arekit 0.24.0__py3-none-any.whl → 0.25.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/common/data/storages/base.py +4 -15
- arekit/common/docs/entities_grouping.py +2 -1
- arekit/common/docs/parser.py +27 -22
- arekit/common/pipeline/base.py +12 -16
- arekit/common/pipeline/batching.py +28 -0
- arekit/common/pipeline/context.py +5 -1
- arekit/common/pipeline/items/base.py +39 -2
- arekit/common/pipeline/items/flatten.py +5 -1
- arekit/common/pipeline/items/handle.py +2 -1
- arekit/common/pipeline/items/iter.py +2 -1
- arekit/common/pipeline/items/map.py +2 -1
- arekit/common/pipeline/items/map_nested.py +4 -0
- arekit/common/pipeline/utils.py +32 -0
- arekit/common/text/{partitioning/str.py → partitioning.py} +14 -9
- arekit/common/utils.py +11 -52
- arekit/contrib/utils/data/contents/opinions.py +13 -3
- arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
- arekit/contrib/utils/data/storages/pandas_based.py +2 -17
- arekit/contrib/utils/data/storages/row_cache.py +8 -2
- arekit/contrib/utils/data/storages/sqlite_based.py +18 -0
- arekit/contrib/utils/io_utils/utils.py +1 -18
- arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
- arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +8 -10
- arekit-0.25.1.data/data/logo.png +0 -0
- arekit-0.25.1.dist-info/METADATA +81 -0
- arekit-0.25.1.dist-info/RECORD +186 -0
- {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/WHEEL +1 -1
- arekit/common/data/input/repositories/__init__.py +0 -0
- arekit/common/data/input/repositories/base.py +0 -68
- arekit/common/data/input/repositories/sample.py +0 -22
- arekit/common/data/views/__init__.py +0 -0
- arekit/common/data/views/samples.py +0 -26
- arekit/common/docs/objects_parser.py +0 -37
- arekit/common/text/parser.py +0 -12
- arekit/common/text/partitioning/__init__.py +0 -0
- arekit/common/text/partitioning/base.py +0 -4
- arekit/common/text/partitioning/terms.py +0 -35
- arekit/contrib/networks/__init__.py +0 -0
- arekit/contrib/networks/embedding.py +0 -149
- arekit/contrib/networks/embedding_io.py +0 -18
- arekit/contrib/networks/input/__init__.py +0 -0
- arekit/contrib/networks/input/const.py +0 -6
- arekit/contrib/networks/input/ctx_serialization.py +0 -28
- arekit/contrib/networks/input/embedding/__init__.py +0 -0
- arekit/contrib/networks/input/embedding/matrix.py +0 -29
- arekit/contrib/networks/input/embedding/offsets.py +0 -55
- arekit/contrib/networks/input/formatters/__init__.py +0 -0
- arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
- arekit/contrib/networks/input/providers/__init__.py +0 -0
- arekit/contrib/networks/input/providers/sample.py +0 -129
- arekit/contrib/networks/input/providers/term_connotation.py +0 -23
- arekit/contrib/networks/input/providers/text.py +0 -24
- arekit/contrib/networks/input/rows_parser.py +0 -47
- arekit/contrib/networks/input/term_types.py +0 -13
- arekit/contrib/networks/input/terms_mapping.py +0 -60
- arekit/contrib/networks/vectorizer.py +0 -6
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +0 -84
- arekit/contrib/source/brat/doc.py +0 -28
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +0 -13
- arekit/contrib/source/brat/entities/entity.py +0 -42
- arekit/contrib/source/brat/entities/parser.py +0 -53
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +0 -19
- arekit/contrib/source/brat/relation.py +0 -32
- arekit/contrib/source/brat/sentence.py +0 -69
- arekit/contrib/source/brat/sentences_reader.py +0 -128
- arekit/contrib/source/download.py +0 -41
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +0 -55
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +0 -74
- arekit/contrib/source/nerel/io_utils.py +0 -62
- arekit/contrib/source/nerel/labels.py +0 -241
- arekit/contrib/source/nerel/reader.py +0 -46
- arekit/contrib/source/nerel/utils.py +0 -24
- arekit/contrib/source/nerel/versions.py +0 -12
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +0 -62
- arekit/contrib/source/nerelbio/labels.py +0 -265
- arekit/contrib/source/nerelbio/reader.py +0 -8
- arekit/contrib/source/nerelbio/versions.py +0 -8
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +0 -36
- arekit/contrib/source/ruattitudes/doc.py +0 -51
- arekit/contrib/source/ruattitudes/doc_brat.py +0 -44
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
- arekit/contrib/source/ruattitudes/io_utils.py +0 -56
- arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
- arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
- arekit/contrib/source/ruattitudes/reader.py +0 -268
- arekit/contrib/source/ruattitudes/sentence.py +0 -73
- arekit/contrib/source/ruattitudes/synonyms.py +0 -17
- arekit/contrib/source/ruattitudes/text_object.py +0 -59
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +0 -157
- arekit/contrib/source/rusentiframes/effect.py +0 -24
- arekit/contrib/source/rusentiframes/io_utils.py +0 -19
- arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
- arekit/contrib/source/rusentiframes/polarity.py +0 -35
- arekit/contrib/source/rusentiframes/role.py +0 -15
- arekit/contrib/source/rusentiframes/state.py +0 -24
- arekit/contrib/source/rusentiframes/types.py +0 -42
- arekit/contrib/source/rusentiframes/value.py +0 -2
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +0 -3
- arekit/contrib/source/rusentrel/docs_reader.py +0 -51
- arekit/contrib/source/rusentrel/entities.py +0 -26
- arekit/contrib/source/rusentrel/io_utils.py +0 -125
- arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
- arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
- arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
- arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
- arekit/contrib/source/rusentrel/synonyms.py +0 -17
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +0 -52
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +0 -31
- arekit/contrib/source/sentinerel/folding/fixed.py +0 -70
- arekit/contrib/source/sentinerel/io_utils.py +0 -87
- arekit/contrib/source/sentinerel/labels.py +0 -53
- arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
- arekit/contrib/source/sentinerel/reader.py +0 -42
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +0 -19
- arekit/contrib/source/zip_utils.py +0 -47
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
- arekit/contrib/utils/data/readers/__init__.py +0 -0
- arekit/contrib/utils/data/readers/base.py +0 -7
- arekit/contrib/utils/data/readers/csv_pd.py +0 -38
- arekit/contrib/utils/data/readers/jsonl.py +0 -15
- arekit/contrib/utils/data/service/__init__.py +0 -0
- arekit/contrib/utils/data/service/balance.py +0 -50
- arekit/contrib/utils/data/writers/csv_native.py +0 -63
- arekit/contrib/utils/data/writers/csv_pd.py +0 -40
- arekit/contrib/utils/data/writers/json_opennre.py +0 -132
- arekit/contrib/utils/data/writers/sqlite_native.py +0 -110
- arekit/contrib/utils/download.py +0 -77
- arekit/contrib/utils/embeddings/__init__.py +0 -0
- arekit/contrib/utils/embeddings/rusvectores.py +0 -58
- arekit/contrib/utils/embeddings/tokens.py +0 -30
- arekit/contrib/utils/io_utils/embedding.py +0 -72
- arekit/contrib/utils/io_utils/opinions.py +0 -37
- arekit/contrib/utils/io_utils/samples.py +0 -79
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +0 -41
- arekit/contrib/utils/lexicons/relation.py +0 -42
- arekit/contrib/utils/lexicons/rusentilex.py +0 -37
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +0 -83
- arekit/contrib/utils/np_utils/__init__.py +0 -0
- arekit/contrib/utils/np_utils/embedding.py +0 -22
- arekit/contrib/utils/np_utils/npz_utils.py +0 -13
- arekit/contrib/utils/np_utils/vocab.py +0 -20
- arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/base.py +0 -99
- arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -54
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -32
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -107
- arekit/contrib/utils/pipelines/items/text/translator.py +0 -135
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +0 -27
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -65
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +0 -29
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -64
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +0 -56
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -20
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -65
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +0 -21
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -107
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +0 -29
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -180
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
- arekit/contrib/utils/processing/languages/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/mods.py +0 -12
- arekit/contrib/utils/processing/languages/pos.py +0 -23
- arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
- arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
- arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
- arekit/contrib/utils/processing/languages/ru/number.py +0 -23
- arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
- arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
- arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
- arekit/contrib/utils/processing/pos/__init__.py +0 -0
- arekit/contrib/utils/processing/pos/base.py +0 -12
- arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
- arekit/contrib/utils/processing/pos/russian.py +0 -10
- arekit/contrib/utils/processing/text/__init__.py +0 -0
- arekit/contrib/utils/processing/text/tokens.py +0 -127
- arekit/contrib/utils/resources.py +0 -25
- arekit/contrib/utils/serializer.py +0 -43
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
- arekit/contrib/utils/vectorizers/__init__.py +0 -0
- arekit/contrib/utils/vectorizers/bpe.py +0 -93
- arekit/contrib/utils/vectorizers/random_norm.py +0 -39
- arekit/download_data.py +0 -11
- arekit-0.24.0.dist-info/METADATA +0 -23
- arekit-0.24.0.dist-info/RECORD +0 -374
- {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/LICENSE +0 -0
- {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/top_level.txt +0 -0
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
from arekit.common.labels.str_fmt import StringLabelsFormatter
|
|
2
|
-
from arekit.common.opinions.base import Opinion
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class OpinionConverter(object):
|
|
6
|
-
""" Opinion type <-> string Converter.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
@staticmethod
|
|
10
|
-
def try_from_string(line, labels_formatter):
|
|
11
|
-
assert(isinstance(line, str))
|
|
12
|
-
|
|
13
|
-
args = line.strip().split(',')
|
|
14
|
-
assert (len(args) >= 3)
|
|
15
|
-
|
|
16
|
-
source_value = args[0].strip()
|
|
17
|
-
target_value = args[1].strip()
|
|
18
|
-
str_label = args[2].strip()
|
|
19
|
-
|
|
20
|
-
if not labels_formatter.supports_value(str_label):
|
|
21
|
-
return None
|
|
22
|
-
|
|
23
|
-
return Opinion(source_value=source_value,
|
|
24
|
-
target_value=target_value,
|
|
25
|
-
label=labels_formatter.str_to_label(str_label))
|
|
26
|
-
|
|
27
|
-
@staticmethod
|
|
28
|
-
def try_to_string(opinion, labels_formatter):
|
|
29
|
-
assert(isinstance(opinion, Opinion))
|
|
30
|
-
assert(isinstance(labels_formatter, StringLabelsFormatter))
|
|
31
|
-
|
|
32
|
-
label = opinion.Label
|
|
33
|
-
|
|
34
|
-
if not labels_formatter.supports_label(label):
|
|
35
|
-
return None
|
|
36
|
-
|
|
37
|
-
return "{}, {}, {}, current".format(
|
|
38
|
-
opinion.SourceValue,
|
|
39
|
-
opinion.TargetValue,
|
|
40
|
-
labels_formatter.label_to_str(opinion.Label))
|
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
from arekit.common.opinions.provider import OpinionCollectionsProvider
|
|
2
|
-
from arekit.common.labels.str_fmt import StringLabelsFormatter
|
|
3
|
-
from arekit.contrib.source.rusentrel.opinions.converter import OpinionConverter
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class RuSentRelOpinionCollectionProvider(OpinionCollectionsProvider):
|
|
7
|
-
|
|
8
|
-
@staticmethod
|
|
9
|
-
def _iter_opinions_from_file(input_file, labels_formatter, error_on_non_supported):
|
|
10
|
-
assert(isinstance(labels_formatter, StringLabelsFormatter))
|
|
11
|
-
assert(isinstance(error_on_non_supported, bool))
|
|
12
|
-
|
|
13
|
-
for line in input_file.readlines():
|
|
14
|
-
|
|
15
|
-
# Force perform decoding if needed.
|
|
16
|
-
if isinstance(line, bytes):
|
|
17
|
-
line = line.decode()
|
|
18
|
-
|
|
19
|
-
if line == '\n':
|
|
20
|
-
continue
|
|
21
|
-
|
|
22
|
-
str_opinion = OpinionConverter.try_from_string(
|
|
23
|
-
line=line,
|
|
24
|
-
labels_formatter=labels_formatter)
|
|
25
|
-
|
|
26
|
-
if str_opinion is None:
|
|
27
|
-
if error_on_non_supported:
|
|
28
|
-
raise Exception("Line '{line}' has non supported label")
|
|
29
|
-
else:
|
|
30
|
-
continue
|
|
31
|
-
|
|
32
|
-
yield str_opinion
|
|
33
|
-
|
|
34
|
-
# region public methods
|
|
35
|
-
|
|
36
|
-
def iter_opinions(self, source, encoding, labels_formatter, error_on_non_supported=True):
|
|
37
|
-
"""
|
|
38
|
-
Important: For externally saved collections (using save_to_file method) and related usage
|
|
39
|
-
"""
|
|
40
|
-
assert(isinstance(source, str))
|
|
41
|
-
assert(isinstance(labels_formatter, StringLabelsFormatter))
|
|
42
|
-
assert(isinstance(error_on_non_supported, bool))
|
|
43
|
-
|
|
44
|
-
with open(source, 'r', encoding=encoding) as input_file:
|
|
45
|
-
|
|
46
|
-
it = RuSentRelOpinionCollectionProvider._iter_opinions_from_file(
|
|
47
|
-
input_file=input_file,
|
|
48
|
-
labels_formatter=labels_formatter,
|
|
49
|
-
error_on_non_supported=error_on_non_supported)
|
|
50
|
-
|
|
51
|
-
for opinion in it:
|
|
52
|
-
yield opinion
|
|
53
|
-
|
|
54
|
-
# endregion
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
import io
|
|
2
|
-
|
|
3
|
-
from arekit.common.labels.str_fmt import StringLabelsFormatter
|
|
4
|
-
from arekit.common.opinions.base import Opinion
|
|
5
|
-
from arekit.common.opinions.collection import OpinionCollection
|
|
6
|
-
from arekit.common.opinions.writer import OpinionCollectionWriter
|
|
7
|
-
from arekit.common.utils import create_dir_if_not_exists
|
|
8
|
-
from arekit.contrib.source.rusentrel.opinions.converter import OpinionConverter
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class RuSentRelOpinionCollectionWriter(OpinionCollectionWriter):
|
|
12
|
-
|
|
13
|
-
def serialize(self, collection, target, encoding, labels_formatter, error_on_non_supported=True):
|
|
14
|
-
assert(isinstance(collection, OpinionCollection))
|
|
15
|
-
assert(isinstance(target, str))
|
|
16
|
-
assert(isinstance(labels_formatter, StringLabelsFormatter))
|
|
17
|
-
assert(isinstance(error_on_non_supported, bool))
|
|
18
|
-
|
|
19
|
-
def __opinion_key(opinion):
|
|
20
|
-
assert (isinstance(opinion, Opinion))
|
|
21
|
-
return opinion.SourceValue + opinion.TargetValue
|
|
22
|
-
|
|
23
|
-
sorted_ops = sorted(collection, key=__opinion_key)
|
|
24
|
-
|
|
25
|
-
create_dir_if_not_exists(target)
|
|
26
|
-
|
|
27
|
-
with io.open(target, 'w', encoding=encoding) as f:
|
|
28
|
-
for o in sorted_ops:
|
|
29
|
-
|
|
30
|
-
str_value = OpinionConverter.try_to_string(
|
|
31
|
-
opinion=o,
|
|
32
|
-
labels_formatter=labels_formatter)
|
|
33
|
-
|
|
34
|
-
if str_value is None:
|
|
35
|
-
if error_on_non_supported:
|
|
36
|
-
raise Exception("Opinion label `{label}` is not supported by formatter".format(
|
|
37
|
-
label=o.Label))
|
|
38
|
-
else:
|
|
39
|
-
continue
|
|
40
|
-
|
|
41
|
-
f.write(str_value)
|
|
42
|
-
f.write('\n')
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
from arekit.contrib.source.synonyms.utils import iter_synonym_groups
|
|
2
|
-
from arekit.contrib.source.rusentrel.io_utils import RuSentRelIOUtils
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class RuSentRelSynonymsCollectionHelper(object):
|
|
6
|
-
|
|
7
|
-
@staticmethod
|
|
8
|
-
def iter_groups(version):
|
|
9
|
-
it = RuSentRelIOUtils.iter_from_zip(
|
|
10
|
-
inner_path=RuSentRelIOUtils.get_synonyms_innerpath(),
|
|
11
|
-
process_func=lambda input_file: iter_synonym_groups(
|
|
12
|
-
input_file,
|
|
13
|
-
desc="Loading RuSentRel Collection"),
|
|
14
|
-
version=version)
|
|
15
|
-
|
|
16
|
-
for group in it:
|
|
17
|
-
yield group
|
|
File without changes
|
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
from arekit.common.entities.collection import EntityCollection
|
|
2
|
-
from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders
|
|
3
|
-
from arekit.contrib.source.brat.annot import BratAnnotationParser
|
|
4
|
-
from arekit.contrib.source.brat.entities.entity import BratEntity
|
|
5
|
-
from arekit.contrib.source.sentinerel.io_utils import SentiNerelIOUtils
|
|
6
|
-
from arekit.contrib.utils.processing.lemmatization.mystem import MystemWrapper
|
|
7
|
-
from arekit.contrib.utils.synonyms.stemmer_based import StemmerBasedSynonymCollection
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class SentiNerelEntityCollection(EntityCollection):
|
|
11
|
-
|
|
12
|
-
def __init__(self, contents, value_to_group_id_func, entities_to_ignore=None):
|
|
13
|
-
"""
|
|
14
|
-
entities_to_ignore: list or None
|
|
15
|
-
this parameter is required because of the simplified implmentation of
|
|
16
|
-
the nested objects of the BRAT annotation.
|
|
17
|
-
"""
|
|
18
|
-
assert(isinstance(contents, dict))
|
|
19
|
-
assert(BratAnnotationParser.ENTITIES in contents)
|
|
20
|
-
assert(isinstance(entities_to_ignore, list) or entities_to_ignore is None)
|
|
21
|
-
|
|
22
|
-
self.__dicard_entities = set([] if entities_to_ignore is None else entities_to_ignore)
|
|
23
|
-
contents[BratAnnotationParser.ENTITIES] = [e for e in contents[BratAnnotationParser.ENTITIES]
|
|
24
|
-
if self.__keep_entity(e)]
|
|
25
|
-
|
|
26
|
-
super(SentiNerelEntityCollection, self).__init__(
|
|
27
|
-
entities=contents[BratAnnotationParser.ENTITIES],
|
|
28
|
-
value_to_group_id_func=value_to_group_id_func)
|
|
29
|
-
|
|
30
|
-
self._sort_entities(key=lambda entity: entity.IndexBegin)
|
|
31
|
-
|
|
32
|
-
def __keep_entity(self, entity):
|
|
33
|
-
assert(isinstance(entity, BratEntity))
|
|
34
|
-
return entity.Type not in self.__dicard_entities
|
|
35
|
-
|
|
36
|
-
@classmethod
|
|
37
|
-
def read_collection(cls, filename, version, entities_to_ignore=None):
|
|
38
|
-
assert(isinstance(filename, str))
|
|
39
|
-
|
|
40
|
-
# Since this dataset does not provide the synonyms collection by default,
|
|
41
|
-
# it is necessary to declare an empty collection to populate so in further.
|
|
42
|
-
synonyms = StemmerBasedSynonymCollection(stemmer=MystemWrapper(), is_read_only=False)
|
|
43
|
-
|
|
44
|
-
return SentiNerelIOUtils.read_from_zip(
|
|
45
|
-
inner_path=SentiNerelIOUtils.get_annotation_innerpath(filename),
|
|
46
|
-
process_func=lambda input_file: cls(
|
|
47
|
-
contents=BratAnnotationParser.parse_annotations(input_file=input_file, encoding='utf-8-sig'),
|
|
48
|
-
entities_to_ignore=entities_to_ignore,
|
|
49
|
-
value_to_group_id_func=lambda value:
|
|
50
|
-
SynonymsCollectionValuesGroupingProviders.provide_existed_or_register_missed_value(
|
|
51
|
-
synonyms, value)),
|
|
52
|
-
version=version)
|
|
File without changes
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
from arekit.contrib.source.sentinerel.folding.fixed import create_fixed_folding_doc_ids
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class SentiNERELFoldingFactory:
|
|
5
|
-
""" Factory of the variety types of the splits that
|
|
6
|
-
are considered within the present experiments.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
@staticmethod
|
|
10
|
-
def create_fixed_folding(file, limit=None):
|
|
11
|
-
""" limit: int
|
|
12
|
-
Allows to limit amount of documents (utilized for testing reasons)
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
train_filenames, test_filenames = SentiNERELFoldingFactory._read_train_test(f=file)
|
|
16
|
-
if limit is not None:
|
|
17
|
-
train_filenames = train_filenames[:limit]
|
|
18
|
-
test_filenames = test_filenames[:limit]
|
|
19
|
-
filenames_by_ids, data_folding = create_fixed_folding_doc_ids(train_filenames=train_filenames,
|
|
20
|
-
test_filenames=test_filenames)
|
|
21
|
-
|
|
22
|
-
return filenames_by_ids, data_folding
|
|
23
|
-
|
|
24
|
-
@staticmethod
|
|
25
|
-
def _read_train_test(f):
|
|
26
|
-
parts = []
|
|
27
|
-
for line in f.readlines():
|
|
28
|
-
if isinstance(line, bytes):
|
|
29
|
-
line = line.decode('utf-8')
|
|
30
|
-
parts.append(line.strip().split(','))
|
|
31
|
-
return parts[0], parts[1]
|
|
@@ -1,70 +0,0 @@
|
|
|
1
|
-
from collections import OrderedDict
|
|
2
|
-
|
|
3
|
-
from arekit.common.experiment.data_type import DataType
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def create_fixed_folding_doc_ids(train_filenames, test_filenames):
|
|
7
|
-
""" Create fixed data-folding based on the predefined list of filenames,
|
|
8
|
-
written in file.
|
|
9
|
-
"""
|
|
10
|
-
assert(isinstance(train_filenames, list))
|
|
11
|
-
assert(isinstance(test_filenames, list))
|
|
12
|
-
|
|
13
|
-
filenames_by_ids = __create_filenames_by_ids(filenames=train_filenames + test_filenames)
|
|
14
|
-
|
|
15
|
-
ids_by_filenames = {}
|
|
16
|
-
for doc_id, filename in filenames_by_ids.items():
|
|
17
|
-
ids_by_filenames[filename] = doc_id
|
|
18
|
-
|
|
19
|
-
train_doc_ids = [ids_by_filenames[filename] for filename in train_filenames]
|
|
20
|
-
test_doc_ids = [ids_by_filenames[filename] for filename in test_filenames]
|
|
21
|
-
|
|
22
|
-
return {
|
|
23
|
-
DataType.Train: train_doc_ids,
|
|
24
|
-
DataType.Test: test_doc_ids,
|
|
25
|
-
DataType.Etalon: test_doc_ids,
|
|
26
|
-
DataType.Dev: test_doc_ids
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def __create_filenames_by_ids(filenames):
|
|
31
|
-
""" Indexing filenames
|
|
32
|
-
"""
|
|
33
|
-
|
|
34
|
-
def __create_new_id(default_id):
|
|
35
|
-
new_id = default_id
|
|
36
|
-
while new_id in filenames_by_ids:
|
|
37
|
-
new_id += 1
|
|
38
|
-
return new_id
|
|
39
|
-
|
|
40
|
-
default_id = 0
|
|
41
|
-
|
|
42
|
-
filenames_by_ids = OrderedDict()
|
|
43
|
-
for fname in filenames:
|
|
44
|
-
|
|
45
|
-
doc_id = __number_from_string(fname)
|
|
46
|
-
|
|
47
|
-
if doc_id is None:
|
|
48
|
-
doc_id = __create_new_id(default_id)
|
|
49
|
-
default_id = doc_id
|
|
50
|
-
|
|
51
|
-
filenames_by_ids[doc_id] = fname
|
|
52
|
-
|
|
53
|
-
return filenames_by_ids
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def __number_from_string(s):
|
|
57
|
-
assert(isinstance(s, str))
|
|
58
|
-
|
|
59
|
-
digit_chars_prefix = []
|
|
60
|
-
|
|
61
|
-
for chr in s:
|
|
62
|
-
if chr.isdigit():
|
|
63
|
-
digit_chars_prefix.append(chr)
|
|
64
|
-
else:
|
|
65
|
-
break
|
|
66
|
-
|
|
67
|
-
if len(digit_chars_prefix) == 0:
|
|
68
|
-
return None
|
|
69
|
-
|
|
70
|
-
return int("".join(digit_chars_prefix))
|
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
from enum import Enum
|
|
2
|
-
from os import path
|
|
3
|
-
from os.path import basename, join
|
|
4
|
-
|
|
5
|
-
import enum
|
|
6
|
-
|
|
7
|
-
from arekit.contrib.source.sentinerel.folding.factory import SentiNERELFoldingFactory
|
|
8
|
-
from arekit.contrib.source.zip_utils import ZipArchiveUtils
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class SentiNerelVersions(Enum):
|
|
12
|
-
""" List of the supported version of this collection
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
# Initial version.
|
|
16
|
-
V1 = "v1_0"
|
|
17
|
-
# Updated annotation within the second half of the texts. (September 2022)
|
|
18
|
-
V2 = "v2_0"
|
|
19
|
-
# Updated annotation within the first half of the texts. (October 2022)
|
|
20
|
-
# Become a source of the RuSentNE-2023 competition.
|
|
21
|
-
# https://github.com/dialogue-evaluation/RuSentNE-evaluation
|
|
22
|
-
V21 = "v2_1"
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
DEFAULT_VERSION = SentiNerelVersions.V21
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class SentiNerelIOUtils(ZipArchiveUtils):
|
|
29
|
-
|
|
30
|
-
inner_root = "sentiment_dataset"
|
|
31
|
-
|
|
32
|
-
@staticmethod
|
|
33
|
-
def get_archive_filepath(version):
|
|
34
|
-
return path.join(SentiNerelIOUtils.get_data_root(), "sentinerel-{}.zip".format(version))
|
|
35
|
-
|
|
36
|
-
@staticmethod
|
|
37
|
-
def get_annotation_innerpath(filename):
|
|
38
|
-
assert(isinstance(filename, str))
|
|
39
|
-
return path.join(SentiNerelIOUtils.inner_root, "{}.ann".format(filename))
|
|
40
|
-
|
|
41
|
-
@staticmethod
|
|
42
|
-
def get_doc_innerpath(filename):
|
|
43
|
-
assert(isinstance(filename, str))
|
|
44
|
-
return path.join(SentiNerelIOUtils.inner_root, "{}.txt".format(filename))
|
|
45
|
-
|
|
46
|
-
@staticmethod
|
|
47
|
-
def __iter_filenames_from_dataset(folder_name, version):
|
|
48
|
-
assert(isinstance(version, enum.Enum))
|
|
49
|
-
assert(isinstance(folder_name, str))
|
|
50
|
-
|
|
51
|
-
for filename in SentiNerelIOUtils.iter_filenames_from_zip(version):
|
|
52
|
-
|
|
53
|
-
extension = filename[-4:]
|
|
54
|
-
|
|
55
|
-
# Crop extension.
|
|
56
|
-
filename = filename[:-4]
|
|
57
|
-
|
|
58
|
-
if extension != ".txt":
|
|
59
|
-
continue
|
|
60
|
-
|
|
61
|
-
if not folder_name in filename:
|
|
62
|
-
continue
|
|
63
|
-
|
|
64
|
-
yield basename(filename)
|
|
65
|
-
|
|
66
|
-
# region public methods
|
|
67
|
-
|
|
68
|
-
@staticmethod
|
|
69
|
-
def iter_collection_filenames(version=DEFAULT_VERSION):
|
|
70
|
-
filenames_it = SentiNerelIOUtils.__iter_filenames_from_dataset(
|
|
71
|
-
folder_name=SentiNerelIOUtils.inner_root, version=version)
|
|
72
|
-
|
|
73
|
-
for doc_id, filename in enumerate(filenames_it):
|
|
74
|
-
yield doc_id, filename
|
|
75
|
-
|
|
76
|
-
@staticmethod
|
|
77
|
-
def read_dataset_split(version=DEFAULT_VERSION, docs_limit=None):
|
|
78
|
-
""" Provides a fixed split of the dataset onto
|
|
79
|
-
`test` and `training` part:
|
|
80
|
-
https://github.com/nicolay-r/SentiNEREL-attitude-extraction
|
|
81
|
-
"""
|
|
82
|
-
return SentiNerelIOUtils.read_from_zip(
|
|
83
|
-
inner_path=join(SentiNerelIOUtils.inner_root, "split_fixed.txt"),
|
|
84
|
-
process_func=lambda f: SentiNERELFoldingFactory.create_fixed_folding(file=f, limit=docs_limit),
|
|
85
|
-
version=version)
|
|
86
|
-
|
|
87
|
-
# endregion
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
from arekit.common.labels.base import Label
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class OpinionBelongsTo(Label):
|
|
5
|
-
pass
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class OpinionRelatesTo(Label):
|
|
9
|
-
pass
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class NegEffectFrom(Label):
|
|
13
|
-
pass
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class NegStateFrom(Label):
|
|
17
|
-
pass
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class PosEffectFrom(Label):
|
|
21
|
-
pass
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class PosAuthorFrom(Label):
|
|
25
|
-
pass
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class NegAuthorFrom(Label):
|
|
29
|
-
pass
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
class PosStateFrom(Label):
|
|
33
|
-
pass
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class NegativeTo(Label):
|
|
37
|
-
pass
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
class PositiveTo(Label):
|
|
41
|
-
pass
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
class AlternativeName(Label):
|
|
45
|
-
pass
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
class StateBelongsTo(Label):
|
|
49
|
-
pass
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class OriginsFrom(Label):
|
|
53
|
-
pass
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
from collections import OrderedDict
|
|
2
|
-
|
|
3
|
-
from arekit.common.labels.scaler.base import BaseLabelScaler
|
|
4
|
-
from arekit.contrib.source.sentinerel import labels
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class SentiNerelLabelScaler(BaseLabelScaler):
|
|
8
|
-
""" This is a complete label scaler of all the labels supported by NEREL dataset.
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
def __init__(self):
|
|
12
|
-
|
|
13
|
-
self.__uint_to_label_dict = OrderedDict([
|
|
14
|
-
(labels.OpinionBelongsTo(), 0),
|
|
15
|
-
(labels.OpinionRelatesTo(), 1),
|
|
16
|
-
(labels.NegEffectFrom(), 2),
|
|
17
|
-
(labels.PosEffectFrom(), 3),
|
|
18
|
-
(labels.NegStateFrom(), 4),
|
|
19
|
-
(labels.PosStateFrom(), 5),
|
|
20
|
-
(labels.NegativeTo(), 6),
|
|
21
|
-
(labels.PositiveTo(), 7),
|
|
22
|
-
(labels.StateBelongsTo(), 8),
|
|
23
|
-
(labels.PosAuthorFrom(), 9),
|
|
24
|
-
(labels.NegAuthorFrom(), 10),
|
|
25
|
-
(labels.AlternativeName(), 11),
|
|
26
|
-
(labels.OriginsFrom(), 12)
|
|
27
|
-
])
|
|
28
|
-
|
|
29
|
-
super(SentiNerelLabelScaler, self).__init__(int_dict=self.__uint_to_label_dict,
|
|
30
|
-
uint_dict=self.__uint_to_label_dict)
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
from arekit.contrib.source.brat.annot import BratAnnotationParser
|
|
2
|
-
from arekit.contrib.source.brat.doc import BratDocument
|
|
3
|
-
from arekit.contrib.source.brat.sentences_reader import BratDocumentSentencesReader
|
|
4
|
-
from arekit.contrib.source.sentinerel.entities import SentiNerelEntityCollection
|
|
5
|
-
from arekit.contrib.source.sentinerel.io_utils import SentiNerelIOUtils, DEFAULT_VERSION
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class SentiNerelDocReader(object):
|
|
9
|
-
|
|
10
|
-
@staticmethod
|
|
11
|
-
def read_text_relations(filename, version):
|
|
12
|
-
assert(isinstance(filename, str))
|
|
13
|
-
|
|
14
|
-
return SentiNerelIOUtils.read_from_zip(
|
|
15
|
-
inner_path=SentiNerelIOUtils.get_annotation_innerpath(filename),
|
|
16
|
-
process_func=lambda input_file: [
|
|
17
|
-
relation for relation in BratAnnotationParser.parse_annotations(
|
|
18
|
-
input_file=input_file, encoding='utf-8-sig')["relations"]],
|
|
19
|
-
version=version)
|
|
20
|
-
|
|
21
|
-
@staticmethod
|
|
22
|
-
def read_document(filename, doc_id, version=DEFAULT_VERSION, entities_to_ignore=None):
|
|
23
|
-
assert(isinstance(filename, str))
|
|
24
|
-
assert(isinstance(doc_id, int))
|
|
25
|
-
|
|
26
|
-
def file_to_doc(input_file):
|
|
27
|
-
sentences = BratDocumentSentencesReader.from_file(input_file=input_file, entities=entities)
|
|
28
|
-
return BratDocument(doc_id=doc_id, sentences=sentences, text_relations=text_relations)
|
|
29
|
-
|
|
30
|
-
# TODO. #398 issue -- in some cases entities might be nested. Therefore we limit the set
|
|
31
|
-
# TODO. of the potential named entities.
|
|
32
|
-
eti = ["EFFECT_NEG", "EFFECT_POS", "ARGUMENT_NEG", "ARGUMENT_POS", "EVENT"] \
|
|
33
|
-
if entities_to_ignore is None else entities_to_ignore
|
|
34
|
-
|
|
35
|
-
entities = SentiNerelEntityCollection.read_collection(
|
|
36
|
-
filename=filename, version=version, entities_to_ignore=eti)
|
|
37
|
-
text_relations = SentiNerelDocReader.read_text_relations(filename=filename, version=version)
|
|
38
|
-
|
|
39
|
-
return SentiNerelIOUtils.read_from_zip(
|
|
40
|
-
inner_path=SentiNerelIOUtils.get_doc_innerpath(filename=filename),
|
|
41
|
-
process_func=file_to_doc,
|
|
42
|
-
version=version)
|
|
File without changes
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
from arekit.common.utils import progress_bar_defined
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def iter_synonym_groups(input_file, sep=",", desc=""):
|
|
5
|
-
""" All the synonyms groups organized in lines, separated by `sep`
|
|
6
|
-
"""
|
|
7
|
-
lines = input_file.readlines()
|
|
8
|
-
|
|
9
|
-
lines_it = progress_bar_defined(lines,
|
|
10
|
-
total=len(lines),
|
|
11
|
-
desc=desc,
|
|
12
|
-
unit="opins")
|
|
13
|
-
|
|
14
|
-
for line in lines_it:
|
|
15
|
-
|
|
16
|
-
if isinstance(line, bytes):
|
|
17
|
-
line = line.decode()
|
|
18
|
-
|
|
19
|
-
yield line.split(sep)
|
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
import zipfile
|
|
2
|
-
|
|
3
|
-
import enum
|
|
4
|
-
|
|
5
|
-
from arekit.common import utils
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class ZipArchiveUtils(object):
|
|
9
|
-
|
|
10
|
-
@staticmethod
|
|
11
|
-
def get_archive_filepath(version):
|
|
12
|
-
raise NotImplementedError()
|
|
13
|
-
|
|
14
|
-
@classmethod
|
|
15
|
-
def read_from_zip(cls, inner_path, process_func, version):
|
|
16
|
-
"""
|
|
17
|
-
process_func:
|
|
18
|
-
func which receives a file reader
|
|
19
|
-
"""
|
|
20
|
-
assert(isinstance(inner_path, str))
|
|
21
|
-
assert(callable(process_func))
|
|
22
|
-
assert(isinstance(version, enum.Enum))
|
|
23
|
-
|
|
24
|
-
with zipfile.ZipFile(cls.get_archive_filepath(version.value), "r") as zip_ref:
|
|
25
|
-
with zip_ref.open(inner_path, mode='r') as c_file:
|
|
26
|
-
return process_func(c_file)
|
|
27
|
-
|
|
28
|
-
@classmethod
|
|
29
|
-
def iter_from_zip(cls, inner_path, process_func, version):
|
|
30
|
-
assert(isinstance(inner_path, str))
|
|
31
|
-
assert(callable(process_func))
|
|
32
|
-
assert(isinstance(version, enum.Enum))
|
|
33
|
-
|
|
34
|
-
with zipfile.ZipFile(cls.get_archive_filepath(version.value), "r") as zip_ref:
|
|
35
|
-
with zip_ref.open(inner_path, mode='r') as c_file:
|
|
36
|
-
for result in process_func(c_file):
|
|
37
|
-
yield result
|
|
38
|
-
|
|
39
|
-
@classmethod
|
|
40
|
-
def iter_filenames_from_zip(cls, version):
|
|
41
|
-
assert(isinstance(version, enum.Enum))
|
|
42
|
-
with zipfile.ZipFile(cls.get_archive_filepath(version.value), "r") as zip_ref:
|
|
43
|
-
return iter(zip_ref.namelist())
|
|
44
|
-
|
|
45
|
-
@staticmethod
|
|
46
|
-
def get_data_root():
|
|
47
|
-
return utils.get_default_download_dir()
|
|
File without changes
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
from arekit.common.frames.connotations.provider import FrameConnotationProvider
|
|
2
|
-
from arekit.contrib.source.rusentiframes.collection import RuSentiFramesCollection
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class RuSentiFramesConnotationProvider(FrameConnotationProvider):
|
|
6
|
-
""" This is a provider based on A0->A1 label type of RuSentiFrames collection.
|
|
7
|
-
For a greater details, checkout the related collection at:
|
|
8
|
-
https://github.com/nicolay-r/RuSentiFrames
|
|
9
|
-
|
|
10
|
-
Papers:
|
|
11
|
-
[1] Natalia Loukachevitch, Nicolay Rusnachenko: Sentiment Frames
|
|
12
|
-
for Attitude Extraction in Russian, 2020
|
|
13
|
-
[2] Distant Supervision for Sentiment Attitude Extraction, 2019
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
def __init__(self, collection):
|
|
17
|
-
assert(isinstance(collection, RuSentiFramesCollection))
|
|
18
|
-
self.__collection = collection
|
|
19
|
-
|
|
20
|
-
def try_provide(self, frame_id):
|
|
21
|
-
return self.__collection.try_get_frame_polarity(frame_id=frame_id,
|
|
22
|
-
role_src='a0',
|
|
23
|
-
role_dest='a1')
|
|
File without changes
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
import importlib
|
|
2
|
-
|
|
3
|
-
from arekit.contrib.utils.data.readers.base import BaseReader
|
|
4
|
-
from arekit.contrib.utils.data.storages.pandas_based import PandasBasedRowsStorage
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class PandasCsvReader(BaseReader):
|
|
8
|
-
""" Represents a CSV-based reader, implmented via pandas API.
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
def __init__(self, sep='\t', header='infer', compression='infer', encoding='utf-8', col_types=None,
|
|
12
|
-
custom_extension=None):
|
|
13
|
-
self.__sep = sep
|
|
14
|
-
self.__compression = compression
|
|
15
|
-
self.__encoding = encoding
|
|
16
|
-
self.__header = header
|
|
17
|
-
self.__custom_extension = custom_extension
|
|
18
|
-
|
|
19
|
-
# Special assignation of types for certain columns.
|
|
20
|
-
self.__col_types = col_types
|
|
21
|
-
if self.__col_types is None:
|
|
22
|
-
self.__col_types = dict()
|
|
23
|
-
|
|
24
|
-
def extension(self):
|
|
25
|
-
return ".tsv.gz" if self.__custom_extension is None else self.__custom_extension
|
|
26
|
-
|
|
27
|
-
def __from_csv(self, filepath):
|
|
28
|
-
pd = importlib.import_module("pandas")
|
|
29
|
-
return pd.read_csv(filepath,
|
|
30
|
-
sep=self.__sep,
|
|
31
|
-
encoding=self.__encoding,
|
|
32
|
-
compression=self.__compression,
|
|
33
|
-
dtype=self.__col_types,
|
|
34
|
-
header=self.__header)
|
|
35
|
-
|
|
36
|
-
def read(self, target):
|
|
37
|
-
df = self.__from_csv(filepath=target)
|
|
38
|
-
return PandasBasedRowsStorage(df)
|