arekit 0.23.1__py3-none-any.whl → 0.25.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/common/context/terms_mapper.py +2 -2
- arekit/common/data/const.py +5 -4
- arekit/common/{experiment/api/ops_doc.py → data/doc_provider.py} +1 -1
- arekit/common/data/input/providers/columns/sample.py +6 -1
- arekit/common/data/input/providers/instances/base.py +1 -1
- arekit/common/data/input/providers/rows/base.py +36 -13
- arekit/common/data/input/providers/rows/samples.py +57 -55
- arekit/common/data/input/providers/sample/cropped.py +2 -2
- arekit/common/data/input/sample.py +1 -1
- arekit/common/data/rows_fmt.py +82 -0
- arekit/common/data/rows_parser.py +43 -0
- arekit/common/data/storages/base.py +23 -18
- arekit/common/data/views/samples.py +2 -8
- arekit/common/{news → docs}/base.py +2 -2
- arekit/common/{news → docs}/entities_grouping.py +2 -1
- arekit/common/{news → docs}/entity.py +2 -1
- arekit/common/{news → docs}/parsed/base.py +5 -5
- arekit/common/docs/parsed/providers/base.py +68 -0
- arekit/common/{news → docs}/parsed/providers/base_pairs.py +2 -2
- arekit/common/{news → docs}/parsed/providers/entity_service.py +27 -22
- arekit/common/{news → docs}/parsed/providers/opinion_pairs.py +2 -2
- arekit/common/{news → docs}/parsed/providers/text_opinion_pairs.py +6 -6
- arekit/common/docs/parsed/service.py +31 -0
- arekit/common/docs/parser.py +66 -0
- arekit/common/{news → docs}/sentence.py +1 -1
- arekit/common/entities/base.py +11 -2
- arekit/common/experiment/api/base_samples_io.py +1 -1
- arekit/common/frames/variants/collection.py +2 -2
- arekit/common/linkage/base.py +2 -2
- arekit/common/linkage/meta.py +23 -0
- arekit/common/linkage/opinions.py +1 -1
- arekit/common/linkage/text_opinions.py +2 -2
- arekit/common/opinions/annot/algo/base.py +1 -1
- arekit/common/opinions/annot/algo/pair_based.py +15 -13
- arekit/common/opinions/annot/algo/predefined.py +4 -4
- arekit/common/opinions/annot/algo_based.py +5 -5
- arekit/common/opinions/annot/base.py +3 -3
- arekit/common/opinions/base.py +7 -7
- arekit/common/opinions/collection.py +3 -3
- arekit/common/pipeline/base.py +12 -16
- arekit/common/pipeline/batching.py +28 -0
- arekit/common/pipeline/context.py +5 -1
- arekit/common/pipeline/items/base.py +38 -1
- arekit/common/pipeline/items/flatten.py +5 -1
- arekit/common/pipeline/items/handle.py +2 -1
- arekit/common/pipeline/items/iter.py +2 -1
- arekit/common/pipeline/items/map.py +2 -1
- arekit/common/pipeline/items/map_nested.py +4 -0
- arekit/common/pipeline/utils.py +32 -0
- arekit/common/service/sqlite.py +36 -0
- arekit/common/synonyms/base.py +2 -2
- arekit/common/text/{partitioning/str.py → partitioning.py} +16 -11
- arekit/common/text_opinions/base.py +11 -11
- arekit/common/utils.py +33 -46
- arekit/contrib/networks/embedding.py +3 -3
- arekit/contrib/networks/embedding_io.py +5 -5
- arekit/contrib/networks/input/const.py +0 -2
- arekit/contrib/networks/input/providers/sample.py +15 -29
- arekit/contrib/networks/input/rows_parser.py +47 -134
- arekit/contrib/prompt/sample.py +18 -16
- arekit/contrib/utils/data/contents/opinions.py +17 -5
- arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
- arekit/contrib/utils/data/{doc_ops → doc_provider}/dir_based.py +7 -7
- arekit/contrib/utils/data/readers/base.py +3 -0
- arekit/contrib/utils/data/readers/csv_pd.py +10 -4
- arekit/contrib/utils/data/readers/jsonl.py +3 -0
- arekit/contrib/utils/data/readers/sqlite.py +14 -0
- arekit/contrib/utils/data/service/balance.py +0 -1
- arekit/contrib/utils/data/storages/pandas_based.py +3 -5
- arekit/contrib/utils/data/storages/row_cache.py +18 -6
- arekit/contrib/utils/data/storages/sqlite_based.py +17 -0
- arekit/contrib/utils/data/writers/base.py +5 -0
- arekit/contrib/utils/data/writers/csv_native.py +3 -0
- arekit/contrib/utils/data/writers/csv_pd.py +3 -0
- arekit/contrib/utils/data/writers/json_opennre.py +31 -13
- arekit/contrib/utils/data/writers/sqlite_native.py +114 -0
- arekit/contrib/utils/io_utils/embedding.py +25 -33
- arekit/contrib/utils/io_utils/utils.py +3 -24
- arekit/contrib/utils/pipelines/items/sampling/base.py +31 -26
- arekit/contrib/utils/pipelines/items/sampling/networks.py +7 -10
- arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
- arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +3 -3
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +2 -1
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +3 -5
- arekit/contrib/utils/pipelines/items/text/translator.py +136 -0
- arekit/contrib/utils/pipelines/opinion_collections.py +5 -5
- arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +7 -7
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +34 -22
- arekit/contrib/utils/pipelines/text_opinion/filters/base.py +1 -1
- arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +1 -1
- arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +3 -3
- arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +4 -4
- arekit/contrib/utils/serializer.py +4 -23
- arekit-0.25.0.data/data/logo.png +0 -0
- arekit-0.25.0.dist-info/METADATA +82 -0
- arekit-0.25.0.dist-info/RECORD +259 -0
- {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/WHEEL +1 -1
- arekit/common/data/row_ids/base.py +0 -79
- arekit/common/data/row_ids/binary.py +0 -38
- arekit/common/data/row_ids/multiple.py +0 -14
- arekit/common/folding/base.py +0 -36
- arekit/common/folding/fixed.py +0 -42
- arekit/common/folding/nofold.py +0 -15
- arekit/common/folding/united.py +0 -46
- arekit/common/news/objects_parser.py +0 -37
- arekit/common/news/parsed/providers/base.py +0 -48
- arekit/common/news/parsed/service.py +0 -31
- arekit/common/news/parser.py +0 -34
- arekit/common/text/parser.py +0 -12
- arekit/common/text/partitioning/__init__.py +0 -0
- arekit/common/text/partitioning/base.py +0 -4
- arekit/common/text/partitioning/terms.py +0 -35
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +0 -83
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +0 -33
- arekit/contrib/source/brat/entities/entity.py +0 -42
- arekit/contrib/source/brat/entities/parser.py +0 -53
- arekit/contrib/source/brat/news.py +0 -28
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +0 -19
- arekit/contrib/source/brat/relation.py +0 -32
- arekit/contrib/source/brat/sentence.py +0 -69
- arekit/contrib/source/brat/sentences_reader.py +0 -128
- arekit/contrib/source/download.py +0 -41
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +0 -55
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +0 -75
- arekit/contrib/source/nerel/io_utils.py +0 -62
- arekit/contrib/source/nerel/labels.py +0 -241
- arekit/contrib/source/nerel/reader.py +0 -46
- arekit/contrib/source/nerel/utils.py +0 -24
- arekit/contrib/source/nerel/versions.py +0 -12
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +0 -62
- arekit/contrib/source/nerelbio/labels.py +0 -265
- arekit/contrib/source/nerelbio/reader.py +0 -8
- arekit/contrib/source/nerelbio/versions.py +0 -8
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +0 -36
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
- arekit/contrib/source/ruattitudes/io_utils.py +0 -56
- arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
- arekit/contrib/source/ruattitudes/news.py +0 -51
- arekit/contrib/source/ruattitudes/news_brat.py +0 -44
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
- arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
- arekit/contrib/source/ruattitudes/reader.py +0 -268
- arekit/contrib/source/ruattitudes/sentence.py +0 -73
- arekit/contrib/source/ruattitudes/synonyms.py +0 -17
- arekit/contrib/source/ruattitudes/text_object.py +0 -57
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +0 -157
- arekit/contrib/source/rusentiframes/effect.py +0 -24
- arekit/contrib/source/rusentiframes/io_utils.py +0 -19
- arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
- arekit/contrib/source/rusentiframes/polarity.py +0 -35
- arekit/contrib/source/rusentiframes/role.py +0 -15
- arekit/contrib/source/rusentiframes/state.py +0 -24
- arekit/contrib/source/rusentiframes/types.py +0 -42
- arekit/contrib/source/rusentiframes/value.py +0 -2
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +0 -3
- arekit/contrib/source/rusentrel/entities.py +0 -26
- arekit/contrib/source/rusentrel/io_utils.py +0 -125
- arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
- arekit/contrib/source/rusentrel/news_reader.py +0 -51
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
- arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
- arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
- arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
- arekit/contrib/source/rusentrel/synonyms.py +0 -17
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +0 -52
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +0 -32
- arekit/contrib/source/sentinerel/folding/fixed.py +0 -73
- arekit/contrib/source/sentinerel/io_utils.py +0 -87
- arekit/contrib/source/sentinerel/labels.py +0 -53
- arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
- arekit/contrib/source/sentinerel/reader.py +0 -42
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +0 -19
- arekit/contrib/source/zip_utils.py +0 -47
- arekit/contrib/utils/bert/rows.py +0 -0
- arekit/contrib/utils/bert/text_b_rus.py +0 -18
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
- arekit/contrib/utils/cv/__init__.py +0 -0
- arekit/contrib/utils/cv/doc_stat/__init__.py +0 -0
- arekit/contrib/utils/cv/doc_stat/base.py +0 -37
- arekit/contrib/utils/cv/doc_stat/sentence.py +0 -12
- arekit/contrib/utils/cv/splitters/__init__.py +0 -0
- arekit/contrib/utils/cv/splitters/base.py +0 -4
- arekit/contrib/utils/cv/splitters/default.py +0 -53
- arekit/contrib/utils/cv/splitters/statistical.py +0 -57
- arekit/contrib/utils/cv/two_class.py +0 -77
- arekit/contrib/utils/data/doc_ops/__init__.py +0 -0
- arekit/contrib/utils/data/doc_ops/dict_based.py +0 -13
- arekit/contrib/utils/data/ext.py +0 -31
- arekit/contrib/utils/data/views/__init__.py +0 -0
- arekit/contrib/utils/data/views/linkages/__init__.py +0 -0
- arekit/contrib/utils/data/views/linkages/base.py +0 -58
- arekit/contrib/utils/data/views/linkages/multilabel.py +0 -48
- arekit/contrib/utils/data/views/linkages/utils.py +0 -24
- arekit/contrib/utils/data/views/opinions.py +0 -14
- arekit/contrib/utils/download.py +0 -78
- arekit/contrib/utils/entities/formatters/str_rus_cased_fmt.py +0 -78
- arekit/contrib/utils/entities/formatters/str_rus_nocased_fmt.py +0 -15
- arekit/contrib/utils/entities/formatters/str_simple_fmt.py +0 -24
- arekit/contrib/utils/entities/formatters/str_simple_uppercase_fmt.py +0 -21
- arekit/contrib/utils/io_utils/opinions.py +0 -39
- arekit/contrib/utils/io_utils/samples.py +0 -78
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +0 -43
- arekit/contrib/utils/lexicons/relation.py +0 -45
- arekit/contrib/utils/lexicons/rusentilex.py +0 -34
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +0 -83
- arekit/contrib/utils/pipelines/items/sampling/bert.py +0 -5
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
- arekit/contrib/utils/pipelines/items/to_output.py +0 -101
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_ops.py +0 -27
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -59
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_ops.py +0 -29
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -59
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_ops.py +0 -56
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -19
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -58
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_ops.py +0 -21
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -100
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_ops.py +0 -29
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -175
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
- arekit/contrib/utils/resources.py +0 -26
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
- arekit/contrib/utils/utils_folding.py +0 -19
- arekit/download_data.py +0 -11
- arekit-0.23.1.dist-info/METADATA +0 -23
- arekit-0.23.1.dist-info/RECORD +0 -403
- /arekit/common/{data/row_ids → docs}/__init__.py +0 -0
- /arekit/common/{folding → docs/parsed}/__init__.py +0 -0
- /arekit/common/{news → docs/parsed/providers}/__init__.py +0 -0
- /arekit/common/{news → docs}/parsed/term_position.py +0 -0
- /arekit/common/{news/parsed → service}/__init__.py +0 -0
- /arekit/{common/news/parsed/providers → contrib/utils/data/doc_provider}/__init__.py +0 -0
- {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/LICENSE +0 -0
- {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/top_level.txt +0 -0
|
@@ -1,134 +1,47 @@
|
|
|
1
|
-
|
|
2
|
-
from arekit.common.
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
network_input_const.SynonymSubject: lambda value: __process_indices_list(value),
|
|
49
|
-
network_input_const.PosTags: lambda value: __process_int_values_list(value)
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
class ParsedSampleRow(object):
|
|
54
|
-
""" Provides a parsed information for a sample row.
|
|
55
|
-
"""
|
|
56
|
-
|
|
57
|
-
def __init__(self, row):
|
|
58
|
-
""" row: dict
|
|
59
|
-
dict of the pairs ("field_name", value)
|
|
60
|
-
"""
|
|
61
|
-
assert(isinstance(row, dict))
|
|
62
|
-
|
|
63
|
-
self.__uint_label = None
|
|
64
|
-
self.__params = {}
|
|
65
|
-
|
|
66
|
-
for key, value in row.items():
|
|
67
|
-
|
|
68
|
-
if key == const.LABEL:
|
|
69
|
-
self.__uint_label = int(value)
|
|
70
|
-
# TODO: To be adopted in future instead of __uint_label
|
|
71
|
-
self.__params[key] = value
|
|
72
|
-
continue
|
|
73
|
-
|
|
74
|
-
if key not in parse_value:
|
|
75
|
-
continue
|
|
76
|
-
|
|
77
|
-
self.__params[key] = parse_value[key](value)
|
|
78
|
-
|
|
79
|
-
def __value_or_none(self, key):
|
|
80
|
-
return self.__params[key] if key in self.__params else no_value()
|
|
81
|
-
|
|
82
|
-
@property
|
|
83
|
-
def SampleID(self):
|
|
84
|
-
return self.__params[const.ID]
|
|
85
|
-
|
|
86
|
-
@property
|
|
87
|
-
def Terms(self):
|
|
88
|
-
return self.__params[const.TEXT]
|
|
89
|
-
|
|
90
|
-
@property
|
|
91
|
-
def SubjectIndex(self):
|
|
92
|
-
return self.__params[const.S_IND]
|
|
93
|
-
|
|
94
|
-
@property
|
|
95
|
-
def ObjectIndex(self):
|
|
96
|
-
return self.__params[const.T_IND]
|
|
97
|
-
|
|
98
|
-
@property
|
|
99
|
-
def UintLabel(self):
|
|
100
|
-
return self.__uint_label
|
|
101
|
-
|
|
102
|
-
@property
|
|
103
|
-
def PartOfSpeechTags(self):
|
|
104
|
-
return self.__value_or_none(network_input_const.PosTags)
|
|
105
|
-
|
|
106
|
-
@property
|
|
107
|
-
def TextFrameVariantIndices(self):
|
|
108
|
-
return self.__value_or_none(network_input_const.FrameVariantIndices)
|
|
109
|
-
|
|
110
|
-
@property
|
|
111
|
-
def TextFrameConnotations(self):
|
|
112
|
-
return self.__value_or_none(network_input_const.FrameConnotations)
|
|
113
|
-
|
|
114
|
-
@property
|
|
115
|
-
def EntityInds(self):
|
|
116
|
-
return self.__value_or_none(const.ENTITIES)
|
|
117
|
-
|
|
118
|
-
@property
|
|
119
|
-
def SynonymObjectInds(self):
|
|
120
|
-
return self.__value_or_none(network_input_const.SynonymObject)
|
|
121
|
-
|
|
122
|
-
@property
|
|
123
|
-
def SynonymSubjectInds(self):
|
|
124
|
-
return self.__value_or_none(network_input_const.SynonymSubject)
|
|
125
|
-
|
|
126
|
-
def __getitem__(self, item):
|
|
127
|
-
assert (isinstance(item, str) or item is None)
|
|
128
|
-
if item not in self.__params:
|
|
129
|
-
return no_value()
|
|
130
|
-
return self.__params[item] if item is not None else no_value()
|
|
131
|
-
|
|
132
|
-
@classmethod
|
|
133
|
-
def parse(cls, row):
|
|
134
|
-
return cls(row=row)
|
|
1
|
+
import arekit.contrib.networks.input.const as const
|
|
2
|
+
from arekit.common.data.rows_fmt import process_indices_list
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def create_nn_column_formatters(no_value_func=lambda: None, args_sep=","):
|
|
6
|
+
assert(callable(no_value_func))
|
|
7
|
+
|
|
8
|
+
empty_list = []
|
|
9
|
+
|
|
10
|
+
def str_to_list(value):
|
|
11
|
+
return process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
|
|
12
|
+
|
|
13
|
+
def list_to_str(inds_iter):
|
|
14
|
+
return args_sep.join([str(i) for i in inds_iter])
|
|
15
|
+
|
|
16
|
+
return {
|
|
17
|
+
const.FrameVariantIndices: {
|
|
18
|
+
"writer": lambda value: list_to_str(value),
|
|
19
|
+
"parser": lambda value: process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
|
|
20
|
+
if isinstance(value, str) else empty_list
|
|
21
|
+
},
|
|
22
|
+
const.FrameConnotations: {
|
|
23
|
+
"writer": lambda value: list_to_str(value),
|
|
24
|
+
"parser": lambda value: process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
|
|
25
|
+
if isinstance(value, str) else empty_list
|
|
26
|
+
},
|
|
27
|
+
const.SynonymObject: {
|
|
28
|
+
"writer": lambda value: list_to_str(value),
|
|
29
|
+
"parser": lambda value: process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
|
|
30
|
+
},
|
|
31
|
+
const.SynonymSubject: {
|
|
32
|
+
"writer": lambda value: list_to_str(value),
|
|
33
|
+
"parser": lambda value: process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
|
|
34
|
+
},
|
|
35
|
+
const.PosTags: {
|
|
36
|
+
"writer": lambda value: list_to_str(value),
|
|
37
|
+
"parser": lambda value: str_to_list(value)
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def create_nn_val_writer_fmt(fmt_type, args_sep=","):
|
|
43
|
+
assert(isinstance(fmt_type, str))
|
|
44
|
+
d = create_nn_column_formatters(args_sep=args_sep)
|
|
45
|
+
for k, v in d.items():
|
|
46
|
+
d[k] = v[fmt_type]
|
|
47
|
+
return d
|
arekit/contrib/prompt/sample.py
CHANGED
|
@@ -28,32 +28,34 @@ class PromptedSampleRowProvider(CroppedSampleRowProvider):
|
|
|
28
28
|
self.__labels_fmt = label_fmt
|
|
29
29
|
|
|
30
30
|
def _fill_row_core(self, row, text_opinion_linkage, index_in_linked, etalon_label,
|
|
31
|
-
|
|
31
|
+
parsed_doc, sentence_ind, s_ind, t_ind):
|
|
32
32
|
|
|
33
33
|
super(PromptedSampleRowProvider, self)._fill_row_core(row=row,
|
|
34
34
|
text_opinion_linkage=text_opinion_linkage,
|
|
35
35
|
index_in_linked=index_in_linked,
|
|
36
36
|
etalon_label=etalon_label,
|
|
37
|
-
|
|
37
|
+
parsed_doc=parsed_doc,
|
|
38
38
|
sentence_ind=sentence_ind,
|
|
39
39
|
s_ind=s_ind,
|
|
40
40
|
t_ind=t_ind)
|
|
41
41
|
original_text = row[BaseSingleTextProvider.TEXT_A]
|
|
42
42
|
|
|
43
43
|
sentence_terms, actual_s_ind, actual_t_ind = self._provide_sentence_terms(
|
|
44
|
-
|
|
44
|
+
parsed_doc=parsed_doc, sentence_ind=sentence_ind, s_ind=s_ind, t_ind=t_ind)
|
|
45
45
|
|
|
46
|
-
label_uint = row[const.
|
|
46
|
+
label_uint = row[const.LABEL_UINT] if const.LABEL_UINT in row else None
|
|
47
47
|
label_val = str(label_uint) if label_uint is None or self.__labels_fmt is None else \
|
|
48
|
-
self.__labels_fmt.label_to_str(self._label_provider.LabelScaler.uint_to_label(row[const.
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
48
|
+
self.__labels_fmt.label_to_str(self._label_provider.LabelScaler.uint_to_label(row[const.LABEL_UINT]))
|
|
49
|
+
|
|
50
|
+
vm = {
|
|
51
|
+
const.TEXT: self.__prompt.format(
|
|
52
|
+
text=original_text,
|
|
53
|
+
s_ind=row[const.S_IND],
|
|
54
|
+
t_ind=row[const.T_IND],
|
|
55
|
+
s_val=sentence_terms[actual_s_ind].DisplayValue,
|
|
56
|
+
t_val=sentence_terms[actual_t_ind].DisplayValue,
|
|
57
|
+
label_uint=label_uint,
|
|
58
|
+
label_val=label_val)
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
self._apply_row_data(row=row, vm=vm, val_fmt=self._val_fmt)
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
from arekit.common.data.input.providers.const import IDLE_MODE
|
|
2
2
|
from arekit.common.data.input.providers.contents import ContentsProvider
|
|
3
|
+
from arekit.common.linkage.base import LinkedDataWrapper
|
|
3
4
|
from arekit.common.linkage.text_opinions import TextOpinionsLinkage
|
|
4
|
-
from arekit.common.pipeline.base import
|
|
5
|
+
from arekit.common.pipeline.base import BasePipelineLauncher
|
|
6
|
+
from arekit.common.pipeline.context import PipelineContext
|
|
5
7
|
from arekit.common.text_opinions.base import TextOpinion
|
|
6
8
|
|
|
7
9
|
|
|
@@ -12,7 +14,7 @@ class InputTextOpinionProvider(ContentsProvider):
|
|
|
12
14
|
results in a TextOpinionLinkage instances.
|
|
13
15
|
pipeline: id -> ... -> TextOpinionLinkage[]
|
|
14
16
|
"""
|
|
15
|
-
assert(isinstance(pipeline,
|
|
17
|
+
assert(isinstance(pipeline, list))
|
|
16
18
|
self.__pipeline = pipeline
|
|
17
19
|
self.__current_id = None
|
|
18
20
|
|
|
@@ -29,7 +31,17 @@ class InputTextOpinionProvider(ContentsProvider):
|
|
|
29
31
|
|
|
30
32
|
def from_doc_ids(self, doc_ids, idle_mode=False):
|
|
31
33
|
self.__current_id = 0
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
34
|
+
|
|
35
|
+
ctx = PipelineContext(d={
|
|
36
|
+
"result": doc_ids,
|
|
37
|
+
IDLE_MODE: idle_mode
|
|
38
|
+
})
|
|
39
|
+
|
|
40
|
+
# Launching pipeline with the passed context
|
|
41
|
+
BasePipelineLauncher.run(pipeline=self.__pipeline, pipeline_ctx=ctx)
|
|
42
|
+
|
|
43
|
+
for linkage in ctx.provide("result"):
|
|
44
|
+
assert(isinstance(linkage, LinkedDataWrapper))
|
|
45
|
+
if isinstance(linkage, TextOpinionsLinkage):
|
|
46
|
+
self.__assign_ids(linkage)
|
|
35
47
|
yield linkage
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from arekit.common.data.doc_provider import DocumentProvider
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class DictionaryBasedDocumentProvider(DocumentProvider):
|
|
5
|
+
|
|
6
|
+
def __init__(self, d):
|
|
7
|
+
assert(isinstance(d, dict))
|
|
8
|
+
super(DictionaryBasedDocumentProvider, self).__init__()
|
|
9
|
+
self.__d = d
|
|
10
|
+
|
|
11
|
+
def by_id(self, doc_id):
|
|
12
|
+
assert(isinstance(doc_id, int))
|
|
13
|
+
return self.__d[doc_id]
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
from os.path import join
|
|
2
2
|
|
|
3
|
-
from arekit.common.
|
|
4
|
-
from arekit.common.
|
|
5
|
-
from arekit.common.
|
|
3
|
+
from arekit.common.data.doc_provider import DocumentProvider
|
|
4
|
+
from arekit.common.docs.base import Document
|
|
5
|
+
from arekit.common.docs.sentence import BaseDocumentSentence
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
class
|
|
9
|
-
""" Document
|
|
8
|
+
class DirectoryFilesDocProvider(DocumentProvider):
|
|
9
|
+
""" Document Providers based on the list of provided file paths
|
|
10
10
|
for the particular directory.
|
|
11
11
|
"""
|
|
12
12
|
|
|
@@ -36,10 +36,10 @@ class DirectoryFilesDocOperations(DocumentOperations):
|
|
|
36
36
|
"""
|
|
37
37
|
# setup input data.
|
|
38
38
|
sentences = self.__sentence_parser(contents)
|
|
39
|
-
sentences = list(map(lambda text:
|
|
39
|
+
sentences = list(map(lambda text: BaseDocumentSentence(text), sentences))
|
|
40
40
|
|
|
41
41
|
# Parse text.
|
|
42
|
-
return
|
|
42
|
+
return Document(doc_id=doc_id, sentences=sentences)
|
|
43
43
|
|
|
44
44
|
def by_id(self, doc_id):
|
|
45
45
|
""" Perform reading operation of the document.
|
|
@@ -1,23 +1,29 @@
|
|
|
1
1
|
import importlib
|
|
2
|
+
|
|
2
3
|
from arekit.contrib.utils.data.readers.base import BaseReader
|
|
3
4
|
from arekit.contrib.utils.data.storages.pandas_based import PandasBasedRowsStorage
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
class PandasCsvReader(BaseReader):
|
|
7
|
-
""" Represents a CSV-based reader,
|
|
8
|
+
""" Represents a CSV-based reader, implmented via pandas API.
|
|
8
9
|
"""
|
|
9
10
|
|
|
10
|
-
def __init__(self, sep='\t', header='infer', compression='infer', encoding='utf-8', col_types=None
|
|
11
|
+
def __init__(self, sep='\t', header='infer', compression='infer', encoding='utf-8', col_types=None,
|
|
12
|
+
custom_extension=None):
|
|
11
13
|
self.__sep = sep
|
|
12
14
|
self.__compression = compression
|
|
13
15
|
self.__encoding = encoding
|
|
14
16
|
self.__header = header
|
|
17
|
+
self.__custom_extension = custom_extension
|
|
15
18
|
|
|
16
|
-
#
|
|
19
|
+
# Special assignation of types for certain columns.
|
|
17
20
|
self.__col_types = col_types
|
|
18
21
|
if self.__col_types is None:
|
|
19
22
|
self.__col_types = dict()
|
|
20
23
|
|
|
24
|
+
def extension(self):
|
|
25
|
+
return ".tsv.gz" if self.__custom_extension is None else self.__custom_extension
|
|
26
|
+
|
|
21
27
|
def __from_csv(self, filepath):
|
|
22
28
|
pd = importlib.import_module("pandas")
|
|
23
29
|
return pd.read_csv(filepath,
|
|
@@ -29,4 +35,4 @@ class PandasCsvReader(BaseReader):
|
|
|
29
35
|
|
|
30
36
|
def read(self, target):
|
|
31
37
|
df = self.__from_csv(filepath=target)
|
|
32
|
-
return PandasBasedRowsStorage(df)
|
|
38
|
+
return PandasBasedRowsStorage(df)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from arekit.contrib.utils.data.readers.base import BaseReader
|
|
2
|
+
from arekit.contrib.utils.data.storages.sqlite_based import SQliteBasedRowsStorage
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class SQliteReader(BaseReader):
|
|
6
|
+
|
|
7
|
+
def __init__(self, table_name):
|
|
8
|
+
self.__table_name = table_name
|
|
9
|
+
|
|
10
|
+
def extension(self):
|
|
11
|
+
return ".sqlite"
|
|
12
|
+
|
|
13
|
+
def read(self, target):
|
|
14
|
+
return SQliteBasedRowsStorage(path=target, table_name=self.__table_name)
|
|
@@ -48,6 +48,9 @@ class PandasBasedRowsStorage(BaseRowsStorage):
|
|
|
48
48
|
def iter_column_names(self):
|
|
49
49
|
return iter(self._df.columns)
|
|
50
50
|
|
|
51
|
+
def iter_column_types(self):
|
|
52
|
+
return iter(self._df.dtypes)
|
|
53
|
+
|
|
51
54
|
def _set_row_value(self, row_ind, column, value):
|
|
52
55
|
self._df.at[row_ind, column] = value
|
|
53
56
|
|
|
@@ -105,11 +108,6 @@ class PandasBasedRowsStorage(BaseRowsStorage):
|
|
|
105
108
|
def find_by_value(self, column_name, value):
|
|
106
109
|
return self.__filter(column_name=column_name, value=value)
|
|
107
110
|
|
|
108
|
-
def find_first_by_value(self, column_name, value):
|
|
109
|
-
# TODO. Return new storage. (Encapsulation)
|
|
110
|
-
rows = self.__filter(column_name=column_name, value=value)
|
|
111
|
-
return rows.iloc[0]
|
|
112
|
-
|
|
113
111
|
def init_empty(self, columns_provider):
|
|
114
112
|
cols_with_types = columns_provider.get_columns_list_with_types()
|
|
115
113
|
self._df = self.__create_empty(cols_with_types)
|
|
@@ -15,7 +15,8 @@ class RowCacheStorage(BaseRowsStorage):
|
|
|
15
15
|
assert(isinstance(force_collect_columns, list) or force_collect_columns is None)
|
|
16
16
|
self.__f = None
|
|
17
17
|
self.__row_cache = {}
|
|
18
|
-
self.
|
|
18
|
+
self.__column_names = []
|
|
19
|
+
self.__column_types = []
|
|
19
20
|
self.__force_collect_columns = [] if force_collect_columns is None else force_collect_columns
|
|
20
21
|
|
|
21
22
|
@property
|
|
@@ -24,15 +25,26 @@ class RowCacheStorage(BaseRowsStorage):
|
|
|
24
25
|
|
|
25
26
|
def init_empty(self, columns_provider):
|
|
26
27
|
assert (isinstance(columns_provider, BaseColumnsProvider))
|
|
27
|
-
|
|
28
|
-
|
|
28
|
+
|
|
29
|
+
self.__column_names.clear()
|
|
30
|
+
for col_name, col_type in columns_provider.get_columns_list_with_types():
|
|
31
|
+
self.__column_names.append(col_name)
|
|
32
|
+
self.__column_types.append(col_type)
|
|
29
33
|
|
|
30
34
|
# Expand with columns that are forced to be provided.
|
|
31
|
-
existed_set = set(self.
|
|
32
|
-
|
|
35
|
+
existed_set = set(self.__column_names)
|
|
36
|
+
|
|
37
|
+
# Calculate extension: columns that were not mentioned in column names list.
|
|
38
|
+
extension = [c for c in self.__force_collect_columns if c not in existed_set]
|
|
39
|
+
|
|
40
|
+
self.__column_names += extension
|
|
41
|
+
self.__column_types += [str] * len(extension)
|
|
33
42
|
|
|
34
43
|
def iter_column_names(self):
|
|
35
|
-
return iter(self.
|
|
44
|
+
return iter(self.__column_names)
|
|
45
|
+
|
|
46
|
+
def iter_column_types(self):
|
|
47
|
+
return iter(self.__column_types)
|
|
36
48
|
|
|
37
49
|
def _set_row_value(self, row_ind, column, value):
|
|
38
50
|
self.__row_cache[column] = value
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import sqlite3
|
|
2
|
+
from arekit.common.data.storages.base import BaseRowsStorage
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class SQliteBasedRowsStorage(BaseRowsStorage):
|
|
6
|
+
|
|
7
|
+
def __init__(self, path, table_name):
|
|
8
|
+
self.__path = path
|
|
9
|
+
self.__table_name = table_name
|
|
10
|
+
self.__conn = None
|
|
11
|
+
|
|
12
|
+
def _iter_rows(self):
|
|
13
|
+
with sqlite3.connect(self.__path) as conn:
|
|
14
|
+
cursor = conn.execute(f"select * from {self.__table_name}")
|
|
15
|
+
for row_index, row in enumerate(cursor.fetchall()):
|
|
16
|
+
row_dict = {cursor.description[i][0]: value for i, value in enumerate(row)}
|
|
17
|
+
yield row_index, row_dict
|
|
@@ -15,6 +15,9 @@ class PandasCsvWriter(BaseWriter):
|
|
|
15
15
|
super(PandasCsvWriter, self).__init__()
|
|
16
16
|
self.__write_header = write_header
|
|
17
17
|
|
|
18
|
+
def extension(self):
|
|
19
|
+
return ".tsv.gz"
|
|
20
|
+
|
|
18
21
|
def write_all(self, storage, target):
|
|
19
22
|
assert(isinstance(storage, PandasBasedRowsStorage))
|
|
20
23
|
assert(isinstance(target, str))
|
|
@@ -27,9 +27,8 @@ class OpenNREJsonWriter(BaseWriter):
|
|
|
27
27
|
During the dataset reading stage via OpenNRE, these linkages automaticaly groups into bags.
|
|
28
28
|
"""
|
|
29
29
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def __init__(self, text_columns, encoding="utf-8"):
|
|
30
|
+
def __init__(self, text_columns, encoding="utf-8", na_value="NA", keep_extra_columns=True,
|
|
31
|
+
skip_extra_existed=True):
|
|
33
32
|
""" text_columns: list
|
|
34
33
|
column names that expected to be joined into a single (token) column.
|
|
35
34
|
"""
|
|
@@ -38,16 +37,23 @@ class OpenNREJsonWriter(BaseWriter):
|
|
|
38
37
|
self.__text_columns = text_columns
|
|
39
38
|
self.__encoding = encoding
|
|
40
39
|
self.__target_f = None
|
|
40
|
+
self.__keep_extra_columns = keep_extra_columns
|
|
41
|
+
self.__na_value = na_value
|
|
42
|
+
self.__skip_extra_existed = skip_extra_existed
|
|
43
|
+
|
|
44
|
+
def extension(self):
|
|
45
|
+
return ".jsonl"
|
|
41
46
|
|
|
42
47
|
@staticmethod
|
|
43
|
-
def __format_row(row, text_columns):
|
|
48
|
+
def __format_row(row, na_value, text_columns, keep_extra_columns, skip_extra_existed):
|
|
44
49
|
""" Formatting that is compatible with the OpenNRE.
|
|
45
50
|
"""
|
|
51
|
+
assert(isinstance(na_value, str))
|
|
46
52
|
|
|
47
53
|
sample_id = row[const.ID]
|
|
48
54
|
s_ind = int(row[const.S_IND])
|
|
49
55
|
t_ind = int(row[const.T_IND])
|
|
50
|
-
bag_id =
|
|
56
|
+
bag_id = str(row[const.OPINION_ID])
|
|
51
57
|
|
|
52
58
|
# Gather tokens.
|
|
53
59
|
tokens = []
|
|
@@ -62,13 +68,18 @@ class OpenNREJsonWriter(BaseWriter):
|
|
|
62
68
|
"token": tokens,
|
|
63
69
|
"h": {"pos": [s_ind, s_ind + 1], "id": str(bag_id + "s")},
|
|
64
70
|
"t": {"pos": [t_ind, t_ind + 1], "id": str(bag_id + "t")},
|
|
65
|
-
"relation": str(int(row[const.
|
|
71
|
+
"relation": str(int(row[const.LABEL_UINT])) if const.LABEL_UINT in row else na_value
|
|
66
72
|
}
|
|
67
73
|
|
|
68
|
-
# Register extra fields.
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
formatted_data
|
|
74
|
+
# Register extra fields (optionally).
|
|
75
|
+
if keep_extra_columns:
|
|
76
|
+
for key, value in row.items():
|
|
77
|
+
if key not in formatted_data and key not in text_columns:
|
|
78
|
+
formatted_data[key] = value
|
|
79
|
+
else:
|
|
80
|
+
if not skip_extra_existed:
|
|
81
|
+
raise Exception(f"key `{key}` is already exist in formatted data "
|
|
82
|
+
f"or a part of the text columns list: {text_columns}")
|
|
72
83
|
|
|
73
84
|
return formatted_data
|
|
74
85
|
|
|
@@ -90,8 +101,12 @@ class OpenNREJsonWriter(BaseWriter):
|
|
|
90
101
|
continue
|
|
91
102
|
row_data[col_name] = storage.RowCache[col_name]
|
|
92
103
|
|
|
93
|
-
|
|
94
|
-
|
|
104
|
+
bag = self.__format_row(row_data, text_columns=self.__text_columns,
|
|
105
|
+
keep_extra_columns=self.__keep_extra_columns,
|
|
106
|
+
na_value=self.__na_value,
|
|
107
|
+
skip_extra_existed=self.__skip_extra_existed)
|
|
108
|
+
|
|
109
|
+
self.__write_bag(bag=bag, json_file=self.__target_f)
|
|
95
110
|
|
|
96
111
|
@staticmethod
|
|
97
112
|
def __write_bag(bag, json_file):
|
|
@@ -108,7 +123,10 @@ class OpenNREJsonWriter(BaseWriter):
|
|
|
108
123
|
os.makedirs(os.path.dirname(target), exist_ok=True)
|
|
109
124
|
with open(target, "w", encoding=self.__encoding) as json_file:
|
|
110
125
|
for row_index, row in storage:
|
|
111
|
-
self.__write_bag(bag=self.__format_row(row, text_columns=self.__text_columns
|
|
126
|
+
self.__write_bag(bag=self.__format_row(row, text_columns=self.__text_columns,
|
|
127
|
+
keep_extra_columns=self.__keep_extra_columns,
|
|
128
|
+
na_value=self.__na_value,
|
|
129
|
+
skip_extra_existed=self.__skip_extra_existed),
|
|
112
130
|
json_file=json_file)
|
|
113
131
|
|
|
114
132
|
logger.info("Saving completed!")
|