arekit 0.23.1__py3-none-any.whl → 0.25.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (267) hide show
  1. arekit/common/context/terms_mapper.py +2 -2
  2. arekit/common/data/const.py +5 -4
  3. arekit/common/{experiment/api/ops_doc.py → data/doc_provider.py} +1 -1
  4. arekit/common/data/input/providers/columns/sample.py +6 -1
  5. arekit/common/data/input/providers/instances/base.py +1 -1
  6. arekit/common/data/input/providers/rows/base.py +36 -13
  7. arekit/common/data/input/providers/rows/samples.py +57 -55
  8. arekit/common/data/input/providers/sample/cropped.py +2 -2
  9. arekit/common/data/input/sample.py +1 -1
  10. arekit/common/data/rows_fmt.py +82 -0
  11. arekit/common/data/rows_parser.py +43 -0
  12. arekit/common/data/storages/base.py +23 -18
  13. arekit/common/data/views/samples.py +2 -8
  14. arekit/common/{news → docs}/base.py +2 -2
  15. arekit/common/{news → docs}/entities_grouping.py +2 -1
  16. arekit/common/{news → docs}/entity.py +2 -1
  17. arekit/common/{news → docs}/parsed/base.py +5 -5
  18. arekit/common/docs/parsed/providers/base.py +68 -0
  19. arekit/common/{news → docs}/parsed/providers/base_pairs.py +2 -2
  20. arekit/common/{news → docs}/parsed/providers/entity_service.py +27 -22
  21. arekit/common/{news → docs}/parsed/providers/opinion_pairs.py +2 -2
  22. arekit/common/{news → docs}/parsed/providers/text_opinion_pairs.py +6 -6
  23. arekit/common/docs/parsed/service.py +31 -0
  24. arekit/common/docs/parser.py +66 -0
  25. arekit/common/{news → docs}/sentence.py +1 -1
  26. arekit/common/entities/base.py +11 -2
  27. arekit/common/experiment/api/base_samples_io.py +1 -1
  28. arekit/common/frames/variants/collection.py +2 -2
  29. arekit/common/linkage/base.py +2 -2
  30. arekit/common/linkage/meta.py +23 -0
  31. arekit/common/linkage/opinions.py +1 -1
  32. arekit/common/linkage/text_opinions.py +2 -2
  33. arekit/common/opinions/annot/algo/base.py +1 -1
  34. arekit/common/opinions/annot/algo/pair_based.py +15 -13
  35. arekit/common/opinions/annot/algo/predefined.py +4 -4
  36. arekit/common/opinions/annot/algo_based.py +5 -5
  37. arekit/common/opinions/annot/base.py +3 -3
  38. arekit/common/opinions/base.py +7 -7
  39. arekit/common/opinions/collection.py +3 -3
  40. arekit/common/pipeline/base.py +12 -16
  41. arekit/common/pipeline/batching.py +28 -0
  42. arekit/common/pipeline/context.py +5 -1
  43. arekit/common/pipeline/items/base.py +38 -1
  44. arekit/common/pipeline/items/flatten.py +5 -1
  45. arekit/common/pipeline/items/handle.py +2 -1
  46. arekit/common/pipeline/items/iter.py +2 -1
  47. arekit/common/pipeline/items/map.py +2 -1
  48. arekit/common/pipeline/items/map_nested.py +4 -0
  49. arekit/common/pipeline/utils.py +32 -0
  50. arekit/common/service/sqlite.py +36 -0
  51. arekit/common/synonyms/base.py +2 -2
  52. arekit/common/text/{partitioning/str.py → partitioning.py} +16 -11
  53. arekit/common/text_opinions/base.py +11 -11
  54. arekit/common/utils.py +33 -46
  55. arekit/contrib/networks/embedding.py +3 -3
  56. arekit/contrib/networks/embedding_io.py +5 -5
  57. arekit/contrib/networks/input/const.py +0 -2
  58. arekit/contrib/networks/input/providers/sample.py +15 -29
  59. arekit/contrib/networks/input/rows_parser.py +47 -134
  60. arekit/contrib/prompt/sample.py +18 -16
  61. arekit/contrib/utils/data/contents/opinions.py +17 -5
  62. arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
  63. arekit/contrib/utils/data/{doc_ops → doc_provider}/dir_based.py +7 -7
  64. arekit/contrib/utils/data/readers/base.py +3 -0
  65. arekit/contrib/utils/data/readers/csv_pd.py +10 -4
  66. arekit/contrib/utils/data/readers/jsonl.py +3 -0
  67. arekit/contrib/utils/data/readers/sqlite.py +14 -0
  68. arekit/contrib/utils/data/service/balance.py +0 -1
  69. arekit/contrib/utils/data/storages/pandas_based.py +3 -5
  70. arekit/contrib/utils/data/storages/row_cache.py +18 -6
  71. arekit/contrib/utils/data/storages/sqlite_based.py +17 -0
  72. arekit/contrib/utils/data/writers/base.py +5 -0
  73. arekit/contrib/utils/data/writers/csv_native.py +3 -0
  74. arekit/contrib/utils/data/writers/csv_pd.py +3 -0
  75. arekit/contrib/utils/data/writers/json_opennre.py +31 -13
  76. arekit/contrib/utils/data/writers/sqlite_native.py +114 -0
  77. arekit/contrib/utils/io_utils/embedding.py +25 -33
  78. arekit/contrib/utils/io_utils/utils.py +3 -24
  79. arekit/contrib/utils/pipelines/items/sampling/base.py +31 -26
  80. arekit/contrib/utils/pipelines/items/sampling/networks.py +7 -10
  81. arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
  82. arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
  83. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +3 -3
  84. arekit/contrib/utils/pipelines/items/text/frames_negation.py +2 -1
  85. arekit/contrib/utils/pipelines/items/text/tokenizer.py +3 -5
  86. arekit/contrib/utils/pipelines/items/text/translator.py +136 -0
  87. arekit/contrib/utils/pipelines/opinion_collections.py +5 -5
  88. arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +7 -7
  89. arekit/contrib/utils/pipelines/text_opinion/extraction.py +34 -22
  90. arekit/contrib/utils/pipelines/text_opinion/filters/base.py +1 -1
  91. arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +1 -1
  92. arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +3 -3
  93. arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +4 -4
  94. arekit/contrib/utils/serializer.py +4 -23
  95. arekit-0.25.0.data/data/logo.png +0 -0
  96. arekit-0.25.0.dist-info/METADATA +82 -0
  97. arekit-0.25.0.dist-info/RECORD +259 -0
  98. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/WHEEL +1 -1
  99. arekit/common/data/row_ids/base.py +0 -79
  100. arekit/common/data/row_ids/binary.py +0 -38
  101. arekit/common/data/row_ids/multiple.py +0 -14
  102. arekit/common/folding/base.py +0 -36
  103. arekit/common/folding/fixed.py +0 -42
  104. arekit/common/folding/nofold.py +0 -15
  105. arekit/common/folding/united.py +0 -46
  106. arekit/common/news/objects_parser.py +0 -37
  107. arekit/common/news/parsed/providers/base.py +0 -48
  108. arekit/common/news/parsed/service.py +0 -31
  109. arekit/common/news/parser.py +0 -34
  110. arekit/common/text/parser.py +0 -12
  111. arekit/common/text/partitioning/__init__.py +0 -0
  112. arekit/common/text/partitioning/base.py +0 -4
  113. arekit/common/text/partitioning/terms.py +0 -35
  114. arekit/contrib/source/__init__.py +0 -0
  115. arekit/contrib/source/brat/__init__.py +0 -0
  116. arekit/contrib/source/brat/annot.py +0 -83
  117. arekit/contrib/source/brat/entities/__init__.py +0 -0
  118. arekit/contrib/source/brat/entities/compound.py +0 -33
  119. arekit/contrib/source/brat/entities/entity.py +0 -42
  120. arekit/contrib/source/brat/entities/parser.py +0 -53
  121. arekit/contrib/source/brat/news.py +0 -28
  122. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  123. arekit/contrib/source/brat/opinions/converter.py +0 -19
  124. arekit/contrib/source/brat/relation.py +0 -32
  125. arekit/contrib/source/brat/sentence.py +0 -69
  126. arekit/contrib/source/brat/sentences_reader.py +0 -128
  127. arekit/contrib/source/download.py +0 -41
  128. arekit/contrib/source/nerel/__init__.py +0 -0
  129. arekit/contrib/source/nerel/entities.py +0 -55
  130. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  131. arekit/contrib/source/nerel/folding/fixed.py +0 -75
  132. arekit/contrib/source/nerel/io_utils.py +0 -62
  133. arekit/contrib/source/nerel/labels.py +0 -241
  134. arekit/contrib/source/nerel/reader.py +0 -46
  135. arekit/contrib/source/nerel/utils.py +0 -24
  136. arekit/contrib/source/nerel/versions.py +0 -12
  137. arekit/contrib/source/nerelbio/__init__.py +0 -0
  138. arekit/contrib/source/nerelbio/io_utils.py +0 -62
  139. arekit/contrib/source/nerelbio/labels.py +0 -265
  140. arekit/contrib/source/nerelbio/reader.py +0 -8
  141. arekit/contrib/source/nerelbio/versions.py +0 -8
  142. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  143. arekit/contrib/source/ruattitudes/collection.py +0 -36
  144. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  145. arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
  146. arekit/contrib/source/ruattitudes/io_utils.py +0 -56
  147. arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
  148. arekit/contrib/source/ruattitudes/news.py +0 -51
  149. arekit/contrib/source/ruattitudes/news_brat.py +0 -44
  150. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  151. arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
  152. arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
  153. arekit/contrib/source/ruattitudes/reader.py +0 -268
  154. arekit/contrib/source/ruattitudes/sentence.py +0 -73
  155. arekit/contrib/source/ruattitudes/synonyms.py +0 -17
  156. arekit/contrib/source/ruattitudes/text_object.py +0 -57
  157. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  158. arekit/contrib/source/rusentiframes/collection.py +0 -157
  159. arekit/contrib/source/rusentiframes/effect.py +0 -24
  160. arekit/contrib/source/rusentiframes/io_utils.py +0 -19
  161. arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
  162. arekit/contrib/source/rusentiframes/polarity.py +0 -35
  163. arekit/contrib/source/rusentiframes/role.py +0 -15
  164. arekit/contrib/source/rusentiframes/state.py +0 -24
  165. arekit/contrib/source/rusentiframes/types.py +0 -42
  166. arekit/contrib/source/rusentiframes/value.py +0 -2
  167. arekit/contrib/source/rusentrel/__init__.py +0 -0
  168. arekit/contrib/source/rusentrel/const.py +0 -3
  169. arekit/contrib/source/rusentrel/entities.py +0 -26
  170. arekit/contrib/source/rusentrel/io_utils.py +0 -125
  171. arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
  172. arekit/contrib/source/rusentrel/news_reader.py +0 -51
  173. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  174. arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
  175. arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
  176. arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
  177. arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
  178. arekit/contrib/source/rusentrel/synonyms.py +0 -17
  179. arekit/contrib/source/sentinerel/__init__.py +0 -0
  180. arekit/contrib/source/sentinerel/entities.py +0 -52
  181. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  182. arekit/contrib/source/sentinerel/folding/factory.py +0 -32
  183. arekit/contrib/source/sentinerel/folding/fixed.py +0 -73
  184. arekit/contrib/source/sentinerel/io_utils.py +0 -87
  185. arekit/contrib/source/sentinerel/labels.py +0 -53
  186. arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
  187. arekit/contrib/source/sentinerel/reader.py +0 -42
  188. arekit/contrib/source/synonyms/__init__.py +0 -0
  189. arekit/contrib/source/synonyms/utils.py +0 -19
  190. arekit/contrib/source/zip_utils.py +0 -47
  191. arekit/contrib/utils/bert/rows.py +0 -0
  192. arekit/contrib/utils/bert/text_b_rus.py +0 -18
  193. arekit/contrib/utils/connotations/__init__.py +0 -0
  194. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
  195. arekit/contrib/utils/cv/__init__.py +0 -0
  196. arekit/contrib/utils/cv/doc_stat/__init__.py +0 -0
  197. arekit/contrib/utils/cv/doc_stat/base.py +0 -37
  198. arekit/contrib/utils/cv/doc_stat/sentence.py +0 -12
  199. arekit/contrib/utils/cv/splitters/__init__.py +0 -0
  200. arekit/contrib/utils/cv/splitters/base.py +0 -4
  201. arekit/contrib/utils/cv/splitters/default.py +0 -53
  202. arekit/contrib/utils/cv/splitters/statistical.py +0 -57
  203. arekit/contrib/utils/cv/two_class.py +0 -77
  204. arekit/contrib/utils/data/doc_ops/__init__.py +0 -0
  205. arekit/contrib/utils/data/doc_ops/dict_based.py +0 -13
  206. arekit/contrib/utils/data/ext.py +0 -31
  207. arekit/contrib/utils/data/views/__init__.py +0 -0
  208. arekit/contrib/utils/data/views/linkages/__init__.py +0 -0
  209. arekit/contrib/utils/data/views/linkages/base.py +0 -58
  210. arekit/contrib/utils/data/views/linkages/multilabel.py +0 -48
  211. arekit/contrib/utils/data/views/linkages/utils.py +0 -24
  212. arekit/contrib/utils/data/views/opinions.py +0 -14
  213. arekit/contrib/utils/download.py +0 -78
  214. arekit/contrib/utils/entities/formatters/str_rus_cased_fmt.py +0 -78
  215. arekit/contrib/utils/entities/formatters/str_rus_nocased_fmt.py +0 -15
  216. arekit/contrib/utils/entities/formatters/str_simple_fmt.py +0 -24
  217. arekit/contrib/utils/entities/formatters/str_simple_uppercase_fmt.py +0 -21
  218. arekit/contrib/utils/io_utils/opinions.py +0 -39
  219. arekit/contrib/utils/io_utils/samples.py +0 -78
  220. arekit/contrib/utils/lexicons/__init__.py +0 -0
  221. arekit/contrib/utils/lexicons/lexicon.py +0 -43
  222. arekit/contrib/utils/lexicons/relation.py +0 -45
  223. arekit/contrib/utils/lexicons/rusentilex.py +0 -34
  224. arekit/contrib/utils/nn/__init__.py +0 -0
  225. arekit/contrib/utils/nn/rows.py +0 -83
  226. arekit/contrib/utils/pipelines/items/sampling/bert.py +0 -5
  227. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
  228. arekit/contrib/utils/pipelines/items/to_output.py +0 -101
  229. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  230. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  231. arekit/contrib/utils/pipelines/sources/nerel/doc_ops.py +0 -27
  232. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -59
  233. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
  234. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  235. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_ops.py +0 -29
  236. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -59
  237. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
  238. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  239. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_ops.py +0 -56
  240. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -19
  241. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -58
  242. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  243. arekit/contrib/utils/pipelines/sources/rusentrel/doc_ops.py +0 -21
  244. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -100
  245. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  246. arekit/contrib/utils/pipelines/sources/sentinerel/doc_ops.py +0 -29
  247. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
  248. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -175
  249. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
  250. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
  251. arekit/contrib/utils/resources.py +0 -26
  252. arekit/contrib/utils/sources/__init__.py +0 -0
  253. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  254. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  255. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
  256. arekit/contrib/utils/utils_folding.py +0 -19
  257. arekit/download_data.py +0 -11
  258. arekit-0.23.1.dist-info/METADATA +0 -23
  259. arekit-0.23.1.dist-info/RECORD +0 -403
  260. /arekit/common/{data/row_ids → docs}/__init__.py +0 -0
  261. /arekit/common/{folding → docs/parsed}/__init__.py +0 -0
  262. /arekit/common/{news → docs/parsed/providers}/__init__.py +0 -0
  263. /arekit/common/{news → docs}/parsed/term_position.py +0 -0
  264. /arekit/common/{news/parsed → service}/__init__.py +0 -0
  265. /arekit/{common/news/parsed/providers → contrib/utils/data/doc_provider}/__init__.py +0 -0
  266. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/LICENSE +0 -0
  267. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/top_level.txt +0 -0
@@ -1,134 +1,47 @@
1
- from arekit.common.data import const
2
- from arekit.common.utils import filter_whitespaces, split_by_whitespaces
3
-
4
- import arekit.contrib.networks.input.const as network_input_const
5
-
6
- empty_list = []
7
-
8
-
9
- def no_value():
10
- return None
11
-
12
-
13
- def __process_values_list(value):
14
- return value.split(network_input_const.ArgsSep)
15
-
16
-
17
- def __process_indices_list(value):
18
- return no_value() if not value else [int(v) for v in str(value).split(network_input_const.ArgsSep)]
19
-
20
-
21
- def __process_int_values_list(value):
22
- return __process_indices_list(value)
23
-
24
-
25
- def __handle_text(value):
26
- """ The core method of the input text processing.
27
- """
28
- assert(isinstance(value, str) or isinstance(value, list))
29
- return filter_whitespaces([term for term in split_by_whitespaces(value)]
30
- if isinstance(value, str) else value)
31
-
32
-
33
- parse_value = {
34
- const.ID: lambda value: value,
35
- const.DOC_ID: lambda value: value,
36
- const.S_IND: lambda value: int(value),
37
- const.T_IND: lambda value: int(value),
38
- const.SENT_IND: lambda value: int(value),
39
- const.ENTITY_VALUES: lambda value: __process_values_list(value),
40
- const.ENTITY_TYPES: lambda value: __process_values_list(value),
41
- const.ENTITIES: lambda value: __process_indices_list(value),
42
- const.TEXT: lambda value: __handle_text(value),
43
- network_input_const.FrameVariantIndices: lambda value:
44
- __process_indices_list(value) if isinstance(value, str) else empty_list,
45
- network_input_const.FrameConnotations: lambda value:
46
- __process_indices_list(value) if isinstance(value, str) else empty_list,
47
- network_input_const.SynonymObject: lambda value: __process_indices_list(value),
48
- network_input_const.SynonymSubject: lambda value: __process_indices_list(value),
49
- network_input_const.PosTags: lambda value: __process_int_values_list(value)
50
- }
51
-
52
-
53
- class ParsedSampleRow(object):
54
- """ Provides a parsed information for a sample row.
55
- """
56
-
57
- def __init__(self, row):
58
- """ row: dict
59
- dict of the pairs ("field_name", value)
60
- """
61
- assert(isinstance(row, dict))
62
-
63
- self.__uint_label = None
64
- self.__params = {}
65
-
66
- for key, value in row.items():
67
-
68
- if key == const.LABEL:
69
- self.__uint_label = int(value)
70
- # TODO: To be adopted in future instead of __uint_label
71
- self.__params[key] = value
72
- continue
73
-
74
- if key not in parse_value:
75
- continue
76
-
77
- self.__params[key] = parse_value[key](value)
78
-
79
- def __value_or_none(self, key):
80
- return self.__params[key] if key in self.__params else no_value()
81
-
82
- @property
83
- def SampleID(self):
84
- return self.__params[const.ID]
85
-
86
- @property
87
- def Terms(self):
88
- return self.__params[const.TEXT]
89
-
90
- @property
91
- def SubjectIndex(self):
92
- return self.__params[const.S_IND]
93
-
94
- @property
95
- def ObjectIndex(self):
96
- return self.__params[const.T_IND]
97
-
98
- @property
99
- def UintLabel(self):
100
- return self.__uint_label
101
-
102
- @property
103
- def PartOfSpeechTags(self):
104
- return self.__value_or_none(network_input_const.PosTags)
105
-
106
- @property
107
- def TextFrameVariantIndices(self):
108
- return self.__value_or_none(network_input_const.FrameVariantIndices)
109
-
110
- @property
111
- def TextFrameConnotations(self):
112
- return self.__value_or_none(network_input_const.FrameConnotations)
113
-
114
- @property
115
- def EntityInds(self):
116
- return self.__value_or_none(const.ENTITIES)
117
-
118
- @property
119
- def SynonymObjectInds(self):
120
- return self.__value_or_none(network_input_const.SynonymObject)
121
-
122
- @property
123
- def SynonymSubjectInds(self):
124
- return self.__value_or_none(network_input_const.SynonymSubject)
125
-
126
- def __getitem__(self, item):
127
- assert (isinstance(item, str) or item is None)
128
- if item not in self.__params:
129
- return no_value()
130
- return self.__params[item] if item is not None else no_value()
131
-
132
- @classmethod
133
- def parse(cls, row):
134
- return cls(row=row)
1
+ import arekit.contrib.networks.input.const as const
2
+ from arekit.common.data.rows_fmt import process_indices_list
3
+
4
+
5
+ def create_nn_column_formatters(no_value_func=lambda: None, args_sep=","):
6
+ assert(callable(no_value_func))
7
+
8
+ empty_list = []
9
+
10
+ def str_to_list(value):
11
+ return process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
12
+
13
+ def list_to_str(inds_iter):
14
+ return args_sep.join([str(i) for i in inds_iter])
15
+
16
+ return {
17
+ const.FrameVariantIndices: {
18
+ "writer": lambda value: list_to_str(value),
19
+ "parser": lambda value: process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
20
+ if isinstance(value, str) else empty_list
21
+ },
22
+ const.FrameConnotations: {
23
+ "writer": lambda value: list_to_str(value),
24
+ "parser": lambda value: process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
25
+ if isinstance(value, str) else empty_list
26
+ },
27
+ const.SynonymObject: {
28
+ "writer": lambda value: list_to_str(value),
29
+ "parser": lambda value: process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
30
+ },
31
+ const.SynonymSubject: {
32
+ "writer": lambda value: list_to_str(value),
33
+ "parser": lambda value: process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
34
+ },
35
+ const.PosTags: {
36
+ "writer": lambda value: list_to_str(value),
37
+ "parser": lambda value: str_to_list(value)
38
+ }
39
+ }
40
+
41
+
42
+ def create_nn_val_writer_fmt(fmt_type, args_sep=","):
43
+ assert(isinstance(fmt_type, str))
44
+ d = create_nn_column_formatters(args_sep=args_sep)
45
+ for k, v in d.items():
46
+ d[k] = v[fmt_type]
47
+ return d
@@ -28,32 +28,34 @@ class PromptedSampleRowProvider(CroppedSampleRowProvider):
28
28
  self.__labels_fmt = label_fmt
29
29
 
30
30
  def _fill_row_core(self, row, text_opinion_linkage, index_in_linked, etalon_label,
31
- parsed_news, sentence_ind, s_ind, t_ind):
31
+ parsed_doc, sentence_ind, s_ind, t_ind):
32
32
 
33
33
  super(PromptedSampleRowProvider, self)._fill_row_core(row=row,
34
34
  text_opinion_linkage=text_opinion_linkage,
35
35
  index_in_linked=index_in_linked,
36
36
  etalon_label=etalon_label,
37
- parsed_news=parsed_news,
37
+ parsed_doc=parsed_doc,
38
38
  sentence_ind=sentence_ind,
39
39
  s_ind=s_ind,
40
40
  t_ind=t_ind)
41
41
  original_text = row[BaseSingleTextProvider.TEXT_A]
42
42
 
43
43
  sentence_terms, actual_s_ind, actual_t_ind = self._provide_sentence_terms(
44
- parsed_news=parsed_news, sentence_ind=sentence_ind, s_ind=s_ind, t_ind=t_ind)
44
+ parsed_doc=parsed_doc, sentence_ind=sentence_ind, s_ind=s_ind, t_ind=t_ind)
45
45
 
46
- label_uint = row[const.LABEL] if const.LABEL in row else None
46
+ label_uint = row[const.LABEL_UINT] if const.LABEL_UINT in row else None
47
47
  label_val = str(label_uint) if label_uint is None or self.__labels_fmt is None else \
48
- self.__labels_fmt.label_to_str(self._label_provider.LabelScaler.uint_to_label(row[const.LABEL]))
49
-
50
- row[BaseSingleTextProvider.TEXT_A] = self.__prompt.format(
51
- text=original_text,
52
- s_ind=row[const.S_IND],
53
- t_ind=row[const.T_IND],
54
- s_val=sentence_terms[actual_s_ind].DisplayValue,
55
- t_val=sentence_terms[actual_t_ind].DisplayValue,
56
- label_uint=label_uint,
57
- label_val=label_val)
58
-
59
- return row
48
+ self.__labels_fmt.label_to_str(self._label_provider.LabelScaler.uint_to_label(row[const.LABEL_UINT]))
49
+
50
+ vm = {
51
+ const.TEXT: self.__prompt.format(
52
+ text=original_text,
53
+ s_ind=row[const.S_IND],
54
+ t_ind=row[const.T_IND],
55
+ s_val=sentence_terms[actual_s_ind].DisplayValue,
56
+ t_val=sentence_terms[actual_t_ind].DisplayValue,
57
+ label_uint=label_uint,
58
+ label_val=label_val)
59
+ }
60
+
61
+ self._apply_row_data(row=row, vm=vm, val_fmt=self._val_fmt)
@@ -1,7 +1,9 @@
1
1
  from arekit.common.data.input.providers.const import IDLE_MODE
2
2
  from arekit.common.data.input.providers.contents import ContentsProvider
3
+ from arekit.common.linkage.base import LinkedDataWrapper
3
4
  from arekit.common.linkage.text_opinions import TextOpinionsLinkage
4
- from arekit.common.pipeline.base import BasePipeline
5
+ from arekit.common.pipeline.base import BasePipelineLauncher
6
+ from arekit.common.pipeline.context import PipelineContext
5
7
  from arekit.common.text_opinions.base import TextOpinion
6
8
 
7
9
 
@@ -12,7 +14,7 @@ class InputTextOpinionProvider(ContentsProvider):
12
14
  results in a TextOpinionLinkage instances.
13
15
  pipeline: id -> ... -> TextOpinionLinkage[]
14
16
  """
15
- assert(isinstance(pipeline, BasePipeline))
17
+ assert(isinstance(pipeline, list))
16
18
  self.__pipeline = pipeline
17
19
  self.__current_id = None
18
20
 
@@ -29,7 +31,17 @@ class InputTextOpinionProvider(ContentsProvider):
29
31
 
30
32
  def from_doc_ids(self, doc_ids, idle_mode=False):
31
33
  self.__current_id = 0
32
- for linkage in self.__pipeline.run(doc_ids, params_dict={IDLE_MODE: idle_mode}):
33
- assert(isinstance(linkage, TextOpinionsLinkage))
34
- self.__assign_ids(linkage)
34
+
35
+ ctx = PipelineContext(d={
36
+ "result": doc_ids,
37
+ IDLE_MODE: idle_mode
38
+ })
39
+
40
+ # Launching pipeline with the passed context
41
+ BasePipelineLauncher.run(pipeline=self.__pipeline, pipeline_ctx=ctx)
42
+
43
+ for linkage in ctx.provide("result"):
44
+ assert(isinstance(linkage, LinkedDataWrapper))
45
+ if isinstance(linkage, TextOpinionsLinkage):
46
+ self.__assign_ids(linkage)
35
47
  yield linkage
@@ -0,0 +1,13 @@
1
+ from arekit.common.data.doc_provider import DocumentProvider
2
+
3
+
4
+ class DictionaryBasedDocumentProvider(DocumentProvider):
5
+
6
+ def __init__(self, d):
7
+ assert(isinstance(d, dict))
8
+ super(DictionaryBasedDocumentProvider, self).__init__()
9
+ self.__d = d
10
+
11
+ def by_id(self, doc_id):
12
+ assert(isinstance(doc_id, int))
13
+ return self.__d[doc_id]
@@ -1,12 +1,12 @@
1
1
  from os.path import join
2
2
 
3
- from arekit.common.experiment.api.ops_doc import DocumentOperations
4
- from arekit.common.news.base import News
5
- from arekit.common.news.sentence import BaseNewsSentence
3
+ from arekit.common.data.doc_provider import DocumentProvider
4
+ from arekit.common.docs.base import Document
5
+ from arekit.common.docs.sentence import BaseDocumentSentence
6
6
 
7
7
 
8
- class DirectoryFilesDocOperations(DocumentOperations):
9
- """ Document Operations based on the list of provided file paths
8
+ class DirectoryFilesDocProvider(DocumentProvider):
9
+ """ Document Providers based on the list of provided file paths
10
10
  for the particular directory.
11
11
  """
12
12
 
@@ -36,10 +36,10 @@ class DirectoryFilesDocOperations(DocumentOperations):
36
36
  """
37
37
  # setup input data.
38
38
  sentences = self.__sentence_parser(contents)
39
- sentences = list(map(lambda text: BaseNewsSentence(text), sentences))
39
+ sentences = list(map(lambda text: BaseDocumentSentence(text), sentences))
40
40
 
41
41
  # Parse text.
42
- return News(doc_id=doc_id, sentences=sentences)
42
+ return Document(doc_id=doc_id, sentences=sentences)
43
43
 
44
44
  def by_id(self, doc_id):
45
45
  """ Perform reading operation of the document.
@@ -1,4 +1,7 @@
1
1
  class BaseReader(object):
2
2
 
3
+ def extension(self):
4
+ raise NotImplementedError()
5
+
3
6
  def read(self, target):
4
7
  raise NotImplementedError()
@@ -1,23 +1,29 @@
1
1
  import importlib
2
+
2
3
  from arekit.contrib.utils.data.readers.base import BaseReader
3
4
  from arekit.contrib.utils.data.storages.pandas_based import PandasBasedRowsStorage
4
5
 
5
6
 
6
7
  class PandasCsvReader(BaseReader):
7
- """ Represents a CSV-based reader, implemented via pandas API.
8
+ """ Represents a CSV-based reader, implmented via pandas API.
8
9
  """
9
10
 
10
- def __init__(self, sep='\t', header='infer', compression='infer', encoding='utf-8', col_types=None):
11
+ def __init__(self, sep='\t', header='infer', compression='infer', encoding='utf-8', col_types=None,
12
+ custom_extension=None):
11
13
  self.__sep = sep
12
14
  self.__compression = compression
13
15
  self.__encoding = encoding
14
16
  self.__header = header
17
+ self.__custom_extension = custom_extension
15
18
 
16
- # Speciall assignation of types for certain columns.
19
+ # Special assignation of types for certain columns.
17
20
  self.__col_types = col_types
18
21
  if self.__col_types is None:
19
22
  self.__col_types = dict()
20
23
 
24
+ def extension(self):
25
+ return ".tsv.gz" if self.__custom_extension is None else self.__custom_extension
26
+
21
27
  def __from_csv(self, filepath):
22
28
  pd = importlib.import_module("pandas")
23
29
  return pd.read_csv(filepath,
@@ -29,4 +35,4 @@ class PandasCsvReader(BaseReader):
29
35
 
30
36
  def read(self, target):
31
37
  df = self.__from_csv(filepath=target)
32
- return PandasBasedRowsStorage(df)
38
+ return PandasBasedRowsStorage(df)
@@ -4,6 +4,9 @@ from arekit.contrib.utils.data.storages.jsonl_based import JsonlBasedRowsStorage
4
4
 
5
5
  class JsonlReader(BaseReader):
6
6
 
7
+ def extension(self):
8
+ return ".jsonl"
9
+
7
10
  def read(self, target):
8
11
  rows = []
9
12
  with open(target, "r") as f:
@@ -0,0 +1,14 @@
1
+ from arekit.contrib.utils.data.readers.base import BaseReader
2
+ from arekit.contrib.utils.data.storages.sqlite_based import SQliteBasedRowsStorage
3
+
4
+
5
+ class SQliteReader(BaseReader):
6
+
7
+ def __init__(self, table_name):
8
+ self.__table_name = table_name
9
+
10
+ def extension(self):
11
+ return ".sqlite"
12
+
13
+ def read(self, target):
14
+ return SQliteBasedRowsStorage(path=target, table_name=self.__table_name)
@@ -1,6 +1,5 @@
1
1
  import gc
2
2
  import importlib
3
-
4
3
  from arekit.contrib.utils.data.storages.pandas_based import PandasBasedRowsStorage
5
4
 
6
5
 
@@ -48,6 +48,9 @@ class PandasBasedRowsStorage(BaseRowsStorage):
48
48
  def iter_column_names(self):
49
49
  return iter(self._df.columns)
50
50
 
51
+ def iter_column_types(self):
52
+ return iter(self._df.dtypes)
53
+
51
54
  def _set_row_value(self, row_ind, column, value):
52
55
  self._df.at[row_ind, column] = value
53
56
 
@@ -105,11 +108,6 @@ class PandasBasedRowsStorage(BaseRowsStorage):
105
108
  def find_by_value(self, column_name, value):
106
109
  return self.__filter(column_name=column_name, value=value)
107
110
 
108
- def find_first_by_value(self, column_name, value):
109
- # TODO. Return new storage. (Encapsulation)
110
- rows = self.__filter(column_name=column_name, value=value)
111
- return rows.iloc[0]
112
-
113
111
  def init_empty(self, columns_provider):
114
112
  cols_with_types = columns_provider.get_columns_list_with_types()
115
113
  self._df = self.__create_empty(cols_with_types)
@@ -15,7 +15,8 @@ class RowCacheStorage(BaseRowsStorage):
15
15
  assert(isinstance(force_collect_columns, list) or force_collect_columns is None)
16
16
  self.__f = None
17
17
  self.__row_cache = {}
18
- self.__columns = []
18
+ self.__column_names = []
19
+ self.__column_types = []
19
20
  self.__force_collect_columns = [] if force_collect_columns is None else force_collect_columns
20
21
 
21
22
  @property
@@ -24,15 +25,26 @@ class RowCacheStorage(BaseRowsStorage):
24
25
 
25
26
  def init_empty(self, columns_provider):
26
27
  assert (isinstance(columns_provider, BaseColumnsProvider))
27
- for col_name, _ in columns_provider.get_columns_list_with_types():
28
- self.__columns.append(col_name)
28
+
29
+ self.__column_names.clear()
30
+ for col_name, col_type in columns_provider.get_columns_list_with_types():
31
+ self.__column_names.append(col_name)
32
+ self.__column_types.append(col_type)
29
33
 
30
34
  # Expand with columns that are forced to be provided.
31
- existed_set = set(self.__columns)
32
- self.__columns += [c for c in self.__force_collect_columns if c not in existed_set]
35
+ existed_set = set(self.__column_names)
36
+
37
+ # Calculate extension: columns that were not mentioned in column names list.
38
+ extension = [c for c in self.__force_collect_columns if c not in existed_set]
39
+
40
+ self.__column_names += extension
41
+ self.__column_types += [str] * len(extension)
33
42
 
34
43
  def iter_column_names(self):
35
- return iter(self.__columns)
44
+ return iter(self.__column_names)
45
+
46
+ def iter_column_types(self):
47
+ return iter(self.__column_types)
36
48
 
37
49
  def _set_row_value(self, row_ind, column, value):
38
50
  self.__row_cache[column] = value
@@ -0,0 +1,17 @@
1
+ import sqlite3
2
+ from arekit.common.data.storages.base import BaseRowsStorage
3
+
4
+
5
+ class SQliteBasedRowsStorage(BaseRowsStorage):
6
+
7
+ def __init__(self, path, table_name):
8
+ self.__path = path
9
+ self.__table_name = table_name
10
+ self.__conn = None
11
+
12
+ def _iter_rows(self):
13
+ with sqlite3.connect(self.__path) as conn:
14
+ cursor = conn.execute(f"select * from {self.__table_name}")
15
+ for row_index, row in enumerate(cursor.fetchall()):
16
+ row_dict = {cursor.description[i][0]: value for i, value in enumerate(row)}
17
+ yield row_index, row_dict
@@ -1,5 +1,10 @@
1
1
  class BaseWriter(object):
2
2
 
3
+ def extension(self):
4
+ """ Expected output extension type.
5
+ """
6
+ raise NotImplementedError()
7
+
3
8
  def open_target(self, target):
4
9
  pass
5
10
 
@@ -17,6 +17,9 @@ class NativeCsvWriter(BaseWriter):
17
17
  self.__header = header
18
18
  self.__header_written = None
19
19
 
20
+ def extension(self):
21
+ return ".csv"
22
+
20
23
  @staticmethod
21
24
  def __iter_storage_column_names(storage):
22
25
  """ Iter only those columns that existed in storage.
@@ -15,6 +15,9 @@ class PandasCsvWriter(BaseWriter):
15
15
  super(PandasCsvWriter, self).__init__()
16
16
  self.__write_header = write_header
17
17
 
18
+ def extension(self):
19
+ return ".tsv.gz"
20
+
18
21
  def write_all(self, storage, target):
19
22
  assert(isinstance(storage, PandasBasedRowsStorage))
20
23
  assert(isinstance(target, str))
@@ -27,9 +27,8 @@ class OpenNREJsonWriter(BaseWriter):
27
27
  During the dataset reading stage via OpenNRE, these linkages automaticaly groups into bags.
28
28
  """
29
29
 
30
- EXTRA_KEYS_TEMPLATE = "_{}"
31
-
32
- def __init__(self, text_columns, encoding="utf-8"):
30
+ def __init__(self, text_columns, encoding="utf-8", na_value="NA", keep_extra_columns=True,
31
+ skip_extra_existed=True):
33
32
  """ text_columns: list
34
33
  column names that expected to be joined into a single (token) column.
35
34
  """
@@ -38,16 +37,23 @@ class OpenNREJsonWriter(BaseWriter):
38
37
  self.__text_columns = text_columns
39
38
  self.__encoding = encoding
40
39
  self.__target_f = None
40
+ self.__keep_extra_columns = keep_extra_columns
41
+ self.__na_value = na_value
42
+ self.__skip_extra_existed = skip_extra_existed
43
+
44
+ def extension(self):
45
+ return ".jsonl"
41
46
 
42
47
  @staticmethod
43
- def __format_row(row, text_columns):
48
+ def __format_row(row, na_value, text_columns, keep_extra_columns, skip_extra_existed):
44
49
  """ Formatting that is compatible with the OpenNRE.
45
50
  """
51
+ assert(isinstance(na_value, str))
46
52
 
47
53
  sample_id = row[const.ID]
48
54
  s_ind = int(row[const.S_IND])
49
55
  t_ind = int(row[const.T_IND])
50
- bag_id = sample_id[0:sample_id.find('_i')]
56
+ bag_id = str(row[const.OPINION_ID])
51
57
 
52
58
  # Gather tokens.
53
59
  tokens = []
@@ -62,13 +68,18 @@ class OpenNREJsonWriter(BaseWriter):
62
68
  "token": tokens,
63
69
  "h": {"pos": [s_ind, s_ind + 1], "id": str(bag_id + "s")},
64
70
  "t": {"pos": [t_ind, t_ind + 1], "id": str(bag_id + "t")},
65
- "relation": str(int(row[const.LABEL])) if const.LABEL in row else "NA"
71
+ "relation": str(int(row[const.LABEL_UINT])) if const.LABEL_UINT in row else na_value
66
72
  }
67
73
 
68
- # Register extra fields.
69
- for key, value in row.items():
70
- if key not in formatted_data and key not in text_columns:
71
- formatted_data[OpenNREJsonWriter.EXTRA_KEYS_TEMPLATE.format(key)] = value
74
+ # Register extra fields (optionally).
75
+ if keep_extra_columns:
76
+ for key, value in row.items():
77
+ if key not in formatted_data and key not in text_columns:
78
+ formatted_data[key] = value
79
+ else:
80
+ if not skip_extra_existed:
81
+ raise Exception(f"key `{key}` is already exist in formatted data "
82
+ f"or a part of the text columns list: {text_columns}")
72
83
 
73
84
  return formatted_data
74
85
 
@@ -90,8 +101,12 @@ class OpenNREJsonWriter(BaseWriter):
90
101
  continue
91
102
  row_data[col_name] = storage.RowCache[col_name]
92
103
 
93
- self.__write_bag(bag=self.__format_row(row_data, text_columns=self.__text_columns),
94
- json_file=self.__target_f)
104
+ bag = self.__format_row(row_data, text_columns=self.__text_columns,
105
+ keep_extra_columns=self.__keep_extra_columns,
106
+ na_value=self.__na_value,
107
+ skip_extra_existed=self.__skip_extra_existed)
108
+
109
+ self.__write_bag(bag=bag, json_file=self.__target_f)
95
110
 
96
111
  @staticmethod
97
112
  def __write_bag(bag, json_file):
@@ -108,7 +123,10 @@ class OpenNREJsonWriter(BaseWriter):
108
123
  os.makedirs(os.path.dirname(target), exist_ok=True)
109
124
  with open(target, "w", encoding=self.__encoding) as json_file:
110
125
  for row_index, row in storage:
111
- self.__write_bag(bag=self.__format_row(row, text_columns=self.__text_columns),
126
+ self.__write_bag(bag=self.__format_row(row, text_columns=self.__text_columns,
127
+ keep_extra_columns=self.__keep_extra_columns,
128
+ na_value=self.__na_value,
129
+ skip_extra_existed=self.__skip_extra_existed),
112
130
  json_file=json_file)
113
131
 
114
132
  logger.info("Saving completed!")