arekit 0.23.1__py3-none-any.whl → 0.25.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (267) hide show
  1. arekit/common/context/terms_mapper.py +2 -2
  2. arekit/common/data/const.py +5 -4
  3. arekit/common/{experiment/api/ops_doc.py → data/doc_provider.py} +1 -1
  4. arekit/common/data/input/providers/columns/sample.py +6 -1
  5. arekit/common/data/input/providers/instances/base.py +1 -1
  6. arekit/common/data/input/providers/rows/base.py +36 -13
  7. arekit/common/data/input/providers/rows/samples.py +57 -55
  8. arekit/common/data/input/providers/sample/cropped.py +2 -2
  9. arekit/common/data/input/sample.py +1 -1
  10. arekit/common/data/rows_fmt.py +82 -0
  11. arekit/common/data/rows_parser.py +43 -0
  12. arekit/common/data/storages/base.py +23 -18
  13. arekit/common/data/views/samples.py +2 -8
  14. arekit/common/{news → docs}/base.py +2 -2
  15. arekit/common/{news → docs}/entities_grouping.py +2 -1
  16. arekit/common/{news → docs}/entity.py +2 -1
  17. arekit/common/{news → docs}/parsed/base.py +5 -5
  18. arekit/common/docs/parsed/providers/base.py +68 -0
  19. arekit/common/{news → docs}/parsed/providers/base_pairs.py +2 -2
  20. arekit/common/{news → docs}/parsed/providers/entity_service.py +27 -22
  21. arekit/common/{news → docs}/parsed/providers/opinion_pairs.py +2 -2
  22. arekit/common/{news → docs}/parsed/providers/text_opinion_pairs.py +6 -6
  23. arekit/common/docs/parsed/service.py +31 -0
  24. arekit/common/docs/parser.py +66 -0
  25. arekit/common/{news → docs}/sentence.py +1 -1
  26. arekit/common/entities/base.py +11 -2
  27. arekit/common/experiment/api/base_samples_io.py +1 -1
  28. arekit/common/frames/variants/collection.py +2 -2
  29. arekit/common/linkage/base.py +2 -2
  30. arekit/common/linkage/meta.py +23 -0
  31. arekit/common/linkage/opinions.py +1 -1
  32. arekit/common/linkage/text_opinions.py +2 -2
  33. arekit/common/opinions/annot/algo/base.py +1 -1
  34. arekit/common/opinions/annot/algo/pair_based.py +15 -13
  35. arekit/common/opinions/annot/algo/predefined.py +4 -4
  36. arekit/common/opinions/annot/algo_based.py +5 -5
  37. arekit/common/opinions/annot/base.py +3 -3
  38. arekit/common/opinions/base.py +7 -7
  39. arekit/common/opinions/collection.py +3 -3
  40. arekit/common/pipeline/base.py +12 -16
  41. arekit/common/pipeline/batching.py +28 -0
  42. arekit/common/pipeline/context.py +5 -1
  43. arekit/common/pipeline/items/base.py +38 -1
  44. arekit/common/pipeline/items/flatten.py +5 -1
  45. arekit/common/pipeline/items/handle.py +2 -1
  46. arekit/common/pipeline/items/iter.py +2 -1
  47. arekit/common/pipeline/items/map.py +2 -1
  48. arekit/common/pipeline/items/map_nested.py +4 -0
  49. arekit/common/pipeline/utils.py +32 -0
  50. arekit/common/service/sqlite.py +36 -0
  51. arekit/common/synonyms/base.py +2 -2
  52. arekit/common/text/{partitioning/str.py → partitioning.py} +16 -11
  53. arekit/common/text_opinions/base.py +11 -11
  54. arekit/common/utils.py +33 -46
  55. arekit/contrib/networks/embedding.py +3 -3
  56. arekit/contrib/networks/embedding_io.py +5 -5
  57. arekit/contrib/networks/input/const.py +0 -2
  58. arekit/contrib/networks/input/providers/sample.py +15 -29
  59. arekit/contrib/networks/input/rows_parser.py +47 -134
  60. arekit/contrib/prompt/sample.py +18 -16
  61. arekit/contrib/utils/data/contents/opinions.py +17 -5
  62. arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
  63. arekit/contrib/utils/data/{doc_ops → doc_provider}/dir_based.py +7 -7
  64. arekit/contrib/utils/data/readers/base.py +3 -0
  65. arekit/contrib/utils/data/readers/csv_pd.py +10 -4
  66. arekit/contrib/utils/data/readers/jsonl.py +3 -0
  67. arekit/contrib/utils/data/readers/sqlite.py +14 -0
  68. arekit/contrib/utils/data/service/balance.py +0 -1
  69. arekit/contrib/utils/data/storages/pandas_based.py +3 -5
  70. arekit/contrib/utils/data/storages/row_cache.py +18 -6
  71. arekit/contrib/utils/data/storages/sqlite_based.py +17 -0
  72. arekit/contrib/utils/data/writers/base.py +5 -0
  73. arekit/contrib/utils/data/writers/csv_native.py +3 -0
  74. arekit/contrib/utils/data/writers/csv_pd.py +3 -0
  75. arekit/contrib/utils/data/writers/json_opennre.py +31 -13
  76. arekit/contrib/utils/data/writers/sqlite_native.py +114 -0
  77. arekit/contrib/utils/io_utils/embedding.py +25 -33
  78. arekit/contrib/utils/io_utils/utils.py +3 -24
  79. arekit/contrib/utils/pipelines/items/sampling/base.py +31 -26
  80. arekit/contrib/utils/pipelines/items/sampling/networks.py +7 -10
  81. arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
  82. arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
  83. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +3 -3
  84. arekit/contrib/utils/pipelines/items/text/frames_negation.py +2 -1
  85. arekit/contrib/utils/pipelines/items/text/tokenizer.py +3 -5
  86. arekit/contrib/utils/pipelines/items/text/translator.py +136 -0
  87. arekit/contrib/utils/pipelines/opinion_collections.py +5 -5
  88. arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +7 -7
  89. arekit/contrib/utils/pipelines/text_opinion/extraction.py +34 -22
  90. arekit/contrib/utils/pipelines/text_opinion/filters/base.py +1 -1
  91. arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +1 -1
  92. arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +3 -3
  93. arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +4 -4
  94. arekit/contrib/utils/serializer.py +4 -23
  95. arekit-0.25.0.data/data/logo.png +0 -0
  96. arekit-0.25.0.dist-info/METADATA +82 -0
  97. arekit-0.25.0.dist-info/RECORD +259 -0
  98. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/WHEEL +1 -1
  99. arekit/common/data/row_ids/base.py +0 -79
  100. arekit/common/data/row_ids/binary.py +0 -38
  101. arekit/common/data/row_ids/multiple.py +0 -14
  102. arekit/common/folding/base.py +0 -36
  103. arekit/common/folding/fixed.py +0 -42
  104. arekit/common/folding/nofold.py +0 -15
  105. arekit/common/folding/united.py +0 -46
  106. arekit/common/news/objects_parser.py +0 -37
  107. arekit/common/news/parsed/providers/base.py +0 -48
  108. arekit/common/news/parsed/service.py +0 -31
  109. arekit/common/news/parser.py +0 -34
  110. arekit/common/text/parser.py +0 -12
  111. arekit/common/text/partitioning/__init__.py +0 -0
  112. arekit/common/text/partitioning/base.py +0 -4
  113. arekit/common/text/partitioning/terms.py +0 -35
  114. arekit/contrib/source/__init__.py +0 -0
  115. arekit/contrib/source/brat/__init__.py +0 -0
  116. arekit/contrib/source/brat/annot.py +0 -83
  117. arekit/contrib/source/brat/entities/__init__.py +0 -0
  118. arekit/contrib/source/brat/entities/compound.py +0 -33
  119. arekit/contrib/source/brat/entities/entity.py +0 -42
  120. arekit/contrib/source/brat/entities/parser.py +0 -53
  121. arekit/contrib/source/brat/news.py +0 -28
  122. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  123. arekit/contrib/source/brat/opinions/converter.py +0 -19
  124. arekit/contrib/source/brat/relation.py +0 -32
  125. arekit/contrib/source/brat/sentence.py +0 -69
  126. arekit/contrib/source/brat/sentences_reader.py +0 -128
  127. arekit/contrib/source/download.py +0 -41
  128. arekit/contrib/source/nerel/__init__.py +0 -0
  129. arekit/contrib/source/nerel/entities.py +0 -55
  130. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  131. arekit/contrib/source/nerel/folding/fixed.py +0 -75
  132. arekit/contrib/source/nerel/io_utils.py +0 -62
  133. arekit/contrib/source/nerel/labels.py +0 -241
  134. arekit/contrib/source/nerel/reader.py +0 -46
  135. arekit/contrib/source/nerel/utils.py +0 -24
  136. arekit/contrib/source/nerel/versions.py +0 -12
  137. arekit/contrib/source/nerelbio/__init__.py +0 -0
  138. arekit/contrib/source/nerelbio/io_utils.py +0 -62
  139. arekit/contrib/source/nerelbio/labels.py +0 -265
  140. arekit/contrib/source/nerelbio/reader.py +0 -8
  141. arekit/contrib/source/nerelbio/versions.py +0 -8
  142. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  143. arekit/contrib/source/ruattitudes/collection.py +0 -36
  144. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  145. arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
  146. arekit/contrib/source/ruattitudes/io_utils.py +0 -56
  147. arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
  148. arekit/contrib/source/ruattitudes/news.py +0 -51
  149. arekit/contrib/source/ruattitudes/news_brat.py +0 -44
  150. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  151. arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
  152. arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
  153. arekit/contrib/source/ruattitudes/reader.py +0 -268
  154. arekit/contrib/source/ruattitudes/sentence.py +0 -73
  155. arekit/contrib/source/ruattitudes/synonyms.py +0 -17
  156. arekit/contrib/source/ruattitudes/text_object.py +0 -57
  157. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  158. arekit/contrib/source/rusentiframes/collection.py +0 -157
  159. arekit/contrib/source/rusentiframes/effect.py +0 -24
  160. arekit/contrib/source/rusentiframes/io_utils.py +0 -19
  161. arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
  162. arekit/contrib/source/rusentiframes/polarity.py +0 -35
  163. arekit/contrib/source/rusentiframes/role.py +0 -15
  164. arekit/contrib/source/rusentiframes/state.py +0 -24
  165. arekit/contrib/source/rusentiframes/types.py +0 -42
  166. arekit/contrib/source/rusentiframes/value.py +0 -2
  167. arekit/contrib/source/rusentrel/__init__.py +0 -0
  168. arekit/contrib/source/rusentrel/const.py +0 -3
  169. arekit/contrib/source/rusentrel/entities.py +0 -26
  170. arekit/contrib/source/rusentrel/io_utils.py +0 -125
  171. arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
  172. arekit/contrib/source/rusentrel/news_reader.py +0 -51
  173. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  174. arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
  175. arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
  176. arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
  177. arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
  178. arekit/contrib/source/rusentrel/synonyms.py +0 -17
  179. arekit/contrib/source/sentinerel/__init__.py +0 -0
  180. arekit/contrib/source/sentinerel/entities.py +0 -52
  181. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  182. arekit/contrib/source/sentinerel/folding/factory.py +0 -32
  183. arekit/contrib/source/sentinerel/folding/fixed.py +0 -73
  184. arekit/contrib/source/sentinerel/io_utils.py +0 -87
  185. arekit/contrib/source/sentinerel/labels.py +0 -53
  186. arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
  187. arekit/contrib/source/sentinerel/reader.py +0 -42
  188. arekit/contrib/source/synonyms/__init__.py +0 -0
  189. arekit/contrib/source/synonyms/utils.py +0 -19
  190. arekit/contrib/source/zip_utils.py +0 -47
  191. arekit/contrib/utils/bert/rows.py +0 -0
  192. arekit/contrib/utils/bert/text_b_rus.py +0 -18
  193. arekit/contrib/utils/connotations/__init__.py +0 -0
  194. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
  195. arekit/contrib/utils/cv/__init__.py +0 -0
  196. arekit/contrib/utils/cv/doc_stat/__init__.py +0 -0
  197. arekit/contrib/utils/cv/doc_stat/base.py +0 -37
  198. arekit/contrib/utils/cv/doc_stat/sentence.py +0 -12
  199. arekit/contrib/utils/cv/splitters/__init__.py +0 -0
  200. arekit/contrib/utils/cv/splitters/base.py +0 -4
  201. arekit/contrib/utils/cv/splitters/default.py +0 -53
  202. arekit/contrib/utils/cv/splitters/statistical.py +0 -57
  203. arekit/contrib/utils/cv/two_class.py +0 -77
  204. arekit/contrib/utils/data/doc_ops/__init__.py +0 -0
  205. arekit/contrib/utils/data/doc_ops/dict_based.py +0 -13
  206. arekit/contrib/utils/data/ext.py +0 -31
  207. arekit/contrib/utils/data/views/__init__.py +0 -0
  208. arekit/contrib/utils/data/views/linkages/__init__.py +0 -0
  209. arekit/contrib/utils/data/views/linkages/base.py +0 -58
  210. arekit/contrib/utils/data/views/linkages/multilabel.py +0 -48
  211. arekit/contrib/utils/data/views/linkages/utils.py +0 -24
  212. arekit/contrib/utils/data/views/opinions.py +0 -14
  213. arekit/contrib/utils/download.py +0 -78
  214. arekit/contrib/utils/entities/formatters/str_rus_cased_fmt.py +0 -78
  215. arekit/contrib/utils/entities/formatters/str_rus_nocased_fmt.py +0 -15
  216. arekit/contrib/utils/entities/formatters/str_simple_fmt.py +0 -24
  217. arekit/contrib/utils/entities/formatters/str_simple_uppercase_fmt.py +0 -21
  218. arekit/contrib/utils/io_utils/opinions.py +0 -39
  219. arekit/contrib/utils/io_utils/samples.py +0 -78
  220. arekit/contrib/utils/lexicons/__init__.py +0 -0
  221. arekit/contrib/utils/lexicons/lexicon.py +0 -43
  222. arekit/contrib/utils/lexicons/relation.py +0 -45
  223. arekit/contrib/utils/lexicons/rusentilex.py +0 -34
  224. arekit/contrib/utils/nn/__init__.py +0 -0
  225. arekit/contrib/utils/nn/rows.py +0 -83
  226. arekit/contrib/utils/pipelines/items/sampling/bert.py +0 -5
  227. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
  228. arekit/contrib/utils/pipelines/items/to_output.py +0 -101
  229. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  230. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  231. arekit/contrib/utils/pipelines/sources/nerel/doc_ops.py +0 -27
  232. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -59
  233. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
  234. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  235. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_ops.py +0 -29
  236. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -59
  237. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
  238. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  239. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_ops.py +0 -56
  240. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -19
  241. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -58
  242. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  243. arekit/contrib/utils/pipelines/sources/rusentrel/doc_ops.py +0 -21
  244. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -100
  245. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  246. arekit/contrib/utils/pipelines/sources/sentinerel/doc_ops.py +0 -29
  247. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
  248. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -175
  249. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
  250. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
  251. arekit/contrib/utils/resources.py +0 -26
  252. arekit/contrib/utils/sources/__init__.py +0 -0
  253. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  254. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  255. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
  256. arekit/contrib/utils/utils_folding.py +0 -19
  257. arekit/download_data.py +0 -11
  258. arekit-0.23.1.dist-info/METADATA +0 -23
  259. arekit-0.23.1.dist-info/RECORD +0 -403
  260. /arekit/common/{data/row_ids → docs}/__init__.py +0 -0
  261. /arekit/common/{folding → docs/parsed}/__init__.py +0 -0
  262. /arekit/common/{news → docs/parsed/providers}/__init__.py +0 -0
  263. /arekit/common/{news → docs}/parsed/term_position.py +0 -0
  264. /arekit/common/{news/parsed → service}/__init__.py +0 -0
  265. /arekit/{common/news/parsed/providers → contrib/utils/data/doc_provider}/__init__.py +0 -0
  266. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/LICENSE +0 -0
  267. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.37.1)
2
+ Generator: bdist_wheel (0.44.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,79 +0,0 @@
1
- from arekit.common.linkage.text_opinions import TextOpinionsLinkage
2
-
3
-
4
- class BaseIDProvider(object):
5
- """
6
- Opinion in text is a sequence of opinions in context
7
- o1, o2, o3, ..., on
8
-
9
- o1 -- first_text_opinion
10
- i -- index in lined (for example: i=3 => 03)
11
-
12
- # TODO. #376. This should be definitely refactored. This implementation
13
- TODO. combines opinion-based and sample-based data sources, which allows
14
- TODO. us to bypass such connection via external foreign keys.
15
-
16
- Since we are head to remove opinions, there is a need to refactor so in a
17
- way of an additional column that provides such information for further connection
18
- between rows of different storages.
19
- """
20
-
21
- SEPARATOR = '_'
22
- OPINION = "o{}" + SEPARATOR
23
- INDEX = "i{}" + SEPARATOR
24
-
25
- # region 'create' methods
26
-
27
- @staticmethod
28
- def create_opinion_id(text_opinions_linkage, index_in_linked):
29
- assert(isinstance(text_opinions_linkage, TextOpinionsLinkage))
30
- assert(isinstance(index_in_linked, int))
31
-
32
- template = ''.join([BaseIDProvider.OPINION,
33
- BaseIDProvider.INDEX])
34
-
35
- text_opinion_id = text_opinions_linkage.First.TextOpinionID
36
- assert(isinstance(text_opinion_id, int))
37
-
38
- return template.format(text_opinion_id,
39
- index_in_linked)
40
-
41
- @staticmethod
42
- def create_sample_id(linked_opinions, index_in_linked, label_scaler):
43
- raise NotImplementedError()
44
-
45
- @staticmethod
46
- def create_pattern(id_value, p_type):
47
- assert(isinstance(id_value, int))
48
- assert(isinstance(p_type, str))
49
- return p_type.format(id_value)
50
-
51
- # endregion
52
-
53
- @staticmethod
54
- def convert_sample_id_to_opinion_id(sample_id):
55
- assert(isinstance(sample_id, str))
56
- return sample_id[:sample_id.index(BaseIDProvider.INDEX[0])] + BaseIDProvider.INDEX.format(0)
57
-
58
- # region 'parse' methods
59
-
60
- @staticmethod
61
- def _parse(row_id, pattern):
62
- assert(isinstance(pattern, str))
63
-
64
- _from = row_id.index(pattern[0]) + 1
65
- _to = row_id.index(BaseIDProvider.SEPARATOR, _from, len(row_id))
66
-
67
- return int(row_id[_from:_to])
68
-
69
- @staticmethod
70
- def parse_opinion_in_opinion_id(opinion_id):
71
- assert(isinstance(opinion_id, str))
72
- return BaseIDProvider._parse(opinion_id, BaseIDProvider.OPINION)
73
-
74
- @staticmethod
75
- def parse_opinion_in_sample_id(sample_id):
76
- assert(isinstance(sample_id, str))
77
- return BaseIDProvider._parse(sample_id, BaseIDProvider.OPINION)
78
-
79
- # endregion
@@ -1,38 +0,0 @@
1
- from arekit.common.data.row_ids.base import BaseIDProvider
2
- from arekit.common.labels.scaler.base import BaseLabelScaler
3
- from arekit.common.linkage.text_opinions import TextOpinionsLinkage
4
-
5
-
6
- class BinaryIDProvider(BaseIDProvider):
7
- """
8
- Considered that label of opinion IS A PART OF id.
9
- # TODO. #376 related. This should be removed after refactoring, because
10
- # TODO. we consider an ordinary IDs, that not based on the other data.
11
- """
12
-
13
- LABEL = 'l{}' + BaseIDProvider.SEPARATOR
14
-
15
- @staticmethod
16
- def create_sample_id(linked_opinions, index_in_linked, label_scaler):
17
- assert(isinstance(linked_opinions, TextOpinionsLinkage))
18
- assert(isinstance(index_in_linked, int))
19
- assert(isinstance(label_scaler, BaseLabelScaler))
20
-
21
- o_id = BaseIDProvider.create_opinion_id(text_opinions_linkage=linked_opinions,
22
- index_in_linked=index_in_linked)
23
-
24
- template = ''.join(["{}", BinaryIDProvider.LABEL])
25
-
26
- return template.format(o_id,
27
- label_scaler.label_to_uint(linked_opinions.get_linked_label()))
28
-
29
- @staticmethod
30
- def parse_label_in_sample_id(sample_id):
31
- assert(isinstance(sample_id, str))
32
- return BinaryIDProvider._parse(row_id=sample_id, pattern=BinaryIDProvider.LABEL)
33
-
34
- @staticmethod
35
- def parse_index_in_sample_id(sample_id):
36
- assert(isinstance(sample_id, str))
37
- return BinaryIDProvider._parse(row_id=sample_id, pattern=BinaryIDProvider.INDEX)
38
-
@@ -1,14 +0,0 @@
1
- from arekit.common.data.row_ids.base import BaseIDProvider
2
-
3
-
4
- class MultipleIDProvider(BaseIDProvider):
5
- """
6
- Considered that label of opinion is not a part of id.
7
- # TODO. #376 related. This should be removed after refactoring, because
8
- # TODO. we consider an ordinary IDs, that not based on the other data.
9
- """
10
-
11
- @staticmethod
12
- def create_sample_id(linked_opinions, index_in_linked, label_scaler):
13
- return BaseIDProvider.create_opinion_id(text_opinions_linkage=linked_opinions,
14
- index_in_linked=index_in_linked)
@@ -1,36 +0,0 @@
1
- import collections
2
-
3
-
4
- class BaseDataFolding(object):
5
- """ Describes and provides API on how to handle doc_ids during experiment,
6
- i.e. how many states does nested folding algorithm supports,
7
- how to perform folding for a particular state (current),
8
- and how to such state into string.
9
- """
10
-
11
- def __init__(self, doc_ids_to_fold, supported_data_types):
12
- assert(isinstance(doc_ids_to_fold, collections.Iterable))
13
- assert(isinstance(supported_data_types, list))
14
- self._doc_ids_to_fold_set = set(doc_ids_to_fold)
15
- self._supported_data_types = supported_data_types
16
-
17
- def contains_doc_id(self, doc_id):
18
- assert(isinstance(doc_id, int))
19
- return doc_id in self._doc_ids_to_fold_set
20
-
21
- def iter_doc_ids(self):
22
- return iter(self._doc_ids_to_fold_set)
23
-
24
- def iter_supported_data_types(self):
25
- """ Iterates through data_types, supported in a related experiment
26
- Note:
27
- In CV-split algorithm, the first part corresponds to a LARGE split,
28
- Jand second to small; therefore, the correct sequence is as follows:
29
- DataType.Train, DataType.Test.
30
- """
31
- return iter(self._supported_data_types)
32
-
33
- def fold_doc_ids_set(self):
34
- """ Perform the doc_ids folding process onto provided data_types
35
- """
36
- raise NotImplementedError()
@@ -1,42 +0,0 @@
1
- from arekit.common.folding.base import BaseDataFolding
2
-
3
-
4
- class FixedFolding(BaseDataFolding):
5
-
6
- def __init__(self, doc_to_datatypes_func, doc_ids_to_fold, supported_data_types):
7
- assert(callable(doc_to_datatypes_func))
8
-
9
- super(FixedFolding, self).__init__(doc_ids_to_fold=doc_ids_to_fold,
10
- supported_data_types=supported_data_types)
11
-
12
- self.__doc_to_datatypes_func = doc_to_datatypes_func
13
-
14
- @classmethod
15
- def from_parts(cls, parts):
16
- """ parts: dict
17
- dictionary of {data_type: [doc_ids]}
18
- """
19
- assert(isinstance(parts, dict))
20
-
21
- doc_to_datatypes = {}
22
- for data_type, doc_ids in parts.items():
23
- for doc_id in doc_ids:
24
- if doc_id not in doc_to_datatypes:
25
- doc_to_datatypes[doc_id] = []
26
- doc_to_datatypes[doc_id].append(data_type)
27
-
28
- return cls(doc_to_datatypes_func=lambda doc_id: doc_to_datatypes[doc_id],
29
- doc_ids_to_fold=doc_to_datatypes.keys(),
30
- supported_data_types=list(parts.keys()))
31
-
32
- def fold_doc_ids_set(self):
33
-
34
- folded = {}
35
- for data_type in self._supported_data_types:
36
- folded[data_type] = []
37
-
38
- for doc_id in self._doc_ids_to_fold_set:
39
- for data_type in self.__doc_to_datatypes_func(doc_id):
40
- folded[data_type].append(doc_id)
41
-
42
- return folded
@@ -1,15 +0,0 @@
1
- from arekit.common.folding.base import BaseDataFolding
2
-
3
-
4
- class NoFolding(BaseDataFolding):
5
- """ The case of absent folding in experiment.
6
- """
7
-
8
- def __init__(self, doc_ids, supported_data_type):
9
- super(NoFolding, self).__init__(doc_ids_to_fold=doc_ids,
10
- supported_data_types=[supported_data_type])
11
-
12
- def fold_doc_ids_set(self):
13
- return {
14
- self._supported_data_types[0]: list(self._doc_ids_to_fold_set)
15
- }
@@ -1,46 +0,0 @@
1
- from arekit.common.folding.base import BaseDataFolding
2
-
3
-
4
- class UnitedFolding(BaseDataFolding):
5
-
6
- def __init__(self, foldings):
7
- assert(isinstance(foldings, list))
8
- self.__foldings = foldings
9
- super(UnitedFolding, self).__init__(
10
- doc_ids_to_fold=UnitedFolding.__iter_all_doc_ids(foldings),
11
- supported_data_types=list(set(UnitedFolding.__iter_all_data_types(foldings))))
12
-
13
- @staticmethod
14
- def __iter_all_doc_ids(foldings):
15
- for folding in foldings:
16
- assert(isinstance(folding, BaseDataFolding))
17
- for doc_id in folding.iter_doc_ids():
18
- yield doc_id
19
-
20
- @staticmethod
21
- def __iter_all_data_types(foldings):
22
- for folding in foldings:
23
- assert(isinstance(folding, BaseDataFolding))
24
- for d_type in folding.iter_supported_data_types():
25
- yield d_type
26
-
27
- @staticmethod
28
- def __merge(origin, new_data):
29
- assert(isinstance(origin, dict))
30
- assert(isinstance(new_data, dict))
31
- for key, value in new_data.items():
32
- if key not in origin:
33
- # Assign list
34
- origin[key] = value
35
- else:
36
- # Combine lists
37
- origin[key] += value
38
-
39
- def fold_doc_ids_set(self):
40
- origin = {}
41
- for folding in self.__foldings:
42
- assert(isinstance(folding, BaseDataFolding))
43
- new_data = folding.fold_doc_ids_set()
44
- self.__merge(origin=origin, new_data=new_data)
45
-
46
- return origin
@@ -1,37 +0,0 @@
1
- from arekit.common.pipeline.items.base import BasePipelineItem
2
- from arekit.common.text.partitioning.base import BasePartitioning
3
- from arekit.common.pipeline.context import PipelineContext
4
-
5
-
6
- class SentenceObjectsParserPipelineItem(BasePipelineItem):
7
-
8
- def __init__(self, partitioning):
9
- assert(isinstance(partitioning, BasePartitioning))
10
- self.__partitioning = partitioning
11
-
12
- # region protected
13
-
14
- def _get_text(self, pipeline_ctx):
15
- return None
16
-
17
- def _get_parts_provider_func(self, input_data, pipeline_ctx):
18
- raise NotImplementedError()
19
-
20
- # endregion
21
-
22
- def apply_core(self, input_data, pipeline_ctx):
23
- assert(isinstance(pipeline_ctx, PipelineContext))
24
- external_input = self._get_text(pipeline_ctx)
25
- actual_input = input_data if external_input is None else external_input
26
- parts_it = self._get_parts_provider_func(input_data=actual_input, pipeline_ctx=pipeline_ctx)
27
- return self.__partitioning.provide(text=actual_input, parts_it=parts_it)
28
-
29
- # region base
30
-
31
- def __enter__(self):
32
- return self
33
-
34
- def __exit__(self, exc_type, exc_val, exc_tb):
35
- pass
36
-
37
- # endregion
@@ -1,48 +0,0 @@
1
- from arekit.common.entities.base import Entity
2
- from arekit.common.news.entity import DocumentEntity
3
- from arekit.common.news.parsed.base import ParsedNews
4
-
5
-
6
- class BaseParsedNewsServiceProvider(object):
7
-
8
- def __init__(self, entity_index_func=None):
9
- """ Outside enity indexing function
10
- entity_index_func: provides id for a given entity, i.e.
11
- func(entity) -> int (id)
12
- """
13
- assert(callable(entity_index_func) or entity_index_func is None)
14
- self._doc_entities = None
15
- self.__entity_map = {}
16
- self.__entity_index_func = entity_index_func
17
-
18
- @property
19
- def Name(self):
20
- raise NotImplementedError()
21
-
22
- def init_parsed_news(self, parsed_news):
23
- assert(isinstance(parsed_news, ParsedNews))
24
-
25
- self._doc_entities = []
26
- self.__entity_map.clear()
27
-
28
- for index, entity in enumerate(parsed_news.iter_entities()):
29
-
30
- doc_entity = DocumentEntity(id_in_doc=index,
31
- value=entity.Value,
32
- e_type=entity.Type,
33
- display_value=entity.DisplayValue,
34
- group_index=entity.GroupIndex)
35
-
36
- self._doc_entities.append(doc_entity)
37
-
38
- if self.__entity_index_func is not None:
39
- self.__entity_map[self.__entity_index_func(entity)] = doc_entity
40
-
41
- def get_document_entity(self, entity):
42
- """ Maps entity to the related one with DocumentEntity type
43
- """
44
- assert(isinstance(entity, Entity))
45
- return self.__entity_map[self.__entity_index_func(entity)]
46
-
47
- def contains_entity(self, entity):
48
- return self.__entity_index_func(entity) in self.__entity_map
@@ -1,31 +0,0 @@
1
- from arekit.common.news.parsed.base import ParsedNews
2
- from arekit.common.news.parsed.providers.base import BaseParsedNewsServiceProvider
3
-
4
-
5
- class ParsedNewsService(object):
6
- """ Represents a collection of providers, combined with the parsed news.
7
- """
8
-
9
- def __init__(self, parsed_news, providers):
10
- assert(isinstance(parsed_news, ParsedNews))
11
- assert(isinstance(providers, list))
12
- self.__parsed_news = parsed_news
13
- self.__providers = {}
14
-
15
- for provider in providers:
16
- assert(isinstance(provider, BaseParsedNewsServiceProvider))
17
- assert(provider.Name not in self.__providers)
18
-
19
- # Link provider with the related name.
20
- self.__providers[provider.Name] = provider
21
-
22
- # Post initialize with the related parsed news.
23
- provider.init_parsed_news(self.__parsed_news)
24
-
25
-
26
- @property
27
- def ParsedNews(self):
28
- return self.__parsed_news
29
-
30
- def get_provider(self, name):
31
- return self.__providers[name]
@@ -1,34 +0,0 @@
1
- from arekit.common.news.base import News
2
- from arekit.common.news.parsed.base import ParsedNews
3
- from arekit.common.pipeline.context import PipelineContext
4
- from arekit.common.text.parser import BaseTextParser
5
-
6
-
7
- class NewsParser(object):
8
-
9
- @staticmethod
10
- def __get_sent(news, sent_ind):
11
- return news.get_sentence(sent_ind)
12
-
13
- @staticmethod
14
- def parse(news, text_parser, parent_ppl_ctx=None):
15
- assert(isinstance(news, News))
16
- assert(isinstance(text_parser, BaseTextParser))
17
- assert(isinstance(parent_ppl_ctx, PipelineContext) or parent_ppl_ctx is None)
18
-
19
- parsed_sentences = [text_parser.run(input_data=NewsParser.__get_sent(news, sent_ind).Text,
20
- params_dict=NewsParser.__create_ppl_params(news=news, sent_ind=sent_ind),
21
- parent_ctx=parent_ppl_ctx)
22
- for sent_ind in range(news.SentencesCount)]
23
-
24
- return ParsedNews(doc_id=news.ID,
25
- parsed_sentences=parsed_sentences)
26
-
27
- @staticmethod
28
- def __create_ppl_params(news, sent_ind):
29
- assert(isinstance(news, News))
30
- return {
31
- "s_ind": sent_ind, # sentence index. (as Metadata)
32
- "doc_id": news.ID, # document index. (as Metadata)
33
- "sentence": NewsParser.__get_sent(news, sent_ind), # Required for special sources.
34
- }
@@ -1,12 +0,0 @@
1
- from arekit.common.pipeline.base import BasePipeline
2
- from arekit.common.text.parsed import BaseParsedText
3
-
4
-
5
- class BaseTextParser(BasePipeline):
6
-
7
- def run(self, input_data, params_dict=None, parent_ctx=None):
8
- output_data = super(BaseTextParser, self).run(input_data=input_data,
9
- params_dict=params_dict,
10
- parent_ctx=parent_ctx)
11
-
12
- return BaseParsedText(terms=output_data)
File without changes
@@ -1,4 +0,0 @@
1
- class BasePartitioning(object):
2
-
3
- def provide(self, text, parts_it):
4
- raise NotImplementedError()
@@ -1,35 +0,0 @@
1
- import collections
2
-
3
- from arekit.common.bound import Bound
4
- from arekit.common.text.partitioning.base import BasePartitioning
5
-
6
-
7
- class TermsPartitioning(BasePartitioning):
8
- """ NOTE: considering that provided parts
9
- has no intersections between each other
10
- """
11
-
12
- def provide(self, text, parts_it):
13
- assert(isinstance(text, list))
14
- assert(isinstance(parts_it, collections.Iterable))
15
-
16
- start = 0
17
- parts = []
18
- for value, bound in parts_it:
19
- assert(isinstance(bound, Bound))
20
- assert(bound.Position >= start)
21
-
22
- # Release everythig till the current value position.
23
- part = text[start:bound.Position]
24
-
25
- parts.extend(part)
26
-
27
- # Release the entity value.
28
- parts.extend([value])
29
-
30
- start = bound.Position + bound.Length
31
-
32
- # Release everything after the last entity.
33
- parts.extend(text[start:len(text)])
34
-
35
- return parts
File without changes
File without changes
@@ -1,83 +0,0 @@
1
- from arekit.contrib.source.brat.entities.entity import BratEntity
2
- from arekit.contrib.source.brat.relation import BratRelation
3
-
4
-
5
- class BratAnnotationParser:
6
-
7
- ENTITIES = "entities"
8
- RELATIONS = "relations"
9
-
10
- @staticmethod
11
- def __non_prefixed_id(value):
12
- assert (isinstance(value, str))
13
- return value[1:]
14
-
15
- @staticmethod
16
- def handle_entity(args):
17
- """ T2 Location 10 23 South America
18
- T1 Location 0 5;16 23 North America
19
- """
20
- assert(len(args) == 3)
21
-
22
- e_id = int(BratAnnotationParser.__non_prefixed_id(args[0]))
23
- entity_params = args[1].split()
24
-
25
- if len(entity_params) != 3:
26
- # We do not support the case of a non-continuous entity mentions.
27
- return None
28
-
29
- e_str_type, e_begin, e_end = entity_params
30
-
31
- return BratEntity(id_in_doc=e_id,
32
- e_type=e_str_type,
33
- index_begin=int(e_begin),
34
- index_end=int(e_end),
35
- value=args[2].strip())
36
-
37
- @staticmethod
38
- def handle_relation(args):
39
- """ Example:
40
- R1 Origin Arg1:T3 Arg2:T4
41
- """
42
-
43
- # Parse identifier index.
44
- e_id = args[0][1:]
45
-
46
- # Parse relation arguments.
47
- rel_type, source, target = args[1].split()
48
-
49
- source_id = source.split(':')[1]
50
- target_id = target.split(':')[1]
51
-
52
- return BratRelation(id_in_doc=e_id,
53
- source_id=int(BratAnnotationParser.__non_prefixed_id(source_id)),
54
- target_id=int(BratAnnotationParser.__non_prefixed_id(target_id)),
55
- rel_type=rel_type)
56
-
57
- @staticmethod
58
- def parse_annotations(input_file, encoding='utf-8'):
59
- """ Read annotation collection from file
60
- """
61
- entities = []
62
- relations = []
63
-
64
- for line in input_file.readlines():
65
- line = line.decode(encoding)
66
-
67
- args = line.split('\t')
68
-
69
- record_type = args[0][0]
70
-
71
- # Entities (objects) are prefixed with `T`
72
- if record_type == "T":
73
- entity = BratAnnotationParser.handle_entity(args)
74
- if entity is not None:
75
- entities.append(entity)
76
-
77
- elif record_type == "R":
78
- relations.append(BratAnnotationParser.handle_relation(args))
79
-
80
- return {
81
- BratAnnotationParser.ENTITIES: entities,
82
- BratAnnotationParser.RELATIONS: relations
83
- }
File without changes
@@ -1,33 +0,0 @@
1
- from arekit.contrib.source.brat.entities.entity import BratEntity
2
-
3
-
4
- class BratCompoundEntity(BratEntity):
5
- """ Entity which contains the hierarchy of the other entities.
6
- """
7
-
8
- def __init__(self, id_in_doc, value, e_type, root, entities, index_begin, index_end,
9
- display_value=None, group_index=None):
10
- assert(isinstance(entities, list))
11
- assert(isinstance(root, BratCompoundEntity) or root is None)
12
- super(BratCompoundEntity, self).__init__(value=value, e_type=e_type,
13
- id_in_doc=id_in_doc,
14
- index_begin=index_begin,
15
- index_end=index_end,
16
- display_value=display_value,
17
- group_index=group_index)
18
- self.__entities = entities
19
- self.__root = root
20
-
21
- @classmethod
22
- def from_list(cls, root, childs):
23
- assert(isinstance(root, BratEntity))
24
- assert(isinstance(childs, list) and len(childs) > 0)
25
- return cls(id_in_doc=root.ID, value=root.Value, e_type=root.Type, root=None,
26
- entities=childs, index_begin=root.IndexBegin, index_end=root.IndexEnd)
27
-
28
- @property
29
- def Root(self):
30
- return self.__root
31
-
32
- def iter_childs(self):
33
- return iter(self.__entities)
@@ -1,42 +0,0 @@
1
- from arekit.common.entities.base import Entity
2
-
3
-
4
- class BratEntity(Entity):
5
- """ Annotated entity in Brat-based collection corpus.
6
- Provides bounds, i.e. char indices in related sentence.
7
- """
8
-
9
- def __init__(self, id_in_doc, e_type, index_begin, index_end, value, display_value=None, group_index=None):
10
- """ index_begin: int
11
- - char index (in case of string type of `text`)
12
- - term index (in case of list type of `text`)
13
- index_end: int
14
- - char index (in case of string type of `text`)
15
- - term index (in case of list type of `text`)
16
- """
17
- assert(isinstance(e_type, str))
18
- assert(isinstance(index_begin, int))
19
- assert(isinstance(index_end, int))
20
- super(BratEntity, self).__init__(value=value, e_type=e_type,
21
- display_value=display_value, group_index=group_index)
22
-
23
- self.__e_type = e_type
24
- self.__begin = index_begin
25
- self.__end = index_end
26
- self.__id = id_in_doc
27
-
28
- @property
29
- def IndexBegin(self):
30
- return self.__begin
31
-
32
- @property
33
- def IndexEnd(self):
34
- return self.__end
35
-
36
- @property
37
- def Type(self):
38
- return self.__e_type
39
-
40
- @property
41
- def ID(self):
42
- return self.__id