arekit 0.23.1__py3-none-any.whl → 0.25.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (267) hide show
  1. arekit/common/context/terms_mapper.py +2 -2
  2. arekit/common/data/const.py +5 -4
  3. arekit/common/{experiment/api/ops_doc.py → data/doc_provider.py} +1 -1
  4. arekit/common/data/input/providers/columns/sample.py +6 -1
  5. arekit/common/data/input/providers/instances/base.py +1 -1
  6. arekit/common/data/input/providers/rows/base.py +36 -13
  7. arekit/common/data/input/providers/rows/samples.py +57 -55
  8. arekit/common/data/input/providers/sample/cropped.py +2 -2
  9. arekit/common/data/input/sample.py +1 -1
  10. arekit/common/data/rows_fmt.py +82 -0
  11. arekit/common/data/rows_parser.py +43 -0
  12. arekit/common/data/storages/base.py +23 -18
  13. arekit/common/data/views/samples.py +2 -8
  14. arekit/common/{news → docs}/base.py +2 -2
  15. arekit/common/{news → docs}/entities_grouping.py +2 -1
  16. arekit/common/{news → docs}/entity.py +2 -1
  17. arekit/common/{news → docs}/parsed/base.py +5 -5
  18. arekit/common/docs/parsed/providers/base.py +68 -0
  19. arekit/common/{news → docs}/parsed/providers/base_pairs.py +2 -2
  20. arekit/common/{news → docs}/parsed/providers/entity_service.py +27 -22
  21. arekit/common/{news → docs}/parsed/providers/opinion_pairs.py +2 -2
  22. arekit/common/{news → docs}/parsed/providers/text_opinion_pairs.py +6 -6
  23. arekit/common/docs/parsed/service.py +31 -0
  24. arekit/common/docs/parser.py +66 -0
  25. arekit/common/{news → docs}/sentence.py +1 -1
  26. arekit/common/entities/base.py +11 -2
  27. arekit/common/experiment/api/base_samples_io.py +1 -1
  28. arekit/common/frames/variants/collection.py +2 -2
  29. arekit/common/linkage/base.py +2 -2
  30. arekit/common/linkage/meta.py +23 -0
  31. arekit/common/linkage/opinions.py +1 -1
  32. arekit/common/linkage/text_opinions.py +2 -2
  33. arekit/common/opinions/annot/algo/base.py +1 -1
  34. arekit/common/opinions/annot/algo/pair_based.py +15 -13
  35. arekit/common/opinions/annot/algo/predefined.py +4 -4
  36. arekit/common/opinions/annot/algo_based.py +5 -5
  37. arekit/common/opinions/annot/base.py +3 -3
  38. arekit/common/opinions/base.py +7 -7
  39. arekit/common/opinions/collection.py +3 -3
  40. arekit/common/pipeline/base.py +12 -16
  41. arekit/common/pipeline/batching.py +28 -0
  42. arekit/common/pipeline/context.py +5 -1
  43. arekit/common/pipeline/items/base.py +38 -1
  44. arekit/common/pipeline/items/flatten.py +5 -1
  45. arekit/common/pipeline/items/handle.py +2 -1
  46. arekit/common/pipeline/items/iter.py +2 -1
  47. arekit/common/pipeline/items/map.py +2 -1
  48. arekit/common/pipeline/items/map_nested.py +4 -0
  49. arekit/common/pipeline/utils.py +32 -0
  50. arekit/common/service/sqlite.py +36 -0
  51. arekit/common/synonyms/base.py +2 -2
  52. arekit/common/text/{partitioning/str.py → partitioning.py} +16 -11
  53. arekit/common/text_opinions/base.py +11 -11
  54. arekit/common/utils.py +33 -46
  55. arekit/contrib/networks/embedding.py +3 -3
  56. arekit/contrib/networks/embedding_io.py +5 -5
  57. arekit/contrib/networks/input/const.py +0 -2
  58. arekit/contrib/networks/input/providers/sample.py +15 -29
  59. arekit/contrib/networks/input/rows_parser.py +47 -134
  60. arekit/contrib/prompt/sample.py +18 -16
  61. arekit/contrib/utils/data/contents/opinions.py +17 -5
  62. arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
  63. arekit/contrib/utils/data/{doc_ops → doc_provider}/dir_based.py +7 -7
  64. arekit/contrib/utils/data/readers/base.py +3 -0
  65. arekit/contrib/utils/data/readers/csv_pd.py +10 -4
  66. arekit/contrib/utils/data/readers/jsonl.py +3 -0
  67. arekit/contrib/utils/data/readers/sqlite.py +14 -0
  68. arekit/contrib/utils/data/service/balance.py +0 -1
  69. arekit/contrib/utils/data/storages/pandas_based.py +3 -5
  70. arekit/contrib/utils/data/storages/row_cache.py +18 -6
  71. arekit/contrib/utils/data/storages/sqlite_based.py +17 -0
  72. arekit/contrib/utils/data/writers/base.py +5 -0
  73. arekit/contrib/utils/data/writers/csv_native.py +3 -0
  74. arekit/contrib/utils/data/writers/csv_pd.py +3 -0
  75. arekit/contrib/utils/data/writers/json_opennre.py +31 -13
  76. arekit/contrib/utils/data/writers/sqlite_native.py +114 -0
  77. arekit/contrib/utils/io_utils/embedding.py +25 -33
  78. arekit/contrib/utils/io_utils/utils.py +3 -24
  79. arekit/contrib/utils/pipelines/items/sampling/base.py +31 -26
  80. arekit/contrib/utils/pipelines/items/sampling/networks.py +7 -10
  81. arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
  82. arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
  83. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +3 -3
  84. arekit/contrib/utils/pipelines/items/text/frames_negation.py +2 -1
  85. arekit/contrib/utils/pipelines/items/text/tokenizer.py +3 -5
  86. arekit/contrib/utils/pipelines/items/text/translator.py +136 -0
  87. arekit/contrib/utils/pipelines/opinion_collections.py +5 -5
  88. arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +7 -7
  89. arekit/contrib/utils/pipelines/text_opinion/extraction.py +34 -22
  90. arekit/contrib/utils/pipelines/text_opinion/filters/base.py +1 -1
  91. arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +1 -1
  92. arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +3 -3
  93. arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +4 -4
  94. arekit/contrib/utils/serializer.py +4 -23
  95. arekit-0.25.0.data/data/logo.png +0 -0
  96. arekit-0.25.0.dist-info/METADATA +82 -0
  97. arekit-0.25.0.dist-info/RECORD +259 -0
  98. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/WHEEL +1 -1
  99. arekit/common/data/row_ids/base.py +0 -79
  100. arekit/common/data/row_ids/binary.py +0 -38
  101. arekit/common/data/row_ids/multiple.py +0 -14
  102. arekit/common/folding/base.py +0 -36
  103. arekit/common/folding/fixed.py +0 -42
  104. arekit/common/folding/nofold.py +0 -15
  105. arekit/common/folding/united.py +0 -46
  106. arekit/common/news/objects_parser.py +0 -37
  107. arekit/common/news/parsed/providers/base.py +0 -48
  108. arekit/common/news/parsed/service.py +0 -31
  109. arekit/common/news/parser.py +0 -34
  110. arekit/common/text/parser.py +0 -12
  111. arekit/common/text/partitioning/__init__.py +0 -0
  112. arekit/common/text/partitioning/base.py +0 -4
  113. arekit/common/text/partitioning/terms.py +0 -35
  114. arekit/contrib/source/__init__.py +0 -0
  115. arekit/contrib/source/brat/__init__.py +0 -0
  116. arekit/contrib/source/brat/annot.py +0 -83
  117. arekit/contrib/source/brat/entities/__init__.py +0 -0
  118. arekit/contrib/source/brat/entities/compound.py +0 -33
  119. arekit/contrib/source/brat/entities/entity.py +0 -42
  120. arekit/contrib/source/brat/entities/parser.py +0 -53
  121. arekit/contrib/source/brat/news.py +0 -28
  122. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  123. arekit/contrib/source/brat/opinions/converter.py +0 -19
  124. arekit/contrib/source/brat/relation.py +0 -32
  125. arekit/contrib/source/brat/sentence.py +0 -69
  126. arekit/contrib/source/brat/sentences_reader.py +0 -128
  127. arekit/contrib/source/download.py +0 -41
  128. arekit/contrib/source/nerel/__init__.py +0 -0
  129. arekit/contrib/source/nerel/entities.py +0 -55
  130. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  131. arekit/contrib/source/nerel/folding/fixed.py +0 -75
  132. arekit/contrib/source/nerel/io_utils.py +0 -62
  133. arekit/contrib/source/nerel/labels.py +0 -241
  134. arekit/contrib/source/nerel/reader.py +0 -46
  135. arekit/contrib/source/nerel/utils.py +0 -24
  136. arekit/contrib/source/nerel/versions.py +0 -12
  137. arekit/contrib/source/nerelbio/__init__.py +0 -0
  138. arekit/contrib/source/nerelbio/io_utils.py +0 -62
  139. arekit/contrib/source/nerelbio/labels.py +0 -265
  140. arekit/contrib/source/nerelbio/reader.py +0 -8
  141. arekit/contrib/source/nerelbio/versions.py +0 -8
  142. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  143. arekit/contrib/source/ruattitudes/collection.py +0 -36
  144. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  145. arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
  146. arekit/contrib/source/ruattitudes/io_utils.py +0 -56
  147. arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
  148. arekit/contrib/source/ruattitudes/news.py +0 -51
  149. arekit/contrib/source/ruattitudes/news_brat.py +0 -44
  150. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  151. arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
  152. arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
  153. arekit/contrib/source/ruattitudes/reader.py +0 -268
  154. arekit/contrib/source/ruattitudes/sentence.py +0 -73
  155. arekit/contrib/source/ruattitudes/synonyms.py +0 -17
  156. arekit/contrib/source/ruattitudes/text_object.py +0 -57
  157. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  158. arekit/contrib/source/rusentiframes/collection.py +0 -157
  159. arekit/contrib/source/rusentiframes/effect.py +0 -24
  160. arekit/contrib/source/rusentiframes/io_utils.py +0 -19
  161. arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
  162. arekit/contrib/source/rusentiframes/polarity.py +0 -35
  163. arekit/contrib/source/rusentiframes/role.py +0 -15
  164. arekit/contrib/source/rusentiframes/state.py +0 -24
  165. arekit/contrib/source/rusentiframes/types.py +0 -42
  166. arekit/contrib/source/rusentiframes/value.py +0 -2
  167. arekit/contrib/source/rusentrel/__init__.py +0 -0
  168. arekit/contrib/source/rusentrel/const.py +0 -3
  169. arekit/contrib/source/rusentrel/entities.py +0 -26
  170. arekit/contrib/source/rusentrel/io_utils.py +0 -125
  171. arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
  172. arekit/contrib/source/rusentrel/news_reader.py +0 -51
  173. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  174. arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
  175. arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
  176. arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
  177. arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
  178. arekit/contrib/source/rusentrel/synonyms.py +0 -17
  179. arekit/contrib/source/sentinerel/__init__.py +0 -0
  180. arekit/contrib/source/sentinerel/entities.py +0 -52
  181. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  182. arekit/contrib/source/sentinerel/folding/factory.py +0 -32
  183. arekit/contrib/source/sentinerel/folding/fixed.py +0 -73
  184. arekit/contrib/source/sentinerel/io_utils.py +0 -87
  185. arekit/contrib/source/sentinerel/labels.py +0 -53
  186. arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
  187. arekit/contrib/source/sentinerel/reader.py +0 -42
  188. arekit/contrib/source/synonyms/__init__.py +0 -0
  189. arekit/contrib/source/synonyms/utils.py +0 -19
  190. arekit/contrib/source/zip_utils.py +0 -47
  191. arekit/contrib/utils/bert/rows.py +0 -0
  192. arekit/contrib/utils/bert/text_b_rus.py +0 -18
  193. arekit/contrib/utils/connotations/__init__.py +0 -0
  194. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
  195. arekit/contrib/utils/cv/__init__.py +0 -0
  196. arekit/contrib/utils/cv/doc_stat/__init__.py +0 -0
  197. arekit/contrib/utils/cv/doc_stat/base.py +0 -37
  198. arekit/contrib/utils/cv/doc_stat/sentence.py +0 -12
  199. arekit/contrib/utils/cv/splitters/__init__.py +0 -0
  200. arekit/contrib/utils/cv/splitters/base.py +0 -4
  201. arekit/contrib/utils/cv/splitters/default.py +0 -53
  202. arekit/contrib/utils/cv/splitters/statistical.py +0 -57
  203. arekit/contrib/utils/cv/two_class.py +0 -77
  204. arekit/contrib/utils/data/doc_ops/__init__.py +0 -0
  205. arekit/contrib/utils/data/doc_ops/dict_based.py +0 -13
  206. arekit/contrib/utils/data/ext.py +0 -31
  207. arekit/contrib/utils/data/views/__init__.py +0 -0
  208. arekit/contrib/utils/data/views/linkages/__init__.py +0 -0
  209. arekit/contrib/utils/data/views/linkages/base.py +0 -58
  210. arekit/contrib/utils/data/views/linkages/multilabel.py +0 -48
  211. arekit/contrib/utils/data/views/linkages/utils.py +0 -24
  212. arekit/contrib/utils/data/views/opinions.py +0 -14
  213. arekit/contrib/utils/download.py +0 -78
  214. arekit/contrib/utils/entities/formatters/str_rus_cased_fmt.py +0 -78
  215. arekit/contrib/utils/entities/formatters/str_rus_nocased_fmt.py +0 -15
  216. arekit/contrib/utils/entities/formatters/str_simple_fmt.py +0 -24
  217. arekit/contrib/utils/entities/formatters/str_simple_uppercase_fmt.py +0 -21
  218. arekit/contrib/utils/io_utils/opinions.py +0 -39
  219. arekit/contrib/utils/io_utils/samples.py +0 -78
  220. arekit/contrib/utils/lexicons/__init__.py +0 -0
  221. arekit/contrib/utils/lexicons/lexicon.py +0 -43
  222. arekit/contrib/utils/lexicons/relation.py +0 -45
  223. arekit/contrib/utils/lexicons/rusentilex.py +0 -34
  224. arekit/contrib/utils/nn/__init__.py +0 -0
  225. arekit/contrib/utils/nn/rows.py +0 -83
  226. arekit/contrib/utils/pipelines/items/sampling/bert.py +0 -5
  227. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
  228. arekit/contrib/utils/pipelines/items/to_output.py +0 -101
  229. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  230. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  231. arekit/contrib/utils/pipelines/sources/nerel/doc_ops.py +0 -27
  232. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -59
  233. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
  234. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  235. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_ops.py +0 -29
  236. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -59
  237. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
  238. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  239. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_ops.py +0 -56
  240. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -19
  241. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -58
  242. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  243. arekit/contrib/utils/pipelines/sources/rusentrel/doc_ops.py +0 -21
  244. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -100
  245. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  246. arekit/contrib/utils/pipelines/sources/sentinerel/doc_ops.py +0 -29
  247. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
  248. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -175
  249. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
  250. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
  251. arekit/contrib/utils/resources.py +0 -26
  252. arekit/contrib/utils/sources/__init__.py +0 -0
  253. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  254. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  255. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
  256. arekit/contrib/utils/utils_folding.py +0 -19
  257. arekit/download_data.py +0 -11
  258. arekit-0.23.1.dist-info/METADATA +0 -23
  259. arekit-0.23.1.dist-info/RECORD +0 -403
  260. /arekit/common/{data/row_ids → docs}/__init__.py +0 -0
  261. /arekit/common/{folding → docs/parsed}/__init__.py +0 -0
  262. /arekit/common/{news → docs/parsed/providers}/__init__.py +0 -0
  263. /arekit/common/{news → docs}/parsed/term_position.py +0 -0
  264. /arekit/common/{news/parsed → service}/__init__.py +0 -0
  265. /arekit/{common/news/parsed/providers → contrib/utils/data/doc_provider}/__init__.py +0 -0
  266. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/LICENSE +0 -0
  267. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/top_level.txt +0 -0
@@ -1,53 +0,0 @@
1
- import random
2
-
3
- from arekit.contrib.utils.cv.splitters.base import CrossValidationSplitter
4
-
5
-
6
- class SimpleCrossValidationSplitter(CrossValidationSplitter):
7
- """ This splitter assumes to performs folding
8
- without extra additional statistics of the related documents.
9
- """
10
-
11
- def __init__(self, shuffle=True, seed=1):
12
- self.__shuffle = shuffle
13
- self.__seed = seed
14
-
15
- # region private methods
16
-
17
- @staticmethod
18
- def __chunk_it(sequence, num):
19
- avg = len(sequence) / float(num)
20
- out = []
21
- last = 0.0
22
-
23
- while last < len(sequence):
24
- out.append(sequence[int(last):int(last + avg)])
25
- last += avg
26
-
27
- return out
28
-
29
- # endregion
30
-
31
- def items_to_cv_pairs(self, doc_ids, cv_count):
32
- """
33
- Splits array of indices into list of pairs (train_indices_list,
34
- test_indices_list)
35
- """
36
- assert(isinstance(doc_ids, set))
37
- assert(isinstance(cv_count, int))
38
-
39
- doc_ids_list = list(doc_ids)
40
-
41
- if self.__shuffle:
42
- random.Random(self.__seed).shuffle(doc_ids_list)
43
-
44
- chunks = self.__chunk_it(doc_ids_list, cv_count)
45
-
46
- for test_index, chunk in enumerate(chunks):
47
- train_indices = list(range(len(chunks)))
48
- train_indices.remove(test_index)
49
-
50
- large = [v for train_index in train_indices for v in chunks[train_index]]
51
- small = chunk
52
-
53
- yield large, small
@@ -1,57 +0,0 @@
1
- import numpy as np
2
- from arekit.contrib.utils.cv.doc_stat.base import BaseDocumentStatGenerator
3
- from arekit.contrib.utils.cv.splitters.base import CrossValidationSplitter
4
-
5
-
6
- class StatBasedCrossValidationSplitter(CrossValidationSplitter):
7
- """ Sentence-based splitter.
8
- """
9
-
10
- def __init__(self, docs_stat, doc_ids):
11
- assert(isinstance(docs_stat, BaseDocumentStatGenerator))
12
- super(StatBasedCrossValidationSplitter, self).__init__()
13
- self.__docs_info = docs_stat.calculate(doc_ids_iter=doc_ids)
14
-
15
- # region private methods
16
-
17
- @staticmethod
18
- def __select_group(cv_group_size, item):
19
- deltas = []
20
- for group_index in range(len(cv_group_size)):
21
- delta = StatBasedCrossValidationSplitter.__calc_cv_group_delta(
22
- cv_group_size=cv_group_size, item=item, g_index_to_add=group_index)
23
- deltas.append(delta)
24
-
25
- return int(np.argmin(deltas))
26
-
27
- @staticmethod
28
- def __calc_cv_group_delta(cv_group_size, item, g_index_to_add):
29
- sums = []
30
- for i in range(len(cv_group_size)):
31
- sums.append(sum(cv_group_size[i]))
32
-
33
- sums[g_index_to_add] += item
34
- return max(sums) - np.mean(sums)
35
-
36
- # endregion
37
-
38
- def items_to_cv_pairs(self, doc_ids, cv_count):
39
- """ Separation with the specific separation, in terms of cv-classes size difference.
40
- """
41
- assert(isinstance(doc_ids, set))
42
- assert(isinstance(cv_count, int))
43
-
44
- sorted_stat = reversed(sorted(self.__docs_info, key=lambda pair: pair[1]))
45
- cv_group_docs = [[] for _ in range(cv_count)]
46
- cv_group_sizes = [[] for _ in range(cv_count)]
47
-
48
- for doc_id, s_count in sorted_stat:
49
- group_index = self.__select_group(cv_group_size=cv_group_sizes, item=s_count)
50
- cv_group_docs[group_index].append(doc_id)
51
- cv_group_sizes[group_index].append(s_count)
52
-
53
- for g_index in range(len(cv_group_docs)):
54
- small = cv_group_docs[g_index]
55
- large = [doc_id for doc_id, _ in self.__docs_info if doc_id not in small]
56
-
57
- yield large, small
@@ -1,77 +0,0 @@
1
- from arekit.common.folding.base import BaseDataFolding
2
- from arekit.contrib.utils.cv.splitters.base import CrossValidationSplitter
3
-
4
-
5
- class TwoClassCVFolding(BaseDataFolding):
6
- """ Performs folding onto a pair of data_types,
7
- i.e. two-class cv-folding algorithm
8
- """
9
-
10
- def __init__(self, supported_data_types, doc_ids_to_fold, cv_count, splitter):
11
- assert(isinstance(splitter, CrossValidationSplitter))
12
- assert(isinstance(cv_count, int) and cv_count > 0)
13
-
14
- if len(supported_data_types) > 2:
15
- raise NotImplementedError("Experiments with such amount of data-types are not supported!")
16
-
17
- super(TwoClassCVFolding, self).__init__(doc_ids_to_fold=doc_ids_to_fold,
18
- supported_data_types=supported_data_types)
19
-
20
- self.__cv_count = cv_count
21
- self.__splitter = splitter
22
- self.__state_index = 0
23
-
24
- # region Properties
25
-
26
- @property
27
- def StateIndex(self):
28
- return self.__state_index
29
-
30
- @property
31
- def CVCount(self):
32
- return self.__cv_count
33
-
34
- # endregion
35
-
36
- def __assign_index(self, i):
37
- self.__state_index = i
38
-
39
- # region BaseFolding
40
-
41
- def iter_states(self):
42
- """ Performs iteration over states supported by folding algorithm
43
- Default:
44
- considering a single state.
45
- """
46
- for state_index in range(self.__cv_count):
47
- self.__assign_index(state_index)
48
- yield None
49
-
50
- def fold_doc_ids_set(self):
51
-
52
- # Access to protected fields
53
- data_types = self._supported_data_types
54
- doc_ids = self._doc_ids_to_fold_set
55
-
56
- if len(data_types) == 1:
57
- # By default we provide the same output since
58
- # there is no need to perform splitting onto single part
59
- return {
60
- data_types[0]: list(doc_ids)
61
- }
62
-
63
- if self.__splitter is None:
64
- raise NotImplementedError("Splitter has not been intialized!")
65
-
66
- it = self.__splitter.items_to_cv_pairs(doc_ids=set(doc_ids),
67
- cv_count=self.__cv_count)
68
-
69
- for index, pair in enumerate(it):
70
- large, small = pair
71
- if index == self.__state_index:
72
- return {
73
- data_types[0]: large,
74
- data_types[1]: small
75
- }
76
-
77
- # endregion
File without changes
@@ -1,13 +0,0 @@
1
- from arekit.common.experiment.api.ops_doc import DocumentOperations
2
-
3
-
4
- class DictionaryBasedDocumentOperations(DocumentOperations):
5
-
6
- def __init__(self, d):
7
- assert(isinstance(d, dict))
8
- super(DictionaryBasedDocumentOperations, self).__init__()
9
- self.__d = d
10
-
11
- def by_id(self, doc_id):
12
- assert(isinstance(doc_id, int))
13
- return self.__d[doc_id]
@@ -1,31 +0,0 @@
1
- from arekit.contrib.utils.data.readers.base import BaseReader
2
- from arekit.contrib.utils.data.writers.base import BaseWriter
3
- from arekit.contrib.utils.data.writers.csv_native import NativeCsvWriter
4
- from arekit.contrib.utils.data.writers.json_opennre import OpenNREJsonWriter
5
-
6
-
7
- PANDAS_CSV_EXTENSION = ".tsv.gz"
8
- OPENNRE_EXTENSION = ".jsonl"
9
-
10
-
11
- def create_writer_extension(writer):
12
- assert(isinstance(writer, BaseWriter))
13
-
14
- if isinstance(writer, OpenNREJsonWriter):
15
- return OPENNRE_EXTENSION
16
- if isinstance(writer, NativeCsvWriter):
17
- return ".csv"
18
- else:
19
- # consider ".tsv.gz" and assuming it is a Pandas.
20
- return PANDAS_CSV_EXTENSION
21
-
22
-
23
- def create_reader_extension(writer):
24
- assert(isinstance(writer, BaseReader))
25
-
26
- if isinstance(writer, OpenNREJsonWriter):
27
- return OPENNRE_EXTENSION
28
- else:
29
- # consider ".tsv.gz" and assuming it is a Pandas.
30
- # other options are not available in 0.23.1
31
- return PANDAS_CSV_EXTENSION
File without changes
File without changes
@@ -1,58 +0,0 @@
1
- from arekit.common.data import const
2
- from arekit.common.data.row_ids.base import BaseIDProvider
3
- from arekit.common.data.storages.base import BaseRowsStorage
4
- from arekit.common.linkage.opinions import OpinionsLinkage
5
- from arekit.contrib.utils.data.views.linkages import utils
6
- from arekit.contrib.utils.data.views.opinions import BaseOpinionStorageView
7
-
8
-
9
- class BaseOpinionLinkagesView(object):
10
- """ Base view onto source in terms of opinion linkages.
11
- """
12
-
13
- def __init__(self, ids_provider, storage):
14
- assert(isinstance(ids_provider, BaseIDProvider))
15
- assert(isinstance(storage, BaseRowsStorage))
16
- self._ids_provider = ids_provider
17
- self._storage = storage
18
-
19
- # region private methods
20
-
21
- def __iter_doc_opinion_ids(self, row_ids):
22
- for row_id in row_ids:
23
- yield self._ids_provider.parse_opinion_in_opinion_id(row_id)
24
-
25
- def __iter_opinions_by_linkages(self, linkages_df, opinions_view):
26
- for df_linkage in linkages_df:
27
- yield self._iter_by_opinions(linked_df=df_linkage, opinions_view=opinions_view)
28
-
29
- # endregion
30
-
31
- # region protected methods
32
-
33
- def _iter_by_opinions(self, linked_df, opinions_view):
34
- raise NotImplementedError()
35
-
36
- # endregion
37
-
38
- # region public methods
39
-
40
- def iter_opinion_linkages(self, doc_id, opinions_view):
41
- assert(isinstance(opinions_view, BaseOpinionStorageView))
42
- doc_df = self._storage.find_by_value(column_name=const.DOC_ID, value=doc_id)
43
- row_ids = [row_id for row_id in doc_df[const.ID]] # TODO. Adopt storage.
44
- doc_opin_ids = self.__iter_doc_opinion_ids(row_ids=row_ids)
45
-
46
- doc_opin_id_patterns = map(
47
- lambda opinion_id: self._ids_provider.create_pattern(id_value=opinion_id, p_type=BaseIDProvider.OPINION),
48
- doc_opin_ids)
49
-
50
- linkages_df = map(
51
- lambda opin_id: utils.filter_by_id(doc_df=doc_df, column=const.ID, value=opin_id),
52
- doc_opin_id_patterns)
53
-
54
- opinions_iter = self.__iter_opinions_by_linkages(linkages_df, opinions_view=opinions_view)
55
-
56
- return map(lambda opinions: OpinionsLinkage(opinions), opinions_iter)
57
-
58
- # endregion
@@ -1,48 +0,0 @@
1
- import numpy as np
2
-
3
- from arekit.common.data import const
4
- from arekit.common.data.row_ids.multiple import MultipleIDProvider
5
- from arekit.common.labels.scaler.base import BaseLabelScaler
6
- from arekit.contrib.utils.data.views.linkages import utils
7
- from arekit.contrib.utils.data.views.linkages.base import BaseOpinionLinkagesView
8
- from arekit.contrib.utils.data.views.opinions import BaseOpinionStorageView
9
-
10
-
11
- class MultilableOpinionLinkagesView(BaseOpinionLinkagesView):
12
- """ View onto source, where each row, related to opinion, has multiple labels.
13
- """
14
-
15
- def __init__(self, labels_scaler, storage):
16
- assert(isinstance(labels_scaler, BaseLabelScaler))
17
- super(MultilableOpinionLinkagesView, self).__init__(ids_provider=MultipleIDProvider(),
18
- storage=storage)
19
- self.__labels_scaler = labels_scaler
20
-
21
- # region private methods
22
-
23
- def __get_column_header(self):
24
- return [str(self.__labels_scaler.label_to_uint(label))
25
- for label in self.__labels_scaler.ordered_suppoted_labels()]
26
-
27
- def __calculate_label(self, row):
28
- """
29
- Using a single row (probabilities by each class)
30
- """
31
- labels_prob = [row[label] for label in self.__get_column_header()]
32
- return self.__labels_scaler.uint_to_label(value=int(np.argmax(labels_prob)))
33
-
34
- # endregion
35
-
36
- # region protected methods
37
-
38
- def _iter_by_opinions(self, linked_df, opinions_view):
39
- assert(isinstance(opinions_view, BaseOpinionStorageView))
40
-
41
- for _, series in linked_df.iterrows():
42
- yield utils.compose_opinion_by_opinion_id(
43
- ids_provider=self._ids_provider,
44
- sample_id=series[const.ID],
45
- opinions_view=opinions_view,
46
- calc_label_func=lambda: self.__calculate_label(series))
47
-
48
- # endregion
@@ -1,24 +0,0 @@
1
- from arekit.common.data import const
2
- from arekit.common.data.row_ids.base import BaseIDProvider
3
- from arekit.common.opinions.base import Opinion
4
- from arekit.contrib.utils.data.views.opinions import BaseOpinionStorageView
5
-
6
-
7
- def compose_opinion_by_opinion_id(ids_provider, sample_id, opinions_view, calc_label_func):
8
- assert(isinstance(ids_provider, BaseIDProvider))
9
- assert(isinstance(sample_id, str))
10
- assert(isinstance(opinions_view, BaseOpinionStorageView))
11
- assert(callable(calc_label_func))
12
-
13
- opinion_id = ids_provider.convert_sample_id_to_opinion_id(sample_id=sample_id)
14
- row = opinions_view.row_by_id(opinion_id=opinion_id)
15
-
16
- return Opinion(source_value=row[const.SOURCE],
17
- target_value=row[const.TARGET],
18
- sentiment=calc_label_func())
19
-
20
-
21
- # TODO. Adopt storage.
22
- def filter_by_id(doc_df, column, value):
23
- assert(isinstance(column, str))
24
- return doc_df[doc_df[column].str.contains(value)]
@@ -1,14 +0,0 @@
1
- from arekit.common.data import const
2
- from arekit.common.data.storages.base import BaseRowsStorage
3
-
4
-
5
- class BaseOpinionStorageView(object):
6
-
7
- def __init__(self, storage):
8
- assert(isinstance(storage, BaseRowsStorage))
9
- self._storage = storage
10
-
11
- def row_by_id(self, opinion_id):
12
- assert(isinstance(opinion_id, str))
13
- return self._storage.find_first_by_value(column_name=const.ID,
14
- value=opinion_id)
@@ -1,78 +0,0 @@
1
- import os
2
- import tarfile
3
- from os.path import join, exists
4
-
5
- from arekit.common import utils
6
- from arekit.contrib.utils.np_utils.embedding import NpzEmbeddingHelper
7
- from arekit.contrib.utils.np_utils.vocab import VocabRepositoryUtils
8
-
9
- NEWS_MYSTEM_SKIPGRAM_1000_20_2015 = "news_mystem_skipgram_1000_20_2015.tar.gz"
10
-
11
-
12
- def __get_resource(local_name, check_existance=False, download_if_missed=False):
13
- assert(isinstance(local_name, str))
14
- filepath = join(utils.get_default_download_dir(), local_name)
15
-
16
- if check_existance and not exists(filepath):
17
- if download_if_missed:
18
- download()
19
- # We try to ger the resource again but won't attempt to download it again.
20
- __get_resource(local_name, check_existance=check_existance, download_if_missed=False)
21
- else:
22
- raise Exception("Resource could not be found: {}".format(filepath))
23
-
24
- return filepath
25
-
26
-
27
- def __get_embedding_dir(filepath):
28
- return filepath.replace(".tar.gz", "")
29
-
30
-
31
- def load_embedding_and_vocab(local_name, check_existance=False, download_if_missed=False):
32
- tar_gz_archive = __get_resource(local_name, check_existance=check_existance,
33
- download_if_missed=download_if_missed)
34
- target_dir = __get_embedding_dir(tar_gz_archive)
35
- embedding = NpzEmbeddingHelper.load_embedding(os.path.join(target_dir, "embedding.npz"))
36
- vocab = VocabRepositoryUtils.load(os.path.join(target_dir, "vocab.txt"))
37
- return embedding, vocab
38
-
39
-
40
- def download():
41
-
42
- data = {
43
- NEWS_MYSTEM_SKIPGRAM_1000_20_2015: "https://www.dropbox.com/s/0omnlgzgnjhxlmf/{filename}?dl=1".format(
44
- filename=NEWS_MYSTEM_SKIPGRAM_1000_20_2015),
45
- }
46
-
47
- # Perform downloading ...
48
- for local_name, url_link in data.items():
49
- utils.download(dest_file_path=__get_resource(local_name),
50
- source_url=url_link)
51
-
52
- # Untar files ...
53
- for local_name in data.keys():
54
-
55
- if ".tar.gz" not in local_name:
56
- continue
57
-
58
- target_filepath = __get_resource(local_name)
59
- with tarfile.open(target_filepath) as file:
60
- def is_within_directory(directory, target):
61
-
62
- abs_directory = os.path.abspath(directory)
63
- abs_target = os.path.abspath(target)
64
-
65
- prefix = os.path.commonprefix([abs_directory, abs_target])
66
-
67
- return prefix == abs_directory
68
-
69
- def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
70
-
71
- for member in tar.getmembers():
72
- member_path = os.path.join(path, member.name)
73
- if not is_within_directory(path, member_path):
74
- raise Exception("Attempted Path Traversal in Tar File")
75
-
76
- tar.extractall(path, members, numeric_owner=numeric_owner)
77
-
78
- safe_extract(file, __get_embedding_dir(target_filepath))
@@ -1,78 +0,0 @@
1
- from arekit.common.entities.base import Entity
2
- from arekit.common.entities.str_fmt import StringEntitiesFormatter
3
- from arekit.common.entities.types import OpinionEntityType
4
- from arekit.contrib.utils.processing.languages.ru.cases import RussianCases
5
- from arekit.contrib.utils.processing.languages.ru.number import RussianNumberType
6
- from arekit.contrib.utils.processing.pos.russian import RussianPOSTagger
7
-
8
-
9
- class RussianEntitiesCasedFormatter(StringEntitiesFormatter):
10
-
11
- # Объект/Субъект
12
- obj_subj_cases_map = {
13
- RussianCases.UNKN: ['', ''], # UNKN
14
- RussianCases.NOM: ['', "ы"], # именительный
15
- RussianCases.GEN: ['а', 'ов'], # родительный
16
- RussianCases.DAT: ['y', 'ам'], # дательный
17
- RussianCases.ACC: ['', 'ы'], # винительный
18
- RussianCases.INS: ['ом', 'aми'], # творительный
19
- RussianCases.ABL: ['e', 'ах'] # предложный
20
- }
21
-
22
- # Сущност
23
- entity_cases_map = {
24
- RussianCases.UNKN: ['ь', 'и'], # UNKN
25
- RussianCases.NOM: ['ь', "и"], # именительный
26
- RussianCases.GEN: ['и', 'ей'], # родительный
27
- RussianCases.DAT: ['и', 'ям'], # дательный
28
- RussianCases.ACC: ['ь', 'и'], # винительный
29
- RussianCases.INS: ['ью', 'ьями'], # творительный
30
- RussianCases.ABL: ['и', 'ях'] # предложный
31
- }
32
-
33
- def __init__(self, pos_tagger):
34
- assert(isinstance(pos_tagger, RussianPOSTagger))
35
- self.__pos_tagger = pos_tagger
36
-
37
- def to_string(self, original_value, entity_type):
38
- assert(isinstance(original_value, Entity))
39
- assert(isinstance(entity_type, OpinionEntityType))
40
-
41
- template = None
42
- cases_map = None
43
-
44
- if (entity_type == OpinionEntityType.Object) or (entity_type == OpinionEntityType.SynonymObject):
45
- template = "объект"
46
- cases_map = self.obj_subj_cases_map
47
- elif (entity_type == OpinionEntityType.Subject) or (entity_type == OpinionEntityType.SynonymSubject):
48
- template = "субъект"
49
- cases_map = self.obj_subj_cases_map
50
- elif entity_type == OpinionEntityType.Other:
51
- template = "сущност"
52
- cases_map = self.entity_cases_map
53
-
54
- return self.__get_correct_declention(value=original_value.Value,
55
- template=template,
56
- cases_map=cases_map)
57
-
58
- def __get_correct_declention(self, value, template, cases_map):
59
- assert(isinstance(value, str))
60
- assert(isinstance(template, str))
61
- assert(isinstance(cases_map, dict))
62
-
63
- num = self.__pos_tagger.get_term_number(value)
64
- case = self.__pos_tagger.get_term_case(value)
65
-
66
- assert(isinstance(num, RussianNumberType))
67
- assert(isinstance(case, RussianCases))
68
-
69
- if num == RussianNumberType.UNKN or num == RussianNumberType.Single:
70
- num_int = 0
71
- else:
72
- num_int = 1
73
-
74
- if case not in cases_map:
75
- case = RussianCases.UNKN
76
-
77
- return template + (cases_map[case])[num_int]
78
-
@@ -1,15 +0,0 @@
1
- from arekit.common.entities.str_fmt import StringEntitiesFormatter
2
- from arekit.common.entities.types import OpinionEntityType
3
-
4
-
5
- class RussianEntitiesFormatter(StringEntitiesFormatter):
6
-
7
- def to_string(self, original_value, entity_type):
8
- assert(isinstance(entity_type, OpinionEntityType))
9
-
10
- if (entity_type == OpinionEntityType.Object) or (entity_type == OpinionEntityType.SynonymObject):
11
- return "объект"
12
- elif (entity_type == OpinionEntityType.Subject) or (entity_type == OpinionEntityType.SynonymSubject):
13
- return "субъект"
14
- if entity_type == OpinionEntityType.Other:
15
- return "сущность"
@@ -1,24 +0,0 @@
1
- from arekit.common.entities.str_fmt import StringEntitiesFormatter
2
- from arekit.common.entities.types import OpinionEntityType
3
-
4
-
5
- class StringEntitiesSimpleFormatter(StringEntitiesFormatter):
6
- """
7
- Utilized for picking a related word in word embedding.
8
- """
9
-
10
- def to_string(self, original_value, entity_type):
11
- """
12
- Returns: str (unicode)
13
- Value that assumes to be utilized in Word2Vec model embedding search.
14
- """
15
- assert(isinstance(entity_type, OpinionEntityType))
16
-
17
- if entity_type == OpinionEntityType.Other:
18
- return "e"
19
- elif entity_type == OpinionEntityType.Object or entity_type == OpinionEntityType.SynonymObject:
20
- return "object"
21
- elif entity_type == OpinionEntityType.Subject or entity_type == OpinionEntityType.SynonymSubject:
22
- return "subject"
23
-
24
- return None
@@ -1,21 +0,0 @@
1
- from arekit.common.entities.base import Entity
2
- from arekit.common.entities.str_fmt import StringEntitiesFormatter
3
- from arekit.common.entities.types import OpinionEntityType
4
-
5
-
6
- class SimpleUppercasedEntityFormatter(StringEntitiesFormatter):
7
-
8
- def to_string(self, original_value, entity_type):
9
- assert(isinstance(original_value, Entity) or original_value is None)
10
- assert(isinstance(entity_type, OpinionEntityType))
11
-
12
- if entity_type == OpinionEntityType.Other:
13
- mask = "ENTITY"
14
- elif entity_type == OpinionEntityType.Subject or entity_type == OpinionEntityType.SynonymSubject:
15
- mask = "E_SUBJ"
16
- elif entity_type == OpinionEntityType.Object or entity_type == OpinionEntityType.SynonymObject:
17
- mask = "E_OBJ"
18
- else:
19
- raise NotImplementedError()
20
-
21
- return mask
@@ -1,39 +0,0 @@
1
- from os.path import join
2
-
3
- from arekit.contrib.utils.data.ext import create_reader_extension
4
- from arekit.contrib.utils.data.readers.base import BaseReader
5
- from arekit.common.experiment.api.base_samples_io import BaseSamplesIO
6
- from arekit.contrib.utils.io_utils.utils import filename_template
7
-
8
-
9
- class OpinionsIO(BaseSamplesIO):
10
-
11
- def __init__(self, target_dir, reader=None, prefix="opinion", target_extension=".tsv.gz"):
12
- assert(isinstance(reader, BaseReader))
13
- self.__target_dir = target_dir
14
- self.__prefix = prefix
15
- self.__reader = reader
16
- self.__target_extension = create_reader_extension(reader) \
17
- if target_extension is None else target_extension
18
-
19
- @property
20
- def Reader(self):
21
- return self.__reader
22
-
23
- def create_target(self, data_type, data_folding):
24
- return self.__get_input_opinions_target(data_type, data_folding=data_folding)
25
-
26
- def __get_input_opinions_target(self, data_type, data_folding):
27
- template = filename_template(data_type=data_type, data_folding=data_folding)
28
- return self.__get_filepath(out_dir=self.__target_dir,
29
- template=template,
30
- prefix=self.__prefix,
31
- extension=self.__target_extension)
32
-
33
- @staticmethod
34
- def __get_filepath(out_dir, template, prefix, extension):
35
- assert(isinstance(template, str))
36
- assert(isinstance(prefix, str))
37
- assert(isinstance(extension, str))
38
- return join(out_dir, "{prefix}-{template}{extension}".format(
39
- prefix=prefix, template=template, extension=extension))