arekit 0.23.1__py3-none-any.whl → 0.25.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (267) hide show
  1. arekit/common/context/terms_mapper.py +2 -2
  2. arekit/common/data/const.py +5 -4
  3. arekit/common/{experiment/api/ops_doc.py → data/doc_provider.py} +1 -1
  4. arekit/common/data/input/providers/columns/sample.py +6 -1
  5. arekit/common/data/input/providers/instances/base.py +1 -1
  6. arekit/common/data/input/providers/rows/base.py +36 -13
  7. arekit/common/data/input/providers/rows/samples.py +57 -55
  8. arekit/common/data/input/providers/sample/cropped.py +2 -2
  9. arekit/common/data/input/sample.py +1 -1
  10. arekit/common/data/rows_fmt.py +82 -0
  11. arekit/common/data/rows_parser.py +43 -0
  12. arekit/common/data/storages/base.py +23 -18
  13. arekit/common/data/views/samples.py +2 -8
  14. arekit/common/{news → docs}/base.py +2 -2
  15. arekit/common/{news → docs}/entities_grouping.py +2 -1
  16. arekit/common/{news → docs}/entity.py +2 -1
  17. arekit/common/{news → docs}/parsed/base.py +5 -5
  18. arekit/common/docs/parsed/providers/base.py +68 -0
  19. arekit/common/{news → docs}/parsed/providers/base_pairs.py +2 -2
  20. arekit/common/{news → docs}/parsed/providers/entity_service.py +27 -22
  21. arekit/common/{news → docs}/parsed/providers/opinion_pairs.py +2 -2
  22. arekit/common/{news → docs}/parsed/providers/text_opinion_pairs.py +6 -6
  23. arekit/common/docs/parsed/service.py +31 -0
  24. arekit/common/docs/parser.py +66 -0
  25. arekit/common/{news → docs}/sentence.py +1 -1
  26. arekit/common/entities/base.py +11 -2
  27. arekit/common/experiment/api/base_samples_io.py +1 -1
  28. arekit/common/frames/variants/collection.py +2 -2
  29. arekit/common/linkage/base.py +2 -2
  30. arekit/common/linkage/meta.py +23 -0
  31. arekit/common/linkage/opinions.py +1 -1
  32. arekit/common/linkage/text_opinions.py +2 -2
  33. arekit/common/opinions/annot/algo/base.py +1 -1
  34. arekit/common/opinions/annot/algo/pair_based.py +15 -13
  35. arekit/common/opinions/annot/algo/predefined.py +4 -4
  36. arekit/common/opinions/annot/algo_based.py +5 -5
  37. arekit/common/opinions/annot/base.py +3 -3
  38. arekit/common/opinions/base.py +7 -7
  39. arekit/common/opinions/collection.py +3 -3
  40. arekit/common/pipeline/base.py +12 -16
  41. arekit/common/pipeline/batching.py +28 -0
  42. arekit/common/pipeline/context.py +5 -1
  43. arekit/common/pipeline/items/base.py +38 -1
  44. arekit/common/pipeline/items/flatten.py +5 -1
  45. arekit/common/pipeline/items/handle.py +2 -1
  46. arekit/common/pipeline/items/iter.py +2 -1
  47. arekit/common/pipeline/items/map.py +2 -1
  48. arekit/common/pipeline/items/map_nested.py +4 -0
  49. arekit/common/pipeline/utils.py +32 -0
  50. arekit/common/service/sqlite.py +36 -0
  51. arekit/common/synonyms/base.py +2 -2
  52. arekit/common/text/{partitioning/str.py → partitioning.py} +16 -11
  53. arekit/common/text_opinions/base.py +11 -11
  54. arekit/common/utils.py +33 -46
  55. arekit/contrib/networks/embedding.py +3 -3
  56. arekit/contrib/networks/embedding_io.py +5 -5
  57. arekit/contrib/networks/input/const.py +0 -2
  58. arekit/contrib/networks/input/providers/sample.py +15 -29
  59. arekit/contrib/networks/input/rows_parser.py +47 -134
  60. arekit/contrib/prompt/sample.py +18 -16
  61. arekit/contrib/utils/data/contents/opinions.py +17 -5
  62. arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
  63. arekit/contrib/utils/data/{doc_ops → doc_provider}/dir_based.py +7 -7
  64. arekit/contrib/utils/data/readers/base.py +3 -0
  65. arekit/contrib/utils/data/readers/csv_pd.py +10 -4
  66. arekit/contrib/utils/data/readers/jsonl.py +3 -0
  67. arekit/contrib/utils/data/readers/sqlite.py +14 -0
  68. arekit/contrib/utils/data/service/balance.py +0 -1
  69. arekit/contrib/utils/data/storages/pandas_based.py +3 -5
  70. arekit/contrib/utils/data/storages/row_cache.py +18 -6
  71. arekit/contrib/utils/data/storages/sqlite_based.py +17 -0
  72. arekit/contrib/utils/data/writers/base.py +5 -0
  73. arekit/contrib/utils/data/writers/csv_native.py +3 -0
  74. arekit/contrib/utils/data/writers/csv_pd.py +3 -0
  75. arekit/contrib/utils/data/writers/json_opennre.py +31 -13
  76. arekit/contrib/utils/data/writers/sqlite_native.py +114 -0
  77. arekit/contrib/utils/io_utils/embedding.py +25 -33
  78. arekit/contrib/utils/io_utils/utils.py +3 -24
  79. arekit/contrib/utils/pipelines/items/sampling/base.py +31 -26
  80. arekit/contrib/utils/pipelines/items/sampling/networks.py +7 -10
  81. arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
  82. arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
  83. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +3 -3
  84. arekit/contrib/utils/pipelines/items/text/frames_negation.py +2 -1
  85. arekit/contrib/utils/pipelines/items/text/tokenizer.py +3 -5
  86. arekit/contrib/utils/pipelines/items/text/translator.py +136 -0
  87. arekit/contrib/utils/pipelines/opinion_collections.py +5 -5
  88. arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +7 -7
  89. arekit/contrib/utils/pipelines/text_opinion/extraction.py +34 -22
  90. arekit/contrib/utils/pipelines/text_opinion/filters/base.py +1 -1
  91. arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +1 -1
  92. arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +3 -3
  93. arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +4 -4
  94. arekit/contrib/utils/serializer.py +4 -23
  95. arekit-0.25.0.data/data/logo.png +0 -0
  96. arekit-0.25.0.dist-info/METADATA +82 -0
  97. arekit-0.25.0.dist-info/RECORD +259 -0
  98. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/WHEEL +1 -1
  99. arekit/common/data/row_ids/base.py +0 -79
  100. arekit/common/data/row_ids/binary.py +0 -38
  101. arekit/common/data/row_ids/multiple.py +0 -14
  102. arekit/common/folding/base.py +0 -36
  103. arekit/common/folding/fixed.py +0 -42
  104. arekit/common/folding/nofold.py +0 -15
  105. arekit/common/folding/united.py +0 -46
  106. arekit/common/news/objects_parser.py +0 -37
  107. arekit/common/news/parsed/providers/base.py +0 -48
  108. arekit/common/news/parsed/service.py +0 -31
  109. arekit/common/news/parser.py +0 -34
  110. arekit/common/text/parser.py +0 -12
  111. arekit/common/text/partitioning/__init__.py +0 -0
  112. arekit/common/text/partitioning/base.py +0 -4
  113. arekit/common/text/partitioning/terms.py +0 -35
  114. arekit/contrib/source/__init__.py +0 -0
  115. arekit/contrib/source/brat/__init__.py +0 -0
  116. arekit/contrib/source/brat/annot.py +0 -83
  117. arekit/contrib/source/brat/entities/__init__.py +0 -0
  118. arekit/contrib/source/brat/entities/compound.py +0 -33
  119. arekit/contrib/source/brat/entities/entity.py +0 -42
  120. arekit/contrib/source/brat/entities/parser.py +0 -53
  121. arekit/contrib/source/brat/news.py +0 -28
  122. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  123. arekit/contrib/source/brat/opinions/converter.py +0 -19
  124. arekit/contrib/source/brat/relation.py +0 -32
  125. arekit/contrib/source/brat/sentence.py +0 -69
  126. arekit/contrib/source/brat/sentences_reader.py +0 -128
  127. arekit/contrib/source/download.py +0 -41
  128. arekit/contrib/source/nerel/__init__.py +0 -0
  129. arekit/contrib/source/nerel/entities.py +0 -55
  130. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  131. arekit/contrib/source/nerel/folding/fixed.py +0 -75
  132. arekit/contrib/source/nerel/io_utils.py +0 -62
  133. arekit/contrib/source/nerel/labels.py +0 -241
  134. arekit/contrib/source/nerel/reader.py +0 -46
  135. arekit/contrib/source/nerel/utils.py +0 -24
  136. arekit/contrib/source/nerel/versions.py +0 -12
  137. arekit/contrib/source/nerelbio/__init__.py +0 -0
  138. arekit/contrib/source/nerelbio/io_utils.py +0 -62
  139. arekit/contrib/source/nerelbio/labels.py +0 -265
  140. arekit/contrib/source/nerelbio/reader.py +0 -8
  141. arekit/contrib/source/nerelbio/versions.py +0 -8
  142. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  143. arekit/contrib/source/ruattitudes/collection.py +0 -36
  144. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  145. arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
  146. arekit/contrib/source/ruattitudes/io_utils.py +0 -56
  147. arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
  148. arekit/contrib/source/ruattitudes/news.py +0 -51
  149. arekit/contrib/source/ruattitudes/news_brat.py +0 -44
  150. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  151. arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
  152. arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
  153. arekit/contrib/source/ruattitudes/reader.py +0 -268
  154. arekit/contrib/source/ruattitudes/sentence.py +0 -73
  155. arekit/contrib/source/ruattitudes/synonyms.py +0 -17
  156. arekit/contrib/source/ruattitudes/text_object.py +0 -57
  157. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  158. arekit/contrib/source/rusentiframes/collection.py +0 -157
  159. arekit/contrib/source/rusentiframes/effect.py +0 -24
  160. arekit/contrib/source/rusentiframes/io_utils.py +0 -19
  161. arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
  162. arekit/contrib/source/rusentiframes/polarity.py +0 -35
  163. arekit/contrib/source/rusentiframes/role.py +0 -15
  164. arekit/contrib/source/rusentiframes/state.py +0 -24
  165. arekit/contrib/source/rusentiframes/types.py +0 -42
  166. arekit/contrib/source/rusentiframes/value.py +0 -2
  167. arekit/contrib/source/rusentrel/__init__.py +0 -0
  168. arekit/contrib/source/rusentrel/const.py +0 -3
  169. arekit/contrib/source/rusentrel/entities.py +0 -26
  170. arekit/contrib/source/rusentrel/io_utils.py +0 -125
  171. arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
  172. arekit/contrib/source/rusentrel/news_reader.py +0 -51
  173. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  174. arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
  175. arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
  176. arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
  177. arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
  178. arekit/contrib/source/rusentrel/synonyms.py +0 -17
  179. arekit/contrib/source/sentinerel/__init__.py +0 -0
  180. arekit/contrib/source/sentinerel/entities.py +0 -52
  181. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  182. arekit/contrib/source/sentinerel/folding/factory.py +0 -32
  183. arekit/contrib/source/sentinerel/folding/fixed.py +0 -73
  184. arekit/contrib/source/sentinerel/io_utils.py +0 -87
  185. arekit/contrib/source/sentinerel/labels.py +0 -53
  186. arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
  187. arekit/contrib/source/sentinerel/reader.py +0 -42
  188. arekit/contrib/source/synonyms/__init__.py +0 -0
  189. arekit/contrib/source/synonyms/utils.py +0 -19
  190. arekit/contrib/source/zip_utils.py +0 -47
  191. arekit/contrib/utils/bert/rows.py +0 -0
  192. arekit/contrib/utils/bert/text_b_rus.py +0 -18
  193. arekit/contrib/utils/connotations/__init__.py +0 -0
  194. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
  195. arekit/contrib/utils/cv/__init__.py +0 -0
  196. arekit/contrib/utils/cv/doc_stat/__init__.py +0 -0
  197. arekit/contrib/utils/cv/doc_stat/base.py +0 -37
  198. arekit/contrib/utils/cv/doc_stat/sentence.py +0 -12
  199. arekit/contrib/utils/cv/splitters/__init__.py +0 -0
  200. arekit/contrib/utils/cv/splitters/base.py +0 -4
  201. arekit/contrib/utils/cv/splitters/default.py +0 -53
  202. arekit/contrib/utils/cv/splitters/statistical.py +0 -57
  203. arekit/contrib/utils/cv/two_class.py +0 -77
  204. arekit/contrib/utils/data/doc_ops/__init__.py +0 -0
  205. arekit/contrib/utils/data/doc_ops/dict_based.py +0 -13
  206. arekit/contrib/utils/data/ext.py +0 -31
  207. arekit/contrib/utils/data/views/__init__.py +0 -0
  208. arekit/contrib/utils/data/views/linkages/__init__.py +0 -0
  209. arekit/contrib/utils/data/views/linkages/base.py +0 -58
  210. arekit/contrib/utils/data/views/linkages/multilabel.py +0 -48
  211. arekit/contrib/utils/data/views/linkages/utils.py +0 -24
  212. arekit/contrib/utils/data/views/opinions.py +0 -14
  213. arekit/contrib/utils/download.py +0 -78
  214. arekit/contrib/utils/entities/formatters/str_rus_cased_fmt.py +0 -78
  215. arekit/contrib/utils/entities/formatters/str_rus_nocased_fmt.py +0 -15
  216. arekit/contrib/utils/entities/formatters/str_simple_fmt.py +0 -24
  217. arekit/contrib/utils/entities/formatters/str_simple_uppercase_fmt.py +0 -21
  218. arekit/contrib/utils/io_utils/opinions.py +0 -39
  219. arekit/contrib/utils/io_utils/samples.py +0 -78
  220. arekit/contrib/utils/lexicons/__init__.py +0 -0
  221. arekit/contrib/utils/lexicons/lexicon.py +0 -43
  222. arekit/contrib/utils/lexicons/relation.py +0 -45
  223. arekit/contrib/utils/lexicons/rusentilex.py +0 -34
  224. arekit/contrib/utils/nn/__init__.py +0 -0
  225. arekit/contrib/utils/nn/rows.py +0 -83
  226. arekit/contrib/utils/pipelines/items/sampling/bert.py +0 -5
  227. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
  228. arekit/contrib/utils/pipelines/items/to_output.py +0 -101
  229. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  230. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  231. arekit/contrib/utils/pipelines/sources/nerel/doc_ops.py +0 -27
  232. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -59
  233. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
  234. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  235. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_ops.py +0 -29
  236. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -59
  237. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
  238. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  239. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_ops.py +0 -56
  240. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -19
  241. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -58
  242. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  243. arekit/contrib/utils/pipelines/sources/rusentrel/doc_ops.py +0 -21
  244. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -100
  245. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  246. arekit/contrib/utils/pipelines/sources/sentinerel/doc_ops.py +0 -29
  247. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
  248. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -175
  249. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
  250. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
  251. arekit/contrib/utils/resources.py +0 -26
  252. arekit/contrib/utils/sources/__init__.py +0 -0
  253. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  254. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  255. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
  256. arekit/contrib/utils/utils_folding.py +0 -19
  257. arekit/download_data.py +0 -11
  258. arekit-0.23.1.dist-info/METADATA +0 -23
  259. arekit-0.23.1.dist-info/RECORD +0 -403
  260. /arekit/common/{data/row_ids → docs}/__init__.py +0 -0
  261. /arekit/common/{folding → docs/parsed}/__init__.py +0 -0
  262. /arekit/common/{news → docs/parsed/providers}/__init__.py +0 -0
  263. /arekit/common/{news → docs}/parsed/term_position.py +0 -0
  264. /arekit/common/{news/parsed → service}/__init__.py +0 -0
  265. /arekit/{common/news/parsed/providers → contrib/utils/data/doc_provider}/__init__.py +0 -0
  266. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/LICENSE +0 -0
  267. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/top_level.txt +0 -0
@@ -1,54 +0,0 @@
1
- from arekit.common.opinions.provider import OpinionCollectionsProvider
2
- from arekit.common.labels.str_fmt import StringLabelsFormatter
3
- from arekit.contrib.source.rusentrel.opinions.converter import OpinionConverter
4
-
5
-
6
- class RuSentRelOpinionCollectionProvider(OpinionCollectionsProvider):
7
-
8
- @staticmethod
9
- def _iter_opinions_from_file(input_file, labels_formatter, error_on_non_supported):
10
- assert(isinstance(labels_formatter, StringLabelsFormatter))
11
- assert(isinstance(error_on_non_supported, bool))
12
-
13
- for line in input_file.readlines():
14
-
15
- # Force perform decoding if needed.
16
- if isinstance(line, bytes):
17
- line = line.decode()
18
-
19
- if line == '\n':
20
- continue
21
-
22
- str_opinion = OpinionConverter.try_from_string(
23
- line=line,
24
- labels_formatter=labels_formatter)
25
-
26
- if str_opinion is None:
27
- if error_on_non_supported:
28
- raise Exception("Line '{line}' has non supported label")
29
- else:
30
- continue
31
-
32
- yield str_opinion
33
-
34
- # region public methods
35
-
36
- def iter_opinions(self, source, encoding, labels_formatter, error_on_non_supported=True):
37
- """
38
- Important: For externally saved collections (using save_to_file method) and related usage
39
- """
40
- assert(isinstance(source, str))
41
- assert(isinstance(labels_formatter, StringLabelsFormatter))
42
- assert(isinstance(error_on_non_supported, bool))
43
-
44
- with open(source, 'r', encoding=encoding) as input_file:
45
-
46
- it = RuSentRelOpinionCollectionProvider._iter_opinions_from_file(
47
- input_file=input_file,
48
- labels_formatter=labels_formatter,
49
- error_on_non_supported=error_on_non_supported)
50
-
51
- for opinion in it:
52
- yield opinion
53
-
54
- # endregion
@@ -1,42 +0,0 @@
1
- import io
2
-
3
- from arekit.common.labels.str_fmt import StringLabelsFormatter
4
- from arekit.common.opinions.base import Opinion
5
- from arekit.common.opinions.collection import OpinionCollection
6
- from arekit.common.opinions.writer import OpinionCollectionWriter
7
- from arekit.common.utils import create_dir_if_not_exists
8
- from arekit.contrib.source.rusentrel.opinions.converter import OpinionConverter
9
-
10
-
11
- class RuSentRelOpinionCollectionWriter(OpinionCollectionWriter):
12
-
13
- def serialize(self, collection, target, encoding, labels_formatter, error_on_non_supported=True):
14
- assert(isinstance(collection, OpinionCollection))
15
- assert(isinstance(target, str))
16
- assert(isinstance(labels_formatter, StringLabelsFormatter))
17
- assert(isinstance(error_on_non_supported, bool))
18
-
19
- def __opinion_key(opinion):
20
- assert (isinstance(opinion, Opinion))
21
- return opinion.SourceValue + opinion.TargetValue
22
-
23
- sorted_ops = sorted(collection, key=__opinion_key)
24
-
25
- create_dir_if_not_exists(target)
26
-
27
- with io.open(target, 'w', encoding=encoding) as f:
28
- for o in sorted_ops:
29
-
30
- str_value = OpinionConverter.try_to_string(
31
- opinion=o,
32
- labels_formatter=labels_formatter)
33
-
34
- if str_value is None:
35
- if error_on_non_supported:
36
- raise Exception("Opinion label `{label}` is not supported by formatter".format(
37
- label=o.Sentiment))
38
- else:
39
- continue
40
-
41
- f.write(str_value)
42
- f.write('\n')
@@ -1,17 +0,0 @@
1
- from arekit.contrib.source.synonyms.utils import iter_synonym_groups
2
- from arekit.contrib.source.rusentrel.io_utils import RuSentRelIOUtils
3
-
4
-
5
- class RuSentRelSynonymsCollectionHelper(object):
6
-
7
- @staticmethod
8
- def iter_groups(version):
9
- it = RuSentRelIOUtils.iter_from_zip(
10
- inner_path=RuSentRelIOUtils.get_synonyms_innerpath(),
11
- process_func=lambda input_file: iter_synonym_groups(
12
- input_file,
13
- desc="Loading RuSentRel Collection"),
14
- version=version)
15
-
16
- for group in it:
17
- yield group
File without changes
@@ -1,52 +0,0 @@
1
- from arekit.common.entities.collection import EntityCollection
2
- from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders
3
- from arekit.contrib.source.brat.annot import BratAnnotationParser
4
- from arekit.contrib.source.brat.entities.entity import BratEntity
5
- from arekit.contrib.source.sentinerel.io_utils import SentiNerelIOUtils
6
- from arekit.contrib.utils.processing.lemmatization.mystem import MystemWrapper
7
- from arekit.contrib.utils.synonyms.stemmer_based import StemmerBasedSynonymCollection
8
-
9
-
10
- class SentiNerelEntityCollection(EntityCollection):
11
-
12
- def __init__(self, contents, value_to_group_id_func, entities_to_ignore=None):
13
- """
14
- entities_to_ignore: list or None
15
- this parameter is required because of the simplified implmentation of
16
- the nested objects of the BRAT annotation.
17
- """
18
- assert(isinstance(contents, dict))
19
- assert(BratAnnotationParser.ENTITIES in contents)
20
- assert(isinstance(entities_to_ignore, list) or entities_to_ignore is None)
21
-
22
- self.__dicard_entities = set([] if entities_to_ignore is None else entities_to_ignore)
23
- contents[BratAnnotationParser.ENTITIES] = [e for e in contents[BratAnnotationParser.ENTITIES]
24
- if self.__keep_entity(e)]
25
-
26
- super(SentiNerelEntityCollection, self).__init__(
27
- entities=contents[BratAnnotationParser.ENTITIES],
28
- value_to_group_id_func=value_to_group_id_func)
29
-
30
- self._sort_entities(key=lambda entity: entity.IndexBegin)
31
-
32
- def __keep_entity(self, entity):
33
- assert(isinstance(entity, BratEntity))
34
- return entity.Type not in self.__dicard_entities
35
-
36
- @classmethod
37
- def read_collection(cls, filename, version, entities_to_ignore=None):
38
- assert(isinstance(filename, str))
39
-
40
- # Since this dataset does not provide the synonyms collection by default,
41
- # it is necessary to declare an empty collection to populate so in further.
42
- synonyms = StemmerBasedSynonymCollection(stemmer=MystemWrapper(), is_read_only=False)
43
-
44
- return SentiNerelIOUtils.read_from_zip(
45
- inner_path=SentiNerelIOUtils.get_annotation_innerpath(filename),
46
- process_func=lambda input_file: cls(
47
- contents=BratAnnotationParser.parse_annotations(input_file=input_file, encoding='utf-8-sig'),
48
- entities_to_ignore=entities_to_ignore,
49
- value_to_group_id_func=lambda value:
50
- SynonymsCollectionValuesGroupingProviders.provide_existed_or_register_missed_value(
51
- synonyms, value)),
52
- version=version)
File without changes
@@ -1,32 +0,0 @@
1
- from arekit.contrib.source.sentinerel.folding.fixed import create_fixed_folding
2
-
3
-
4
- class SentiNERELFoldingFactory:
5
- """ Factory of the variety types of the splits that
6
- are considered within the present experiments.
7
- """
8
-
9
- @staticmethod
10
- def create_fixed_folding(file, limit=None):
11
- """
12
- limit: int
13
- Allows to limit amount of documents (utilized for testing reasons)
14
- """
15
-
16
- train_filenames, test_filenames = SentiNERELFoldingFactory._read_train_test(f=file)
17
- if limit is not None:
18
- train_filenames = train_filenames[:limit]
19
- test_filenames = test_filenames[:limit]
20
- filenames_by_ids, data_folding = create_fixed_folding(train_filenames=train_filenames,
21
- test_filenames=test_filenames)
22
-
23
- return filenames_by_ids, data_folding
24
-
25
- @staticmethod
26
- def _read_train_test(f):
27
- parts = []
28
- for line in f.readlines():
29
- if isinstance(line, bytes):
30
- line = line.decode('utf-8')
31
- parts.append(line.strip().split(','))
32
- return parts[0], parts[1]
@@ -1,73 +0,0 @@
1
- from collections import OrderedDict
2
-
3
- from arekit.common.experiment.data_type import DataType
4
- from arekit.common.folding.fixed import FixedFolding
5
-
6
-
7
- def create_fixed_folding(train_filenames, test_filenames):
8
- """ Create fixed data-folding based on the predefined list of filenames,
9
- written in file.
10
- """
11
- assert(isinstance(train_filenames, list))
12
- assert(isinstance(test_filenames, list))
13
-
14
- filenames_by_ids = create_filenames_by_ids(filenames=train_filenames + test_filenames)
15
-
16
- ids_by_filenames = {}
17
- for doc_id, filename in filenames_by_ids.items():
18
- ids_by_filenames[filename] = doc_id
19
-
20
- train_doc_ids = [ids_by_filenames[filename] for filename in train_filenames]
21
- test_doc_ids = [ids_by_filenames[filename] for filename in test_filenames]
22
-
23
- fixed_folding = FixedFolding.from_parts({
24
- DataType.Train: train_doc_ids,
25
- DataType.Test: test_doc_ids,
26
- DataType.Etalon: test_doc_ids,
27
- DataType.Dev: test_doc_ids
28
- })
29
-
30
- return filenames_by_ids, fixed_folding
31
-
32
-
33
- def create_filenames_by_ids(filenames):
34
- """ Indexing filenames
35
- """
36
-
37
- def __create_new_id(default_id):
38
- new_id = default_id
39
- while new_id in filenames_by_ids:
40
- new_id += 1
41
- return new_id
42
-
43
- default_id = 0
44
-
45
- filenames_by_ids = OrderedDict()
46
- for fname in filenames:
47
-
48
- doc_id = number_from_string(fname)
49
-
50
- if doc_id is None:
51
- doc_id = __create_new_id(default_id)
52
- default_id = doc_id
53
-
54
- filenames_by_ids[doc_id] = fname
55
-
56
- return filenames_by_ids
57
-
58
-
59
- def number_from_string(s):
60
- assert(isinstance(s, str))
61
-
62
- digit_chars_prefix = []
63
-
64
- for chr in s:
65
- if chr.isdigit():
66
- digit_chars_prefix.append(chr)
67
- else:
68
- break
69
-
70
- if len(digit_chars_prefix) == 0:
71
- return None
72
-
73
- return int("".join(digit_chars_prefix))
@@ -1,87 +0,0 @@
1
- from enum import Enum
2
- from os import path
3
- from os.path import basename, join
4
-
5
- import enum
6
-
7
- from arekit.contrib.source.sentinerel.folding.factory import SentiNERELFoldingFactory
8
- from arekit.contrib.source.zip_utils import ZipArchiveUtils
9
-
10
-
11
- class SentiNerelVersions(Enum):
12
- """ List of the supported version of this collection
13
- """
14
-
15
- # Initial version.
16
- V1 = "v1_0"
17
- # Updated annotation within the second half of the texts. (September 2022)
18
- V2 = "v2_0"
19
- # Updated annotation within the first half of the texts. (October 2022)
20
- # Become a source of the RuSentNE-2023 competition.
21
- # https://github.com/dialogue-evaluation/RuSentNE-evaluation
22
- V21 = "v2_1"
23
-
24
-
25
- DEFAULT_VERSION = SentiNerelVersions.V21
26
-
27
-
28
- class SentiNerelIOUtils(ZipArchiveUtils):
29
-
30
- inner_root = "sentiment_dataset"
31
-
32
- @staticmethod
33
- def get_archive_filepath(version):
34
- return path.join(SentiNerelIOUtils.get_data_root(), "sentinerel-{}.zip".format(version))
35
-
36
- @staticmethod
37
- def get_annotation_innerpath(filename):
38
- assert(isinstance(filename, str))
39
- return path.join(SentiNerelIOUtils.inner_root, "{}.ann".format(filename))
40
-
41
- @staticmethod
42
- def get_news_innerpath(filename):
43
- assert(isinstance(filename, str))
44
- return path.join(SentiNerelIOUtils.inner_root, "{}.txt".format(filename))
45
-
46
- @staticmethod
47
- def __iter_filenames_from_dataset(folder_name, version):
48
- assert(isinstance(version, enum.Enum))
49
- assert(isinstance(folder_name, str))
50
-
51
- for filename in SentiNerelIOUtils.iter_filenames_from_zip(version):
52
-
53
- extension = filename[-4:]
54
-
55
- # Crop extension.
56
- filename = filename[:-4]
57
-
58
- if extension != ".txt":
59
- continue
60
-
61
- if not folder_name in filename:
62
- continue
63
-
64
- yield basename(filename)
65
-
66
- # region public methods
67
-
68
- @staticmethod
69
- def iter_collection_filenames(version=DEFAULT_VERSION):
70
- filenames_it = SentiNerelIOUtils.__iter_filenames_from_dataset(
71
- folder_name=SentiNerelIOUtils.inner_root, version=version)
72
-
73
- for doc_id, filename in enumerate(filenames_it):
74
- yield doc_id, filename
75
-
76
- @staticmethod
77
- def read_dataset_split(version=DEFAULT_VERSION, docs_limit=None):
78
- """ Provides a fixed split of the dataset onto
79
- `test` and `training` part:
80
- https://github.com/nicolay-r/SentiNEREL-attitude-extraction
81
- """
82
- return SentiNerelIOUtils.read_from_zip(
83
- inner_path=join(SentiNerelIOUtils.inner_root, "split_fixed.txt"),
84
- process_func=lambda f: SentiNERELFoldingFactory.create_fixed_folding(file=f, limit=docs_limit),
85
- version=version)
86
-
87
- # endregion
@@ -1,53 +0,0 @@
1
- from arekit.common.labels.base import Label
2
-
3
-
4
- class OpinionBelongsTo(Label):
5
- pass
6
-
7
-
8
- class OpinionRelatesTo(Label):
9
- pass
10
-
11
-
12
- class NegEffectFrom(Label):
13
- pass
14
-
15
-
16
- class NegStateFrom(Label):
17
- pass
18
-
19
-
20
- class PosEffectFrom(Label):
21
- pass
22
-
23
-
24
- class PosAuthorFrom(Label):
25
- pass
26
-
27
-
28
- class NegAuthorFrom(Label):
29
- pass
30
-
31
-
32
- class PosStateFrom(Label):
33
- pass
34
-
35
-
36
- class NegativeTo(Label):
37
- pass
38
-
39
-
40
- class PositiveTo(Label):
41
- pass
42
-
43
-
44
- class AlternativeName(Label):
45
- pass
46
-
47
-
48
- class StateBelongsTo(Label):
49
- pass
50
-
51
-
52
- class OriginsFrom(Label):
53
- pass
@@ -1,30 +0,0 @@
1
- from collections import OrderedDict
2
-
3
- from arekit.common.labels.scaler.base import BaseLabelScaler
4
- from arekit.contrib.source.sentinerel import labels
5
-
6
-
7
- class SentiNerelLabelScaler(BaseLabelScaler):
8
- """ This is a complete label scaler of all the labels supported by NEREL dataset.
9
- """
10
-
11
- def __init__(self):
12
-
13
- self.__uint_to_label_dict = OrderedDict([
14
- (labels.OpinionBelongsTo(), 0),
15
- (labels.OpinionRelatesTo(), 1),
16
- (labels.NegEffectFrom(), 2),
17
- (labels.PosEffectFrom(), 3),
18
- (labels.NegStateFrom(), 4),
19
- (labels.PosStateFrom(), 5),
20
- (labels.NegativeTo(), 6),
21
- (labels.PositiveTo(), 7),
22
- (labels.StateBelongsTo(), 8),
23
- (labels.PosAuthorFrom(), 9),
24
- (labels.NegAuthorFrom(), 10),
25
- (labels.AlternativeName(), 11),
26
- (labels.OriginsFrom(), 12)
27
- ])
28
-
29
- super(SentiNerelLabelScaler, self).__init__(int_dict=self.__uint_to_label_dict,
30
- uint_dict=self.__uint_to_label_dict)
@@ -1,42 +0,0 @@
1
- from arekit.contrib.source.brat.annot import BratAnnotationParser
2
- from arekit.contrib.source.brat.news import BratNews
3
- from arekit.contrib.source.brat.sentences_reader import BratDocumentSentencesReader
4
- from arekit.contrib.source.sentinerel.entities import SentiNerelEntityCollection
5
- from arekit.contrib.source.sentinerel.io_utils import SentiNerelIOUtils, DEFAULT_VERSION
6
-
7
-
8
- class SentiNerelDocReader(object):
9
-
10
- @staticmethod
11
- def read_text_relations(filename, version):
12
- assert(isinstance(filename, str))
13
-
14
- return SentiNerelIOUtils.read_from_zip(
15
- inner_path=SentiNerelIOUtils.get_annotation_innerpath(filename),
16
- process_func=lambda input_file: [
17
- relation for relation in BratAnnotationParser.parse_annotations(
18
- input_file=input_file, encoding='utf-8-sig')["relations"]],
19
- version=version)
20
-
21
- @staticmethod
22
- def read_document(filename, doc_id, version=DEFAULT_VERSION, entities_to_ignore=None):
23
- assert(isinstance(filename, str))
24
- assert(isinstance(doc_id, int))
25
-
26
- def file_to_doc(input_file):
27
- sentences = BratDocumentSentencesReader.from_file(input_file=input_file, entities=entities)
28
- return BratNews(doc_id=doc_id, sentences=sentences, text_relations=text_relations)
29
-
30
- # TODO. #398 issue -- in some cases entities might be nested. Therefore we limit the set
31
- # TODO. of the potential named entities.
32
- eti = ["EFFECT_NEG", "EFFECT_POS", "ARGUMENT_NEG", "ARGUMENT_POS", "EVENT"] \
33
- if entities_to_ignore is None else entities_to_ignore
34
-
35
- entities = SentiNerelEntityCollection.read_collection(
36
- filename=filename, version=version, entities_to_ignore=eti)
37
- text_relations = SentiNerelDocReader.read_text_relations(filename=filename, version=version)
38
-
39
- return SentiNerelIOUtils.read_from_zip(
40
- inner_path=SentiNerelIOUtils.get_news_innerpath(filename=filename),
41
- process_func=file_to_doc,
42
- version=version)
File without changes
@@ -1,19 +0,0 @@
1
- from arekit.common.utils import progress_bar_defined
2
-
3
-
4
- def iter_synonym_groups(input_file, sep=",", desc=""):
5
- """ All the synonyms groups organized in lines, separated by `sep`
6
- """
7
- lines = input_file.readlines()
8
-
9
- lines_it = progress_bar_defined(lines,
10
- total=len(lines),
11
- desc=desc,
12
- unit="opins")
13
-
14
- for line in lines_it:
15
-
16
- if isinstance(line, bytes):
17
- line = line.decode()
18
-
19
- yield line.split(sep)
@@ -1,47 +0,0 @@
1
- import zipfile
2
-
3
- import enum
4
-
5
- from arekit.common import utils
6
-
7
-
8
- class ZipArchiveUtils(object):
9
-
10
- @staticmethod
11
- def get_archive_filepath(version):
12
- raise NotImplementedError()
13
-
14
- @classmethod
15
- def read_from_zip(cls, inner_path, process_func, version):
16
- """
17
- process_func:
18
- func which receives a file reader
19
- """
20
- assert(isinstance(inner_path, str))
21
- assert(callable(process_func))
22
- assert(isinstance(version, enum.Enum))
23
-
24
- with zipfile.ZipFile(cls.get_archive_filepath(version.value), "r") as zip_ref:
25
- with zip_ref.open(inner_path, mode='r') as c_file:
26
- return process_func(c_file)
27
-
28
- @classmethod
29
- def iter_from_zip(cls, inner_path, process_func, version):
30
- assert(isinstance(inner_path, str))
31
- assert(callable(process_func))
32
- assert(isinstance(version, enum.Enum))
33
-
34
- with zipfile.ZipFile(cls.get_archive_filepath(version.value), "r") as zip_ref:
35
- with zip_ref.open(inner_path, mode='r') as c_file:
36
- for result in process_func(c_file):
37
- yield result
38
-
39
- @classmethod
40
- def iter_filenames_from_zip(cls, version):
41
- assert(isinstance(version, enum.Enum))
42
- with zipfile.ZipFile(cls.get_archive_filepath(version.value), "r") as zip_ref:
43
- return iter(zip_ref.namelist())
44
-
45
- @staticmethod
46
- def get_data_root():
47
- return utils.get_default_download_dir()
File without changes
@@ -1,18 +0,0 @@
1
- from enum import Enum
2
-
3
-
4
- class BertTextBRussianPrompts(Enum):
5
- """
6
- Default, based on COLA, but includes an extra text_b.
7
- text_b: Pseudo-sentence w/o S.P (S.P -- sentiment polarity)
8
- text_b: Question w/o S.P (S.P -- sentiment polarity)
9
-
10
- Multilabel variant
11
-
12
- Notation were taken from paper:
13
- https://www.aclweb.org/anthology/N19-1035.pdf
14
- """
15
-
16
- NLI = '{subject} к {object} в контексте : << {context} >>'
17
-
18
- QA = 'Что вы думаете по поводу отношения {subject} к {object} в контексте : << {context} >> ?'
File without changes
@@ -1,23 +0,0 @@
1
- from arekit.common.frames.connotations.provider import FrameConnotationProvider
2
- from arekit.contrib.source.rusentiframes.collection import RuSentiFramesCollection
3
-
4
-
5
- class RuSentiFramesConnotationProvider(FrameConnotationProvider):
6
- """ This is a provider based on A0->A1 label type of RuSentiFrames collection.
7
- For a greater details, checkout the related collection at:
8
- https://github.com/nicolay-r/RuSentiFrames
9
-
10
- Papers:
11
- [1] Natalia Loukachevitch, Nicolay Rusnachenko: Sentiment Frames
12
- for Attitude Extraction in Russian, 2020
13
- [2] Distant Supervision for Sentiment Attitude Extraction, 2019
14
- """
15
-
16
- def __init__(self, collection):
17
- assert(isinstance(collection, RuSentiFramesCollection))
18
- self.__collection = collection
19
-
20
- def try_provide(self, frame_id):
21
- return self.__collection.try_get_frame_polarity(frame_id=frame_id,
22
- role_src='a0',
23
- role_dest='a1')
File without changes
File without changes
@@ -1,37 +0,0 @@
1
- class BaseDocumentStatGenerator(object):
2
- """
3
- Provides statistic on certain document.
4
- Abstract, considered a specific implementation for document processing operation.
5
- """
6
-
7
- def __init__(self, doc_reader_func):
8
- """
9
- news_parser_func: func -> news
10
- assumes to provide a news by a certain doc_id
11
- """
12
- assert(callable(doc_reader_func))
13
- self.__doc_reader_func = doc_reader_func
14
-
15
- # region abstract protected methods
16
-
17
- def _calc(self, news):
18
- """ Abstract method that provides quantitative statistic
19
- for a particular news
20
- """
21
- raise NotImplementedError()
22
-
23
- # endregion
24
-
25
- # region public methods
26
-
27
- def calculate(self, doc_ids_iter):
28
- docs_info = []
29
-
30
- for doc_id in doc_ids_iter:
31
- doc = self.__doc_reader_func(doc_id)
32
- s_count = self._calc(doc)
33
- docs_info.append((doc_id, s_count))
34
-
35
- return docs_info
36
-
37
- # endregion
@@ -1,12 +0,0 @@
1
- from arekit.common.news.base import News
2
- from arekit.contrib.utils.cv.doc_stat.base import BaseDocumentStatGenerator
3
-
4
-
5
- class SentenceBasedDocumentStatGenerator(BaseDocumentStatGenerator):
6
-
7
- def __init__(self, doc_reader_func):
8
- super(SentenceBasedDocumentStatGenerator, self).__init__(doc_reader_func)
9
-
10
- def _calc(self, news):
11
- assert(isinstance(news, News))
12
- return news.SentencesCount
File without changes
@@ -1,4 +0,0 @@
1
- class CrossValidationSplitter(object):
2
-
3
- def items_to_cv_pairs(self, doc_ids, cv_count):
4
- pass