arekit 0.23.1__py3-none-any.whl → 0.25.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (267) hide show
  1. arekit/common/context/terms_mapper.py +2 -2
  2. arekit/common/data/const.py +5 -4
  3. arekit/common/{experiment/api/ops_doc.py → data/doc_provider.py} +1 -1
  4. arekit/common/data/input/providers/columns/sample.py +6 -1
  5. arekit/common/data/input/providers/instances/base.py +1 -1
  6. arekit/common/data/input/providers/rows/base.py +36 -13
  7. arekit/common/data/input/providers/rows/samples.py +57 -55
  8. arekit/common/data/input/providers/sample/cropped.py +2 -2
  9. arekit/common/data/input/sample.py +1 -1
  10. arekit/common/data/rows_fmt.py +82 -0
  11. arekit/common/data/rows_parser.py +43 -0
  12. arekit/common/data/storages/base.py +23 -18
  13. arekit/common/data/views/samples.py +2 -8
  14. arekit/common/{news → docs}/base.py +2 -2
  15. arekit/common/{news → docs}/entities_grouping.py +2 -1
  16. arekit/common/{news → docs}/entity.py +2 -1
  17. arekit/common/{news → docs}/parsed/base.py +5 -5
  18. arekit/common/docs/parsed/providers/base.py +68 -0
  19. arekit/common/{news → docs}/parsed/providers/base_pairs.py +2 -2
  20. arekit/common/{news → docs}/parsed/providers/entity_service.py +27 -22
  21. arekit/common/{news → docs}/parsed/providers/opinion_pairs.py +2 -2
  22. arekit/common/{news → docs}/parsed/providers/text_opinion_pairs.py +6 -6
  23. arekit/common/docs/parsed/service.py +31 -0
  24. arekit/common/docs/parser.py +66 -0
  25. arekit/common/{news → docs}/sentence.py +1 -1
  26. arekit/common/entities/base.py +11 -2
  27. arekit/common/experiment/api/base_samples_io.py +1 -1
  28. arekit/common/frames/variants/collection.py +2 -2
  29. arekit/common/linkage/base.py +2 -2
  30. arekit/common/linkage/meta.py +23 -0
  31. arekit/common/linkage/opinions.py +1 -1
  32. arekit/common/linkage/text_opinions.py +2 -2
  33. arekit/common/opinions/annot/algo/base.py +1 -1
  34. arekit/common/opinions/annot/algo/pair_based.py +15 -13
  35. arekit/common/opinions/annot/algo/predefined.py +4 -4
  36. arekit/common/opinions/annot/algo_based.py +5 -5
  37. arekit/common/opinions/annot/base.py +3 -3
  38. arekit/common/opinions/base.py +7 -7
  39. arekit/common/opinions/collection.py +3 -3
  40. arekit/common/pipeline/base.py +12 -16
  41. arekit/common/pipeline/batching.py +28 -0
  42. arekit/common/pipeline/context.py +5 -1
  43. arekit/common/pipeline/items/base.py +38 -1
  44. arekit/common/pipeline/items/flatten.py +5 -1
  45. arekit/common/pipeline/items/handle.py +2 -1
  46. arekit/common/pipeline/items/iter.py +2 -1
  47. arekit/common/pipeline/items/map.py +2 -1
  48. arekit/common/pipeline/items/map_nested.py +4 -0
  49. arekit/common/pipeline/utils.py +32 -0
  50. arekit/common/service/sqlite.py +36 -0
  51. arekit/common/synonyms/base.py +2 -2
  52. arekit/common/text/{partitioning/str.py → partitioning.py} +16 -11
  53. arekit/common/text_opinions/base.py +11 -11
  54. arekit/common/utils.py +33 -46
  55. arekit/contrib/networks/embedding.py +3 -3
  56. arekit/contrib/networks/embedding_io.py +5 -5
  57. arekit/contrib/networks/input/const.py +0 -2
  58. arekit/contrib/networks/input/providers/sample.py +15 -29
  59. arekit/contrib/networks/input/rows_parser.py +47 -134
  60. arekit/contrib/prompt/sample.py +18 -16
  61. arekit/contrib/utils/data/contents/opinions.py +17 -5
  62. arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
  63. arekit/contrib/utils/data/{doc_ops → doc_provider}/dir_based.py +7 -7
  64. arekit/contrib/utils/data/readers/base.py +3 -0
  65. arekit/contrib/utils/data/readers/csv_pd.py +10 -4
  66. arekit/contrib/utils/data/readers/jsonl.py +3 -0
  67. arekit/contrib/utils/data/readers/sqlite.py +14 -0
  68. arekit/contrib/utils/data/service/balance.py +0 -1
  69. arekit/contrib/utils/data/storages/pandas_based.py +3 -5
  70. arekit/contrib/utils/data/storages/row_cache.py +18 -6
  71. arekit/contrib/utils/data/storages/sqlite_based.py +17 -0
  72. arekit/contrib/utils/data/writers/base.py +5 -0
  73. arekit/contrib/utils/data/writers/csv_native.py +3 -0
  74. arekit/contrib/utils/data/writers/csv_pd.py +3 -0
  75. arekit/contrib/utils/data/writers/json_opennre.py +31 -13
  76. arekit/contrib/utils/data/writers/sqlite_native.py +114 -0
  77. arekit/contrib/utils/io_utils/embedding.py +25 -33
  78. arekit/contrib/utils/io_utils/utils.py +3 -24
  79. arekit/contrib/utils/pipelines/items/sampling/base.py +31 -26
  80. arekit/contrib/utils/pipelines/items/sampling/networks.py +7 -10
  81. arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
  82. arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
  83. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +3 -3
  84. arekit/contrib/utils/pipelines/items/text/frames_negation.py +2 -1
  85. arekit/contrib/utils/pipelines/items/text/tokenizer.py +3 -5
  86. arekit/contrib/utils/pipelines/items/text/translator.py +136 -0
  87. arekit/contrib/utils/pipelines/opinion_collections.py +5 -5
  88. arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +7 -7
  89. arekit/contrib/utils/pipelines/text_opinion/extraction.py +34 -22
  90. arekit/contrib/utils/pipelines/text_opinion/filters/base.py +1 -1
  91. arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +1 -1
  92. arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +3 -3
  93. arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +4 -4
  94. arekit/contrib/utils/serializer.py +4 -23
  95. arekit-0.25.0.data/data/logo.png +0 -0
  96. arekit-0.25.0.dist-info/METADATA +82 -0
  97. arekit-0.25.0.dist-info/RECORD +259 -0
  98. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/WHEEL +1 -1
  99. arekit/common/data/row_ids/base.py +0 -79
  100. arekit/common/data/row_ids/binary.py +0 -38
  101. arekit/common/data/row_ids/multiple.py +0 -14
  102. arekit/common/folding/base.py +0 -36
  103. arekit/common/folding/fixed.py +0 -42
  104. arekit/common/folding/nofold.py +0 -15
  105. arekit/common/folding/united.py +0 -46
  106. arekit/common/news/objects_parser.py +0 -37
  107. arekit/common/news/parsed/providers/base.py +0 -48
  108. arekit/common/news/parsed/service.py +0 -31
  109. arekit/common/news/parser.py +0 -34
  110. arekit/common/text/parser.py +0 -12
  111. arekit/common/text/partitioning/__init__.py +0 -0
  112. arekit/common/text/partitioning/base.py +0 -4
  113. arekit/common/text/partitioning/terms.py +0 -35
  114. arekit/contrib/source/__init__.py +0 -0
  115. arekit/contrib/source/brat/__init__.py +0 -0
  116. arekit/contrib/source/brat/annot.py +0 -83
  117. arekit/contrib/source/brat/entities/__init__.py +0 -0
  118. arekit/contrib/source/brat/entities/compound.py +0 -33
  119. arekit/contrib/source/brat/entities/entity.py +0 -42
  120. arekit/contrib/source/brat/entities/parser.py +0 -53
  121. arekit/contrib/source/brat/news.py +0 -28
  122. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  123. arekit/contrib/source/brat/opinions/converter.py +0 -19
  124. arekit/contrib/source/brat/relation.py +0 -32
  125. arekit/contrib/source/brat/sentence.py +0 -69
  126. arekit/contrib/source/brat/sentences_reader.py +0 -128
  127. arekit/contrib/source/download.py +0 -41
  128. arekit/contrib/source/nerel/__init__.py +0 -0
  129. arekit/contrib/source/nerel/entities.py +0 -55
  130. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  131. arekit/contrib/source/nerel/folding/fixed.py +0 -75
  132. arekit/contrib/source/nerel/io_utils.py +0 -62
  133. arekit/contrib/source/nerel/labels.py +0 -241
  134. arekit/contrib/source/nerel/reader.py +0 -46
  135. arekit/contrib/source/nerel/utils.py +0 -24
  136. arekit/contrib/source/nerel/versions.py +0 -12
  137. arekit/contrib/source/nerelbio/__init__.py +0 -0
  138. arekit/contrib/source/nerelbio/io_utils.py +0 -62
  139. arekit/contrib/source/nerelbio/labels.py +0 -265
  140. arekit/contrib/source/nerelbio/reader.py +0 -8
  141. arekit/contrib/source/nerelbio/versions.py +0 -8
  142. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  143. arekit/contrib/source/ruattitudes/collection.py +0 -36
  144. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  145. arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
  146. arekit/contrib/source/ruattitudes/io_utils.py +0 -56
  147. arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
  148. arekit/contrib/source/ruattitudes/news.py +0 -51
  149. arekit/contrib/source/ruattitudes/news_brat.py +0 -44
  150. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  151. arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
  152. arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
  153. arekit/contrib/source/ruattitudes/reader.py +0 -268
  154. arekit/contrib/source/ruattitudes/sentence.py +0 -73
  155. arekit/contrib/source/ruattitudes/synonyms.py +0 -17
  156. arekit/contrib/source/ruattitudes/text_object.py +0 -57
  157. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  158. arekit/contrib/source/rusentiframes/collection.py +0 -157
  159. arekit/contrib/source/rusentiframes/effect.py +0 -24
  160. arekit/contrib/source/rusentiframes/io_utils.py +0 -19
  161. arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
  162. arekit/contrib/source/rusentiframes/polarity.py +0 -35
  163. arekit/contrib/source/rusentiframes/role.py +0 -15
  164. arekit/contrib/source/rusentiframes/state.py +0 -24
  165. arekit/contrib/source/rusentiframes/types.py +0 -42
  166. arekit/contrib/source/rusentiframes/value.py +0 -2
  167. arekit/contrib/source/rusentrel/__init__.py +0 -0
  168. arekit/contrib/source/rusentrel/const.py +0 -3
  169. arekit/contrib/source/rusentrel/entities.py +0 -26
  170. arekit/contrib/source/rusentrel/io_utils.py +0 -125
  171. arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
  172. arekit/contrib/source/rusentrel/news_reader.py +0 -51
  173. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  174. arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
  175. arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
  176. arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
  177. arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
  178. arekit/contrib/source/rusentrel/synonyms.py +0 -17
  179. arekit/contrib/source/sentinerel/__init__.py +0 -0
  180. arekit/contrib/source/sentinerel/entities.py +0 -52
  181. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  182. arekit/contrib/source/sentinerel/folding/factory.py +0 -32
  183. arekit/contrib/source/sentinerel/folding/fixed.py +0 -73
  184. arekit/contrib/source/sentinerel/io_utils.py +0 -87
  185. arekit/contrib/source/sentinerel/labels.py +0 -53
  186. arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
  187. arekit/contrib/source/sentinerel/reader.py +0 -42
  188. arekit/contrib/source/synonyms/__init__.py +0 -0
  189. arekit/contrib/source/synonyms/utils.py +0 -19
  190. arekit/contrib/source/zip_utils.py +0 -47
  191. arekit/contrib/utils/bert/rows.py +0 -0
  192. arekit/contrib/utils/bert/text_b_rus.py +0 -18
  193. arekit/contrib/utils/connotations/__init__.py +0 -0
  194. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
  195. arekit/contrib/utils/cv/__init__.py +0 -0
  196. arekit/contrib/utils/cv/doc_stat/__init__.py +0 -0
  197. arekit/contrib/utils/cv/doc_stat/base.py +0 -37
  198. arekit/contrib/utils/cv/doc_stat/sentence.py +0 -12
  199. arekit/contrib/utils/cv/splitters/__init__.py +0 -0
  200. arekit/contrib/utils/cv/splitters/base.py +0 -4
  201. arekit/contrib/utils/cv/splitters/default.py +0 -53
  202. arekit/contrib/utils/cv/splitters/statistical.py +0 -57
  203. arekit/contrib/utils/cv/two_class.py +0 -77
  204. arekit/contrib/utils/data/doc_ops/__init__.py +0 -0
  205. arekit/contrib/utils/data/doc_ops/dict_based.py +0 -13
  206. arekit/contrib/utils/data/ext.py +0 -31
  207. arekit/contrib/utils/data/views/__init__.py +0 -0
  208. arekit/contrib/utils/data/views/linkages/__init__.py +0 -0
  209. arekit/contrib/utils/data/views/linkages/base.py +0 -58
  210. arekit/contrib/utils/data/views/linkages/multilabel.py +0 -48
  211. arekit/contrib/utils/data/views/linkages/utils.py +0 -24
  212. arekit/contrib/utils/data/views/opinions.py +0 -14
  213. arekit/contrib/utils/download.py +0 -78
  214. arekit/contrib/utils/entities/formatters/str_rus_cased_fmt.py +0 -78
  215. arekit/contrib/utils/entities/formatters/str_rus_nocased_fmt.py +0 -15
  216. arekit/contrib/utils/entities/formatters/str_simple_fmt.py +0 -24
  217. arekit/contrib/utils/entities/formatters/str_simple_uppercase_fmt.py +0 -21
  218. arekit/contrib/utils/io_utils/opinions.py +0 -39
  219. arekit/contrib/utils/io_utils/samples.py +0 -78
  220. arekit/contrib/utils/lexicons/__init__.py +0 -0
  221. arekit/contrib/utils/lexicons/lexicon.py +0 -43
  222. arekit/contrib/utils/lexicons/relation.py +0 -45
  223. arekit/contrib/utils/lexicons/rusentilex.py +0 -34
  224. arekit/contrib/utils/nn/__init__.py +0 -0
  225. arekit/contrib/utils/nn/rows.py +0 -83
  226. arekit/contrib/utils/pipelines/items/sampling/bert.py +0 -5
  227. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
  228. arekit/contrib/utils/pipelines/items/to_output.py +0 -101
  229. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  230. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  231. arekit/contrib/utils/pipelines/sources/nerel/doc_ops.py +0 -27
  232. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -59
  233. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
  234. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  235. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_ops.py +0 -29
  236. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -59
  237. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
  238. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  239. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_ops.py +0 -56
  240. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -19
  241. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -58
  242. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  243. arekit/contrib/utils/pipelines/sources/rusentrel/doc_ops.py +0 -21
  244. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -100
  245. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  246. arekit/contrib/utils/pipelines/sources/sentinerel/doc_ops.py +0 -29
  247. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
  248. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -175
  249. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
  250. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
  251. arekit/contrib/utils/resources.py +0 -26
  252. arekit/contrib/utils/sources/__init__.py +0 -0
  253. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  254. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  255. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
  256. arekit/contrib/utils/utils_folding.py +0 -19
  257. arekit/download_data.py +0 -11
  258. arekit-0.23.1.dist-info/METADATA +0 -23
  259. arekit-0.23.1.dist-info/RECORD +0 -403
  260. /arekit/common/{data/row_ids → docs}/__init__.py +0 -0
  261. /arekit/common/{folding → docs/parsed}/__init__.py +0 -0
  262. /arekit/common/{news → docs/parsed/providers}/__init__.py +0 -0
  263. /arekit/common/{news → docs}/parsed/term_position.py +0 -0
  264. /arekit/common/{news/parsed → service}/__init__.py +0 -0
  265. /arekit/{common/news/parsed/providers → contrib/utils/data/doc_provider}/__init__.py +0 -0
  266. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/LICENSE +0 -0
  267. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/top_level.txt +0 -0
@@ -1,56 +0,0 @@
1
- from os import path
2
-
3
- from enum import Enum
4
-
5
- from arekit.contrib.source.zip_utils import ZipArchiveUtils
6
-
7
-
8
- class RuAttitudesVersions(Enum):
9
- Debug = "dbg"
10
- V10 = "v1_0"
11
- V11 = "v1_1"
12
- V20Base = 'v2_0_base'
13
- V20Large = 'v2_0_large'
14
- V20BaseNeut = 'v2_0_base_neut'
15
- V20LargeNeut = 'v2_0_large_neut'
16
-
17
-
18
- class RuAttitudesVersionsService:
19
-
20
- @staticmethod
21
- def __iter_type_and_names():
22
- for version_type in RuAttitudesVersions:
23
- yield version_type, version_type.value
24
-
25
- @staticmethod
26
- def find_by_name(name):
27
- for version_type, related_name in RuAttitudesVersionsService.__iter_type_and_names():
28
- if name == related_name:
29
- return version_type
30
- raise Exception("Version `{}` does not supported".format(name))
31
-
32
- @staticmethod
33
- def iter_supported_names():
34
- for _, name in RuAttitudesVersionsService.__iter_type_and_names():
35
- yield name
36
-
37
-
38
- class RuAttitudesIOUtils(ZipArchiveUtils):
39
-
40
- # region internal methods
41
-
42
- @staticmethod
43
- def get_archive_filepath(version):
44
- assert(isinstance(version, str))
45
- return path.join(RuAttitudesIOUtils.get_data_root(),
46
- "ruattitudes-{version}.zip".format(version=version))
47
-
48
- @staticmethod
49
- def get_collection_filepath():
50
- return "collection.txt"
51
-
52
- @classmethod
53
- def get_synonyms_innerpath(cls):
54
- return "synonyms.txt"
55
-
56
- # endregion
@@ -1,12 +0,0 @@
1
- from arekit.common.labels.scaler.base import BaseLabelScaler
2
- from arekit.common.labels.str_fmt import StringLabelsFormatter
3
-
4
-
5
- class RuAttitudesLabelFormatter(StringLabelsFormatter):
6
-
7
- def __init__(self, label_scaler):
8
- assert(isinstance(label_scaler, BaseLabelScaler))
9
- stol = {}
10
- for int_label in [-1, 0, 1]:
11
- stol[str(int_label)] = type(label_scaler.int_to_label(int_label))
12
- super(RuAttitudesLabelFormatter, self).__init__(stol=stol)
@@ -1,51 +0,0 @@
1
- from arekit.contrib.source.ruattitudes.sentence import RuAttitudesSentence
2
-
3
-
4
- class RuAttitudesNews(object):
5
-
6
- def __init__(self, sentences, news_index):
7
- assert(len(sentences) > 0)
8
-
9
- self.__sentences = sentences
10
- self.__objects_before_sentence = self.__cache_objects_declared_before()
11
- self.__news_index = news_index
12
-
13
- self.__set_owners()
14
-
15
- # region properties
16
-
17
- @property
18
- def ID(self):
19
- return self.__news_index
20
-
21
- @property
22
- def Title(self):
23
- return self.__sentences[0]
24
-
25
- # endregion
26
-
27
- # region private methods
28
-
29
- def __set_owners(self):
30
- for sentence in self.__sentences:
31
- assert(isinstance(sentence, RuAttitudesSentence))
32
- sentence.set_owner(self)
33
-
34
- def __cache_objects_declared_before(self):
35
- d = {}
36
- before = 0
37
- for s in self.__sentences:
38
- assert(isinstance(s, RuAttitudesSentence))
39
- d[s.SentenceIndex] = before
40
- before += s.ObjectsCount
41
-
42
- return d
43
-
44
- # endregion
45
-
46
- def get_objects_declared_before(self, sentence_index):
47
- return self.__objects_before_sentence[sentence_index]
48
-
49
- def iter_sentences(self):
50
- for sentence in self.__sentences:
51
- yield sentence
@@ -1,44 +0,0 @@
1
- from arekit.contrib.source.brat.news import BratNews
2
- from arekit.contrib.source.brat.sentence import BratSentence
3
- from arekit.contrib.source.ruattitudes.news import RuAttitudesNews
4
- from arekit.contrib.source.ruattitudes.opinions.base import SentenceOpinion
5
- from arekit.contrib.source.ruattitudes.opinions.converter import RuAttitudesSentenceOpinionConverter
6
- from arekit.contrib.source.ruattitudes.sentence import RuAttitudesSentence
7
- from arekit.common.utils import split_by_whitespaces
8
-
9
-
10
- class RuAttitudesNewsConverter(object):
11
- """ Performs conversion to a brat-based representation.
12
- The latter allows then allows to adopt pipelines for TextOpnion extraction.
13
- """
14
-
15
- @staticmethod
16
- def to_brat_news(news):
17
- assert(isinstance(news, RuAttitudesNews))
18
- text_opinions = RuAttitudesNewsConverter.__iter_text_opinions(news=news)
19
- brat_sentences = RuAttitudesNewsConverter.__to_brat_sentences(news.iter_sentences())
20
- return BratNews(doc_id=news.ID,
21
- sentences=brat_sentences,
22
- text_relations=list(text_opinions))
23
-
24
- @staticmethod
25
- def __to_brat_sentences(sentences_iter):
26
- sentences = []
27
- for s in sentences_iter:
28
- assert(isinstance(s, RuAttitudesSentence))
29
- assert(s.Owner is not None)
30
- brat_entities = [obj.to_entity(s.get_doc_level_text_object_id) for obj in s.iter_objects()]
31
- brat_sentence = BratSentence(text=split_by_whitespaces(s.Text), index_begin=0, entities=brat_entities)
32
- sentences.append(brat_sentence)
33
- return sentences
34
-
35
- @staticmethod
36
- def __iter_text_opinions(news):
37
- assert(isinstance(news, RuAttitudesNews))
38
- for sentence in news.iter_sentences():
39
- assert(isinstance(sentence, RuAttitudesSentence))
40
- for sentence_opinion in sentence.iter_sentence_opins():
41
- assert(isinstance(sentence_opinion, SentenceOpinion))
42
- yield RuAttitudesSentenceOpinionConverter.to_brat_relation(
43
- sentence_opinion=sentence_opinion,
44
- end_to_doc_id_func=sentence.get_doc_level_text_object_id)
File without changes
@@ -1,28 +0,0 @@
1
- class SentenceOpinion(object):
2
- """ Provides an opinion within a sentence.
3
- Specific for RuAttitudes collection, as the latter provides
4
- connections within a sentence.
5
- """
6
-
7
- def __init__(self, source_id, target_id, label_int, tag):
8
- assert(isinstance(label_int, int))
9
- self.__label_int = label_int
10
- self.__source_id = source_id
11
- self.__target_id = target_id
12
- self.__tag = tag
13
-
14
- @property
15
- def SourceID(self):
16
- return self.__source_id
17
-
18
- @property
19
- def TargetID(self):
20
- return self.__target_id
21
-
22
- @property
23
- def Label(self):
24
- return self.__label_int
25
-
26
- @property
27
- def Tag(self):
28
- return self.__tag
@@ -1,37 +0,0 @@
1
- from arekit.common.labels.scaler.base import BaseLabelScaler
2
- from arekit.common.opinions.base import Opinion
3
- from arekit.contrib.source.brat.relation import BratRelation
4
- from arekit.contrib.source.ruattitudes.opinions.base import SentenceOpinion
5
-
6
-
7
- class RuAttitudesSentenceOpinionConverter:
8
-
9
- @staticmethod
10
- def to_brat_relation(sentence_opinion, end_to_doc_id_func):
11
- """ Converts opinion into brat-related relation.
12
- NOTE: for rel_type we just call str() over int-based value.
13
- """
14
- assert(isinstance(sentence_opinion, SentenceOpinion))
15
- return BratRelation(id_in_doc="0",
16
- source_id=end_to_doc_id_func(sentence_opinion.SourceID),
17
- target_id=end_to_doc_id_func(sentence_opinion.TargetID),
18
- rel_type=str(sentence_opinion.Label))
19
-
20
- @staticmethod
21
- def to_opinion(sentence_opinion, source_value, target_value, label_scaler):
22
- """
23
- Converts onto document, non referenced opinion
24
- (non bounded to the text).
25
- """
26
- assert(isinstance(sentence_opinion, SentenceOpinion))
27
- assert(isinstance(label_scaler, BaseLabelScaler))
28
-
29
- opinion = Opinion(source_value=source_value,
30
- target_value=target_value,
31
- sentiment=label_scaler.int_to_label(sentence_opinion.Label))
32
-
33
- # Using this tag allows to perform a revert operation,
34
- # i.e. to find opinion_ref by opinion.
35
- opinion.set_tag(sentence_opinion.Tag)
36
-
37
- return opinion
@@ -1,268 +0,0 @@
1
- from arekit.common.utils import split_by_whitespaces
2
- from arekit.contrib.source.ruattitudes.news import RuAttitudesNews
3
- from arekit.contrib.source.ruattitudes.opinions.base import SentenceOpinion
4
- from arekit.contrib.source.ruattitudes.sentence import RuAttitudesSentence
5
- from arekit.contrib.source.ruattitudes.text_object import TextObject
6
-
7
-
8
- class RuAttitudesFormatReader(object):
9
-
10
- DOC_SEP_KEY = '--------'
11
- FILE_KEY = "File:"
12
- OBJ_KEY = "Object:"
13
- TITLE_KEY = "Title:"
14
- SINDEX_KEY = "Sentence:"
15
- OPINION_KEY = "Attitude:"
16
- STEXT_KEY = "Text:"
17
- TERMS_IN_TITLE = "TermsInTitle:"
18
- TERMS_IN_TEXT = "TermsInText:"
19
- FRAMEVAR_TITLE = "FrameVariant:"
20
-
21
- AUTH_LABEL = '<AUTH>'
22
-
23
- def __iter__(self):
24
- pass
25
-
26
- # region private methods
27
-
28
- @staticmethod
29
- def iter_news_inds(input_file, get_news_index_func):
30
- assert(callable(get_news_index_func))
31
-
32
- title = None
33
- local_news_ind = 0
34
- has_sentences = False
35
-
36
- for line in RuAttitudesFormatReader.__iter_lines(input_file):
37
-
38
- if RuAttitudesFormatReader.__check_is_title(line):
39
- # We use a placeholder, there is no need in actual value out there.
40
- title = "title"
41
- has_sentences = True
42
-
43
- if RuAttitudesFormatReader.__check_is_news_sep(line=line, title=title):
44
- yield RuAttitudesFormatReader.__assign_news_index(news_index_func=get_news_index_func,
45
- local_index=local_news_ind)
46
- local_news_ind += 1
47
- title = None
48
-
49
- if has_sentences:
50
- yield RuAttitudesFormatReader.__assign_news_index(news_index_func=get_news_index_func,
51
- local_index=local_news_ind)
52
-
53
- @staticmethod
54
- def iter_news(input_file, get_news_index_func):
55
- assert(callable(get_news_index_func))
56
-
57
- reset = False
58
- title = None
59
- title_terms_count = None
60
- text_terms_count = None
61
- sentences = []
62
- opinions_list = []
63
- objects_list = []
64
- s_index = 0
65
- objects_in_prior_sentences_count = 0
66
- local_news_ind = 0
67
-
68
- for line in RuAttitudesFormatReader.__iter_lines(input_file):
69
-
70
- if RuAttitudesFormatReader.FILE_KEY in line:
71
- pass
72
-
73
- if RuAttitudesFormatReader.OBJ_KEY in line:
74
- object = RuAttitudesFormatReader.__parse_object(line)
75
- objects_list.append(object)
76
-
77
- if RuAttitudesFormatReader.OPINION_KEY in line:
78
- sentence_opin = RuAttitudesFormatReader.__parse_sentence_opin(line)
79
- opinions_list.append(sentence_opin)
80
-
81
- if RuAttitudesFormatReader.FRAMEVAR_TITLE in line:
82
- # TODO. This information is ommited now.
83
- pass
84
-
85
- if RuAttitudesFormatReader.TERMS_IN_TITLE in line:
86
- title_terms_count = RuAttitudesFormatReader.__parse_terms_in_title_count(line)
87
-
88
- if RuAttitudesFormatReader.SINDEX_KEY in line:
89
- s_index = RuAttitudesFormatReader.__parse_sentence_index(line)
90
-
91
- if RuAttitudesFormatReader.__check_is_title(line):
92
- title = RuAttitudesSentence(is_title=True,
93
- text=RuAttitudesFormatReader.__parse_sentence(line, True),
94
- sentence_opins=opinions_list,
95
- objects_list=objects_list,
96
- sentence_index=-1)
97
- sentences.append(title)
98
- t_len = RuAttitudesFormatReader.__calculate_terms_in_line(line)
99
- assert(title_terms_count == t_len or title_terms_count is None)
100
- reset = True
101
-
102
- if RuAttitudesFormatReader.STEXT_KEY in line and line.index(RuAttitudesFormatReader.STEXT_KEY) == 0:
103
- sentence = RuAttitudesSentence(is_title=False,
104
- text=RuAttitudesFormatReader.__parse_sentence(line, False),
105
- sentence_opins=opinions_list,
106
- objects_list=objects_list,
107
- sentence_index=s_index)
108
- sentences.append(sentence)
109
- objects_in_prior_sentences_count += len(objects_list)
110
- t_len = RuAttitudesFormatReader.__calculate_terms_in_line(line)
111
- assert(text_terms_count == t_len or text_terms_count is None)
112
- reset = True
113
-
114
- if RuAttitudesFormatReader.__check_is_news_sep(line=line, title=title):
115
- news_index = RuAttitudesFormatReader.__assign_news_index(news_index_func=get_news_index_func,
116
- local_index=local_news_ind)
117
- yield RuAttitudesNews(sentences=sentences,
118
- news_index=news_index)
119
- local_news_ind += 1
120
- sentences = []
121
- reset = True
122
-
123
- if RuAttitudesFormatReader.TERMS_IN_TEXT in line:
124
- text_terms_count = RuAttitudesFormatReader.__parse_terms_in_text_count(line)
125
-
126
- if reset:
127
- opinions_list = []
128
- objects_list = []
129
- title_terms_count = None
130
- reset = False
131
-
132
- if len(sentences) > 0:
133
- news_index = RuAttitudesFormatReader.__assign_news_index(news_index_func=get_news_index_func,
134
- local_index=local_news_ind)
135
- yield RuAttitudesNews(sentences=sentences,
136
- news_index=news_index)
137
- sentences = []
138
-
139
- assert(len(sentences) == 0)
140
-
141
- @staticmethod
142
- def __assign_news_index(news_index_func, local_index):
143
- assert(callable(news_index_func))
144
- return news_index_func(local_index)
145
-
146
- @staticmethod
147
- def __check_is_news_sep(line, title):
148
- return RuAttitudesFormatReader.DOC_SEP_KEY in line and title is not None
149
-
150
- @staticmethod
151
- def __check_is_title(line):
152
- return RuAttitudesFormatReader.TITLE_KEY in line
153
-
154
- @staticmethod
155
- def __iter_lines(input_file):
156
- for line in input_file.readlines():
157
- yield line.decode('utf-8')
158
-
159
- @staticmethod
160
- def __calculate_terms_in_line(line):
161
- assert(isinstance(line, str))
162
- return len(split_by_whitespaces(line))
163
-
164
- @staticmethod
165
- def __parse_sentence(line, is_title):
166
- assert(isinstance(is_title, bool))
167
-
168
- key = RuAttitudesFormatReader.STEXT_KEY if not is_title else RuAttitudesFormatReader.TITLE_KEY
169
- text = line[len(key):]
170
- return text.strip()
171
-
172
- @staticmethod
173
- def __parse_sentence_opin(line):
174
- line = line[len(RuAttitudesFormatReader.OPINION_KEY):]
175
-
176
- s_from = line.index('b:(')
177
- s_to = line.index(')', s_from)
178
- label = int(line[s_from + 3:s_to])
179
-
180
- o_from = line.index('oi:[')
181
- o_to = line.index(']', o_from)
182
- source_object_id_in_sentence, target_object_id_in_sentence = line[o_from + 4:o_to].split(',')
183
-
184
- source_object_id_in_sentence = int(source_object_id_in_sentence)
185
- target_object_id_in_sentence = int(target_object_id_in_sentence)
186
-
187
- s_from = line.index('si:{')
188
- s_to = line.index('}', s_from)
189
- opninion_key = line[s_from+4:s_to]
190
-
191
- sentence_opin = SentenceOpinion(source_id=source_object_id_in_sentence,
192
- target_id=target_object_id_in_sentence,
193
- label_int=label,
194
- tag=opninion_key)
195
-
196
- return sentence_opin
197
-
198
- @staticmethod
199
- def __parse_object(line):
200
- assert(isinstance(line, str))
201
-
202
- line = line[len(RuAttitudesFormatReader.OBJ_KEY):]
203
-
204
- obj_ind_begin = line.index('oi:[', 0)
205
- obj_ind_end = line.index(']', obj_ind_begin + 1)
206
-
207
- o_begin = line.index("'", 0)
208
- o_end = line.index("'", o_begin + 1)
209
-
210
- b_from = line.index('b:(')
211
- b_to = line.index(')', b_from)
212
-
213
- id_in_sentence = int(line[obj_ind_begin + 4:obj_ind_end])
214
- term_index, length = line[b_from+3:b_to].split(',')
215
- value = line[o_begin + 1:o_end]
216
-
217
- obj_type = RuAttitudesFormatReader.__try_get_type(line)
218
-
219
- sg_from = line.index('si:{')
220
- sg_to = line.index('}', sg_from)
221
- group_index = int(line[sg_from+4:sg_to])
222
-
223
- is_auth = '<AUTH>' in line
224
-
225
- text_object = TextObject(id_in_sentence=id_in_sentence,
226
- value=value,
227
- obj_type=obj_type,
228
- position=int(term_index),
229
- terms_count=int(length),
230
- syn_group_index=group_index,
231
- is_auth=is_auth)
232
-
233
- return text_object
234
-
235
- @staticmethod
236
- def __parse_terms_in_title_count(line):
237
- line = line[len(RuAttitudesFormatReader.TERMS_IN_TITLE):]
238
- return int(line)
239
-
240
- @staticmethod
241
- def __parse_terms_in_text_count(line):
242
- line = line[len(RuAttitudesFormatReader.TERMS_IN_TEXT):]
243
- return int(line)
244
-
245
- @staticmethod
246
- def __parse_sentence_index(line):
247
- line = line[len(RuAttitudesFormatReader.SINDEX_KEY):]
248
- return int(line)
249
-
250
- @staticmethod
251
- def __try_get_type(line):
252
-
253
- # Tag, utilized in RuAttitudes-2.0 format.
254
- template = 'type:'
255
- if template in line:
256
- is_auth = RuAttitudesFormatReader.AUTH_LABEL in line
257
- t_from = line.index(template)
258
- t_to = line.index(RuAttitudesFormatReader.AUTH_LABEL[0], t_from) if is_auth else len(line)
259
- return line[t_from + len(template):t_to].strip()
260
-
261
- # Tag, utilized in RuAttitudes-1.* format.
262
- template = 't:['
263
- if template in line:
264
- t_from = line.index(template)
265
- t_to = line.index(']', t_from)
266
- return line[t_from + len(template):t_to].strip()
267
-
268
- # endregion
@@ -1,73 +0,0 @@
1
- from arekit.common.news.sentence import BaseNewsSentence
2
- from arekit.contrib.source.ruattitudes.opinions.base import SentenceOpinion
3
-
4
-
5
- class RuAttitudesSentence(BaseNewsSentence):
6
-
7
- def __init__(self, is_title, text, sentence_opins, objects_list, sentence_index):
8
- assert(isinstance(is_title, bool))
9
- assert(isinstance(sentence_opins, list))
10
- assert(isinstance(objects_list, list))
11
- assert(isinstance(sentence_index, int))
12
- super(RuAttitudesSentence, self).__init__(text)
13
-
14
- self.__is_title = is_title
15
- self.__sentence_opins = sentence_opins
16
- self.__objects = objects_list
17
- self.__sentence_index = sentence_index
18
- self.__owner = None
19
-
20
- # region properties
21
-
22
- @property
23
- def SentenceIndex(self):
24
- return self.__sentence_index
25
-
26
- @property
27
- def IsTitle(self):
28
- return self.__is_title
29
-
30
- @property
31
- def Owner(self):
32
- return self.__owner
33
-
34
- @property
35
- def ObjectsCount(self):
36
- return len(self.__objects)
37
-
38
- # endregion
39
-
40
- # region public methods
41
-
42
- def set_owner(self, owner):
43
- if self.__owner is not None:
44
- raise Exception("Owner is already declared")
45
- self.__owner = owner
46
-
47
- def get_objects(self, sentence_opin):
48
- assert(isinstance(sentence_opin, SentenceOpinion))
49
- source_obj = self.__objects[sentence_opin.SourceID]
50
- target_obj = self.__objects[sentence_opin.TargetID]
51
- return source_obj, target_obj
52
-
53
- def get_doc_level_text_object_id(self, text_object_ind):
54
- return text_object_ind + self.__owner.get_objects_declared_before(self.SentenceIndex)
55
-
56
- def iter_objects(self):
57
- for object in self.__objects:
58
- yield object
59
-
60
- def find_sentence_opin_by_key(self, key):
61
- assert(key is not None)
62
-
63
- for opinion in self.__sentence_opins:
64
- if opinion.Tag == key:
65
- return opinion
66
-
67
- return None
68
-
69
- def iter_sentence_opins(self):
70
- for opinion in self.__sentence_opins:
71
- yield opinion
72
-
73
- # endregion
@@ -1,17 +0,0 @@
1
- from arekit.contrib.source.ruattitudes.io_utils import RuAttitudesIOUtils
2
- from arekit.contrib.source.synonyms.utils import iter_synonym_groups
3
-
4
-
5
- class RuAttitudesSynonymsCollectionHelper(object):
6
-
7
- @staticmethod
8
- def iter_groups(version):
9
- it = RuAttitudesIOUtils.iter_from_zip(
10
- inner_path=RuAttitudesIOUtils.get_synonyms_innerpath(),
11
- process_func=lambda input_file: iter_synonym_groups(
12
- input_file,
13
- desc="Loading RuAttitudes SynonymsCollection"),
14
- version=version)
15
-
16
- for group in it:
17
- yield group
@@ -1,57 +0,0 @@
1
- from arekit.common.bound import Bound
2
- from arekit.contrib.source.brat.entities.entity import BratEntity
3
-
4
-
5
- class TextObject(object):
6
- """
7
- Considering any part of text, labeled by 'position', and 'type'
8
- The latter is used to emphasize the entity type.
9
- """
10
-
11
- def __init__(self, id_in_sentence, value, obj_type, position, terms_count, syn_group_index, is_auth):
12
- assert(isinstance(id_in_sentence, int))
13
- assert(isinstance(value, str))
14
- assert(isinstance(position, int))
15
- assert(isinstance(terms_count, int) and terms_count > 0)
16
- assert(isinstance(obj_type, str) or obj_type is None)
17
- assert(isinstance(syn_group_index, int))
18
- assert(isinstance(is_auth, bool))
19
- self.__value = value
20
- self.__type = obj_type
21
- self.__id_in_sentence = id_in_sentence
22
- self.__syn_group_index = syn_group_index
23
- self.__is_auth = is_auth
24
- self.__bound = Bound(pos=position, length=terms_count)
25
-
26
- def to_entity(self, to_doc_id_func):
27
- assert(callable(to_doc_id_func))
28
- return BratEntity(id_in_doc=to_doc_id_func(self.__id_in_sentence),
29
- value=self.__value if len(self.__value) > 0 else '[empty]',
30
- e_type=self.__type,
31
- index_begin=self.__bound.Position,
32
- index_end=self.__bound.Position + self.__bound.Length,
33
- group_index=self.__syn_group_index)
34
-
35
- # region properties
36
-
37
- @property
38
- def Value(self):
39
- return self.__value
40
-
41
- @property
42
- def Type(self):
43
- return self.__type
44
-
45
- @property
46
- def IdInSentence(self):
47
- return self.__id_in_sentence
48
-
49
- @property
50
- def Bound(self):
51
- return self.__bound
52
-
53
- @property
54
- def IsAuthorized(self):
55
- return self.__is_auth
56
-
57
- # endregion
File without changes