arekit 0.24.0__py3-none-any.whl → 0.25.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. arekit/common/data/storages/base.py +4 -15
  2. arekit/common/docs/entities_grouping.py +2 -1
  3. arekit/common/docs/parser.py +27 -22
  4. arekit/common/pipeline/base.py +12 -16
  5. arekit/common/pipeline/batching.py +28 -0
  6. arekit/common/pipeline/context.py +5 -1
  7. arekit/common/pipeline/items/base.py +39 -2
  8. arekit/common/pipeline/items/flatten.py +5 -1
  9. arekit/common/pipeline/items/handle.py +2 -1
  10. arekit/common/pipeline/items/iter.py +2 -1
  11. arekit/common/pipeline/items/map.py +2 -1
  12. arekit/common/pipeline/items/map_nested.py +4 -0
  13. arekit/common/pipeline/utils.py +32 -0
  14. arekit/common/text/{partitioning/str.py → partitioning.py} +14 -9
  15. arekit/common/utils.py +11 -52
  16. arekit/contrib/utils/data/contents/opinions.py +13 -3
  17. arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
  18. arekit/contrib/utils/data/storages/pandas_based.py +2 -17
  19. arekit/contrib/utils/data/storages/row_cache.py +8 -2
  20. arekit/contrib/utils/data/storages/sqlite_based.py +18 -0
  21. arekit/contrib/utils/io_utils/utils.py +1 -18
  22. arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
  23. arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
  24. arekit/contrib/utils/pipelines/text_opinion/extraction.py +8 -10
  25. arekit-0.25.1.data/data/logo.png +0 -0
  26. arekit-0.25.1.dist-info/METADATA +81 -0
  27. arekit-0.25.1.dist-info/RECORD +186 -0
  28. {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/WHEEL +1 -1
  29. arekit/common/data/input/repositories/__init__.py +0 -0
  30. arekit/common/data/input/repositories/base.py +0 -68
  31. arekit/common/data/input/repositories/sample.py +0 -22
  32. arekit/common/data/views/__init__.py +0 -0
  33. arekit/common/data/views/samples.py +0 -26
  34. arekit/common/docs/objects_parser.py +0 -37
  35. arekit/common/text/parser.py +0 -12
  36. arekit/common/text/partitioning/__init__.py +0 -0
  37. arekit/common/text/partitioning/base.py +0 -4
  38. arekit/common/text/partitioning/terms.py +0 -35
  39. arekit/contrib/networks/__init__.py +0 -0
  40. arekit/contrib/networks/embedding.py +0 -149
  41. arekit/contrib/networks/embedding_io.py +0 -18
  42. arekit/contrib/networks/input/__init__.py +0 -0
  43. arekit/contrib/networks/input/const.py +0 -6
  44. arekit/contrib/networks/input/ctx_serialization.py +0 -28
  45. arekit/contrib/networks/input/embedding/__init__.py +0 -0
  46. arekit/contrib/networks/input/embedding/matrix.py +0 -29
  47. arekit/contrib/networks/input/embedding/offsets.py +0 -55
  48. arekit/contrib/networks/input/formatters/__init__.py +0 -0
  49. arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
  50. arekit/contrib/networks/input/providers/__init__.py +0 -0
  51. arekit/contrib/networks/input/providers/sample.py +0 -129
  52. arekit/contrib/networks/input/providers/term_connotation.py +0 -23
  53. arekit/contrib/networks/input/providers/text.py +0 -24
  54. arekit/contrib/networks/input/rows_parser.py +0 -47
  55. arekit/contrib/networks/input/term_types.py +0 -13
  56. arekit/contrib/networks/input/terms_mapping.py +0 -60
  57. arekit/contrib/networks/vectorizer.py +0 -6
  58. arekit/contrib/source/__init__.py +0 -0
  59. arekit/contrib/source/brat/__init__.py +0 -0
  60. arekit/contrib/source/brat/annot.py +0 -84
  61. arekit/contrib/source/brat/doc.py +0 -28
  62. arekit/contrib/source/brat/entities/__init__.py +0 -0
  63. arekit/contrib/source/brat/entities/compound.py +0 -13
  64. arekit/contrib/source/brat/entities/entity.py +0 -42
  65. arekit/contrib/source/brat/entities/parser.py +0 -53
  66. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  67. arekit/contrib/source/brat/opinions/converter.py +0 -19
  68. arekit/contrib/source/brat/relation.py +0 -32
  69. arekit/contrib/source/brat/sentence.py +0 -69
  70. arekit/contrib/source/brat/sentences_reader.py +0 -128
  71. arekit/contrib/source/download.py +0 -41
  72. arekit/contrib/source/nerel/__init__.py +0 -0
  73. arekit/contrib/source/nerel/entities.py +0 -55
  74. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  75. arekit/contrib/source/nerel/folding/fixed.py +0 -74
  76. arekit/contrib/source/nerel/io_utils.py +0 -62
  77. arekit/contrib/source/nerel/labels.py +0 -241
  78. arekit/contrib/source/nerel/reader.py +0 -46
  79. arekit/contrib/source/nerel/utils.py +0 -24
  80. arekit/contrib/source/nerel/versions.py +0 -12
  81. arekit/contrib/source/nerelbio/__init__.py +0 -0
  82. arekit/contrib/source/nerelbio/io_utils.py +0 -62
  83. arekit/contrib/source/nerelbio/labels.py +0 -265
  84. arekit/contrib/source/nerelbio/reader.py +0 -8
  85. arekit/contrib/source/nerelbio/versions.py +0 -8
  86. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  87. arekit/contrib/source/ruattitudes/collection.py +0 -36
  88. arekit/contrib/source/ruattitudes/doc.py +0 -51
  89. arekit/contrib/source/ruattitudes/doc_brat.py +0 -44
  90. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  91. arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
  92. arekit/contrib/source/ruattitudes/io_utils.py +0 -56
  93. arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
  94. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  95. arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
  96. arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
  97. arekit/contrib/source/ruattitudes/reader.py +0 -268
  98. arekit/contrib/source/ruattitudes/sentence.py +0 -73
  99. arekit/contrib/source/ruattitudes/synonyms.py +0 -17
  100. arekit/contrib/source/ruattitudes/text_object.py +0 -59
  101. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  102. arekit/contrib/source/rusentiframes/collection.py +0 -157
  103. arekit/contrib/source/rusentiframes/effect.py +0 -24
  104. arekit/contrib/source/rusentiframes/io_utils.py +0 -19
  105. arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
  106. arekit/contrib/source/rusentiframes/polarity.py +0 -35
  107. arekit/contrib/source/rusentiframes/role.py +0 -15
  108. arekit/contrib/source/rusentiframes/state.py +0 -24
  109. arekit/contrib/source/rusentiframes/types.py +0 -42
  110. arekit/contrib/source/rusentiframes/value.py +0 -2
  111. arekit/contrib/source/rusentrel/__init__.py +0 -0
  112. arekit/contrib/source/rusentrel/const.py +0 -3
  113. arekit/contrib/source/rusentrel/docs_reader.py +0 -51
  114. arekit/contrib/source/rusentrel/entities.py +0 -26
  115. arekit/contrib/source/rusentrel/io_utils.py +0 -125
  116. arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
  117. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  118. arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
  119. arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
  120. arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
  121. arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
  122. arekit/contrib/source/rusentrel/synonyms.py +0 -17
  123. arekit/contrib/source/sentinerel/__init__.py +0 -0
  124. arekit/contrib/source/sentinerel/entities.py +0 -52
  125. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  126. arekit/contrib/source/sentinerel/folding/factory.py +0 -31
  127. arekit/contrib/source/sentinerel/folding/fixed.py +0 -70
  128. arekit/contrib/source/sentinerel/io_utils.py +0 -87
  129. arekit/contrib/source/sentinerel/labels.py +0 -53
  130. arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
  131. arekit/contrib/source/sentinerel/reader.py +0 -42
  132. arekit/contrib/source/synonyms/__init__.py +0 -0
  133. arekit/contrib/source/synonyms/utils.py +0 -19
  134. arekit/contrib/source/zip_utils.py +0 -47
  135. arekit/contrib/utils/connotations/__init__.py +0 -0
  136. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
  137. arekit/contrib/utils/data/readers/__init__.py +0 -0
  138. arekit/contrib/utils/data/readers/base.py +0 -7
  139. arekit/contrib/utils/data/readers/csv_pd.py +0 -38
  140. arekit/contrib/utils/data/readers/jsonl.py +0 -15
  141. arekit/contrib/utils/data/service/__init__.py +0 -0
  142. arekit/contrib/utils/data/service/balance.py +0 -50
  143. arekit/contrib/utils/data/writers/csv_native.py +0 -63
  144. arekit/contrib/utils/data/writers/csv_pd.py +0 -40
  145. arekit/contrib/utils/data/writers/json_opennre.py +0 -132
  146. arekit/contrib/utils/data/writers/sqlite_native.py +0 -110
  147. arekit/contrib/utils/download.py +0 -77
  148. arekit/contrib/utils/embeddings/__init__.py +0 -0
  149. arekit/contrib/utils/embeddings/rusvectores.py +0 -58
  150. arekit/contrib/utils/embeddings/tokens.py +0 -30
  151. arekit/contrib/utils/io_utils/embedding.py +0 -72
  152. arekit/contrib/utils/io_utils/opinions.py +0 -37
  153. arekit/contrib/utils/io_utils/samples.py +0 -79
  154. arekit/contrib/utils/lexicons/__init__.py +0 -0
  155. arekit/contrib/utils/lexicons/lexicon.py +0 -41
  156. arekit/contrib/utils/lexicons/relation.py +0 -42
  157. arekit/contrib/utils/lexicons/rusentilex.py +0 -37
  158. arekit/contrib/utils/nn/__init__.py +0 -0
  159. arekit/contrib/utils/nn/rows.py +0 -83
  160. arekit/contrib/utils/np_utils/__init__.py +0 -0
  161. arekit/contrib/utils/np_utils/embedding.py +0 -22
  162. arekit/contrib/utils/np_utils/npz_utils.py +0 -13
  163. arekit/contrib/utils/np_utils/vocab.py +0 -20
  164. arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
  165. arekit/contrib/utils/pipelines/items/sampling/base.py +0 -99
  166. arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -54
  167. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
  168. arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -32
  169. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
  170. arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -107
  171. arekit/contrib/utils/pipelines/items/text/translator.py +0 -135
  172. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  173. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  174. arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +0 -27
  175. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -65
  176. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
  177. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  178. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +0 -29
  179. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -64
  180. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
  181. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  182. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +0 -56
  183. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -20
  184. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -65
  185. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  186. arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +0 -21
  187. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -107
  188. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  189. arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +0 -29
  190. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
  191. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -180
  192. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
  193. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
  194. arekit/contrib/utils/processing/languages/__init__.py +0 -0
  195. arekit/contrib/utils/processing/languages/mods.py +0 -12
  196. arekit/contrib/utils/processing/languages/pos.py +0 -23
  197. arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
  198. arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
  199. arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
  200. arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
  201. arekit/contrib/utils/processing/languages/ru/number.py +0 -23
  202. arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
  203. arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
  204. arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
  205. arekit/contrib/utils/processing/pos/__init__.py +0 -0
  206. arekit/contrib/utils/processing/pos/base.py +0 -12
  207. arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
  208. arekit/contrib/utils/processing/pos/russian.py +0 -10
  209. arekit/contrib/utils/processing/text/__init__.py +0 -0
  210. arekit/contrib/utils/processing/text/tokens.py +0 -127
  211. arekit/contrib/utils/resources.py +0 -25
  212. arekit/contrib/utils/serializer.py +0 -43
  213. arekit/contrib/utils/sources/__init__.py +0 -0
  214. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  215. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  216. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
  217. arekit/contrib/utils/vectorizers/__init__.py +0 -0
  218. arekit/contrib/utils/vectorizers/bpe.py +0 -93
  219. arekit/contrib/utils/vectorizers/random_norm.py +0 -39
  220. arekit/download_data.py +0 -11
  221. arekit-0.24.0.dist-info/METADATA +0 -23
  222. arekit-0.24.0.dist-info/RECORD +0 -374
  223. {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/LICENSE +0 -0
  224. {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/top_level.txt +0 -0
@@ -1,8 +0,0 @@
1
- import enum
2
-
3
-
4
- class NerelBioVersions(enum.Enum):
5
- V1 = "v1_0"
6
-
7
-
8
- DEFAULT_VERSION = NerelBioVersions.V1
File without changes
@@ -1,36 +0,0 @@
1
- from arekit.contrib.source.ruattitudes.io_utils import RuAttitudesVersions, RuAttitudesIOUtils
2
- from arekit.contrib.source.ruattitudes.reader import RuAttitudesFormatReader
3
-
4
-
5
- class RuAttitudesCollection(object):
6
-
7
- @staticmethod
8
- def __get_reading_handler(input_file, read_inds_only, get_doc_inds_func):
9
- assert(isinstance(read_inds_only, bool))
10
-
11
- if read_inds_only:
12
- return RuAttitudesFormatReader.iter_docs_inds(input_file=input_file,
13
- get_doc_index_func=get_doc_inds_func)
14
- else:
15
- return RuAttitudesFormatReader.iter_docs(input_file=input_file,
16
- get_doc_index_func=get_doc_inds_func)
17
-
18
- @staticmethod
19
- def iter_docs(version, get_doc_index_func, return_inds_only):
20
- """
21
- RuAttitudes collection reader from zip archive
22
- """
23
- assert(isinstance(version, RuAttitudesVersions))
24
- assert(callable(get_doc_index_func))
25
- assert(isinstance(return_inds_only, bool))
26
-
27
- it = RuAttitudesIOUtils.iter_from_zip(
28
- inner_path=RuAttitudesIOUtils.get_collection_filepath(),
29
- process_func=lambda input_filepath: RuAttitudesCollection.__get_reading_handler(
30
- input_file=input_filepath,
31
- read_inds_only=return_inds_only,
32
- get_doc_inds_func=get_doc_index_func),
33
- version=version)
34
-
35
- for doc in it:
36
- yield doc
@@ -1,51 +0,0 @@
1
- from arekit.contrib.source.ruattitudes.sentence import RuAttitudesSentence
2
-
3
-
4
- class RuAttitudesDocument(object):
5
-
6
- def __init__(self, sentences, doc_index):
7
- assert(len(sentences) > 0)
8
-
9
- self.__sentences = sentences
10
- self.__objects_before_sentence = self.__cache_objects_declared_before()
11
- self.__doc_index = doc_index
12
-
13
- self.__set_owners()
14
-
15
- # region properties
16
-
17
- @property
18
- def ID(self):
19
- return self.__doc_index
20
-
21
- @property
22
- def Title(self):
23
- return self.__sentences[0]
24
-
25
- # endregion
26
-
27
- # region private methods
28
-
29
- def __set_owners(self):
30
- for sentence in self.__sentences:
31
- assert(isinstance(sentence, RuAttitudesSentence))
32
- sentence.set_owner(self)
33
-
34
- def __cache_objects_declared_before(self):
35
- d = {}
36
- before = 0
37
- for s in self.__sentences:
38
- assert(isinstance(s, RuAttitudesSentence))
39
- d[s.SentenceIndex] = before
40
- before += s.ObjectsCount
41
-
42
- return d
43
-
44
- # endregion
45
-
46
- def get_objects_declared_before(self, sentence_index):
47
- return self.__objects_before_sentence[sentence_index]
48
-
49
- def iter_sentences(self):
50
- for sentence in self.__sentences:
51
- yield sentence
@@ -1,44 +0,0 @@
1
- from arekit.contrib.source.brat.doc import BratDocument
2
- from arekit.contrib.source.brat.sentence import BratSentence
3
- from arekit.contrib.source.ruattitudes.doc import RuAttitudesDocument
4
- from arekit.contrib.source.ruattitudes.opinions.base import SentenceOpinion
5
- from arekit.contrib.source.ruattitudes.opinions.converter import RuAttitudesSentenceOpinionConverter
6
- from arekit.contrib.source.ruattitudes.sentence import RuAttitudesSentence
7
- from arekit.common.utils import split_by_whitespaces
8
-
9
-
10
- class RuAttitudesDocumentsConverter(object):
11
- """ Performs conversion to a brat-based representation.
12
- The latter allows then allows to adopt pipelines for TextOpnion extraction.
13
- """
14
-
15
- @staticmethod
16
- def to_brat_doc(doc):
17
- assert(isinstance(doc, RuAttitudesDocument))
18
- text_opinions = RuAttitudesDocumentsConverter.__iter_text_opinions(doc=doc)
19
- brat_sentences = RuAttitudesDocumentsConverter.__to_brat_sentences(doc.iter_sentences())
20
- return BratDocument(doc_id=doc.ID,
21
- sentences=brat_sentences,
22
- text_relations=list(text_opinions))
23
-
24
- @staticmethod
25
- def __to_brat_sentences(sentences_iter):
26
- sentences = []
27
- for s in sentences_iter:
28
- assert(isinstance(s, RuAttitudesSentence))
29
- assert(s.Owner is not None)
30
- brat_entities = [obj.to_entity(s.get_doc_level_text_object_id) for obj in s.iter_objects()]
31
- brat_sentence = BratSentence(text=split_by_whitespaces(s.Text), index_begin=0, entities=brat_entities)
32
- sentences.append(brat_sentence)
33
- return sentences
34
-
35
- @staticmethod
36
- def __iter_text_opinions(doc):
37
- assert(isinstance(doc, RuAttitudesDocument))
38
- for sentence in doc.iter_sentences():
39
- assert(isinstance(sentence, RuAttitudesSentence))
40
- for sentence_opinion in sentence.iter_sentence_opins():
41
- assert(isinstance(sentence_opinion, SentenceOpinion))
42
- yield RuAttitudesSentenceOpinionConverter.to_brat_relation(
43
- sentence_opinion=sentence_opinion,
44
- end_to_doc_id_func=sentence.get_doc_level_text_object_id)
File without changes
@@ -1,7 +0,0 @@
1
- from arekit.contrib.source.brat.entities.parser import BratTextEntitiesParser
2
-
3
-
4
- class RuAttitudesTextEntitiesParser(BratTextEntitiesParser):
5
-
6
- def __init__(self):
7
- super(RuAttitudesTextEntitiesParser, self).__init__(partitioning="terms")
@@ -1,56 +0,0 @@
1
- from os import path
2
-
3
- from enum import Enum
4
-
5
- from arekit.contrib.source.zip_utils import ZipArchiveUtils
6
-
7
-
8
- class RuAttitudesVersions(Enum):
9
- Debug = "dbg"
10
- V10 = "v1_0"
11
- V11 = "v1_1"
12
- V20Base = 'v2_0_base'
13
- V20Large = 'v2_0_large'
14
- V20BaseNeut = 'v2_0_base_neut'
15
- V20LargeNeut = 'v2_0_large_neut'
16
-
17
-
18
- class RuAttitudesVersionsService:
19
-
20
- @staticmethod
21
- def __iter_type_and_names():
22
- for version_type in RuAttitudesVersions:
23
- yield version_type, version_type.value
24
-
25
- @staticmethod
26
- def find_by_name(name):
27
- for version_type, related_name in RuAttitudesVersionsService.__iter_type_and_names():
28
- if name == related_name:
29
- return version_type
30
- raise Exception("Version `{}` does not supported".format(name))
31
-
32
- @staticmethod
33
- def iter_supported_names():
34
- for _, name in RuAttitudesVersionsService.__iter_type_and_names():
35
- yield name
36
-
37
-
38
- class RuAttitudesIOUtils(ZipArchiveUtils):
39
-
40
- # region internal methods
41
-
42
- @staticmethod
43
- def get_archive_filepath(version):
44
- assert(isinstance(version, str))
45
- return path.join(RuAttitudesIOUtils.get_data_root(),
46
- "ruattitudes-{version}.zip".format(version=version))
47
-
48
- @staticmethod
49
- def get_collection_filepath():
50
- return "collection.txt"
51
-
52
- @classmethod
53
- def get_synonyms_innerpath(cls):
54
- return "synonyms.txt"
55
-
56
- # endregion
@@ -1,12 +0,0 @@
1
- from arekit.common.labels.scaler.base import BaseLabelScaler
2
- from arekit.common.labels.str_fmt import StringLabelsFormatter
3
-
4
-
5
- class RuAttitudesLabelFormatter(StringLabelsFormatter):
6
-
7
- def __init__(self, label_scaler):
8
- assert(isinstance(label_scaler, BaseLabelScaler))
9
- stol = {}
10
- for int_label in [-1, 0, 1]:
11
- stol[str(int_label)] = type(label_scaler.int_to_label(int_label))
12
- super(RuAttitudesLabelFormatter, self).__init__(stol=stol)
File without changes
@@ -1,28 +0,0 @@
1
- class SentenceOpinion(object):
2
- """ Provides an opinion within a sentence.
3
- Specific for RuAttitudes collection, as the latter provides
4
- connections within a sentence.
5
- """
6
-
7
- def __init__(self, source_id, target_id, label_int, tag):
8
- assert(isinstance(label_int, int))
9
- self.__label_int = label_int
10
- self.__source_id = source_id
11
- self.__target_id = target_id
12
- self.__tag = tag
13
-
14
- @property
15
- def SourceID(self):
16
- return self.__source_id
17
-
18
- @property
19
- def TargetID(self):
20
- return self.__target_id
21
-
22
- @property
23
- def Label(self):
24
- return self.__label_int
25
-
26
- @property
27
- def Tag(self):
28
- return self.__tag
@@ -1,37 +0,0 @@
1
- from arekit.common.labels.scaler.base import BaseLabelScaler
2
- from arekit.common.opinions.base import Opinion
3
- from arekit.contrib.source.brat.relation import BratRelation
4
- from arekit.contrib.source.ruattitudes.opinions.base import SentenceOpinion
5
-
6
-
7
- class RuAttitudesSentenceOpinionConverter:
8
-
9
- @staticmethod
10
- def to_brat_relation(sentence_opinion, end_to_doc_id_func):
11
- """ Converts opinion into brat-related relation.
12
- NOTE: for rel_type we just call str() over int-based value.
13
- """
14
- assert(isinstance(sentence_opinion, SentenceOpinion))
15
- return BratRelation(id_in_doc="0",
16
- source_id=end_to_doc_id_func(sentence_opinion.SourceID),
17
- target_id=end_to_doc_id_func(sentence_opinion.TargetID),
18
- rel_type=str(sentence_opinion.Label))
19
-
20
- @staticmethod
21
- def to_opinion(sentence_opinion, source_value, target_value, label_scaler):
22
- """
23
- Converts onto document, non referenced opinion
24
- (non bounded to the text).
25
- """
26
- assert(isinstance(sentence_opinion, SentenceOpinion))
27
- assert(isinstance(label_scaler, BaseLabelScaler))
28
-
29
- opinion = Opinion(source_value=source_value,
30
- target_value=target_value,
31
- label=label_scaler.int_to_label(sentence_opinion.Label))
32
-
33
- # Using this tag allows to perform a revert operation,
34
- # i.e. to find opinion_ref by opinion.
35
- opinion.set_tag(sentence_opinion.Tag)
36
-
37
- return opinion
@@ -1,268 +0,0 @@
1
- from arekit.common.utils import split_by_whitespaces
2
- from arekit.contrib.source.ruattitudes.doc import RuAttitudesDocument
3
- from arekit.contrib.source.ruattitudes.opinions.base import SentenceOpinion
4
- from arekit.contrib.source.ruattitudes.sentence import RuAttitudesSentence
5
- from arekit.contrib.source.ruattitudes.text_object import TextObject
6
-
7
-
8
- class RuAttitudesFormatReader(object):
9
-
10
- DOC_SEP_KEY = '--------'
11
- FILE_KEY = "File:"
12
- OBJ_KEY = "Object:"
13
- TITLE_KEY = "Title:"
14
- SINDEX_KEY = "Sentence:"
15
- OPINION_KEY = "Attitude:"
16
- STEXT_KEY = "Text:"
17
- TERMS_IN_TITLE = "TermsInTitle:"
18
- TERMS_IN_TEXT = "TermsInText:"
19
- FRAMEVAR_TITLE = "FrameVariant:"
20
-
21
- AUTH_LABEL = '<AUTH>'
22
-
23
- def __iter__(self):
24
- pass
25
-
26
- # region private methods
27
-
28
- @staticmethod
29
- def iter_docs_inds(input_file, get_doc_index_func):
30
- assert(callable(get_doc_index_func))
31
-
32
- title = None
33
- local_doc_ind = 0
34
- has_sentences = False
35
-
36
- for line in RuAttitudesFormatReader.__iter_lines(input_file):
37
-
38
- if RuAttitudesFormatReader.__check_is_title(line):
39
- # We use a placeholder, there is no need in actual value out there.
40
- title = "title"
41
- has_sentences = True
42
-
43
- if RuAttitudesFormatReader.__check_is_doc_sep(line=line, title=title):
44
- yield RuAttitudesFormatReader.__assign_doc_index(doc_index_func=get_doc_index_func,
45
- local_index=local_doc_ind)
46
- local_doc_ind += 1
47
- title = None
48
-
49
- if has_sentences:
50
- yield RuAttitudesFormatReader.__assign_doc_index(doc_index_func=get_doc_index_func,
51
- local_index=local_doc_ind)
52
-
53
- @staticmethod
54
- def iter_docs(input_file, get_doc_index_func):
55
- assert(callable(get_doc_index_func))
56
-
57
- reset = False
58
- title = None
59
- title_terms_count = None
60
- text_terms_count = None
61
- sentences = []
62
- opinions_list = []
63
- objects_list = []
64
- s_index = 0
65
- objects_in_prior_sentences_count = 0
66
- local_doc_ind = 0
67
-
68
- for line in RuAttitudesFormatReader.__iter_lines(input_file):
69
-
70
- if RuAttitudesFormatReader.FILE_KEY in line:
71
- pass
72
-
73
- if RuAttitudesFormatReader.OBJ_KEY in line:
74
- object = RuAttitudesFormatReader.__parse_object(line)
75
- objects_list.append(object)
76
-
77
- if RuAttitudesFormatReader.OPINION_KEY in line:
78
- sentence_opin = RuAttitudesFormatReader.__parse_sentence_opin(line)
79
- opinions_list.append(sentence_opin)
80
-
81
- if RuAttitudesFormatReader.FRAMEVAR_TITLE in line:
82
- # TODO. This information is ommited now.
83
- pass
84
-
85
- if RuAttitudesFormatReader.TERMS_IN_TITLE in line:
86
- title_terms_count = RuAttitudesFormatReader.__parse_terms_in_title_count(line)
87
-
88
- if RuAttitudesFormatReader.SINDEX_KEY in line:
89
- s_index = RuAttitudesFormatReader.__parse_sentence_index(line)
90
-
91
- if RuAttitudesFormatReader.__check_is_title(line):
92
- title = RuAttitudesSentence(is_title=True,
93
- text=RuAttitudesFormatReader.__parse_sentence(line, True),
94
- sentence_opins=opinions_list,
95
- objects_list=objects_list,
96
- sentence_index=-1)
97
- sentences.append(title)
98
- t_len = RuAttitudesFormatReader.__calculate_terms_in_line(line)
99
- assert(title_terms_count == t_len or title_terms_count is None)
100
- reset = True
101
-
102
- if RuAttitudesFormatReader.STEXT_KEY in line and line.index(RuAttitudesFormatReader.STEXT_KEY) == 0:
103
- sentence = RuAttitudesSentence(is_title=False,
104
- text=RuAttitudesFormatReader.__parse_sentence(line, False),
105
- sentence_opins=opinions_list,
106
- objects_list=objects_list,
107
- sentence_index=s_index)
108
- sentences.append(sentence)
109
- objects_in_prior_sentences_count += len(objects_list)
110
- t_len = RuAttitudesFormatReader.__calculate_terms_in_line(line)
111
- assert(text_terms_count == t_len or text_terms_count is None)
112
- reset = True
113
-
114
- if RuAttitudesFormatReader.__check_is_doc_sep(line=line, title=title):
115
- doc_index = RuAttitudesFormatReader.__assign_doc_index(doc_index_func=get_doc_index_func,
116
- local_index=local_doc_ind)
117
- yield RuAttitudesDocument(sentences=sentences,
118
- doc_index=doc_index)
119
- local_doc_ind += 1
120
- sentences = []
121
- reset = True
122
-
123
- if RuAttitudesFormatReader.TERMS_IN_TEXT in line:
124
- text_terms_count = RuAttitudesFormatReader.__parse_terms_in_text_count(line)
125
-
126
- if reset:
127
- opinions_list = []
128
- objects_list = []
129
- title_terms_count = None
130
- reset = False
131
-
132
- if len(sentences) > 0:
133
- doc_index = RuAttitudesFormatReader.__assign_doc_index(doc_index_func=get_doc_index_func,
134
- local_index=local_doc_ind)
135
- yield RuAttitudesDocument(sentences=sentences,
136
- doc_index=doc_index)
137
- sentences = []
138
-
139
- assert(len(sentences) == 0)
140
-
141
- @staticmethod
142
- def __assign_doc_index(doc_index_func, local_index):
143
- assert(callable(doc_index_func))
144
- return doc_index_func(local_index)
145
-
146
- @staticmethod
147
- def __check_is_doc_sep(line, title):
148
- return RuAttitudesFormatReader.DOC_SEP_KEY in line and title is not None
149
-
150
- @staticmethod
151
- def __check_is_title(line):
152
- return RuAttitudesFormatReader.TITLE_KEY in line
153
-
154
- @staticmethod
155
- def __iter_lines(input_file):
156
- for line in input_file.readlines():
157
- yield line.decode('utf-8')
158
-
159
- @staticmethod
160
- def __calculate_terms_in_line(line):
161
- assert(isinstance(line, str))
162
- return len(split_by_whitespaces(line))
163
-
164
- @staticmethod
165
- def __parse_sentence(line, is_title):
166
- assert(isinstance(is_title, bool))
167
-
168
- key = RuAttitudesFormatReader.STEXT_KEY if not is_title else RuAttitudesFormatReader.TITLE_KEY
169
- text = line[len(key):]
170
- return text.strip()
171
-
172
- @staticmethod
173
- def __parse_sentence_opin(line):
174
- line = line[len(RuAttitudesFormatReader.OPINION_KEY):]
175
-
176
- s_from = line.index('b:(')
177
- s_to = line.index(')', s_from)
178
- label = int(line[s_from + 3:s_to])
179
-
180
- o_from = line.index('oi:[')
181
- o_to = line.index(']', o_from)
182
- source_object_id_in_sentence, target_object_id_in_sentence = line[o_from + 4:o_to].split(',')
183
-
184
- source_object_id_in_sentence = int(source_object_id_in_sentence)
185
- target_object_id_in_sentence = int(target_object_id_in_sentence)
186
-
187
- s_from = line.index('si:{')
188
- s_to = line.index('}', s_from)
189
- opninion_key = line[s_from+4:s_to]
190
-
191
- sentence_opin = SentenceOpinion(source_id=source_object_id_in_sentence,
192
- target_id=target_object_id_in_sentence,
193
- label_int=label,
194
- tag=opninion_key)
195
-
196
- return sentence_opin
197
-
198
- @staticmethod
199
- def __parse_object(line):
200
- assert(isinstance(line, str))
201
-
202
- line = line[len(RuAttitudesFormatReader.OBJ_KEY):]
203
-
204
- obj_ind_begin = line.index('oi:[', 0)
205
- obj_ind_end = line.index(']', obj_ind_begin + 1)
206
-
207
- o_begin = line.index("'", 0)
208
- o_end = line.index("'", o_begin + 1)
209
-
210
- b_from = line.index('b:(')
211
- b_to = line.index(')', b_from)
212
-
213
- id_in_sentence = int(line[obj_ind_begin + 4:obj_ind_end])
214
- term_index, length = line[b_from+3:b_to].split(',')
215
- value = line[o_begin + 1:o_end]
216
-
217
- obj_type = RuAttitudesFormatReader.__try_get_type(line)
218
-
219
- sg_from = line.index('si:{')
220
- sg_to = line.index('}', sg_from)
221
- group_index = int(line[sg_from+4:sg_to])
222
-
223
- is_auth = '<AUTH>' in line
224
-
225
- text_object = TextObject(id_in_sentence=id_in_sentence,
226
- value=value,
227
- obj_type=obj_type,
228
- position=int(term_index),
229
- terms_count=int(length),
230
- syn_group_index=group_index,
231
- is_auth=is_auth)
232
-
233
- return text_object
234
-
235
- @staticmethod
236
- def __parse_terms_in_title_count(line):
237
- line = line[len(RuAttitudesFormatReader.TERMS_IN_TITLE):]
238
- return int(line)
239
-
240
- @staticmethod
241
- def __parse_terms_in_text_count(line):
242
- line = line[len(RuAttitudesFormatReader.TERMS_IN_TEXT):]
243
- return int(line)
244
-
245
- @staticmethod
246
- def __parse_sentence_index(line):
247
- line = line[len(RuAttitudesFormatReader.SINDEX_KEY):]
248
- return int(line)
249
-
250
- @staticmethod
251
- def __try_get_type(line):
252
-
253
- # Tag, utilized in RuAttitudes-2.0 format.
254
- template = 'type:'
255
- if template in line:
256
- is_auth = RuAttitudesFormatReader.AUTH_LABEL in line
257
- t_from = line.index(template)
258
- t_to = line.index(RuAttitudesFormatReader.AUTH_LABEL[0], t_from) if is_auth else len(line)
259
- return line[t_from + len(template):t_to].strip()
260
-
261
- # Tag, utilized in RuAttitudes-1.* format.
262
- template = 't:['
263
- if template in line:
264
- t_from = line.index(template)
265
- t_to = line.index(']', t_from)
266
- return line[t_from + len(template):t_to].strip()
267
-
268
- # endregion
@@ -1,73 +0,0 @@
1
- from arekit.common.docs.sentence import BaseDocumentSentence
2
- from arekit.contrib.source.ruattitudes.opinions.base import SentenceOpinion
3
-
4
-
5
- class RuAttitudesSentence(BaseDocumentSentence):
6
-
7
- def __init__(self, is_title, text, sentence_opins, objects_list, sentence_index):
8
- assert(isinstance(is_title, bool))
9
- assert(isinstance(sentence_opins, list))
10
- assert(isinstance(objects_list, list))
11
- assert(isinstance(sentence_index, int))
12
- super(RuAttitudesSentence, self).__init__(text)
13
-
14
- self.__is_title = is_title
15
- self.__sentence_opins = sentence_opins
16
- self.__objects = objects_list
17
- self.__sentence_index = sentence_index
18
- self.__owner = None
19
-
20
- # region properties
21
-
22
- @property
23
- def SentenceIndex(self):
24
- return self.__sentence_index
25
-
26
- @property
27
- def IsTitle(self):
28
- return self.__is_title
29
-
30
- @property
31
- def Owner(self):
32
- return self.__owner
33
-
34
- @property
35
- def ObjectsCount(self):
36
- return len(self.__objects)
37
-
38
- # endregion
39
-
40
- # region public methods
41
-
42
- def set_owner(self, owner):
43
- if self.__owner is not None:
44
- raise Exception("Owner is already declared")
45
- self.__owner = owner
46
-
47
- def get_objects(self, sentence_opin):
48
- assert(isinstance(sentence_opin, SentenceOpinion))
49
- source_obj = self.__objects[sentence_opin.SourceID]
50
- target_obj = self.__objects[sentence_opin.TargetID]
51
- return source_obj, target_obj
52
-
53
- def get_doc_level_text_object_id(self, text_object_ind):
54
- return text_object_ind + self.__owner.get_objects_declared_before(self.SentenceIndex)
55
-
56
- def iter_objects(self):
57
- for object in self.__objects:
58
- yield object
59
-
60
- def find_sentence_opin_by_key(self, key):
61
- assert(key is not None)
62
-
63
- for opinion in self.__sentence_opins:
64
- if opinion.Tag == key:
65
- return opinion
66
-
67
- return None
68
-
69
- def iter_sentence_opins(self):
70
- for opinion in self.__sentence_opins:
71
- yield opinion
72
-
73
- # endregion
@@ -1,17 +0,0 @@
1
- from arekit.contrib.source.ruattitudes.io_utils import RuAttitudesIOUtils
2
- from arekit.contrib.source.synonyms.utils import iter_synonym_groups
3
-
4
-
5
- class RuAttitudesSynonymsCollectionHelper(object):
6
-
7
- @staticmethod
8
- def iter_groups(version):
9
- it = RuAttitudesIOUtils.iter_from_zip(
10
- inner_path=RuAttitudesIOUtils.get_synonyms_innerpath(),
11
- process_func=lambda input_file: iter_synonym_groups(
12
- input_file,
13
- desc="Loading RuAttitudes SynonymsCollection"),
14
- version=version)
15
-
16
- for group in it:
17
- yield group