arekit 0.24.0__py3-none-any.whl → 0.25.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. arekit/common/data/storages/base.py +4 -15
  2. arekit/common/docs/entities_grouping.py +2 -1
  3. arekit/common/docs/parser.py +27 -22
  4. arekit/common/pipeline/base.py +12 -16
  5. arekit/common/pipeline/batching.py +28 -0
  6. arekit/common/pipeline/context.py +5 -1
  7. arekit/common/pipeline/items/base.py +39 -2
  8. arekit/common/pipeline/items/flatten.py +5 -1
  9. arekit/common/pipeline/items/handle.py +2 -1
  10. arekit/common/pipeline/items/iter.py +2 -1
  11. arekit/common/pipeline/items/map.py +2 -1
  12. arekit/common/pipeline/items/map_nested.py +4 -0
  13. arekit/common/pipeline/utils.py +32 -0
  14. arekit/common/text/{partitioning/str.py → partitioning.py} +14 -9
  15. arekit/common/utils.py +11 -52
  16. arekit/contrib/utils/data/contents/opinions.py +13 -3
  17. arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
  18. arekit/contrib/utils/data/storages/pandas_based.py +2 -17
  19. arekit/contrib/utils/data/storages/row_cache.py +8 -2
  20. arekit/contrib/utils/data/storages/sqlite_based.py +18 -0
  21. arekit/contrib/utils/io_utils/utils.py +1 -18
  22. arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
  23. arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
  24. arekit/contrib/utils/pipelines/text_opinion/extraction.py +8 -10
  25. arekit-0.25.1.data/data/logo.png +0 -0
  26. arekit-0.25.1.dist-info/METADATA +81 -0
  27. arekit-0.25.1.dist-info/RECORD +186 -0
  28. {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/WHEEL +1 -1
  29. arekit/common/data/input/repositories/__init__.py +0 -0
  30. arekit/common/data/input/repositories/base.py +0 -68
  31. arekit/common/data/input/repositories/sample.py +0 -22
  32. arekit/common/data/views/__init__.py +0 -0
  33. arekit/common/data/views/samples.py +0 -26
  34. arekit/common/docs/objects_parser.py +0 -37
  35. arekit/common/text/parser.py +0 -12
  36. arekit/common/text/partitioning/__init__.py +0 -0
  37. arekit/common/text/partitioning/base.py +0 -4
  38. arekit/common/text/partitioning/terms.py +0 -35
  39. arekit/contrib/networks/__init__.py +0 -0
  40. arekit/contrib/networks/embedding.py +0 -149
  41. arekit/contrib/networks/embedding_io.py +0 -18
  42. arekit/contrib/networks/input/__init__.py +0 -0
  43. arekit/contrib/networks/input/const.py +0 -6
  44. arekit/contrib/networks/input/ctx_serialization.py +0 -28
  45. arekit/contrib/networks/input/embedding/__init__.py +0 -0
  46. arekit/contrib/networks/input/embedding/matrix.py +0 -29
  47. arekit/contrib/networks/input/embedding/offsets.py +0 -55
  48. arekit/contrib/networks/input/formatters/__init__.py +0 -0
  49. arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
  50. arekit/contrib/networks/input/providers/__init__.py +0 -0
  51. arekit/contrib/networks/input/providers/sample.py +0 -129
  52. arekit/contrib/networks/input/providers/term_connotation.py +0 -23
  53. arekit/contrib/networks/input/providers/text.py +0 -24
  54. arekit/contrib/networks/input/rows_parser.py +0 -47
  55. arekit/contrib/networks/input/term_types.py +0 -13
  56. arekit/contrib/networks/input/terms_mapping.py +0 -60
  57. arekit/contrib/networks/vectorizer.py +0 -6
  58. arekit/contrib/source/__init__.py +0 -0
  59. arekit/contrib/source/brat/__init__.py +0 -0
  60. arekit/contrib/source/brat/annot.py +0 -84
  61. arekit/contrib/source/brat/doc.py +0 -28
  62. arekit/contrib/source/brat/entities/__init__.py +0 -0
  63. arekit/contrib/source/brat/entities/compound.py +0 -13
  64. arekit/contrib/source/brat/entities/entity.py +0 -42
  65. arekit/contrib/source/brat/entities/parser.py +0 -53
  66. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  67. arekit/contrib/source/brat/opinions/converter.py +0 -19
  68. arekit/contrib/source/brat/relation.py +0 -32
  69. arekit/contrib/source/brat/sentence.py +0 -69
  70. arekit/contrib/source/brat/sentences_reader.py +0 -128
  71. arekit/contrib/source/download.py +0 -41
  72. arekit/contrib/source/nerel/__init__.py +0 -0
  73. arekit/contrib/source/nerel/entities.py +0 -55
  74. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  75. arekit/contrib/source/nerel/folding/fixed.py +0 -74
  76. arekit/contrib/source/nerel/io_utils.py +0 -62
  77. arekit/contrib/source/nerel/labels.py +0 -241
  78. arekit/contrib/source/nerel/reader.py +0 -46
  79. arekit/contrib/source/nerel/utils.py +0 -24
  80. arekit/contrib/source/nerel/versions.py +0 -12
  81. arekit/contrib/source/nerelbio/__init__.py +0 -0
  82. arekit/contrib/source/nerelbio/io_utils.py +0 -62
  83. arekit/contrib/source/nerelbio/labels.py +0 -265
  84. arekit/contrib/source/nerelbio/reader.py +0 -8
  85. arekit/contrib/source/nerelbio/versions.py +0 -8
  86. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  87. arekit/contrib/source/ruattitudes/collection.py +0 -36
  88. arekit/contrib/source/ruattitudes/doc.py +0 -51
  89. arekit/contrib/source/ruattitudes/doc_brat.py +0 -44
  90. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  91. arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
  92. arekit/contrib/source/ruattitudes/io_utils.py +0 -56
  93. arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
  94. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  95. arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
  96. arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
  97. arekit/contrib/source/ruattitudes/reader.py +0 -268
  98. arekit/contrib/source/ruattitudes/sentence.py +0 -73
  99. arekit/contrib/source/ruattitudes/synonyms.py +0 -17
  100. arekit/contrib/source/ruattitudes/text_object.py +0 -59
  101. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  102. arekit/contrib/source/rusentiframes/collection.py +0 -157
  103. arekit/contrib/source/rusentiframes/effect.py +0 -24
  104. arekit/contrib/source/rusentiframes/io_utils.py +0 -19
  105. arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
  106. arekit/contrib/source/rusentiframes/polarity.py +0 -35
  107. arekit/contrib/source/rusentiframes/role.py +0 -15
  108. arekit/contrib/source/rusentiframes/state.py +0 -24
  109. arekit/contrib/source/rusentiframes/types.py +0 -42
  110. arekit/contrib/source/rusentiframes/value.py +0 -2
  111. arekit/contrib/source/rusentrel/__init__.py +0 -0
  112. arekit/contrib/source/rusentrel/const.py +0 -3
  113. arekit/contrib/source/rusentrel/docs_reader.py +0 -51
  114. arekit/contrib/source/rusentrel/entities.py +0 -26
  115. arekit/contrib/source/rusentrel/io_utils.py +0 -125
  116. arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
  117. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  118. arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
  119. arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
  120. arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
  121. arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
  122. arekit/contrib/source/rusentrel/synonyms.py +0 -17
  123. arekit/contrib/source/sentinerel/__init__.py +0 -0
  124. arekit/contrib/source/sentinerel/entities.py +0 -52
  125. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  126. arekit/contrib/source/sentinerel/folding/factory.py +0 -31
  127. arekit/contrib/source/sentinerel/folding/fixed.py +0 -70
  128. arekit/contrib/source/sentinerel/io_utils.py +0 -87
  129. arekit/contrib/source/sentinerel/labels.py +0 -53
  130. arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
  131. arekit/contrib/source/sentinerel/reader.py +0 -42
  132. arekit/contrib/source/synonyms/__init__.py +0 -0
  133. arekit/contrib/source/synonyms/utils.py +0 -19
  134. arekit/contrib/source/zip_utils.py +0 -47
  135. arekit/contrib/utils/connotations/__init__.py +0 -0
  136. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
  137. arekit/contrib/utils/data/readers/__init__.py +0 -0
  138. arekit/contrib/utils/data/readers/base.py +0 -7
  139. arekit/contrib/utils/data/readers/csv_pd.py +0 -38
  140. arekit/contrib/utils/data/readers/jsonl.py +0 -15
  141. arekit/contrib/utils/data/service/__init__.py +0 -0
  142. arekit/contrib/utils/data/service/balance.py +0 -50
  143. arekit/contrib/utils/data/writers/csv_native.py +0 -63
  144. arekit/contrib/utils/data/writers/csv_pd.py +0 -40
  145. arekit/contrib/utils/data/writers/json_opennre.py +0 -132
  146. arekit/contrib/utils/data/writers/sqlite_native.py +0 -110
  147. arekit/contrib/utils/download.py +0 -77
  148. arekit/contrib/utils/embeddings/__init__.py +0 -0
  149. arekit/contrib/utils/embeddings/rusvectores.py +0 -58
  150. arekit/contrib/utils/embeddings/tokens.py +0 -30
  151. arekit/contrib/utils/io_utils/embedding.py +0 -72
  152. arekit/contrib/utils/io_utils/opinions.py +0 -37
  153. arekit/contrib/utils/io_utils/samples.py +0 -79
  154. arekit/contrib/utils/lexicons/__init__.py +0 -0
  155. arekit/contrib/utils/lexicons/lexicon.py +0 -41
  156. arekit/contrib/utils/lexicons/relation.py +0 -42
  157. arekit/contrib/utils/lexicons/rusentilex.py +0 -37
  158. arekit/contrib/utils/nn/__init__.py +0 -0
  159. arekit/contrib/utils/nn/rows.py +0 -83
  160. arekit/contrib/utils/np_utils/__init__.py +0 -0
  161. arekit/contrib/utils/np_utils/embedding.py +0 -22
  162. arekit/contrib/utils/np_utils/npz_utils.py +0 -13
  163. arekit/contrib/utils/np_utils/vocab.py +0 -20
  164. arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
  165. arekit/contrib/utils/pipelines/items/sampling/base.py +0 -99
  166. arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -54
  167. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
  168. arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -32
  169. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
  170. arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -107
  171. arekit/contrib/utils/pipelines/items/text/translator.py +0 -135
  172. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  173. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  174. arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +0 -27
  175. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -65
  176. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
  177. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  178. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +0 -29
  179. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -64
  180. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
  181. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  182. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +0 -56
  183. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -20
  184. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -65
  185. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  186. arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +0 -21
  187. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -107
  188. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  189. arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +0 -29
  190. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
  191. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -180
  192. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
  193. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
  194. arekit/contrib/utils/processing/languages/__init__.py +0 -0
  195. arekit/contrib/utils/processing/languages/mods.py +0 -12
  196. arekit/contrib/utils/processing/languages/pos.py +0 -23
  197. arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
  198. arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
  199. arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
  200. arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
  201. arekit/contrib/utils/processing/languages/ru/number.py +0 -23
  202. arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
  203. arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
  204. arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
  205. arekit/contrib/utils/processing/pos/__init__.py +0 -0
  206. arekit/contrib/utils/processing/pos/base.py +0 -12
  207. arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
  208. arekit/contrib/utils/processing/pos/russian.py +0 -10
  209. arekit/contrib/utils/processing/text/__init__.py +0 -0
  210. arekit/contrib/utils/processing/text/tokens.py +0 -127
  211. arekit/contrib/utils/resources.py +0 -25
  212. arekit/contrib/utils/serializer.py +0 -43
  213. arekit/contrib/utils/sources/__init__.py +0 -0
  214. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  215. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  216. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
  217. arekit/contrib/utils/vectorizers/__init__.py +0 -0
  218. arekit/contrib/utils/vectorizers/bpe.py +0 -93
  219. arekit/contrib/utils/vectorizers/random_norm.py +0 -39
  220. arekit/download_data.py +0 -11
  221. arekit-0.24.0.dist-info/METADATA +0 -23
  222. arekit-0.24.0.dist-info/RECORD +0 -374
  223. {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/LICENSE +0 -0
  224. {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/top_level.txt +0 -0
@@ -1,40 +0,0 @@
1
- from arekit.common.labels.str_fmt import StringLabelsFormatter
2
- from arekit.common.opinions.base import Opinion
3
-
4
-
5
- class OpinionConverter(object):
6
- """ Opinion type <-> string Converter.
7
- """
8
-
9
- @staticmethod
10
- def try_from_string(line, labels_formatter):
11
- assert(isinstance(line, str))
12
-
13
- args = line.strip().split(',')
14
- assert (len(args) >= 3)
15
-
16
- source_value = args[0].strip()
17
- target_value = args[1].strip()
18
- str_label = args[2].strip()
19
-
20
- if not labels_formatter.supports_value(str_label):
21
- return None
22
-
23
- return Opinion(source_value=source_value,
24
- target_value=target_value,
25
- label=labels_formatter.str_to_label(str_label))
26
-
27
- @staticmethod
28
- def try_to_string(opinion, labels_formatter):
29
- assert(isinstance(opinion, Opinion))
30
- assert(isinstance(labels_formatter, StringLabelsFormatter))
31
-
32
- label = opinion.Label
33
-
34
- if not labels_formatter.supports_label(label):
35
- return None
36
-
37
- return "{}, {}, {}, current".format(
38
- opinion.SourceValue,
39
- opinion.TargetValue,
40
- labels_formatter.label_to_str(opinion.Label))
@@ -1,54 +0,0 @@
1
- from arekit.common.opinions.provider import OpinionCollectionsProvider
2
- from arekit.common.labels.str_fmt import StringLabelsFormatter
3
- from arekit.contrib.source.rusentrel.opinions.converter import OpinionConverter
4
-
5
-
6
- class RuSentRelOpinionCollectionProvider(OpinionCollectionsProvider):
7
-
8
- @staticmethod
9
- def _iter_opinions_from_file(input_file, labels_formatter, error_on_non_supported):
10
- assert(isinstance(labels_formatter, StringLabelsFormatter))
11
- assert(isinstance(error_on_non_supported, bool))
12
-
13
- for line in input_file.readlines():
14
-
15
- # Force perform decoding if needed.
16
- if isinstance(line, bytes):
17
- line = line.decode()
18
-
19
- if line == '\n':
20
- continue
21
-
22
- str_opinion = OpinionConverter.try_from_string(
23
- line=line,
24
- labels_formatter=labels_formatter)
25
-
26
- if str_opinion is None:
27
- if error_on_non_supported:
28
- raise Exception("Line '{line}' has non supported label")
29
- else:
30
- continue
31
-
32
- yield str_opinion
33
-
34
- # region public methods
35
-
36
- def iter_opinions(self, source, encoding, labels_formatter, error_on_non_supported=True):
37
- """
38
- Important: For externally saved collections (using save_to_file method) and related usage
39
- """
40
- assert(isinstance(source, str))
41
- assert(isinstance(labels_formatter, StringLabelsFormatter))
42
- assert(isinstance(error_on_non_supported, bool))
43
-
44
- with open(source, 'r', encoding=encoding) as input_file:
45
-
46
- it = RuSentRelOpinionCollectionProvider._iter_opinions_from_file(
47
- input_file=input_file,
48
- labels_formatter=labels_formatter,
49
- error_on_non_supported=error_on_non_supported)
50
-
51
- for opinion in it:
52
- yield opinion
53
-
54
- # endregion
@@ -1,42 +0,0 @@
1
- import io
2
-
3
- from arekit.common.labels.str_fmt import StringLabelsFormatter
4
- from arekit.common.opinions.base import Opinion
5
- from arekit.common.opinions.collection import OpinionCollection
6
- from arekit.common.opinions.writer import OpinionCollectionWriter
7
- from arekit.common.utils import create_dir_if_not_exists
8
- from arekit.contrib.source.rusentrel.opinions.converter import OpinionConverter
9
-
10
-
11
- class RuSentRelOpinionCollectionWriter(OpinionCollectionWriter):
12
-
13
- def serialize(self, collection, target, encoding, labels_formatter, error_on_non_supported=True):
14
- assert(isinstance(collection, OpinionCollection))
15
- assert(isinstance(target, str))
16
- assert(isinstance(labels_formatter, StringLabelsFormatter))
17
- assert(isinstance(error_on_non_supported, bool))
18
-
19
- def __opinion_key(opinion):
20
- assert (isinstance(opinion, Opinion))
21
- return opinion.SourceValue + opinion.TargetValue
22
-
23
- sorted_ops = sorted(collection, key=__opinion_key)
24
-
25
- create_dir_if_not_exists(target)
26
-
27
- with io.open(target, 'w', encoding=encoding) as f:
28
- for o in sorted_ops:
29
-
30
- str_value = OpinionConverter.try_to_string(
31
- opinion=o,
32
- labels_formatter=labels_formatter)
33
-
34
- if str_value is None:
35
- if error_on_non_supported:
36
- raise Exception("Opinion label `{label}` is not supported by formatter".format(
37
- label=o.Label))
38
- else:
39
- continue
40
-
41
- f.write(str_value)
42
- f.write('\n')
@@ -1,17 +0,0 @@
1
- from arekit.contrib.source.synonyms.utils import iter_synonym_groups
2
- from arekit.contrib.source.rusentrel.io_utils import RuSentRelIOUtils
3
-
4
-
5
- class RuSentRelSynonymsCollectionHelper(object):
6
-
7
- @staticmethod
8
- def iter_groups(version):
9
- it = RuSentRelIOUtils.iter_from_zip(
10
- inner_path=RuSentRelIOUtils.get_synonyms_innerpath(),
11
- process_func=lambda input_file: iter_synonym_groups(
12
- input_file,
13
- desc="Loading RuSentRel Collection"),
14
- version=version)
15
-
16
- for group in it:
17
- yield group
File without changes
@@ -1,52 +0,0 @@
1
- from arekit.common.entities.collection import EntityCollection
2
- from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders
3
- from arekit.contrib.source.brat.annot import BratAnnotationParser
4
- from arekit.contrib.source.brat.entities.entity import BratEntity
5
- from arekit.contrib.source.sentinerel.io_utils import SentiNerelIOUtils
6
- from arekit.contrib.utils.processing.lemmatization.mystem import MystemWrapper
7
- from arekit.contrib.utils.synonyms.stemmer_based import StemmerBasedSynonymCollection
8
-
9
-
10
- class SentiNerelEntityCollection(EntityCollection):
11
-
12
- def __init__(self, contents, value_to_group_id_func, entities_to_ignore=None):
13
- """
14
- entities_to_ignore: list or None
15
- this parameter is required because of the simplified implmentation of
16
- the nested objects of the BRAT annotation.
17
- """
18
- assert(isinstance(contents, dict))
19
- assert(BratAnnotationParser.ENTITIES in contents)
20
- assert(isinstance(entities_to_ignore, list) or entities_to_ignore is None)
21
-
22
- self.__dicard_entities = set([] if entities_to_ignore is None else entities_to_ignore)
23
- contents[BratAnnotationParser.ENTITIES] = [e for e in contents[BratAnnotationParser.ENTITIES]
24
- if self.__keep_entity(e)]
25
-
26
- super(SentiNerelEntityCollection, self).__init__(
27
- entities=contents[BratAnnotationParser.ENTITIES],
28
- value_to_group_id_func=value_to_group_id_func)
29
-
30
- self._sort_entities(key=lambda entity: entity.IndexBegin)
31
-
32
- def __keep_entity(self, entity):
33
- assert(isinstance(entity, BratEntity))
34
- return entity.Type not in self.__dicard_entities
35
-
36
- @classmethod
37
- def read_collection(cls, filename, version, entities_to_ignore=None):
38
- assert(isinstance(filename, str))
39
-
40
- # Since this dataset does not provide the synonyms collection by default,
41
- # it is necessary to declare an empty collection to populate so in further.
42
- synonyms = StemmerBasedSynonymCollection(stemmer=MystemWrapper(), is_read_only=False)
43
-
44
- return SentiNerelIOUtils.read_from_zip(
45
- inner_path=SentiNerelIOUtils.get_annotation_innerpath(filename),
46
- process_func=lambda input_file: cls(
47
- contents=BratAnnotationParser.parse_annotations(input_file=input_file, encoding='utf-8-sig'),
48
- entities_to_ignore=entities_to_ignore,
49
- value_to_group_id_func=lambda value:
50
- SynonymsCollectionValuesGroupingProviders.provide_existed_or_register_missed_value(
51
- synonyms, value)),
52
- version=version)
File without changes
@@ -1,31 +0,0 @@
1
- from arekit.contrib.source.sentinerel.folding.fixed import create_fixed_folding_doc_ids
2
-
3
-
4
- class SentiNERELFoldingFactory:
5
- """ Factory of the variety types of the splits that
6
- are considered within the present experiments.
7
- """
8
-
9
- @staticmethod
10
- def create_fixed_folding(file, limit=None):
11
- """ limit: int
12
- Allows to limit amount of documents (utilized for testing reasons)
13
- """
14
-
15
- train_filenames, test_filenames = SentiNERELFoldingFactory._read_train_test(f=file)
16
- if limit is not None:
17
- train_filenames = train_filenames[:limit]
18
- test_filenames = test_filenames[:limit]
19
- filenames_by_ids, data_folding = create_fixed_folding_doc_ids(train_filenames=train_filenames,
20
- test_filenames=test_filenames)
21
-
22
- return filenames_by_ids, data_folding
23
-
24
- @staticmethod
25
- def _read_train_test(f):
26
- parts = []
27
- for line in f.readlines():
28
- if isinstance(line, bytes):
29
- line = line.decode('utf-8')
30
- parts.append(line.strip().split(','))
31
- return parts[0], parts[1]
@@ -1,70 +0,0 @@
1
- from collections import OrderedDict
2
-
3
- from arekit.common.experiment.data_type import DataType
4
-
5
-
6
- def create_fixed_folding_doc_ids(train_filenames, test_filenames):
7
- """ Create fixed data-folding based on the predefined list of filenames,
8
- written in file.
9
- """
10
- assert(isinstance(train_filenames, list))
11
- assert(isinstance(test_filenames, list))
12
-
13
- filenames_by_ids = __create_filenames_by_ids(filenames=train_filenames + test_filenames)
14
-
15
- ids_by_filenames = {}
16
- for doc_id, filename in filenames_by_ids.items():
17
- ids_by_filenames[filename] = doc_id
18
-
19
- train_doc_ids = [ids_by_filenames[filename] for filename in train_filenames]
20
- test_doc_ids = [ids_by_filenames[filename] for filename in test_filenames]
21
-
22
- return {
23
- DataType.Train: train_doc_ids,
24
- DataType.Test: test_doc_ids,
25
- DataType.Etalon: test_doc_ids,
26
- DataType.Dev: test_doc_ids
27
- }
28
-
29
-
30
- def __create_filenames_by_ids(filenames):
31
- """ Indexing filenames
32
- """
33
-
34
- def __create_new_id(default_id):
35
- new_id = default_id
36
- while new_id in filenames_by_ids:
37
- new_id += 1
38
- return new_id
39
-
40
- default_id = 0
41
-
42
- filenames_by_ids = OrderedDict()
43
- for fname in filenames:
44
-
45
- doc_id = __number_from_string(fname)
46
-
47
- if doc_id is None:
48
- doc_id = __create_new_id(default_id)
49
- default_id = doc_id
50
-
51
- filenames_by_ids[doc_id] = fname
52
-
53
- return filenames_by_ids
54
-
55
-
56
- def __number_from_string(s):
57
- assert(isinstance(s, str))
58
-
59
- digit_chars_prefix = []
60
-
61
- for chr in s:
62
- if chr.isdigit():
63
- digit_chars_prefix.append(chr)
64
- else:
65
- break
66
-
67
- if len(digit_chars_prefix) == 0:
68
- return None
69
-
70
- return int("".join(digit_chars_prefix))
@@ -1,87 +0,0 @@
1
- from enum import Enum
2
- from os import path
3
- from os.path import basename, join
4
-
5
- import enum
6
-
7
- from arekit.contrib.source.sentinerel.folding.factory import SentiNERELFoldingFactory
8
- from arekit.contrib.source.zip_utils import ZipArchiveUtils
9
-
10
-
11
- class SentiNerelVersions(Enum):
12
- """ List of the supported version of this collection
13
- """
14
-
15
- # Initial version.
16
- V1 = "v1_0"
17
- # Updated annotation within the second half of the texts. (September 2022)
18
- V2 = "v2_0"
19
- # Updated annotation within the first half of the texts. (October 2022)
20
- # Become a source of the RuSentNE-2023 competition.
21
- # https://github.com/dialogue-evaluation/RuSentNE-evaluation
22
- V21 = "v2_1"
23
-
24
-
25
- DEFAULT_VERSION = SentiNerelVersions.V21
26
-
27
-
28
- class SentiNerelIOUtils(ZipArchiveUtils):
29
-
30
- inner_root = "sentiment_dataset"
31
-
32
- @staticmethod
33
- def get_archive_filepath(version):
34
- return path.join(SentiNerelIOUtils.get_data_root(), "sentinerel-{}.zip".format(version))
35
-
36
- @staticmethod
37
- def get_annotation_innerpath(filename):
38
- assert(isinstance(filename, str))
39
- return path.join(SentiNerelIOUtils.inner_root, "{}.ann".format(filename))
40
-
41
- @staticmethod
42
- def get_doc_innerpath(filename):
43
- assert(isinstance(filename, str))
44
- return path.join(SentiNerelIOUtils.inner_root, "{}.txt".format(filename))
45
-
46
- @staticmethod
47
- def __iter_filenames_from_dataset(folder_name, version):
48
- assert(isinstance(version, enum.Enum))
49
- assert(isinstance(folder_name, str))
50
-
51
- for filename in SentiNerelIOUtils.iter_filenames_from_zip(version):
52
-
53
- extension = filename[-4:]
54
-
55
- # Crop extension.
56
- filename = filename[:-4]
57
-
58
- if extension != ".txt":
59
- continue
60
-
61
- if not folder_name in filename:
62
- continue
63
-
64
- yield basename(filename)
65
-
66
- # region public methods
67
-
68
- @staticmethod
69
- def iter_collection_filenames(version=DEFAULT_VERSION):
70
- filenames_it = SentiNerelIOUtils.__iter_filenames_from_dataset(
71
- folder_name=SentiNerelIOUtils.inner_root, version=version)
72
-
73
- for doc_id, filename in enumerate(filenames_it):
74
- yield doc_id, filename
75
-
76
- @staticmethod
77
- def read_dataset_split(version=DEFAULT_VERSION, docs_limit=None):
78
- """ Provides a fixed split of the dataset onto
79
- `test` and `training` part:
80
- https://github.com/nicolay-r/SentiNEREL-attitude-extraction
81
- """
82
- return SentiNerelIOUtils.read_from_zip(
83
- inner_path=join(SentiNerelIOUtils.inner_root, "split_fixed.txt"),
84
- process_func=lambda f: SentiNERELFoldingFactory.create_fixed_folding(file=f, limit=docs_limit),
85
- version=version)
86
-
87
- # endregion
@@ -1,53 +0,0 @@
1
- from arekit.common.labels.base import Label
2
-
3
-
4
- class OpinionBelongsTo(Label):
5
- pass
6
-
7
-
8
- class OpinionRelatesTo(Label):
9
- pass
10
-
11
-
12
- class NegEffectFrom(Label):
13
- pass
14
-
15
-
16
- class NegStateFrom(Label):
17
- pass
18
-
19
-
20
- class PosEffectFrom(Label):
21
- pass
22
-
23
-
24
- class PosAuthorFrom(Label):
25
- pass
26
-
27
-
28
- class NegAuthorFrom(Label):
29
- pass
30
-
31
-
32
- class PosStateFrom(Label):
33
- pass
34
-
35
-
36
- class NegativeTo(Label):
37
- pass
38
-
39
-
40
- class PositiveTo(Label):
41
- pass
42
-
43
-
44
- class AlternativeName(Label):
45
- pass
46
-
47
-
48
- class StateBelongsTo(Label):
49
- pass
50
-
51
-
52
- class OriginsFrom(Label):
53
- pass
@@ -1,30 +0,0 @@
1
- from collections import OrderedDict
2
-
3
- from arekit.common.labels.scaler.base import BaseLabelScaler
4
- from arekit.contrib.source.sentinerel import labels
5
-
6
-
7
- class SentiNerelLabelScaler(BaseLabelScaler):
8
- """ This is a complete label scaler of all the labels supported by NEREL dataset.
9
- """
10
-
11
- def __init__(self):
12
-
13
- self.__uint_to_label_dict = OrderedDict([
14
- (labels.OpinionBelongsTo(), 0),
15
- (labels.OpinionRelatesTo(), 1),
16
- (labels.NegEffectFrom(), 2),
17
- (labels.PosEffectFrom(), 3),
18
- (labels.NegStateFrom(), 4),
19
- (labels.PosStateFrom(), 5),
20
- (labels.NegativeTo(), 6),
21
- (labels.PositiveTo(), 7),
22
- (labels.StateBelongsTo(), 8),
23
- (labels.PosAuthorFrom(), 9),
24
- (labels.NegAuthorFrom(), 10),
25
- (labels.AlternativeName(), 11),
26
- (labels.OriginsFrom(), 12)
27
- ])
28
-
29
- super(SentiNerelLabelScaler, self).__init__(int_dict=self.__uint_to_label_dict,
30
- uint_dict=self.__uint_to_label_dict)
@@ -1,42 +0,0 @@
1
- from arekit.contrib.source.brat.annot import BratAnnotationParser
2
- from arekit.contrib.source.brat.doc import BratDocument
3
- from arekit.contrib.source.brat.sentences_reader import BratDocumentSentencesReader
4
- from arekit.contrib.source.sentinerel.entities import SentiNerelEntityCollection
5
- from arekit.contrib.source.sentinerel.io_utils import SentiNerelIOUtils, DEFAULT_VERSION
6
-
7
-
8
- class SentiNerelDocReader(object):
9
-
10
- @staticmethod
11
- def read_text_relations(filename, version):
12
- assert(isinstance(filename, str))
13
-
14
- return SentiNerelIOUtils.read_from_zip(
15
- inner_path=SentiNerelIOUtils.get_annotation_innerpath(filename),
16
- process_func=lambda input_file: [
17
- relation for relation in BratAnnotationParser.parse_annotations(
18
- input_file=input_file, encoding='utf-8-sig')["relations"]],
19
- version=version)
20
-
21
- @staticmethod
22
- def read_document(filename, doc_id, version=DEFAULT_VERSION, entities_to_ignore=None):
23
- assert(isinstance(filename, str))
24
- assert(isinstance(doc_id, int))
25
-
26
- def file_to_doc(input_file):
27
- sentences = BratDocumentSentencesReader.from_file(input_file=input_file, entities=entities)
28
- return BratDocument(doc_id=doc_id, sentences=sentences, text_relations=text_relations)
29
-
30
- # TODO. #398 issue -- in some cases entities might be nested. Therefore we limit the set
31
- # TODO. of the potential named entities.
32
- eti = ["EFFECT_NEG", "EFFECT_POS", "ARGUMENT_NEG", "ARGUMENT_POS", "EVENT"] \
33
- if entities_to_ignore is None else entities_to_ignore
34
-
35
- entities = SentiNerelEntityCollection.read_collection(
36
- filename=filename, version=version, entities_to_ignore=eti)
37
- text_relations = SentiNerelDocReader.read_text_relations(filename=filename, version=version)
38
-
39
- return SentiNerelIOUtils.read_from_zip(
40
- inner_path=SentiNerelIOUtils.get_doc_innerpath(filename=filename),
41
- process_func=file_to_doc,
42
- version=version)
File without changes
@@ -1,19 +0,0 @@
1
- from arekit.common.utils import progress_bar_defined
2
-
3
-
4
- def iter_synonym_groups(input_file, sep=",", desc=""):
5
- """ All the synonyms groups organized in lines, separated by `sep`
6
- """
7
- lines = input_file.readlines()
8
-
9
- lines_it = progress_bar_defined(lines,
10
- total=len(lines),
11
- desc=desc,
12
- unit="opins")
13
-
14
- for line in lines_it:
15
-
16
- if isinstance(line, bytes):
17
- line = line.decode()
18
-
19
- yield line.split(sep)
@@ -1,47 +0,0 @@
1
- import zipfile
2
-
3
- import enum
4
-
5
- from arekit.common import utils
6
-
7
-
8
- class ZipArchiveUtils(object):
9
-
10
- @staticmethod
11
- def get_archive_filepath(version):
12
- raise NotImplementedError()
13
-
14
- @classmethod
15
- def read_from_zip(cls, inner_path, process_func, version):
16
- """
17
- process_func:
18
- func which receives a file reader
19
- """
20
- assert(isinstance(inner_path, str))
21
- assert(callable(process_func))
22
- assert(isinstance(version, enum.Enum))
23
-
24
- with zipfile.ZipFile(cls.get_archive_filepath(version.value), "r") as zip_ref:
25
- with zip_ref.open(inner_path, mode='r') as c_file:
26
- return process_func(c_file)
27
-
28
- @classmethod
29
- def iter_from_zip(cls, inner_path, process_func, version):
30
- assert(isinstance(inner_path, str))
31
- assert(callable(process_func))
32
- assert(isinstance(version, enum.Enum))
33
-
34
- with zipfile.ZipFile(cls.get_archive_filepath(version.value), "r") as zip_ref:
35
- with zip_ref.open(inner_path, mode='r') as c_file:
36
- for result in process_func(c_file):
37
- yield result
38
-
39
- @classmethod
40
- def iter_filenames_from_zip(cls, version):
41
- assert(isinstance(version, enum.Enum))
42
- with zipfile.ZipFile(cls.get_archive_filepath(version.value), "r") as zip_ref:
43
- return iter(zip_ref.namelist())
44
-
45
- @staticmethod
46
- def get_data_root():
47
- return utils.get_default_download_dir()
File without changes
@@ -1,23 +0,0 @@
1
- from arekit.common.frames.connotations.provider import FrameConnotationProvider
2
- from arekit.contrib.source.rusentiframes.collection import RuSentiFramesCollection
3
-
4
-
5
- class RuSentiFramesConnotationProvider(FrameConnotationProvider):
6
- """ This is a provider based on A0->A1 label type of RuSentiFrames collection.
7
- For a greater details, checkout the related collection at:
8
- https://github.com/nicolay-r/RuSentiFrames
9
-
10
- Papers:
11
- [1] Natalia Loukachevitch, Nicolay Rusnachenko: Sentiment Frames
12
- for Attitude Extraction in Russian, 2020
13
- [2] Distant Supervision for Sentiment Attitude Extraction, 2019
14
- """
15
-
16
- def __init__(self, collection):
17
- assert(isinstance(collection, RuSentiFramesCollection))
18
- self.__collection = collection
19
-
20
- def try_provide(self, frame_id):
21
- return self.__collection.try_get_frame_polarity(frame_id=frame_id,
22
- role_src='a0',
23
- role_dest='a1')
File without changes
@@ -1,7 +0,0 @@
1
- class BaseReader(object):
2
-
3
- def extension(self):
4
- raise NotImplementedError()
5
-
6
- def read(self, target):
7
- raise NotImplementedError()
@@ -1,38 +0,0 @@
1
- import importlib
2
-
3
- from arekit.contrib.utils.data.readers.base import BaseReader
4
- from arekit.contrib.utils.data.storages.pandas_based import PandasBasedRowsStorage
5
-
6
-
7
- class PandasCsvReader(BaseReader):
8
- """ Represents a CSV-based reader, implmented via pandas API.
9
- """
10
-
11
- def __init__(self, sep='\t', header='infer', compression='infer', encoding='utf-8', col_types=None,
12
- custom_extension=None):
13
- self.__sep = sep
14
- self.__compression = compression
15
- self.__encoding = encoding
16
- self.__header = header
17
- self.__custom_extension = custom_extension
18
-
19
- # Special assignation of types for certain columns.
20
- self.__col_types = col_types
21
- if self.__col_types is None:
22
- self.__col_types = dict()
23
-
24
- def extension(self):
25
- return ".tsv.gz" if self.__custom_extension is None else self.__custom_extension
26
-
27
- def __from_csv(self, filepath):
28
- pd = importlib.import_module("pandas")
29
- return pd.read_csv(filepath,
30
- sep=self.__sep,
31
- encoding=self.__encoding,
32
- compression=self.__compression,
33
- dtype=self.__col_types,
34
- header=self.__header)
35
-
36
- def read(self, target):
37
- df = self.__from_csv(filepath=target)
38
- return PandasBasedRowsStorage(df)