arekit 0.24.0__py3-none-any.whl → 0.25.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. arekit/common/docs/entities_grouping.py +2 -1
  2. arekit/common/docs/parser.py +52 -20
  3. arekit/common/pipeline/base.py +12 -16
  4. arekit/common/pipeline/batching.py +28 -0
  5. arekit/common/pipeline/context.py +5 -1
  6. arekit/common/pipeline/items/base.py +38 -1
  7. arekit/common/pipeline/items/flatten.py +5 -1
  8. arekit/common/pipeline/items/handle.py +2 -1
  9. arekit/common/pipeline/items/iter.py +2 -1
  10. arekit/common/pipeline/items/map.py +2 -1
  11. arekit/common/pipeline/items/map_nested.py +4 -0
  12. arekit/common/pipeline/utils.py +32 -0
  13. arekit/common/service/sqlite.py +36 -0
  14. arekit/common/text/{partitioning/str.py → partitioning.py} +14 -9
  15. arekit/common/utils.py +0 -44
  16. arekit/contrib/utils/data/contents/opinions.py +13 -3
  17. arekit/contrib/utils/data/readers/sqlite.py +14 -0
  18. arekit/contrib/utils/data/storages/row_cache.py +6 -1
  19. arekit/contrib/utils/data/storages/sqlite_based.py +17 -0
  20. arekit/contrib/utils/data/writers/sqlite_native.py +4 -0
  21. arekit/contrib/utils/io_utils/utils.py +1 -18
  22. arekit/contrib/utils/pipelines/items/sampling/base.py +7 -12
  23. arekit/contrib/utils/pipelines/items/sampling/networks.py +3 -2
  24. arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
  25. arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
  26. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +2 -2
  27. arekit/contrib/utils/pipelines/items/text/frames_negation.py +2 -1
  28. arekit/contrib/utils/pipelines/items/text/tokenizer.py +2 -4
  29. arekit/contrib/utils/pipelines/items/text/translator.py +2 -1
  30. arekit/contrib/utils/pipelines/text_opinion/extraction.py +6 -9
  31. arekit/contrib/utils/serializer.py +1 -2
  32. arekit-0.25.0.data/data/logo.png +0 -0
  33. arekit-0.25.0.dist-info/METADATA +82 -0
  34. {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/RECORD +38 -153
  35. {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/WHEEL +1 -1
  36. arekit/common/docs/objects_parser.py +0 -37
  37. arekit/common/text/parser.py +0 -12
  38. arekit/common/text/partitioning/base.py +0 -4
  39. arekit/common/text/partitioning/terms.py +0 -35
  40. arekit/contrib/source/__init__.py +0 -0
  41. arekit/contrib/source/brat/__init__.py +0 -0
  42. arekit/contrib/source/brat/annot.py +0 -84
  43. arekit/contrib/source/brat/doc.py +0 -28
  44. arekit/contrib/source/brat/entities/__init__.py +0 -0
  45. arekit/contrib/source/brat/entities/compound.py +0 -13
  46. arekit/contrib/source/brat/entities/entity.py +0 -42
  47. arekit/contrib/source/brat/entities/parser.py +0 -53
  48. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  49. arekit/contrib/source/brat/opinions/converter.py +0 -19
  50. arekit/contrib/source/brat/relation.py +0 -32
  51. arekit/contrib/source/brat/sentence.py +0 -69
  52. arekit/contrib/source/brat/sentences_reader.py +0 -128
  53. arekit/contrib/source/download.py +0 -41
  54. arekit/contrib/source/nerel/__init__.py +0 -0
  55. arekit/contrib/source/nerel/entities.py +0 -55
  56. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  57. arekit/contrib/source/nerel/folding/fixed.py +0 -74
  58. arekit/contrib/source/nerel/io_utils.py +0 -62
  59. arekit/contrib/source/nerel/labels.py +0 -241
  60. arekit/contrib/source/nerel/reader.py +0 -46
  61. arekit/contrib/source/nerel/utils.py +0 -24
  62. arekit/contrib/source/nerel/versions.py +0 -12
  63. arekit/contrib/source/nerelbio/__init__.py +0 -0
  64. arekit/contrib/source/nerelbio/io_utils.py +0 -62
  65. arekit/contrib/source/nerelbio/labels.py +0 -265
  66. arekit/contrib/source/nerelbio/reader.py +0 -8
  67. arekit/contrib/source/nerelbio/versions.py +0 -8
  68. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  69. arekit/contrib/source/ruattitudes/collection.py +0 -36
  70. arekit/contrib/source/ruattitudes/doc.py +0 -51
  71. arekit/contrib/source/ruattitudes/doc_brat.py +0 -44
  72. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  73. arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
  74. arekit/contrib/source/ruattitudes/io_utils.py +0 -56
  75. arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
  76. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  77. arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
  78. arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
  79. arekit/contrib/source/ruattitudes/reader.py +0 -268
  80. arekit/contrib/source/ruattitudes/sentence.py +0 -73
  81. arekit/contrib/source/ruattitudes/synonyms.py +0 -17
  82. arekit/contrib/source/ruattitudes/text_object.py +0 -59
  83. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  84. arekit/contrib/source/rusentiframes/collection.py +0 -157
  85. arekit/contrib/source/rusentiframes/effect.py +0 -24
  86. arekit/contrib/source/rusentiframes/io_utils.py +0 -19
  87. arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
  88. arekit/contrib/source/rusentiframes/polarity.py +0 -35
  89. arekit/contrib/source/rusentiframes/role.py +0 -15
  90. arekit/contrib/source/rusentiframes/state.py +0 -24
  91. arekit/contrib/source/rusentiframes/types.py +0 -42
  92. arekit/contrib/source/rusentiframes/value.py +0 -2
  93. arekit/contrib/source/rusentrel/__init__.py +0 -0
  94. arekit/contrib/source/rusentrel/const.py +0 -3
  95. arekit/contrib/source/rusentrel/docs_reader.py +0 -51
  96. arekit/contrib/source/rusentrel/entities.py +0 -26
  97. arekit/contrib/source/rusentrel/io_utils.py +0 -125
  98. arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
  99. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  100. arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
  101. arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
  102. arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
  103. arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
  104. arekit/contrib/source/rusentrel/synonyms.py +0 -17
  105. arekit/contrib/source/sentinerel/__init__.py +0 -0
  106. arekit/contrib/source/sentinerel/entities.py +0 -52
  107. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  108. arekit/contrib/source/sentinerel/folding/factory.py +0 -31
  109. arekit/contrib/source/sentinerel/folding/fixed.py +0 -70
  110. arekit/contrib/source/sentinerel/io_utils.py +0 -87
  111. arekit/contrib/source/sentinerel/labels.py +0 -53
  112. arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
  113. arekit/contrib/source/sentinerel/reader.py +0 -42
  114. arekit/contrib/source/synonyms/__init__.py +0 -0
  115. arekit/contrib/source/synonyms/utils.py +0 -19
  116. arekit/contrib/source/zip_utils.py +0 -47
  117. arekit/contrib/utils/connotations/__init__.py +0 -0
  118. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
  119. arekit/contrib/utils/download.py +0 -77
  120. arekit/contrib/utils/io_utils/opinions.py +0 -37
  121. arekit/contrib/utils/io_utils/samples.py +0 -79
  122. arekit/contrib/utils/lexicons/__init__.py +0 -0
  123. arekit/contrib/utils/lexicons/lexicon.py +0 -41
  124. arekit/contrib/utils/lexicons/relation.py +0 -42
  125. arekit/contrib/utils/lexicons/rusentilex.py +0 -37
  126. arekit/contrib/utils/nn/__init__.py +0 -0
  127. arekit/contrib/utils/nn/rows.py +0 -83
  128. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
  129. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  130. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  131. arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +0 -27
  132. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -65
  133. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
  134. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  135. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +0 -29
  136. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -64
  137. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
  138. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  139. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +0 -56
  140. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -20
  141. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -65
  142. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  143. arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +0 -21
  144. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -107
  145. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  146. arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +0 -29
  147. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
  148. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -180
  149. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
  150. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
  151. arekit/contrib/utils/resources.py +0 -25
  152. arekit/contrib/utils/sources/__init__.py +0 -0
  153. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  154. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  155. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
  156. arekit/download_data.py +0 -11
  157. arekit-0.24.0.dist-info/METADATA +0 -23
  158. /arekit/common/{text/partitioning → service}/__init__.py +0 -0
  159. {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/LICENSE +0 -0
  160. {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/top_level.txt +0 -0
@@ -1,15 +0,0 @@
1
- class FrameRole(object):
2
-
3
- def __init__(self, source, description):
4
- assert(isinstance(source, str))
5
- assert(isinstance(description, str))
6
- self.__source = source
7
- self.__description = description
8
-
9
- @property
10
- def Source(self):
11
- return self.__source
12
-
13
- @property
14
- def Description(self):
15
- return self.__description
@@ -1,24 +0,0 @@
1
- from arekit.common.labels.base import Label
2
-
3
-
4
- class FrameState(object):
5
-
6
- def __init__(self, role, label, prob):
7
- assert(isinstance(role, str))
8
- assert(isinstance(label, Label))
9
- assert(isinstance(prob, float))
10
- self.__role = role
11
- self.__label = label
12
- self.__prob = prob
13
-
14
- @property
15
- def Role(self):
16
- return self.__role
17
-
18
- @property
19
- def Label(self):
20
- return self.__label
21
-
22
- @property
23
- def Prob(self):
24
- return self.__prob
@@ -1,42 +0,0 @@
1
- from enum import Enum
2
-
3
-
4
- class RuSentiFramesVersions(Enum):
5
-
6
- # Papers for description:
7
- # Distant Supervision for Sentiment Attitude Extraction (RANLP-2019)
8
- # Nicolay Rusnachenko, Natalia Loukachevitch, Elena Tutubalina
9
- # https://www.aclweb.org/anthology/R19-1118/
10
- # https://github.com/nicolay-r/RuSentiFrames/tree/v1.0
11
- V10 = "v1_0"
12
-
13
- # Papers for description:
14
- # Sentiment Frames for Attitude Extraction in Russian (DIALOG-2020)
15
- # Natalia Loukachevitch, Nicolay Rusnachenko
16
- # https://github.com/nicolay-r/RuSentiFrames/tree/v2.0
17
- V20 = "v2_0"
18
-
19
-
20
- class RuSentiFramesVersionsService:
21
-
22
- @staticmethod
23
- def __iter_supported_types():
24
- return iter(RuSentiFramesVersions)
25
-
26
- @staticmethod
27
- def get_name_by_type(version_type):
28
- assert(isinstance(version_type, RuSentiFramesVersions))
29
- return version_type.value
30
-
31
- @staticmethod
32
- def get_type_by_name(name):
33
- for version_type in RuSentiFramesVersionsService.__iter_supported_types():
34
- if version_type.value == name:
35
- return version_type
36
-
37
- raise Exception("RuSentiFrames version by name `{}` was hot found!".format(name))
38
-
39
- @staticmethod
40
- def iter_supported_names():
41
- for version_type in RuSentiFramesVersionsService.__iter_supported_types():
42
- yield version_type.value
@@ -1,2 +0,0 @@
1
- class FrameValue(object):
2
- pass
File without changes
@@ -1,3 +0,0 @@
1
- # Defaul label formattings.
2
- POS_LABEL_STR = "pos"
3
- NEG_LABEL_STR = "neg"
@@ -1,51 +0,0 @@
1
- from arekit.common.synonyms.base import SynonymsCollection
2
- from arekit.contrib.source.brat.doc import BratDocument
3
- from arekit.contrib.source.brat.sentences_reader import BratDocumentSentencesReader
4
- from arekit.contrib.source.rusentrel.entities import RuSentRelDocumentEntityCollection
5
- from arekit.contrib.source.rusentrel.io_utils import RuSentRelVersions, RuSentRelIOUtils
6
-
7
-
8
- class RuSentRelDocumentsReader(object):
9
-
10
- # region class methods
11
-
12
- @staticmethod
13
- def hide_first_entry(line, entry, hide_with=" "):
14
-
15
- index = line.find(entry)
16
-
17
- if index >= 0:
18
- pad = hide_with * len(entry)
19
- before = line[0:index]
20
- after = line[index+len(entry):]
21
- line = "".join([before, pad, after])
22
-
23
- return line
24
-
25
- @staticmethod
26
- def read_document(doc_id, synonyms, version=RuSentRelVersions.V11, target_doc_id=None):
27
- assert(isinstance(synonyms, SynonymsCollection))
28
- assert(isinstance(version, RuSentRelVersions))
29
- assert(isinstance(target_doc_id, int) or target_doc_id is None)
30
-
31
- def file_to_doc(input_file):
32
-
33
- sentences = BratDocumentSentencesReader.from_file(
34
- input_file=input_file,
35
- entities=entities,
36
- line_handler=lambda line: RuSentRelDocumentsReader.hide_first_entry(line, entry="{Author, Unknown}"),
37
- skip_entity_func=lambda entity: entity.Value in ['author', 'unknown'])
38
-
39
- return BratDocument(doc_id=target_doc_id if target_doc_id is not None else doc_id,
40
- sentences=sentences,
41
- text_relations=[])
42
-
43
- entities = RuSentRelDocumentEntityCollection.read_collection(
44
- doc_id=doc_id,
45
- synonyms=synonyms,
46
- version=version)
47
-
48
- return RuSentRelIOUtils.read_from_zip(
49
- inner_path=RuSentRelIOUtils.get_doc_innerpath(index=doc_id, version=version),
50
- process_func=file_to_doc,
51
- version=version)
@@ -1,26 +0,0 @@
1
- from arekit.common.entities.collection import EntityCollection
2
- from arekit.common.synonyms.base import SynonymsCollection
3
- from arekit.contrib.source.brat.annot import BratAnnotationParser
4
- from arekit.contrib.source.rusentrel.io_utils import RuSentRelVersions, RuSentRelIOUtils
5
-
6
-
7
- class RuSentRelDocumentEntityCollection(EntityCollection):
8
-
9
- def __init__(self, entities, value_to_group_id_func):
10
- super(RuSentRelDocumentEntityCollection, self).__init__(
11
- entities=entities,
12
- value_to_group_id_func=value_to_group_id_func)
13
-
14
- self._sort_entities(key=lambda entity: entity.IndexBegin)
15
-
16
- @classmethod
17
- def read_collection(cls, doc_id, synonyms, version=RuSentRelVersions.V11):
18
- assert (isinstance(synonyms, SynonymsCollection))
19
- assert (isinstance(doc_id, int))
20
-
21
- return RuSentRelIOUtils.read_from_zip(
22
- inner_path=RuSentRelIOUtils.get_entity_innerpath(index=doc_id, version=version),
23
- process_func=lambda input_file: cls(
24
- entities=BratAnnotationParser.parse_annotations(input_file)["entities"],
25
- value_to_group_id_func=synonyms.get_synonym_group_index),
26
- version=version)
@@ -1,125 +0,0 @@
1
- from os import path
2
-
3
- from enum import Enum
4
-
5
- from arekit.contrib.source.zip_utils import ZipArchiveUtils
6
-
7
-
8
- class RuSentRelVersions(Enum):
9
- """ Original collection repository: https://github.com/nicolay-r/RuSentRel
10
- Paper: https://arxiv.org/abs/1808.08932
11
- """
12
- V11 = "v1_1"
13
-
14
-
15
- class RuSentRelIOUtils(ZipArchiveUtils):
16
-
17
- TEST_FOLDER = "test"
18
- TRAIN_FOLDER = "train"
19
- ETALON_FOLDER = "etalon"
20
-
21
- @staticmethod
22
- def get_archive_filepath(version):
23
- assert(version, str)
24
- return path.join(RuSentRelIOUtils.get_data_root(), "rusentrel-{}.zip".format(version))
25
-
26
- # region internal methods
27
-
28
- @staticmethod
29
- def get_sentiment_opin_filepath(index, version, prefix='art'):
30
- root = RuSentRelIOUtils.__get_root_by_index(index, version=version, keep_etalon=True)
31
- return path.join(root, "{prefix}{index}.opin.txt".format(prefix=prefix, index=index))
32
-
33
- @staticmethod
34
- def get_entity_innerpath(index, version):
35
- assert(isinstance(index, int))
36
- assert(isinstance(version, RuSentRelVersions))
37
- inner_root = RuSentRelIOUtils.__get_root_by_index(doc_id=index, version=version)
38
- return path.join(inner_root, "art{}.ann".format(index))
39
-
40
- @staticmethod
41
- def get_doc_innerpath(index, version):
42
- assert(isinstance(index, int))
43
- assert(isinstance(version, RuSentRelVersions))
44
- inner_root = RuSentRelIOUtils.__get_root_by_index(doc_id=index, version=version)
45
- return path.join(inner_root, "art{}.txt".format(index))
46
-
47
- @staticmethod
48
- def get_synonyms_innerpath():
49
- return "synonyms.txt"
50
-
51
- # endregion
52
-
53
- @staticmethod
54
- def __get_root_by_index(doc_id, version, keep_etalon=False):
55
- assert(RuSentRelIOUtils.__is_supported(version))
56
- assert(isinstance(version, RuSentRelVersions))
57
- assert(isinstance(doc_id, int))
58
- other_dir = RuSentRelIOUtils.ETALON_FOLDER if keep_etalon else RuSentRelIOUtils.TEST_FOLDER
59
- test_indices = set(RuSentRelIOUtils.__iter_indicies_from_dataset(version, RuSentRelIOUtils.TEST_FOLDER))
60
- return other_dir if doc_id in test_indices else RuSentRelIOUtils.TRAIN_FOLDER
61
-
62
- @staticmethod
63
- def __is_supported(version):
64
- assert(isinstance(version, RuSentRelVersions))
65
- return version == RuSentRelVersions.V11
66
-
67
- @staticmethod
68
- def __number_from_string(s):
69
- digit_chars = [chr for chr in s if chr.isdigit()]
70
-
71
- if len(digit_chars) == 0:
72
- return None
73
-
74
- return int("".join(digit_chars))
75
-
76
- @staticmethod
77
- def __iter_indicies_from_dataset(version, folder_name):
78
- assert(isinstance(folder_name, str))
79
- assert(RuSentRelIOUtils.__is_supported(version))
80
-
81
- used = set()
82
-
83
- for filename in RuSentRelIOUtils.iter_filenames_from_zip(version):
84
- if not folder_name in filename:
85
- continue
86
-
87
- index = RuSentRelIOUtils.__number_from_string(filename)
88
-
89
- if index is None:
90
- continue
91
-
92
- if index in used:
93
- continue
94
-
95
- used.add(index)
96
-
97
- yield index
98
-
99
- # region public methods
100
-
101
- @staticmethod
102
- def iter_test_indices(version):
103
- assert(RuSentRelIOUtils.__is_supported(version))
104
- indices_iter = RuSentRelIOUtils.__iter_indicies_from_dataset(
105
- version=version, folder_name="{}/".format(RuSentRelIOUtils.TEST_FOLDER))
106
- for index in indices_iter:
107
- yield index
108
-
109
- @staticmethod
110
- def iter_train_indices(version):
111
- assert(RuSentRelIOUtils.__is_supported(version))
112
- indices_iter = RuSentRelIOUtils.__iter_indicies_from_dataset(
113
- version=version, folder_name="{}/".format(RuSentRelIOUtils.TRAIN_FOLDER))
114
- for index in indices_iter:
115
- yield index
116
-
117
- @staticmethod
118
- def iter_collection_indices(version):
119
- assert(RuSentRelIOUtils.__is_supported(version))
120
- for index in RuSentRelIOUtils.iter_train_indices(version):
121
- yield index
122
- for index in RuSentRelIOUtils.iter_test_indices(version):
123
- yield index
124
-
125
- # endregion
@@ -1,12 +0,0 @@
1
- from arekit.common.labels.base import Label
2
- from arekit.common.labels.str_fmt import StringLabelsFormatter
3
- from arekit.contrib.source.rusentrel.const import NEG_LABEL_STR, POS_LABEL_STR
4
-
5
-
6
- class RuSentRelLabelsFormatter(StringLabelsFormatter):
7
-
8
- def __init__(self, pos_label_type, neg_label_type):
9
- assert(issubclass(pos_label_type, Label))
10
- assert(issubclass(neg_label_type, Label))
11
- stol = {NEG_LABEL_STR: neg_label_type, POS_LABEL_STR: pos_label_type}
12
- super(RuSentRelLabelsFormatter, self).__init__(stol=stol)
File without changes
@@ -1,30 +0,0 @@
1
- from arekit.contrib.source.rusentrel.const import POS_LABEL_STR, NEG_LABEL_STR
2
- from arekit.contrib.source.rusentrel.io_utils import RuSentRelIOUtils, RuSentRelVersions
3
- from arekit.contrib.source.rusentrel.labels_fmt import RuSentRelLabelsFormatter
4
- from arekit.contrib.source.rusentrel.opinions.provider import RuSentRelOpinionCollectionProvider
5
-
6
-
7
- class RuSentRelOpinions:
8
- """
9
- Collection of sentiment opinions between entities
10
- """
11
-
12
- @staticmethod
13
- def iter_from_doc(doc_id, labels_fmt, version=RuSentRelVersions.V11):
14
- """ doc_id:
15
- synonyms: None or SynonymsCollection
16
- None corresponds to the related synonym collection from RuSentRel collection.
17
- version: RuSentrelVersions enum
18
- """
19
- assert(isinstance(version, RuSentRelVersions))
20
- assert(isinstance(labels_fmt, RuSentRelLabelsFormatter))
21
- assert(labels_fmt.supports_value(POS_LABEL_STR))
22
- assert(labels_fmt.supports_value(NEG_LABEL_STR))
23
-
24
- return RuSentRelIOUtils.iter_from_zip(
25
- inner_path=RuSentRelIOUtils.get_sentiment_opin_filepath(index=doc_id, version=version),
26
- process_func=lambda input_file: RuSentRelOpinionCollectionProvider._iter_opinions_from_file(
27
- input_file=input_file,
28
- labels_formatter=labels_fmt,
29
- error_on_non_supported=True),
30
- version=version)
@@ -1,40 +0,0 @@
1
- from arekit.common.labels.str_fmt import StringLabelsFormatter
2
- from arekit.common.opinions.base import Opinion
3
-
4
-
5
- class OpinionConverter(object):
6
- """ Opinion type <-> string Converter.
7
- """
8
-
9
- @staticmethod
10
- def try_from_string(line, labels_formatter):
11
- assert(isinstance(line, str))
12
-
13
- args = line.strip().split(',')
14
- assert (len(args) >= 3)
15
-
16
- source_value = args[0].strip()
17
- target_value = args[1].strip()
18
- str_label = args[2].strip()
19
-
20
- if not labels_formatter.supports_value(str_label):
21
- return None
22
-
23
- return Opinion(source_value=source_value,
24
- target_value=target_value,
25
- label=labels_formatter.str_to_label(str_label))
26
-
27
- @staticmethod
28
- def try_to_string(opinion, labels_formatter):
29
- assert(isinstance(opinion, Opinion))
30
- assert(isinstance(labels_formatter, StringLabelsFormatter))
31
-
32
- label = opinion.Label
33
-
34
- if not labels_formatter.supports_label(label):
35
- return None
36
-
37
- return "{}, {}, {}, current".format(
38
- opinion.SourceValue,
39
- opinion.TargetValue,
40
- labels_formatter.label_to_str(opinion.Label))
@@ -1,54 +0,0 @@
1
- from arekit.common.opinions.provider import OpinionCollectionsProvider
2
- from arekit.common.labels.str_fmt import StringLabelsFormatter
3
- from arekit.contrib.source.rusentrel.opinions.converter import OpinionConverter
4
-
5
-
6
- class RuSentRelOpinionCollectionProvider(OpinionCollectionsProvider):
7
-
8
- @staticmethod
9
- def _iter_opinions_from_file(input_file, labels_formatter, error_on_non_supported):
10
- assert(isinstance(labels_formatter, StringLabelsFormatter))
11
- assert(isinstance(error_on_non_supported, bool))
12
-
13
- for line in input_file.readlines():
14
-
15
- # Force perform decoding if needed.
16
- if isinstance(line, bytes):
17
- line = line.decode()
18
-
19
- if line == '\n':
20
- continue
21
-
22
- str_opinion = OpinionConverter.try_from_string(
23
- line=line,
24
- labels_formatter=labels_formatter)
25
-
26
- if str_opinion is None:
27
- if error_on_non_supported:
28
- raise Exception("Line '{line}' has non supported label")
29
- else:
30
- continue
31
-
32
- yield str_opinion
33
-
34
- # region public methods
35
-
36
- def iter_opinions(self, source, encoding, labels_formatter, error_on_non_supported=True):
37
- """
38
- Important: For externally saved collections (using save_to_file method) and related usage
39
- """
40
- assert(isinstance(source, str))
41
- assert(isinstance(labels_formatter, StringLabelsFormatter))
42
- assert(isinstance(error_on_non_supported, bool))
43
-
44
- with open(source, 'r', encoding=encoding) as input_file:
45
-
46
- it = RuSentRelOpinionCollectionProvider._iter_opinions_from_file(
47
- input_file=input_file,
48
- labels_formatter=labels_formatter,
49
- error_on_non_supported=error_on_non_supported)
50
-
51
- for opinion in it:
52
- yield opinion
53
-
54
- # endregion
@@ -1,42 +0,0 @@
1
- import io
2
-
3
- from arekit.common.labels.str_fmt import StringLabelsFormatter
4
- from arekit.common.opinions.base import Opinion
5
- from arekit.common.opinions.collection import OpinionCollection
6
- from arekit.common.opinions.writer import OpinionCollectionWriter
7
- from arekit.common.utils import create_dir_if_not_exists
8
- from arekit.contrib.source.rusentrel.opinions.converter import OpinionConverter
9
-
10
-
11
- class RuSentRelOpinionCollectionWriter(OpinionCollectionWriter):
12
-
13
- def serialize(self, collection, target, encoding, labels_formatter, error_on_non_supported=True):
14
- assert(isinstance(collection, OpinionCollection))
15
- assert(isinstance(target, str))
16
- assert(isinstance(labels_formatter, StringLabelsFormatter))
17
- assert(isinstance(error_on_non_supported, bool))
18
-
19
- def __opinion_key(opinion):
20
- assert (isinstance(opinion, Opinion))
21
- return opinion.SourceValue + opinion.TargetValue
22
-
23
- sorted_ops = sorted(collection, key=__opinion_key)
24
-
25
- create_dir_if_not_exists(target)
26
-
27
- with io.open(target, 'w', encoding=encoding) as f:
28
- for o in sorted_ops:
29
-
30
- str_value = OpinionConverter.try_to_string(
31
- opinion=o,
32
- labels_formatter=labels_formatter)
33
-
34
- if str_value is None:
35
- if error_on_non_supported:
36
- raise Exception("Opinion label `{label}` is not supported by formatter".format(
37
- label=o.Label))
38
- else:
39
- continue
40
-
41
- f.write(str_value)
42
- f.write('\n')
@@ -1,17 +0,0 @@
1
- from arekit.contrib.source.synonyms.utils import iter_synonym_groups
2
- from arekit.contrib.source.rusentrel.io_utils import RuSentRelIOUtils
3
-
4
-
5
- class RuSentRelSynonymsCollectionHelper(object):
6
-
7
- @staticmethod
8
- def iter_groups(version):
9
- it = RuSentRelIOUtils.iter_from_zip(
10
- inner_path=RuSentRelIOUtils.get_synonyms_innerpath(),
11
- process_func=lambda input_file: iter_synonym_groups(
12
- input_file,
13
- desc="Loading RuSentRel Collection"),
14
- version=version)
15
-
16
- for group in it:
17
- yield group
File without changes
@@ -1,52 +0,0 @@
1
- from arekit.common.entities.collection import EntityCollection
2
- from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders
3
- from arekit.contrib.source.brat.annot import BratAnnotationParser
4
- from arekit.contrib.source.brat.entities.entity import BratEntity
5
- from arekit.contrib.source.sentinerel.io_utils import SentiNerelIOUtils
6
- from arekit.contrib.utils.processing.lemmatization.mystem import MystemWrapper
7
- from arekit.contrib.utils.synonyms.stemmer_based import StemmerBasedSynonymCollection
8
-
9
-
10
- class SentiNerelEntityCollection(EntityCollection):
11
-
12
- def __init__(self, contents, value_to_group_id_func, entities_to_ignore=None):
13
- """
14
- entities_to_ignore: list or None
15
- this parameter is required because of the simplified implmentation of
16
- the nested objects of the BRAT annotation.
17
- """
18
- assert(isinstance(contents, dict))
19
- assert(BratAnnotationParser.ENTITIES in contents)
20
- assert(isinstance(entities_to_ignore, list) or entities_to_ignore is None)
21
-
22
- self.__dicard_entities = set([] if entities_to_ignore is None else entities_to_ignore)
23
- contents[BratAnnotationParser.ENTITIES] = [e for e in contents[BratAnnotationParser.ENTITIES]
24
- if self.__keep_entity(e)]
25
-
26
- super(SentiNerelEntityCollection, self).__init__(
27
- entities=contents[BratAnnotationParser.ENTITIES],
28
- value_to_group_id_func=value_to_group_id_func)
29
-
30
- self._sort_entities(key=lambda entity: entity.IndexBegin)
31
-
32
- def __keep_entity(self, entity):
33
- assert(isinstance(entity, BratEntity))
34
- return entity.Type not in self.__dicard_entities
35
-
36
- @classmethod
37
- def read_collection(cls, filename, version, entities_to_ignore=None):
38
- assert(isinstance(filename, str))
39
-
40
- # Since this dataset does not provide the synonyms collection by default,
41
- # it is necessary to declare an empty collection to populate so in further.
42
- synonyms = StemmerBasedSynonymCollection(stemmer=MystemWrapper(), is_read_only=False)
43
-
44
- return SentiNerelIOUtils.read_from_zip(
45
- inner_path=SentiNerelIOUtils.get_annotation_innerpath(filename),
46
- process_func=lambda input_file: cls(
47
- contents=BratAnnotationParser.parse_annotations(input_file=input_file, encoding='utf-8-sig'),
48
- entities_to_ignore=entities_to_ignore,
49
- value_to_group_id_func=lambda value:
50
- SynonymsCollectionValuesGroupingProviders.provide_existed_or_register_missed_value(
51
- synonyms, value)),
52
- version=version)
File without changes
@@ -1,31 +0,0 @@
1
- from arekit.contrib.source.sentinerel.folding.fixed import create_fixed_folding_doc_ids
2
-
3
-
4
- class SentiNERELFoldingFactory:
5
- """ Factory of the variety types of the splits that
6
- are considered within the present experiments.
7
- """
8
-
9
- @staticmethod
10
- def create_fixed_folding(file, limit=None):
11
- """ limit: int
12
- Allows to limit amount of documents (utilized for testing reasons)
13
- """
14
-
15
- train_filenames, test_filenames = SentiNERELFoldingFactory._read_train_test(f=file)
16
- if limit is not None:
17
- train_filenames = train_filenames[:limit]
18
- test_filenames = test_filenames[:limit]
19
- filenames_by_ids, data_folding = create_fixed_folding_doc_ids(train_filenames=train_filenames,
20
- test_filenames=test_filenames)
21
-
22
- return filenames_by_ids, data_folding
23
-
24
- @staticmethod
25
- def _read_train_test(f):
26
- parts = []
27
- for line in f.readlines():
28
- if isinstance(line, bytes):
29
- line = line.decode('utf-8')
30
- parts.append(line.strip().split(','))
31
- return parts[0], parts[1]
@@ -1,70 +0,0 @@
1
- from collections import OrderedDict
2
-
3
- from arekit.common.experiment.data_type import DataType
4
-
5
-
6
- def create_fixed_folding_doc_ids(train_filenames, test_filenames):
7
- """ Create fixed data-folding based on the predefined list of filenames,
8
- written in file.
9
- """
10
- assert(isinstance(train_filenames, list))
11
- assert(isinstance(test_filenames, list))
12
-
13
- filenames_by_ids = __create_filenames_by_ids(filenames=train_filenames + test_filenames)
14
-
15
- ids_by_filenames = {}
16
- for doc_id, filename in filenames_by_ids.items():
17
- ids_by_filenames[filename] = doc_id
18
-
19
- train_doc_ids = [ids_by_filenames[filename] for filename in train_filenames]
20
- test_doc_ids = [ids_by_filenames[filename] for filename in test_filenames]
21
-
22
- return {
23
- DataType.Train: train_doc_ids,
24
- DataType.Test: test_doc_ids,
25
- DataType.Etalon: test_doc_ids,
26
- DataType.Dev: test_doc_ids
27
- }
28
-
29
-
30
- def __create_filenames_by_ids(filenames):
31
- """ Indexing filenames
32
- """
33
-
34
- def __create_new_id(default_id):
35
- new_id = default_id
36
- while new_id in filenames_by_ids:
37
- new_id += 1
38
- return new_id
39
-
40
- default_id = 0
41
-
42
- filenames_by_ids = OrderedDict()
43
- for fname in filenames:
44
-
45
- doc_id = __number_from_string(fname)
46
-
47
- if doc_id is None:
48
- doc_id = __create_new_id(default_id)
49
- default_id = doc_id
50
-
51
- filenames_by_ids[doc_id] = fname
52
-
53
- return filenames_by_ids
54
-
55
-
56
- def __number_from_string(s):
57
- assert(isinstance(s, str))
58
-
59
- digit_chars_prefix = []
60
-
61
- for chr in s:
62
- if chr.isdigit():
63
- digit_chars_prefix.append(chr)
64
- else:
65
- break
66
-
67
- if len(digit_chars_prefix) == 0:
68
- return None
69
-
70
- return int("".join(digit_chars_prefix))