arekit 0.24.0__py3-none-any.whl → 0.25.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. arekit/common/data/storages/base.py +4 -15
  2. arekit/common/docs/entities_grouping.py +2 -1
  3. arekit/common/docs/parser.py +27 -22
  4. arekit/common/pipeline/base.py +12 -16
  5. arekit/common/pipeline/batching.py +28 -0
  6. arekit/common/pipeline/context.py +5 -1
  7. arekit/common/pipeline/items/base.py +39 -2
  8. arekit/common/pipeline/items/flatten.py +5 -1
  9. arekit/common/pipeline/items/handle.py +2 -1
  10. arekit/common/pipeline/items/iter.py +2 -1
  11. arekit/common/pipeline/items/map.py +2 -1
  12. arekit/common/pipeline/items/map_nested.py +4 -0
  13. arekit/common/pipeline/utils.py +32 -0
  14. arekit/common/text/{partitioning/str.py → partitioning.py} +14 -9
  15. arekit/common/utils.py +11 -52
  16. arekit/contrib/utils/data/contents/opinions.py +13 -3
  17. arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
  18. arekit/contrib/utils/data/storages/pandas_based.py +2 -17
  19. arekit/contrib/utils/data/storages/row_cache.py +8 -2
  20. arekit/contrib/utils/data/storages/sqlite_based.py +18 -0
  21. arekit/contrib/utils/io_utils/utils.py +1 -18
  22. arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
  23. arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
  24. arekit/contrib/utils/pipelines/text_opinion/extraction.py +8 -10
  25. arekit-0.25.1.data/data/logo.png +0 -0
  26. arekit-0.25.1.dist-info/METADATA +81 -0
  27. arekit-0.25.1.dist-info/RECORD +186 -0
  28. {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/WHEEL +1 -1
  29. arekit/common/data/input/repositories/__init__.py +0 -0
  30. arekit/common/data/input/repositories/base.py +0 -68
  31. arekit/common/data/input/repositories/sample.py +0 -22
  32. arekit/common/data/views/__init__.py +0 -0
  33. arekit/common/data/views/samples.py +0 -26
  34. arekit/common/docs/objects_parser.py +0 -37
  35. arekit/common/text/parser.py +0 -12
  36. arekit/common/text/partitioning/__init__.py +0 -0
  37. arekit/common/text/partitioning/base.py +0 -4
  38. arekit/common/text/partitioning/terms.py +0 -35
  39. arekit/contrib/networks/__init__.py +0 -0
  40. arekit/contrib/networks/embedding.py +0 -149
  41. arekit/contrib/networks/embedding_io.py +0 -18
  42. arekit/contrib/networks/input/__init__.py +0 -0
  43. arekit/contrib/networks/input/const.py +0 -6
  44. arekit/contrib/networks/input/ctx_serialization.py +0 -28
  45. arekit/contrib/networks/input/embedding/__init__.py +0 -0
  46. arekit/contrib/networks/input/embedding/matrix.py +0 -29
  47. arekit/contrib/networks/input/embedding/offsets.py +0 -55
  48. arekit/contrib/networks/input/formatters/__init__.py +0 -0
  49. arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
  50. arekit/contrib/networks/input/providers/__init__.py +0 -0
  51. arekit/contrib/networks/input/providers/sample.py +0 -129
  52. arekit/contrib/networks/input/providers/term_connotation.py +0 -23
  53. arekit/contrib/networks/input/providers/text.py +0 -24
  54. arekit/contrib/networks/input/rows_parser.py +0 -47
  55. arekit/contrib/networks/input/term_types.py +0 -13
  56. arekit/contrib/networks/input/terms_mapping.py +0 -60
  57. arekit/contrib/networks/vectorizer.py +0 -6
  58. arekit/contrib/source/__init__.py +0 -0
  59. arekit/contrib/source/brat/__init__.py +0 -0
  60. arekit/contrib/source/brat/annot.py +0 -84
  61. arekit/contrib/source/brat/doc.py +0 -28
  62. arekit/contrib/source/brat/entities/__init__.py +0 -0
  63. arekit/contrib/source/brat/entities/compound.py +0 -13
  64. arekit/contrib/source/brat/entities/entity.py +0 -42
  65. arekit/contrib/source/brat/entities/parser.py +0 -53
  66. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  67. arekit/contrib/source/brat/opinions/converter.py +0 -19
  68. arekit/contrib/source/brat/relation.py +0 -32
  69. arekit/contrib/source/brat/sentence.py +0 -69
  70. arekit/contrib/source/brat/sentences_reader.py +0 -128
  71. arekit/contrib/source/download.py +0 -41
  72. arekit/contrib/source/nerel/__init__.py +0 -0
  73. arekit/contrib/source/nerel/entities.py +0 -55
  74. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  75. arekit/contrib/source/nerel/folding/fixed.py +0 -74
  76. arekit/contrib/source/nerel/io_utils.py +0 -62
  77. arekit/contrib/source/nerel/labels.py +0 -241
  78. arekit/contrib/source/nerel/reader.py +0 -46
  79. arekit/contrib/source/nerel/utils.py +0 -24
  80. arekit/contrib/source/nerel/versions.py +0 -12
  81. arekit/contrib/source/nerelbio/__init__.py +0 -0
  82. arekit/contrib/source/nerelbio/io_utils.py +0 -62
  83. arekit/contrib/source/nerelbio/labels.py +0 -265
  84. arekit/contrib/source/nerelbio/reader.py +0 -8
  85. arekit/contrib/source/nerelbio/versions.py +0 -8
  86. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  87. arekit/contrib/source/ruattitudes/collection.py +0 -36
  88. arekit/contrib/source/ruattitudes/doc.py +0 -51
  89. arekit/contrib/source/ruattitudes/doc_brat.py +0 -44
  90. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  91. arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
  92. arekit/contrib/source/ruattitudes/io_utils.py +0 -56
  93. arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
  94. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  95. arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
  96. arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
  97. arekit/contrib/source/ruattitudes/reader.py +0 -268
  98. arekit/contrib/source/ruattitudes/sentence.py +0 -73
  99. arekit/contrib/source/ruattitudes/synonyms.py +0 -17
  100. arekit/contrib/source/ruattitudes/text_object.py +0 -59
  101. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  102. arekit/contrib/source/rusentiframes/collection.py +0 -157
  103. arekit/contrib/source/rusentiframes/effect.py +0 -24
  104. arekit/contrib/source/rusentiframes/io_utils.py +0 -19
  105. arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
  106. arekit/contrib/source/rusentiframes/polarity.py +0 -35
  107. arekit/contrib/source/rusentiframes/role.py +0 -15
  108. arekit/contrib/source/rusentiframes/state.py +0 -24
  109. arekit/contrib/source/rusentiframes/types.py +0 -42
  110. arekit/contrib/source/rusentiframes/value.py +0 -2
  111. arekit/contrib/source/rusentrel/__init__.py +0 -0
  112. arekit/contrib/source/rusentrel/const.py +0 -3
  113. arekit/contrib/source/rusentrel/docs_reader.py +0 -51
  114. arekit/contrib/source/rusentrel/entities.py +0 -26
  115. arekit/contrib/source/rusentrel/io_utils.py +0 -125
  116. arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
  117. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  118. arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
  119. arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
  120. arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
  121. arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
  122. arekit/contrib/source/rusentrel/synonyms.py +0 -17
  123. arekit/contrib/source/sentinerel/__init__.py +0 -0
  124. arekit/contrib/source/sentinerel/entities.py +0 -52
  125. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  126. arekit/contrib/source/sentinerel/folding/factory.py +0 -31
  127. arekit/contrib/source/sentinerel/folding/fixed.py +0 -70
  128. arekit/contrib/source/sentinerel/io_utils.py +0 -87
  129. arekit/contrib/source/sentinerel/labels.py +0 -53
  130. arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
  131. arekit/contrib/source/sentinerel/reader.py +0 -42
  132. arekit/contrib/source/synonyms/__init__.py +0 -0
  133. arekit/contrib/source/synonyms/utils.py +0 -19
  134. arekit/contrib/source/zip_utils.py +0 -47
  135. arekit/contrib/utils/connotations/__init__.py +0 -0
  136. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
  137. arekit/contrib/utils/data/readers/__init__.py +0 -0
  138. arekit/contrib/utils/data/readers/base.py +0 -7
  139. arekit/contrib/utils/data/readers/csv_pd.py +0 -38
  140. arekit/contrib/utils/data/readers/jsonl.py +0 -15
  141. arekit/contrib/utils/data/service/__init__.py +0 -0
  142. arekit/contrib/utils/data/service/balance.py +0 -50
  143. arekit/contrib/utils/data/writers/csv_native.py +0 -63
  144. arekit/contrib/utils/data/writers/csv_pd.py +0 -40
  145. arekit/contrib/utils/data/writers/json_opennre.py +0 -132
  146. arekit/contrib/utils/data/writers/sqlite_native.py +0 -110
  147. arekit/contrib/utils/download.py +0 -77
  148. arekit/contrib/utils/embeddings/__init__.py +0 -0
  149. arekit/contrib/utils/embeddings/rusvectores.py +0 -58
  150. arekit/contrib/utils/embeddings/tokens.py +0 -30
  151. arekit/contrib/utils/io_utils/embedding.py +0 -72
  152. arekit/contrib/utils/io_utils/opinions.py +0 -37
  153. arekit/contrib/utils/io_utils/samples.py +0 -79
  154. arekit/contrib/utils/lexicons/__init__.py +0 -0
  155. arekit/contrib/utils/lexicons/lexicon.py +0 -41
  156. arekit/contrib/utils/lexicons/relation.py +0 -42
  157. arekit/contrib/utils/lexicons/rusentilex.py +0 -37
  158. arekit/contrib/utils/nn/__init__.py +0 -0
  159. arekit/contrib/utils/nn/rows.py +0 -83
  160. arekit/contrib/utils/np_utils/__init__.py +0 -0
  161. arekit/contrib/utils/np_utils/embedding.py +0 -22
  162. arekit/contrib/utils/np_utils/npz_utils.py +0 -13
  163. arekit/contrib/utils/np_utils/vocab.py +0 -20
  164. arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
  165. arekit/contrib/utils/pipelines/items/sampling/base.py +0 -99
  166. arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -54
  167. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
  168. arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -32
  169. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
  170. arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -107
  171. arekit/contrib/utils/pipelines/items/text/translator.py +0 -135
  172. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  173. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  174. arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +0 -27
  175. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -65
  176. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
  177. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  178. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +0 -29
  179. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -64
  180. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
  181. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  182. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +0 -56
  183. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -20
  184. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -65
  185. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  186. arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +0 -21
  187. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -107
  188. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  189. arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +0 -29
  190. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
  191. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -180
  192. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
  193. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
  194. arekit/contrib/utils/processing/languages/__init__.py +0 -0
  195. arekit/contrib/utils/processing/languages/mods.py +0 -12
  196. arekit/contrib/utils/processing/languages/pos.py +0 -23
  197. arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
  198. arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
  199. arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
  200. arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
  201. arekit/contrib/utils/processing/languages/ru/number.py +0 -23
  202. arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
  203. arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
  204. arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
  205. arekit/contrib/utils/processing/pos/__init__.py +0 -0
  206. arekit/contrib/utils/processing/pos/base.py +0 -12
  207. arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
  208. arekit/contrib/utils/processing/pos/russian.py +0 -10
  209. arekit/contrib/utils/processing/text/__init__.py +0 -0
  210. arekit/contrib/utils/processing/text/tokens.py +0 -127
  211. arekit/contrib/utils/resources.py +0 -25
  212. arekit/contrib/utils/serializer.py +0 -43
  213. arekit/contrib/utils/sources/__init__.py +0 -0
  214. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  215. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  216. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
  217. arekit/contrib/utils/vectorizers/__init__.py +0 -0
  218. arekit/contrib/utils/vectorizers/bpe.py +0 -93
  219. arekit/contrib/utils/vectorizers/random_norm.py +0 -39
  220. arekit/download_data.py +0 -11
  221. arekit-0.24.0.dist-info/METADATA +0 -23
  222. arekit-0.24.0.dist-info/RECORD +0 -374
  223. {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/LICENSE +0 -0
  224. {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/top_level.txt +0 -0
@@ -1,37 +0,0 @@
1
- from os.path import join
2
-
3
- from arekit.contrib.utils.data.readers.base import BaseReader
4
- from arekit.common.experiment.api.base_samples_io import BaseSamplesIO
5
- from arekit.contrib.utils.io_utils.utils import filename_template
6
-
7
-
8
- class OpinionsIO(BaseSamplesIO):
9
-
10
- def __init__(self, target_dir, reader=None, prefix="opinion"):
11
- assert(isinstance(reader, BaseReader))
12
- self.__target_dir = target_dir
13
- self.__prefix = prefix
14
- self.__reader = reader
15
- self.__target_extension = reader.extension()
16
-
17
- @property
18
- def Reader(self):
19
- return self.__reader
20
-
21
- def create_target(self, data_type):
22
- return self.__get_input_opinions_target(data_type)
23
-
24
- def __get_input_opinions_target(self, data_type):
25
- template = filename_template(data_type=data_type)
26
- return self.__get_filepath(out_dir=self.__target_dir,
27
- template=template,
28
- prefix=self.__prefix,
29
- extension=self.__target_extension)
30
-
31
- @staticmethod
32
- def __get_filepath(out_dir, template, prefix, extension):
33
- assert(isinstance(template, str))
34
- assert(isinstance(prefix, str))
35
- assert(isinstance(extension, str))
36
- return join(out_dir, "{prefix}-{template}{extension}".format(
37
- prefix=prefix, template=template, extension=extension))
@@ -1,79 +0,0 @@
1
- import logging
2
- from os.path import join
3
-
4
- from arekit.contrib.utils.data.readers.base import BaseReader
5
- from arekit.common.experiment.api.base_samples_io import BaseSamplesIO
6
- from arekit.contrib.utils.data.writers.base import BaseWriter
7
- from arekit.contrib.utils.io_utils.utils import filename_template, check_targets_existence
8
-
9
- logger = logging.getLogger(__name__)
10
- logging.basicConfig(level=logging.INFO)
11
-
12
-
13
- class SamplesIO(BaseSamplesIO):
14
- """ Samples default IO utils for samples.
15
- Sample is a text part which include pair of attitude participants.
16
- This class allows to provide saver and loader for such entries, bubbed as samples.
17
- Samples required for machine learning training/inferring.
18
- """
19
-
20
- def __init__(self, target_dir, writer=None, reader=None, prefix="sample"):
21
- assert(isinstance(target_dir, str))
22
- assert(isinstance(prefix, str))
23
- assert(isinstance(writer, BaseWriter) or writer is None)
24
- assert(isinstance(reader, BaseReader) or reader is None)
25
- self.__target_dir = target_dir
26
- self.__prefix = prefix
27
- self.__writer = writer
28
- self.__reader = reader
29
-
30
- self.__target_extension = None
31
- if writer is not None:
32
- self.__target_extension = writer.extension()
33
- elif reader is not None:
34
- self.__target_extension = reader.extension()
35
-
36
- # region public methods
37
-
38
- @property
39
- def Prefix(self):
40
- return self.__prefix
41
-
42
- @property
43
- def Reader(self):
44
- return self.__reader
45
-
46
- @property
47
- def Writer(self):
48
- return self.__writer
49
-
50
- def create_target(self, data_type):
51
- return self.__get_input_sample_target(data_type)
52
-
53
- def check_targets_existed(self, data_types_iter):
54
- for data_type in data_types_iter:
55
-
56
- targets = [
57
- self.__get_input_sample_target(data_type=data_type),
58
- ]
59
-
60
- if not check_targets_existence(targets=targets):
61
- return False
62
- return True
63
-
64
- # endregion
65
-
66
- def __get_input_sample_target(self, data_type):
67
- template = filename_template(data_type=data_type)
68
- return self.__get_filepath(out_dir=self.__target_dir,
69
- template=template,
70
- prefix=self.__prefix,
71
- extension=self.__target_extension)
72
-
73
- @staticmethod
74
- def __get_filepath(out_dir, template, prefix, extension):
75
- assert(isinstance(template, str))
76
- assert(isinstance(prefix, str))
77
- assert(isinstance(extension, str))
78
- return join(out_dir, "{prefix}-{template}{extension}".format(
79
- prefix=prefix, template=template, extension=extension))
File without changes
@@ -1,41 +0,0 @@
1
- from arekit.contrib.utils.data.readers.csv_pd import PandasCsvReader
2
-
3
-
4
- class Lexicon(object):
5
-
6
- @property
7
- def ToneKey(self):
8
- return 'tone'
9
-
10
- @property
11
- def TermKey(self):
12
- return 'term'
13
-
14
- def __init__(self, dataframe):
15
- self.__lexicon_df = dataframe
16
-
17
- @classmethod
18
- def load(cls, filepath, separator=','):
19
- reader = PandasCsvReader(compression=None, sep=separator)
20
- return cls(reader.read(filepath))
21
-
22
- def get_score(self, lemma):
23
- assert(type(lemma) == str)
24
- s = self.__lexicon_df[lemma.encode('utf-8') == self.__lexicon_df[self.TermKey]]
25
- return s[self.ToneKey].values[0] if len(s) > 0 else 0
26
-
27
- def has_term(self, term):
28
- assert(type(term) == str)
29
- s = self.__lexicon_df[term.encode('utf-8') == self.__lexicon_df[self.TermKey]]
30
- return len(s) > 0
31
-
32
- def __iter__(self):
33
- for term in self.__lexicon_df[self.TermKey]:
34
- yield term
35
-
36
- def __contains__(self, item):
37
- assert(isinstance(item, str))
38
- result = self.__lexicon_df[self.__lexicon_df[self.TermKey] == item.encode('utf-8')]
39
- return len(result) > 0
40
-
41
-
@@ -1,42 +0,0 @@
1
- from arekit.contrib.utils.data.readers.csv_pd import PandasCsvReader
2
-
3
-
4
- class RelationLexicon(object):
5
-
6
- def __init__(self, dataframe):
7
- self.__check(dataframe)
8
- self.__lexicon = dataframe
9
-
10
- @classmethod
11
- def load(cls, filepath, separator=','):
12
- reader = PandasCsvReader(compression=None, sep=separator)
13
- return cls(reader.read(filepath))
14
-
15
- @staticmethod
16
- def __check(df):
17
- for index in df.index:
18
- relation = df.loc[index][0]
19
- assert(len(relation.split('<->')) == 2)
20
-
21
- @staticmethod
22
- def __create_key(l, r):
23
- assert(type(l) == str)
24
- assert(type(r) == str)
25
- return '<->'.join([l, r])
26
-
27
- def get_score(self, left, right):
28
- assert(type(left) == str)
29
- assert(type(right) == str)
30
-
31
- lr_key = self.__create_key(left, right)
32
- rl_key = self.__create_key(right, left)
33
-
34
- lr_score = self.__lexicon[lr_key == self.__lexicon['relation']]
35
- rl_score = self.__lexicon[rl_key == self.__lexicon['relation']]
36
-
37
- if len(lr_score) > 0:
38
- return lr_score['tone'].values[0]
39
- if len(rl_score) > 0:
40
- return rl_score['tone'].values[0]
41
-
42
- return None
@@ -1,37 +0,0 @@
1
- import importlib
2
- import zipfile
3
- from os import path
4
-
5
-
6
- from arekit.contrib.source.zip_utils import ZipArchiveUtils
7
- from arekit.contrib.utils.lexicons.lexicon import Lexicon
8
-
9
-
10
- class RuSentiLexLexicon(Lexicon):
11
- """
12
- RuSentiLex Lexicon wrapper for csv file stored in /data folder.
13
- """
14
-
15
- __INNER_PATH = 'rusentilex.csv'
16
-
17
- @property
18
- def ToneKey(self):
19
- return 'tone'
20
-
21
- @property
22
- def TermKey(self):
23
- return 'term'
24
-
25
- @staticmethod
26
- def __get_archive_filepath():
27
- return path.join(ZipArchiveUtils.get_data_root(), "rusentilex.zip")
28
-
29
- @classmethod
30
- def from_zip(cls):
31
- """ Using Pandas API to read lexicon.
32
- """
33
- pd = importlib.import_module("pandas")
34
- with zipfile.ZipFile(cls.__get_archive_filepath(), "r") as zip_ref:
35
- with zip_ref.open(cls.__INNER_PATH, mode='r') as csv_file:
36
- df = pd.read_csv(csv_file, sep=',')
37
- return cls(df)
File without changes
@@ -1,83 +0,0 @@
1
- import collections
2
-
3
- from arekit.common.data.input.providers.text.single import BaseSingleTextProvider
4
- from arekit.common.data.input.terms_mapper import OpinionContainingTextTermsMapper
5
- from arekit.common.entities.str_fmt import StringEntitiesFormatter
6
- from arekit.contrib.networks.input.ctx_serialization import NetworkSerializationContext
7
- from arekit.contrib.networks.input.formatters.pos_mapper import PosTermsMapper
8
- from arekit.contrib.networks.input.providers.sample import NetworkSampleRowProvider
9
- from arekit.contrib.networks.input.providers.text import NetworkSingleTextProvider
10
- from arekit.contrib.networks.input.term_types import TermTypes
11
- from arekit.contrib.networks.input.terms_mapping import VectorizedNetworkTermMapping
12
- from arekit.contrib.utils.processing.lemmatization.mystem import MystemWrapper
13
- from arekit.contrib.utils.resources import load_embedding_news_mystem_skipgram_1000_20_2015
14
- from arekit.contrib.utils.vectorizers.bpe import BPEVectorizer
15
- from arekit.contrib.utils.vectorizers.random_norm import RandomNormalVectorizer
16
-
17
-
18
- def __add_term_embedding(dict_data, term, emb_vector):
19
- if term in dict_data:
20
- return
21
- dict_data[term] = emb_vector
22
-
23
-
24
- def create_rows_provider(str_entity_fmt, ctx, vectorizers="default"):
25
- """ This method is corresponds to the default initialization of
26
- the rows provider for data sampling pipeline.
27
-
28
- vectorizers:
29
- NONE: no need to vectorize, just provide text (using SingleTextProvider).
30
- DEFAULT: we consider an application of stemmer for Russian Language.
31
- DICT: in which for every type there is an assigned Vectorizer
32
- vectorization of term types.
33
- {
34
- TermType.Word: Vectorizer,
35
- TermType.Entity: Vectorizer,
36
- ...
37
- }
38
- """
39
- assert(isinstance(str_entity_fmt, StringEntitiesFormatter))
40
- assert(isinstance(ctx, NetworkSerializationContext))
41
- assert(isinstance(vectorizers, dict) or vectorizers == "default" or vectorizers is None)
42
-
43
- term_embedding_pairs = None
44
-
45
- if vectorizers is not None:
46
-
47
- if vectorizers == "default":
48
- # initialize default vectorizer for Russian language.
49
- embedding = load_embedding_news_mystem_skipgram_1000_20_2015(stemmer=MystemWrapper(), auto_download=True)
50
- bpe_vectorizer = BPEVectorizer(embedding=embedding, max_part_size=3)
51
- norm_vectorizer = RandomNormalVectorizer(vector_size=embedding.VectorSize,
52
- token_offset=12345)
53
- vectorizers = {
54
- TermTypes.WORD: bpe_vectorizer,
55
- TermTypes.ENTITY: bpe_vectorizer,
56
- TermTypes.FRAME: bpe_vectorizer,
57
- TermTypes.TOKEN: norm_vectorizer
58
- }
59
-
60
- # Setup term-embedding pairs collection instance.
61
- term_embedding_pairs = collections.OrderedDict()
62
-
63
- # Use text provider with vectorizers.
64
- text_provider = NetworkSingleTextProvider(
65
- text_terms_mapper=VectorizedNetworkTermMapping(
66
- vectorizers=vectorizers,
67
- string_entities_formatter=str_entity_fmt),
68
- pair_handling_func=lambda pair: __add_term_embedding(
69
- dict_data=term_embedding_pairs,
70
- term=pair[0],
71
- emb_vector=pair[1]))
72
- else:
73
- # Create text provider which without vectorizers.
74
- text_provider = BaseSingleTextProvider(
75
- text_terms_mapper=OpinionContainingTextTermsMapper(str_entity_fmt))
76
-
77
- return NetworkSampleRowProvider(
78
- label_provider=ctx.LabelProvider,
79
- text_provider=text_provider,
80
- frames_connotation_provider=ctx.FramesConnotationProvider,
81
- frame_role_label_scaler=ctx.FrameRolesLabelScaler,
82
- pos_terms_mapper=PosTermsMapper(ctx.PosTagger) if ctx.PosTagger is not None else None,
83
- term_embedding_pairs=term_embedding_pairs)
File without changes
@@ -1,22 +0,0 @@
1
- import logging
2
-
3
- from arekit.contrib.utils.np_utils.npz_utils import NpzRepositoryUtils
4
-
5
- logger = logging.getLogger(__name__)
6
- logging.basicConfig(level=logging.INFO)
7
-
8
-
9
- class NpzEmbeddingHelper:
10
-
11
- @staticmethod
12
- def save_embedding(data, target):
13
- NpzRepositoryUtils.save(data=data, target=target)
14
- logger.info("Saving embedding [size={shape}]: {filepath}".format(shape=data.shape,
15
- filepath=target))
16
-
17
- @staticmethod
18
- def load_embedding(source):
19
- embedding = NpzRepositoryUtils.load(source)
20
- logger.info("Embedding read [size={size}]: {filepath}".format(size=embedding.shape,
21
- filepath=source))
22
- return embedding
@@ -1,13 +0,0 @@
1
- import numpy as np
2
-
3
-
4
- class NpzRepositoryUtils(object):
5
-
6
- @staticmethod
7
- def save(data, target):
8
- np.savez(target, data)
9
-
10
- @staticmethod
11
- def load(source):
12
- data = np.load(source)
13
- return data['arr_0']
@@ -1,20 +0,0 @@
1
- import logging
2
-
3
- import numpy as np
4
-
5
- logger = logging.getLogger(__name__)
6
- logging.basicConfig(level=logging.INFO)
7
-
8
-
9
- class VocabRepositoryUtils(object):
10
-
11
- @staticmethod
12
- def save(data, target):
13
- logger.info("Saving vocabulary [size={size}]: {filepath}".format(size=len(data), filepath=target))
14
- np.savetxt(target, data, fmt='%s')
15
-
16
- @staticmethod
17
- def load(source):
18
- vocab = np.loadtxt(source, dtype=str, comments=None)
19
- logger.info("Loading vocabulary [size={size}]: {filepath}".format(size=len(vocab), filepath=source))
20
- return vocab
File without changes
@@ -1,99 +0,0 @@
1
- from arekit.common.data.input.providers.rows.samples import BaseSampleRowProvider
2
- from arekit.common.data.storages.base import BaseRowsStorage
3
- from arekit.common.experiment.api.base_samples_io import BaseSamplesIO
4
- from arekit.common.experiment.data_type import DataType
5
- from arekit.common.pipeline.base import BasePipeline
6
- from arekit.common.pipeline.context import PipelineContext
7
- from arekit.common.pipeline.items.base import BasePipelineItem
8
- from arekit.contrib.utils.serializer import InputDataSerializationHelper
9
-
10
-
11
- class BaseSerializerPipelineItem(BasePipelineItem):
12
-
13
- def __init__(self, rows_provider, samples_io, save_labels_func, storage):
14
- """ sample_rows_formatter:
15
- how we format input texts for a BERT model, for example:
16
- - single text
17
- - two sequences, separated by [SEP] token
18
-
19
- save_labels_func: function
20
- data_type -> bool
21
- """
22
- assert(isinstance(rows_provider, BaseSampleRowProvider))
23
- assert(isinstance(samples_io, BaseSamplesIO))
24
- assert(callable(save_labels_func))
25
- assert(isinstance(storage, BaseRowsStorage))
26
-
27
- self._rows_provider = rows_provider
28
- self._samples_io = samples_io
29
- self._save_labels_func = save_labels_func
30
- self._storage = storage
31
-
32
- def _serialize_iteration(self, data_type, pipeline, data_folding, doc_ids):
33
- assert(isinstance(data_type, DataType))
34
- assert(isinstance(pipeline, BasePipeline))
35
- assert(isinstance(data_folding, dict) or data_folding is None)
36
- assert(isinstance(doc_ids, list) or doc_ids is None)
37
- assert(doc_ids is not None or data_folding is not None)
38
-
39
- repos = {
40
- "sample": InputDataSerializationHelper.create_samples_repo(
41
- keep_labels=self._save_labels_func(data_type),
42
- rows_provider=self._rows_provider,
43
- storage=self._storage),
44
- }
45
-
46
- writer_and_targets = {
47
- "sample": (self._samples_io.Writer,
48
- self._samples_io.create_target(data_type=data_type)),
49
- }
50
-
51
- for description, repo in repos.items():
52
-
53
- if data_folding is None:
54
- # Consider only the predefined doc_ids.
55
- doc_ids_iter = doc_ids
56
- else:
57
- # Take particular data_type.
58
- doc_ids_iter = data_folding[data_type]
59
- # Consider only predefined doc_ids.
60
- if doc_ids is not None:
61
- doc_ids_iter = set(doc_ids_iter).intersection(doc_ids)
62
-
63
- InputDataSerializationHelper.fill_and_write(
64
- repo=repo,
65
- pipeline=pipeline,
66
- doc_ids_iter=doc_ids_iter,
67
- desc="{desc} [{data_type}]".format(desc=description, data_type=data_type),
68
- writer=writer_and_targets[description][0],
69
- target=writer_and_targets[description][1])
70
-
71
- def _handle_iteration(self, data_type_pipelines, data_folding, doc_ids):
72
- """ Performing data serialization for a particular iteration
73
- """
74
- assert(isinstance(data_type_pipelines, dict))
75
- for data_type, pipeline in data_type_pipelines.items():
76
- self._serialize_iteration(data_type=data_type, pipeline=pipeline, data_folding=data_folding,
77
- doc_ids=doc_ids)
78
-
79
- def apply_core(self, input_data, pipeline_ctx):
80
- """
81
- data_type_pipelines: dict of, for example:
82
- {
83
- DataType.Train: BasePipeline,
84
- DataType.Test: BasePipeline
85
- }
86
-
87
- data_type_pipelines: doc_id -> parsed_doc -> annot -> opinion linkages
88
- for example, function: sentiment_attitude_extraction_default_pipeline
89
- doc_ids: optional
90
- this parameter allows to limit amount of documents considered for sampling
91
- """
92
- assert(isinstance(input_data, PipelineContext))
93
- assert("data_type_pipelines" in input_data)
94
-
95
- data_folding = input_data.provide_or_none("data_folding")
96
-
97
- self._handle_iteration(data_type_pipelines=input_data.provide("data_type_pipelines"),
98
- doc_ids=input_data.provide_or_none("doc_ids"),
99
- data_folding=data_folding)
@@ -1,54 +0,0 @@
1
- from arekit.contrib.networks.input.embedding.matrix import create_term_embedding_matrix
2
- from arekit.contrib.networks.input.embedding.offsets import TermsEmbeddingOffsets
3
- from arekit.contrib.networks.embedding import Embedding
4
- from arekit.contrib.networks.input.providers.sample import NetworkSampleRowProvider
5
- from arekit.contrib.utils.io_utils.embedding import NpEmbeddingIO
6
- from arekit.contrib.utils.pipelines.items.sampling.base import BaseSerializerPipelineItem
7
-
8
-
9
- class NetworksInputSerializerPipelineItem(BaseSerializerPipelineItem):
10
-
11
- def __init__(self, save_labels_func, rows_provider, samples_io, emb_io, storage, save_embedding=True):
12
- """ This pipeline item allows to perform a data preparation for neural network models.
13
-
14
- considering a list of the whole data_types with the related pipelines,
15
- which are supported and required in a handler. It is necessary to know
16
- data_types in advance as it allows to create a complete vocabulary of input terms,
17
- with the related embeddings.
18
- """
19
- assert(isinstance(emb_io, NpEmbeddingIO))
20
- assert(isinstance(rows_provider, NetworkSampleRowProvider))
21
- assert(isinstance(save_embedding, bool))
22
- super(NetworksInputSerializerPipelineItem, self).__init__(
23
- rows_provider=rows_provider,
24
- samples_io=samples_io,
25
- save_labels_func=save_labels_func,
26
- storage=storage)
27
-
28
- self.__emb_io = emb_io
29
- self.__save_embedding = save_embedding
30
-
31
- def _handle_iteration(self, data_type_pipelines, data_folding, doc_ids):
32
- """ Performing data serialization for a particular iteration
33
- """
34
- assert(isinstance(data_type_pipelines, dict))
35
-
36
- # Prepare for the present iteration.
37
- self._rows_provider.clear_embedding_pairs()
38
-
39
- super(NetworksInputSerializerPipelineItem, self)._handle_iteration(
40
- data_type_pipelines=data_type_pipelines, data_folding=data_folding, doc_ids=doc_ids)
41
-
42
- if not (self.__save_embedding and self._rows_provider.HasEmbeddingPairs):
43
- return
44
-
45
- # Save embedding information additionally.
46
- term_embedding = Embedding.from_word_embedding_pairs_iter(self._rows_provider.iter_term_embedding_pairs())
47
- embedding_matrix = create_term_embedding_matrix(term_embedding=term_embedding)
48
- vocab = list(TermsEmbeddingOffsets.extract_vocab(words_embedding=term_embedding))
49
-
50
- # Save embedding matrix
51
- self.__emb_io.save_embedding(data=embedding_matrix)
52
- self.__emb_io.save_vocab(data=vocab)
53
-
54
- del embedding_matrix
@@ -1,36 +0,0 @@
1
- from arekit.common.text.stemmer import Stemmer
2
- from arekit.contrib.utils.pipelines.items.text.frames import FrameVariantsParser
3
- from arekit.contrib.utils.processing.languages.ru.mods import RussianLanguageMods
4
-
5
-
6
- class LemmasBasedFrameVariantsParser(FrameVariantsParser):
7
-
8
- def __init__(self, frame_variants, stemmer, locale_mods=RussianLanguageMods, save_lemmas=False):
9
- assert(isinstance(stemmer, Stemmer))
10
- assert(isinstance(save_lemmas, bool))
11
- super(LemmasBasedFrameVariantsParser, self).__init__(frame_variants=frame_variants)
12
-
13
- self.__frame_variants = frame_variants
14
- self.__stemmer = stemmer
15
- self.__save_lemmas = save_lemmas
16
- self.__max_variant_len = max([len(variant) for _, variant in frame_variants.iter_variants()])
17
- self.__locale_mods = locale_mods
18
-
19
- def __lemmatize_term(self, term):
20
- # we first split onto words for lemmatization and then join all of them.
21
- lemma = "".join(self.__stemmer.lemmatize_to_list(term))
22
- # then we replace certain chars according to the locale restrictions.
23
- return self.__locale_mods.replace_specific_word_chars(lemma)
24
-
25
- def __provide_lemmatized_terms(self, terms):
26
- """
27
- Compose a list of lemmatized versions of parsed_doc
28
- PS: Might be significantly slow, depending on stemmer were used.
29
- """
30
- assert(isinstance(terms, list))
31
- return [self.__lemmatize_term(term) if isinstance(term, str) else term for term in terms]
32
-
33
- def apply_core(self, input_data, pipeline_ctx):
34
- lemmas = self.__provide_lemmatized_terms(input_data)
35
- processed_it = self._iter_processed(terms=lemmas, origin=lemmas if self.__save_lemmas else input_data)
36
- return list(processed_it)
@@ -1,32 +0,0 @@
1
- from arekit.common.frames.text_variant import TextFrameVariant
2
- from arekit.common.pipeline.context import PipelineContext
3
- from arekit.common.pipeline.items.base import BasePipelineItem
4
- from arekit.contrib.utils.processing.languages.mods import BaseLanguageMods
5
- from arekit.contrib.utils.processing.languages.ru.mods import RussianLanguageMods
6
-
7
-
8
- class FrameVariantsSentimentNegation(BasePipelineItem):
9
-
10
- def __init__(self, locale_mods=RussianLanguageMods):
11
- assert(issubclass(locale_mods, BaseLanguageMods))
12
- self._locale_mods = locale_mods
13
-
14
- @staticmethod
15
- def __get_preposition(terms, index):
16
- return terms[index-1] if index > 0 else None
17
-
18
- def apply_core(self, input_data, pipeline_ctx):
19
- assert(isinstance(input_data, list))
20
- assert(isinstance(pipeline_ctx, PipelineContext))
21
-
22
- for curr_ind, term in enumerate(input_data):
23
-
24
- if not isinstance(term, TextFrameVariant):
25
- continue
26
-
27
- prep_term = self.__get_preposition(terms=input_data, index=curr_ind)
28
- is_negated = self._locale_mods.is_negation_word(prep_term) if prep_term is not None else False
29
-
30
- term.set_is_negated(is_negated)
31
-
32
- return input_data
@@ -1,10 +0,0 @@
1
- from arekit.common.pipeline.context import PipelineContext
2
- from arekit.common.pipeline.items.base import BasePipelineItem
3
- from arekit.common.utils import split_by_whitespaces
4
-
5
-
6
- class TermsSplitterParser(BasePipelineItem):
7
-
8
- def apply_core(self, input_data, pipeline_ctx):
9
- assert(isinstance(pipeline_ctx, PipelineContext))
10
- return split_by_whitespaces(input_data)