arekit 0.24.0__py3-none-any.whl → 0.25.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. arekit/common/data/storages/base.py +4 -15
  2. arekit/common/docs/entities_grouping.py +2 -1
  3. arekit/common/docs/parser.py +27 -22
  4. arekit/common/pipeline/base.py +12 -16
  5. arekit/common/pipeline/batching.py +28 -0
  6. arekit/common/pipeline/context.py +5 -1
  7. arekit/common/pipeline/items/base.py +39 -2
  8. arekit/common/pipeline/items/flatten.py +5 -1
  9. arekit/common/pipeline/items/handle.py +2 -1
  10. arekit/common/pipeline/items/iter.py +2 -1
  11. arekit/common/pipeline/items/map.py +2 -1
  12. arekit/common/pipeline/items/map_nested.py +4 -0
  13. arekit/common/pipeline/utils.py +32 -0
  14. arekit/common/text/{partitioning/str.py → partitioning.py} +14 -9
  15. arekit/common/utils.py +11 -52
  16. arekit/contrib/utils/data/contents/opinions.py +13 -3
  17. arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
  18. arekit/contrib/utils/data/storages/pandas_based.py +2 -17
  19. arekit/contrib/utils/data/storages/row_cache.py +8 -2
  20. arekit/contrib/utils/data/storages/sqlite_based.py +18 -0
  21. arekit/contrib/utils/io_utils/utils.py +1 -18
  22. arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
  23. arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
  24. arekit/contrib/utils/pipelines/text_opinion/extraction.py +8 -10
  25. arekit-0.25.1.data/data/logo.png +0 -0
  26. arekit-0.25.1.dist-info/METADATA +81 -0
  27. arekit-0.25.1.dist-info/RECORD +186 -0
  28. {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/WHEEL +1 -1
  29. arekit/common/data/input/repositories/__init__.py +0 -0
  30. arekit/common/data/input/repositories/base.py +0 -68
  31. arekit/common/data/input/repositories/sample.py +0 -22
  32. arekit/common/data/views/__init__.py +0 -0
  33. arekit/common/data/views/samples.py +0 -26
  34. arekit/common/docs/objects_parser.py +0 -37
  35. arekit/common/text/parser.py +0 -12
  36. arekit/common/text/partitioning/__init__.py +0 -0
  37. arekit/common/text/partitioning/base.py +0 -4
  38. arekit/common/text/partitioning/terms.py +0 -35
  39. arekit/contrib/networks/__init__.py +0 -0
  40. arekit/contrib/networks/embedding.py +0 -149
  41. arekit/contrib/networks/embedding_io.py +0 -18
  42. arekit/contrib/networks/input/__init__.py +0 -0
  43. arekit/contrib/networks/input/const.py +0 -6
  44. arekit/contrib/networks/input/ctx_serialization.py +0 -28
  45. arekit/contrib/networks/input/embedding/__init__.py +0 -0
  46. arekit/contrib/networks/input/embedding/matrix.py +0 -29
  47. arekit/contrib/networks/input/embedding/offsets.py +0 -55
  48. arekit/contrib/networks/input/formatters/__init__.py +0 -0
  49. arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
  50. arekit/contrib/networks/input/providers/__init__.py +0 -0
  51. arekit/contrib/networks/input/providers/sample.py +0 -129
  52. arekit/contrib/networks/input/providers/term_connotation.py +0 -23
  53. arekit/contrib/networks/input/providers/text.py +0 -24
  54. arekit/contrib/networks/input/rows_parser.py +0 -47
  55. arekit/contrib/networks/input/term_types.py +0 -13
  56. arekit/contrib/networks/input/terms_mapping.py +0 -60
  57. arekit/contrib/networks/vectorizer.py +0 -6
  58. arekit/contrib/source/__init__.py +0 -0
  59. arekit/contrib/source/brat/__init__.py +0 -0
  60. arekit/contrib/source/brat/annot.py +0 -84
  61. arekit/contrib/source/brat/doc.py +0 -28
  62. arekit/contrib/source/brat/entities/__init__.py +0 -0
  63. arekit/contrib/source/brat/entities/compound.py +0 -13
  64. arekit/contrib/source/brat/entities/entity.py +0 -42
  65. arekit/contrib/source/brat/entities/parser.py +0 -53
  66. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  67. arekit/contrib/source/brat/opinions/converter.py +0 -19
  68. arekit/contrib/source/brat/relation.py +0 -32
  69. arekit/contrib/source/brat/sentence.py +0 -69
  70. arekit/contrib/source/brat/sentences_reader.py +0 -128
  71. arekit/contrib/source/download.py +0 -41
  72. arekit/contrib/source/nerel/__init__.py +0 -0
  73. arekit/contrib/source/nerel/entities.py +0 -55
  74. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  75. arekit/contrib/source/nerel/folding/fixed.py +0 -74
  76. arekit/contrib/source/nerel/io_utils.py +0 -62
  77. arekit/contrib/source/nerel/labels.py +0 -241
  78. arekit/contrib/source/nerel/reader.py +0 -46
  79. arekit/contrib/source/nerel/utils.py +0 -24
  80. arekit/contrib/source/nerel/versions.py +0 -12
  81. arekit/contrib/source/nerelbio/__init__.py +0 -0
  82. arekit/contrib/source/nerelbio/io_utils.py +0 -62
  83. arekit/contrib/source/nerelbio/labels.py +0 -265
  84. arekit/contrib/source/nerelbio/reader.py +0 -8
  85. arekit/contrib/source/nerelbio/versions.py +0 -8
  86. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  87. arekit/contrib/source/ruattitudes/collection.py +0 -36
  88. arekit/contrib/source/ruattitudes/doc.py +0 -51
  89. arekit/contrib/source/ruattitudes/doc_brat.py +0 -44
  90. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  91. arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
  92. arekit/contrib/source/ruattitudes/io_utils.py +0 -56
  93. arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
  94. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  95. arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
  96. arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
  97. arekit/contrib/source/ruattitudes/reader.py +0 -268
  98. arekit/contrib/source/ruattitudes/sentence.py +0 -73
  99. arekit/contrib/source/ruattitudes/synonyms.py +0 -17
  100. arekit/contrib/source/ruattitudes/text_object.py +0 -59
  101. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  102. arekit/contrib/source/rusentiframes/collection.py +0 -157
  103. arekit/contrib/source/rusentiframes/effect.py +0 -24
  104. arekit/contrib/source/rusentiframes/io_utils.py +0 -19
  105. arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
  106. arekit/contrib/source/rusentiframes/polarity.py +0 -35
  107. arekit/contrib/source/rusentiframes/role.py +0 -15
  108. arekit/contrib/source/rusentiframes/state.py +0 -24
  109. arekit/contrib/source/rusentiframes/types.py +0 -42
  110. arekit/contrib/source/rusentiframes/value.py +0 -2
  111. arekit/contrib/source/rusentrel/__init__.py +0 -0
  112. arekit/contrib/source/rusentrel/const.py +0 -3
  113. arekit/contrib/source/rusentrel/docs_reader.py +0 -51
  114. arekit/contrib/source/rusentrel/entities.py +0 -26
  115. arekit/contrib/source/rusentrel/io_utils.py +0 -125
  116. arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
  117. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  118. arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
  119. arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
  120. arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
  121. arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
  122. arekit/contrib/source/rusentrel/synonyms.py +0 -17
  123. arekit/contrib/source/sentinerel/__init__.py +0 -0
  124. arekit/contrib/source/sentinerel/entities.py +0 -52
  125. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  126. arekit/contrib/source/sentinerel/folding/factory.py +0 -31
  127. arekit/contrib/source/sentinerel/folding/fixed.py +0 -70
  128. arekit/contrib/source/sentinerel/io_utils.py +0 -87
  129. arekit/contrib/source/sentinerel/labels.py +0 -53
  130. arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
  131. arekit/contrib/source/sentinerel/reader.py +0 -42
  132. arekit/contrib/source/synonyms/__init__.py +0 -0
  133. arekit/contrib/source/synonyms/utils.py +0 -19
  134. arekit/contrib/source/zip_utils.py +0 -47
  135. arekit/contrib/utils/connotations/__init__.py +0 -0
  136. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
  137. arekit/contrib/utils/data/readers/__init__.py +0 -0
  138. arekit/contrib/utils/data/readers/base.py +0 -7
  139. arekit/contrib/utils/data/readers/csv_pd.py +0 -38
  140. arekit/contrib/utils/data/readers/jsonl.py +0 -15
  141. arekit/contrib/utils/data/service/__init__.py +0 -0
  142. arekit/contrib/utils/data/service/balance.py +0 -50
  143. arekit/contrib/utils/data/writers/csv_native.py +0 -63
  144. arekit/contrib/utils/data/writers/csv_pd.py +0 -40
  145. arekit/contrib/utils/data/writers/json_opennre.py +0 -132
  146. arekit/contrib/utils/data/writers/sqlite_native.py +0 -110
  147. arekit/contrib/utils/download.py +0 -77
  148. arekit/contrib/utils/embeddings/__init__.py +0 -0
  149. arekit/contrib/utils/embeddings/rusvectores.py +0 -58
  150. arekit/contrib/utils/embeddings/tokens.py +0 -30
  151. arekit/contrib/utils/io_utils/embedding.py +0 -72
  152. arekit/contrib/utils/io_utils/opinions.py +0 -37
  153. arekit/contrib/utils/io_utils/samples.py +0 -79
  154. arekit/contrib/utils/lexicons/__init__.py +0 -0
  155. arekit/contrib/utils/lexicons/lexicon.py +0 -41
  156. arekit/contrib/utils/lexicons/relation.py +0 -42
  157. arekit/contrib/utils/lexicons/rusentilex.py +0 -37
  158. arekit/contrib/utils/nn/__init__.py +0 -0
  159. arekit/contrib/utils/nn/rows.py +0 -83
  160. arekit/contrib/utils/np_utils/__init__.py +0 -0
  161. arekit/contrib/utils/np_utils/embedding.py +0 -22
  162. arekit/contrib/utils/np_utils/npz_utils.py +0 -13
  163. arekit/contrib/utils/np_utils/vocab.py +0 -20
  164. arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
  165. arekit/contrib/utils/pipelines/items/sampling/base.py +0 -99
  166. arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -54
  167. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
  168. arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -32
  169. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
  170. arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -107
  171. arekit/contrib/utils/pipelines/items/text/translator.py +0 -135
  172. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  173. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  174. arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +0 -27
  175. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -65
  176. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
  177. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  178. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +0 -29
  179. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -64
  180. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
  181. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  182. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +0 -56
  183. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -20
  184. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -65
  185. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  186. arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +0 -21
  187. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -107
  188. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  189. arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +0 -29
  190. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
  191. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -180
  192. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
  193. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
  194. arekit/contrib/utils/processing/languages/__init__.py +0 -0
  195. arekit/contrib/utils/processing/languages/mods.py +0 -12
  196. arekit/contrib/utils/processing/languages/pos.py +0 -23
  197. arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
  198. arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
  199. arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
  200. arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
  201. arekit/contrib/utils/processing/languages/ru/number.py +0 -23
  202. arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
  203. arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
  204. arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
  205. arekit/contrib/utils/processing/pos/__init__.py +0 -0
  206. arekit/contrib/utils/processing/pos/base.py +0 -12
  207. arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
  208. arekit/contrib/utils/processing/pos/russian.py +0 -10
  209. arekit/contrib/utils/processing/text/__init__.py +0 -0
  210. arekit/contrib/utils/processing/text/tokens.py +0 -127
  211. arekit/contrib/utils/resources.py +0 -25
  212. arekit/contrib/utils/serializer.py +0 -43
  213. arekit/contrib/utils/sources/__init__.py +0 -0
  214. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  215. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  216. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
  217. arekit/contrib/utils/vectorizers/__init__.py +0 -0
  218. arekit/contrib/utils/vectorizers/bpe.py +0 -93
  219. arekit/contrib/utils/vectorizers/random_norm.py +0 -39
  220. arekit/download_data.py +0 -11
  221. arekit-0.24.0.dist-info/METADATA +0 -23
  222. arekit-0.24.0.dist-info/RECORD +0 -374
  223. {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/LICENSE +0 -0
  224. {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/top_level.txt +0 -0
@@ -1,78 +0,0 @@
1
- from enum import Enum
2
-
3
-
4
- class RussianCases(Enum):
5
- """ Падежи русского языка
6
- """
7
-
8
- """ не определено
9
- """
10
- UNKN = 10
11
-
12
- """ именительный
13
- """
14
- NOM = 1
15
-
16
- """ родительный
17
- """
18
- GEN = 2
19
-
20
- """ дательный
21
- """
22
- DAT = 3
23
-
24
- """ винительный
25
- """
26
- ACC = 4
27
-
28
- """ творительный
29
- """
30
- INS = 5
31
-
32
- """ предложный
33
- """
34
- ABL = 6
35
-
36
- """ партитив
37
- """
38
- PART = 7
39
-
40
- """ местный
41
- """
42
- LOC = 8
43
-
44
- """ звательный
45
- """
46
- VOC = 9
47
-
48
-
49
- class RussianCasesService(object):
50
-
51
- __english = {
52
- 'nom': RussianCases.NOM,
53
- 'gen': RussianCases.GEN,
54
- 'dat': RussianCases.DAT,
55
- 'acc': RussianCases.ACC,
56
- 'ins': RussianCases.INS,
57
- 'abl': RussianCases.ABL,
58
- 'part': RussianCases.PART,
59
- 'loc': RussianCases.LOC,
60
- 'voc': RussianCases.VOC,
61
- }
62
-
63
- __mystem_russian = {
64
- 'им': RussianCases.NOM,
65
- 'род': RussianCases.GEN,
66
- 'дат': RussianCases.DAT,
67
- 'вин': RussianCases.ACC,
68
- 'твор': RussianCases.INS,
69
- 'пр': RussianCases.ABL,
70
- 'парт': RussianCases.PART,
71
- 'местн': RussianCases.LOC,
72
- 'зват': RussianCases.VOC,
73
- }
74
-
75
- @staticmethod
76
- def iter_rus_mystem_tags():
77
- for key, value in RussianCasesService.__mystem_russian.items():
78
- yield key, value
@@ -1,6 +0,0 @@
1
- class RussianConstants:
2
-
3
- PrepositionSet = {'к', 'на', 'по', 'с', 'до', 'в', 'во', "у", "а"}
4
-
5
- def __init__(self):
6
- pass
@@ -1,13 +0,0 @@
1
- from arekit.contrib.utils.processing.languages.mods import BaseLanguageMods
2
-
3
-
4
- class RussianLanguageMods(BaseLanguageMods):
5
-
6
- @staticmethod
7
- def replace_specific_word_chars(word):
8
- assert(isinstance(word, str))
9
- return word.replace('ё', 'e')
10
-
11
- @staticmethod
12
- def is_negation_word(word):
13
- return word == 'не'
@@ -1,23 +0,0 @@
1
- from enum import Enum
2
-
3
-
4
- class RussianNumberType(Enum):
5
-
6
- UNKN = 3
7
-
8
- Plural = 1
9
-
10
- Single = 2
11
-
12
-
13
- class RussianNumberTypeService(object):
14
-
15
- __russian = {
16
- 'ед': RussianNumberType.Single,
17
- 'мн': RussianNumberType.Plural
18
- }
19
-
20
- @staticmethod
21
- def iter_rus_mystem_tags():
22
- for key, value in RussianNumberTypeService.__russian.items():
23
- yield key, value
@@ -1,36 +0,0 @@
1
- from arekit.contrib.utils.processing.languages.pos import PartOfSpeechType
2
-
3
-
4
- class PartOfSpeechTypesService(object):
5
-
6
- __pos_names = {
7
- "S": PartOfSpeechType.NOUN,
8
- "ADV": PartOfSpeechType.ADV,
9
- "ADVPRO": PartOfSpeechType.ADVPRO,
10
- "ANUM": PartOfSpeechType.ANUM,
11
- "APRO": PartOfSpeechType.APRO,
12
- "COM": PartOfSpeechType.COM,
13
- "CONJ": PartOfSpeechType.CONJ,
14
- "INTJ": PartOfSpeechType.INTJ,
15
- "NUM": PartOfSpeechType.NUM,
16
- "PART": PartOfSpeechType.PART,
17
- "PR": PartOfSpeechType.PR,
18
- "A": PartOfSpeechType.ADJ,
19
- "SPRO": PartOfSpeechType.SPRO,
20
- "V": PartOfSpeechType.VERB,
21
- "UNKN": PartOfSpeechType.Unknown,
22
- "EMPTY": PartOfSpeechType.Empty}
23
-
24
- @staticmethod
25
- def iter_mystem_tags():
26
- for key, value in PartOfSpeechTypesService.__pos_names.items():
27
- yield key, value
28
-
29
- @staticmethod
30
- def get_mystem_from_string(value):
31
- return PartOfSpeechTypesService.__pos_names[value]
32
-
33
- @staticmethod
34
- def get_mystem_pos_count():
35
- return len(PartOfSpeechTypesService.__pos_names)
36
-
File without changes
@@ -1,51 +0,0 @@
1
- from arekit.common.text.stemmer import Stemmer
2
- from arekit.common.utils import filter_whitespaces
3
- from pymystem3 import Mystem
4
-
5
-
6
- class MystemWrapper(Stemmer):
7
- """ Yandex MyStem wrapper
8
-
9
- part of speech description:
10
- https://tech.yandex.ru/mystem/doc/grammemes-values-docpage/
11
- """
12
-
13
- def __init__(self, entire_input=False):
14
- """
15
- entire_input: bool
16
- Mystem parameter that allows to keep all information from input (true) or
17
- remove garbage characters
18
- """
19
- self.__mystem = Mystem(entire_input=entire_input)
20
-
21
- # region properties
22
-
23
- @property
24
- def MystemInstance(self):
25
- return self.__mystem
26
-
27
- # endregion
28
-
29
- # region public methods
30
-
31
- def lemmatize_to_list(self, text):
32
- return self.__lemmatize_core(text)
33
-
34
- def lemmatize_to_str(self, text):
35
- result = " ".join(self.__lemmatize_core(text))
36
- return result if len(result) != 0 else self.__process_original_text(text)
37
-
38
- # endregion
39
-
40
- # region private methods
41
-
42
- def __lemmatize_core(self, text):
43
- assert(isinstance(text, str))
44
- result_list = self.__mystem.lemmatize(self.__process_original_text(text))
45
- return filter_whitespaces(result_list)
46
-
47
- @staticmethod
48
- def __process_original_text(text):
49
- return text.lower()
50
-
51
- # endregion
File without changes
@@ -1,12 +0,0 @@
1
- # TODO. Move to base class.
2
- class POSTagger:
3
-
4
- def get_term_pos(self, term):
5
- raise NotImplementedError()
6
-
7
- def get_term_number(self, term):
8
- raise NotImplementedError()
9
-
10
- def pos_to_int(self, pos):
11
- raise NotImplementedError()
12
-
@@ -1,134 +0,0 @@
1
- from pymystem3 import Mystem
2
-
3
- from arekit.contrib.utils.processing.languages.pos import PartOfSpeechType
4
- from arekit.contrib.utils.processing.languages.ru.cases import RussianCases, RussianCasesService
5
- from arekit.contrib.utils.processing.languages.ru.number import RussianNumberType, RussianNumberTypeService
6
- from arekit.contrib.utils.processing.languages.ru.pos_service import PartOfSpeechTypesService
7
- from arekit.contrib.utils.processing.pos.russian import RussianPOSTagger
8
-
9
-
10
- class POSMystemWrapper(RussianPOSTagger):
11
-
12
- _ArgsSeparator = ','
13
- _GrammarKey = 'gr'
14
-
15
- def __init__(self, mystem):
16
- assert(isinstance(mystem, Mystem))
17
- self.__mystem = mystem
18
-
19
- # region private methods
20
-
21
- @staticmethod
22
- def __extract_from_analysis(analysis, func):
23
- """
24
- part of speech description:
25
- https://tech.yandex.ru/mystem/doc/grammemes-values-docpage/
26
- func: f(args) -> out
27
- returns: str or None
28
- """
29
- assert(callable(func))
30
-
31
- if 'analysis' not in analysis:
32
- return func(None)
33
-
34
- info = analysis['analysis']
35
- if len(info) == 0:
36
- return func(None)
37
-
38
- return func(info[0])
39
-
40
- @staticmethod
41
- def __get_pos(arguments):
42
- if arguments is None:
43
- return PartOfSpeechType.Unknown
44
-
45
- pos = arguments[POSMystemWrapper._GrammarKey].split(POSMystemWrapper._ArgsSeparator)[0]
46
- if '=' in pos:
47
- pos = pos.split('=')[0]
48
-
49
- return PartOfSpeechTypesService.get_mystem_from_string(pos)
50
-
51
- @staticmethod
52
- def __get_russian_case(arguments):
53
- if arguments is None:
54
- return RussianCases.UNKN
55
-
56
- all_params = set(POSMystemWrapper.__iter_params(arguments))
57
-
58
- for key, case in RussianCasesService.iter_rus_mystem_tags():
59
- if key in all_params:
60
- return case
61
-
62
- return RussianCases.UNKN
63
-
64
- @staticmethod
65
- def __get_number(arguments):
66
- if arguments is None:
67
- return RussianNumberType.UNKN
68
-
69
- all_params = set(POSMystemWrapper.__iter_params(arguments))
70
-
71
- for key, case in RussianNumberTypeService.iter_rus_mystem_tags():
72
- if key in all_params:
73
- return case
74
-
75
- return RussianNumberType.UNKN
76
-
77
- @staticmethod
78
- def __iter_params(arguments):
79
- params = arguments[POSMystemWrapper._GrammarKey].split(POSMystemWrapper._ArgsSeparator)
80
- for optionally_combined in params:
81
- for param in optionally_combined.split('='):
82
- yield param
83
-
84
- # endregion
85
-
86
- def get_term_pos(self, term):
87
- assert(isinstance(term, str))
88
- analyzed = self.__mystem.analyze(term)
89
- return self.__extract_from_analysis(analyzed[0], self.__get_pos) \
90
- if len(analyzed) > 0 else PartOfSpeechType.Unknown
91
-
92
- def get_term_case(self, term):
93
- assert(isinstance(term, str))
94
- analyzed = self.__mystem.analyze(term)
95
- return self.__extract_from_analysis(analyzed[0], self.__get_russian_case) \
96
- if len(analyzed) > 0 else RussianCases.UNKN
97
-
98
- def get_term_number(self, term):
99
- assert(isinstance(term, str))
100
- analyzed = self.__mystem.analyze(term)
101
- return self.__extract_from_analysis(analyzed[0], self.__get_number) \
102
- if len(analyzed) > 0 else RussianNumberType.UNKN
103
-
104
- def get_terms_russian_cases(self, text):
105
- """ list of part of speech according to the certain word in text
106
- """
107
- assert(isinstance(text, str))
108
- cases = []
109
-
110
- analyzed = self.__mystem.analyze(text)
111
- for a in analyzed:
112
- pos = self.__extract_from_analysis(a, self.__get_russian_case) if len(analyzed) > 0 else RussianCases.UNKN
113
- cases.append(pos)
114
-
115
- return cases
116
-
117
- def pos_to_int(self, pos):
118
- assert(isinstance(pos, PartOfSpeechType))
119
- return int(pos)
120
-
121
- @staticmethod
122
- def is_adjective(pos_type):
123
- assert(isinstance(pos_type, PartOfSpeechType))
124
- return pos_type == PartOfSpeechType.ADJ
125
-
126
- @staticmethod
127
- def is_noun(pos_type):
128
- assert(isinstance(pos_type, PartOfSpeechType))
129
- return pos_type == PartOfSpeechType.NOUN
130
-
131
- @staticmethod
132
- def is_verb(pos_type):
133
- assert(isinstance(pos_type, PartOfSpeechType))
134
- return pos_type == PartOfSpeechType.VERB
@@ -1,10 +0,0 @@
1
- from arekit.contrib.utils.processing.pos.base import POSTagger
2
-
3
-
4
- class RussianPOSTagger(POSTagger):
5
- """ Provides cases support ('падежи')
6
- """
7
-
8
- def get_term_case(self, term):
9
- raise NotImplementedError()
10
-
File without changes
@@ -1,127 +0,0 @@
1
- from urllib.parse import urlparse
2
- from arekit.common.context.token import Token
3
-
4
-
5
- # TODO. Provide the base (BaseTokens) type.
6
- # TODO. With the related API at BaseTokens.
7
- class Tokens:
8
- """
9
- Tokens used to describe a non-word text units, such as punctuation,
10
- uknown words/chars, smiles, etc.
11
- """
12
-
13
- _wrapper = "<[{}]>"
14
- COMMA = _wrapper.format(',')
15
- SEMICOLON = _wrapper.format(';')
16
- COLON = _wrapper.format(':')
17
- QUOTE = _wrapper.format('QUOTE')
18
- DASH = _wrapper.format('-')
19
- LONG_DASH = _wrapper.format('long_dash')
20
- DOT = _wrapper.format('.')
21
- TRIPLE_DOTS = _wrapper.format('…')
22
- EXC_SIGN = _wrapper.format('!')
23
- QUESTION_SIGN = _wrapper.format('?')
24
- OPEN_BRACKET = _wrapper.format('OPEN_BRACKET')
25
- CLOSED_BRACKET = _wrapper.format('CLOSED_BRACKET')
26
- NUMBER = _wrapper.format('NUMBER')
27
- NEW_LINE = _wrapper.format("NEW_LINE")
28
- UNKNOWN_CHAR = _wrapper.format('UNKNOWN_CHAR')
29
- UNKNOWN_WORD = _wrapper.format('UNKNOWN_WORD')
30
- URL = _wrapper.format("URL")
31
-
32
- __token_mapping = {
33
- ',': COMMA,
34
- '.': DOT,
35
- '…': TRIPLE_DOTS,
36
- ':': COLON,
37
- ';': SEMICOLON,
38
- '-': DASH,
39
- '—': LONG_DASH,
40
- '?': QUESTION_SIGN,
41
- '!': EXC_SIGN,
42
- '(': OPEN_BRACKET,
43
- ')': CLOSED_BRACKET,
44
- '{': OPEN_BRACKET,
45
- '}': CLOSED_BRACKET,
46
- '[': OPEN_BRACKET,
47
- ']': CLOSED_BRACKET,
48
- '\n': NEW_LINE,
49
- '«': QUOTE,
50
- '»': QUOTE,
51
- '"': QUOTE,
52
- }
53
-
54
- __supported_tokens = {
55
- COMMA,
56
- SEMICOLON,
57
- COLON,
58
- QUOTE,
59
- DASH,
60
- DOT,
61
- LONG_DASH,
62
- TRIPLE_DOTS,
63
- EXC_SIGN,
64
- QUESTION_SIGN,
65
- OPEN_BRACKET,
66
- CLOSED_BRACKET,
67
- NUMBER,
68
- URL,
69
- NEW_LINE,
70
- UNKNOWN_CHAR,
71
- UNKNOWN_WORD}
72
-
73
- @staticmethod
74
- def try_create(subterm):
75
- """
76
- Trying create a token by given 'term' parameter
77
- subterm: unicode
78
- I.e. term ending, so means a part of original term
79
- """
80
- assert(isinstance(subterm, str))
81
- if subterm not in Tokens.__token_mapping:
82
- return None
83
- return Token(term=subterm, token_value=Tokens.__token_mapping[subterm])
84
-
85
- @staticmethod
86
- def try_parse(term):
87
- assert(isinstance(term, str))
88
- for origin, token_value in Tokens.__token_mapping.items():
89
- if term == token_value:
90
- return Token(term=origin, token_value=token_value)
91
-
92
- @staticmethod
93
- def try_create_number(term):
94
- assert(isinstance(term, str))
95
- if not term.isdigit():
96
- return None
97
- return Token(term=term, token_value=Tokens.NUMBER)
98
-
99
- @staticmethod
100
- def try_create_url(term):
101
- assert(isinstance(term, str))
102
- result = urlparse(term)
103
- is_correct = result.scheme and result.netloc and result.path
104
- if not is_correct:
105
- return None
106
- return Token(term=term, token_value=Tokens.URL)
107
-
108
- @staticmethod
109
- def is_token(term):
110
- assert(isinstance(term, str))
111
- return term in Tokens.__supported_tokens
112
-
113
- @staticmethod
114
- def iter_chars_by_token(term):
115
- """
116
- Iterate through charts that is related to term
117
- token: char
118
- """
119
- assert(isinstance(term, str))
120
- for char, token in Tokens.__token_mapping.items():
121
- if term == token:
122
- yield char
123
-
124
- @staticmethod
125
- def iter_supported_tokens():
126
- for token in Tokens.__supported_tokens:
127
- yield token
@@ -1,25 +0,0 @@
1
- from arekit.common.text.stemmer import Stemmer
2
- from arekit.contrib.utils.download import NEWS_MYSTEM_SKIPGRAM_1000_20_2015, load_embedding_and_vocab
3
- from arekit.contrib.utils.embeddings.rusvectores import RusvectoresEmbedding
4
-
5
-
6
- def load_embedding_news_mystem_skipgram_1000_20_2015(stemmer, auto_download=False):
7
- """ Embedding from https://rusvectores.org/ru/models/
8
- Description: Russian news, from 2013 till the october 2015
9
- Corpora size: 2.5 milliard words
10
- Vocabulary volume: 147 358
11
- Frequency bound: 200
12
- Tagset: Mystem
13
- Algorithm: Continuous Skip-Gram
14
- Vector size: 1000
15
-
16
- stemmer: Stemmer
17
- It is expected to adopt MystemWrapper.
18
- auto_download: bool
19
- Whether try to download if the resource was missed.
20
- """
21
- assert(isinstance(stemmer, Stemmer) or stemmer is None)
22
- embedding, vocab = load_embedding_and_vocab(local_name=NEWS_MYSTEM_SKIPGRAM_1000_20_2015, check_existance=True,
23
- download_if_missed=auto_download)
24
- embedding = RusvectoresEmbedding(matrix=embedding, words=vocab, stemmer=stemmer)
25
- return embedding
@@ -1,43 +0,0 @@
1
- import logging
2
-
3
- from collections.abc import Iterable
4
-
5
- from arekit.common.data.input.providers.columns.sample import SampleColumnsProvider
6
- from arekit.common.data.input.providers.rows.base import BaseRowProvider
7
- from arekit.common.data.input.repositories.base import BaseInputRepository
8
- from arekit.common.data.input.repositories.sample import BaseInputSamplesRepository
9
- from arekit.common.data.storages.base import BaseRowsStorage
10
- from arekit.common.pipeline.base import BasePipeline
11
- from arekit.contrib.utils.data.contents.opinions import InputTextOpinionProvider
12
-
13
- logger = logging.getLogger(__name__)
14
- logging.basicConfig(level=logging.INFO)
15
-
16
-
17
- class InputDataSerializationHelper(object):
18
-
19
- @staticmethod
20
- def create_samples_repo(keep_labels, rows_provider, storage):
21
- assert(isinstance(rows_provider, BaseRowProvider))
22
- assert(isinstance(keep_labels, bool))
23
- assert(isinstance(storage, BaseRowsStorage))
24
- return BaseInputSamplesRepository(
25
- columns_provider=SampleColumnsProvider(store_labels=keep_labels),
26
- rows_provider=rows_provider,
27
- storage=storage)
28
-
29
- @staticmethod
30
- def fill_and_write(pipeline, repo, target, writer, doc_ids_iter, desc=""):
31
- assert(isinstance(pipeline, BasePipeline))
32
- assert(isinstance(doc_ids_iter, Iterable))
33
- assert(isinstance(repo, BaseInputRepository))
34
-
35
- doc_ids = list(doc_ids_iter)
36
-
37
- repo.populate(contents_provider=InputTextOpinionProvider(pipeline),
38
- doc_ids=doc_ids,
39
- desc=desc,
40
- writer=writer,
41
- target=target)
42
-
43
- repo.push(writer=writer, target=target)
File without changes
File without changes
@@ -1,63 +0,0 @@
1
- from arekit.common.entities.base import Entity
2
- from arekit.common.docs.parsed.base import ParsedDocument
3
- from arekit.common.docs.parsed.providers.entity_service import EntityServiceProvider
4
- from arekit.common.docs.parsed.term_position import TermPositionTypes, TermPosition
5
- from arekit.common.text.enums import TermFormat
6
- from arekit.common.text.parsed import BaseParsedText
7
- from arekit.common.text_opinions.base import TextOpinion
8
- from arekit.contrib.utils.pipelines.text_opinion.filters.base import TextOpinionFilter
9
-
10
-
11
- class ProfessionAsCharacteristicSentimentTextOpinionFilter(TextOpinionFilter):
12
- """ This is a filter, based on the PROFESSION type prefixed entity for
13
- the SentiNEREL collection.
14
-
15
- In this case, profession acts as a characteristics of the Person, and
16
- therefore there is no need to consider these attitudes in annotation.
17
-
18
- For a greater details, see:
19
- https://github.com/nicolay-r/AREkit/issues/404
20
- """
21
-
22
- def __init__(self, char_type="PROFESSION"):
23
- self.__char_type = char_type
24
- self.__next_entity_types = ["PERSON"]
25
-
26
- def filter(self, text_opinion, parsed_doc, entity_service_provider):
27
- assert(isinstance(text_opinion, TextOpinion))
28
- assert(isinstance(parsed_doc, ParsedDocument))
29
- assert(isinstance(entity_service_provider, EntityServiceProvider))
30
-
31
- # Picking up entity.
32
- target_entity = entity_service_provider._doc_entities[text_opinion.TargetId]
33
- assert(isinstance(target_entity, Entity))
34
-
35
- if target_entity.Type != self.__char_type:
36
- # This is not our case.
37
- return True
38
-
39
- # Picking up the related target entity position.
40
- target_pos = entity_service_provider.get_entity_position(text_opinion.TargetId)
41
- assert(isinstance(target_pos, TermPosition))
42
-
43
- # Picking up the related sentence of target.
44
- t_sent = target_pos.get_index(TermPositionTypes.SentenceIndex)
45
- sentence = parsed_doc.get_sentence(t_sent)
46
- assert(isinstance(sentence, BaseParsedText))
47
-
48
- # Picking up the entity position in sentence.
49
- target_term_ind = target_pos.get_index(TermPositionTypes.IndexInSentence)
50
-
51
- # We pick up the next term within the parsed sentece.
52
- next_term = sentence.get_term(target_term_ind + 1, term_format=TermFormat.Raw) \
53
- if len(sentence) > target_term_ind + 1 else None
54
-
55
- if next_term is None:
56
- # This is not our case.
57
- return True
58
-
59
- if isinstance(next_term, Entity) and next_term.Type in self.__next_entity_types:
60
- # We reject this opinion from the annotation, since this is not expected to be a sentiment one.
61
- return False
62
-
63
- return True
File without changes