arekit 0.25.0__py3-none-any.whl → 0.25.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. arekit/common/context/terms_mapper.py +5 -2
  2. arekit/common/data/input/providers/rows/samples.py +8 -12
  3. arekit/common/data/input/providers/sample/cropped.py +4 -3
  4. arekit/common/data/input/terms_mapper.py +4 -8
  5. arekit/common/data/storages/base.py +4 -18
  6. arekit/common/docs/entities_grouping.py +5 -3
  7. arekit/common/docs/parsed/base.py +3 -3
  8. arekit/common/docs/parsed/providers/base.py +3 -5
  9. arekit/common/docs/parsed/providers/entity_service.py +7 -28
  10. arekit/common/docs/parsed/providers/opinion_pairs.py +6 -6
  11. arekit/common/docs/parsed/providers/text_opinion_pairs.py +4 -4
  12. arekit/common/docs/parsed/service.py +2 -2
  13. arekit/common/docs/parser.py +3 -30
  14. arekit/common/model/labeling/single.py +7 -3
  15. arekit/common/opinions/annot/algo/pair_based.py +9 -5
  16. arekit/common/pipeline/base.py +0 -2
  17. arekit/common/pipeline/batching.py +0 -3
  18. arekit/common/pipeline/items/base.py +1 -1
  19. arekit/common/utils.py +11 -8
  20. arekit/contrib/bert/input/providers/cropped_sample.py +2 -5
  21. arekit/contrib/bert/terms/mapper.py +2 -2
  22. arekit/contrib/prompt/sample.py +2 -6
  23. arekit/contrib/utils/bert/samplers.py +4 -2
  24. arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
  25. arekit/contrib/utils/data/storages/row_cache.py +2 -1
  26. arekit/contrib/utils/data/storages/sqlite_based.py +2 -1
  27. arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +8 -5
  28. arekit/contrib/utils/pipelines/text_opinion/extraction.py +16 -8
  29. {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/METADATA +10 -8
  30. {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/RECORD +34 -115
  31. {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/WHEEL +1 -1
  32. arekit/common/data/input/repositories/__init__.py +0 -0
  33. arekit/common/data/input/repositories/base.py +0 -68
  34. arekit/common/data/input/repositories/sample.py +0 -22
  35. arekit/common/data/views/__init__.py +0 -0
  36. arekit/common/data/views/samples.py +0 -26
  37. arekit/common/experiment/__init__.py +0 -0
  38. arekit/common/experiment/api/__init__.py +0 -0
  39. arekit/common/experiment/api/base_samples_io.py +0 -20
  40. arekit/common/experiment/data_type.py +0 -17
  41. arekit/common/service/__init__.py +0 -0
  42. arekit/common/service/sqlite.py +0 -36
  43. arekit/contrib/networks/__init__.py +0 -0
  44. arekit/contrib/networks/embedding.py +0 -149
  45. arekit/contrib/networks/embedding_io.py +0 -18
  46. arekit/contrib/networks/input/__init__.py +0 -0
  47. arekit/contrib/networks/input/const.py +0 -6
  48. arekit/contrib/networks/input/ctx_serialization.py +0 -28
  49. arekit/contrib/networks/input/embedding/__init__.py +0 -0
  50. arekit/contrib/networks/input/embedding/matrix.py +0 -29
  51. arekit/contrib/networks/input/embedding/offsets.py +0 -55
  52. arekit/contrib/networks/input/formatters/__init__.py +0 -0
  53. arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
  54. arekit/contrib/networks/input/providers/__init__.py +0 -0
  55. arekit/contrib/networks/input/providers/sample.py +0 -129
  56. arekit/contrib/networks/input/providers/term_connotation.py +0 -23
  57. arekit/contrib/networks/input/providers/text.py +0 -24
  58. arekit/contrib/networks/input/rows_parser.py +0 -47
  59. arekit/contrib/networks/input/term_types.py +0 -13
  60. arekit/contrib/networks/input/terms_mapping.py +0 -60
  61. arekit/contrib/networks/vectorizer.py +0 -6
  62. arekit/contrib/utils/data/readers/__init__.py +0 -0
  63. arekit/contrib/utils/data/readers/base.py +0 -7
  64. arekit/contrib/utils/data/readers/csv_pd.py +0 -38
  65. arekit/contrib/utils/data/readers/jsonl.py +0 -15
  66. arekit/contrib/utils/data/readers/sqlite.py +0 -14
  67. arekit/contrib/utils/data/service/__init__.py +0 -0
  68. arekit/contrib/utils/data/service/balance.py +0 -50
  69. arekit/contrib/utils/data/storages/pandas_based.py +0 -123
  70. arekit/contrib/utils/data/writers/csv_native.py +0 -63
  71. arekit/contrib/utils/data/writers/csv_pd.py +0 -40
  72. arekit/contrib/utils/data/writers/json_opennre.py +0 -132
  73. arekit/contrib/utils/data/writers/sqlite_native.py +0 -114
  74. arekit/contrib/utils/embeddings/__init__.py +0 -0
  75. arekit/contrib/utils/embeddings/rusvectores.py +0 -58
  76. arekit/contrib/utils/embeddings/tokens.py +0 -30
  77. arekit/contrib/utils/entities/formatters/str_display.py +0 -11
  78. arekit/contrib/utils/io_utils/embedding.py +0 -72
  79. arekit/contrib/utils/np_utils/__init__.py +0 -0
  80. arekit/contrib/utils/np_utils/embedding.py +0 -22
  81. arekit/contrib/utils/np_utils/npz_utils.py +0 -13
  82. arekit/contrib/utils/np_utils/vocab.py +0 -20
  83. arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
  84. arekit/contrib/utils/pipelines/items/sampling/base.py +0 -94
  85. arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -55
  86. arekit/contrib/utils/pipelines/items/text/entities_default.py +0 -23
  87. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
  88. arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -33
  89. arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -105
  90. arekit/contrib/utils/pipelines/items/text/translator.py +0 -136
  91. arekit/contrib/utils/processing/__init__.py +0 -0
  92. arekit/contrib/utils/processing/languages/__init__.py +0 -0
  93. arekit/contrib/utils/processing/languages/mods.py +0 -12
  94. arekit/contrib/utils/processing/languages/pos.py +0 -23
  95. arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
  96. arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
  97. arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
  98. arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
  99. arekit/contrib/utils/processing/languages/ru/number.py +0 -23
  100. arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
  101. arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
  102. arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
  103. arekit/contrib/utils/processing/pos/__init__.py +0 -0
  104. arekit/contrib/utils/processing/pos/base.py +0 -12
  105. arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
  106. arekit/contrib/utils/processing/pos/russian.py +0 -10
  107. arekit/contrib/utils/processing/text/__init__.py +0 -0
  108. arekit/contrib/utils/processing/text/tokens.py +0 -127
  109. arekit/contrib/utils/serializer.py +0 -42
  110. arekit/contrib/utils/vectorizers/__init__.py +0 -0
  111. arekit/contrib/utils/vectorizers/bpe.py +0 -93
  112. arekit/contrib/utils/vectorizers/random_norm.py +0 -39
  113. {arekit-0.25.0.data → arekit-0.25.2.data}/data/logo.png +0 -0
  114. {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/LICENSE +0 -0
  115. {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/top_level.txt +0 -0
@@ -1,105 +0,0 @@
1
- import logging
2
-
3
- from arekit.common.context.token import Token
4
- from arekit.common.pipeline.items.base import BasePipelineItem
5
- from arekit.common.utils import split_by_whitespaces
6
- from arekit.contrib.utils.processing.text.tokens import Tokens
7
-
8
- logger = logging.getLogger(__name__)
9
- logger.setLevel(logging.INFO)
10
-
11
-
12
- class DefaultTextTokenizer(BasePipelineItem):
13
- """ Default parser implementation.
14
- """
15
-
16
- def __init__(self, keep_tokens=True, **kwargs):
17
- super(DefaultTextTokenizer, self).__init__(**kwargs)
18
- self.__keep_tokens = keep_tokens
19
-
20
- # region protected methods
21
-
22
- def apply_core(self, input_data, pipeline_ctx):
23
- output_data = self.__process_parts(input_data)
24
- if not self.__keep_tokens:
25
- output_data = [word for word in output_data if not isinstance(word, Token)]
26
- return output_data
27
-
28
- # endregion
29
-
30
- # region private static methods
31
-
32
- def __process_parts(self, parts):
33
- assert(isinstance(parts, list))
34
-
35
- parsed = []
36
- for part in parts:
37
-
38
- if part is None:
39
- continue
40
-
41
- # Keep non str words as it is and try to parse str-based words.
42
- processed = [part] if not isinstance(part, str) else \
43
- self.__iter_processed_part(part=part)
44
-
45
- parsed.extend(processed)
46
-
47
- return parsed
48
-
49
- def __iter_processed_part(self, part):
50
- for word in split_by_whitespaces(part):
51
- for term in self.__process_word(word):
52
- yield term
53
-
54
- def __process_word(self, word):
55
- assert(isinstance(word, str))
56
- return self.__split_tokens(word)
57
-
58
- @staticmethod
59
- def __split_tokens(term):
60
- """
61
- Splitting off tokens from parsed_doc ending, i.e. for example:
62
- term: "сказать,-" -> "(term: "сказать", ["COMMA_TOKEN", "DASH_TOKEN"])
63
- return: (unicode or None, list)
64
- modified term and list of extracted tokens.
65
- """
66
-
67
- url = Tokens.try_create_url(term)
68
- if url is not None:
69
- return [url]
70
-
71
- l = 0
72
- words_and_tokens = []
73
- while l < len(term):
74
-
75
- # Token.
76
- token = Tokens.try_create(term[l])
77
- if token is not None:
78
- if token.get_token_value() != Tokens.NEW_LINE:
79
- words_and_tokens.append(token)
80
- l += 1
81
-
82
- # Number.
83
- elif str.isdigit(term[l]):
84
- k = l + 1
85
- while k < len(term) and str.isdigit(term[k]):
86
- k += 1
87
- token = Tokens.try_create_number(term[l:k])
88
- assert(token is not None)
89
- words_and_tokens.append(token)
90
- l = k
91
-
92
- # Term.
93
- else:
94
- k = l + 1
95
- while k < len(term):
96
- token = Tokens.try_create(term[k])
97
- if token is not None and token.get_token_value() != Tokens.DASH:
98
- break
99
- k += 1
100
- words_and_tokens.append(term[l:k])
101
- l = k
102
-
103
- return words_and_tokens
104
-
105
- # endregion
@@ -1,136 +0,0 @@
1
- from arekit.common.data.input.providers.const import IDLE_MODE
2
- from arekit.common.pipeline.conts import PARENT_CTX
3
- from arekit.common.entities.base import Entity
4
- from arekit.common.pipeline.context import PipelineContext
5
- from arekit.common.pipeline.items.base import BasePipelineItem
6
-
7
-
8
- class MLTextTranslatorPipelineItem(BasePipelineItem):
9
- """ Machine learning based translator pipeline item.
10
- """
11
-
12
- def __init__(self, batch_translate_model, do_translate_entity=True, **kwargs):
13
- """ Model, which is based on translation of the text,
14
- represented as a list of words.
15
- """
16
- super(MLTextTranslatorPipelineItem, self).__init__(**kwargs)
17
- self.__do_translate_entity = do_translate_entity
18
- self.__translate = batch_translate_model
19
-
20
- def fast_most_accurate_approach(self, input_data, entity_placeholder_template="<entityTag={}/>"):
21
- """ This approach assumes that the translation won't corrupt the original
22
- meta-annotation for entities and objects mentioned in text.
23
- """
24
-
25
- def __optionally_register(prts):
26
- if len(prts) > 0:
27
- content.append(" ".join(prts))
28
- parts_to_join.clear()
29
-
30
- content = []
31
- origin_entities = []
32
- parts_to_join = []
33
-
34
- for part in input_data:
35
- if isinstance(part, str) and part.strip():
36
- parts_to_join.append(part)
37
- elif isinstance(part, Entity):
38
- entity_index = len(origin_entities)
39
- parts_to_join.append(entity_placeholder_template.format(entity_index))
40
- # Register entities information for further restoration.
41
- origin_entities.append(part)
42
-
43
- # Register original text with masked named entities.
44
- __optionally_register(parts_to_join)
45
- # Register all named entities in order of their appearance in text.
46
- content.extend([e.Value for e in origin_entities])
47
-
48
- # Compose text parts.
49
- translated_parts = self.__translate(content)
50
-
51
- if len(translated_parts) == 0:
52
- return None
53
-
54
- # Take the original text.
55
- text = translated_parts[0]
56
- for entity_index in range(len(origin_entities)):
57
- if entity_placeholder_template.format(entity_index) not in text:
58
- return None
59
-
60
- # Enumerate entities.
61
- from_ind = 0
62
- text_parts = []
63
- for entity_index, translated_value in enumerate(translated_parts[1:]):
64
- entity_placeholder_instance = entity_placeholder_template.format(entity_index)
65
- # Cropping text part.
66
- to_ind = text.index(entity_placeholder_instance)
67
-
68
- if self.__do_translate_entity:
69
- origin_entities[entity_index].set_display_value(translated_value.strip())
70
-
71
- # Register entities.
72
- text_parts.append(text[from_ind:to_ind])
73
- text_parts.append(origin_entities[entity_index])
74
- # Update from index.
75
- from_ind = to_ind + len(entity_placeholder_instance)
76
-
77
- # Consider the remaining part.
78
- text_parts.append(text[from_ind:])
79
- return text_parts
80
-
81
- def default_pre_part_splitting_approach(self, input_data):
82
- """ This is the original strategy, based on the manually cropped named entities
83
- before the actual translation call.
84
- """
85
-
86
- def __optionally_register(prts):
87
- if len(prts) > 0:
88
- content.append(" ".join(prts))
89
- parts_to_join.clear()
90
-
91
- content = []
92
- origin_entities = []
93
- origin_entity_ind = []
94
- parts_to_join = []
95
-
96
- for _, part in enumerate(input_data):
97
- if isinstance(part, str) and part.strip():
98
- parts_to_join.append(part)
99
- elif isinstance(part, Entity):
100
- # Register first the prior parts were merged.
101
- __optionally_register(parts_to_join)
102
- # Register entities information for further restoration.
103
- origin_entity_ind.append(len(content))
104
- origin_entities.append(part)
105
- content.append(part.Value)
106
-
107
- __optionally_register(parts_to_join)
108
-
109
- # Compose text parts.
110
- translated_parts = self.__translate(content)
111
-
112
- for entity_ind, entity_part_ind in enumerate(origin_entity_ind):
113
- entity = origin_entities[entity_ind]
114
- if self.__do_translate_entity:
115
- entity.set_display_value(translated_parts[entity_part_ind].strip())
116
- translated_parts[entity_part_ind] = entity
117
-
118
- return translated_parts
119
-
120
- def apply_core(self, input_data, pipeline_ctx):
121
- assert(isinstance(pipeline_ctx, PipelineContext))
122
- assert(isinstance(input_data, list))
123
-
124
- # Check the pipeline state whether is an idle mode or not.
125
- parent_ctx = pipeline_ctx.provide(PARENT_CTX)
126
- idle_mode = parent_ctx.provide(IDLE_MODE)
127
-
128
- # When pipeline utilized only for the assessing the expected amount
129
- # of rows (common case of idle_mode), there is no need to perform
130
- # translation.
131
- if idle_mode:
132
- return
133
-
134
- fast_accurate = self.fast_most_accurate_approach(input_data)
135
- return self.default_pre_part_splitting_approach(input_data) \
136
- if fast_accurate is None else fast_accurate
File without changes
File without changes
@@ -1,12 +0,0 @@
1
- class BaseLanguageMods(object):
2
- """
3
- Represents a class with static methods that provides text/word modifies.
4
- """
5
-
6
- @staticmethod
7
- def replace_specific_word_chars(word):
8
- pass
9
-
10
- @staticmethod
11
- def is_negation_word(word):
12
- pass
@@ -1,23 +0,0 @@
1
- from enum import IntEnum
2
-
3
-
4
- class PartOfSpeechType(IntEnum):
5
-
6
- NOUN = 1
7
- ADV = 2
8
- ADVPRO = 3
9
- ANUM = 4
10
- APRO = 5
11
- COM = 6
12
- CONJ = 7
13
- INTJ = 8
14
- NUM = 9
15
- PART = 10
16
- PR = 11
17
- ADJ = 12
18
- SPRO = 13
19
- VERB = 14
20
-
21
- Unknown = 15
22
-
23
- Empty = 16
File without changes
@@ -1,78 +0,0 @@
1
- from enum import Enum
2
-
3
-
4
- class RussianCases(Enum):
5
- """ Падежи русского языка
6
- """
7
-
8
- """ не определено
9
- """
10
- UNKN = 10
11
-
12
- """ именительный
13
- """
14
- NOM = 1
15
-
16
- """ родительный
17
- """
18
- GEN = 2
19
-
20
- """ дательный
21
- """
22
- DAT = 3
23
-
24
- """ винительный
25
- """
26
- ACC = 4
27
-
28
- """ творительный
29
- """
30
- INS = 5
31
-
32
- """ предложный
33
- """
34
- ABL = 6
35
-
36
- """ партитив
37
- """
38
- PART = 7
39
-
40
- """ местный
41
- """
42
- LOC = 8
43
-
44
- """ звательный
45
- """
46
- VOC = 9
47
-
48
-
49
- class RussianCasesService(object):
50
-
51
- __english = {
52
- 'nom': RussianCases.NOM,
53
- 'gen': RussianCases.GEN,
54
- 'dat': RussianCases.DAT,
55
- 'acc': RussianCases.ACC,
56
- 'ins': RussianCases.INS,
57
- 'abl': RussianCases.ABL,
58
- 'part': RussianCases.PART,
59
- 'loc': RussianCases.LOC,
60
- 'voc': RussianCases.VOC,
61
- }
62
-
63
- __mystem_russian = {
64
- 'им': RussianCases.NOM,
65
- 'род': RussianCases.GEN,
66
- 'дат': RussianCases.DAT,
67
- 'вин': RussianCases.ACC,
68
- 'твор': RussianCases.INS,
69
- 'пр': RussianCases.ABL,
70
- 'парт': RussianCases.PART,
71
- 'местн': RussianCases.LOC,
72
- 'зват': RussianCases.VOC,
73
- }
74
-
75
- @staticmethod
76
- def iter_rus_mystem_tags():
77
- for key, value in RussianCasesService.__mystem_russian.items():
78
- yield key, value
@@ -1,6 +0,0 @@
1
- class RussianConstants:
2
-
3
- PrepositionSet = {'к', 'на', 'по', 'с', 'до', 'в', 'во', "у", "а"}
4
-
5
- def __init__(self):
6
- pass
@@ -1,13 +0,0 @@
1
- from arekit.contrib.utils.processing.languages.mods import BaseLanguageMods
2
-
3
-
4
- class RussianLanguageMods(BaseLanguageMods):
5
-
6
- @staticmethod
7
- def replace_specific_word_chars(word):
8
- assert(isinstance(word, str))
9
- return word.replace('ё', 'e')
10
-
11
- @staticmethod
12
- def is_negation_word(word):
13
- return word == 'не'
@@ -1,23 +0,0 @@
1
- from enum import Enum
2
-
3
-
4
- class RussianNumberType(Enum):
5
-
6
- UNKN = 3
7
-
8
- Plural = 1
9
-
10
- Single = 2
11
-
12
-
13
- class RussianNumberTypeService(object):
14
-
15
- __russian = {
16
- 'ед': RussianNumberType.Single,
17
- 'мн': RussianNumberType.Plural
18
- }
19
-
20
- @staticmethod
21
- def iter_rus_mystem_tags():
22
- for key, value in RussianNumberTypeService.__russian.items():
23
- yield key, value
@@ -1,36 +0,0 @@
1
- from arekit.contrib.utils.processing.languages.pos import PartOfSpeechType
2
-
3
-
4
- class PartOfSpeechTypesService(object):
5
-
6
- __pos_names = {
7
- "S": PartOfSpeechType.NOUN,
8
- "ADV": PartOfSpeechType.ADV,
9
- "ADVPRO": PartOfSpeechType.ADVPRO,
10
- "ANUM": PartOfSpeechType.ANUM,
11
- "APRO": PartOfSpeechType.APRO,
12
- "COM": PartOfSpeechType.COM,
13
- "CONJ": PartOfSpeechType.CONJ,
14
- "INTJ": PartOfSpeechType.INTJ,
15
- "NUM": PartOfSpeechType.NUM,
16
- "PART": PartOfSpeechType.PART,
17
- "PR": PartOfSpeechType.PR,
18
- "A": PartOfSpeechType.ADJ,
19
- "SPRO": PartOfSpeechType.SPRO,
20
- "V": PartOfSpeechType.VERB,
21
- "UNKN": PartOfSpeechType.Unknown,
22
- "EMPTY": PartOfSpeechType.Empty}
23
-
24
- @staticmethod
25
- def iter_mystem_tags():
26
- for key, value in PartOfSpeechTypesService.__pos_names.items():
27
- yield key, value
28
-
29
- @staticmethod
30
- def get_mystem_from_string(value):
31
- return PartOfSpeechTypesService.__pos_names[value]
32
-
33
- @staticmethod
34
- def get_mystem_pos_count():
35
- return len(PartOfSpeechTypesService.__pos_names)
36
-
File without changes
@@ -1,51 +0,0 @@
1
- from arekit.common.text.stemmer import Stemmer
2
- from arekit.common.utils import filter_whitespaces
3
- from pymystem3 import Mystem
4
-
5
-
6
- class MystemWrapper(Stemmer):
7
- """ Yandex MyStem wrapper
8
-
9
- part of speech description:
10
- https://tech.yandex.ru/mystem/doc/grammemes-values-docpage/
11
- """
12
-
13
- def __init__(self, entire_input=False):
14
- """
15
- entire_input: bool
16
- Mystem parameter that allows to keep all information from input (true) or
17
- remove garbage characters
18
- """
19
- self.__mystem = Mystem(entire_input=entire_input)
20
-
21
- # region properties
22
-
23
- @property
24
- def MystemInstance(self):
25
- return self.__mystem
26
-
27
- # endregion
28
-
29
- # region public methods
30
-
31
- def lemmatize_to_list(self, text):
32
- return self.__lemmatize_core(text)
33
-
34
- def lemmatize_to_str(self, text):
35
- result = " ".join(self.__lemmatize_core(text))
36
- return result if len(result) != 0 else self.__process_original_text(text)
37
-
38
- # endregion
39
-
40
- # region private methods
41
-
42
- def __lemmatize_core(self, text):
43
- assert(isinstance(text, str))
44
- result_list = self.__mystem.lemmatize(self.__process_original_text(text))
45
- return filter_whitespaces(result_list)
46
-
47
- @staticmethod
48
- def __process_original_text(text):
49
- return text.lower()
50
-
51
- # endregion
File without changes
@@ -1,12 +0,0 @@
1
- # TODO. Move to base class.
2
- class POSTagger:
3
-
4
- def get_term_pos(self, term):
5
- raise NotImplementedError()
6
-
7
- def get_term_number(self, term):
8
- raise NotImplementedError()
9
-
10
- def pos_to_int(self, pos):
11
- raise NotImplementedError()
12
-
@@ -1,134 +0,0 @@
1
- from pymystem3 import Mystem
2
-
3
- from arekit.contrib.utils.processing.languages.pos import PartOfSpeechType
4
- from arekit.contrib.utils.processing.languages.ru.cases import RussianCases, RussianCasesService
5
- from arekit.contrib.utils.processing.languages.ru.number import RussianNumberType, RussianNumberTypeService
6
- from arekit.contrib.utils.processing.languages.ru.pos_service import PartOfSpeechTypesService
7
- from arekit.contrib.utils.processing.pos.russian import RussianPOSTagger
8
-
9
-
10
- class POSMystemWrapper(RussianPOSTagger):
11
-
12
- _ArgsSeparator = ','
13
- _GrammarKey = 'gr'
14
-
15
- def __init__(self, mystem):
16
- assert(isinstance(mystem, Mystem))
17
- self.__mystem = mystem
18
-
19
- # region private methods
20
-
21
- @staticmethod
22
- def __extract_from_analysis(analysis, func):
23
- """
24
- part of speech description:
25
- https://tech.yandex.ru/mystem/doc/grammemes-values-docpage/
26
- func: f(args) -> out
27
- returns: str or None
28
- """
29
- assert(callable(func))
30
-
31
- if 'analysis' not in analysis:
32
- return func(None)
33
-
34
- info = analysis['analysis']
35
- if len(info) == 0:
36
- return func(None)
37
-
38
- return func(info[0])
39
-
40
- @staticmethod
41
- def __get_pos(arguments):
42
- if arguments is None:
43
- return PartOfSpeechType.Unknown
44
-
45
- pos = arguments[POSMystemWrapper._GrammarKey].split(POSMystemWrapper._ArgsSeparator)[0]
46
- if '=' in pos:
47
- pos = pos.split('=')[0]
48
-
49
- return PartOfSpeechTypesService.get_mystem_from_string(pos)
50
-
51
- @staticmethod
52
- def __get_russian_case(arguments):
53
- if arguments is None:
54
- return RussianCases.UNKN
55
-
56
- all_params = set(POSMystemWrapper.__iter_params(arguments))
57
-
58
- for key, case in RussianCasesService.iter_rus_mystem_tags():
59
- if key in all_params:
60
- return case
61
-
62
- return RussianCases.UNKN
63
-
64
- @staticmethod
65
- def __get_number(arguments):
66
- if arguments is None:
67
- return RussianNumberType.UNKN
68
-
69
- all_params = set(POSMystemWrapper.__iter_params(arguments))
70
-
71
- for key, case in RussianNumberTypeService.iter_rus_mystem_tags():
72
- if key in all_params:
73
- return case
74
-
75
- return RussianNumberType.UNKN
76
-
77
- @staticmethod
78
- def __iter_params(arguments):
79
- params = arguments[POSMystemWrapper._GrammarKey].split(POSMystemWrapper._ArgsSeparator)
80
- for optionally_combined in params:
81
- for param in optionally_combined.split('='):
82
- yield param
83
-
84
- # endregion
85
-
86
- def get_term_pos(self, term):
87
- assert(isinstance(term, str))
88
- analyzed = self.__mystem.analyze(term)
89
- return self.__extract_from_analysis(analyzed[0], self.__get_pos) \
90
- if len(analyzed) > 0 else PartOfSpeechType.Unknown
91
-
92
- def get_term_case(self, term):
93
- assert(isinstance(term, str))
94
- analyzed = self.__mystem.analyze(term)
95
- return self.__extract_from_analysis(analyzed[0], self.__get_russian_case) \
96
- if len(analyzed) > 0 else RussianCases.UNKN
97
-
98
- def get_term_number(self, term):
99
- assert(isinstance(term, str))
100
- analyzed = self.__mystem.analyze(term)
101
- return self.__extract_from_analysis(analyzed[0], self.__get_number) \
102
- if len(analyzed) > 0 else RussianNumberType.UNKN
103
-
104
- def get_terms_russian_cases(self, text):
105
- """ list of part of speech according to the certain word in text
106
- """
107
- assert(isinstance(text, str))
108
- cases = []
109
-
110
- analyzed = self.__mystem.analyze(text)
111
- for a in analyzed:
112
- pos = self.__extract_from_analysis(a, self.__get_russian_case) if len(analyzed) > 0 else RussianCases.UNKN
113
- cases.append(pos)
114
-
115
- return cases
116
-
117
- def pos_to_int(self, pos):
118
- assert(isinstance(pos, PartOfSpeechType))
119
- return int(pos)
120
-
121
- @staticmethod
122
- def is_adjective(pos_type):
123
- assert(isinstance(pos_type, PartOfSpeechType))
124
- return pos_type == PartOfSpeechType.ADJ
125
-
126
- @staticmethod
127
- def is_noun(pos_type):
128
- assert(isinstance(pos_type, PartOfSpeechType))
129
- return pos_type == PartOfSpeechType.NOUN
130
-
131
- @staticmethod
132
- def is_verb(pos_type):
133
- assert(isinstance(pos_type, PartOfSpeechType))
134
- return pos_type == PartOfSpeechType.VERB
@@ -1,10 +0,0 @@
1
- from arekit.contrib.utils.processing.pos.base import POSTagger
2
-
3
-
4
- class RussianPOSTagger(POSTagger):
5
- """ Provides cases support ('падежи')
6
- """
7
-
8
- def get_term_case(self, term):
9
- raise NotImplementedError()
10
-
File without changes