arekit 0.25.0__py3-none-any.whl → 0.25.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/common/data/storages/base.py +4 -15
- arekit/common/docs/parser.py +3 -30
- arekit/common/pipeline/items/base.py +1 -1
- arekit/common/utils.py +11 -8
- arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
- arekit/contrib/utils/data/storages/pandas_based.py +2 -17
- arekit/contrib/utils/data/storages/row_cache.py +2 -1
- arekit/contrib/utils/data/storages/sqlite_based.py +2 -1
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +5 -4
- {arekit-0.25.0.dist-info → arekit-0.25.1.dist-info}/METADATA +4 -5
- {arekit-0.25.0.dist-info → arekit-0.25.1.dist-info}/RECORD +15 -88
- arekit/common/data/input/repositories/__init__.py +0 -0
- arekit/common/data/input/repositories/base.py +0 -68
- arekit/common/data/input/repositories/sample.py +0 -22
- arekit/common/data/views/__init__.py +0 -0
- arekit/common/data/views/samples.py +0 -26
- arekit/common/service/__init__.py +0 -0
- arekit/common/service/sqlite.py +0 -36
- arekit/contrib/networks/__init__.py +0 -0
- arekit/contrib/networks/embedding.py +0 -149
- arekit/contrib/networks/embedding_io.py +0 -18
- arekit/contrib/networks/input/__init__.py +0 -0
- arekit/contrib/networks/input/const.py +0 -6
- arekit/contrib/networks/input/ctx_serialization.py +0 -28
- arekit/contrib/networks/input/embedding/__init__.py +0 -0
- arekit/contrib/networks/input/embedding/matrix.py +0 -29
- arekit/contrib/networks/input/embedding/offsets.py +0 -55
- arekit/contrib/networks/input/formatters/__init__.py +0 -0
- arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
- arekit/contrib/networks/input/providers/__init__.py +0 -0
- arekit/contrib/networks/input/providers/sample.py +0 -129
- arekit/contrib/networks/input/providers/term_connotation.py +0 -23
- arekit/contrib/networks/input/providers/text.py +0 -24
- arekit/contrib/networks/input/rows_parser.py +0 -47
- arekit/contrib/networks/input/term_types.py +0 -13
- arekit/contrib/networks/input/terms_mapping.py +0 -60
- arekit/contrib/networks/vectorizer.py +0 -6
- arekit/contrib/utils/data/readers/__init__.py +0 -0
- arekit/contrib/utils/data/readers/base.py +0 -7
- arekit/contrib/utils/data/readers/csv_pd.py +0 -38
- arekit/contrib/utils/data/readers/jsonl.py +0 -15
- arekit/contrib/utils/data/readers/sqlite.py +0 -14
- arekit/contrib/utils/data/service/__init__.py +0 -0
- arekit/contrib/utils/data/service/balance.py +0 -50
- arekit/contrib/utils/data/writers/csv_native.py +0 -63
- arekit/contrib/utils/data/writers/csv_pd.py +0 -40
- arekit/contrib/utils/data/writers/json_opennre.py +0 -132
- arekit/contrib/utils/data/writers/sqlite_native.py +0 -114
- arekit/contrib/utils/embeddings/__init__.py +0 -0
- arekit/contrib/utils/embeddings/rusvectores.py +0 -58
- arekit/contrib/utils/embeddings/tokens.py +0 -30
- arekit/contrib/utils/io_utils/embedding.py +0 -72
- arekit/contrib/utils/np_utils/__init__.py +0 -0
- arekit/contrib/utils/np_utils/embedding.py +0 -22
- arekit/contrib/utils/np_utils/npz_utils.py +0 -13
- arekit/contrib/utils/np_utils/vocab.py +0 -20
- arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/base.py +0 -94
- arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -55
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -33
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -105
- arekit/contrib/utils/pipelines/items/text/translator.py +0 -136
- arekit/contrib/utils/processing/languages/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/mods.py +0 -12
- arekit/contrib/utils/processing/languages/pos.py +0 -23
- arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
- arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
- arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
- arekit/contrib/utils/processing/languages/ru/number.py +0 -23
- arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
- arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
- arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
- arekit/contrib/utils/processing/pos/__init__.py +0 -0
- arekit/contrib/utils/processing/pos/base.py +0 -12
- arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
- arekit/contrib/utils/processing/pos/russian.py +0 -10
- arekit/contrib/utils/processing/text/__init__.py +0 -0
- arekit/contrib/utils/processing/text/tokens.py +0 -127
- arekit/contrib/utils/serializer.py +0 -42
- arekit/contrib/utils/vectorizers/__init__.py +0 -0
- arekit/contrib/utils/vectorizers/bpe.py +0 -93
- arekit/contrib/utils/vectorizers/random_norm.py +0 -39
- {arekit-0.25.0.data → arekit-0.25.1.data}/data/logo.png +0 -0
- {arekit-0.25.0.dist-info → arekit-0.25.1.dist-info}/LICENSE +0 -0
- {arekit-0.25.0.dist-info → arekit-0.25.1.dist-info}/WHEEL +0 -0
- {arekit-0.25.0.dist-info → arekit-0.25.1.dist-info}/top_level.txt +0 -0
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
from arekit.contrib.utils.processing.languages.pos import PartOfSpeechType
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class PartOfSpeechTypesService(object):
|
|
5
|
-
|
|
6
|
-
__pos_names = {
|
|
7
|
-
"S": PartOfSpeechType.NOUN,
|
|
8
|
-
"ADV": PartOfSpeechType.ADV,
|
|
9
|
-
"ADVPRO": PartOfSpeechType.ADVPRO,
|
|
10
|
-
"ANUM": PartOfSpeechType.ANUM,
|
|
11
|
-
"APRO": PartOfSpeechType.APRO,
|
|
12
|
-
"COM": PartOfSpeechType.COM,
|
|
13
|
-
"CONJ": PartOfSpeechType.CONJ,
|
|
14
|
-
"INTJ": PartOfSpeechType.INTJ,
|
|
15
|
-
"NUM": PartOfSpeechType.NUM,
|
|
16
|
-
"PART": PartOfSpeechType.PART,
|
|
17
|
-
"PR": PartOfSpeechType.PR,
|
|
18
|
-
"A": PartOfSpeechType.ADJ,
|
|
19
|
-
"SPRO": PartOfSpeechType.SPRO,
|
|
20
|
-
"V": PartOfSpeechType.VERB,
|
|
21
|
-
"UNKN": PartOfSpeechType.Unknown,
|
|
22
|
-
"EMPTY": PartOfSpeechType.Empty}
|
|
23
|
-
|
|
24
|
-
@staticmethod
|
|
25
|
-
def iter_mystem_tags():
|
|
26
|
-
for key, value in PartOfSpeechTypesService.__pos_names.items():
|
|
27
|
-
yield key, value
|
|
28
|
-
|
|
29
|
-
@staticmethod
|
|
30
|
-
def get_mystem_from_string(value):
|
|
31
|
-
return PartOfSpeechTypesService.__pos_names[value]
|
|
32
|
-
|
|
33
|
-
@staticmethod
|
|
34
|
-
def get_mystem_pos_count():
|
|
35
|
-
return len(PartOfSpeechTypesService.__pos_names)
|
|
36
|
-
|
|
File without changes
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
from arekit.common.text.stemmer import Stemmer
|
|
2
|
-
from arekit.common.utils import filter_whitespaces
|
|
3
|
-
from pymystem3 import Mystem
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class MystemWrapper(Stemmer):
|
|
7
|
-
""" Yandex MyStem wrapper
|
|
8
|
-
|
|
9
|
-
part of speech description:
|
|
10
|
-
https://tech.yandex.ru/mystem/doc/grammemes-values-docpage/
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
def __init__(self, entire_input=False):
|
|
14
|
-
"""
|
|
15
|
-
entire_input: bool
|
|
16
|
-
Mystem parameter that allows to keep all information from input (true) or
|
|
17
|
-
remove garbage characters
|
|
18
|
-
"""
|
|
19
|
-
self.__mystem = Mystem(entire_input=entire_input)
|
|
20
|
-
|
|
21
|
-
# region properties
|
|
22
|
-
|
|
23
|
-
@property
|
|
24
|
-
def MystemInstance(self):
|
|
25
|
-
return self.__mystem
|
|
26
|
-
|
|
27
|
-
# endregion
|
|
28
|
-
|
|
29
|
-
# region public methods
|
|
30
|
-
|
|
31
|
-
def lemmatize_to_list(self, text):
|
|
32
|
-
return self.__lemmatize_core(text)
|
|
33
|
-
|
|
34
|
-
def lemmatize_to_str(self, text):
|
|
35
|
-
result = " ".join(self.__lemmatize_core(text))
|
|
36
|
-
return result if len(result) != 0 else self.__process_original_text(text)
|
|
37
|
-
|
|
38
|
-
# endregion
|
|
39
|
-
|
|
40
|
-
# region private methods
|
|
41
|
-
|
|
42
|
-
def __lemmatize_core(self, text):
|
|
43
|
-
assert(isinstance(text, str))
|
|
44
|
-
result_list = self.__mystem.lemmatize(self.__process_original_text(text))
|
|
45
|
-
return filter_whitespaces(result_list)
|
|
46
|
-
|
|
47
|
-
@staticmethod
|
|
48
|
-
def __process_original_text(text):
|
|
49
|
-
return text.lower()
|
|
50
|
-
|
|
51
|
-
# endregion
|
|
File without changes
|
|
@@ -1,134 +0,0 @@
|
|
|
1
|
-
from pymystem3 import Mystem
|
|
2
|
-
|
|
3
|
-
from arekit.contrib.utils.processing.languages.pos import PartOfSpeechType
|
|
4
|
-
from arekit.contrib.utils.processing.languages.ru.cases import RussianCases, RussianCasesService
|
|
5
|
-
from arekit.contrib.utils.processing.languages.ru.number import RussianNumberType, RussianNumberTypeService
|
|
6
|
-
from arekit.contrib.utils.processing.languages.ru.pos_service import PartOfSpeechTypesService
|
|
7
|
-
from arekit.contrib.utils.processing.pos.russian import RussianPOSTagger
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class POSMystemWrapper(RussianPOSTagger):
|
|
11
|
-
|
|
12
|
-
_ArgsSeparator = ','
|
|
13
|
-
_GrammarKey = 'gr'
|
|
14
|
-
|
|
15
|
-
def __init__(self, mystem):
|
|
16
|
-
assert(isinstance(mystem, Mystem))
|
|
17
|
-
self.__mystem = mystem
|
|
18
|
-
|
|
19
|
-
# region private methods
|
|
20
|
-
|
|
21
|
-
@staticmethod
|
|
22
|
-
def __extract_from_analysis(analysis, func):
|
|
23
|
-
"""
|
|
24
|
-
part of speech description:
|
|
25
|
-
https://tech.yandex.ru/mystem/doc/grammemes-values-docpage/
|
|
26
|
-
func: f(args) -> out
|
|
27
|
-
returns: str or None
|
|
28
|
-
"""
|
|
29
|
-
assert(callable(func))
|
|
30
|
-
|
|
31
|
-
if 'analysis' not in analysis:
|
|
32
|
-
return func(None)
|
|
33
|
-
|
|
34
|
-
info = analysis['analysis']
|
|
35
|
-
if len(info) == 0:
|
|
36
|
-
return func(None)
|
|
37
|
-
|
|
38
|
-
return func(info[0])
|
|
39
|
-
|
|
40
|
-
@staticmethod
|
|
41
|
-
def __get_pos(arguments):
|
|
42
|
-
if arguments is None:
|
|
43
|
-
return PartOfSpeechType.Unknown
|
|
44
|
-
|
|
45
|
-
pos = arguments[POSMystemWrapper._GrammarKey].split(POSMystemWrapper._ArgsSeparator)[0]
|
|
46
|
-
if '=' in pos:
|
|
47
|
-
pos = pos.split('=')[0]
|
|
48
|
-
|
|
49
|
-
return PartOfSpeechTypesService.get_mystem_from_string(pos)
|
|
50
|
-
|
|
51
|
-
@staticmethod
|
|
52
|
-
def __get_russian_case(arguments):
|
|
53
|
-
if arguments is None:
|
|
54
|
-
return RussianCases.UNKN
|
|
55
|
-
|
|
56
|
-
all_params = set(POSMystemWrapper.__iter_params(arguments))
|
|
57
|
-
|
|
58
|
-
for key, case in RussianCasesService.iter_rus_mystem_tags():
|
|
59
|
-
if key in all_params:
|
|
60
|
-
return case
|
|
61
|
-
|
|
62
|
-
return RussianCases.UNKN
|
|
63
|
-
|
|
64
|
-
@staticmethod
|
|
65
|
-
def __get_number(arguments):
|
|
66
|
-
if arguments is None:
|
|
67
|
-
return RussianNumberType.UNKN
|
|
68
|
-
|
|
69
|
-
all_params = set(POSMystemWrapper.__iter_params(arguments))
|
|
70
|
-
|
|
71
|
-
for key, case in RussianNumberTypeService.iter_rus_mystem_tags():
|
|
72
|
-
if key in all_params:
|
|
73
|
-
return case
|
|
74
|
-
|
|
75
|
-
return RussianNumberType.UNKN
|
|
76
|
-
|
|
77
|
-
@staticmethod
|
|
78
|
-
def __iter_params(arguments):
|
|
79
|
-
params = arguments[POSMystemWrapper._GrammarKey].split(POSMystemWrapper._ArgsSeparator)
|
|
80
|
-
for optionally_combined in params:
|
|
81
|
-
for param in optionally_combined.split('='):
|
|
82
|
-
yield param
|
|
83
|
-
|
|
84
|
-
# endregion
|
|
85
|
-
|
|
86
|
-
def get_term_pos(self, term):
|
|
87
|
-
assert(isinstance(term, str))
|
|
88
|
-
analyzed = self.__mystem.analyze(term)
|
|
89
|
-
return self.__extract_from_analysis(analyzed[0], self.__get_pos) \
|
|
90
|
-
if len(analyzed) > 0 else PartOfSpeechType.Unknown
|
|
91
|
-
|
|
92
|
-
def get_term_case(self, term):
|
|
93
|
-
assert(isinstance(term, str))
|
|
94
|
-
analyzed = self.__mystem.analyze(term)
|
|
95
|
-
return self.__extract_from_analysis(analyzed[0], self.__get_russian_case) \
|
|
96
|
-
if len(analyzed) > 0 else RussianCases.UNKN
|
|
97
|
-
|
|
98
|
-
def get_term_number(self, term):
|
|
99
|
-
assert(isinstance(term, str))
|
|
100
|
-
analyzed = self.__mystem.analyze(term)
|
|
101
|
-
return self.__extract_from_analysis(analyzed[0], self.__get_number) \
|
|
102
|
-
if len(analyzed) > 0 else RussianNumberType.UNKN
|
|
103
|
-
|
|
104
|
-
def get_terms_russian_cases(self, text):
|
|
105
|
-
""" list of part of speech according to the certain word in text
|
|
106
|
-
"""
|
|
107
|
-
assert(isinstance(text, str))
|
|
108
|
-
cases = []
|
|
109
|
-
|
|
110
|
-
analyzed = self.__mystem.analyze(text)
|
|
111
|
-
for a in analyzed:
|
|
112
|
-
pos = self.__extract_from_analysis(a, self.__get_russian_case) if len(analyzed) > 0 else RussianCases.UNKN
|
|
113
|
-
cases.append(pos)
|
|
114
|
-
|
|
115
|
-
return cases
|
|
116
|
-
|
|
117
|
-
def pos_to_int(self, pos):
|
|
118
|
-
assert(isinstance(pos, PartOfSpeechType))
|
|
119
|
-
return int(pos)
|
|
120
|
-
|
|
121
|
-
@staticmethod
|
|
122
|
-
def is_adjective(pos_type):
|
|
123
|
-
assert(isinstance(pos_type, PartOfSpeechType))
|
|
124
|
-
return pos_type == PartOfSpeechType.ADJ
|
|
125
|
-
|
|
126
|
-
@staticmethod
|
|
127
|
-
def is_noun(pos_type):
|
|
128
|
-
assert(isinstance(pos_type, PartOfSpeechType))
|
|
129
|
-
return pos_type == PartOfSpeechType.NOUN
|
|
130
|
-
|
|
131
|
-
@staticmethod
|
|
132
|
-
def is_verb(pos_type):
|
|
133
|
-
assert(isinstance(pos_type, PartOfSpeechType))
|
|
134
|
-
return pos_type == PartOfSpeechType.VERB
|
|
File without changes
|
|
@@ -1,127 +0,0 @@
|
|
|
1
|
-
from urllib.parse import urlparse
|
|
2
|
-
from arekit.common.context.token import Token
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
# TODO. Provide the base (BaseTokens) type.
|
|
6
|
-
# TODO. With the related API at BaseTokens.
|
|
7
|
-
class Tokens:
|
|
8
|
-
"""
|
|
9
|
-
Tokens used to describe a non-word text units, such as punctuation,
|
|
10
|
-
uknown words/chars, smiles, etc.
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
_wrapper = "<[{}]>"
|
|
14
|
-
COMMA = _wrapper.format(',')
|
|
15
|
-
SEMICOLON = _wrapper.format(';')
|
|
16
|
-
COLON = _wrapper.format(':')
|
|
17
|
-
QUOTE = _wrapper.format('QUOTE')
|
|
18
|
-
DASH = _wrapper.format('-')
|
|
19
|
-
LONG_DASH = _wrapper.format('long_dash')
|
|
20
|
-
DOT = _wrapper.format('.')
|
|
21
|
-
TRIPLE_DOTS = _wrapper.format('…')
|
|
22
|
-
EXC_SIGN = _wrapper.format('!')
|
|
23
|
-
QUESTION_SIGN = _wrapper.format('?')
|
|
24
|
-
OPEN_BRACKET = _wrapper.format('OPEN_BRACKET')
|
|
25
|
-
CLOSED_BRACKET = _wrapper.format('CLOSED_BRACKET')
|
|
26
|
-
NUMBER = _wrapper.format('NUMBER')
|
|
27
|
-
NEW_LINE = _wrapper.format("NEW_LINE")
|
|
28
|
-
UNKNOWN_CHAR = _wrapper.format('UNKNOWN_CHAR')
|
|
29
|
-
UNKNOWN_WORD = _wrapper.format('UNKNOWN_WORD')
|
|
30
|
-
URL = _wrapper.format("URL")
|
|
31
|
-
|
|
32
|
-
__token_mapping = {
|
|
33
|
-
',': COMMA,
|
|
34
|
-
'.': DOT,
|
|
35
|
-
'…': TRIPLE_DOTS,
|
|
36
|
-
':': COLON,
|
|
37
|
-
';': SEMICOLON,
|
|
38
|
-
'-': DASH,
|
|
39
|
-
'—': LONG_DASH,
|
|
40
|
-
'?': QUESTION_SIGN,
|
|
41
|
-
'!': EXC_SIGN,
|
|
42
|
-
'(': OPEN_BRACKET,
|
|
43
|
-
')': CLOSED_BRACKET,
|
|
44
|
-
'{': OPEN_BRACKET,
|
|
45
|
-
'}': CLOSED_BRACKET,
|
|
46
|
-
'[': OPEN_BRACKET,
|
|
47
|
-
']': CLOSED_BRACKET,
|
|
48
|
-
'\n': NEW_LINE,
|
|
49
|
-
'«': QUOTE,
|
|
50
|
-
'»': QUOTE,
|
|
51
|
-
'"': QUOTE,
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
__supported_tokens = {
|
|
55
|
-
COMMA,
|
|
56
|
-
SEMICOLON,
|
|
57
|
-
COLON,
|
|
58
|
-
QUOTE,
|
|
59
|
-
DASH,
|
|
60
|
-
DOT,
|
|
61
|
-
LONG_DASH,
|
|
62
|
-
TRIPLE_DOTS,
|
|
63
|
-
EXC_SIGN,
|
|
64
|
-
QUESTION_SIGN,
|
|
65
|
-
OPEN_BRACKET,
|
|
66
|
-
CLOSED_BRACKET,
|
|
67
|
-
NUMBER,
|
|
68
|
-
URL,
|
|
69
|
-
NEW_LINE,
|
|
70
|
-
UNKNOWN_CHAR,
|
|
71
|
-
UNKNOWN_WORD}
|
|
72
|
-
|
|
73
|
-
@staticmethod
|
|
74
|
-
def try_create(subterm):
|
|
75
|
-
"""
|
|
76
|
-
Trying create a token by given 'term' parameter
|
|
77
|
-
subterm: unicode
|
|
78
|
-
I.e. term ending, so means a part of original term
|
|
79
|
-
"""
|
|
80
|
-
assert(isinstance(subterm, str))
|
|
81
|
-
if subterm not in Tokens.__token_mapping:
|
|
82
|
-
return None
|
|
83
|
-
return Token(term=subterm, token_value=Tokens.__token_mapping[subterm])
|
|
84
|
-
|
|
85
|
-
@staticmethod
|
|
86
|
-
def try_parse(term):
|
|
87
|
-
assert(isinstance(term, str))
|
|
88
|
-
for origin, token_value in Tokens.__token_mapping.items():
|
|
89
|
-
if term == token_value:
|
|
90
|
-
return Token(term=origin, token_value=token_value)
|
|
91
|
-
|
|
92
|
-
@staticmethod
|
|
93
|
-
def try_create_number(term):
|
|
94
|
-
assert(isinstance(term, str))
|
|
95
|
-
if not term.isdigit():
|
|
96
|
-
return None
|
|
97
|
-
return Token(term=term, token_value=Tokens.NUMBER)
|
|
98
|
-
|
|
99
|
-
@staticmethod
|
|
100
|
-
def try_create_url(term):
|
|
101
|
-
assert(isinstance(term, str))
|
|
102
|
-
result = urlparse(term)
|
|
103
|
-
is_correct = result.scheme and result.netloc and result.path
|
|
104
|
-
if not is_correct:
|
|
105
|
-
return None
|
|
106
|
-
return Token(term=term, token_value=Tokens.URL)
|
|
107
|
-
|
|
108
|
-
@staticmethod
|
|
109
|
-
def is_token(term):
|
|
110
|
-
assert(isinstance(term, str))
|
|
111
|
-
return term in Tokens.__supported_tokens
|
|
112
|
-
|
|
113
|
-
@staticmethod
|
|
114
|
-
def iter_chars_by_token(term):
|
|
115
|
-
"""
|
|
116
|
-
Iterate through charts that is related to term
|
|
117
|
-
token: char
|
|
118
|
-
"""
|
|
119
|
-
assert(isinstance(term, str))
|
|
120
|
-
for char, token in Tokens.__token_mapping.items():
|
|
121
|
-
if term == token:
|
|
122
|
-
yield char
|
|
123
|
-
|
|
124
|
-
@staticmethod
|
|
125
|
-
def iter_supported_tokens():
|
|
126
|
-
for token in Tokens.__supported_tokens:
|
|
127
|
-
yield token
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
|
|
3
|
-
from collections.abc import Iterable
|
|
4
|
-
|
|
5
|
-
from arekit.common.data.input.providers.columns.sample import SampleColumnsProvider
|
|
6
|
-
from arekit.common.data.input.providers.rows.base import BaseRowProvider
|
|
7
|
-
from arekit.common.data.input.repositories.base import BaseInputRepository
|
|
8
|
-
from arekit.common.data.input.repositories.sample import BaseInputSamplesRepository
|
|
9
|
-
from arekit.common.data.storages.base import BaseRowsStorage
|
|
10
|
-
from arekit.contrib.utils.data.contents.opinions import InputTextOpinionProvider
|
|
11
|
-
|
|
12
|
-
logger = logging.getLogger(__name__)
|
|
13
|
-
logging.basicConfig(level=logging.INFO)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class InputDataSerializationHelper(object):
|
|
17
|
-
|
|
18
|
-
@staticmethod
|
|
19
|
-
def create_samples_repo(keep_labels, rows_provider, storage):
|
|
20
|
-
assert(isinstance(rows_provider, BaseRowProvider))
|
|
21
|
-
assert(isinstance(keep_labels, bool))
|
|
22
|
-
assert(isinstance(storage, BaseRowsStorage))
|
|
23
|
-
return BaseInputSamplesRepository(
|
|
24
|
-
columns_provider=SampleColumnsProvider(store_labels=keep_labels),
|
|
25
|
-
rows_provider=rows_provider,
|
|
26
|
-
storage=storage)
|
|
27
|
-
|
|
28
|
-
@staticmethod
|
|
29
|
-
def fill_and_write(pipeline, repo, target, writer, doc_ids_iter, desc=""):
|
|
30
|
-
assert(isinstance(pipeline, list))
|
|
31
|
-
assert(isinstance(doc_ids_iter, Iterable))
|
|
32
|
-
assert(isinstance(repo, BaseInputRepository))
|
|
33
|
-
|
|
34
|
-
doc_ids = list(doc_ids_iter)
|
|
35
|
-
|
|
36
|
-
repo.populate(contents_provider=InputTextOpinionProvider(pipeline),
|
|
37
|
-
doc_ids=doc_ids,
|
|
38
|
-
desc=desc,
|
|
39
|
-
writer=writer,
|
|
40
|
-
target=target)
|
|
41
|
-
|
|
42
|
-
repo.push(writer=writer, target=target)
|
|
File without changes
|
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
|
|
3
|
-
from arekit.common.log_utils import logger
|
|
4
|
-
from arekit.contrib.networks.embedding import Embedding
|
|
5
|
-
from arekit.contrib.networks.vectorizer import BaseVectorizer
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class BPEVectorizer(BaseVectorizer):
|
|
9
|
-
""" Embedding algorithm based on parts (trigrams originally)
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
def __init__(self, embedding, max_part_size=3, word_separator=' '):
|
|
13
|
-
assert(isinstance(embedding, Embedding))
|
|
14
|
-
assert(isinstance(max_part_size, int))
|
|
15
|
-
self.__embedding = embedding
|
|
16
|
-
self.__max_part_size = max_part_size
|
|
17
|
-
self.__word_separator = word_separator
|
|
18
|
-
|
|
19
|
-
def create_term_embedding(self, term):
|
|
20
|
-
""" Note: returns modified term value in a form of the `word` returning parameter.
|
|
21
|
-
"""
|
|
22
|
-
assert(isinstance(term, str))
|
|
23
|
-
|
|
24
|
-
word, word_embedding = self.__get_from_embedding(term=term) \
|
|
25
|
-
if term in self.__embedding else self.__compose_from_parts(term=term)
|
|
26
|
-
|
|
27
|
-
# In order to prevent a problem of the further separations during reading process.
|
|
28
|
-
# it is necessary to replace the separators with the other chars.
|
|
29
|
-
word = word.replace(self.__word_separator, '-')
|
|
30
|
-
|
|
31
|
-
return word, word_embedding
|
|
32
|
-
|
|
33
|
-
def __compose_from_parts(self, term, do_lowercase=True):
|
|
34
|
-
# remove empty spaces before and after.
|
|
35
|
-
term = term.strip()
|
|
36
|
-
|
|
37
|
-
# perform lowercasing
|
|
38
|
-
if do_lowercase:
|
|
39
|
-
term = term.lower()
|
|
40
|
-
|
|
41
|
-
# Calculating vector from term parts
|
|
42
|
-
count = 0
|
|
43
|
-
vector = np.zeros(self.__embedding.VectorSize)
|
|
44
|
-
for word in term.split(self.__word_separator):
|
|
45
|
-
v, c = self.__create_embedding_for_word(word=word, embedding=self.__embedding)
|
|
46
|
-
count += c
|
|
47
|
-
vector = vector + v
|
|
48
|
-
|
|
49
|
-
return term, vector / count if count > 0 else vector
|
|
50
|
-
|
|
51
|
-
def __get_from_embedding(self, term):
|
|
52
|
-
return self.__embedding.try_get_related_word(term), self.__embedding[term]
|
|
53
|
-
|
|
54
|
-
def __create_embedding_for_word(self, word, embedding):
|
|
55
|
-
assert(isinstance(word, str))
|
|
56
|
-
assert(isinstance(embedding, Embedding))
|
|
57
|
-
|
|
58
|
-
if word in embedding:
|
|
59
|
-
return embedding[word], 1
|
|
60
|
-
|
|
61
|
-
c_i = 0
|
|
62
|
-
c_l = self.__max_part_size
|
|
63
|
-
count = 0
|
|
64
|
-
vector = np.zeros(embedding.VectorSize)
|
|
65
|
-
missings = []
|
|
66
|
-
|
|
67
|
-
while c_i < len(word):
|
|
68
|
-
|
|
69
|
-
if c_l == 0:
|
|
70
|
-
missings.append(c_i)
|
|
71
|
-
c_i += 1
|
|
72
|
-
c_l = self.__max_part_size
|
|
73
|
-
continue
|
|
74
|
-
|
|
75
|
-
right_b = min(len(word), c_i + c_l)
|
|
76
|
-
|
|
77
|
-
s_i = embedding.try_find_index_by_plain_word(word=word[c_i:right_b])
|
|
78
|
-
|
|
79
|
-
if s_i is None:
|
|
80
|
-
c_l -= 1
|
|
81
|
-
continue
|
|
82
|
-
|
|
83
|
-
vector += embedding.get_vector_by_index(s_i)
|
|
84
|
-
c_i += c_l
|
|
85
|
-
count += 1
|
|
86
|
-
|
|
87
|
-
debug = False
|
|
88
|
-
if debug:
|
|
89
|
-
w_debug = ''.join(['?' if i in missings else ch
|
|
90
|
-
for i, ch in enumerate(word)])
|
|
91
|
-
logger.debug('Embedded: {}'.format(w_debug).encode('utf-8'))
|
|
92
|
-
|
|
93
|
-
return vector, count
|
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
|
|
3
|
-
from arekit.contrib.networks.vectorizer import BaseVectorizer
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class RandomNormalVectorizer(BaseVectorizer):
|
|
7
|
-
|
|
8
|
-
def __init__(self, vector_size, token_offset=12345, max_tokens_count=100):
|
|
9
|
-
assert(isinstance(vector_size, int))
|
|
10
|
-
self.__vector_size = vector_size
|
|
11
|
-
self.__seed_token_offset = token_offset
|
|
12
|
-
self.__max_tokens_count = max_tokens_count
|
|
13
|
-
|
|
14
|
-
def create_term_embedding(self, term):
|
|
15
|
-
""" term: is its index.
|
|
16
|
-
"""
|
|
17
|
-
embedding = self.__get_random_normal_distribution(
|
|
18
|
-
vector_size=self.__vector_size,
|
|
19
|
-
seed=(self.__string_to_int(term) % self.__max_tokens_count) + self.__seed_token_offset,
|
|
20
|
-
loc=0.05,
|
|
21
|
-
scale=0.025)
|
|
22
|
-
return term, embedding
|
|
23
|
-
|
|
24
|
-
# region private methods
|
|
25
|
-
|
|
26
|
-
def __string_to_int(self, s):
|
|
27
|
-
# Originally taken from here:
|
|
28
|
-
# https://stackoverflow.com/questions/2511058/persistent-hashing-of-strings-in-python
|
|
29
|
-
ord3 = lambda x: '%.3d' % ord(x)
|
|
30
|
-
return int(''.join(map(ord3, s)))
|
|
31
|
-
|
|
32
|
-
@staticmethod
|
|
33
|
-
def __get_random_normal_distribution(vector_size, seed, loc, scale):
|
|
34
|
-
assert (isinstance(vector_size, int))
|
|
35
|
-
assert (isinstance(seed, int))
|
|
36
|
-
np.random.seed(seed)
|
|
37
|
-
return np.random.normal(loc=loc, scale=scale, size=vector_size)
|
|
38
|
-
|
|
39
|
-
# endregion
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|