arekit 0.25.0__py3-none-any.whl → 0.25.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. arekit/common/data/storages/base.py +4 -15
  2. arekit/common/docs/parser.py +3 -30
  3. arekit/common/pipeline/items/base.py +1 -1
  4. arekit/common/utils.py +11 -8
  5. arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
  6. arekit/contrib/utils/data/storages/pandas_based.py +2 -17
  7. arekit/contrib/utils/data/storages/row_cache.py +2 -1
  8. arekit/contrib/utils/data/storages/sqlite_based.py +2 -1
  9. arekit/contrib/utils/pipelines/text_opinion/extraction.py +5 -4
  10. {arekit-0.25.0.dist-info → arekit-0.25.1.dist-info}/METADATA +4 -5
  11. {arekit-0.25.0.dist-info → arekit-0.25.1.dist-info}/RECORD +15 -88
  12. arekit/common/data/input/repositories/__init__.py +0 -0
  13. arekit/common/data/input/repositories/base.py +0 -68
  14. arekit/common/data/input/repositories/sample.py +0 -22
  15. arekit/common/data/views/__init__.py +0 -0
  16. arekit/common/data/views/samples.py +0 -26
  17. arekit/common/service/__init__.py +0 -0
  18. arekit/common/service/sqlite.py +0 -36
  19. arekit/contrib/networks/__init__.py +0 -0
  20. arekit/contrib/networks/embedding.py +0 -149
  21. arekit/contrib/networks/embedding_io.py +0 -18
  22. arekit/contrib/networks/input/__init__.py +0 -0
  23. arekit/contrib/networks/input/const.py +0 -6
  24. arekit/contrib/networks/input/ctx_serialization.py +0 -28
  25. arekit/contrib/networks/input/embedding/__init__.py +0 -0
  26. arekit/contrib/networks/input/embedding/matrix.py +0 -29
  27. arekit/contrib/networks/input/embedding/offsets.py +0 -55
  28. arekit/contrib/networks/input/formatters/__init__.py +0 -0
  29. arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
  30. arekit/contrib/networks/input/providers/__init__.py +0 -0
  31. arekit/contrib/networks/input/providers/sample.py +0 -129
  32. arekit/contrib/networks/input/providers/term_connotation.py +0 -23
  33. arekit/contrib/networks/input/providers/text.py +0 -24
  34. arekit/contrib/networks/input/rows_parser.py +0 -47
  35. arekit/contrib/networks/input/term_types.py +0 -13
  36. arekit/contrib/networks/input/terms_mapping.py +0 -60
  37. arekit/contrib/networks/vectorizer.py +0 -6
  38. arekit/contrib/utils/data/readers/__init__.py +0 -0
  39. arekit/contrib/utils/data/readers/base.py +0 -7
  40. arekit/contrib/utils/data/readers/csv_pd.py +0 -38
  41. arekit/contrib/utils/data/readers/jsonl.py +0 -15
  42. arekit/contrib/utils/data/readers/sqlite.py +0 -14
  43. arekit/contrib/utils/data/service/__init__.py +0 -0
  44. arekit/contrib/utils/data/service/balance.py +0 -50
  45. arekit/contrib/utils/data/writers/csv_native.py +0 -63
  46. arekit/contrib/utils/data/writers/csv_pd.py +0 -40
  47. arekit/contrib/utils/data/writers/json_opennre.py +0 -132
  48. arekit/contrib/utils/data/writers/sqlite_native.py +0 -114
  49. arekit/contrib/utils/embeddings/__init__.py +0 -0
  50. arekit/contrib/utils/embeddings/rusvectores.py +0 -58
  51. arekit/contrib/utils/embeddings/tokens.py +0 -30
  52. arekit/contrib/utils/io_utils/embedding.py +0 -72
  53. arekit/contrib/utils/np_utils/__init__.py +0 -0
  54. arekit/contrib/utils/np_utils/embedding.py +0 -22
  55. arekit/contrib/utils/np_utils/npz_utils.py +0 -13
  56. arekit/contrib/utils/np_utils/vocab.py +0 -20
  57. arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
  58. arekit/contrib/utils/pipelines/items/sampling/base.py +0 -94
  59. arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -55
  60. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
  61. arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -33
  62. arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -105
  63. arekit/contrib/utils/pipelines/items/text/translator.py +0 -136
  64. arekit/contrib/utils/processing/languages/__init__.py +0 -0
  65. arekit/contrib/utils/processing/languages/mods.py +0 -12
  66. arekit/contrib/utils/processing/languages/pos.py +0 -23
  67. arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
  68. arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
  69. arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
  70. arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
  71. arekit/contrib/utils/processing/languages/ru/number.py +0 -23
  72. arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
  73. arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
  74. arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
  75. arekit/contrib/utils/processing/pos/__init__.py +0 -0
  76. arekit/contrib/utils/processing/pos/base.py +0 -12
  77. arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
  78. arekit/contrib/utils/processing/pos/russian.py +0 -10
  79. arekit/contrib/utils/processing/text/__init__.py +0 -0
  80. arekit/contrib/utils/processing/text/tokens.py +0 -127
  81. arekit/contrib/utils/serializer.py +0 -42
  82. arekit/contrib/utils/vectorizers/__init__.py +0 -0
  83. arekit/contrib/utils/vectorizers/bpe.py +0 -93
  84. arekit/contrib/utils/vectorizers/random_norm.py +0 -39
  85. {arekit-0.25.0.data → arekit-0.25.1.data}/data/logo.png +0 -0
  86. {arekit-0.25.0.dist-info → arekit-0.25.1.dist-info}/LICENSE +0 -0
  87. {arekit-0.25.0.dist-info → arekit-0.25.1.dist-info}/WHEEL +0 -0
  88. {arekit-0.25.0.dist-info → arekit-0.25.1.dist-info}/top_level.txt +0 -0
@@ -1,36 +0,0 @@
1
- from arekit.contrib.utils.processing.languages.pos import PartOfSpeechType
2
-
3
-
4
- class PartOfSpeechTypesService(object):
5
-
6
- __pos_names = {
7
- "S": PartOfSpeechType.NOUN,
8
- "ADV": PartOfSpeechType.ADV,
9
- "ADVPRO": PartOfSpeechType.ADVPRO,
10
- "ANUM": PartOfSpeechType.ANUM,
11
- "APRO": PartOfSpeechType.APRO,
12
- "COM": PartOfSpeechType.COM,
13
- "CONJ": PartOfSpeechType.CONJ,
14
- "INTJ": PartOfSpeechType.INTJ,
15
- "NUM": PartOfSpeechType.NUM,
16
- "PART": PartOfSpeechType.PART,
17
- "PR": PartOfSpeechType.PR,
18
- "A": PartOfSpeechType.ADJ,
19
- "SPRO": PartOfSpeechType.SPRO,
20
- "V": PartOfSpeechType.VERB,
21
- "UNKN": PartOfSpeechType.Unknown,
22
- "EMPTY": PartOfSpeechType.Empty}
23
-
24
- @staticmethod
25
- def iter_mystem_tags():
26
- for key, value in PartOfSpeechTypesService.__pos_names.items():
27
- yield key, value
28
-
29
- @staticmethod
30
- def get_mystem_from_string(value):
31
- return PartOfSpeechTypesService.__pos_names[value]
32
-
33
- @staticmethod
34
- def get_mystem_pos_count():
35
- return len(PartOfSpeechTypesService.__pos_names)
36
-
File without changes
@@ -1,51 +0,0 @@
1
- from arekit.common.text.stemmer import Stemmer
2
- from arekit.common.utils import filter_whitespaces
3
- from pymystem3 import Mystem
4
-
5
-
6
- class MystemWrapper(Stemmer):
7
- """ Yandex MyStem wrapper
8
-
9
- part of speech description:
10
- https://tech.yandex.ru/mystem/doc/grammemes-values-docpage/
11
- """
12
-
13
- def __init__(self, entire_input=False):
14
- """
15
- entire_input: bool
16
- Mystem parameter that allows to keep all information from input (true) or
17
- remove garbage characters
18
- """
19
- self.__mystem = Mystem(entire_input=entire_input)
20
-
21
- # region properties
22
-
23
- @property
24
- def MystemInstance(self):
25
- return self.__mystem
26
-
27
- # endregion
28
-
29
- # region public methods
30
-
31
- def lemmatize_to_list(self, text):
32
- return self.__lemmatize_core(text)
33
-
34
- def lemmatize_to_str(self, text):
35
- result = " ".join(self.__lemmatize_core(text))
36
- return result if len(result) != 0 else self.__process_original_text(text)
37
-
38
- # endregion
39
-
40
- # region private methods
41
-
42
- def __lemmatize_core(self, text):
43
- assert(isinstance(text, str))
44
- result_list = self.__mystem.lemmatize(self.__process_original_text(text))
45
- return filter_whitespaces(result_list)
46
-
47
- @staticmethod
48
- def __process_original_text(text):
49
- return text.lower()
50
-
51
- # endregion
File without changes
@@ -1,12 +0,0 @@
1
- # TODO. Move to base class.
2
- class POSTagger:
3
-
4
- def get_term_pos(self, term):
5
- raise NotImplementedError()
6
-
7
- def get_term_number(self, term):
8
- raise NotImplementedError()
9
-
10
- def pos_to_int(self, pos):
11
- raise NotImplementedError()
12
-
@@ -1,134 +0,0 @@
1
- from pymystem3 import Mystem
2
-
3
- from arekit.contrib.utils.processing.languages.pos import PartOfSpeechType
4
- from arekit.contrib.utils.processing.languages.ru.cases import RussianCases, RussianCasesService
5
- from arekit.contrib.utils.processing.languages.ru.number import RussianNumberType, RussianNumberTypeService
6
- from arekit.contrib.utils.processing.languages.ru.pos_service import PartOfSpeechTypesService
7
- from arekit.contrib.utils.processing.pos.russian import RussianPOSTagger
8
-
9
-
10
- class POSMystemWrapper(RussianPOSTagger):
11
-
12
- _ArgsSeparator = ','
13
- _GrammarKey = 'gr'
14
-
15
- def __init__(self, mystem):
16
- assert(isinstance(mystem, Mystem))
17
- self.__mystem = mystem
18
-
19
- # region private methods
20
-
21
- @staticmethod
22
- def __extract_from_analysis(analysis, func):
23
- """
24
- part of speech description:
25
- https://tech.yandex.ru/mystem/doc/grammemes-values-docpage/
26
- func: f(args) -> out
27
- returns: str or None
28
- """
29
- assert(callable(func))
30
-
31
- if 'analysis' not in analysis:
32
- return func(None)
33
-
34
- info = analysis['analysis']
35
- if len(info) == 0:
36
- return func(None)
37
-
38
- return func(info[0])
39
-
40
- @staticmethod
41
- def __get_pos(arguments):
42
- if arguments is None:
43
- return PartOfSpeechType.Unknown
44
-
45
- pos = arguments[POSMystemWrapper._GrammarKey].split(POSMystemWrapper._ArgsSeparator)[0]
46
- if '=' in pos:
47
- pos = pos.split('=')[0]
48
-
49
- return PartOfSpeechTypesService.get_mystem_from_string(pos)
50
-
51
- @staticmethod
52
- def __get_russian_case(arguments):
53
- if arguments is None:
54
- return RussianCases.UNKN
55
-
56
- all_params = set(POSMystemWrapper.__iter_params(arguments))
57
-
58
- for key, case in RussianCasesService.iter_rus_mystem_tags():
59
- if key in all_params:
60
- return case
61
-
62
- return RussianCases.UNKN
63
-
64
- @staticmethod
65
- def __get_number(arguments):
66
- if arguments is None:
67
- return RussianNumberType.UNKN
68
-
69
- all_params = set(POSMystemWrapper.__iter_params(arguments))
70
-
71
- for key, case in RussianNumberTypeService.iter_rus_mystem_tags():
72
- if key in all_params:
73
- return case
74
-
75
- return RussianNumberType.UNKN
76
-
77
- @staticmethod
78
- def __iter_params(arguments):
79
- params = arguments[POSMystemWrapper._GrammarKey].split(POSMystemWrapper._ArgsSeparator)
80
- for optionally_combined in params:
81
- for param in optionally_combined.split('='):
82
- yield param
83
-
84
- # endregion
85
-
86
- def get_term_pos(self, term):
87
- assert(isinstance(term, str))
88
- analyzed = self.__mystem.analyze(term)
89
- return self.__extract_from_analysis(analyzed[0], self.__get_pos) \
90
- if len(analyzed) > 0 else PartOfSpeechType.Unknown
91
-
92
- def get_term_case(self, term):
93
- assert(isinstance(term, str))
94
- analyzed = self.__mystem.analyze(term)
95
- return self.__extract_from_analysis(analyzed[0], self.__get_russian_case) \
96
- if len(analyzed) > 0 else RussianCases.UNKN
97
-
98
- def get_term_number(self, term):
99
- assert(isinstance(term, str))
100
- analyzed = self.__mystem.analyze(term)
101
- return self.__extract_from_analysis(analyzed[0], self.__get_number) \
102
- if len(analyzed) > 0 else RussianNumberType.UNKN
103
-
104
- def get_terms_russian_cases(self, text):
105
- """ list of part of speech according to the certain word in text
106
- """
107
- assert(isinstance(text, str))
108
- cases = []
109
-
110
- analyzed = self.__mystem.analyze(text)
111
- for a in analyzed:
112
- pos = self.__extract_from_analysis(a, self.__get_russian_case) if len(analyzed) > 0 else RussianCases.UNKN
113
- cases.append(pos)
114
-
115
- return cases
116
-
117
- def pos_to_int(self, pos):
118
- assert(isinstance(pos, PartOfSpeechType))
119
- return int(pos)
120
-
121
- @staticmethod
122
- def is_adjective(pos_type):
123
- assert(isinstance(pos_type, PartOfSpeechType))
124
- return pos_type == PartOfSpeechType.ADJ
125
-
126
- @staticmethod
127
- def is_noun(pos_type):
128
- assert(isinstance(pos_type, PartOfSpeechType))
129
- return pos_type == PartOfSpeechType.NOUN
130
-
131
- @staticmethod
132
- def is_verb(pos_type):
133
- assert(isinstance(pos_type, PartOfSpeechType))
134
- return pos_type == PartOfSpeechType.VERB
@@ -1,10 +0,0 @@
1
- from arekit.contrib.utils.processing.pos.base import POSTagger
2
-
3
-
4
- class RussianPOSTagger(POSTagger):
5
- """ Provides cases support ('падежи')
6
- """
7
-
8
- def get_term_case(self, term):
9
- raise NotImplementedError()
10
-
File without changes
@@ -1,127 +0,0 @@
1
- from urllib.parse import urlparse
2
- from arekit.common.context.token import Token
3
-
4
-
5
- # TODO. Provide the base (BaseTokens) type.
6
- # TODO. With the related API at BaseTokens.
7
- class Tokens:
8
- """
9
- Tokens used to describe a non-word text units, such as punctuation,
10
- uknown words/chars, smiles, etc.
11
- """
12
-
13
- _wrapper = "<[{}]>"
14
- COMMA = _wrapper.format(',')
15
- SEMICOLON = _wrapper.format(';')
16
- COLON = _wrapper.format(':')
17
- QUOTE = _wrapper.format('QUOTE')
18
- DASH = _wrapper.format('-')
19
- LONG_DASH = _wrapper.format('long_dash')
20
- DOT = _wrapper.format('.')
21
- TRIPLE_DOTS = _wrapper.format('…')
22
- EXC_SIGN = _wrapper.format('!')
23
- QUESTION_SIGN = _wrapper.format('?')
24
- OPEN_BRACKET = _wrapper.format('OPEN_BRACKET')
25
- CLOSED_BRACKET = _wrapper.format('CLOSED_BRACKET')
26
- NUMBER = _wrapper.format('NUMBER')
27
- NEW_LINE = _wrapper.format("NEW_LINE")
28
- UNKNOWN_CHAR = _wrapper.format('UNKNOWN_CHAR')
29
- UNKNOWN_WORD = _wrapper.format('UNKNOWN_WORD')
30
- URL = _wrapper.format("URL")
31
-
32
- __token_mapping = {
33
- ',': COMMA,
34
- '.': DOT,
35
- '…': TRIPLE_DOTS,
36
- ':': COLON,
37
- ';': SEMICOLON,
38
- '-': DASH,
39
- '—': LONG_DASH,
40
- '?': QUESTION_SIGN,
41
- '!': EXC_SIGN,
42
- '(': OPEN_BRACKET,
43
- ')': CLOSED_BRACKET,
44
- '{': OPEN_BRACKET,
45
- '}': CLOSED_BRACKET,
46
- '[': OPEN_BRACKET,
47
- ']': CLOSED_BRACKET,
48
- '\n': NEW_LINE,
49
- '«': QUOTE,
50
- '»': QUOTE,
51
- '"': QUOTE,
52
- }
53
-
54
- __supported_tokens = {
55
- COMMA,
56
- SEMICOLON,
57
- COLON,
58
- QUOTE,
59
- DASH,
60
- DOT,
61
- LONG_DASH,
62
- TRIPLE_DOTS,
63
- EXC_SIGN,
64
- QUESTION_SIGN,
65
- OPEN_BRACKET,
66
- CLOSED_BRACKET,
67
- NUMBER,
68
- URL,
69
- NEW_LINE,
70
- UNKNOWN_CHAR,
71
- UNKNOWN_WORD}
72
-
73
- @staticmethod
74
- def try_create(subterm):
75
- """
76
- Trying create a token by given 'term' parameter
77
- subterm: unicode
78
- I.e. term ending, so means a part of original term
79
- """
80
- assert(isinstance(subterm, str))
81
- if subterm not in Tokens.__token_mapping:
82
- return None
83
- return Token(term=subterm, token_value=Tokens.__token_mapping[subterm])
84
-
85
- @staticmethod
86
- def try_parse(term):
87
- assert(isinstance(term, str))
88
- for origin, token_value in Tokens.__token_mapping.items():
89
- if term == token_value:
90
- return Token(term=origin, token_value=token_value)
91
-
92
- @staticmethod
93
- def try_create_number(term):
94
- assert(isinstance(term, str))
95
- if not term.isdigit():
96
- return None
97
- return Token(term=term, token_value=Tokens.NUMBER)
98
-
99
- @staticmethod
100
- def try_create_url(term):
101
- assert(isinstance(term, str))
102
- result = urlparse(term)
103
- is_correct = result.scheme and result.netloc and result.path
104
- if not is_correct:
105
- return None
106
- return Token(term=term, token_value=Tokens.URL)
107
-
108
- @staticmethod
109
- def is_token(term):
110
- assert(isinstance(term, str))
111
- return term in Tokens.__supported_tokens
112
-
113
- @staticmethod
114
- def iter_chars_by_token(term):
115
- """
116
- Iterate through charts that is related to term
117
- token: char
118
- """
119
- assert(isinstance(term, str))
120
- for char, token in Tokens.__token_mapping.items():
121
- if term == token:
122
- yield char
123
-
124
- @staticmethod
125
- def iter_supported_tokens():
126
- for token in Tokens.__supported_tokens:
127
- yield token
@@ -1,42 +0,0 @@
1
- import logging
2
-
3
- from collections.abc import Iterable
4
-
5
- from arekit.common.data.input.providers.columns.sample import SampleColumnsProvider
6
- from arekit.common.data.input.providers.rows.base import BaseRowProvider
7
- from arekit.common.data.input.repositories.base import BaseInputRepository
8
- from arekit.common.data.input.repositories.sample import BaseInputSamplesRepository
9
- from arekit.common.data.storages.base import BaseRowsStorage
10
- from arekit.contrib.utils.data.contents.opinions import InputTextOpinionProvider
11
-
12
- logger = logging.getLogger(__name__)
13
- logging.basicConfig(level=logging.INFO)
14
-
15
-
16
- class InputDataSerializationHelper(object):
17
-
18
- @staticmethod
19
- def create_samples_repo(keep_labels, rows_provider, storage):
20
- assert(isinstance(rows_provider, BaseRowProvider))
21
- assert(isinstance(keep_labels, bool))
22
- assert(isinstance(storage, BaseRowsStorage))
23
- return BaseInputSamplesRepository(
24
- columns_provider=SampleColumnsProvider(store_labels=keep_labels),
25
- rows_provider=rows_provider,
26
- storage=storage)
27
-
28
- @staticmethod
29
- def fill_and_write(pipeline, repo, target, writer, doc_ids_iter, desc=""):
30
- assert(isinstance(pipeline, list))
31
- assert(isinstance(doc_ids_iter, Iterable))
32
- assert(isinstance(repo, BaseInputRepository))
33
-
34
- doc_ids = list(doc_ids_iter)
35
-
36
- repo.populate(contents_provider=InputTextOpinionProvider(pipeline),
37
- doc_ids=doc_ids,
38
- desc=desc,
39
- writer=writer,
40
- target=target)
41
-
42
- repo.push(writer=writer, target=target)
File without changes
@@ -1,93 +0,0 @@
1
- import numpy as np
2
-
3
- from arekit.common.log_utils import logger
4
- from arekit.contrib.networks.embedding import Embedding
5
- from arekit.contrib.networks.vectorizer import BaseVectorizer
6
-
7
-
8
- class BPEVectorizer(BaseVectorizer):
9
- """ Embedding algorithm based on parts (trigrams originally)
10
- """
11
-
12
- def __init__(self, embedding, max_part_size=3, word_separator=' '):
13
- assert(isinstance(embedding, Embedding))
14
- assert(isinstance(max_part_size, int))
15
- self.__embedding = embedding
16
- self.__max_part_size = max_part_size
17
- self.__word_separator = word_separator
18
-
19
- def create_term_embedding(self, term):
20
- """ Note: returns modified term value in a form of the `word` returning parameter.
21
- """
22
- assert(isinstance(term, str))
23
-
24
- word, word_embedding = self.__get_from_embedding(term=term) \
25
- if term in self.__embedding else self.__compose_from_parts(term=term)
26
-
27
- # In order to prevent a problem of the further separations during reading process.
28
- # it is necessary to replace the separators with the other chars.
29
- word = word.replace(self.__word_separator, '-')
30
-
31
- return word, word_embedding
32
-
33
- def __compose_from_parts(self, term, do_lowercase=True):
34
- # remove empty spaces before and after.
35
- term = term.strip()
36
-
37
- # perform lowercasing
38
- if do_lowercase:
39
- term = term.lower()
40
-
41
- # Calculating vector from term parts
42
- count = 0
43
- vector = np.zeros(self.__embedding.VectorSize)
44
- for word in term.split(self.__word_separator):
45
- v, c = self.__create_embedding_for_word(word=word, embedding=self.__embedding)
46
- count += c
47
- vector = vector + v
48
-
49
- return term, vector / count if count > 0 else vector
50
-
51
- def __get_from_embedding(self, term):
52
- return self.__embedding.try_get_related_word(term), self.__embedding[term]
53
-
54
- def __create_embedding_for_word(self, word, embedding):
55
- assert(isinstance(word, str))
56
- assert(isinstance(embedding, Embedding))
57
-
58
- if word in embedding:
59
- return embedding[word], 1
60
-
61
- c_i = 0
62
- c_l = self.__max_part_size
63
- count = 0
64
- vector = np.zeros(embedding.VectorSize)
65
- missings = []
66
-
67
- while c_i < len(word):
68
-
69
- if c_l == 0:
70
- missings.append(c_i)
71
- c_i += 1
72
- c_l = self.__max_part_size
73
- continue
74
-
75
- right_b = min(len(word), c_i + c_l)
76
-
77
- s_i = embedding.try_find_index_by_plain_word(word=word[c_i:right_b])
78
-
79
- if s_i is None:
80
- c_l -= 1
81
- continue
82
-
83
- vector += embedding.get_vector_by_index(s_i)
84
- c_i += c_l
85
- count += 1
86
-
87
- debug = False
88
- if debug:
89
- w_debug = ''.join(['?' if i in missings else ch
90
- for i, ch in enumerate(word)])
91
- logger.debug('Embedded: {}'.format(w_debug).encode('utf-8'))
92
-
93
- return vector, count
@@ -1,39 +0,0 @@
1
- import numpy as np
2
-
3
- from arekit.contrib.networks.vectorizer import BaseVectorizer
4
-
5
-
6
- class RandomNormalVectorizer(BaseVectorizer):
7
-
8
- def __init__(self, vector_size, token_offset=12345, max_tokens_count=100):
9
- assert(isinstance(vector_size, int))
10
- self.__vector_size = vector_size
11
- self.__seed_token_offset = token_offset
12
- self.__max_tokens_count = max_tokens_count
13
-
14
- def create_term_embedding(self, term):
15
- """ term: is its index.
16
- """
17
- embedding = self.__get_random_normal_distribution(
18
- vector_size=self.__vector_size,
19
- seed=(self.__string_to_int(term) % self.__max_tokens_count) + self.__seed_token_offset,
20
- loc=0.05,
21
- scale=0.025)
22
- return term, embedding
23
-
24
- # region private methods
25
-
26
- def __string_to_int(self, s):
27
- # Originally taken from here:
28
- # https://stackoverflow.com/questions/2511058/persistent-hashing-of-strings-in-python
29
- ord3 = lambda x: '%.3d' % ord(x)
30
- return int(''.join(map(ord3, s)))
31
-
32
- @staticmethod
33
- def __get_random_normal_distribution(vector_size, seed, loc, scale):
34
- assert (isinstance(vector_size, int))
35
- assert (isinstance(seed, int))
36
- np.random.seed(seed)
37
- return np.random.normal(loc=loc, scale=scale, size=vector_size)
38
-
39
- # endregion
File without changes