arekit 0.25.0__py3-none-any.whl → 0.25.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. arekit/common/context/terms_mapper.py +5 -2
  2. arekit/common/data/input/providers/rows/samples.py +8 -12
  3. arekit/common/data/input/providers/sample/cropped.py +4 -3
  4. arekit/common/data/input/terms_mapper.py +4 -8
  5. arekit/common/data/storages/base.py +4 -18
  6. arekit/common/docs/entities_grouping.py +5 -3
  7. arekit/common/docs/parsed/base.py +3 -3
  8. arekit/common/docs/parsed/providers/base.py +3 -5
  9. arekit/common/docs/parsed/providers/entity_service.py +7 -28
  10. arekit/common/docs/parsed/providers/opinion_pairs.py +6 -6
  11. arekit/common/docs/parsed/providers/text_opinion_pairs.py +4 -4
  12. arekit/common/docs/parsed/service.py +2 -2
  13. arekit/common/docs/parser.py +3 -30
  14. arekit/common/model/labeling/single.py +7 -3
  15. arekit/common/opinions/annot/algo/pair_based.py +9 -5
  16. arekit/common/pipeline/base.py +0 -2
  17. arekit/common/pipeline/batching.py +0 -3
  18. arekit/common/pipeline/items/base.py +1 -1
  19. arekit/common/utils.py +11 -8
  20. arekit/contrib/bert/input/providers/cropped_sample.py +2 -5
  21. arekit/contrib/bert/terms/mapper.py +2 -2
  22. arekit/contrib/prompt/sample.py +2 -6
  23. arekit/contrib/utils/bert/samplers.py +4 -2
  24. arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
  25. arekit/contrib/utils/data/storages/row_cache.py +2 -1
  26. arekit/contrib/utils/data/storages/sqlite_based.py +2 -1
  27. arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +8 -5
  28. arekit/contrib/utils/pipelines/text_opinion/extraction.py +16 -8
  29. {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/METADATA +10 -8
  30. {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/RECORD +34 -115
  31. {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/WHEEL +1 -1
  32. arekit/common/data/input/repositories/__init__.py +0 -0
  33. arekit/common/data/input/repositories/base.py +0 -68
  34. arekit/common/data/input/repositories/sample.py +0 -22
  35. arekit/common/data/views/__init__.py +0 -0
  36. arekit/common/data/views/samples.py +0 -26
  37. arekit/common/experiment/__init__.py +0 -0
  38. arekit/common/experiment/api/__init__.py +0 -0
  39. arekit/common/experiment/api/base_samples_io.py +0 -20
  40. arekit/common/experiment/data_type.py +0 -17
  41. arekit/common/service/__init__.py +0 -0
  42. arekit/common/service/sqlite.py +0 -36
  43. arekit/contrib/networks/__init__.py +0 -0
  44. arekit/contrib/networks/embedding.py +0 -149
  45. arekit/contrib/networks/embedding_io.py +0 -18
  46. arekit/contrib/networks/input/__init__.py +0 -0
  47. arekit/contrib/networks/input/const.py +0 -6
  48. arekit/contrib/networks/input/ctx_serialization.py +0 -28
  49. arekit/contrib/networks/input/embedding/__init__.py +0 -0
  50. arekit/contrib/networks/input/embedding/matrix.py +0 -29
  51. arekit/contrib/networks/input/embedding/offsets.py +0 -55
  52. arekit/contrib/networks/input/formatters/__init__.py +0 -0
  53. arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
  54. arekit/contrib/networks/input/providers/__init__.py +0 -0
  55. arekit/contrib/networks/input/providers/sample.py +0 -129
  56. arekit/contrib/networks/input/providers/term_connotation.py +0 -23
  57. arekit/contrib/networks/input/providers/text.py +0 -24
  58. arekit/contrib/networks/input/rows_parser.py +0 -47
  59. arekit/contrib/networks/input/term_types.py +0 -13
  60. arekit/contrib/networks/input/terms_mapping.py +0 -60
  61. arekit/contrib/networks/vectorizer.py +0 -6
  62. arekit/contrib/utils/data/readers/__init__.py +0 -0
  63. arekit/contrib/utils/data/readers/base.py +0 -7
  64. arekit/contrib/utils/data/readers/csv_pd.py +0 -38
  65. arekit/contrib/utils/data/readers/jsonl.py +0 -15
  66. arekit/contrib/utils/data/readers/sqlite.py +0 -14
  67. arekit/contrib/utils/data/service/__init__.py +0 -0
  68. arekit/contrib/utils/data/service/balance.py +0 -50
  69. arekit/contrib/utils/data/storages/pandas_based.py +0 -123
  70. arekit/contrib/utils/data/writers/csv_native.py +0 -63
  71. arekit/contrib/utils/data/writers/csv_pd.py +0 -40
  72. arekit/contrib/utils/data/writers/json_opennre.py +0 -132
  73. arekit/contrib/utils/data/writers/sqlite_native.py +0 -114
  74. arekit/contrib/utils/embeddings/__init__.py +0 -0
  75. arekit/contrib/utils/embeddings/rusvectores.py +0 -58
  76. arekit/contrib/utils/embeddings/tokens.py +0 -30
  77. arekit/contrib/utils/entities/formatters/str_display.py +0 -11
  78. arekit/contrib/utils/io_utils/embedding.py +0 -72
  79. arekit/contrib/utils/np_utils/__init__.py +0 -0
  80. arekit/contrib/utils/np_utils/embedding.py +0 -22
  81. arekit/contrib/utils/np_utils/npz_utils.py +0 -13
  82. arekit/contrib/utils/np_utils/vocab.py +0 -20
  83. arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
  84. arekit/contrib/utils/pipelines/items/sampling/base.py +0 -94
  85. arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -55
  86. arekit/contrib/utils/pipelines/items/text/entities_default.py +0 -23
  87. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
  88. arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -33
  89. arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -105
  90. arekit/contrib/utils/pipelines/items/text/translator.py +0 -136
  91. arekit/contrib/utils/processing/__init__.py +0 -0
  92. arekit/contrib/utils/processing/languages/__init__.py +0 -0
  93. arekit/contrib/utils/processing/languages/mods.py +0 -12
  94. arekit/contrib/utils/processing/languages/pos.py +0 -23
  95. arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
  96. arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
  97. arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
  98. arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
  99. arekit/contrib/utils/processing/languages/ru/number.py +0 -23
  100. arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
  101. arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
  102. arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
  103. arekit/contrib/utils/processing/pos/__init__.py +0 -0
  104. arekit/contrib/utils/processing/pos/base.py +0 -12
  105. arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
  106. arekit/contrib/utils/processing/pos/russian.py +0 -10
  107. arekit/contrib/utils/processing/text/__init__.py +0 -0
  108. arekit/contrib/utils/processing/text/tokens.py +0 -127
  109. arekit/contrib/utils/serializer.py +0 -42
  110. arekit/contrib/utils/vectorizers/__init__.py +0 -0
  111. arekit/contrib/utils/vectorizers/bpe.py +0 -93
  112. arekit/contrib/utils/vectorizers/random_norm.py +0 -39
  113. {arekit-0.25.0.data → arekit-0.25.2.data}/data/logo.png +0 -0
  114. {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/LICENSE +0 -0
  115. {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/top_level.txt +0 -0
@@ -1,127 +0,0 @@
1
- from urllib.parse import urlparse
2
- from arekit.common.context.token import Token
3
-
4
-
5
- # TODO. Provide the base (BaseTokens) type.
6
- # TODO. With the related API at BaseTokens.
7
- class Tokens:
8
- """
9
- Tokens used to describe a non-word text units, such as punctuation,
10
- uknown words/chars, smiles, etc.
11
- """
12
-
13
- _wrapper = "<[{}]>"
14
- COMMA = _wrapper.format(',')
15
- SEMICOLON = _wrapper.format(';')
16
- COLON = _wrapper.format(':')
17
- QUOTE = _wrapper.format('QUOTE')
18
- DASH = _wrapper.format('-')
19
- LONG_DASH = _wrapper.format('long_dash')
20
- DOT = _wrapper.format('.')
21
- TRIPLE_DOTS = _wrapper.format('…')
22
- EXC_SIGN = _wrapper.format('!')
23
- QUESTION_SIGN = _wrapper.format('?')
24
- OPEN_BRACKET = _wrapper.format('OPEN_BRACKET')
25
- CLOSED_BRACKET = _wrapper.format('CLOSED_BRACKET')
26
- NUMBER = _wrapper.format('NUMBER')
27
- NEW_LINE = _wrapper.format("NEW_LINE")
28
- UNKNOWN_CHAR = _wrapper.format('UNKNOWN_CHAR')
29
- UNKNOWN_WORD = _wrapper.format('UNKNOWN_WORD')
30
- URL = _wrapper.format("URL")
31
-
32
- __token_mapping = {
33
- ',': COMMA,
34
- '.': DOT,
35
- '…': TRIPLE_DOTS,
36
- ':': COLON,
37
- ';': SEMICOLON,
38
- '-': DASH,
39
- '—': LONG_DASH,
40
- '?': QUESTION_SIGN,
41
- '!': EXC_SIGN,
42
- '(': OPEN_BRACKET,
43
- ')': CLOSED_BRACKET,
44
- '{': OPEN_BRACKET,
45
- '}': CLOSED_BRACKET,
46
- '[': OPEN_BRACKET,
47
- ']': CLOSED_BRACKET,
48
- '\n': NEW_LINE,
49
- '«': QUOTE,
50
- '»': QUOTE,
51
- '"': QUOTE,
52
- }
53
-
54
- __supported_tokens = {
55
- COMMA,
56
- SEMICOLON,
57
- COLON,
58
- QUOTE,
59
- DASH,
60
- DOT,
61
- LONG_DASH,
62
- TRIPLE_DOTS,
63
- EXC_SIGN,
64
- QUESTION_SIGN,
65
- OPEN_BRACKET,
66
- CLOSED_BRACKET,
67
- NUMBER,
68
- URL,
69
- NEW_LINE,
70
- UNKNOWN_CHAR,
71
- UNKNOWN_WORD}
72
-
73
- @staticmethod
74
- def try_create(subterm):
75
- """
76
- Trying create a token by given 'term' parameter
77
- subterm: unicode
78
- I.e. term ending, so means a part of original term
79
- """
80
- assert(isinstance(subterm, str))
81
- if subterm not in Tokens.__token_mapping:
82
- return None
83
- return Token(term=subterm, token_value=Tokens.__token_mapping[subterm])
84
-
85
- @staticmethod
86
- def try_parse(term):
87
- assert(isinstance(term, str))
88
- for origin, token_value in Tokens.__token_mapping.items():
89
- if term == token_value:
90
- return Token(term=origin, token_value=token_value)
91
-
92
- @staticmethod
93
- def try_create_number(term):
94
- assert(isinstance(term, str))
95
- if not term.isdigit():
96
- return None
97
- return Token(term=term, token_value=Tokens.NUMBER)
98
-
99
- @staticmethod
100
- def try_create_url(term):
101
- assert(isinstance(term, str))
102
- result = urlparse(term)
103
- is_correct = result.scheme and result.netloc and result.path
104
- if not is_correct:
105
- return None
106
- return Token(term=term, token_value=Tokens.URL)
107
-
108
- @staticmethod
109
- def is_token(term):
110
- assert(isinstance(term, str))
111
- return term in Tokens.__supported_tokens
112
-
113
- @staticmethod
114
- def iter_chars_by_token(term):
115
- """
116
- Iterate through charts that is related to term
117
- token: char
118
- """
119
- assert(isinstance(term, str))
120
- for char, token in Tokens.__token_mapping.items():
121
- if term == token:
122
- yield char
123
-
124
- @staticmethod
125
- def iter_supported_tokens():
126
- for token in Tokens.__supported_tokens:
127
- yield token
@@ -1,42 +0,0 @@
1
- import logging
2
-
3
- from collections.abc import Iterable
4
-
5
- from arekit.common.data.input.providers.columns.sample import SampleColumnsProvider
6
- from arekit.common.data.input.providers.rows.base import BaseRowProvider
7
- from arekit.common.data.input.repositories.base import BaseInputRepository
8
- from arekit.common.data.input.repositories.sample import BaseInputSamplesRepository
9
- from arekit.common.data.storages.base import BaseRowsStorage
10
- from arekit.contrib.utils.data.contents.opinions import InputTextOpinionProvider
11
-
12
- logger = logging.getLogger(__name__)
13
- logging.basicConfig(level=logging.INFO)
14
-
15
-
16
- class InputDataSerializationHelper(object):
17
-
18
- @staticmethod
19
- def create_samples_repo(keep_labels, rows_provider, storage):
20
- assert(isinstance(rows_provider, BaseRowProvider))
21
- assert(isinstance(keep_labels, bool))
22
- assert(isinstance(storage, BaseRowsStorage))
23
- return BaseInputSamplesRepository(
24
- columns_provider=SampleColumnsProvider(store_labels=keep_labels),
25
- rows_provider=rows_provider,
26
- storage=storage)
27
-
28
- @staticmethod
29
- def fill_and_write(pipeline, repo, target, writer, doc_ids_iter, desc=""):
30
- assert(isinstance(pipeline, list))
31
- assert(isinstance(doc_ids_iter, Iterable))
32
- assert(isinstance(repo, BaseInputRepository))
33
-
34
- doc_ids = list(doc_ids_iter)
35
-
36
- repo.populate(contents_provider=InputTextOpinionProvider(pipeline),
37
- doc_ids=doc_ids,
38
- desc=desc,
39
- writer=writer,
40
- target=target)
41
-
42
- repo.push(writer=writer, target=target)
File without changes
@@ -1,93 +0,0 @@
1
- import numpy as np
2
-
3
- from arekit.common.log_utils import logger
4
- from arekit.contrib.networks.embedding import Embedding
5
- from arekit.contrib.networks.vectorizer import BaseVectorizer
6
-
7
-
8
- class BPEVectorizer(BaseVectorizer):
9
- """ Embedding algorithm based on parts (trigrams originally)
10
- """
11
-
12
- def __init__(self, embedding, max_part_size=3, word_separator=' '):
13
- assert(isinstance(embedding, Embedding))
14
- assert(isinstance(max_part_size, int))
15
- self.__embedding = embedding
16
- self.__max_part_size = max_part_size
17
- self.__word_separator = word_separator
18
-
19
- def create_term_embedding(self, term):
20
- """ Note: returns modified term value in a form of the `word` returning parameter.
21
- """
22
- assert(isinstance(term, str))
23
-
24
- word, word_embedding = self.__get_from_embedding(term=term) \
25
- if term in self.__embedding else self.__compose_from_parts(term=term)
26
-
27
- # In order to prevent a problem of the further separations during reading process.
28
- # it is necessary to replace the separators with the other chars.
29
- word = word.replace(self.__word_separator, '-')
30
-
31
- return word, word_embedding
32
-
33
- def __compose_from_parts(self, term, do_lowercase=True):
34
- # remove empty spaces before and after.
35
- term = term.strip()
36
-
37
- # perform lowercasing
38
- if do_lowercase:
39
- term = term.lower()
40
-
41
- # Calculating vector from term parts
42
- count = 0
43
- vector = np.zeros(self.__embedding.VectorSize)
44
- for word in term.split(self.__word_separator):
45
- v, c = self.__create_embedding_for_word(word=word, embedding=self.__embedding)
46
- count += c
47
- vector = vector + v
48
-
49
- return term, vector / count if count > 0 else vector
50
-
51
- def __get_from_embedding(self, term):
52
- return self.__embedding.try_get_related_word(term), self.__embedding[term]
53
-
54
- def __create_embedding_for_word(self, word, embedding):
55
- assert(isinstance(word, str))
56
- assert(isinstance(embedding, Embedding))
57
-
58
- if word in embedding:
59
- return embedding[word], 1
60
-
61
- c_i = 0
62
- c_l = self.__max_part_size
63
- count = 0
64
- vector = np.zeros(embedding.VectorSize)
65
- missings = []
66
-
67
- while c_i < len(word):
68
-
69
- if c_l == 0:
70
- missings.append(c_i)
71
- c_i += 1
72
- c_l = self.__max_part_size
73
- continue
74
-
75
- right_b = min(len(word), c_i + c_l)
76
-
77
- s_i = embedding.try_find_index_by_plain_word(word=word[c_i:right_b])
78
-
79
- if s_i is None:
80
- c_l -= 1
81
- continue
82
-
83
- vector += embedding.get_vector_by_index(s_i)
84
- c_i += c_l
85
- count += 1
86
-
87
- debug = False
88
- if debug:
89
- w_debug = ''.join(['?' if i in missings else ch
90
- for i, ch in enumerate(word)])
91
- logger.debug('Embedded: {}'.format(w_debug).encode('utf-8'))
92
-
93
- return vector, count
@@ -1,39 +0,0 @@
1
- import numpy as np
2
-
3
- from arekit.contrib.networks.vectorizer import BaseVectorizer
4
-
5
-
6
- class RandomNormalVectorizer(BaseVectorizer):
7
-
8
- def __init__(self, vector_size, token_offset=12345, max_tokens_count=100):
9
- assert(isinstance(vector_size, int))
10
- self.__vector_size = vector_size
11
- self.__seed_token_offset = token_offset
12
- self.__max_tokens_count = max_tokens_count
13
-
14
- def create_term_embedding(self, term):
15
- """ term: is its index.
16
- """
17
- embedding = self.__get_random_normal_distribution(
18
- vector_size=self.__vector_size,
19
- seed=(self.__string_to_int(term) % self.__max_tokens_count) + self.__seed_token_offset,
20
- loc=0.05,
21
- scale=0.025)
22
- return term, embedding
23
-
24
- # region private methods
25
-
26
- def __string_to_int(self, s):
27
- # Originally taken from here:
28
- # https://stackoverflow.com/questions/2511058/persistent-hashing-of-strings-in-python
29
- ord3 = lambda x: '%.3d' % ord(x)
30
- return int(''.join(map(ord3, s)))
31
-
32
- @staticmethod
33
- def __get_random_normal_distribution(vector_size, seed, loc, scale):
34
- assert (isinstance(vector_size, int))
35
- assert (isinstance(seed, int))
36
- np.random.seed(seed)
37
- return np.random.normal(loc=loc, scale=scale, size=vector_size)
38
-
39
- # endregion
File without changes