arekit 0.25.0__py3-none-any.whl → 0.25.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/common/context/terms_mapper.py +5 -2
- arekit/common/data/input/providers/rows/samples.py +8 -12
- arekit/common/data/input/providers/sample/cropped.py +4 -3
- arekit/common/data/input/terms_mapper.py +4 -8
- arekit/common/data/storages/base.py +4 -18
- arekit/common/docs/entities_grouping.py +5 -3
- arekit/common/docs/parsed/base.py +3 -3
- arekit/common/docs/parsed/providers/base.py +3 -5
- arekit/common/docs/parsed/providers/entity_service.py +7 -28
- arekit/common/docs/parsed/providers/opinion_pairs.py +6 -6
- arekit/common/docs/parsed/providers/text_opinion_pairs.py +4 -4
- arekit/common/docs/parsed/service.py +2 -2
- arekit/common/docs/parser.py +3 -30
- arekit/common/model/labeling/single.py +7 -3
- arekit/common/opinions/annot/algo/pair_based.py +9 -5
- arekit/common/pipeline/base.py +0 -2
- arekit/common/pipeline/batching.py +0 -3
- arekit/common/pipeline/items/base.py +1 -1
- arekit/common/utils.py +11 -8
- arekit/contrib/bert/input/providers/cropped_sample.py +2 -5
- arekit/contrib/bert/terms/mapper.py +2 -2
- arekit/contrib/prompt/sample.py +2 -6
- arekit/contrib/utils/bert/samplers.py +4 -2
- arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
- arekit/contrib/utils/data/storages/row_cache.py +2 -1
- arekit/contrib/utils/data/storages/sqlite_based.py +2 -1
- arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +8 -5
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +16 -8
- {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/METADATA +10 -8
- {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/RECORD +34 -115
- {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/WHEEL +1 -1
- arekit/common/data/input/repositories/__init__.py +0 -0
- arekit/common/data/input/repositories/base.py +0 -68
- arekit/common/data/input/repositories/sample.py +0 -22
- arekit/common/data/views/__init__.py +0 -0
- arekit/common/data/views/samples.py +0 -26
- arekit/common/experiment/__init__.py +0 -0
- arekit/common/experiment/api/__init__.py +0 -0
- arekit/common/experiment/api/base_samples_io.py +0 -20
- arekit/common/experiment/data_type.py +0 -17
- arekit/common/service/__init__.py +0 -0
- arekit/common/service/sqlite.py +0 -36
- arekit/contrib/networks/__init__.py +0 -0
- arekit/contrib/networks/embedding.py +0 -149
- arekit/contrib/networks/embedding_io.py +0 -18
- arekit/contrib/networks/input/__init__.py +0 -0
- arekit/contrib/networks/input/const.py +0 -6
- arekit/contrib/networks/input/ctx_serialization.py +0 -28
- arekit/contrib/networks/input/embedding/__init__.py +0 -0
- arekit/contrib/networks/input/embedding/matrix.py +0 -29
- arekit/contrib/networks/input/embedding/offsets.py +0 -55
- arekit/contrib/networks/input/formatters/__init__.py +0 -0
- arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
- arekit/contrib/networks/input/providers/__init__.py +0 -0
- arekit/contrib/networks/input/providers/sample.py +0 -129
- arekit/contrib/networks/input/providers/term_connotation.py +0 -23
- arekit/contrib/networks/input/providers/text.py +0 -24
- arekit/contrib/networks/input/rows_parser.py +0 -47
- arekit/contrib/networks/input/term_types.py +0 -13
- arekit/contrib/networks/input/terms_mapping.py +0 -60
- arekit/contrib/networks/vectorizer.py +0 -6
- arekit/contrib/utils/data/readers/__init__.py +0 -0
- arekit/contrib/utils/data/readers/base.py +0 -7
- arekit/contrib/utils/data/readers/csv_pd.py +0 -38
- arekit/contrib/utils/data/readers/jsonl.py +0 -15
- arekit/contrib/utils/data/readers/sqlite.py +0 -14
- arekit/contrib/utils/data/service/__init__.py +0 -0
- arekit/contrib/utils/data/service/balance.py +0 -50
- arekit/contrib/utils/data/storages/pandas_based.py +0 -123
- arekit/contrib/utils/data/writers/csv_native.py +0 -63
- arekit/contrib/utils/data/writers/csv_pd.py +0 -40
- arekit/contrib/utils/data/writers/json_opennre.py +0 -132
- arekit/contrib/utils/data/writers/sqlite_native.py +0 -114
- arekit/contrib/utils/embeddings/__init__.py +0 -0
- arekit/contrib/utils/embeddings/rusvectores.py +0 -58
- arekit/contrib/utils/embeddings/tokens.py +0 -30
- arekit/contrib/utils/entities/formatters/str_display.py +0 -11
- arekit/contrib/utils/io_utils/embedding.py +0 -72
- arekit/contrib/utils/np_utils/__init__.py +0 -0
- arekit/contrib/utils/np_utils/embedding.py +0 -22
- arekit/contrib/utils/np_utils/npz_utils.py +0 -13
- arekit/contrib/utils/np_utils/vocab.py +0 -20
- arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/base.py +0 -94
- arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -55
- arekit/contrib/utils/pipelines/items/text/entities_default.py +0 -23
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -33
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -105
- arekit/contrib/utils/pipelines/items/text/translator.py +0 -136
- arekit/contrib/utils/processing/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/mods.py +0 -12
- arekit/contrib/utils/processing/languages/pos.py +0 -23
- arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
- arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
- arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
- arekit/contrib/utils/processing/languages/ru/number.py +0 -23
- arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
- arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
- arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
- arekit/contrib/utils/processing/pos/__init__.py +0 -0
- arekit/contrib/utils/processing/pos/base.py +0 -12
- arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
- arekit/contrib/utils/processing/pos/russian.py +0 -10
- arekit/contrib/utils/processing/text/__init__.py +0 -0
- arekit/contrib/utils/processing/text/tokens.py +0 -127
- arekit/contrib/utils/serializer.py +0 -42
- arekit/contrib/utils/vectorizers/__init__.py +0 -0
- arekit/contrib/utils/vectorizers/bpe.py +0 -93
- arekit/contrib/utils/vectorizers/random_norm.py +0 -39
- {arekit-0.25.0.data → arekit-0.25.2.data}/data/logo.png +0 -0
- {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/LICENSE +0 -0
- {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/top_level.txt +0 -0
|
@@ -1,127 +0,0 @@
|
|
|
1
|
-
from urllib.parse import urlparse
|
|
2
|
-
from arekit.common.context.token import Token
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
# TODO. Provide the base (BaseTokens) type.
|
|
6
|
-
# TODO. With the related API at BaseTokens.
|
|
7
|
-
class Tokens:
|
|
8
|
-
"""
|
|
9
|
-
Tokens used to describe a non-word text units, such as punctuation,
|
|
10
|
-
uknown words/chars, smiles, etc.
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
_wrapper = "<[{}]>"
|
|
14
|
-
COMMA = _wrapper.format(',')
|
|
15
|
-
SEMICOLON = _wrapper.format(';')
|
|
16
|
-
COLON = _wrapper.format(':')
|
|
17
|
-
QUOTE = _wrapper.format('QUOTE')
|
|
18
|
-
DASH = _wrapper.format('-')
|
|
19
|
-
LONG_DASH = _wrapper.format('long_dash')
|
|
20
|
-
DOT = _wrapper.format('.')
|
|
21
|
-
TRIPLE_DOTS = _wrapper.format('…')
|
|
22
|
-
EXC_SIGN = _wrapper.format('!')
|
|
23
|
-
QUESTION_SIGN = _wrapper.format('?')
|
|
24
|
-
OPEN_BRACKET = _wrapper.format('OPEN_BRACKET')
|
|
25
|
-
CLOSED_BRACKET = _wrapper.format('CLOSED_BRACKET')
|
|
26
|
-
NUMBER = _wrapper.format('NUMBER')
|
|
27
|
-
NEW_LINE = _wrapper.format("NEW_LINE")
|
|
28
|
-
UNKNOWN_CHAR = _wrapper.format('UNKNOWN_CHAR')
|
|
29
|
-
UNKNOWN_WORD = _wrapper.format('UNKNOWN_WORD')
|
|
30
|
-
URL = _wrapper.format("URL")
|
|
31
|
-
|
|
32
|
-
__token_mapping = {
|
|
33
|
-
',': COMMA,
|
|
34
|
-
'.': DOT,
|
|
35
|
-
'…': TRIPLE_DOTS,
|
|
36
|
-
':': COLON,
|
|
37
|
-
';': SEMICOLON,
|
|
38
|
-
'-': DASH,
|
|
39
|
-
'—': LONG_DASH,
|
|
40
|
-
'?': QUESTION_SIGN,
|
|
41
|
-
'!': EXC_SIGN,
|
|
42
|
-
'(': OPEN_BRACKET,
|
|
43
|
-
')': CLOSED_BRACKET,
|
|
44
|
-
'{': OPEN_BRACKET,
|
|
45
|
-
'}': CLOSED_BRACKET,
|
|
46
|
-
'[': OPEN_BRACKET,
|
|
47
|
-
']': CLOSED_BRACKET,
|
|
48
|
-
'\n': NEW_LINE,
|
|
49
|
-
'«': QUOTE,
|
|
50
|
-
'»': QUOTE,
|
|
51
|
-
'"': QUOTE,
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
__supported_tokens = {
|
|
55
|
-
COMMA,
|
|
56
|
-
SEMICOLON,
|
|
57
|
-
COLON,
|
|
58
|
-
QUOTE,
|
|
59
|
-
DASH,
|
|
60
|
-
DOT,
|
|
61
|
-
LONG_DASH,
|
|
62
|
-
TRIPLE_DOTS,
|
|
63
|
-
EXC_SIGN,
|
|
64
|
-
QUESTION_SIGN,
|
|
65
|
-
OPEN_BRACKET,
|
|
66
|
-
CLOSED_BRACKET,
|
|
67
|
-
NUMBER,
|
|
68
|
-
URL,
|
|
69
|
-
NEW_LINE,
|
|
70
|
-
UNKNOWN_CHAR,
|
|
71
|
-
UNKNOWN_WORD}
|
|
72
|
-
|
|
73
|
-
@staticmethod
|
|
74
|
-
def try_create(subterm):
|
|
75
|
-
"""
|
|
76
|
-
Trying create a token by given 'term' parameter
|
|
77
|
-
subterm: unicode
|
|
78
|
-
I.e. term ending, so means a part of original term
|
|
79
|
-
"""
|
|
80
|
-
assert(isinstance(subterm, str))
|
|
81
|
-
if subterm not in Tokens.__token_mapping:
|
|
82
|
-
return None
|
|
83
|
-
return Token(term=subterm, token_value=Tokens.__token_mapping[subterm])
|
|
84
|
-
|
|
85
|
-
@staticmethod
|
|
86
|
-
def try_parse(term):
|
|
87
|
-
assert(isinstance(term, str))
|
|
88
|
-
for origin, token_value in Tokens.__token_mapping.items():
|
|
89
|
-
if term == token_value:
|
|
90
|
-
return Token(term=origin, token_value=token_value)
|
|
91
|
-
|
|
92
|
-
@staticmethod
|
|
93
|
-
def try_create_number(term):
|
|
94
|
-
assert(isinstance(term, str))
|
|
95
|
-
if not term.isdigit():
|
|
96
|
-
return None
|
|
97
|
-
return Token(term=term, token_value=Tokens.NUMBER)
|
|
98
|
-
|
|
99
|
-
@staticmethod
|
|
100
|
-
def try_create_url(term):
|
|
101
|
-
assert(isinstance(term, str))
|
|
102
|
-
result = urlparse(term)
|
|
103
|
-
is_correct = result.scheme and result.netloc and result.path
|
|
104
|
-
if not is_correct:
|
|
105
|
-
return None
|
|
106
|
-
return Token(term=term, token_value=Tokens.URL)
|
|
107
|
-
|
|
108
|
-
@staticmethod
|
|
109
|
-
def is_token(term):
|
|
110
|
-
assert(isinstance(term, str))
|
|
111
|
-
return term in Tokens.__supported_tokens
|
|
112
|
-
|
|
113
|
-
@staticmethod
|
|
114
|
-
def iter_chars_by_token(term):
|
|
115
|
-
"""
|
|
116
|
-
Iterate through charts that is related to term
|
|
117
|
-
token: char
|
|
118
|
-
"""
|
|
119
|
-
assert(isinstance(term, str))
|
|
120
|
-
for char, token in Tokens.__token_mapping.items():
|
|
121
|
-
if term == token:
|
|
122
|
-
yield char
|
|
123
|
-
|
|
124
|
-
@staticmethod
|
|
125
|
-
def iter_supported_tokens():
|
|
126
|
-
for token in Tokens.__supported_tokens:
|
|
127
|
-
yield token
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
|
|
3
|
-
from collections.abc import Iterable
|
|
4
|
-
|
|
5
|
-
from arekit.common.data.input.providers.columns.sample import SampleColumnsProvider
|
|
6
|
-
from arekit.common.data.input.providers.rows.base import BaseRowProvider
|
|
7
|
-
from arekit.common.data.input.repositories.base import BaseInputRepository
|
|
8
|
-
from arekit.common.data.input.repositories.sample import BaseInputSamplesRepository
|
|
9
|
-
from arekit.common.data.storages.base import BaseRowsStorage
|
|
10
|
-
from arekit.contrib.utils.data.contents.opinions import InputTextOpinionProvider
|
|
11
|
-
|
|
12
|
-
logger = logging.getLogger(__name__)
|
|
13
|
-
logging.basicConfig(level=logging.INFO)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class InputDataSerializationHelper(object):
|
|
17
|
-
|
|
18
|
-
@staticmethod
|
|
19
|
-
def create_samples_repo(keep_labels, rows_provider, storage):
|
|
20
|
-
assert(isinstance(rows_provider, BaseRowProvider))
|
|
21
|
-
assert(isinstance(keep_labels, bool))
|
|
22
|
-
assert(isinstance(storage, BaseRowsStorage))
|
|
23
|
-
return BaseInputSamplesRepository(
|
|
24
|
-
columns_provider=SampleColumnsProvider(store_labels=keep_labels),
|
|
25
|
-
rows_provider=rows_provider,
|
|
26
|
-
storage=storage)
|
|
27
|
-
|
|
28
|
-
@staticmethod
|
|
29
|
-
def fill_and_write(pipeline, repo, target, writer, doc_ids_iter, desc=""):
|
|
30
|
-
assert(isinstance(pipeline, list))
|
|
31
|
-
assert(isinstance(doc_ids_iter, Iterable))
|
|
32
|
-
assert(isinstance(repo, BaseInputRepository))
|
|
33
|
-
|
|
34
|
-
doc_ids = list(doc_ids_iter)
|
|
35
|
-
|
|
36
|
-
repo.populate(contents_provider=InputTextOpinionProvider(pipeline),
|
|
37
|
-
doc_ids=doc_ids,
|
|
38
|
-
desc=desc,
|
|
39
|
-
writer=writer,
|
|
40
|
-
target=target)
|
|
41
|
-
|
|
42
|
-
repo.push(writer=writer, target=target)
|
|
File without changes
|
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
|
|
3
|
-
from arekit.common.log_utils import logger
|
|
4
|
-
from arekit.contrib.networks.embedding import Embedding
|
|
5
|
-
from arekit.contrib.networks.vectorizer import BaseVectorizer
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class BPEVectorizer(BaseVectorizer):
|
|
9
|
-
""" Embedding algorithm based on parts (trigrams originally)
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
def __init__(self, embedding, max_part_size=3, word_separator=' '):
|
|
13
|
-
assert(isinstance(embedding, Embedding))
|
|
14
|
-
assert(isinstance(max_part_size, int))
|
|
15
|
-
self.__embedding = embedding
|
|
16
|
-
self.__max_part_size = max_part_size
|
|
17
|
-
self.__word_separator = word_separator
|
|
18
|
-
|
|
19
|
-
def create_term_embedding(self, term):
|
|
20
|
-
""" Note: returns modified term value in a form of the `word` returning parameter.
|
|
21
|
-
"""
|
|
22
|
-
assert(isinstance(term, str))
|
|
23
|
-
|
|
24
|
-
word, word_embedding = self.__get_from_embedding(term=term) \
|
|
25
|
-
if term in self.__embedding else self.__compose_from_parts(term=term)
|
|
26
|
-
|
|
27
|
-
# In order to prevent a problem of the further separations during reading process.
|
|
28
|
-
# it is necessary to replace the separators with the other chars.
|
|
29
|
-
word = word.replace(self.__word_separator, '-')
|
|
30
|
-
|
|
31
|
-
return word, word_embedding
|
|
32
|
-
|
|
33
|
-
def __compose_from_parts(self, term, do_lowercase=True):
|
|
34
|
-
# remove empty spaces before and after.
|
|
35
|
-
term = term.strip()
|
|
36
|
-
|
|
37
|
-
# perform lowercasing
|
|
38
|
-
if do_lowercase:
|
|
39
|
-
term = term.lower()
|
|
40
|
-
|
|
41
|
-
# Calculating vector from term parts
|
|
42
|
-
count = 0
|
|
43
|
-
vector = np.zeros(self.__embedding.VectorSize)
|
|
44
|
-
for word in term.split(self.__word_separator):
|
|
45
|
-
v, c = self.__create_embedding_for_word(word=word, embedding=self.__embedding)
|
|
46
|
-
count += c
|
|
47
|
-
vector = vector + v
|
|
48
|
-
|
|
49
|
-
return term, vector / count if count > 0 else vector
|
|
50
|
-
|
|
51
|
-
def __get_from_embedding(self, term):
|
|
52
|
-
return self.__embedding.try_get_related_word(term), self.__embedding[term]
|
|
53
|
-
|
|
54
|
-
def __create_embedding_for_word(self, word, embedding):
|
|
55
|
-
assert(isinstance(word, str))
|
|
56
|
-
assert(isinstance(embedding, Embedding))
|
|
57
|
-
|
|
58
|
-
if word in embedding:
|
|
59
|
-
return embedding[word], 1
|
|
60
|
-
|
|
61
|
-
c_i = 0
|
|
62
|
-
c_l = self.__max_part_size
|
|
63
|
-
count = 0
|
|
64
|
-
vector = np.zeros(embedding.VectorSize)
|
|
65
|
-
missings = []
|
|
66
|
-
|
|
67
|
-
while c_i < len(word):
|
|
68
|
-
|
|
69
|
-
if c_l == 0:
|
|
70
|
-
missings.append(c_i)
|
|
71
|
-
c_i += 1
|
|
72
|
-
c_l = self.__max_part_size
|
|
73
|
-
continue
|
|
74
|
-
|
|
75
|
-
right_b = min(len(word), c_i + c_l)
|
|
76
|
-
|
|
77
|
-
s_i = embedding.try_find_index_by_plain_word(word=word[c_i:right_b])
|
|
78
|
-
|
|
79
|
-
if s_i is None:
|
|
80
|
-
c_l -= 1
|
|
81
|
-
continue
|
|
82
|
-
|
|
83
|
-
vector += embedding.get_vector_by_index(s_i)
|
|
84
|
-
c_i += c_l
|
|
85
|
-
count += 1
|
|
86
|
-
|
|
87
|
-
debug = False
|
|
88
|
-
if debug:
|
|
89
|
-
w_debug = ''.join(['?' if i in missings else ch
|
|
90
|
-
for i, ch in enumerate(word)])
|
|
91
|
-
logger.debug('Embedded: {}'.format(w_debug).encode('utf-8'))
|
|
92
|
-
|
|
93
|
-
return vector, count
|
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
|
|
3
|
-
from arekit.contrib.networks.vectorizer import BaseVectorizer
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class RandomNormalVectorizer(BaseVectorizer):
|
|
7
|
-
|
|
8
|
-
def __init__(self, vector_size, token_offset=12345, max_tokens_count=100):
|
|
9
|
-
assert(isinstance(vector_size, int))
|
|
10
|
-
self.__vector_size = vector_size
|
|
11
|
-
self.__seed_token_offset = token_offset
|
|
12
|
-
self.__max_tokens_count = max_tokens_count
|
|
13
|
-
|
|
14
|
-
def create_term_embedding(self, term):
|
|
15
|
-
""" term: is its index.
|
|
16
|
-
"""
|
|
17
|
-
embedding = self.__get_random_normal_distribution(
|
|
18
|
-
vector_size=self.__vector_size,
|
|
19
|
-
seed=(self.__string_to_int(term) % self.__max_tokens_count) + self.__seed_token_offset,
|
|
20
|
-
loc=0.05,
|
|
21
|
-
scale=0.025)
|
|
22
|
-
return term, embedding
|
|
23
|
-
|
|
24
|
-
# region private methods
|
|
25
|
-
|
|
26
|
-
def __string_to_int(self, s):
|
|
27
|
-
# Originally taken from here:
|
|
28
|
-
# https://stackoverflow.com/questions/2511058/persistent-hashing-of-strings-in-python
|
|
29
|
-
ord3 = lambda x: '%.3d' % ord(x)
|
|
30
|
-
return int(''.join(map(ord3, s)))
|
|
31
|
-
|
|
32
|
-
@staticmethod
|
|
33
|
-
def __get_random_normal_distribution(vector_size, seed, loc, scale):
|
|
34
|
-
assert (isinstance(vector_size, int))
|
|
35
|
-
assert (isinstance(seed, int))
|
|
36
|
-
np.random.seed(seed)
|
|
37
|
-
return np.random.normal(loc=loc, scale=scale, size=vector_size)
|
|
38
|
-
|
|
39
|
-
# endregion
|
|
File without changes
|
|
File without changes
|
|
File without changes
|