arekit 0.25.0__py3-none-any.whl → 0.25.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/common/context/terms_mapper.py +5 -2
- arekit/common/data/input/providers/rows/samples.py +8 -12
- arekit/common/data/input/providers/sample/cropped.py +4 -3
- arekit/common/data/input/terms_mapper.py +4 -8
- arekit/common/data/storages/base.py +4 -18
- arekit/common/docs/entities_grouping.py +5 -3
- arekit/common/docs/parsed/base.py +3 -3
- arekit/common/docs/parsed/providers/base.py +3 -5
- arekit/common/docs/parsed/providers/entity_service.py +7 -28
- arekit/common/docs/parsed/providers/opinion_pairs.py +6 -6
- arekit/common/docs/parsed/providers/text_opinion_pairs.py +4 -4
- arekit/common/docs/parsed/service.py +2 -2
- arekit/common/docs/parser.py +3 -30
- arekit/common/model/labeling/single.py +7 -3
- arekit/common/opinions/annot/algo/pair_based.py +9 -5
- arekit/common/pipeline/base.py +0 -2
- arekit/common/pipeline/batching.py +0 -3
- arekit/common/pipeline/items/base.py +1 -1
- arekit/common/utils.py +11 -8
- arekit/contrib/bert/input/providers/cropped_sample.py +2 -5
- arekit/contrib/bert/terms/mapper.py +2 -2
- arekit/contrib/prompt/sample.py +2 -6
- arekit/contrib/utils/bert/samplers.py +4 -2
- arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
- arekit/contrib/utils/data/storages/row_cache.py +2 -1
- arekit/contrib/utils/data/storages/sqlite_based.py +2 -1
- arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +8 -5
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +16 -8
- {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/METADATA +10 -8
- {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/RECORD +34 -115
- {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/WHEEL +1 -1
- arekit/common/data/input/repositories/__init__.py +0 -0
- arekit/common/data/input/repositories/base.py +0 -68
- arekit/common/data/input/repositories/sample.py +0 -22
- arekit/common/data/views/__init__.py +0 -0
- arekit/common/data/views/samples.py +0 -26
- arekit/common/experiment/__init__.py +0 -0
- arekit/common/experiment/api/__init__.py +0 -0
- arekit/common/experiment/api/base_samples_io.py +0 -20
- arekit/common/experiment/data_type.py +0 -17
- arekit/common/service/__init__.py +0 -0
- arekit/common/service/sqlite.py +0 -36
- arekit/contrib/networks/__init__.py +0 -0
- arekit/contrib/networks/embedding.py +0 -149
- arekit/contrib/networks/embedding_io.py +0 -18
- arekit/contrib/networks/input/__init__.py +0 -0
- arekit/contrib/networks/input/const.py +0 -6
- arekit/contrib/networks/input/ctx_serialization.py +0 -28
- arekit/contrib/networks/input/embedding/__init__.py +0 -0
- arekit/contrib/networks/input/embedding/matrix.py +0 -29
- arekit/contrib/networks/input/embedding/offsets.py +0 -55
- arekit/contrib/networks/input/formatters/__init__.py +0 -0
- arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
- arekit/contrib/networks/input/providers/__init__.py +0 -0
- arekit/contrib/networks/input/providers/sample.py +0 -129
- arekit/contrib/networks/input/providers/term_connotation.py +0 -23
- arekit/contrib/networks/input/providers/text.py +0 -24
- arekit/contrib/networks/input/rows_parser.py +0 -47
- arekit/contrib/networks/input/term_types.py +0 -13
- arekit/contrib/networks/input/terms_mapping.py +0 -60
- arekit/contrib/networks/vectorizer.py +0 -6
- arekit/contrib/utils/data/readers/__init__.py +0 -0
- arekit/contrib/utils/data/readers/base.py +0 -7
- arekit/contrib/utils/data/readers/csv_pd.py +0 -38
- arekit/contrib/utils/data/readers/jsonl.py +0 -15
- arekit/contrib/utils/data/readers/sqlite.py +0 -14
- arekit/contrib/utils/data/service/__init__.py +0 -0
- arekit/contrib/utils/data/service/balance.py +0 -50
- arekit/contrib/utils/data/storages/pandas_based.py +0 -123
- arekit/contrib/utils/data/writers/csv_native.py +0 -63
- arekit/contrib/utils/data/writers/csv_pd.py +0 -40
- arekit/contrib/utils/data/writers/json_opennre.py +0 -132
- arekit/contrib/utils/data/writers/sqlite_native.py +0 -114
- arekit/contrib/utils/embeddings/__init__.py +0 -0
- arekit/contrib/utils/embeddings/rusvectores.py +0 -58
- arekit/contrib/utils/embeddings/tokens.py +0 -30
- arekit/contrib/utils/entities/formatters/str_display.py +0 -11
- arekit/contrib/utils/io_utils/embedding.py +0 -72
- arekit/contrib/utils/np_utils/__init__.py +0 -0
- arekit/contrib/utils/np_utils/embedding.py +0 -22
- arekit/contrib/utils/np_utils/npz_utils.py +0 -13
- arekit/contrib/utils/np_utils/vocab.py +0 -20
- arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/base.py +0 -94
- arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -55
- arekit/contrib/utils/pipelines/items/text/entities_default.py +0 -23
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -33
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -105
- arekit/contrib/utils/pipelines/items/text/translator.py +0 -136
- arekit/contrib/utils/processing/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/mods.py +0 -12
- arekit/contrib/utils/processing/languages/pos.py +0 -23
- arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
- arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
- arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
- arekit/contrib/utils/processing/languages/ru/number.py +0 -23
- arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
- arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
- arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
- arekit/contrib/utils/processing/pos/__init__.py +0 -0
- arekit/contrib/utils/processing/pos/base.py +0 -12
- arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
- arekit/contrib/utils/processing/pos/russian.py +0 -10
- arekit/contrib/utils/processing/text/__init__.py +0 -0
- arekit/contrib/utils/processing/text/tokens.py +0 -127
- arekit/contrib/utils/serializer.py +0 -42
- arekit/contrib/utils/vectorizers/__init__.py +0 -0
- arekit/contrib/utils/vectorizers/bpe.py +0 -93
- arekit/contrib/utils/vectorizers/random_norm.py +0 -39
- {arekit-0.25.0.data → arekit-0.25.2.data}/data/logo.png +0 -0
- {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/LICENSE +0 -0
- {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/top_level.txt +0 -0
|
@@ -1,114 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import sqlite3
|
|
3
|
-
from os.path import dirname
|
|
4
|
-
|
|
5
|
-
from arekit.common.data import const
|
|
6
|
-
from arekit.contrib.utils.data.storages.row_cache import RowCacheStorage
|
|
7
|
-
from arekit.contrib.utils.data.writers.base import BaseWriter
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class SQliteWriter(BaseWriter):
|
|
11
|
-
""" TODO. This implementation is dedicated for the writing concepts of the data
|
|
12
|
-
serialization pipeline. However we add the SQLite3 service, it would be
|
|
13
|
-
right to refactor and utlize some core functionality from the core/service/sqlite.py
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
def __init__(self, table_name="contents", index_column_names=None, skip_existed=False, clear_table=True):
|
|
17
|
-
""" index_column_names: list or None
|
|
18
|
-
column names should be considered to build a unique index;
|
|
19
|
-
if None, the default 'const.ID' will be considered for row indexation.
|
|
20
|
-
"""
|
|
21
|
-
assert (isinstance(index_column_names, list) or index_column_names is None)
|
|
22
|
-
self.__index_column_names = index_column_names if index_column_names is not None else [const.ID]
|
|
23
|
-
self.__table_name = table_name
|
|
24
|
-
self.__conn = None
|
|
25
|
-
self.__cur = None
|
|
26
|
-
self.__need_init_table = True
|
|
27
|
-
self.__origin_column_names = None
|
|
28
|
-
self.__skip_existed = skip_existed
|
|
29
|
-
self.__clear_table = clear_table
|
|
30
|
-
|
|
31
|
-
def extension(self):
|
|
32
|
-
return ".sqlite"
|
|
33
|
-
|
|
34
|
-
@staticmethod
|
|
35
|
-
def __iter_storage_column_names(storage):
|
|
36
|
-
""" Iter only those columns that existed in storage.
|
|
37
|
-
"""
|
|
38
|
-
assert (isinstance(storage, RowCacheStorage))
|
|
39
|
-
for col_name, col_type in zip(storage.iter_column_names(), storage.iter_column_types()):
|
|
40
|
-
if col_name in storage.RowCache:
|
|
41
|
-
yield col_name, col_type
|
|
42
|
-
|
|
43
|
-
def __init_table(self, column_data):
|
|
44
|
-
# Compose column name with the related SQLITE type.
|
|
45
|
-
column_types = ",".join([" ".join([col_name, self.type_to_sqlite(col_type)])
|
|
46
|
-
for col_name, col_type in column_data])
|
|
47
|
-
# Create table if not exists.
|
|
48
|
-
self.__cur.execute(f"CREATE TABLE IF NOT EXISTS {self.__table_name}({column_types})")
|
|
49
|
-
# Table exists, however we may optionally remove the content from it.
|
|
50
|
-
if self.__clear_table:
|
|
51
|
-
self.__cur.execute(f"DELETE FROM {self.__table_name};")
|
|
52
|
-
# Create index.
|
|
53
|
-
index_name = f"i_{self.__table_name}_id"
|
|
54
|
-
self.__cur.execute(f"DROP INDEX IF EXISTS {index_name};")
|
|
55
|
-
self.__cur.execute("CREATE INDEX IF NOT EXISTS {index} ON {table}({columns})".format(
|
|
56
|
-
index=index_name,
|
|
57
|
-
table=self.__table_name,
|
|
58
|
-
columns=", ".join(self.__index_column_names)
|
|
59
|
-
))
|
|
60
|
-
self.__origin_column_names = [col_name for col_name, _ in column_data]
|
|
61
|
-
|
|
62
|
-
@staticmethod
|
|
63
|
-
def type_to_sqlite(col_type):
|
|
64
|
-
""" This is a simple function that provides conversion from the
|
|
65
|
-
base numpy types to SQLITE.
|
|
66
|
-
NOTE: this method represent a quick implementation for supporting
|
|
67
|
-
types, however it is far away from the generalized implementation.
|
|
68
|
-
"""
|
|
69
|
-
if isinstance(col_type, str):
|
|
70
|
-
if 'int' in col_type:
|
|
71
|
-
return 'INTEGER'
|
|
72
|
-
|
|
73
|
-
return "TEXT"
|
|
74
|
-
|
|
75
|
-
def open_target(self, target):
|
|
76
|
-
os.makedirs(dirname(target), exist_ok=True)
|
|
77
|
-
self.__conn = sqlite3.connect(target)
|
|
78
|
-
self.__cur = self.__conn.cursor()
|
|
79
|
-
|
|
80
|
-
def commit_line(self, storage):
|
|
81
|
-
assert (isinstance(storage, RowCacheStorage))
|
|
82
|
-
|
|
83
|
-
column_data = list(self.__iter_storage_column_names(storage))
|
|
84
|
-
|
|
85
|
-
if self.__need_init_table:
|
|
86
|
-
self.__init_table(column_data)
|
|
87
|
-
self.__need_init_table = False
|
|
88
|
-
|
|
89
|
-
# Check whether the related row is already exist in SQLITE database.
|
|
90
|
-
row_id = storage.RowCache[const.ID]
|
|
91
|
-
top_row = self.__cur.execute(f"SELECT EXISTS(SELECT 1 FROM {self.__table_name} WHERE id='{row_id}');")
|
|
92
|
-
is_exists = top_row.fetchone()[0]
|
|
93
|
-
if is_exists == 1 and self.__skip_existed:
|
|
94
|
-
return
|
|
95
|
-
|
|
96
|
-
line_data = [storage.RowCache[col_name] for col_name, _ in column_data]
|
|
97
|
-
parameters = ",".join(["?"] * len(line_data))
|
|
98
|
-
|
|
99
|
-
assert (len(self.__origin_column_names) == len(line_data))
|
|
100
|
-
|
|
101
|
-
self.__cur.execute(
|
|
102
|
-
f"INSERT OR REPLACE INTO {self.__table_name} VALUES ({parameters})",
|
|
103
|
-
tuple(line_data))
|
|
104
|
-
|
|
105
|
-
self.__conn.commit()
|
|
106
|
-
|
|
107
|
-
def close_target(self):
|
|
108
|
-
self.__cur = None
|
|
109
|
-
self.__origin_column_names = None
|
|
110
|
-
self.__need_init_table = True
|
|
111
|
-
self.__conn.close()
|
|
112
|
-
|
|
113
|
-
def write_all(self, storage, target):
|
|
114
|
-
pass
|
|
File without changes
|
|
@@ -1,58 +0,0 @@
|
|
|
1
|
-
from arekit.common.text.stemmer import Stemmer
|
|
2
|
-
from arekit.contrib.networks.embedding import Embedding
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class RusvectoresEmbedding(Embedding):
|
|
6
|
-
""" Wrapper over models from the following resource.
|
|
7
|
-
https://rusvectores.org/ru/models/
|
|
8
|
-
|
|
9
|
-
NOTE: Usually these are embeddings for texts written in Russian.
|
|
10
|
-
for the better performance it is expected that we adopt stemmer.
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
def __init__(self, matrix, words, stemmer):
|
|
14
|
-
assert(isinstance(stemmer, Stemmer) or stemmer is None)
|
|
15
|
-
super(RusvectoresEmbedding, self).__init__(matrix=matrix, words=words)
|
|
16
|
-
self.__index_without_pos = self.__create_terms_without_pos()
|
|
17
|
-
self.__stemmer = stemmer
|
|
18
|
-
self.__lemmatize_by_default = stemmer is not None
|
|
19
|
-
|
|
20
|
-
def try_find_index_by_plain_word(self, word):
|
|
21
|
-
assert(isinstance(word, str))
|
|
22
|
-
|
|
23
|
-
temp = self.__lemmatize_by_default
|
|
24
|
-
self.__lemmatize_by_default = False
|
|
25
|
-
index = super(RusvectoresEmbedding, self).try_find_index_by_plain_word(word)
|
|
26
|
-
self.__lemmatize_by_default = temp
|
|
27
|
-
|
|
28
|
-
return index
|
|
29
|
-
|
|
30
|
-
def _handler(self, word):
|
|
31
|
-
return self.__try_find_word_index_pair_lemmatized(word, self.__lemmatize_by_default)
|
|
32
|
-
|
|
33
|
-
# region private methods
|
|
34
|
-
|
|
35
|
-
def __try_find_word_index_pair_lemmatized(self, term, lemmatize):
|
|
36
|
-
assert(isinstance(term, str))
|
|
37
|
-
assert(isinstance(lemmatize, bool))
|
|
38
|
-
|
|
39
|
-
if lemmatize:
|
|
40
|
-
term = self.__stemmer.lemmatize_to_str(term)
|
|
41
|
-
|
|
42
|
-
index = self.__index_without_pos[term] \
|
|
43
|
-
if term in self.__index_without_pos else None
|
|
44
|
-
|
|
45
|
-
return term, index
|
|
46
|
-
|
|
47
|
-
def __create_terms_without_pos(self):
|
|
48
|
-
d = {}
|
|
49
|
-
for word_with_pos, index in self.iter_vocabulary():
|
|
50
|
-
assert(isinstance(word_with_pos, str))
|
|
51
|
-
word = word_with_pos.split(u'_')[0]
|
|
52
|
-
if word in d:
|
|
53
|
-
continue
|
|
54
|
-
d[word] = index
|
|
55
|
-
|
|
56
|
-
return d
|
|
57
|
-
|
|
58
|
-
# endregion
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
|
|
3
|
-
from arekit.contrib.networks.embedding import Embedding
|
|
4
|
-
from arekit.contrib.utils.processing.text.tokens import Tokens
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class TokenEmbedding(Embedding):
|
|
8
|
-
""" Embedding vectors for text punctuation, based on Tokens in parsed text
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
@classmethod
|
|
12
|
-
def from_supported_tokens(cls, vector_size, random_vector_func):
|
|
13
|
-
"""
|
|
14
|
-
random_vector_func: func
|
|
15
|
-
function with parameters (vector_size, seed)
|
|
16
|
-
"""
|
|
17
|
-
assert(isinstance(vector_size, int))
|
|
18
|
-
assert(callable(random_vector_func))
|
|
19
|
-
|
|
20
|
-
matrix = []
|
|
21
|
-
tokens_list = list(Tokens.iter_supported_tokens())
|
|
22
|
-
|
|
23
|
-
for token_index, _ in enumerate(tokens_list):
|
|
24
|
-
|
|
25
|
-
vector = random_vector_func(vector_size, token_index)
|
|
26
|
-
|
|
27
|
-
matrix.append(vector)
|
|
28
|
-
|
|
29
|
-
return cls(matrix=np.array(matrix),
|
|
30
|
-
words=tokens_list)
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
from arekit.common.entities.base import Entity
|
|
2
|
-
from arekit.common.entities.str_fmt import StringEntitiesFormatter
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class StringEntitiesDisplayValueFormatter(StringEntitiesFormatter):
|
|
6
|
-
""" Provides the contents of the DisplayValue property.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
def to_string(self, original_value, entity_type):
|
|
10
|
-
assert(isinstance(original_value, Entity))
|
|
11
|
-
return original_value.DisplayValue
|
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
from os.path import join
|
|
2
|
-
|
|
3
|
-
from arekit.contrib.networks.embedding_io import BaseEmbeddingIO
|
|
4
|
-
from arekit.contrib.utils.io_utils.utils import check_targets_existence
|
|
5
|
-
from arekit.contrib.utils.np_utils.embedding import NpzEmbeddingHelper
|
|
6
|
-
from arekit.contrib.utils.np_utils.vocab import VocabRepositoryUtils
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class NpEmbeddingIO(BaseEmbeddingIO):
|
|
10
|
-
""" Npz-based IO utils for embedding and text-based for vocabulary.
|
|
11
|
-
This format represents a archived version of the numpy math data, i.e. vectors, numbers, etc.
|
|
12
|
-
|
|
13
|
-
Provides additional Input/Output paths generation functions for:
|
|
14
|
-
- embedding matrix;
|
|
15
|
-
- embedding vocabulary.
|
|
16
|
-
"""
|
|
17
|
-
|
|
18
|
-
def __init__(self, target_dir, prefix_name="sample"):
|
|
19
|
-
assert(isinstance(target_dir, str))
|
|
20
|
-
|
|
21
|
-
self.__target_dir = target_dir
|
|
22
|
-
self.__term_emb_fn_template = "-".join([prefix_name, "term_embedding"])
|
|
23
|
-
self.__vocab_fn_template = "-".join([prefix_name, "term_embedding"])
|
|
24
|
-
|
|
25
|
-
# region Embedding-related data
|
|
26
|
-
|
|
27
|
-
def save_vocab(self, data):
|
|
28
|
-
target = self.__get_default_vocab_filepath()
|
|
29
|
-
return VocabRepositoryUtils.save(data=data, target=target)
|
|
30
|
-
|
|
31
|
-
def load_vocab(self):
|
|
32
|
-
source = self.___get_vocab_source()
|
|
33
|
-
return dict(VocabRepositoryUtils.load(source))
|
|
34
|
-
|
|
35
|
-
def save_embedding(self, data):
|
|
36
|
-
target = self.__get_default_embedding_filepath()
|
|
37
|
-
NpzEmbeddingHelper.save_embedding(data=data, target=target)
|
|
38
|
-
|
|
39
|
-
def load_embedding(self):
|
|
40
|
-
source = self.__get_term_embedding_source()
|
|
41
|
-
return NpzEmbeddingHelper.load_embedding(source)
|
|
42
|
-
|
|
43
|
-
def check_targets_existed(self):
|
|
44
|
-
targets = [
|
|
45
|
-
self.__get_default_vocab_filepath(),
|
|
46
|
-
self.__get_term_embedding_target()
|
|
47
|
-
]
|
|
48
|
-
return check_targets_existence(targets=targets)
|
|
49
|
-
|
|
50
|
-
# endregion
|
|
51
|
-
|
|
52
|
-
# region embedding-related data
|
|
53
|
-
|
|
54
|
-
def ___get_vocab_source(self):
|
|
55
|
-
""" It is possible to load a predefined embedding from another experiment
|
|
56
|
-
using the related filepath provided by model_io.
|
|
57
|
-
"""
|
|
58
|
-
return self.__get_default_vocab_filepath()
|
|
59
|
-
|
|
60
|
-
def __get_term_embedding_target(self):
|
|
61
|
-
return self.__get_default_embedding_filepath()
|
|
62
|
-
|
|
63
|
-
def __get_term_embedding_source(self):
|
|
64
|
-
return self.__get_default_embedding_filepath()
|
|
65
|
-
|
|
66
|
-
def __get_default_vocab_filepath(self):
|
|
67
|
-
return join(self.__target_dir, self.__vocab_fn_template)
|
|
68
|
-
|
|
69
|
-
def __get_default_embedding_filepath(self):
|
|
70
|
-
return join(self.__target_dir, self.__term_emb_fn_template)
|
|
71
|
-
|
|
72
|
-
# endregion
|
|
File without changes
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
|
|
3
|
-
from arekit.contrib.utils.np_utils.npz_utils import NpzRepositoryUtils
|
|
4
|
-
|
|
5
|
-
logger = logging.getLogger(__name__)
|
|
6
|
-
logging.basicConfig(level=logging.INFO)
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class NpzEmbeddingHelper:
|
|
10
|
-
|
|
11
|
-
@staticmethod
|
|
12
|
-
def save_embedding(data, target):
|
|
13
|
-
NpzRepositoryUtils.save(data=data, target=target)
|
|
14
|
-
logger.info("Saving embedding [size={shape}]: {filepath}".format(shape=data.shape,
|
|
15
|
-
filepath=target))
|
|
16
|
-
|
|
17
|
-
@staticmethod
|
|
18
|
-
def load_embedding(source):
|
|
19
|
-
embedding = NpzRepositoryUtils.load(source)
|
|
20
|
-
logger.info("Embedding read [size={size}]: {filepath}".format(size=embedding.shape,
|
|
21
|
-
filepath=source))
|
|
22
|
-
return embedding
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
|
|
3
|
-
import numpy as np
|
|
4
|
-
|
|
5
|
-
logger = logging.getLogger(__name__)
|
|
6
|
-
logging.basicConfig(level=logging.INFO)
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class VocabRepositoryUtils(object):
|
|
10
|
-
|
|
11
|
-
@staticmethod
|
|
12
|
-
def save(data, target):
|
|
13
|
-
logger.info("Saving vocabulary [size={size}]: {filepath}".format(size=len(data), filepath=target))
|
|
14
|
-
np.savetxt(target, data, fmt='%s')
|
|
15
|
-
|
|
16
|
-
@staticmethod
|
|
17
|
-
def load(source):
|
|
18
|
-
vocab = np.loadtxt(source, dtype=str, comments=None)
|
|
19
|
-
logger.info("Loading vocabulary [size={size}]: {filepath}".format(size=len(vocab), filepath=source))
|
|
20
|
-
return vocab
|
|
File without changes
|
|
@@ -1,94 +0,0 @@
|
|
|
1
|
-
from arekit.common.data.input.providers.rows.samples import BaseSampleRowProvider
|
|
2
|
-
from arekit.common.data.storages.base import BaseRowsStorage
|
|
3
|
-
from arekit.common.experiment.api.base_samples_io import BaseSamplesIO
|
|
4
|
-
from arekit.common.experiment.data_type import DataType
|
|
5
|
-
from arekit.common.pipeline.items.base import BasePipelineItem
|
|
6
|
-
from arekit.contrib.utils.serializer import InputDataSerializationHelper
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class BaseSerializerPipelineItem(BasePipelineItem):
|
|
10
|
-
|
|
11
|
-
def __init__(self, rows_provider, samples_io, save_labels_func, storage, **kwargs):
|
|
12
|
-
""" sample_rows_formatter:
|
|
13
|
-
how we format input texts for a BERT model, for example:
|
|
14
|
-
- single text
|
|
15
|
-
- two sequences, separated by [SEP] token
|
|
16
|
-
|
|
17
|
-
save_labels_func: function
|
|
18
|
-
data_type -> bool
|
|
19
|
-
"""
|
|
20
|
-
assert(isinstance(rows_provider, BaseSampleRowProvider))
|
|
21
|
-
assert(isinstance(samples_io, BaseSamplesIO))
|
|
22
|
-
assert(callable(save_labels_func))
|
|
23
|
-
assert(isinstance(storage, BaseRowsStorage))
|
|
24
|
-
super(BaseSerializerPipelineItem, self).__init__(**kwargs)
|
|
25
|
-
|
|
26
|
-
self._rows_provider = rows_provider
|
|
27
|
-
self._samples_io = samples_io
|
|
28
|
-
self._save_labels_func = save_labels_func
|
|
29
|
-
self._storage = storage
|
|
30
|
-
|
|
31
|
-
def _serialize_iteration(self, data_type, pipeline, data_folding, doc_ids):
|
|
32
|
-
assert(isinstance(data_type, DataType))
|
|
33
|
-
assert(isinstance(pipeline, list))
|
|
34
|
-
assert(isinstance(data_folding, dict) or data_folding is None)
|
|
35
|
-
assert(isinstance(doc_ids, list) or doc_ids is None)
|
|
36
|
-
assert(doc_ids is not None or data_folding is not None)
|
|
37
|
-
|
|
38
|
-
repos = {
|
|
39
|
-
"sample": InputDataSerializationHelper.create_samples_repo(
|
|
40
|
-
keep_labels=self._save_labels_func(data_type),
|
|
41
|
-
rows_provider=self._rows_provider,
|
|
42
|
-
storage=self._storage),
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
writer_and_targets = {
|
|
46
|
-
"sample": (self._samples_io.Writer,
|
|
47
|
-
self._samples_io.create_target(data_type=data_type)),
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
for description, repo in repos.items():
|
|
51
|
-
|
|
52
|
-
if data_folding is None:
|
|
53
|
-
# Consider only the predefined doc_ids.
|
|
54
|
-
doc_ids_iter = doc_ids
|
|
55
|
-
else:
|
|
56
|
-
# Take particular data_type.
|
|
57
|
-
doc_ids_iter = data_folding[data_type]
|
|
58
|
-
# Consider only predefined doc_ids.
|
|
59
|
-
if doc_ids is not None:
|
|
60
|
-
doc_ids_iter = set(doc_ids_iter).intersection(doc_ids)
|
|
61
|
-
|
|
62
|
-
InputDataSerializationHelper.fill_and_write(
|
|
63
|
-
repo=repo,
|
|
64
|
-
pipeline=pipeline,
|
|
65
|
-
doc_ids_iter=doc_ids_iter,
|
|
66
|
-
desc="{desc} [{data_type}]".format(desc=description, data_type=data_type),
|
|
67
|
-
writer=writer_and_targets[description][0],
|
|
68
|
-
target=writer_and_targets[description][1])
|
|
69
|
-
|
|
70
|
-
def _handle_iteration(self, data_type_pipelines, data_folding, doc_ids):
|
|
71
|
-
""" Performing data serialization for a particular iteration
|
|
72
|
-
"""
|
|
73
|
-
assert(isinstance(data_type_pipelines, dict))
|
|
74
|
-
for data_type, pipeline in data_type_pipelines.items():
|
|
75
|
-
self._serialize_iteration(data_type=data_type, pipeline=pipeline, data_folding=data_folding,
|
|
76
|
-
doc_ids=doc_ids)
|
|
77
|
-
|
|
78
|
-
def apply_core(self, input_data, pipeline_ctx):
|
|
79
|
-
"""
|
|
80
|
-
data_type_pipelines: dict of, for example:
|
|
81
|
-
{
|
|
82
|
-
DataType.Train: BasePipeline,
|
|
83
|
-
DataType.Test: BasePipeline
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
data_type_pipelines: doc_id -> parsed_doc -> annot -> opinion linkages
|
|
87
|
-
for example, function: sentiment_attitude_extraction_default_pipeline
|
|
88
|
-
doc_ids: optional
|
|
89
|
-
this parameter allows to limit amount of documents considered for sampling
|
|
90
|
-
"""
|
|
91
|
-
assert("data_type_pipelines" in pipeline_ctx)
|
|
92
|
-
self._handle_iteration(data_type_pipelines=pipeline_ctx.provide("data_type_pipelines"),
|
|
93
|
-
doc_ids=pipeline_ctx.provide_or_none("doc_ids"),
|
|
94
|
-
data_folding=pipeline_ctx.provide_or_none("data_folding"))
|
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
from arekit.contrib.networks.input.embedding.matrix import create_term_embedding_matrix
|
|
2
|
-
from arekit.contrib.networks.input.embedding.offsets import TermsEmbeddingOffsets
|
|
3
|
-
from arekit.contrib.networks.embedding import Embedding
|
|
4
|
-
from arekit.contrib.networks.input.providers.sample import NetworkSampleRowProvider
|
|
5
|
-
from arekit.contrib.utils.io_utils.embedding import NpEmbeddingIO
|
|
6
|
-
from arekit.contrib.utils.pipelines.items.sampling.base import BaseSerializerPipelineItem
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class NetworksInputSerializerPipelineItem(BaseSerializerPipelineItem):
|
|
10
|
-
|
|
11
|
-
def __init__(self, save_labels_func, rows_provider, samples_io, emb_io, storage, save_embedding=True, **kwargs):
|
|
12
|
-
""" This pipeline item allows to perform a data preparation for neural network models.
|
|
13
|
-
|
|
14
|
-
considering a list of the whole data_types with the related pipelines,
|
|
15
|
-
which are supported and required in a handler. It is necessary to know
|
|
16
|
-
data_types in advance as it allows to create a complete vocabulary of input terms,
|
|
17
|
-
with the related embeddings.
|
|
18
|
-
"""
|
|
19
|
-
assert(isinstance(emb_io, NpEmbeddingIO))
|
|
20
|
-
assert(isinstance(rows_provider, NetworkSampleRowProvider))
|
|
21
|
-
assert(isinstance(save_embedding, bool))
|
|
22
|
-
super(NetworksInputSerializerPipelineItem, self).__init__(
|
|
23
|
-
rows_provider=rows_provider,
|
|
24
|
-
samples_io=samples_io,
|
|
25
|
-
save_labels_func=save_labels_func,
|
|
26
|
-
storage=storage,
|
|
27
|
-
**kwargs)
|
|
28
|
-
|
|
29
|
-
self.__emb_io = emb_io
|
|
30
|
-
self.__save_embedding = save_embedding
|
|
31
|
-
|
|
32
|
-
def _handle_iteration(self, data_type_pipelines, data_folding, doc_ids):
|
|
33
|
-
""" Performing data serialization for a particular iteration
|
|
34
|
-
"""
|
|
35
|
-
assert(isinstance(data_type_pipelines, dict))
|
|
36
|
-
|
|
37
|
-
# Prepare for the present iteration.
|
|
38
|
-
self._rows_provider.clear_embedding_pairs()
|
|
39
|
-
|
|
40
|
-
super(NetworksInputSerializerPipelineItem, self)._handle_iteration(
|
|
41
|
-
data_type_pipelines=data_type_pipelines, data_folding=data_folding, doc_ids=doc_ids)
|
|
42
|
-
|
|
43
|
-
if not (self.__save_embedding and self._rows_provider.HasEmbeddingPairs):
|
|
44
|
-
return
|
|
45
|
-
|
|
46
|
-
# Save embedding information additionally.
|
|
47
|
-
term_embedding = Embedding.from_word_embedding_pairs_iter(self._rows_provider.iter_term_embedding_pairs())
|
|
48
|
-
embedding_matrix = create_term_embedding_matrix(term_embedding=term_embedding)
|
|
49
|
-
vocab = list(TermsEmbeddingOffsets.extract_vocab(words_embedding=term_embedding))
|
|
50
|
-
|
|
51
|
-
# Save embedding matrix
|
|
52
|
-
self.__emb_io.save_embedding(data=embedding_matrix)
|
|
53
|
-
self.__emb_io.save_vocab(data=vocab)
|
|
54
|
-
|
|
55
|
-
del embedding_matrix
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
from arekit.common.entities.base import Entity
|
|
2
|
-
from arekit.common.pipeline.items.base import BasePipelineItem
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class TextEntitiesParser(BasePipelineItem):
|
|
6
|
-
|
|
7
|
-
def __init__(self, **kwargs):
|
|
8
|
-
super(TextEntitiesParser, self).__init__(**kwargs)
|
|
9
|
-
|
|
10
|
-
@staticmethod
|
|
11
|
-
def __process_word(word):
|
|
12
|
-
assert(isinstance(word, str))
|
|
13
|
-
|
|
14
|
-
# If this is a special word which is related to the [entity] mention.
|
|
15
|
-
if word[0] == "[" and word[-1] == "]":
|
|
16
|
-
entity = Entity(value=word[1:-1], e_type=None)
|
|
17
|
-
return entity
|
|
18
|
-
|
|
19
|
-
return word
|
|
20
|
-
|
|
21
|
-
def apply_core(self, input_data, pipeline_ctx):
|
|
22
|
-
assert(isinstance(input_data, list))
|
|
23
|
-
return [self.__process_word(w) for w in input_data]
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
from arekit.common.text.stemmer import Stemmer
|
|
2
|
-
from arekit.contrib.utils.pipelines.items.text.frames import FrameVariantsParser
|
|
3
|
-
from arekit.contrib.utils.processing.languages.ru.mods import RussianLanguageMods
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class LemmasBasedFrameVariantsParser(FrameVariantsParser):
|
|
7
|
-
|
|
8
|
-
def __init__(self, frame_variants, stemmer, locale_mods=RussianLanguageMods, save_lemmas=False, **kwargs):
|
|
9
|
-
assert(isinstance(stemmer, Stemmer))
|
|
10
|
-
assert(isinstance(save_lemmas, bool))
|
|
11
|
-
super(LemmasBasedFrameVariantsParser, self).__init__(frame_variants=frame_variants, **kwargs)
|
|
12
|
-
|
|
13
|
-
self.__frame_variants = frame_variants
|
|
14
|
-
self.__stemmer = stemmer
|
|
15
|
-
self.__save_lemmas = save_lemmas
|
|
16
|
-
self.__max_variant_len = max([len(variant) for _, variant in frame_variants.iter_variants()])
|
|
17
|
-
self.__locale_mods = locale_mods
|
|
18
|
-
|
|
19
|
-
def __lemmatize_term(self, term):
|
|
20
|
-
# we first split onto words for lemmatization and then join all of them.
|
|
21
|
-
lemma = "".join(self.__stemmer.lemmatize_to_list(term))
|
|
22
|
-
# then we replace certain chars according to the locale restrictions.
|
|
23
|
-
return self.__locale_mods.replace_specific_word_chars(lemma)
|
|
24
|
-
|
|
25
|
-
def __provide_lemmatized_terms(self, terms):
|
|
26
|
-
"""
|
|
27
|
-
Compose a list of lemmatized versions of parsed_doc
|
|
28
|
-
PS: Might be significantly slow, depending on stemmer were used.
|
|
29
|
-
"""
|
|
30
|
-
assert(isinstance(terms, list))
|
|
31
|
-
return [self.__lemmatize_term(term) if isinstance(term, str) else term for term in terms]
|
|
32
|
-
|
|
33
|
-
def apply_core(self, input_data, pipeline_ctx):
|
|
34
|
-
lemmas = self.__provide_lemmatized_terms(input_data)
|
|
35
|
-
processed_it = self._iter_processed(terms=lemmas, origin=lemmas if self.__save_lemmas else input_data)
|
|
36
|
-
return list(processed_it)
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
from arekit.common.frames.text_variant import TextFrameVariant
|
|
2
|
-
from arekit.common.pipeline.context import PipelineContext
|
|
3
|
-
from arekit.common.pipeline.items.base import BasePipelineItem
|
|
4
|
-
from arekit.contrib.utils.processing.languages.mods import BaseLanguageMods
|
|
5
|
-
from arekit.contrib.utils.processing.languages.ru.mods import RussianLanguageMods
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class FrameVariantsSentimentNegation(BasePipelineItem):
|
|
9
|
-
|
|
10
|
-
def __init__(self, locale_mods=RussianLanguageMods, **kwargs):
|
|
11
|
-
assert(issubclass(locale_mods, BaseLanguageMods))
|
|
12
|
-
super(FrameVariantsSentimentNegation, self).__init__(**kwargs)
|
|
13
|
-
self._locale_mods = locale_mods
|
|
14
|
-
|
|
15
|
-
@staticmethod
|
|
16
|
-
def __get_preposition(terms, index):
|
|
17
|
-
return terms[index-1] if index > 0 else None
|
|
18
|
-
|
|
19
|
-
def apply_core(self, input_data, pipeline_ctx):
|
|
20
|
-
assert(isinstance(input_data, list))
|
|
21
|
-
assert(isinstance(pipeline_ctx, PipelineContext))
|
|
22
|
-
|
|
23
|
-
for curr_ind, term in enumerate(input_data):
|
|
24
|
-
|
|
25
|
-
if not isinstance(term, TextFrameVariant):
|
|
26
|
-
continue
|
|
27
|
-
|
|
28
|
-
prep_term = self.__get_preposition(terms=input_data, index=curr_ind)
|
|
29
|
-
is_negated = self._locale_mods.is_negation_word(prep_term) if prep_term is not None else False
|
|
30
|
-
|
|
31
|
-
term.set_is_negated(is_negated)
|
|
32
|
-
|
|
33
|
-
return input_data
|