arekit 0.24.0__py3-none-any.whl → 0.25.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/common/data/storages/base.py +4 -15
- arekit/common/docs/entities_grouping.py +2 -1
- arekit/common/docs/parser.py +27 -22
- arekit/common/pipeline/base.py +12 -16
- arekit/common/pipeline/batching.py +28 -0
- arekit/common/pipeline/context.py +5 -1
- arekit/common/pipeline/items/base.py +39 -2
- arekit/common/pipeline/items/flatten.py +5 -1
- arekit/common/pipeline/items/handle.py +2 -1
- arekit/common/pipeline/items/iter.py +2 -1
- arekit/common/pipeline/items/map.py +2 -1
- arekit/common/pipeline/items/map_nested.py +4 -0
- arekit/common/pipeline/utils.py +32 -0
- arekit/common/text/{partitioning/str.py → partitioning.py} +14 -9
- arekit/common/utils.py +11 -52
- arekit/contrib/utils/data/contents/opinions.py +13 -3
- arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
- arekit/contrib/utils/data/storages/pandas_based.py +2 -17
- arekit/contrib/utils/data/storages/row_cache.py +8 -2
- arekit/contrib/utils/data/storages/sqlite_based.py +18 -0
- arekit/contrib/utils/io_utils/utils.py +1 -18
- arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
- arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +8 -10
- arekit-0.25.1.data/data/logo.png +0 -0
- arekit-0.25.1.dist-info/METADATA +81 -0
- arekit-0.25.1.dist-info/RECORD +186 -0
- {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/WHEEL +1 -1
- arekit/common/data/input/repositories/__init__.py +0 -0
- arekit/common/data/input/repositories/base.py +0 -68
- arekit/common/data/input/repositories/sample.py +0 -22
- arekit/common/data/views/__init__.py +0 -0
- arekit/common/data/views/samples.py +0 -26
- arekit/common/docs/objects_parser.py +0 -37
- arekit/common/text/parser.py +0 -12
- arekit/common/text/partitioning/__init__.py +0 -0
- arekit/common/text/partitioning/base.py +0 -4
- arekit/common/text/partitioning/terms.py +0 -35
- arekit/contrib/networks/__init__.py +0 -0
- arekit/contrib/networks/embedding.py +0 -149
- arekit/contrib/networks/embedding_io.py +0 -18
- arekit/contrib/networks/input/__init__.py +0 -0
- arekit/contrib/networks/input/const.py +0 -6
- arekit/contrib/networks/input/ctx_serialization.py +0 -28
- arekit/contrib/networks/input/embedding/__init__.py +0 -0
- arekit/contrib/networks/input/embedding/matrix.py +0 -29
- arekit/contrib/networks/input/embedding/offsets.py +0 -55
- arekit/contrib/networks/input/formatters/__init__.py +0 -0
- arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
- arekit/contrib/networks/input/providers/__init__.py +0 -0
- arekit/contrib/networks/input/providers/sample.py +0 -129
- arekit/contrib/networks/input/providers/term_connotation.py +0 -23
- arekit/contrib/networks/input/providers/text.py +0 -24
- arekit/contrib/networks/input/rows_parser.py +0 -47
- arekit/contrib/networks/input/term_types.py +0 -13
- arekit/contrib/networks/input/terms_mapping.py +0 -60
- arekit/contrib/networks/vectorizer.py +0 -6
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +0 -84
- arekit/contrib/source/brat/doc.py +0 -28
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +0 -13
- arekit/contrib/source/brat/entities/entity.py +0 -42
- arekit/contrib/source/brat/entities/parser.py +0 -53
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +0 -19
- arekit/contrib/source/brat/relation.py +0 -32
- arekit/contrib/source/brat/sentence.py +0 -69
- arekit/contrib/source/brat/sentences_reader.py +0 -128
- arekit/contrib/source/download.py +0 -41
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +0 -55
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +0 -74
- arekit/contrib/source/nerel/io_utils.py +0 -62
- arekit/contrib/source/nerel/labels.py +0 -241
- arekit/contrib/source/nerel/reader.py +0 -46
- arekit/contrib/source/nerel/utils.py +0 -24
- arekit/contrib/source/nerel/versions.py +0 -12
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +0 -62
- arekit/contrib/source/nerelbio/labels.py +0 -265
- arekit/contrib/source/nerelbio/reader.py +0 -8
- arekit/contrib/source/nerelbio/versions.py +0 -8
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +0 -36
- arekit/contrib/source/ruattitudes/doc.py +0 -51
- arekit/contrib/source/ruattitudes/doc_brat.py +0 -44
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
- arekit/contrib/source/ruattitudes/io_utils.py +0 -56
- arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
- arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
- arekit/contrib/source/ruattitudes/reader.py +0 -268
- arekit/contrib/source/ruattitudes/sentence.py +0 -73
- arekit/contrib/source/ruattitudes/synonyms.py +0 -17
- arekit/contrib/source/ruattitudes/text_object.py +0 -59
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +0 -157
- arekit/contrib/source/rusentiframes/effect.py +0 -24
- arekit/contrib/source/rusentiframes/io_utils.py +0 -19
- arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
- arekit/contrib/source/rusentiframes/polarity.py +0 -35
- arekit/contrib/source/rusentiframes/role.py +0 -15
- arekit/contrib/source/rusentiframes/state.py +0 -24
- arekit/contrib/source/rusentiframes/types.py +0 -42
- arekit/contrib/source/rusentiframes/value.py +0 -2
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +0 -3
- arekit/contrib/source/rusentrel/docs_reader.py +0 -51
- arekit/contrib/source/rusentrel/entities.py +0 -26
- arekit/contrib/source/rusentrel/io_utils.py +0 -125
- arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
- arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
- arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
- arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
- arekit/contrib/source/rusentrel/synonyms.py +0 -17
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +0 -52
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +0 -31
- arekit/contrib/source/sentinerel/folding/fixed.py +0 -70
- arekit/contrib/source/sentinerel/io_utils.py +0 -87
- arekit/contrib/source/sentinerel/labels.py +0 -53
- arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
- arekit/contrib/source/sentinerel/reader.py +0 -42
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +0 -19
- arekit/contrib/source/zip_utils.py +0 -47
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
- arekit/contrib/utils/data/readers/__init__.py +0 -0
- arekit/contrib/utils/data/readers/base.py +0 -7
- arekit/contrib/utils/data/readers/csv_pd.py +0 -38
- arekit/contrib/utils/data/readers/jsonl.py +0 -15
- arekit/contrib/utils/data/service/__init__.py +0 -0
- arekit/contrib/utils/data/service/balance.py +0 -50
- arekit/contrib/utils/data/writers/csv_native.py +0 -63
- arekit/contrib/utils/data/writers/csv_pd.py +0 -40
- arekit/contrib/utils/data/writers/json_opennre.py +0 -132
- arekit/contrib/utils/data/writers/sqlite_native.py +0 -110
- arekit/contrib/utils/download.py +0 -77
- arekit/contrib/utils/embeddings/__init__.py +0 -0
- arekit/contrib/utils/embeddings/rusvectores.py +0 -58
- arekit/contrib/utils/embeddings/tokens.py +0 -30
- arekit/contrib/utils/io_utils/embedding.py +0 -72
- arekit/contrib/utils/io_utils/opinions.py +0 -37
- arekit/contrib/utils/io_utils/samples.py +0 -79
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +0 -41
- arekit/contrib/utils/lexicons/relation.py +0 -42
- arekit/contrib/utils/lexicons/rusentilex.py +0 -37
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +0 -83
- arekit/contrib/utils/np_utils/__init__.py +0 -0
- arekit/contrib/utils/np_utils/embedding.py +0 -22
- arekit/contrib/utils/np_utils/npz_utils.py +0 -13
- arekit/contrib/utils/np_utils/vocab.py +0 -20
- arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/base.py +0 -99
- arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -54
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -32
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -107
- arekit/contrib/utils/pipelines/items/text/translator.py +0 -135
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +0 -27
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -65
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +0 -29
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -64
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +0 -56
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -20
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -65
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +0 -21
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -107
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +0 -29
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -180
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
- arekit/contrib/utils/processing/languages/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/mods.py +0 -12
- arekit/contrib/utils/processing/languages/pos.py +0 -23
- arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
- arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
- arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
- arekit/contrib/utils/processing/languages/ru/number.py +0 -23
- arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
- arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
- arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
- arekit/contrib/utils/processing/pos/__init__.py +0 -0
- arekit/contrib/utils/processing/pos/base.py +0 -12
- arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
- arekit/contrib/utils/processing/pos/russian.py +0 -10
- arekit/contrib/utils/processing/text/__init__.py +0 -0
- arekit/contrib/utils/processing/text/tokens.py +0 -127
- arekit/contrib/utils/resources.py +0 -25
- arekit/contrib/utils/serializer.py +0 -43
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
- arekit/contrib/utils/vectorizers/__init__.py +0 -0
- arekit/contrib/utils/vectorizers/bpe.py +0 -93
- arekit/contrib/utils/vectorizers/random_norm.py +0 -39
- arekit/download_data.py +0 -11
- arekit-0.24.0.dist-info/METADATA +0 -23
- arekit-0.24.0.dist-info/RECORD +0 -374
- {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/LICENSE +0 -0
- {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/top_level.txt +0 -0
|
@@ -1,149 +0,0 @@
|
|
|
1
|
-
from collections.abc import Iterable
|
|
2
|
-
import numpy as np
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class Embedding(object):
|
|
6
|
-
"""
|
|
7
|
-
Represents default wrapper over W2V API.
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
def __init__(self, matrix, words):
|
|
11
|
-
assert(isinstance(matrix, np.ndarray) and len(matrix.shape) == 2)
|
|
12
|
-
assert(isinstance(words, np.ndarray))
|
|
13
|
-
assert(len(words) == matrix.shape[0])
|
|
14
|
-
self._matrix = matrix
|
|
15
|
-
self.__words = words
|
|
16
|
-
self.__index_by_word = self.__create_index(words)
|
|
17
|
-
|
|
18
|
-
# region properties
|
|
19
|
-
|
|
20
|
-
@property
|
|
21
|
-
def VectorSize(self):
|
|
22
|
-
return self._matrix.shape[1]
|
|
23
|
-
|
|
24
|
-
@property
|
|
25
|
-
def VocabularySize(self):
|
|
26
|
-
return self._matrix.shape[0]
|
|
27
|
-
|
|
28
|
-
# endregion
|
|
29
|
-
|
|
30
|
-
# region classmethods
|
|
31
|
-
|
|
32
|
-
@classmethod
|
|
33
|
-
def from_word_embedding_pairs_iter(cls, word_embedding_pairs):
|
|
34
|
-
assert(isinstance(word_embedding_pairs, Iterable))
|
|
35
|
-
|
|
36
|
-
matrix = []
|
|
37
|
-
words = []
|
|
38
|
-
used = set()
|
|
39
|
-
for word, vector in word_embedding_pairs:
|
|
40
|
-
|
|
41
|
-
if word in used:
|
|
42
|
-
continue
|
|
43
|
-
|
|
44
|
-
used.add(word)
|
|
45
|
-
|
|
46
|
-
matrix.append(vector)
|
|
47
|
-
words.append(word)
|
|
48
|
-
|
|
49
|
-
return cls(matrix=np.array(matrix) if len(matrix) > 0 else np.empty(shape=(0, 0)),
|
|
50
|
-
words=np.array(words))
|
|
51
|
-
|
|
52
|
-
@classmethod
|
|
53
|
-
def from_list_with_embedding_func(cls, words_iter, embedding_func):
|
|
54
|
-
assert(isinstance(words_iter, Iterable))
|
|
55
|
-
assert(callable(embedding_func))
|
|
56
|
-
|
|
57
|
-
matrix = []
|
|
58
|
-
words = []
|
|
59
|
-
used = set()
|
|
60
|
-
for word in words_iter:
|
|
61
|
-
|
|
62
|
-
if word in used:
|
|
63
|
-
continue
|
|
64
|
-
used.add(word)
|
|
65
|
-
|
|
66
|
-
vector = embedding_func(word)
|
|
67
|
-
matrix.append(vector)
|
|
68
|
-
words.append(word)
|
|
69
|
-
|
|
70
|
-
return cls(matrix=np.array(matrix),
|
|
71
|
-
words=words)
|
|
72
|
-
|
|
73
|
-
# endregion
|
|
74
|
-
|
|
75
|
-
# region private methods
|
|
76
|
-
|
|
77
|
-
@staticmethod
|
|
78
|
-
def __create_index(words):
|
|
79
|
-
index = {}
|
|
80
|
-
for i, word in enumerate(words):
|
|
81
|
-
index[word] = i
|
|
82
|
-
return index
|
|
83
|
-
|
|
84
|
-
def __try_find_word_index_pair(self, word):
|
|
85
|
-
"""
|
|
86
|
-
Assumes to pefrom term transformation (optional)
|
|
87
|
-
in order to find a term in an inner vocabulary
|
|
88
|
-
|
|
89
|
-
returns: pair
|
|
90
|
-
(processed_term, index)
|
|
91
|
-
"""
|
|
92
|
-
assert(isinstance(word, str))
|
|
93
|
-
|
|
94
|
-
has_index = self.__index_by_word[word] if word in self.__index_by_word else None
|
|
95
|
-
word = word if has_index else None
|
|
96
|
-
return word, has_index
|
|
97
|
-
|
|
98
|
-
def __hadler_core(self, word):
|
|
99
|
-
"""
|
|
100
|
-
Core word handler.
|
|
101
|
-
Assumes to perform word stripping.
|
|
102
|
-
"""
|
|
103
|
-
stripped_word = word.strip()
|
|
104
|
-
return self._handler(stripped_word)
|
|
105
|
-
|
|
106
|
-
# endregion
|
|
107
|
-
|
|
108
|
-
def iter_vocabulary(self):
|
|
109
|
-
for word in self.__words:
|
|
110
|
-
yield word, self.__index_by_word[word]
|
|
111
|
-
|
|
112
|
-
def get_vector_by_index(self, index):
|
|
113
|
-
assert(isinstance(index, int))
|
|
114
|
-
return self._matrix[index]
|
|
115
|
-
|
|
116
|
-
def get_word_by_index(self, index):
|
|
117
|
-
assert(isinstance(index, int))
|
|
118
|
-
return self.__words[index]
|
|
119
|
-
|
|
120
|
-
def try_find_index_by_word(self, word):
|
|
121
|
-
assert(isinstance(word, str))
|
|
122
|
-
_, index = self.__hadler_core(word)
|
|
123
|
-
return index
|
|
124
|
-
|
|
125
|
-
def try_find_index_by_plain_word(self, word):
|
|
126
|
-
assert(isinstance(word, str))
|
|
127
|
-
_, index = self.__hadler_core(word)
|
|
128
|
-
return index
|
|
129
|
-
|
|
130
|
-
def try_get_related_word(self, word):
|
|
131
|
-
word, _ = self.__hadler_core(word)
|
|
132
|
-
return word
|
|
133
|
-
|
|
134
|
-
def _handler(self, word):
|
|
135
|
-
return self.__try_find_word_index_pair(word)
|
|
136
|
-
|
|
137
|
-
# region overriden methods
|
|
138
|
-
|
|
139
|
-
def __contains__(self, word):
|
|
140
|
-
assert(isinstance(word, str))
|
|
141
|
-
_, index = self.__hadler_core(word)
|
|
142
|
-
return index is not None
|
|
143
|
-
|
|
144
|
-
def __getitem__(self, word):
|
|
145
|
-
assert(isinstance(word, str))
|
|
146
|
-
_, index = self.__hadler_core(word)
|
|
147
|
-
return self._matrix[index]
|
|
148
|
-
|
|
149
|
-
# endregion
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
class BaseEmbeddingIO(object):
|
|
2
|
-
""" API for loading and saving embedding and vocabulary related data.
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
def save_vocab(self, data):
|
|
6
|
-
raise NotImplementedError()
|
|
7
|
-
|
|
8
|
-
def load_vocab(self,):
|
|
9
|
-
raise NotImplementedError()
|
|
10
|
-
|
|
11
|
-
def save_embedding(self, data):
|
|
12
|
-
raise NotImplementedError()
|
|
13
|
-
|
|
14
|
-
def load_embedding(self):
|
|
15
|
-
raise NotImplementedError()
|
|
16
|
-
|
|
17
|
-
def check_targets_existed(self):
|
|
18
|
-
raise NotImplementedError()
|
|
File without changes
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
from arekit.common.data.input.providers.label.multiple import MultipleLabelProvider
|
|
2
|
-
from arekit.contrib.utils.processing.pos.base import POSTagger
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class NetworkSerializationContext(object):
|
|
6
|
-
|
|
7
|
-
def __init__(self, labels_scaler, frame_roles_label_scaler, frames_connotation_provider, pos_tagger=None):
|
|
8
|
-
assert(isinstance(pos_tagger, POSTagger) or pos_tagger is None)
|
|
9
|
-
self.__label_provider = MultipleLabelProvider(labels_scaler)
|
|
10
|
-
self.__frame_roles_label_scaler = frame_roles_label_scaler
|
|
11
|
-
self.__frames_connotation_provider = frames_connotation_provider
|
|
12
|
-
self.__pos_tagger = pos_tagger
|
|
13
|
-
|
|
14
|
-
@property
|
|
15
|
-
def LabelProvider(self):
|
|
16
|
-
return self.__label_provider
|
|
17
|
-
|
|
18
|
-
@property
|
|
19
|
-
def FrameRolesLabelScaler(self):
|
|
20
|
-
return self.__frame_roles_label_scaler
|
|
21
|
-
|
|
22
|
-
@property
|
|
23
|
-
def FramesConnotationProvider(self):
|
|
24
|
-
return self.__frames_connotation_provider
|
|
25
|
-
|
|
26
|
-
@property
|
|
27
|
-
def PosTagger(self):
|
|
28
|
-
return self.__pos_tagger
|
|
File without changes
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import numpy as np
|
|
3
|
-
|
|
4
|
-
from arekit.contrib.networks.input.embedding.offsets import TermsEmbeddingOffsets
|
|
5
|
-
from arekit.contrib.networks.embedding import Embedding
|
|
6
|
-
|
|
7
|
-
logger = logging.getLogger(__name__)
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def create_term_embedding_matrix(term_embedding):
|
|
11
|
-
"""
|
|
12
|
-
Compose complete embedding matrix, which includes:
|
|
13
|
-
- word embeddings
|
|
14
|
-
- entity embeddings
|
|
15
|
-
- token embeddings
|
|
16
|
-
|
|
17
|
-
returns: np.ndarray(words_count, embedding_size)
|
|
18
|
-
embedding matrix which includes embedding both for words and
|
|
19
|
-
entities
|
|
20
|
-
"""
|
|
21
|
-
assert(isinstance(term_embedding, Embedding))
|
|
22
|
-
|
|
23
|
-
embedding_offsets = TermsEmbeddingOffsets(words_count=term_embedding.VocabularySize)
|
|
24
|
-
matrix = np.zeros((embedding_offsets.TotalCount, term_embedding.VectorSize))
|
|
25
|
-
|
|
26
|
-
for word, index in term_embedding.iter_vocabulary():
|
|
27
|
-
matrix[embedding_offsets.get_word_index(index)] = term_embedding.get_vector_by_index(index)
|
|
28
|
-
|
|
29
|
-
return matrix
|
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
|
|
3
|
-
from arekit.contrib.networks.embedding import Embedding
|
|
4
|
-
|
|
5
|
-
logger = logging.getLogger(__name__)
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class TermsEmbeddingOffsets(object):
|
|
9
|
-
"""
|
|
10
|
-
Describes indices distribution within a further TermsEmbedding.
|
|
11
|
-
|
|
12
|
-
All parameters shifted by 1 because of a empty placeholder.
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
def __init__(self, words_count):
|
|
16
|
-
assert(isinstance(words_count, int))
|
|
17
|
-
self.__words_count = words_count
|
|
18
|
-
|
|
19
|
-
# region properties
|
|
20
|
-
|
|
21
|
-
@property
|
|
22
|
-
def TotalCount(self):
|
|
23
|
-
return 1 + self.__words_count
|
|
24
|
-
|
|
25
|
-
# endregion
|
|
26
|
-
|
|
27
|
-
# region 'get' methods
|
|
28
|
-
|
|
29
|
-
@staticmethod
|
|
30
|
-
def get_word_index(index):
|
|
31
|
-
return 1 + index
|
|
32
|
-
|
|
33
|
-
# endregion
|
|
34
|
-
|
|
35
|
-
@staticmethod
|
|
36
|
-
def extract_vocab(words_embedding):
|
|
37
|
-
"""
|
|
38
|
-
returns:
|
|
39
|
-
enumeration of pairs (word, key)
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
assert(isinstance(words_embedding, Embedding))
|
|
43
|
-
|
|
44
|
-
offsets = TermsEmbeddingOffsets(words_count=words_embedding.VocabularySize)
|
|
45
|
-
|
|
46
|
-
all_words = [(0, 'PADDING')]
|
|
47
|
-
|
|
48
|
-
for word, index in words_embedding.iter_vocabulary():
|
|
49
|
-
assert(isinstance(word, str))
|
|
50
|
-
all_words.append((offsets.get_word_index(index), word))
|
|
51
|
-
|
|
52
|
-
assert(len(all_words) == offsets.TotalCount)
|
|
53
|
-
|
|
54
|
-
for key, word in sorted(all_words, key=lambda item: item[0]):
|
|
55
|
-
yield word, key
|
|
File without changes
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
from arekit.common.context.terms_mapper import TextTermsMapper
|
|
2
|
-
from arekit.contrib.utils.processing.languages.pos import PartOfSpeechType
|
|
3
|
-
from arekit.contrib.utils.processing.pos.base import POSTagger
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class PosTermsMapper(TextTermsMapper):
|
|
7
|
-
|
|
8
|
-
def __init__(self, pos_tagger):
|
|
9
|
-
assert(isinstance(pos_tagger, POSTagger))
|
|
10
|
-
self.__pos_tagger = pos_tagger
|
|
11
|
-
|
|
12
|
-
def map_word(self, w_ind, word):
|
|
13
|
-
return self.__pos_tagger.get_term_pos(word)
|
|
14
|
-
|
|
15
|
-
def map_token(self, t_ind, token):
|
|
16
|
-
return PartOfSpeechType.Unknown
|
|
17
|
-
|
|
18
|
-
def map_text_frame_variant(self, fv_ind, text_frame_variant):
|
|
19
|
-
return self.__pos_tagger.get_term_pos(text_frame_variant.Variant.get_value())
|
|
20
|
-
|
|
21
|
-
def map_entity(self, e_ind, entity):
|
|
22
|
-
return PartOfSpeechType.Unknown
|
|
File without changes
|
|
@@ -1,129 +0,0 @@
|
|
|
1
|
-
import collections
|
|
2
|
-
|
|
3
|
-
from arekit.common.data.input.providers.label.base import LabelProvider
|
|
4
|
-
from arekit.common.data.input.providers.rows.samples import BaseSampleRowProvider
|
|
5
|
-
from arekit.common.entities.base import Entity
|
|
6
|
-
from arekit.common.frames.text_variant import TextFrameVariant
|
|
7
|
-
from arekit.common.labels.scaler.sentiment import SentimentLabelScaler
|
|
8
|
-
from arekit.common.docs.parsed.base import ParsedDocument
|
|
9
|
-
from arekit.contrib.networks.input.formatters.pos_mapper import PosTermsMapper
|
|
10
|
-
from arekit.contrib.networks.input import const
|
|
11
|
-
from arekit.contrib.networks.input.providers.term_connotation import extract_uint_frame_variant_connotation
|
|
12
|
-
from arekit.contrib.networks.input.rows_parser import create_nn_val_writer_fmt
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class NetworkSampleRowProvider(BaseSampleRowProvider):
|
|
16
|
-
|
|
17
|
-
def __init__(self,
|
|
18
|
-
label_provider,
|
|
19
|
-
text_provider,
|
|
20
|
-
frames_connotation_provider,
|
|
21
|
-
frame_role_label_scaler,
|
|
22
|
-
term_embedding_pairs=None,
|
|
23
|
-
pos_terms_mapper=None):
|
|
24
|
-
""" term_embedding_pairs: dict or None
|
|
25
|
-
additional structure, utilized to collect all the embedding pairs during the
|
|
26
|
-
rows providing stage.
|
|
27
|
-
"""
|
|
28
|
-
assert(isinstance(label_provider, LabelProvider))
|
|
29
|
-
assert(isinstance(frame_role_label_scaler, SentimentLabelScaler))
|
|
30
|
-
assert(isinstance(pos_terms_mapper, PosTermsMapper) or pos_terms_mapper is None)
|
|
31
|
-
assert(isinstance(term_embedding_pairs, collections.OrderedDict) or term_embedding_pairs is None)
|
|
32
|
-
|
|
33
|
-
super(NetworkSampleRowProvider, self).__init__(label_provider=label_provider,
|
|
34
|
-
text_provider=text_provider)
|
|
35
|
-
|
|
36
|
-
self.__frames_connotation_provider = frames_connotation_provider
|
|
37
|
-
self.__frame_role_label_scaler = frame_role_label_scaler
|
|
38
|
-
self.__pos_terms_mapper = pos_terms_mapper
|
|
39
|
-
self.__term_embedding_pairs = term_embedding_pairs
|
|
40
|
-
self.__nn_val_fmt = create_nn_val_writer_fmt(fmt_type="writer")
|
|
41
|
-
|
|
42
|
-
@property
|
|
43
|
-
def HasEmbeddingPairs(self):
|
|
44
|
-
return self.__term_embedding_pairs is not None
|
|
45
|
-
|
|
46
|
-
def iter_term_embedding_pairs(self):
|
|
47
|
-
""" Provide the contents of the embedded pairs.
|
|
48
|
-
"""
|
|
49
|
-
return iter(self.__term_embedding_pairs.items())
|
|
50
|
-
|
|
51
|
-
def clear_embedding_pairs(self):
|
|
52
|
-
""" Release the contents of the collected embedding pairs.
|
|
53
|
-
"""
|
|
54
|
-
|
|
55
|
-
# Check whether we actually collect embedding pairs.
|
|
56
|
-
if self.__term_embedding_pairs is None:
|
|
57
|
-
return
|
|
58
|
-
|
|
59
|
-
self.__term_embedding_pairs.clear()
|
|
60
|
-
|
|
61
|
-
def _fill_row_core(self, row, text_opinion_linkage, index_in_linked, etalon_label,
|
|
62
|
-
parsed_doc, sentence_ind, s_ind, t_ind):
|
|
63
|
-
assert(isinstance(parsed_doc, ParsedDocument))
|
|
64
|
-
|
|
65
|
-
super(NetworkSampleRowProvider, self)._fill_row_core(
|
|
66
|
-
row=row,
|
|
67
|
-
text_opinion_linkage=text_opinion_linkage,
|
|
68
|
-
index_in_linked=index_in_linked,
|
|
69
|
-
etalon_label=etalon_label,
|
|
70
|
-
parsed_doc=parsed_doc,
|
|
71
|
-
sentence_ind=sentence_ind,
|
|
72
|
-
s_ind=s_ind, t_ind=t_ind)
|
|
73
|
-
|
|
74
|
-
# Extracting list of terms, utilized in further.
|
|
75
|
-
terms_iter, actual_s_ind, actual_t_ind = self._provide_sentence_terms(
|
|
76
|
-
parsed_doc=parsed_doc, sentence_ind=sentence_ind, s_ind=s_ind, t_ind=t_ind)
|
|
77
|
-
terms = list(terms_iter)
|
|
78
|
-
|
|
79
|
-
# Compose frame indices.
|
|
80
|
-
uint_frame_inds = list(self.__iter_indices(terms=terms, filter=lambda t: isinstance(t, TextFrameVariant)))
|
|
81
|
-
|
|
82
|
-
# Compose frame connotations.
|
|
83
|
-
uint_frame_connotations = list(
|
|
84
|
-
map(lambda variant: extract_uint_frame_variant_connotation(
|
|
85
|
-
text_frame_variant=variant,
|
|
86
|
-
frames_connotation_provider=self.__frames_connotation_provider,
|
|
87
|
-
three_label_scaler=self.__frame_role_label_scaler),
|
|
88
|
-
[terms[frame_ind] for frame_ind in uint_frame_inds]))
|
|
89
|
-
|
|
90
|
-
vm = {
|
|
91
|
-
const.FrameVariantIndices: uint_frame_inds,
|
|
92
|
-
const.FrameConnotations: uint_frame_connotations,
|
|
93
|
-
const.SynonymSubject: self.__create_synonyms_set(terms=terms, term_ind=actual_s_ind),
|
|
94
|
-
const.SynonymObject: self.__create_synonyms_set(terms=terms, term_ind=actual_t_ind),
|
|
95
|
-
const.PosTags: None if self.__pos_terms_mapper is None else [int(pos_tag) for pos_tag in self.__pos_terms_mapper.iter_mapped(terms)]
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
self._apply_row_data(row=row, vm=vm, val_fmt=self.__nn_val_fmt)
|
|
99
|
-
|
|
100
|
-
# region private methods
|
|
101
|
-
|
|
102
|
-
def __create_synonyms_set(self, terms, term_ind):
|
|
103
|
-
entity = terms[term_ind]
|
|
104
|
-
assert(isinstance(entity, Entity))
|
|
105
|
-
|
|
106
|
-
# Searching for other synonyms among all the terms.
|
|
107
|
-
group_ind = entity.GroupIndex
|
|
108
|
-
it = self.__iter_indices(terms=terms, filter=lambda t: self.__syn_check(term=t, group_ind=group_ind))
|
|
109
|
-
inds_set = set(it)
|
|
110
|
-
|
|
111
|
-
# Guarantee the presence of the term_ind
|
|
112
|
-
inds_set.add(term_ind)
|
|
113
|
-
|
|
114
|
-
return sorted(inds_set)
|
|
115
|
-
|
|
116
|
-
@staticmethod
|
|
117
|
-
def __iter_indices(terms, filter):
|
|
118
|
-
for t_ind, term in enumerate(terms):
|
|
119
|
-
if filter(term):
|
|
120
|
-
yield t_ind
|
|
121
|
-
|
|
122
|
-
def __syn_check(self, term, group_ind):
|
|
123
|
-
if not isinstance(term, Entity):
|
|
124
|
-
return False
|
|
125
|
-
if group_ind is None:
|
|
126
|
-
return False
|
|
127
|
-
return term.GroupIndex == group_ind
|
|
128
|
-
|
|
129
|
-
# endregion
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
from arekit.common.frames.connotations.descriptor import FrameConnotationDescriptor
|
|
2
|
-
from arekit.common.frames.connotations.provider import FrameConnotationProvider
|
|
3
|
-
from arekit.common.frames.text_variant import TextFrameVariant
|
|
4
|
-
from arekit.common.labels.scaler.sentiment import SentimentLabelScaler
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def extract_uint_frame_variant_connotation(text_frame_variant, frames_connotation_provider, three_label_scaler):
|
|
8
|
-
assert (isinstance(text_frame_variant, TextFrameVariant))
|
|
9
|
-
assert (isinstance(frames_connotation_provider, FrameConnotationProvider))
|
|
10
|
-
assert (isinstance(three_label_scaler, SentimentLabelScaler))
|
|
11
|
-
|
|
12
|
-
frame_id = text_frame_variant.Variant.FrameID
|
|
13
|
-
connot_descriptor = frames_connotation_provider.try_provide(frame_id)
|
|
14
|
-
|
|
15
|
-
if connot_descriptor is None:
|
|
16
|
-
return three_label_scaler.label_to_uint(label=three_label_scaler.get_no_label_instance())
|
|
17
|
-
|
|
18
|
-
assert (isinstance(connot_descriptor, FrameConnotationDescriptor))
|
|
19
|
-
|
|
20
|
-
target_label = three_label_scaler.invert_label(connot_descriptor.Label) \
|
|
21
|
-
if text_frame_variant.IsNegated else connot_descriptor.Label
|
|
22
|
-
|
|
23
|
-
return three_label_scaler.label_to_uint(target_label)
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
from arekit.common.data.input.providers.text.single import BaseSingleTextProvider
|
|
2
|
-
from arekit.contrib.networks.input.terms_mapping import VectorizedNetworkTermMapping
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class NetworkSingleTextProvider(BaseSingleTextProvider):
|
|
6
|
-
"""
|
|
7
|
-
Performs iteration process over (string, embedding) term pairs.
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
def __init__(self, text_terms_mapper, pair_handling_func):
|
|
11
|
-
assert(isinstance(text_terms_mapper, VectorizedNetworkTermMapping))
|
|
12
|
-
assert(callable(pair_handling_func))
|
|
13
|
-
super(NetworkSingleTextProvider, self).__init__(text_terms_mapper=text_terms_mapper)
|
|
14
|
-
self.__write_embedding_pair_func = pair_handling_func
|
|
15
|
-
|
|
16
|
-
def _mapped_data_to_str(self, m_data):
|
|
17
|
-
# In this case, m_term consist of
|
|
18
|
-
# 1. Term;
|
|
19
|
-
# 2. Embedding.
|
|
20
|
-
term, _ = m_data
|
|
21
|
-
return term
|
|
22
|
-
|
|
23
|
-
def _handle_mapped_data(self, m_data):
|
|
24
|
-
self.__write_embedding_pair_func(m_data)
|
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
import arekit.contrib.networks.input.const as const
|
|
2
|
-
from arekit.common.data.rows_fmt import process_indices_list
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
def create_nn_column_formatters(no_value_func=lambda: None, args_sep=","):
|
|
6
|
-
assert(callable(no_value_func))
|
|
7
|
-
|
|
8
|
-
empty_list = []
|
|
9
|
-
|
|
10
|
-
def str_to_list(value):
|
|
11
|
-
return process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
|
|
12
|
-
|
|
13
|
-
def list_to_str(inds_iter):
|
|
14
|
-
return args_sep.join([str(i) for i in inds_iter])
|
|
15
|
-
|
|
16
|
-
return {
|
|
17
|
-
const.FrameVariantIndices: {
|
|
18
|
-
"writer": lambda value: list_to_str(value),
|
|
19
|
-
"parser": lambda value: process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
|
|
20
|
-
if isinstance(value, str) else empty_list
|
|
21
|
-
},
|
|
22
|
-
const.FrameConnotations: {
|
|
23
|
-
"writer": lambda value: list_to_str(value),
|
|
24
|
-
"parser": lambda value: process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
|
|
25
|
-
if isinstance(value, str) else empty_list
|
|
26
|
-
},
|
|
27
|
-
const.SynonymObject: {
|
|
28
|
-
"writer": lambda value: list_to_str(value),
|
|
29
|
-
"parser": lambda value: process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
|
|
30
|
-
},
|
|
31
|
-
const.SynonymSubject: {
|
|
32
|
-
"writer": lambda value: list_to_str(value),
|
|
33
|
-
"parser": lambda value: process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
|
|
34
|
-
},
|
|
35
|
-
const.PosTags: {
|
|
36
|
-
"writer": lambda value: list_to_str(value),
|
|
37
|
-
"parser": lambda value: str_to_list(value)
|
|
38
|
-
}
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def create_nn_val_writer_fmt(fmt_type, args_sep=","):
|
|
43
|
-
assert(isinstance(fmt_type, str))
|
|
44
|
-
d = create_nn_column_formatters(args_sep=args_sep)
|
|
45
|
-
for k, v in d.items():
|
|
46
|
-
d[k] = v[fmt_type]
|
|
47
|
-
return d
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
class TermTypes(object):
|
|
2
|
-
""" Types of input terms that may occur within the
|
|
3
|
-
input sequence of the neural network moodels.
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
WORD = "word"
|
|
7
|
-
ENTITY = "entity"
|
|
8
|
-
FRAME = "frame"
|
|
9
|
-
TOKEN = "token"
|
|
10
|
-
|
|
11
|
-
@staticmethod
|
|
12
|
-
def iter_types():
|
|
13
|
-
return [TermTypes.WORD, TermTypes.ENTITY, TermTypes.FRAME, TermTypes.TOKEN]
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
from arekit.common.data.input.terms_mapper import OpinionContainingTextTermsMapper
|
|
2
|
-
from arekit.common.entities.base import Entity
|
|
3
|
-
from arekit.common.frames.text_variant import TextFrameVariant
|
|
4
|
-
from arekit.contrib.networks.input.term_types import TermTypes
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class VectorizedNetworkTermMapping(OpinionContainingTextTermsMapper):
|
|
8
|
-
""" For every element returns: (word, embedded vector)
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
def __init__(self, string_entities_formatter, vectorizers):
|
|
12
|
-
"""string_emb_entity_formatter:
|
|
13
|
-
Utilized in order to obtain embedding value from predefined_embeding for entities
|
|
14
|
-
vectorizers:
|
|
15
|
-
dict
|
|
16
|
-
"""
|
|
17
|
-
assert(isinstance(vectorizers, dict))
|
|
18
|
-
|
|
19
|
-
for term_type in TermTypes.iter_types():
|
|
20
|
-
assert(term_type in vectorizers)
|
|
21
|
-
|
|
22
|
-
super(VectorizedNetworkTermMapping, self).__init__(
|
|
23
|
-
entity_formatter=string_entities_formatter)
|
|
24
|
-
|
|
25
|
-
self.__vectorizers = vectorizers
|
|
26
|
-
|
|
27
|
-
def map_term(self, term_type, term):
|
|
28
|
-
"""Universal term mapping method.
|
|
29
|
-
|
|
30
|
-
Args:
|
|
31
|
-
term_type (TermTypes): The type of term to map.
|
|
32
|
-
term (str): The term to map.
|
|
33
|
-
|
|
34
|
-
Returns:
|
|
35
|
-
The mapped term.
|
|
36
|
-
"""
|
|
37
|
-
return self.__vectorizers[term_type].create_term_embedding(term=term)
|
|
38
|
-
|
|
39
|
-
def map_word(self, w_ind, word):
|
|
40
|
-
return self.map_term(TermTypes.WORD, word)
|
|
41
|
-
|
|
42
|
-
def map_text_frame_variant(self, fv_ind, text_frame_variant):
|
|
43
|
-
assert(isinstance(text_frame_variant, TextFrameVariant))
|
|
44
|
-
return self.map_term(TermTypes.FRAME, text_frame_variant.Variant.get_value())
|
|
45
|
-
|
|
46
|
-
def map_token(self, t_ind, token):
|
|
47
|
-
""" It assumes to be composed for all the supported types.
|
|
48
|
-
"""
|
|
49
|
-
return self.map_term(TermTypes.TOKEN, token.get_token_value())
|
|
50
|
-
|
|
51
|
-
def map_entity(self, e_ind, entity):
|
|
52
|
-
assert(isinstance(entity, Entity))
|
|
53
|
-
|
|
54
|
-
# Value extraction
|
|
55
|
-
str_formatted_entity = super(VectorizedNetworkTermMapping, self).map_entity(
|
|
56
|
-
e_ind=e_ind,
|
|
57
|
-
entity=entity)
|
|
58
|
-
|
|
59
|
-
# Vector extraction
|
|
60
|
-
return self.map_term(TermTypes.ENTITY, str_formatted_entity)
|
|
File without changes
|
|
File without changes
|