arekit 0.25.0__py3-none-any.whl → 0.25.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. arekit/common/data/storages/base.py +4 -15
  2. arekit/common/docs/parser.py +3 -30
  3. arekit/common/pipeline/items/base.py +1 -1
  4. arekit/common/utils.py +11 -8
  5. arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
  6. arekit/contrib/utils/data/storages/pandas_based.py +2 -17
  7. arekit/contrib/utils/data/storages/row_cache.py +2 -1
  8. arekit/contrib/utils/data/storages/sqlite_based.py +2 -1
  9. arekit/contrib/utils/pipelines/text_opinion/extraction.py +5 -4
  10. {arekit-0.25.0.dist-info → arekit-0.25.1.dist-info}/METADATA +4 -5
  11. {arekit-0.25.0.dist-info → arekit-0.25.1.dist-info}/RECORD +15 -88
  12. arekit/common/data/input/repositories/__init__.py +0 -0
  13. arekit/common/data/input/repositories/base.py +0 -68
  14. arekit/common/data/input/repositories/sample.py +0 -22
  15. arekit/common/data/views/__init__.py +0 -0
  16. arekit/common/data/views/samples.py +0 -26
  17. arekit/common/service/__init__.py +0 -0
  18. arekit/common/service/sqlite.py +0 -36
  19. arekit/contrib/networks/__init__.py +0 -0
  20. arekit/contrib/networks/embedding.py +0 -149
  21. arekit/contrib/networks/embedding_io.py +0 -18
  22. arekit/contrib/networks/input/__init__.py +0 -0
  23. arekit/contrib/networks/input/const.py +0 -6
  24. arekit/contrib/networks/input/ctx_serialization.py +0 -28
  25. arekit/contrib/networks/input/embedding/__init__.py +0 -0
  26. arekit/contrib/networks/input/embedding/matrix.py +0 -29
  27. arekit/contrib/networks/input/embedding/offsets.py +0 -55
  28. arekit/contrib/networks/input/formatters/__init__.py +0 -0
  29. arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
  30. arekit/contrib/networks/input/providers/__init__.py +0 -0
  31. arekit/contrib/networks/input/providers/sample.py +0 -129
  32. arekit/contrib/networks/input/providers/term_connotation.py +0 -23
  33. arekit/contrib/networks/input/providers/text.py +0 -24
  34. arekit/contrib/networks/input/rows_parser.py +0 -47
  35. arekit/contrib/networks/input/term_types.py +0 -13
  36. arekit/contrib/networks/input/terms_mapping.py +0 -60
  37. arekit/contrib/networks/vectorizer.py +0 -6
  38. arekit/contrib/utils/data/readers/__init__.py +0 -0
  39. arekit/contrib/utils/data/readers/base.py +0 -7
  40. arekit/contrib/utils/data/readers/csv_pd.py +0 -38
  41. arekit/contrib/utils/data/readers/jsonl.py +0 -15
  42. arekit/contrib/utils/data/readers/sqlite.py +0 -14
  43. arekit/contrib/utils/data/service/__init__.py +0 -0
  44. arekit/contrib/utils/data/service/balance.py +0 -50
  45. arekit/contrib/utils/data/writers/csv_native.py +0 -63
  46. arekit/contrib/utils/data/writers/csv_pd.py +0 -40
  47. arekit/contrib/utils/data/writers/json_opennre.py +0 -132
  48. arekit/contrib/utils/data/writers/sqlite_native.py +0 -114
  49. arekit/contrib/utils/embeddings/__init__.py +0 -0
  50. arekit/contrib/utils/embeddings/rusvectores.py +0 -58
  51. arekit/contrib/utils/embeddings/tokens.py +0 -30
  52. arekit/contrib/utils/io_utils/embedding.py +0 -72
  53. arekit/contrib/utils/np_utils/__init__.py +0 -0
  54. arekit/contrib/utils/np_utils/embedding.py +0 -22
  55. arekit/contrib/utils/np_utils/npz_utils.py +0 -13
  56. arekit/contrib/utils/np_utils/vocab.py +0 -20
  57. arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
  58. arekit/contrib/utils/pipelines/items/sampling/base.py +0 -94
  59. arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -55
  60. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
  61. arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -33
  62. arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -105
  63. arekit/contrib/utils/pipelines/items/text/translator.py +0 -136
  64. arekit/contrib/utils/processing/languages/__init__.py +0 -0
  65. arekit/contrib/utils/processing/languages/mods.py +0 -12
  66. arekit/contrib/utils/processing/languages/pos.py +0 -23
  67. arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
  68. arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
  69. arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
  70. arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
  71. arekit/contrib/utils/processing/languages/ru/number.py +0 -23
  72. arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
  73. arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
  74. arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
  75. arekit/contrib/utils/processing/pos/__init__.py +0 -0
  76. arekit/contrib/utils/processing/pos/base.py +0 -12
  77. arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
  78. arekit/contrib/utils/processing/pos/russian.py +0 -10
  79. arekit/contrib/utils/processing/text/__init__.py +0 -0
  80. arekit/contrib/utils/processing/text/tokens.py +0 -127
  81. arekit/contrib/utils/serializer.py +0 -42
  82. arekit/contrib/utils/vectorizers/__init__.py +0 -0
  83. arekit/contrib/utils/vectorizers/bpe.py +0 -93
  84. arekit/contrib/utils/vectorizers/random_norm.py +0 -39
  85. {arekit-0.25.0.data → arekit-0.25.1.data}/data/logo.png +0 -0
  86. {arekit-0.25.0.dist-info → arekit-0.25.1.dist-info}/LICENSE +0 -0
  87. {arekit-0.25.0.dist-info → arekit-0.25.1.dist-info}/WHEEL +0 -0
  88. {arekit-0.25.0.dist-info → arekit-0.25.1.dist-info}/top_level.txt +0 -0
@@ -1,36 +0,0 @@
1
- import sqlite3
2
-
3
-
4
- class SQLiteProvider(object):
5
-
6
- @staticmethod
7
- def write(data_it, target, data2col_func, table_name, columns, sqlite3_column_types,
8
- id_column_name="id", id_column_type="TEXT"):
9
- assert(callable(data2col_func))
10
- assert(isinstance(columns, list))
11
- assert(isinstance(sqlite3_column_types, list))
12
- assert(len(columns) == len(sqlite3_column_types))
13
-
14
- with sqlite3.connect(target) as con:
15
- cur = con.cursor()
16
-
17
- # Provide the ID column.
18
- columns = [id_column_name] + columns
19
- sqlite3_column_types = [id_column_type] + sqlite3_column_types
20
-
21
- # Compose the whole columns list.
22
- content = ", ".join([" ".join(item) for item in zip(columns, sqlite3_column_types)])
23
- cur.execute(f"CREATE TABLE IF NOT EXISTS {table_name}({content})")
24
- cur.execute(f"CREATE INDEX IF NOT EXISTS i_id ON {table_name}({id_column_name})")
25
-
26
- for uid, data in data_it:
27
- r = cur.execute(f"SELECT EXISTS(SELECT 1 FROM {table_name} WHERE {id_column_name}='{uid}');")
28
- ans = r.fetchone()[0]
29
- if ans == 1:
30
- continue
31
-
32
- params = ", ".join(tuple(['?'] * (len(columns))))
33
- cur.execute(f"INSERT INTO {table_name} VALUES ({params})", [str(uid)] + data2col_func(data))
34
- con.commit()
35
-
36
- cur.close()
File without changes
@@ -1,149 +0,0 @@
1
- from collections.abc import Iterable
2
- import numpy as np
3
-
4
-
5
- class Embedding(object):
6
- """
7
- Represents default wrapper over W2V API.
8
- """
9
-
10
- def __init__(self, matrix, words):
11
- assert(isinstance(matrix, np.ndarray) and len(matrix.shape) == 2)
12
- assert(isinstance(words, np.ndarray))
13
- assert(len(words) == matrix.shape[0])
14
- self._matrix = matrix
15
- self.__words = words
16
- self.__index_by_word = self.__create_index(words)
17
-
18
- # region properties
19
-
20
- @property
21
- def VectorSize(self):
22
- return self._matrix.shape[1]
23
-
24
- @property
25
- def VocabularySize(self):
26
- return self._matrix.shape[0]
27
-
28
- # endregion
29
-
30
- # region classmethods
31
-
32
- @classmethod
33
- def from_word_embedding_pairs_iter(cls, word_embedding_pairs):
34
- assert(isinstance(word_embedding_pairs, Iterable))
35
-
36
- matrix = []
37
- words = []
38
- used = set()
39
- for word, vector in word_embedding_pairs:
40
-
41
- if word in used:
42
- continue
43
-
44
- used.add(word)
45
-
46
- matrix.append(vector)
47
- words.append(word)
48
-
49
- return cls(matrix=np.array(matrix) if len(matrix) > 0 else np.empty(shape=(0, 0)),
50
- words=np.array(words))
51
-
52
- @classmethod
53
- def from_list_with_embedding_func(cls, words_iter, embedding_func):
54
- assert(isinstance(words_iter, Iterable))
55
- assert(callable(embedding_func))
56
-
57
- matrix = []
58
- words = []
59
- used = set()
60
- for word in words_iter:
61
-
62
- if word in used:
63
- continue
64
- used.add(word)
65
-
66
- vector = embedding_func(word)
67
- matrix.append(vector)
68
- words.append(word)
69
-
70
- return cls(matrix=np.array(matrix),
71
- words=words)
72
-
73
- # endregion
74
-
75
- # region private methods
76
-
77
- @staticmethod
78
- def __create_index(words):
79
- index = {}
80
- for i, word in enumerate(words):
81
- index[word] = i
82
- return index
83
-
84
- def __try_find_word_index_pair(self, word):
85
- """
86
- Assumes to pefrom term transformation (optional)
87
- in order to find a term in an inner vocabulary
88
-
89
- returns: pair
90
- (processed_term, index)
91
- """
92
- assert(isinstance(word, str))
93
-
94
- has_index = self.__index_by_word[word] if word in self.__index_by_word else None
95
- word = word if has_index else None
96
- return word, has_index
97
-
98
- def __hadler_core(self, word):
99
- """
100
- Core word handler.
101
- Assumes to perform word stripping.
102
- """
103
- stripped_word = word.strip()
104
- return self._handler(stripped_word)
105
-
106
- # endregion
107
-
108
- def iter_vocabulary(self):
109
- for word in self.__words:
110
- yield word, self.__index_by_word[word]
111
-
112
- def get_vector_by_index(self, index):
113
- assert(isinstance(index, int))
114
- return self._matrix[index]
115
-
116
- def get_word_by_index(self, index):
117
- assert(isinstance(index, int))
118
- return self.__words[index]
119
-
120
- def try_find_index_by_word(self, word):
121
- assert(isinstance(word, str))
122
- _, index = self.__hadler_core(word)
123
- return index
124
-
125
- def try_find_index_by_plain_word(self, word):
126
- assert(isinstance(word, str))
127
- _, index = self.__hadler_core(word)
128
- return index
129
-
130
- def try_get_related_word(self, word):
131
- word, _ = self.__hadler_core(word)
132
- return word
133
-
134
- def _handler(self, word):
135
- return self.__try_find_word_index_pair(word)
136
-
137
- # region overriden methods
138
-
139
- def __contains__(self, word):
140
- assert(isinstance(word, str))
141
- _, index = self.__hadler_core(word)
142
- return index is not None
143
-
144
- def __getitem__(self, word):
145
- assert(isinstance(word, str))
146
- _, index = self.__hadler_core(word)
147
- return self._matrix[index]
148
-
149
- # endregion
@@ -1,18 +0,0 @@
1
- class BaseEmbeddingIO(object):
2
- """ API for loading and saving embedding and vocabulary related data.
3
- """
4
-
5
- def save_vocab(self, data):
6
- raise NotImplementedError()
7
-
8
- def load_vocab(self,):
9
- raise NotImplementedError()
10
-
11
- def save_embedding(self, data):
12
- raise NotImplementedError()
13
-
14
- def load_embedding(self):
15
- raise NotImplementedError()
16
-
17
- def check_targets_existed(self):
18
- raise NotImplementedError()
File without changes
@@ -1,6 +0,0 @@
1
- # Additional input columns
2
- FrameVariantIndices = "frames"
3
- FrameConnotations = "frame_connots_uint"
4
- SynonymObject = "syn_objs"
5
- SynonymSubject = "syn_subjs"
6
- PosTags = "pos_tags"
@@ -1,28 +0,0 @@
1
- from arekit.common.data.input.providers.label.multiple import MultipleLabelProvider
2
- from arekit.contrib.utils.processing.pos.base import POSTagger
3
-
4
-
5
- class NetworkSerializationContext(object):
6
-
7
- def __init__(self, labels_scaler, frame_roles_label_scaler, frames_connotation_provider, pos_tagger=None):
8
- assert(isinstance(pos_tagger, POSTagger) or pos_tagger is None)
9
- self.__label_provider = MultipleLabelProvider(labels_scaler)
10
- self.__frame_roles_label_scaler = frame_roles_label_scaler
11
- self.__frames_connotation_provider = frames_connotation_provider
12
- self.__pos_tagger = pos_tagger
13
-
14
- @property
15
- def LabelProvider(self):
16
- return self.__label_provider
17
-
18
- @property
19
- def FrameRolesLabelScaler(self):
20
- return self.__frame_roles_label_scaler
21
-
22
- @property
23
- def FramesConnotationProvider(self):
24
- return self.__frames_connotation_provider
25
-
26
- @property
27
- def PosTagger(self):
28
- return self.__pos_tagger
File without changes
@@ -1,29 +0,0 @@
1
- import logging
2
- import numpy as np
3
-
4
- from arekit.contrib.networks.input.embedding.offsets import TermsEmbeddingOffsets
5
- from arekit.contrib.networks.embedding import Embedding
6
-
7
- logger = logging.getLogger(__name__)
8
-
9
-
10
- def create_term_embedding_matrix(term_embedding):
11
- """
12
- Compose complete embedding matrix, which includes:
13
- - word embeddings
14
- - entity embeddings
15
- - token embeddings
16
-
17
- returns: np.ndarray(words_count, embedding_size)
18
- embedding matrix which includes embedding both for words and
19
- entities
20
- """
21
- assert(isinstance(term_embedding, Embedding))
22
-
23
- embedding_offsets = TermsEmbeddingOffsets(words_count=term_embedding.VocabularySize)
24
- matrix = np.zeros((embedding_offsets.TotalCount, term_embedding.VectorSize))
25
-
26
- for word, index in term_embedding.iter_vocabulary():
27
- matrix[embedding_offsets.get_word_index(index)] = term_embedding.get_vector_by_index(index)
28
-
29
- return matrix
@@ -1,55 +0,0 @@
1
- import logging
2
-
3
- from arekit.contrib.networks.embedding import Embedding
4
-
5
- logger = logging.getLogger(__name__)
6
-
7
-
8
- class TermsEmbeddingOffsets(object):
9
- """
10
- Describes indices distribution within a further TermsEmbedding.
11
-
12
- All parameters shifted by 1 because of a empty placeholder.
13
- """
14
-
15
- def __init__(self, words_count):
16
- assert(isinstance(words_count, int))
17
- self.__words_count = words_count
18
-
19
- # region properties
20
-
21
- @property
22
- def TotalCount(self):
23
- return 1 + self.__words_count
24
-
25
- # endregion
26
-
27
- # region 'get' methods
28
-
29
- @staticmethod
30
- def get_word_index(index):
31
- return 1 + index
32
-
33
- # endregion
34
-
35
- @staticmethod
36
- def extract_vocab(words_embedding):
37
- """
38
- returns:
39
- enumeration of pairs (word, key)
40
- """
41
-
42
- assert(isinstance(words_embedding, Embedding))
43
-
44
- offsets = TermsEmbeddingOffsets(words_count=words_embedding.VocabularySize)
45
-
46
- all_words = [(0, 'PADDING')]
47
-
48
- for word, index in words_embedding.iter_vocabulary():
49
- assert(isinstance(word, str))
50
- all_words.append((offsets.get_word_index(index), word))
51
-
52
- assert(len(all_words) == offsets.TotalCount)
53
-
54
- for key, word in sorted(all_words, key=lambda item: item[0]):
55
- yield word, key
File without changes
@@ -1,22 +0,0 @@
1
- from arekit.common.context.terms_mapper import TextTermsMapper
2
- from arekit.contrib.utils.processing.languages.pos import PartOfSpeechType
3
- from arekit.contrib.utils.processing.pos.base import POSTagger
4
-
5
-
6
- class PosTermsMapper(TextTermsMapper):
7
-
8
- def __init__(self, pos_tagger):
9
- assert(isinstance(pos_tagger, POSTagger))
10
- self.__pos_tagger = pos_tagger
11
-
12
- def map_word(self, w_ind, word):
13
- return self.__pos_tagger.get_term_pos(word)
14
-
15
- def map_token(self, t_ind, token):
16
- return PartOfSpeechType.Unknown
17
-
18
- def map_text_frame_variant(self, fv_ind, text_frame_variant):
19
- return self.__pos_tagger.get_term_pos(text_frame_variant.Variant.get_value())
20
-
21
- def map_entity(self, e_ind, entity):
22
- return PartOfSpeechType.Unknown
File without changes
@@ -1,129 +0,0 @@
1
- import collections
2
-
3
- from arekit.common.data.input.providers.label.base import LabelProvider
4
- from arekit.common.data.input.providers.rows.samples import BaseSampleRowProvider
5
- from arekit.common.entities.base import Entity
6
- from arekit.common.frames.text_variant import TextFrameVariant
7
- from arekit.common.labels.scaler.sentiment import SentimentLabelScaler
8
- from arekit.common.docs.parsed.base import ParsedDocument
9
- from arekit.contrib.networks.input.formatters.pos_mapper import PosTermsMapper
10
- from arekit.contrib.networks.input import const
11
- from arekit.contrib.networks.input.providers.term_connotation import extract_uint_frame_variant_connotation
12
- from arekit.contrib.networks.input.rows_parser import create_nn_val_writer_fmt
13
-
14
-
15
- class NetworkSampleRowProvider(BaseSampleRowProvider):
16
-
17
- def __init__(self,
18
- label_provider,
19
- text_provider,
20
- frames_connotation_provider,
21
- frame_role_label_scaler,
22
- term_embedding_pairs=None,
23
- pos_terms_mapper=None):
24
- """ term_embedding_pairs: dict or None
25
- additional structure, utilized to collect all the embedding pairs during the
26
- rows providing stage.
27
- """
28
- assert(isinstance(label_provider, LabelProvider))
29
- assert(isinstance(frame_role_label_scaler, SentimentLabelScaler))
30
- assert(isinstance(pos_terms_mapper, PosTermsMapper) or pos_terms_mapper is None)
31
- assert(isinstance(term_embedding_pairs, collections.OrderedDict) or term_embedding_pairs is None)
32
-
33
- super(NetworkSampleRowProvider, self).__init__(label_provider=label_provider,
34
- text_provider=text_provider)
35
-
36
- self.__frames_connotation_provider = frames_connotation_provider
37
- self.__frame_role_label_scaler = frame_role_label_scaler
38
- self.__pos_terms_mapper = pos_terms_mapper
39
- self.__term_embedding_pairs = term_embedding_pairs
40
- self.__nn_val_fmt = create_nn_val_writer_fmt(fmt_type="writer")
41
-
42
- @property
43
- def HasEmbeddingPairs(self):
44
- return self.__term_embedding_pairs is not None
45
-
46
- def iter_term_embedding_pairs(self):
47
- """ Provide the contents of the embedded pairs.
48
- """
49
- return iter(self.__term_embedding_pairs.items())
50
-
51
- def clear_embedding_pairs(self):
52
- """ Release the contents of the collected embedding pairs.
53
- """
54
-
55
- # Check whether we actually collect embedding pairs.
56
- if self.__term_embedding_pairs is None:
57
- return
58
-
59
- self.__term_embedding_pairs.clear()
60
-
61
- def _fill_row_core(self, row, text_opinion_linkage, index_in_linked, etalon_label,
62
- parsed_doc, sentence_ind, s_ind, t_ind):
63
- assert(isinstance(parsed_doc, ParsedDocument))
64
-
65
- super(NetworkSampleRowProvider, self)._fill_row_core(
66
- row=row,
67
- text_opinion_linkage=text_opinion_linkage,
68
- index_in_linked=index_in_linked,
69
- etalon_label=etalon_label,
70
- parsed_doc=parsed_doc,
71
- sentence_ind=sentence_ind,
72
- s_ind=s_ind, t_ind=t_ind)
73
-
74
- # Extracting list of terms, utilized in further.
75
- terms_iter, actual_s_ind, actual_t_ind = self._provide_sentence_terms(
76
- parsed_doc=parsed_doc, sentence_ind=sentence_ind, s_ind=s_ind, t_ind=t_ind)
77
- terms = list(terms_iter)
78
-
79
- # Compose frame indices.
80
- uint_frame_inds = list(self.__iter_indices(terms=terms, filter=lambda t: isinstance(t, TextFrameVariant)))
81
-
82
- # Compose frame connotations.
83
- uint_frame_connotations = list(
84
- map(lambda variant: extract_uint_frame_variant_connotation(
85
- text_frame_variant=variant,
86
- frames_connotation_provider=self.__frames_connotation_provider,
87
- three_label_scaler=self.__frame_role_label_scaler),
88
- [terms[frame_ind] for frame_ind in uint_frame_inds]))
89
-
90
- vm = {
91
- const.FrameVariantIndices: uint_frame_inds,
92
- const.FrameConnotations: uint_frame_connotations,
93
- const.SynonymSubject: self.__create_synonyms_set(terms=terms, term_ind=actual_s_ind),
94
- const.SynonymObject: self.__create_synonyms_set(terms=terms, term_ind=actual_t_ind),
95
- const.PosTags: None if self.__pos_terms_mapper is None else [int(pos_tag) for pos_tag in self.__pos_terms_mapper.iter_mapped(terms)]
96
- }
97
-
98
- self._apply_row_data(row=row, vm=vm, val_fmt=self.__nn_val_fmt)
99
-
100
- # region private methods
101
-
102
- def __create_synonyms_set(self, terms, term_ind):
103
- entity = terms[term_ind]
104
- assert(isinstance(entity, Entity))
105
-
106
- # Searching for other synonyms among all the terms.
107
- group_ind = entity.GroupIndex
108
- it = self.__iter_indices(terms=terms, filter=lambda t: self.__syn_check(term=t, group_ind=group_ind))
109
- inds_set = set(it)
110
-
111
- # Guarantee the presence of the term_ind
112
- inds_set.add(term_ind)
113
-
114
- return sorted(inds_set)
115
-
116
- @staticmethod
117
- def __iter_indices(terms, filter):
118
- for t_ind, term in enumerate(terms):
119
- if filter(term):
120
- yield t_ind
121
-
122
- def __syn_check(self, term, group_ind):
123
- if not isinstance(term, Entity):
124
- return False
125
- if group_ind is None:
126
- return False
127
- return term.GroupIndex == group_ind
128
-
129
- # endregion
@@ -1,23 +0,0 @@
1
- from arekit.common.frames.connotations.descriptor import FrameConnotationDescriptor
2
- from arekit.common.frames.connotations.provider import FrameConnotationProvider
3
- from arekit.common.frames.text_variant import TextFrameVariant
4
- from arekit.common.labels.scaler.sentiment import SentimentLabelScaler
5
-
6
-
7
- def extract_uint_frame_variant_connotation(text_frame_variant, frames_connotation_provider, three_label_scaler):
8
- assert (isinstance(text_frame_variant, TextFrameVariant))
9
- assert (isinstance(frames_connotation_provider, FrameConnotationProvider))
10
- assert (isinstance(three_label_scaler, SentimentLabelScaler))
11
-
12
- frame_id = text_frame_variant.Variant.FrameID
13
- connot_descriptor = frames_connotation_provider.try_provide(frame_id)
14
-
15
- if connot_descriptor is None:
16
- return three_label_scaler.label_to_uint(label=three_label_scaler.get_no_label_instance())
17
-
18
- assert (isinstance(connot_descriptor, FrameConnotationDescriptor))
19
-
20
- target_label = three_label_scaler.invert_label(connot_descriptor.Label) \
21
- if text_frame_variant.IsNegated else connot_descriptor.Label
22
-
23
- return three_label_scaler.label_to_uint(target_label)
@@ -1,24 +0,0 @@
1
- from arekit.common.data.input.providers.text.single import BaseSingleTextProvider
2
- from arekit.contrib.networks.input.terms_mapping import VectorizedNetworkTermMapping
3
-
4
-
5
- class NetworkSingleTextProvider(BaseSingleTextProvider):
6
- """
7
- Performs iteration process over (string, embedding) term pairs.
8
- """
9
-
10
- def __init__(self, text_terms_mapper, pair_handling_func):
11
- assert(isinstance(text_terms_mapper, VectorizedNetworkTermMapping))
12
- assert(callable(pair_handling_func))
13
- super(NetworkSingleTextProvider, self).__init__(text_terms_mapper=text_terms_mapper)
14
- self.__write_embedding_pair_func = pair_handling_func
15
-
16
- def _mapped_data_to_str(self, m_data):
17
- # In this case, m_term consist of
18
- # 1. Term;
19
- # 2. Embedding.
20
- term, _ = m_data
21
- return term
22
-
23
- def _handle_mapped_data(self, m_data):
24
- self.__write_embedding_pair_func(m_data)
@@ -1,47 +0,0 @@
1
- import arekit.contrib.networks.input.const as const
2
- from arekit.common.data.rows_fmt import process_indices_list
3
-
4
-
5
- def create_nn_column_formatters(no_value_func=lambda: None, args_sep=","):
6
- assert(callable(no_value_func))
7
-
8
- empty_list = []
9
-
10
- def str_to_list(value):
11
- return process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
12
-
13
- def list_to_str(inds_iter):
14
- return args_sep.join([str(i) for i in inds_iter])
15
-
16
- return {
17
- const.FrameVariantIndices: {
18
- "writer": lambda value: list_to_str(value),
19
- "parser": lambda value: process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
20
- if isinstance(value, str) else empty_list
21
- },
22
- const.FrameConnotations: {
23
- "writer": lambda value: list_to_str(value),
24
- "parser": lambda value: process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
25
- if isinstance(value, str) else empty_list
26
- },
27
- const.SynonymObject: {
28
- "writer": lambda value: list_to_str(value),
29
- "parser": lambda value: process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
30
- },
31
- const.SynonymSubject: {
32
- "writer": lambda value: list_to_str(value),
33
- "parser": lambda value: process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
34
- },
35
- const.PosTags: {
36
- "writer": lambda value: list_to_str(value),
37
- "parser": lambda value: str_to_list(value)
38
- }
39
- }
40
-
41
-
42
- def create_nn_val_writer_fmt(fmt_type, args_sep=","):
43
- assert(isinstance(fmt_type, str))
44
- d = create_nn_column_formatters(args_sep=args_sep)
45
- for k, v in d.items():
46
- d[k] = v[fmt_type]
47
- return d
@@ -1,13 +0,0 @@
1
- class TermTypes(object):
2
- """ Types of input terms that may occur within the
3
- input sequence of the neural network moodels.
4
- """
5
-
6
- WORD = "word"
7
- ENTITY = "entity"
8
- FRAME = "frame"
9
- TOKEN = "token"
10
-
11
- @staticmethod
12
- def iter_types():
13
- return [TermTypes.WORD, TermTypes.ENTITY, TermTypes.FRAME, TermTypes.TOKEN]
@@ -1,60 +0,0 @@
1
- from arekit.common.data.input.terms_mapper import OpinionContainingTextTermsMapper
2
- from arekit.common.entities.base import Entity
3
- from arekit.common.frames.text_variant import TextFrameVariant
4
- from arekit.contrib.networks.input.term_types import TermTypes
5
-
6
-
7
- class VectorizedNetworkTermMapping(OpinionContainingTextTermsMapper):
8
- """ For every element returns: (word, embedded vector)
9
- """
10
-
11
- def __init__(self, string_entities_formatter, vectorizers):
12
- """string_emb_entity_formatter:
13
- Utilized in order to obtain embedding value from predefined_embeding for entities
14
- vectorizers:
15
- dict
16
- """
17
- assert(isinstance(vectorizers, dict))
18
-
19
- for term_type in TermTypes.iter_types():
20
- assert(term_type in vectorizers)
21
-
22
- super(VectorizedNetworkTermMapping, self).__init__(
23
- entity_formatter=string_entities_formatter)
24
-
25
- self.__vectorizers = vectorizers
26
-
27
- def map_term(self, term_type, term):
28
- """Universal term mapping method.
29
-
30
- Args:
31
- term_type (TermTypes): The type of term to map.
32
- term (str): The term to map.
33
-
34
- Returns:
35
- The mapped term.
36
- """
37
- return self.__vectorizers[term_type].create_term_embedding(term=term)
38
-
39
- def map_word(self, w_ind, word):
40
- return self.map_term(TermTypes.WORD, word)
41
-
42
- def map_text_frame_variant(self, fv_ind, text_frame_variant):
43
- assert(isinstance(text_frame_variant, TextFrameVariant))
44
- return self.map_term(TermTypes.FRAME, text_frame_variant.Variant.get_value())
45
-
46
- def map_token(self, t_ind, token):
47
- """ It assumes to be composed for all the supported types.
48
- """
49
- return self.map_term(TermTypes.TOKEN, token.get_token_value())
50
-
51
- def map_entity(self, e_ind, entity):
52
- assert(isinstance(entity, Entity))
53
-
54
- # Value extraction
55
- str_formatted_entity = super(VectorizedNetworkTermMapping, self).map_entity(
56
- e_ind=e_ind,
57
- entity=entity)
58
-
59
- # Vector extraction
60
- return self.map_term(TermTypes.ENTITY, str_formatted_entity)
@@ -1,6 +0,0 @@
1
- class BaseVectorizer(object):
2
- """ Custom API for vectorization
3
- """
4
-
5
- def create_term_embedding(self, term):
6
- raise NotImplementedError()
File without changes
@@ -1,7 +0,0 @@
1
- class BaseReader(object):
2
-
3
- def extension(self):
4
- raise NotImplementedError()
5
-
6
- def read(self, target):
7
- raise NotImplementedError()