arekit 0.25.0__py3-none-any.whl → 0.25.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. arekit/common/context/terms_mapper.py +5 -2
  2. arekit/common/data/input/providers/rows/samples.py +8 -12
  3. arekit/common/data/input/providers/sample/cropped.py +4 -3
  4. arekit/common/data/input/terms_mapper.py +4 -8
  5. arekit/common/data/storages/base.py +4 -18
  6. arekit/common/docs/entities_grouping.py +5 -3
  7. arekit/common/docs/parsed/base.py +3 -3
  8. arekit/common/docs/parsed/providers/base.py +3 -5
  9. arekit/common/docs/parsed/providers/entity_service.py +7 -28
  10. arekit/common/docs/parsed/providers/opinion_pairs.py +6 -6
  11. arekit/common/docs/parsed/providers/text_opinion_pairs.py +4 -4
  12. arekit/common/docs/parsed/service.py +2 -2
  13. arekit/common/docs/parser.py +3 -30
  14. arekit/common/model/labeling/single.py +7 -3
  15. arekit/common/opinions/annot/algo/pair_based.py +9 -5
  16. arekit/common/pipeline/base.py +0 -2
  17. arekit/common/pipeline/batching.py +0 -3
  18. arekit/common/pipeline/items/base.py +1 -1
  19. arekit/common/utils.py +11 -8
  20. arekit/contrib/bert/input/providers/cropped_sample.py +2 -5
  21. arekit/contrib/bert/terms/mapper.py +2 -2
  22. arekit/contrib/prompt/sample.py +2 -6
  23. arekit/contrib/utils/bert/samplers.py +4 -2
  24. arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
  25. arekit/contrib/utils/data/storages/row_cache.py +2 -1
  26. arekit/contrib/utils/data/storages/sqlite_based.py +2 -1
  27. arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +8 -5
  28. arekit/contrib/utils/pipelines/text_opinion/extraction.py +16 -8
  29. {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/METADATA +10 -8
  30. {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/RECORD +34 -115
  31. {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/WHEEL +1 -1
  32. arekit/common/data/input/repositories/__init__.py +0 -0
  33. arekit/common/data/input/repositories/base.py +0 -68
  34. arekit/common/data/input/repositories/sample.py +0 -22
  35. arekit/common/data/views/__init__.py +0 -0
  36. arekit/common/data/views/samples.py +0 -26
  37. arekit/common/experiment/__init__.py +0 -0
  38. arekit/common/experiment/api/__init__.py +0 -0
  39. arekit/common/experiment/api/base_samples_io.py +0 -20
  40. arekit/common/experiment/data_type.py +0 -17
  41. arekit/common/service/__init__.py +0 -0
  42. arekit/common/service/sqlite.py +0 -36
  43. arekit/contrib/networks/__init__.py +0 -0
  44. arekit/contrib/networks/embedding.py +0 -149
  45. arekit/contrib/networks/embedding_io.py +0 -18
  46. arekit/contrib/networks/input/__init__.py +0 -0
  47. arekit/contrib/networks/input/const.py +0 -6
  48. arekit/contrib/networks/input/ctx_serialization.py +0 -28
  49. arekit/contrib/networks/input/embedding/__init__.py +0 -0
  50. arekit/contrib/networks/input/embedding/matrix.py +0 -29
  51. arekit/contrib/networks/input/embedding/offsets.py +0 -55
  52. arekit/contrib/networks/input/formatters/__init__.py +0 -0
  53. arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
  54. arekit/contrib/networks/input/providers/__init__.py +0 -0
  55. arekit/contrib/networks/input/providers/sample.py +0 -129
  56. arekit/contrib/networks/input/providers/term_connotation.py +0 -23
  57. arekit/contrib/networks/input/providers/text.py +0 -24
  58. arekit/contrib/networks/input/rows_parser.py +0 -47
  59. arekit/contrib/networks/input/term_types.py +0 -13
  60. arekit/contrib/networks/input/terms_mapping.py +0 -60
  61. arekit/contrib/networks/vectorizer.py +0 -6
  62. arekit/contrib/utils/data/readers/__init__.py +0 -0
  63. arekit/contrib/utils/data/readers/base.py +0 -7
  64. arekit/contrib/utils/data/readers/csv_pd.py +0 -38
  65. arekit/contrib/utils/data/readers/jsonl.py +0 -15
  66. arekit/contrib/utils/data/readers/sqlite.py +0 -14
  67. arekit/contrib/utils/data/service/__init__.py +0 -0
  68. arekit/contrib/utils/data/service/balance.py +0 -50
  69. arekit/contrib/utils/data/storages/pandas_based.py +0 -123
  70. arekit/contrib/utils/data/writers/csv_native.py +0 -63
  71. arekit/contrib/utils/data/writers/csv_pd.py +0 -40
  72. arekit/contrib/utils/data/writers/json_opennre.py +0 -132
  73. arekit/contrib/utils/data/writers/sqlite_native.py +0 -114
  74. arekit/contrib/utils/embeddings/__init__.py +0 -0
  75. arekit/contrib/utils/embeddings/rusvectores.py +0 -58
  76. arekit/contrib/utils/embeddings/tokens.py +0 -30
  77. arekit/contrib/utils/entities/formatters/str_display.py +0 -11
  78. arekit/contrib/utils/io_utils/embedding.py +0 -72
  79. arekit/contrib/utils/np_utils/__init__.py +0 -0
  80. arekit/contrib/utils/np_utils/embedding.py +0 -22
  81. arekit/contrib/utils/np_utils/npz_utils.py +0 -13
  82. arekit/contrib/utils/np_utils/vocab.py +0 -20
  83. arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
  84. arekit/contrib/utils/pipelines/items/sampling/base.py +0 -94
  85. arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -55
  86. arekit/contrib/utils/pipelines/items/text/entities_default.py +0 -23
  87. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
  88. arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -33
  89. arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -105
  90. arekit/contrib/utils/pipelines/items/text/translator.py +0 -136
  91. arekit/contrib/utils/processing/__init__.py +0 -0
  92. arekit/contrib/utils/processing/languages/__init__.py +0 -0
  93. arekit/contrib/utils/processing/languages/mods.py +0 -12
  94. arekit/contrib/utils/processing/languages/pos.py +0 -23
  95. arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
  96. arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
  97. arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
  98. arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
  99. arekit/contrib/utils/processing/languages/ru/number.py +0 -23
  100. arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
  101. arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
  102. arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
  103. arekit/contrib/utils/processing/pos/__init__.py +0 -0
  104. arekit/contrib/utils/processing/pos/base.py +0 -12
  105. arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
  106. arekit/contrib/utils/processing/pos/russian.py +0 -10
  107. arekit/contrib/utils/processing/text/__init__.py +0 -0
  108. arekit/contrib/utils/processing/text/tokens.py +0 -127
  109. arekit/contrib/utils/serializer.py +0 -42
  110. arekit/contrib/utils/vectorizers/__init__.py +0 -0
  111. arekit/contrib/utils/vectorizers/bpe.py +0 -93
  112. arekit/contrib/utils/vectorizers/random_norm.py +0 -39
  113. {arekit-0.25.0.data → arekit-0.25.2.data}/data/logo.png +0 -0
  114. {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/LICENSE +0 -0
  115. {arekit-0.25.0.dist-info → arekit-0.25.2.dist-info}/top_level.txt +0 -0
@@ -1,114 +0,0 @@
1
- import os
2
- import sqlite3
3
- from os.path import dirname
4
-
5
- from arekit.common.data import const
6
- from arekit.contrib.utils.data.storages.row_cache import RowCacheStorage
7
- from arekit.contrib.utils.data.writers.base import BaseWriter
8
-
9
-
10
- class SQliteWriter(BaseWriter):
11
- """ TODO. This implementation is dedicated for the writing concepts of the data
12
- serialization pipeline. However we add the SQLite3 service, it would be
13
- right to refactor and utlize some core functionality from the core/service/sqlite.py
14
- """
15
-
16
- def __init__(self, table_name="contents", index_column_names=None, skip_existed=False, clear_table=True):
17
- """ index_column_names: list or None
18
- column names should be considered to build a unique index;
19
- if None, the default 'const.ID' will be considered for row indexation.
20
- """
21
- assert (isinstance(index_column_names, list) or index_column_names is None)
22
- self.__index_column_names = index_column_names if index_column_names is not None else [const.ID]
23
- self.__table_name = table_name
24
- self.__conn = None
25
- self.__cur = None
26
- self.__need_init_table = True
27
- self.__origin_column_names = None
28
- self.__skip_existed = skip_existed
29
- self.__clear_table = clear_table
30
-
31
- def extension(self):
32
- return ".sqlite"
33
-
34
- @staticmethod
35
- def __iter_storage_column_names(storage):
36
- """ Iter only those columns that existed in storage.
37
- """
38
- assert (isinstance(storage, RowCacheStorage))
39
- for col_name, col_type in zip(storage.iter_column_names(), storage.iter_column_types()):
40
- if col_name in storage.RowCache:
41
- yield col_name, col_type
42
-
43
- def __init_table(self, column_data):
44
- # Compose column name with the related SQLITE type.
45
- column_types = ",".join([" ".join([col_name, self.type_to_sqlite(col_type)])
46
- for col_name, col_type in column_data])
47
- # Create table if not exists.
48
- self.__cur.execute(f"CREATE TABLE IF NOT EXISTS {self.__table_name}({column_types})")
49
- # Table exists, however we may optionally remove the content from it.
50
- if self.__clear_table:
51
- self.__cur.execute(f"DELETE FROM {self.__table_name};")
52
- # Create index.
53
- index_name = f"i_{self.__table_name}_id"
54
- self.__cur.execute(f"DROP INDEX IF EXISTS {index_name};")
55
- self.__cur.execute("CREATE INDEX IF NOT EXISTS {index} ON {table}({columns})".format(
56
- index=index_name,
57
- table=self.__table_name,
58
- columns=", ".join(self.__index_column_names)
59
- ))
60
- self.__origin_column_names = [col_name for col_name, _ in column_data]
61
-
62
- @staticmethod
63
- def type_to_sqlite(col_type):
64
- """ This is a simple function that provides conversion from the
65
- base numpy types to SQLITE.
66
- NOTE: this method represent a quick implementation for supporting
67
- types, however it is far away from the generalized implementation.
68
- """
69
- if isinstance(col_type, str):
70
- if 'int' in col_type:
71
- return 'INTEGER'
72
-
73
- return "TEXT"
74
-
75
- def open_target(self, target):
76
- os.makedirs(dirname(target), exist_ok=True)
77
- self.__conn = sqlite3.connect(target)
78
- self.__cur = self.__conn.cursor()
79
-
80
- def commit_line(self, storage):
81
- assert (isinstance(storage, RowCacheStorage))
82
-
83
- column_data = list(self.__iter_storage_column_names(storage))
84
-
85
- if self.__need_init_table:
86
- self.__init_table(column_data)
87
- self.__need_init_table = False
88
-
89
- # Check whether the related row is already exist in SQLITE database.
90
- row_id = storage.RowCache[const.ID]
91
- top_row = self.__cur.execute(f"SELECT EXISTS(SELECT 1 FROM {self.__table_name} WHERE id='{row_id}');")
92
- is_exists = top_row.fetchone()[0]
93
- if is_exists == 1 and self.__skip_existed:
94
- return
95
-
96
- line_data = [storage.RowCache[col_name] for col_name, _ in column_data]
97
- parameters = ",".join(["?"] * len(line_data))
98
-
99
- assert (len(self.__origin_column_names) == len(line_data))
100
-
101
- self.__cur.execute(
102
- f"INSERT OR REPLACE INTO {self.__table_name} VALUES ({parameters})",
103
- tuple(line_data))
104
-
105
- self.__conn.commit()
106
-
107
- def close_target(self):
108
- self.__cur = None
109
- self.__origin_column_names = None
110
- self.__need_init_table = True
111
- self.__conn.close()
112
-
113
- def write_all(self, storage, target):
114
- pass
File without changes
@@ -1,58 +0,0 @@
1
- from arekit.common.text.stemmer import Stemmer
2
- from arekit.contrib.networks.embedding import Embedding
3
-
4
-
5
- class RusvectoresEmbedding(Embedding):
6
- """ Wrapper over models from the following resource.
7
- https://rusvectores.org/ru/models/
8
-
9
- NOTE: Usually these are embeddings for texts written in Russian.
10
- for the better performance it is expected that we adopt stemmer.
11
- """
12
-
13
- def __init__(self, matrix, words, stemmer):
14
- assert(isinstance(stemmer, Stemmer) or stemmer is None)
15
- super(RusvectoresEmbedding, self).__init__(matrix=matrix, words=words)
16
- self.__index_without_pos = self.__create_terms_without_pos()
17
- self.__stemmer = stemmer
18
- self.__lemmatize_by_default = stemmer is not None
19
-
20
- def try_find_index_by_plain_word(self, word):
21
- assert(isinstance(word, str))
22
-
23
- temp = self.__lemmatize_by_default
24
- self.__lemmatize_by_default = False
25
- index = super(RusvectoresEmbedding, self).try_find_index_by_plain_word(word)
26
- self.__lemmatize_by_default = temp
27
-
28
- return index
29
-
30
- def _handler(self, word):
31
- return self.__try_find_word_index_pair_lemmatized(word, self.__lemmatize_by_default)
32
-
33
- # region private methods
34
-
35
- def __try_find_word_index_pair_lemmatized(self, term, lemmatize):
36
- assert(isinstance(term, str))
37
- assert(isinstance(lemmatize, bool))
38
-
39
- if lemmatize:
40
- term = self.__stemmer.lemmatize_to_str(term)
41
-
42
- index = self.__index_without_pos[term] \
43
- if term in self.__index_without_pos else None
44
-
45
- return term, index
46
-
47
- def __create_terms_without_pos(self):
48
- d = {}
49
- for word_with_pos, index in self.iter_vocabulary():
50
- assert(isinstance(word_with_pos, str))
51
- word = word_with_pos.split(u'_')[0]
52
- if word in d:
53
- continue
54
- d[word] = index
55
-
56
- return d
57
-
58
- # endregion
@@ -1,30 +0,0 @@
1
- import numpy as np
2
-
3
- from arekit.contrib.networks.embedding import Embedding
4
- from arekit.contrib.utils.processing.text.tokens import Tokens
5
-
6
-
7
- class TokenEmbedding(Embedding):
8
- """ Embedding vectors for text punctuation, based on Tokens in parsed text
9
- """
10
-
11
- @classmethod
12
- def from_supported_tokens(cls, vector_size, random_vector_func):
13
- """
14
- random_vector_func: func
15
- function with parameters (vector_size, seed)
16
- """
17
- assert(isinstance(vector_size, int))
18
- assert(callable(random_vector_func))
19
-
20
- matrix = []
21
- tokens_list = list(Tokens.iter_supported_tokens())
22
-
23
- for token_index, _ in enumerate(tokens_list):
24
-
25
- vector = random_vector_func(vector_size, token_index)
26
-
27
- matrix.append(vector)
28
-
29
- return cls(matrix=np.array(matrix),
30
- words=tokens_list)
@@ -1,11 +0,0 @@
1
- from arekit.common.entities.base import Entity
2
- from arekit.common.entities.str_fmt import StringEntitiesFormatter
3
-
4
-
5
- class StringEntitiesDisplayValueFormatter(StringEntitiesFormatter):
6
- """ Provides the contents of the DisplayValue property.
7
- """
8
-
9
- def to_string(self, original_value, entity_type):
10
- assert(isinstance(original_value, Entity))
11
- return original_value.DisplayValue
@@ -1,72 +0,0 @@
1
- from os.path import join
2
-
3
- from arekit.contrib.networks.embedding_io import BaseEmbeddingIO
4
- from arekit.contrib.utils.io_utils.utils import check_targets_existence
5
- from arekit.contrib.utils.np_utils.embedding import NpzEmbeddingHelper
6
- from arekit.contrib.utils.np_utils.vocab import VocabRepositoryUtils
7
-
8
-
9
- class NpEmbeddingIO(BaseEmbeddingIO):
10
- """ Npz-based IO utils for embedding and text-based for vocabulary.
11
- This format represents a archived version of the numpy math data, i.e. vectors, numbers, etc.
12
-
13
- Provides additional Input/Output paths generation functions for:
14
- - embedding matrix;
15
- - embedding vocabulary.
16
- """
17
-
18
- def __init__(self, target_dir, prefix_name="sample"):
19
- assert(isinstance(target_dir, str))
20
-
21
- self.__target_dir = target_dir
22
- self.__term_emb_fn_template = "-".join([prefix_name, "term_embedding"])
23
- self.__vocab_fn_template = "-".join([prefix_name, "term_embedding"])
24
-
25
- # region Embedding-related data
26
-
27
- def save_vocab(self, data):
28
- target = self.__get_default_vocab_filepath()
29
- return VocabRepositoryUtils.save(data=data, target=target)
30
-
31
- def load_vocab(self):
32
- source = self.___get_vocab_source()
33
- return dict(VocabRepositoryUtils.load(source))
34
-
35
- def save_embedding(self, data):
36
- target = self.__get_default_embedding_filepath()
37
- NpzEmbeddingHelper.save_embedding(data=data, target=target)
38
-
39
- def load_embedding(self):
40
- source = self.__get_term_embedding_source()
41
- return NpzEmbeddingHelper.load_embedding(source)
42
-
43
- def check_targets_existed(self):
44
- targets = [
45
- self.__get_default_vocab_filepath(),
46
- self.__get_term_embedding_target()
47
- ]
48
- return check_targets_existence(targets=targets)
49
-
50
- # endregion
51
-
52
- # region embedding-related data
53
-
54
- def ___get_vocab_source(self):
55
- """ It is possible to load a predefined embedding from another experiment
56
- using the related filepath provided by model_io.
57
- """
58
- return self.__get_default_vocab_filepath()
59
-
60
- def __get_term_embedding_target(self):
61
- return self.__get_default_embedding_filepath()
62
-
63
- def __get_term_embedding_source(self):
64
- return self.__get_default_embedding_filepath()
65
-
66
- def __get_default_vocab_filepath(self):
67
- return join(self.__target_dir, self.__vocab_fn_template)
68
-
69
- def __get_default_embedding_filepath(self):
70
- return join(self.__target_dir, self.__term_emb_fn_template)
71
-
72
- # endregion
File without changes
@@ -1,22 +0,0 @@
1
- import logging
2
-
3
- from arekit.contrib.utils.np_utils.npz_utils import NpzRepositoryUtils
4
-
5
- logger = logging.getLogger(__name__)
6
- logging.basicConfig(level=logging.INFO)
7
-
8
-
9
- class NpzEmbeddingHelper:
10
-
11
- @staticmethod
12
- def save_embedding(data, target):
13
- NpzRepositoryUtils.save(data=data, target=target)
14
- logger.info("Saving embedding [size={shape}]: {filepath}".format(shape=data.shape,
15
- filepath=target))
16
-
17
- @staticmethod
18
- def load_embedding(source):
19
- embedding = NpzRepositoryUtils.load(source)
20
- logger.info("Embedding read [size={size}]: {filepath}".format(size=embedding.shape,
21
- filepath=source))
22
- return embedding
@@ -1,13 +0,0 @@
1
- import numpy as np
2
-
3
-
4
- class NpzRepositoryUtils(object):
5
-
6
- @staticmethod
7
- def save(data, target):
8
- np.savez(target, data)
9
-
10
- @staticmethod
11
- def load(source):
12
- data = np.load(source)
13
- return data['arr_0']
@@ -1,20 +0,0 @@
1
- import logging
2
-
3
- import numpy as np
4
-
5
- logger = logging.getLogger(__name__)
6
- logging.basicConfig(level=logging.INFO)
7
-
8
-
9
- class VocabRepositoryUtils(object):
10
-
11
- @staticmethod
12
- def save(data, target):
13
- logger.info("Saving vocabulary [size={size}]: {filepath}".format(size=len(data), filepath=target))
14
- np.savetxt(target, data, fmt='%s')
15
-
16
- @staticmethod
17
- def load(source):
18
- vocab = np.loadtxt(source, dtype=str, comments=None)
19
- logger.info("Loading vocabulary [size={size}]: {filepath}".format(size=len(vocab), filepath=source))
20
- return vocab
File without changes
@@ -1,94 +0,0 @@
1
- from arekit.common.data.input.providers.rows.samples import BaseSampleRowProvider
2
- from arekit.common.data.storages.base import BaseRowsStorage
3
- from arekit.common.experiment.api.base_samples_io import BaseSamplesIO
4
- from arekit.common.experiment.data_type import DataType
5
- from arekit.common.pipeline.items.base import BasePipelineItem
6
- from arekit.contrib.utils.serializer import InputDataSerializationHelper
7
-
8
-
9
- class BaseSerializerPipelineItem(BasePipelineItem):
10
-
11
- def __init__(self, rows_provider, samples_io, save_labels_func, storage, **kwargs):
12
- """ sample_rows_formatter:
13
- how we format input texts for a BERT model, for example:
14
- - single text
15
- - two sequences, separated by [SEP] token
16
-
17
- save_labels_func: function
18
- data_type -> bool
19
- """
20
- assert(isinstance(rows_provider, BaseSampleRowProvider))
21
- assert(isinstance(samples_io, BaseSamplesIO))
22
- assert(callable(save_labels_func))
23
- assert(isinstance(storage, BaseRowsStorage))
24
- super(BaseSerializerPipelineItem, self).__init__(**kwargs)
25
-
26
- self._rows_provider = rows_provider
27
- self._samples_io = samples_io
28
- self._save_labels_func = save_labels_func
29
- self._storage = storage
30
-
31
- def _serialize_iteration(self, data_type, pipeline, data_folding, doc_ids):
32
- assert(isinstance(data_type, DataType))
33
- assert(isinstance(pipeline, list))
34
- assert(isinstance(data_folding, dict) or data_folding is None)
35
- assert(isinstance(doc_ids, list) or doc_ids is None)
36
- assert(doc_ids is not None or data_folding is not None)
37
-
38
- repos = {
39
- "sample": InputDataSerializationHelper.create_samples_repo(
40
- keep_labels=self._save_labels_func(data_type),
41
- rows_provider=self._rows_provider,
42
- storage=self._storage),
43
- }
44
-
45
- writer_and_targets = {
46
- "sample": (self._samples_io.Writer,
47
- self._samples_io.create_target(data_type=data_type)),
48
- }
49
-
50
- for description, repo in repos.items():
51
-
52
- if data_folding is None:
53
- # Consider only the predefined doc_ids.
54
- doc_ids_iter = doc_ids
55
- else:
56
- # Take particular data_type.
57
- doc_ids_iter = data_folding[data_type]
58
- # Consider only predefined doc_ids.
59
- if doc_ids is not None:
60
- doc_ids_iter = set(doc_ids_iter).intersection(doc_ids)
61
-
62
- InputDataSerializationHelper.fill_and_write(
63
- repo=repo,
64
- pipeline=pipeline,
65
- doc_ids_iter=doc_ids_iter,
66
- desc="{desc} [{data_type}]".format(desc=description, data_type=data_type),
67
- writer=writer_and_targets[description][0],
68
- target=writer_and_targets[description][1])
69
-
70
- def _handle_iteration(self, data_type_pipelines, data_folding, doc_ids):
71
- """ Performing data serialization for a particular iteration
72
- """
73
- assert(isinstance(data_type_pipelines, dict))
74
- for data_type, pipeline in data_type_pipelines.items():
75
- self._serialize_iteration(data_type=data_type, pipeline=pipeline, data_folding=data_folding,
76
- doc_ids=doc_ids)
77
-
78
- def apply_core(self, input_data, pipeline_ctx):
79
- """
80
- data_type_pipelines: dict of, for example:
81
- {
82
- DataType.Train: BasePipeline,
83
- DataType.Test: BasePipeline
84
- }
85
-
86
- data_type_pipelines: doc_id -> parsed_doc -> annot -> opinion linkages
87
- for example, function: sentiment_attitude_extraction_default_pipeline
88
- doc_ids: optional
89
- this parameter allows to limit amount of documents considered for sampling
90
- """
91
- assert("data_type_pipelines" in pipeline_ctx)
92
- self._handle_iteration(data_type_pipelines=pipeline_ctx.provide("data_type_pipelines"),
93
- doc_ids=pipeline_ctx.provide_or_none("doc_ids"),
94
- data_folding=pipeline_ctx.provide_or_none("data_folding"))
@@ -1,55 +0,0 @@
1
- from arekit.contrib.networks.input.embedding.matrix import create_term_embedding_matrix
2
- from arekit.contrib.networks.input.embedding.offsets import TermsEmbeddingOffsets
3
- from arekit.contrib.networks.embedding import Embedding
4
- from arekit.contrib.networks.input.providers.sample import NetworkSampleRowProvider
5
- from arekit.contrib.utils.io_utils.embedding import NpEmbeddingIO
6
- from arekit.contrib.utils.pipelines.items.sampling.base import BaseSerializerPipelineItem
7
-
8
-
9
- class NetworksInputSerializerPipelineItem(BaseSerializerPipelineItem):
10
-
11
- def __init__(self, save_labels_func, rows_provider, samples_io, emb_io, storage, save_embedding=True, **kwargs):
12
- """ This pipeline item allows to perform a data preparation for neural network models.
13
-
14
- considering a list of the whole data_types with the related pipelines,
15
- which are supported and required in a handler. It is necessary to know
16
- data_types in advance as it allows to create a complete vocabulary of input terms,
17
- with the related embeddings.
18
- """
19
- assert(isinstance(emb_io, NpEmbeddingIO))
20
- assert(isinstance(rows_provider, NetworkSampleRowProvider))
21
- assert(isinstance(save_embedding, bool))
22
- super(NetworksInputSerializerPipelineItem, self).__init__(
23
- rows_provider=rows_provider,
24
- samples_io=samples_io,
25
- save_labels_func=save_labels_func,
26
- storage=storage,
27
- **kwargs)
28
-
29
- self.__emb_io = emb_io
30
- self.__save_embedding = save_embedding
31
-
32
- def _handle_iteration(self, data_type_pipelines, data_folding, doc_ids):
33
- """ Performing data serialization for a particular iteration
34
- """
35
- assert(isinstance(data_type_pipelines, dict))
36
-
37
- # Prepare for the present iteration.
38
- self._rows_provider.clear_embedding_pairs()
39
-
40
- super(NetworksInputSerializerPipelineItem, self)._handle_iteration(
41
- data_type_pipelines=data_type_pipelines, data_folding=data_folding, doc_ids=doc_ids)
42
-
43
- if not (self.__save_embedding and self._rows_provider.HasEmbeddingPairs):
44
- return
45
-
46
- # Save embedding information additionally.
47
- term_embedding = Embedding.from_word_embedding_pairs_iter(self._rows_provider.iter_term_embedding_pairs())
48
- embedding_matrix = create_term_embedding_matrix(term_embedding=term_embedding)
49
- vocab = list(TermsEmbeddingOffsets.extract_vocab(words_embedding=term_embedding))
50
-
51
- # Save embedding matrix
52
- self.__emb_io.save_embedding(data=embedding_matrix)
53
- self.__emb_io.save_vocab(data=vocab)
54
-
55
- del embedding_matrix
@@ -1,23 +0,0 @@
1
- from arekit.common.entities.base import Entity
2
- from arekit.common.pipeline.items.base import BasePipelineItem
3
-
4
-
5
- class TextEntitiesParser(BasePipelineItem):
6
-
7
- def __init__(self, **kwargs):
8
- super(TextEntitiesParser, self).__init__(**kwargs)
9
-
10
- @staticmethod
11
- def __process_word(word):
12
- assert(isinstance(word, str))
13
-
14
- # If this is a special word which is related to the [entity] mention.
15
- if word[0] == "[" and word[-1] == "]":
16
- entity = Entity(value=word[1:-1], e_type=None)
17
- return entity
18
-
19
- return word
20
-
21
- def apply_core(self, input_data, pipeline_ctx):
22
- assert(isinstance(input_data, list))
23
- return [self.__process_word(w) for w in input_data]
@@ -1,36 +0,0 @@
1
- from arekit.common.text.stemmer import Stemmer
2
- from arekit.contrib.utils.pipelines.items.text.frames import FrameVariantsParser
3
- from arekit.contrib.utils.processing.languages.ru.mods import RussianLanguageMods
4
-
5
-
6
- class LemmasBasedFrameVariantsParser(FrameVariantsParser):
7
-
8
- def __init__(self, frame_variants, stemmer, locale_mods=RussianLanguageMods, save_lemmas=False, **kwargs):
9
- assert(isinstance(stemmer, Stemmer))
10
- assert(isinstance(save_lemmas, bool))
11
- super(LemmasBasedFrameVariantsParser, self).__init__(frame_variants=frame_variants, **kwargs)
12
-
13
- self.__frame_variants = frame_variants
14
- self.__stemmer = stemmer
15
- self.__save_lemmas = save_lemmas
16
- self.__max_variant_len = max([len(variant) for _, variant in frame_variants.iter_variants()])
17
- self.__locale_mods = locale_mods
18
-
19
- def __lemmatize_term(self, term):
20
- # we first split onto words for lemmatization and then join all of them.
21
- lemma = "".join(self.__stemmer.lemmatize_to_list(term))
22
- # then we replace certain chars according to the locale restrictions.
23
- return self.__locale_mods.replace_specific_word_chars(lemma)
24
-
25
- def __provide_lemmatized_terms(self, terms):
26
- """
27
- Compose a list of lemmatized versions of parsed_doc
28
- PS: Might be significantly slow, depending on stemmer were used.
29
- """
30
- assert(isinstance(terms, list))
31
- return [self.__lemmatize_term(term) if isinstance(term, str) else term for term in terms]
32
-
33
- def apply_core(self, input_data, pipeline_ctx):
34
- lemmas = self.__provide_lemmatized_terms(input_data)
35
- processed_it = self._iter_processed(terms=lemmas, origin=lemmas if self.__save_lemmas else input_data)
36
- return list(processed_it)
@@ -1,33 +0,0 @@
1
- from arekit.common.frames.text_variant import TextFrameVariant
2
- from arekit.common.pipeline.context import PipelineContext
3
- from arekit.common.pipeline.items.base import BasePipelineItem
4
- from arekit.contrib.utils.processing.languages.mods import BaseLanguageMods
5
- from arekit.contrib.utils.processing.languages.ru.mods import RussianLanguageMods
6
-
7
-
8
- class FrameVariantsSentimentNegation(BasePipelineItem):
9
-
10
- def __init__(self, locale_mods=RussianLanguageMods, **kwargs):
11
- assert(issubclass(locale_mods, BaseLanguageMods))
12
- super(FrameVariantsSentimentNegation, self).__init__(**kwargs)
13
- self._locale_mods = locale_mods
14
-
15
- @staticmethod
16
- def __get_preposition(terms, index):
17
- return terms[index-1] if index > 0 else None
18
-
19
- def apply_core(self, input_data, pipeline_ctx):
20
- assert(isinstance(input_data, list))
21
- assert(isinstance(pipeline_ctx, PipelineContext))
22
-
23
- for curr_ind, term in enumerate(input_data):
24
-
25
- if not isinstance(term, TextFrameVariant):
26
- continue
27
-
28
- prep_term = self.__get_preposition(terms=input_data, index=curr_ind)
29
- is_negated = self._locale_mods.is_negation_word(prep_term) if prep_term is not None else False
30
-
31
- term.set_is_negated(is_negated)
32
-
33
- return input_data