arekit 0.24.0__py3-none-any.whl → 0.25.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. arekit/common/data/storages/base.py +4 -15
  2. arekit/common/docs/entities_grouping.py +2 -1
  3. arekit/common/docs/parser.py +27 -22
  4. arekit/common/pipeline/base.py +12 -16
  5. arekit/common/pipeline/batching.py +28 -0
  6. arekit/common/pipeline/context.py +5 -1
  7. arekit/common/pipeline/items/base.py +39 -2
  8. arekit/common/pipeline/items/flatten.py +5 -1
  9. arekit/common/pipeline/items/handle.py +2 -1
  10. arekit/common/pipeline/items/iter.py +2 -1
  11. arekit/common/pipeline/items/map.py +2 -1
  12. arekit/common/pipeline/items/map_nested.py +4 -0
  13. arekit/common/pipeline/utils.py +32 -0
  14. arekit/common/text/{partitioning/str.py → partitioning.py} +14 -9
  15. arekit/common/utils.py +11 -52
  16. arekit/contrib/utils/data/contents/opinions.py +13 -3
  17. arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
  18. arekit/contrib/utils/data/storages/pandas_based.py +2 -17
  19. arekit/contrib/utils/data/storages/row_cache.py +8 -2
  20. arekit/contrib/utils/data/storages/sqlite_based.py +18 -0
  21. arekit/contrib/utils/io_utils/utils.py +1 -18
  22. arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
  23. arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
  24. arekit/contrib/utils/pipelines/text_opinion/extraction.py +8 -10
  25. arekit-0.25.1.data/data/logo.png +0 -0
  26. arekit-0.25.1.dist-info/METADATA +81 -0
  27. arekit-0.25.1.dist-info/RECORD +186 -0
  28. {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/WHEEL +1 -1
  29. arekit/common/data/input/repositories/__init__.py +0 -0
  30. arekit/common/data/input/repositories/base.py +0 -68
  31. arekit/common/data/input/repositories/sample.py +0 -22
  32. arekit/common/data/views/__init__.py +0 -0
  33. arekit/common/data/views/samples.py +0 -26
  34. arekit/common/docs/objects_parser.py +0 -37
  35. arekit/common/text/parser.py +0 -12
  36. arekit/common/text/partitioning/__init__.py +0 -0
  37. arekit/common/text/partitioning/base.py +0 -4
  38. arekit/common/text/partitioning/terms.py +0 -35
  39. arekit/contrib/networks/__init__.py +0 -0
  40. arekit/contrib/networks/embedding.py +0 -149
  41. arekit/contrib/networks/embedding_io.py +0 -18
  42. arekit/contrib/networks/input/__init__.py +0 -0
  43. arekit/contrib/networks/input/const.py +0 -6
  44. arekit/contrib/networks/input/ctx_serialization.py +0 -28
  45. arekit/contrib/networks/input/embedding/__init__.py +0 -0
  46. arekit/contrib/networks/input/embedding/matrix.py +0 -29
  47. arekit/contrib/networks/input/embedding/offsets.py +0 -55
  48. arekit/contrib/networks/input/formatters/__init__.py +0 -0
  49. arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
  50. arekit/contrib/networks/input/providers/__init__.py +0 -0
  51. arekit/contrib/networks/input/providers/sample.py +0 -129
  52. arekit/contrib/networks/input/providers/term_connotation.py +0 -23
  53. arekit/contrib/networks/input/providers/text.py +0 -24
  54. arekit/contrib/networks/input/rows_parser.py +0 -47
  55. arekit/contrib/networks/input/term_types.py +0 -13
  56. arekit/contrib/networks/input/terms_mapping.py +0 -60
  57. arekit/contrib/networks/vectorizer.py +0 -6
  58. arekit/contrib/source/__init__.py +0 -0
  59. arekit/contrib/source/brat/__init__.py +0 -0
  60. arekit/contrib/source/brat/annot.py +0 -84
  61. arekit/contrib/source/brat/doc.py +0 -28
  62. arekit/contrib/source/brat/entities/__init__.py +0 -0
  63. arekit/contrib/source/brat/entities/compound.py +0 -13
  64. arekit/contrib/source/brat/entities/entity.py +0 -42
  65. arekit/contrib/source/brat/entities/parser.py +0 -53
  66. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  67. arekit/contrib/source/brat/opinions/converter.py +0 -19
  68. arekit/contrib/source/brat/relation.py +0 -32
  69. arekit/contrib/source/brat/sentence.py +0 -69
  70. arekit/contrib/source/brat/sentences_reader.py +0 -128
  71. arekit/contrib/source/download.py +0 -41
  72. arekit/contrib/source/nerel/__init__.py +0 -0
  73. arekit/contrib/source/nerel/entities.py +0 -55
  74. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  75. arekit/contrib/source/nerel/folding/fixed.py +0 -74
  76. arekit/contrib/source/nerel/io_utils.py +0 -62
  77. arekit/contrib/source/nerel/labels.py +0 -241
  78. arekit/contrib/source/nerel/reader.py +0 -46
  79. arekit/contrib/source/nerel/utils.py +0 -24
  80. arekit/contrib/source/nerel/versions.py +0 -12
  81. arekit/contrib/source/nerelbio/__init__.py +0 -0
  82. arekit/contrib/source/nerelbio/io_utils.py +0 -62
  83. arekit/contrib/source/nerelbio/labels.py +0 -265
  84. arekit/contrib/source/nerelbio/reader.py +0 -8
  85. arekit/contrib/source/nerelbio/versions.py +0 -8
  86. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  87. arekit/contrib/source/ruattitudes/collection.py +0 -36
  88. arekit/contrib/source/ruattitudes/doc.py +0 -51
  89. arekit/contrib/source/ruattitudes/doc_brat.py +0 -44
  90. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  91. arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
  92. arekit/contrib/source/ruattitudes/io_utils.py +0 -56
  93. arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
  94. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  95. arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
  96. arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
  97. arekit/contrib/source/ruattitudes/reader.py +0 -268
  98. arekit/contrib/source/ruattitudes/sentence.py +0 -73
  99. arekit/contrib/source/ruattitudes/synonyms.py +0 -17
  100. arekit/contrib/source/ruattitudes/text_object.py +0 -59
  101. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  102. arekit/contrib/source/rusentiframes/collection.py +0 -157
  103. arekit/contrib/source/rusentiframes/effect.py +0 -24
  104. arekit/contrib/source/rusentiframes/io_utils.py +0 -19
  105. arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
  106. arekit/contrib/source/rusentiframes/polarity.py +0 -35
  107. arekit/contrib/source/rusentiframes/role.py +0 -15
  108. arekit/contrib/source/rusentiframes/state.py +0 -24
  109. arekit/contrib/source/rusentiframes/types.py +0 -42
  110. arekit/contrib/source/rusentiframes/value.py +0 -2
  111. arekit/contrib/source/rusentrel/__init__.py +0 -0
  112. arekit/contrib/source/rusentrel/const.py +0 -3
  113. arekit/contrib/source/rusentrel/docs_reader.py +0 -51
  114. arekit/contrib/source/rusentrel/entities.py +0 -26
  115. arekit/contrib/source/rusentrel/io_utils.py +0 -125
  116. arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
  117. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  118. arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
  119. arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
  120. arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
  121. arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
  122. arekit/contrib/source/rusentrel/synonyms.py +0 -17
  123. arekit/contrib/source/sentinerel/__init__.py +0 -0
  124. arekit/contrib/source/sentinerel/entities.py +0 -52
  125. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  126. arekit/contrib/source/sentinerel/folding/factory.py +0 -31
  127. arekit/contrib/source/sentinerel/folding/fixed.py +0 -70
  128. arekit/contrib/source/sentinerel/io_utils.py +0 -87
  129. arekit/contrib/source/sentinerel/labels.py +0 -53
  130. arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
  131. arekit/contrib/source/sentinerel/reader.py +0 -42
  132. arekit/contrib/source/synonyms/__init__.py +0 -0
  133. arekit/contrib/source/synonyms/utils.py +0 -19
  134. arekit/contrib/source/zip_utils.py +0 -47
  135. arekit/contrib/utils/connotations/__init__.py +0 -0
  136. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
  137. arekit/contrib/utils/data/readers/__init__.py +0 -0
  138. arekit/contrib/utils/data/readers/base.py +0 -7
  139. arekit/contrib/utils/data/readers/csv_pd.py +0 -38
  140. arekit/contrib/utils/data/readers/jsonl.py +0 -15
  141. arekit/contrib/utils/data/service/__init__.py +0 -0
  142. arekit/contrib/utils/data/service/balance.py +0 -50
  143. arekit/contrib/utils/data/writers/csv_native.py +0 -63
  144. arekit/contrib/utils/data/writers/csv_pd.py +0 -40
  145. arekit/contrib/utils/data/writers/json_opennre.py +0 -132
  146. arekit/contrib/utils/data/writers/sqlite_native.py +0 -110
  147. arekit/contrib/utils/download.py +0 -77
  148. arekit/contrib/utils/embeddings/__init__.py +0 -0
  149. arekit/contrib/utils/embeddings/rusvectores.py +0 -58
  150. arekit/contrib/utils/embeddings/tokens.py +0 -30
  151. arekit/contrib/utils/io_utils/embedding.py +0 -72
  152. arekit/contrib/utils/io_utils/opinions.py +0 -37
  153. arekit/contrib/utils/io_utils/samples.py +0 -79
  154. arekit/contrib/utils/lexicons/__init__.py +0 -0
  155. arekit/contrib/utils/lexicons/lexicon.py +0 -41
  156. arekit/contrib/utils/lexicons/relation.py +0 -42
  157. arekit/contrib/utils/lexicons/rusentilex.py +0 -37
  158. arekit/contrib/utils/nn/__init__.py +0 -0
  159. arekit/contrib/utils/nn/rows.py +0 -83
  160. arekit/contrib/utils/np_utils/__init__.py +0 -0
  161. arekit/contrib/utils/np_utils/embedding.py +0 -22
  162. arekit/contrib/utils/np_utils/npz_utils.py +0 -13
  163. arekit/contrib/utils/np_utils/vocab.py +0 -20
  164. arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
  165. arekit/contrib/utils/pipelines/items/sampling/base.py +0 -99
  166. arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -54
  167. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
  168. arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -32
  169. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
  170. arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -107
  171. arekit/contrib/utils/pipelines/items/text/translator.py +0 -135
  172. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  173. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  174. arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +0 -27
  175. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -65
  176. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
  177. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  178. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +0 -29
  179. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -64
  180. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
  181. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  182. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +0 -56
  183. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -20
  184. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -65
  185. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  186. arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +0 -21
  187. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -107
  188. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  189. arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +0 -29
  190. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
  191. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -180
  192. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
  193. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
  194. arekit/contrib/utils/processing/languages/__init__.py +0 -0
  195. arekit/contrib/utils/processing/languages/mods.py +0 -12
  196. arekit/contrib/utils/processing/languages/pos.py +0 -23
  197. arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
  198. arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
  199. arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
  200. arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
  201. arekit/contrib/utils/processing/languages/ru/number.py +0 -23
  202. arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
  203. arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
  204. arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
  205. arekit/contrib/utils/processing/pos/__init__.py +0 -0
  206. arekit/contrib/utils/processing/pos/base.py +0 -12
  207. arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
  208. arekit/contrib/utils/processing/pos/russian.py +0 -10
  209. arekit/contrib/utils/processing/text/__init__.py +0 -0
  210. arekit/contrib/utils/processing/text/tokens.py +0 -127
  211. arekit/contrib/utils/resources.py +0 -25
  212. arekit/contrib/utils/serializer.py +0 -43
  213. arekit/contrib/utils/sources/__init__.py +0 -0
  214. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  215. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  216. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
  217. arekit/contrib/utils/vectorizers/__init__.py +0 -0
  218. arekit/contrib/utils/vectorizers/bpe.py +0 -93
  219. arekit/contrib/utils/vectorizers/random_norm.py +0 -39
  220. arekit/download_data.py +0 -11
  221. arekit-0.24.0.dist-info/METADATA +0 -23
  222. arekit-0.24.0.dist-info/RECORD +0 -374
  223. {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/LICENSE +0 -0
  224. {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/top_level.txt +0 -0
@@ -10,6 +10,9 @@ logger = logging.getLogger(__name__)
10
10
 
11
11
  class BaseRowsStorage(object):
12
12
 
13
+ def __init__(self, log_out=None):
14
+ self.__log_out = log_out
15
+
13
16
  # region protected methods
14
17
 
15
18
  def _begin_filling_row(self, row_ind):
@@ -31,27 +34,12 @@ class BaseRowsStorage(object):
31
34
  def _get_rows_count(self):
32
35
  raise NotImplemented()
33
36
 
34
- def find_by_value(self, column_name, value):
35
- raise NotImplemented()
36
-
37
- def find_first_by_value(self, column_name, value):
38
- raise NotImplemented()
39
-
40
- def iter_column_values(self, column_name, dtype=None):
41
- raise NotImplemented()
42
-
43
37
  def get_row(self, row_index):
44
38
  raise NotImplemented()
45
39
 
46
- def get_cell(self, row_index, column_name):
47
- raise NotImplemented()
48
-
49
40
  def init_empty(self, columns_provider):
50
41
  raise NotImplemented()
51
42
 
52
- def iter_shuffled(self):
53
- raise NotImplemented()
54
-
55
43
  def iter_column_names(self):
56
44
  raise NotImplemented()
57
45
 
@@ -81,6 +69,7 @@ class BaseRowsStorage(object):
81
69
  condition_func=lambda item: not isinstance(item[1], MetaEmptyLinkedDataWrapper),
82
70
  postfix_func=postfix_func,
83
71
  desc="{fmt}".format(fmt=desc),
72
+ file=self.__log_out,
84
73
  total=rows_count)
85
74
 
86
75
  for row_index, item in enumerate(pbar_it):
@@ -4,8 +4,9 @@ from arekit.common.pipeline.items.base import BasePipelineItem
4
4
 
5
5
  class EntitiesGroupingPipelineItem(BasePipelineItem):
6
6
 
7
- def __init__(self, value_to_group_id_func):
7
+ def __init__(self, value_to_group_id_func, **kwargs):
8
8
  assert(callable(value_to_group_id_func))
9
+ super(EntitiesGroupingPipelineItem, self).__init__(**kwargs)
9
10
  self.__value_to_group_id_func = value_to_group_id_func
10
11
 
11
12
  def apply_core(self, input_data, pipeline_ctx):
@@ -1,34 +1,39 @@
1
1
  from arekit.common.docs.base import Document
2
2
  from arekit.common.docs.parsed.base import ParsedDocument
3
+ from arekit.common.pipeline.batching import BatchingPipelineLauncher
3
4
  from arekit.common.pipeline.context import PipelineContext
4
- from arekit.common.text.parser import BaseTextParser
5
+ from arekit.common.pipeline.utils import BatchIterator
6
+ from arekit.common.text.parsed import BaseParsedText
7
+ from arekit.common.utils import progress_bar_defined
5
8
 
6
9
 
7
- class DocumentParser(object):
10
+ class DocumentParsers(object):
8
11
 
9
12
  @staticmethod
10
- def __get_sent(doc, sent_ind):
11
- return doc.get_sentence(sent_ind)
12
-
13
- @staticmethod
14
- def parse(doc, text_parser, parent_ppl_ctx=None):
13
+ def parse_batch(doc, pipeline_items, batch_size, parent_ppl_ctx=None, src_key="input", show_progress=False):
14
+ """ This document parser is based on batch of sentences.
15
+ """
16
+ assert(isinstance(batch_size, int) and batch_size > 0)
15
17
  assert(isinstance(doc, Document))
16
- assert(isinstance(text_parser, BaseTextParser))
18
+ assert(isinstance(pipeline_items, list))
17
19
  assert(isinstance(parent_ppl_ctx, PipelineContext) or parent_ppl_ctx is None)
18
20
 
19
- parsed_sentences = [text_parser.run(input_data=DocumentParser.__get_sent(doc, sent_ind).Text,
20
- params_dict=DocumentParser.__create_ppl_params(doc=doc, sent_ind=sent_ind),
21
- parent_ctx=parent_ppl_ctx)
22
- for sent_ind in range(doc.SentencesCount)]
21
+ parsed_sentences = []
23
22
 
24
- return ParsedDocument(doc_id=doc.ID,
25
- parsed_sentences=parsed_sentences)
23
+ data_it = BatchIterator(data_iter=iter(range(doc.SentencesCount)), batch_size=batch_size)
24
+ progress_it = progress_bar_defined(data_it, total=round(doc.SentencesCount / batch_size),
25
+ disable=not show_progress)
26
26
 
27
- @staticmethod
28
- def __create_ppl_params(doc, sent_ind):
29
- assert(isinstance(doc, Document))
30
- return {
31
- "s_ind": sent_ind, # sentence index. (as Metadata)
32
- "doc_id": doc.ID, # document index. (as Metadata)
33
- "sentence": DocumentParser.__get_sent(doc, sent_ind), # Required for special sources.
34
- }
27
+ for batch in progress_it:
28
+
29
+ # Composing the context from a single sentence.
30
+ ctx = PipelineContext({src_key: [doc.get_sentence(s_ind) for s_ind in batch]},
31
+ parent_ctx=parent_ppl_ctx)
32
+
33
+ # Apply all the operations.
34
+ BatchingPipelineLauncher.run(pipeline=pipeline_items, pipeline_ctx=ctx, src_key=src_key)
35
+
36
+ # Collecting the result.
37
+ parsed_sentences += [BaseParsedText(terms=result) for result in ctx.provide("result")]
38
+
39
+ return ParsedDocument(doc_id=doc.ID, parsed_sentences=parsed_sentences)
@@ -2,24 +2,20 @@ from arekit.common.pipeline.context import PipelineContext
2
2
  from arekit.common.pipeline.items.base import BasePipelineItem
3
3
 
4
4
 
5
- class BasePipeline(object):
5
+ class BasePipelineLauncher:
6
6
 
7
- def __init__(self, pipeline):
7
+ @staticmethod
8
+ def run(pipeline, pipeline_ctx, src_key=None, has_input=True):
8
9
  assert(isinstance(pipeline, list))
9
- self.__pipeline = pipeline
10
+ assert(isinstance(pipeline_ctx, PipelineContext))
11
+ assert(isinstance(src_key, str) or src_key is None)
10
12
 
11
- def run(self, input_data, params_dict=None, parent_ctx=None):
12
- assert(isinstance(params_dict, dict) or params_dict is None)
13
-
14
- pipeline_ctx = PipelineContext(d=params_dict if params_dict is not None else dict(),
15
- parent_ctx=parent_ctx)
16
-
17
- for item in filter(lambda itm: itm is not None, self.__pipeline):
13
+ for ind, item in enumerate(filter(lambda itm: itm is not None, pipeline)):
18
14
  assert(isinstance(item, BasePipelineItem))
19
- input_data = item.apply(input_data=input_data, pipeline_ctx=pipeline_ctx)
20
-
21
- return input_data
15
+ do_force_key = src_key is not None and ind == 0
16
+ input_data = item.get_source(pipeline_ctx, force_key=src_key if do_force_key else None) \
17
+ if has_input or ind > 0 else None
18
+ item_result = item.apply(input_data=input_data, pipeline_ctx=pipeline_ctx)
19
+ pipeline_ctx.update(param=item.ResultKey, value=item_result, is_new_key=False)
22
20
 
23
- def append(self, item):
24
- assert(isinstance(item, BasePipelineItem))
25
- self.__pipeline.append(item)
21
+ return pipeline_ctx
@@ -0,0 +1,28 @@
1
+ from arekit.common.pipeline.context import PipelineContext
2
+ from arekit.common.pipeline.items.base import BasePipelineItem
3
+
4
+
5
+ class BatchingPipelineLauncher:
6
+
7
+ @staticmethod
8
+ def run(pipeline, pipeline_ctx, src_key=None):
9
+ assert(isinstance(pipeline, list))
10
+ assert(isinstance(pipeline_ctx, PipelineContext))
11
+ assert(isinstance(src_key, str) or src_key is None)
12
+
13
+ for ind, item in enumerate(filter(lambda itm: itm is not None, pipeline)):
14
+ assert (isinstance(item, BasePipelineItem))
15
+
16
+ # Handle the content of the batch or batch itself.
17
+ content = item.get_source(pipeline_ctx, call_func=False, force_key=src_key if ind == 0 else None)
18
+ handled_batch = [item._src_func(i) if item._src_func is not None else i for i in content]
19
+
20
+ if item.SupportBatching:
21
+ batch_result = list(item.apply(input_data=handled_batch, pipeline_ctx=pipeline_ctx))
22
+ else:
23
+ batch_result = [item.apply(input_data=input_data, pipeline_ctx=pipeline_ctx)
24
+ for input_data in handled_batch]
25
+
26
+ pipeline_ctx.update(param=item.ResultKey, value=batch_result, is_new_key=False)
27
+
28
+ return pipeline_ctx
@@ -13,6 +13,8 @@ class PipelineContext(object):
13
13
  self._d[PARENT_CTX] = parent_ctx
14
14
 
15
15
  def __provide(self, param):
16
+ if param not in self._d:
17
+ raise Exception(f"Key `{param}` is not in dictionary.\n{self._d}")
16
18
  return self._d[param]
17
19
 
18
20
  # region public
@@ -23,7 +25,9 @@ class PipelineContext(object):
23
25
  def provide_or_none(self, param):
24
26
  return self.__provide(param) if param in self._d else None
25
27
 
26
- def update(self, param, value):
28
+ def update(self, param, value, is_new_key=False):
29
+ if is_new_key and param in self._d:
30
+ raise Exception(f"Key `{param}` is already presented in pipeline context dictionary.")
27
31
  self._d[param] = value
28
32
 
29
33
  # endregion
@@ -1,9 +1,46 @@
1
+ from arekit.common.pipeline.context import PipelineContext
2
+
3
+
1
4
  class BasePipelineItem(object):
2
- """ Single pipeline item that might be instatiated and embedded into pipeline.
5
+ """ Single pipeline item that might be instantiated and embedded into pipeline.
3
6
  """
4
7
 
8
+ def __init__(self, src_key="result", result_key="result", src_func=None):
9
+ assert(isinstance(src_key, str) or src_key is None)
10
+ assert(callable(src_func) or src_func is None)
11
+ self.__src_key = src_key
12
+ self._src_func = src_func
13
+ self.__result_key = result_key
14
+
15
+ @property
16
+ def ResultKey(self):
17
+ return self.__result_key
18
+
19
+ @property
20
+ def SupportBatching(self):
21
+ """ By default pipeline item is not designed for batching.
22
+ """
23
+ return False
24
+
25
+ def get_source(self, src_ctx, call_func=True, force_key=None):
26
+ """ Extract input element for processing.
27
+ """
28
+ assert(isinstance(src_ctx, PipelineContext))
29
+
30
+ # If there is no information about key, then we consider absence of the source.
31
+ if self.__src_key is None:
32
+ return None
33
+
34
+ # Extracting actual source.
35
+ src_data = src_ctx.provide(self.__src_key if force_key is None else force_key)
36
+ if self._src_func is not None and call_func:
37
+ src_data = self._src_func(src_data)
38
+
39
+ return src_data
40
+
5
41
  def apply_core(self, input_data, pipeline_ctx):
6
- raise NotImplementedError()
42
+ """By default we do nothing."""
43
+ pass
7
44
 
8
45
  def apply(self, input_data, pipeline_ctx=None):
9
46
  """ Performs input processing an update it for a further pipeline items.
@@ -5,10 +5,14 @@ class FlattenIterPipelineItem(BasePipelineItem):
5
5
  """ Considered to flat iterations of items that represent iterations.
6
6
  """
7
7
 
8
+ def __init__(self, **kwargs):
9
+ super(FlattenIterPipelineItem, self).__init__(**kwargs)
10
+ pass
11
+
8
12
  def __flat_iter(self, iter_data):
9
13
  for iter_item in iter_data:
10
14
  for item in iter_item:
11
15
  yield item
12
16
 
13
17
  def apply_core(self, input_data, pipeline_ctx):
14
- return self.__flat_iter(input_data)
18
+ return self.__flat_iter(input_data)
@@ -3,8 +3,9 @@ from arekit.common.pipeline.items.base import BasePipelineItem
3
3
 
4
4
  class HandleIterPipelineItem(BasePipelineItem):
5
5
 
6
- def __init__(self, handle_func=None):
6
+ def __init__(self, handle_func=None, **kwargs):
7
7
  assert(callable(handle_func))
8
+ super(HandleIterPipelineItem, self).__init__(**kwargs)
8
9
  self.__handle_func = handle_func
9
10
 
10
11
  def __updated_data(self, items_iter):
@@ -3,8 +3,9 @@ from arekit.common.pipeline.items.base import BasePipelineItem
3
3
 
4
4
  class FilterPipelineItem(BasePipelineItem):
5
5
 
6
- def __init__(self, filter_func=None):
6
+ def __init__(self, filter_func=None, **kwargs):
7
7
  assert(callable(filter_func))
8
+ super(FilterPipelineItem, self).__init__(**kwargs)
8
9
  self.__filter_func = filter_func
9
10
 
10
11
  def apply_core(self, input_data, pipeline_ctx):
@@ -3,8 +3,9 @@ from arekit.common.pipeline.items.base import BasePipelineItem
3
3
 
4
4
  class MapPipelineItem(BasePipelineItem):
5
5
 
6
- def __init__(self, map_func=None):
6
+ def __init__(self, map_func=None, **kwargs):
7
7
  assert(callable(map_func))
8
+ super(MapPipelineItem, self).__init__(**kwargs)
8
9
  self._map_func = map_func
9
10
 
10
11
  def apply_core(self, input_data, pipeline_ctx):
@@ -9,5 +9,9 @@ class MapNestedPipelineItem(MapPipelineItem):
9
9
  suppose to be mapped with the passed pipeline context.
10
10
  """
11
11
 
12
+ def __init__(self, **kwargs):
13
+ super(MapNestedPipelineItem, self).__init__(**kwargs)
14
+ pass
15
+
12
16
  def apply_core(self, input_data, pipeline_ctx):
13
17
  return map(lambda item: self._map_func(item, pipeline_ctx), input_data)
@@ -0,0 +1,32 @@
1
+ class BatchIterator:
2
+
3
+ def __init__(self, data_iter, batch_size, end_value=None):
4
+ assert(isinstance(batch_size, int) and batch_size > 0)
5
+ assert(callable(end_value) or end_value is None)
6
+ self.__data_iter = data_iter
7
+ self.__index = 0
8
+ self.__batch_size = batch_size
9
+ self.__end_value = end_value
10
+
11
+ def __iter__(self):
12
+ return self
13
+
14
+ def __next__(self):
15
+ buffer = []
16
+ while True:
17
+ try:
18
+ data = next(self.__data_iter)
19
+ except StopIteration:
20
+ break
21
+ buffer.append(data)
22
+ if len(buffer) == self.__batch_size:
23
+ break
24
+
25
+ if len(buffer) > 0:
26
+ self.__index += 1
27
+ return buffer
28
+
29
+ if self.__end_value is None:
30
+ raise StopIteration
31
+ else:
32
+ return self.__end_value()
@@ -1,28 +1,34 @@
1
1
  from collections.abc import Iterable
2
2
 
3
3
  from arekit.common.bound import Bound
4
- from arekit.common.text.partitioning.base import BasePartitioning
5
4
 
6
5
 
7
- class StringPartitioning(BasePartitioning):
6
+ class Partitioning(object):
8
7
  """ NOTE: considering that provided parts
9
8
  has no intersections between each other
10
9
  """
11
10
 
11
+ list_reg_types = {
12
+ "str": lambda p, item: p.append(item),
13
+ "list": lambda p, item: p.extend(item)
14
+ }
15
+
16
+ def __init__(self, text_fmt):
17
+ assert(isinstance(text_fmt, str) and text_fmt in self.list_reg_types)
18
+ self.__reg_part = self.list_reg_types[text_fmt]
19
+
12
20
  def provide(self, text, parts_it):
13
- assert(isinstance(text, str))
14
21
  assert(isinstance(parts_it, Iterable))
15
22
 
16
- start = 0
17
23
  parts = []
24
+ start = 0
25
+
18
26
  for value, bound in parts_it:
19
27
  assert(isinstance(bound, Bound))
20
28
  assert(bound.Position >= start)
21
29
 
22
30
  # Release everything till the current value position.
23
- part = text[start:bound.Position]
24
-
25
- parts.append(part)
31
+ self.__reg_part(p=parts, item=text[start:bound.Position])
26
32
 
27
33
  # Release the entity value.
28
34
  parts.extend([value])
@@ -30,7 +36,6 @@ class StringPartitioning(BasePartitioning):
30
36
  start = bound.Position + bound.Length
31
37
 
32
38
  # Release everything after the last entity.
33
- last_part = text[start:len(text)]
34
- parts.extend([last_part])
39
+ self.__reg_part(p=parts, item=text[start:len(text)])
35
40
 
36
41
  return parts
arekit/common/utils.py CHANGED
@@ -1,6 +1,4 @@
1
- import sys
2
1
  import os
3
- import requests
4
2
  from tqdm import tqdm
5
3
 
6
4
 
@@ -28,14 +26,14 @@ def split_by_whitespaces(text):
28
26
  return text.split()
29
27
 
30
28
 
31
- def progress_bar(iterable, total, desc="", unit="it"):
29
+ def progress_bar(iterable, total, desc="", unit="it", file=None, disable=False):
32
30
  if total is not None:
33
- return progress_bar_defined(iterable=iterable, total=total, desc=desc, unit=unit)
31
+ return progress_bar_defined(iterable=iterable, total=total, desc=desc, unit=unit, file=file, disable=disable)
34
32
  else:
35
- return progress_bar_iter(iterable=iterable, desc=desc, unit=unit)
33
+ return progress_bar_iter(iterable=iterable, desc=desc, unit=unit, file=file, disable=disable)
36
34
 
37
35
 
38
- def progress_bar_conditional(iterable, condition_func, total, postfix_func=None, desc="", unit="it"):
36
+ def progress_bar_conditional(iterable, condition_func, total, postfix_func=None, desc="", unit="it", file=None):
39
37
  """ This progress-bar updates only on the
40
38
  specific conditions during the iteration process.
41
39
  """
@@ -48,7 +46,7 @@ def progress_bar_conditional(iterable, condition_func, total, postfix_func=None,
48
46
  yield 0
49
47
 
50
48
  pbar_it = progress_bar(iterable=__iter_infinite_placeholder(),
51
- desc=desc, unit=unit, total=total)
49
+ desc=desc, unit=unit, total=total, file=file)
52
50
  element = iter(pbar_it)
53
51
 
54
52
  # Initialize with 0.
@@ -66,7 +64,7 @@ def progress_bar_conditional(iterable, condition_func, total, postfix_func=None,
66
64
  pbar_it.set_postfix(postfix_func(item))
67
65
 
68
66
 
69
- def progress_bar_defined(iterable, total, miniters=200, desc="", unit="it"):
67
+ def progress_bar_defined(iterable, total, miniters=200, desc="", unit="it", file=None, disable=False):
70
68
  return tqdm(iterable=iterable,
71
69
  total=total,
72
70
  desc=desc,
@@ -74,56 +72,17 @@ def progress_bar_defined(iterable, total, miniters=200, desc="", unit="it"):
74
72
  position=0,
75
73
  leave=True,
76
74
  unit=unit,
75
+ file=file,
76
+ disable=disable,
77
77
  miniters=total / miniters if total is not None else total)
78
78
 
79
79
 
80
- def progress_bar_iter(iterable, desc="", unit='it'):
80
+ def progress_bar_iter(iterable, desc="", unit='it', file=None, disable=False):
81
81
  return tqdm(iterable=iterable,
82
82
  desc=desc,
83
83
  position=0,
84
84
  leave=True,
85
85
  ncols=120,
86
+ file=file,
87
+ disable=disable,
86
88
  unit=unit)
87
-
88
-
89
- def get_default_download_dir():
90
- """ Refered to NLTK toolkit approach
91
- https://github.com/nltk/nltk/blob/8e771679cee1b4a9540633cc3ea17f4421ffd6c0/nltk/downloader.py#L1051
92
- """
93
-
94
- # On Windows, use %APPDATA%
95
- if sys.platform == "win32" and "APPDATA" in os.environ:
96
- homedir = os.environ["APPDATA"]
97
-
98
- # Otherwise, install in the user's home directory.
99
- else:
100
- homedir = os.path.expanduser("~/")
101
- if homedir == "~/":
102
- raise ValueError("Could not find a default download directory")
103
-
104
- return os.path.join(homedir, ".arekit")
105
-
106
-
107
- def download(dest_file_path, source_url):
108
- """ Refered to https://github.com/nicolay-r/ner-bilstm-crf-tensorflow/blob/master/ner/utils.py
109
- Simple http file downloader
110
- """
111
- print(('Downloading from {src} to {dest}'.format(src=source_url, dest=dest_file_path)))
112
-
113
- sys.stdout.flush()
114
- datapath = os.path.dirname(dest_file_path)
115
-
116
- if not os.path.exists(datapath):
117
- os.makedirs(datapath, mode=0o755)
118
-
119
- dest_file_path = os.path.abspath(dest_file_path)
120
-
121
- r = requests.get(source_url, stream=True)
122
- total_length = int(r.headers.get('content-length', 0))
123
-
124
- with open(dest_file_path, 'wb') as f:
125
- pbar = tqdm(total=total_length, unit='B', unit_scale=True)
126
- for chunk in r.iter_content(chunk_size=32 * 1024):
127
- if chunk: # filter out keep-alive new chunks
128
- pbar.update(len(chunk))
129
- f.write(chunk)
@@ -2,7 +2,8 @@ from arekit.common.data.input.providers.const import IDLE_MODE
2
2
  from arekit.common.data.input.providers.contents import ContentsProvider
3
3
  from arekit.common.linkage.base import LinkedDataWrapper
4
4
  from arekit.common.linkage.text_opinions import TextOpinionsLinkage
5
- from arekit.common.pipeline.base import BasePipeline
5
+ from arekit.common.pipeline.base import BasePipelineLauncher
6
+ from arekit.common.pipeline.context import PipelineContext
6
7
  from arekit.common.text_opinions.base import TextOpinion
7
8
 
8
9
 
@@ -13,7 +14,7 @@ class InputTextOpinionProvider(ContentsProvider):
13
14
  results in a TextOpinionLinkage instances.
14
15
  pipeline: id -> ... -> TextOpinionLinkage[]
15
16
  """
16
- assert(isinstance(pipeline, BasePipeline))
17
+ assert(isinstance(pipeline, list))
17
18
  self.__pipeline = pipeline
18
19
  self.__current_id = None
19
20
 
@@ -30,7 +31,16 @@ class InputTextOpinionProvider(ContentsProvider):
30
31
 
31
32
  def from_doc_ids(self, doc_ids, idle_mode=False):
32
33
  self.__current_id = 0
33
- for linkage in self.__pipeline.run(doc_ids, params_dict={IDLE_MODE: idle_mode}):
34
+
35
+ ctx = PipelineContext(d={
36
+ "result": doc_ids,
37
+ IDLE_MODE: idle_mode
38
+ })
39
+
40
+ # Launching pipeline with the passed context
41
+ BasePipelineLauncher.run(pipeline=self.__pipeline, pipeline_ctx=ctx)
42
+
43
+ for linkage in ctx.provide("result"):
34
44
  assert(isinstance(linkage, LinkedDataWrapper))
35
45
  if isinstance(linkage, TextOpinionsLinkage):
36
46
  self.__assign_ids(linkage)
@@ -5,8 +5,9 @@ from arekit.common.data.storages.base import BaseRowsStorage
5
5
 
6
6
  class JsonlBasedRowsStorage(BaseRowsStorage):
7
7
 
8
- def __init__(self, rows):
8
+ def __init__(self, rows, **kwargs):
9
9
  assert(isinstance(rows, list))
10
+ super(JsonlBasedRowsStorage, self).__init__(**kwargs)
10
11
  self.__rows = rows
11
12
 
12
13
  def _iter_rows(self):
@@ -12,7 +12,8 @@ class PandasBasedRowsStorage(BaseRowsStorage):
12
12
  based on the pandas DataFrames.
13
13
  """
14
14
 
15
- def __init__(self, df=None):
15
+ def __init__(self, df=None, **kwargs):
16
+ super(PandasBasedRowsStorage, self).__init__(**kwargs)
16
17
  self._df = df
17
18
 
18
19
  @property
@@ -96,26 +97,10 @@ class PandasBasedRowsStorage(BaseRowsStorage):
96
97
  def get_row(self, row_index):
97
98
  return self._df.iloc[row_index]
98
99
 
99
- def get_cell(self, row_index, column_name):
100
- return self._df.iloc[row_index][column_name]
101
-
102
- def iter_column_values(self, column_name, dtype=None):
103
- values = self._df[column_name]
104
- if dtype is None:
105
- return values
106
- return values.astype(dtype)
107
-
108
- def find_by_value(self, column_name, value):
109
- return self.__filter(column_name=column_name, value=value)
110
-
111
100
  def init_empty(self, columns_provider):
112
101
  cols_with_types = columns_provider.get_columns_list_with_types()
113
102
  self._df = self.__create_empty(cols_with_types)
114
103
 
115
- def iter_shuffled(self):
116
- shuffled_df = self._df.sample(frac=1)
117
- return self.__iter_rows_core(shuffled_df)
118
-
119
104
  def free(self):
120
105
  del self._df
121
106
  super(PandasBasedRowsStorage, self).free()
@@ -6,13 +6,14 @@ class RowCacheStorage(BaseRowsStorage):
6
6
  """ Row Caching storage kernel, based on python dictionary.
7
7
  """
8
8
 
9
- def __init__(self, force_collect_columns=None):
9
+ def __init__(self, force_collect_columns=None, **kwargs):
10
10
  """ This is a particular/related solution for the following issue:
11
11
  https://github.com/nicolay-r/AREkit/issues/464
12
12
  force_collect_columns: list
13
13
  columns that supposed to be additionally considered in output.
14
14
  """
15
15
  assert(isinstance(force_collect_columns, list) or force_collect_columns is None)
16
+ super(RowCacheStorage, self).__init__(**kwargs)
16
17
  self.__f = None
17
18
  self.__row_cache = {}
18
19
  self.__column_names = []
@@ -33,7 +34,12 @@ class RowCacheStorage(BaseRowsStorage):
33
34
 
34
35
  # Expand with columns that are forced to be provided.
35
36
  existed_set = set(self.__column_names)
36
- self.__column_names += [c for c in self.__force_collect_columns if c not in existed_set]
37
+
38
+ # Calculate extension: columns that were not mentioned in column names list.
39
+ extension = [c for c in self.__force_collect_columns if c not in existed_set]
40
+
41
+ self.__column_names += extension
42
+ self.__column_types += [str] * len(extension)
37
43
 
38
44
  def iter_column_names(self):
39
45
  return iter(self.__column_names)
@@ -0,0 +1,18 @@
1
+ import sqlite3
2
+ from arekit.common.data.storages.base import BaseRowsStorage
3
+
4
+
5
+ class SQliteBasedRowsStorage(BaseRowsStorage):
6
+
7
+ def __init__(self, path, table_name, **kwargs):
8
+ super(SQliteBasedRowsStorage, self).__init__(**kwargs)
9
+ self.__path = path
10
+ self.__table_name = table_name
11
+ self.__conn = None
12
+
13
+ def _iter_rows(self):
14
+ with sqlite3.connect(self.__path) as conn:
15
+ cursor = conn.execute(f"select * from {self.__table_name}")
16
+ for row_index, row in enumerate(cursor.fetchall()):
17
+ row_dict = {cursor.description[i][0]: value for i, value in enumerate(row)}
18
+ yield row_index, row_dict