arekit 0.24.0__py3-none-any.whl → 0.25.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. arekit/common/data/storages/base.py +4 -15
  2. arekit/common/docs/entities_grouping.py +2 -1
  3. arekit/common/docs/parser.py +27 -22
  4. arekit/common/pipeline/base.py +12 -16
  5. arekit/common/pipeline/batching.py +28 -0
  6. arekit/common/pipeline/context.py +5 -1
  7. arekit/common/pipeline/items/base.py +39 -2
  8. arekit/common/pipeline/items/flatten.py +5 -1
  9. arekit/common/pipeline/items/handle.py +2 -1
  10. arekit/common/pipeline/items/iter.py +2 -1
  11. arekit/common/pipeline/items/map.py +2 -1
  12. arekit/common/pipeline/items/map_nested.py +4 -0
  13. arekit/common/pipeline/utils.py +32 -0
  14. arekit/common/text/{partitioning/str.py → partitioning.py} +14 -9
  15. arekit/common/utils.py +11 -52
  16. arekit/contrib/utils/data/contents/opinions.py +13 -3
  17. arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
  18. arekit/contrib/utils/data/storages/pandas_based.py +2 -17
  19. arekit/contrib/utils/data/storages/row_cache.py +8 -2
  20. arekit/contrib/utils/data/storages/sqlite_based.py +18 -0
  21. arekit/contrib/utils/io_utils/utils.py +1 -18
  22. arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
  23. arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
  24. arekit/contrib/utils/pipelines/text_opinion/extraction.py +8 -10
  25. arekit-0.25.1.data/data/logo.png +0 -0
  26. arekit-0.25.1.dist-info/METADATA +81 -0
  27. arekit-0.25.1.dist-info/RECORD +186 -0
  28. {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/WHEEL +1 -1
  29. arekit/common/data/input/repositories/__init__.py +0 -0
  30. arekit/common/data/input/repositories/base.py +0 -68
  31. arekit/common/data/input/repositories/sample.py +0 -22
  32. arekit/common/data/views/__init__.py +0 -0
  33. arekit/common/data/views/samples.py +0 -26
  34. arekit/common/docs/objects_parser.py +0 -37
  35. arekit/common/text/parser.py +0 -12
  36. arekit/common/text/partitioning/__init__.py +0 -0
  37. arekit/common/text/partitioning/base.py +0 -4
  38. arekit/common/text/partitioning/terms.py +0 -35
  39. arekit/contrib/networks/__init__.py +0 -0
  40. arekit/contrib/networks/embedding.py +0 -149
  41. arekit/contrib/networks/embedding_io.py +0 -18
  42. arekit/contrib/networks/input/__init__.py +0 -0
  43. arekit/contrib/networks/input/const.py +0 -6
  44. arekit/contrib/networks/input/ctx_serialization.py +0 -28
  45. arekit/contrib/networks/input/embedding/__init__.py +0 -0
  46. arekit/contrib/networks/input/embedding/matrix.py +0 -29
  47. arekit/contrib/networks/input/embedding/offsets.py +0 -55
  48. arekit/contrib/networks/input/formatters/__init__.py +0 -0
  49. arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
  50. arekit/contrib/networks/input/providers/__init__.py +0 -0
  51. arekit/contrib/networks/input/providers/sample.py +0 -129
  52. arekit/contrib/networks/input/providers/term_connotation.py +0 -23
  53. arekit/contrib/networks/input/providers/text.py +0 -24
  54. arekit/contrib/networks/input/rows_parser.py +0 -47
  55. arekit/contrib/networks/input/term_types.py +0 -13
  56. arekit/contrib/networks/input/terms_mapping.py +0 -60
  57. arekit/contrib/networks/vectorizer.py +0 -6
  58. arekit/contrib/source/__init__.py +0 -0
  59. arekit/contrib/source/brat/__init__.py +0 -0
  60. arekit/contrib/source/brat/annot.py +0 -84
  61. arekit/contrib/source/brat/doc.py +0 -28
  62. arekit/contrib/source/brat/entities/__init__.py +0 -0
  63. arekit/contrib/source/brat/entities/compound.py +0 -13
  64. arekit/contrib/source/brat/entities/entity.py +0 -42
  65. arekit/contrib/source/brat/entities/parser.py +0 -53
  66. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  67. arekit/contrib/source/brat/opinions/converter.py +0 -19
  68. arekit/contrib/source/brat/relation.py +0 -32
  69. arekit/contrib/source/brat/sentence.py +0 -69
  70. arekit/contrib/source/brat/sentences_reader.py +0 -128
  71. arekit/contrib/source/download.py +0 -41
  72. arekit/contrib/source/nerel/__init__.py +0 -0
  73. arekit/contrib/source/nerel/entities.py +0 -55
  74. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  75. arekit/contrib/source/nerel/folding/fixed.py +0 -74
  76. arekit/contrib/source/nerel/io_utils.py +0 -62
  77. arekit/contrib/source/nerel/labels.py +0 -241
  78. arekit/contrib/source/nerel/reader.py +0 -46
  79. arekit/contrib/source/nerel/utils.py +0 -24
  80. arekit/contrib/source/nerel/versions.py +0 -12
  81. arekit/contrib/source/nerelbio/__init__.py +0 -0
  82. arekit/contrib/source/nerelbio/io_utils.py +0 -62
  83. arekit/contrib/source/nerelbio/labels.py +0 -265
  84. arekit/contrib/source/nerelbio/reader.py +0 -8
  85. arekit/contrib/source/nerelbio/versions.py +0 -8
  86. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  87. arekit/contrib/source/ruattitudes/collection.py +0 -36
  88. arekit/contrib/source/ruattitudes/doc.py +0 -51
  89. arekit/contrib/source/ruattitudes/doc_brat.py +0 -44
  90. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  91. arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
  92. arekit/contrib/source/ruattitudes/io_utils.py +0 -56
  93. arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
  94. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  95. arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
  96. arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
  97. arekit/contrib/source/ruattitudes/reader.py +0 -268
  98. arekit/contrib/source/ruattitudes/sentence.py +0 -73
  99. arekit/contrib/source/ruattitudes/synonyms.py +0 -17
  100. arekit/contrib/source/ruattitudes/text_object.py +0 -59
  101. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  102. arekit/contrib/source/rusentiframes/collection.py +0 -157
  103. arekit/contrib/source/rusentiframes/effect.py +0 -24
  104. arekit/contrib/source/rusentiframes/io_utils.py +0 -19
  105. arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
  106. arekit/contrib/source/rusentiframes/polarity.py +0 -35
  107. arekit/contrib/source/rusentiframes/role.py +0 -15
  108. arekit/contrib/source/rusentiframes/state.py +0 -24
  109. arekit/contrib/source/rusentiframes/types.py +0 -42
  110. arekit/contrib/source/rusentiframes/value.py +0 -2
  111. arekit/contrib/source/rusentrel/__init__.py +0 -0
  112. arekit/contrib/source/rusentrel/const.py +0 -3
  113. arekit/contrib/source/rusentrel/docs_reader.py +0 -51
  114. arekit/contrib/source/rusentrel/entities.py +0 -26
  115. arekit/contrib/source/rusentrel/io_utils.py +0 -125
  116. arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
  117. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  118. arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
  119. arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
  120. arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
  121. arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
  122. arekit/contrib/source/rusentrel/synonyms.py +0 -17
  123. arekit/contrib/source/sentinerel/__init__.py +0 -0
  124. arekit/contrib/source/sentinerel/entities.py +0 -52
  125. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  126. arekit/contrib/source/sentinerel/folding/factory.py +0 -31
  127. arekit/contrib/source/sentinerel/folding/fixed.py +0 -70
  128. arekit/contrib/source/sentinerel/io_utils.py +0 -87
  129. arekit/contrib/source/sentinerel/labels.py +0 -53
  130. arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
  131. arekit/contrib/source/sentinerel/reader.py +0 -42
  132. arekit/contrib/source/synonyms/__init__.py +0 -0
  133. arekit/contrib/source/synonyms/utils.py +0 -19
  134. arekit/contrib/source/zip_utils.py +0 -47
  135. arekit/contrib/utils/connotations/__init__.py +0 -0
  136. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
  137. arekit/contrib/utils/data/readers/__init__.py +0 -0
  138. arekit/contrib/utils/data/readers/base.py +0 -7
  139. arekit/contrib/utils/data/readers/csv_pd.py +0 -38
  140. arekit/contrib/utils/data/readers/jsonl.py +0 -15
  141. arekit/contrib/utils/data/service/__init__.py +0 -0
  142. arekit/contrib/utils/data/service/balance.py +0 -50
  143. arekit/contrib/utils/data/writers/csv_native.py +0 -63
  144. arekit/contrib/utils/data/writers/csv_pd.py +0 -40
  145. arekit/contrib/utils/data/writers/json_opennre.py +0 -132
  146. arekit/contrib/utils/data/writers/sqlite_native.py +0 -110
  147. arekit/contrib/utils/download.py +0 -77
  148. arekit/contrib/utils/embeddings/__init__.py +0 -0
  149. arekit/contrib/utils/embeddings/rusvectores.py +0 -58
  150. arekit/contrib/utils/embeddings/tokens.py +0 -30
  151. arekit/contrib/utils/io_utils/embedding.py +0 -72
  152. arekit/contrib/utils/io_utils/opinions.py +0 -37
  153. arekit/contrib/utils/io_utils/samples.py +0 -79
  154. arekit/contrib/utils/lexicons/__init__.py +0 -0
  155. arekit/contrib/utils/lexicons/lexicon.py +0 -41
  156. arekit/contrib/utils/lexicons/relation.py +0 -42
  157. arekit/contrib/utils/lexicons/rusentilex.py +0 -37
  158. arekit/contrib/utils/nn/__init__.py +0 -0
  159. arekit/contrib/utils/nn/rows.py +0 -83
  160. arekit/contrib/utils/np_utils/__init__.py +0 -0
  161. arekit/contrib/utils/np_utils/embedding.py +0 -22
  162. arekit/contrib/utils/np_utils/npz_utils.py +0 -13
  163. arekit/contrib/utils/np_utils/vocab.py +0 -20
  164. arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
  165. arekit/contrib/utils/pipelines/items/sampling/base.py +0 -99
  166. arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -54
  167. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
  168. arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -32
  169. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
  170. arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -107
  171. arekit/contrib/utils/pipelines/items/text/translator.py +0 -135
  172. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  173. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  174. arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +0 -27
  175. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -65
  176. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
  177. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  178. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +0 -29
  179. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -64
  180. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
  181. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  182. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +0 -56
  183. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -20
  184. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -65
  185. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  186. arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +0 -21
  187. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -107
  188. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  189. arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +0 -29
  190. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
  191. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -180
  192. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
  193. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
  194. arekit/contrib/utils/processing/languages/__init__.py +0 -0
  195. arekit/contrib/utils/processing/languages/mods.py +0 -12
  196. arekit/contrib/utils/processing/languages/pos.py +0 -23
  197. arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
  198. arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
  199. arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
  200. arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
  201. arekit/contrib/utils/processing/languages/ru/number.py +0 -23
  202. arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
  203. arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
  204. arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
  205. arekit/contrib/utils/processing/pos/__init__.py +0 -0
  206. arekit/contrib/utils/processing/pos/base.py +0 -12
  207. arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
  208. arekit/contrib/utils/processing/pos/russian.py +0 -10
  209. arekit/contrib/utils/processing/text/__init__.py +0 -0
  210. arekit/contrib/utils/processing/text/tokens.py +0 -127
  211. arekit/contrib/utils/resources.py +0 -25
  212. arekit/contrib/utils/serializer.py +0 -43
  213. arekit/contrib/utils/sources/__init__.py +0 -0
  214. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  215. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  216. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
  217. arekit/contrib/utils/vectorizers/__init__.py +0 -0
  218. arekit/contrib/utils/vectorizers/bpe.py +0 -93
  219. arekit/contrib/utils/vectorizers/random_norm.py +0 -39
  220. arekit/download_data.py +0 -11
  221. arekit-0.24.0.dist-info/METADATA +0 -23
  222. arekit-0.24.0.dist-info/RECORD +0 -374
  223. {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/LICENSE +0 -0
  224. {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/top_level.txt +0 -0
@@ -1,29 +1,12 @@
1
1
  from collections.abc import Iterable
2
2
  import logging
3
- from os.path import join, exists
4
-
5
- from arekit.common.experiment.data_type import DataType
3
+ from os.path import exists
6
4
 
7
5
 
8
6
  logger = logging.getLogger(__name__)
9
7
  logging.basicConfig(level=logging.INFO)
10
8
 
11
9
 
12
- def join_dir_with_subfolder_name(subfolder_name, dir):
13
- """ Returns subfolder in in directory
14
- """
15
- assert(isinstance(subfolder_name, str))
16
- assert(isinstance(dir, str))
17
-
18
- target_dir = join(dir, "{}/".format(subfolder_name))
19
- return target_dir
20
-
21
-
22
- def filename_template(data_type):
23
- assert(isinstance(data_type, DataType))
24
- return "{data_type}-0".format(data_type=data_type.name.lower())
25
-
26
-
27
10
  def check_targets_existence(targets):
28
11
  assert (isinstance(targets, Iterable))
29
12
 
@@ -4,8 +4,8 @@ from arekit.common.pipeline.items.base import BasePipelineItem
4
4
 
5
5
  class TextEntitiesParser(BasePipelineItem):
6
6
 
7
- def __init__(self):
8
- super(TextEntitiesParser, self).__init__()
7
+ def __init__(self, **kwargs):
8
+ super(TextEntitiesParser, self).__init__(**kwargs)
9
9
 
10
10
  @staticmethod
11
11
  def __process_word(word):
@@ -6,11 +6,10 @@ from arekit.common.pipeline.items.base import BasePipelineItem
6
6
 
7
7
  class FrameVariantsParser(BasePipelineItem):
8
8
 
9
- def __init__(self, frame_variants):
9
+ def __init__(self, frame_variants, **kwargs):
10
10
  assert(isinstance(frame_variants, FrameVariantsCollection))
11
11
  assert(len(frame_variants) > 0)
12
-
13
- super(FrameVariantsParser, self).__init__()
12
+ super(FrameVariantsParser, self).__init__(**kwargs)
14
13
 
15
14
  self.__frame_variants = frame_variants
16
15
  self.__max_variant_len = max([len(variant) for _, variant in frame_variants.iter_variants()])
@@ -3,12 +3,10 @@ from arekit.common.linkage.text_opinions import TextOpinionsLinkage
3
3
  from arekit.common.docs.parsed.base import ParsedDocument
4
4
  from arekit.common.docs.parsed.providers.entity_service import EntityServiceProvider
5
5
  from arekit.common.docs.parsed.service import ParsedDocumentService
6
- from arekit.common.docs.parser import DocumentParser
7
- from arekit.common.pipeline.base import BasePipeline
6
+ from arekit.common.docs.parser import DocumentParsers
8
7
  from arekit.common.pipeline.items.flatten import FlattenIterPipelineItem
9
8
  from arekit.common.pipeline.items.map import MapPipelineItem
10
9
  from arekit.common.pipeline.items.map_nested import MapNestedPipelineItem
11
- from arekit.common.text.parser import BaseTextParser
12
10
  from arekit.common.text_opinions.base import TextOpinion
13
11
  from arekit.contrib.utils.pipelines.text_opinion.filters.base import TextOpinionFilter
14
12
  from arekit.contrib.utils.pipelines.text_opinion.filters.limitation import FrameworkLimitationsTextOpinionFilter
@@ -17,7 +15,7 @@ from arekit.contrib.utils.pipelines.text_opinion.filters.limitation import Frame
17
15
  def __iter_text_opinion_linkages(parsed_doc, annotators, entity_index_func,
18
16
  text_opinion_filters, use_meta):
19
17
  """ use_meta: bool
20
- this is mainly for tqdm and other console parameters to stay up-to-date
18
+ this is mainly for the progress-bar and other console parameters to stay up-to-date
21
19
  with the state in the case we do not have that much output results
22
20
  across multiple amount of documents.
23
21
  """
@@ -64,24 +62,24 @@ def __iter_text_opinion_linkages(parsed_doc, annotators, entity_index_func,
64
62
  yield MetaEmptyLinkedDataWrapper(doc_id=parsed_doc.RelatedDocID)
65
63
 
66
64
 
67
- def text_opinion_extraction_pipeline(text_parser, get_doc_by_id_func, annotators, entity_index_func,
65
+ def text_opinion_extraction_pipeline(pipeline_items, get_doc_by_id_func, annotators, entity_index_func, batch_size,
68
66
  text_opinion_filters=None, use_meta_between_docs=True):
69
- assert(isinstance(text_parser, BaseTextParser))
70
67
  assert(callable(get_doc_by_id_func))
71
68
  assert(isinstance(annotators, list))
72
69
  assert(isinstance(text_opinion_filters, list) or text_opinion_filters is None)
73
70
  assert(isinstance(use_meta_between_docs, bool))
71
+ assert(isinstance(batch_size, int) and batch_size > 0)
74
72
 
75
73
  extra_filters = [] if text_opinion_filters is None else text_opinion_filters
76
74
  actual_text_opinion_filters = [FrameworkLimitationsTextOpinionFilter()] + extra_filters
77
75
 
78
- return BasePipeline([
76
+ return [
79
77
  # (doc_id) -> (doc)
80
78
  MapPipelineItem(map_func=lambda doc_id: get_doc_by_id_func(doc_id)),
81
79
 
82
80
  # (doc, ppl_ctx) -> (parsed_doc)
83
- MapNestedPipelineItem(map_func=lambda doc, ppl_ctx: DocumentParser.parse(
84
- doc=doc, text_parser=text_parser, parent_ppl_ctx=ppl_ctx)),
81
+ MapNestedPipelineItem(map_func=lambda doc, ppl_ctx: DocumentParsers.parse_batch(
82
+ doc=doc, pipeline_items=pipeline_items, parent_ppl_ctx=ppl_ctx, batch_size=batch_size)),
85
83
 
86
84
  # (parsed_doc) -> (text_opinions)
87
85
  MapPipelineItem(map_func=lambda parsed_doc: __iter_text_opinion_linkages(
@@ -90,4 +88,4 @@ def text_opinion_extraction_pipeline(text_parser, get_doc_by_id_func, annotators
90
88
 
91
89
  # linkages[] -> linkages
92
90
  FlattenIterPipelineItem()
93
- ])
91
+ ]
Binary file
@@ -0,0 +1,81 @@
1
+ Metadata-Version: 2.1
2
+ Name: arekit
3
+ Version: 0.25.1
4
+ Summary: Document level Attitude and Relation Extraction toolkit (AREkit) for sampling and prompting mass-media news into datasets for ML-model training
5
+ Home-page: https://github.com/nicolay-r/AREkit
6
+ Author: Nicolay Rusnachenko
7
+ Author-email: rusnicolay@gmail.com
8
+ License: MIT License
9
+ Keywords: natural language processing,relation extraction,sentiment analysis
10
+ Classifier: Programming Language :: Python
11
+ Classifier: Programming Language :: Python :: 3.6
12
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
13
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
14
+ Classifier: Topic :: Text Processing :: Linguistic
15
+ Requires-Python: >=3.6
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: tqdm
19
+ Requires-Dist: enum34==1.1.10
20
+ Requires-Dist: numpy>=1.14.5
21
+
22
+ # AREkit 0.25.1
23
+
24
+ ![](https://img.shields.io/badge/Python-3.9+-brightgreen.svg)
25
+
26
+ <p align="center">
27
+ <img src="logo.png"/>
28
+ </p>
29
+
30
+ **AREkit** (Attitude and Relation Extraction Toolkit) --
31
+ is a python toolkit, devoted to document level Attitude and Relation Extraction between text objects from mass-media news.
32
+
33
+ ## Description
34
+
35
+
36
+ This toolkit aims at memory-effective data processing in [Relation Extraction (RE)](https://nlpprogress.com/english/relationship_extraction.html) related tasks.
37
+
38
+ <p align="center">
39
+ <img src="docs/arekit-pipeline-concept.png"/>
40
+ </p>
41
+
42
+ > Figure: AREkit pipelines design. More on
43
+ > **[ARElight: Context Sampling of Large Texts for Deep Learning Relation Extraction](https://link.springer.com/chapter/10.1007/978-3-031-56069-9_23)** paper
44
+
45
+ In particular, this framework serves the following features:
46
+ * ➿ [pipelines](https://github.com/nicolay-r/AREkit/wiki/Pipelines:-Text-Opinion-Annotation) and iterators for handling large-scale collections serialization without out-of-memory issues.
47
+ * 🔗 EL (entity-linking) API support for objects,
48
+ * ➰ avoidance of cyclic connections,
49
+ * :straight_ruler: distance consideration between relation participants (in `terms` or `sentences`),
50
+ * 📑 relations annotations and filtering rules,
51
+ * *️⃣ entities formatting or masking, and more.
52
+
53
+ The core functionality includes:
54
+ * API for document presentation with EL (Entity Linking, i.e. Object Synonymy) support
55
+ for sentence level relations preparation (dubbed as contexts);
56
+ * API for contexts extraction;
57
+ * Relations transferring from sentence-level onto document-level, and more.
58
+
59
+ ## Installation
60
+
61
+ ```bash
62
+ pip install git+https://github.com/nicolay-r/AREkit.git@0.25.1-rc
63
+ ```
64
+
65
+ ## Usage
66
+
67
+ Please follow the **[tutorial section on project Wiki](https://github.com/nicolay-r/AREkit/wiki/Tutorials)** for mode details.
68
+
69
+ ## How to cite
70
+ A great research is also accompanied by the faithful reference.
71
+ if you use or extend our work, please cite as follows:
72
+
73
+ ```bibtex
74
+ @inproceedings{rusnachenko2024arelight,
75
+ title={ARElight: Context Sampling of Large Texts for Deep Learning Relation Extraction},
76
+ author={Rusnachenko, Nicolay and Liang, Huizhi and Kolomeets, Maxim and Shi, Lei},
77
+ booktitle={European Conference on Information Retrieval},
78
+ year={2024},
79
+ organization={Springer}
80
+ }
81
+ ```
@@ -0,0 +1,186 @@
1
+ arekit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ arekit/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ arekit/common/bound.py,sha256=lPpHY6ct_CU9e4qXeYjhJfWbTj6Sb_NVtZ1CJheQPNE,1402
4
+ arekit/common/log_utils.py,sha256=OfEQxbExkuRAl9dxlgFEqcFhI4HHoMYT7WE8ud0IPOM,924
5
+ arekit/common/utils.py,sha256=N061ENJJgvsB338Q9cixc6RWyuikSPQq4Tc8mmgwy9s,2659
6
+ arekit/common/context/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ arekit/common/context/terms_mapper.py,sha256=QA02Cv7D2JKTlXkez_0w0J8HuvNziNF2vrqLgy4Bwc8,1447
8
+ arekit/common/context/token.py,sha256=CpWAlvprUnJfCtYvO8lwdfU_ofSKAOGOudXTwppyzSk,459
9
+ arekit/common/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ arekit/common/data/const.py,sha256=J74zim3CGJlLJp-AVn5z9TTuBfmttjiM_8sRW1Pc-iE,457
11
+ arekit/common/data/doc_provider.py,sha256=KU6Q2-B8_cUuFhSBHYp-cDI8OCwFk3fwOahv2QLIR2c,149
12
+ arekit/common/data/rows_fmt.py,sha256=klq9HdzSnhbRBhOw7O4ctp3PZ5L6ZVy-0eIV2vLLYY8,2694
13
+ arekit/common/data/rows_parser.py,sha256=qYSEETvhX_0_JuAqm0bjK_V28_53qq7OY9JAnBdRC78,1513
14
+ arekit/common/data/input/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ arekit/common/data/input/sample.py,sha256=6JeGxsLbEUXVKPWA1hIlkTDNOaYg4bHCJWw0ULrLByg,2143
16
+ arekit/common/data/input/terms_mapper.py,sha256=DUOMbGwiQETY7qhztoU8uU30d1cQPsIsgNLldpjcufg,3197
17
+ arekit/common/data/input/providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ arekit/common/data/input/providers/const.py,sha256=GDvPkgP7hllHW3QiueMBQgQyu2CtNFI4JYNNja2Im6Q,187
19
+ arekit/common/data/input/providers/contents.py,sha256=jT1LJE_5Igw5H2e1jKsWWciHSbPVg649phT177SzhEA,261
20
+ arekit/common/data/input/providers/columns/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
+ arekit/common/data/input/providers/columns/base.py,sha256=Ar4GkC1L8YFMgeVNM-pIkOOUvKqf2CgIIdh5DA0V8uI,225
22
+ arekit/common/data/input/providers/columns/sample.py,sha256=3onDT6LGkFwU3GOAm6M1MvgjD3fEgapTslAV6-9gvIE,1756
23
+ arekit/common/data/input/providers/instances/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
+ arekit/common/data/input/providers/instances/base.py,sha256=ybaHQNRpuebdHGU937yzkgZ0E7mO-S7Dm41NwFj44ew,420
25
+ arekit/common/data/input/providers/instances/multiple.py,sha256=6agaTA3srLiLEhBTU0RnD01GUFqMcsITV5NjVkUgR10,1144
26
+ arekit/common/data/input/providers/instances/single.py,sha256=bZKIn_Kw79c8pH1a3aUq1dmOsDu__BoFwQDLGjEtg5I,253
27
+ arekit/common/data/input/providers/label/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
+ arekit/common/data/input/providers/label/base.py,sha256=1MOCKw_OP_IbYT5OR5C3b9VZdYnLGg-TxPc_qHpuZJs,620
29
+ arekit/common/data/input/providers/label/binary.py,sha256=jPD6Jn8DYMrdI3jN8ueoWvuGMouUKbelmI07sP9Wau4,337
30
+ arekit/common/data/input/providers/label/multiple.py,sha256=HWbHF_CwwbiLQbYm5dgvnXAm0b6tJOyFYFEUBxuWAqI,492
31
+ arekit/common/data/input/providers/rows/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
+ arekit/common/data/input/providers/rows/base.py,sha256=syH7ZEW3Agwfb1IR0G7n_Amy3Kkg0EZk2V7kH3r7ADg,2517
33
+ arekit/common/data/input/providers/rows/samples.py,sha256=uqLTP8fnz-0wC7ALLlIDUYtXTG4OpnRqp70Fgv_1Iiw,9427
34
+ arekit/common/data/input/providers/sample/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
+ arekit/common/data/input/providers/sample/cropped.py,sha256=jJSos4Si-qy-wb-QmomXxxgURR1UhJnvY0tZoowlfVc,1885
36
+ arekit/common/data/input/providers/text/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
+ arekit/common/data/input/providers/text/single.py,sha256=vm3sShIYZcmses-hmZX9cOfveWXCYGwvKLgQ0qs3VXQ,1604
38
+ arekit/common/data/storages/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
+ arekit/common/data/storages/base.py,sha256=psxo5uIc3hUDi5Cgf4j3Cm-935Fy1VQBYzcBzCcCFZE,2661
40
+ arekit/common/docs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
+ arekit/common/docs/base.py,sha256=uXUOtpR9BEsDBfDHg4eLqOjfSVOV_o9VPii3nSxLZuY,734
42
+ arekit/common/docs/entities_grouping.py,sha256=_r254fNr0j6BjHuLZBLjj21yWm4_k__5aOcBXcAaQUQ,704
43
+ arekit/common/docs/entity.py,sha256=TxrZMdIEgjk-PgCyskCkVis2KAw_M7vTBp3ppP6G05M,662
44
+ arekit/common/docs/parser.py,sha256=dzWjpbbYt-C9UU9sSy_Holnm0kQxJqtz1_6va6kS_L4,1780
45
+ arekit/common/docs/sentence.py,sha256=nZCCFj2yk71POoXCBfEMN3pteM2qQdj60eEzxMVY_3k,302
46
+ arekit/common/docs/parsed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
+ arekit/common/docs/parsed/base.py,sha256=WPstqOpBuLKjtz6UO_bI0DpOPF3Sm0wYEVwjtldbPXE,3175
48
+ arekit/common/docs/parsed/service.py,sha256=fSzwtRcSvmvlW8LyK6XPf7wJAx66GWlbRgH_3oQf-BU,1029
49
+ arekit/common/docs/parsed/term_position.py,sha256=H9eQQeanLxwP6og30TQUnpcXymGEPwXClRpaE8VnpLs,1040
50
+ arekit/common/docs/parsed/providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
+ arekit/common/docs/parsed/providers/base.py,sha256=IjnG7c6Q78cYYAPTrwuZCOiMQDfMaujDQ6U0gK7JCcw,2587
52
+ arekit/common/docs/parsed/providers/base_pairs.py,sha256=RDYjspkENPQU2pn7Jp5mFrL9566eVWgXMEzWBQlMdRo,2195
53
+ arekit/common/docs/parsed/providers/entity_service.py,sha256=oaBfferpkDXfAFL17vpecSZUsV1Pjvq6lqgHDHsIEZY,6657
54
+ arekit/common/docs/parsed/providers/opinion_pairs.py,sha256=ibeFmvpMBBARtqQ3EKEocIOulgzavv0DeYxePGQK5-U,633
55
+ arekit/common/docs/parsed/providers/text_opinion_pairs.py,sha256=BC4uVgFxy3oZTkCq9VgOlqoqhODia2Z3anoGyGoy0ao,3139
56
+ arekit/common/entities/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
57
+ arekit/common/entities/base.py,sha256=kpJFo4pCRVBQX6T8PibLKspp9UwoIrkHDoFMTM9KkUs,1646
58
+ arekit/common/entities/collection.py,sha256=ySSriMYP6zzdto1mC0V9VPXmkAqyJN3mmGoqoNValGI,1931
59
+ arekit/common/entities/str_fmt.py,sha256=gAPeS8RXdhh8Px_u5eOAPbtLREiiyMueid0lQoa4EbQ,250
60
+ arekit/common/entities/types.py,sha256=pxFB0gsevdsmnduN_Ffk7_P2TRiMt6NAHyrutuKOFvs,145
61
+ arekit/common/experiment/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
+ arekit/common/experiment/data_type.py,sha256=DezUkfwLTf6XLYheqPiaWyx3ZwcldsJ8wDV8aNgJtDk,227
63
+ arekit/common/experiment/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
+ arekit/common/experiment/api/base_samples_io.py,sha256=SN8CnbEYaazE3SldvnENfjoNRHsTejtrg4jJfqfZLMs,516
65
+ arekit/common/frames/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
+ arekit/common/frames/text_variant.py,sha256=TlWR4jnuF7HW9BMHhOTKkr768V_Ub0wd0E5A4YTwD0c,875
67
+ arekit/common/frames/connotations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
68
+ arekit/common/frames/connotations/descriptor.py,sha256=yow1Wo-Hf52rx2hiQlpeSkpP4WFFcFB25ewgXtwm588,408
69
+ arekit/common/frames/connotations/provider.py,sha256=Zm-NFL-aVKJM_NhvTWizIAiNENt6B1tegTrj0k2afoc,114
70
+ arekit/common/frames/variants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
+ arekit/common/frames/variants/base.py,sha256=PhFxJZl-g9bGLfg1OlPKIUETAsTx4wwSPuBS5yOEPg8,489
72
+ arekit/common/frames/variants/collection.py,sha256=28_DRBny_iAWMdHpupdCnLvBp0FtF2tjz-uUctyrmhY,1935
73
+ arekit/common/labels/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
+ arekit/common/labels/base.py,sha256=m7EjvPcQPHtzZ0txVqNXIQPUzgNuaU2FmDyND7K4yTE,412
75
+ arekit/common/labels/str_fmt.py,sha256=ecDsP1-7NNHk_aEaBlPaNaNoA_aqy28QBOHoIxtEnDk,1707
76
+ arekit/common/labels/provider/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
+ arekit/common/labels/provider/base.py,sha256=KIWvRwXGWNWYhrzEV8A0g9r0Yk7N2E0qQpf9-UpVnbw,151
78
+ arekit/common/labels/provider/constant.py,sha256=bU6DCm1iuk_W2fMkg-NxABMJqgS9DtwxnoHpD_vSnLc,462
79
+ arekit/common/labels/scaler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
80
+ arekit/common/labels/scaler/base.py,sha256=FTZ7eTPTVK9IXLcZaXbpidsTTqTjX0-l1Qt-N1bpqWg,2349
81
+ arekit/common/labels/scaler/sentiment.py,sha256=TbYdM9mdtFTQL_fgh9rS9TEc-7U4Fpskp8JvnvN8TAA,180
82
+ arekit/common/labels/scaler/single.py,sha256=tybF3-fO4CHd_QUFnDCEmTbfbljfJA9aZEv9MtpM5Ss,308
83
+ arekit/common/linkage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
84
+ arekit/common/linkage/base.py,sha256=toZmKjTr444eHhvNLMSNU23KXtfH5DtOGtS99qGNcOo,1014
85
+ arekit/common/linkage/meta.py,sha256=LFHHhAkzQzym8rha4uuXb0BKwIb61SVGtxnU4iF_Nuk,692
86
+ arekit/common/linkage/opinions.py,sha256=8OQscnh1-5JJL3KX_lCm_6ayGCezDuFnvidfuwkjClI,255
87
+ arekit/common/linkage/text_opinions.py,sha256=qR1-zGEYaVPSpNISnGGXnABpdP6Qx8tc1i5DsEyn9wo,571
88
+ arekit/common/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
89
+ arekit/common/model/labeling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
90
+ arekit/common/model/labeling/base.py,sha256=uj7_igCWEU23OjnzabNy0LyxoZ6S_qSfCA-ZaoL1erA,727
91
+ arekit/common/model/labeling/modes.py,sha256=DiwC6Aomke-ojwwpR2pcd4qgQSwmRdGCvQlyHHhN3YY,127
92
+ arekit/common/model/labeling/single.py,sha256=Eggi0obocjiT9ofv_U0zLiFoEIeUQhaMCqjCWn14Fh8,773
93
+ arekit/common/opinions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
94
+ arekit/common/opinions/base.py,sha256=eIx1RzsngCkpnF2Utju5i_Qp7gqF_rDIe_UDeMGXtmo,2112
95
+ arekit/common/opinions/collection.py,sha256=bdx-CIYYdE-DrjyB1mRTGtkLb-lrGPTSLl25xv5EHnM,4938
96
+ arekit/common/opinions/enums.py,sha256=TE5AGN_xb0NdZ636UtHuYFRMNl24iwXzmyf8WUfvr6w,83
97
+ arekit/common/opinions/provider.py,sha256=q4hXRFDuGoo9fGOf_L9CM048YBtel1v3__ZqfSXL8Xc,168
98
+ arekit/common/opinions/writer.py,sha256=-IbWTIVlX2rhLpSP_8iuQ3_WyzzGwhto7ujfnNL6jhA,173
99
+ arekit/common/opinions/annot/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
100
+ arekit/common/opinions/annot/algo_based.py,sha256=cvDGDmUoUaQ1Xcbyouxrjs0CkHRfRogW8Mfs5O5cOlc,2240
101
+ arekit/common/opinions/annot/base.py,sha256=IvwrwT8O3s6b2_R0arpMR4Uog7kuWQZUAyRP5cq_27A,382
102
+ arekit/common/opinions/annot/algo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
103
+ arekit/common/opinions/annot/algo/base.py,sha256=ymll-4-SplCY7CLswjOZEC1vsVHIEzUP0JMYgvL8hbo,124
104
+ arekit/common/opinions/annot/algo/pair_based.py,sha256=HbYn1mAsn5g11NiC9pfrMqNtJn_GzvqPFGpafMqqB2o,4419
105
+ arekit/common/opinions/annot/algo/predefined.py,sha256=zU39SADPKnykHCNB-Bmn_0bvd6gYWWYmfgfi-68hHSs,741
106
+ arekit/common/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
107
+ arekit/common/pipeline/base.py,sha256=8TgWNy5QrnKEp1bq3lhyGSgIfYe5ZIZU3c-DYBJ9LPA,957
108
+ arekit/common/pipeline/batching.py,sha256=DdOvOladOo2aEv3JZ8NQnCvsNGcWk4TFzENrZqTGyXk,1239
109
+ arekit/common/pipeline/context.py,sha256=Fw25lBVakHNAXjtkdEqopR-Jh59cDKGWD2jCJxBrj7Y,1126
110
+ arekit/common/pipeline/conts.py,sha256=NAQNsHt1kK3HnxWv3M6yXi0c7C6Mx6ZZ6KZc0yE0eas,70
111
+ arekit/common/pipeline/utils.py,sha256=5VqH1LtRa4tYUbyiRvWdBmP4biFhTKq9vhr8QiRFFkY,882
112
+ arekit/common/pipeline/items/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
113
+ arekit/common/pipeline/items/base.py,sha256=15-z8ERQ0QxaRszs7sHQduU0KIBJIm8B0V2nwCva6d0,1695
114
+ arekit/common/pipeline/items/flatten.py,sha256=9T4jWqPGv4UDxajlM0Nm0-gvwUgqqYB8XH0efTum9a0,542
115
+ arekit/common/pipeline/items/handle.py,sha256=QS5Byj7-o5jmFi0ag58NE3zm2-JzVIunIgc3Pn1ij6g,578
116
+ arekit/common/pipeline/items/iter.py,sha256=Tk9WdUMPOq20s7jEWEpU4PmillnVtQ8nIa2ct7iw-3s,406
117
+ arekit/common/pipeline/items/map.py,sha256=G5wBdjaaxePD0pijrxsfpJACeP7kzj7HerjCkNIhmII,381
118
+ arekit/common/pipeline/items/map_nested.py,sha256=vs0GdJNr3qSF9p2yd1nWji5E1HGzECbvOfN2MqoHc2A,630
119
+ arekit/common/synonyms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
120
+ arekit/common/synonyms/base.py,sha256=YxD-CKCjlEtar1zTdumnfC3vKgbP2wLODR9mMEwbbnA,4237
121
+ arekit/common/synonyms/grouping.py,sha256=fi7QQbBvsTvvP2CPTesSPEsPNmGfc6euqj-HPhVvtlg,698
122
+ arekit/common/text/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
123
+ arekit/common/text/enums.py,sha256=nelEI7B-szLUtl8xds8Kw_vgK5JWg_Aj7IadEj2q_1Y,141
124
+ arekit/common/text/parsed.py,sha256=YxGRHtozDd3sDVI3hMT_hOO7Wmsy7_zLkblfnSXeJ9g,1104
125
+ arekit/common/text/partitioning.py,sha256=OL8r3-xaMafnT7FuPXDHINlA-BQgx6cLaMqm366WKCU,1153
126
+ arekit/common/text/stemmer.py,sha256=OJ5XelxLN-7m3uLPDU9C7CWdkXDeK-xieexQN6RYLXc,341
127
+ arekit/common/text_opinions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
128
+ arekit/common/text_opinions/base.py,sha256=KootNvGAbUVCV5uFgLjK-bm9bbQSIvZUz0q9CBToGa8,3447
129
+ arekit/contrib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
130
+ arekit/contrib/bert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
131
+ arekit/contrib/bert/input/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
132
+ arekit/contrib/bert/input/providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
133
+ arekit/contrib/bert/input/providers/cropped_sample.py,sha256=46uHHhAe8cGxV2JlfO3thog5XV6T2niUIflFghfUSBM,866
134
+ arekit/contrib/bert/input/providers/text_pair.py,sha256=_1d-he0n42y3ksj8RjJlNHgHnaQUEq0aQhUdTPRMKgg,2817
135
+ arekit/contrib/bert/terms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
136
+ arekit/contrib/bert/terms/mapper.py,sha256=oHX-lsaZYjBFLjngzSKT5z_JPJCHbclUsEe4i4fup_8,992
137
+ arekit/contrib/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
138
+ arekit/contrib/prompt/sample.py,sha256=MxpbDR0ww7WmdtuPu74B8R6QKVXeuzO0CKGOJIYwbRk,3164
139
+ arekit/contrib/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
140
+ arekit/contrib/utils/bert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
141
+ arekit/contrib/utils/bert/samplers.py,sha256=ZVe3rbUAH0Jw1xR_yHE1DoUJf3CI0pDgbBQQzlLWevc,989
142
+ arekit/contrib/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
143
+ arekit/contrib/utils/data/contents/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
144
+ arekit/contrib/utils/data/contents/opinions.py,sha256=MSV7NytEe15adKhhHCq5KiCj6ZBq31nV-u2rcSfFCgE,1738
145
+ arekit/contrib/utils/data/doc_provider/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
146
+ arekit/contrib/utils/data/doc_provider/dict_based.py,sha256=zUOiiIbj5zby4xqMb0m9N-a6enavJJ7wFmPaGErykWU,371
147
+ arekit/contrib/utils/data/doc_provider/dir_based.py,sha256=FTw3kLV_CYtPoUoHl39IrP6RjLvTecCno9May95jVXw,1916
148
+ arekit/contrib/utils/data/storages/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
149
+ arekit/contrib/utils/data/storages/jsonl_based.py,sha256=dz8uizu9t1C215o0HEL8y4LiDKR4aC_-OwDu_xF0xIM,522
150
+ arekit/contrib/utils/data/storages/pandas_based.py,sha256=gMkWUFHZE9Oe1Uy04vEBcUfTIAdh46r5zpjlPAwwG2g,3842
151
+ arekit/contrib/utils/data/storages/row_cache.py,sha256=MRK0uJFvw6O99k2aFb3JLZhLUBo2JUO-WYQ4EeRRu6M,2051
152
+ arekit/contrib/utils/data/storages/sqlite_based.py,sha256=cIYAHyiB4CMftKgrgLqw-L4F1WnhbspjwWLSPqH5NHk,682
153
+ arekit/contrib/utils/data/writers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
154
+ arekit/contrib/utils/data/writers/base.py,sha256=JLwf5WVl_U319sdMev8YOn4OoCcrgNIUZtrOuG1JLjI,766
155
+ arekit/contrib/utils/entities/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
156
+ arekit/contrib/utils/entities/filter.py,sha256=aHTExIMFaMdy4QL8iYE23eiby3qLImAakXR6gNqG6fs,145
157
+ arekit/contrib/utils/entities/formatters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
158
+ arekit/contrib/utils/entities/formatters/str_display.py,sha256=N8igv7EVaTFayvLXkyBGtm67KwHaeP-M-L8d7oqBG9Q,401
159
+ arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py,sha256=rEUIma9O3kOBWIguGtJ69JH-00Dhm0vUBOd5yNcKweY,653
160
+ arekit/contrib/utils/io_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
161
+ arekit/contrib/utils/io_utils/utils.py,sha256=310SIJTsNLn2OZrGPer9W4ZP52PHkjBK3zsyqxVs3h0,537
162
+ arekit/contrib/utils/pipelines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
163
+ arekit/contrib/utils/pipelines/opinion_collections.py,sha256=y9-klVJGCN9mPd7t1ECllAiCnAb3MKVXC1PnYddp5sQ,3195
164
+ arekit/contrib/utils/pipelines/items/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
165
+ arekit/contrib/utils/pipelines/items/text/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
166
+ arekit/contrib/utils/pipelines/items/text/entities_default.py,sha256=vNx5ir2mf7a1gg_OeqUsf_p1Fu2k7QIFxVpe-CuwZ84,727
167
+ arekit/contrib/utils/pipelines/items/text/frames.py,sha256=pZQybYfgEQB1DM3PtmsgrtB2Xl0HejmP4rhT0nR_YKE,2586
168
+ arekit/contrib/utils/pipelines/text_opinion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
169
+ arekit/contrib/utils/pipelines/text_opinion/extraction.py,sha256=MT1WMlvVI25JRL0g7W83bV8BGUr7_MNOQBj7ZAHgrnU,4245
170
+ arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
171
+ arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py,sha256=bwS-UR2x3rgp_xqnf6z-73T-eIZE_kltRSGYxgd_WpU,1751
172
+ arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
173
+ arekit/contrib/utils/pipelines/text_opinion/filters/base.py,sha256=GnKnJB4MKqiMSJny3a9Na7l7Csm7abbt6GADBCY18Mw,143
174
+ arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py,sha256=3Pjq4IJJMT7dYpK266lN66WQJUnQO3P0rG6wcAvJOOA,649
175
+ arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py,sha256=pdWFJaKh4kKIsUuBNp3WNy5Rj80CjWEy2wp-0axFnrI,1254
176
+ arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py,sha256=4AFS5zhocJuYphGO2ZMWmYTtIhGItKDTkB0--AmjgnA,1151
177
+ arekit/contrib/utils/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
178
+ arekit/contrib/utils/synonyms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
179
+ arekit/contrib/utils/synonyms/simple.py,sha256=ST9EwuWP88FzbyV8Gi0-biTPgGOsZ7OWyaBWHL_U_eo,557
180
+ arekit/contrib/utils/synonyms/stemmer_based.py,sha256=q19P_XOCWN2_JrBtybAt7ToMIr1ambw4ahr0fSEEHmQ,1400
181
+ arekit-0.25.1.data/data/logo.png,sha256=S8OZ4MGGD72Pf5co7ngYbXKkJH1EUhbErUXv1ZjUWiU,45718
182
+ arekit-0.25.1.dist-info/LICENSE,sha256=JO9tIbxAvhwDv73cX-gUStr9yA-TY7wusUeLHRx7JuY,1076
183
+ arekit-0.25.1.dist-info/METADATA,sha256=ryWGTL4fYqR36z2qh1UuYBg6UIU6n7_U9Y09KPRS6xk,3177
184
+ arekit-0.25.1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
185
+ arekit-0.25.1.dist-info/top_level.txt,sha256=4pXuFE8IE0lBsqi6ZsR7figx0H939VIX4_-76YIbkOQ,7
186
+ arekit-0.25.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.37.1)
2
+ Generator: bdist_wheel (0.44.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
File without changes
@@ -1,68 +0,0 @@
1
- from arekit.common.data.input.providers.columns.base import BaseColumnsProvider
2
- from arekit.common.data.input.providers.contents import ContentsProvider
3
- from arekit.common.data.input.providers.rows.base import BaseRowProvider
4
- from arekit.common.data.storages.base import BaseRowsStorage
5
- from arekit.contrib.utils.data.storages.row_cache import RowCacheStorage
6
- from arekit.contrib.utils.data.writers.base import BaseWriter
7
-
8
-
9
- class BaseInputRepository(object):
10
-
11
- def __init__(self, columns_provider, rows_provider, storage):
12
- assert(isinstance(columns_provider, BaseColumnsProvider))
13
- assert(isinstance(rows_provider, BaseRowProvider))
14
- assert(isinstance(storage, BaseRowsStorage))
15
-
16
- self._columns_provider = columns_provider
17
- self._rows_provider = rows_provider
18
- self._storage = storage
19
-
20
- # Do setup operations.
21
- self._setup_columns_provider()
22
- self._setup_rows_provider()
23
-
24
- # region protected methods
25
-
26
- def _setup_columns_provider(self):
27
- pass
28
-
29
- def _setup_rows_provider(self):
30
- pass
31
-
32
- # endregion
33
-
34
- def populate(self, contents_provider, doc_ids, desc="", writer=None, target=None):
35
- assert(isinstance(contents_provider, ContentsProvider))
36
- assert(isinstance(self._storage, BaseRowsStorage))
37
- assert(isinstance(doc_ids, list))
38
- assert(isinstance(writer, BaseWriter) or writer is None)
39
- assert(isinstance(target, str) or target is None)
40
-
41
- def iter_rows(idle_mode):
42
- return self._rows_provider.iter_by_rows(
43
- contents_provider=contents_provider,
44
- doc_ids_iter=doc_ids,
45
- idle_mode=idle_mode)
46
-
47
- self._storage.init_empty(columns_provider=self._columns_provider)
48
-
49
- is_async_write_mode_on = writer is not None and target is not None
50
-
51
- if is_async_write_mode_on:
52
- writer.open_target(target)
53
-
54
- self._storage.fill(lambda idle_mode: iter_rows(idle_mode),
55
- columns_provider=self._columns_provider,
56
- row_handler=lambda: writer.commit_line(self._storage) if is_async_write_mode_on else None,
57
- desc=desc)
58
-
59
- if is_async_write_mode_on:
60
- writer.close_target()
61
-
62
- def push(self, writer, target, free_storage=True):
63
- if not isinstance(self._storage, RowCacheStorage):
64
- writer.write_all(self._storage, target)
65
-
66
- # After writing we free the contents of the storage.
67
- if free_storage:
68
- self._storage.free()
@@ -1,22 +0,0 @@
1
- import logging
2
-
3
- from arekit.common.data.input.providers.rows.samples import BaseSampleRowProvider
4
- from arekit.common.data.input.repositories.base import BaseInputRepository
5
-
6
- logger = logging.getLogger(__name__)
7
- logging.basicConfig(level=logging.INFO)
8
-
9
-
10
- class BaseInputSamplesRepository(BaseInputRepository):
11
-
12
- def _setup_rows_provider(self):
13
- """ Setup store labels.
14
- """
15
- assert(isinstance(self._rows_provider, BaseSampleRowProvider))
16
- self._rows_provider.set_store_labels(self._columns_provider.StoreLabels)
17
-
18
- def _setup_columns_provider(self):
19
- """ Setup text column names.
20
- """
21
- text_column_names = list(self._rows_provider.TextProvider.iter_columns())
22
- self._columns_provider.set_text_column_names(text_column_names)
File without changes
@@ -1,26 +0,0 @@
1
- from arekit.common.data import const
2
- from arekit.common.data.storages.base import BaseRowsStorage
3
-
4
-
5
- # TODO. This is a particular type of view, and expected to be off the core.
6
- class LinkedSamplesStorageView(object):
7
-
8
- def iter_from_storage(self, storage):
9
- assert(isinstance(storage, BaseRowsStorage))
10
- undefined = -1
11
-
12
- linked = []
13
- current_opinion_id = undefined
14
- for row_index, opinion_id in enumerate(storage.iter_column_values(const.OPINION_ID)):
15
- if current_opinion_id != undefined:
16
- if opinion_id != current_opinion_id:
17
- yield linked
18
- linked = []
19
- current_opinion_id = opinion_id
20
- else:
21
- current_opinion_id = opinion_id
22
-
23
- linked.append(storage.get_row(row_index))
24
-
25
- if len(linked) > 0:
26
- yield linked
@@ -1,37 +0,0 @@
1
- from arekit.common.pipeline.items.base import BasePipelineItem
2
- from arekit.common.text.partitioning.base import BasePartitioning
3
- from arekit.common.pipeline.context import PipelineContext
4
-
5
-
6
- class SentenceObjectsParserPipelineItem(BasePipelineItem):
7
-
8
- def __init__(self, partitioning):
9
- assert(isinstance(partitioning, BasePartitioning))
10
- self.__partitioning = partitioning
11
-
12
- # region protected
13
-
14
- def _get_text(self, pipeline_ctx):
15
- return None
16
-
17
- def _get_parts_provider_func(self, input_data, pipeline_ctx):
18
- raise NotImplementedError()
19
-
20
- # endregion
21
-
22
- def apply_core(self, input_data, pipeline_ctx):
23
- assert(isinstance(pipeline_ctx, PipelineContext))
24
- external_input = self._get_text(pipeline_ctx)
25
- actual_input = input_data if external_input is None else external_input
26
- parts_it = self._get_parts_provider_func(input_data=actual_input, pipeline_ctx=pipeline_ctx)
27
- return self.__partitioning.provide(text=actual_input, parts_it=parts_it)
28
-
29
- # region base
30
-
31
- def __enter__(self):
32
- return self
33
-
34
- def __exit__(self, exc_type, exc_val, exc_tb):
35
- pass
36
-
37
- # endregion
@@ -1,12 +0,0 @@
1
- from arekit.common.pipeline.base import BasePipeline
2
- from arekit.common.text.parsed import BaseParsedText
3
-
4
-
5
- class BaseTextParser(BasePipeline):
6
-
7
- def run(self, input_data, params_dict=None, parent_ctx=None):
8
- output_data = super(BaseTextParser, self).run(input_data=input_data,
9
- params_dict=params_dict,
10
- parent_ctx=parent_ctx)
11
-
12
- return BaseParsedText(terms=output_data)
File without changes
@@ -1,4 +0,0 @@
1
- class BasePartitioning(object):
2
-
3
- def provide(self, text, parts_it):
4
- raise NotImplementedError()
@@ -1,35 +0,0 @@
1
- from collections.abc import Iterable
2
-
3
- from arekit.common.bound import Bound
4
- from arekit.common.text.partitioning.base import BasePartitioning
5
-
6
-
7
- class TermsPartitioning(BasePartitioning):
8
- """ NOTE: considering that provided parts
9
- has no intersections between each other
10
- """
11
-
12
- def provide(self, text, parts_it):
13
- assert(isinstance(text, list))
14
- assert(isinstance(parts_it, Iterable))
15
-
16
- start = 0
17
- parts = []
18
- for value, bound in parts_it:
19
- assert(isinstance(bound, Bound))
20
- assert(bound.Position >= start)
21
-
22
- # Release everythig till the current value position.
23
- part = text[start:bound.Position]
24
-
25
- parts.extend(part)
26
-
27
- # Release the entity value.
28
- parts.extend([value])
29
-
30
- start = bound.Position + bound.Length
31
-
32
- # Release everything after the last entity.
33
- parts.extend(text[start:len(text)])
34
-
35
- return parts
File without changes