arekit 0.24.0__py3-none-any.whl → 0.25.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. arekit/common/data/storages/base.py +4 -15
  2. arekit/common/docs/entities_grouping.py +2 -1
  3. arekit/common/docs/parser.py +27 -22
  4. arekit/common/pipeline/base.py +12 -16
  5. arekit/common/pipeline/batching.py +28 -0
  6. arekit/common/pipeline/context.py +5 -1
  7. arekit/common/pipeline/items/base.py +39 -2
  8. arekit/common/pipeline/items/flatten.py +5 -1
  9. arekit/common/pipeline/items/handle.py +2 -1
  10. arekit/common/pipeline/items/iter.py +2 -1
  11. arekit/common/pipeline/items/map.py +2 -1
  12. arekit/common/pipeline/items/map_nested.py +4 -0
  13. arekit/common/pipeline/utils.py +32 -0
  14. arekit/common/text/{partitioning/str.py → partitioning.py} +14 -9
  15. arekit/common/utils.py +11 -52
  16. arekit/contrib/utils/data/contents/opinions.py +13 -3
  17. arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
  18. arekit/contrib/utils/data/storages/pandas_based.py +2 -17
  19. arekit/contrib/utils/data/storages/row_cache.py +8 -2
  20. arekit/contrib/utils/data/storages/sqlite_based.py +18 -0
  21. arekit/contrib/utils/io_utils/utils.py +1 -18
  22. arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
  23. arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
  24. arekit/contrib/utils/pipelines/text_opinion/extraction.py +8 -10
  25. arekit-0.25.1.data/data/logo.png +0 -0
  26. arekit-0.25.1.dist-info/METADATA +81 -0
  27. arekit-0.25.1.dist-info/RECORD +186 -0
  28. {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/WHEEL +1 -1
  29. arekit/common/data/input/repositories/__init__.py +0 -0
  30. arekit/common/data/input/repositories/base.py +0 -68
  31. arekit/common/data/input/repositories/sample.py +0 -22
  32. arekit/common/data/views/__init__.py +0 -0
  33. arekit/common/data/views/samples.py +0 -26
  34. arekit/common/docs/objects_parser.py +0 -37
  35. arekit/common/text/parser.py +0 -12
  36. arekit/common/text/partitioning/__init__.py +0 -0
  37. arekit/common/text/partitioning/base.py +0 -4
  38. arekit/common/text/partitioning/terms.py +0 -35
  39. arekit/contrib/networks/__init__.py +0 -0
  40. arekit/contrib/networks/embedding.py +0 -149
  41. arekit/contrib/networks/embedding_io.py +0 -18
  42. arekit/contrib/networks/input/__init__.py +0 -0
  43. arekit/contrib/networks/input/const.py +0 -6
  44. arekit/contrib/networks/input/ctx_serialization.py +0 -28
  45. arekit/contrib/networks/input/embedding/__init__.py +0 -0
  46. arekit/contrib/networks/input/embedding/matrix.py +0 -29
  47. arekit/contrib/networks/input/embedding/offsets.py +0 -55
  48. arekit/contrib/networks/input/formatters/__init__.py +0 -0
  49. arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
  50. arekit/contrib/networks/input/providers/__init__.py +0 -0
  51. arekit/contrib/networks/input/providers/sample.py +0 -129
  52. arekit/contrib/networks/input/providers/term_connotation.py +0 -23
  53. arekit/contrib/networks/input/providers/text.py +0 -24
  54. arekit/contrib/networks/input/rows_parser.py +0 -47
  55. arekit/contrib/networks/input/term_types.py +0 -13
  56. arekit/contrib/networks/input/terms_mapping.py +0 -60
  57. arekit/contrib/networks/vectorizer.py +0 -6
  58. arekit/contrib/source/__init__.py +0 -0
  59. arekit/contrib/source/brat/__init__.py +0 -0
  60. arekit/contrib/source/brat/annot.py +0 -84
  61. arekit/contrib/source/brat/doc.py +0 -28
  62. arekit/contrib/source/brat/entities/__init__.py +0 -0
  63. arekit/contrib/source/brat/entities/compound.py +0 -13
  64. arekit/contrib/source/brat/entities/entity.py +0 -42
  65. arekit/contrib/source/brat/entities/parser.py +0 -53
  66. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  67. arekit/contrib/source/brat/opinions/converter.py +0 -19
  68. arekit/contrib/source/brat/relation.py +0 -32
  69. arekit/contrib/source/brat/sentence.py +0 -69
  70. arekit/contrib/source/brat/sentences_reader.py +0 -128
  71. arekit/contrib/source/download.py +0 -41
  72. arekit/contrib/source/nerel/__init__.py +0 -0
  73. arekit/contrib/source/nerel/entities.py +0 -55
  74. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  75. arekit/contrib/source/nerel/folding/fixed.py +0 -74
  76. arekit/contrib/source/nerel/io_utils.py +0 -62
  77. arekit/contrib/source/nerel/labels.py +0 -241
  78. arekit/contrib/source/nerel/reader.py +0 -46
  79. arekit/contrib/source/nerel/utils.py +0 -24
  80. arekit/contrib/source/nerel/versions.py +0 -12
  81. arekit/contrib/source/nerelbio/__init__.py +0 -0
  82. arekit/contrib/source/nerelbio/io_utils.py +0 -62
  83. arekit/contrib/source/nerelbio/labels.py +0 -265
  84. arekit/contrib/source/nerelbio/reader.py +0 -8
  85. arekit/contrib/source/nerelbio/versions.py +0 -8
  86. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  87. arekit/contrib/source/ruattitudes/collection.py +0 -36
  88. arekit/contrib/source/ruattitudes/doc.py +0 -51
  89. arekit/contrib/source/ruattitudes/doc_brat.py +0 -44
  90. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  91. arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
  92. arekit/contrib/source/ruattitudes/io_utils.py +0 -56
  93. arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
  94. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  95. arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
  96. arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
  97. arekit/contrib/source/ruattitudes/reader.py +0 -268
  98. arekit/contrib/source/ruattitudes/sentence.py +0 -73
  99. arekit/contrib/source/ruattitudes/synonyms.py +0 -17
  100. arekit/contrib/source/ruattitudes/text_object.py +0 -59
  101. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  102. arekit/contrib/source/rusentiframes/collection.py +0 -157
  103. arekit/contrib/source/rusentiframes/effect.py +0 -24
  104. arekit/contrib/source/rusentiframes/io_utils.py +0 -19
  105. arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
  106. arekit/contrib/source/rusentiframes/polarity.py +0 -35
  107. arekit/contrib/source/rusentiframes/role.py +0 -15
  108. arekit/contrib/source/rusentiframes/state.py +0 -24
  109. arekit/contrib/source/rusentiframes/types.py +0 -42
  110. arekit/contrib/source/rusentiframes/value.py +0 -2
  111. arekit/contrib/source/rusentrel/__init__.py +0 -0
  112. arekit/contrib/source/rusentrel/const.py +0 -3
  113. arekit/contrib/source/rusentrel/docs_reader.py +0 -51
  114. arekit/contrib/source/rusentrel/entities.py +0 -26
  115. arekit/contrib/source/rusentrel/io_utils.py +0 -125
  116. arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
  117. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  118. arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
  119. arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
  120. arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
  121. arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
  122. arekit/contrib/source/rusentrel/synonyms.py +0 -17
  123. arekit/contrib/source/sentinerel/__init__.py +0 -0
  124. arekit/contrib/source/sentinerel/entities.py +0 -52
  125. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  126. arekit/contrib/source/sentinerel/folding/factory.py +0 -31
  127. arekit/contrib/source/sentinerel/folding/fixed.py +0 -70
  128. arekit/contrib/source/sentinerel/io_utils.py +0 -87
  129. arekit/contrib/source/sentinerel/labels.py +0 -53
  130. arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
  131. arekit/contrib/source/sentinerel/reader.py +0 -42
  132. arekit/contrib/source/synonyms/__init__.py +0 -0
  133. arekit/contrib/source/synonyms/utils.py +0 -19
  134. arekit/contrib/source/zip_utils.py +0 -47
  135. arekit/contrib/utils/connotations/__init__.py +0 -0
  136. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
  137. arekit/contrib/utils/data/readers/__init__.py +0 -0
  138. arekit/contrib/utils/data/readers/base.py +0 -7
  139. arekit/contrib/utils/data/readers/csv_pd.py +0 -38
  140. arekit/contrib/utils/data/readers/jsonl.py +0 -15
  141. arekit/contrib/utils/data/service/__init__.py +0 -0
  142. arekit/contrib/utils/data/service/balance.py +0 -50
  143. arekit/contrib/utils/data/writers/csv_native.py +0 -63
  144. arekit/contrib/utils/data/writers/csv_pd.py +0 -40
  145. arekit/contrib/utils/data/writers/json_opennre.py +0 -132
  146. arekit/contrib/utils/data/writers/sqlite_native.py +0 -110
  147. arekit/contrib/utils/download.py +0 -77
  148. arekit/contrib/utils/embeddings/__init__.py +0 -0
  149. arekit/contrib/utils/embeddings/rusvectores.py +0 -58
  150. arekit/contrib/utils/embeddings/tokens.py +0 -30
  151. arekit/contrib/utils/io_utils/embedding.py +0 -72
  152. arekit/contrib/utils/io_utils/opinions.py +0 -37
  153. arekit/contrib/utils/io_utils/samples.py +0 -79
  154. arekit/contrib/utils/lexicons/__init__.py +0 -0
  155. arekit/contrib/utils/lexicons/lexicon.py +0 -41
  156. arekit/contrib/utils/lexicons/relation.py +0 -42
  157. arekit/contrib/utils/lexicons/rusentilex.py +0 -37
  158. arekit/contrib/utils/nn/__init__.py +0 -0
  159. arekit/contrib/utils/nn/rows.py +0 -83
  160. arekit/contrib/utils/np_utils/__init__.py +0 -0
  161. arekit/contrib/utils/np_utils/embedding.py +0 -22
  162. arekit/contrib/utils/np_utils/npz_utils.py +0 -13
  163. arekit/contrib/utils/np_utils/vocab.py +0 -20
  164. arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
  165. arekit/contrib/utils/pipelines/items/sampling/base.py +0 -99
  166. arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -54
  167. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
  168. arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -32
  169. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
  170. arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -107
  171. arekit/contrib/utils/pipelines/items/text/translator.py +0 -135
  172. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  173. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  174. arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +0 -27
  175. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -65
  176. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
  177. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  178. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +0 -29
  179. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -64
  180. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
  181. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  182. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +0 -56
  183. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -20
  184. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -65
  185. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  186. arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +0 -21
  187. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -107
  188. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  189. arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +0 -29
  190. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
  191. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -180
  192. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
  193. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
  194. arekit/contrib/utils/processing/languages/__init__.py +0 -0
  195. arekit/contrib/utils/processing/languages/mods.py +0 -12
  196. arekit/contrib/utils/processing/languages/pos.py +0 -23
  197. arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
  198. arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
  199. arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
  200. arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
  201. arekit/contrib/utils/processing/languages/ru/number.py +0 -23
  202. arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
  203. arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
  204. arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
  205. arekit/contrib/utils/processing/pos/__init__.py +0 -0
  206. arekit/contrib/utils/processing/pos/base.py +0 -12
  207. arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
  208. arekit/contrib/utils/processing/pos/russian.py +0 -10
  209. arekit/contrib/utils/processing/text/__init__.py +0 -0
  210. arekit/contrib/utils/processing/text/tokens.py +0 -127
  211. arekit/contrib/utils/resources.py +0 -25
  212. arekit/contrib/utils/serializer.py +0 -43
  213. arekit/contrib/utils/sources/__init__.py +0 -0
  214. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  215. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  216. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
  217. arekit/contrib/utils/vectorizers/__init__.py +0 -0
  218. arekit/contrib/utils/vectorizers/bpe.py +0 -93
  219. arekit/contrib/utils/vectorizers/random_norm.py +0 -39
  220. arekit/download_data.py +0 -11
  221. arekit-0.24.0.dist-info/METADATA +0 -23
  222. arekit-0.24.0.dist-info/RECORD +0 -374
  223. {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/LICENSE +0 -0
  224. {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/top_level.txt +0 -0
@@ -1,107 +0,0 @@
1
- import logging
2
-
3
- from arekit.common.context.token import Token
4
- from arekit.common.pipeline.context import PipelineContext
5
- from arekit.common.pipeline.items.base import BasePipelineItem
6
- from arekit.common.utils import split_by_whitespaces
7
- from arekit.contrib.utils.processing.text.tokens import Tokens
8
-
9
- logger = logging.getLogger(__name__)
10
- logger.setLevel(logging.INFO)
11
-
12
-
13
- class DefaultTextTokenizer(BasePipelineItem):
14
- """ Default parser implementation.
15
- """
16
-
17
- def __init__(self, keep_tokens=True):
18
- super(DefaultTextTokenizer, self).__init__()
19
- self.__keep_tokens = keep_tokens
20
-
21
- # region protected methods
22
-
23
- def apply_core(self, input_data, pipeline_ctx):
24
- assert(isinstance(pipeline_ctx, PipelineContext))
25
- output_data = self.__process_parts(input_data)
26
- if not self.__keep_tokens:
27
- output_data = [word for word in output_data if not isinstance(word, Token)]
28
- return output_data
29
-
30
- # endregion
31
-
32
- # region private static methods
33
-
34
- def __process_parts(self, parts):
35
- assert(isinstance(parts, list))
36
-
37
- parsed = []
38
- for part in parts:
39
-
40
- if part is None:
41
- continue
42
-
43
- # Keep non str words as it is and try to parse str-based words.
44
- processed = [part] if not isinstance(part, str) else \
45
- self.__iter_processed_part(part=part)
46
-
47
- parsed.extend(processed)
48
-
49
- return parsed
50
-
51
- def __iter_processed_part(self, part):
52
- for word in split_by_whitespaces(part):
53
- for term in self.__process_word(word):
54
- yield term
55
-
56
- def __process_word(self, word):
57
- assert(isinstance(word, str))
58
- return self.__split_tokens(word)
59
-
60
- @staticmethod
61
- def __split_tokens(term):
62
- """
63
- Splitting off tokens from parsed_doc ending, i.e. for example:
64
- term: "сказать,-" -> "(term: "сказать", ["COMMA_TOKEN", "DASH_TOKEN"])
65
- return: (unicode or None, list)
66
- modified term and list of extracted tokens.
67
- """
68
-
69
- url = Tokens.try_create_url(term)
70
- if url is not None:
71
- return [url]
72
-
73
- l = 0
74
- words_and_tokens = []
75
- while l < len(term):
76
-
77
- # Token.
78
- token = Tokens.try_create(term[l])
79
- if token is not None:
80
- if token.get_token_value() != Tokens.NEW_LINE:
81
- words_and_tokens.append(token)
82
- l += 1
83
-
84
- # Number.
85
- elif str.isdigit(term[l]):
86
- k = l + 1
87
- while k < len(term) and str.isdigit(term[k]):
88
- k += 1
89
- token = Tokens.try_create_number(term[l:k])
90
- assert(token is not None)
91
- words_and_tokens.append(token)
92
- l = k
93
-
94
- # Term.
95
- else:
96
- k = l + 1
97
- while k < len(term):
98
- token = Tokens.try_create(term[k])
99
- if token is not None and token.get_token_value() != Tokens.DASH:
100
- break
101
- k += 1
102
- words_and_tokens.append(term[l:k])
103
- l = k
104
-
105
- return words_and_tokens
106
-
107
- # endregion
@@ -1,135 +0,0 @@
1
- from arekit.common.data.input.providers.const import IDLE_MODE
2
- from arekit.common.pipeline.conts import PARENT_CTX
3
- from arekit.common.entities.base import Entity
4
- from arekit.common.pipeline.context import PipelineContext
5
- from arekit.common.pipeline.items.base import BasePipelineItem
6
-
7
-
8
- class MLTextTranslatorPipelineItem(BasePipelineItem):
9
- """ Machine learning based translator pipeline item.
10
- """
11
-
12
- def __init__(self, batch_translate_model, do_translate_entity=True):
13
- """ Model, which is based on translation of the text,
14
- represented as a list of words.
15
- """
16
- self.__do_translate_entity = do_translate_entity
17
- self.__translate = batch_translate_model
18
-
19
- def fast_most_accurate_approach(self, input_data, entity_placeholder_template="<entityTag={}/>"):
20
- """ This approach assumes that the translation won't corrupt the original
21
- meta-annotation for entities and objects mentioned in text.
22
- """
23
-
24
- def __optionally_register(prts):
25
- if len(prts) > 0:
26
- content.append(" ".join(prts))
27
- parts_to_join.clear()
28
-
29
- content = []
30
- origin_entities = []
31
- parts_to_join = []
32
-
33
- for part in input_data:
34
- if isinstance(part, str) and part.strip():
35
- parts_to_join.append(part)
36
- elif isinstance(part, Entity):
37
- entity_index = len(origin_entities)
38
- parts_to_join.append(entity_placeholder_template.format(entity_index))
39
- # Register entities information for further restoration.
40
- origin_entities.append(part)
41
-
42
- # Register original text with masked named entities.
43
- __optionally_register(parts_to_join)
44
- # Register all named entities in order of their appearance in text.
45
- content.extend([e.Value for e in origin_entities])
46
-
47
- # Compose text parts.
48
- translated_parts = self.__translate(content)
49
-
50
- if len(translated_parts) == 0:
51
- return None
52
-
53
- # Take the original text.
54
- text = translated_parts[0]
55
- for entity_index in range(len(origin_entities)):
56
- if entity_placeholder_template.format(entity_index) not in text:
57
- return None
58
-
59
- # Enumerate entities.
60
- from_ind = 0
61
- text_parts = []
62
- for entity_index, translated_value in enumerate(translated_parts[1:]):
63
- entity_placeholder_instance = entity_placeholder_template.format(entity_index)
64
- # Cropping text part.
65
- to_ind = text.index(entity_placeholder_instance)
66
-
67
- if self.__do_translate_entity:
68
- origin_entities[entity_index].set_display_value(translated_value.strip())
69
-
70
- # Register entities.
71
- text_parts.append(text[from_ind:to_ind])
72
- text_parts.append(origin_entities[entity_index])
73
- # Update from index.
74
- from_ind = to_ind + len(entity_placeholder_instance)
75
-
76
- # Consider the remaining part.
77
- text_parts.append(text[from_ind:])
78
- return text_parts
79
-
80
- def default_pre_part_splitting_approach(self, input_data):
81
- """ This is the original strategy, based on the manually cropped named entities
82
- before the actual translation call.
83
- """
84
-
85
- def __optionally_register(prts):
86
- if len(prts) > 0:
87
- content.append(" ".join(prts))
88
- parts_to_join.clear()
89
-
90
- content = []
91
- origin_entities = []
92
- origin_entity_ind = []
93
- parts_to_join = []
94
-
95
- for _, part in enumerate(input_data):
96
- if isinstance(part, str) and part.strip():
97
- parts_to_join.append(part)
98
- elif isinstance(part, Entity):
99
- # Register first the prior parts were merged.
100
- __optionally_register(parts_to_join)
101
- # Register entities information for further restoration.
102
- origin_entity_ind.append(len(content))
103
- origin_entities.append(part)
104
- content.append(part.Value)
105
-
106
- __optionally_register(parts_to_join)
107
-
108
- # Compose text parts.
109
- translated_parts = self.__translate(content)
110
-
111
- for entity_ind, entity_part_ind in enumerate(origin_entity_ind):
112
- entity = origin_entities[entity_ind]
113
- if self.__do_translate_entity:
114
- entity.set_display_value(translated_parts[entity_part_ind].strip())
115
- translated_parts[entity_part_ind] = entity
116
-
117
- return translated_parts
118
-
119
- def apply_core(self, input_data, pipeline_ctx):
120
- assert(isinstance(pipeline_ctx, PipelineContext))
121
- assert(isinstance(input_data, list))
122
-
123
- # Check the pipeline state whether is an idle mode or not.
124
- parent_ctx = pipeline_ctx.provide(PARENT_CTX)
125
- idle_mode = parent_ctx.provide(IDLE_MODE)
126
-
127
- # When pipeline utilized only for the assessing the expected amount
128
- # of rows (common case of idle_mode), there is no need to perform
129
- # translation.
130
- if idle_mode:
131
- return
132
-
133
- fast_accurate = self.fast_most_accurate_approach(input_data)
134
- return self.default_pre_part_splitting_approach(input_data) \
135
- if fast_accurate is None else fast_accurate
File without changes
File without changes
@@ -1,27 +0,0 @@
1
- from arekit.common.data.doc_provider import DocumentProvider
2
- from arekit.contrib.source.nerel.reader import NerelDocReader
3
- from arekit.contrib.source.nerel.versions import NerelVersions
4
-
5
-
6
- class NERELDocProvider(DocumentProvider):
7
- """ A Russian dataset with nested named entities, relations, events and linked entities.
8
- https://github.com/nerel-ds/NEREL
9
- """
10
-
11
- def __init__(self, filename_by_id, version):
12
- """ filename_ids: dict
13
- Dictionary of {id: filename}, where
14
- - id: int
15
- - filename: str
16
- version: NerelVersions
17
- Specify the appropriate version of teh NEREL collection.
18
- """
19
- assert(isinstance(filename_by_id, dict))
20
- assert(isinstance(version, NerelVersions))
21
- super(NERELDocProvider, self).__init__()
22
- self.__filename_by_id = filename_by_id
23
- self.__version = version
24
- self.__doc_reader = NerelDocReader(version)
25
-
26
- def by_id(self, doc_id):
27
- return self.__doc_reader.read_document(doc_id=doc_id, filename=self.__filename_by_id[doc_id])
@@ -1,65 +0,0 @@
1
- from arekit.common.data.doc_provider import DocumentProvider
2
- from arekit.common.experiment.data_type import DataType
3
- from arekit.contrib.source.nerel.io_utils import NerelIOUtils
4
- from arekit.contrib.source.nerel.versions import NerelVersions
5
- from arekit.contrib.utils.pipelines.sources.nerel.doc_provider import NERELDocProvider
6
- from arekit.contrib.utils.pipelines.sources.nerel.labels_fmt import NerelAnyLabelFormatter
7
- from arekit.contrib.utils.pipelines.text_opinion.annot.predefined import PredefinedTextOpinionAnnotator
8
- from arekit.contrib.utils.pipelines.text_opinion.extraction import text_opinion_extraction_pipeline
9
- from arekit.contrib.utils.pipelines.text_opinion.filters.distance_based import DistanceLimitedTextOpinionFilter
10
-
11
-
12
- def create_text_relation_extraction_pipeline(nerel_version,
13
- text_parser,
14
- label_formatter=NerelAnyLabelFormatter(),
15
- terms_per_context=50,
16
- doc_ops=None,
17
- docs_limit=None,
18
- custom_text_opinion_filters=None):
19
- assert(isinstance(nerel_version, NerelVersions))
20
- assert(isinstance(doc_ops, DocumentProvider) or doc_ops is None)
21
- assert(isinstance(custom_text_opinion_filters, list) or custom_text_opinion_filters is None)
22
-
23
- data_folding = None
24
-
25
- if doc_ops is None:
26
- # Default Initialization.
27
- filenames_by_ids, data_folding = NerelIOUtils.read_dataset_split(version=nerel_version,
28
- docs_limit=docs_limit)
29
- doc_ops = NERELDocProvider(filename_by_id=filenames_by_ids, version=nerel_version)
30
-
31
- # Default text opinion filters.
32
- text_opinion_filters = [
33
- DistanceLimitedTextOpinionFilter(terms_per_context)
34
- ]
35
-
36
- # Append with the custom filters afterwards.
37
- if custom_text_opinion_filters is not None:
38
- text_opinion_filters += custom_text_opinion_filters
39
-
40
- predefined_annot = PredefinedTextOpinionAnnotator(doc_ops, label_formatter)
41
-
42
- pipelines = {
43
- DataType.Train: text_opinion_extraction_pipeline(text_parser=text_parser,
44
- get_doc_by_id_func=doc_ops.by_id,
45
- annotators=[predefined_annot],
46
- entity_index_func=lambda brat_entity: brat_entity.ID,
47
- text_opinion_filters=text_opinion_filters),
48
- DataType.Test: text_opinion_extraction_pipeline(text_parser=text_parser,
49
- get_doc_by_id_func=doc_ops.by_id,
50
- annotators=[predefined_annot],
51
- entity_index_func=lambda brat_entity: brat_entity.ID,
52
- text_opinion_filters=text_opinion_filters),
53
- DataType.Dev: text_opinion_extraction_pipeline(text_parser=text_parser,
54
- get_doc_by_id_func=doc_ops.by_id,
55
- annotators=[predefined_annot],
56
- entity_index_func=lambda brat_entity: brat_entity.ID,
57
- text_opinion_filters=text_opinion_filters),
58
- }
59
-
60
- # In the case when we setup a default data-folding.
61
- # There is a need to provide it, due to the needs in further.
62
- if data_folding is not None:
63
- return pipelines, data_folding
64
-
65
- return pipelines
@@ -1,60 +0,0 @@
1
- from arekit.common.labels.str_fmt import StringLabelsFormatter
2
- from arekit.contrib.source.nerel import labels
3
-
4
-
5
- class NerelAnyLabelFormatter(StringLabelsFormatter):
6
-
7
- def __init__(self):
8
-
9
- stol = {
10
- "OPINION_BELONGS_TO": labels.OpinionBelongsTo,
11
- "OPINION_RELATES_TO": labels.OpinionRelatesTo,
12
- "NEG_EFFECT_FROM": labels.NegEffectFrom,
13
- "POS_EFFECT_FROM": labels.PosEffectFrom,
14
- "NEG_STATE_FROM": labels.NegStateFrom,
15
- "POS_STATE_FROM": labels.PosStateFrom,
16
- "NEGATIVE_TO": labels.NegativeTo,
17
- "POSITIVE_TO": labels.PositiveTo,
18
- "STATE_BELONGS_TO": labels.STATE_BELONGS_TO,
19
- "POS_AUTHOR_FROM": labels.PosAuthorFrom,
20
- "NEG_AUTHOR_FROM": labels.NegAuthorFrom,
21
- "ALTERNATIVE_NAME": labels.ALTERNATIVE_NAME,
22
- "ORIGINS_FROM": labels.ORIGINS_FROM,
23
- "START_TIME": labels.START_TIME,
24
- "OWNER_OF": labels.OWNER_OF,
25
- "SUBEVENT_OF": labels.SUBEVENT_OF,
26
- "PARENT_OF": labels.PARENT_OF,
27
- "SUBORDINATE_OF": labels.SUBORDINATE_OF,
28
- "PART_OF": labels.PART_OF,
29
- "TAKES_PLACE_IN": labels.TAKES_PLACE_IN,
30
- "PARTICIPANT_IN": labels.PARTICIPANT_IN,
31
- "WORKPLACE": labels.WORKPLACE,
32
- "PENALIZED_AS": labels.PENALIZED_AS,
33
- "WORKS_AS": labels.WORKS_AS,
34
- "PLACE_OF_DEATH": labels.PLACE_OF_DEATH,
35
- "PLACE_OF_BIRTH": labels.PLACE_OF_BIRTH,
36
- "HAS_CAUSE": labels.HAS_CAUSE,
37
- "AWARDED_WITH": labels.AWARDED_WITH,
38
- "CAUSE_OF_DEATH": labels.CAUSE_OF_DEATH,
39
- "CONVICTED_OF": labels.CONVICTED_OF,
40
- "DATE_DEFUNCT_IN": labels.DATE_DEFUNCT_IN,
41
- "DATE_FOUNDED_IN": labels.DATE_FOUNDED_IN,
42
- "DATE_OF_BIRTH": labels.DATE_OF_BIRTH,
43
- "DATE_OF_CREATION": labels.DATE_OF_CREATION,
44
- "DATE_OF_DEATH": labels.DATE_OF_DEATH,
45
- "END_TIME": labels.END_TIME,
46
- "EXPENDITURE": labels.EXPENDITURE,
47
- "FOUNDED_BY": labels.FOUNDED_BY,
48
- "KNOWS": labels.KNOWS,
49
- "RELATIVE": labels.RELATIVE,
50
- "LOCATED_IN": labels.LOCATED_IN,
51
- "RELIGION_OF": labels.RELIGION_OF,
52
- "MEDICAL_CONDITION": labels.MEDICAL_CONDITION,
53
- "SCHOOLS_ATTENDED": labels.SCHOOLS_ATTENDED,
54
- "MEMBER_OF": labels.MEMBER_OF,
55
- "SIBLING": labels.SIBLING,
56
- "ORGANIZES": labels.ORGANIZES,
57
- "SPOUSE": labels.SPOUSE
58
- }
59
-
60
- super(NerelAnyLabelFormatter, self).__init__(stol=stol)
@@ -1,29 +0,0 @@
1
- from arekit.common.data.doc_provider import DocumentProvider
2
- from arekit.contrib.source.nerelbio.reader import NerelBioDocReader
3
- from arekit.contrib.source.nerelbio.versions import NerelBioVersions
4
-
5
-
6
- class NERELBioDocProvider(DocumentProvider):
7
- """ NEREL-BIO extends the general domain dataset NEREL.
8
- NEREL-BIO annotation scheme covers both general and biomedical
9
- domains making it suitable for domain transfer experiments.
10
- https://github.com/nerel-ds/NEREL-BIO
11
- """
12
-
13
- def __init__(self, filename_by_id, version):
14
- """ filename_ids: dict
15
- Dictionary of {id: filename}, where
16
- - id: int
17
- - filename: str
18
- version: NerelBioVersions
19
- Specify the appropriate version of the NEREL-BIO collection.
20
- """
21
- assert(isinstance(filename_by_id, dict))
22
- assert(isinstance(version, NerelBioVersions))
23
- super(NERELBioDocProvider, self).__init__()
24
- self.__filename_by_id = filename_by_id
25
- self.__version = version
26
- self.__doc_reader = NerelBioDocReader(version)
27
-
28
- def by_id(self, doc_id):
29
- return self.__doc_reader.read_document(doc_id=doc_id, filename=self.__filename_by_id[doc_id])
@@ -1,64 +0,0 @@
1
- from arekit.common.data.doc_provider import DocumentProvider
2
- from arekit.common.experiment.data_type import DataType
3
- from arekit.contrib.source.nerelbio.io_utils import NerelBioIOUtils
4
- from arekit.contrib.source.nerelbio.versions import NerelBioVersions
5
- from arekit.contrib.utils.pipelines.sources.nerel_bio.doc_provider import NERELBioDocProvider
6
- from arekit.contrib.utils.pipelines.sources.nerel_bio.labels_fmt import NerelBioAnyLabelFormatter
7
- from arekit.contrib.utils.pipelines.text_opinion.annot.predefined import PredefinedTextOpinionAnnotator
8
- from arekit.contrib.utils.pipelines.text_opinion.extraction import text_opinion_extraction_pipeline
9
- from arekit.contrib.utils.pipelines.text_opinion.filters.distance_based import DistanceLimitedTextOpinionFilter
10
-
11
-
12
- def create_text_relation_extraction_pipeline(nerel_bio_version,
13
- text_parser,
14
- label_formatter=NerelBioAnyLabelFormatter(),
15
- terms_per_context=50,
16
- doc_ops=None,
17
- docs_limit=None,
18
- custom_text_opinion_filters=None):
19
- assert(isinstance(nerel_bio_version, NerelBioVersions))
20
- assert(isinstance(doc_ops, DocumentProvider) or doc_ops is None)
21
- assert(isinstance(custom_text_opinion_filters, list) or custom_text_opinion_filters is None)
22
-
23
- data_folding = None
24
-
25
- if doc_ops is None:
26
- # Default Initialization.
27
- filenames_by_ids, data_folding = NerelBioIOUtils.read_dataset_split(version=nerel_bio_version,
28
- docs_limit=docs_limit)
29
- doc_ops = NERELBioDocProvider(filename_by_id=filenames_by_ids, version=nerel_bio_version)
30
-
31
- text_opinion_filters = [
32
- DistanceLimitedTextOpinionFilter(terms_per_context)
33
- ]
34
-
35
- # Append with the custom filters afterwards.
36
- if custom_text_opinion_filters is not None:
37
- text_opinion_filters += custom_text_opinion_filters
38
-
39
- predefined_annot = PredefinedTextOpinionAnnotator(doc_ops, label_formatter)
40
-
41
- pipelines = {
42
- DataType.Train: text_opinion_extraction_pipeline(text_parser=text_parser,
43
- get_doc_by_id_func=doc_ops.by_id,
44
- annotators=[predefined_annot],
45
- entity_index_func=lambda brat_entity: brat_entity.ID,
46
- text_opinion_filters=text_opinion_filters),
47
- DataType.Test: text_opinion_extraction_pipeline(text_parser=text_parser,
48
- get_doc_by_id_func=doc_ops.by_id,
49
- annotators=[predefined_annot],
50
- entity_index_func=lambda brat_entity: brat_entity.ID,
51
- text_opinion_filters=text_opinion_filters),
52
- DataType.Dev: text_opinion_extraction_pipeline(text_parser=text_parser,
53
- get_doc_by_id_func=doc_ops.by_id,
54
- annotators=[predefined_annot],
55
- entity_index_func=lambda brat_entity: brat_entity.ID,
56
- text_opinion_filters=text_opinion_filters),
57
- }
58
-
59
- # In the case when we setup a default data-folding.
60
- # There is a need to provide it, due to the needs in further.
61
- if data_folding is not None:
62
- return pipelines, data_folding
63
-
64
- return pipelines
@@ -1,79 +0,0 @@
1
- from arekit.common.labels.str_fmt import StringLabelsFormatter
2
- from arekit.contrib.source.nerelbio import labels
3
-
4
-
5
- class NerelBioAnyLabelFormatter(StringLabelsFormatter):
6
-
7
- def __init__(self):
8
-
9
- stol = {
10
- "ABBREVIATION": labels.ABBREVIATION,
11
- "ALTERNATIVE_NAME": labels.ALTERNATIVE_NAME,
12
- "KNOWS": labels.KNOWS,
13
- "AGE_IS": labels.AGE_IS,
14
- "AGE_DIED_AT": labels.AGE_DIED_AT,
15
- "AWARDED_WITH": labels.AWARDED_WITH,
16
- "PLACE_OF_BIRTH": labels.PLACE_OF_BIRTH,
17
- "DATE_DEFUNCT_IN": labels.DATE_DEFUNCT_IN,
18
- "DATE_FOUNDED_IN": labels.DATE_FOUNDED_IN,
19
- "DATE_OF_BIRTH": labels.DATE_OF_BIRTH,
20
- "DATE_OF_CREATION": labels.DATE_OF_CREATION,
21
- "DATE_OF_DEATH": labels.DATE_OF_DEATH,
22
- "POINT_IN_TIME": labels.POINT_IN_TIME,
23
- "PLACE_OF_DEATH": labels.PLACE_OF_DEATH,
24
- "FOUNDED_BY": labels.FOUNDED_BY,
25
- "HEADQUARTERED_IN": labels.HEADQUARTERED_IN,
26
- "IDEOLOGY_OF": labels.IDEOLOGY_OF,
27
- "SPOUSE": labels.SPOUSE,
28
- "MEMBER_OF": labels.MEMBER_OF,
29
- "ORGANIZES": labels.ORGANIZES,
30
- "OWNER_OF": labels.OWNER_OF,
31
- "PARENT_OF": labels.PARENT_OF,
32
- "PARTICIPANT_IN": labels.PARTICIPANT_IN,
33
- "PLACE_RESIDES_IN": labels.PLACE_RESIDES_IN,
34
- "PRICE_OF": labels.PRICE_OF,
35
- "PRODUCES": labels.PRODUCES,
36
- "RELATIVE": labels.RELATIVE,
37
- "RELIGION_OF": labels.RELIGION_OF,
38
- "SCHOOLS_ATTENDED": labels.SCHOOLS_ATTENDED,
39
- "SIBLING": labels.SIBLING,
40
- "SUBEVENT_OF": labels.SUBEVENT_OF,
41
- "SUBORDINATE_OF": labels.SUBORDINATE_OF,
42
- "TAKES_PLACE_IN": labels.TAKES_PLACE_IN,
43
- "WORKPLACE": labels.WORKPLACE,
44
- "WORKS_AS": labels.WORKS_AS,
45
- "CONVICTED_OF": labels.CONVICTED_OF,
46
- "PENALIZED_AS": labels.PENALIZED_AS,
47
- "START_TIME": labels.START_TIME,
48
- "END_TIME": labels.END_TIME,
49
- "EXPENDITURE": labels.EXPENDITURE,
50
- "AGENT": labels.AGENT,
51
- "INANIMATE_INVOLVED": labels.INANIMATE_INVOLVED,
52
- "INCOME": labels.INCOME,
53
- "SUBCLASS_OF": labels.SUBCLASS_OF,
54
- "PART_OF": labels.PART_OF,
55
- "LOCATED_IN": labels.LOCATED_IN,
56
- "TREATED_USING": labels.TREATED_USING,
57
- "ORIGINS_FROM": labels.ORIGINS_FROM,
58
- "TO_DETECT_OR_STUDY": labels.TO_DETECT_OR_STUDY,
59
- "AFFECTS": labels.AFFECTS,
60
- "HAS_CAUSE": labels.HAS_CAUSE,
61
- "APPLIED_TO": labels.APPLIED_TO,
62
- "USED_IN": labels.USED_IN,
63
- "ASSOCIATED_WITH": labels.ASSOCIATED_WITH,
64
- "HAS_ADMINISTRATION_ROUTE": labels.HAS_ADMINISTRATION_ROUTE,
65
- "HAS_STRENGTH": labels.HAS_STRENGTH,
66
- "DURATION_OF": labels.DURATION_OF,
67
- "VALUE_IS": labels.VALUE_IS,
68
- "PHYSIOLOGY_OF": labels.PHYSIOLOGY_OF,
69
- "PROCEDURE_PERFORMED": labels.PROCEDURE_PERFORMED,
70
- "MENTAL_PROCESS_OF": labels.MENTAL_PROCESS_OF,
71
- "MEDICAL_CONDITION": labels.MEDICAL_CONDITION,
72
- "DOSE_IS": labels.DOSE_IS,
73
- "FINDING_OF": labels.FINDING_OF,
74
- "CAUSE_OF_DEATH": labels.CAUSE_OF_DEATH,
75
- "CONSUME": labels.CONSUME,
76
- }
77
-
78
- super(NerelBioAnyLabelFormatter, self).__init__(stol=stol)
79
-
@@ -1,56 +0,0 @@
1
- from arekit.common.utils import progress_bar_iter
2
- from arekit.contrib.source.ruattitudes.collection import RuAttitudesCollection
3
- from arekit.contrib.source.ruattitudes.io_utils import RuAttitudesVersions
4
- from arekit.contrib.source.ruattitudes.doc import RuAttitudesDocument
5
- from arekit.contrib.source.ruattitudes.doc_brat import RuAttitudesDocumentsConverter
6
- from arekit.contrib.utils.data.doc_provider.dict_based import DictionaryBasedDocumentProvider
7
-
8
-
9
- class RuAttitudesDocumentProvider(DictionaryBasedDocumentProvider):
10
-
11
- def __init__(self, version, keep_doc_ids_only, doc_id_func, limit):
12
- d = self.read_ruattitudes_to_brat_in_memory(version=version,
13
- keep_doc_ids_only=keep_doc_ids_only,
14
- doc_id_func=doc_id_func,
15
- limit=limit)
16
- super(RuAttitudesDocumentProvider, self).__init__(d)
17
-
18
- @staticmethod
19
- def read_ruattitudes_to_brat_in_memory(version, keep_doc_ids_only, doc_id_func, limit=None):
20
- """ Performs reading of RuAttitude formatted documents and
21
- selection according to 'doc_ids_set' parameter.
22
- """
23
- assert (isinstance(version, RuAttitudesVersions))
24
- assert (isinstance(keep_doc_ids_only, bool))
25
- assert (callable(doc_id_func))
26
-
27
- it = RuAttitudesCollection.iter_docs(version=version,
28
- get_doc_index_func=doc_id_func,
29
- return_inds_only=keep_doc_ids_only)
30
-
31
- it_formatted_and_logged = progress_bar_iter(
32
- iterable=RuAttitudesDocumentProvider.__iter_id_with_doc(
33
- docs_it=it, keep_doc_ids_only=keep_doc_ids_only),
34
- desc="Loading RuAttitudes Collection [{}]".format("doc ids only" if keep_doc_ids_only else "fully"),
35
- unit='docs')
36
-
37
- d = {}
38
- docs_read = 0
39
- for doc_id, doc in it_formatted_and_logged:
40
- assert(isinstance(doc, RuAttitudesDocument) or doc is None)
41
- d[doc_id] = RuAttitudesDocumentsConverter.to_brat_doc(doc) if doc is not None else None
42
- docs_read += 1
43
- if limit is not None and docs_read >= limit:
44
- break
45
-
46
- return d
47
-
48
- @staticmethod
49
- def __iter_id_with_doc(docs_it, keep_doc_ids_only):
50
- if keep_doc_ids_only:
51
- for doc_id in docs_it:
52
- yield doc_id, None
53
- else:
54
- for doc in docs_it:
55
- assert (isinstance(doc, RuAttitudesDocument))
56
- yield doc.ID, doc
@@ -1,20 +0,0 @@
1
- from arekit.common.entities.types import OpinionEntityType
2
- from arekit.contrib.utils.entities.filter import EntityFilter
3
-
4
-
5
- class RuAttitudesEntityFilter(EntityFilter):
6
- """ This is a task-specific filter, which is applicable of entity types proposed
7
- by the OntoNotesV5 resource: https://catalog.ldc.upenn.edu/LDC2013T19
8
- We consider only a short list related to the sentiment attitude extraction task.
9
- """
10
-
11
- supported = ["GPE", "PERSON", "LOCAL", "GEO", "ORG"]
12
-
13
- def is_ignored(self, entity, e_type):
14
-
15
- if e_type == OpinionEntityType.Subject:
16
- return entity.Type not in RuAttitudesEntityFilter.supported
17
- if e_type == OpinionEntityType.Object:
18
- return entity.Type not in RuAttitudesEntityFilter.supported
19
- else:
20
- return True