arekit 0.23.1__py3-none-any.whl → 0.25.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (267) hide show
  1. arekit/common/context/terms_mapper.py +2 -2
  2. arekit/common/data/const.py +5 -4
  3. arekit/common/{experiment/api/ops_doc.py → data/doc_provider.py} +1 -1
  4. arekit/common/data/input/providers/columns/sample.py +6 -1
  5. arekit/common/data/input/providers/instances/base.py +1 -1
  6. arekit/common/data/input/providers/rows/base.py +36 -13
  7. arekit/common/data/input/providers/rows/samples.py +57 -55
  8. arekit/common/data/input/providers/sample/cropped.py +2 -2
  9. arekit/common/data/input/sample.py +1 -1
  10. arekit/common/data/rows_fmt.py +82 -0
  11. arekit/common/data/rows_parser.py +43 -0
  12. arekit/common/data/storages/base.py +23 -18
  13. arekit/common/data/views/samples.py +2 -8
  14. arekit/common/{news → docs}/base.py +2 -2
  15. arekit/common/{news → docs}/entities_grouping.py +2 -1
  16. arekit/common/{news → docs}/entity.py +2 -1
  17. arekit/common/{news → docs}/parsed/base.py +5 -5
  18. arekit/common/docs/parsed/providers/base.py +68 -0
  19. arekit/common/{news → docs}/parsed/providers/base_pairs.py +2 -2
  20. arekit/common/{news → docs}/parsed/providers/entity_service.py +27 -22
  21. arekit/common/{news → docs}/parsed/providers/opinion_pairs.py +2 -2
  22. arekit/common/{news → docs}/parsed/providers/text_opinion_pairs.py +6 -6
  23. arekit/common/docs/parsed/service.py +31 -0
  24. arekit/common/docs/parser.py +66 -0
  25. arekit/common/{news → docs}/sentence.py +1 -1
  26. arekit/common/entities/base.py +11 -2
  27. arekit/common/experiment/api/base_samples_io.py +1 -1
  28. arekit/common/frames/variants/collection.py +2 -2
  29. arekit/common/linkage/base.py +2 -2
  30. arekit/common/linkage/meta.py +23 -0
  31. arekit/common/linkage/opinions.py +1 -1
  32. arekit/common/linkage/text_opinions.py +2 -2
  33. arekit/common/opinions/annot/algo/base.py +1 -1
  34. arekit/common/opinions/annot/algo/pair_based.py +15 -13
  35. arekit/common/opinions/annot/algo/predefined.py +4 -4
  36. arekit/common/opinions/annot/algo_based.py +5 -5
  37. arekit/common/opinions/annot/base.py +3 -3
  38. arekit/common/opinions/base.py +7 -7
  39. arekit/common/opinions/collection.py +3 -3
  40. arekit/common/pipeline/base.py +12 -16
  41. arekit/common/pipeline/batching.py +28 -0
  42. arekit/common/pipeline/context.py +5 -1
  43. arekit/common/pipeline/items/base.py +38 -1
  44. arekit/common/pipeline/items/flatten.py +5 -1
  45. arekit/common/pipeline/items/handle.py +2 -1
  46. arekit/common/pipeline/items/iter.py +2 -1
  47. arekit/common/pipeline/items/map.py +2 -1
  48. arekit/common/pipeline/items/map_nested.py +4 -0
  49. arekit/common/pipeline/utils.py +32 -0
  50. arekit/common/service/sqlite.py +36 -0
  51. arekit/common/synonyms/base.py +2 -2
  52. arekit/common/text/{partitioning/str.py → partitioning.py} +16 -11
  53. arekit/common/text_opinions/base.py +11 -11
  54. arekit/common/utils.py +33 -46
  55. arekit/contrib/networks/embedding.py +3 -3
  56. arekit/contrib/networks/embedding_io.py +5 -5
  57. arekit/contrib/networks/input/const.py +0 -2
  58. arekit/contrib/networks/input/providers/sample.py +15 -29
  59. arekit/contrib/networks/input/rows_parser.py +47 -134
  60. arekit/contrib/prompt/sample.py +18 -16
  61. arekit/contrib/utils/data/contents/opinions.py +17 -5
  62. arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
  63. arekit/contrib/utils/data/{doc_ops → doc_provider}/dir_based.py +7 -7
  64. arekit/contrib/utils/data/readers/base.py +3 -0
  65. arekit/contrib/utils/data/readers/csv_pd.py +10 -4
  66. arekit/contrib/utils/data/readers/jsonl.py +3 -0
  67. arekit/contrib/utils/data/readers/sqlite.py +14 -0
  68. arekit/contrib/utils/data/service/balance.py +0 -1
  69. arekit/contrib/utils/data/storages/pandas_based.py +3 -5
  70. arekit/contrib/utils/data/storages/row_cache.py +18 -6
  71. arekit/contrib/utils/data/storages/sqlite_based.py +17 -0
  72. arekit/contrib/utils/data/writers/base.py +5 -0
  73. arekit/contrib/utils/data/writers/csv_native.py +3 -0
  74. arekit/contrib/utils/data/writers/csv_pd.py +3 -0
  75. arekit/contrib/utils/data/writers/json_opennre.py +31 -13
  76. arekit/contrib/utils/data/writers/sqlite_native.py +114 -0
  77. arekit/contrib/utils/io_utils/embedding.py +25 -33
  78. arekit/contrib/utils/io_utils/utils.py +3 -24
  79. arekit/contrib/utils/pipelines/items/sampling/base.py +31 -26
  80. arekit/contrib/utils/pipelines/items/sampling/networks.py +7 -10
  81. arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
  82. arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
  83. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +3 -3
  84. arekit/contrib/utils/pipelines/items/text/frames_negation.py +2 -1
  85. arekit/contrib/utils/pipelines/items/text/tokenizer.py +3 -5
  86. arekit/contrib/utils/pipelines/items/text/translator.py +136 -0
  87. arekit/contrib/utils/pipelines/opinion_collections.py +5 -5
  88. arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +7 -7
  89. arekit/contrib/utils/pipelines/text_opinion/extraction.py +34 -22
  90. arekit/contrib/utils/pipelines/text_opinion/filters/base.py +1 -1
  91. arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +1 -1
  92. arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +3 -3
  93. arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +4 -4
  94. arekit/contrib/utils/serializer.py +4 -23
  95. arekit-0.25.0.data/data/logo.png +0 -0
  96. arekit-0.25.0.dist-info/METADATA +82 -0
  97. arekit-0.25.0.dist-info/RECORD +259 -0
  98. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/WHEEL +1 -1
  99. arekit/common/data/row_ids/base.py +0 -79
  100. arekit/common/data/row_ids/binary.py +0 -38
  101. arekit/common/data/row_ids/multiple.py +0 -14
  102. arekit/common/folding/base.py +0 -36
  103. arekit/common/folding/fixed.py +0 -42
  104. arekit/common/folding/nofold.py +0 -15
  105. arekit/common/folding/united.py +0 -46
  106. arekit/common/news/objects_parser.py +0 -37
  107. arekit/common/news/parsed/providers/base.py +0 -48
  108. arekit/common/news/parsed/service.py +0 -31
  109. arekit/common/news/parser.py +0 -34
  110. arekit/common/text/parser.py +0 -12
  111. arekit/common/text/partitioning/__init__.py +0 -0
  112. arekit/common/text/partitioning/base.py +0 -4
  113. arekit/common/text/partitioning/terms.py +0 -35
  114. arekit/contrib/source/__init__.py +0 -0
  115. arekit/contrib/source/brat/__init__.py +0 -0
  116. arekit/contrib/source/brat/annot.py +0 -83
  117. arekit/contrib/source/brat/entities/__init__.py +0 -0
  118. arekit/contrib/source/brat/entities/compound.py +0 -33
  119. arekit/contrib/source/brat/entities/entity.py +0 -42
  120. arekit/contrib/source/brat/entities/parser.py +0 -53
  121. arekit/contrib/source/brat/news.py +0 -28
  122. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  123. arekit/contrib/source/brat/opinions/converter.py +0 -19
  124. arekit/contrib/source/brat/relation.py +0 -32
  125. arekit/contrib/source/brat/sentence.py +0 -69
  126. arekit/contrib/source/brat/sentences_reader.py +0 -128
  127. arekit/contrib/source/download.py +0 -41
  128. arekit/contrib/source/nerel/__init__.py +0 -0
  129. arekit/contrib/source/nerel/entities.py +0 -55
  130. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  131. arekit/contrib/source/nerel/folding/fixed.py +0 -75
  132. arekit/contrib/source/nerel/io_utils.py +0 -62
  133. arekit/contrib/source/nerel/labels.py +0 -241
  134. arekit/contrib/source/nerel/reader.py +0 -46
  135. arekit/contrib/source/nerel/utils.py +0 -24
  136. arekit/contrib/source/nerel/versions.py +0 -12
  137. arekit/contrib/source/nerelbio/__init__.py +0 -0
  138. arekit/contrib/source/nerelbio/io_utils.py +0 -62
  139. arekit/contrib/source/nerelbio/labels.py +0 -265
  140. arekit/contrib/source/nerelbio/reader.py +0 -8
  141. arekit/contrib/source/nerelbio/versions.py +0 -8
  142. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  143. arekit/contrib/source/ruattitudes/collection.py +0 -36
  144. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  145. arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
  146. arekit/contrib/source/ruattitudes/io_utils.py +0 -56
  147. arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
  148. arekit/contrib/source/ruattitudes/news.py +0 -51
  149. arekit/contrib/source/ruattitudes/news_brat.py +0 -44
  150. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  151. arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
  152. arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
  153. arekit/contrib/source/ruattitudes/reader.py +0 -268
  154. arekit/contrib/source/ruattitudes/sentence.py +0 -73
  155. arekit/contrib/source/ruattitudes/synonyms.py +0 -17
  156. arekit/contrib/source/ruattitudes/text_object.py +0 -57
  157. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  158. arekit/contrib/source/rusentiframes/collection.py +0 -157
  159. arekit/contrib/source/rusentiframes/effect.py +0 -24
  160. arekit/contrib/source/rusentiframes/io_utils.py +0 -19
  161. arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
  162. arekit/contrib/source/rusentiframes/polarity.py +0 -35
  163. arekit/contrib/source/rusentiframes/role.py +0 -15
  164. arekit/contrib/source/rusentiframes/state.py +0 -24
  165. arekit/contrib/source/rusentiframes/types.py +0 -42
  166. arekit/contrib/source/rusentiframes/value.py +0 -2
  167. arekit/contrib/source/rusentrel/__init__.py +0 -0
  168. arekit/contrib/source/rusentrel/const.py +0 -3
  169. arekit/contrib/source/rusentrel/entities.py +0 -26
  170. arekit/contrib/source/rusentrel/io_utils.py +0 -125
  171. arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
  172. arekit/contrib/source/rusentrel/news_reader.py +0 -51
  173. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  174. arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
  175. arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
  176. arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
  177. arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
  178. arekit/contrib/source/rusentrel/synonyms.py +0 -17
  179. arekit/contrib/source/sentinerel/__init__.py +0 -0
  180. arekit/contrib/source/sentinerel/entities.py +0 -52
  181. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  182. arekit/contrib/source/sentinerel/folding/factory.py +0 -32
  183. arekit/contrib/source/sentinerel/folding/fixed.py +0 -73
  184. arekit/contrib/source/sentinerel/io_utils.py +0 -87
  185. arekit/contrib/source/sentinerel/labels.py +0 -53
  186. arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
  187. arekit/contrib/source/sentinerel/reader.py +0 -42
  188. arekit/contrib/source/synonyms/__init__.py +0 -0
  189. arekit/contrib/source/synonyms/utils.py +0 -19
  190. arekit/contrib/source/zip_utils.py +0 -47
  191. arekit/contrib/utils/bert/rows.py +0 -0
  192. arekit/contrib/utils/bert/text_b_rus.py +0 -18
  193. arekit/contrib/utils/connotations/__init__.py +0 -0
  194. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
  195. arekit/contrib/utils/cv/__init__.py +0 -0
  196. arekit/contrib/utils/cv/doc_stat/__init__.py +0 -0
  197. arekit/contrib/utils/cv/doc_stat/base.py +0 -37
  198. arekit/contrib/utils/cv/doc_stat/sentence.py +0 -12
  199. arekit/contrib/utils/cv/splitters/__init__.py +0 -0
  200. arekit/contrib/utils/cv/splitters/base.py +0 -4
  201. arekit/contrib/utils/cv/splitters/default.py +0 -53
  202. arekit/contrib/utils/cv/splitters/statistical.py +0 -57
  203. arekit/contrib/utils/cv/two_class.py +0 -77
  204. arekit/contrib/utils/data/doc_ops/__init__.py +0 -0
  205. arekit/contrib/utils/data/doc_ops/dict_based.py +0 -13
  206. arekit/contrib/utils/data/ext.py +0 -31
  207. arekit/contrib/utils/data/views/__init__.py +0 -0
  208. arekit/contrib/utils/data/views/linkages/__init__.py +0 -0
  209. arekit/contrib/utils/data/views/linkages/base.py +0 -58
  210. arekit/contrib/utils/data/views/linkages/multilabel.py +0 -48
  211. arekit/contrib/utils/data/views/linkages/utils.py +0 -24
  212. arekit/contrib/utils/data/views/opinions.py +0 -14
  213. arekit/contrib/utils/download.py +0 -78
  214. arekit/contrib/utils/entities/formatters/str_rus_cased_fmt.py +0 -78
  215. arekit/contrib/utils/entities/formatters/str_rus_nocased_fmt.py +0 -15
  216. arekit/contrib/utils/entities/formatters/str_simple_fmt.py +0 -24
  217. arekit/contrib/utils/entities/formatters/str_simple_uppercase_fmt.py +0 -21
  218. arekit/contrib/utils/io_utils/opinions.py +0 -39
  219. arekit/contrib/utils/io_utils/samples.py +0 -78
  220. arekit/contrib/utils/lexicons/__init__.py +0 -0
  221. arekit/contrib/utils/lexicons/lexicon.py +0 -43
  222. arekit/contrib/utils/lexicons/relation.py +0 -45
  223. arekit/contrib/utils/lexicons/rusentilex.py +0 -34
  224. arekit/contrib/utils/nn/__init__.py +0 -0
  225. arekit/contrib/utils/nn/rows.py +0 -83
  226. arekit/contrib/utils/pipelines/items/sampling/bert.py +0 -5
  227. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
  228. arekit/contrib/utils/pipelines/items/to_output.py +0 -101
  229. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  230. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  231. arekit/contrib/utils/pipelines/sources/nerel/doc_ops.py +0 -27
  232. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -59
  233. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
  234. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  235. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_ops.py +0 -29
  236. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -59
  237. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
  238. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  239. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_ops.py +0 -56
  240. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -19
  241. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -58
  242. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  243. arekit/contrib/utils/pipelines/sources/rusentrel/doc_ops.py +0 -21
  244. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -100
  245. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  246. arekit/contrib/utils/pipelines/sources/sentinerel/doc_ops.py +0 -29
  247. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
  248. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -175
  249. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
  250. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
  251. arekit/contrib/utils/resources.py +0 -26
  252. arekit/contrib/utils/sources/__init__.py +0 -0
  253. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  254. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  255. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
  256. arekit/contrib/utils/utils_folding.py +0 -19
  257. arekit/download_data.py +0 -11
  258. arekit-0.23.1.dist-info/METADATA +0 -23
  259. arekit-0.23.1.dist-info/RECORD +0 -403
  260. /arekit/common/{data/row_ids → docs}/__init__.py +0 -0
  261. /arekit/common/{folding → docs/parsed}/__init__.py +0 -0
  262. /arekit/common/{news → docs/parsed/providers}/__init__.py +0 -0
  263. /arekit/common/{news → docs}/parsed/term_position.py +0 -0
  264. /arekit/common/{news/parsed → service}/__init__.py +0 -0
  265. /arekit/{common/news/parsed/providers → contrib/utils/data/doc_provider}/__init__.py +0 -0
  266. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/LICENSE +0 -0
  267. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/top_level.txt +0 -0
@@ -1,78 +0,0 @@
1
- import logging
2
- from os.path import join
3
-
4
- from arekit.contrib.utils.data.ext import create_writer_extension, create_reader_extension
5
- from arekit.contrib.utils.data.readers.base import BaseReader
6
- from arekit.common.experiment.api.base_samples_io import BaseSamplesIO
7
- from arekit.contrib.utils.data.writers.base import BaseWriter
8
- from arekit.contrib.utils.io_utils.utils import filename_template, check_targets_existence
9
-
10
- logger = logging.getLogger(__name__)
11
- logging.basicConfig(level=logging.INFO)
12
-
13
-
14
- class SamplesIO(BaseSamplesIO):
15
- """ Samples default IO utils for samples.
16
- Sample is a text part which include pair of attitude participants.
17
- This class allows to provide saver and loader for such entries, bubbed as samples.
18
- Samples required for machine learning training/inferring.
19
- """
20
-
21
- def __init__(self, target_dir, writer=None, reader=None, prefix="sample", target_extension=None):
22
- assert(isinstance(target_dir, str))
23
- assert(isinstance(prefix, str))
24
- assert(isinstance(writer, BaseWriter) or writer is None)
25
- assert(isinstance(reader, BaseReader) or reader is None)
26
- assert(isinstance(target_extension, str) or target_extension is None)
27
- self.__target_dir = target_dir
28
- self.__prefix = prefix
29
- self.__writer = writer
30
- self.__reader = reader
31
- self.__target_extension = target_extension
32
-
33
- if target_extension is None:
34
- if writer is not None:
35
- self.__target_extension = create_writer_extension(writer)
36
- elif reader is not None:
37
- self.__target_extension = create_reader_extension(reader)
38
-
39
- # region public methods
40
-
41
- @property
42
- def Reader(self):
43
- return self.__reader
44
-
45
- @property
46
- def Writer(self):
47
- return self.__writer
48
-
49
- def create_target(self, data_type, data_folding):
50
- return self.__get_input_sample_target(data_type, data_folding=data_folding)
51
-
52
- def check_targets_existed(self, data_types_iter, data_folding):
53
- for data_type in data_types_iter:
54
-
55
- targets = [
56
- self.__get_input_sample_target(data_type=data_type, data_folding=data_folding),
57
- ]
58
-
59
- if not check_targets_existence(targets=targets):
60
- return False
61
- return True
62
-
63
- # endregion
64
-
65
- def __get_input_sample_target(self, data_type, data_folding):
66
- template = filename_template(data_type=data_type, data_folding=data_folding)
67
- return self.__get_filepath(out_dir=self.__target_dir,
68
- template=template,
69
- prefix=self.__prefix,
70
- extension=self.__target_extension)
71
-
72
- @staticmethod
73
- def __get_filepath(out_dir, template, prefix, extension):
74
- assert(isinstance(template, str))
75
- assert(isinstance(prefix, str))
76
- assert(isinstance(extension, str))
77
- return join(out_dir, "{prefix}-{template}{extension}".format(
78
- prefix=prefix, template=template, extension=extension))
File without changes
@@ -1,43 +0,0 @@
1
- import pandas as pd
2
- from arekit.contrib.utils.data.readers.csv_pd import PandasCsvReader
3
-
4
-
5
- class Lexicon(object):
6
-
7
- @property
8
- def ToneKey(self):
9
- return 'tone'
10
-
11
- @property
12
- def TermKey(self):
13
- return 'term'
14
-
15
- def __init__(self, dataframe):
16
- assert(isinstance(dataframe, pd.DataFrame))
17
- self.__lexicon_df = dataframe
18
-
19
- @classmethod
20
- def load(cls, filepath, separator=','):
21
- reader = PandasCsvReader(compression=None, sep=separator)
22
- return cls(reader.read(filepath))
23
-
24
- def get_score(self, lemma):
25
- assert(type(lemma) == str)
26
- s = self.__lexicon_df[lemma.encode('utf-8') == self.__lexicon_df[self.TermKey]]
27
- return s[self.ToneKey].values[0] if len(s) > 0 else 0
28
-
29
- def has_term(self, term):
30
- assert(type(term) == str)
31
- s = self.__lexicon_df[term.encode('utf-8') == self.__lexicon_df[self.TermKey]]
32
- return len(s) > 0
33
-
34
- def __iter__(self):
35
- for term in self.__lexicon_df[self.TermKey]:
36
- yield term
37
-
38
- def __contains__(self, item):
39
- assert(isinstance(item, str))
40
- result = self.__lexicon_df[self.__lexicon_df[self.TermKey] == item.encode('utf-8')]
41
- return len(result) > 0
42
-
43
-
@@ -1,45 +0,0 @@
1
- import pandas as pd
2
-
3
- from arekit.contrib.utils.data.readers.csv_pd import PandasCsvReader
4
-
5
-
6
- class RelationLexicon(object):
7
-
8
- def __init__(self, dataframe):
9
- assert(isinstance(dataframe, pd.DataFrame))
10
- self.__check(dataframe)
11
- self.__lexicon = dataframe
12
-
13
- @classmethod
14
- def load(cls, filepath, separator=','):
15
- reader = PandasCsvReader(compression=None, sep=separator)
16
- return cls(reader.read(filepath))
17
-
18
- @staticmethod
19
- def __check(df):
20
- for index in df.index:
21
- relation = df.loc[index][0]
22
- assert(len(relation.split('<->')) == 2)
23
-
24
- @staticmethod
25
- def __create_key(l, r):
26
- assert(type(l) == str)
27
- assert(type(r) == str)
28
- return '<->'.join([l, r])
29
-
30
- def get_score(self, left, right):
31
- assert(type(left) == str)
32
- assert(type(right) == str)
33
-
34
- lr_key = self.__create_key(left, right)
35
- rl_key = self.__create_key(right, left)
36
-
37
- lr_score = self.__lexicon[lr_key == self.__lexicon['relation']]
38
- rl_score = self.__lexicon[rl_key == self.__lexicon['relation']]
39
-
40
- if len(lr_score) > 0:
41
- return lr_score['tone'].values[0]
42
- if len(rl_score) > 0:
43
- return rl_score['tone'].values[0]
44
-
45
- return None
@@ -1,34 +0,0 @@
1
- import zipfile
2
- from os import path
3
-
4
- import pandas as pd
5
-
6
- from arekit.contrib.source.zip_utils import ZipArchiveUtils
7
- from arekit.contrib.utils.lexicons.lexicon import Lexicon
8
-
9
-
10
- class RuSentiLexLexicon(Lexicon):
11
- """
12
- RuSentiLex Lexicon wrapper for csv file stored in /data folder.
13
- """
14
-
15
- __INNER_PATH = 'rusentilex.csv'
16
-
17
- @property
18
- def ToneKey(self):
19
- return 'tone'
20
-
21
- @property
22
- def TermKey(self):
23
- return 'term'
24
-
25
- @staticmethod
26
- def __get_archive_filepath():
27
- return path.join(ZipArchiveUtils.get_data_root(), "rusentilex.zip")
28
-
29
- @classmethod
30
- def from_zip(cls):
31
- with zipfile.ZipFile(cls.__get_archive_filepath(), "r") as zip_ref:
32
- with zip_ref.open(cls.__INNER_PATH, mode='r') as csv_file:
33
- df = pd.read_csv(csv_file, sep=',')
34
- return cls(df)
File without changes
@@ -1,83 +0,0 @@
1
- import collections
2
-
3
- from arekit.common.data.input.providers.text.single import BaseSingleTextProvider
4
- from arekit.common.data.input.terms_mapper import OpinionContainingTextTermsMapper
5
- from arekit.common.entities.str_fmt import StringEntitiesFormatter
6
- from arekit.contrib.networks.input.ctx_serialization import NetworkSerializationContext
7
- from arekit.contrib.networks.input.formatters.pos_mapper import PosTermsMapper
8
- from arekit.contrib.networks.input.providers.sample import NetworkSampleRowProvider
9
- from arekit.contrib.networks.input.providers.text import NetworkSingleTextProvider
10
- from arekit.contrib.networks.input.term_types import TermTypes
11
- from arekit.contrib.networks.input.terms_mapping import VectorizedNetworkTermMapping
12
- from arekit.contrib.utils.processing.lemmatization.mystem import MystemWrapper
13
- from arekit.contrib.utils.resources import load_embedding_news_mystem_skipgram_1000_20_2015
14
- from arekit.contrib.utils.vectorizers.bpe import BPEVectorizer
15
- from arekit.contrib.utils.vectorizers.random_norm import RandomNormalVectorizer
16
-
17
-
18
- def __add_term_embedding(dict_data, term, emb_vector):
19
- if term in dict_data:
20
- return
21
- dict_data[term] = emb_vector
22
-
23
-
24
- def create_rows_provider(str_entity_fmt, ctx, vectorizers="default"):
25
- """ This method is corresponds to the default initialization of
26
- the rows provider for data sampling pipeline.
27
-
28
- vectorizers:
29
- NONE: no need to vectorize, just provide text (using SingleTextProvider).
30
- DEFAULT: we consider an application of stemmer for Russian Language.
31
- DICT: in which for every type there is an assigned Vectorizer
32
- vectorization of term types.
33
- {
34
- TermType.Word: Vectorizer,
35
- TermType.Entity: Vectorizer,
36
- ...
37
- }
38
- """
39
- assert(isinstance(str_entity_fmt, StringEntitiesFormatter))
40
- assert(isinstance(ctx, NetworkSerializationContext))
41
- assert(isinstance(vectorizers, dict) or vectorizers == "default" or vectorizers is None)
42
-
43
- term_embedding_pairs = None
44
-
45
- if vectorizers is not None:
46
-
47
- if vectorizers == "default":
48
- # initialize default vectorizer for Russian language.
49
- embedding = load_embedding_news_mystem_skipgram_1000_20_2015(stemmer=MystemWrapper(), auto_download=True)
50
- bpe_vectorizer = BPEVectorizer(embedding=embedding, max_part_size=3)
51
- norm_vectorizer = RandomNormalVectorizer(vector_size=embedding.VectorSize,
52
- token_offset=12345)
53
- vectorizers = {
54
- TermTypes.WORD: bpe_vectorizer,
55
- TermTypes.ENTITY: bpe_vectorizer,
56
- TermTypes.FRAME: bpe_vectorizer,
57
- TermTypes.TOKEN: norm_vectorizer
58
- }
59
-
60
- # Setup term-embedding pairs collection instance.
61
- term_embedding_pairs = collections.OrderedDict()
62
-
63
- # Use text provider with vectorizers.
64
- text_provider = NetworkSingleTextProvider(
65
- text_terms_mapper=VectorizedNetworkTermMapping(
66
- vectorizers=vectorizers,
67
- string_entities_formatter=str_entity_fmt),
68
- pair_handling_func=lambda pair: __add_term_embedding(
69
- dict_data=term_embedding_pairs,
70
- term=pair[0],
71
- emb_vector=pair[1]))
72
- else:
73
- # Create text provider which without vectorizers.
74
- text_provider = BaseSingleTextProvider(
75
- text_terms_mapper=OpinionContainingTextTermsMapper(str_entity_fmt))
76
-
77
- return NetworkSampleRowProvider(
78
- label_provider=ctx.LabelProvider,
79
- text_provider=text_provider,
80
- frames_connotation_provider=ctx.FramesConnotationProvider,
81
- frame_role_label_scaler=ctx.FrameRolesLabelScaler,
82
- pos_terms_mapper=PosTermsMapper(ctx.PosTagger) if ctx.PosTagger is not None else None,
83
- term_embedding_pairs=term_embedding_pairs)
@@ -1,5 +0,0 @@
1
- from arekit.contrib.utils.pipelines.items.sampling.base import BaseSerializerPipelineItem
2
-
3
-
4
- class BertExperimentInputSerializerPipelineItem(BaseSerializerPipelineItem):
5
- pass
@@ -1,10 +0,0 @@
1
- from arekit.common.pipeline.context import PipelineContext
2
- from arekit.common.pipeline.items.base import BasePipelineItem
3
- from arekit.common.utils import split_by_whitespaces
4
-
5
-
6
- class TermsSplitterParser(BasePipelineItem):
7
-
8
- def apply_core(self, input_data, pipeline_ctx):
9
- assert(isinstance(pipeline_ctx, PipelineContext))
10
- return split_by_whitespaces(input_data)
@@ -1,101 +0,0 @@
1
- from arekit.common.data import const
2
- from arekit.common.data.storages.base import BaseRowsStorage
3
- from arekit.common.experiment.data_type import DataType
4
- from arekit.common.folding.base import BaseDataFolding
5
- from arekit.common.labels.scaler.base import BaseLabelScaler
6
- from arekit.common.labels.str_fmt import StringLabelsFormatter
7
- from arekit.common.model.labeling.modes import LabelCalculationMode
8
- from arekit.common.opinions.writer import OpinionCollectionWriter
9
- from arekit.common.pipeline.base import BasePipeline
10
- from arekit.common.pipeline.context import PipelineContext
11
- from arekit.common.pipeline.items.base import BasePipelineItem
12
- from arekit.common.pipeline.items.handle import HandleIterPipelineItem
13
- from arekit.contrib.utils.data.views.linkages.multilabel import MultilableOpinionLinkagesView
14
- from arekit.contrib.utils.data.views.opinions import BaseOpinionStorageView
15
- from arekit.contrib.utils.io_utils.opinions import OpinionsIO
16
- from arekit.contrib.utils.utils_folding import folding_iter_states, experiment_iter_index
17
- from arekit.contrib.utils.pipelines.opinion_collections import \
18
- text_opinion_linkages_to_opinion_collections_pipeline_part
19
-
20
-
21
- class TextOpinionLinkagesToOpinionConverterPipelineItem(BasePipelineItem):
22
-
23
- def __init__(self, opinions_io, create_opinion_collection_func,
24
- opinion_collection_writer, label_scaler, labels_formatter):
25
- """ create_opinion_collection_func: func
26
- func () -> OpinionCollection (empty)
27
- """
28
- assert(isinstance(opinions_io, OpinionsIO))
29
- assert(callable(create_opinion_collection_func))
30
- assert(isinstance(label_scaler, BaseLabelScaler))
31
- assert(isinstance(labels_formatter, StringLabelsFormatter))
32
- assert(isinstance(opinion_collection_writer, OpinionCollectionWriter))
33
- super(TextOpinionLinkagesToOpinionConverterPipelineItem, self).__init__()
34
-
35
- self.__opinions_io = opinions_io
36
- self.__labels_formatter = labels_formatter
37
- self.__label_scaler = label_scaler
38
- self.__create_opinion_collection_func = create_opinion_collection_func
39
- self.__opinion_collection_writer = opinion_collection_writer
40
-
41
- def __convert(self, data_folding, output_storage, target_func, data_type, pipeline_ctx):
42
- """ From `output_storage` to `target` conversion.
43
- output_storage: BaseRowsStorage
44
- target_func: func(doc_id) -- considered to provide a target for the particular document.
45
- """
46
- assert(isinstance(data_folding, BaseDataFolding))
47
- assert(isinstance(output_storage, BaseRowsStorage))
48
- assert(isinstance(data_type, DataType))
49
- assert(callable(target_func))
50
-
51
- # We utilize google bert format, where every row
52
- # consist of label probabilities per every class
53
- linkages_view = MultilableOpinionLinkagesView(labels_scaler=self.__label_scaler,
54
- storage=output_storage)
55
- target = self.__opinions_io.create_target(data_type=data_type, data_folding=data_folding)
56
- storage = self.__opinions_io.Reader.read(target)
57
-
58
- converter_part = text_opinion_linkages_to_opinion_collections_pipeline_part(
59
- iter_opinion_linkages_func=lambda doc_id: linkages_view.iter_opinion_linkages(
60
- doc_id=doc_id, opinions_view=BaseOpinionStorageView(storage)),
61
- doc_ids_set=set(data_folding.fold_doc_ids_set()[data_type]),
62
- create_opinion_collection_func=self.__create_opinion_collection_func,
63
- labels_scaler=self.__label_scaler,
64
- label_calc_mode=LabelCalculationMode.AVERAGE)
65
-
66
- pipeline = BasePipeline(
67
- converter_part +
68
- [HandleIterPipelineItem(lambda data: self.__opinion_collection_writer.serialize(
69
- collection=data[1],
70
- encoding='utf-8',
71
- labels_formatter=self.__labels_formatter,
72
- error_on_non_supported=True,
73
- target=target_func(data[0])))
74
- ])
75
-
76
- input_data = set(output_storage.iter_column_values(column_name=const.DOC_ID))
77
-
78
- # iterate over the result.
79
- for _ in pipeline.run(input_data, parent_ctx=pipeline_ctx):
80
- pass
81
-
82
- def _iter_output_and_target_pairs(self, iter_index, data_type):
83
- raise NotImplementedError()
84
-
85
- def apply_core(self, input_data, pipeline_ctx):
86
- assert(isinstance(pipeline_ctx, PipelineContext))
87
- assert("data_folding" in pipeline_ctx)
88
- assert("data_type" in pipeline_ctx)
89
-
90
- data_folding = pipeline_ctx.provide("data_folding")
91
- data_type = pipeline_ctx.provide("data_type")
92
-
93
- for _ in folding_iter_states(data_folding):
94
- iter_index = experiment_iter_index(data_folding)
95
- pairs_it = self._iter_output_and_target_pairs(iter_index=iter_index, data_type=data_type)
96
- for output_storage, target in pairs_it:
97
- self.__convert(output_storage=output_storage,
98
- target_func=target,
99
- data_type=data_type,
100
- data_folding=data_folding,
101
- pipeline_ctx=pipeline_ctx)
File without changes
File without changes
@@ -1,27 +0,0 @@
1
- from arekit.common.experiment.api.ops_doc import DocumentOperations
2
- from arekit.contrib.source.nerel.reader import NerelDocReader
3
- from arekit.contrib.source.nerel.versions import NerelVersions
4
-
5
-
6
- class NERELDocOperation(DocumentOperations):
7
- """ A Russian dataset with nested named entities, relations, events and linked entities.
8
- https://github.com/nerel-ds/NEREL
9
- """
10
-
11
- def __init__(self, filename_by_id, version):
12
- """ filename_ids: dict
13
- Dictionary of {id: filename}, where
14
- - id: int
15
- - filename: str
16
- version: NerelVersions
17
- Specify the appropriate version of teh NEREL collection.
18
- """
19
- assert(isinstance(filename_by_id, dict))
20
- assert(isinstance(version, NerelVersions))
21
- super(NERELDocOperation, self).__init__()
22
- self.__filename_by_id = filename_by_id
23
- self.__version = version
24
- self.__doc_reader = NerelDocReader(version)
25
-
26
- def by_id(self, doc_id):
27
- return self.__doc_reader.read_document(doc_id=doc_id, filename=self.__filename_by_id[doc_id])
@@ -1,59 +0,0 @@
1
- from arekit.common.experiment.api.ops_doc import DocumentOperations
2
- from arekit.common.experiment.data_type import DataType
3
- from arekit.contrib.source.nerel.io_utils import NerelIOUtils
4
- from arekit.contrib.source.nerel.versions import NerelVersions
5
- from arekit.contrib.utils.pipelines.sources.nerel.doc_ops import NERELDocOperation
6
- from arekit.contrib.utils.pipelines.sources.nerel.labels_fmt import NerelAnyLabelFormatter
7
- from arekit.contrib.utils.pipelines.text_opinion.annot.predefined import PredefinedTextOpinionAnnotator
8
- from arekit.contrib.utils.pipelines.text_opinion.extraction import text_opinion_extraction_pipeline
9
- from arekit.contrib.utils.pipelines.text_opinion.filters.distance_based import DistanceLimitedTextOpinionFilter
10
- from arekit.contrib.utils.pipelines.text_opinion.filters.entity_based import EntityBasedTextOpinionFilter
11
-
12
-
13
- def create_text_relation_extraction_pipeline(nerel_version,
14
- text_parser,
15
- label_formatter=NerelAnyLabelFormatter(),
16
- terms_per_context=50,
17
- doc_ops=None,
18
- docs_limit=None,
19
- entity_filter=None):
20
- assert(isinstance(nerel_version, NerelVersions))
21
- assert(isinstance(doc_ops, DocumentOperations) or doc_ops is None)
22
-
23
- data_folding = None
24
-
25
- if doc_ops is None:
26
- # Default Initialization.
27
- filenames_by_ids, data_folding = NerelIOUtils.read_dataset_split(version=nerel_version,
28
- docs_limit=docs_limit)
29
- doc_ops = NERELDocOperation(filename_by_id=filenames_by_ids,
30
- version=nerel_version)
31
-
32
- text_opinion_filters = [
33
- EntityBasedTextOpinionFilter(entity_filter=entity_filter),
34
- DistanceLimitedTextOpinionFilter(terms_per_context)
35
- ]
36
-
37
- predefined_annot = PredefinedTextOpinionAnnotator(doc_ops, label_formatter)
38
-
39
- pipelines = {
40
- DataType.Train: text_opinion_extraction_pipeline(text_parser=text_parser,
41
- get_doc_by_id_func=doc_ops.by_id,
42
- annotators=[predefined_annot],
43
- text_opinion_filters=text_opinion_filters),
44
- DataType.Test: text_opinion_extraction_pipeline(text_parser=text_parser,
45
- get_doc_by_id_func=doc_ops.by_id,
46
- annotators=[predefined_annot],
47
- text_opinion_filters=text_opinion_filters),
48
- DataType.Dev: text_opinion_extraction_pipeline(text_parser=text_parser,
49
- get_doc_by_id_func=doc_ops.by_id,
50
- annotators=[predefined_annot],
51
- text_opinion_filters=text_opinion_filters),
52
- }
53
-
54
- # In the case when we setup a default data-folding.
55
- # There is a need to provide it, due to the needs in further.
56
- if data_folding is not None:
57
- return pipelines, data_folding
58
-
59
- return pipelines
@@ -1,60 +0,0 @@
1
- from arekit.common.labels.str_fmt import StringLabelsFormatter
2
- from arekit.contrib.source.nerel import labels
3
-
4
-
5
- class NerelAnyLabelFormatter(StringLabelsFormatter):
6
-
7
- def __init__(self):
8
-
9
- stol = {
10
- "OPINION_BELONGS_TO": labels.OpinionBelongsTo,
11
- "OPINION_RELATES_TO": labels.OpinionRelatesTo,
12
- "NEG_EFFECT_FROM": labels.NegEffectFrom,
13
- "POS_EFFECT_FROM": labels.PosEffectFrom,
14
- "NEG_STATE_FROM": labels.NegStateFrom,
15
- "POS_STATE_FROM": labels.PosStateFrom,
16
- "NEGATIVE_TO": labels.NegativeTo,
17
- "POSITIVE_TO": labels.PositiveTo,
18
- "STATE_BELONGS_TO": labels.STATE_BELONGS_TO,
19
- "POS_AUTHOR_FROM": labels.PosAuthorFrom,
20
- "NEG_AUTHOR_FROM": labels.NegAuthorFrom,
21
- "ALTERNATIVE_NAME": labels.ALTERNATIVE_NAME,
22
- "ORIGINS_FROM": labels.ORIGINS_FROM,
23
- "START_TIME": labels.START_TIME,
24
- "OWNER_OF": labels.OWNER_OF,
25
- "SUBEVENT_OF": labels.SUBEVENT_OF,
26
- "PARENT_OF": labels.PARENT_OF,
27
- "SUBORDINATE_OF": labels.SUBORDINATE_OF,
28
- "PART_OF": labels.PART_OF,
29
- "TAKES_PLACE_IN": labels.TAKES_PLACE_IN,
30
- "PARTICIPANT_IN": labels.PARTICIPANT_IN,
31
- "WORKPLACE": labels.WORKPLACE,
32
- "PENALIZED_AS": labels.PENALIZED_AS,
33
- "WORKS_AS": labels.WORKS_AS,
34
- "PLACE_OF_DEATH": labels.PLACE_OF_DEATH,
35
- "PLACE_OF_BIRTH": labels.PLACE_OF_BIRTH,
36
- "HAS_CAUSE": labels.HAS_CAUSE,
37
- "AWARDED_WITH": labels.AWARDED_WITH,
38
- "CAUSE_OF_DEATH": labels.CAUSE_OF_DEATH,
39
- "CONVICTED_OF": labels.CONVICTED_OF,
40
- "DATE_DEFUNCT_IN": labels.DATE_DEFUNCT_IN,
41
- "DATE_FOUNDED_IN": labels.DATE_FOUNDED_IN,
42
- "DATE_OF_BIRTH": labels.DATE_OF_BIRTH,
43
- "DATE_OF_CREATION": labels.DATE_OF_CREATION,
44
- "DATE_OF_DEATH": labels.DATE_OF_DEATH,
45
- "END_TIME": labels.END_TIME,
46
- "EXPENDITURE": labels.EXPENDITURE,
47
- "FOUNDED_BY": labels.FOUNDED_BY,
48
- "KNOWS": labels.KNOWS,
49
- "RELATIVE": labels.RELATIVE,
50
- "LOCATED_IN": labels.LOCATED_IN,
51
- "RELIGION_OF": labels.RELIGION_OF,
52
- "MEDICAL_CONDITION": labels.MEDICAL_CONDITION,
53
- "SCHOOLS_ATTENDED": labels.SCHOOLS_ATTENDED,
54
- "MEMBER_OF": labels.MEMBER_OF,
55
- "SIBLING": labels.SIBLING,
56
- "ORGANIZES": labels.ORGANIZES,
57
- "SPOUSE": labels.SPOUSE
58
- }
59
-
60
- super(NerelAnyLabelFormatter, self).__init__(stol=stol)
@@ -1,29 +0,0 @@
1
- from arekit.common.experiment.api.ops_doc import DocumentOperations
2
- from arekit.contrib.source.nerelbio.reader import NerelBioDocReader
3
- from arekit.contrib.source.nerelbio.versions import NerelBioVersions
4
-
5
-
6
- class NERELBioDocOperation(DocumentOperations):
7
- """ NEREL-BIO extends the general domain dataset NEREL.
8
- NEREL-BIO annotation scheme covers both general and biomedical
9
- domains making it suitable for domain transfer experiments.
10
- https://github.com/nerel-ds/NEREL-BIO
11
- """
12
-
13
- def __init__(self, filename_by_id, version):
14
- """ filename_ids: dict
15
- Dictionary of {id: filename}, where
16
- - id: int
17
- - filename: str
18
- version: NerelBioVersions
19
- Specify the appropriate version of the NEREL-BIO collection.
20
- """
21
- assert(isinstance(filename_by_id, dict))
22
- assert(isinstance(version, NerelBioVersions))
23
- super(NERELBioDocOperation, self).__init__()
24
- self.__filename_by_id = filename_by_id
25
- self.__version = version
26
- self.__doc_reader = NerelBioDocReader(version)
27
-
28
- def by_id(self, doc_id):
29
- return self.__doc_reader.read_document(doc_id=doc_id, filename=self.__filename_by_id[doc_id])
@@ -1,59 +0,0 @@
1
- from arekit.common.experiment.api.ops_doc import DocumentOperations
2
- from arekit.common.experiment.data_type import DataType
3
- from arekit.contrib.source.nerelbio.io_utils import NerelBioIOUtils
4
- from arekit.contrib.source.nerelbio.versions import NerelBioVersions
5
- from arekit.contrib.utils.pipelines.sources.nerel_bio.doc_ops import NERELBioDocOperation
6
- from arekit.contrib.utils.pipelines.sources.nerel_bio.labels_fmt import NerelBioAnyLabelFormatter
7
- from arekit.contrib.utils.pipelines.text_opinion.annot.predefined import PredefinedTextOpinionAnnotator
8
- from arekit.contrib.utils.pipelines.text_opinion.extraction import text_opinion_extraction_pipeline
9
- from arekit.contrib.utils.pipelines.text_opinion.filters.distance_based import DistanceLimitedTextOpinionFilter
10
- from arekit.contrib.utils.pipelines.text_opinion.filters.entity_based import EntityBasedTextOpinionFilter
11
-
12
-
13
- def create_text_relation_extraction_pipeline(nerel_bio_version,
14
- text_parser,
15
- label_formatter=NerelBioAnyLabelFormatter(),
16
- terms_per_context=50,
17
- doc_ops=None,
18
- docs_limit=None,
19
- entity_filter=None):
20
- assert(isinstance(nerel_bio_version, NerelBioVersions))
21
- assert(isinstance(doc_ops, DocumentOperations) or doc_ops is None)
22
-
23
- data_folding = None
24
-
25
- if doc_ops is None:
26
- # Default Initialization.
27
- filenames_by_ids, data_folding = NerelBioIOUtils.read_dataset_split(version=nerel_bio_version,
28
- docs_limit=docs_limit)
29
- doc_ops = NERELBioDocOperation(filename_by_id=filenames_by_ids,
30
- version=nerel_bio_version)
31
-
32
- text_opinion_filters = [
33
- EntityBasedTextOpinionFilter(entity_filter=entity_filter),
34
- DistanceLimitedTextOpinionFilter(terms_per_context)
35
- ]
36
-
37
- predefined_annot = PredefinedTextOpinionAnnotator(doc_ops, label_formatter)
38
-
39
- pipelines = {
40
- DataType.Train: text_opinion_extraction_pipeline(text_parser=text_parser,
41
- get_doc_by_id_func=doc_ops.by_id,
42
- annotators=[predefined_annot],
43
- text_opinion_filters=text_opinion_filters),
44
- DataType.Test: text_opinion_extraction_pipeline(text_parser=text_parser,
45
- get_doc_by_id_func=doc_ops.by_id,
46
- annotators=[predefined_annot],
47
- text_opinion_filters=text_opinion_filters),
48
- DataType.Dev: text_opinion_extraction_pipeline(text_parser=text_parser,
49
- get_doc_by_id_func=doc_ops.by_id,
50
- annotators=[predefined_annot],
51
- text_opinion_filters=text_opinion_filters),
52
- }
53
-
54
- # In the case when we setup a default data-folding.
55
- # There is a need to provide it, due to the needs in further.
56
- if data_folding is not None:
57
- return pipelines, data_folding
58
-
59
- return pipelines