arekit 0.23.1__py3-none-any.whl → 0.25.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (267) hide show
  1. arekit/common/context/terms_mapper.py +2 -2
  2. arekit/common/data/const.py +5 -4
  3. arekit/common/{experiment/api/ops_doc.py → data/doc_provider.py} +1 -1
  4. arekit/common/data/input/providers/columns/sample.py +6 -1
  5. arekit/common/data/input/providers/instances/base.py +1 -1
  6. arekit/common/data/input/providers/rows/base.py +36 -13
  7. arekit/common/data/input/providers/rows/samples.py +57 -55
  8. arekit/common/data/input/providers/sample/cropped.py +2 -2
  9. arekit/common/data/input/sample.py +1 -1
  10. arekit/common/data/rows_fmt.py +82 -0
  11. arekit/common/data/rows_parser.py +43 -0
  12. arekit/common/data/storages/base.py +23 -18
  13. arekit/common/data/views/samples.py +2 -8
  14. arekit/common/{news → docs}/base.py +2 -2
  15. arekit/common/{news → docs}/entities_grouping.py +2 -1
  16. arekit/common/{news → docs}/entity.py +2 -1
  17. arekit/common/{news → docs}/parsed/base.py +5 -5
  18. arekit/common/docs/parsed/providers/base.py +68 -0
  19. arekit/common/{news → docs}/parsed/providers/base_pairs.py +2 -2
  20. arekit/common/{news → docs}/parsed/providers/entity_service.py +27 -22
  21. arekit/common/{news → docs}/parsed/providers/opinion_pairs.py +2 -2
  22. arekit/common/{news → docs}/parsed/providers/text_opinion_pairs.py +6 -6
  23. arekit/common/docs/parsed/service.py +31 -0
  24. arekit/common/docs/parser.py +66 -0
  25. arekit/common/{news → docs}/sentence.py +1 -1
  26. arekit/common/entities/base.py +11 -2
  27. arekit/common/experiment/api/base_samples_io.py +1 -1
  28. arekit/common/frames/variants/collection.py +2 -2
  29. arekit/common/linkage/base.py +2 -2
  30. arekit/common/linkage/meta.py +23 -0
  31. arekit/common/linkage/opinions.py +1 -1
  32. arekit/common/linkage/text_opinions.py +2 -2
  33. arekit/common/opinions/annot/algo/base.py +1 -1
  34. arekit/common/opinions/annot/algo/pair_based.py +15 -13
  35. arekit/common/opinions/annot/algo/predefined.py +4 -4
  36. arekit/common/opinions/annot/algo_based.py +5 -5
  37. arekit/common/opinions/annot/base.py +3 -3
  38. arekit/common/opinions/base.py +7 -7
  39. arekit/common/opinions/collection.py +3 -3
  40. arekit/common/pipeline/base.py +12 -16
  41. arekit/common/pipeline/batching.py +28 -0
  42. arekit/common/pipeline/context.py +5 -1
  43. arekit/common/pipeline/items/base.py +38 -1
  44. arekit/common/pipeline/items/flatten.py +5 -1
  45. arekit/common/pipeline/items/handle.py +2 -1
  46. arekit/common/pipeline/items/iter.py +2 -1
  47. arekit/common/pipeline/items/map.py +2 -1
  48. arekit/common/pipeline/items/map_nested.py +4 -0
  49. arekit/common/pipeline/utils.py +32 -0
  50. arekit/common/service/sqlite.py +36 -0
  51. arekit/common/synonyms/base.py +2 -2
  52. arekit/common/text/{partitioning/str.py → partitioning.py} +16 -11
  53. arekit/common/text_opinions/base.py +11 -11
  54. arekit/common/utils.py +33 -46
  55. arekit/contrib/networks/embedding.py +3 -3
  56. arekit/contrib/networks/embedding_io.py +5 -5
  57. arekit/contrib/networks/input/const.py +0 -2
  58. arekit/contrib/networks/input/providers/sample.py +15 -29
  59. arekit/contrib/networks/input/rows_parser.py +47 -134
  60. arekit/contrib/prompt/sample.py +18 -16
  61. arekit/contrib/utils/data/contents/opinions.py +17 -5
  62. arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
  63. arekit/contrib/utils/data/{doc_ops → doc_provider}/dir_based.py +7 -7
  64. arekit/contrib/utils/data/readers/base.py +3 -0
  65. arekit/contrib/utils/data/readers/csv_pd.py +10 -4
  66. arekit/contrib/utils/data/readers/jsonl.py +3 -0
  67. arekit/contrib/utils/data/readers/sqlite.py +14 -0
  68. arekit/contrib/utils/data/service/balance.py +0 -1
  69. arekit/contrib/utils/data/storages/pandas_based.py +3 -5
  70. arekit/contrib/utils/data/storages/row_cache.py +18 -6
  71. arekit/contrib/utils/data/storages/sqlite_based.py +17 -0
  72. arekit/contrib/utils/data/writers/base.py +5 -0
  73. arekit/contrib/utils/data/writers/csv_native.py +3 -0
  74. arekit/contrib/utils/data/writers/csv_pd.py +3 -0
  75. arekit/contrib/utils/data/writers/json_opennre.py +31 -13
  76. arekit/contrib/utils/data/writers/sqlite_native.py +114 -0
  77. arekit/contrib/utils/io_utils/embedding.py +25 -33
  78. arekit/contrib/utils/io_utils/utils.py +3 -24
  79. arekit/contrib/utils/pipelines/items/sampling/base.py +31 -26
  80. arekit/contrib/utils/pipelines/items/sampling/networks.py +7 -10
  81. arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
  82. arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
  83. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +3 -3
  84. arekit/contrib/utils/pipelines/items/text/frames_negation.py +2 -1
  85. arekit/contrib/utils/pipelines/items/text/tokenizer.py +3 -5
  86. arekit/contrib/utils/pipelines/items/text/translator.py +136 -0
  87. arekit/contrib/utils/pipelines/opinion_collections.py +5 -5
  88. arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +7 -7
  89. arekit/contrib/utils/pipelines/text_opinion/extraction.py +34 -22
  90. arekit/contrib/utils/pipelines/text_opinion/filters/base.py +1 -1
  91. arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +1 -1
  92. arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +3 -3
  93. arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +4 -4
  94. arekit/contrib/utils/serializer.py +4 -23
  95. arekit-0.25.0.data/data/logo.png +0 -0
  96. arekit-0.25.0.dist-info/METADATA +82 -0
  97. arekit-0.25.0.dist-info/RECORD +259 -0
  98. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/WHEEL +1 -1
  99. arekit/common/data/row_ids/base.py +0 -79
  100. arekit/common/data/row_ids/binary.py +0 -38
  101. arekit/common/data/row_ids/multiple.py +0 -14
  102. arekit/common/folding/base.py +0 -36
  103. arekit/common/folding/fixed.py +0 -42
  104. arekit/common/folding/nofold.py +0 -15
  105. arekit/common/folding/united.py +0 -46
  106. arekit/common/news/objects_parser.py +0 -37
  107. arekit/common/news/parsed/providers/base.py +0 -48
  108. arekit/common/news/parsed/service.py +0 -31
  109. arekit/common/news/parser.py +0 -34
  110. arekit/common/text/parser.py +0 -12
  111. arekit/common/text/partitioning/__init__.py +0 -0
  112. arekit/common/text/partitioning/base.py +0 -4
  113. arekit/common/text/partitioning/terms.py +0 -35
  114. arekit/contrib/source/__init__.py +0 -0
  115. arekit/contrib/source/brat/__init__.py +0 -0
  116. arekit/contrib/source/brat/annot.py +0 -83
  117. arekit/contrib/source/brat/entities/__init__.py +0 -0
  118. arekit/contrib/source/brat/entities/compound.py +0 -33
  119. arekit/contrib/source/brat/entities/entity.py +0 -42
  120. arekit/contrib/source/brat/entities/parser.py +0 -53
  121. arekit/contrib/source/brat/news.py +0 -28
  122. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  123. arekit/contrib/source/brat/opinions/converter.py +0 -19
  124. arekit/contrib/source/brat/relation.py +0 -32
  125. arekit/contrib/source/brat/sentence.py +0 -69
  126. arekit/contrib/source/brat/sentences_reader.py +0 -128
  127. arekit/contrib/source/download.py +0 -41
  128. arekit/contrib/source/nerel/__init__.py +0 -0
  129. arekit/contrib/source/nerel/entities.py +0 -55
  130. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  131. arekit/contrib/source/nerel/folding/fixed.py +0 -75
  132. arekit/contrib/source/nerel/io_utils.py +0 -62
  133. arekit/contrib/source/nerel/labels.py +0 -241
  134. arekit/contrib/source/nerel/reader.py +0 -46
  135. arekit/contrib/source/nerel/utils.py +0 -24
  136. arekit/contrib/source/nerel/versions.py +0 -12
  137. arekit/contrib/source/nerelbio/__init__.py +0 -0
  138. arekit/contrib/source/nerelbio/io_utils.py +0 -62
  139. arekit/contrib/source/nerelbio/labels.py +0 -265
  140. arekit/contrib/source/nerelbio/reader.py +0 -8
  141. arekit/contrib/source/nerelbio/versions.py +0 -8
  142. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  143. arekit/contrib/source/ruattitudes/collection.py +0 -36
  144. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  145. arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
  146. arekit/contrib/source/ruattitudes/io_utils.py +0 -56
  147. arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
  148. arekit/contrib/source/ruattitudes/news.py +0 -51
  149. arekit/contrib/source/ruattitudes/news_brat.py +0 -44
  150. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  151. arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
  152. arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
  153. arekit/contrib/source/ruattitudes/reader.py +0 -268
  154. arekit/contrib/source/ruattitudes/sentence.py +0 -73
  155. arekit/contrib/source/ruattitudes/synonyms.py +0 -17
  156. arekit/contrib/source/ruattitudes/text_object.py +0 -57
  157. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  158. arekit/contrib/source/rusentiframes/collection.py +0 -157
  159. arekit/contrib/source/rusentiframes/effect.py +0 -24
  160. arekit/contrib/source/rusentiframes/io_utils.py +0 -19
  161. arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
  162. arekit/contrib/source/rusentiframes/polarity.py +0 -35
  163. arekit/contrib/source/rusentiframes/role.py +0 -15
  164. arekit/contrib/source/rusentiframes/state.py +0 -24
  165. arekit/contrib/source/rusentiframes/types.py +0 -42
  166. arekit/contrib/source/rusentiframes/value.py +0 -2
  167. arekit/contrib/source/rusentrel/__init__.py +0 -0
  168. arekit/contrib/source/rusentrel/const.py +0 -3
  169. arekit/contrib/source/rusentrel/entities.py +0 -26
  170. arekit/contrib/source/rusentrel/io_utils.py +0 -125
  171. arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
  172. arekit/contrib/source/rusentrel/news_reader.py +0 -51
  173. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  174. arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
  175. arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
  176. arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
  177. arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
  178. arekit/contrib/source/rusentrel/synonyms.py +0 -17
  179. arekit/contrib/source/sentinerel/__init__.py +0 -0
  180. arekit/contrib/source/sentinerel/entities.py +0 -52
  181. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  182. arekit/contrib/source/sentinerel/folding/factory.py +0 -32
  183. arekit/contrib/source/sentinerel/folding/fixed.py +0 -73
  184. arekit/contrib/source/sentinerel/io_utils.py +0 -87
  185. arekit/contrib/source/sentinerel/labels.py +0 -53
  186. arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
  187. arekit/contrib/source/sentinerel/reader.py +0 -42
  188. arekit/contrib/source/synonyms/__init__.py +0 -0
  189. arekit/contrib/source/synonyms/utils.py +0 -19
  190. arekit/contrib/source/zip_utils.py +0 -47
  191. arekit/contrib/utils/bert/rows.py +0 -0
  192. arekit/contrib/utils/bert/text_b_rus.py +0 -18
  193. arekit/contrib/utils/connotations/__init__.py +0 -0
  194. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
  195. arekit/contrib/utils/cv/__init__.py +0 -0
  196. arekit/contrib/utils/cv/doc_stat/__init__.py +0 -0
  197. arekit/contrib/utils/cv/doc_stat/base.py +0 -37
  198. arekit/contrib/utils/cv/doc_stat/sentence.py +0 -12
  199. arekit/contrib/utils/cv/splitters/__init__.py +0 -0
  200. arekit/contrib/utils/cv/splitters/base.py +0 -4
  201. arekit/contrib/utils/cv/splitters/default.py +0 -53
  202. arekit/contrib/utils/cv/splitters/statistical.py +0 -57
  203. arekit/contrib/utils/cv/two_class.py +0 -77
  204. arekit/contrib/utils/data/doc_ops/__init__.py +0 -0
  205. arekit/contrib/utils/data/doc_ops/dict_based.py +0 -13
  206. arekit/contrib/utils/data/ext.py +0 -31
  207. arekit/contrib/utils/data/views/__init__.py +0 -0
  208. arekit/contrib/utils/data/views/linkages/__init__.py +0 -0
  209. arekit/contrib/utils/data/views/linkages/base.py +0 -58
  210. arekit/contrib/utils/data/views/linkages/multilabel.py +0 -48
  211. arekit/contrib/utils/data/views/linkages/utils.py +0 -24
  212. arekit/contrib/utils/data/views/opinions.py +0 -14
  213. arekit/contrib/utils/download.py +0 -78
  214. arekit/contrib/utils/entities/formatters/str_rus_cased_fmt.py +0 -78
  215. arekit/contrib/utils/entities/formatters/str_rus_nocased_fmt.py +0 -15
  216. arekit/contrib/utils/entities/formatters/str_simple_fmt.py +0 -24
  217. arekit/contrib/utils/entities/formatters/str_simple_uppercase_fmt.py +0 -21
  218. arekit/contrib/utils/io_utils/opinions.py +0 -39
  219. arekit/contrib/utils/io_utils/samples.py +0 -78
  220. arekit/contrib/utils/lexicons/__init__.py +0 -0
  221. arekit/contrib/utils/lexicons/lexicon.py +0 -43
  222. arekit/contrib/utils/lexicons/relation.py +0 -45
  223. arekit/contrib/utils/lexicons/rusentilex.py +0 -34
  224. arekit/contrib/utils/nn/__init__.py +0 -0
  225. arekit/contrib/utils/nn/rows.py +0 -83
  226. arekit/contrib/utils/pipelines/items/sampling/bert.py +0 -5
  227. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
  228. arekit/contrib/utils/pipelines/items/to_output.py +0 -101
  229. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  230. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  231. arekit/contrib/utils/pipelines/sources/nerel/doc_ops.py +0 -27
  232. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -59
  233. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
  234. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  235. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_ops.py +0 -29
  236. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -59
  237. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
  238. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  239. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_ops.py +0 -56
  240. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -19
  241. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -58
  242. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  243. arekit/contrib/utils/pipelines/sources/rusentrel/doc_ops.py +0 -21
  244. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -100
  245. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  246. arekit/contrib/utils/pipelines/sources/sentinerel/doc_ops.py +0 -29
  247. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
  248. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -175
  249. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
  250. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
  251. arekit/contrib/utils/resources.py +0 -26
  252. arekit/contrib/utils/sources/__init__.py +0 -0
  253. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  254. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  255. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
  256. arekit/contrib/utils/utils_folding.py +0 -19
  257. arekit/download_data.py +0 -11
  258. arekit-0.23.1.dist-info/METADATA +0 -23
  259. arekit-0.23.1.dist-info/RECORD +0 -403
  260. /arekit/common/{data/row_ids → docs}/__init__.py +0 -0
  261. /arekit/common/{folding → docs/parsed}/__init__.py +0 -0
  262. /arekit/common/{news → docs/parsed/providers}/__init__.py +0 -0
  263. /arekit/common/{news → docs}/parsed/term_position.py +0 -0
  264. /arekit/common/{news/parsed → service}/__init__.py +0 -0
  265. /arekit/{common/news/parsed/providers → contrib/utils/data/doc_provider}/__init__.py +0 -0
  266. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/LICENSE +0 -0
  267. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
1
- import collections
1
+ from collections.abc import Iterable
2
2
 
3
3
  from arekit.common.context.token import Token
4
4
  from arekit.common.entities.base import Entity
@@ -10,7 +10,7 @@ class TextTermsMapper(object):
10
10
  def iter_mapped(self, terms):
11
11
  """ Performs mapping operation of each terms in a sequence
12
12
  """
13
- assert(isinstance(terms, collections.Iterable))
13
+ assert(isinstance(terms, Iterable))
14
14
 
15
15
  self._before_mapping()
16
16
 
@@ -1,11 +1,12 @@
1
1
  ID = 'id'
2
2
  DOC_ID = 'doc_id'
3
3
  TEXT = 'text_a'
4
- LABEL = 'label'
4
+ LABEL_UINT = 'label_uint'
5
+ LABEL_STR = 'label_str'
5
6
 
6
- # Corresponds to fields with attitude ends. (values, STRING)
7
- SOURCE = 'source'
8
- TARGET = 'target'
7
+ # Global identifier of the opinion in the sampled data.
8
+ OPINION_ID = "opinion_id"
9
+ OPINION_LINKAGE_ID = "linkage_id"
9
10
 
10
11
  # Corresponds to fields with attitude ends. (indices, INT)
11
12
  S_IND = 's_ind'
@@ -1,4 +1,4 @@
1
- class DocumentOperations(object):
1
+ class DocumentProvider(object):
2
2
  """ Provides operations with documents
3
3
  """
4
4
 
@@ -37,7 +37,8 @@ class SampleColumnsProvider(BaseColumnsProvider):
37
37
 
38
38
  # insert labels
39
39
  if self.__store_labels:
40
- dtypes_list.append((const.LABEL, 'int32'))
40
+ dtypes_list.append((const.LABEL_UINT, 'int32'))
41
+ dtypes_list.append((const.LABEL_STR, str))
41
42
 
42
43
  # insert text columns
43
44
  for col_name in self.__text_column_names:
@@ -47,6 +48,10 @@ class SampleColumnsProvider(BaseColumnsProvider):
47
48
  dtypes_list.append((const.S_IND, 'int32'))
48
49
  dtypes_list.append((const.T_IND, 'int32'))
49
50
 
51
+ # opinion-extraction task related fields
52
+ dtypes_list.append((const.OPINION_ID, 'int32'))
53
+ dtypes_list.append((const.OPINION_LINKAGE_ID, 'int32'))
54
+
50
55
  return dtypes_list
51
56
 
52
57
  def set_text_column_names(self, text_column_names):
@@ -11,4 +11,4 @@ class BaseLinkedDataInstancesProvider(object):
11
11
  """ Implementation based on the first element of the linkage.
12
12
  """
13
13
  assert(isinstance(linked_data, LinkedDataWrapper))
14
- return linked_data.First.Sentiment
14
+ return linked_data.First.Label
@@ -1,10 +1,11 @@
1
- import collections
1
+ from collections import Counter
2
+ from collections.abc import Iterable
2
3
  import logging
3
4
 
4
5
  from arekit.common.data.input.providers.contents import ContentsProvider
5
6
  from arekit.common.linkage.base import LinkedDataWrapper
6
- from arekit.common.news.parsed.providers.entity_service import EntityServiceProvider
7
- from arekit.common.news.parsed.service import ParsedNewsService
7
+ from arekit.common.docs.parsed.providers.entity_service import EntityServiceProvider
8
+ from arekit.common.linkage.meta import MetaEmptyLinkedDataWrapper
8
9
 
9
10
  logger = logging.getLogger(__name__)
10
11
 
@@ -13,29 +14,51 @@ class BaseRowProvider(object):
13
14
  """ Base provider for rows that suppose to be filled into BaseRowsStorage.
14
15
  """
15
16
 
17
+ def __init__(self):
18
+ self.__rows_counter = None
19
+
16
20
  # region protected methods
17
21
 
18
22
  # TODO. This might be also generalized.
19
23
  # TODO. Idle-mode is also a implementation and task specific parameter, i.e. might be removed from here.
20
- def _provide_rows(self, parsed_news, entity_service, text_opinion_linkage, idle_mode):
24
+ def _provide_rows(self, parsed_doc, entity_service, text_opinion_linkage, idle_mode):
21
25
  raise NotImplementedError()
22
26
 
27
+ def _count_row(self):
28
+ index = self.__rows_counter["rows_iterated"]
29
+ self.__rows_counter["rows_iterated"] += 1
30
+ return index
31
+
23
32
  # endregion
24
33
 
34
+ def __iter_rows(self, linked_data, idle_mode):
35
+ parsed_doc_service = linked_data.Tag
36
+ return self._provide_rows(parsed_doc=parsed_doc_service.ParsedDocument,
37
+ entity_service=parsed_doc_service.get_provider(EntityServiceProvider.NAME),
38
+ text_opinion_linkage=linked_data,
39
+ idle_mode=idle_mode)
40
+
25
41
  def iter_by_rows(self, contents_provider, doc_ids_iter, idle_mode):
26
42
  assert(isinstance(contents_provider, ContentsProvider))
27
- assert(isinstance(doc_ids_iter, collections.Iterable))
43
+ assert(isinstance(doc_ids_iter, Iterable))
44
+
45
+ self.__rows_counter = Counter()
28
46
 
29
47
  for linked_data in contents_provider.from_doc_ids(doc_ids=doc_ids_iter, idle_mode=idle_mode):
30
48
  assert(isinstance(linked_data, LinkedDataWrapper))
31
- assert(isinstance(linked_data.Tag, ParsedNewsService))
32
49
 
33
- parsed_news_service = linked_data.Tag
50
+ if isinstance(linked_data, MetaEmptyLinkedDataWrapper):
51
+ if idle_mode:
52
+ # In the case of the IDLE mode we do not consider the meta-data.
53
+ data_it = []
54
+ else:
55
+ # Consider the actual linked data instance.
56
+ data_it = [linked_data]
57
+ else:
58
+ # Consider the actual rows of the related linked data.
59
+ data_it = self.__iter_rows(linked_data=linked_data, idle_mode=idle_mode)
34
60
 
35
- rows_it = self._provide_rows(parsed_news=parsed_news_service.ParsedNews,
36
- entity_service=parsed_news_service.get_provider(EntityServiceProvider.NAME),
37
- text_opinion_linkage=linked_data,
38
- idle_mode=idle_mode)
61
+ for data in data_it:
62
+ yield linked_data.RelatedDocID, data
39
63
 
40
- for row in rows_it:
41
- yield linked_data.RelatedDocID, row
64
+ self.__rows_counter = None
@@ -8,15 +8,14 @@ from arekit.common.data.input.providers.label.binary import BinaryLabelProvider
8
8
  from arekit.common.data.input.providers.label.multiple import MultipleLabelProvider
9
9
  from arekit.common.data.input.providers.rows.base import BaseRowProvider
10
10
  from arekit.common.data.input.providers.text.single import BaseSingleTextProvider
11
- from arekit.common.data.row_ids.binary import BinaryIDProvider
12
- from arekit.common.data.row_ids.multiple import MultipleIDProvider
11
+ from arekit.common.data.rows_fmt import create_base_column_fmt
13
12
  from arekit.common.entities.base import Entity
14
13
  from arekit.common.labels.base import Label
15
14
 
16
15
  from arekit.common.linkage.text_opinions import TextOpinionsLinkage
17
- from arekit.common.news.parsed.base import ParsedNews
18
- from arekit.common.news.parsed.providers.entity_service import EntityEndType, EntityServiceProvider
19
- from arekit.common.news.parsed.term_position import TermPositionTypes
16
+ from arekit.common.docs.parsed.base import ParsedDocument
17
+ from arekit.common.docs.parsed.providers.entity_service import EntityEndType, EntityServiceProvider
18
+ from arekit.common.docs.parsed.term_position import TermPositionTypes
20
19
  from arekit.common.text_opinions.base import TextOpinion
21
20
 
22
21
 
@@ -34,9 +33,9 @@ class BaseSampleRowProvider(BaseRowProvider):
34
33
 
35
34
  self._label_provider = label_provider
36
35
  self.__text_provider = text_provider
37
- self.__row_ids_provider = self.__create_row_ids_provider(label_provider)
38
36
  self.__instances_provider = self.__create_instances_provider(label_provider)
39
37
  self.__store_labels = None
38
+ self._val_fmt = create_base_column_fmt(fmt_type="writer")
40
39
 
41
40
  # region properties
42
41
 
@@ -52,56 +51,67 @@ class BaseSampleRowProvider(BaseRowProvider):
52
51
 
53
52
  # region protected methods
54
53
 
55
- def _provide_sentence_terms(self, parsed_news, sentence_ind, s_ind, t_ind):
56
- terms_iter = parsed_news.iter_sentence_terms(sentence_index=sentence_ind, return_id=False)
54
+ def _provide_sentence_terms(self, parsed_doc, sentence_ind, s_ind, t_ind):
55
+ terms_iter = parsed_doc.iter_sentence_terms(sentence_index=sentence_ind, return_id=False)
57
56
  return list(terms_iter), s_ind, t_ind
58
57
 
59
58
  # TODO. This is a very task-specific description, too many data provided.
60
59
  # TODO. Switch this API to dict of params
61
60
  def _fill_row_core(self, row, text_opinion_linkage, index_in_linked, etalon_label,
62
- parsed_news, sentence_ind, s_ind, t_ind):
61
+ parsed_doc, sentence_ind, s_ind, t_ind):
63
62
  assert(isinstance(self.__store_labels, bool))
64
63
 
65
- def __assign_value(column, value):
66
- row[column] = value
67
-
68
- row[const.ID] = self.__row_ids_provider.create_sample_id(
69
- linked_opinions=text_opinion_linkage,
70
- index_in_linked=index_in_linked,
71
- label_scaler=self._label_provider.LabelScaler)
64
+ sentence_terms, actual_s_ind, actual_t_ind = self._provide_sentence_terms(
65
+ parsed_doc=parsed_doc, sentence_ind=sentence_ind, s_ind=s_ind, t_ind=t_ind)
72
66
 
73
- row[const.DOC_ID] = text_opinion_linkage.First.DocID
67
+ # Entity indices from the related context.
68
+ entities = list(filter(lambda term: isinstance(term, Entity), sentence_terms))
74
69
 
75
- row[const.SENT_IND] = sentence_ind
70
+ # Values mapping.
71
+ vm = {
72
+ const.ID: self._count_row(),
73
+ const.OPINION_ID: text_opinion_linkage.First.TextOpinionID,
74
+ const.OPINION_LINKAGE_ID: index_in_linked,
75
+ const.DOC_ID: text_opinion_linkage.First.DocID,
76
+ const.SENT_IND: sentence_ind,
77
+ const.ENTITY_VALUES: entities,
78
+ const.ENTITY_TYPES: entities,
79
+ const.ENTITIES: [str(i) for i, t in enumerate(sentence_terms) if isinstance(t, Entity)],
80
+ const.S_IND: actual_s_ind,
81
+ const.T_IND: actual_t_ind,
82
+ const.LABEL_UINT: None,
83
+ const.LABEL_STR: None
84
+ }
85
+
86
+ # Compose text value.
87
+ def __assign_value(column, value):
88
+ vm[column] = value
76
89
 
77
90
  expected_label = text_opinion_linkage.get_linked_label()
78
91
 
79
- if self.__store_labels:
80
- row[const.LABEL] = self._label_provider.calculate_output_uint_label(
81
- expected_uint_label=self._label_provider.LabelScaler.label_to_uint(expected_label),
82
- etalon_uint_label=self._label_provider.LabelScaler.label_to_uint(etalon_label))
83
-
84
- sentence_terms, actual_s_ind, actual_t_ind = self._provide_sentence_terms(
85
- parsed_news=parsed_news, sentence_ind=sentence_ind, s_ind=s_ind, t_ind=t_ind)
86
-
87
92
  self.__text_provider.add_text_in_row(
88
- set_text_func=lambda column, value: __assign_value(column, value),
89
- sentence_terms=sentence_terms,
90
- s_ind=actual_s_ind,
91
- t_ind=actual_t_ind,
93
+ set_text_func=__assign_value, sentence_terms=sentence_terms,
94
+ s_ind=actual_s_ind, t_ind=actual_t_ind,
92
95
  expected_label=expected_label)
93
96
 
94
- # Entity indicies from the related context.
95
- entities = list(filter(lambda term: isinstance(term, Entity), sentence_terms))
96
- entity_inds = [str(i) for i, t in enumerate(sentence_terms) if isinstance(t, Entity)]
97
- row[const.ENTITY_VALUES] = ",".join([e.DisplayValue.replace(',', '') for e in entities])
98
- row[const.ENTITY_TYPES] = ",".join([e.Type.replace(',', '') for e in entities])
99
- row[const.ENTITIES] = ",".join(entity_inds)
97
+ if self.__store_labels:
98
+ l2i = self._label_provider.LabelScaler.label_to_uint
99
+ ui2l = self._label_provider.LabelScaler.uint_to_label
100
+ uint_label = self._label_provider.calculate_output_uint_label(
101
+ expected_uint_label=l2i(expected_label), etalon_uint_label=l2i(etalon_label))
102
+ vm[const.LABEL_UINT] = uint_label
103
+ vm[const.LABEL_STR] = type(ui2l(uint_label)).__name__
100
104
 
101
- row[const.S_IND] = actual_s_ind
102
- row[const.T_IND] = actual_t_ind
105
+ self._apply_row_data(row=row, vm=vm, val_fmt=self._val_fmt)
106
+
107
+ @staticmethod
108
+ def _apply_row_data(row, vm, val_fmt):
109
+ for k, v in vm.items():
110
+ if v is None:
111
+ continue
112
+ row[k] = v if k not in val_fmt else val_fmt[k](v)
103
113
 
104
- def _provide_rows(self, parsed_news, entity_service, text_opinion_linkage, idle_mode):
114
+ def _provide_rows(self, parsed_doc, entity_service, text_opinion_linkage, idle_mode):
105
115
  assert(isinstance(idle_mode, bool))
106
116
 
107
117
  row_dict = OrderedDict()
@@ -109,7 +119,7 @@ class BaseSampleRowProvider(BaseRowProvider):
109
119
  for index_in_linked in range(len(text_opinion_linkage)):
110
120
 
111
121
  rows_it = self.__provide_rows(
112
- parsed_news=parsed_news,
122
+ parsed_doc=parsed_doc,
113
123
  entity_service=entity_service,
114
124
  row_dict=row_dict,
115
125
  text_opinion_linkage=text_opinion_linkage,
@@ -123,36 +133,28 @@ class BaseSampleRowProvider(BaseRowProvider):
123
133
 
124
134
  # region private methods
125
135
 
126
- @staticmethod
127
- def __create_row_ids_provider(label_provider):
128
- # TODO. #376 related. This should be removed after refactoring, because
129
- # TODO. we consider an ordinary IDs, that not based on the other data.
130
- if isinstance(label_provider, BinaryLabelProvider):
131
- return BinaryIDProvider()
132
- if isinstance(label_provider, MultipleLabelProvider):
133
- return MultipleIDProvider()
134
-
135
136
  @staticmethod
136
137
  def __create_instances_provider(label_provider):
137
- # TODO. #473 related: thiese label providers are based on text opinion extraction task!
138
+ # TODO. #473 related: these label providers are based on text opinion extraction task!
138
139
  if isinstance(label_provider, BinaryLabelProvider):
139
140
  return MultipleInstancesLinkedTextOpinionsProvider(label_provider.SupportedLabels)
140
141
  if isinstance(label_provider, MultipleLabelProvider):
141
142
  return SingleInstanceLinkedDataProvider()
142
143
 
143
- def __provide_rows(self, row_dict, parsed_news, entity_service,
144
+ def __provide_rows(self, row_dict, parsed_doc, entity_service,
144
145
  text_opinion_linkage, index_in_linked, idle_mode):
145
146
  """
146
147
  Providing Rows depending on row_id_formatter type
147
148
  """
148
- assert(isinstance(parsed_news, ParsedNews))
149
+ assert(isinstance(parsed_doc, ParsedDocument))
149
150
  assert(isinstance(row_dict, OrderedDict))
150
151
  assert(isinstance(text_opinion_linkage, TextOpinionsLinkage))
151
152
 
152
153
  etalon_label = self.__instances_provider.provide_label(text_opinion_linkage)
153
154
  for instance in self.__instances_provider.iter_instances(text_opinion_linkage):
154
155
  yield self.__create_row(row=row_dict,
155
- parsed_news=parsed_news,
156
+ row_id=0,
157
+ parsed_doc=parsed_doc,
156
158
  entity_service=entity_service,
157
159
  text_opinions_linkage=instance,
158
160
  index_in_linked=index_in_linked,
@@ -160,7 +162,7 @@ class BaseSampleRowProvider(BaseRowProvider):
160
162
  etalon_label=etalon_label,
161
163
  idle_mode=idle_mode)
162
164
 
163
- def __create_row(self, row, parsed_news, entity_service, text_opinions_linkage,
165
+ def __create_row(self, row, row_id, parsed_doc, entity_service, text_opinions_linkage,
164
166
  index_in_linked, etalon_label, idle_mode):
165
167
  """
166
168
  Composing row in following format:
@@ -196,7 +198,7 @@ class BaseSampleRowProvider(BaseRowProvider):
196
198
  raise Exception("Limitation: Multi-Sentence text_opinions are not supported.")
197
199
 
198
200
  self._fill_row_core(row=row,
199
- parsed_news=parsed_news,
201
+ parsed_doc=parsed_doc,
200
202
  sentence_ind=source_s_ind,
201
203
  text_opinion_linkage=text_opinions_linkage,
202
204
  index_in_linked=index_in_linked,
@@ -34,9 +34,9 @@ class CroppedSampleRowProvider(BaseSampleRowProvider):
34
34
 
35
35
  return _from, _to
36
36
 
37
- def _provide_sentence_terms(self, parsed_news, sentence_ind, s_ind, t_ind):
37
+ def _provide_sentence_terms(self, parsed_doc, sentence_ind, s_ind, t_ind):
38
38
  terms_iter, src_ind, tgt_ind = super(CroppedSampleRowProvider, self)._provide_sentence_terms(
39
- parsed_news=parsed_news, sentence_ind=sentence_ind, s_ind=s_ind, t_ind=t_ind)
39
+ parsed_doc=parsed_doc, sentence_ind=sentence_ind, s_ind=s_ind, t_ind=t_ind)
40
40
  terms = list(terms_iter)
41
41
  _from, _to = self.__calc_window_bounds(window_size=self.__crop_window_size,
42
42
  s_ind=s_ind, t_ind=t_ind, input_length=len(terms))
@@ -1,6 +1,6 @@
1
1
  from collections import OrderedDict
2
2
 
3
- from arekit.common.news.parsed.providers.entity_service import EntityServiceProvider, DistanceType
3
+ from arekit.common.docs.parsed.providers.entity_service import EntityServiceProvider, DistanceType
4
4
  from arekit.common.text_opinions.base import TextOpinion
5
5
 
6
6
 
@@ -0,0 +1,82 @@
1
+ from arekit.common.data import const
2
+ from arekit.common.utils import filter_whitespaces, split_by_whitespaces
3
+
4
+
5
+ def process_values_list(value, args_sep):
6
+ return value.split(args_sep)
7
+
8
+
9
+ def process_indices_list(value, no_value_func, args_sep):
10
+ return no_value_func() if not value else [int(v) for v in str(value).split(args_sep)]
11
+
12
+
13
+ def process_text(value):
14
+ """ The core method of the input text processing.
15
+ """
16
+ assert(isinstance(value, str) or isinstance(value, list))
17
+ return filter_whitespaces([term for term in split_by_whitespaces(value)]
18
+ if isinstance(value, str) else value)
19
+
20
+
21
+ def create_base_column_value_fmt(no_value_func=lambda: None, args_sep=","):
22
+
23
+ self_func = lambda value: value
24
+
25
+ return {
26
+ const.ID: {
27
+ "writer": self_func,
28
+ "parser": self_func
29
+ },
30
+ const.DOC_ID: {
31
+ "writer": self_func,
32
+ "parser": self_func,
33
+ },
34
+ const.S_IND: {
35
+ "writer": self_func,
36
+ "parser": lambda value: int(value)
37
+ },
38
+ const.T_IND: {
39
+ "writer": self_func,
40
+ "parser": lambda value: int(value)
41
+ },
42
+ const.SENT_IND: {
43
+ "writer": self_func,
44
+ "parser": lambda value: int(value)
45
+ },
46
+ const.OPINION_ID: {
47
+ "writer": self_func,
48
+ "parser": lambda value: int(value)
49
+ },
50
+ const.OPINION_LINKAGE_ID: {
51
+ "writer": self_func,
52
+ "parser": lambda value: int(value)
53
+ },
54
+ const.ENTITY_VALUES: {
55
+ "writer": lambda entities: args_sep.join([e.DisplayValue.replace(args_sep, '') for e in entities]),
56
+ "parser": lambda value: process_values_list(value, args_sep=args_sep),
57
+ },
58
+ const.ENTITY_TYPES: {
59
+ "writer": lambda entities: args_sep.join([e.Type.replace(args_sep, '') for e in entities]),
60
+ "parser": lambda value: process_values_list(value, args_sep=args_sep)
61
+ },
62
+ const.ENTITIES: {
63
+ "writer": lambda entity_inds: args_sep.join(entity_inds),
64
+ "parser": lambda value: process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
65
+ },
66
+ const.TEXT: {
67
+ "writer": self_func,
68
+ "parser": lambda value: process_text(value)
69
+ },
70
+ const.LABEL_UINT: {
71
+ "writer": self_func,
72
+ "parser": lambda value: int(value)
73
+ }
74
+ }
75
+
76
+
77
+ def create_base_column_fmt(fmt_type, args_sep=","):
78
+ assert(isinstance(fmt_type, str))
79
+ d = create_base_column_value_fmt(args_sep=args_sep)
80
+ for k, v in d.items():
81
+ d[k] = v[fmt_type]
82
+ return d
@@ -0,0 +1,43 @@
1
+ class ParsedSampleRow(object):
2
+ """ Provides a parsed information for a sample row.
3
+ """
4
+
5
+ def __init__(self, row, columns_fmts, no_value_func):
6
+ """ row: dict
7
+ dict of the pairs ("field_name", value)
8
+ columns_fmt: list
9
+ list of the formatters, where every formatter represent a dictionary.
10
+ no_value_func: func
11
+ the default value the conveys the absence of the parameter-value.
12
+ """
13
+ assert(isinstance(row, dict))
14
+ assert(isinstance(columns_fmts, list))
15
+ assert(callable(no_value_func))
16
+
17
+ self.__uint_label = None
18
+ self.__params = {}
19
+ self.__no_value = no_value_func
20
+
21
+ for key, value in row.items():
22
+
23
+ for columns_fmt in columns_fmts:
24
+ assert(isinstance(columns_fmt, dict))
25
+
26
+ if key not in columns_fmt:
27
+ continue
28
+
29
+ self.__params[key] = columns_fmt[key](value)
30
+ break
31
+
32
+ def __value_or_none(self, key):
33
+ return self.__params[key] if key in self.__params else self.__no_value()
34
+
35
+ def __getitem__(self, item):
36
+ assert (isinstance(item, str) or item is None)
37
+ if item not in self.__params:
38
+ return self.__no_value()
39
+ return self.__params[item] if item is not None else self.__no_value()
40
+
41
+ @classmethod
42
+ def parse(cls, row, columns_fmts, no_value_func):
43
+ return cls(row=row, columns_fmts=columns_fmts, no_value_func=no_value_func)
@@ -2,7 +2,8 @@ import gc
2
2
  import logging
3
3
 
4
4
  from arekit.common.data.input.providers.columns.base import BaseColumnsProvider
5
- from arekit.common.utils import progress_bar
5
+ from arekit.common.linkage.meta import MetaEmptyLinkedDataWrapper
6
+ from arekit.common.utils import progress_bar_conditional
6
7
 
7
8
  logger = logging.getLogger(__name__)
8
9
 
@@ -54,6 +55,9 @@ class BaseRowsStorage(object):
54
55
  def iter_column_names(self):
55
56
  raise NotImplemented()
56
57
 
58
+ def iter_column_types(self):
59
+ raise NotImplemented()
60
+
57
61
  # endregion
58
62
 
59
63
  def fill(self, iter_rows_func, columns_provider, row_handler=None, rows_count=None, desc=""):
@@ -61,30 +65,31 @@ class BaseRowsStorage(object):
61
65
  assert(isinstance(columns_provider, BaseColumnsProvider))
62
66
  assert(callable(row_handler) or row_handler is None)
63
67
 
64
- pbar_it = progress_bar(iterable=iter_rows_func(False),
65
- desc="{fmt}".format(fmt=desc),
66
- total=rows_count)
67
-
68
68
  doc_ids_seen = set()
69
69
 
70
- for row_index, row in enumerate(pbar_it):
71
-
72
- doc_id, row_values = row
73
-
70
+ def postfix_func(item):
71
+ doc_id, _ = item
72
+ doc_ids_seen.add(doc_id)
73
+ return {
74
+ "docs_seen": len(doc_ids_seen),
75
+ "doc_now": str(doc_id)
76
+ }
77
+
78
+ pbar_it = progress_bar_conditional(
79
+ iterable=iter_rows_func(False),
80
+ # We skip meta information data.
81
+ condition_func=lambda item: not isinstance(item[1], MetaEmptyLinkedDataWrapper),
82
+ postfix_func=postfix_func,
83
+ desc="{fmt}".format(fmt=desc),
84
+ total=rows_count)
85
+
86
+ for row_index, item in enumerate(pbar_it):
87
+ _, row_values = item
74
88
  self._begin_filling_row(row_index)
75
-
76
89
  for column, value in row_values.items():
77
90
  self._set_row_value(row_ind=row_index,
78
91
  column=column,
79
92
  value=value)
80
-
81
- # Provide information about amount of processed documents.
82
- doc_ids_seen.add(doc_id)
83
- pbar_it.set_postfix({
84
- "docs_seen": len(doc_ids_seen),
85
- "doc_now": doc_id,
86
- })
87
-
88
93
  if row_handler is not None:
89
94
  row_handler()
90
95
 
@@ -1,23 +1,17 @@
1
1
  from arekit.common.data import const
2
- from arekit.common.data.row_ids.base import BaseIDProvider
3
2
  from arekit.common.data.storages.base import BaseRowsStorage
4
3
 
5
4
 
5
+ # TODO. This is a particular type of view, and expected to be off the core.
6
6
  class LinkedSamplesStorageView(object):
7
7
 
8
- def __init__(self, row_ids_provider):
9
- assert(isinstance(row_ids_provider, BaseIDProvider))
10
- self.__row_ids_provider = row_ids_provider
11
-
12
8
  def iter_from_storage(self, storage):
13
9
  assert(isinstance(storage, BaseRowsStorage))
14
10
  undefined = -1
15
11
 
16
12
  linked = []
17
13
  current_opinion_id = undefined
18
- for row_index, sample_id in enumerate(storage.iter_column_values(const.ID)):
19
- sample_id = str(sample_id)
20
- opinion_id = self.__row_ids_provider.parse_opinion_in_sample_id(sample_id)
14
+ for row_index, opinion_id in enumerate(storage.iter_column_values(const.OPINION_ID)):
21
15
  if current_opinion_id != undefined:
22
16
  if opinion_id != current_opinion_id:
23
17
  yield linked
@@ -1,4 +1,4 @@
1
- class News(object):
1
+ class Document(object):
2
2
 
3
3
  def __init__(self, doc_id, sentences):
4
4
  assert(isinstance(sentences, list))
@@ -13,7 +13,7 @@ class News(object):
13
13
 
14
14
  @property
15
15
  def SentencesCount(self):
16
- """ Provides total amount of sentences within a news
16
+ """ Provides total amount of sentences within a doc
17
17
  At present is useful for:
18
18
  - CV-splitters, which may rely on sentences count.
19
19
  - Text parsing.
@@ -4,8 +4,9 @@ from arekit.common.pipeline.items.base import BasePipelineItem
4
4
 
5
5
  class EntitiesGroupingPipelineItem(BasePipelineItem):
6
6
 
7
- def __init__(self, value_to_group_id_func):
7
+ def __init__(self, value_to_group_id_func, **kwargs):
8
8
  assert(callable(value_to_group_id_func))
9
+ super(EntitiesGroupingPipelineItem, self).__init__(**kwargs)
9
10
  self.__value_to_group_id_func = value_to_group_id_func
10
11
 
11
12
  def apply_core(self, input_data, pipeline_ctx):
@@ -3,12 +3,13 @@ from arekit.common.entities.base import Entity
3
3
 
4
4
  class DocumentEntity(Entity):
5
5
 
6
- def __init__(self, value, display_value, e_type, id_in_doc, group_index):
6
+ def __init__(self, value, display_value, e_type, childs, id_in_doc, group_index):
7
7
  """ id_in_doc: Id, utilized witin the internal services
8
8
  """
9
9
  super(DocumentEntity, self).__init__(value=value,
10
10
  e_type=e_type,
11
11
  display_value=display_value,
12
+ childs=childs,
12
13
  group_index=group_index)
13
14
  self.__id = id_in_doc
14
15
 
@@ -1,14 +1,14 @@
1
- import collections
1
+ from collections.abc import Iterable
2
2
 
3
3
  from arekit.common.entities.base import Entity
4
4
  from arekit.common.text.enums import TermFormat
5
5
  from arekit.common.text.parsed import BaseParsedText
6
6
 
7
7
 
8
- class ParsedNews(object):
8
+ class ParsedDocument(object):
9
9
  """
10
- This class represents an information of the processed news in following directions:
11
- - news words
10
+ This class represents an information of the processed doc in following directions:
11
+ - doc words
12
12
  - tokens
13
13
  - entities (positions).
14
14
  - frames (FrameVariants)
@@ -25,7 +25,7 @@ class ParsedNews(object):
25
25
  parsed_sentences: iterable of ParsedSentence type
26
26
  NOTE: Considered sentences with labeled Entities in it!
27
27
  """
28
- assert(isinstance(parsed_sentences, collections.Iterable))
28
+ assert(isinstance(parsed_sentences, Iterable))
29
29
 
30
30
  self.__doc_id = doc_id
31
31
  self.__parsed_sentences = list(parsed_sentences)