arekit 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. arekit/__init__.py +0 -0
  2. arekit/common/__init__.py +0 -0
  3. arekit/common/bound.py +48 -0
  4. arekit/common/context/__init__.py +0 -0
  5. arekit/common/context/terms_mapper.py +51 -0
  6. arekit/common/context/token.py +16 -0
  7. arekit/common/data/__init__.py +0 -0
  8. arekit/common/data/const.py +21 -0
  9. arekit/common/data/doc_provider.py +6 -0
  10. arekit/common/data/input/__init__.py +0 -0
  11. arekit/common/data/input/providers/__init__.py +0 -0
  12. arekit/common/data/input/providers/columns/__init__.py +0 -0
  13. arekit/common/data/input/providers/columns/base.py +9 -0
  14. arekit/common/data/input/providers/columns/sample.py +59 -0
  15. arekit/common/data/input/providers/const.py +3 -0
  16. arekit/common/data/input/providers/contents.py +9 -0
  17. arekit/common/data/input/providers/instances/__init__.py +0 -0
  18. arekit/common/data/input/providers/instances/base.py +14 -0
  19. arekit/common/data/input/providers/instances/multiple.py +27 -0
  20. arekit/common/data/input/providers/instances/single.py +8 -0
  21. arekit/common/data/input/providers/label/__init__.py +0 -0
  22. arekit/common/data/input/providers/label/base.py +24 -0
  23. arekit/common/data/input/providers/label/binary.py +11 -0
  24. arekit/common/data/input/providers/label/multiple.py +15 -0
  25. arekit/common/data/input/providers/rows/__init__.py +0 -0
  26. arekit/common/data/input/providers/rows/base.py +64 -0
  27. arekit/common/data/input/providers/rows/samples.py +227 -0
  28. arekit/common/data/input/providers/sample/__init__.py +0 -0
  29. arekit/common/data/input/providers/sample/cropped.py +43 -0
  30. arekit/common/data/input/providers/text/__init__.py +0 -0
  31. arekit/common/data/input/providers/text/single.py +49 -0
  32. arekit/common/data/input/repositories/__init__.py +0 -0
  33. arekit/common/data/input/repositories/base.py +68 -0
  34. arekit/common/data/input/repositories/sample.py +22 -0
  35. arekit/common/data/input/sample.py +66 -0
  36. arekit/common/data/input/terms_mapper.py +88 -0
  37. arekit/common/data/rows_fmt.py +82 -0
  38. arekit/common/data/rows_parser.py +43 -0
  39. arekit/common/data/storages/__init__.py +0 -0
  40. arekit/common/data/storages/base.py +109 -0
  41. arekit/common/data/views/__init__.py +0 -0
  42. arekit/common/data/views/samples.py +26 -0
  43. arekit/common/docs/__init__.py +0 -0
  44. arekit/common/docs/base.py +30 -0
  45. arekit/common/docs/entities_grouping.py +16 -0
  46. arekit/common/docs/entity.py +18 -0
  47. arekit/common/docs/objects_parser.py +37 -0
  48. arekit/common/docs/parsed/__init__.py +0 -0
  49. arekit/common/docs/parsed/base.py +101 -0
  50. arekit/common/docs/parsed/providers/__init__.py +0 -0
  51. arekit/common/docs/parsed/providers/base.py +68 -0
  52. arekit/common/docs/parsed/providers/base_pairs.py +51 -0
  53. arekit/common/docs/parsed/providers/entity_service.py +175 -0
  54. arekit/common/docs/parsed/providers/opinion_pairs.py +20 -0
  55. arekit/common/docs/parsed/providers/text_opinion_pairs.py +78 -0
  56. arekit/common/docs/parsed/service.py +31 -0
  57. arekit/common/docs/parsed/term_position.py +42 -0
  58. arekit/common/docs/parser.py +34 -0
  59. arekit/common/docs/sentence.py +14 -0
  60. arekit/common/entities/__init__.py +0 -0
  61. arekit/common/entities/base.py +51 -0
  62. arekit/common/entities/collection.py +72 -0
  63. arekit/common/entities/str_fmt.py +8 -0
  64. arekit/common/entities/types.py +9 -0
  65. arekit/common/experiment/__init__.py +0 -0
  66. arekit/common/experiment/api/__init__.py +0 -0
  67. arekit/common/experiment/api/base_samples_io.py +20 -0
  68. arekit/common/experiment/data_type.py +17 -0
  69. arekit/common/frames/__init__.py +0 -0
  70. arekit/common/frames/connotations/__init__.py +0 -0
  71. arekit/common/frames/connotations/descriptor.py +17 -0
  72. arekit/common/frames/connotations/provider.py +4 -0
  73. arekit/common/frames/text_variant.py +43 -0
  74. arekit/common/frames/variants/__init__.py +0 -0
  75. arekit/common/frames/variants/base.py +21 -0
  76. arekit/common/frames/variants/collection.py +60 -0
  77. arekit/common/labels/__init__.py +0 -0
  78. arekit/common/labels/base.py +19 -0
  79. arekit/common/labels/provider/__init__.py +0 -0
  80. arekit/common/labels/provider/base.py +7 -0
  81. arekit/common/labels/provider/constant.py +14 -0
  82. arekit/common/labels/scaler/__init__.py +0 -0
  83. arekit/common/labels/scaler/base.py +85 -0
  84. arekit/common/labels/scaler/sentiment.py +7 -0
  85. arekit/common/labels/scaler/single.py +10 -0
  86. arekit/common/labels/str_fmt.py +55 -0
  87. arekit/common/linkage/__init__.py +0 -0
  88. arekit/common/linkage/base.py +44 -0
  89. arekit/common/linkage/meta.py +23 -0
  90. arekit/common/linkage/opinions.py +9 -0
  91. arekit/common/linkage/text_opinions.py +22 -0
  92. arekit/common/log_utils.py +29 -0
  93. arekit/common/model/__init__.py +0 -0
  94. arekit/common/model/labeling/__init__.py +0 -0
  95. arekit/common/model/labeling/base.py +24 -0
  96. arekit/common/model/labeling/modes.py +8 -0
  97. arekit/common/model/labeling/single.py +24 -0
  98. arekit/common/opinions/__init__.py +0 -0
  99. arekit/common/opinions/annot/__init__.py +0 -0
  100. arekit/common/opinions/annot/algo/__init__.py +0 -0
  101. arekit/common/opinions/annot/algo/base.py +4 -0
  102. arekit/common/opinions/annot/algo/pair_based.py +99 -0
  103. arekit/common/opinions/annot/algo/predefined.py +16 -0
  104. arekit/common/opinions/annot/algo_based.py +55 -0
  105. arekit/common/opinions/annot/base.py +15 -0
  106. arekit/common/opinions/base.py +74 -0
  107. arekit/common/opinions/collection.py +150 -0
  108. arekit/common/opinions/enums.py +6 -0
  109. arekit/common/opinions/provider.py +4 -0
  110. arekit/common/opinions/writer.py +4 -0
  111. arekit/common/pipeline/__init__.py +0 -0
  112. arekit/common/pipeline/base.py +25 -0
  113. arekit/common/pipeline/context.py +36 -0
  114. arekit/common/pipeline/conts.py +2 -0
  115. arekit/common/pipeline/items/__init__.py +0 -0
  116. arekit/common/pipeline/items/base.py +12 -0
  117. arekit/common/pipeline/items/flatten.py +14 -0
  118. arekit/common/pipeline/items/handle.py +17 -0
  119. arekit/common/pipeline/items/iter.py +11 -0
  120. arekit/common/pipeline/items/map.py +11 -0
  121. arekit/common/pipeline/items/map_nested.py +13 -0
  122. arekit/common/synonyms/__init__.py +0 -0
  123. arekit/common/synonyms/base.py +151 -0
  124. arekit/common/synonyms/grouping.py +21 -0
  125. arekit/common/text/__init__.py +0 -0
  126. arekit/common/text/enums.py +12 -0
  127. arekit/common/text/parsed.py +42 -0
  128. arekit/common/text/parser.py +12 -0
  129. arekit/common/text/partitioning/__init__.py +0 -0
  130. arekit/common/text/partitioning/base.py +4 -0
  131. arekit/common/text/partitioning/str.py +36 -0
  132. arekit/common/text/partitioning/terms.py +35 -0
  133. arekit/common/text/stemmer.py +16 -0
  134. arekit/common/text_opinions/__init__.py +0 -0
  135. arekit/common/text_opinions/base.py +105 -0
  136. arekit/common/utils.py +129 -0
  137. arekit/contrib/__init__.py +0 -0
  138. arekit/contrib/bert/__init__.py +0 -0
  139. arekit/contrib/bert/input/__init__.py +0 -0
  140. arekit/contrib/bert/input/providers/__init__.py +0 -0
  141. arekit/contrib/bert/input/providers/cropped_sample.py +17 -0
  142. arekit/contrib/bert/input/providers/text_pair.py +62 -0
  143. arekit/contrib/bert/terms/__init__.py +0 -0
  144. arekit/contrib/bert/terms/mapper.py +20 -0
  145. arekit/contrib/networks/__init__.py +0 -0
  146. arekit/contrib/networks/embedding.py +149 -0
  147. arekit/contrib/networks/embedding_io.py +18 -0
  148. arekit/contrib/networks/input/__init__.py +0 -0
  149. arekit/contrib/networks/input/const.py +6 -0
  150. arekit/contrib/networks/input/ctx_serialization.py +28 -0
  151. arekit/contrib/networks/input/embedding/__init__.py +0 -0
  152. arekit/contrib/networks/input/embedding/matrix.py +29 -0
  153. arekit/contrib/networks/input/embedding/offsets.py +55 -0
  154. arekit/contrib/networks/input/formatters/__init__.py +0 -0
  155. arekit/contrib/networks/input/formatters/pos_mapper.py +22 -0
  156. arekit/contrib/networks/input/providers/__init__.py +0 -0
  157. arekit/contrib/networks/input/providers/sample.py +129 -0
  158. arekit/contrib/networks/input/providers/term_connotation.py +23 -0
  159. arekit/contrib/networks/input/providers/text.py +24 -0
  160. arekit/contrib/networks/input/rows_parser.py +47 -0
  161. arekit/contrib/networks/input/term_types.py +13 -0
  162. arekit/contrib/networks/input/terms_mapping.py +60 -0
  163. arekit/contrib/networks/vectorizer.py +6 -0
  164. arekit/contrib/prompt/__init__.py +0 -0
  165. arekit/contrib/prompt/sample.py +61 -0
  166. arekit/contrib/source/__init__.py +0 -0
  167. arekit/contrib/source/brat/__init__.py +0 -0
  168. arekit/contrib/source/brat/annot.py +84 -0
  169. arekit/contrib/source/brat/doc.py +28 -0
  170. arekit/contrib/source/brat/entities/__init__.py +0 -0
  171. arekit/contrib/source/brat/entities/compound.py +13 -0
  172. arekit/contrib/source/brat/entities/entity.py +42 -0
  173. arekit/contrib/source/brat/entities/parser.py +53 -0
  174. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  175. arekit/contrib/source/brat/opinions/converter.py +19 -0
  176. arekit/contrib/source/brat/relation.py +32 -0
  177. arekit/contrib/source/brat/sentence.py +69 -0
  178. arekit/contrib/source/brat/sentences_reader.py +128 -0
  179. arekit/contrib/source/download.py +41 -0
  180. arekit/contrib/source/nerel/__init__.py +0 -0
  181. arekit/contrib/source/nerel/entities.py +55 -0
  182. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  183. arekit/contrib/source/nerel/folding/fixed.py +74 -0
  184. arekit/contrib/source/nerel/io_utils.py +62 -0
  185. arekit/contrib/source/nerel/labels.py +241 -0
  186. arekit/contrib/source/nerel/reader.py +46 -0
  187. arekit/contrib/source/nerel/utils.py +24 -0
  188. arekit/contrib/source/nerel/versions.py +12 -0
  189. arekit/contrib/source/nerelbio/__init__.py +0 -0
  190. arekit/contrib/source/nerelbio/io_utils.py +62 -0
  191. arekit/contrib/source/nerelbio/labels.py +265 -0
  192. arekit/contrib/source/nerelbio/reader.py +8 -0
  193. arekit/contrib/source/nerelbio/versions.py +8 -0
  194. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  195. arekit/contrib/source/ruattitudes/collection.py +36 -0
  196. arekit/contrib/source/ruattitudes/doc.py +51 -0
  197. arekit/contrib/source/ruattitudes/doc_brat.py +44 -0
  198. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  199. arekit/contrib/source/ruattitudes/entity/parser.py +7 -0
  200. arekit/contrib/source/ruattitudes/io_utils.py +56 -0
  201. arekit/contrib/source/ruattitudes/labels_fmt.py +12 -0
  202. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  203. arekit/contrib/source/ruattitudes/opinions/base.py +28 -0
  204. arekit/contrib/source/ruattitudes/opinions/converter.py +37 -0
  205. arekit/contrib/source/ruattitudes/reader.py +268 -0
  206. arekit/contrib/source/ruattitudes/sentence.py +73 -0
  207. arekit/contrib/source/ruattitudes/synonyms.py +17 -0
  208. arekit/contrib/source/ruattitudes/text_object.py +59 -0
  209. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  210. arekit/contrib/source/rusentiframes/collection.py +157 -0
  211. arekit/contrib/source/rusentiframes/effect.py +24 -0
  212. arekit/contrib/source/rusentiframes/io_utils.py +19 -0
  213. arekit/contrib/source/rusentiframes/labels_fmt.py +22 -0
  214. arekit/contrib/source/rusentiframes/polarity.py +35 -0
  215. arekit/contrib/source/rusentiframes/role.py +15 -0
  216. arekit/contrib/source/rusentiframes/state.py +24 -0
  217. arekit/contrib/source/rusentiframes/types.py +42 -0
  218. arekit/contrib/source/rusentiframes/value.py +2 -0
  219. arekit/contrib/source/rusentrel/__init__.py +0 -0
  220. arekit/contrib/source/rusentrel/const.py +3 -0
  221. arekit/contrib/source/rusentrel/docs_reader.py +51 -0
  222. arekit/contrib/source/rusentrel/entities.py +26 -0
  223. arekit/contrib/source/rusentrel/io_utils.py +125 -0
  224. arekit/contrib/source/rusentrel/labels_fmt.py +12 -0
  225. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  226. arekit/contrib/source/rusentrel/opinions/collection.py +30 -0
  227. arekit/contrib/source/rusentrel/opinions/converter.py +40 -0
  228. arekit/contrib/source/rusentrel/opinions/provider.py +54 -0
  229. arekit/contrib/source/rusentrel/opinions/writer.py +42 -0
  230. arekit/contrib/source/rusentrel/synonyms.py +17 -0
  231. arekit/contrib/source/sentinerel/__init__.py +0 -0
  232. arekit/contrib/source/sentinerel/entities.py +52 -0
  233. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  234. arekit/contrib/source/sentinerel/folding/factory.py +31 -0
  235. arekit/contrib/source/sentinerel/folding/fixed.py +70 -0
  236. arekit/contrib/source/sentinerel/io_utils.py +87 -0
  237. arekit/contrib/source/sentinerel/labels.py +53 -0
  238. arekit/contrib/source/sentinerel/labels_scaler.py +30 -0
  239. arekit/contrib/source/sentinerel/reader.py +42 -0
  240. arekit/contrib/source/synonyms/__init__.py +0 -0
  241. arekit/contrib/source/synonyms/utils.py +19 -0
  242. arekit/contrib/source/zip_utils.py +47 -0
  243. arekit/contrib/utils/__init__.py +0 -0
  244. arekit/contrib/utils/bert/__init__.py +0 -0
  245. arekit/contrib/utils/bert/samplers.py +17 -0
  246. arekit/contrib/utils/connotations/__init__.py +0 -0
  247. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +23 -0
  248. arekit/contrib/utils/data/__init__.py +0 -0
  249. arekit/contrib/utils/data/contents/__init__.py +0 -0
  250. arekit/contrib/utils/data/contents/opinions.py +37 -0
  251. arekit/contrib/utils/data/doc_provider/__init__.py +0 -0
  252. arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
  253. arekit/contrib/utils/data/doc_provider/dir_based.py +53 -0
  254. arekit/contrib/utils/data/readers/__init__.py +0 -0
  255. arekit/contrib/utils/data/readers/base.py +7 -0
  256. arekit/contrib/utils/data/readers/csv_pd.py +38 -0
  257. arekit/contrib/utils/data/readers/jsonl.py +15 -0
  258. arekit/contrib/utils/data/service/__init__.py +0 -0
  259. arekit/contrib/utils/data/service/balance.py +50 -0
  260. arekit/contrib/utils/data/storages/__init__.py +0 -0
  261. arekit/contrib/utils/data/storages/jsonl_based.py +18 -0
  262. arekit/contrib/utils/data/storages/pandas_based.py +123 -0
  263. arekit/contrib/utils/data/storages/row_cache.py +48 -0
  264. arekit/contrib/utils/data/writers/__init__.py +0 -0
  265. arekit/contrib/utils/data/writers/base.py +27 -0
  266. arekit/contrib/utils/data/writers/csv_native.py +63 -0
  267. arekit/contrib/utils/data/writers/csv_pd.py +40 -0
  268. arekit/contrib/utils/data/writers/json_opennre.py +132 -0
  269. arekit/contrib/utils/data/writers/sqlite_native.py +110 -0
  270. arekit/contrib/utils/download.py +77 -0
  271. arekit/contrib/utils/embeddings/__init__.py +0 -0
  272. arekit/contrib/utils/embeddings/rusvectores.py +58 -0
  273. arekit/contrib/utils/embeddings/tokens.py +30 -0
  274. arekit/contrib/utils/entities/__init__.py +0 -0
  275. arekit/contrib/utils/entities/filter.py +7 -0
  276. arekit/contrib/utils/entities/formatters/__init__.py +0 -0
  277. arekit/contrib/utils/entities/formatters/str_display.py +11 -0
  278. arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py +15 -0
  279. arekit/contrib/utils/io_utils/__init__.py +0 -0
  280. arekit/contrib/utils/io_utils/embedding.py +72 -0
  281. arekit/contrib/utils/io_utils/opinions.py +37 -0
  282. arekit/contrib/utils/io_utils/samples.py +79 -0
  283. arekit/contrib/utils/io_utils/utils.py +39 -0
  284. arekit/contrib/utils/lexicons/__init__.py +0 -0
  285. arekit/contrib/utils/lexicons/lexicon.py +41 -0
  286. arekit/contrib/utils/lexicons/relation.py +42 -0
  287. arekit/contrib/utils/lexicons/rusentilex.py +37 -0
  288. arekit/contrib/utils/nn/__init__.py +0 -0
  289. arekit/contrib/utils/nn/rows.py +83 -0
  290. arekit/contrib/utils/np_utils/__init__.py +0 -0
  291. arekit/contrib/utils/np_utils/embedding.py +22 -0
  292. arekit/contrib/utils/np_utils/npz_utils.py +13 -0
  293. arekit/contrib/utils/np_utils/vocab.py +20 -0
  294. arekit/contrib/utils/pipelines/__init__.py +0 -0
  295. arekit/contrib/utils/pipelines/items/__init__.py +0 -0
  296. arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
  297. arekit/contrib/utils/pipelines/items/sampling/base.py +99 -0
  298. arekit/contrib/utils/pipelines/items/sampling/networks.py +54 -0
  299. arekit/contrib/utils/pipelines/items/text/__init__.py +0 -0
  300. arekit/contrib/utils/pipelines/items/text/entities_default.py +23 -0
  301. arekit/contrib/utils/pipelines/items/text/frames.py +86 -0
  302. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +36 -0
  303. arekit/contrib/utils/pipelines/items/text/frames_negation.py +32 -0
  304. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +10 -0
  305. arekit/contrib/utils/pipelines/items/text/tokenizer.py +107 -0
  306. arekit/contrib/utils/pipelines/items/text/translator.py +135 -0
  307. arekit/contrib/utils/pipelines/opinion_collections.py +85 -0
  308. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  309. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  310. arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +27 -0
  311. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +65 -0
  312. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +60 -0
  313. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  314. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +29 -0
  315. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +64 -0
  316. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +79 -0
  317. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  318. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +56 -0
  319. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +20 -0
  320. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +65 -0
  321. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  322. arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +21 -0
  323. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +107 -0
  324. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  325. arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +29 -0
  326. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +62 -0
  327. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +180 -0
  328. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +50 -0
  329. arekit/contrib/utils/pipelines/text_opinion/__init__.py +0 -0
  330. arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py +0 -0
  331. arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +34 -0
  332. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +88 -0
  333. arekit/contrib/utils/pipelines/text_opinion/extraction.py +93 -0
  334. arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py +0 -0
  335. arekit/contrib/utils/pipelines/text_opinion/filters/base.py +4 -0
  336. arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +16 -0
  337. arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +29 -0
  338. arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +26 -0
  339. arekit/contrib/utils/processing/__init__.py +0 -0
  340. arekit/contrib/utils/processing/languages/__init__.py +0 -0
  341. arekit/contrib/utils/processing/languages/mods.py +12 -0
  342. arekit/contrib/utils/processing/languages/pos.py +23 -0
  343. arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
  344. arekit/contrib/utils/processing/languages/ru/cases.py +78 -0
  345. arekit/contrib/utils/processing/languages/ru/constants.py +6 -0
  346. arekit/contrib/utils/processing/languages/ru/mods.py +13 -0
  347. arekit/contrib/utils/processing/languages/ru/number.py +23 -0
  348. arekit/contrib/utils/processing/languages/ru/pos_service.py +36 -0
  349. arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
  350. arekit/contrib/utils/processing/lemmatization/mystem.py +51 -0
  351. arekit/contrib/utils/processing/pos/__init__.py +0 -0
  352. arekit/contrib/utils/processing/pos/base.py +12 -0
  353. arekit/contrib/utils/processing/pos/mystem_wrap.py +134 -0
  354. arekit/contrib/utils/processing/pos/russian.py +10 -0
  355. arekit/contrib/utils/processing/text/__init__.py +0 -0
  356. arekit/contrib/utils/processing/text/tokens.py +127 -0
  357. arekit/contrib/utils/resources.py +25 -0
  358. arekit/contrib/utils/serializer.py +43 -0
  359. arekit/contrib/utils/sources/__init__.py +0 -0
  360. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  361. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  362. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +63 -0
  363. arekit/contrib/utils/synonyms/__init__.py +0 -0
  364. arekit/contrib/utils/synonyms/simple.py +15 -0
  365. arekit/contrib/utils/synonyms/stemmer_based.py +38 -0
  366. arekit/contrib/utils/vectorizers/__init__.py +0 -0
  367. arekit/contrib/utils/vectorizers/bpe.py +93 -0
  368. arekit/contrib/utils/vectorizers/random_norm.py +39 -0
  369. arekit/download_data.py +11 -0
  370. arekit-0.24.0.dist-info/LICENSE +21 -0
  371. arekit-0.24.0.dist-info/METADATA +23 -0
  372. arekit-0.24.0.dist-info/RECORD +374 -0
  373. arekit-0.24.0.dist-info/WHEEL +5 -0
  374. arekit-0.24.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,43 @@
1
+ from arekit.common.data.input.providers.label.multiple import MultipleLabelProvider
2
+ from arekit.common.data.input.providers.rows.samples import BaseSampleRowProvider
3
+
4
+
5
+ class CroppedSampleRowProvider(BaseSampleRowProvider):
6
+ """ Sample provided which has `crop_window` that allows to slice
7
+ the potentially large samples and guarantee the presence of
8
+ attitude inside.
9
+ """
10
+
11
+ def __init__(self, crop_window_size, label_scaler, text_provider):
12
+ assert(isinstance(crop_window_size, int) and crop_window_size > 0)
13
+ super(CroppedSampleRowProvider, self).__init__(label_provider=MultipleLabelProvider(label_scaler),
14
+ text_provider=text_provider)
15
+ self.__crop_window_size = crop_window_size
16
+
17
+ @staticmethod
18
+ def __calc_window_bounds(window_size, s_ind, t_ind, input_length):
19
+ """ returns: [_from, _to)
20
+ """
21
+ assert(isinstance(s_ind, int))
22
+ assert(isinstance(t_ind, int))
23
+ assert(isinstance(input_length, int))
24
+ assert(input_length >= s_ind and input_length >= t_ind)
25
+
26
+ def __in():
27
+ return _from <= s_ind < _to and _from <= t_ind < _to
28
+
29
+ _from = 0
30
+ _to = window_size
31
+ while not __in():
32
+ _from += 1
33
+ _to += 1
34
+
35
+ return _from, _to
36
+
37
+ def _provide_sentence_terms(self, parsed_doc, sentence_ind, s_ind, t_ind):
38
+ terms_iter, src_ind, tgt_ind = super(CroppedSampleRowProvider, self)._provide_sentence_terms(
39
+ parsed_doc=parsed_doc, sentence_ind=sentence_ind, s_ind=s_ind, t_ind=t_ind)
40
+ terms = list(terms_iter)
41
+ _from, _to = self.__calc_window_bounds(window_size=self.__crop_window_size,
42
+ s_ind=s_ind, t_ind=t_ind, input_length=len(terms))
43
+ return terms[_from:_to], src_ind - _from, tgt_ind - _from
File without changes
@@ -0,0 +1,49 @@
1
+ from arekit.common.data import const
2
+ from arekit.common.data.input.terms_mapper import OpinionContainingTextTermsMapper
3
+ from arekit.common.labels.base import Label
4
+
5
+
6
+ class BaseSingleTextProvider(object):
7
+
8
+ TEXT_A = const.TEXT
9
+ TERMS_SEPARATOR = " "
10
+
11
+ def __init__(self, text_terms_mapper):
12
+ assert(isinstance(text_terms_mapper, OpinionContainingTextTermsMapper))
13
+ self._mapper = text_terms_mapper
14
+
15
+ def iter_columns(self):
16
+ yield BaseSingleTextProvider.TEXT_A
17
+
18
+ @staticmethod
19
+ def _process_text(text):
20
+ assert(isinstance(text, str))
21
+ return text.strip()
22
+
23
+ def _mapped_data_to_str(self, m_data):
24
+ return m_data
25
+
26
+ def _handle_mapped_data(self, m_data):
27
+ # Optionally handle mapped data.
28
+ pass
29
+
30
+ def _handle_terms_and_compose_text(self, sentence_terms):
31
+ assert(isinstance(sentence_terms, list))
32
+
33
+ str_terms = []
34
+
35
+ for m_data in self._mapper.iter_mapped(sentence_terms):
36
+ str_terms.append(self._mapped_data_to_str(m_data=m_data))
37
+ self._handle_mapped_data(m_data=m_data)
38
+
39
+ return self.TERMS_SEPARATOR.join(str_terms)
40
+
41
+ def add_text_in_row(self, set_text_func, sentence_terms, s_ind, t_ind, expected_label):
42
+ assert(callable(set_text_func))
43
+ assert(isinstance(sentence_terms, list))
44
+ assert(isinstance(expected_label, Label))
45
+
46
+ self._mapper.set_s_ind(s_ind)
47
+ self._mapper.set_t_ind(t_ind)
48
+ set_text_func(column=self.TEXT_A,
49
+ value=self._process_text(text=self._handle_terms_and_compose_text(sentence_terms)))
File without changes
@@ -0,0 +1,68 @@
1
+ from arekit.common.data.input.providers.columns.base import BaseColumnsProvider
2
+ from arekit.common.data.input.providers.contents import ContentsProvider
3
+ from arekit.common.data.input.providers.rows.base import BaseRowProvider
4
+ from arekit.common.data.storages.base import BaseRowsStorage
5
+ from arekit.contrib.utils.data.storages.row_cache import RowCacheStorage
6
+ from arekit.contrib.utils.data.writers.base import BaseWriter
7
+
8
+
9
+ class BaseInputRepository(object):
10
+
11
+ def __init__(self, columns_provider, rows_provider, storage):
12
+ assert(isinstance(columns_provider, BaseColumnsProvider))
13
+ assert(isinstance(rows_provider, BaseRowProvider))
14
+ assert(isinstance(storage, BaseRowsStorage))
15
+
16
+ self._columns_provider = columns_provider
17
+ self._rows_provider = rows_provider
18
+ self._storage = storage
19
+
20
+ # Do setup operations.
21
+ self._setup_columns_provider()
22
+ self._setup_rows_provider()
23
+
24
+ # region protected methods
25
+
26
+ def _setup_columns_provider(self):
27
+ pass
28
+
29
+ def _setup_rows_provider(self):
30
+ pass
31
+
32
+ # endregion
33
+
34
+ def populate(self, contents_provider, doc_ids, desc="", writer=None, target=None):
35
+ assert(isinstance(contents_provider, ContentsProvider))
36
+ assert(isinstance(self._storage, BaseRowsStorage))
37
+ assert(isinstance(doc_ids, list))
38
+ assert(isinstance(writer, BaseWriter) or writer is None)
39
+ assert(isinstance(target, str) or target is None)
40
+
41
+ def iter_rows(idle_mode):
42
+ return self._rows_provider.iter_by_rows(
43
+ contents_provider=contents_provider,
44
+ doc_ids_iter=doc_ids,
45
+ idle_mode=idle_mode)
46
+
47
+ self._storage.init_empty(columns_provider=self._columns_provider)
48
+
49
+ is_async_write_mode_on = writer is not None and target is not None
50
+
51
+ if is_async_write_mode_on:
52
+ writer.open_target(target)
53
+
54
+ self._storage.fill(lambda idle_mode: iter_rows(idle_mode),
55
+ columns_provider=self._columns_provider,
56
+ row_handler=lambda: writer.commit_line(self._storage) if is_async_write_mode_on else None,
57
+ desc=desc)
58
+
59
+ if is_async_write_mode_on:
60
+ writer.close_target()
61
+
62
+ def push(self, writer, target, free_storage=True):
63
+ if not isinstance(self._storage, RowCacheStorage):
64
+ writer.write_all(self._storage, target)
65
+
66
+ # After writing we free the contents of the storage.
67
+ if free_storage:
68
+ self._storage.free()
@@ -0,0 +1,22 @@
1
+ import logging
2
+
3
+ from arekit.common.data.input.providers.rows.samples import BaseSampleRowProvider
4
+ from arekit.common.data.input.repositories.base import BaseInputRepository
5
+
6
+ logger = logging.getLogger(__name__)
7
+ logging.basicConfig(level=logging.INFO)
8
+
9
+
10
+ class BaseInputSamplesRepository(BaseInputRepository):
11
+
12
+ def _setup_rows_provider(self):
13
+ """ Setup store labels.
14
+ """
15
+ assert(isinstance(self._rows_provider, BaseSampleRowProvider))
16
+ self._rows_provider.set_store_labels(self._columns_provider.StoreLabels)
17
+
18
+ def _setup_columns_provider(self):
19
+ """ Setup text column names.
20
+ """
21
+ text_column_names = list(self._rows_provider.TextProvider.iter_columns())
22
+ self._columns_provider.set_text_column_names(text_column_names)
@@ -0,0 +1,66 @@
1
+ from collections import OrderedDict
2
+
3
+ from arekit.common.docs.parsed.providers.entity_service import EntityServiceProvider, DistanceType
4
+ from arekit.common.text_opinions.base import TextOpinion
5
+
6
+
7
+ class InputSampleBase(object):
8
+ """
9
+ Description of a single sample (context) of a model
10
+ """
11
+
12
+ def __init__(self, shift_index_dbg, input_sample_id, values):
13
+ assert(isinstance(shift_index_dbg, int))
14
+ assert(isinstance(input_sample_id, str))
15
+ assert(isinstance(values, list))
16
+ self._shift_index_dbg = shift_index_dbg
17
+ self.__input_sample_id = input_sample_id
18
+ self.__values = OrderedDict(values)
19
+
20
+ # region properties
21
+
22
+ @property
23
+ def ID(self):
24
+ return self.__input_sample_id
25
+
26
+ # endregion
27
+
28
+ @staticmethod
29
+ def check_ability_to_create_sample(entity_service, window_size, text_opinion):
30
+ """
31
+ Main text_opinion filtering rules
32
+ """
33
+ assert(isinstance(entity_service, EntityServiceProvider))
34
+ assert(isinstance(text_opinion, TextOpinion))
35
+ assert(isinstance(window_size, int) and window_size > 0)
36
+
37
+ is_not_same_ends = False
38
+ is_in_window = False
39
+ is_same_sentence = False
40
+
41
+ if text_opinion.SourceId != text_opinion.TargetId:
42
+ is_not_same_ends = True
43
+
44
+ dist_between_entities = entity_service.calc_dist_between_text_opinion_ends(
45
+ text_opinion=text_opinion,
46
+ distance_type=DistanceType.InTerms)
47
+
48
+ if InputSampleBase._check_ends_could_be_fitted_in_window(dist_between_entities, window_size):
49
+ is_in_window = True
50
+
51
+ dist_in_sents = entity_service.calc_dist_between_text_opinion_ends(
52
+ text_opinion=text_opinion,
53
+ distance_type=DistanceType.InSentences)
54
+
55
+ if dist_in_sents == 0:
56
+ is_same_sentence = True
57
+
58
+ return is_not_same_ends and is_in_window and is_same_sentence
59
+
60
+ @staticmethod
61
+ def _check_ends_could_be_fitted_in_window(actual_dist, window):
62
+ return actual_dist < window
63
+
64
+ def __iter__(self):
65
+ for key, value in self.__values.items():
66
+ yield key, value
@@ -0,0 +1,88 @@
1
+ from arekit.common.context.terms_mapper import TextTermsMapper
2
+ from arekit.common.context.token import Token
3
+ from arekit.common.entities.base import Entity
4
+ from arekit.common.entities.str_fmt import StringEntitiesFormatter
5
+ from arekit.common.entities.types import OpinionEntityType
6
+ from arekit.common.frames.text_variant import TextFrameVariant
7
+
8
+
9
+ class OpinionContainingTextTermsMapper(TextTermsMapper):
10
+ """
11
+ Provides an ability to setup s_obj, t_obj
12
+ The latter might be utilized with synonyms collection
13
+ """
14
+
15
+ def __init__(self, entity_formatter):
16
+ assert(isinstance(entity_formatter, StringEntitiesFormatter))
17
+ self.__entities_formatter = entity_formatter
18
+ self.__s_ind = None
19
+ self.__t_ind = None
20
+ self.__s_group = None
21
+ self.__t_group = None
22
+
23
+ @property
24
+ def StringEntitiesFormatter(self):
25
+ return self.__entities_formatter
26
+
27
+ def __syn_group(self, entity):
28
+ """ Note: here we guarantee that entity has GroupIndex.
29
+ """
30
+ assert(isinstance(entity, Entity))
31
+ return entity.GroupIndex if entity is not None else None
32
+
33
+ def set_s_ind(self, s_ind):
34
+ assert(isinstance(s_ind, int))
35
+ self.__s_ind = s_ind
36
+
37
+ def set_t_ind(self, t_ind):
38
+ assert(isinstance(t_ind, int))
39
+ self.__t_ind = t_ind
40
+
41
+ def _after_mapping(self):
42
+ """ In order to prevent bugs.
43
+ Every index should be declared before mapping.
44
+ """
45
+ self.__s_ind = None
46
+ self.__t_ind = None
47
+
48
+ def iter_mapped(self, terms):
49
+ terms_list = list(terms)
50
+ self.__s_group = self.__syn_group(terms_list[self.__s_ind] if self.__s_ind is not None else None)
51
+ self.__t_group = self.__syn_group(terms_list[self.__t_ind] if self.__t_ind is not None else None)
52
+ return super(OpinionContainingTextTermsMapper, self).iter_mapped(terms)
53
+
54
+ def map_entity(self, e_ind, entity):
55
+
56
+ entity_type = OpinionEntityType.Other
57
+ if e_ind == self.__s_ind:
58
+ entity_type = OpinionEntityType.Subject
59
+ elif e_ind == self.__t_ind:
60
+ entity_type = OpinionEntityType.Object
61
+ elif self.__is_in_same_group(self.__syn_group(entity), self.__s_group):
62
+ entity_type = OpinionEntityType.SynonymSubject
63
+ elif self.__is_in_same_group(self.__syn_group(entity), self.__t_group):
64
+ entity_type = OpinionEntityType.SynonymObject
65
+
66
+ return self.__entities_formatter.to_string(original_value=entity,
67
+ entity_type=entity_type)
68
+
69
+ @staticmethod
70
+ def __is_in_same_group(g1, g2):
71
+
72
+ if g1 is None or g2 is None:
73
+ # In such scenario we cannot guarantee
74
+ # that g1 and g2 belong to the same group.
75
+ return False
76
+
77
+ return g1 == g2
78
+
79
+ def map_word(self, w_ind, word):
80
+ return word.strip()
81
+
82
+ def map_text_frame_variant(self, fv_ind, text_frame_variant):
83
+ assert(isinstance(text_frame_variant, TextFrameVariant))
84
+ return text_frame_variant.Variant.get_value().strip()
85
+
86
+ def map_token(self, t_ind, token):
87
+ assert(isinstance(token, Token))
88
+ return token.get_meta_value()
@@ -0,0 +1,82 @@
1
+ from arekit.common.data import const
2
+ from arekit.common.utils import filter_whitespaces, split_by_whitespaces
3
+
4
+
5
+ def process_values_list(value, args_sep):
6
+ return value.split(args_sep)
7
+
8
+
9
+ def process_indices_list(value, no_value_func, args_sep):
10
+ return no_value_func() if not value else [int(v) for v in str(value).split(args_sep)]
11
+
12
+
13
+ def process_text(value):
14
+ """ The core method of the input text processing.
15
+ """
16
+ assert(isinstance(value, str) or isinstance(value, list))
17
+ return filter_whitespaces([term for term in split_by_whitespaces(value)]
18
+ if isinstance(value, str) else value)
19
+
20
+
21
+ def create_base_column_value_fmt(no_value_func=lambda: None, args_sep=","):
22
+
23
+ self_func = lambda value: value
24
+
25
+ return {
26
+ const.ID: {
27
+ "writer": self_func,
28
+ "parser": self_func
29
+ },
30
+ const.DOC_ID: {
31
+ "writer": self_func,
32
+ "parser": self_func,
33
+ },
34
+ const.S_IND: {
35
+ "writer": self_func,
36
+ "parser": lambda value: int(value)
37
+ },
38
+ const.T_IND: {
39
+ "writer": self_func,
40
+ "parser": lambda value: int(value)
41
+ },
42
+ const.SENT_IND: {
43
+ "writer": self_func,
44
+ "parser": lambda value: int(value)
45
+ },
46
+ const.OPINION_ID: {
47
+ "writer": self_func,
48
+ "parser": lambda value: int(value)
49
+ },
50
+ const.OPINION_LINKAGE_ID: {
51
+ "writer": self_func,
52
+ "parser": lambda value: int(value)
53
+ },
54
+ const.ENTITY_VALUES: {
55
+ "writer": lambda entities: args_sep.join([e.DisplayValue.replace(args_sep, '') for e in entities]),
56
+ "parser": lambda value: process_values_list(value, args_sep=args_sep),
57
+ },
58
+ const.ENTITY_TYPES: {
59
+ "writer": lambda entities: args_sep.join([e.Type.replace(args_sep, '') for e in entities]),
60
+ "parser": lambda value: process_values_list(value, args_sep=args_sep)
61
+ },
62
+ const.ENTITIES: {
63
+ "writer": lambda entity_inds: args_sep.join(entity_inds),
64
+ "parser": lambda value: process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
65
+ },
66
+ const.TEXT: {
67
+ "writer": self_func,
68
+ "parser": lambda value: process_text(value)
69
+ },
70
+ const.LABEL_UINT: {
71
+ "writer": self_func,
72
+ "parser": lambda value: int(value)
73
+ }
74
+ }
75
+
76
+
77
+ def create_base_column_fmt(fmt_type, args_sep=","):
78
+ assert(isinstance(fmt_type, str))
79
+ d = create_base_column_value_fmt(args_sep=args_sep)
80
+ for k, v in d.items():
81
+ d[k] = v[fmt_type]
82
+ return d
@@ -0,0 +1,43 @@
1
+ class ParsedSampleRow(object):
2
+ """ Provides a parsed information for a sample row.
3
+ """
4
+
5
+ def __init__(self, row, columns_fmts, no_value_func):
6
+ """ row: dict
7
+ dict of the pairs ("field_name", value)
8
+ columns_fmt: list
9
+ list of the formatters, where every formatter represent a dictionary.
10
+ no_value_func: func
11
+ the default value the conveys the absence of the parameter-value.
12
+ """
13
+ assert(isinstance(row, dict))
14
+ assert(isinstance(columns_fmts, list))
15
+ assert(callable(no_value_func))
16
+
17
+ self.__uint_label = None
18
+ self.__params = {}
19
+ self.__no_value = no_value_func
20
+
21
+ for key, value in row.items():
22
+
23
+ for columns_fmt in columns_fmts:
24
+ assert(isinstance(columns_fmt, dict))
25
+
26
+ if key not in columns_fmt:
27
+ continue
28
+
29
+ self.__params[key] = columns_fmt[key](value)
30
+ break
31
+
32
+ def __value_or_none(self, key):
33
+ return self.__params[key] if key in self.__params else self.__no_value()
34
+
35
+ def __getitem__(self, item):
36
+ assert (isinstance(item, str) or item is None)
37
+ if item not in self.__params:
38
+ return self.__no_value()
39
+ return self.__params[item] if item is not None else self.__no_value()
40
+
41
+ @classmethod
42
+ def parse(cls, row, columns_fmts, no_value_func):
43
+ return cls(row=row, columns_fmts=columns_fmts, no_value_func=no_value_func)
File without changes
@@ -0,0 +1,109 @@
1
+ import gc
2
+ import logging
3
+
4
+ from arekit.common.data.input.providers.columns.base import BaseColumnsProvider
5
+ from arekit.common.linkage.meta import MetaEmptyLinkedDataWrapper
6
+ from arekit.common.utils import progress_bar_conditional
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class BaseRowsStorage(object):
12
+
13
+ # region protected methods
14
+
15
+ def _begin_filling_row(self, row_ind):
16
+ pass
17
+
18
+ # endregion
19
+
20
+ # region abstract methods
21
+
22
+ def _set_row_value(self, row_ind, column, value):
23
+ raise NotImplemented()
24
+
25
+ def _iter_rows(self):
26
+ """ returns: tuple(int, list)
27
+ provides the index (int) and the related content of the row (dict)
28
+ """
29
+ raise NotImplemented()
30
+
31
+ def _get_rows_count(self):
32
+ raise NotImplemented()
33
+
34
+ def find_by_value(self, column_name, value):
35
+ raise NotImplemented()
36
+
37
+ def find_first_by_value(self, column_name, value):
38
+ raise NotImplemented()
39
+
40
+ def iter_column_values(self, column_name, dtype=None):
41
+ raise NotImplemented()
42
+
43
+ def get_row(self, row_index):
44
+ raise NotImplemented()
45
+
46
+ def get_cell(self, row_index, column_name):
47
+ raise NotImplemented()
48
+
49
+ def init_empty(self, columns_provider):
50
+ raise NotImplemented()
51
+
52
+ def iter_shuffled(self):
53
+ raise NotImplemented()
54
+
55
+ def iter_column_names(self):
56
+ raise NotImplemented()
57
+
58
+ def iter_column_types(self):
59
+ raise NotImplemented()
60
+
61
+ # endregion
62
+
63
+ def fill(self, iter_rows_func, columns_provider, row_handler=None, rows_count=None, desc=""):
64
+ assert(callable(iter_rows_func))
65
+ assert(isinstance(columns_provider, BaseColumnsProvider))
66
+ assert(callable(row_handler) or row_handler is None)
67
+
68
+ doc_ids_seen = set()
69
+
70
+ def postfix_func(item):
71
+ doc_id, _ = item
72
+ doc_ids_seen.add(doc_id)
73
+ return {
74
+ "docs_seen": len(doc_ids_seen),
75
+ "doc_now": str(doc_id)
76
+ }
77
+
78
+ pbar_it = progress_bar_conditional(
79
+ iterable=iter_rows_func(False),
80
+ # We skip meta information data.
81
+ condition_func=lambda item: not isinstance(item[1], MetaEmptyLinkedDataWrapper),
82
+ postfix_func=postfix_func,
83
+ desc="{fmt}".format(fmt=desc),
84
+ total=rows_count)
85
+
86
+ for row_index, item in enumerate(pbar_it):
87
+ _, row_values = item
88
+ self._begin_filling_row(row_index)
89
+ for column, value in row_values.items():
90
+ self._set_row_value(row_ind=row_index,
91
+ column=column,
92
+ value=value)
93
+ if row_handler is not None:
94
+ row_handler()
95
+
96
+ def free(self):
97
+ gc.collect()
98
+
99
+ # endregion
100
+
101
+ # region base methods
102
+
103
+ def __iter__(self):
104
+ return self._iter_rows()
105
+
106
+ def __len__(self):
107
+ return self._get_rows_count()
108
+
109
+ # endregion
File without changes
@@ -0,0 +1,26 @@
1
+ from arekit.common.data import const
2
+ from arekit.common.data.storages.base import BaseRowsStorage
3
+
4
+
5
+ # TODO. This is a particular type of view, and expected to be off the core.
6
+ class LinkedSamplesStorageView(object):
7
+
8
+ def iter_from_storage(self, storage):
9
+ assert(isinstance(storage, BaseRowsStorage))
10
+ undefined = -1
11
+
12
+ linked = []
13
+ current_opinion_id = undefined
14
+ for row_index, opinion_id in enumerate(storage.iter_column_values(const.OPINION_ID)):
15
+ if current_opinion_id != undefined:
16
+ if opinion_id != current_opinion_id:
17
+ yield linked
18
+ linked = []
19
+ current_opinion_id = opinion_id
20
+ else:
21
+ current_opinion_id = opinion_id
22
+
23
+ linked.append(storage.get_row(row_index))
24
+
25
+ if len(linked) > 0:
26
+ yield linked
File without changes
@@ -0,0 +1,30 @@
1
+ class Document(object):
2
+
3
+ def __init__(self, doc_id, sentences):
4
+ assert(isinstance(sentences, list))
5
+ self.__id = doc_id
6
+ self._sentences = sentences
7
+
8
+ # region properties
9
+
10
+ @property
11
+ def ID(self):
12
+ return self.__id
13
+
14
+ @property
15
+ def SentencesCount(self):
16
+ """ Provides total amount of sentences within a doc
17
+ At present is useful for:
18
+ - CV-splitters, which may rely on sentences count.
19
+ - Text parsing.
20
+ """
21
+ return len(self._sentences)
22
+
23
+ # endregion
24
+
25
+ def iter_sentences(self):
26
+ for sentence in self._sentences:
27
+ yield sentence
28
+
29
+ def get_sentence(self, s_ind):
30
+ return self._sentences[s_ind]
@@ -0,0 +1,16 @@
1
+ from arekit.common.entities.base import Entity
2
+ from arekit.common.pipeline.items.base import BasePipelineItem
3
+
4
+
5
+ class EntitiesGroupingPipelineItem(BasePipelineItem):
6
+
7
+ def __init__(self, value_to_group_id_func):
8
+ assert(callable(value_to_group_id_func))
9
+ self.__value_to_group_id_func = value_to_group_id_func
10
+
11
+ def apply_core(self, input_data, pipeline_ctx):
12
+ assert(isinstance(input_data, list))
13
+
14
+ for entity in filter(lambda term: isinstance(term, Entity), input_data):
15
+ group_index = self.__value_to_group_id_func(entity.Value)
16
+ entity.set_group_index(group_index)
@@ -0,0 +1,18 @@
1
+ from arekit.common.entities.base import Entity
2
+
3
+
4
+ class DocumentEntity(Entity):
5
+
6
+ def __init__(self, value, display_value, e_type, childs, id_in_doc, group_index):
7
+ """ id_in_doc: Id, utilized witin the internal services
8
+ """
9
+ super(DocumentEntity, self).__init__(value=value,
10
+ e_type=e_type,
11
+ display_value=display_value,
12
+ childs=childs,
13
+ group_index=group_index)
14
+ self.__id = id_in_doc
15
+
16
+ @property
17
+ def IdInDocument(self):
18
+ return self.__id