arekit 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. arekit/__init__.py +0 -0
  2. arekit/common/__init__.py +0 -0
  3. arekit/common/bound.py +48 -0
  4. arekit/common/context/__init__.py +0 -0
  5. arekit/common/context/terms_mapper.py +51 -0
  6. arekit/common/context/token.py +16 -0
  7. arekit/common/data/__init__.py +0 -0
  8. arekit/common/data/const.py +21 -0
  9. arekit/common/data/doc_provider.py +6 -0
  10. arekit/common/data/input/__init__.py +0 -0
  11. arekit/common/data/input/providers/__init__.py +0 -0
  12. arekit/common/data/input/providers/columns/__init__.py +0 -0
  13. arekit/common/data/input/providers/columns/base.py +9 -0
  14. arekit/common/data/input/providers/columns/sample.py +59 -0
  15. arekit/common/data/input/providers/const.py +3 -0
  16. arekit/common/data/input/providers/contents.py +9 -0
  17. arekit/common/data/input/providers/instances/__init__.py +0 -0
  18. arekit/common/data/input/providers/instances/base.py +14 -0
  19. arekit/common/data/input/providers/instances/multiple.py +27 -0
  20. arekit/common/data/input/providers/instances/single.py +8 -0
  21. arekit/common/data/input/providers/label/__init__.py +0 -0
  22. arekit/common/data/input/providers/label/base.py +24 -0
  23. arekit/common/data/input/providers/label/binary.py +11 -0
  24. arekit/common/data/input/providers/label/multiple.py +15 -0
  25. arekit/common/data/input/providers/rows/__init__.py +0 -0
  26. arekit/common/data/input/providers/rows/base.py +64 -0
  27. arekit/common/data/input/providers/rows/samples.py +227 -0
  28. arekit/common/data/input/providers/sample/__init__.py +0 -0
  29. arekit/common/data/input/providers/sample/cropped.py +43 -0
  30. arekit/common/data/input/providers/text/__init__.py +0 -0
  31. arekit/common/data/input/providers/text/single.py +49 -0
  32. arekit/common/data/input/repositories/__init__.py +0 -0
  33. arekit/common/data/input/repositories/base.py +68 -0
  34. arekit/common/data/input/repositories/sample.py +22 -0
  35. arekit/common/data/input/sample.py +66 -0
  36. arekit/common/data/input/terms_mapper.py +88 -0
  37. arekit/common/data/rows_fmt.py +82 -0
  38. arekit/common/data/rows_parser.py +43 -0
  39. arekit/common/data/storages/__init__.py +0 -0
  40. arekit/common/data/storages/base.py +109 -0
  41. arekit/common/data/views/__init__.py +0 -0
  42. arekit/common/data/views/samples.py +26 -0
  43. arekit/common/docs/__init__.py +0 -0
  44. arekit/common/docs/base.py +30 -0
  45. arekit/common/docs/entities_grouping.py +16 -0
  46. arekit/common/docs/entity.py +18 -0
  47. arekit/common/docs/objects_parser.py +37 -0
  48. arekit/common/docs/parsed/__init__.py +0 -0
  49. arekit/common/docs/parsed/base.py +101 -0
  50. arekit/common/docs/parsed/providers/__init__.py +0 -0
  51. arekit/common/docs/parsed/providers/base.py +68 -0
  52. arekit/common/docs/parsed/providers/base_pairs.py +51 -0
  53. arekit/common/docs/parsed/providers/entity_service.py +175 -0
  54. arekit/common/docs/parsed/providers/opinion_pairs.py +20 -0
  55. arekit/common/docs/parsed/providers/text_opinion_pairs.py +78 -0
  56. arekit/common/docs/parsed/service.py +31 -0
  57. arekit/common/docs/parsed/term_position.py +42 -0
  58. arekit/common/docs/parser.py +34 -0
  59. arekit/common/docs/sentence.py +14 -0
  60. arekit/common/entities/__init__.py +0 -0
  61. arekit/common/entities/base.py +51 -0
  62. arekit/common/entities/collection.py +72 -0
  63. arekit/common/entities/str_fmt.py +8 -0
  64. arekit/common/entities/types.py +9 -0
  65. arekit/common/experiment/__init__.py +0 -0
  66. arekit/common/experiment/api/__init__.py +0 -0
  67. arekit/common/experiment/api/base_samples_io.py +20 -0
  68. arekit/common/experiment/data_type.py +17 -0
  69. arekit/common/frames/__init__.py +0 -0
  70. arekit/common/frames/connotations/__init__.py +0 -0
  71. arekit/common/frames/connotations/descriptor.py +17 -0
  72. arekit/common/frames/connotations/provider.py +4 -0
  73. arekit/common/frames/text_variant.py +43 -0
  74. arekit/common/frames/variants/__init__.py +0 -0
  75. arekit/common/frames/variants/base.py +21 -0
  76. arekit/common/frames/variants/collection.py +60 -0
  77. arekit/common/labels/__init__.py +0 -0
  78. arekit/common/labels/base.py +19 -0
  79. arekit/common/labels/provider/__init__.py +0 -0
  80. arekit/common/labels/provider/base.py +7 -0
  81. arekit/common/labels/provider/constant.py +14 -0
  82. arekit/common/labels/scaler/__init__.py +0 -0
  83. arekit/common/labels/scaler/base.py +85 -0
  84. arekit/common/labels/scaler/sentiment.py +7 -0
  85. arekit/common/labels/scaler/single.py +10 -0
  86. arekit/common/labels/str_fmt.py +55 -0
  87. arekit/common/linkage/__init__.py +0 -0
  88. arekit/common/linkage/base.py +44 -0
  89. arekit/common/linkage/meta.py +23 -0
  90. arekit/common/linkage/opinions.py +9 -0
  91. arekit/common/linkage/text_opinions.py +22 -0
  92. arekit/common/log_utils.py +29 -0
  93. arekit/common/model/__init__.py +0 -0
  94. arekit/common/model/labeling/__init__.py +0 -0
  95. arekit/common/model/labeling/base.py +24 -0
  96. arekit/common/model/labeling/modes.py +8 -0
  97. arekit/common/model/labeling/single.py +24 -0
  98. arekit/common/opinions/__init__.py +0 -0
  99. arekit/common/opinions/annot/__init__.py +0 -0
  100. arekit/common/opinions/annot/algo/__init__.py +0 -0
  101. arekit/common/opinions/annot/algo/base.py +4 -0
  102. arekit/common/opinions/annot/algo/pair_based.py +99 -0
  103. arekit/common/opinions/annot/algo/predefined.py +16 -0
  104. arekit/common/opinions/annot/algo_based.py +55 -0
  105. arekit/common/opinions/annot/base.py +15 -0
  106. arekit/common/opinions/base.py +74 -0
  107. arekit/common/opinions/collection.py +150 -0
  108. arekit/common/opinions/enums.py +6 -0
  109. arekit/common/opinions/provider.py +4 -0
  110. arekit/common/opinions/writer.py +4 -0
  111. arekit/common/pipeline/__init__.py +0 -0
  112. arekit/common/pipeline/base.py +25 -0
  113. arekit/common/pipeline/context.py +36 -0
  114. arekit/common/pipeline/conts.py +2 -0
  115. arekit/common/pipeline/items/__init__.py +0 -0
  116. arekit/common/pipeline/items/base.py +12 -0
  117. arekit/common/pipeline/items/flatten.py +14 -0
  118. arekit/common/pipeline/items/handle.py +17 -0
  119. arekit/common/pipeline/items/iter.py +11 -0
  120. arekit/common/pipeline/items/map.py +11 -0
  121. arekit/common/pipeline/items/map_nested.py +13 -0
  122. arekit/common/synonyms/__init__.py +0 -0
  123. arekit/common/synonyms/base.py +151 -0
  124. arekit/common/synonyms/grouping.py +21 -0
  125. arekit/common/text/__init__.py +0 -0
  126. arekit/common/text/enums.py +12 -0
  127. arekit/common/text/parsed.py +42 -0
  128. arekit/common/text/parser.py +12 -0
  129. arekit/common/text/partitioning/__init__.py +0 -0
  130. arekit/common/text/partitioning/base.py +4 -0
  131. arekit/common/text/partitioning/str.py +36 -0
  132. arekit/common/text/partitioning/terms.py +35 -0
  133. arekit/common/text/stemmer.py +16 -0
  134. arekit/common/text_opinions/__init__.py +0 -0
  135. arekit/common/text_opinions/base.py +105 -0
  136. arekit/common/utils.py +129 -0
  137. arekit/contrib/__init__.py +0 -0
  138. arekit/contrib/bert/__init__.py +0 -0
  139. arekit/contrib/bert/input/__init__.py +0 -0
  140. arekit/contrib/bert/input/providers/__init__.py +0 -0
  141. arekit/contrib/bert/input/providers/cropped_sample.py +17 -0
  142. arekit/contrib/bert/input/providers/text_pair.py +62 -0
  143. arekit/contrib/bert/terms/__init__.py +0 -0
  144. arekit/contrib/bert/terms/mapper.py +20 -0
  145. arekit/contrib/networks/__init__.py +0 -0
  146. arekit/contrib/networks/embedding.py +149 -0
  147. arekit/contrib/networks/embedding_io.py +18 -0
  148. arekit/contrib/networks/input/__init__.py +0 -0
  149. arekit/contrib/networks/input/const.py +6 -0
  150. arekit/contrib/networks/input/ctx_serialization.py +28 -0
  151. arekit/contrib/networks/input/embedding/__init__.py +0 -0
  152. arekit/contrib/networks/input/embedding/matrix.py +29 -0
  153. arekit/contrib/networks/input/embedding/offsets.py +55 -0
  154. arekit/contrib/networks/input/formatters/__init__.py +0 -0
  155. arekit/contrib/networks/input/formatters/pos_mapper.py +22 -0
  156. arekit/contrib/networks/input/providers/__init__.py +0 -0
  157. arekit/contrib/networks/input/providers/sample.py +129 -0
  158. arekit/contrib/networks/input/providers/term_connotation.py +23 -0
  159. arekit/contrib/networks/input/providers/text.py +24 -0
  160. arekit/contrib/networks/input/rows_parser.py +47 -0
  161. arekit/contrib/networks/input/term_types.py +13 -0
  162. arekit/contrib/networks/input/terms_mapping.py +60 -0
  163. arekit/contrib/networks/vectorizer.py +6 -0
  164. arekit/contrib/prompt/__init__.py +0 -0
  165. arekit/contrib/prompt/sample.py +61 -0
  166. arekit/contrib/source/__init__.py +0 -0
  167. arekit/contrib/source/brat/__init__.py +0 -0
  168. arekit/contrib/source/brat/annot.py +84 -0
  169. arekit/contrib/source/brat/doc.py +28 -0
  170. arekit/contrib/source/brat/entities/__init__.py +0 -0
  171. arekit/contrib/source/brat/entities/compound.py +13 -0
  172. arekit/contrib/source/brat/entities/entity.py +42 -0
  173. arekit/contrib/source/brat/entities/parser.py +53 -0
  174. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  175. arekit/contrib/source/brat/opinions/converter.py +19 -0
  176. arekit/contrib/source/brat/relation.py +32 -0
  177. arekit/contrib/source/brat/sentence.py +69 -0
  178. arekit/contrib/source/brat/sentences_reader.py +128 -0
  179. arekit/contrib/source/download.py +41 -0
  180. arekit/contrib/source/nerel/__init__.py +0 -0
  181. arekit/contrib/source/nerel/entities.py +55 -0
  182. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  183. arekit/contrib/source/nerel/folding/fixed.py +74 -0
  184. arekit/contrib/source/nerel/io_utils.py +62 -0
  185. arekit/contrib/source/nerel/labels.py +241 -0
  186. arekit/contrib/source/nerel/reader.py +46 -0
  187. arekit/contrib/source/nerel/utils.py +24 -0
  188. arekit/contrib/source/nerel/versions.py +12 -0
  189. arekit/contrib/source/nerelbio/__init__.py +0 -0
  190. arekit/contrib/source/nerelbio/io_utils.py +62 -0
  191. arekit/contrib/source/nerelbio/labels.py +265 -0
  192. arekit/contrib/source/nerelbio/reader.py +8 -0
  193. arekit/contrib/source/nerelbio/versions.py +8 -0
  194. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  195. arekit/contrib/source/ruattitudes/collection.py +36 -0
  196. arekit/contrib/source/ruattitudes/doc.py +51 -0
  197. arekit/contrib/source/ruattitudes/doc_brat.py +44 -0
  198. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  199. arekit/contrib/source/ruattitudes/entity/parser.py +7 -0
  200. arekit/contrib/source/ruattitudes/io_utils.py +56 -0
  201. arekit/contrib/source/ruattitudes/labels_fmt.py +12 -0
  202. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  203. arekit/contrib/source/ruattitudes/opinions/base.py +28 -0
  204. arekit/contrib/source/ruattitudes/opinions/converter.py +37 -0
  205. arekit/contrib/source/ruattitudes/reader.py +268 -0
  206. arekit/contrib/source/ruattitudes/sentence.py +73 -0
  207. arekit/contrib/source/ruattitudes/synonyms.py +17 -0
  208. arekit/contrib/source/ruattitudes/text_object.py +59 -0
  209. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  210. arekit/contrib/source/rusentiframes/collection.py +157 -0
  211. arekit/contrib/source/rusentiframes/effect.py +24 -0
  212. arekit/contrib/source/rusentiframes/io_utils.py +19 -0
  213. arekit/contrib/source/rusentiframes/labels_fmt.py +22 -0
  214. arekit/contrib/source/rusentiframes/polarity.py +35 -0
  215. arekit/contrib/source/rusentiframes/role.py +15 -0
  216. arekit/contrib/source/rusentiframes/state.py +24 -0
  217. arekit/contrib/source/rusentiframes/types.py +42 -0
  218. arekit/contrib/source/rusentiframes/value.py +2 -0
  219. arekit/contrib/source/rusentrel/__init__.py +0 -0
  220. arekit/contrib/source/rusentrel/const.py +3 -0
  221. arekit/contrib/source/rusentrel/docs_reader.py +51 -0
  222. arekit/contrib/source/rusentrel/entities.py +26 -0
  223. arekit/contrib/source/rusentrel/io_utils.py +125 -0
  224. arekit/contrib/source/rusentrel/labels_fmt.py +12 -0
  225. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  226. arekit/contrib/source/rusentrel/opinions/collection.py +30 -0
  227. arekit/contrib/source/rusentrel/opinions/converter.py +40 -0
  228. arekit/contrib/source/rusentrel/opinions/provider.py +54 -0
  229. arekit/contrib/source/rusentrel/opinions/writer.py +42 -0
  230. arekit/contrib/source/rusentrel/synonyms.py +17 -0
  231. arekit/contrib/source/sentinerel/__init__.py +0 -0
  232. arekit/contrib/source/sentinerel/entities.py +52 -0
  233. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  234. arekit/contrib/source/sentinerel/folding/factory.py +31 -0
  235. arekit/contrib/source/sentinerel/folding/fixed.py +70 -0
  236. arekit/contrib/source/sentinerel/io_utils.py +87 -0
  237. arekit/contrib/source/sentinerel/labels.py +53 -0
  238. arekit/contrib/source/sentinerel/labels_scaler.py +30 -0
  239. arekit/contrib/source/sentinerel/reader.py +42 -0
  240. arekit/contrib/source/synonyms/__init__.py +0 -0
  241. arekit/contrib/source/synonyms/utils.py +19 -0
  242. arekit/contrib/source/zip_utils.py +47 -0
  243. arekit/contrib/utils/__init__.py +0 -0
  244. arekit/contrib/utils/bert/__init__.py +0 -0
  245. arekit/contrib/utils/bert/samplers.py +17 -0
  246. arekit/contrib/utils/connotations/__init__.py +0 -0
  247. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +23 -0
  248. arekit/contrib/utils/data/__init__.py +0 -0
  249. arekit/contrib/utils/data/contents/__init__.py +0 -0
  250. arekit/contrib/utils/data/contents/opinions.py +37 -0
  251. arekit/contrib/utils/data/doc_provider/__init__.py +0 -0
  252. arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
  253. arekit/contrib/utils/data/doc_provider/dir_based.py +53 -0
  254. arekit/contrib/utils/data/readers/__init__.py +0 -0
  255. arekit/contrib/utils/data/readers/base.py +7 -0
  256. arekit/contrib/utils/data/readers/csv_pd.py +38 -0
  257. arekit/contrib/utils/data/readers/jsonl.py +15 -0
  258. arekit/contrib/utils/data/service/__init__.py +0 -0
  259. arekit/contrib/utils/data/service/balance.py +50 -0
  260. arekit/contrib/utils/data/storages/__init__.py +0 -0
  261. arekit/contrib/utils/data/storages/jsonl_based.py +18 -0
  262. arekit/contrib/utils/data/storages/pandas_based.py +123 -0
  263. arekit/contrib/utils/data/storages/row_cache.py +48 -0
  264. arekit/contrib/utils/data/writers/__init__.py +0 -0
  265. arekit/contrib/utils/data/writers/base.py +27 -0
  266. arekit/contrib/utils/data/writers/csv_native.py +63 -0
  267. arekit/contrib/utils/data/writers/csv_pd.py +40 -0
  268. arekit/contrib/utils/data/writers/json_opennre.py +132 -0
  269. arekit/contrib/utils/data/writers/sqlite_native.py +110 -0
  270. arekit/contrib/utils/download.py +77 -0
  271. arekit/contrib/utils/embeddings/__init__.py +0 -0
  272. arekit/contrib/utils/embeddings/rusvectores.py +58 -0
  273. arekit/contrib/utils/embeddings/tokens.py +30 -0
  274. arekit/contrib/utils/entities/__init__.py +0 -0
  275. arekit/contrib/utils/entities/filter.py +7 -0
  276. arekit/contrib/utils/entities/formatters/__init__.py +0 -0
  277. arekit/contrib/utils/entities/formatters/str_display.py +11 -0
  278. arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py +15 -0
  279. arekit/contrib/utils/io_utils/__init__.py +0 -0
  280. arekit/contrib/utils/io_utils/embedding.py +72 -0
  281. arekit/contrib/utils/io_utils/opinions.py +37 -0
  282. arekit/contrib/utils/io_utils/samples.py +79 -0
  283. arekit/contrib/utils/io_utils/utils.py +39 -0
  284. arekit/contrib/utils/lexicons/__init__.py +0 -0
  285. arekit/contrib/utils/lexicons/lexicon.py +41 -0
  286. arekit/contrib/utils/lexicons/relation.py +42 -0
  287. arekit/contrib/utils/lexicons/rusentilex.py +37 -0
  288. arekit/contrib/utils/nn/__init__.py +0 -0
  289. arekit/contrib/utils/nn/rows.py +83 -0
  290. arekit/contrib/utils/np_utils/__init__.py +0 -0
  291. arekit/contrib/utils/np_utils/embedding.py +22 -0
  292. arekit/contrib/utils/np_utils/npz_utils.py +13 -0
  293. arekit/contrib/utils/np_utils/vocab.py +20 -0
  294. arekit/contrib/utils/pipelines/__init__.py +0 -0
  295. arekit/contrib/utils/pipelines/items/__init__.py +0 -0
  296. arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
  297. arekit/contrib/utils/pipelines/items/sampling/base.py +99 -0
  298. arekit/contrib/utils/pipelines/items/sampling/networks.py +54 -0
  299. arekit/contrib/utils/pipelines/items/text/__init__.py +0 -0
  300. arekit/contrib/utils/pipelines/items/text/entities_default.py +23 -0
  301. arekit/contrib/utils/pipelines/items/text/frames.py +86 -0
  302. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +36 -0
  303. arekit/contrib/utils/pipelines/items/text/frames_negation.py +32 -0
  304. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +10 -0
  305. arekit/contrib/utils/pipelines/items/text/tokenizer.py +107 -0
  306. arekit/contrib/utils/pipelines/items/text/translator.py +135 -0
  307. arekit/contrib/utils/pipelines/opinion_collections.py +85 -0
  308. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  309. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  310. arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +27 -0
  311. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +65 -0
  312. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +60 -0
  313. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  314. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +29 -0
  315. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +64 -0
  316. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +79 -0
  317. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  318. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +56 -0
  319. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +20 -0
  320. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +65 -0
  321. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  322. arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +21 -0
  323. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +107 -0
  324. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  325. arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +29 -0
  326. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +62 -0
  327. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +180 -0
  328. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +50 -0
  329. arekit/contrib/utils/pipelines/text_opinion/__init__.py +0 -0
  330. arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py +0 -0
  331. arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +34 -0
  332. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +88 -0
  333. arekit/contrib/utils/pipelines/text_opinion/extraction.py +93 -0
  334. arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py +0 -0
  335. arekit/contrib/utils/pipelines/text_opinion/filters/base.py +4 -0
  336. arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +16 -0
  337. arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +29 -0
  338. arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +26 -0
  339. arekit/contrib/utils/processing/__init__.py +0 -0
  340. arekit/contrib/utils/processing/languages/__init__.py +0 -0
  341. arekit/contrib/utils/processing/languages/mods.py +12 -0
  342. arekit/contrib/utils/processing/languages/pos.py +23 -0
  343. arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
  344. arekit/contrib/utils/processing/languages/ru/cases.py +78 -0
  345. arekit/contrib/utils/processing/languages/ru/constants.py +6 -0
  346. arekit/contrib/utils/processing/languages/ru/mods.py +13 -0
  347. arekit/contrib/utils/processing/languages/ru/number.py +23 -0
  348. arekit/contrib/utils/processing/languages/ru/pos_service.py +36 -0
  349. arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
  350. arekit/contrib/utils/processing/lemmatization/mystem.py +51 -0
  351. arekit/contrib/utils/processing/pos/__init__.py +0 -0
  352. arekit/contrib/utils/processing/pos/base.py +12 -0
  353. arekit/contrib/utils/processing/pos/mystem_wrap.py +134 -0
  354. arekit/contrib/utils/processing/pos/russian.py +10 -0
  355. arekit/contrib/utils/processing/text/__init__.py +0 -0
  356. arekit/contrib/utils/processing/text/tokens.py +127 -0
  357. arekit/contrib/utils/resources.py +25 -0
  358. arekit/contrib/utils/serializer.py +43 -0
  359. arekit/contrib/utils/sources/__init__.py +0 -0
  360. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  361. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  362. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +63 -0
  363. arekit/contrib/utils/synonyms/__init__.py +0 -0
  364. arekit/contrib/utils/synonyms/simple.py +15 -0
  365. arekit/contrib/utils/synonyms/stemmer_based.py +38 -0
  366. arekit/contrib/utils/vectorizers/__init__.py +0 -0
  367. arekit/contrib/utils/vectorizers/bpe.py +93 -0
  368. arekit/contrib/utils/vectorizers/random_norm.py +39 -0
  369. arekit/download_data.py +11 -0
  370. arekit-0.24.0.dist-info/LICENSE +21 -0
  371. arekit-0.24.0.dist-info/METADATA +23 -0
  372. arekit-0.24.0.dist-info/RECORD +374 -0
  373. arekit-0.24.0.dist-info/WHEEL +5 -0
  374. arekit-0.24.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,37 @@
1
+ from arekit.common.pipeline.items.base import BasePipelineItem
2
+ from arekit.common.text.partitioning.base import BasePartitioning
3
+ from arekit.common.pipeline.context import PipelineContext
4
+
5
+
6
+ class SentenceObjectsParserPipelineItem(BasePipelineItem):
7
+
8
+ def __init__(self, partitioning):
9
+ assert(isinstance(partitioning, BasePartitioning))
10
+ self.__partitioning = partitioning
11
+
12
+ # region protected
13
+
14
+ def _get_text(self, pipeline_ctx):
15
+ return None
16
+
17
+ def _get_parts_provider_func(self, input_data, pipeline_ctx):
18
+ raise NotImplementedError()
19
+
20
+ # endregion
21
+
22
+ def apply_core(self, input_data, pipeline_ctx):
23
+ assert(isinstance(pipeline_ctx, PipelineContext))
24
+ external_input = self._get_text(pipeline_ctx)
25
+ actual_input = input_data if external_input is None else external_input
26
+ parts_it = self._get_parts_provider_func(input_data=actual_input, pipeline_ctx=pipeline_ctx)
27
+ return self.__partitioning.provide(text=actual_input, parts_it=parts_it)
28
+
29
+ # region base
30
+
31
+ def __enter__(self):
32
+ return self
33
+
34
+ def __exit__(self, exc_type, exc_val, exc_tb):
35
+ pass
36
+
37
+ # endregion
File without changes
@@ -0,0 +1,101 @@
1
+ from collections.abc import Iterable
2
+
3
+ from arekit.common.entities.base import Entity
4
+ from arekit.common.text.enums import TermFormat
5
+ from arekit.common.text.parsed import BaseParsedText
6
+
7
+
8
+ class ParsedDocument(object):
9
+ """
10
+ This class represents an information of the processed doc in following directions:
11
+ - doc words
12
+ - tokens
13
+ - entities (positions).
14
+ - frames (FrameVariants)
15
+ It allows:
16
+ - Expand parsed sentences with other objects:
17
+ modify_parsed_sentences(func)
18
+
19
+ Limitations:
20
+ IN MEMORY implementation (`add` method)
21
+ """
22
+
23
+ def __init__(self, doc_id, parsed_sentences):
24
+ """
25
+ parsed_sentences: iterable of ParsedSentence type
26
+ NOTE: Considered sentences with labeled Entities in it!
27
+ """
28
+ assert(isinstance(parsed_sentences, Iterable))
29
+
30
+ self.__doc_id = doc_id
31
+ self.__parsed_sentences = list(parsed_sentences)
32
+
33
+ # region properties
34
+
35
+ @property
36
+ def RelatedDocID(self):
37
+ return self.__doc_id
38
+
39
+ # endregion
40
+
41
+ # region private methods
42
+
43
+ def __iter_all_raw_terms(self, filter_func=None, term_only=False):
44
+ assert(callable(filter_func) or filter_func is None)
45
+ assert(isinstance(term_only, bool))
46
+
47
+ for s_ind, sentence in enumerate(self.__parsed_sentences):
48
+ for ind_in_sent, term in self.__iter_sentence_raw_terms(sentence, filter_func=filter_func):
49
+
50
+ if term_only:
51
+ yield term
52
+ else:
53
+ yield s_ind, ind_in_sent, term
54
+
55
+ @staticmethod
56
+ def __iter_sentence_raw_terms(sentence, filter_func):
57
+ assert(isinstance(sentence, BaseParsedText))
58
+ assert(callable(filter_func) or filter_func is None)
59
+
60
+ for ind_in_sent, term in enumerate(sentence.iter_terms(TermFormat.Raw)):
61
+
62
+ if filter_func is not None:
63
+ if not filter_func(term):
64
+ continue
65
+
66
+ yield ind_in_sent, term
67
+
68
+ # endregion
69
+
70
+ # region public 'iter' methods
71
+
72
+ def get_sentence(self, s_ind):
73
+ assert(isinstance(s_ind, int))
74
+ return self.__parsed_sentences[s_ind]
75
+
76
+ def iter_entities(self):
77
+ for entity in self.__iter_all_raw_terms(term_only=True, filter_func=lambda t: isinstance(t, Entity)):
78
+ yield entity
79
+
80
+ def iter_terms(self, filter_func=None, term_only=True):
81
+ for term in self.__iter_all_raw_terms(term_only=term_only, filter_func=filter_func):
82
+ yield term
83
+
84
+ def iter_sentence_terms(self, sentence_index, return_id, filter_func=None):
85
+ assert(isinstance(sentence_index, int))
86
+ assert(isinstance(return_id, bool))
87
+ assert(callable(filter_func) or filter_func is None)
88
+
89
+ it = self.__iter_sentence_raw_terms(sentence=self.__parsed_sentences[sentence_index],
90
+ filter_func=filter_func)
91
+
92
+ for ind_in_sent, term in it:
93
+ if return_id:
94
+ yield ind_in_sent, term
95
+ else:
96
+ yield term
97
+ # endregion
98
+
99
+ def __iter__(self):
100
+ for sentence in self.__parsed_sentences:
101
+ yield sentence
File without changes
@@ -0,0 +1,68 @@
1
+ from arekit.common.entities.base import Entity
2
+ from arekit.common.docs.entity import DocumentEntity
3
+ from arekit.common.docs.parsed.base import ParsedDocument
4
+
5
+
6
+ class BaseParsedDocumentServiceProvider(object):
7
+
8
+ def __init__(self, entity_index_func=None):
9
+ """ Outside enity indexing function
10
+ entity_index_func: provides id for a given entity, i.e.
11
+ func(entity) -> int (id)
12
+ """
13
+ assert(callable(entity_index_func) or entity_index_func is None)
14
+ self._doc_entities = None
15
+ self.__entity_map = {}
16
+ self.__entity_index_func = entity_index_func
17
+
18
+ @property
19
+ def Name(self):
20
+ raise NotImplementedError()
21
+
22
+ def init_parsed_doc(self, parsed_doc):
23
+ assert(isinstance(parsed_doc, ParsedDocument))
24
+
25
+ def __iter_childs_and_root_node(entity):
26
+ """ Note: Entity has childs and we would like to iterate over childs
27
+ to conider them as well as keep the root Node.
28
+ """
29
+ # We first add childs.
30
+ for child_entity in entity.iter_childs():
31
+ yield child_entity, True
32
+
33
+ # Return Root node.
34
+ yield entity, False
35
+
36
+ self._doc_entities = []
37
+ self.__entity_map.clear()
38
+
39
+ current_id = 0
40
+ for _, entity in enumerate(parsed_doc.iter_entities()):
41
+
42
+ child_doc_entities = []
43
+ for tree_entity, is_child in __iter_childs_and_root_node(entity):
44
+
45
+ doc_entity = DocumentEntity(id_in_doc=current_id,
46
+ value=tree_entity.Value,
47
+ e_type=tree_entity.Type,
48
+ display_value=tree_entity.DisplayValue,
49
+ childs=None if is_child else child_doc_entities,
50
+ group_index=tree_entity.GroupIndex)
51
+ current_id += 1
52
+
53
+ if is_child:
54
+ child_doc_entities.append(doc_entity)
55
+
56
+ self._doc_entities.append(doc_entity)
57
+
58
+ if self.__entity_index_func is not None:
59
+ self.__entity_map[self.__entity_index_func(tree_entity)] = doc_entity
60
+
61
+ def get_document_entity(self, entity):
62
+ """ Maps entity to the related one with DocumentEntity type
63
+ """
64
+ assert(isinstance(entity, Entity))
65
+ return self.__entity_map[self.__entity_index_func(entity)]
66
+
67
+ def contains_entity(self, entity):
68
+ return self.__entity_index_func(entity) in self.__entity_map
@@ -0,0 +1,51 @@
1
+ from arekit.common.labels.provider.base import BasePairLabelProvider
2
+ from arekit.common.docs.parsed.providers.base import BaseParsedDocumentServiceProvider
3
+
4
+
5
+ class BasePairProvider(BaseParsedDocumentServiceProvider):
6
+
7
+ @property
8
+ def Name(self):
9
+ raise NotImplementedError()
10
+
11
+ def _create_pair(self, source_entity, target_entity, label):
12
+ raise NotImplementedError()
13
+
14
+ # region private methods
15
+
16
+ def _iter_from_entities(self, src_entity_doc_ids, tgt_entity_doc_ids, label_provider, filter_func=None):
17
+ assert(isinstance(src_entity_doc_ids, list))
18
+ assert(isinstance(tgt_entity_doc_ids, list))
19
+ assert(isinstance(label_provider, BasePairLabelProvider))
20
+ assert(callable(filter_func) or filter_func is None)
21
+
22
+ for src_e_doc_id in src_entity_doc_ids:
23
+ for tgt_e_doc_id in tgt_entity_doc_ids:
24
+ assert(isinstance(src_e_doc_id, int))
25
+ assert(isinstance(tgt_e_doc_id, int))
26
+
27
+ # Extract entities by doc_id.
28
+ source_entity = self._doc_entities[src_e_doc_id]
29
+ target_entity = self._doc_entities[tgt_e_doc_id]
30
+
31
+ if filter_func is not None and not filter_func(source_entity, target_entity):
32
+ continue
33
+
34
+ if source_entity == target_entity:
35
+ continue
36
+
37
+ label = label_provider.provide(source=source_entity,
38
+ target=target_entity)
39
+
40
+ yield self._create_pair(source_entity=source_entity,
41
+ target_entity=target_entity,
42
+ label=label)
43
+
44
+ # endregion
45
+
46
+ def iter_from_all(self, label_provider, filter_func):
47
+ assert(isinstance(label_provider, BasePairLabelProvider))
48
+ return self._iter_from_entities(src_entity_doc_ids=list(map(lambda e: e.IdInDocument, self._doc_entities)),
49
+ tgt_entity_doc_ids=list(map(lambda e: e.IdInDocument, self._doc_entities)),
50
+ label_provider=label_provider,
51
+ filter_func=filter_func)
@@ -0,0 +1,175 @@
1
+ from enum import Enum
2
+
3
+ from arekit.common.entities.base import Entity
4
+ from arekit.common.docs.entity import DocumentEntity
5
+ from arekit.common.docs.parsed.base import ParsedDocument
6
+ from arekit.common.docs.parsed.providers.base import BaseParsedDocumentServiceProvider
7
+ from arekit.common.docs.parsed.term_position import TermPositionTypes, TermPosition
8
+ from arekit.common.text_opinions.base import TextOpinion
9
+
10
+
11
+ class EntityEndType(Enum):
12
+ """ Pair end type
13
+ """
14
+ Source = 1
15
+ Target = 2
16
+
17
+
18
+ class DistanceType(Enum):
19
+ InTerms = 1
20
+ InSentences = 2
21
+
22
+ @staticmethod
23
+ def to_position_type(dist_type):
24
+ assert(isinstance(dist_type, DistanceType))
25
+
26
+ if dist_type == DistanceType.InTerms:
27
+ return TermPositionTypes.IndexInDocument
28
+
29
+ if dist_type == DistanceType.InSentences:
30
+ return TermPositionTypes.SentenceIndex
31
+
32
+
33
+ class EntityServiceProvider(BaseParsedDocumentServiceProvider):
34
+ """ This class provides a helper functions for TextOpinions, which become a part of TextOpinionCollection.
35
+ The latter is important because of the dependency from Owner.
36
+ We utilize 'extract' prefix in methods to emphasize that these are methods of helper.
37
+
38
+ Wrapper over:
39
+ parsed doc, positions, text_opinions
40
+ """
41
+
42
+ NAME = "entity-service-provider"
43
+
44
+ def __init__(self, entity_index_func):
45
+ assert(callable(entity_index_func))
46
+ super(EntityServiceProvider, self).__init__(entity_index_func=entity_index_func)
47
+ # Initialize API.
48
+ self.__iter_raw_terms_func = None
49
+ # Initialize entity positions.
50
+ self.__entity_positions = None
51
+
52
+ @property
53
+ def Name(self):
54
+ return self.NAME
55
+
56
+ def init_parsed_doc(self, parsed_doc):
57
+ super(EntityServiceProvider, self).init_parsed_doc(parsed_doc)
58
+ assert(isinstance(parsed_doc, ParsedDocument))
59
+ self.__iter_raw_terms_func = lambda: parsed_doc.iter_terms(filter_func=None, term_only=False)
60
+ self.__entity_positions = self.__calculate_entity_positions()
61
+
62
+ # region public 'extract' methods
63
+
64
+ def extract_entity_value(self, text_opinion, end_type):
65
+ return self.__extract_entity_value(text_opinion=text_opinion, end_type=end_type)
66
+
67
+ def extract_entity_position(self, text_opinion, end_type, position_type=None):
68
+ return self.__get_entity_position(text_opinion=text_opinion,
69
+ end_type=end_type,
70
+ position_type=position_type)
71
+
72
+ # endregion
73
+
74
+ # region public 'calculate' methods
75
+
76
+ @staticmethod
77
+ def calc_dist_between_text_opinion_end_indices(pos1_ind, pos2_ind):
78
+ return EntityServiceProvider.__calc_distance_by_inds(pos1_ind=pos1_ind, pos2_ind=pos2_ind)
79
+
80
+ def calc_dist_between_text_opinion_ends(self, text_opinion, distance_type):
81
+ assert(isinstance(text_opinion, TextOpinion))
82
+ assert(isinstance(distance_type, DistanceType))
83
+
84
+ e1_id = self.__get_end_id(text_opinion=text_opinion, end_type=EntityEndType.Source)
85
+ e2_id = self.__get_end_id(text_opinion=text_opinion, end_type=EntityEndType.Target)
86
+
87
+ return self.__calc_distance(
88
+ pos1=self.get_entity_position(id_in_document=e1_id),
89
+ pos2=self.get_entity_position(id_in_document=e2_id),
90
+ position_type=DistanceType.to_position_type(distance_type))
91
+
92
+ def calc_dist_between_entities(self, e1, e2, distance_type):
93
+ assert(isinstance(e1, DocumentEntity))
94
+ assert(isinstance(e2, DocumentEntity))
95
+ assert(isinstance(distance_type, DistanceType))
96
+
97
+ return self.__calc_distance(
98
+ pos1=self.get_entity_position(e1.IdInDocument),
99
+ pos2=self.get_entity_position(e2.IdInDocument),
100
+ position_type=DistanceType.to_position_type(distance_type))
101
+
102
+ def get_entity_position(self, id_in_document, position_type=None):
103
+ """ returns: TermPosition or int
104
+ """
105
+ assert(isinstance(position_type, TermPositionTypes) or position_type is None)
106
+
107
+ e_pos = self.__entity_positions[id_in_document]
108
+ assert(isinstance(e_pos, TermPosition))
109
+
110
+ if position_type is None:
111
+ return e_pos
112
+
113
+ return e_pos.get_index(position_type)
114
+
115
+ def get_entity_value(self, id_in_document):
116
+ entity = self._doc_entities[id_in_document]
117
+ assert(isinstance(entity, Entity))
118
+ return entity.Value
119
+
120
+ # endregion
121
+
122
+ # region private methods
123
+
124
+ def __extract_entity_value(self, text_opinion, end_type):
125
+ assert(isinstance(text_opinion, TextOpinion))
126
+ end_id = self.__get_end_id(text_opinion=text_opinion, end_type=end_type)
127
+ return self.get_entity_value(end_id)
128
+
129
+ def __get_entity_position(self, text_opinion, end_type, position_type=None):
130
+ assert(isinstance(text_opinion, TextOpinion))
131
+ end_id = self.__get_end_id(text_opinion=text_opinion, end_type=end_type)
132
+ return self.get_entity_position(end_id, position_type)
133
+
134
+ def __calc_distance(self, pos1, pos2, position_type=TermPositionTypes.IndexInDocument):
135
+ assert(isinstance(pos1, TermPosition))
136
+ assert(isinstance(pos2, TermPosition))
137
+ return self.__calc_distance_by_inds(pos1_ind=pos1.get_index(position_type),
138
+ pos2_ind=pos2.get_index(position_type))
139
+
140
+ @staticmethod
141
+ def __calc_distance_by_inds(pos1_ind, pos2_ind):
142
+ return abs(pos1_ind - pos2_ind)
143
+
144
+ @staticmethod
145
+ def __get_end_id(text_opinion, end_type):
146
+ assert(isinstance(text_opinion, TextOpinion))
147
+ assert(end_type == EntityEndType.Source or end_type == EntityEndType.Target)
148
+ return text_opinion.SourceId if end_type == EntityEndType.Source else text_opinion.TargetId
149
+
150
+ def __calculate_entity_positions(self):
151
+ """ Note: here we consider the same order as in self._entities.
152
+ """
153
+ t_ind_in_doc = -1
154
+
155
+ positions = {}
156
+ for s_ind, t_ind_in_sent, term in self.__iter_raw_terms_func():
157
+
158
+ t_ind_in_doc += 1
159
+
160
+ if not isinstance(term, Entity):
161
+ continue
162
+
163
+ # We consider that entities within a single tree has the same positions.
164
+ for tree_entity in list(term.iter_childs()) + [term]:
165
+
166
+ key = self.get_document_entity(tree_entity).IdInDocument
167
+ assert(key not in positions)
168
+
169
+ positions[key] = TermPosition(term_ind_in_doc=t_ind_in_doc,
170
+ term_ind_in_sent=t_ind_in_sent,
171
+ s_ind=s_ind)
172
+
173
+ return positions
174
+
175
+ # endregion
@@ -0,0 +1,20 @@
1
+ from arekit.common.entities.base import Entity
2
+ from arekit.common.docs.parsed.providers.base_pairs import BasePairProvider
3
+ from arekit.common.opinions.base import Opinion
4
+
5
+
6
+ class OpinionPairsProvider(BasePairProvider):
7
+
8
+ NAME = "opinion-pairs-provider"
9
+
10
+ @property
11
+ def Name(self):
12
+ return self.NAME
13
+
14
+ def _create_pair(self, source_entity, target_entity, label):
15
+ assert(isinstance(source_entity, Entity))
16
+ assert(isinstance(target_entity, Entity))
17
+
18
+ return Opinion(source_value=source_entity.Value,
19
+ target_value=target_entity.Value,
20
+ label=label)
@@ -0,0 +1,78 @@
1
+ import logging
2
+
3
+ from arekit.common.entities.collection import EntityCollection
4
+ from arekit.common.docs.entity import DocumentEntity
5
+ from arekit.common.docs.parsed.providers.base_pairs import BasePairProvider
6
+ from arekit.common.opinions.base import Opinion
7
+ from arekit.common.text_opinions.base import TextOpinion
8
+ from arekit.common.labels.provider.constant import ConstantLabelProvider
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class TextOpinionPairsProvider(BasePairProvider):
14
+ """ Document Related text opinion provider.
15
+ """
16
+
17
+ NAME = "text-opinion-pairs-provider"
18
+
19
+ def __init__(self, value_to_group_id_func):
20
+ super(TextOpinionPairsProvider, self).__init__()
21
+ self.__value_to_group_id_func = value_to_group_id_func
22
+ self.__doc_id = None
23
+ self.__entities_collection = None
24
+
25
+ @property
26
+ def Name(self):
27
+ return self.NAME
28
+
29
+ def _create_pair(self, source_entity, target_entity, label):
30
+ assert(isinstance(source_entity, DocumentEntity))
31
+ assert(isinstance(target_entity, DocumentEntity))
32
+
33
+ return TextOpinion(doc_id=self.__doc_id,
34
+ source_id=source_entity.IdInDocument,
35
+ target_id=target_entity.IdInDocument,
36
+ label=label,
37
+ text_opinion_id=None)
38
+
39
+ def init_parsed_doc(self, parsed_doc):
40
+ super(TextOpinionPairsProvider, self).init_parsed_doc(parsed_doc)
41
+ self.__doc_id = parsed_doc.RelatedDocID
42
+ self.__entities_collection = EntityCollection(
43
+ entities=list(self._doc_entities),
44
+ value_to_group_id_func=self.__value_to_group_id_func)
45
+
46
+ def iter_from_opinion(self, opinion, debug=False):
47
+ """ Provides text-level opinion extraction by document-level opinions
48
+ (Opinion class instances), for a particular document (doc_id),
49
+ with the related entity collection.
50
+ """
51
+ assert(isinstance(opinion, Opinion))
52
+
53
+ key = EntityCollection.KeyType.BY_SYNONYMS
54
+ source_entities = self.__entities_collection.try_get_entities(opinion.SourceValue, group_key=key)
55
+ target_entities = self.__entities_collection.try_get_entities(opinion.TargetValue, group_key=key)
56
+
57
+ if source_entities is None:
58
+ if debug:
59
+ logger.info("Appropriate entity for '{}'->'...' has not been found".format(
60
+ opinion.SourceValue))
61
+ return
62
+ yield
63
+
64
+ if target_entities is None:
65
+ if debug:
66
+ logger.info("Appropriate entity for '...'->'{}' has not been found".format(
67
+ opinion.TargetValue))
68
+ return
69
+ yield
70
+
71
+ label_provider = ConstantLabelProvider(label_instance=opinion.Label)
72
+
73
+ pairs_it = self._iter_from_entities(src_entity_doc_ids=list(map(lambda e: e.IdInDocument, source_entities)),
74
+ tgt_entity_doc_ids=list(map(lambda e: e.IdInDocument, target_entities)),
75
+ label_provider=label_provider)
76
+
77
+ for pair in pairs_it:
78
+ yield pair
@@ -0,0 +1,31 @@
1
+ from arekit.common.docs.parsed.base import ParsedDocument
2
+ from arekit.common.docs.parsed.providers.base import BaseParsedDocumentServiceProvider
3
+
4
+
5
+ class ParsedDocumentService(object):
6
+ """ Represents a collection of providers, combined with the parsed doc.
7
+ """
8
+
9
+ def __init__(self, parsed_doc, providers):
10
+ assert(isinstance(parsed_doc, ParsedDocument))
11
+ assert(isinstance(providers, list))
12
+ self.__parsed_doc = parsed_doc
13
+ self.__providers = {}
14
+
15
+ for provider in providers:
16
+ assert(isinstance(provider, BaseParsedDocumentServiceProvider))
17
+ assert(provider.Name not in self.__providers)
18
+
19
+ # Link provider with the related name.
20
+ self.__providers[provider.Name] = provider
21
+
22
+ # Post initialize with the related parsed doc.
23
+ provider.init_parsed_doc(self.__parsed_doc)
24
+
25
+
26
+ @property
27
+ def ParsedDocument(self):
28
+ return self.__parsed_doc
29
+
30
+ def get_provider(self, name):
31
+ return self.__providers[name]
@@ -0,0 +1,42 @@
1
+ from enum import Enum
2
+
3
+
4
+ class TermPosition:
5
+
6
+ def __init__(self, term_ind_in_doc, term_ind_in_sent, s_ind):
7
+ self.__t_ind_in_doc = term_ind_in_doc
8
+ self.__t_ind_in_sent = term_ind_in_sent
9
+ self.__s_ind = s_ind
10
+
11
+ def get_index(self, position_type):
12
+ assert(isinstance(position_type, TermPositionTypes))
13
+
14
+ if position_type == TermPositionTypes.IndexInDocument:
15
+ return self.__t_ind_in_doc
16
+ if position_type == TermPositionTypes.IndexInSentence:
17
+ return self.__t_ind_in_sent
18
+ if position_type == TermPositionTypes.SentenceIndex:
19
+ return self.__s_ind
20
+
21
+
22
+ class TermPositionTypes(Enum):
23
+
24
+ """
25
+ Corresponds to an index of a related term in a whole document
26
+ (document considered as a sequence of terms)
27
+ """
28
+ IndexInDocument = 1
29
+
30
+ """
31
+ Corresponds to an index of a related term in a certain sentence.
32
+ """
33
+ IndexInSentence = 2
34
+
35
+ """
36
+ Corresponds to an index of a sentence in a whole document.
37
+ """
38
+ SentenceIndex = 3
39
+
40
+
41
+
42
+
@@ -0,0 +1,34 @@
1
+ from arekit.common.docs.base import Document
2
+ from arekit.common.docs.parsed.base import ParsedDocument
3
+ from arekit.common.pipeline.context import PipelineContext
4
+ from arekit.common.text.parser import BaseTextParser
5
+
6
+
7
+ class DocumentParser(object):
8
+
9
+ @staticmethod
10
+ def __get_sent(doc, sent_ind):
11
+ return doc.get_sentence(sent_ind)
12
+
13
+ @staticmethod
14
+ def parse(doc, text_parser, parent_ppl_ctx=None):
15
+ assert(isinstance(doc, Document))
16
+ assert(isinstance(text_parser, BaseTextParser))
17
+ assert(isinstance(parent_ppl_ctx, PipelineContext) or parent_ppl_ctx is None)
18
+
19
+ parsed_sentences = [text_parser.run(input_data=DocumentParser.__get_sent(doc, sent_ind).Text,
20
+ params_dict=DocumentParser.__create_ppl_params(doc=doc, sent_ind=sent_ind),
21
+ parent_ctx=parent_ppl_ctx)
22
+ for sent_ind in range(doc.SentencesCount)]
23
+
24
+ return ParsedDocument(doc_id=doc.ID,
25
+ parsed_sentences=parsed_sentences)
26
+
27
+ @staticmethod
28
+ def __create_ppl_params(doc, sent_ind):
29
+ assert(isinstance(doc, Document))
30
+ return {
31
+ "s_ind": sent_ind, # sentence index. (as Metadata)
32
+ "doc_id": doc.ID, # document index. (as Metadata)
33
+ "sentence": DocumentParser.__get_sent(doc, sent_ind), # Required for special sources.
34
+ }
@@ -0,0 +1,14 @@
1
+
2
+ class BaseDocumentSentence(object):
3
+
4
+ def __init__(self, text):
5
+ self.__text = text
6
+
7
+ @property
8
+ def Text(self):
9
+ """
10
+ Any type, i.e.
11
+ - str: original text as string
12
+ - list of words: separated by words/tokens
13
+ """
14
+ return self.__text
File without changes