arekit 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. arekit/__init__.py +0 -0
  2. arekit/common/__init__.py +0 -0
  3. arekit/common/bound.py +48 -0
  4. arekit/common/context/__init__.py +0 -0
  5. arekit/common/context/terms_mapper.py +51 -0
  6. arekit/common/context/token.py +16 -0
  7. arekit/common/data/__init__.py +0 -0
  8. arekit/common/data/const.py +21 -0
  9. arekit/common/data/doc_provider.py +6 -0
  10. arekit/common/data/input/__init__.py +0 -0
  11. arekit/common/data/input/providers/__init__.py +0 -0
  12. arekit/common/data/input/providers/columns/__init__.py +0 -0
  13. arekit/common/data/input/providers/columns/base.py +9 -0
  14. arekit/common/data/input/providers/columns/sample.py +59 -0
  15. arekit/common/data/input/providers/const.py +3 -0
  16. arekit/common/data/input/providers/contents.py +9 -0
  17. arekit/common/data/input/providers/instances/__init__.py +0 -0
  18. arekit/common/data/input/providers/instances/base.py +14 -0
  19. arekit/common/data/input/providers/instances/multiple.py +27 -0
  20. arekit/common/data/input/providers/instances/single.py +8 -0
  21. arekit/common/data/input/providers/label/__init__.py +0 -0
  22. arekit/common/data/input/providers/label/base.py +24 -0
  23. arekit/common/data/input/providers/label/binary.py +11 -0
  24. arekit/common/data/input/providers/label/multiple.py +15 -0
  25. arekit/common/data/input/providers/rows/__init__.py +0 -0
  26. arekit/common/data/input/providers/rows/base.py +64 -0
  27. arekit/common/data/input/providers/rows/samples.py +227 -0
  28. arekit/common/data/input/providers/sample/__init__.py +0 -0
  29. arekit/common/data/input/providers/sample/cropped.py +43 -0
  30. arekit/common/data/input/providers/text/__init__.py +0 -0
  31. arekit/common/data/input/providers/text/single.py +49 -0
  32. arekit/common/data/input/repositories/__init__.py +0 -0
  33. arekit/common/data/input/repositories/base.py +68 -0
  34. arekit/common/data/input/repositories/sample.py +22 -0
  35. arekit/common/data/input/sample.py +66 -0
  36. arekit/common/data/input/terms_mapper.py +88 -0
  37. arekit/common/data/rows_fmt.py +82 -0
  38. arekit/common/data/rows_parser.py +43 -0
  39. arekit/common/data/storages/__init__.py +0 -0
  40. arekit/common/data/storages/base.py +109 -0
  41. arekit/common/data/views/__init__.py +0 -0
  42. arekit/common/data/views/samples.py +26 -0
  43. arekit/common/docs/__init__.py +0 -0
  44. arekit/common/docs/base.py +30 -0
  45. arekit/common/docs/entities_grouping.py +16 -0
  46. arekit/common/docs/entity.py +18 -0
  47. arekit/common/docs/objects_parser.py +37 -0
  48. arekit/common/docs/parsed/__init__.py +0 -0
  49. arekit/common/docs/parsed/base.py +101 -0
  50. arekit/common/docs/parsed/providers/__init__.py +0 -0
  51. arekit/common/docs/parsed/providers/base.py +68 -0
  52. arekit/common/docs/parsed/providers/base_pairs.py +51 -0
  53. arekit/common/docs/parsed/providers/entity_service.py +175 -0
  54. arekit/common/docs/parsed/providers/opinion_pairs.py +20 -0
  55. arekit/common/docs/parsed/providers/text_opinion_pairs.py +78 -0
  56. arekit/common/docs/parsed/service.py +31 -0
  57. arekit/common/docs/parsed/term_position.py +42 -0
  58. arekit/common/docs/parser.py +34 -0
  59. arekit/common/docs/sentence.py +14 -0
  60. arekit/common/entities/__init__.py +0 -0
  61. arekit/common/entities/base.py +51 -0
  62. arekit/common/entities/collection.py +72 -0
  63. arekit/common/entities/str_fmt.py +8 -0
  64. arekit/common/entities/types.py +9 -0
  65. arekit/common/experiment/__init__.py +0 -0
  66. arekit/common/experiment/api/__init__.py +0 -0
  67. arekit/common/experiment/api/base_samples_io.py +20 -0
  68. arekit/common/experiment/data_type.py +17 -0
  69. arekit/common/frames/__init__.py +0 -0
  70. arekit/common/frames/connotations/__init__.py +0 -0
  71. arekit/common/frames/connotations/descriptor.py +17 -0
  72. arekit/common/frames/connotations/provider.py +4 -0
  73. arekit/common/frames/text_variant.py +43 -0
  74. arekit/common/frames/variants/__init__.py +0 -0
  75. arekit/common/frames/variants/base.py +21 -0
  76. arekit/common/frames/variants/collection.py +60 -0
  77. arekit/common/labels/__init__.py +0 -0
  78. arekit/common/labels/base.py +19 -0
  79. arekit/common/labels/provider/__init__.py +0 -0
  80. arekit/common/labels/provider/base.py +7 -0
  81. arekit/common/labels/provider/constant.py +14 -0
  82. arekit/common/labels/scaler/__init__.py +0 -0
  83. arekit/common/labels/scaler/base.py +85 -0
  84. arekit/common/labels/scaler/sentiment.py +7 -0
  85. arekit/common/labels/scaler/single.py +10 -0
  86. arekit/common/labels/str_fmt.py +55 -0
  87. arekit/common/linkage/__init__.py +0 -0
  88. arekit/common/linkage/base.py +44 -0
  89. arekit/common/linkage/meta.py +23 -0
  90. arekit/common/linkage/opinions.py +9 -0
  91. arekit/common/linkage/text_opinions.py +22 -0
  92. arekit/common/log_utils.py +29 -0
  93. arekit/common/model/__init__.py +0 -0
  94. arekit/common/model/labeling/__init__.py +0 -0
  95. arekit/common/model/labeling/base.py +24 -0
  96. arekit/common/model/labeling/modes.py +8 -0
  97. arekit/common/model/labeling/single.py +24 -0
  98. arekit/common/opinions/__init__.py +0 -0
  99. arekit/common/opinions/annot/__init__.py +0 -0
  100. arekit/common/opinions/annot/algo/__init__.py +0 -0
  101. arekit/common/opinions/annot/algo/base.py +4 -0
  102. arekit/common/opinions/annot/algo/pair_based.py +99 -0
  103. arekit/common/opinions/annot/algo/predefined.py +16 -0
  104. arekit/common/opinions/annot/algo_based.py +55 -0
  105. arekit/common/opinions/annot/base.py +15 -0
  106. arekit/common/opinions/base.py +74 -0
  107. arekit/common/opinions/collection.py +150 -0
  108. arekit/common/opinions/enums.py +6 -0
  109. arekit/common/opinions/provider.py +4 -0
  110. arekit/common/opinions/writer.py +4 -0
  111. arekit/common/pipeline/__init__.py +0 -0
  112. arekit/common/pipeline/base.py +25 -0
  113. arekit/common/pipeline/context.py +36 -0
  114. arekit/common/pipeline/conts.py +2 -0
  115. arekit/common/pipeline/items/__init__.py +0 -0
  116. arekit/common/pipeline/items/base.py +12 -0
  117. arekit/common/pipeline/items/flatten.py +14 -0
  118. arekit/common/pipeline/items/handle.py +17 -0
  119. arekit/common/pipeline/items/iter.py +11 -0
  120. arekit/common/pipeline/items/map.py +11 -0
  121. arekit/common/pipeline/items/map_nested.py +13 -0
  122. arekit/common/synonyms/__init__.py +0 -0
  123. arekit/common/synonyms/base.py +151 -0
  124. arekit/common/synonyms/grouping.py +21 -0
  125. arekit/common/text/__init__.py +0 -0
  126. arekit/common/text/enums.py +12 -0
  127. arekit/common/text/parsed.py +42 -0
  128. arekit/common/text/parser.py +12 -0
  129. arekit/common/text/partitioning/__init__.py +0 -0
  130. arekit/common/text/partitioning/base.py +4 -0
  131. arekit/common/text/partitioning/str.py +36 -0
  132. arekit/common/text/partitioning/terms.py +35 -0
  133. arekit/common/text/stemmer.py +16 -0
  134. arekit/common/text_opinions/__init__.py +0 -0
  135. arekit/common/text_opinions/base.py +105 -0
  136. arekit/common/utils.py +129 -0
  137. arekit/contrib/__init__.py +0 -0
  138. arekit/contrib/bert/__init__.py +0 -0
  139. arekit/contrib/bert/input/__init__.py +0 -0
  140. arekit/contrib/bert/input/providers/__init__.py +0 -0
  141. arekit/contrib/bert/input/providers/cropped_sample.py +17 -0
  142. arekit/contrib/bert/input/providers/text_pair.py +62 -0
  143. arekit/contrib/bert/terms/__init__.py +0 -0
  144. arekit/contrib/bert/terms/mapper.py +20 -0
  145. arekit/contrib/networks/__init__.py +0 -0
  146. arekit/contrib/networks/embedding.py +149 -0
  147. arekit/contrib/networks/embedding_io.py +18 -0
  148. arekit/contrib/networks/input/__init__.py +0 -0
  149. arekit/contrib/networks/input/const.py +6 -0
  150. arekit/contrib/networks/input/ctx_serialization.py +28 -0
  151. arekit/contrib/networks/input/embedding/__init__.py +0 -0
  152. arekit/contrib/networks/input/embedding/matrix.py +29 -0
  153. arekit/contrib/networks/input/embedding/offsets.py +55 -0
  154. arekit/contrib/networks/input/formatters/__init__.py +0 -0
  155. arekit/contrib/networks/input/formatters/pos_mapper.py +22 -0
  156. arekit/contrib/networks/input/providers/__init__.py +0 -0
  157. arekit/contrib/networks/input/providers/sample.py +129 -0
  158. arekit/contrib/networks/input/providers/term_connotation.py +23 -0
  159. arekit/contrib/networks/input/providers/text.py +24 -0
  160. arekit/contrib/networks/input/rows_parser.py +47 -0
  161. arekit/contrib/networks/input/term_types.py +13 -0
  162. arekit/contrib/networks/input/terms_mapping.py +60 -0
  163. arekit/contrib/networks/vectorizer.py +6 -0
  164. arekit/contrib/prompt/__init__.py +0 -0
  165. arekit/contrib/prompt/sample.py +61 -0
  166. arekit/contrib/source/__init__.py +0 -0
  167. arekit/contrib/source/brat/__init__.py +0 -0
  168. arekit/contrib/source/brat/annot.py +84 -0
  169. arekit/contrib/source/brat/doc.py +28 -0
  170. arekit/contrib/source/brat/entities/__init__.py +0 -0
  171. arekit/contrib/source/brat/entities/compound.py +13 -0
  172. arekit/contrib/source/brat/entities/entity.py +42 -0
  173. arekit/contrib/source/brat/entities/parser.py +53 -0
  174. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  175. arekit/contrib/source/brat/opinions/converter.py +19 -0
  176. arekit/contrib/source/brat/relation.py +32 -0
  177. arekit/contrib/source/brat/sentence.py +69 -0
  178. arekit/contrib/source/brat/sentences_reader.py +128 -0
  179. arekit/contrib/source/download.py +41 -0
  180. arekit/contrib/source/nerel/__init__.py +0 -0
  181. arekit/contrib/source/nerel/entities.py +55 -0
  182. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  183. arekit/contrib/source/nerel/folding/fixed.py +74 -0
  184. arekit/contrib/source/nerel/io_utils.py +62 -0
  185. arekit/contrib/source/nerel/labels.py +241 -0
  186. arekit/contrib/source/nerel/reader.py +46 -0
  187. arekit/contrib/source/nerel/utils.py +24 -0
  188. arekit/contrib/source/nerel/versions.py +12 -0
  189. arekit/contrib/source/nerelbio/__init__.py +0 -0
  190. arekit/contrib/source/nerelbio/io_utils.py +62 -0
  191. arekit/contrib/source/nerelbio/labels.py +265 -0
  192. arekit/contrib/source/nerelbio/reader.py +8 -0
  193. arekit/contrib/source/nerelbio/versions.py +8 -0
  194. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  195. arekit/contrib/source/ruattitudes/collection.py +36 -0
  196. arekit/contrib/source/ruattitudes/doc.py +51 -0
  197. arekit/contrib/source/ruattitudes/doc_brat.py +44 -0
  198. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  199. arekit/contrib/source/ruattitudes/entity/parser.py +7 -0
  200. arekit/contrib/source/ruattitudes/io_utils.py +56 -0
  201. arekit/contrib/source/ruattitudes/labels_fmt.py +12 -0
  202. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  203. arekit/contrib/source/ruattitudes/opinions/base.py +28 -0
  204. arekit/contrib/source/ruattitudes/opinions/converter.py +37 -0
  205. arekit/contrib/source/ruattitudes/reader.py +268 -0
  206. arekit/contrib/source/ruattitudes/sentence.py +73 -0
  207. arekit/contrib/source/ruattitudes/synonyms.py +17 -0
  208. arekit/contrib/source/ruattitudes/text_object.py +59 -0
  209. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  210. arekit/contrib/source/rusentiframes/collection.py +157 -0
  211. arekit/contrib/source/rusentiframes/effect.py +24 -0
  212. arekit/contrib/source/rusentiframes/io_utils.py +19 -0
  213. arekit/contrib/source/rusentiframes/labels_fmt.py +22 -0
  214. arekit/contrib/source/rusentiframes/polarity.py +35 -0
  215. arekit/contrib/source/rusentiframes/role.py +15 -0
  216. arekit/contrib/source/rusentiframes/state.py +24 -0
  217. arekit/contrib/source/rusentiframes/types.py +42 -0
  218. arekit/contrib/source/rusentiframes/value.py +2 -0
  219. arekit/contrib/source/rusentrel/__init__.py +0 -0
  220. arekit/contrib/source/rusentrel/const.py +3 -0
  221. arekit/contrib/source/rusentrel/docs_reader.py +51 -0
  222. arekit/contrib/source/rusentrel/entities.py +26 -0
  223. arekit/contrib/source/rusentrel/io_utils.py +125 -0
  224. arekit/contrib/source/rusentrel/labels_fmt.py +12 -0
  225. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  226. arekit/contrib/source/rusentrel/opinions/collection.py +30 -0
  227. arekit/contrib/source/rusentrel/opinions/converter.py +40 -0
  228. arekit/contrib/source/rusentrel/opinions/provider.py +54 -0
  229. arekit/contrib/source/rusentrel/opinions/writer.py +42 -0
  230. arekit/contrib/source/rusentrel/synonyms.py +17 -0
  231. arekit/contrib/source/sentinerel/__init__.py +0 -0
  232. arekit/contrib/source/sentinerel/entities.py +52 -0
  233. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  234. arekit/contrib/source/sentinerel/folding/factory.py +31 -0
  235. arekit/contrib/source/sentinerel/folding/fixed.py +70 -0
  236. arekit/contrib/source/sentinerel/io_utils.py +87 -0
  237. arekit/contrib/source/sentinerel/labels.py +53 -0
  238. arekit/contrib/source/sentinerel/labels_scaler.py +30 -0
  239. arekit/contrib/source/sentinerel/reader.py +42 -0
  240. arekit/contrib/source/synonyms/__init__.py +0 -0
  241. arekit/contrib/source/synonyms/utils.py +19 -0
  242. arekit/contrib/source/zip_utils.py +47 -0
  243. arekit/contrib/utils/__init__.py +0 -0
  244. arekit/contrib/utils/bert/__init__.py +0 -0
  245. arekit/contrib/utils/bert/samplers.py +17 -0
  246. arekit/contrib/utils/connotations/__init__.py +0 -0
  247. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +23 -0
  248. arekit/contrib/utils/data/__init__.py +0 -0
  249. arekit/contrib/utils/data/contents/__init__.py +0 -0
  250. arekit/contrib/utils/data/contents/opinions.py +37 -0
  251. arekit/contrib/utils/data/doc_provider/__init__.py +0 -0
  252. arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
  253. arekit/contrib/utils/data/doc_provider/dir_based.py +53 -0
  254. arekit/contrib/utils/data/readers/__init__.py +0 -0
  255. arekit/contrib/utils/data/readers/base.py +7 -0
  256. arekit/contrib/utils/data/readers/csv_pd.py +38 -0
  257. arekit/contrib/utils/data/readers/jsonl.py +15 -0
  258. arekit/contrib/utils/data/service/__init__.py +0 -0
  259. arekit/contrib/utils/data/service/balance.py +50 -0
  260. arekit/contrib/utils/data/storages/__init__.py +0 -0
  261. arekit/contrib/utils/data/storages/jsonl_based.py +18 -0
  262. arekit/contrib/utils/data/storages/pandas_based.py +123 -0
  263. arekit/contrib/utils/data/storages/row_cache.py +48 -0
  264. arekit/contrib/utils/data/writers/__init__.py +0 -0
  265. arekit/contrib/utils/data/writers/base.py +27 -0
  266. arekit/contrib/utils/data/writers/csv_native.py +63 -0
  267. arekit/contrib/utils/data/writers/csv_pd.py +40 -0
  268. arekit/contrib/utils/data/writers/json_opennre.py +132 -0
  269. arekit/contrib/utils/data/writers/sqlite_native.py +110 -0
  270. arekit/contrib/utils/download.py +77 -0
  271. arekit/contrib/utils/embeddings/__init__.py +0 -0
  272. arekit/contrib/utils/embeddings/rusvectores.py +58 -0
  273. arekit/contrib/utils/embeddings/tokens.py +30 -0
  274. arekit/contrib/utils/entities/__init__.py +0 -0
  275. arekit/contrib/utils/entities/filter.py +7 -0
  276. arekit/contrib/utils/entities/formatters/__init__.py +0 -0
  277. arekit/contrib/utils/entities/formatters/str_display.py +11 -0
  278. arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py +15 -0
  279. arekit/contrib/utils/io_utils/__init__.py +0 -0
  280. arekit/contrib/utils/io_utils/embedding.py +72 -0
  281. arekit/contrib/utils/io_utils/opinions.py +37 -0
  282. arekit/contrib/utils/io_utils/samples.py +79 -0
  283. arekit/contrib/utils/io_utils/utils.py +39 -0
  284. arekit/contrib/utils/lexicons/__init__.py +0 -0
  285. arekit/contrib/utils/lexicons/lexicon.py +41 -0
  286. arekit/contrib/utils/lexicons/relation.py +42 -0
  287. arekit/contrib/utils/lexicons/rusentilex.py +37 -0
  288. arekit/contrib/utils/nn/__init__.py +0 -0
  289. arekit/contrib/utils/nn/rows.py +83 -0
  290. arekit/contrib/utils/np_utils/__init__.py +0 -0
  291. arekit/contrib/utils/np_utils/embedding.py +22 -0
  292. arekit/contrib/utils/np_utils/npz_utils.py +13 -0
  293. arekit/contrib/utils/np_utils/vocab.py +20 -0
  294. arekit/contrib/utils/pipelines/__init__.py +0 -0
  295. arekit/contrib/utils/pipelines/items/__init__.py +0 -0
  296. arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
  297. arekit/contrib/utils/pipelines/items/sampling/base.py +99 -0
  298. arekit/contrib/utils/pipelines/items/sampling/networks.py +54 -0
  299. arekit/contrib/utils/pipelines/items/text/__init__.py +0 -0
  300. arekit/contrib/utils/pipelines/items/text/entities_default.py +23 -0
  301. arekit/contrib/utils/pipelines/items/text/frames.py +86 -0
  302. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +36 -0
  303. arekit/contrib/utils/pipelines/items/text/frames_negation.py +32 -0
  304. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +10 -0
  305. arekit/contrib/utils/pipelines/items/text/tokenizer.py +107 -0
  306. arekit/contrib/utils/pipelines/items/text/translator.py +135 -0
  307. arekit/contrib/utils/pipelines/opinion_collections.py +85 -0
  308. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  309. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  310. arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +27 -0
  311. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +65 -0
  312. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +60 -0
  313. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  314. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +29 -0
  315. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +64 -0
  316. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +79 -0
  317. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  318. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +56 -0
  319. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +20 -0
  320. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +65 -0
  321. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  322. arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +21 -0
  323. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +107 -0
  324. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  325. arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +29 -0
  326. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +62 -0
  327. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +180 -0
  328. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +50 -0
  329. arekit/contrib/utils/pipelines/text_opinion/__init__.py +0 -0
  330. arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py +0 -0
  331. arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +34 -0
  332. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +88 -0
  333. arekit/contrib/utils/pipelines/text_opinion/extraction.py +93 -0
  334. arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py +0 -0
  335. arekit/contrib/utils/pipelines/text_opinion/filters/base.py +4 -0
  336. arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +16 -0
  337. arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +29 -0
  338. arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +26 -0
  339. arekit/contrib/utils/processing/__init__.py +0 -0
  340. arekit/contrib/utils/processing/languages/__init__.py +0 -0
  341. arekit/contrib/utils/processing/languages/mods.py +12 -0
  342. arekit/contrib/utils/processing/languages/pos.py +23 -0
  343. arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
  344. arekit/contrib/utils/processing/languages/ru/cases.py +78 -0
  345. arekit/contrib/utils/processing/languages/ru/constants.py +6 -0
  346. arekit/contrib/utils/processing/languages/ru/mods.py +13 -0
  347. arekit/contrib/utils/processing/languages/ru/number.py +23 -0
  348. arekit/contrib/utils/processing/languages/ru/pos_service.py +36 -0
  349. arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
  350. arekit/contrib/utils/processing/lemmatization/mystem.py +51 -0
  351. arekit/contrib/utils/processing/pos/__init__.py +0 -0
  352. arekit/contrib/utils/processing/pos/base.py +12 -0
  353. arekit/contrib/utils/processing/pos/mystem_wrap.py +134 -0
  354. arekit/contrib/utils/processing/pos/russian.py +10 -0
  355. arekit/contrib/utils/processing/text/__init__.py +0 -0
  356. arekit/contrib/utils/processing/text/tokens.py +127 -0
  357. arekit/contrib/utils/resources.py +25 -0
  358. arekit/contrib/utils/serializer.py +43 -0
  359. arekit/contrib/utils/sources/__init__.py +0 -0
  360. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  361. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  362. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +63 -0
  363. arekit/contrib/utils/synonyms/__init__.py +0 -0
  364. arekit/contrib/utils/synonyms/simple.py +15 -0
  365. arekit/contrib/utils/synonyms/stemmer_based.py +38 -0
  366. arekit/contrib/utils/vectorizers/__init__.py +0 -0
  367. arekit/contrib/utils/vectorizers/bpe.py +93 -0
  368. arekit/contrib/utils/vectorizers/random_norm.py +39 -0
  369. arekit/download_data.py +11 -0
  370. arekit-0.24.0.dist-info/LICENSE +21 -0
  371. arekit-0.24.0.dist-info/METADATA +23 -0
  372. arekit-0.24.0.dist-info/RECORD +374 -0
  373. arekit-0.24.0.dist-info/WHEEL +5 -0
  374. arekit-0.24.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,11 @@
1
+ from arekit.common.pipeline.items.base import BasePipelineItem
2
+
3
+
4
+ class MapPipelineItem(BasePipelineItem):
5
+
6
+ def __init__(self, map_func=None):
7
+ assert(callable(map_func))
8
+ self._map_func = map_func
9
+
10
+ def apply_core(self, input_data, pipeline_ctx):
11
+ return map(self._map_func, input_data)
@@ -0,0 +1,13 @@
1
+ from arekit.common.pipeline.items.map import MapPipelineItem
2
+
3
+
4
+ class MapNestedPipelineItem(MapPipelineItem):
5
+ """ This type is considered for describing nested pipelines,
6
+ which might be required in parameters of the parent pipeline-contexts.
7
+
8
+ Data treated as a sequence, in which every element is
9
+ suppose to be mapped with the passed pipeline context.
10
+ """
11
+
12
+ def apply_core(self, input_data, pipeline_ctx):
13
+ return map(lambda item: self._map_func(item, pipeline_ctx), input_data)
File without changes
@@ -0,0 +1,151 @@
1
+ from collections.abc import Iterable
2
+
3
+ from arekit.common import log_utils
4
+
5
+
6
+ class SynonymsCollection(object):
7
+
8
+ def __init__(self, iter_group_values_lists=None, is_read_only=True, debug=False):
9
+ """ iter_group_values_lists: iterable or None
10
+ is_read_only: bool
11
+ whether the relation collection could be expanded or not
12
+ debug: bool
13
+ utilized for logging the salient information during usage.
14
+ """
15
+ assert(isinstance(iter_group_values_lists, Iterable) or iter_group_values_lists is None)
16
+ assert(isinstance(is_read_only, bool))
17
+ assert(isinstance(debug, bool))
18
+
19
+ # Assumes to be filled
20
+ self.__by_sid = {}
21
+ self.__by_index = []
22
+
23
+ self.__is_read_only = is_read_only
24
+ self.__debug = debug
25
+ self.__fill(iter_group_values_lists=[] if iter_group_values_lists is None else iter_group_values_lists)
26
+
27
+ # region properties
28
+
29
+ @property
30
+ def IsReadOnly(self):
31
+ return self.__is_read_only
32
+
33
+ # endregion
34
+
35
+ # region public 'add' methods
36
+
37
+ def add_synonym_value(self, value):
38
+ assert(isinstance(value, str))
39
+
40
+ if self.__contains_synonym_value(value):
41
+ raise Exception(("Collection already contains synonyms '{}'".format(value)).encode('utf-8'))
42
+
43
+ if self.__is_read_only:
44
+ raise Exception(("Failed to add '{}'. Synonym collection is read only!".format(value)).encode('utf-8'))
45
+
46
+ sid = self._create_external_sid(value)
47
+ self.__by_sid[sid] = self.__get_groups_count()
48
+ self.__by_index.append([value])
49
+
50
+ # endregion
51
+
52
+ # region public 'contains' methods
53
+
54
+ def contains_synonym_value(self, value):
55
+ return self.__contains_synonym_value(value)
56
+
57
+ # endregion
58
+
59
+ # region public 'get' methods
60
+
61
+ def get_synonym_group_index(self, value):
62
+ """ NOTE: Before use this, please take a look at the grouping (see #327 issue).
63
+ It is better to use that class API rather than pass that method for `value_to_group_id_func`
64
+ """
65
+ assert(isinstance(value, str))
66
+ return self.__get_group_index(value)
67
+
68
+ # endregion
69
+
70
+ # region public 'create' methods
71
+
72
+ def create_synonym_id(self, value):
73
+ return self._create_external_sid(value)
74
+
75
+ # endregion
76
+
77
+ # region protected methods
78
+
79
+ def _contains_sid(self, v_id):
80
+ return v_id in self.__by_sid
81
+
82
+ def _create_internal_sid(self, value):
83
+ """ Utilized during filling stage.
84
+ """
85
+ raise NotImplementedError()
86
+
87
+ def _create_external_sid(self, value):
88
+ raise NotImplementedError()
89
+
90
+ # endregion
91
+
92
+ # region public 'iter' methods
93
+
94
+ def iter_synonym_values(self, value):
95
+ assert(isinstance(value, str))
96
+ sid = self._create_external_sid(value)
97
+ index = self.__by_sid[sid]
98
+ return iter(self.__by_index[index])
99
+
100
+ def iter_by_index(self):
101
+ return iter(self.__by_index)
102
+
103
+ def iter_group(self, group_index):
104
+ assert(isinstance(group_index, int))
105
+ return iter(self.__by_index[group_index])
106
+
107
+ # endregion
108
+
109
+ # region private methods
110
+
111
+ def __fill(self, iter_group_values_lists):
112
+ for group in iter_group_values_lists:
113
+ self.__process_group(group)
114
+
115
+ def __process_group(self, group_values_list):
116
+ group_index = len(self.__by_index)
117
+ synonym_list = []
118
+
119
+ for synonym_value in group_values_list:
120
+
121
+ value = synonym_value.strip()
122
+
123
+ sid = self._create_internal_sid(value)
124
+
125
+ if self._contains_sid(sid) and self.__debug:
126
+ log_utils.log_synonym_existed(value)
127
+ continue
128
+
129
+ synonym_list.append(value)
130
+ self.__by_sid[sid] = group_index
131
+
132
+ self.__by_index.append(synonym_list)
133
+
134
+ def __get_groups_count(self):
135
+ return len(self.__by_index)
136
+
137
+ def __get_group_index(self, value):
138
+ sid = self._create_external_sid(value)
139
+ return self.__by_sid[sid]
140
+
141
+ def __contains_synonym_value(self, value):
142
+ return self._contains_sid(self._create_external_sid(value))
143
+
144
+ # endregion
145
+
146
+ # region overridden methods
147
+
148
+ def __len__(self):
149
+ return len(self.__by_index)
150
+
151
+ # endregion
@@ -0,0 +1,21 @@
1
+ from arekit.common.synonyms.base import SynonymsCollection
2
+
3
+
4
+ class SynonymsCollectionValuesGroupingProviders:
5
+ """ Providers for the grouping.
6
+ """
7
+
8
+ @staticmethod
9
+ def provide_existed_or_register_missed_value(synonyms, value):
10
+ """ grouping with a potential expansion.
11
+ """
12
+ assert(isinstance(synonyms, SynonymsCollection))
13
+ if not synonyms.contains_synonym_value(value):
14
+ synonyms.add_synonym_value(value)
15
+ return synonyms.get_synonym_group_index(value)
16
+
17
+ @staticmethod
18
+ def provide_existed_value(synonyms, value):
19
+ """ grouping by using only existed value.
20
+ """
21
+ return synonyms.get_synonym_group_index(value)
File without changes
@@ -0,0 +1,12 @@
1
+ from enum import Enum
2
+
3
+
4
+ class TermFormat(Enum):
5
+ """
6
+ Supported types of terms
7
+ """
8
+
9
+ """
10
+ Original value
11
+ """
12
+ Raw = 1
@@ -0,0 +1,42 @@
1
+ from arekit.common.text.enums import TermFormat
2
+
3
+
4
+ class BaseParsedText(object):
5
+ """
6
+ Represents a processed text with extra parameters
7
+ that were used during parsing.
8
+ """
9
+
10
+ # region constructors
11
+
12
+ def __init__(self, terms):
13
+ assert(isinstance(terms, list))
14
+ self._terms = terms
15
+
16
+ # endregion
17
+
18
+ def get_term(self, index, term_format):
19
+ assert(isinstance(term_format, TermFormat))
20
+ terms = self._get_terms(term_format)
21
+ return terms[index]
22
+
23
+ def iter_terms(self, term_format, filter=None):
24
+ assert(isinstance(term_format, TermFormat))
25
+ assert(callable(filter) or filter is None)
26
+ terms = self._get_terms(term_format)
27
+ for term in terms:
28
+ if filter is not None and not list(filter(term)):
29
+ continue
30
+ yield term
31
+
32
+ # region private methods
33
+
34
+ def _get_terms(self, term_format):
35
+ assert(isinstance(term_format, TermFormat))
36
+ assert(term_format == TermFormat.Raw)
37
+ return self._terms
38
+
39
+ # endregion
40
+
41
+ def __len__(self):
42
+ return len(self._terms)
@@ -0,0 +1,12 @@
1
+ from arekit.common.pipeline.base import BasePipeline
2
+ from arekit.common.text.parsed import BaseParsedText
3
+
4
+
5
+ class BaseTextParser(BasePipeline):
6
+
7
+ def run(self, input_data, params_dict=None, parent_ctx=None):
8
+ output_data = super(BaseTextParser, self).run(input_data=input_data,
9
+ params_dict=params_dict,
10
+ parent_ctx=parent_ctx)
11
+
12
+ return BaseParsedText(terms=output_data)
File without changes
@@ -0,0 +1,4 @@
1
+ class BasePartitioning(object):
2
+
3
+ def provide(self, text, parts_it):
4
+ raise NotImplementedError()
@@ -0,0 +1,36 @@
1
+ from collections.abc import Iterable
2
+
3
+ from arekit.common.bound import Bound
4
+ from arekit.common.text.partitioning.base import BasePartitioning
5
+
6
+
7
+ class StringPartitioning(BasePartitioning):
8
+ """ NOTE: considering that provided parts
9
+ has no intersections between each other
10
+ """
11
+
12
+ def provide(self, text, parts_it):
13
+ assert(isinstance(text, str))
14
+ assert(isinstance(parts_it, Iterable))
15
+
16
+ start = 0
17
+ parts = []
18
+ for value, bound in parts_it:
19
+ assert(isinstance(bound, Bound))
20
+ assert(bound.Position >= start)
21
+
22
+ # Release everything till the current value position.
23
+ part = text[start:bound.Position]
24
+
25
+ parts.append(part)
26
+
27
+ # Release the entity value.
28
+ parts.extend([value])
29
+
30
+ start = bound.Position + bound.Length
31
+
32
+ # Release everything after the last entity.
33
+ last_part = text[start:len(text)]
34
+ parts.extend([last_part])
35
+
36
+ return parts
@@ -0,0 +1,35 @@
1
+ from collections.abc import Iterable
2
+
3
+ from arekit.common.bound import Bound
4
+ from arekit.common.text.partitioning.base import BasePartitioning
5
+
6
+
7
+ class TermsPartitioning(BasePartitioning):
8
+ """ NOTE: considering that provided parts
9
+ has no intersections between each other
10
+ """
11
+
12
+ def provide(self, text, parts_it):
13
+ assert(isinstance(text, list))
14
+ assert(isinstance(parts_it, Iterable))
15
+
16
+ start = 0
17
+ parts = []
18
+ for value, bound in parts_it:
19
+ assert(isinstance(bound, Bound))
20
+ assert(bound.Position >= start)
21
+
22
+ # Release everythig till the current value position.
23
+ part = text[start:bound.Position]
24
+
25
+ parts.extend(part)
26
+
27
+ # Release the entity value.
28
+ parts.extend([value])
29
+
30
+ start = bound.Position + bound.Length
31
+
32
+ # Release everything after the last entity.
33
+ parts.extend(text[start:len(text)])
34
+
35
+ return parts
@@ -0,0 +1,16 @@
1
+ class Stemmer:
2
+ """
3
+ Interface
4
+ """
5
+
6
+ def lemmatize_to_list(self, text):
7
+ raise NotImplementedError()
8
+
9
+ def lemmatize_to_str(self, text):
10
+ raise NotImplementedError()
11
+
12
+ def is_adjective(self, pos_type):
13
+ raise NotImplementedError()
14
+
15
+ def is_noun(self, pos_type):
16
+ raise NotImplementedError()
File without changes
@@ -0,0 +1,105 @@
1
+ from arekit.common.labels.base import Label
2
+
3
+
4
+ class TextOpinion(object):
5
+ """
6
+ Represents a relation which were found in doc article
7
+ and composed between two named entities
8
+ (it was found especially by Opinion with predefined label)
9
+ allows to modify label using set_label
10
+
11
+ NOTE: it is important to keep document level IDs. (designed for that)
12
+ """
13
+
14
+ # region constructors
15
+
16
+ def __init__(self, doc_id, text_opinion_id, source_id, target_id, label):
17
+ """ source_id: document level object id
18
+ target_id: document level object id
19
+ """
20
+ self.__doc_id = doc_id
21
+ self.__source_id = source_id
22
+ self.__target_id = target_id
23
+ self.__text_opinion_id = text_opinion_id
24
+ self.__modifiable_label = None
25
+ self.__set_label_core(label)
26
+
27
+ @classmethod
28
+ def create_copy(cls, other, keep_text_opinion_id=True):
29
+ assert(isinstance(other, TextOpinion))
30
+ assert(isinstance(keep_text_opinion_id, bool))
31
+ return cls.__try_create_copy_core(other=other, keep_text_opinion_id=keep_text_opinion_id)
32
+
33
+ @staticmethod
34
+ def try_convert(other, convert_entity_id_func):
35
+ """ Creates a copy of `other` opinion with different id of opinion participants.
36
+ Use cases: required for BaseParsedDocumentServiceProvider, when we decided to bring the outside
37
+ opinion into one which is based on DocumentEntities.
38
+ """
39
+ assert(isinstance(other, TextOpinion))
40
+ assert(callable(convert_entity_id_func))
41
+ return TextOpinion.__try_create_copy_core(other=other,
42
+ convert_entity_id_func=convert_entity_id_func,
43
+ keep_text_opinion_id=False)
44
+
45
+ @staticmethod
46
+ def __try_create_copy_core(other, convert_entity_id_func=lambda part_id: part_id, keep_text_opinion_id=True):
47
+ """ Tries to compose a copy by considering an optional id conversion,
48
+ and identification keeping.
49
+ convert_id:
50
+ func(id) -> id
51
+ """
52
+ assert(callable(convert_entity_id_func))
53
+
54
+ source_id = convert_entity_id_func(other.SourceId)
55
+ target_id = convert_entity_id_func(other.TargetId)
56
+
57
+ if source_id is None or target_id is None:
58
+ return None
59
+
60
+ return TextOpinion(doc_id=other.__doc_id,
61
+ text_opinion_id=other.__text_opinion_id if keep_text_opinion_id else None,
62
+ source_id=source_id,
63
+ target_id=target_id,
64
+ label=other.Label)
65
+
66
+ def __set_label_core(self, label):
67
+ assert(isinstance(label, Label))
68
+ self.__modifiable_label = label
69
+
70
+ # endregion
71
+
72
+ # region properties
73
+
74
+ @property
75
+ def Label(self):
76
+ return self.__modifiable_label
77
+
78
+ @property
79
+ def DocID(self):
80
+ return self.__doc_id
81
+
82
+ @property
83
+ def TextOpinionID(self):
84
+ return self.__text_opinion_id
85
+
86
+ @property
87
+ def SourceId(self):
88
+ return self.__source_id
89
+
90
+ @property
91
+ def TargetId(self):
92
+ return self.__target_id
93
+
94
+ # endregion
95
+
96
+ # region public methods
97
+
98
+ def set_text_opinion_id(self, text_opinion_id):
99
+ assert(self.__text_opinion_id is None)
100
+ self.__text_opinion_id = text_opinion_id
101
+
102
+ def set_label(self, label):
103
+ self.__set_label_core(label)
104
+
105
+ # endregion
arekit/common/utils.py ADDED
@@ -0,0 +1,129 @@
1
+ import sys
2
+ import os
3
+ import requests
4
+ from tqdm import tqdm
5
+
6
+
7
+ def create_dir_if_not_exists(filepath):
8
+ dir = os.path.dirname(filepath)
9
+
10
+ # Check whether string is empty.
11
+ if not dir:
12
+ return
13
+
14
+ if not os.path.exists(dir):
15
+ os.makedirs(dir)
16
+
17
+
18
+ def filter_whitespaces(terms):
19
+ return [term.strip() for term in terms if term.strip()]
20
+
21
+
22
+ def split_by_whitespaces(text):
23
+ """
24
+ Assumes to perform a word separation including a variety of space entries.
25
+ In terms of the latter we consider any whitespace separator.
26
+ """
27
+ assert(isinstance(text, str))
28
+ return text.split()
29
+
30
+
31
+ def progress_bar(iterable, total, desc="", unit="it"):
32
+ if total is not None:
33
+ return progress_bar_defined(iterable=iterable, total=total, desc=desc, unit=unit)
34
+ else:
35
+ return progress_bar_iter(iterable=iterable, desc=desc, unit=unit)
36
+
37
+
38
+ def progress_bar_conditional(iterable, condition_func, total, postfix_func=None, desc="", unit="it"):
39
+ """ This progress-bar updates only on the
40
+ specific conditions during the iteration process.
41
+ """
42
+ assert(callable(condition_func))
43
+ assert(callable(postfix_func) or postfix_func is None)
44
+
45
+ # We consider artificial function that always iters 0.
46
+ def __iter_infinite_placeholder():
47
+ while True:
48
+ yield 0
49
+
50
+ pbar_it = progress_bar(iterable=__iter_infinite_placeholder(),
51
+ desc=desc, unit=unit, total=total)
52
+ element = iter(pbar_it)
53
+
54
+ # Initialize with 0.
55
+ next(element)
56
+
57
+ for item in iterable:
58
+
59
+ # Optionally Update progress bar with the next state.
60
+ if condition_func(item):
61
+ next(element)
62
+ yield item
63
+
64
+ # Optionally provide meta-information.
65
+ if postfix_func is not None:
66
+ pbar_it.set_postfix(postfix_func(item))
67
+
68
+
69
+ def progress_bar_defined(iterable, total, miniters=200, desc="", unit="it"):
70
+ return tqdm(iterable=iterable,
71
+ total=total,
72
+ desc=desc,
73
+ ncols=120,
74
+ position=0,
75
+ leave=True,
76
+ unit=unit,
77
+ miniters=total / miniters if total is not None else total)
78
+
79
+
80
+ def progress_bar_iter(iterable, desc="", unit='it'):
81
+ return tqdm(iterable=iterable,
82
+ desc=desc,
83
+ position=0,
84
+ leave=True,
85
+ ncols=120,
86
+ unit=unit)
87
+
88
+
89
+ def get_default_download_dir():
90
+ """ Refered to NLTK toolkit approach
91
+ https://github.com/nltk/nltk/blob/8e771679cee1b4a9540633cc3ea17f4421ffd6c0/nltk/downloader.py#L1051
92
+ """
93
+
94
+ # On Windows, use %APPDATA%
95
+ if sys.platform == "win32" and "APPDATA" in os.environ:
96
+ homedir = os.environ["APPDATA"]
97
+
98
+ # Otherwise, install in the user's home directory.
99
+ else:
100
+ homedir = os.path.expanduser("~/")
101
+ if homedir == "~/":
102
+ raise ValueError("Could not find a default download directory")
103
+
104
+ return os.path.join(homedir, ".arekit")
105
+
106
+
107
+ def download(dest_file_path, source_url):
108
+ """ Refered to https://github.com/nicolay-r/ner-bilstm-crf-tensorflow/blob/master/ner/utils.py
109
+ Simple http file downloader
110
+ """
111
+ print(('Downloading from {src} to {dest}'.format(src=source_url, dest=dest_file_path)))
112
+
113
+ sys.stdout.flush()
114
+ datapath = os.path.dirname(dest_file_path)
115
+
116
+ if not os.path.exists(datapath):
117
+ os.makedirs(datapath, mode=0o755)
118
+
119
+ dest_file_path = os.path.abspath(dest_file_path)
120
+
121
+ r = requests.get(source_url, stream=True)
122
+ total_length = int(r.headers.get('content-length', 0))
123
+
124
+ with open(dest_file_path, 'wb') as f:
125
+ pbar = tqdm(total=total_length, unit='B', unit_scale=True)
126
+ for chunk in r.iter_content(chunk_size=32 * 1024):
127
+ if chunk: # filter out keep-alive new chunks
128
+ pbar.update(len(chunk))
129
+ f.write(chunk)
File without changes
File without changes
File without changes
File without changes
@@ -0,0 +1,17 @@
1
+ from arekit.common.data.input.providers.sample.cropped import CroppedSampleRowProvider
2
+ from arekit.common.data.input.providers.text.single import BaseSingleTextProvider
3
+ from arekit.contrib.bert.input.providers.text_pair import PairTextProvider
4
+
5
+
6
+ class CroppedBertSampleRowProvider(CroppedSampleRowProvider):
7
+
8
+ def __init__(self, crop_window_size, label_scaler, text_terms_mapper, text_b_template):
9
+
10
+ text_provider = BaseSingleTextProvider(text_terms_mapper=text_terms_mapper) \
11
+ if text_b_template is None else PairTextProvider(text_b_prompt=text_b_template,
12
+ text_terms_mapper=text_terms_mapper)
13
+
14
+ super(CroppedBertSampleRowProvider, self).__init__(
15
+ crop_window_size=crop_window_size,
16
+ label_scaler=label_scaler,
17
+ text_provider=text_provider)