arekit 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. arekit/__init__.py +0 -0
  2. arekit/common/__init__.py +0 -0
  3. arekit/common/bound.py +48 -0
  4. arekit/common/context/__init__.py +0 -0
  5. arekit/common/context/terms_mapper.py +51 -0
  6. arekit/common/context/token.py +16 -0
  7. arekit/common/data/__init__.py +0 -0
  8. arekit/common/data/const.py +21 -0
  9. arekit/common/data/doc_provider.py +6 -0
  10. arekit/common/data/input/__init__.py +0 -0
  11. arekit/common/data/input/providers/__init__.py +0 -0
  12. arekit/common/data/input/providers/columns/__init__.py +0 -0
  13. arekit/common/data/input/providers/columns/base.py +9 -0
  14. arekit/common/data/input/providers/columns/sample.py +59 -0
  15. arekit/common/data/input/providers/const.py +3 -0
  16. arekit/common/data/input/providers/contents.py +9 -0
  17. arekit/common/data/input/providers/instances/__init__.py +0 -0
  18. arekit/common/data/input/providers/instances/base.py +14 -0
  19. arekit/common/data/input/providers/instances/multiple.py +27 -0
  20. arekit/common/data/input/providers/instances/single.py +8 -0
  21. arekit/common/data/input/providers/label/__init__.py +0 -0
  22. arekit/common/data/input/providers/label/base.py +24 -0
  23. arekit/common/data/input/providers/label/binary.py +11 -0
  24. arekit/common/data/input/providers/label/multiple.py +15 -0
  25. arekit/common/data/input/providers/rows/__init__.py +0 -0
  26. arekit/common/data/input/providers/rows/base.py +64 -0
  27. arekit/common/data/input/providers/rows/samples.py +227 -0
  28. arekit/common/data/input/providers/sample/__init__.py +0 -0
  29. arekit/common/data/input/providers/sample/cropped.py +43 -0
  30. arekit/common/data/input/providers/text/__init__.py +0 -0
  31. arekit/common/data/input/providers/text/single.py +49 -0
  32. arekit/common/data/input/repositories/__init__.py +0 -0
  33. arekit/common/data/input/repositories/base.py +68 -0
  34. arekit/common/data/input/repositories/sample.py +22 -0
  35. arekit/common/data/input/sample.py +66 -0
  36. arekit/common/data/input/terms_mapper.py +88 -0
  37. arekit/common/data/rows_fmt.py +82 -0
  38. arekit/common/data/rows_parser.py +43 -0
  39. arekit/common/data/storages/__init__.py +0 -0
  40. arekit/common/data/storages/base.py +109 -0
  41. arekit/common/data/views/__init__.py +0 -0
  42. arekit/common/data/views/samples.py +26 -0
  43. arekit/common/docs/__init__.py +0 -0
  44. arekit/common/docs/base.py +30 -0
  45. arekit/common/docs/entities_grouping.py +16 -0
  46. arekit/common/docs/entity.py +18 -0
  47. arekit/common/docs/objects_parser.py +37 -0
  48. arekit/common/docs/parsed/__init__.py +0 -0
  49. arekit/common/docs/parsed/base.py +101 -0
  50. arekit/common/docs/parsed/providers/__init__.py +0 -0
  51. arekit/common/docs/parsed/providers/base.py +68 -0
  52. arekit/common/docs/parsed/providers/base_pairs.py +51 -0
  53. arekit/common/docs/parsed/providers/entity_service.py +175 -0
  54. arekit/common/docs/parsed/providers/opinion_pairs.py +20 -0
  55. arekit/common/docs/parsed/providers/text_opinion_pairs.py +78 -0
  56. arekit/common/docs/parsed/service.py +31 -0
  57. arekit/common/docs/parsed/term_position.py +42 -0
  58. arekit/common/docs/parser.py +34 -0
  59. arekit/common/docs/sentence.py +14 -0
  60. arekit/common/entities/__init__.py +0 -0
  61. arekit/common/entities/base.py +51 -0
  62. arekit/common/entities/collection.py +72 -0
  63. arekit/common/entities/str_fmt.py +8 -0
  64. arekit/common/entities/types.py +9 -0
  65. arekit/common/experiment/__init__.py +0 -0
  66. arekit/common/experiment/api/__init__.py +0 -0
  67. arekit/common/experiment/api/base_samples_io.py +20 -0
  68. arekit/common/experiment/data_type.py +17 -0
  69. arekit/common/frames/__init__.py +0 -0
  70. arekit/common/frames/connotations/__init__.py +0 -0
  71. arekit/common/frames/connotations/descriptor.py +17 -0
  72. arekit/common/frames/connotations/provider.py +4 -0
  73. arekit/common/frames/text_variant.py +43 -0
  74. arekit/common/frames/variants/__init__.py +0 -0
  75. arekit/common/frames/variants/base.py +21 -0
  76. arekit/common/frames/variants/collection.py +60 -0
  77. arekit/common/labels/__init__.py +0 -0
  78. arekit/common/labels/base.py +19 -0
  79. arekit/common/labels/provider/__init__.py +0 -0
  80. arekit/common/labels/provider/base.py +7 -0
  81. arekit/common/labels/provider/constant.py +14 -0
  82. arekit/common/labels/scaler/__init__.py +0 -0
  83. arekit/common/labels/scaler/base.py +85 -0
  84. arekit/common/labels/scaler/sentiment.py +7 -0
  85. arekit/common/labels/scaler/single.py +10 -0
  86. arekit/common/labels/str_fmt.py +55 -0
  87. arekit/common/linkage/__init__.py +0 -0
  88. arekit/common/linkage/base.py +44 -0
  89. arekit/common/linkage/meta.py +23 -0
  90. arekit/common/linkage/opinions.py +9 -0
  91. arekit/common/linkage/text_opinions.py +22 -0
  92. arekit/common/log_utils.py +29 -0
  93. arekit/common/model/__init__.py +0 -0
  94. arekit/common/model/labeling/__init__.py +0 -0
  95. arekit/common/model/labeling/base.py +24 -0
  96. arekit/common/model/labeling/modes.py +8 -0
  97. arekit/common/model/labeling/single.py +24 -0
  98. arekit/common/opinions/__init__.py +0 -0
  99. arekit/common/opinions/annot/__init__.py +0 -0
  100. arekit/common/opinions/annot/algo/__init__.py +0 -0
  101. arekit/common/opinions/annot/algo/base.py +4 -0
  102. arekit/common/opinions/annot/algo/pair_based.py +99 -0
  103. arekit/common/opinions/annot/algo/predefined.py +16 -0
  104. arekit/common/opinions/annot/algo_based.py +55 -0
  105. arekit/common/opinions/annot/base.py +15 -0
  106. arekit/common/opinions/base.py +74 -0
  107. arekit/common/opinions/collection.py +150 -0
  108. arekit/common/opinions/enums.py +6 -0
  109. arekit/common/opinions/provider.py +4 -0
  110. arekit/common/opinions/writer.py +4 -0
  111. arekit/common/pipeline/__init__.py +0 -0
  112. arekit/common/pipeline/base.py +25 -0
  113. arekit/common/pipeline/context.py +36 -0
  114. arekit/common/pipeline/conts.py +2 -0
  115. arekit/common/pipeline/items/__init__.py +0 -0
  116. arekit/common/pipeline/items/base.py +12 -0
  117. arekit/common/pipeline/items/flatten.py +14 -0
  118. arekit/common/pipeline/items/handle.py +17 -0
  119. arekit/common/pipeline/items/iter.py +11 -0
  120. arekit/common/pipeline/items/map.py +11 -0
  121. arekit/common/pipeline/items/map_nested.py +13 -0
  122. arekit/common/synonyms/__init__.py +0 -0
  123. arekit/common/synonyms/base.py +151 -0
  124. arekit/common/synonyms/grouping.py +21 -0
  125. arekit/common/text/__init__.py +0 -0
  126. arekit/common/text/enums.py +12 -0
  127. arekit/common/text/parsed.py +42 -0
  128. arekit/common/text/parser.py +12 -0
  129. arekit/common/text/partitioning/__init__.py +0 -0
  130. arekit/common/text/partitioning/base.py +4 -0
  131. arekit/common/text/partitioning/str.py +36 -0
  132. arekit/common/text/partitioning/terms.py +35 -0
  133. arekit/common/text/stemmer.py +16 -0
  134. arekit/common/text_opinions/__init__.py +0 -0
  135. arekit/common/text_opinions/base.py +105 -0
  136. arekit/common/utils.py +129 -0
  137. arekit/contrib/__init__.py +0 -0
  138. arekit/contrib/bert/__init__.py +0 -0
  139. arekit/contrib/bert/input/__init__.py +0 -0
  140. arekit/contrib/bert/input/providers/__init__.py +0 -0
  141. arekit/contrib/bert/input/providers/cropped_sample.py +17 -0
  142. arekit/contrib/bert/input/providers/text_pair.py +62 -0
  143. arekit/contrib/bert/terms/__init__.py +0 -0
  144. arekit/contrib/bert/terms/mapper.py +20 -0
  145. arekit/contrib/networks/__init__.py +0 -0
  146. arekit/contrib/networks/embedding.py +149 -0
  147. arekit/contrib/networks/embedding_io.py +18 -0
  148. arekit/contrib/networks/input/__init__.py +0 -0
  149. arekit/contrib/networks/input/const.py +6 -0
  150. arekit/contrib/networks/input/ctx_serialization.py +28 -0
  151. arekit/contrib/networks/input/embedding/__init__.py +0 -0
  152. arekit/contrib/networks/input/embedding/matrix.py +29 -0
  153. arekit/contrib/networks/input/embedding/offsets.py +55 -0
  154. arekit/contrib/networks/input/formatters/__init__.py +0 -0
  155. arekit/contrib/networks/input/formatters/pos_mapper.py +22 -0
  156. arekit/contrib/networks/input/providers/__init__.py +0 -0
  157. arekit/contrib/networks/input/providers/sample.py +129 -0
  158. arekit/contrib/networks/input/providers/term_connotation.py +23 -0
  159. arekit/contrib/networks/input/providers/text.py +24 -0
  160. arekit/contrib/networks/input/rows_parser.py +47 -0
  161. arekit/contrib/networks/input/term_types.py +13 -0
  162. arekit/contrib/networks/input/terms_mapping.py +60 -0
  163. arekit/contrib/networks/vectorizer.py +6 -0
  164. arekit/contrib/prompt/__init__.py +0 -0
  165. arekit/contrib/prompt/sample.py +61 -0
  166. arekit/contrib/source/__init__.py +0 -0
  167. arekit/contrib/source/brat/__init__.py +0 -0
  168. arekit/contrib/source/brat/annot.py +84 -0
  169. arekit/contrib/source/brat/doc.py +28 -0
  170. arekit/contrib/source/brat/entities/__init__.py +0 -0
  171. arekit/contrib/source/brat/entities/compound.py +13 -0
  172. arekit/contrib/source/brat/entities/entity.py +42 -0
  173. arekit/contrib/source/brat/entities/parser.py +53 -0
  174. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  175. arekit/contrib/source/brat/opinions/converter.py +19 -0
  176. arekit/contrib/source/brat/relation.py +32 -0
  177. arekit/contrib/source/brat/sentence.py +69 -0
  178. arekit/contrib/source/brat/sentences_reader.py +128 -0
  179. arekit/contrib/source/download.py +41 -0
  180. arekit/contrib/source/nerel/__init__.py +0 -0
  181. arekit/contrib/source/nerel/entities.py +55 -0
  182. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  183. arekit/contrib/source/nerel/folding/fixed.py +74 -0
  184. arekit/contrib/source/nerel/io_utils.py +62 -0
  185. arekit/contrib/source/nerel/labels.py +241 -0
  186. arekit/contrib/source/nerel/reader.py +46 -0
  187. arekit/contrib/source/nerel/utils.py +24 -0
  188. arekit/contrib/source/nerel/versions.py +12 -0
  189. arekit/contrib/source/nerelbio/__init__.py +0 -0
  190. arekit/contrib/source/nerelbio/io_utils.py +62 -0
  191. arekit/contrib/source/nerelbio/labels.py +265 -0
  192. arekit/contrib/source/nerelbio/reader.py +8 -0
  193. arekit/contrib/source/nerelbio/versions.py +8 -0
  194. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  195. arekit/contrib/source/ruattitudes/collection.py +36 -0
  196. arekit/contrib/source/ruattitudes/doc.py +51 -0
  197. arekit/contrib/source/ruattitudes/doc_brat.py +44 -0
  198. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  199. arekit/contrib/source/ruattitudes/entity/parser.py +7 -0
  200. arekit/contrib/source/ruattitudes/io_utils.py +56 -0
  201. arekit/contrib/source/ruattitudes/labels_fmt.py +12 -0
  202. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  203. arekit/contrib/source/ruattitudes/opinions/base.py +28 -0
  204. arekit/contrib/source/ruattitudes/opinions/converter.py +37 -0
  205. arekit/contrib/source/ruattitudes/reader.py +268 -0
  206. arekit/contrib/source/ruattitudes/sentence.py +73 -0
  207. arekit/contrib/source/ruattitudes/synonyms.py +17 -0
  208. arekit/contrib/source/ruattitudes/text_object.py +59 -0
  209. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  210. arekit/contrib/source/rusentiframes/collection.py +157 -0
  211. arekit/contrib/source/rusentiframes/effect.py +24 -0
  212. arekit/contrib/source/rusentiframes/io_utils.py +19 -0
  213. arekit/contrib/source/rusentiframes/labels_fmt.py +22 -0
  214. arekit/contrib/source/rusentiframes/polarity.py +35 -0
  215. arekit/contrib/source/rusentiframes/role.py +15 -0
  216. arekit/contrib/source/rusentiframes/state.py +24 -0
  217. arekit/contrib/source/rusentiframes/types.py +42 -0
  218. arekit/contrib/source/rusentiframes/value.py +2 -0
  219. arekit/contrib/source/rusentrel/__init__.py +0 -0
  220. arekit/contrib/source/rusentrel/const.py +3 -0
  221. arekit/contrib/source/rusentrel/docs_reader.py +51 -0
  222. arekit/contrib/source/rusentrel/entities.py +26 -0
  223. arekit/contrib/source/rusentrel/io_utils.py +125 -0
  224. arekit/contrib/source/rusentrel/labels_fmt.py +12 -0
  225. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  226. arekit/contrib/source/rusentrel/opinions/collection.py +30 -0
  227. arekit/contrib/source/rusentrel/opinions/converter.py +40 -0
  228. arekit/contrib/source/rusentrel/opinions/provider.py +54 -0
  229. arekit/contrib/source/rusentrel/opinions/writer.py +42 -0
  230. arekit/contrib/source/rusentrel/synonyms.py +17 -0
  231. arekit/contrib/source/sentinerel/__init__.py +0 -0
  232. arekit/contrib/source/sentinerel/entities.py +52 -0
  233. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  234. arekit/contrib/source/sentinerel/folding/factory.py +31 -0
  235. arekit/contrib/source/sentinerel/folding/fixed.py +70 -0
  236. arekit/contrib/source/sentinerel/io_utils.py +87 -0
  237. arekit/contrib/source/sentinerel/labels.py +53 -0
  238. arekit/contrib/source/sentinerel/labels_scaler.py +30 -0
  239. arekit/contrib/source/sentinerel/reader.py +42 -0
  240. arekit/contrib/source/synonyms/__init__.py +0 -0
  241. arekit/contrib/source/synonyms/utils.py +19 -0
  242. arekit/contrib/source/zip_utils.py +47 -0
  243. arekit/contrib/utils/__init__.py +0 -0
  244. arekit/contrib/utils/bert/__init__.py +0 -0
  245. arekit/contrib/utils/bert/samplers.py +17 -0
  246. arekit/contrib/utils/connotations/__init__.py +0 -0
  247. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +23 -0
  248. arekit/contrib/utils/data/__init__.py +0 -0
  249. arekit/contrib/utils/data/contents/__init__.py +0 -0
  250. arekit/contrib/utils/data/contents/opinions.py +37 -0
  251. arekit/contrib/utils/data/doc_provider/__init__.py +0 -0
  252. arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
  253. arekit/contrib/utils/data/doc_provider/dir_based.py +53 -0
  254. arekit/contrib/utils/data/readers/__init__.py +0 -0
  255. arekit/contrib/utils/data/readers/base.py +7 -0
  256. arekit/contrib/utils/data/readers/csv_pd.py +38 -0
  257. arekit/contrib/utils/data/readers/jsonl.py +15 -0
  258. arekit/contrib/utils/data/service/__init__.py +0 -0
  259. arekit/contrib/utils/data/service/balance.py +50 -0
  260. arekit/contrib/utils/data/storages/__init__.py +0 -0
  261. arekit/contrib/utils/data/storages/jsonl_based.py +18 -0
  262. arekit/contrib/utils/data/storages/pandas_based.py +123 -0
  263. arekit/contrib/utils/data/storages/row_cache.py +48 -0
  264. arekit/contrib/utils/data/writers/__init__.py +0 -0
  265. arekit/contrib/utils/data/writers/base.py +27 -0
  266. arekit/contrib/utils/data/writers/csv_native.py +63 -0
  267. arekit/contrib/utils/data/writers/csv_pd.py +40 -0
  268. arekit/contrib/utils/data/writers/json_opennre.py +132 -0
  269. arekit/contrib/utils/data/writers/sqlite_native.py +110 -0
  270. arekit/contrib/utils/download.py +77 -0
  271. arekit/contrib/utils/embeddings/__init__.py +0 -0
  272. arekit/contrib/utils/embeddings/rusvectores.py +58 -0
  273. arekit/contrib/utils/embeddings/tokens.py +30 -0
  274. arekit/contrib/utils/entities/__init__.py +0 -0
  275. arekit/contrib/utils/entities/filter.py +7 -0
  276. arekit/contrib/utils/entities/formatters/__init__.py +0 -0
  277. arekit/contrib/utils/entities/formatters/str_display.py +11 -0
  278. arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py +15 -0
  279. arekit/contrib/utils/io_utils/__init__.py +0 -0
  280. arekit/contrib/utils/io_utils/embedding.py +72 -0
  281. arekit/contrib/utils/io_utils/opinions.py +37 -0
  282. arekit/contrib/utils/io_utils/samples.py +79 -0
  283. arekit/contrib/utils/io_utils/utils.py +39 -0
  284. arekit/contrib/utils/lexicons/__init__.py +0 -0
  285. arekit/contrib/utils/lexicons/lexicon.py +41 -0
  286. arekit/contrib/utils/lexicons/relation.py +42 -0
  287. arekit/contrib/utils/lexicons/rusentilex.py +37 -0
  288. arekit/contrib/utils/nn/__init__.py +0 -0
  289. arekit/contrib/utils/nn/rows.py +83 -0
  290. arekit/contrib/utils/np_utils/__init__.py +0 -0
  291. arekit/contrib/utils/np_utils/embedding.py +22 -0
  292. arekit/contrib/utils/np_utils/npz_utils.py +13 -0
  293. arekit/contrib/utils/np_utils/vocab.py +20 -0
  294. arekit/contrib/utils/pipelines/__init__.py +0 -0
  295. arekit/contrib/utils/pipelines/items/__init__.py +0 -0
  296. arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
  297. arekit/contrib/utils/pipelines/items/sampling/base.py +99 -0
  298. arekit/contrib/utils/pipelines/items/sampling/networks.py +54 -0
  299. arekit/contrib/utils/pipelines/items/text/__init__.py +0 -0
  300. arekit/contrib/utils/pipelines/items/text/entities_default.py +23 -0
  301. arekit/contrib/utils/pipelines/items/text/frames.py +86 -0
  302. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +36 -0
  303. arekit/contrib/utils/pipelines/items/text/frames_negation.py +32 -0
  304. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +10 -0
  305. arekit/contrib/utils/pipelines/items/text/tokenizer.py +107 -0
  306. arekit/contrib/utils/pipelines/items/text/translator.py +135 -0
  307. arekit/contrib/utils/pipelines/opinion_collections.py +85 -0
  308. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  309. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  310. arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +27 -0
  311. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +65 -0
  312. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +60 -0
  313. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  314. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +29 -0
  315. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +64 -0
  316. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +79 -0
  317. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  318. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +56 -0
  319. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +20 -0
  320. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +65 -0
  321. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  322. arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +21 -0
  323. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +107 -0
  324. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  325. arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +29 -0
  326. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +62 -0
  327. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +180 -0
  328. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +50 -0
  329. arekit/contrib/utils/pipelines/text_opinion/__init__.py +0 -0
  330. arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py +0 -0
  331. arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +34 -0
  332. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +88 -0
  333. arekit/contrib/utils/pipelines/text_opinion/extraction.py +93 -0
  334. arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py +0 -0
  335. arekit/contrib/utils/pipelines/text_opinion/filters/base.py +4 -0
  336. arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +16 -0
  337. arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +29 -0
  338. arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +26 -0
  339. arekit/contrib/utils/processing/__init__.py +0 -0
  340. arekit/contrib/utils/processing/languages/__init__.py +0 -0
  341. arekit/contrib/utils/processing/languages/mods.py +12 -0
  342. arekit/contrib/utils/processing/languages/pos.py +23 -0
  343. arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
  344. arekit/contrib/utils/processing/languages/ru/cases.py +78 -0
  345. arekit/contrib/utils/processing/languages/ru/constants.py +6 -0
  346. arekit/contrib/utils/processing/languages/ru/mods.py +13 -0
  347. arekit/contrib/utils/processing/languages/ru/number.py +23 -0
  348. arekit/contrib/utils/processing/languages/ru/pos_service.py +36 -0
  349. arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
  350. arekit/contrib/utils/processing/lemmatization/mystem.py +51 -0
  351. arekit/contrib/utils/processing/pos/__init__.py +0 -0
  352. arekit/contrib/utils/processing/pos/base.py +12 -0
  353. arekit/contrib/utils/processing/pos/mystem_wrap.py +134 -0
  354. arekit/contrib/utils/processing/pos/russian.py +10 -0
  355. arekit/contrib/utils/processing/text/__init__.py +0 -0
  356. arekit/contrib/utils/processing/text/tokens.py +127 -0
  357. arekit/contrib/utils/resources.py +25 -0
  358. arekit/contrib/utils/serializer.py +43 -0
  359. arekit/contrib/utils/sources/__init__.py +0 -0
  360. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  361. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  362. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +63 -0
  363. arekit/contrib/utils/synonyms/__init__.py +0 -0
  364. arekit/contrib/utils/synonyms/simple.py +15 -0
  365. arekit/contrib/utils/synonyms/stemmer_based.py +38 -0
  366. arekit/contrib/utils/vectorizers/__init__.py +0 -0
  367. arekit/contrib/utils/vectorizers/bpe.py +93 -0
  368. arekit/contrib/utils/vectorizers/random_norm.py +39 -0
  369. arekit/download_data.py +11 -0
  370. arekit-0.24.0.dist-info/LICENSE +21 -0
  371. arekit-0.24.0.dist-info/METADATA +23 -0
  372. arekit-0.24.0.dist-info/RECORD +374 -0
  373. arekit-0.24.0.dist-info/WHEEL +5 -0
  374. arekit-0.24.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,123 @@
1
+ import importlib
2
+
3
+ import numpy as np
4
+
5
+ from arekit.common.data.input.providers.columns.base import BaseColumnsProvider
6
+ from arekit.common.data.storages.base import BaseRowsStorage, logger
7
+ from arekit.common.utils import progress_bar_iter
8
+
9
+
10
+ class PandasBasedRowsStorage(BaseRowsStorage):
11
+ """ Storage Kernel functions implementation,
12
+ based on the pandas DataFrames.
13
+ """
14
+
15
+ def __init__(self, df=None):
16
+ self._df = df
17
+
18
+ @property
19
+ def DataFrame(self):
20
+ # TODO. Temporary hack, however this should be removed in future.
21
+ return self._df
22
+
23
+ @staticmethod
24
+ def __create_empty(cols_with_types):
25
+ """ cols_with_types: list of pairs ("name", dtype)
26
+ """
27
+ assert(isinstance(cols_with_types, list))
28
+ data = np.empty(0, dtype=np.dtype(cols_with_types))
29
+ pd = importlib.import_module("pandas")
30
+ return pd.DataFrame(data)
31
+
32
+ def __filter(self, column_name, value):
33
+ return self._df[self._df[column_name] == value]
34
+
35
+ @staticmethod
36
+ def __iter_rows_core(df):
37
+ for row_index, row in df.iterrows():
38
+ yield row_index, row
39
+
40
+ def __fill_with_blank_rows(self, row_id_column_name, rows_count):
41
+ assert(isinstance(row_id_column_name, str))
42
+ assert(isinstance(rows_count, int))
43
+ self._df[row_id_column_name] = list(range(rows_count))
44
+ self._df.set_index(row_id_column_name, inplace=True)
45
+
46
+ # region protected methods
47
+
48
+ def iter_column_names(self):
49
+ return iter(self._df.columns)
50
+
51
+ def iter_column_types(self):
52
+ return iter(self._df.dtypes)
53
+
54
+ def _set_row_value(self, row_ind, column, value):
55
+ self._df.at[row_ind, column] = value
56
+
57
+ def _iter_rows(self):
58
+ for row_index, row in self.__iter_rows_core(self._df):
59
+ yield row_index, row.to_dict()
60
+
61
+ def _get_rows_count(self):
62
+ return len(self._df)
63
+
64
+ # endregion
65
+
66
+ # region public methods
67
+
68
+ def fill(self, iter_rows_func, columns_provider, row_handler=None, rows_count=None, desc=""):
69
+ """ NOTE: We provide the rows counting which is required
70
+ in order to know an expected amount of rows in advace
71
+ due to the specifics of the pandas memory allocation
72
+ for the DataFrames.
73
+ The latter allows us avoid rows appending, which
74
+ may significantly affects on performance once the size
75
+ of DataFrame becomes relatively large.
76
+ """
77
+ assert(isinstance(columns_provider, BaseColumnsProvider))
78
+
79
+ logger.info("Rows calculation process started. [Required by Pandas-Based storage kernel]")
80
+ logged_rows_it = progress_bar_iter(
81
+ iterable=iter_rows_func(True),
82
+ desc="Calculating rows count ({reason})".format(reason=desc),
83
+ unit="rows")
84
+ rows_count = sum(1 for _ in logged_rows_it)
85
+
86
+ logger.info("Filling with blank rows: {}".format(rows_count))
87
+ self.__fill_with_blank_rows(row_id_column_name=columns_provider.ROW_ID,
88
+ rows_count=rows_count)
89
+ logger.info("Completed!")
90
+
91
+ super(PandasBasedRowsStorage, self).fill(iter_rows_func=iter_rows_func,
92
+ row_handler=row_handler,
93
+ columns_provider=columns_provider,
94
+ rows_count=rows_count)
95
+
96
+ def get_row(self, row_index):
97
+ return self._df.iloc[row_index]
98
+
99
+ def get_cell(self, row_index, column_name):
100
+ return self._df.iloc[row_index][column_name]
101
+
102
+ def iter_column_values(self, column_name, dtype=None):
103
+ values = self._df[column_name]
104
+ if dtype is None:
105
+ return values
106
+ return values.astype(dtype)
107
+
108
+ def find_by_value(self, column_name, value):
109
+ return self.__filter(column_name=column_name, value=value)
110
+
111
+ def init_empty(self, columns_provider):
112
+ cols_with_types = columns_provider.get_columns_list_with_types()
113
+ self._df = self.__create_empty(cols_with_types)
114
+
115
+ def iter_shuffled(self):
116
+ shuffled_df = self._df.sample(frac=1)
117
+ return self.__iter_rows_core(shuffled_df)
118
+
119
+ def free(self):
120
+ del self._df
121
+ super(PandasBasedRowsStorage, self).free()
122
+
123
+ # endregion
@@ -0,0 +1,48 @@
1
+ from arekit.common.data.input.providers.columns.base import BaseColumnsProvider
2
+ from arekit.common.data.storages.base import BaseRowsStorage
3
+
4
+
5
+ class RowCacheStorage(BaseRowsStorage):
6
+ """ Row Caching storage kernel, based on python dictionary.
7
+ """
8
+
9
+ def __init__(self, force_collect_columns=None):
10
+ """ This is a particular/related solution for the following issue:
11
+ https://github.com/nicolay-r/AREkit/issues/464
12
+ force_collect_columns: list
13
+ columns that supposed to be additionally considered in output.
14
+ """
15
+ assert(isinstance(force_collect_columns, list) or force_collect_columns is None)
16
+ self.__f = None
17
+ self.__row_cache = {}
18
+ self.__column_names = []
19
+ self.__column_types = []
20
+ self.__force_collect_columns = [] if force_collect_columns is None else force_collect_columns
21
+
22
+ @property
23
+ def RowCache(self):
24
+ return self.__row_cache
25
+
26
+ def init_empty(self, columns_provider):
27
+ assert (isinstance(columns_provider, BaseColumnsProvider))
28
+
29
+ self.__column_names.clear()
30
+ for col_name, col_type in columns_provider.get_columns_list_with_types():
31
+ self.__column_names.append(col_name)
32
+ self.__column_types.append(col_type)
33
+
34
+ # Expand with columns that are forced to be provided.
35
+ existed_set = set(self.__column_names)
36
+ self.__column_names += [c for c in self.__force_collect_columns if c not in existed_set]
37
+
38
+ def iter_column_names(self):
39
+ return iter(self.__column_names)
40
+
41
+ def iter_column_types(self):
42
+ return iter(self.__column_types)
43
+
44
+ def _set_row_value(self, row_ind, column, value):
45
+ self.__row_cache[column] = value
46
+
47
+ def _begin_filling_row(self, row_ind):
48
+ self.__row_cache.clear()
File without changes
@@ -0,0 +1,27 @@
1
+ class BaseWriter(object):
2
+
3
+ def extension(self):
4
+ """ Expected output extension type.
5
+ """
6
+ raise NotImplementedError()
7
+
8
+ def open_target(self, target):
9
+ pass
10
+
11
+ def commit_line(self, storage):
12
+ pass
13
+
14
+ def close_target(self):
15
+ pass
16
+
17
+ def write_all(self, storage, target):
18
+ """ Performs the writing process of the whole storage.
19
+ The implementation and support of the related operation
20
+ may vary and depends on the nature of the storage, which
21
+ briefly might keep all the data in memory (available)
22
+ or cache only temporary information (unavailable)
23
+
24
+ storage: BaseRowsStorage
25
+ target: str
26
+ """
27
+ raise NotImplementedError()
@@ -0,0 +1,63 @@
1
+ import csv
2
+ import os
3
+ from os.path import dirname
4
+
5
+ from arekit.common.data.storages.base import BaseRowsStorage
6
+ from arekit.contrib.utils.data.storages.row_cache import RowCacheStorage
7
+ from arekit.contrib.utils.data.writers.base import BaseWriter
8
+
9
+
10
+ class NativeCsvWriter(BaseWriter):
11
+
12
+ def __init__(self, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL, header=True):
13
+ self.__target_f = None
14
+ self.__writer = None
15
+ self.__create_writer_func = lambda f: csv.writer(
16
+ f, delimiter=delimiter, quotechar=quotechar, quoting=quoting)
17
+ self.__header = header
18
+ self.__header_written = None
19
+
20
+ def extension(self):
21
+ return ".csv"
22
+
23
+ @staticmethod
24
+ def __iter_storage_column_names(storage):
25
+ """ Iter only those columns that existed in storage.
26
+ """
27
+ for col_name in storage.iter_column_names():
28
+ if col_name in storage.RowCache:
29
+ yield col_name
30
+
31
+ def open_target(self, target):
32
+ os.makedirs(dirname(target), exist_ok=True)
33
+ self.__target_f = open(target, "w")
34
+ self.__writer = self.__create_writer_func(self.__target_f)
35
+ self.__header_written = not self.__header
36
+
37
+ def close_target(self):
38
+ self.__target_f.close()
39
+
40
+ def commit_line(self, storage):
41
+ assert(isinstance(storage, RowCacheStorage))
42
+ assert(self.__writer is not None)
43
+
44
+ if not self.__header_written:
45
+ self.__writer.writerow(list(self.__iter_storage_column_names(storage)))
46
+ self.__header_written = True
47
+
48
+ line_data = list(map(lambda col_name: storage.RowCache[col_name],
49
+ self.__iter_storage_column_names(storage)))
50
+ self.__writer.writerow(line_data)
51
+
52
+ def write_all(self, storage, target):
53
+ """ Writes all the `storage` rows
54
+ into the `target` filepath, formatted as CSV.
55
+ """
56
+ assert(isinstance(storage, BaseRowsStorage))
57
+
58
+ with open(target, "w") as f:
59
+ writer = self.__create_writer_func(f)
60
+ for _, row in storage:
61
+ #content = [row[col_name] for col_name in storage.iter_column_names()]
62
+ content = [v for v in row]
63
+ writer.writerow(content)
@@ -0,0 +1,40 @@
1
+ import logging
2
+
3
+ from arekit.common.data.input.providers.columns.base import BaseColumnsProvider
4
+ from arekit.common.utils import create_dir_if_not_exists
5
+ from arekit.contrib.utils.data.storages.pandas_based import PandasBasedRowsStorage
6
+ from arekit.contrib.utils.data.writers.base import BaseWriter
7
+
8
+ logger = logging.getLogger(__name__)
9
+ logging.basicConfig(level=logging.INFO)
10
+
11
+
12
+ class PandasCsvWriter(BaseWriter):
13
+
14
+ def __init__(self, write_header):
15
+ super(PandasCsvWriter, self).__init__()
16
+ self.__write_header = write_header
17
+
18
+ def extension(self):
19
+ return ".tsv.gz"
20
+
21
+ def write_all(self, storage, target):
22
+ assert(isinstance(storage, PandasBasedRowsStorage))
23
+ assert(isinstance(target, str))
24
+
25
+ create_dir_if_not_exists(target)
26
+
27
+ # Temporary hack, remove it in future.
28
+ df = storage.DataFrame
29
+
30
+ logger.info("Saving... {length}: {filepath}".format(length=len(storage), filepath=target))
31
+ df.to_csv(target,
32
+ sep='\t',
33
+ encoding='utf-8',
34
+ columns=[c for c in df.columns if c != BaseColumnsProvider.ROW_ID],
35
+ index=False,
36
+ float_format="%.0f",
37
+ compression='gzip',
38
+ header=self.__write_header)
39
+
40
+ logger.info("Saving completed!")
@@ -0,0 +1,132 @@
1
+ import json
2
+ import logging
3
+ import os
4
+ from os.path import dirname
5
+
6
+ from arekit.common.data import const
7
+ from arekit.common.data.storages.base import BaseRowsStorage
8
+ from arekit.contrib.utils.data.storages.row_cache import RowCacheStorage
9
+ from arekit.contrib.utils.data.writers.base import BaseWriter
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class OpenNREJsonWriter(BaseWriter):
15
+ """ This is a bag-based writer for the samples.
16
+ Project page: https://github.com/thunlp/OpenNRE
17
+
18
+ Every bag presented as follows:
19
+ {
20
+ 'text' or 'token': ...,
21
+ 'h': {'pos': [start, end], 'id': ... },
22
+ 't': {'pos': [start, end], 'id': ... }
23
+ 'id': "id_of_the_text_opinion"
24
+ }
25
+
26
+ In terms of the linked opinions (i0, i1, etc.) we consider id of the first opinion in linkage.
27
+ During the dataset reading stage via OpenNRE, these linkages automaticaly groups into bags.
28
+ """
29
+
30
+ def __init__(self, text_columns, encoding="utf-8", na_value="NA", keep_extra_columns=True,
31
+ skip_extra_existed=True):
32
+ """ text_columns: list
33
+ column names that expected to be joined into a single (token) column.
34
+ """
35
+ assert(isinstance(text_columns, list))
36
+ assert(isinstance(encoding, str))
37
+ self.__text_columns = text_columns
38
+ self.__encoding = encoding
39
+ self.__target_f = None
40
+ self.__keep_extra_columns = keep_extra_columns
41
+ self.__na_value = na_value
42
+ self.__skip_extra_existed = skip_extra_existed
43
+
44
+ def extension(self):
45
+ return ".jsonl"
46
+
47
+ @staticmethod
48
+ def __format_row(row, na_value, text_columns, keep_extra_columns, skip_extra_existed):
49
+ """ Formatting that is compatible with the OpenNRE.
50
+ """
51
+ assert(isinstance(na_value, str))
52
+
53
+ sample_id = row[const.ID]
54
+ s_ind = int(row[const.S_IND])
55
+ t_ind = int(row[const.T_IND])
56
+ bag_id = str(row[const.OPINION_ID])
57
+
58
+ # Gather tokens.
59
+ tokens = []
60
+ for text_col in text_columns:
61
+ if text_col in row:
62
+ tokens.extend(row[text_col].split())
63
+
64
+ # Filtering JSON row.
65
+ formatted_data = {
66
+ "id": bag_id,
67
+ "id_orig": sample_id,
68
+ "token": tokens,
69
+ "h": {"pos": [s_ind, s_ind + 1], "id": str(bag_id + "s")},
70
+ "t": {"pos": [t_ind, t_ind + 1], "id": str(bag_id + "t")},
71
+ "relation": str(int(row[const.LABEL_UINT])) if const.LABEL_UINT in row else na_value
72
+ }
73
+
74
+ # Register extra fields (optionally).
75
+ if keep_extra_columns:
76
+ for key, value in row.items():
77
+ if key not in formatted_data and key not in text_columns:
78
+ formatted_data[key] = value
79
+ else:
80
+ if not skip_extra_existed:
81
+ raise Exception(f"key `{key}` is already exist in formatted data "
82
+ f"or a part of the text columns list: {text_columns}")
83
+
84
+ return formatted_data
85
+
86
+ def open_target(self, target):
87
+ os.makedirs(dirname(target), exist_ok=True)
88
+ self.__target_f = open(target, "w")
89
+ pass
90
+
91
+ def close_target(self):
92
+ self.__target_f.close()
93
+
94
+ def commit_line(self, storage):
95
+ assert(isinstance(storage, RowCacheStorage))
96
+
97
+ # Collect existed columns.
98
+ row_data = {}
99
+ for col_name in storage.iter_column_names():
100
+ if col_name not in storage.RowCache:
101
+ continue
102
+ row_data[col_name] = storage.RowCache[col_name]
103
+
104
+ bag = self.__format_row(row_data, text_columns=self.__text_columns,
105
+ keep_extra_columns=self.__keep_extra_columns,
106
+ na_value=self.__na_value,
107
+ skip_extra_existed=self.__skip_extra_existed)
108
+
109
+ self.__write_bag(bag=bag, json_file=self.__target_f)
110
+
111
+ @staticmethod
112
+ def __write_bag(bag, json_file):
113
+ assert(isinstance(bag, dict))
114
+ json.dump(bag, json_file, separators=(",", ":"), ensure_ascii=False)
115
+ json_file.write("\n")
116
+
117
+ def write_all(self, storage, target):
118
+ assert(isinstance(storage, BaseRowsStorage))
119
+ assert(isinstance(target, str))
120
+
121
+ logger.info("Saving... {rows}: {filepath}".format(rows=(len(storage)), filepath=target))
122
+
123
+ os.makedirs(os.path.dirname(target), exist_ok=True)
124
+ with open(target, "w", encoding=self.__encoding) as json_file:
125
+ for row_index, row in storage:
126
+ self.__write_bag(bag=self.__format_row(row, text_columns=self.__text_columns,
127
+ keep_extra_columns=self.__keep_extra_columns,
128
+ na_value=self.__na_value,
129
+ skip_extra_existed=self.__skip_extra_existed),
130
+ json_file=json_file)
131
+
132
+ logger.info("Saving completed!")
@@ -0,0 +1,110 @@
1
+ import os
2
+ import sqlite3
3
+ from os.path import dirname
4
+
5
+ from arekit.common.data import const
6
+ from arekit.contrib.utils.data.storages.row_cache import RowCacheStorage
7
+ from arekit.contrib.utils.data.writers.base import BaseWriter
8
+
9
+
10
+ class SQliteWriter(BaseWriter):
11
+
12
+ def __init__(self, table_name="contents", index_column_names=None, skip_existed=False, clear_table=True):
13
+ """ index_column_names: list or None
14
+ column names should be considered to build a unique index;
15
+ if None, the default 'const.ID' will be considered for row indexation.
16
+ """
17
+ assert (isinstance(index_column_names, list) or index_column_names is None)
18
+ self.__index_column_names = index_column_names if index_column_names is not None else [const.ID]
19
+ self.__table_name = table_name
20
+ self.__conn = None
21
+ self.__cur = None
22
+ self.__need_init_table = True
23
+ self.__origin_column_names = None
24
+ self.__skip_existed = skip_existed
25
+ self.__clear_table = clear_table
26
+
27
+ def extension(self):
28
+ return ".sqlite"
29
+
30
+ @staticmethod
31
+ def __iter_storage_column_names(storage):
32
+ """ Iter only those columns that existed in storage.
33
+ """
34
+ assert (isinstance(storage, RowCacheStorage))
35
+ for col_name, col_type in zip(storage.iter_column_names(), storage.iter_column_types()):
36
+ if col_name in storage.RowCache:
37
+ yield col_name, col_type
38
+
39
+ def __init_table(self, column_data):
40
+ # Compose column name with the related SQLITE type.
41
+ column_types = ",".join([" ".join([col_name, self.type_to_sqlite(col_type)])
42
+ for col_name, col_type in column_data])
43
+ # Create table if not exists.
44
+ self.__cur.execute(f"CREATE TABLE IF NOT EXISTS {self.__table_name}({column_types})")
45
+ # Table exists, however we may optionally remove the content from it.
46
+ if self.__clear_table:
47
+ self.__cur.execute(f"DELETE FROM {self.__table_name};")
48
+ # Create index.
49
+ index_name = f"i_{self.__table_name}_id"
50
+ self.__cur.execute(f"DROP INDEX IF EXISTS {index_name};")
51
+ self.__cur.execute("CREATE INDEX IF NOT EXISTS {index} ON {table}({columns})".format(
52
+ index=index_name,
53
+ table=self.__table_name,
54
+ columns=", ".join(self.__index_column_names)
55
+ ))
56
+ self.__origin_column_names = [col_name for col_name, _ in column_data]
57
+
58
+ @staticmethod
59
+ def type_to_sqlite(col_type):
60
+ """ This is a simple function that provides conversion from the
61
+ base numpy types to SQLITE.
62
+ NOTE: this method represent a quick implementation for supporting
63
+ types, however it is far away from the generalized implementation.
64
+ """
65
+ if isinstance(col_type, str):
66
+ if 'int' in col_type:
67
+ return 'INTEGER'
68
+
69
+ return "TEXT"
70
+
71
+ def open_target(self, target):
72
+ os.makedirs(dirname(target), exist_ok=True)
73
+ self.__conn = sqlite3.connect(target)
74
+ self.__cur = self.__conn.cursor()
75
+
76
+ def commit_line(self, storage):
77
+ assert (isinstance(storage, RowCacheStorage))
78
+
79
+ column_data = list(self.__iter_storage_column_names(storage))
80
+
81
+ if self.__need_init_table:
82
+ self.__init_table(column_data)
83
+ self.__need_init_table = False
84
+
85
+ # Check whether the related row is already exist in SQLITE database.
86
+ row_id = storage.RowCache[const.ID]
87
+ top_row = self.__cur.execute(f"SELECT EXISTS(SELECT 1 FROM {self.__table_name} WHERE id='{row_id}');")
88
+ is_exists = top_row.fetchone()[0]
89
+ if is_exists == 1 and self.__skip_existed:
90
+ return
91
+
92
+ line_data = [storage.RowCache[col_name] for col_name, _ in column_data]
93
+ parameters = ",".join(["?"] * len(line_data))
94
+
95
+ assert (len(self.__origin_column_names) == len(line_data))
96
+
97
+ self.__cur.execute(
98
+ f"INSERT OR REPLACE INTO {self.__table_name} VALUES ({parameters})",
99
+ tuple(line_data))
100
+
101
+ self.__conn.commit()
102
+
103
+ def close_target(self):
104
+ self.__cur = None
105
+ self.__origin_column_names = None
106
+ self.__need_init_table = True
107
+ self.__conn.close()
108
+
109
+ def write_all(self, storage, target):
110
+ pass
@@ -0,0 +1,77 @@
1
+ import os
2
+ import tarfile
3
+ from os.path import join, exists
4
+
5
+ from arekit.common import utils
6
+ from arekit.contrib.utils.np_utils.embedding import NpzEmbeddingHelper
7
+ from arekit.contrib.utils.np_utils.vocab import VocabRepositoryUtils
8
+
9
+ NEWS_MYSTEM_SKIPGRAM_1000_20_2015 = "news_mystem_skipgram_1000_20_2015.tar.gz"
10
+
11
+
12
+ def __get_resource(local_name, check_existance=False, download_if_missed=False):
13
+ assert (isinstance(local_name, str))
14
+ filepath = join(utils.get_default_download_dir(), local_name)
15
+
16
+ if check_existance and not exists(filepath):
17
+ if download_if_missed:
18
+ download()
19
+ # We try to ger the resource again but won't attempt to download it again.
20
+ __get_resource(local_name, check_existance=check_existance, download_if_missed=False)
21
+ else:
22
+ raise Exception("Resource could not be found: {}".format(filepath))
23
+
24
+ return filepath
25
+
26
+
27
+ def __get_embedding_dir(filepath):
28
+ return filepath.replace(".tar.gz", "")
29
+
30
+
31
+ def load_embedding_and_vocab(local_name, check_existance=False, download_if_missed=False):
32
+ tar_gz_archive = __get_resource(local_name, check_existance=check_existance,
33
+ download_if_missed=download_if_missed)
34
+ target_dir = __get_embedding_dir(tar_gz_archive)
35
+ embedding = NpzEmbeddingHelper.load_embedding(os.path.join(target_dir, "embedding.npz"))
36
+ vocab = VocabRepositoryUtils.load(os.path.join(target_dir, "vocab.txt"))
37
+ return embedding, vocab
38
+
39
+
40
+ def download():
41
+ data = {
42
+ NEWS_MYSTEM_SKIPGRAM_1000_20_2015: "https://www.dropbox.com/s/0omnlgzgnjhxlmf/{filename}?dl=1".format(
43
+ filename=NEWS_MYSTEM_SKIPGRAM_1000_20_2015),
44
+ }
45
+
46
+ # Perform downloading ...
47
+ for local_name, url_link in data.items():
48
+ utils.download(dest_file_path=__get_resource(local_name),
49
+ source_url=url_link)
50
+
51
+ # Untar files ...
52
+ for local_name in data.keys():
53
+
54
+ if ".tar.gz" not in local_name:
55
+ continue
56
+
57
+ target_filepath = __get_resource(local_name)
58
+ with tarfile.open(target_filepath) as file:
59
+ def is_within_directory(directory, target):
60
+
61
+ abs_directory = os.path.abspath(directory)
62
+ abs_target = os.path.abspath(target)
63
+
64
+ prefix = os.path.commonprefix([abs_directory, abs_target])
65
+
66
+ return prefix == abs_directory
67
+
68
+ def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
69
+
70
+ for member in tar.getmembers():
71
+ member_path = os.path.join(path, member.name)
72
+ if not is_within_directory(path, member_path):
73
+ raise Exception("Attempted Path Traversal in Tar File")
74
+
75
+ tar.extractall(path, members, numeric_owner=numeric_owner)
76
+
77
+ safe_extract(file, __get_embedding_dir(target_filepath))
File without changes
@@ -0,0 +1,58 @@
1
+ from arekit.common.text.stemmer import Stemmer
2
+ from arekit.contrib.networks.embedding import Embedding
3
+
4
+
5
+ class RusvectoresEmbedding(Embedding):
6
+ """ Wrapper over models from the following resource.
7
+ https://rusvectores.org/ru/models/
8
+
9
+ NOTE: Usually these are embeddings for texts written in Russian.
10
+ for the better performance it is expected that we adopt stemmer.
11
+ """
12
+
13
+ def __init__(self, matrix, words, stemmer):
14
+ assert(isinstance(stemmer, Stemmer) or stemmer is None)
15
+ super(RusvectoresEmbedding, self).__init__(matrix=matrix, words=words)
16
+ self.__index_without_pos = self.__create_terms_without_pos()
17
+ self.__stemmer = stemmer
18
+ self.__lemmatize_by_default = stemmer is not None
19
+
20
+ def try_find_index_by_plain_word(self, word):
21
+ assert(isinstance(word, str))
22
+
23
+ temp = self.__lemmatize_by_default
24
+ self.__lemmatize_by_default = False
25
+ index = super(RusvectoresEmbedding, self).try_find_index_by_plain_word(word)
26
+ self.__lemmatize_by_default = temp
27
+
28
+ return index
29
+
30
+ def _handler(self, word):
31
+ return self.__try_find_word_index_pair_lemmatized(word, self.__lemmatize_by_default)
32
+
33
+ # region private methods
34
+
35
+ def __try_find_word_index_pair_lemmatized(self, term, lemmatize):
36
+ assert(isinstance(term, str))
37
+ assert(isinstance(lemmatize, bool))
38
+
39
+ if lemmatize:
40
+ term = self.__stemmer.lemmatize_to_str(term)
41
+
42
+ index = self.__index_without_pos[term] \
43
+ if term in self.__index_without_pos else None
44
+
45
+ return term, index
46
+
47
+ def __create_terms_without_pos(self):
48
+ d = {}
49
+ for word_with_pos, index in self.iter_vocabulary():
50
+ assert(isinstance(word_with_pos, str))
51
+ word = word_with_pos.split(u'_')[0]
52
+ if word in d:
53
+ continue
54
+ d[word] = index
55
+
56
+ return d
57
+
58
+ # endregion