arekit 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. arekit/__init__.py +0 -0
  2. arekit/common/__init__.py +0 -0
  3. arekit/common/bound.py +48 -0
  4. arekit/common/context/__init__.py +0 -0
  5. arekit/common/context/terms_mapper.py +51 -0
  6. arekit/common/context/token.py +16 -0
  7. arekit/common/data/__init__.py +0 -0
  8. arekit/common/data/const.py +21 -0
  9. arekit/common/data/doc_provider.py +6 -0
  10. arekit/common/data/input/__init__.py +0 -0
  11. arekit/common/data/input/providers/__init__.py +0 -0
  12. arekit/common/data/input/providers/columns/__init__.py +0 -0
  13. arekit/common/data/input/providers/columns/base.py +9 -0
  14. arekit/common/data/input/providers/columns/sample.py +59 -0
  15. arekit/common/data/input/providers/const.py +3 -0
  16. arekit/common/data/input/providers/contents.py +9 -0
  17. arekit/common/data/input/providers/instances/__init__.py +0 -0
  18. arekit/common/data/input/providers/instances/base.py +14 -0
  19. arekit/common/data/input/providers/instances/multiple.py +27 -0
  20. arekit/common/data/input/providers/instances/single.py +8 -0
  21. arekit/common/data/input/providers/label/__init__.py +0 -0
  22. arekit/common/data/input/providers/label/base.py +24 -0
  23. arekit/common/data/input/providers/label/binary.py +11 -0
  24. arekit/common/data/input/providers/label/multiple.py +15 -0
  25. arekit/common/data/input/providers/rows/__init__.py +0 -0
  26. arekit/common/data/input/providers/rows/base.py +64 -0
  27. arekit/common/data/input/providers/rows/samples.py +227 -0
  28. arekit/common/data/input/providers/sample/__init__.py +0 -0
  29. arekit/common/data/input/providers/sample/cropped.py +43 -0
  30. arekit/common/data/input/providers/text/__init__.py +0 -0
  31. arekit/common/data/input/providers/text/single.py +49 -0
  32. arekit/common/data/input/repositories/__init__.py +0 -0
  33. arekit/common/data/input/repositories/base.py +68 -0
  34. arekit/common/data/input/repositories/sample.py +22 -0
  35. arekit/common/data/input/sample.py +66 -0
  36. arekit/common/data/input/terms_mapper.py +88 -0
  37. arekit/common/data/rows_fmt.py +82 -0
  38. arekit/common/data/rows_parser.py +43 -0
  39. arekit/common/data/storages/__init__.py +0 -0
  40. arekit/common/data/storages/base.py +109 -0
  41. arekit/common/data/views/__init__.py +0 -0
  42. arekit/common/data/views/samples.py +26 -0
  43. arekit/common/docs/__init__.py +0 -0
  44. arekit/common/docs/base.py +30 -0
  45. arekit/common/docs/entities_grouping.py +16 -0
  46. arekit/common/docs/entity.py +18 -0
  47. arekit/common/docs/objects_parser.py +37 -0
  48. arekit/common/docs/parsed/__init__.py +0 -0
  49. arekit/common/docs/parsed/base.py +101 -0
  50. arekit/common/docs/parsed/providers/__init__.py +0 -0
  51. arekit/common/docs/parsed/providers/base.py +68 -0
  52. arekit/common/docs/parsed/providers/base_pairs.py +51 -0
  53. arekit/common/docs/parsed/providers/entity_service.py +175 -0
  54. arekit/common/docs/parsed/providers/opinion_pairs.py +20 -0
  55. arekit/common/docs/parsed/providers/text_opinion_pairs.py +78 -0
  56. arekit/common/docs/parsed/service.py +31 -0
  57. arekit/common/docs/parsed/term_position.py +42 -0
  58. arekit/common/docs/parser.py +34 -0
  59. arekit/common/docs/sentence.py +14 -0
  60. arekit/common/entities/__init__.py +0 -0
  61. arekit/common/entities/base.py +51 -0
  62. arekit/common/entities/collection.py +72 -0
  63. arekit/common/entities/str_fmt.py +8 -0
  64. arekit/common/entities/types.py +9 -0
  65. arekit/common/experiment/__init__.py +0 -0
  66. arekit/common/experiment/api/__init__.py +0 -0
  67. arekit/common/experiment/api/base_samples_io.py +20 -0
  68. arekit/common/experiment/data_type.py +17 -0
  69. arekit/common/frames/__init__.py +0 -0
  70. arekit/common/frames/connotations/__init__.py +0 -0
  71. arekit/common/frames/connotations/descriptor.py +17 -0
  72. arekit/common/frames/connotations/provider.py +4 -0
  73. arekit/common/frames/text_variant.py +43 -0
  74. arekit/common/frames/variants/__init__.py +0 -0
  75. arekit/common/frames/variants/base.py +21 -0
  76. arekit/common/frames/variants/collection.py +60 -0
  77. arekit/common/labels/__init__.py +0 -0
  78. arekit/common/labels/base.py +19 -0
  79. arekit/common/labels/provider/__init__.py +0 -0
  80. arekit/common/labels/provider/base.py +7 -0
  81. arekit/common/labels/provider/constant.py +14 -0
  82. arekit/common/labels/scaler/__init__.py +0 -0
  83. arekit/common/labels/scaler/base.py +85 -0
  84. arekit/common/labels/scaler/sentiment.py +7 -0
  85. arekit/common/labels/scaler/single.py +10 -0
  86. arekit/common/labels/str_fmt.py +55 -0
  87. arekit/common/linkage/__init__.py +0 -0
  88. arekit/common/linkage/base.py +44 -0
  89. arekit/common/linkage/meta.py +23 -0
  90. arekit/common/linkage/opinions.py +9 -0
  91. arekit/common/linkage/text_opinions.py +22 -0
  92. arekit/common/log_utils.py +29 -0
  93. arekit/common/model/__init__.py +0 -0
  94. arekit/common/model/labeling/__init__.py +0 -0
  95. arekit/common/model/labeling/base.py +24 -0
  96. arekit/common/model/labeling/modes.py +8 -0
  97. arekit/common/model/labeling/single.py +24 -0
  98. arekit/common/opinions/__init__.py +0 -0
  99. arekit/common/opinions/annot/__init__.py +0 -0
  100. arekit/common/opinions/annot/algo/__init__.py +0 -0
  101. arekit/common/opinions/annot/algo/base.py +4 -0
  102. arekit/common/opinions/annot/algo/pair_based.py +99 -0
  103. arekit/common/opinions/annot/algo/predefined.py +16 -0
  104. arekit/common/opinions/annot/algo_based.py +55 -0
  105. arekit/common/opinions/annot/base.py +15 -0
  106. arekit/common/opinions/base.py +74 -0
  107. arekit/common/opinions/collection.py +150 -0
  108. arekit/common/opinions/enums.py +6 -0
  109. arekit/common/opinions/provider.py +4 -0
  110. arekit/common/opinions/writer.py +4 -0
  111. arekit/common/pipeline/__init__.py +0 -0
  112. arekit/common/pipeline/base.py +25 -0
  113. arekit/common/pipeline/context.py +36 -0
  114. arekit/common/pipeline/conts.py +2 -0
  115. arekit/common/pipeline/items/__init__.py +0 -0
  116. arekit/common/pipeline/items/base.py +12 -0
  117. arekit/common/pipeline/items/flatten.py +14 -0
  118. arekit/common/pipeline/items/handle.py +17 -0
  119. arekit/common/pipeline/items/iter.py +11 -0
  120. arekit/common/pipeline/items/map.py +11 -0
  121. arekit/common/pipeline/items/map_nested.py +13 -0
  122. arekit/common/synonyms/__init__.py +0 -0
  123. arekit/common/synonyms/base.py +151 -0
  124. arekit/common/synonyms/grouping.py +21 -0
  125. arekit/common/text/__init__.py +0 -0
  126. arekit/common/text/enums.py +12 -0
  127. arekit/common/text/parsed.py +42 -0
  128. arekit/common/text/parser.py +12 -0
  129. arekit/common/text/partitioning/__init__.py +0 -0
  130. arekit/common/text/partitioning/base.py +4 -0
  131. arekit/common/text/partitioning/str.py +36 -0
  132. arekit/common/text/partitioning/terms.py +35 -0
  133. arekit/common/text/stemmer.py +16 -0
  134. arekit/common/text_opinions/__init__.py +0 -0
  135. arekit/common/text_opinions/base.py +105 -0
  136. arekit/common/utils.py +129 -0
  137. arekit/contrib/__init__.py +0 -0
  138. arekit/contrib/bert/__init__.py +0 -0
  139. arekit/contrib/bert/input/__init__.py +0 -0
  140. arekit/contrib/bert/input/providers/__init__.py +0 -0
  141. arekit/contrib/bert/input/providers/cropped_sample.py +17 -0
  142. arekit/contrib/bert/input/providers/text_pair.py +62 -0
  143. arekit/contrib/bert/terms/__init__.py +0 -0
  144. arekit/contrib/bert/terms/mapper.py +20 -0
  145. arekit/contrib/networks/__init__.py +0 -0
  146. arekit/contrib/networks/embedding.py +149 -0
  147. arekit/contrib/networks/embedding_io.py +18 -0
  148. arekit/contrib/networks/input/__init__.py +0 -0
  149. arekit/contrib/networks/input/const.py +6 -0
  150. arekit/contrib/networks/input/ctx_serialization.py +28 -0
  151. arekit/contrib/networks/input/embedding/__init__.py +0 -0
  152. arekit/contrib/networks/input/embedding/matrix.py +29 -0
  153. arekit/contrib/networks/input/embedding/offsets.py +55 -0
  154. arekit/contrib/networks/input/formatters/__init__.py +0 -0
  155. arekit/contrib/networks/input/formatters/pos_mapper.py +22 -0
  156. arekit/contrib/networks/input/providers/__init__.py +0 -0
  157. arekit/contrib/networks/input/providers/sample.py +129 -0
  158. arekit/contrib/networks/input/providers/term_connotation.py +23 -0
  159. arekit/contrib/networks/input/providers/text.py +24 -0
  160. arekit/contrib/networks/input/rows_parser.py +47 -0
  161. arekit/contrib/networks/input/term_types.py +13 -0
  162. arekit/contrib/networks/input/terms_mapping.py +60 -0
  163. arekit/contrib/networks/vectorizer.py +6 -0
  164. arekit/contrib/prompt/__init__.py +0 -0
  165. arekit/contrib/prompt/sample.py +61 -0
  166. arekit/contrib/source/__init__.py +0 -0
  167. arekit/contrib/source/brat/__init__.py +0 -0
  168. arekit/contrib/source/brat/annot.py +84 -0
  169. arekit/contrib/source/brat/doc.py +28 -0
  170. arekit/contrib/source/brat/entities/__init__.py +0 -0
  171. arekit/contrib/source/brat/entities/compound.py +13 -0
  172. arekit/contrib/source/brat/entities/entity.py +42 -0
  173. arekit/contrib/source/brat/entities/parser.py +53 -0
  174. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  175. arekit/contrib/source/brat/opinions/converter.py +19 -0
  176. arekit/contrib/source/brat/relation.py +32 -0
  177. arekit/contrib/source/brat/sentence.py +69 -0
  178. arekit/contrib/source/brat/sentences_reader.py +128 -0
  179. arekit/contrib/source/download.py +41 -0
  180. arekit/contrib/source/nerel/__init__.py +0 -0
  181. arekit/contrib/source/nerel/entities.py +55 -0
  182. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  183. arekit/contrib/source/nerel/folding/fixed.py +74 -0
  184. arekit/contrib/source/nerel/io_utils.py +62 -0
  185. arekit/contrib/source/nerel/labels.py +241 -0
  186. arekit/contrib/source/nerel/reader.py +46 -0
  187. arekit/contrib/source/nerel/utils.py +24 -0
  188. arekit/contrib/source/nerel/versions.py +12 -0
  189. arekit/contrib/source/nerelbio/__init__.py +0 -0
  190. arekit/contrib/source/nerelbio/io_utils.py +62 -0
  191. arekit/contrib/source/nerelbio/labels.py +265 -0
  192. arekit/contrib/source/nerelbio/reader.py +8 -0
  193. arekit/contrib/source/nerelbio/versions.py +8 -0
  194. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  195. arekit/contrib/source/ruattitudes/collection.py +36 -0
  196. arekit/contrib/source/ruattitudes/doc.py +51 -0
  197. arekit/contrib/source/ruattitudes/doc_brat.py +44 -0
  198. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  199. arekit/contrib/source/ruattitudes/entity/parser.py +7 -0
  200. arekit/contrib/source/ruattitudes/io_utils.py +56 -0
  201. arekit/contrib/source/ruattitudes/labels_fmt.py +12 -0
  202. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  203. arekit/contrib/source/ruattitudes/opinions/base.py +28 -0
  204. arekit/contrib/source/ruattitudes/opinions/converter.py +37 -0
  205. arekit/contrib/source/ruattitudes/reader.py +268 -0
  206. arekit/contrib/source/ruattitudes/sentence.py +73 -0
  207. arekit/contrib/source/ruattitudes/synonyms.py +17 -0
  208. arekit/contrib/source/ruattitudes/text_object.py +59 -0
  209. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  210. arekit/contrib/source/rusentiframes/collection.py +157 -0
  211. arekit/contrib/source/rusentiframes/effect.py +24 -0
  212. arekit/contrib/source/rusentiframes/io_utils.py +19 -0
  213. arekit/contrib/source/rusentiframes/labels_fmt.py +22 -0
  214. arekit/contrib/source/rusentiframes/polarity.py +35 -0
  215. arekit/contrib/source/rusentiframes/role.py +15 -0
  216. arekit/contrib/source/rusentiframes/state.py +24 -0
  217. arekit/contrib/source/rusentiframes/types.py +42 -0
  218. arekit/contrib/source/rusentiframes/value.py +2 -0
  219. arekit/contrib/source/rusentrel/__init__.py +0 -0
  220. arekit/contrib/source/rusentrel/const.py +3 -0
  221. arekit/contrib/source/rusentrel/docs_reader.py +51 -0
  222. arekit/contrib/source/rusentrel/entities.py +26 -0
  223. arekit/contrib/source/rusentrel/io_utils.py +125 -0
  224. arekit/contrib/source/rusentrel/labels_fmt.py +12 -0
  225. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  226. arekit/contrib/source/rusentrel/opinions/collection.py +30 -0
  227. arekit/contrib/source/rusentrel/opinions/converter.py +40 -0
  228. arekit/contrib/source/rusentrel/opinions/provider.py +54 -0
  229. arekit/contrib/source/rusentrel/opinions/writer.py +42 -0
  230. arekit/contrib/source/rusentrel/synonyms.py +17 -0
  231. arekit/contrib/source/sentinerel/__init__.py +0 -0
  232. arekit/contrib/source/sentinerel/entities.py +52 -0
  233. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  234. arekit/contrib/source/sentinerel/folding/factory.py +31 -0
  235. arekit/contrib/source/sentinerel/folding/fixed.py +70 -0
  236. arekit/contrib/source/sentinerel/io_utils.py +87 -0
  237. arekit/contrib/source/sentinerel/labels.py +53 -0
  238. arekit/contrib/source/sentinerel/labels_scaler.py +30 -0
  239. arekit/contrib/source/sentinerel/reader.py +42 -0
  240. arekit/contrib/source/synonyms/__init__.py +0 -0
  241. arekit/contrib/source/synonyms/utils.py +19 -0
  242. arekit/contrib/source/zip_utils.py +47 -0
  243. arekit/contrib/utils/__init__.py +0 -0
  244. arekit/contrib/utils/bert/__init__.py +0 -0
  245. arekit/contrib/utils/bert/samplers.py +17 -0
  246. arekit/contrib/utils/connotations/__init__.py +0 -0
  247. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +23 -0
  248. arekit/contrib/utils/data/__init__.py +0 -0
  249. arekit/contrib/utils/data/contents/__init__.py +0 -0
  250. arekit/contrib/utils/data/contents/opinions.py +37 -0
  251. arekit/contrib/utils/data/doc_provider/__init__.py +0 -0
  252. arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
  253. arekit/contrib/utils/data/doc_provider/dir_based.py +53 -0
  254. arekit/contrib/utils/data/readers/__init__.py +0 -0
  255. arekit/contrib/utils/data/readers/base.py +7 -0
  256. arekit/contrib/utils/data/readers/csv_pd.py +38 -0
  257. arekit/contrib/utils/data/readers/jsonl.py +15 -0
  258. arekit/contrib/utils/data/service/__init__.py +0 -0
  259. arekit/contrib/utils/data/service/balance.py +50 -0
  260. arekit/contrib/utils/data/storages/__init__.py +0 -0
  261. arekit/contrib/utils/data/storages/jsonl_based.py +18 -0
  262. arekit/contrib/utils/data/storages/pandas_based.py +123 -0
  263. arekit/contrib/utils/data/storages/row_cache.py +48 -0
  264. arekit/contrib/utils/data/writers/__init__.py +0 -0
  265. arekit/contrib/utils/data/writers/base.py +27 -0
  266. arekit/contrib/utils/data/writers/csv_native.py +63 -0
  267. arekit/contrib/utils/data/writers/csv_pd.py +40 -0
  268. arekit/contrib/utils/data/writers/json_opennre.py +132 -0
  269. arekit/contrib/utils/data/writers/sqlite_native.py +110 -0
  270. arekit/contrib/utils/download.py +77 -0
  271. arekit/contrib/utils/embeddings/__init__.py +0 -0
  272. arekit/contrib/utils/embeddings/rusvectores.py +58 -0
  273. arekit/contrib/utils/embeddings/tokens.py +30 -0
  274. arekit/contrib/utils/entities/__init__.py +0 -0
  275. arekit/contrib/utils/entities/filter.py +7 -0
  276. arekit/contrib/utils/entities/formatters/__init__.py +0 -0
  277. arekit/contrib/utils/entities/formatters/str_display.py +11 -0
  278. arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py +15 -0
  279. arekit/contrib/utils/io_utils/__init__.py +0 -0
  280. arekit/contrib/utils/io_utils/embedding.py +72 -0
  281. arekit/contrib/utils/io_utils/opinions.py +37 -0
  282. arekit/contrib/utils/io_utils/samples.py +79 -0
  283. arekit/contrib/utils/io_utils/utils.py +39 -0
  284. arekit/contrib/utils/lexicons/__init__.py +0 -0
  285. arekit/contrib/utils/lexicons/lexicon.py +41 -0
  286. arekit/contrib/utils/lexicons/relation.py +42 -0
  287. arekit/contrib/utils/lexicons/rusentilex.py +37 -0
  288. arekit/contrib/utils/nn/__init__.py +0 -0
  289. arekit/contrib/utils/nn/rows.py +83 -0
  290. arekit/contrib/utils/np_utils/__init__.py +0 -0
  291. arekit/contrib/utils/np_utils/embedding.py +22 -0
  292. arekit/contrib/utils/np_utils/npz_utils.py +13 -0
  293. arekit/contrib/utils/np_utils/vocab.py +20 -0
  294. arekit/contrib/utils/pipelines/__init__.py +0 -0
  295. arekit/contrib/utils/pipelines/items/__init__.py +0 -0
  296. arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
  297. arekit/contrib/utils/pipelines/items/sampling/base.py +99 -0
  298. arekit/contrib/utils/pipelines/items/sampling/networks.py +54 -0
  299. arekit/contrib/utils/pipelines/items/text/__init__.py +0 -0
  300. arekit/contrib/utils/pipelines/items/text/entities_default.py +23 -0
  301. arekit/contrib/utils/pipelines/items/text/frames.py +86 -0
  302. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +36 -0
  303. arekit/contrib/utils/pipelines/items/text/frames_negation.py +32 -0
  304. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +10 -0
  305. arekit/contrib/utils/pipelines/items/text/tokenizer.py +107 -0
  306. arekit/contrib/utils/pipelines/items/text/translator.py +135 -0
  307. arekit/contrib/utils/pipelines/opinion_collections.py +85 -0
  308. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  309. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  310. arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +27 -0
  311. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +65 -0
  312. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +60 -0
  313. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  314. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +29 -0
  315. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +64 -0
  316. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +79 -0
  317. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  318. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +56 -0
  319. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +20 -0
  320. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +65 -0
  321. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  322. arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +21 -0
  323. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +107 -0
  324. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  325. arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +29 -0
  326. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +62 -0
  327. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +180 -0
  328. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +50 -0
  329. arekit/contrib/utils/pipelines/text_opinion/__init__.py +0 -0
  330. arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py +0 -0
  331. arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +34 -0
  332. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +88 -0
  333. arekit/contrib/utils/pipelines/text_opinion/extraction.py +93 -0
  334. arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py +0 -0
  335. arekit/contrib/utils/pipelines/text_opinion/filters/base.py +4 -0
  336. arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +16 -0
  337. arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +29 -0
  338. arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +26 -0
  339. arekit/contrib/utils/processing/__init__.py +0 -0
  340. arekit/contrib/utils/processing/languages/__init__.py +0 -0
  341. arekit/contrib/utils/processing/languages/mods.py +12 -0
  342. arekit/contrib/utils/processing/languages/pos.py +23 -0
  343. arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
  344. arekit/contrib/utils/processing/languages/ru/cases.py +78 -0
  345. arekit/contrib/utils/processing/languages/ru/constants.py +6 -0
  346. arekit/contrib/utils/processing/languages/ru/mods.py +13 -0
  347. arekit/contrib/utils/processing/languages/ru/number.py +23 -0
  348. arekit/contrib/utils/processing/languages/ru/pos_service.py +36 -0
  349. arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
  350. arekit/contrib/utils/processing/lemmatization/mystem.py +51 -0
  351. arekit/contrib/utils/processing/pos/__init__.py +0 -0
  352. arekit/contrib/utils/processing/pos/base.py +12 -0
  353. arekit/contrib/utils/processing/pos/mystem_wrap.py +134 -0
  354. arekit/contrib/utils/processing/pos/russian.py +10 -0
  355. arekit/contrib/utils/processing/text/__init__.py +0 -0
  356. arekit/contrib/utils/processing/text/tokens.py +127 -0
  357. arekit/contrib/utils/resources.py +25 -0
  358. arekit/contrib/utils/serializer.py +43 -0
  359. arekit/contrib/utils/sources/__init__.py +0 -0
  360. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  361. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  362. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +63 -0
  363. arekit/contrib/utils/synonyms/__init__.py +0 -0
  364. arekit/contrib/utils/synonyms/simple.py +15 -0
  365. arekit/contrib/utils/synonyms/stemmer_based.py +38 -0
  366. arekit/contrib/utils/vectorizers/__init__.py +0 -0
  367. arekit/contrib/utils/vectorizers/bpe.py +93 -0
  368. arekit/contrib/utils/vectorizers/random_norm.py +39 -0
  369. arekit/download_data.py +11 -0
  370. arekit-0.24.0.dist-info/LICENSE +21 -0
  371. arekit-0.24.0.dist-info/METADATA +23 -0
  372. arekit-0.24.0.dist-info/RECORD +374 -0
  373. arekit-0.24.0.dist-info/WHEEL +5 -0
  374. arekit-0.24.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,30 @@
1
+ import numpy as np
2
+
3
+ from arekit.contrib.networks.embedding import Embedding
4
+ from arekit.contrib.utils.processing.text.tokens import Tokens
5
+
6
+
7
+ class TokenEmbedding(Embedding):
8
+ """ Embedding vectors for text punctuation, based on Tokens in parsed text
9
+ """
10
+
11
+ @classmethod
12
+ def from_supported_tokens(cls, vector_size, random_vector_func):
13
+ """
14
+ random_vector_func: func
15
+ function with parameters (vector_size, seed)
16
+ """
17
+ assert(isinstance(vector_size, int))
18
+ assert(callable(random_vector_func))
19
+
20
+ matrix = []
21
+ tokens_list = list(Tokens.iter_supported_tokens())
22
+
23
+ for token_index, _ in enumerate(tokens_list):
24
+
25
+ vector = random_vector_func(vector_size, token_index)
26
+
27
+ matrix.append(vector)
28
+
29
+ return cls(matrix=np.array(matrix),
30
+ words=tokens_list)
File without changes
@@ -0,0 +1,7 @@
1
+ class EntityFilter(object):
2
+
3
+ def __init__(self):
4
+ pass
5
+
6
+ def is_ignored(self, entity, e_type):
7
+ raise NotImplementedError()
File without changes
@@ -0,0 +1,11 @@
1
+ from arekit.common.entities.base import Entity
2
+ from arekit.common.entities.str_fmt import StringEntitiesFormatter
3
+
4
+
5
+ class StringEntitiesDisplayValueFormatter(StringEntitiesFormatter):
6
+ """ Provides the contents of the DisplayValue property.
7
+ """
8
+
9
+ def to_string(self, original_value, entity_type):
10
+ assert(isinstance(original_value, Entity))
11
+ return original_value.DisplayValue
@@ -0,0 +1,15 @@
1
+ from arekit.common.entities.str_fmt import StringEntitiesFormatter
2
+ from arekit.common.entities.types import OpinionEntityType
3
+
4
+
5
+ class SharpPrefixedEntitiesSimpleFormatter(StringEntitiesFormatter):
6
+
7
+ def to_string(self, original_value, entity_type):
8
+ assert(isinstance(entity_type, OpinionEntityType))
9
+
10
+ if (entity_type == OpinionEntityType.Object) or (entity_type == OpinionEntityType.SynonymObject):
11
+ return "#O"
12
+ elif (entity_type == OpinionEntityType.Subject) or (entity_type == OpinionEntityType.SynonymSubject):
13
+ return "#S"
14
+ elif entity_type == OpinionEntityType.Other:
15
+ return "#E"
File without changes
@@ -0,0 +1,72 @@
1
+ from os.path import join
2
+
3
+ from arekit.contrib.networks.embedding_io import BaseEmbeddingIO
4
+ from arekit.contrib.utils.io_utils.utils import check_targets_existence
5
+ from arekit.contrib.utils.np_utils.embedding import NpzEmbeddingHelper
6
+ from arekit.contrib.utils.np_utils.vocab import VocabRepositoryUtils
7
+
8
+
9
+ class NpEmbeddingIO(BaseEmbeddingIO):
10
+ """ Npz-based IO utils for embedding and text-based for vocabulary.
11
+ This format represents a archived version of the numpy math data, i.e. vectors, numbers, etc.
12
+
13
+ Provides additional Input/Output paths generation functions for:
14
+ - embedding matrix;
15
+ - embedding vocabulary.
16
+ """
17
+
18
+ def __init__(self, target_dir, prefix_name="sample"):
19
+ assert(isinstance(target_dir, str))
20
+
21
+ self.__target_dir = target_dir
22
+ self.__term_emb_fn_template = "-".join([prefix_name, "term_embedding"])
23
+ self.__vocab_fn_template = "-".join([prefix_name, "term_embedding"])
24
+
25
+ # region Embedding-related data
26
+
27
+ def save_vocab(self, data):
28
+ target = self.__get_default_vocab_filepath()
29
+ return VocabRepositoryUtils.save(data=data, target=target)
30
+
31
+ def load_vocab(self):
32
+ source = self.___get_vocab_source()
33
+ return dict(VocabRepositoryUtils.load(source))
34
+
35
+ def save_embedding(self, data):
36
+ target = self.__get_default_embedding_filepath()
37
+ NpzEmbeddingHelper.save_embedding(data=data, target=target)
38
+
39
+ def load_embedding(self):
40
+ source = self.__get_term_embedding_source()
41
+ return NpzEmbeddingHelper.load_embedding(source)
42
+
43
+ def check_targets_existed(self):
44
+ targets = [
45
+ self.__get_default_vocab_filepath(),
46
+ self.__get_term_embedding_target()
47
+ ]
48
+ return check_targets_existence(targets=targets)
49
+
50
+ # endregion
51
+
52
+ # region embedding-related data
53
+
54
+ def ___get_vocab_source(self):
55
+ """ It is possible to load a predefined embedding from another experiment
56
+ using the related filepath provided by model_io.
57
+ """
58
+ return self.__get_default_vocab_filepath()
59
+
60
+ def __get_term_embedding_target(self):
61
+ return self.__get_default_embedding_filepath()
62
+
63
+ def __get_term_embedding_source(self):
64
+ return self.__get_default_embedding_filepath()
65
+
66
+ def __get_default_vocab_filepath(self):
67
+ return join(self.__target_dir, self.__vocab_fn_template)
68
+
69
+ def __get_default_embedding_filepath(self):
70
+ return join(self.__target_dir, self.__term_emb_fn_template)
71
+
72
+ # endregion
@@ -0,0 +1,37 @@
1
+ from os.path import join
2
+
3
+ from arekit.contrib.utils.data.readers.base import BaseReader
4
+ from arekit.common.experiment.api.base_samples_io import BaseSamplesIO
5
+ from arekit.contrib.utils.io_utils.utils import filename_template
6
+
7
+
8
+ class OpinionsIO(BaseSamplesIO):
9
+
10
+ def __init__(self, target_dir, reader=None, prefix="opinion"):
11
+ assert(isinstance(reader, BaseReader))
12
+ self.__target_dir = target_dir
13
+ self.__prefix = prefix
14
+ self.__reader = reader
15
+ self.__target_extension = reader.extension()
16
+
17
+ @property
18
+ def Reader(self):
19
+ return self.__reader
20
+
21
+ def create_target(self, data_type):
22
+ return self.__get_input_opinions_target(data_type)
23
+
24
+ def __get_input_opinions_target(self, data_type):
25
+ template = filename_template(data_type=data_type)
26
+ return self.__get_filepath(out_dir=self.__target_dir,
27
+ template=template,
28
+ prefix=self.__prefix,
29
+ extension=self.__target_extension)
30
+
31
+ @staticmethod
32
+ def __get_filepath(out_dir, template, prefix, extension):
33
+ assert(isinstance(template, str))
34
+ assert(isinstance(prefix, str))
35
+ assert(isinstance(extension, str))
36
+ return join(out_dir, "{prefix}-{template}{extension}".format(
37
+ prefix=prefix, template=template, extension=extension))
@@ -0,0 +1,79 @@
1
+ import logging
2
+ from os.path import join
3
+
4
+ from arekit.contrib.utils.data.readers.base import BaseReader
5
+ from arekit.common.experiment.api.base_samples_io import BaseSamplesIO
6
+ from arekit.contrib.utils.data.writers.base import BaseWriter
7
+ from arekit.contrib.utils.io_utils.utils import filename_template, check_targets_existence
8
+
9
+ logger = logging.getLogger(__name__)
10
+ logging.basicConfig(level=logging.INFO)
11
+
12
+
13
+ class SamplesIO(BaseSamplesIO):
14
+ """ Samples default IO utils for samples.
15
+ Sample is a text part which include pair of attitude participants.
16
+ This class allows to provide saver and loader for such entries, bubbed as samples.
17
+ Samples required for machine learning training/inferring.
18
+ """
19
+
20
+ def __init__(self, target_dir, writer=None, reader=None, prefix="sample"):
21
+ assert(isinstance(target_dir, str))
22
+ assert(isinstance(prefix, str))
23
+ assert(isinstance(writer, BaseWriter) or writer is None)
24
+ assert(isinstance(reader, BaseReader) or reader is None)
25
+ self.__target_dir = target_dir
26
+ self.__prefix = prefix
27
+ self.__writer = writer
28
+ self.__reader = reader
29
+
30
+ self.__target_extension = None
31
+ if writer is not None:
32
+ self.__target_extension = writer.extension()
33
+ elif reader is not None:
34
+ self.__target_extension = reader.extension()
35
+
36
+ # region public methods
37
+
38
+ @property
39
+ def Prefix(self):
40
+ return self.__prefix
41
+
42
+ @property
43
+ def Reader(self):
44
+ return self.__reader
45
+
46
+ @property
47
+ def Writer(self):
48
+ return self.__writer
49
+
50
+ def create_target(self, data_type):
51
+ return self.__get_input_sample_target(data_type)
52
+
53
+ def check_targets_existed(self, data_types_iter):
54
+ for data_type in data_types_iter:
55
+
56
+ targets = [
57
+ self.__get_input_sample_target(data_type=data_type),
58
+ ]
59
+
60
+ if not check_targets_existence(targets=targets):
61
+ return False
62
+ return True
63
+
64
+ # endregion
65
+
66
+ def __get_input_sample_target(self, data_type):
67
+ template = filename_template(data_type=data_type)
68
+ return self.__get_filepath(out_dir=self.__target_dir,
69
+ template=template,
70
+ prefix=self.__prefix,
71
+ extension=self.__target_extension)
72
+
73
+ @staticmethod
74
+ def __get_filepath(out_dir, template, prefix, extension):
75
+ assert(isinstance(template, str))
76
+ assert(isinstance(prefix, str))
77
+ assert(isinstance(extension, str))
78
+ return join(out_dir, "{prefix}-{template}{extension}".format(
79
+ prefix=prefix, template=template, extension=extension))
@@ -0,0 +1,39 @@
1
+ from collections.abc import Iterable
2
+ import logging
3
+ from os.path import join, exists
4
+
5
+ from arekit.common.experiment.data_type import DataType
6
+
7
+
8
+ logger = logging.getLogger(__name__)
9
+ logging.basicConfig(level=logging.INFO)
10
+
11
+
12
+ def join_dir_with_subfolder_name(subfolder_name, dir):
13
+ """ Returns subfolder in in directory
14
+ """
15
+ assert(isinstance(subfolder_name, str))
16
+ assert(isinstance(dir, str))
17
+
18
+ target_dir = join(dir, "{}/".format(subfolder_name))
19
+ return target_dir
20
+
21
+
22
+ def filename_template(data_type):
23
+ assert(isinstance(data_type, DataType))
24
+ return "{data_type}-0".format(data_type=data_type.name.lower())
25
+
26
+
27
+ def check_targets_existence(targets):
28
+ assert (isinstance(targets, Iterable))
29
+
30
+ result = True
31
+ for filepath in targets:
32
+ assert(isinstance(filepath, str))
33
+
34
+ existed = exists(filepath)
35
+ logger.info("Check existence [{is_existed}]: {fp}".format(is_existed=existed, fp=filepath))
36
+ if not existed:
37
+ result = False
38
+
39
+ return result
File without changes
@@ -0,0 +1,41 @@
1
+ from arekit.contrib.utils.data.readers.csv_pd import PandasCsvReader
2
+
3
+
4
+ class Lexicon(object):
5
+
6
+ @property
7
+ def ToneKey(self):
8
+ return 'tone'
9
+
10
+ @property
11
+ def TermKey(self):
12
+ return 'term'
13
+
14
+ def __init__(self, dataframe):
15
+ self.__lexicon_df = dataframe
16
+
17
+ @classmethod
18
+ def load(cls, filepath, separator=','):
19
+ reader = PandasCsvReader(compression=None, sep=separator)
20
+ return cls(reader.read(filepath))
21
+
22
+ def get_score(self, lemma):
23
+ assert(type(lemma) == str)
24
+ s = self.__lexicon_df[lemma.encode('utf-8') == self.__lexicon_df[self.TermKey]]
25
+ return s[self.ToneKey].values[0] if len(s) > 0 else 0
26
+
27
+ def has_term(self, term):
28
+ assert(type(term) == str)
29
+ s = self.__lexicon_df[term.encode('utf-8') == self.__lexicon_df[self.TermKey]]
30
+ return len(s) > 0
31
+
32
+ def __iter__(self):
33
+ for term in self.__lexicon_df[self.TermKey]:
34
+ yield term
35
+
36
+ def __contains__(self, item):
37
+ assert(isinstance(item, str))
38
+ result = self.__lexicon_df[self.__lexicon_df[self.TermKey] == item.encode('utf-8')]
39
+ return len(result) > 0
40
+
41
+
@@ -0,0 +1,42 @@
1
+ from arekit.contrib.utils.data.readers.csv_pd import PandasCsvReader
2
+
3
+
4
+ class RelationLexicon(object):
5
+
6
+ def __init__(self, dataframe):
7
+ self.__check(dataframe)
8
+ self.__lexicon = dataframe
9
+
10
+ @classmethod
11
+ def load(cls, filepath, separator=','):
12
+ reader = PandasCsvReader(compression=None, sep=separator)
13
+ return cls(reader.read(filepath))
14
+
15
+ @staticmethod
16
+ def __check(df):
17
+ for index in df.index:
18
+ relation = df.loc[index][0]
19
+ assert(len(relation.split('<->')) == 2)
20
+
21
+ @staticmethod
22
+ def __create_key(l, r):
23
+ assert(type(l) == str)
24
+ assert(type(r) == str)
25
+ return '<->'.join([l, r])
26
+
27
+ def get_score(self, left, right):
28
+ assert(type(left) == str)
29
+ assert(type(right) == str)
30
+
31
+ lr_key = self.__create_key(left, right)
32
+ rl_key = self.__create_key(right, left)
33
+
34
+ lr_score = self.__lexicon[lr_key == self.__lexicon['relation']]
35
+ rl_score = self.__lexicon[rl_key == self.__lexicon['relation']]
36
+
37
+ if len(lr_score) > 0:
38
+ return lr_score['tone'].values[0]
39
+ if len(rl_score) > 0:
40
+ return rl_score['tone'].values[0]
41
+
42
+ return None
@@ -0,0 +1,37 @@
1
+ import importlib
2
+ import zipfile
3
+ from os import path
4
+
5
+
6
+ from arekit.contrib.source.zip_utils import ZipArchiveUtils
7
+ from arekit.contrib.utils.lexicons.lexicon import Lexicon
8
+
9
+
10
+ class RuSentiLexLexicon(Lexicon):
11
+ """
12
+ RuSentiLex Lexicon wrapper for csv file stored in /data folder.
13
+ """
14
+
15
+ __INNER_PATH = 'rusentilex.csv'
16
+
17
+ @property
18
+ def ToneKey(self):
19
+ return 'tone'
20
+
21
+ @property
22
+ def TermKey(self):
23
+ return 'term'
24
+
25
+ @staticmethod
26
+ def __get_archive_filepath():
27
+ return path.join(ZipArchiveUtils.get_data_root(), "rusentilex.zip")
28
+
29
+ @classmethod
30
+ def from_zip(cls):
31
+ """ Using Pandas API to read lexicon.
32
+ """
33
+ pd = importlib.import_module("pandas")
34
+ with zipfile.ZipFile(cls.__get_archive_filepath(), "r") as zip_ref:
35
+ with zip_ref.open(cls.__INNER_PATH, mode='r') as csv_file:
36
+ df = pd.read_csv(csv_file, sep=',')
37
+ return cls(df)
File without changes
@@ -0,0 +1,83 @@
1
+ import collections
2
+
3
+ from arekit.common.data.input.providers.text.single import BaseSingleTextProvider
4
+ from arekit.common.data.input.terms_mapper import OpinionContainingTextTermsMapper
5
+ from arekit.common.entities.str_fmt import StringEntitiesFormatter
6
+ from arekit.contrib.networks.input.ctx_serialization import NetworkSerializationContext
7
+ from arekit.contrib.networks.input.formatters.pos_mapper import PosTermsMapper
8
+ from arekit.contrib.networks.input.providers.sample import NetworkSampleRowProvider
9
+ from arekit.contrib.networks.input.providers.text import NetworkSingleTextProvider
10
+ from arekit.contrib.networks.input.term_types import TermTypes
11
+ from arekit.contrib.networks.input.terms_mapping import VectorizedNetworkTermMapping
12
+ from arekit.contrib.utils.processing.lemmatization.mystem import MystemWrapper
13
+ from arekit.contrib.utils.resources import load_embedding_news_mystem_skipgram_1000_20_2015
14
+ from arekit.contrib.utils.vectorizers.bpe import BPEVectorizer
15
+ from arekit.contrib.utils.vectorizers.random_norm import RandomNormalVectorizer
16
+
17
+
18
+ def __add_term_embedding(dict_data, term, emb_vector):
19
+ if term in dict_data:
20
+ return
21
+ dict_data[term] = emb_vector
22
+
23
+
24
+ def create_rows_provider(str_entity_fmt, ctx, vectorizers="default"):
25
+ """ This method is corresponds to the default initialization of
26
+ the rows provider for data sampling pipeline.
27
+
28
+ vectorizers:
29
+ NONE: no need to vectorize, just provide text (using SingleTextProvider).
30
+ DEFAULT: we consider an application of stemmer for Russian Language.
31
+ DICT: in which for every type there is an assigned Vectorizer
32
+ vectorization of term types.
33
+ {
34
+ TermType.Word: Vectorizer,
35
+ TermType.Entity: Vectorizer,
36
+ ...
37
+ }
38
+ """
39
+ assert(isinstance(str_entity_fmt, StringEntitiesFormatter))
40
+ assert(isinstance(ctx, NetworkSerializationContext))
41
+ assert(isinstance(vectorizers, dict) or vectorizers == "default" or vectorizers is None)
42
+
43
+ term_embedding_pairs = None
44
+
45
+ if vectorizers is not None:
46
+
47
+ if vectorizers == "default":
48
+ # initialize default vectorizer for Russian language.
49
+ embedding = load_embedding_news_mystem_skipgram_1000_20_2015(stemmer=MystemWrapper(), auto_download=True)
50
+ bpe_vectorizer = BPEVectorizer(embedding=embedding, max_part_size=3)
51
+ norm_vectorizer = RandomNormalVectorizer(vector_size=embedding.VectorSize,
52
+ token_offset=12345)
53
+ vectorizers = {
54
+ TermTypes.WORD: bpe_vectorizer,
55
+ TermTypes.ENTITY: bpe_vectorizer,
56
+ TermTypes.FRAME: bpe_vectorizer,
57
+ TermTypes.TOKEN: norm_vectorizer
58
+ }
59
+
60
+ # Setup term-embedding pairs collection instance.
61
+ term_embedding_pairs = collections.OrderedDict()
62
+
63
+ # Use text provider with vectorizers.
64
+ text_provider = NetworkSingleTextProvider(
65
+ text_terms_mapper=VectorizedNetworkTermMapping(
66
+ vectorizers=vectorizers,
67
+ string_entities_formatter=str_entity_fmt),
68
+ pair_handling_func=lambda pair: __add_term_embedding(
69
+ dict_data=term_embedding_pairs,
70
+ term=pair[0],
71
+ emb_vector=pair[1]))
72
+ else:
73
+ # Create text provider which without vectorizers.
74
+ text_provider = BaseSingleTextProvider(
75
+ text_terms_mapper=OpinionContainingTextTermsMapper(str_entity_fmt))
76
+
77
+ return NetworkSampleRowProvider(
78
+ label_provider=ctx.LabelProvider,
79
+ text_provider=text_provider,
80
+ frames_connotation_provider=ctx.FramesConnotationProvider,
81
+ frame_role_label_scaler=ctx.FrameRolesLabelScaler,
82
+ pos_terms_mapper=PosTermsMapper(ctx.PosTagger) if ctx.PosTagger is not None else None,
83
+ term_embedding_pairs=term_embedding_pairs)
File without changes
@@ -0,0 +1,22 @@
1
+ import logging
2
+
3
+ from arekit.contrib.utils.np_utils.npz_utils import NpzRepositoryUtils
4
+
5
+ logger = logging.getLogger(__name__)
6
+ logging.basicConfig(level=logging.INFO)
7
+
8
+
9
+ class NpzEmbeddingHelper:
10
+
11
+ @staticmethod
12
+ def save_embedding(data, target):
13
+ NpzRepositoryUtils.save(data=data, target=target)
14
+ logger.info("Saving embedding [size={shape}]: {filepath}".format(shape=data.shape,
15
+ filepath=target))
16
+
17
+ @staticmethod
18
+ def load_embedding(source):
19
+ embedding = NpzRepositoryUtils.load(source)
20
+ logger.info("Embedding read [size={size}]: {filepath}".format(size=embedding.shape,
21
+ filepath=source))
22
+ return embedding
@@ -0,0 +1,13 @@
1
+ import numpy as np
2
+
3
+
4
+ class NpzRepositoryUtils(object):
5
+
6
+ @staticmethod
7
+ def save(data, target):
8
+ np.savez(target, data)
9
+
10
+ @staticmethod
11
+ def load(source):
12
+ data = np.load(source)
13
+ return data['arr_0']
@@ -0,0 +1,20 @@
1
+ import logging
2
+
3
+ import numpy as np
4
+
5
+ logger = logging.getLogger(__name__)
6
+ logging.basicConfig(level=logging.INFO)
7
+
8
+
9
+ class VocabRepositoryUtils(object):
10
+
11
+ @staticmethod
12
+ def save(data, target):
13
+ logger.info("Saving vocabulary [size={size}]: {filepath}".format(size=len(data), filepath=target))
14
+ np.savetxt(target, data, fmt='%s')
15
+
16
+ @staticmethod
17
+ def load(source):
18
+ vocab = np.loadtxt(source, dtype=str, comments=None)
19
+ logger.info("Loading vocabulary [size={size}]: {filepath}".format(size=len(vocab), filepath=source))
20
+ return vocab
File without changes
File without changes