arekit 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. arekit/__init__.py +0 -0
  2. arekit/common/__init__.py +0 -0
  3. arekit/common/bound.py +48 -0
  4. arekit/common/context/__init__.py +0 -0
  5. arekit/common/context/terms_mapper.py +51 -0
  6. arekit/common/context/token.py +16 -0
  7. arekit/common/data/__init__.py +0 -0
  8. arekit/common/data/const.py +21 -0
  9. arekit/common/data/doc_provider.py +6 -0
  10. arekit/common/data/input/__init__.py +0 -0
  11. arekit/common/data/input/providers/__init__.py +0 -0
  12. arekit/common/data/input/providers/columns/__init__.py +0 -0
  13. arekit/common/data/input/providers/columns/base.py +9 -0
  14. arekit/common/data/input/providers/columns/sample.py +59 -0
  15. arekit/common/data/input/providers/const.py +3 -0
  16. arekit/common/data/input/providers/contents.py +9 -0
  17. arekit/common/data/input/providers/instances/__init__.py +0 -0
  18. arekit/common/data/input/providers/instances/base.py +14 -0
  19. arekit/common/data/input/providers/instances/multiple.py +27 -0
  20. arekit/common/data/input/providers/instances/single.py +8 -0
  21. arekit/common/data/input/providers/label/__init__.py +0 -0
  22. arekit/common/data/input/providers/label/base.py +24 -0
  23. arekit/common/data/input/providers/label/binary.py +11 -0
  24. arekit/common/data/input/providers/label/multiple.py +15 -0
  25. arekit/common/data/input/providers/rows/__init__.py +0 -0
  26. arekit/common/data/input/providers/rows/base.py +64 -0
  27. arekit/common/data/input/providers/rows/samples.py +227 -0
  28. arekit/common/data/input/providers/sample/__init__.py +0 -0
  29. arekit/common/data/input/providers/sample/cropped.py +43 -0
  30. arekit/common/data/input/providers/text/__init__.py +0 -0
  31. arekit/common/data/input/providers/text/single.py +49 -0
  32. arekit/common/data/input/repositories/__init__.py +0 -0
  33. arekit/common/data/input/repositories/base.py +68 -0
  34. arekit/common/data/input/repositories/sample.py +22 -0
  35. arekit/common/data/input/sample.py +66 -0
  36. arekit/common/data/input/terms_mapper.py +88 -0
  37. arekit/common/data/rows_fmt.py +82 -0
  38. arekit/common/data/rows_parser.py +43 -0
  39. arekit/common/data/storages/__init__.py +0 -0
  40. arekit/common/data/storages/base.py +109 -0
  41. arekit/common/data/views/__init__.py +0 -0
  42. arekit/common/data/views/samples.py +26 -0
  43. arekit/common/docs/__init__.py +0 -0
  44. arekit/common/docs/base.py +30 -0
  45. arekit/common/docs/entities_grouping.py +16 -0
  46. arekit/common/docs/entity.py +18 -0
  47. arekit/common/docs/objects_parser.py +37 -0
  48. arekit/common/docs/parsed/__init__.py +0 -0
  49. arekit/common/docs/parsed/base.py +101 -0
  50. arekit/common/docs/parsed/providers/__init__.py +0 -0
  51. arekit/common/docs/parsed/providers/base.py +68 -0
  52. arekit/common/docs/parsed/providers/base_pairs.py +51 -0
  53. arekit/common/docs/parsed/providers/entity_service.py +175 -0
  54. arekit/common/docs/parsed/providers/opinion_pairs.py +20 -0
  55. arekit/common/docs/parsed/providers/text_opinion_pairs.py +78 -0
  56. arekit/common/docs/parsed/service.py +31 -0
  57. arekit/common/docs/parsed/term_position.py +42 -0
  58. arekit/common/docs/parser.py +34 -0
  59. arekit/common/docs/sentence.py +14 -0
  60. arekit/common/entities/__init__.py +0 -0
  61. arekit/common/entities/base.py +51 -0
  62. arekit/common/entities/collection.py +72 -0
  63. arekit/common/entities/str_fmt.py +8 -0
  64. arekit/common/entities/types.py +9 -0
  65. arekit/common/experiment/__init__.py +0 -0
  66. arekit/common/experiment/api/__init__.py +0 -0
  67. arekit/common/experiment/api/base_samples_io.py +20 -0
  68. arekit/common/experiment/data_type.py +17 -0
  69. arekit/common/frames/__init__.py +0 -0
  70. arekit/common/frames/connotations/__init__.py +0 -0
  71. arekit/common/frames/connotations/descriptor.py +17 -0
  72. arekit/common/frames/connotations/provider.py +4 -0
  73. arekit/common/frames/text_variant.py +43 -0
  74. arekit/common/frames/variants/__init__.py +0 -0
  75. arekit/common/frames/variants/base.py +21 -0
  76. arekit/common/frames/variants/collection.py +60 -0
  77. arekit/common/labels/__init__.py +0 -0
  78. arekit/common/labels/base.py +19 -0
  79. arekit/common/labels/provider/__init__.py +0 -0
  80. arekit/common/labels/provider/base.py +7 -0
  81. arekit/common/labels/provider/constant.py +14 -0
  82. arekit/common/labels/scaler/__init__.py +0 -0
  83. arekit/common/labels/scaler/base.py +85 -0
  84. arekit/common/labels/scaler/sentiment.py +7 -0
  85. arekit/common/labels/scaler/single.py +10 -0
  86. arekit/common/labels/str_fmt.py +55 -0
  87. arekit/common/linkage/__init__.py +0 -0
  88. arekit/common/linkage/base.py +44 -0
  89. arekit/common/linkage/meta.py +23 -0
  90. arekit/common/linkage/opinions.py +9 -0
  91. arekit/common/linkage/text_opinions.py +22 -0
  92. arekit/common/log_utils.py +29 -0
  93. arekit/common/model/__init__.py +0 -0
  94. arekit/common/model/labeling/__init__.py +0 -0
  95. arekit/common/model/labeling/base.py +24 -0
  96. arekit/common/model/labeling/modes.py +8 -0
  97. arekit/common/model/labeling/single.py +24 -0
  98. arekit/common/opinions/__init__.py +0 -0
  99. arekit/common/opinions/annot/__init__.py +0 -0
  100. arekit/common/opinions/annot/algo/__init__.py +0 -0
  101. arekit/common/opinions/annot/algo/base.py +4 -0
  102. arekit/common/opinions/annot/algo/pair_based.py +99 -0
  103. arekit/common/opinions/annot/algo/predefined.py +16 -0
  104. arekit/common/opinions/annot/algo_based.py +55 -0
  105. arekit/common/opinions/annot/base.py +15 -0
  106. arekit/common/opinions/base.py +74 -0
  107. arekit/common/opinions/collection.py +150 -0
  108. arekit/common/opinions/enums.py +6 -0
  109. arekit/common/opinions/provider.py +4 -0
  110. arekit/common/opinions/writer.py +4 -0
  111. arekit/common/pipeline/__init__.py +0 -0
  112. arekit/common/pipeline/base.py +25 -0
  113. arekit/common/pipeline/context.py +36 -0
  114. arekit/common/pipeline/conts.py +2 -0
  115. arekit/common/pipeline/items/__init__.py +0 -0
  116. arekit/common/pipeline/items/base.py +12 -0
  117. arekit/common/pipeline/items/flatten.py +14 -0
  118. arekit/common/pipeline/items/handle.py +17 -0
  119. arekit/common/pipeline/items/iter.py +11 -0
  120. arekit/common/pipeline/items/map.py +11 -0
  121. arekit/common/pipeline/items/map_nested.py +13 -0
  122. arekit/common/synonyms/__init__.py +0 -0
  123. arekit/common/synonyms/base.py +151 -0
  124. arekit/common/synonyms/grouping.py +21 -0
  125. arekit/common/text/__init__.py +0 -0
  126. arekit/common/text/enums.py +12 -0
  127. arekit/common/text/parsed.py +42 -0
  128. arekit/common/text/parser.py +12 -0
  129. arekit/common/text/partitioning/__init__.py +0 -0
  130. arekit/common/text/partitioning/base.py +4 -0
  131. arekit/common/text/partitioning/str.py +36 -0
  132. arekit/common/text/partitioning/terms.py +35 -0
  133. arekit/common/text/stemmer.py +16 -0
  134. arekit/common/text_opinions/__init__.py +0 -0
  135. arekit/common/text_opinions/base.py +105 -0
  136. arekit/common/utils.py +129 -0
  137. arekit/contrib/__init__.py +0 -0
  138. arekit/contrib/bert/__init__.py +0 -0
  139. arekit/contrib/bert/input/__init__.py +0 -0
  140. arekit/contrib/bert/input/providers/__init__.py +0 -0
  141. arekit/contrib/bert/input/providers/cropped_sample.py +17 -0
  142. arekit/contrib/bert/input/providers/text_pair.py +62 -0
  143. arekit/contrib/bert/terms/__init__.py +0 -0
  144. arekit/contrib/bert/terms/mapper.py +20 -0
  145. arekit/contrib/networks/__init__.py +0 -0
  146. arekit/contrib/networks/embedding.py +149 -0
  147. arekit/contrib/networks/embedding_io.py +18 -0
  148. arekit/contrib/networks/input/__init__.py +0 -0
  149. arekit/contrib/networks/input/const.py +6 -0
  150. arekit/contrib/networks/input/ctx_serialization.py +28 -0
  151. arekit/contrib/networks/input/embedding/__init__.py +0 -0
  152. arekit/contrib/networks/input/embedding/matrix.py +29 -0
  153. arekit/contrib/networks/input/embedding/offsets.py +55 -0
  154. arekit/contrib/networks/input/formatters/__init__.py +0 -0
  155. arekit/contrib/networks/input/formatters/pos_mapper.py +22 -0
  156. arekit/contrib/networks/input/providers/__init__.py +0 -0
  157. arekit/contrib/networks/input/providers/sample.py +129 -0
  158. arekit/contrib/networks/input/providers/term_connotation.py +23 -0
  159. arekit/contrib/networks/input/providers/text.py +24 -0
  160. arekit/contrib/networks/input/rows_parser.py +47 -0
  161. arekit/contrib/networks/input/term_types.py +13 -0
  162. arekit/contrib/networks/input/terms_mapping.py +60 -0
  163. arekit/contrib/networks/vectorizer.py +6 -0
  164. arekit/contrib/prompt/__init__.py +0 -0
  165. arekit/contrib/prompt/sample.py +61 -0
  166. arekit/contrib/source/__init__.py +0 -0
  167. arekit/contrib/source/brat/__init__.py +0 -0
  168. arekit/contrib/source/brat/annot.py +84 -0
  169. arekit/contrib/source/brat/doc.py +28 -0
  170. arekit/contrib/source/brat/entities/__init__.py +0 -0
  171. arekit/contrib/source/brat/entities/compound.py +13 -0
  172. arekit/contrib/source/brat/entities/entity.py +42 -0
  173. arekit/contrib/source/brat/entities/parser.py +53 -0
  174. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  175. arekit/contrib/source/brat/opinions/converter.py +19 -0
  176. arekit/contrib/source/brat/relation.py +32 -0
  177. arekit/contrib/source/brat/sentence.py +69 -0
  178. arekit/contrib/source/brat/sentences_reader.py +128 -0
  179. arekit/contrib/source/download.py +41 -0
  180. arekit/contrib/source/nerel/__init__.py +0 -0
  181. arekit/contrib/source/nerel/entities.py +55 -0
  182. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  183. arekit/contrib/source/nerel/folding/fixed.py +74 -0
  184. arekit/contrib/source/nerel/io_utils.py +62 -0
  185. arekit/contrib/source/nerel/labels.py +241 -0
  186. arekit/contrib/source/nerel/reader.py +46 -0
  187. arekit/contrib/source/nerel/utils.py +24 -0
  188. arekit/contrib/source/nerel/versions.py +12 -0
  189. arekit/contrib/source/nerelbio/__init__.py +0 -0
  190. arekit/contrib/source/nerelbio/io_utils.py +62 -0
  191. arekit/contrib/source/nerelbio/labels.py +265 -0
  192. arekit/contrib/source/nerelbio/reader.py +8 -0
  193. arekit/contrib/source/nerelbio/versions.py +8 -0
  194. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  195. arekit/contrib/source/ruattitudes/collection.py +36 -0
  196. arekit/contrib/source/ruattitudes/doc.py +51 -0
  197. arekit/contrib/source/ruattitudes/doc_brat.py +44 -0
  198. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  199. arekit/contrib/source/ruattitudes/entity/parser.py +7 -0
  200. arekit/contrib/source/ruattitudes/io_utils.py +56 -0
  201. arekit/contrib/source/ruattitudes/labels_fmt.py +12 -0
  202. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  203. arekit/contrib/source/ruattitudes/opinions/base.py +28 -0
  204. arekit/contrib/source/ruattitudes/opinions/converter.py +37 -0
  205. arekit/contrib/source/ruattitudes/reader.py +268 -0
  206. arekit/contrib/source/ruattitudes/sentence.py +73 -0
  207. arekit/contrib/source/ruattitudes/synonyms.py +17 -0
  208. arekit/contrib/source/ruattitudes/text_object.py +59 -0
  209. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  210. arekit/contrib/source/rusentiframes/collection.py +157 -0
  211. arekit/contrib/source/rusentiframes/effect.py +24 -0
  212. arekit/contrib/source/rusentiframes/io_utils.py +19 -0
  213. arekit/contrib/source/rusentiframes/labels_fmt.py +22 -0
  214. arekit/contrib/source/rusentiframes/polarity.py +35 -0
  215. arekit/contrib/source/rusentiframes/role.py +15 -0
  216. arekit/contrib/source/rusentiframes/state.py +24 -0
  217. arekit/contrib/source/rusentiframes/types.py +42 -0
  218. arekit/contrib/source/rusentiframes/value.py +2 -0
  219. arekit/contrib/source/rusentrel/__init__.py +0 -0
  220. arekit/contrib/source/rusentrel/const.py +3 -0
  221. arekit/contrib/source/rusentrel/docs_reader.py +51 -0
  222. arekit/contrib/source/rusentrel/entities.py +26 -0
  223. arekit/contrib/source/rusentrel/io_utils.py +125 -0
  224. arekit/contrib/source/rusentrel/labels_fmt.py +12 -0
  225. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  226. arekit/contrib/source/rusentrel/opinions/collection.py +30 -0
  227. arekit/contrib/source/rusentrel/opinions/converter.py +40 -0
  228. arekit/contrib/source/rusentrel/opinions/provider.py +54 -0
  229. arekit/contrib/source/rusentrel/opinions/writer.py +42 -0
  230. arekit/contrib/source/rusentrel/synonyms.py +17 -0
  231. arekit/contrib/source/sentinerel/__init__.py +0 -0
  232. arekit/contrib/source/sentinerel/entities.py +52 -0
  233. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  234. arekit/contrib/source/sentinerel/folding/factory.py +31 -0
  235. arekit/contrib/source/sentinerel/folding/fixed.py +70 -0
  236. arekit/contrib/source/sentinerel/io_utils.py +87 -0
  237. arekit/contrib/source/sentinerel/labels.py +53 -0
  238. arekit/contrib/source/sentinerel/labels_scaler.py +30 -0
  239. arekit/contrib/source/sentinerel/reader.py +42 -0
  240. arekit/contrib/source/synonyms/__init__.py +0 -0
  241. arekit/contrib/source/synonyms/utils.py +19 -0
  242. arekit/contrib/source/zip_utils.py +47 -0
  243. arekit/contrib/utils/__init__.py +0 -0
  244. arekit/contrib/utils/bert/__init__.py +0 -0
  245. arekit/contrib/utils/bert/samplers.py +17 -0
  246. arekit/contrib/utils/connotations/__init__.py +0 -0
  247. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +23 -0
  248. arekit/contrib/utils/data/__init__.py +0 -0
  249. arekit/contrib/utils/data/contents/__init__.py +0 -0
  250. arekit/contrib/utils/data/contents/opinions.py +37 -0
  251. arekit/contrib/utils/data/doc_provider/__init__.py +0 -0
  252. arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
  253. arekit/contrib/utils/data/doc_provider/dir_based.py +53 -0
  254. arekit/contrib/utils/data/readers/__init__.py +0 -0
  255. arekit/contrib/utils/data/readers/base.py +7 -0
  256. arekit/contrib/utils/data/readers/csv_pd.py +38 -0
  257. arekit/contrib/utils/data/readers/jsonl.py +15 -0
  258. arekit/contrib/utils/data/service/__init__.py +0 -0
  259. arekit/contrib/utils/data/service/balance.py +50 -0
  260. arekit/contrib/utils/data/storages/__init__.py +0 -0
  261. arekit/contrib/utils/data/storages/jsonl_based.py +18 -0
  262. arekit/contrib/utils/data/storages/pandas_based.py +123 -0
  263. arekit/contrib/utils/data/storages/row_cache.py +48 -0
  264. arekit/contrib/utils/data/writers/__init__.py +0 -0
  265. arekit/contrib/utils/data/writers/base.py +27 -0
  266. arekit/contrib/utils/data/writers/csv_native.py +63 -0
  267. arekit/contrib/utils/data/writers/csv_pd.py +40 -0
  268. arekit/contrib/utils/data/writers/json_opennre.py +132 -0
  269. arekit/contrib/utils/data/writers/sqlite_native.py +110 -0
  270. arekit/contrib/utils/download.py +77 -0
  271. arekit/contrib/utils/embeddings/__init__.py +0 -0
  272. arekit/contrib/utils/embeddings/rusvectores.py +58 -0
  273. arekit/contrib/utils/embeddings/tokens.py +30 -0
  274. arekit/contrib/utils/entities/__init__.py +0 -0
  275. arekit/contrib/utils/entities/filter.py +7 -0
  276. arekit/contrib/utils/entities/formatters/__init__.py +0 -0
  277. arekit/contrib/utils/entities/formatters/str_display.py +11 -0
  278. arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py +15 -0
  279. arekit/contrib/utils/io_utils/__init__.py +0 -0
  280. arekit/contrib/utils/io_utils/embedding.py +72 -0
  281. arekit/contrib/utils/io_utils/opinions.py +37 -0
  282. arekit/contrib/utils/io_utils/samples.py +79 -0
  283. arekit/contrib/utils/io_utils/utils.py +39 -0
  284. arekit/contrib/utils/lexicons/__init__.py +0 -0
  285. arekit/contrib/utils/lexicons/lexicon.py +41 -0
  286. arekit/contrib/utils/lexicons/relation.py +42 -0
  287. arekit/contrib/utils/lexicons/rusentilex.py +37 -0
  288. arekit/contrib/utils/nn/__init__.py +0 -0
  289. arekit/contrib/utils/nn/rows.py +83 -0
  290. arekit/contrib/utils/np_utils/__init__.py +0 -0
  291. arekit/contrib/utils/np_utils/embedding.py +22 -0
  292. arekit/contrib/utils/np_utils/npz_utils.py +13 -0
  293. arekit/contrib/utils/np_utils/vocab.py +20 -0
  294. arekit/contrib/utils/pipelines/__init__.py +0 -0
  295. arekit/contrib/utils/pipelines/items/__init__.py +0 -0
  296. arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
  297. arekit/contrib/utils/pipelines/items/sampling/base.py +99 -0
  298. arekit/contrib/utils/pipelines/items/sampling/networks.py +54 -0
  299. arekit/contrib/utils/pipelines/items/text/__init__.py +0 -0
  300. arekit/contrib/utils/pipelines/items/text/entities_default.py +23 -0
  301. arekit/contrib/utils/pipelines/items/text/frames.py +86 -0
  302. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +36 -0
  303. arekit/contrib/utils/pipelines/items/text/frames_negation.py +32 -0
  304. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +10 -0
  305. arekit/contrib/utils/pipelines/items/text/tokenizer.py +107 -0
  306. arekit/contrib/utils/pipelines/items/text/translator.py +135 -0
  307. arekit/contrib/utils/pipelines/opinion_collections.py +85 -0
  308. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  309. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  310. arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +27 -0
  311. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +65 -0
  312. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +60 -0
  313. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  314. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +29 -0
  315. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +64 -0
  316. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +79 -0
  317. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  318. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +56 -0
  319. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +20 -0
  320. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +65 -0
  321. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  322. arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +21 -0
  323. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +107 -0
  324. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  325. arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +29 -0
  326. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +62 -0
  327. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +180 -0
  328. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +50 -0
  329. arekit/contrib/utils/pipelines/text_opinion/__init__.py +0 -0
  330. arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py +0 -0
  331. arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +34 -0
  332. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +88 -0
  333. arekit/contrib/utils/pipelines/text_opinion/extraction.py +93 -0
  334. arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py +0 -0
  335. arekit/contrib/utils/pipelines/text_opinion/filters/base.py +4 -0
  336. arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +16 -0
  337. arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +29 -0
  338. arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +26 -0
  339. arekit/contrib/utils/processing/__init__.py +0 -0
  340. arekit/contrib/utils/processing/languages/__init__.py +0 -0
  341. arekit/contrib/utils/processing/languages/mods.py +12 -0
  342. arekit/contrib/utils/processing/languages/pos.py +23 -0
  343. arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
  344. arekit/contrib/utils/processing/languages/ru/cases.py +78 -0
  345. arekit/contrib/utils/processing/languages/ru/constants.py +6 -0
  346. arekit/contrib/utils/processing/languages/ru/mods.py +13 -0
  347. arekit/contrib/utils/processing/languages/ru/number.py +23 -0
  348. arekit/contrib/utils/processing/languages/ru/pos_service.py +36 -0
  349. arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
  350. arekit/contrib/utils/processing/lemmatization/mystem.py +51 -0
  351. arekit/contrib/utils/processing/pos/__init__.py +0 -0
  352. arekit/contrib/utils/processing/pos/base.py +12 -0
  353. arekit/contrib/utils/processing/pos/mystem_wrap.py +134 -0
  354. arekit/contrib/utils/processing/pos/russian.py +10 -0
  355. arekit/contrib/utils/processing/text/__init__.py +0 -0
  356. arekit/contrib/utils/processing/text/tokens.py +127 -0
  357. arekit/contrib/utils/resources.py +25 -0
  358. arekit/contrib/utils/serializer.py +43 -0
  359. arekit/contrib/utils/sources/__init__.py +0 -0
  360. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  361. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  362. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +63 -0
  363. arekit/contrib/utils/synonyms/__init__.py +0 -0
  364. arekit/contrib/utils/synonyms/simple.py +15 -0
  365. arekit/contrib/utils/synonyms/stemmer_based.py +38 -0
  366. arekit/contrib/utils/vectorizers/__init__.py +0 -0
  367. arekit/contrib/utils/vectorizers/bpe.py +93 -0
  368. arekit/contrib/utils/vectorizers/random_norm.py +39 -0
  369. arekit/download_data.py +11 -0
  370. arekit-0.24.0.dist-info/LICENSE +21 -0
  371. arekit-0.24.0.dist-info/METADATA +23 -0
  372. arekit-0.24.0.dist-info/RECORD +374 -0
  373. arekit-0.24.0.dist-info/WHEEL +5 -0
  374. arekit-0.24.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,87 @@
1
+ from enum import Enum
2
+ from os import path
3
+ from os.path import basename, join
4
+
5
+ import enum
6
+
7
+ from arekit.contrib.source.sentinerel.folding.factory import SentiNERELFoldingFactory
8
+ from arekit.contrib.source.zip_utils import ZipArchiveUtils
9
+
10
+
11
+ class SentiNerelVersions(Enum):
12
+ """ List of the supported version of this collection
13
+ """
14
+
15
+ # Initial version.
16
+ V1 = "v1_0"
17
+ # Updated annotation within the second half of the texts. (September 2022)
18
+ V2 = "v2_0"
19
+ # Updated annotation within the first half of the texts. (October 2022)
20
+ # Become a source of the RuSentNE-2023 competition.
21
+ # https://github.com/dialogue-evaluation/RuSentNE-evaluation
22
+ V21 = "v2_1"
23
+
24
+
25
+ DEFAULT_VERSION = SentiNerelVersions.V21
26
+
27
+
28
+ class SentiNerelIOUtils(ZipArchiveUtils):
29
+
30
+ inner_root = "sentiment_dataset"
31
+
32
+ @staticmethod
33
+ def get_archive_filepath(version):
34
+ return path.join(SentiNerelIOUtils.get_data_root(), "sentinerel-{}.zip".format(version))
35
+
36
+ @staticmethod
37
+ def get_annotation_innerpath(filename):
38
+ assert(isinstance(filename, str))
39
+ return path.join(SentiNerelIOUtils.inner_root, "{}.ann".format(filename))
40
+
41
+ @staticmethod
42
+ def get_doc_innerpath(filename):
43
+ assert(isinstance(filename, str))
44
+ return path.join(SentiNerelIOUtils.inner_root, "{}.txt".format(filename))
45
+
46
+ @staticmethod
47
+ def __iter_filenames_from_dataset(folder_name, version):
48
+ assert(isinstance(version, enum.Enum))
49
+ assert(isinstance(folder_name, str))
50
+
51
+ for filename in SentiNerelIOUtils.iter_filenames_from_zip(version):
52
+
53
+ extension = filename[-4:]
54
+
55
+ # Crop extension.
56
+ filename = filename[:-4]
57
+
58
+ if extension != ".txt":
59
+ continue
60
+
61
+ if not folder_name in filename:
62
+ continue
63
+
64
+ yield basename(filename)
65
+
66
+ # region public methods
67
+
68
+ @staticmethod
69
+ def iter_collection_filenames(version=DEFAULT_VERSION):
70
+ filenames_it = SentiNerelIOUtils.__iter_filenames_from_dataset(
71
+ folder_name=SentiNerelIOUtils.inner_root, version=version)
72
+
73
+ for doc_id, filename in enumerate(filenames_it):
74
+ yield doc_id, filename
75
+
76
+ @staticmethod
77
+ def read_dataset_split(version=DEFAULT_VERSION, docs_limit=None):
78
+ """ Provides a fixed split of the dataset onto
79
+ `test` and `training` part:
80
+ https://github.com/nicolay-r/SentiNEREL-attitude-extraction
81
+ """
82
+ return SentiNerelIOUtils.read_from_zip(
83
+ inner_path=join(SentiNerelIOUtils.inner_root, "split_fixed.txt"),
84
+ process_func=lambda f: SentiNERELFoldingFactory.create_fixed_folding(file=f, limit=docs_limit),
85
+ version=version)
86
+
87
+ # endregion
@@ -0,0 +1,53 @@
1
+ from arekit.common.labels.base import Label
2
+
3
+
4
+ class OpinionBelongsTo(Label):
5
+ pass
6
+
7
+
8
+ class OpinionRelatesTo(Label):
9
+ pass
10
+
11
+
12
+ class NegEffectFrom(Label):
13
+ pass
14
+
15
+
16
+ class NegStateFrom(Label):
17
+ pass
18
+
19
+
20
+ class PosEffectFrom(Label):
21
+ pass
22
+
23
+
24
+ class PosAuthorFrom(Label):
25
+ pass
26
+
27
+
28
+ class NegAuthorFrom(Label):
29
+ pass
30
+
31
+
32
+ class PosStateFrom(Label):
33
+ pass
34
+
35
+
36
+ class NegativeTo(Label):
37
+ pass
38
+
39
+
40
+ class PositiveTo(Label):
41
+ pass
42
+
43
+
44
+ class AlternativeName(Label):
45
+ pass
46
+
47
+
48
+ class StateBelongsTo(Label):
49
+ pass
50
+
51
+
52
+ class OriginsFrom(Label):
53
+ pass
@@ -0,0 +1,30 @@
1
+ from collections import OrderedDict
2
+
3
+ from arekit.common.labels.scaler.base import BaseLabelScaler
4
+ from arekit.contrib.source.sentinerel import labels
5
+
6
+
7
+ class SentiNerelLabelScaler(BaseLabelScaler):
8
+ """ This is a complete label scaler of all the labels supported by NEREL dataset.
9
+ """
10
+
11
+ def __init__(self):
12
+
13
+ self.__uint_to_label_dict = OrderedDict([
14
+ (labels.OpinionBelongsTo(), 0),
15
+ (labels.OpinionRelatesTo(), 1),
16
+ (labels.NegEffectFrom(), 2),
17
+ (labels.PosEffectFrom(), 3),
18
+ (labels.NegStateFrom(), 4),
19
+ (labels.PosStateFrom(), 5),
20
+ (labels.NegativeTo(), 6),
21
+ (labels.PositiveTo(), 7),
22
+ (labels.StateBelongsTo(), 8),
23
+ (labels.PosAuthorFrom(), 9),
24
+ (labels.NegAuthorFrom(), 10),
25
+ (labels.AlternativeName(), 11),
26
+ (labels.OriginsFrom(), 12)
27
+ ])
28
+
29
+ super(SentiNerelLabelScaler, self).__init__(int_dict=self.__uint_to_label_dict,
30
+ uint_dict=self.__uint_to_label_dict)
@@ -0,0 +1,42 @@
1
+ from arekit.contrib.source.brat.annot import BratAnnotationParser
2
+ from arekit.contrib.source.brat.doc import BratDocument
3
+ from arekit.contrib.source.brat.sentences_reader import BratDocumentSentencesReader
4
+ from arekit.contrib.source.sentinerel.entities import SentiNerelEntityCollection
5
+ from arekit.contrib.source.sentinerel.io_utils import SentiNerelIOUtils, DEFAULT_VERSION
6
+
7
+
8
+ class SentiNerelDocReader(object):
9
+
10
+ @staticmethod
11
+ def read_text_relations(filename, version):
12
+ assert(isinstance(filename, str))
13
+
14
+ return SentiNerelIOUtils.read_from_zip(
15
+ inner_path=SentiNerelIOUtils.get_annotation_innerpath(filename),
16
+ process_func=lambda input_file: [
17
+ relation for relation in BratAnnotationParser.parse_annotations(
18
+ input_file=input_file, encoding='utf-8-sig')["relations"]],
19
+ version=version)
20
+
21
+ @staticmethod
22
+ def read_document(filename, doc_id, version=DEFAULT_VERSION, entities_to_ignore=None):
23
+ assert(isinstance(filename, str))
24
+ assert(isinstance(doc_id, int))
25
+
26
+ def file_to_doc(input_file):
27
+ sentences = BratDocumentSentencesReader.from_file(input_file=input_file, entities=entities)
28
+ return BratDocument(doc_id=doc_id, sentences=sentences, text_relations=text_relations)
29
+
30
+ # TODO. #398 issue -- in some cases entities might be nested. Therefore we limit the set
31
+ # TODO. of the potential named entities.
32
+ eti = ["EFFECT_NEG", "EFFECT_POS", "ARGUMENT_NEG", "ARGUMENT_POS", "EVENT"] \
33
+ if entities_to_ignore is None else entities_to_ignore
34
+
35
+ entities = SentiNerelEntityCollection.read_collection(
36
+ filename=filename, version=version, entities_to_ignore=eti)
37
+ text_relations = SentiNerelDocReader.read_text_relations(filename=filename, version=version)
38
+
39
+ return SentiNerelIOUtils.read_from_zip(
40
+ inner_path=SentiNerelIOUtils.get_doc_innerpath(filename=filename),
41
+ process_func=file_to_doc,
42
+ version=version)
File without changes
@@ -0,0 +1,19 @@
1
+ from arekit.common.utils import progress_bar_defined
2
+
3
+
4
+ def iter_synonym_groups(input_file, sep=",", desc=""):
5
+ """ All the synonyms groups organized in lines, separated by `sep`
6
+ """
7
+ lines = input_file.readlines()
8
+
9
+ lines_it = progress_bar_defined(lines,
10
+ total=len(lines),
11
+ desc=desc,
12
+ unit="opins")
13
+
14
+ for line in lines_it:
15
+
16
+ if isinstance(line, bytes):
17
+ line = line.decode()
18
+
19
+ yield line.split(sep)
@@ -0,0 +1,47 @@
1
+ import zipfile
2
+
3
+ import enum
4
+
5
+ from arekit.common import utils
6
+
7
+
8
+ class ZipArchiveUtils(object):
9
+
10
+ @staticmethod
11
+ def get_archive_filepath(version):
12
+ raise NotImplementedError()
13
+
14
+ @classmethod
15
+ def read_from_zip(cls, inner_path, process_func, version):
16
+ """
17
+ process_func:
18
+ func which receives a file reader
19
+ """
20
+ assert(isinstance(inner_path, str))
21
+ assert(callable(process_func))
22
+ assert(isinstance(version, enum.Enum))
23
+
24
+ with zipfile.ZipFile(cls.get_archive_filepath(version.value), "r") as zip_ref:
25
+ with zip_ref.open(inner_path, mode='r') as c_file:
26
+ return process_func(c_file)
27
+
28
+ @classmethod
29
+ def iter_from_zip(cls, inner_path, process_func, version):
30
+ assert(isinstance(inner_path, str))
31
+ assert(callable(process_func))
32
+ assert(isinstance(version, enum.Enum))
33
+
34
+ with zipfile.ZipFile(cls.get_archive_filepath(version.value), "r") as zip_ref:
35
+ with zip_ref.open(inner_path, mode='r') as c_file:
36
+ for result in process_func(c_file):
37
+ yield result
38
+
39
+ @classmethod
40
+ def iter_filenames_from_zip(cls, version):
41
+ assert(isinstance(version, enum.Enum))
42
+ with zipfile.ZipFile(cls.get_archive_filepath(version.value), "r") as zip_ref:
43
+ return iter(zip_ref.namelist())
44
+
45
+ @staticmethod
46
+ def get_data_root():
47
+ return utils.get_default_download_dir()
File without changes
File without changes
@@ -0,0 +1,17 @@
1
+ from arekit.common.data.input.providers.label.multiple import MultipleLabelProvider
2
+ from arekit.common.data.input.providers.rows.samples import BaseSampleRowProvider
3
+ from arekit.common.data.input.providers.text.single import BaseSingleTextProvider
4
+ from arekit.common.data.input.terms_mapper import OpinionContainingTextTermsMapper
5
+ from arekit.contrib.bert.input.providers.text_pair import PairTextProvider
6
+
7
+
8
+ def create_sample_provider(label_scaler, text_terms_mapper, text_b_prompt=None):
9
+ assert(isinstance(text_terms_mapper, OpinionContainingTextTermsMapper))
10
+
11
+ text_provider = BaseSingleTextProvider(text_terms_mapper=text_terms_mapper) \
12
+ if text_b_prompt is None else PairTextProvider(text_b_prompt=text_b_prompt,
13
+ text_terms_mapper=text_terms_mapper)
14
+
15
+ label_provider = MultipleLabelProvider(label_scaler=label_scaler)
16
+
17
+ return BaseSampleRowProvider(text_provider=text_provider, label_provider=label_provider)
File without changes
@@ -0,0 +1,23 @@
1
+ from arekit.common.frames.connotations.provider import FrameConnotationProvider
2
+ from arekit.contrib.source.rusentiframes.collection import RuSentiFramesCollection
3
+
4
+
5
+ class RuSentiFramesConnotationProvider(FrameConnotationProvider):
6
+ """ This is a provider based on A0->A1 label type of RuSentiFrames collection.
7
+ For a greater details, checkout the related collection at:
8
+ https://github.com/nicolay-r/RuSentiFrames
9
+
10
+ Papers:
11
+ [1] Natalia Loukachevitch, Nicolay Rusnachenko: Sentiment Frames
12
+ for Attitude Extraction in Russian, 2020
13
+ [2] Distant Supervision for Sentiment Attitude Extraction, 2019
14
+ """
15
+
16
+ def __init__(self, collection):
17
+ assert(isinstance(collection, RuSentiFramesCollection))
18
+ self.__collection = collection
19
+
20
+ def try_provide(self, frame_id):
21
+ return self.__collection.try_get_frame_polarity(frame_id=frame_id,
22
+ role_src='a0',
23
+ role_dest='a1')
File without changes
File without changes
@@ -0,0 +1,37 @@
1
+ from arekit.common.data.input.providers.const import IDLE_MODE
2
+ from arekit.common.data.input.providers.contents import ContentsProvider
3
+ from arekit.common.linkage.base import LinkedDataWrapper
4
+ from arekit.common.linkage.text_opinions import TextOpinionsLinkage
5
+ from arekit.common.pipeline.base import BasePipeline
6
+ from arekit.common.text_opinions.base import TextOpinion
7
+
8
+
9
+ class InputTextOpinionProvider(ContentsProvider):
10
+
11
+ def __init__(self, pipeline):
12
+ """ NOTE: it is important that the output of the pipeline
13
+ results in a TextOpinionLinkage instances.
14
+ pipeline: id -> ... -> TextOpinionLinkage[]
15
+ """
16
+ assert(isinstance(pipeline, BasePipeline))
17
+ self.__pipeline = pipeline
18
+ self.__current_id = None
19
+
20
+ # endregion
21
+
22
+ def __assign_ids(self, linkage):
23
+ """ Perform IDs assignation.
24
+ """
25
+ assert(isinstance(linkage, TextOpinionsLinkage))
26
+ for text_opinion in linkage:
27
+ assert(isinstance(text_opinion, TextOpinion))
28
+ text_opinion.set_text_opinion_id(self.__current_id)
29
+ self.__current_id += 1
30
+
31
+ def from_doc_ids(self, doc_ids, idle_mode=False):
32
+ self.__current_id = 0
33
+ for linkage in self.__pipeline.run(doc_ids, params_dict={IDLE_MODE: idle_mode}):
34
+ assert(isinstance(linkage, LinkedDataWrapper))
35
+ if isinstance(linkage, TextOpinionsLinkage):
36
+ self.__assign_ids(linkage)
37
+ yield linkage
File without changes
@@ -0,0 +1,13 @@
1
+ from arekit.common.data.doc_provider import DocumentProvider
2
+
3
+
4
+ class DictionaryBasedDocumentProvider(DocumentProvider):
5
+
6
+ def __init__(self, d):
7
+ assert(isinstance(d, dict))
8
+ super(DictionaryBasedDocumentProvider, self).__init__()
9
+ self.__d = d
10
+
11
+ def by_id(self, doc_id):
12
+ assert(isinstance(doc_id, int))
13
+ return self.__d[doc_id]
@@ -0,0 +1,53 @@
1
+ from os.path import join
2
+
3
+ from arekit.common.data.doc_provider import DocumentProvider
4
+ from arekit.common.docs.base import Document
5
+ from arekit.common.docs.sentence import BaseDocumentSentence
6
+
7
+
8
+ class DirectoryFilesDocProvider(DocumentProvider):
9
+ """ Document Providers based on the list of provided file paths
10
+ for the particular directory.
11
+ """
12
+
13
+ def __init__(self, dir_path, file_names=None, sentence_parser=None):
14
+ """
15
+ dir_path: str
16
+ path to the root directory.
17
+ file_names: list
18
+ list of file paths related to documents.
19
+ sentence_splitter: object
20
+ how data is suppose to be separated onto sentences.
21
+ str -> list(str)
22
+ """
23
+ assert(isinstance(dir_path, str))
24
+ assert(isinstance(file_names, list) or file_names is None)
25
+ assert(callable(sentence_parser) or sentence_parser is None)
26
+
27
+ self.__dir_path = dir_path
28
+ self.__file_names = file_names
29
+
30
+ # Line-split sentence parser by default.
31
+ self.__sentence_parser = (lambda text: [t.strip() for t in text.split('\n')]) \
32
+ if sentence_parser is None else sentence_parser
33
+
34
+ def __read_doc(self, doc_id, contents):
35
+ """ Parse a single document.
36
+ """
37
+ # setup input data.
38
+ sentences = self.__sentence_parser(contents)
39
+ sentences = list(map(lambda text: BaseDocumentSentence(text), sentences))
40
+
41
+ # Parse text.
42
+ return Document(doc_id=doc_id, sentences=sentences)
43
+
44
+ def by_id(self, doc_id):
45
+ """ Perform reading operation of the document.
46
+ """
47
+ file_name = self.__file_names[doc_id]
48
+ with open(join(self.__dir_path, file_name), "r") as f:
49
+ contents = f.read()
50
+ return self.__read_doc(doc_id=file_name, contents=contents)
51
+
52
+ def __len__(self):
53
+ return len(self.__file_names)
File without changes
@@ -0,0 +1,7 @@
1
+ class BaseReader(object):
2
+
3
+ def extension(self):
4
+ raise NotImplementedError()
5
+
6
+ def read(self, target):
7
+ raise NotImplementedError()
@@ -0,0 +1,38 @@
1
+ import importlib
2
+
3
+ from arekit.contrib.utils.data.readers.base import BaseReader
4
+ from arekit.contrib.utils.data.storages.pandas_based import PandasBasedRowsStorage
5
+
6
+
7
+ class PandasCsvReader(BaseReader):
8
+ """ Represents a CSV-based reader, implmented via pandas API.
9
+ """
10
+
11
+ def __init__(self, sep='\t', header='infer', compression='infer', encoding='utf-8', col_types=None,
12
+ custom_extension=None):
13
+ self.__sep = sep
14
+ self.__compression = compression
15
+ self.__encoding = encoding
16
+ self.__header = header
17
+ self.__custom_extension = custom_extension
18
+
19
+ # Special assignation of types for certain columns.
20
+ self.__col_types = col_types
21
+ if self.__col_types is None:
22
+ self.__col_types = dict()
23
+
24
+ def extension(self):
25
+ return ".tsv.gz" if self.__custom_extension is None else self.__custom_extension
26
+
27
+ def __from_csv(self, filepath):
28
+ pd = importlib.import_module("pandas")
29
+ return pd.read_csv(filepath,
30
+ sep=self.__sep,
31
+ encoding=self.__encoding,
32
+ compression=self.__compression,
33
+ dtype=self.__col_types,
34
+ header=self.__header)
35
+
36
+ def read(self, target):
37
+ df = self.__from_csv(filepath=target)
38
+ return PandasBasedRowsStorage(df)
@@ -0,0 +1,15 @@
1
+ from arekit.contrib.utils.data.readers.base import BaseReader
2
+ from arekit.contrib.utils.data.storages.jsonl_based import JsonlBasedRowsStorage
3
+
4
+
5
+ class JsonlReader(BaseReader):
6
+
7
+ def extension(self):
8
+ return ".jsonl"
9
+
10
+ def read(self, target):
11
+ rows = []
12
+ with open(target, "r") as f:
13
+ for line in f.readlines():
14
+ rows.append(line)
15
+ return JsonlBasedRowsStorage(rows)
File without changes
@@ -0,0 +1,50 @@
1
+ import gc
2
+ import importlib
3
+ from arekit.contrib.utils.data.storages.pandas_based import PandasBasedRowsStorage
4
+
5
+
6
+ class PandasBasedStorageBalancing(object):
7
+
8
+ @staticmethod
9
+ def create_balanced_from(storage, column_name, free_origin=True):
10
+ """ Performs oversampled balancing.
11
+
12
+ Note: it is quite important to remove previously created storage
13
+ in order to avoid memory leaking.
14
+
15
+ storage: PandasBasedRowsStorage
16
+ storage contents to be balanced.
17
+
18
+ column_name: str
19
+ column utilized for balancing.
20
+
21
+ free_origin: bool
22
+ indicates whether there is a need to release the resources
23
+ utilized for the original storage.
24
+ """
25
+ assert(isinstance(storage, PandasBasedRowsStorage))
26
+
27
+ original_df = storage.DataFrame
28
+
29
+ max_size = original_df[column_name].value_counts().max()
30
+
31
+ dframes = []
32
+ for class_index, group in original_df.groupby(column_name):
33
+ dframes.append(group.sample(max_size - len(group), replace=True))
34
+
35
+ # Clear resources.
36
+ pd = importlib.import_module("pandas")
37
+ balanced_df = pd.concat(dframes + [original_df])
38
+
39
+ # Removing temporary created dataframe.
40
+ for df in dframes:
41
+ del df
42
+
43
+ # Marking the original dataframe as released
44
+ # in terms of the allocated memory for it.
45
+ if free_origin:
46
+ storage.free()
47
+
48
+ gc.collect()
49
+
50
+ return PandasBasedRowsStorage(df=balanced_df)
File without changes
@@ -0,0 +1,18 @@
1
+ import json
2
+
3
+ from arekit.common.data.storages.base import BaseRowsStorage
4
+
5
+
6
+ class JsonlBasedRowsStorage(BaseRowsStorage):
7
+
8
+ def __init__(self, rows):
9
+ assert(isinstance(rows, list))
10
+ self.__rows = rows
11
+
12
+ def _iter_rows(self):
13
+ for row_index, row in enumerate(self.__rows):
14
+ assert(isinstance(row, str))
15
+ yield row_index, json.loads(row)
16
+
17
+ def _get_rows_count(self):
18
+ return len(self.__rows)