arekit 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. arekit/__init__.py +0 -0
  2. arekit/common/__init__.py +0 -0
  3. arekit/common/bound.py +48 -0
  4. arekit/common/context/__init__.py +0 -0
  5. arekit/common/context/terms_mapper.py +51 -0
  6. arekit/common/context/token.py +16 -0
  7. arekit/common/data/__init__.py +0 -0
  8. arekit/common/data/const.py +21 -0
  9. arekit/common/data/doc_provider.py +6 -0
  10. arekit/common/data/input/__init__.py +0 -0
  11. arekit/common/data/input/providers/__init__.py +0 -0
  12. arekit/common/data/input/providers/columns/__init__.py +0 -0
  13. arekit/common/data/input/providers/columns/base.py +9 -0
  14. arekit/common/data/input/providers/columns/sample.py +59 -0
  15. arekit/common/data/input/providers/const.py +3 -0
  16. arekit/common/data/input/providers/contents.py +9 -0
  17. arekit/common/data/input/providers/instances/__init__.py +0 -0
  18. arekit/common/data/input/providers/instances/base.py +14 -0
  19. arekit/common/data/input/providers/instances/multiple.py +27 -0
  20. arekit/common/data/input/providers/instances/single.py +8 -0
  21. arekit/common/data/input/providers/label/__init__.py +0 -0
  22. arekit/common/data/input/providers/label/base.py +24 -0
  23. arekit/common/data/input/providers/label/binary.py +11 -0
  24. arekit/common/data/input/providers/label/multiple.py +15 -0
  25. arekit/common/data/input/providers/rows/__init__.py +0 -0
  26. arekit/common/data/input/providers/rows/base.py +64 -0
  27. arekit/common/data/input/providers/rows/samples.py +227 -0
  28. arekit/common/data/input/providers/sample/__init__.py +0 -0
  29. arekit/common/data/input/providers/sample/cropped.py +43 -0
  30. arekit/common/data/input/providers/text/__init__.py +0 -0
  31. arekit/common/data/input/providers/text/single.py +49 -0
  32. arekit/common/data/input/repositories/__init__.py +0 -0
  33. arekit/common/data/input/repositories/base.py +68 -0
  34. arekit/common/data/input/repositories/sample.py +22 -0
  35. arekit/common/data/input/sample.py +66 -0
  36. arekit/common/data/input/terms_mapper.py +88 -0
  37. arekit/common/data/rows_fmt.py +82 -0
  38. arekit/common/data/rows_parser.py +43 -0
  39. arekit/common/data/storages/__init__.py +0 -0
  40. arekit/common/data/storages/base.py +109 -0
  41. arekit/common/data/views/__init__.py +0 -0
  42. arekit/common/data/views/samples.py +26 -0
  43. arekit/common/docs/__init__.py +0 -0
  44. arekit/common/docs/base.py +30 -0
  45. arekit/common/docs/entities_grouping.py +16 -0
  46. arekit/common/docs/entity.py +18 -0
  47. arekit/common/docs/objects_parser.py +37 -0
  48. arekit/common/docs/parsed/__init__.py +0 -0
  49. arekit/common/docs/parsed/base.py +101 -0
  50. arekit/common/docs/parsed/providers/__init__.py +0 -0
  51. arekit/common/docs/parsed/providers/base.py +68 -0
  52. arekit/common/docs/parsed/providers/base_pairs.py +51 -0
  53. arekit/common/docs/parsed/providers/entity_service.py +175 -0
  54. arekit/common/docs/parsed/providers/opinion_pairs.py +20 -0
  55. arekit/common/docs/parsed/providers/text_opinion_pairs.py +78 -0
  56. arekit/common/docs/parsed/service.py +31 -0
  57. arekit/common/docs/parsed/term_position.py +42 -0
  58. arekit/common/docs/parser.py +34 -0
  59. arekit/common/docs/sentence.py +14 -0
  60. arekit/common/entities/__init__.py +0 -0
  61. arekit/common/entities/base.py +51 -0
  62. arekit/common/entities/collection.py +72 -0
  63. arekit/common/entities/str_fmt.py +8 -0
  64. arekit/common/entities/types.py +9 -0
  65. arekit/common/experiment/__init__.py +0 -0
  66. arekit/common/experiment/api/__init__.py +0 -0
  67. arekit/common/experiment/api/base_samples_io.py +20 -0
  68. arekit/common/experiment/data_type.py +17 -0
  69. arekit/common/frames/__init__.py +0 -0
  70. arekit/common/frames/connotations/__init__.py +0 -0
  71. arekit/common/frames/connotations/descriptor.py +17 -0
  72. arekit/common/frames/connotations/provider.py +4 -0
  73. arekit/common/frames/text_variant.py +43 -0
  74. arekit/common/frames/variants/__init__.py +0 -0
  75. arekit/common/frames/variants/base.py +21 -0
  76. arekit/common/frames/variants/collection.py +60 -0
  77. arekit/common/labels/__init__.py +0 -0
  78. arekit/common/labels/base.py +19 -0
  79. arekit/common/labels/provider/__init__.py +0 -0
  80. arekit/common/labels/provider/base.py +7 -0
  81. arekit/common/labels/provider/constant.py +14 -0
  82. arekit/common/labels/scaler/__init__.py +0 -0
  83. arekit/common/labels/scaler/base.py +85 -0
  84. arekit/common/labels/scaler/sentiment.py +7 -0
  85. arekit/common/labels/scaler/single.py +10 -0
  86. arekit/common/labels/str_fmt.py +55 -0
  87. arekit/common/linkage/__init__.py +0 -0
  88. arekit/common/linkage/base.py +44 -0
  89. arekit/common/linkage/meta.py +23 -0
  90. arekit/common/linkage/opinions.py +9 -0
  91. arekit/common/linkage/text_opinions.py +22 -0
  92. arekit/common/log_utils.py +29 -0
  93. arekit/common/model/__init__.py +0 -0
  94. arekit/common/model/labeling/__init__.py +0 -0
  95. arekit/common/model/labeling/base.py +24 -0
  96. arekit/common/model/labeling/modes.py +8 -0
  97. arekit/common/model/labeling/single.py +24 -0
  98. arekit/common/opinions/__init__.py +0 -0
  99. arekit/common/opinions/annot/__init__.py +0 -0
  100. arekit/common/opinions/annot/algo/__init__.py +0 -0
  101. arekit/common/opinions/annot/algo/base.py +4 -0
  102. arekit/common/opinions/annot/algo/pair_based.py +99 -0
  103. arekit/common/opinions/annot/algo/predefined.py +16 -0
  104. arekit/common/opinions/annot/algo_based.py +55 -0
  105. arekit/common/opinions/annot/base.py +15 -0
  106. arekit/common/opinions/base.py +74 -0
  107. arekit/common/opinions/collection.py +150 -0
  108. arekit/common/opinions/enums.py +6 -0
  109. arekit/common/opinions/provider.py +4 -0
  110. arekit/common/opinions/writer.py +4 -0
  111. arekit/common/pipeline/__init__.py +0 -0
  112. arekit/common/pipeline/base.py +25 -0
  113. arekit/common/pipeline/context.py +36 -0
  114. arekit/common/pipeline/conts.py +2 -0
  115. arekit/common/pipeline/items/__init__.py +0 -0
  116. arekit/common/pipeline/items/base.py +12 -0
  117. arekit/common/pipeline/items/flatten.py +14 -0
  118. arekit/common/pipeline/items/handle.py +17 -0
  119. arekit/common/pipeline/items/iter.py +11 -0
  120. arekit/common/pipeline/items/map.py +11 -0
  121. arekit/common/pipeline/items/map_nested.py +13 -0
  122. arekit/common/synonyms/__init__.py +0 -0
  123. arekit/common/synonyms/base.py +151 -0
  124. arekit/common/synonyms/grouping.py +21 -0
  125. arekit/common/text/__init__.py +0 -0
  126. arekit/common/text/enums.py +12 -0
  127. arekit/common/text/parsed.py +42 -0
  128. arekit/common/text/parser.py +12 -0
  129. arekit/common/text/partitioning/__init__.py +0 -0
  130. arekit/common/text/partitioning/base.py +4 -0
  131. arekit/common/text/partitioning/str.py +36 -0
  132. arekit/common/text/partitioning/terms.py +35 -0
  133. arekit/common/text/stemmer.py +16 -0
  134. arekit/common/text_opinions/__init__.py +0 -0
  135. arekit/common/text_opinions/base.py +105 -0
  136. arekit/common/utils.py +129 -0
  137. arekit/contrib/__init__.py +0 -0
  138. arekit/contrib/bert/__init__.py +0 -0
  139. arekit/contrib/bert/input/__init__.py +0 -0
  140. arekit/contrib/bert/input/providers/__init__.py +0 -0
  141. arekit/contrib/bert/input/providers/cropped_sample.py +17 -0
  142. arekit/contrib/bert/input/providers/text_pair.py +62 -0
  143. arekit/contrib/bert/terms/__init__.py +0 -0
  144. arekit/contrib/bert/terms/mapper.py +20 -0
  145. arekit/contrib/networks/__init__.py +0 -0
  146. arekit/contrib/networks/embedding.py +149 -0
  147. arekit/contrib/networks/embedding_io.py +18 -0
  148. arekit/contrib/networks/input/__init__.py +0 -0
  149. arekit/contrib/networks/input/const.py +6 -0
  150. arekit/contrib/networks/input/ctx_serialization.py +28 -0
  151. arekit/contrib/networks/input/embedding/__init__.py +0 -0
  152. arekit/contrib/networks/input/embedding/matrix.py +29 -0
  153. arekit/contrib/networks/input/embedding/offsets.py +55 -0
  154. arekit/contrib/networks/input/formatters/__init__.py +0 -0
  155. arekit/contrib/networks/input/formatters/pos_mapper.py +22 -0
  156. arekit/contrib/networks/input/providers/__init__.py +0 -0
  157. arekit/contrib/networks/input/providers/sample.py +129 -0
  158. arekit/contrib/networks/input/providers/term_connotation.py +23 -0
  159. arekit/contrib/networks/input/providers/text.py +24 -0
  160. arekit/contrib/networks/input/rows_parser.py +47 -0
  161. arekit/contrib/networks/input/term_types.py +13 -0
  162. arekit/contrib/networks/input/terms_mapping.py +60 -0
  163. arekit/contrib/networks/vectorizer.py +6 -0
  164. arekit/contrib/prompt/__init__.py +0 -0
  165. arekit/contrib/prompt/sample.py +61 -0
  166. arekit/contrib/source/__init__.py +0 -0
  167. arekit/contrib/source/brat/__init__.py +0 -0
  168. arekit/contrib/source/brat/annot.py +84 -0
  169. arekit/contrib/source/brat/doc.py +28 -0
  170. arekit/contrib/source/brat/entities/__init__.py +0 -0
  171. arekit/contrib/source/brat/entities/compound.py +13 -0
  172. arekit/contrib/source/brat/entities/entity.py +42 -0
  173. arekit/contrib/source/brat/entities/parser.py +53 -0
  174. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  175. arekit/contrib/source/brat/opinions/converter.py +19 -0
  176. arekit/contrib/source/brat/relation.py +32 -0
  177. arekit/contrib/source/brat/sentence.py +69 -0
  178. arekit/contrib/source/brat/sentences_reader.py +128 -0
  179. arekit/contrib/source/download.py +41 -0
  180. arekit/contrib/source/nerel/__init__.py +0 -0
  181. arekit/contrib/source/nerel/entities.py +55 -0
  182. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  183. arekit/contrib/source/nerel/folding/fixed.py +74 -0
  184. arekit/contrib/source/nerel/io_utils.py +62 -0
  185. arekit/contrib/source/nerel/labels.py +241 -0
  186. arekit/contrib/source/nerel/reader.py +46 -0
  187. arekit/contrib/source/nerel/utils.py +24 -0
  188. arekit/contrib/source/nerel/versions.py +12 -0
  189. arekit/contrib/source/nerelbio/__init__.py +0 -0
  190. arekit/contrib/source/nerelbio/io_utils.py +62 -0
  191. arekit/contrib/source/nerelbio/labels.py +265 -0
  192. arekit/contrib/source/nerelbio/reader.py +8 -0
  193. arekit/contrib/source/nerelbio/versions.py +8 -0
  194. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  195. arekit/contrib/source/ruattitudes/collection.py +36 -0
  196. arekit/contrib/source/ruattitudes/doc.py +51 -0
  197. arekit/contrib/source/ruattitudes/doc_brat.py +44 -0
  198. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  199. arekit/contrib/source/ruattitudes/entity/parser.py +7 -0
  200. arekit/contrib/source/ruattitudes/io_utils.py +56 -0
  201. arekit/contrib/source/ruattitudes/labels_fmt.py +12 -0
  202. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  203. arekit/contrib/source/ruattitudes/opinions/base.py +28 -0
  204. arekit/contrib/source/ruattitudes/opinions/converter.py +37 -0
  205. arekit/contrib/source/ruattitudes/reader.py +268 -0
  206. arekit/contrib/source/ruattitudes/sentence.py +73 -0
  207. arekit/contrib/source/ruattitudes/synonyms.py +17 -0
  208. arekit/contrib/source/ruattitudes/text_object.py +59 -0
  209. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  210. arekit/contrib/source/rusentiframes/collection.py +157 -0
  211. arekit/contrib/source/rusentiframes/effect.py +24 -0
  212. arekit/contrib/source/rusentiframes/io_utils.py +19 -0
  213. arekit/contrib/source/rusentiframes/labels_fmt.py +22 -0
  214. arekit/contrib/source/rusentiframes/polarity.py +35 -0
  215. arekit/contrib/source/rusentiframes/role.py +15 -0
  216. arekit/contrib/source/rusentiframes/state.py +24 -0
  217. arekit/contrib/source/rusentiframes/types.py +42 -0
  218. arekit/contrib/source/rusentiframes/value.py +2 -0
  219. arekit/contrib/source/rusentrel/__init__.py +0 -0
  220. arekit/contrib/source/rusentrel/const.py +3 -0
  221. arekit/contrib/source/rusentrel/docs_reader.py +51 -0
  222. arekit/contrib/source/rusentrel/entities.py +26 -0
  223. arekit/contrib/source/rusentrel/io_utils.py +125 -0
  224. arekit/contrib/source/rusentrel/labels_fmt.py +12 -0
  225. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  226. arekit/contrib/source/rusentrel/opinions/collection.py +30 -0
  227. arekit/contrib/source/rusentrel/opinions/converter.py +40 -0
  228. arekit/contrib/source/rusentrel/opinions/provider.py +54 -0
  229. arekit/contrib/source/rusentrel/opinions/writer.py +42 -0
  230. arekit/contrib/source/rusentrel/synonyms.py +17 -0
  231. arekit/contrib/source/sentinerel/__init__.py +0 -0
  232. arekit/contrib/source/sentinerel/entities.py +52 -0
  233. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  234. arekit/contrib/source/sentinerel/folding/factory.py +31 -0
  235. arekit/contrib/source/sentinerel/folding/fixed.py +70 -0
  236. arekit/contrib/source/sentinerel/io_utils.py +87 -0
  237. arekit/contrib/source/sentinerel/labels.py +53 -0
  238. arekit/contrib/source/sentinerel/labels_scaler.py +30 -0
  239. arekit/contrib/source/sentinerel/reader.py +42 -0
  240. arekit/contrib/source/synonyms/__init__.py +0 -0
  241. arekit/contrib/source/synonyms/utils.py +19 -0
  242. arekit/contrib/source/zip_utils.py +47 -0
  243. arekit/contrib/utils/__init__.py +0 -0
  244. arekit/contrib/utils/bert/__init__.py +0 -0
  245. arekit/contrib/utils/bert/samplers.py +17 -0
  246. arekit/contrib/utils/connotations/__init__.py +0 -0
  247. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +23 -0
  248. arekit/contrib/utils/data/__init__.py +0 -0
  249. arekit/contrib/utils/data/contents/__init__.py +0 -0
  250. arekit/contrib/utils/data/contents/opinions.py +37 -0
  251. arekit/contrib/utils/data/doc_provider/__init__.py +0 -0
  252. arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
  253. arekit/contrib/utils/data/doc_provider/dir_based.py +53 -0
  254. arekit/contrib/utils/data/readers/__init__.py +0 -0
  255. arekit/contrib/utils/data/readers/base.py +7 -0
  256. arekit/contrib/utils/data/readers/csv_pd.py +38 -0
  257. arekit/contrib/utils/data/readers/jsonl.py +15 -0
  258. arekit/contrib/utils/data/service/__init__.py +0 -0
  259. arekit/contrib/utils/data/service/balance.py +50 -0
  260. arekit/contrib/utils/data/storages/__init__.py +0 -0
  261. arekit/contrib/utils/data/storages/jsonl_based.py +18 -0
  262. arekit/contrib/utils/data/storages/pandas_based.py +123 -0
  263. arekit/contrib/utils/data/storages/row_cache.py +48 -0
  264. arekit/contrib/utils/data/writers/__init__.py +0 -0
  265. arekit/contrib/utils/data/writers/base.py +27 -0
  266. arekit/contrib/utils/data/writers/csv_native.py +63 -0
  267. arekit/contrib/utils/data/writers/csv_pd.py +40 -0
  268. arekit/contrib/utils/data/writers/json_opennre.py +132 -0
  269. arekit/contrib/utils/data/writers/sqlite_native.py +110 -0
  270. arekit/contrib/utils/download.py +77 -0
  271. arekit/contrib/utils/embeddings/__init__.py +0 -0
  272. arekit/contrib/utils/embeddings/rusvectores.py +58 -0
  273. arekit/contrib/utils/embeddings/tokens.py +30 -0
  274. arekit/contrib/utils/entities/__init__.py +0 -0
  275. arekit/contrib/utils/entities/filter.py +7 -0
  276. arekit/contrib/utils/entities/formatters/__init__.py +0 -0
  277. arekit/contrib/utils/entities/formatters/str_display.py +11 -0
  278. arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py +15 -0
  279. arekit/contrib/utils/io_utils/__init__.py +0 -0
  280. arekit/contrib/utils/io_utils/embedding.py +72 -0
  281. arekit/contrib/utils/io_utils/opinions.py +37 -0
  282. arekit/contrib/utils/io_utils/samples.py +79 -0
  283. arekit/contrib/utils/io_utils/utils.py +39 -0
  284. arekit/contrib/utils/lexicons/__init__.py +0 -0
  285. arekit/contrib/utils/lexicons/lexicon.py +41 -0
  286. arekit/contrib/utils/lexicons/relation.py +42 -0
  287. arekit/contrib/utils/lexicons/rusentilex.py +37 -0
  288. arekit/contrib/utils/nn/__init__.py +0 -0
  289. arekit/contrib/utils/nn/rows.py +83 -0
  290. arekit/contrib/utils/np_utils/__init__.py +0 -0
  291. arekit/contrib/utils/np_utils/embedding.py +22 -0
  292. arekit/contrib/utils/np_utils/npz_utils.py +13 -0
  293. arekit/contrib/utils/np_utils/vocab.py +20 -0
  294. arekit/contrib/utils/pipelines/__init__.py +0 -0
  295. arekit/contrib/utils/pipelines/items/__init__.py +0 -0
  296. arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
  297. arekit/contrib/utils/pipelines/items/sampling/base.py +99 -0
  298. arekit/contrib/utils/pipelines/items/sampling/networks.py +54 -0
  299. arekit/contrib/utils/pipelines/items/text/__init__.py +0 -0
  300. arekit/contrib/utils/pipelines/items/text/entities_default.py +23 -0
  301. arekit/contrib/utils/pipelines/items/text/frames.py +86 -0
  302. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +36 -0
  303. arekit/contrib/utils/pipelines/items/text/frames_negation.py +32 -0
  304. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +10 -0
  305. arekit/contrib/utils/pipelines/items/text/tokenizer.py +107 -0
  306. arekit/contrib/utils/pipelines/items/text/translator.py +135 -0
  307. arekit/contrib/utils/pipelines/opinion_collections.py +85 -0
  308. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  309. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  310. arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +27 -0
  311. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +65 -0
  312. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +60 -0
  313. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  314. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +29 -0
  315. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +64 -0
  316. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +79 -0
  317. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  318. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +56 -0
  319. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +20 -0
  320. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +65 -0
  321. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  322. arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +21 -0
  323. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +107 -0
  324. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  325. arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +29 -0
  326. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +62 -0
  327. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +180 -0
  328. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +50 -0
  329. arekit/contrib/utils/pipelines/text_opinion/__init__.py +0 -0
  330. arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py +0 -0
  331. arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +34 -0
  332. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +88 -0
  333. arekit/contrib/utils/pipelines/text_opinion/extraction.py +93 -0
  334. arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py +0 -0
  335. arekit/contrib/utils/pipelines/text_opinion/filters/base.py +4 -0
  336. arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +16 -0
  337. arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +29 -0
  338. arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +26 -0
  339. arekit/contrib/utils/processing/__init__.py +0 -0
  340. arekit/contrib/utils/processing/languages/__init__.py +0 -0
  341. arekit/contrib/utils/processing/languages/mods.py +12 -0
  342. arekit/contrib/utils/processing/languages/pos.py +23 -0
  343. arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
  344. arekit/contrib/utils/processing/languages/ru/cases.py +78 -0
  345. arekit/contrib/utils/processing/languages/ru/constants.py +6 -0
  346. arekit/contrib/utils/processing/languages/ru/mods.py +13 -0
  347. arekit/contrib/utils/processing/languages/ru/number.py +23 -0
  348. arekit/contrib/utils/processing/languages/ru/pos_service.py +36 -0
  349. arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
  350. arekit/contrib/utils/processing/lemmatization/mystem.py +51 -0
  351. arekit/contrib/utils/processing/pos/__init__.py +0 -0
  352. arekit/contrib/utils/processing/pos/base.py +12 -0
  353. arekit/contrib/utils/processing/pos/mystem_wrap.py +134 -0
  354. arekit/contrib/utils/processing/pos/russian.py +10 -0
  355. arekit/contrib/utils/processing/text/__init__.py +0 -0
  356. arekit/contrib/utils/processing/text/tokens.py +127 -0
  357. arekit/contrib/utils/resources.py +25 -0
  358. arekit/contrib/utils/serializer.py +43 -0
  359. arekit/contrib/utils/sources/__init__.py +0 -0
  360. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  361. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  362. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +63 -0
  363. arekit/contrib/utils/synonyms/__init__.py +0 -0
  364. arekit/contrib/utils/synonyms/simple.py +15 -0
  365. arekit/contrib/utils/synonyms/stemmer_based.py +38 -0
  366. arekit/contrib/utils/vectorizers/__init__.py +0 -0
  367. arekit/contrib/utils/vectorizers/bpe.py +93 -0
  368. arekit/contrib/utils/vectorizers/random_norm.py +39 -0
  369. arekit/download_data.py +11 -0
  370. arekit-0.24.0.dist-info/LICENSE +21 -0
  371. arekit-0.24.0.dist-info/METADATA +23 -0
  372. arekit-0.24.0.dist-info/RECORD +374 -0
  373. arekit-0.24.0.dist-info/WHEEL +5 -0
  374. arekit-0.24.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,55 @@
1
+ from arekit.common.entities.collection import EntityCollection
2
+ from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders
3
+ from arekit.contrib.source.brat.annot import BratAnnotationParser
4
+ from arekit.contrib.source.brat.entities.entity import BratEntity
5
+ from arekit.contrib.source.nerel.io_utils import NerelIOUtils
6
+ from arekit.contrib.utils.processing.lemmatization.mystem import MystemWrapper
7
+ from arekit.contrib.utils.synonyms.stemmer_based import StemmerBasedSynonymCollection
8
+
9
+
10
+ class NerelEntityCollection(EntityCollection):
11
+
12
+ def __init__(self, contents, value_to_group_id_func, entities_to_ignore=None):
13
+ """
14
+ entities_to_ignore: list or None
15
+ this parameter is required because of the simplified implementation of
16
+ the nested objects of the BRAT annotation.
17
+ """
18
+ assert(isinstance(contents, dict))
19
+ assert(BratAnnotationParser.ENTITIES in contents)
20
+ assert(isinstance(entities_to_ignore, list) or entities_to_ignore is None)
21
+
22
+ self.__discard_entities = set([] if entities_to_ignore is None else entities_to_ignore)
23
+ contents[BratAnnotationParser.ENTITIES] = [e for e in contents[BratAnnotationParser.ENTITIES]
24
+ if self.__keep_entity(e)]
25
+
26
+ super(NerelEntityCollection, self).__init__(
27
+ entities=contents[BratAnnotationParser.ENTITIES],
28
+ value_to_group_id_func=value_to_group_id_func)
29
+
30
+ self._sort_entities(key=lambda entity: entity.IndexBegin)
31
+
32
+ def __keep_entity(self, entity):
33
+ assert(isinstance(entity, BratEntity))
34
+ return entity.Type not in self.__discard_entities
35
+
36
+ @classmethod
37
+ def read_collection(cls, filename, version, io_utils, entities_to_ignore=None):
38
+ assert(isinstance(io_utils, NerelIOUtils))
39
+ assert(isinstance(filename, str))
40
+
41
+ # Since this dataset does not provide the synonyms collection by default,
42
+ # it is necessary to declare an empty collection to populate so in further.
43
+ synonyms = StemmerBasedSynonymCollection(stemmer=MystemWrapper(), is_read_only=False)
44
+
45
+ doc_fold = io_utils.map_doc_to_fold_type(version)
46
+
47
+ return io_utils.read_from_zip(
48
+ inner_path=io_utils.get_annotation_innerpath(folding_data_type=doc_fold[filename], filename=filename),
49
+ process_func=lambda input_file: cls(
50
+ contents=BratAnnotationParser.parse_annotations(input_file=input_file, encoding='utf-8-sig'),
51
+ entities_to_ignore=entities_to_ignore,
52
+ value_to_group_id_func=lambda value:
53
+ SynonymsCollectionValuesGroupingProviders.provide_existed_or_register_missed_value(
54
+ synonyms, value)),
55
+ version=version)
File without changes
@@ -0,0 +1,74 @@
1
+ from collections import OrderedDict
2
+
3
+ from arekit.common.experiment.data_type import DataType
4
+
5
+
6
+ def create_fixed_folding(train_filenames, dev_filenames, test_filenames, limit=None):
7
+ """ Create fixed data-folding based on the predefined list of filenames,
8
+ written in file.
9
+ """
10
+ assert(isinstance(train_filenames, list))
11
+ assert(isinstance(dev_filenames, list))
12
+ assert(isinstance(test_filenames, list))
13
+
14
+ filenames_by_ids = create_filenames_by_ids(filenames=train_filenames + dev_filenames + test_filenames)
15
+
16
+ ids_by_filenames = {}
17
+ for doc_id, filename in filenames_by_ids.items():
18
+ ids_by_filenames[filename] = doc_id
19
+
20
+ train_filenames = train_filenames if limit is None else train_filenames[:limit]
21
+ test_filenames = test_filenames if limit is None else test_filenames[:limit]
22
+ dev_filenames = dev_filenames if limit is None else dev_filenames[:limit]
23
+
24
+ fixed_folding = {
25
+ DataType.Train: [ids_by_filenames[filename] for filename in train_filenames],
26
+ DataType.Test: [ids_by_filenames[filename] for filename in test_filenames],
27
+ DataType.Dev: [ids_by_filenames[filename] for filename in dev_filenames]
28
+ }
29
+
30
+ return filenames_by_ids, fixed_folding
31
+
32
+
33
+ def create_filenames_by_ids(filenames):
34
+ """ Indexing filenames
35
+ """
36
+
37
+ def __create_new_id(default_id):
38
+ new_id = default_id
39
+ while new_id in filenames_by_ids:
40
+ new_id += 1
41
+ return new_id
42
+
43
+ default_id = 0
44
+
45
+ filenames_by_ids = OrderedDict()
46
+ for fname in filenames:
47
+
48
+ doc_id = number_from_string(fname)
49
+
50
+ if doc_id is None:
51
+ doc_id = __create_new_id(default_id)
52
+ default_id = doc_id
53
+
54
+ assert(doc_id not in filenames_by_ids)
55
+ filenames_by_ids[doc_id] = fname
56
+
57
+ return filenames_by_ids
58
+
59
+
60
+ def number_from_string(s):
61
+ assert(isinstance(s, str))
62
+
63
+ digit_chars_prefix = []
64
+
65
+ for chr in s:
66
+ if chr.isdigit():
67
+ digit_chars_prefix.append(chr)
68
+ else:
69
+ break
70
+
71
+ if len(digit_chars_prefix) == 0:
72
+ return None
73
+
74
+ return int("".join(digit_chars_prefix))
@@ -0,0 +1,62 @@
1
+ from os import path
2
+
3
+ from arekit.common.experiment.data_type import DataType
4
+ from arekit.contrib.source.nerel.folding.fixed import create_fixed_folding
5
+ from arekit.contrib.source.nerel.utils import iter_filename_and_splittype
6
+ from arekit.contrib.source.zip_utils import ZipArchiveUtils
7
+
8
+
9
+ class NerelIOUtils(ZipArchiveUtils):
10
+
11
+ splits = {
12
+ DataType.Train: "train",
13
+ DataType.Dev: "dev",
14
+ DataType.Test: "test"
15
+ }
16
+
17
+ @staticmethod
18
+ def get_archive_filepath(version):
19
+ return path.join(NerelIOUtils.get_data_root(), "nerel-{}.zip".format(version))
20
+
21
+ @staticmethod
22
+ def get_annotation_innerpath(folding_data_type, filename):
23
+ assert(isinstance(filename, str))
24
+ return path.join(NerelIOUtils.splits[folding_data_type], "{}.ann".format(filename))
25
+
26
+ @staticmethod
27
+ def get_news_innerpath(folding_data_type, filename):
28
+ assert(isinstance(filename, str))
29
+ return path.join(NerelIOUtils.splits[folding_data_type], "{}.txt".format(filename))
30
+
31
+ @staticmethod
32
+ def map_doc_to_fold_type(version):
33
+
34
+ it = iter_filename_and_splittype(
35
+ filenames_it=NerelIOUtils.iter_filenames_from_zip(version),
36
+ splits=NerelIOUtils.splits.items())
37
+
38
+ d2f = {}
39
+ for filename, split_type in it:
40
+ d2f[filename] = split_type
41
+
42
+ return d2f
43
+
44
+ @staticmethod
45
+ def read_dataset_split(version, docs_limit=None):
46
+
47
+ it = iter_filename_and_splittype(
48
+ filenames_it=NerelIOUtils.iter_filenames_from_zip(version),
49
+ splits=NerelIOUtils.splits.items())
50
+
51
+ f2d = {}
52
+ for filename, split_type in it:
53
+ if split_type not in f2d:
54
+ f2d[split_type] = []
55
+ f2d[split_type].append(filename)
56
+
57
+ filenames_by_ids, data_folding = create_fixed_folding(train_filenames=f2d[DataType.Train],
58
+ test_filenames=f2d[DataType.Test],
59
+ dev_filenames=f2d[DataType.Dev],
60
+ limit=docs_limit)
61
+
62
+ return filenames_by_ids, data_folding
@@ -0,0 +1,241 @@
1
+ from arekit.common.labels.base import Label
2
+
3
+
4
+ class OpinionBelongsTo(Label):
5
+ pass
6
+
7
+
8
+ class OpinionRelatesTo(Label):
9
+ pass
10
+
11
+
12
+ class NegEffectFrom(Label):
13
+ pass
14
+
15
+
16
+ class NegStateFrom(Label):
17
+ pass
18
+
19
+
20
+ class PosEffectFrom(Label):
21
+ pass
22
+
23
+
24
+ class PosAuthorFrom(Label):
25
+ pass
26
+
27
+
28
+ class NegAuthorFrom(Label):
29
+ pass
30
+
31
+
32
+ class PosStateFrom(Label):
33
+ pass
34
+
35
+
36
+ class NegativeTo(Label):
37
+ pass
38
+
39
+
40
+ class PositiveTo(Label):
41
+ pass
42
+
43
+
44
+ class STATE_BELONGS_TO(Label):
45
+ pass
46
+
47
+
48
+ class ABBREVIATION(Label):
49
+ pass
50
+
51
+
52
+ class HEADQUARTERED_IN(Label):
53
+ pass
54
+
55
+
56
+ class AGE_DIED_AT(Label):
57
+ pass
58
+
59
+
60
+ class AGE_IS(Label):
61
+ pass
62
+
63
+
64
+ class AGENT(Label):
65
+ pass
66
+
67
+
68
+ class IDEOLOGY_OF(Label):
69
+ pass
70
+
71
+
72
+ class PLACE_RESIDES_IN(Label):
73
+ pass
74
+
75
+
76
+ class POINT_IN_TIME(Label):
77
+ pass
78
+
79
+
80
+ class INANIMATE_INVOLVED(Label):
81
+ pass
82
+
83
+
84
+ class PRICE_OF(Label):
85
+ pass
86
+
87
+
88
+ class INCOME(Label):
89
+ pass
90
+
91
+
92
+ class PRODUCES(Label):
93
+ pass
94
+
95
+
96
+ class ALTERNATIVE_NAME(Label):
97
+ pass
98
+
99
+
100
+ class AWARDED_WITH(Label):
101
+ pass
102
+
103
+
104
+ class CAUSE_OF_DEATH(Label):
105
+ pass
106
+
107
+
108
+ class CONVICTED_OF(Label):
109
+ pass
110
+
111
+
112
+ class DATE_DEFUNCT_IN(Label):
113
+ pass
114
+
115
+
116
+ class DATE_FOUNDED_IN(Label):
117
+ pass
118
+
119
+
120
+ class DATE_OF_BIRTH(Label):
121
+ pass
122
+
123
+
124
+ class DATE_OF_CREATION(Label):
125
+ pass
126
+
127
+
128
+ class DATE_OF_DEATH(Label):
129
+ pass
130
+
131
+
132
+ class END_TIME(Label):
133
+ pass
134
+
135
+
136
+ class EXPENDITURE(Label):
137
+ pass
138
+
139
+
140
+ class FOUNDED_BY(Label):
141
+ pass
142
+
143
+
144
+ class KNOWS(Label):
145
+ pass
146
+
147
+
148
+ class RELATIVE(Label):
149
+ pass
150
+
151
+
152
+ class LOCATED_IN(Label):
153
+ pass
154
+
155
+
156
+ class RELIGION_OF(Label):
157
+ pass
158
+
159
+
160
+ class MEDICAL_CONDITION(Label):
161
+ pass
162
+
163
+
164
+ class SCHOOLS_ATTENDED(Label):
165
+ pass
166
+
167
+
168
+ class MEMBER_OF(Label):
169
+ pass
170
+
171
+
172
+ class SIBLING(Label):
173
+ pass
174
+
175
+
176
+ class ORGANIZES(Label):
177
+ pass
178
+
179
+
180
+ class SPOUSE(Label):
181
+ pass
182
+
183
+
184
+ class ORIGINS_FROM(Label):
185
+ pass
186
+
187
+
188
+ class START_TIME(Label):
189
+ pass
190
+
191
+
192
+ class OWNER_OF(Label):
193
+ pass
194
+
195
+
196
+ class SUBEVENT_OF(Label):
197
+ pass
198
+
199
+
200
+ class PARENT_OF(Label):
201
+ pass
202
+
203
+
204
+ class SUBORDINATE_OF(Label):
205
+ pass
206
+
207
+
208
+ class PART_OF(Label):
209
+ pass
210
+
211
+
212
+ class TAKES_PLACE_IN(Label):
213
+ pass
214
+
215
+
216
+ class PARTICIPANT_IN(Label):
217
+ pass
218
+
219
+
220
+ class WORKPLACE(Label):
221
+ pass
222
+
223
+
224
+ class PENALIZED_AS(Label):
225
+ pass
226
+
227
+
228
+ class WORKS_AS(Label):
229
+ pass
230
+
231
+
232
+ class PLACE_OF_DEATH(Label):
233
+ pass
234
+
235
+
236
+ class PLACE_OF_BIRTH(Label):
237
+ pass
238
+
239
+
240
+ class HAS_CAUSE (Label):
241
+ pass
@@ -0,0 +1,46 @@
1
+ from arekit.contrib.source.brat.annot import BratAnnotationParser
2
+ from arekit.contrib.source.brat.doc import BratDocument
3
+ from arekit.contrib.source.brat.sentences_reader import BratDocumentSentencesReader
4
+ from arekit.contrib.source.nerel.entities import NerelEntityCollection
5
+ from arekit.contrib.source.nerel.io_utils import NerelIOUtils
6
+
7
+
8
+ class NerelDocReader(object):
9
+
10
+ def __init__(self, version, io_utils=NerelIOUtils()):
11
+ assert(isinstance(io_utils, NerelIOUtils))
12
+ self.__version = version
13
+ self.__io_utils = io_utils
14
+ self.__doc_fold = io_utils.map_doc_to_fold_type(version)
15
+
16
+ def read_text_relations(self, filename):
17
+ assert(isinstance(filename, str))
18
+
19
+ return self.__io_utils.read_from_zip(
20
+ inner_path=self.__io_utils.get_annotation_innerpath(
21
+ folding_data_type=self.__doc_fold[filename],
22
+ filename=filename),
23
+ process_func=lambda input_file: [
24
+ relation for relation in BratAnnotationParser.parse_annotations(
25
+ input_file=input_file, encoding='utf-8-sig')["relations"]],
26
+ version=self.__version)
27
+
28
+ def read_document(self, filename, doc_id, entities_to_ignore=None):
29
+ assert(isinstance(filename, str))
30
+ assert(isinstance(doc_id, int))
31
+
32
+ def file_to_doc(input_file):
33
+ sentences = BratDocumentSentencesReader.from_file(input_file=input_file, entities=entities)
34
+ return BratDocument(doc_id=doc_id, sentences=sentences, text_relations=text_relations)
35
+
36
+ entities = NerelEntityCollection.read_collection(
37
+ filename=filename, version=self.__version,
38
+ entities_to_ignore=entities_to_ignore, io_utils=self.__io_utils)
39
+
40
+ text_relations = self.read_text_relations(filename=filename)
41
+
42
+ return self.__io_utils.read_from_zip(
43
+ inner_path=self.__io_utils.get_news_innerpath(
44
+ folding_data_type=self.__doc_fold[filename], filename=filename),
45
+ process_func=file_to_doc,
46
+ version=self.__version)
@@ -0,0 +1,24 @@
1
+ from os.path import basename
2
+
3
+
4
+ def __iter_filtered_filenames(filenames_iter):
5
+ for filename in filenames_iter:
6
+ extension = filename[-4:]
7
+ # Crop extension.
8
+ filename = filename[:-4]
9
+ if extension != ".txt":
10
+ continue
11
+ yield filename, basename(filename)
12
+
13
+
14
+ def iter_filename_and_splittype(filenames_it, splits):
15
+ for doc_id, data in enumerate(__iter_filtered_filenames(filenames_it)):
16
+ filepath, filename = data
17
+ for split_type, split_name in splits:
18
+ if split_name in filepath:
19
+ yield filename, split_type
20
+
21
+
22
+ def iter_collection_filenames(filenames_it):
23
+ for doc_id, filename in enumerate(__iter_filtered_filenames(filenames_it)):
24
+ yield doc_id, filename
@@ -0,0 +1,12 @@
1
+ import enum
2
+
3
+
4
+ class NerelVersions(enum.Enum):
5
+ """ List of the supported version of this collection
6
+ """
7
+
8
+ V1 = "v1_0"
9
+ V11 = "v1_1"
10
+
11
+
12
+ DEFAULT_VERSION = NerelVersions.V1
File without changes
@@ -0,0 +1,62 @@
1
+ from os import path
2
+
3
+ from arekit.common.experiment.data_type import DataType
4
+ from arekit.contrib.source.nerel.folding.fixed import create_fixed_folding
5
+ from arekit.contrib.source.nerel.io_utils import NerelIOUtils
6
+ from arekit.contrib.source.nerel.utils import iter_filename_and_splittype
7
+
8
+
9
+ class NerelBioIOUtils(NerelIOUtils):
10
+
11
+ splits = {
12
+ DataType.Train: "train",
13
+ DataType.Dev: "dev",
14
+ DataType.Test: "test"
15
+ }
16
+
17
+ @staticmethod
18
+ def get_archive_filepath(version):
19
+ return path.join(NerelBioIOUtils.get_data_root(), "nerel-bio-{}.zip".format(version))
20
+
21
+ @staticmethod
22
+ def get_annotation_innerpath(folding_data_type, filename):
23
+ assert(isinstance(filename, str))
24
+ return path.join(NerelBioIOUtils.splits[folding_data_type], "{}.ann".format(filename))
25
+
26
+ @staticmethod
27
+ def get_news_innerpath(folding_data_type, filename):
28
+ assert(isinstance(filename, str))
29
+ return path.join(NerelBioIOUtils.splits[folding_data_type], "{}.txt".format(filename))
30
+
31
+ @staticmethod
32
+ def map_doc_to_fold_type(version):
33
+
34
+ it = iter_filename_and_splittype(
35
+ filenames_it=NerelBioIOUtils.iter_filenames_from_zip(version),
36
+ splits=NerelBioIOUtils.splits.items())
37
+
38
+ d2f = {}
39
+ for filename, split_type in it:
40
+ d2f[filename] = split_type
41
+
42
+ return d2f
43
+
44
+ @staticmethod
45
+ def read_dataset_split(version, docs_limit=None):
46
+
47
+ it = iter_filename_and_splittype(
48
+ filenames_it=NerelBioIOUtils.iter_filenames_from_zip(version),
49
+ splits=NerelBioIOUtils.splits.items())
50
+
51
+ f2d = {}
52
+ for filename, split_type in it:
53
+ if split_type not in f2d:
54
+ f2d[split_type] = []
55
+ f2d[split_type].append(filename)
56
+
57
+ filenames_by_ids, data_folding = create_fixed_folding(train_filenames=f2d[DataType.Train],
58
+ test_filenames=f2d[DataType.Test],
59
+ dev_filenames=f2d[DataType.Dev],
60
+ limit=docs_limit)
61
+
62
+ return filenames_by_ids, data_folding