arekit 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. arekit/__init__.py +0 -0
  2. arekit/common/__init__.py +0 -0
  3. arekit/common/bound.py +48 -0
  4. arekit/common/context/__init__.py +0 -0
  5. arekit/common/context/terms_mapper.py +51 -0
  6. arekit/common/context/token.py +16 -0
  7. arekit/common/data/__init__.py +0 -0
  8. arekit/common/data/const.py +21 -0
  9. arekit/common/data/doc_provider.py +6 -0
  10. arekit/common/data/input/__init__.py +0 -0
  11. arekit/common/data/input/providers/__init__.py +0 -0
  12. arekit/common/data/input/providers/columns/__init__.py +0 -0
  13. arekit/common/data/input/providers/columns/base.py +9 -0
  14. arekit/common/data/input/providers/columns/sample.py +59 -0
  15. arekit/common/data/input/providers/const.py +3 -0
  16. arekit/common/data/input/providers/contents.py +9 -0
  17. arekit/common/data/input/providers/instances/__init__.py +0 -0
  18. arekit/common/data/input/providers/instances/base.py +14 -0
  19. arekit/common/data/input/providers/instances/multiple.py +27 -0
  20. arekit/common/data/input/providers/instances/single.py +8 -0
  21. arekit/common/data/input/providers/label/__init__.py +0 -0
  22. arekit/common/data/input/providers/label/base.py +24 -0
  23. arekit/common/data/input/providers/label/binary.py +11 -0
  24. arekit/common/data/input/providers/label/multiple.py +15 -0
  25. arekit/common/data/input/providers/rows/__init__.py +0 -0
  26. arekit/common/data/input/providers/rows/base.py +64 -0
  27. arekit/common/data/input/providers/rows/samples.py +227 -0
  28. arekit/common/data/input/providers/sample/__init__.py +0 -0
  29. arekit/common/data/input/providers/sample/cropped.py +43 -0
  30. arekit/common/data/input/providers/text/__init__.py +0 -0
  31. arekit/common/data/input/providers/text/single.py +49 -0
  32. arekit/common/data/input/repositories/__init__.py +0 -0
  33. arekit/common/data/input/repositories/base.py +68 -0
  34. arekit/common/data/input/repositories/sample.py +22 -0
  35. arekit/common/data/input/sample.py +66 -0
  36. arekit/common/data/input/terms_mapper.py +88 -0
  37. arekit/common/data/rows_fmt.py +82 -0
  38. arekit/common/data/rows_parser.py +43 -0
  39. arekit/common/data/storages/__init__.py +0 -0
  40. arekit/common/data/storages/base.py +109 -0
  41. arekit/common/data/views/__init__.py +0 -0
  42. arekit/common/data/views/samples.py +26 -0
  43. arekit/common/docs/__init__.py +0 -0
  44. arekit/common/docs/base.py +30 -0
  45. arekit/common/docs/entities_grouping.py +16 -0
  46. arekit/common/docs/entity.py +18 -0
  47. arekit/common/docs/objects_parser.py +37 -0
  48. arekit/common/docs/parsed/__init__.py +0 -0
  49. arekit/common/docs/parsed/base.py +101 -0
  50. arekit/common/docs/parsed/providers/__init__.py +0 -0
  51. arekit/common/docs/parsed/providers/base.py +68 -0
  52. arekit/common/docs/parsed/providers/base_pairs.py +51 -0
  53. arekit/common/docs/parsed/providers/entity_service.py +175 -0
  54. arekit/common/docs/parsed/providers/opinion_pairs.py +20 -0
  55. arekit/common/docs/parsed/providers/text_opinion_pairs.py +78 -0
  56. arekit/common/docs/parsed/service.py +31 -0
  57. arekit/common/docs/parsed/term_position.py +42 -0
  58. arekit/common/docs/parser.py +34 -0
  59. arekit/common/docs/sentence.py +14 -0
  60. arekit/common/entities/__init__.py +0 -0
  61. arekit/common/entities/base.py +51 -0
  62. arekit/common/entities/collection.py +72 -0
  63. arekit/common/entities/str_fmt.py +8 -0
  64. arekit/common/entities/types.py +9 -0
  65. arekit/common/experiment/__init__.py +0 -0
  66. arekit/common/experiment/api/__init__.py +0 -0
  67. arekit/common/experiment/api/base_samples_io.py +20 -0
  68. arekit/common/experiment/data_type.py +17 -0
  69. arekit/common/frames/__init__.py +0 -0
  70. arekit/common/frames/connotations/__init__.py +0 -0
  71. arekit/common/frames/connotations/descriptor.py +17 -0
  72. arekit/common/frames/connotations/provider.py +4 -0
  73. arekit/common/frames/text_variant.py +43 -0
  74. arekit/common/frames/variants/__init__.py +0 -0
  75. arekit/common/frames/variants/base.py +21 -0
  76. arekit/common/frames/variants/collection.py +60 -0
  77. arekit/common/labels/__init__.py +0 -0
  78. arekit/common/labels/base.py +19 -0
  79. arekit/common/labels/provider/__init__.py +0 -0
  80. arekit/common/labels/provider/base.py +7 -0
  81. arekit/common/labels/provider/constant.py +14 -0
  82. arekit/common/labels/scaler/__init__.py +0 -0
  83. arekit/common/labels/scaler/base.py +85 -0
  84. arekit/common/labels/scaler/sentiment.py +7 -0
  85. arekit/common/labels/scaler/single.py +10 -0
  86. arekit/common/labels/str_fmt.py +55 -0
  87. arekit/common/linkage/__init__.py +0 -0
  88. arekit/common/linkage/base.py +44 -0
  89. arekit/common/linkage/meta.py +23 -0
  90. arekit/common/linkage/opinions.py +9 -0
  91. arekit/common/linkage/text_opinions.py +22 -0
  92. arekit/common/log_utils.py +29 -0
  93. arekit/common/model/__init__.py +0 -0
  94. arekit/common/model/labeling/__init__.py +0 -0
  95. arekit/common/model/labeling/base.py +24 -0
  96. arekit/common/model/labeling/modes.py +8 -0
  97. arekit/common/model/labeling/single.py +24 -0
  98. arekit/common/opinions/__init__.py +0 -0
  99. arekit/common/opinions/annot/__init__.py +0 -0
  100. arekit/common/opinions/annot/algo/__init__.py +0 -0
  101. arekit/common/opinions/annot/algo/base.py +4 -0
  102. arekit/common/opinions/annot/algo/pair_based.py +99 -0
  103. arekit/common/opinions/annot/algo/predefined.py +16 -0
  104. arekit/common/opinions/annot/algo_based.py +55 -0
  105. arekit/common/opinions/annot/base.py +15 -0
  106. arekit/common/opinions/base.py +74 -0
  107. arekit/common/opinions/collection.py +150 -0
  108. arekit/common/opinions/enums.py +6 -0
  109. arekit/common/opinions/provider.py +4 -0
  110. arekit/common/opinions/writer.py +4 -0
  111. arekit/common/pipeline/__init__.py +0 -0
  112. arekit/common/pipeline/base.py +25 -0
  113. arekit/common/pipeline/context.py +36 -0
  114. arekit/common/pipeline/conts.py +2 -0
  115. arekit/common/pipeline/items/__init__.py +0 -0
  116. arekit/common/pipeline/items/base.py +12 -0
  117. arekit/common/pipeline/items/flatten.py +14 -0
  118. arekit/common/pipeline/items/handle.py +17 -0
  119. arekit/common/pipeline/items/iter.py +11 -0
  120. arekit/common/pipeline/items/map.py +11 -0
  121. arekit/common/pipeline/items/map_nested.py +13 -0
  122. arekit/common/synonyms/__init__.py +0 -0
  123. arekit/common/synonyms/base.py +151 -0
  124. arekit/common/synonyms/grouping.py +21 -0
  125. arekit/common/text/__init__.py +0 -0
  126. arekit/common/text/enums.py +12 -0
  127. arekit/common/text/parsed.py +42 -0
  128. arekit/common/text/parser.py +12 -0
  129. arekit/common/text/partitioning/__init__.py +0 -0
  130. arekit/common/text/partitioning/base.py +4 -0
  131. arekit/common/text/partitioning/str.py +36 -0
  132. arekit/common/text/partitioning/terms.py +35 -0
  133. arekit/common/text/stemmer.py +16 -0
  134. arekit/common/text_opinions/__init__.py +0 -0
  135. arekit/common/text_opinions/base.py +105 -0
  136. arekit/common/utils.py +129 -0
  137. arekit/contrib/__init__.py +0 -0
  138. arekit/contrib/bert/__init__.py +0 -0
  139. arekit/contrib/bert/input/__init__.py +0 -0
  140. arekit/contrib/bert/input/providers/__init__.py +0 -0
  141. arekit/contrib/bert/input/providers/cropped_sample.py +17 -0
  142. arekit/contrib/bert/input/providers/text_pair.py +62 -0
  143. arekit/contrib/bert/terms/__init__.py +0 -0
  144. arekit/contrib/bert/terms/mapper.py +20 -0
  145. arekit/contrib/networks/__init__.py +0 -0
  146. arekit/contrib/networks/embedding.py +149 -0
  147. arekit/contrib/networks/embedding_io.py +18 -0
  148. arekit/contrib/networks/input/__init__.py +0 -0
  149. arekit/contrib/networks/input/const.py +6 -0
  150. arekit/contrib/networks/input/ctx_serialization.py +28 -0
  151. arekit/contrib/networks/input/embedding/__init__.py +0 -0
  152. arekit/contrib/networks/input/embedding/matrix.py +29 -0
  153. arekit/contrib/networks/input/embedding/offsets.py +55 -0
  154. arekit/contrib/networks/input/formatters/__init__.py +0 -0
  155. arekit/contrib/networks/input/formatters/pos_mapper.py +22 -0
  156. arekit/contrib/networks/input/providers/__init__.py +0 -0
  157. arekit/contrib/networks/input/providers/sample.py +129 -0
  158. arekit/contrib/networks/input/providers/term_connotation.py +23 -0
  159. arekit/contrib/networks/input/providers/text.py +24 -0
  160. arekit/contrib/networks/input/rows_parser.py +47 -0
  161. arekit/contrib/networks/input/term_types.py +13 -0
  162. arekit/contrib/networks/input/terms_mapping.py +60 -0
  163. arekit/contrib/networks/vectorizer.py +6 -0
  164. arekit/contrib/prompt/__init__.py +0 -0
  165. arekit/contrib/prompt/sample.py +61 -0
  166. arekit/contrib/source/__init__.py +0 -0
  167. arekit/contrib/source/brat/__init__.py +0 -0
  168. arekit/contrib/source/brat/annot.py +84 -0
  169. arekit/contrib/source/brat/doc.py +28 -0
  170. arekit/contrib/source/brat/entities/__init__.py +0 -0
  171. arekit/contrib/source/brat/entities/compound.py +13 -0
  172. arekit/contrib/source/brat/entities/entity.py +42 -0
  173. arekit/contrib/source/brat/entities/parser.py +53 -0
  174. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  175. arekit/contrib/source/brat/opinions/converter.py +19 -0
  176. arekit/contrib/source/brat/relation.py +32 -0
  177. arekit/contrib/source/brat/sentence.py +69 -0
  178. arekit/contrib/source/brat/sentences_reader.py +128 -0
  179. arekit/contrib/source/download.py +41 -0
  180. arekit/contrib/source/nerel/__init__.py +0 -0
  181. arekit/contrib/source/nerel/entities.py +55 -0
  182. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  183. arekit/contrib/source/nerel/folding/fixed.py +74 -0
  184. arekit/contrib/source/nerel/io_utils.py +62 -0
  185. arekit/contrib/source/nerel/labels.py +241 -0
  186. arekit/contrib/source/nerel/reader.py +46 -0
  187. arekit/contrib/source/nerel/utils.py +24 -0
  188. arekit/contrib/source/nerel/versions.py +12 -0
  189. arekit/contrib/source/nerelbio/__init__.py +0 -0
  190. arekit/contrib/source/nerelbio/io_utils.py +62 -0
  191. arekit/contrib/source/nerelbio/labels.py +265 -0
  192. arekit/contrib/source/nerelbio/reader.py +8 -0
  193. arekit/contrib/source/nerelbio/versions.py +8 -0
  194. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  195. arekit/contrib/source/ruattitudes/collection.py +36 -0
  196. arekit/contrib/source/ruattitudes/doc.py +51 -0
  197. arekit/contrib/source/ruattitudes/doc_brat.py +44 -0
  198. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  199. arekit/contrib/source/ruattitudes/entity/parser.py +7 -0
  200. arekit/contrib/source/ruattitudes/io_utils.py +56 -0
  201. arekit/contrib/source/ruattitudes/labels_fmt.py +12 -0
  202. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  203. arekit/contrib/source/ruattitudes/opinions/base.py +28 -0
  204. arekit/contrib/source/ruattitudes/opinions/converter.py +37 -0
  205. arekit/contrib/source/ruattitudes/reader.py +268 -0
  206. arekit/contrib/source/ruattitudes/sentence.py +73 -0
  207. arekit/contrib/source/ruattitudes/synonyms.py +17 -0
  208. arekit/contrib/source/ruattitudes/text_object.py +59 -0
  209. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  210. arekit/contrib/source/rusentiframes/collection.py +157 -0
  211. arekit/contrib/source/rusentiframes/effect.py +24 -0
  212. arekit/contrib/source/rusentiframes/io_utils.py +19 -0
  213. arekit/contrib/source/rusentiframes/labels_fmt.py +22 -0
  214. arekit/contrib/source/rusentiframes/polarity.py +35 -0
  215. arekit/contrib/source/rusentiframes/role.py +15 -0
  216. arekit/contrib/source/rusentiframes/state.py +24 -0
  217. arekit/contrib/source/rusentiframes/types.py +42 -0
  218. arekit/contrib/source/rusentiframes/value.py +2 -0
  219. arekit/contrib/source/rusentrel/__init__.py +0 -0
  220. arekit/contrib/source/rusentrel/const.py +3 -0
  221. arekit/contrib/source/rusentrel/docs_reader.py +51 -0
  222. arekit/contrib/source/rusentrel/entities.py +26 -0
  223. arekit/contrib/source/rusentrel/io_utils.py +125 -0
  224. arekit/contrib/source/rusentrel/labels_fmt.py +12 -0
  225. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  226. arekit/contrib/source/rusentrel/opinions/collection.py +30 -0
  227. arekit/contrib/source/rusentrel/opinions/converter.py +40 -0
  228. arekit/contrib/source/rusentrel/opinions/provider.py +54 -0
  229. arekit/contrib/source/rusentrel/opinions/writer.py +42 -0
  230. arekit/contrib/source/rusentrel/synonyms.py +17 -0
  231. arekit/contrib/source/sentinerel/__init__.py +0 -0
  232. arekit/contrib/source/sentinerel/entities.py +52 -0
  233. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  234. arekit/contrib/source/sentinerel/folding/factory.py +31 -0
  235. arekit/contrib/source/sentinerel/folding/fixed.py +70 -0
  236. arekit/contrib/source/sentinerel/io_utils.py +87 -0
  237. arekit/contrib/source/sentinerel/labels.py +53 -0
  238. arekit/contrib/source/sentinerel/labels_scaler.py +30 -0
  239. arekit/contrib/source/sentinerel/reader.py +42 -0
  240. arekit/contrib/source/synonyms/__init__.py +0 -0
  241. arekit/contrib/source/synonyms/utils.py +19 -0
  242. arekit/contrib/source/zip_utils.py +47 -0
  243. arekit/contrib/utils/__init__.py +0 -0
  244. arekit/contrib/utils/bert/__init__.py +0 -0
  245. arekit/contrib/utils/bert/samplers.py +17 -0
  246. arekit/contrib/utils/connotations/__init__.py +0 -0
  247. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +23 -0
  248. arekit/contrib/utils/data/__init__.py +0 -0
  249. arekit/contrib/utils/data/contents/__init__.py +0 -0
  250. arekit/contrib/utils/data/contents/opinions.py +37 -0
  251. arekit/contrib/utils/data/doc_provider/__init__.py +0 -0
  252. arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
  253. arekit/contrib/utils/data/doc_provider/dir_based.py +53 -0
  254. arekit/contrib/utils/data/readers/__init__.py +0 -0
  255. arekit/contrib/utils/data/readers/base.py +7 -0
  256. arekit/contrib/utils/data/readers/csv_pd.py +38 -0
  257. arekit/contrib/utils/data/readers/jsonl.py +15 -0
  258. arekit/contrib/utils/data/service/__init__.py +0 -0
  259. arekit/contrib/utils/data/service/balance.py +50 -0
  260. arekit/contrib/utils/data/storages/__init__.py +0 -0
  261. arekit/contrib/utils/data/storages/jsonl_based.py +18 -0
  262. arekit/contrib/utils/data/storages/pandas_based.py +123 -0
  263. arekit/contrib/utils/data/storages/row_cache.py +48 -0
  264. arekit/contrib/utils/data/writers/__init__.py +0 -0
  265. arekit/contrib/utils/data/writers/base.py +27 -0
  266. arekit/contrib/utils/data/writers/csv_native.py +63 -0
  267. arekit/contrib/utils/data/writers/csv_pd.py +40 -0
  268. arekit/contrib/utils/data/writers/json_opennre.py +132 -0
  269. arekit/contrib/utils/data/writers/sqlite_native.py +110 -0
  270. arekit/contrib/utils/download.py +77 -0
  271. arekit/contrib/utils/embeddings/__init__.py +0 -0
  272. arekit/contrib/utils/embeddings/rusvectores.py +58 -0
  273. arekit/contrib/utils/embeddings/tokens.py +30 -0
  274. arekit/contrib/utils/entities/__init__.py +0 -0
  275. arekit/contrib/utils/entities/filter.py +7 -0
  276. arekit/contrib/utils/entities/formatters/__init__.py +0 -0
  277. arekit/contrib/utils/entities/formatters/str_display.py +11 -0
  278. arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py +15 -0
  279. arekit/contrib/utils/io_utils/__init__.py +0 -0
  280. arekit/contrib/utils/io_utils/embedding.py +72 -0
  281. arekit/contrib/utils/io_utils/opinions.py +37 -0
  282. arekit/contrib/utils/io_utils/samples.py +79 -0
  283. arekit/contrib/utils/io_utils/utils.py +39 -0
  284. arekit/contrib/utils/lexicons/__init__.py +0 -0
  285. arekit/contrib/utils/lexicons/lexicon.py +41 -0
  286. arekit/contrib/utils/lexicons/relation.py +42 -0
  287. arekit/contrib/utils/lexicons/rusentilex.py +37 -0
  288. arekit/contrib/utils/nn/__init__.py +0 -0
  289. arekit/contrib/utils/nn/rows.py +83 -0
  290. arekit/contrib/utils/np_utils/__init__.py +0 -0
  291. arekit/contrib/utils/np_utils/embedding.py +22 -0
  292. arekit/contrib/utils/np_utils/npz_utils.py +13 -0
  293. arekit/contrib/utils/np_utils/vocab.py +20 -0
  294. arekit/contrib/utils/pipelines/__init__.py +0 -0
  295. arekit/contrib/utils/pipelines/items/__init__.py +0 -0
  296. arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
  297. arekit/contrib/utils/pipelines/items/sampling/base.py +99 -0
  298. arekit/contrib/utils/pipelines/items/sampling/networks.py +54 -0
  299. arekit/contrib/utils/pipelines/items/text/__init__.py +0 -0
  300. arekit/contrib/utils/pipelines/items/text/entities_default.py +23 -0
  301. arekit/contrib/utils/pipelines/items/text/frames.py +86 -0
  302. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +36 -0
  303. arekit/contrib/utils/pipelines/items/text/frames_negation.py +32 -0
  304. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +10 -0
  305. arekit/contrib/utils/pipelines/items/text/tokenizer.py +107 -0
  306. arekit/contrib/utils/pipelines/items/text/translator.py +135 -0
  307. arekit/contrib/utils/pipelines/opinion_collections.py +85 -0
  308. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  309. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  310. arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +27 -0
  311. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +65 -0
  312. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +60 -0
  313. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  314. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +29 -0
  315. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +64 -0
  316. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +79 -0
  317. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  318. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +56 -0
  319. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +20 -0
  320. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +65 -0
  321. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  322. arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +21 -0
  323. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +107 -0
  324. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  325. arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +29 -0
  326. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +62 -0
  327. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +180 -0
  328. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +50 -0
  329. arekit/contrib/utils/pipelines/text_opinion/__init__.py +0 -0
  330. arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py +0 -0
  331. arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +34 -0
  332. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +88 -0
  333. arekit/contrib/utils/pipelines/text_opinion/extraction.py +93 -0
  334. arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py +0 -0
  335. arekit/contrib/utils/pipelines/text_opinion/filters/base.py +4 -0
  336. arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +16 -0
  337. arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +29 -0
  338. arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +26 -0
  339. arekit/contrib/utils/processing/__init__.py +0 -0
  340. arekit/contrib/utils/processing/languages/__init__.py +0 -0
  341. arekit/contrib/utils/processing/languages/mods.py +12 -0
  342. arekit/contrib/utils/processing/languages/pos.py +23 -0
  343. arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
  344. arekit/contrib/utils/processing/languages/ru/cases.py +78 -0
  345. arekit/contrib/utils/processing/languages/ru/constants.py +6 -0
  346. arekit/contrib/utils/processing/languages/ru/mods.py +13 -0
  347. arekit/contrib/utils/processing/languages/ru/number.py +23 -0
  348. arekit/contrib/utils/processing/languages/ru/pos_service.py +36 -0
  349. arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
  350. arekit/contrib/utils/processing/lemmatization/mystem.py +51 -0
  351. arekit/contrib/utils/processing/pos/__init__.py +0 -0
  352. arekit/contrib/utils/processing/pos/base.py +12 -0
  353. arekit/contrib/utils/processing/pos/mystem_wrap.py +134 -0
  354. arekit/contrib/utils/processing/pos/russian.py +10 -0
  355. arekit/contrib/utils/processing/text/__init__.py +0 -0
  356. arekit/contrib/utils/processing/text/tokens.py +127 -0
  357. arekit/contrib/utils/resources.py +25 -0
  358. arekit/contrib/utils/serializer.py +43 -0
  359. arekit/contrib/utils/sources/__init__.py +0 -0
  360. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  361. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  362. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +63 -0
  363. arekit/contrib/utils/synonyms/__init__.py +0 -0
  364. arekit/contrib/utils/synonyms/simple.py +15 -0
  365. arekit/contrib/utils/synonyms/stemmer_based.py +38 -0
  366. arekit/contrib/utils/vectorizers/__init__.py +0 -0
  367. arekit/contrib/utils/vectorizers/bpe.py +93 -0
  368. arekit/contrib/utils/vectorizers/random_norm.py +39 -0
  369. arekit/download_data.py +11 -0
  370. arekit-0.24.0.dist-info/LICENSE +21 -0
  371. arekit-0.24.0.dist-info/METADATA +23 -0
  372. arekit-0.24.0.dist-info/RECORD +374 -0
  373. arekit-0.24.0.dist-info/WHEEL +5 -0
  374. arekit-0.24.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,27 @@
1
+ from arekit.common.data.doc_provider import DocumentProvider
2
+ from arekit.contrib.source.nerel.reader import NerelDocReader
3
+ from arekit.contrib.source.nerel.versions import NerelVersions
4
+
5
+
6
+ class NERELDocProvider(DocumentProvider):
7
+ """ A Russian dataset with nested named entities, relations, events and linked entities.
8
+ https://github.com/nerel-ds/NEREL
9
+ """
10
+
11
+ def __init__(self, filename_by_id, version):
12
+ """ filename_ids: dict
13
+ Dictionary of {id: filename}, where
14
+ - id: int
15
+ - filename: str
16
+ version: NerelVersions
17
+ Specify the appropriate version of teh NEREL collection.
18
+ """
19
+ assert(isinstance(filename_by_id, dict))
20
+ assert(isinstance(version, NerelVersions))
21
+ super(NERELDocProvider, self).__init__()
22
+ self.__filename_by_id = filename_by_id
23
+ self.__version = version
24
+ self.__doc_reader = NerelDocReader(version)
25
+
26
+ def by_id(self, doc_id):
27
+ return self.__doc_reader.read_document(doc_id=doc_id, filename=self.__filename_by_id[doc_id])
@@ -0,0 +1,65 @@
1
+ from arekit.common.data.doc_provider import DocumentProvider
2
+ from arekit.common.experiment.data_type import DataType
3
+ from arekit.contrib.source.nerel.io_utils import NerelIOUtils
4
+ from arekit.contrib.source.nerel.versions import NerelVersions
5
+ from arekit.contrib.utils.pipelines.sources.nerel.doc_provider import NERELDocProvider
6
+ from arekit.contrib.utils.pipelines.sources.nerel.labels_fmt import NerelAnyLabelFormatter
7
+ from arekit.contrib.utils.pipelines.text_opinion.annot.predefined import PredefinedTextOpinionAnnotator
8
+ from arekit.contrib.utils.pipelines.text_opinion.extraction import text_opinion_extraction_pipeline
9
+ from arekit.contrib.utils.pipelines.text_opinion.filters.distance_based import DistanceLimitedTextOpinionFilter
10
+
11
+
12
+ def create_text_relation_extraction_pipeline(nerel_version,
13
+ text_parser,
14
+ label_formatter=NerelAnyLabelFormatter(),
15
+ terms_per_context=50,
16
+ doc_ops=None,
17
+ docs_limit=None,
18
+ custom_text_opinion_filters=None):
19
+ assert(isinstance(nerel_version, NerelVersions))
20
+ assert(isinstance(doc_ops, DocumentProvider) or doc_ops is None)
21
+ assert(isinstance(custom_text_opinion_filters, list) or custom_text_opinion_filters is None)
22
+
23
+ data_folding = None
24
+
25
+ if doc_ops is None:
26
+ # Default Initialization.
27
+ filenames_by_ids, data_folding = NerelIOUtils.read_dataset_split(version=nerel_version,
28
+ docs_limit=docs_limit)
29
+ doc_ops = NERELDocProvider(filename_by_id=filenames_by_ids, version=nerel_version)
30
+
31
+ # Default text opinion filters.
32
+ text_opinion_filters = [
33
+ DistanceLimitedTextOpinionFilter(terms_per_context)
34
+ ]
35
+
36
+ # Append with the custom filters afterwards.
37
+ if custom_text_opinion_filters is not None:
38
+ text_opinion_filters += custom_text_opinion_filters
39
+
40
+ predefined_annot = PredefinedTextOpinionAnnotator(doc_ops, label_formatter)
41
+
42
+ pipelines = {
43
+ DataType.Train: text_opinion_extraction_pipeline(text_parser=text_parser,
44
+ get_doc_by_id_func=doc_ops.by_id,
45
+ annotators=[predefined_annot],
46
+ entity_index_func=lambda brat_entity: brat_entity.ID,
47
+ text_opinion_filters=text_opinion_filters),
48
+ DataType.Test: text_opinion_extraction_pipeline(text_parser=text_parser,
49
+ get_doc_by_id_func=doc_ops.by_id,
50
+ annotators=[predefined_annot],
51
+ entity_index_func=lambda brat_entity: brat_entity.ID,
52
+ text_opinion_filters=text_opinion_filters),
53
+ DataType.Dev: text_opinion_extraction_pipeline(text_parser=text_parser,
54
+ get_doc_by_id_func=doc_ops.by_id,
55
+ annotators=[predefined_annot],
56
+ entity_index_func=lambda brat_entity: brat_entity.ID,
57
+ text_opinion_filters=text_opinion_filters),
58
+ }
59
+
60
+ # In the case when we setup a default data-folding.
61
+ # There is a need to provide it, due to the needs in further.
62
+ if data_folding is not None:
63
+ return pipelines, data_folding
64
+
65
+ return pipelines
@@ -0,0 +1,60 @@
1
+ from arekit.common.labels.str_fmt import StringLabelsFormatter
2
+ from arekit.contrib.source.nerel import labels
3
+
4
+
5
+ class NerelAnyLabelFormatter(StringLabelsFormatter):
6
+
7
+ def __init__(self):
8
+
9
+ stol = {
10
+ "OPINION_BELONGS_TO": labels.OpinionBelongsTo,
11
+ "OPINION_RELATES_TO": labels.OpinionRelatesTo,
12
+ "NEG_EFFECT_FROM": labels.NegEffectFrom,
13
+ "POS_EFFECT_FROM": labels.PosEffectFrom,
14
+ "NEG_STATE_FROM": labels.NegStateFrom,
15
+ "POS_STATE_FROM": labels.PosStateFrom,
16
+ "NEGATIVE_TO": labels.NegativeTo,
17
+ "POSITIVE_TO": labels.PositiveTo,
18
+ "STATE_BELONGS_TO": labels.STATE_BELONGS_TO,
19
+ "POS_AUTHOR_FROM": labels.PosAuthorFrom,
20
+ "NEG_AUTHOR_FROM": labels.NegAuthorFrom,
21
+ "ALTERNATIVE_NAME": labels.ALTERNATIVE_NAME,
22
+ "ORIGINS_FROM": labels.ORIGINS_FROM,
23
+ "START_TIME": labels.START_TIME,
24
+ "OWNER_OF": labels.OWNER_OF,
25
+ "SUBEVENT_OF": labels.SUBEVENT_OF,
26
+ "PARENT_OF": labels.PARENT_OF,
27
+ "SUBORDINATE_OF": labels.SUBORDINATE_OF,
28
+ "PART_OF": labels.PART_OF,
29
+ "TAKES_PLACE_IN": labels.TAKES_PLACE_IN,
30
+ "PARTICIPANT_IN": labels.PARTICIPANT_IN,
31
+ "WORKPLACE": labels.WORKPLACE,
32
+ "PENALIZED_AS": labels.PENALIZED_AS,
33
+ "WORKS_AS": labels.WORKS_AS,
34
+ "PLACE_OF_DEATH": labels.PLACE_OF_DEATH,
35
+ "PLACE_OF_BIRTH": labels.PLACE_OF_BIRTH,
36
+ "HAS_CAUSE": labels.HAS_CAUSE,
37
+ "AWARDED_WITH": labels.AWARDED_WITH,
38
+ "CAUSE_OF_DEATH": labels.CAUSE_OF_DEATH,
39
+ "CONVICTED_OF": labels.CONVICTED_OF,
40
+ "DATE_DEFUNCT_IN": labels.DATE_DEFUNCT_IN,
41
+ "DATE_FOUNDED_IN": labels.DATE_FOUNDED_IN,
42
+ "DATE_OF_BIRTH": labels.DATE_OF_BIRTH,
43
+ "DATE_OF_CREATION": labels.DATE_OF_CREATION,
44
+ "DATE_OF_DEATH": labels.DATE_OF_DEATH,
45
+ "END_TIME": labels.END_TIME,
46
+ "EXPENDITURE": labels.EXPENDITURE,
47
+ "FOUNDED_BY": labels.FOUNDED_BY,
48
+ "KNOWS": labels.KNOWS,
49
+ "RELATIVE": labels.RELATIVE,
50
+ "LOCATED_IN": labels.LOCATED_IN,
51
+ "RELIGION_OF": labels.RELIGION_OF,
52
+ "MEDICAL_CONDITION": labels.MEDICAL_CONDITION,
53
+ "SCHOOLS_ATTENDED": labels.SCHOOLS_ATTENDED,
54
+ "MEMBER_OF": labels.MEMBER_OF,
55
+ "SIBLING": labels.SIBLING,
56
+ "ORGANIZES": labels.ORGANIZES,
57
+ "SPOUSE": labels.SPOUSE
58
+ }
59
+
60
+ super(NerelAnyLabelFormatter, self).__init__(stol=stol)
@@ -0,0 +1,29 @@
1
+ from arekit.common.data.doc_provider import DocumentProvider
2
+ from arekit.contrib.source.nerelbio.reader import NerelBioDocReader
3
+ from arekit.contrib.source.nerelbio.versions import NerelBioVersions
4
+
5
+
6
+ class NERELBioDocProvider(DocumentProvider):
7
+ """ NEREL-BIO extends the general domain dataset NEREL.
8
+ NEREL-BIO annotation scheme covers both general and biomedical
9
+ domains making it suitable for domain transfer experiments.
10
+ https://github.com/nerel-ds/NEREL-BIO
11
+ """
12
+
13
+ def __init__(self, filename_by_id, version):
14
+ """ filename_ids: dict
15
+ Dictionary of {id: filename}, where
16
+ - id: int
17
+ - filename: str
18
+ version: NerelBioVersions
19
+ Specify the appropriate version of the NEREL-BIO collection.
20
+ """
21
+ assert(isinstance(filename_by_id, dict))
22
+ assert(isinstance(version, NerelBioVersions))
23
+ super(NERELBioDocProvider, self).__init__()
24
+ self.__filename_by_id = filename_by_id
25
+ self.__version = version
26
+ self.__doc_reader = NerelBioDocReader(version)
27
+
28
+ def by_id(self, doc_id):
29
+ return self.__doc_reader.read_document(doc_id=doc_id, filename=self.__filename_by_id[doc_id])
@@ -0,0 +1,64 @@
1
+ from arekit.common.data.doc_provider import DocumentProvider
2
+ from arekit.common.experiment.data_type import DataType
3
+ from arekit.contrib.source.nerelbio.io_utils import NerelBioIOUtils
4
+ from arekit.contrib.source.nerelbio.versions import NerelBioVersions
5
+ from arekit.contrib.utils.pipelines.sources.nerel_bio.doc_provider import NERELBioDocProvider
6
+ from arekit.contrib.utils.pipelines.sources.nerel_bio.labels_fmt import NerelBioAnyLabelFormatter
7
+ from arekit.contrib.utils.pipelines.text_opinion.annot.predefined import PredefinedTextOpinionAnnotator
8
+ from arekit.contrib.utils.pipelines.text_opinion.extraction import text_opinion_extraction_pipeline
9
+ from arekit.contrib.utils.pipelines.text_opinion.filters.distance_based import DistanceLimitedTextOpinionFilter
10
+
11
+
12
+ def create_text_relation_extraction_pipeline(nerel_bio_version,
13
+ text_parser,
14
+ label_formatter=NerelBioAnyLabelFormatter(),
15
+ terms_per_context=50,
16
+ doc_ops=None,
17
+ docs_limit=None,
18
+ custom_text_opinion_filters=None):
19
+ assert(isinstance(nerel_bio_version, NerelBioVersions))
20
+ assert(isinstance(doc_ops, DocumentProvider) or doc_ops is None)
21
+ assert(isinstance(custom_text_opinion_filters, list) or custom_text_opinion_filters is None)
22
+
23
+ data_folding = None
24
+
25
+ if doc_ops is None:
26
+ # Default Initialization.
27
+ filenames_by_ids, data_folding = NerelBioIOUtils.read_dataset_split(version=nerel_bio_version,
28
+ docs_limit=docs_limit)
29
+ doc_ops = NERELBioDocProvider(filename_by_id=filenames_by_ids, version=nerel_bio_version)
30
+
31
+ text_opinion_filters = [
32
+ DistanceLimitedTextOpinionFilter(terms_per_context)
33
+ ]
34
+
35
+ # Append with the custom filters afterwards.
36
+ if custom_text_opinion_filters is not None:
37
+ text_opinion_filters += custom_text_opinion_filters
38
+
39
+ predefined_annot = PredefinedTextOpinionAnnotator(doc_ops, label_formatter)
40
+
41
+ pipelines = {
42
+ DataType.Train: text_opinion_extraction_pipeline(text_parser=text_parser,
43
+ get_doc_by_id_func=doc_ops.by_id,
44
+ annotators=[predefined_annot],
45
+ entity_index_func=lambda brat_entity: brat_entity.ID,
46
+ text_opinion_filters=text_opinion_filters),
47
+ DataType.Test: text_opinion_extraction_pipeline(text_parser=text_parser,
48
+ get_doc_by_id_func=doc_ops.by_id,
49
+ annotators=[predefined_annot],
50
+ entity_index_func=lambda brat_entity: brat_entity.ID,
51
+ text_opinion_filters=text_opinion_filters),
52
+ DataType.Dev: text_opinion_extraction_pipeline(text_parser=text_parser,
53
+ get_doc_by_id_func=doc_ops.by_id,
54
+ annotators=[predefined_annot],
55
+ entity_index_func=lambda brat_entity: brat_entity.ID,
56
+ text_opinion_filters=text_opinion_filters),
57
+ }
58
+
59
+ # In the case when we setup a default data-folding.
60
+ # There is a need to provide it, due to the needs in further.
61
+ if data_folding is not None:
62
+ return pipelines, data_folding
63
+
64
+ return pipelines
@@ -0,0 +1,79 @@
1
+ from arekit.common.labels.str_fmt import StringLabelsFormatter
2
+ from arekit.contrib.source.nerelbio import labels
3
+
4
+
5
+ class NerelBioAnyLabelFormatter(StringLabelsFormatter):
6
+
7
+ def __init__(self):
8
+
9
+ stol = {
10
+ "ABBREVIATION": labels.ABBREVIATION,
11
+ "ALTERNATIVE_NAME": labels.ALTERNATIVE_NAME,
12
+ "KNOWS": labels.KNOWS,
13
+ "AGE_IS": labels.AGE_IS,
14
+ "AGE_DIED_AT": labels.AGE_DIED_AT,
15
+ "AWARDED_WITH": labels.AWARDED_WITH,
16
+ "PLACE_OF_BIRTH": labels.PLACE_OF_BIRTH,
17
+ "DATE_DEFUNCT_IN": labels.DATE_DEFUNCT_IN,
18
+ "DATE_FOUNDED_IN": labels.DATE_FOUNDED_IN,
19
+ "DATE_OF_BIRTH": labels.DATE_OF_BIRTH,
20
+ "DATE_OF_CREATION": labels.DATE_OF_CREATION,
21
+ "DATE_OF_DEATH": labels.DATE_OF_DEATH,
22
+ "POINT_IN_TIME": labels.POINT_IN_TIME,
23
+ "PLACE_OF_DEATH": labels.PLACE_OF_DEATH,
24
+ "FOUNDED_BY": labels.FOUNDED_BY,
25
+ "HEADQUARTERED_IN": labels.HEADQUARTERED_IN,
26
+ "IDEOLOGY_OF": labels.IDEOLOGY_OF,
27
+ "SPOUSE": labels.SPOUSE,
28
+ "MEMBER_OF": labels.MEMBER_OF,
29
+ "ORGANIZES": labels.ORGANIZES,
30
+ "OWNER_OF": labels.OWNER_OF,
31
+ "PARENT_OF": labels.PARENT_OF,
32
+ "PARTICIPANT_IN": labels.PARTICIPANT_IN,
33
+ "PLACE_RESIDES_IN": labels.PLACE_RESIDES_IN,
34
+ "PRICE_OF": labels.PRICE_OF,
35
+ "PRODUCES": labels.PRODUCES,
36
+ "RELATIVE": labels.RELATIVE,
37
+ "RELIGION_OF": labels.RELIGION_OF,
38
+ "SCHOOLS_ATTENDED": labels.SCHOOLS_ATTENDED,
39
+ "SIBLING": labels.SIBLING,
40
+ "SUBEVENT_OF": labels.SUBEVENT_OF,
41
+ "SUBORDINATE_OF": labels.SUBORDINATE_OF,
42
+ "TAKES_PLACE_IN": labels.TAKES_PLACE_IN,
43
+ "WORKPLACE": labels.WORKPLACE,
44
+ "WORKS_AS": labels.WORKS_AS,
45
+ "CONVICTED_OF": labels.CONVICTED_OF,
46
+ "PENALIZED_AS": labels.PENALIZED_AS,
47
+ "START_TIME": labels.START_TIME,
48
+ "END_TIME": labels.END_TIME,
49
+ "EXPENDITURE": labels.EXPENDITURE,
50
+ "AGENT": labels.AGENT,
51
+ "INANIMATE_INVOLVED": labels.INANIMATE_INVOLVED,
52
+ "INCOME": labels.INCOME,
53
+ "SUBCLASS_OF": labels.SUBCLASS_OF,
54
+ "PART_OF": labels.PART_OF,
55
+ "LOCATED_IN": labels.LOCATED_IN,
56
+ "TREATED_USING": labels.TREATED_USING,
57
+ "ORIGINS_FROM": labels.ORIGINS_FROM,
58
+ "TO_DETECT_OR_STUDY": labels.TO_DETECT_OR_STUDY,
59
+ "AFFECTS": labels.AFFECTS,
60
+ "HAS_CAUSE": labels.HAS_CAUSE,
61
+ "APPLIED_TO": labels.APPLIED_TO,
62
+ "USED_IN": labels.USED_IN,
63
+ "ASSOCIATED_WITH": labels.ASSOCIATED_WITH,
64
+ "HAS_ADMINISTRATION_ROUTE": labels.HAS_ADMINISTRATION_ROUTE,
65
+ "HAS_STRENGTH": labels.HAS_STRENGTH,
66
+ "DURATION_OF": labels.DURATION_OF,
67
+ "VALUE_IS": labels.VALUE_IS,
68
+ "PHYSIOLOGY_OF": labels.PHYSIOLOGY_OF,
69
+ "PROCEDURE_PERFORMED": labels.PROCEDURE_PERFORMED,
70
+ "MENTAL_PROCESS_OF": labels.MENTAL_PROCESS_OF,
71
+ "MEDICAL_CONDITION": labels.MEDICAL_CONDITION,
72
+ "DOSE_IS": labels.DOSE_IS,
73
+ "FINDING_OF": labels.FINDING_OF,
74
+ "CAUSE_OF_DEATH": labels.CAUSE_OF_DEATH,
75
+ "CONSUME": labels.CONSUME,
76
+ }
77
+
78
+ super(NerelBioAnyLabelFormatter, self).__init__(stol=stol)
79
+
@@ -0,0 +1,56 @@
1
+ from arekit.common.utils import progress_bar_iter
2
+ from arekit.contrib.source.ruattitudes.collection import RuAttitudesCollection
3
+ from arekit.contrib.source.ruattitudes.io_utils import RuAttitudesVersions
4
+ from arekit.contrib.source.ruattitudes.doc import RuAttitudesDocument
5
+ from arekit.contrib.source.ruattitudes.doc_brat import RuAttitudesDocumentsConverter
6
+ from arekit.contrib.utils.data.doc_provider.dict_based import DictionaryBasedDocumentProvider
7
+
8
+
9
+ class RuAttitudesDocumentProvider(DictionaryBasedDocumentProvider):
10
+
11
+ def __init__(self, version, keep_doc_ids_only, doc_id_func, limit):
12
+ d = self.read_ruattitudes_to_brat_in_memory(version=version,
13
+ keep_doc_ids_only=keep_doc_ids_only,
14
+ doc_id_func=doc_id_func,
15
+ limit=limit)
16
+ super(RuAttitudesDocumentProvider, self).__init__(d)
17
+
18
+ @staticmethod
19
+ def read_ruattitudes_to_brat_in_memory(version, keep_doc_ids_only, doc_id_func, limit=None):
20
+ """ Performs reading of RuAttitude formatted documents and
21
+ selection according to 'doc_ids_set' parameter.
22
+ """
23
+ assert (isinstance(version, RuAttitudesVersions))
24
+ assert (isinstance(keep_doc_ids_only, bool))
25
+ assert (callable(doc_id_func))
26
+
27
+ it = RuAttitudesCollection.iter_docs(version=version,
28
+ get_doc_index_func=doc_id_func,
29
+ return_inds_only=keep_doc_ids_only)
30
+
31
+ it_formatted_and_logged = progress_bar_iter(
32
+ iterable=RuAttitudesDocumentProvider.__iter_id_with_doc(
33
+ docs_it=it, keep_doc_ids_only=keep_doc_ids_only),
34
+ desc="Loading RuAttitudes Collection [{}]".format("doc ids only" if keep_doc_ids_only else "fully"),
35
+ unit='docs')
36
+
37
+ d = {}
38
+ docs_read = 0
39
+ for doc_id, doc in it_formatted_and_logged:
40
+ assert(isinstance(doc, RuAttitudesDocument) or doc is None)
41
+ d[doc_id] = RuAttitudesDocumentsConverter.to_brat_doc(doc) if doc is not None else None
42
+ docs_read += 1
43
+ if limit is not None and docs_read >= limit:
44
+ break
45
+
46
+ return d
47
+
48
+ @staticmethod
49
+ def __iter_id_with_doc(docs_it, keep_doc_ids_only):
50
+ if keep_doc_ids_only:
51
+ for doc_id in docs_it:
52
+ yield doc_id, None
53
+ else:
54
+ for doc in docs_it:
55
+ assert (isinstance(doc, RuAttitudesDocument))
56
+ yield doc.ID, doc
@@ -0,0 +1,20 @@
1
+ from arekit.common.entities.types import OpinionEntityType
2
+ from arekit.contrib.utils.entities.filter import EntityFilter
3
+
4
+
5
+ class RuAttitudesEntityFilter(EntityFilter):
6
+ """ This is a task-specific filter, which is applicable of entity types proposed
7
+ by the OntoNotesV5 resource: https://catalog.ldc.upenn.edu/LDC2013T19
8
+ We consider only a short list related to the sentiment attitude extraction task.
9
+ """
10
+
11
+ supported = ["GPE", "PERSON", "LOCAL", "GEO", "ORG"]
12
+
13
+ def is_ignored(self, entity, e_type):
14
+
15
+ if e_type == OpinionEntityType.Subject:
16
+ return entity.Type not in RuAttitudesEntityFilter.supported
17
+ if e_type == OpinionEntityType.Object:
18
+ return entity.Type not in RuAttitudesEntityFilter.supported
19
+ else:
20
+ return True
@@ -0,0 +1,65 @@
1
+ from arekit.common.labels.scaler.base import BaseLabelScaler
2
+ from arekit.contrib.source.ruattitudes.io_utils import RuAttitudesVersions
3
+ from arekit.contrib.source.ruattitudes.labels_fmt import RuAttitudesLabelFormatter
4
+ from arekit.contrib.utils.pipelines.sources.ruattitudes.doc_provider import RuAttitudesDocumentProvider
5
+ from arekit.contrib.utils.pipelines.sources.ruattitudes.entity_filter import RuAttitudesEntityFilter
6
+ from arekit.contrib.utils.pipelines.text_opinion.annot.predefined import PredefinedTextOpinionAnnotator
7
+ from arekit.contrib.utils.pipelines.text_opinion.extraction import text_opinion_extraction_pipeline
8
+ from arekit.contrib.utils.pipelines.text_opinion.filters.distance_based import DistanceLimitedTextOpinionFilter
9
+ from arekit.contrib.utils.pipelines.text_opinion.filters.entity_based import EntityBasedTextOpinionFilter
10
+
11
+
12
+ def create_text_opinion_extraction_pipeline(text_parser,
13
+ label_scaler,
14
+ custom_text_opinion_filters=None,
15
+ version=RuAttitudesVersions.V20Large,
16
+ terms_per_context=50,
17
+ limit=None):
18
+ """ Processing pipeline for RuAttitudes.
19
+ This pipeline is based on the in-memory RuAttitudes storage.
20
+
21
+ Original collection paper: www.aclweb.org/anthology/r19-1118/
22
+ Github repository: https://github.com/nicolay-r/RuAttitudes
23
+
24
+ version: enum
25
+ Version of the RuAttitudes collection.
26
+ NOTE: we consider to support a variations of the 2.0 versions.
27
+ label_scaler:
28
+ Scaler that allows to perform conversion from integer labels (RuAttitudes) to
29
+ the actual `Label` instances, required in further for text_opinions instances.
30
+ terms_per_context: int
31
+ Amount of terms that we consider in between the Object and Subject.
32
+ limit: int or None
33
+ Limit of documents to consider.
34
+ """
35
+ assert(isinstance(label_scaler, BaseLabelScaler))
36
+ assert(isinstance(version, RuAttitudesVersions))
37
+ assert(isinstance(custom_text_opinion_filters, list) or custom_text_opinion_filters is None)
38
+ assert(version in [RuAttitudesVersions.V20Large, RuAttitudesVersions.V20Base,
39
+ RuAttitudesVersions.V20BaseNeut, RuAttitudesVersions.V20LargeNeut])
40
+
41
+ doc_provider = RuAttitudesDocumentProvider(version=version,
42
+ keep_doc_ids_only=False,
43
+ doc_id_func=lambda doc_id: doc_id,
44
+ limit=limit)
45
+
46
+ text_opinion_filters = [
47
+ EntityBasedTextOpinionFilter(entity_filter=RuAttitudesEntityFilter()),
48
+ DistanceLimitedTextOpinionFilter(terms_per_context)
49
+ ]
50
+
51
+ # Append with the custom filters afterwards.
52
+ if custom_text_opinion_filters is not None:
53
+ text_opinion_filters += custom_text_opinion_filters
54
+
55
+ pipeline = text_opinion_extraction_pipeline(
56
+ annotators=[
57
+ PredefinedTextOpinionAnnotator(doc_provider=doc_provider,
58
+ label_formatter=RuAttitudesLabelFormatter(label_scaler))
59
+ ],
60
+ text_opinion_filters=custom_text_opinion_filters,
61
+ get_doc_by_id_func=doc_provider.by_id,
62
+ entity_index_func=lambda brat_entity: brat_entity.ID,
63
+ text_parser=text_parser)
64
+
65
+ return pipeline
@@ -0,0 +1,21 @@
1
+ from arekit.common.data.doc_provider import DocumentProvider
2
+ from arekit.common.synonyms.base import SynonymsCollection
3
+ from arekit.contrib.source.rusentrel.io_utils import RuSentRelVersions
4
+ from arekit.contrib.source.rusentrel.docs_reader import RuSentRelDocumentsReader
5
+
6
+
7
+ class RuSentrelDocumentProvider(DocumentProvider):
8
+ """ Limitations: Supported only train/test collections format
9
+ """
10
+
11
+ def __init__(self, version, synonyms):
12
+ assert(isinstance(version, RuSentRelVersions))
13
+ assert(isinstance(synonyms, SynonymsCollection))
14
+ super(RuSentrelDocumentProvider, self).__init__()
15
+ self.__version = version
16
+ self.__synonyms = synonyms
17
+
18
+ def by_id(self, doc_id):
19
+ assert (isinstance(doc_id, int))
20
+ return RuSentRelDocumentsReader.read_document(doc_id=doc_id, synonyms=self.__synonyms, version=self.__version)
21
+
@@ -0,0 +1,107 @@
1
+ from arekit.common.labels.base import NoLabel
2
+ from arekit.common.labels.provider.constant import ConstantLabelProvider
3
+ from arekit.common.opinions.annot.algo.pair_based import PairBasedOpinionAnnotationAlgorithm
4
+ from arekit.common.opinions.annot.algo.predefined import PredefinedOpinionAnnotationAlgorithm
5
+ from arekit.common.opinions.collection import OpinionCollection
6
+ from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders
7
+ from arekit.contrib.source.rusentrel.labels_fmt import RuSentRelLabelsFormatter
8
+ from arekit.contrib.source.rusentrel.opinions.collection import RuSentRelOpinions
9
+ from arekit.contrib.source.rusentrel.synonyms import RuSentRelSynonymsCollectionHelper
10
+ from arekit.contrib.utils.pipelines.sources.rusentrel.doc_provider import RuSentrelDocumentProvider
11
+ from arekit.contrib.utils.pipelines.text_opinion.annot.algo_based import AlgorithmBasedTextOpinionAnnotator
12
+ from arekit.contrib.utils.pipelines.text_opinion.extraction import text_opinion_extraction_pipeline
13
+ from arekit.contrib.utils.pipelines.text_opinion.filters.distance_based import DistanceLimitedTextOpinionFilter
14
+ from arekit.contrib.utils.processing.lemmatization.mystem import MystemWrapper
15
+ from arekit.contrib.utils.synonyms.stemmer_based import StemmerBasedSynonymCollection
16
+
17
+
18
+ def create_text_opinion_extraction_pipeline(rusentrel_version,
19
+ text_parser,
20
+ labels_fmt,
21
+ custom_text_opinion_filters=None,
22
+ no_label=NoLabel(),
23
+ terms_per_context=50,
24
+ dist_in_sentences=0):
25
+ """ Processing pipeline for RuSentRel, which combines:
26
+ - predefined document-level annotation (sentiment labels)
27
+ - automatic annotation of opinions between mentioned named entities (no-label)
28
+
29
+ Original collection paper: arxiv.org/abs/1808.08932
30
+
31
+ version: enum
32
+ Version of the RuSentRel collection.
33
+ terms_per_context: int
34
+ Amount of terms that we consider in between the Object and Subject.
35
+ dist_in_sentences: int
36
+ considering amount of sentences that could be in between Object and Subject.
37
+ """
38
+ assert(isinstance(labels_fmt, RuSentRelLabelsFormatter))
39
+ assert(isinstance(custom_text_opinion_filters, list) or custom_text_opinion_filters is None)
40
+
41
+ synonyms = StemmerBasedSynonymCollection(
42
+ iter_group_values_lists=RuSentRelSynonymsCollectionHelper.iter_groups(rusentrel_version),
43
+ stemmer=MystemWrapper(),
44
+ is_read_only=False)
45
+
46
+ doc_provider = RuSentrelDocumentProvider(version=rusentrel_version, synonyms=synonyms)
47
+
48
+ text_opinion_filters = [
49
+ DistanceLimitedTextOpinionFilter(terms_per_context)
50
+ ]
51
+
52
+ # Append with the custom filters afterwards.
53
+ if custom_text_opinion_filters is not None:
54
+ text_opinion_filters += custom_text_opinion_filters
55
+
56
+ pipeline = text_opinion_extraction_pipeline(
57
+ annotators=[
58
+ predefined_annotator(synonyms=synonyms, labels_fmt=labels_fmt),
59
+ nolabel_annotator(synonyms=synonyms, terms_per_context=terms_per_context,
60
+ dist_in_sentences=dist_in_sentences, no_label=no_label)
61
+ ],
62
+ text_opinion_filters=text_opinion_filters,
63
+ get_doc_by_id_func=doc_provider.by_id,
64
+ entity_index_func=lambda brat_entity: brat_entity.ID,
65
+ text_parser=text_parser)
66
+
67
+ return pipeline
68
+
69
+
70
+ def nolabel_annotator(synonyms, terms_per_context, dist_in_sentences=0, no_label=NoLabel()):
71
+ """ This is a default annotator, utilized to annotate `neutral`-like attitudes.
72
+ Neutral means that we adopt no-label parameter, and this label might be customized
73
+ to the one required in your studies.
74
+ """
75
+ return AlgorithmBasedTextOpinionAnnotator(
76
+ annot_algo=PairBasedOpinionAnnotationAlgorithm(dist_in_sents=dist_in_sentences,
77
+ dist_in_terms_bound=terms_per_context,
78
+ label_provider=ConstantLabelProvider(no_label),
79
+ entity_index_func=lambda brat_entity: brat_entity.ID),
80
+ create_empty_collection_func=lambda: OpinionCollection(
81
+ synonyms=synonyms, error_on_duplicates=True, error_on_synonym_end_missed=False),
82
+ value_to_group_id_func=lambda value:
83
+ SynonymsCollectionValuesGroupingProviders.provide_existed_value(synonyms=synonyms, value=value))
84
+
85
+
86
+ def predefined_annotator(synonyms, labels_fmt):
87
+ """ This is a annotator-converter of the predefined Document-Level opinions onto text-level one
88
+ """
89
+ return AlgorithmBasedTextOpinionAnnotator(
90
+ annot_algo=PredefinedOpinionAnnotationAlgorithm(
91
+ lambda doc_id: __get_document_opinions(doc_id=doc_id, synonyms=synonyms, labels_fmt=labels_fmt)),
92
+ create_empty_collection_func=lambda: OpinionCollection(
93
+ synonyms=synonyms, error_on_duplicates=True, error_on_synonym_end_missed=False),
94
+ value_to_group_id_func=lambda value:
95
+ SynonymsCollectionValuesGroupingProviders.provide_existed_value(synonyms=synonyms, value=value))
96
+
97
+
98
+ def __get_document_opinions(doc_id, synonyms, labels_fmt):
99
+ """ RuSentRel provides a pre-defined list of Document-Level Opinions.
100
+ Within this function we create the related OpinionCollection by a given doc_id.
101
+ """
102
+ return OpinionCollection(
103
+ opinions=RuSentRelOpinions.iter_from_doc(
104
+ doc_id=doc_id, labels_fmt=labels_fmt),
105
+ synonyms=synonyms,
106
+ error_on_synonym_end_missed=True,
107
+ error_on_duplicates=True)
@@ -0,0 +1,29 @@
1
+ from arekit.common.data.doc_provider import DocumentProvider
2
+ from arekit.contrib.source.sentinerel.io_utils import SentiNerelVersions
3
+ from arekit.contrib.source.sentinerel.reader import SentiNerelDocReader
4
+
5
+
6
+ class SentiNERELDocProvider(DocumentProvider):
7
+ """ Document reader for the collection of the RuSentNE competition 2023.
8
+ For more details please follow the following repository:
9
+ github: https://github.com/dialogue-evaluation/RuSentNE-evaluation
10
+ """
11
+
12
+ def __init__(self, filename_by_id, version):
13
+ """ filename_ids: dict
14
+ Dictionary of {id: filename}, where
15
+ - id: int
16
+ - filename: str
17
+ version: SentiNerelVersions
18
+ Specify the appropriate version of teh SentiNEREL collection.
19
+ """
20
+ assert(isinstance(filename_by_id, dict))
21
+ assert(isinstance(version, SentiNerelVersions))
22
+ super(SentiNERELDocProvider, self).__init__()
23
+ self.__filename_by_id = filename_by_id
24
+ self.__version = version
25
+
26
+ def by_id(self, doc_id):
27
+ return SentiNerelDocReader.read_document(doc_id=doc_id,
28
+ version=self.__version,
29
+ filename=self.__filename_by_id[doc_id])