arekit 0.25.0__tar.gz → 0.25.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (264) hide show
  1. {arekit-0.25.0 → arekit-0.25.1}/PKG-INFO +4 -5
  2. {arekit-0.25.0 → arekit-0.25.1}/README.md +4 -4
  3. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/storages/base.py +4 -15
  4. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/parser.py +3 -30
  5. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/pipeline/items/base.py +1 -1
  6. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/utils.py +11 -8
  7. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
  8. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/data/storages/pandas_based.py +2 -17
  9. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/data/storages/row_cache.py +2 -1
  10. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/data/storages/sqlite_based.py +2 -1
  11. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/text_opinion/extraction.py +5 -4
  12. {arekit-0.25.0 → arekit-0.25.1}/arekit.egg-info/PKG-INFO +4 -5
  13. {arekit-0.25.0 → arekit-0.25.1}/arekit.egg-info/SOURCES.txt +1 -74
  14. {arekit-0.25.0 → arekit-0.25.1}/arekit.egg-info/requires.txt +0 -1
  15. {arekit-0.25.0 → arekit-0.25.1}/setup.py +2 -3
  16. arekit-0.25.0/arekit/common/data/input/repositories/base.py +0 -68
  17. arekit-0.25.0/arekit/common/data/input/repositories/sample.py +0 -22
  18. arekit-0.25.0/arekit/common/data/views/samples.py +0 -26
  19. arekit-0.25.0/arekit/common/service/sqlite.py +0 -36
  20. arekit-0.25.0/arekit/contrib/networks/embedding.py +0 -149
  21. arekit-0.25.0/arekit/contrib/networks/embedding_io.py +0 -18
  22. arekit-0.25.0/arekit/contrib/networks/input/const.py +0 -6
  23. arekit-0.25.0/arekit/contrib/networks/input/ctx_serialization.py +0 -28
  24. arekit-0.25.0/arekit/contrib/networks/input/embedding/matrix.py +0 -29
  25. arekit-0.25.0/arekit/contrib/networks/input/embedding/offsets.py +0 -55
  26. arekit-0.25.0/arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
  27. arekit-0.25.0/arekit/contrib/networks/input/providers/sample.py +0 -129
  28. arekit-0.25.0/arekit/contrib/networks/input/providers/term_connotation.py +0 -23
  29. arekit-0.25.0/arekit/contrib/networks/input/providers/text.py +0 -24
  30. arekit-0.25.0/arekit/contrib/networks/input/rows_parser.py +0 -47
  31. arekit-0.25.0/arekit/contrib/networks/input/term_types.py +0 -13
  32. arekit-0.25.0/arekit/contrib/networks/input/terms_mapping.py +0 -60
  33. arekit-0.25.0/arekit/contrib/networks/vectorizer.py +0 -6
  34. arekit-0.25.0/arekit/contrib/utils/data/readers/base.py +0 -7
  35. arekit-0.25.0/arekit/contrib/utils/data/readers/csv_pd.py +0 -38
  36. arekit-0.25.0/arekit/contrib/utils/data/readers/jsonl.py +0 -15
  37. arekit-0.25.0/arekit/contrib/utils/data/readers/sqlite.py +0 -14
  38. arekit-0.25.0/arekit/contrib/utils/data/service/balance.py +0 -50
  39. arekit-0.25.0/arekit/contrib/utils/data/writers/csv_native.py +0 -63
  40. arekit-0.25.0/arekit/contrib/utils/data/writers/csv_pd.py +0 -40
  41. arekit-0.25.0/arekit/contrib/utils/data/writers/json_opennre.py +0 -132
  42. arekit-0.25.0/arekit/contrib/utils/data/writers/sqlite_native.py +0 -114
  43. arekit-0.25.0/arekit/contrib/utils/embeddings/rusvectores.py +0 -58
  44. arekit-0.25.0/arekit/contrib/utils/embeddings/tokens.py +0 -30
  45. arekit-0.25.0/arekit/contrib/utils/entities/__init__.py +0 -0
  46. arekit-0.25.0/arekit/contrib/utils/entities/formatters/__init__.py +0 -0
  47. arekit-0.25.0/arekit/contrib/utils/io_utils/__init__.py +0 -0
  48. arekit-0.25.0/arekit/contrib/utils/io_utils/embedding.py +0 -72
  49. arekit-0.25.0/arekit/contrib/utils/np_utils/__init__.py +0 -0
  50. arekit-0.25.0/arekit/contrib/utils/np_utils/embedding.py +0 -22
  51. arekit-0.25.0/arekit/contrib/utils/np_utils/npz_utils.py +0 -13
  52. arekit-0.25.0/arekit/contrib/utils/np_utils/vocab.py +0 -20
  53. arekit-0.25.0/arekit/contrib/utils/pipelines/__init__.py +0 -0
  54. arekit-0.25.0/arekit/contrib/utils/pipelines/items/__init__.py +0 -0
  55. arekit-0.25.0/arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
  56. arekit-0.25.0/arekit/contrib/utils/pipelines/items/sampling/base.py +0 -94
  57. arekit-0.25.0/arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -55
  58. arekit-0.25.0/arekit/contrib/utils/pipelines/items/text/__init__.py +0 -0
  59. arekit-0.25.0/arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
  60. arekit-0.25.0/arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -33
  61. arekit-0.25.0/arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -105
  62. arekit-0.25.0/arekit/contrib/utils/pipelines/items/text/translator.py +0 -136
  63. arekit-0.25.0/arekit/contrib/utils/pipelines/text_opinion/__init__.py +0 -0
  64. arekit-0.25.0/arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py +0 -0
  65. arekit-0.25.0/arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py +0 -0
  66. arekit-0.25.0/arekit/contrib/utils/processing/__init__.py +0 -0
  67. arekit-0.25.0/arekit/contrib/utils/processing/languages/__init__.py +0 -0
  68. arekit-0.25.0/arekit/contrib/utils/processing/languages/mods.py +0 -12
  69. arekit-0.25.0/arekit/contrib/utils/processing/languages/pos.py +0 -23
  70. arekit-0.25.0/arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
  71. arekit-0.25.0/arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
  72. arekit-0.25.0/arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
  73. arekit-0.25.0/arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
  74. arekit-0.25.0/arekit/contrib/utils/processing/languages/ru/number.py +0 -23
  75. arekit-0.25.0/arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
  76. arekit-0.25.0/arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
  77. arekit-0.25.0/arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
  78. arekit-0.25.0/arekit/contrib/utils/processing/pos/__init__.py +0 -0
  79. arekit-0.25.0/arekit/contrib/utils/processing/pos/base.py +0 -12
  80. arekit-0.25.0/arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
  81. arekit-0.25.0/arekit/contrib/utils/processing/pos/russian.py +0 -10
  82. arekit-0.25.0/arekit/contrib/utils/processing/text/__init__.py +0 -0
  83. arekit-0.25.0/arekit/contrib/utils/processing/text/tokens.py +0 -127
  84. arekit-0.25.0/arekit/contrib/utils/serializer.py +0 -42
  85. arekit-0.25.0/arekit/contrib/utils/synonyms/__init__.py +0 -0
  86. arekit-0.25.0/arekit/contrib/utils/vectorizers/__init__.py +0 -0
  87. arekit-0.25.0/arekit/contrib/utils/vectorizers/bpe.py +0 -93
  88. arekit-0.25.0/arekit/contrib/utils/vectorizers/random_norm.py +0 -39
  89. {arekit-0.25.0 → arekit-0.25.1}/LICENSE +0 -0
  90. {arekit-0.25.0 → arekit-0.25.1}/arekit/__init__.py +0 -0
  91. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/__init__.py +0 -0
  92. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/bound.py +0 -0
  93. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/context/__init__.py +0 -0
  94. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/context/terms_mapper.py +0 -0
  95. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/context/token.py +0 -0
  96. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/__init__.py +0 -0
  97. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/const.py +0 -0
  98. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/doc_provider.py +0 -0
  99. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/__init__.py +0 -0
  100. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/__init__.py +0 -0
  101. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/columns/__init__.py +0 -0
  102. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/columns/base.py +0 -0
  103. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/columns/sample.py +0 -0
  104. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/const.py +0 -0
  105. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/contents.py +0 -0
  106. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/instances/__init__.py +0 -0
  107. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/instances/base.py +0 -0
  108. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/instances/multiple.py +0 -0
  109. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/instances/single.py +0 -0
  110. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/label/__init__.py +0 -0
  111. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/label/base.py +0 -0
  112. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/label/binary.py +0 -0
  113. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/label/multiple.py +0 -0
  114. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/rows/__init__.py +0 -0
  115. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/rows/base.py +0 -0
  116. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/rows/samples.py +0 -0
  117. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/sample/__init__.py +0 -0
  118. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/sample/cropped.py +0 -0
  119. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/text/__init__.py +0 -0
  120. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/text/single.py +0 -0
  121. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/sample.py +0 -0
  122. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/terms_mapper.py +0 -0
  123. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/rows_fmt.py +0 -0
  124. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/rows_parser.py +0 -0
  125. {arekit-0.25.0/arekit/common/data/input/repositories → arekit-0.25.1/arekit/common/data/storages}/__init__.py +0 -0
  126. {arekit-0.25.0/arekit/common/data/storages → arekit-0.25.1/arekit/common/docs}/__init__.py +0 -0
  127. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/base.py +0 -0
  128. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/entities_grouping.py +0 -0
  129. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/entity.py +0 -0
  130. {arekit-0.25.0/arekit/common/data/views → arekit-0.25.1/arekit/common/docs/parsed}/__init__.py +0 -0
  131. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/parsed/base.py +0 -0
  132. {arekit-0.25.0/arekit/common/docs → arekit-0.25.1/arekit/common/docs/parsed/providers}/__init__.py +0 -0
  133. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/parsed/providers/base.py +0 -0
  134. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/parsed/providers/base_pairs.py +0 -0
  135. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/parsed/providers/entity_service.py +0 -0
  136. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/parsed/providers/opinion_pairs.py +0 -0
  137. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/parsed/providers/text_opinion_pairs.py +0 -0
  138. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/parsed/service.py +0 -0
  139. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/parsed/term_position.py +0 -0
  140. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/sentence.py +0 -0
  141. {arekit-0.25.0/arekit/common/docs/parsed → arekit-0.25.1/arekit/common/entities}/__init__.py +0 -0
  142. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/entities/base.py +0 -0
  143. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/entities/collection.py +0 -0
  144. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/entities/str_fmt.py +0 -0
  145. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/entities/types.py +0 -0
  146. {arekit-0.25.0/arekit/common/docs/parsed/providers → arekit-0.25.1/arekit/common/experiment}/__init__.py +0 -0
  147. {arekit-0.25.0/arekit/common/entities → arekit-0.25.1/arekit/common/experiment/api}/__init__.py +0 -0
  148. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/experiment/api/base_samples_io.py +0 -0
  149. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/experiment/data_type.py +0 -0
  150. {arekit-0.25.0/arekit/common/experiment → arekit-0.25.1/arekit/common/frames}/__init__.py +0 -0
  151. {arekit-0.25.0/arekit/common/experiment/api → arekit-0.25.1/arekit/common/frames/connotations}/__init__.py +0 -0
  152. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/frames/connotations/descriptor.py +0 -0
  153. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/frames/connotations/provider.py +0 -0
  154. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/frames/text_variant.py +0 -0
  155. {arekit-0.25.0/arekit/common/frames → arekit-0.25.1/arekit/common/frames/variants}/__init__.py +0 -0
  156. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/frames/variants/base.py +0 -0
  157. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/frames/variants/collection.py +0 -0
  158. {arekit-0.25.0/arekit/common/frames/connotations → arekit-0.25.1/arekit/common/labels}/__init__.py +0 -0
  159. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/labels/base.py +0 -0
  160. {arekit-0.25.0/arekit/common/frames/variants → arekit-0.25.1/arekit/common/labels/provider}/__init__.py +0 -0
  161. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/labels/provider/base.py +0 -0
  162. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/labels/provider/constant.py +0 -0
  163. {arekit-0.25.0/arekit/common/labels → arekit-0.25.1/arekit/common/labels/scaler}/__init__.py +0 -0
  164. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/labels/scaler/base.py +0 -0
  165. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/labels/scaler/sentiment.py +0 -0
  166. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/labels/scaler/single.py +0 -0
  167. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/labels/str_fmt.py +0 -0
  168. {arekit-0.25.0/arekit/common/labels/provider → arekit-0.25.1/arekit/common/linkage}/__init__.py +0 -0
  169. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/linkage/base.py +0 -0
  170. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/linkage/meta.py +0 -0
  171. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/linkage/opinions.py +0 -0
  172. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/linkage/text_opinions.py +0 -0
  173. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/log_utils.py +0 -0
  174. {arekit-0.25.0/arekit/common/labels/scaler → arekit-0.25.1/arekit/common/model}/__init__.py +0 -0
  175. {arekit-0.25.0/arekit/common/linkage → arekit-0.25.1/arekit/common/model/labeling}/__init__.py +0 -0
  176. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/model/labeling/base.py +0 -0
  177. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/model/labeling/modes.py +0 -0
  178. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/model/labeling/single.py +0 -0
  179. {arekit-0.25.0/arekit/common/model → arekit-0.25.1/arekit/common/opinions}/__init__.py +0 -0
  180. {arekit-0.25.0/arekit/common/model/labeling → arekit-0.25.1/arekit/common/opinions/annot}/__init__.py +0 -0
  181. {arekit-0.25.0/arekit/common/opinions → arekit-0.25.1/arekit/common/opinions/annot/algo}/__init__.py +0 -0
  182. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/opinions/annot/algo/base.py +0 -0
  183. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/opinions/annot/algo/pair_based.py +0 -0
  184. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/opinions/annot/algo/predefined.py +0 -0
  185. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/opinions/annot/algo_based.py +0 -0
  186. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/opinions/annot/base.py +0 -0
  187. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/opinions/base.py +0 -0
  188. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/opinions/collection.py +0 -0
  189. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/opinions/enums.py +0 -0
  190. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/opinions/provider.py +0 -0
  191. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/opinions/writer.py +0 -0
  192. {arekit-0.25.0/arekit/common/opinions/annot → arekit-0.25.1/arekit/common/pipeline}/__init__.py +0 -0
  193. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/pipeline/base.py +0 -0
  194. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/pipeline/batching.py +0 -0
  195. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/pipeline/context.py +0 -0
  196. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/pipeline/conts.py +0 -0
  197. {arekit-0.25.0/arekit/common/opinions/annot/algo → arekit-0.25.1/arekit/common/pipeline/items}/__init__.py +0 -0
  198. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/pipeline/items/flatten.py +0 -0
  199. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/pipeline/items/handle.py +0 -0
  200. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/pipeline/items/iter.py +0 -0
  201. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/pipeline/items/map.py +0 -0
  202. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/pipeline/items/map_nested.py +0 -0
  203. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/pipeline/utils.py +0 -0
  204. {arekit-0.25.0/arekit/common/pipeline → arekit-0.25.1/arekit/common/synonyms}/__init__.py +0 -0
  205. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/synonyms/base.py +0 -0
  206. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/synonyms/grouping.py +0 -0
  207. {arekit-0.25.0/arekit/common/pipeline/items → arekit-0.25.1/arekit/common/text}/__init__.py +0 -0
  208. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/text/enums.py +0 -0
  209. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/text/parsed.py +0 -0
  210. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/text/partitioning.py +0 -0
  211. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/text/stemmer.py +0 -0
  212. {arekit-0.25.0/arekit/common/service → arekit-0.25.1/arekit/common/text_opinions}/__init__.py +0 -0
  213. {arekit-0.25.0 → arekit-0.25.1}/arekit/common/text_opinions/base.py +0 -0
  214. {arekit-0.25.0/arekit/common/synonyms → arekit-0.25.1/arekit/contrib}/__init__.py +0 -0
  215. {arekit-0.25.0/arekit/common/text → arekit-0.25.1/arekit/contrib/bert}/__init__.py +0 -0
  216. {arekit-0.25.0/arekit/common/text_opinions → arekit-0.25.1/arekit/contrib/bert/input}/__init__.py +0 -0
  217. {arekit-0.25.0/arekit/contrib → arekit-0.25.1/arekit/contrib/bert/input/providers}/__init__.py +0 -0
  218. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/bert/input/providers/cropped_sample.py +0 -0
  219. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/bert/input/providers/text_pair.py +0 -0
  220. {arekit-0.25.0/arekit/contrib/bert → arekit-0.25.1/arekit/contrib/bert/terms}/__init__.py +0 -0
  221. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/bert/terms/mapper.py +0 -0
  222. {arekit-0.25.0/arekit/contrib/bert/input → arekit-0.25.1/arekit/contrib/prompt}/__init__.py +0 -0
  223. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/prompt/sample.py +0 -0
  224. {arekit-0.25.0/arekit/contrib/bert/input/providers → arekit-0.25.1/arekit/contrib/utils}/__init__.py +0 -0
  225. {arekit-0.25.0/arekit/contrib/bert/terms → arekit-0.25.1/arekit/contrib/utils/bert}/__init__.py +0 -0
  226. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/bert/samplers.py +0 -0
  227. {arekit-0.25.0/arekit/contrib/networks → arekit-0.25.1/arekit/contrib/utils/data}/__init__.py +0 -0
  228. {arekit-0.25.0/arekit/contrib/networks/input → arekit-0.25.1/arekit/contrib/utils/data/contents}/__init__.py +0 -0
  229. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/data/contents/opinions.py +0 -0
  230. {arekit-0.25.0/arekit/contrib/networks/input/embedding → arekit-0.25.1/arekit/contrib/utils/data/doc_provider}/__init__.py +0 -0
  231. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/data/doc_provider/dict_based.py +0 -0
  232. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/data/doc_provider/dir_based.py +0 -0
  233. {arekit-0.25.0/arekit/contrib/networks/input/formatters → arekit-0.25.1/arekit/contrib/utils/data/storages}/__init__.py +0 -0
  234. {arekit-0.25.0/arekit/contrib/networks/input/providers → arekit-0.25.1/arekit/contrib/utils/data/writers}/__init__.py +0 -0
  235. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/data/writers/base.py +0 -0
  236. {arekit-0.25.0/arekit/contrib/prompt → arekit-0.25.1/arekit/contrib/utils/entities}/__init__.py +0 -0
  237. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/entities/filter.py +0 -0
  238. {arekit-0.25.0/arekit/contrib/utils → arekit-0.25.1/arekit/contrib/utils/entities/formatters}/__init__.py +0 -0
  239. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/entities/formatters/str_display.py +0 -0
  240. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py +0 -0
  241. {arekit-0.25.0/arekit/contrib/utils/bert → arekit-0.25.1/arekit/contrib/utils/io_utils}/__init__.py +0 -0
  242. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/io_utils/utils.py +0 -0
  243. {arekit-0.25.0/arekit/contrib/utils/data → arekit-0.25.1/arekit/contrib/utils/pipelines}/__init__.py +0 -0
  244. {arekit-0.25.0/arekit/contrib/utils/data/contents → arekit-0.25.1/arekit/contrib/utils/pipelines/items}/__init__.py +0 -0
  245. {arekit-0.25.0/arekit/contrib/utils/data/doc_provider → arekit-0.25.1/arekit/contrib/utils/pipelines/items/text}/__init__.py +0 -0
  246. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/items/text/entities_default.py +0 -0
  247. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/items/text/frames.py +0 -0
  248. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/opinion_collections.py +0 -0
  249. {arekit-0.25.0/arekit/contrib/utils/data/readers → arekit-0.25.1/arekit/contrib/utils/pipelines/text_opinion}/__init__.py +0 -0
  250. {arekit-0.25.0/arekit/contrib/utils/data/service → arekit-0.25.1/arekit/contrib/utils/pipelines/text_opinion/annot}/__init__.py +0 -0
  251. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +0 -0
  252. {arekit-0.25.0/arekit/contrib/utils/data/storages → arekit-0.25.1/arekit/contrib/utils/pipelines/text_opinion/filters}/__init__.py +0 -0
  253. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/text_opinion/filters/base.py +0 -0
  254. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +0 -0
  255. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +0 -0
  256. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +0 -0
  257. {arekit-0.25.0/arekit/contrib/utils/data/writers → arekit-0.25.1/arekit/contrib/utils/processing}/__init__.py +0 -0
  258. {arekit-0.25.0/arekit/contrib/utils/embeddings → arekit-0.25.1/arekit/contrib/utils/synonyms}/__init__.py +0 -0
  259. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/synonyms/simple.py +0 -0
  260. {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/synonyms/stemmer_based.py +0 -0
  261. {arekit-0.25.0 → arekit-0.25.1}/arekit.egg-info/dependency_links.txt +0 -0
  262. {arekit-0.25.0 → arekit-0.25.1}/arekit.egg-info/top_level.txt +0 -0
  263. {arekit-0.25.0 → arekit-0.25.1}/logo.png +0 -0
  264. {arekit-0.25.0 → arekit-0.25.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: arekit
3
- Version: 0.25.0
3
+ Version: 0.25.1
4
4
  Summary: Document level Attitude and Relation Extraction toolkit (AREkit) for sampling and prompting mass-media news into datasets for ML-model training
5
5
  Home-page: https://github.com/nicolay-r/AREkit
6
6
  Author: Nicolay Rusnachenko
@@ -18,9 +18,8 @@ License-File: LICENSE
18
18
  Requires-Dist: tqdm
19
19
  Requires-Dist: enum34==1.1.10
20
20
  Requires-Dist: numpy>=1.14.5
21
- Requires-Dist: pymystem3==0.2.0
22
21
 
23
- # AREkit 0.25.0
22
+ # AREkit 0.25.1
24
23
 
25
24
  ![](https://img.shields.io/badge/Python-3.9+-brightgreen.svg)
26
25
 
@@ -34,7 +33,7 @@ is a python toolkit, devoted to document level Attitude and Relation Extraction
34
33
  ## Description
35
34
 
36
35
 
37
- This toolkit aims at memory-effective data processing in Relation Extraction (RE) related tasks.
36
+ This toolkit aims at memory-effective data processing in [Relation Extraction (RE)](https://nlpprogress.com/english/relationship_extraction.html) related tasks.
38
37
 
39
38
  <p align="center">
40
39
  <img src="docs/arekit-pipeline-concept.png"/>
@@ -60,7 +59,7 @@ for sentence level relations preparation (dubbed as contexts);
60
59
  ## Installation
61
60
 
62
61
  ```bash
63
- pip install git+https://github.com/nicolay-r/AREkit.git@0.25.0-rc
62
+ pip install git+https://github.com/nicolay-r/AREkit.git@0.25.1-rc
64
63
  ```
65
64
 
66
65
  ## Usage
@@ -1,4 +1,4 @@
1
- # AREkit 0.25.0
1
+ # AREkit 0.25.1
2
2
 
3
3
  ![](https://img.shields.io/badge/Python-3.9+-brightgreen.svg)
4
4
 
@@ -12,7 +12,7 @@ is a python toolkit, devoted to document level Attitude and Relation Extraction
12
12
  ## Description
13
13
 
14
14
 
15
- This toolkit aims at memory-effective data processing in Relation Extraction (RE) related tasks.
15
+ This toolkit aims at memory-effective data processing in [Relation Extraction (RE)](https://nlpprogress.com/english/relationship_extraction.html) related tasks.
16
16
 
17
17
  <p align="center">
18
18
  <img src="docs/arekit-pipeline-concept.png"/>
@@ -38,7 +38,7 @@ for sentence level relations preparation (dubbed as contexts);
38
38
  ## Installation
39
39
 
40
40
  ```bash
41
- pip install git+https://github.com/nicolay-r/AREkit.git@0.25.0-rc
41
+ pip install git+https://github.com/nicolay-r/AREkit.git@0.25.1-rc
42
42
  ```
43
43
 
44
44
  ## Usage
@@ -57,4 +57,4 @@ if you use or extend our work, please cite as follows:
57
57
  year={2024},
58
58
  organization={Springer}
59
59
  }
60
- ```
60
+ ```
@@ -10,6 +10,9 @@ logger = logging.getLogger(__name__)
10
10
 
11
11
  class BaseRowsStorage(object):
12
12
 
13
+ def __init__(self, log_out=None):
14
+ self.__log_out = log_out
15
+
13
16
  # region protected methods
14
17
 
15
18
  def _begin_filling_row(self, row_ind):
@@ -31,27 +34,12 @@ class BaseRowsStorage(object):
31
34
  def _get_rows_count(self):
32
35
  raise NotImplemented()
33
36
 
34
- def find_by_value(self, column_name, value):
35
- raise NotImplemented()
36
-
37
- def find_first_by_value(self, column_name, value):
38
- raise NotImplemented()
39
-
40
- def iter_column_values(self, column_name, dtype=None):
41
- raise NotImplemented()
42
-
43
37
  def get_row(self, row_index):
44
38
  raise NotImplemented()
45
39
 
46
- def get_cell(self, row_index, column_name):
47
- raise NotImplemented()
48
-
49
40
  def init_empty(self, columns_provider):
50
41
  raise NotImplemented()
51
42
 
52
- def iter_shuffled(self):
53
- raise NotImplemented()
54
-
55
43
  def iter_column_names(self):
56
44
  raise NotImplemented()
57
45
 
@@ -81,6 +69,7 @@ class BaseRowsStorage(object):
81
69
  condition_func=lambda item: not isinstance(item[1], MetaEmptyLinkedDataWrapper),
82
70
  postfix_func=postfix_func,
83
71
  desc="{fmt}".format(fmt=desc),
72
+ file=self.__log_out,
84
73
  total=rows_count)
85
74
 
86
75
  for row_index, item in enumerate(pbar_it):
@@ -1,42 +1,14 @@
1
- from tqdm import tqdm
2
1
  from arekit.common.docs.base import Document
3
2
  from arekit.common.docs.parsed.base import ParsedDocument
4
- from arekit.common.pipeline.base import BasePipelineLauncher
5
3
  from arekit.common.pipeline.batching import BatchingPipelineLauncher
6
4
  from arekit.common.pipeline.context import PipelineContext
7
5
  from arekit.common.pipeline.utils import BatchIterator
8
6
  from arekit.common.text.parsed import BaseParsedText
7
+ from arekit.common.utils import progress_bar_defined
9
8
 
10
9
 
11
10
  class DocumentParsers(object):
12
11
 
13
- @staticmethod
14
- def parse(doc, pipeline_items, parent_ppl_ctx=None, src_key="input", show_progress=False):
15
- """ This document parser is based on single text parts (sentences)
16
- that passes sequentially through the pipeline of transformations.
17
- """
18
- assert(isinstance(doc, Document))
19
- assert(isinstance(pipeline_items, list))
20
- assert(isinstance(parent_ppl_ctx, PipelineContext) or parent_ppl_ctx is None)
21
-
22
- parsed_sentences = []
23
-
24
- data_it = range(doc.SentencesCount)
25
- progress_it = tqdm(data_it, disable=not show_progress)
26
-
27
- for sent_ind in progress_it:
28
-
29
- # Composing the context from a single sentence.
30
- ctx = PipelineContext({src_key: doc.get_sentence(sent_ind)}, parent_ctx=parent_ppl_ctx)
31
-
32
- # Apply all the operations.
33
- BasePipelineLauncher.run(pipeline=pipeline_items, pipeline_ctx=ctx, src_key=src_key)
34
-
35
- # Collecting the result.
36
- parsed_sentences.append(BaseParsedText(terms=ctx.provide("result")))
37
-
38
- return ParsedDocument(doc_id=doc.ID, parsed_sentences=parsed_sentences)
39
-
40
12
  @staticmethod
41
13
  def parse_batch(doc, pipeline_items, batch_size, parent_ppl_ctx=None, src_key="input", show_progress=False):
42
14
  """ This document parser is based on batch of sentences.
@@ -49,7 +21,8 @@ class DocumentParsers(object):
49
21
  parsed_sentences = []
50
22
 
51
23
  data_it = BatchIterator(data_iter=iter(range(doc.SentencesCount)), batch_size=batch_size)
52
- progress_it = tqdm(data_it, total=round(doc.SentencesCount / batch_size), disable=not show_progress)
24
+ progress_it = progress_bar_defined(data_it, total=round(doc.SentencesCount / batch_size),
25
+ disable=not show_progress)
53
26
 
54
27
  for batch in progress_it:
55
28
 
@@ -2,7 +2,7 @@ from arekit.common.pipeline.context import PipelineContext
2
2
 
3
3
 
4
4
  class BasePipelineItem(object):
5
- """ Single pipeline item that might be instatiated and embedded into pipeline.
5
+ """ Single pipeline item that might be instantiated and embedded into pipeline.
6
6
  """
7
7
 
8
8
  def __init__(self, src_key="result", result_key="result", src_func=None):
@@ -1,4 +1,3 @@
1
- import sys
2
1
  import os
3
2
  from tqdm import tqdm
4
3
 
@@ -27,14 +26,14 @@ def split_by_whitespaces(text):
27
26
  return text.split()
28
27
 
29
28
 
30
- def progress_bar(iterable, total, desc="", unit="it"):
29
+ def progress_bar(iterable, total, desc="", unit="it", file=None, disable=False):
31
30
  if total is not None:
32
- return progress_bar_defined(iterable=iterable, total=total, desc=desc, unit=unit)
31
+ return progress_bar_defined(iterable=iterable, total=total, desc=desc, unit=unit, file=file, disable=disable)
33
32
  else:
34
- return progress_bar_iter(iterable=iterable, desc=desc, unit=unit)
33
+ return progress_bar_iter(iterable=iterable, desc=desc, unit=unit, file=file, disable=disable)
35
34
 
36
35
 
37
- def progress_bar_conditional(iterable, condition_func, total, postfix_func=None, desc="", unit="it"):
36
+ def progress_bar_conditional(iterable, condition_func, total, postfix_func=None, desc="", unit="it", file=None):
38
37
  """ This progress-bar updates only on the
39
38
  specific conditions during the iteration process.
40
39
  """
@@ -47,7 +46,7 @@ def progress_bar_conditional(iterable, condition_func, total, postfix_func=None,
47
46
  yield 0
48
47
 
49
48
  pbar_it = progress_bar(iterable=__iter_infinite_placeholder(),
50
- desc=desc, unit=unit, total=total)
49
+ desc=desc, unit=unit, total=total, file=file)
51
50
  element = iter(pbar_it)
52
51
 
53
52
  # Initialize with 0.
@@ -65,7 +64,7 @@ def progress_bar_conditional(iterable, condition_func, total, postfix_func=None,
65
64
  pbar_it.set_postfix(postfix_func(item))
66
65
 
67
66
 
68
- def progress_bar_defined(iterable, total, miniters=200, desc="", unit="it"):
67
+ def progress_bar_defined(iterable, total, miniters=200, desc="", unit="it", file=None, disable=False):
69
68
  return tqdm(iterable=iterable,
70
69
  total=total,
71
70
  desc=desc,
@@ -73,13 +72,17 @@ def progress_bar_defined(iterable, total, miniters=200, desc="", unit="it"):
73
72
  position=0,
74
73
  leave=True,
75
74
  unit=unit,
75
+ file=file,
76
+ disable=disable,
76
77
  miniters=total / miniters if total is not None else total)
77
78
 
78
79
 
79
- def progress_bar_iter(iterable, desc="", unit='it'):
80
+ def progress_bar_iter(iterable, desc="", unit='it', file=None, disable=False):
80
81
  return tqdm(iterable=iterable,
81
82
  desc=desc,
82
83
  position=0,
83
84
  leave=True,
84
85
  ncols=120,
86
+ file=file,
87
+ disable=disable,
85
88
  unit=unit)
@@ -5,8 +5,9 @@ from arekit.common.data.storages.base import BaseRowsStorage
5
5
 
6
6
  class JsonlBasedRowsStorage(BaseRowsStorage):
7
7
 
8
- def __init__(self, rows):
8
+ def __init__(self, rows, **kwargs):
9
9
  assert(isinstance(rows, list))
10
+ super(JsonlBasedRowsStorage, self).__init__(**kwargs)
10
11
  self.__rows = rows
11
12
 
12
13
  def _iter_rows(self):
@@ -12,7 +12,8 @@ class PandasBasedRowsStorage(BaseRowsStorage):
12
12
  based on the pandas DataFrames.
13
13
  """
14
14
 
15
- def __init__(self, df=None):
15
+ def __init__(self, df=None, **kwargs):
16
+ super(PandasBasedRowsStorage, self).__init__(**kwargs)
16
17
  self._df = df
17
18
 
18
19
  @property
@@ -96,26 +97,10 @@ class PandasBasedRowsStorage(BaseRowsStorage):
96
97
  def get_row(self, row_index):
97
98
  return self._df.iloc[row_index]
98
99
 
99
- def get_cell(self, row_index, column_name):
100
- return self._df.iloc[row_index][column_name]
101
-
102
- def iter_column_values(self, column_name, dtype=None):
103
- values = self._df[column_name]
104
- if dtype is None:
105
- return values
106
- return values.astype(dtype)
107
-
108
- def find_by_value(self, column_name, value):
109
- return self.__filter(column_name=column_name, value=value)
110
-
111
100
  def init_empty(self, columns_provider):
112
101
  cols_with_types = columns_provider.get_columns_list_with_types()
113
102
  self._df = self.__create_empty(cols_with_types)
114
103
 
115
- def iter_shuffled(self):
116
- shuffled_df = self._df.sample(frac=1)
117
- return self.__iter_rows_core(shuffled_df)
118
-
119
104
  def free(self):
120
105
  del self._df
121
106
  super(PandasBasedRowsStorage, self).free()
@@ -6,13 +6,14 @@ class RowCacheStorage(BaseRowsStorage):
6
6
  """ Row Caching storage kernel, based on python dictionary.
7
7
  """
8
8
 
9
- def __init__(self, force_collect_columns=None):
9
+ def __init__(self, force_collect_columns=None, **kwargs):
10
10
  """ This is a particular/related solution for the following issue:
11
11
  https://github.com/nicolay-r/AREkit/issues/464
12
12
  force_collect_columns: list
13
13
  columns that supposed to be additionally considered in output.
14
14
  """
15
15
  assert(isinstance(force_collect_columns, list) or force_collect_columns is None)
16
+ super(RowCacheStorage, self).__init__(**kwargs)
16
17
  self.__f = None
17
18
  self.__row_cache = {}
18
19
  self.__column_names = []
@@ -4,7 +4,8 @@ from arekit.common.data.storages.base import BaseRowsStorage
4
4
 
5
5
  class SQliteBasedRowsStorage(BaseRowsStorage):
6
6
 
7
- def __init__(self, path, table_name):
7
+ def __init__(self, path, table_name, **kwargs):
8
+ super(SQliteBasedRowsStorage, self).__init__(**kwargs)
8
9
  self.__path = path
9
10
  self.__table_name = table_name
10
11
  self.__conn = None
@@ -15,7 +15,7 @@ from arekit.contrib.utils.pipelines.text_opinion.filters.limitation import Frame
15
15
  def __iter_text_opinion_linkages(parsed_doc, annotators, entity_index_func,
16
16
  text_opinion_filters, use_meta):
17
17
  """ use_meta: bool
18
- this is mainly for tqdm and other console parameters to stay up-to-date
18
+ this is mainly for the progress-bar and other console parameters to stay up-to-date
19
19
  with the state in the case we do not have that much output results
20
20
  across multiple amount of documents.
21
21
  """
@@ -62,12 +62,13 @@ def __iter_text_opinion_linkages(parsed_doc, annotators, entity_index_func,
62
62
  yield MetaEmptyLinkedDataWrapper(doc_id=parsed_doc.RelatedDocID)
63
63
 
64
64
 
65
- def text_opinion_extraction_pipeline(pipeline_items, get_doc_by_id_func, annotators, entity_index_func,
65
+ def text_opinion_extraction_pipeline(pipeline_items, get_doc_by_id_func, annotators, entity_index_func, batch_size,
66
66
  text_opinion_filters=None, use_meta_between_docs=True):
67
67
  assert(callable(get_doc_by_id_func))
68
68
  assert(isinstance(annotators, list))
69
69
  assert(isinstance(text_opinion_filters, list) or text_opinion_filters is None)
70
70
  assert(isinstance(use_meta_between_docs, bool))
71
+ assert(isinstance(batch_size, int) and batch_size > 0)
71
72
 
72
73
  extra_filters = [] if text_opinion_filters is None else text_opinion_filters
73
74
  actual_text_opinion_filters = [FrameworkLimitationsTextOpinionFilter()] + extra_filters
@@ -77,8 +78,8 @@ def text_opinion_extraction_pipeline(pipeline_items, get_doc_by_id_func, annotat
77
78
  MapPipelineItem(map_func=lambda doc_id: get_doc_by_id_func(doc_id)),
78
79
 
79
80
  # (doc, ppl_ctx) -> (parsed_doc)
80
- MapNestedPipelineItem(map_func=lambda doc, ppl_ctx: DocumentParsers.parse(
81
- doc=doc, pipeline_items=pipeline_items, parent_ppl_ctx=ppl_ctx)),
81
+ MapNestedPipelineItem(map_func=lambda doc, ppl_ctx: DocumentParsers.parse_batch(
82
+ doc=doc, pipeline_items=pipeline_items, parent_ppl_ctx=ppl_ctx, batch_size=batch_size)),
82
83
 
83
84
  # (parsed_doc) -> (text_opinions)
84
85
  MapPipelineItem(map_func=lambda parsed_doc: __iter_text_opinion_linkages(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: arekit
3
- Version: 0.25.0
3
+ Version: 0.25.1
4
4
  Summary: Document level Attitude and Relation Extraction toolkit (AREkit) for sampling and prompting mass-media news into datasets for ML-model training
5
5
  Home-page: https://github.com/nicolay-r/AREkit
6
6
  Author: Nicolay Rusnachenko
@@ -18,9 +18,8 @@ License-File: LICENSE
18
18
  Requires-Dist: tqdm
19
19
  Requires-Dist: enum34==1.1.10
20
20
  Requires-Dist: numpy>=1.14.5
21
- Requires-Dist: pymystem3==0.2.0
22
21
 
23
- # AREkit 0.25.0
22
+ # AREkit 0.25.1
24
23
 
25
24
  ![](https://img.shields.io/badge/Python-3.9+-brightgreen.svg)
26
25
 
@@ -34,7 +33,7 @@ is a python toolkit, devoted to document level Attitude and Relation Extraction
34
33
  ## Description
35
34
 
36
35
 
37
- This toolkit aims at memory-effective data processing in Relation Extraction (RE) related tasks.
36
+ This toolkit aims at memory-effective data processing in [Relation Extraction (RE)](https://nlpprogress.com/english/relationship_extraction.html) related tasks.
38
37
 
39
38
  <p align="center">
40
39
  <img src="docs/arekit-pipeline-concept.png"/>
@@ -60,7 +59,7 @@ for sentence level relations preparation (dubbed as contexts);
60
59
  ## Installation
61
60
 
62
61
  ```bash
63
- pip install git+https://github.com/nicolay-r/AREkit.git@0.25.0-rc
62
+ pip install git+https://github.com/nicolay-r/AREkit.git@0.25.1-rc
64
63
  ```
65
64
 
66
65
  ## Usage
@@ -44,13 +44,8 @@ arekit/common/data/input/providers/sample/__init__.py
44
44
  arekit/common/data/input/providers/sample/cropped.py
45
45
  arekit/common/data/input/providers/text/__init__.py
46
46
  arekit/common/data/input/providers/text/single.py
47
- arekit/common/data/input/repositories/__init__.py
48
- arekit/common/data/input/repositories/base.py
49
- arekit/common/data/input/repositories/sample.py
50
47
  arekit/common/data/storages/__init__.py
51
48
  arekit/common/data/storages/base.py
52
- arekit/common/data/views/__init__.py
53
- arekit/common/data/views/samples.py
54
49
  arekit/common/docs/__init__.py
55
50
  arekit/common/docs/base.py
56
51
  arekit/common/docs/entities_grouping.py
@@ -130,8 +125,6 @@ arekit/common/pipeline/items/handle.py
130
125
  arekit/common/pipeline/items/iter.py
131
126
  arekit/common/pipeline/items/map.py
132
127
  arekit/common/pipeline/items/map_nested.py
133
- arekit/common/service/__init__.py
134
- arekit/common/service/sqlite.py
135
128
  arekit/common/synonyms/__init__.py
136
129
  arekit/common/synonyms/base.py
137
130
  arekit/common/synonyms/grouping.py
@@ -150,29 +143,9 @@ arekit/contrib/bert/input/providers/cropped_sample.py
150
143
  arekit/contrib/bert/input/providers/text_pair.py
151
144
  arekit/contrib/bert/terms/__init__.py
152
145
  arekit/contrib/bert/terms/mapper.py
153
- arekit/contrib/networks/__init__.py
154
- arekit/contrib/networks/embedding.py
155
- arekit/contrib/networks/embedding_io.py
156
- arekit/contrib/networks/vectorizer.py
157
- arekit/contrib/networks/input/__init__.py
158
- arekit/contrib/networks/input/const.py
159
- arekit/contrib/networks/input/ctx_serialization.py
160
- arekit/contrib/networks/input/rows_parser.py
161
- arekit/contrib/networks/input/term_types.py
162
- arekit/contrib/networks/input/terms_mapping.py
163
- arekit/contrib/networks/input/embedding/__init__.py
164
- arekit/contrib/networks/input/embedding/matrix.py
165
- arekit/contrib/networks/input/embedding/offsets.py
166
- arekit/contrib/networks/input/formatters/__init__.py
167
- arekit/contrib/networks/input/formatters/pos_mapper.py
168
- arekit/contrib/networks/input/providers/__init__.py
169
- arekit/contrib/networks/input/providers/sample.py
170
- arekit/contrib/networks/input/providers/term_connotation.py
171
- arekit/contrib/networks/input/providers/text.py
172
146
  arekit/contrib/prompt/__init__.py
173
147
  arekit/contrib/prompt/sample.py
174
148
  arekit/contrib/utils/__init__.py
175
- arekit/contrib/utils/serializer.py
176
149
  arekit/contrib/utils/bert/__init__.py
177
150
  arekit/contrib/utils/bert/samplers.py
178
151
  arekit/contrib/utils/data/__init__.py
@@ -181,13 +154,6 @@ arekit/contrib/utils/data/contents/opinions.py
181
154
  arekit/contrib/utils/data/doc_provider/__init__.py
182
155
  arekit/contrib/utils/data/doc_provider/dict_based.py
183
156
  arekit/contrib/utils/data/doc_provider/dir_based.py
184
- arekit/contrib/utils/data/readers/__init__.py
185
- arekit/contrib/utils/data/readers/base.py
186
- arekit/contrib/utils/data/readers/csv_pd.py
187
- arekit/contrib/utils/data/readers/jsonl.py
188
- arekit/contrib/utils/data/readers/sqlite.py
189
- arekit/contrib/utils/data/service/__init__.py
190
- arekit/contrib/utils/data/service/balance.py
191
157
  arekit/contrib/utils/data/storages/__init__.py
192
158
  arekit/contrib/utils/data/storages/jsonl_based.py
193
159
  arekit/contrib/utils/data/storages/pandas_based.py
@@ -195,38 +161,19 @@ arekit/contrib/utils/data/storages/row_cache.py
195
161
  arekit/contrib/utils/data/storages/sqlite_based.py
196
162
  arekit/contrib/utils/data/writers/__init__.py
197
163
  arekit/contrib/utils/data/writers/base.py
198
- arekit/contrib/utils/data/writers/csv_native.py
199
- arekit/contrib/utils/data/writers/csv_pd.py
200
- arekit/contrib/utils/data/writers/json_opennre.py
201
- arekit/contrib/utils/data/writers/sqlite_native.py
202
- arekit/contrib/utils/embeddings/__init__.py
203
- arekit/contrib/utils/embeddings/rusvectores.py
204
- arekit/contrib/utils/embeddings/tokens.py
205
164
  arekit/contrib/utils/entities/__init__.py
206
165
  arekit/contrib/utils/entities/filter.py
207
166
  arekit/contrib/utils/entities/formatters/__init__.py
208
167
  arekit/contrib/utils/entities/formatters/str_display.py
209
168
  arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py
210
169
  arekit/contrib/utils/io_utils/__init__.py
211
- arekit/contrib/utils/io_utils/embedding.py
212
170
  arekit/contrib/utils/io_utils/utils.py
213
- arekit/contrib/utils/np_utils/__init__.py
214
- arekit/contrib/utils/np_utils/embedding.py
215
- arekit/contrib/utils/np_utils/npz_utils.py
216
- arekit/contrib/utils/np_utils/vocab.py
217
171
  arekit/contrib/utils/pipelines/__init__.py
218
172
  arekit/contrib/utils/pipelines/opinion_collections.py
219
173
  arekit/contrib/utils/pipelines/items/__init__.py
220
- arekit/contrib/utils/pipelines/items/sampling/__init__.py
221
- arekit/contrib/utils/pipelines/items/sampling/base.py
222
- arekit/contrib/utils/pipelines/items/sampling/networks.py
223
174
  arekit/contrib/utils/pipelines/items/text/__init__.py
224
175
  arekit/contrib/utils/pipelines/items/text/entities_default.py
225
176
  arekit/contrib/utils/pipelines/items/text/frames.py
226
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py
227
- arekit/contrib/utils/pipelines/items/text/frames_negation.py
228
- arekit/contrib/utils/pipelines/items/text/tokenizer.py
229
- arekit/contrib/utils/pipelines/items/text/translator.py
230
177
  arekit/contrib/utils/pipelines/text_opinion/__init__.py
231
178
  arekit/contrib/utils/pipelines/text_opinion/extraction.py
232
179
  arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py
@@ -237,26 +184,6 @@ arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py
237
184
  arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py
238
185
  arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py
239
186
  arekit/contrib/utils/processing/__init__.py
240
- arekit/contrib/utils/processing/languages/__init__.py
241
- arekit/contrib/utils/processing/languages/mods.py
242
- arekit/contrib/utils/processing/languages/pos.py
243
- arekit/contrib/utils/processing/languages/ru/__init__.py
244
- arekit/contrib/utils/processing/languages/ru/cases.py
245
- arekit/contrib/utils/processing/languages/ru/constants.py
246
- arekit/contrib/utils/processing/languages/ru/mods.py
247
- arekit/contrib/utils/processing/languages/ru/number.py
248
- arekit/contrib/utils/processing/languages/ru/pos_service.py
249
- arekit/contrib/utils/processing/lemmatization/__init__.py
250
- arekit/contrib/utils/processing/lemmatization/mystem.py
251
- arekit/contrib/utils/processing/pos/__init__.py
252
- arekit/contrib/utils/processing/pos/base.py
253
- arekit/contrib/utils/processing/pos/mystem_wrap.py
254
- arekit/contrib/utils/processing/pos/russian.py
255
- arekit/contrib/utils/processing/text/__init__.py
256
- arekit/contrib/utils/processing/text/tokens.py
257
187
  arekit/contrib/utils/synonyms/__init__.py
258
188
  arekit/contrib/utils/synonyms/simple.py
259
- arekit/contrib/utils/synonyms/stemmer_based.py
260
- arekit/contrib/utils/vectorizers/__init__.py
261
- arekit/contrib/utils/vectorizers/bpe.py
262
- arekit/contrib/utils/vectorizers/random_norm.py
189
+ arekit/contrib/utils/synonyms/stemmer_based.py
@@ -1,4 +1,3 @@
1
1
  tqdm
2
2
  enum34==1.1.10
3
3
  numpy>=1.14.5
4
- pymystem3==0.2.0
@@ -15,7 +15,7 @@ def get_requirements(filenames):
15
15
 
16
16
  setup(
17
17
  name='arekit',
18
- version='0.25.0',
18
+ version='0.25.1',
19
19
  python_requires=">=3.6",
20
20
  description='Document level Attitude and Relation Extraction toolkit (AREkit)'
21
21
  ' for sampling and prompting mass-media news into datasets for ML-model training',
@@ -35,7 +35,6 @@ setup(
35
35
  keywords='natural language processing, relation extraction, sentiment analysis',
36
36
  packages=find_packages(),
37
37
  install_requires=get_requirements([
38
- 'dependencies.txt',
39
- 'arekit/contrib/utils/dependencies.txt']),
38
+ 'dependencies.txt']),
40
39
  data_files=["logo.png"],
41
40
  )
@@ -1,68 +0,0 @@
1
- from arekit.common.data.input.providers.columns.base import BaseColumnsProvider
2
- from arekit.common.data.input.providers.contents import ContentsProvider
3
- from arekit.common.data.input.providers.rows.base import BaseRowProvider
4
- from arekit.common.data.storages.base import BaseRowsStorage
5
- from arekit.contrib.utils.data.storages.row_cache import RowCacheStorage
6
- from arekit.contrib.utils.data.writers.base import BaseWriter
7
-
8
-
9
- class BaseInputRepository(object):
10
-
11
- def __init__(self, columns_provider, rows_provider, storage):
12
- assert(isinstance(columns_provider, BaseColumnsProvider))
13
- assert(isinstance(rows_provider, BaseRowProvider))
14
- assert(isinstance(storage, BaseRowsStorage))
15
-
16
- self._columns_provider = columns_provider
17
- self._rows_provider = rows_provider
18
- self._storage = storage
19
-
20
- # Do setup operations.
21
- self._setup_columns_provider()
22
- self._setup_rows_provider()
23
-
24
- # region protected methods
25
-
26
- def _setup_columns_provider(self):
27
- pass
28
-
29
- def _setup_rows_provider(self):
30
- pass
31
-
32
- # endregion
33
-
34
- def populate(self, contents_provider, doc_ids, desc="", writer=None, target=None):
35
- assert(isinstance(contents_provider, ContentsProvider))
36
- assert(isinstance(self._storage, BaseRowsStorage))
37
- assert(isinstance(doc_ids, list))
38
- assert(isinstance(writer, BaseWriter) or writer is None)
39
- assert(isinstance(target, str) or target is None)
40
-
41
- def iter_rows(idle_mode):
42
- return self._rows_provider.iter_by_rows(
43
- contents_provider=contents_provider,
44
- doc_ids_iter=doc_ids,
45
- idle_mode=idle_mode)
46
-
47
- self._storage.init_empty(columns_provider=self._columns_provider)
48
-
49
- is_async_write_mode_on = writer is not None and target is not None
50
-
51
- if is_async_write_mode_on:
52
- writer.open_target(target)
53
-
54
- self._storage.fill(lambda idle_mode: iter_rows(idle_mode),
55
- columns_provider=self._columns_provider,
56
- row_handler=lambda: writer.commit_line(self._storage) if is_async_write_mode_on else None,
57
- desc=desc)
58
-
59
- if is_async_write_mode_on:
60
- writer.close_target()
61
-
62
- def push(self, writer, target, free_storage=True):
63
- if not isinstance(self._storage, RowCacheStorage):
64
- writer.write_all(self._storage, target)
65
-
66
- # After writing we free the contents of the storage.
67
- if free_storage:
68
- self._storage.free()
@@ -1,22 +0,0 @@
1
- import logging
2
-
3
- from arekit.common.data.input.providers.rows.samples import BaseSampleRowProvider
4
- from arekit.common.data.input.repositories.base import BaseInputRepository
5
-
6
- logger = logging.getLogger(__name__)
7
- logging.basicConfig(level=logging.INFO)
8
-
9
-
10
- class BaseInputSamplesRepository(BaseInputRepository):
11
-
12
- def _setup_rows_provider(self):
13
- """ Setup store labels.
14
- """
15
- assert(isinstance(self._rows_provider, BaseSampleRowProvider))
16
- self._rows_provider.set_store_labels(self._columns_provider.StoreLabels)
17
-
18
- def _setup_columns_provider(self):
19
- """ Setup text column names.
20
- """
21
- text_column_names = list(self._rows_provider.TextProvider.iter_columns())
22
- self._columns_provider.set_text_column_names(text_column_names)
@@ -1,26 +0,0 @@
1
- from arekit.common.data import const
2
- from arekit.common.data.storages.base import BaseRowsStorage
3
-
4
-
5
- # TODO. This is a particular type of view, and expected to be off the core.
6
- class LinkedSamplesStorageView(object):
7
-
8
- def iter_from_storage(self, storage):
9
- assert(isinstance(storage, BaseRowsStorage))
10
- undefined = -1
11
-
12
- linked = []
13
- current_opinion_id = undefined
14
- for row_index, opinion_id in enumerate(storage.iter_column_values(const.OPINION_ID)):
15
- if current_opinion_id != undefined:
16
- if opinion_id != current_opinion_id:
17
- yield linked
18
- linked = []
19
- current_opinion_id = opinion_id
20
- else:
21
- current_opinion_id = opinion_id
22
-
23
- linked.append(storage.get_row(row_index))
24
-
25
- if len(linked) > 0:
26
- yield linked