arekit 0.25.0__tar.gz → 0.25.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (267) hide show
  1. arekit-0.25.2/PKG-INFO +80 -0
  2. {arekit-0.25.0 → arekit-0.25.2}/README.md +6 -4
  3. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/context/terms_mapper.py +5 -2
  4. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/providers/rows/samples.py +8 -12
  5. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/providers/sample/cropped.py +4 -3
  6. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/terms_mapper.py +4 -8
  7. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/storages/base.py +4 -18
  8. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/docs/entities_grouping.py +5 -3
  9. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/docs/parsed/base.py +3 -3
  10. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/docs/parsed/providers/base.py +3 -5
  11. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/docs/parsed/providers/entity_service.py +7 -28
  12. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/docs/parsed/providers/opinion_pairs.py +6 -6
  13. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/docs/parsed/providers/text_opinion_pairs.py +4 -4
  14. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/docs/parsed/service.py +2 -2
  15. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/docs/parser.py +3 -30
  16. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/model/labeling/single.py +7 -3
  17. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/opinions/annot/algo/pair_based.py +9 -5
  18. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/pipeline/base.py +0 -2
  19. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/pipeline/batching.py +0 -3
  20. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/pipeline/items/base.py +1 -1
  21. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/utils.py +11 -8
  22. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/bert/input/providers/cropped_sample.py +2 -5
  23. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/bert/terms/mapper.py +2 -2
  24. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/prompt/sample.py +2 -6
  25. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/utils/bert/samplers.py +4 -2
  26. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
  27. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/utils/data/storages/row_cache.py +2 -1
  28. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/utils/data/storages/sqlite_based.py +2 -1
  29. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +8 -5
  30. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/utils/pipelines/text_opinion/extraction.py +16 -8
  31. arekit-0.25.2/arekit.egg-info/PKG-INFO +80 -0
  32. {arekit-0.25.0 → arekit-0.25.2}/arekit.egg-info/SOURCES.txt +1 -83
  33. arekit-0.25.2/arekit.egg-info/requires.txt +2 -0
  34. {arekit-0.25.0 → arekit-0.25.2}/setup.py +2 -3
  35. arekit-0.25.0/LICENSE +0 -21
  36. arekit-0.25.0/PKG-INFO +0 -82
  37. arekit-0.25.0/arekit/common/data/input/repositories/base.py +0 -68
  38. arekit-0.25.0/arekit/common/data/input/repositories/sample.py +0 -22
  39. arekit-0.25.0/arekit/common/data/views/samples.py +0 -26
  40. arekit-0.25.0/arekit/common/experiment/api/base_samples_io.py +0 -20
  41. arekit-0.25.0/arekit/common/experiment/data_type.py +0 -17
  42. arekit-0.25.0/arekit/common/service/sqlite.py +0 -36
  43. arekit-0.25.0/arekit/contrib/networks/embedding.py +0 -149
  44. arekit-0.25.0/arekit/contrib/networks/embedding_io.py +0 -18
  45. arekit-0.25.0/arekit/contrib/networks/input/const.py +0 -6
  46. arekit-0.25.0/arekit/contrib/networks/input/ctx_serialization.py +0 -28
  47. arekit-0.25.0/arekit/contrib/networks/input/embedding/matrix.py +0 -29
  48. arekit-0.25.0/arekit/contrib/networks/input/embedding/offsets.py +0 -55
  49. arekit-0.25.0/arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
  50. arekit-0.25.0/arekit/contrib/networks/input/providers/sample.py +0 -129
  51. arekit-0.25.0/arekit/contrib/networks/input/providers/term_connotation.py +0 -23
  52. arekit-0.25.0/arekit/contrib/networks/input/providers/text.py +0 -24
  53. arekit-0.25.0/arekit/contrib/networks/input/rows_parser.py +0 -47
  54. arekit-0.25.0/arekit/contrib/networks/input/term_types.py +0 -13
  55. arekit-0.25.0/arekit/contrib/networks/input/terms_mapping.py +0 -60
  56. arekit-0.25.0/arekit/contrib/networks/vectorizer.py +0 -6
  57. arekit-0.25.0/arekit/contrib/utils/data/readers/base.py +0 -7
  58. arekit-0.25.0/arekit/contrib/utils/data/readers/csv_pd.py +0 -38
  59. arekit-0.25.0/arekit/contrib/utils/data/readers/jsonl.py +0 -15
  60. arekit-0.25.0/arekit/contrib/utils/data/readers/sqlite.py +0 -14
  61. arekit-0.25.0/arekit/contrib/utils/data/service/balance.py +0 -50
  62. arekit-0.25.0/arekit/contrib/utils/data/storages/__init__.py +0 -0
  63. arekit-0.25.0/arekit/contrib/utils/data/storages/pandas_based.py +0 -123
  64. arekit-0.25.0/arekit/contrib/utils/data/writers/__init__.py +0 -0
  65. arekit-0.25.0/arekit/contrib/utils/data/writers/csv_native.py +0 -63
  66. arekit-0.25.0/arekit/contrib/utils/data/writers/csv_pd.py +0 -40
  67. arekit-0.25.0/arekit/contrib/utils/data/writers/json_opennre.py +0 -132
  68. arekit-0.25.0/arekit/contrib/utils/data/writers/sqlite_native.py +0 -114
  69. arekit-0.25.0/arekit/contrib/utils/embeddings/__init__.py +0 -0
  70. arekit-0.25.0/arekit/contrib/utils/embeddings/rusvectores.py +0 -58
  71. arekit-0.25.0/arekit/contrib/utils/embeddings/tokens.py +0 -30
  72. arekit-0.25.0/arekit/contrib/utils/entities/__init__.py +0 -0
  73. arekit-0.25.0/arekit/contrib/utils/entities/formatters/__init__.py +0 -0
  74. arekit-0.25.0/arekit/contrib/utils/entities/formatters/str_display.py +0 -11
  75. arekit-0.25.0/arekit/contrib/utils/io_utils/__init__.py +0 -0
  76. arekit-0.25.0/arekit/contrib/utils/io_utils/embedding.py +0 -72
  77. arekit-0.25.0/arekit/contrib/utils/np_utils/__init__.py +0 -0
  78. arekit-0.25.0/arekit/contrib/utils/np_utils/embedding.py +0 -22
  79. arekit-0.25.0/arekit/contrib/utils/np_utils/npz_utils.py +0 -13
  80. arekit-0.25.0/arekit/contrib/utils/np_utils/vocab.py +0 -20
  81. arekit-0.25.0/arekit/contrib/utils/pipelines/__init__.py +0 -0
  82. arekit-0.25.0/arekit/contrib/utils/pipelines/items/__init__.py +0 -0
  83. arekit-0.25.0/arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
  84. arekit-0.25.0/arekit/contrib/utils/pipelines/items/sampling/base.py +0 -94
  85. arekit-0.25.0/arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -55
  86. arekit-0.25.0/arekit/contrib/utils/pipelines/items/text/__init__.py +0 -0
  87. arekit-0.25.0/arekit/contrib/utils/pipelines/items/text/entities_default.py +0 -23
  88. arekit-0.25.0/arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
  89. arekit-0.25.0/arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -33
  90. arekit-0.25.0/arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -105
  91. arekit-0.25.0/arekit/contrib/utils/pipelines/items/text/translator.py +0 -136
  92. arekit-0.25.0/arekit/contrib/utils/pipelines/text_opinion/__init__.py +0 -0
  93. arekit-0.25.0/arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py +0 -0
  94. arekit-0.25.0/arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py +0 -0
  95. arekit-0.25.0/arekit/contrib/utils/processing/__init__.py +0 -0
  96. arekit-0.25.0/arekit/contrib/utils/processing/languages/__init__.py +0 -0
  97. arekit-0.25.0/arekit/contrib/utils/processing/languages/mods.py +0 -12
  98. arekit-0.25.0/arekit/contrib/utils/processing/languages/pos.py +0 -23
  99. arekit-0.25.0/arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
  100. arekit-0.25.0/arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
  101. arekit-0.25.0/arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
  102. arekit-0.25.0/arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
  103. arekit-0.25.0/arekit/contrib/utils/processing/languages/ru/number.py +0 -23
  104. arekit-0.25.0/arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
  105. arekit-0.25.0/arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
  106. arekit-0.25.0/arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
  107. arekit-0.25.0/arekit/contrib/utils/processing/pos/__init__.py +0 -0
  108. arekit-0.25.0/arekit/contrib/utils/processing/pos/base.py +0 -12
  109. arekit-0.25.0/arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
  110. arekit-0.25.0/arekit/contrib/utils/processing/pos/russian.py +0 -10
  111. arekit-0.25.0/arekit/contrib/utils/processing/text/__init__.py +0 -0
  112. arekit-0.25.0/arekit/contrib/utils/processing/text/tokens.py +0 -127
  113. arekit-0.25.0/arekit/contrib/utils/serializer.py +0 -42
  114. arekit-0.25.0/arekit/contrib/utils/synonyms/__init__.py +0 -0
  115. arekit-0.25.0/arekit/contrib/utils/vectorizers/__init__.py +0 -0
  116. arekit-0.25.0/arekit/contrib/utils/vectorizers/bpe.py +0 -93
  117. arekit-0.25.0/arekit/contrib/utils/vectorizers/random_norm.py +0 -39
  118. arekit-0.25.0/arekit.egg-info/PKG-INFO +0 -82
  119. arekit-0.25.0/arekit.egg-info/requires.txt +0 -4
  120. {arekit-0.25.0 → arekit-0.25.2}/arekit/__init__.py +0 -0
  121. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/__init__.py +0 -0
  122. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/bound.py +0 -0
  123. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/context/__init__.py +0 -0
  124. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/context/token.py +0 -0
  125. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/__init__.py +0 -0
  126. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/const.py +0 -0
  127. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/doc_provider.py +0 -0
  128. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/__init__.py +0 -0
  129. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/providers/__init__.py +0 -0
  130. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/providers/columns/__init__.py +0 -0
  131. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/providers/columns/base.py +0 -0
  132. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/providers/columns/sample.py +0 -0
  133. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/providers/const.py +0 -0
  134. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/providers/contents.py +0 -0
  135. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/providers/instances/__init__.py +0 -0
  136. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/providers/instances/base.py +0 -0
  137. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/providers/instances/multiple.py +0 -0
  138. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/providers/instances/single.py +0 -0
  139. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/providers/label/__init__.py +0 -0
  140. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/providers/label/base.py +0 -0
  141. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/providers/label/binary.py +0 -0
  142. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/providers/label/multiple.py +0 -0
  143. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/providers/rows/__init__.py +0 -0
  144. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/providers/rows/base.py +0 -0
  145. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/providers/sample/__init__.py +0 -0
  146. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/providers/text/__init__.py +0 -0
  147. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/providers/text/single.py +0 -0
  148. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/input/sample.py +0 -0
  149. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/rows_fmt.py +0 -0
  150. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/data/rows_parser.py +0 -0
  151. {arekit-0.25.0/arekit/common/data/input/repositories → arekit-0.25.2/arekit/common/data/storages}/__init__.py +0 -0
  152. {arekit-0.25.0/arekit/common/data/storages → arekit-0.25.2/arekit/common/docs}/__init__.py +0 -0
  153. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/docs/base.py +0 -0
  154. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/docs/entity.py +0 -0
  155. {arekit-0.25.0/arekit/common/data/views → arekit-0.25.2/arekit/common/docs/parsed}/__init__.py +0 -0
  156. {arekit-0.25.0/arekit/common/docs → arekit-0.25.2/arekit/common/docs/parsed/providers}/__init__.py +0 -0
  157. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/docs/parsed/providers/base_pairs.py +0 -0
  158. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/docs/parsed/term_position.py +0 -0
  159. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/docs/sentence.py +0 -0
  160. {arekit-0.25.0/arekit/common/docs/parsed → arekit-0.25.2/arekit/common/entities}/__init__.py +0 -0
  161. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/entities/base.py +0 -0
  162. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/entities/collection.py +0 -0
  163. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/entities/str_fmt.py +0 -0
  164. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/entities/types.py +0 -0
  165. {arekit-0.25.0/arekit/common/docs/parsed/providers → arekit-0.25.2/arekit/common/frames}/__init__.py +0 -0
  166. {arekit-0.25.0/arekit/common/entities → arekit-0.25.2/arekit/common/frames/connotations}/__init__.py +0 -0
  167. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/frames/connotations/descriptor.py +0 -0
  168. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/frames/connotations/provider.py +0 -0
  169. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/frames/text_variant.py +0 -0
  170. {arekit-0.25.0/arekit/common/experiment → arekit-0.25.2/arekit/common/frames/variants}/__init__.py +0 -0
  171. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/frames/variants/base.py +0 -0
  172. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/frames/variants/collection.py +0 -0
  173. {arekit-0.25.0/arekit/common/experiment/api → arekit-0.25.2/arekit/common/labels}/__init__.py +0 -0
  174. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/labels/base.py +0 -0
  175. {arekit-0.25.0/arekit/common/frames → arekit-0.25.2/arekit/common/labels/provider}/__init__.py +0 -0
  176. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/labels/provider/base.py +0 -0
  177. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/labels/provider/constant.py +0 -0
  178. {arekit-0.25.0/arekit/common/frames/connotations → arekit-0.25.2/arekit/common/labels/scaler}/__init__.py +0 -0
  179. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/labels/scaler/base.py +0 -0
  180. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/labels/scaler/sentiment.py +0 -0
  181. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/labels/scaler/single.py +0 -0
  182. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/labels/str_fmt.py +0 -0
  183. {arekit-0.25.0/arekit/common/frames/variants → arekit-0.25.2/arekit/common/linkage}/__init__.py +0 -0
  184. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/linkage/base.py +0 -0
  185. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/linkage/meta.py +0 -0
  186. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/linkage/opinions.py +0 -0
  187. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/linkage/text_opinions.py +0 -0
  188. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/log_utils.py +0 -0
  189. {arekit-0.25.0/arekit/common/labels → arekit-0.25.2/arekit/common/model}/__init__.py +0 -0
  190. {arekit-0.25.0/arekit/common/labels/provider → arekit-0.25.2/arekit/common/model/labeling}/__init__.py +0 -0
  191. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/model/labeling/base.py +0 -0
  192. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/model/labeling/modes.py +0 -0
  193. {arekit-0.25.0/arekit/common/labels/scaler → arekit-0.25.2/arekit/common/opinions}/__init__.py +0 -0
  194. {arekit-0.25.0/arekit/common/linkage → arekit-0.25.2/arekit/common/opinions/annot}/__init__.py +0 -0
  195. {arekit-0.25.0/arekit/common/model → arekit-0.25.2/arekit/common/opinions/annot/algo}/__init__.py +0 -0
  196. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/opinions/annot/algo/base.py +0 -0
  197. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/opinions/annot/algo/predefined.py +0 -0
  198. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/opinions/annot/algo_based.py +0 -0
  199. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/opinions/annot/base.py +0 -0
  200. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/opinions/base.py +0 -0
  201. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/opinions/collection.py +0 -0
  202. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/opinions/enums.py +0 -0
  203. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/opinions/provider.py +0 -0
  204. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/opinions/writer.py +0 -0
  205. {arekit-0.25.0/arekit/common/model/labeling → arekit-0.25.2/arekit/common/pipeline}/__init__.py +0 -0
  206. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/pipeline/context.py +0 -0
  207. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/pipeline/conts.py +0 -0
  208. {arekit-0.25.0/arekit/common/opinions → arekit-0.25.2/arekit/common/pipeline/items}/__init__.py +0 -0
  209. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/pipeline/items/flatten.py +0 -0
  210. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/pipeline/items/handle.py +0 -0
  211. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/pipeline/items/iter.py +0 -0
  212. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/pipeline/items/map.py +0 -0
  213. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/pipeline/items/map_nested.py +0 -0
  214. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/pipeline/utils.py +0 -0
  215. {arekit-0.25.0/arekit/common/opinions/annot → arekit-0.25.2/arekit/common/synonyms}/__init__.py +0 -0
  216. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/synonyms/base.py +0 -0
  217. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/synonyms/grouping.py +0 -0
  218. {arekit-0.25.0/arekit/common/opinions/annot/algo → arekit-0.25.2/arekit/common/text}/__init__.py +0 -0
  219. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/text/enums.py +0 -0
  220. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/text/parsed.py +0 -0
  221. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/text/partitioning.py +0 -0
  222. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/text/stemmer.py +0 -0
  223. {arekit-0.25.0/arekit/common/pipeline → arekit-0.25.2/arekit/common/text_opinions}/__init__.py +0 -0
  224. {arekit-0.25.0 → arekit-0.25.2}/arekit/common/text_opinions/base.py +0 -0
  225. {arekit-0.25.0/arekit/common/pipeline/items → arekit-0.25.2/arekit/contrib}/__init__.py +0 -0
  226. {arekit-0.25.0/arekit/common/service → arekit-0.25.2/arekit/contrib/bert}/__init__.py +0 -0
  227. {arekit-0.25.0/arekit/common/synonyms → arekit-0.25.2/arekit/contrib/bert/input}/__init__.py +0 -0
  228. {arekit-0.25.0/arekit/common/text → arekit-0.25.2/arekit/contrib/bert/input/providers}/__init__.py +0 -0
  229. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/bert/input/providers/text_pair.py +0 -0
  230. {arekit-0.25.0/arekit/common/text_opinions → arekit-0.25.2/arekit/contrib/bert/terms}/__init__.py +0 -0
  231. {arekit-0.25.0/arekit/contrib → arekit-0.25.2/arekit/contrib/prompt}/__init__.py +0 -0
  232. {arekit-0.25.0/arekit/contrib/bert → arekit-0.25.2/arekit/contrib/utils}/__init__.py +0 -0
  233. {arekit-0.25.0/arekit/contrib/bert/input → arekit-0.25.2/arekit/contrib/utils/bert}/__init__.py +0 -0
  234. {arekit-0.25.0/arekit/contrib/bert/input/providers → arekit-0.25.2/arekit/contrib/utils/data}/__init__.py +0 -0
  235. {arekit-0.25.0/arekit/contrib/bert/terms → arekit-0.25.2/arekit/contrib/utils/data/contents}/__init__.py +0 -0
  236. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/utils/data/contents/opinions.py +0 -0
  237. {arekit-0.25.0/arekit/contrib/networks → arekit-0.25.2/arekit/contrib/utils/data/doc_provider}/__init__.py +0 -0
  238. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/utils/data/doc_provider/dict_based.py +0 -0
  239. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/utils/data/doc_provider/dir_based.py +0 -0
  240. {arekit-0.25.0/arekit/contrib/networks/input → arekit-0.25.2/arekit/contrib/utils/data/storages}/__init__.py +0 -0
  241. {arekit-0.25.0/arekit/contrib/networks/input/embedding → arekit-0.25.2/arekit/contrib/utils/data/writers}/__init__.py +0 -0
  242. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/utils/data/writers/base.py +0 -0
  243. {arekit-0.25.0/arekit/contrib/networks/input/formatters → arekit-0.25.2/arekit/contrib/utils/entities}/__init__.py +0 -0
  244. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/utils/entities/filter.py +0 -0
  245. {arekit-0.25.0/arekit/contrib/networks/input/providers → arekit-0.25.2/arekit/contrib/utils/entities/formatters}/__init__.py +0 -0
  246. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py +0 -0
  247. {arekit-0.25.0/arekit/contrib/prompt → arekit-0.25.2/arekit/contrib/utils/io_utils}/__init__.py +0 -0
  248. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/utils/io_utils/utils.py +0 -0
  249. {arekit-0.25.0/arekit/contrib/utils → arekit-0.25.2/arekit/contrib/utils/pipelines}/__init__.py +0 -0
  250. {arekit-0.25.0/arekit/contrib/utils/bert → arekit-0.25.2/arekit/contrib/utils/pipelines/items}/__init__.py +0 -0
  251. {arekit-0.25.0/arekit/contrib/utils/data → arekit-0.25.2/arekit/contrib/utils/pipelines/items/text}/__init__.py +0 -0
  252. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/utils/pipelines/items/text/frames.py +0 -0
  253. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/utils/pipelines/opinion_collections.py +0 -0
  254. {arekit-0.25.0/arekit/contrib/utils/data/contents → arekit-0.25.2/arekit/contrib/utils/pipelines/text_opinion}/__init__.py +0 -0
  255. {arekit-0.25.0/arekit/contrib/utils/data/doc_provider → arekit-0.25.2/arekit/contrib/utils/pipelines/text_opinion/annot}/__init__.py +0 -0
  256. {arekit-0.25.0/arekit/contrib/utils/data/readers → arekit-0.25.2/arekit/contrib/utils/pipelines/text_opinion/filters}/__init__.py +0 -0
  257. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/utils/pipelines/text_opinion/filters/base.py +0 -0
  258. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +0 -0
  259. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +0 -0
  260. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +0 -0
  261. {arekit-0.25.0/arekit/contrib/utils/data/service → arekit-0.25.2/arekit/contrib/utils/synonyms}/__init__.py +0 -0
  262. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/utils/synonyms/simple.py +0 -0
  263. {arekit-0.25.0 → arekit-0.25.2}/arekit/contrib/utils/synonyms/stemmer_based.py +0 -0
  264. {arekit-0.25.0 → arekit-0.25.2}/arekit.egg-info/dependency_links.txt +0 -0
  265. {arekit-0.25.0 → arekit-0.25.2}/arekit.egg-info/top_level.txt +0 -0
  266. {arekit-0.25.0 → arekit-0.25.2}/logo.png +0 -0
  267. {arekit-0.25.0 → arekit-0.25.2}/setup.cfg +0 -0
arekit-0.25.2/PKG-INFO ADDED
@@ -0,0 +1,80 @@
1
+ Metadata-Version: 2.1
2
+ Name: arekit
3
+ Version: 0.25.2
4
+ Summary: Document level Attitude and Relation Extraction toolkit (AREkit) for sampling and prompting mass-media news into datasets for ML-model training
5
+ Home-page: https://github.com/nicolay-r/AREkit
6
+ Author: Nicolay Rusnachenko
7
+ Author-email: rusnicolay@gmail.com
8
+ License: MIT License
9
+ Description: # AREkit 0.25.2
10
+
11
+ ![](https://img.shields.io/badge/Python-3.9+-brightgreen.svg)
12
+ [![PyPI downloads](https://img.shields.io/pypi/dm/arekit.svg)](https://pypistats.org/packages/arekit)
13
+
14
+
15
+ <p align="center">
16
+ <img src="logo.png"/>
17
+ </p>
18
+
19
+ **AREkit** (Attitude and Relation Extraction Toolkit) --
20
+ is a python toolkit, devoted to document level Attitude and Relation Extraction between text objects from mass-media news.
21
+
22
+ ## Description
23
+
24
+
25
+ This toolkit aims at memory-effective data processing in [Relation Extraction (RE)](https://nlpprogress.com/english/relationship_extraction.html) related tasks.
26
+
27
+ <p align="center">
28
+ <img src="docs/arekit-pipeline-concept.png"/>
29
+ </p>
30
+
31
+ > Figure: AREkit pipelines design. More on
32
+ > **[ARElight: Context Sampling of Large Texts for Deep Learning Relation Extraction](https://link.springer.com/chapter/10.1007/978-3-031-56069-9_23)** paper
33
+
34
+ In particular, this framework serves the following features:
35
+ * ➿ [pipelines](https://github.com/nicolay-r/AREkit/wiki/Pipelines:-Text-Opinion-Annotation) and iterators for handling large-scale collections serialization without out-of-memory issues.
36
+ * 🔗 EL (entity-linking) API support for objects,
37
+ * ➰ avoidance of cyclic connections,
38
+ * :straight_ruler: distance consideration between relation participants (in `terms` or `sentences`),
39
+ * 📑 relations annotations and filtering rules,
40
+ * *️⃣ entities formatting or masking, and more.
41
+
42
+ The core functionality includes:
43
+ * API for document presentation with EL (Entity Linking, i.e. Object Synonymy) support
44
+ for sentence level relations preparation (dubbed as contexts);
45
+ * API for contexts extraction;
46
+ * Relations transferring from sentence-level onto document-level, and more.
47
+
48
+ ## Installation
49
+
50
+ ```bash
51
+ pip install git+https://github.com/nicolay-r/AREkit.git@0.25.2-rc
52
+ ```
53
+
54
+ ## Usage
55
+
56
+ Please follow the **[tutorial section on project Wiki](https://github.com/nicolay-r/AREkit/wiki/Tutorials)** for mode details.
57
+
58
+ ## How to cite
59
+ A great research is also accompanied by the faithful reference.
60
+ if you use or extend our work, please cite as follows:
61
+
62
+ ```bibtex
63
+ @inproceedings{rusnachenko2024arelight,
64
+ title={ARElight: Context Sampling of Large Texts for Deep Learning Relation Extraction},
65
+ author={Rusnachenko, Nicolay and Liang, Huizhi and Kolomeets, Maxim and Shi, Lei},
66
+ booktitle={European Conference on Information Retrieval},
67
+ year={2024},
68
+ organization={Springer}
69
+ }
70
+ ```
71
+
72
+ Keywords: natural language processing,relation extraction,sentiment analysis
73
+ Platform: UNKNOWN
74
+ Classifier: Programming Language :: Python
75
+ Classifier: Programming Language :: Python :: 3.6
76
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
77
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
78
+ Classifier: Topic :: Text Processing :: Linguistic
79
+ Requires-Python: >=3.6
80
+ Description-Content-Type: text/markdown
@@ -1,6 +1,8 @@
1
- # AREkit 0.25.0
1
+ # AREkit 0.25.2
2
2
 
3
3
  ![](https://img.shields.io/badge/Python-3.9+-brightgreen.svg)
4
+ [![PyPI downloads](https://img.shields.io/pypi/dm/arekit.svg)](https://pypistats.org/packages/arekit)
5
+
4
6
 
5
7
  <p align="center">
6
8
  <img src="logo.png"/>
@@ -12,7 +14,7 @@ is a python toolkit, devoted to document level Attitude and Relation Extraction
12
14
  ## Description
13
15
 
14
16
 
15
- This toolkit aims at memory-effective data processing in Relation Extraction (RE) related tasks.
17
+ This toolkit aims at memory-effective data processing in [Relation Extraction (RE)](https://nlpprogress.com/english/relationship_extraction.html) related tasks.
16
18
 
17
19
  <p align="center">
18
20
  <img src="docs/arekit-pipeline-concept.png"/>
@@ -38,7 +40,7 @@ for sentence level relations preparation (dubbed as contexts);
38
40
  ## Installation
39
41
 
40
42
  ```bash
41
- pip install git+https://github.com/nicolay-r/AREkit.git@0.25.0-rc
43
+ pip install git+https://github.com/nicolay-r/AREkit.git@0.25.2-rc
42
44
  ```
43
45
 
44
46
  ## Usage
@@ -57,4 +59,4 @@ if you use or extend our work, please cite as follows:
57
59
  year={2024},
58
60
  organization={Springer}
59
61
  }
60
- ```
62
+ ```
@@ -1,12 +1,15 @@
1
1
  from collections.abc import Iterable
2
2
 
3
3
  from arekit.common.context.token import Token
4
- from arekit.common.entities.base import Entity
5
4
  from arekit.common.frames.text_variant import TextFrameVariant
6
5
 
7
6
 
8
7
  class TextTermsMapper(object):
9
8
 
9
+ def __init__(self, is_entity_func):
10
+ assert(callable(is_entity_func))
11
+ self.__is_entity_func = is_entity_func
12
+
10
13
  def iter_mapped(self, terms):
11
14
  """ Performs mapping operation of each terms in a sequence
12
15
  """
@@ -22,7 +25,7 @@ class TextTermsMapper(object):
22
25
  m_term = self.map_token(i, term)
23
26
  elif isinstance(term, TextFrameVariant):
24
27
  m_term = self.map_text_frame_variant(i, term)
25
- elif isinstance(term, Entity):
28
+ elif self.__is_entity_func(term):
26
29
  m_term = self.map_entity(i, term)
27
30
  else:
28
31
  raise Exception("Unsupported type {}".format(term))
@@ -9,13 +9,11 @@ from arekit.common.data.input.providers.label.multiple import MultipleLabelProvi
9
9
  from arekit.common.data.input.providers.rows.base import BaseRowProvider
10
10
  from arekit.common.data.input.providers.text.single import BaseSingleTextProvider
11
11
  from arekit.common.data.rows_fmt import create_base_column_fmt
12
- from arekit.common.entities.base import Entity
13
- from arekit.common.labels.base import Label
14
-
15
- from arekit.common.linkage.text_opinions import TextOpinionsLinkage
16
12
  from arekit.common.docs.parsed.base import ParsedDocument
17
13
  from arekit.common.docs.parsed.providers.entity_service import EntityEndType, EntityServiceProvider
18
14
  from arekit.common.docs.parsed.term_position import TermPositionTypes
15
+ from arekit.common.labels.base import Label
16
+ from arekit.common.linkage.text_opinions import TextOpinionsLinkage
19
17
  from arekit.common.text_opinions.base import TextOpinion
20
18
 
21
19
 
@@ -26,13 +24,15 @@ class BaseSampleRowProvider(BaseRowProvider):
26
24
  """ Rows provider for samples storage.
27
25
  """
28
26
 
29
- def __init__(self, label_provider, text_provider):
27
+ def __init__(self, is_entity_func, label_provider, text_provider):
28
+ assert(callable(is_entity_func))
30
29
  assert(isinstance(label_provider, LabelProvider))
31
30
  assert(isinstance(text_provider, BaseSingleTextProvider))
32
31
  super(BaseSampleRowProvider, self).__init__()
33
32
 
34
33
  self._label_provider = label_provider
35
34
  self.__text_provider = text_provider
35
+ self.__is_entity_func = is_entity_func
36
36
  self.__instances_provider = self.__create_instances_provider(label_provider)
37
37
  self.__store_labels = None
38
38
  self._val_fmt = create_base_column_fmt(fmt_type="writer")
@@ -65,7 +65,7 @@ class BaseSampleRowProvider(BaseRowProvider):
65
65
  parsed_doc=parsed_doc, sentence_ind=sentence_ind, s_ind=s_ind, t_ind=t_ind)
66
66
 
67
67
  # Entity indices from the related context.
68
- entities = list(filter(lambda term: isinstance(term, Entity), sentence_terms))
68
+ entities = list(filter(self.__is_entity_func, sentence_terms))
69
69
 
70
70
  # Values mapping.
71
71
  vm = {
@@ -76,7 +76,7 @@ class BaseSampleRowProvider(BaseRowProvider):
76
76
  const.SENT_IND: sentence_ind,
77
77
  const.ENTITY_VALUES: entities,
78
78
  const.ENTITY_TYPES: entities,
79
- const.ENTITIES: [str(i) for i, t in enumerate(sentence_terms) if isinstance(t, Entity)],
79
+ const.ENTITIES: [str(i) for i, t in enumerate(sentence_terms) if self.__is_entity_func(t)],
80
80
  const.S_IND: actual_s_ind,
81
81
  const.T_IND: actual_t_ind,
82
82
  const.LABEL_UINT: None,
@@ -143,9 +143,6 @@ class BaseSampleRowProvider(BaseRowProvider):
143
143
 
144
144
  def __provide_rows(self, row_dict, parsed_doc, entity_service,
145
145
  text_opinion_linkage, index_in_linked, idle_mode):
146
- """
147
- Providing Rows depending on row_id_formatter type
148
- """
149
146
  assert(isinstance(parsed_doc, ParsedDocument))
150
147
  assert(isinstance(row_dict, OrderedDict))
151
148
  assert(isinstance(text_opinion_linkage, TextOpinionsLinkage))
@@ -153,7 +150,6 @@ class BaseSampleRowProvider(BaseRowProvider):
153
150
  etalon_label = self.__instances_provider.provide_label(text_opinion_linkage)
154
151
  for instance in self.__instances_provider.iter_instances(text_opinion_linkage):
155
152
  yield self.__create_row(row=row_dict,
156
- row_id=0,
157
153
  parsed_doc=parsed_doc,
158
154
  entity_service=entity_service,
159
155
  text_opinions_linkage=instance,
@@ -162,7 +158,7 @@ class BaseSampleRowProvider(BaseRowProvider):
162
158
  etalon_label=etalon_label,
163
159
  idle_mode=idle_mode)
164
160
 
165
- def __create_row(self, row, row_id, parsed_doc, entity_service, text_opinions_linkage,
161
+ def __create_row(self, row, parsed_doc, entity_service, text_opinions_linkage,
166
162
  index_in_linked, etalon_label, idle_mode):
167
163
  """
168
164
  Composing row in following format:
@@ -8,10 +8,11 @@ class CroppedSampleRowProvider(BaseSampleRowProvider):
8
8
  attitude inside.
9
9
  """
10
10
 
11
- def __init__(self, crop_window_size, label_scaler, text_provider):
11
+ def __init__(self, crop_window_size, label_scaler, **kwargs):
12
12
  assert(isinstance(crop_window_size, int) and crop_window_size > 0)
13
- super(CroppedSampleRowProvider, self).__init__(label_provider=MultipleLabelProvider(label_scaler),
14
- text_provider=text_provider)
13
+ super(CroppedSampleRowProvider, self).__init__(
14
+ label_provider=MultipleLabelProvider(label_scaler),
15
+ **kwargs)
15
16
  self.__crop_window_size = crop_window_size
16
17
 
17
18
  @staticmethod
@@ -1,6 +1,5 @@
1
1
  from arekit.common.context.terms_mapper import TextTermsMapper
2
2
  from arekit.common.context.token import Token
3
- from arekit.common.entities.base import Entity
4
3
  from arekit.common.entities.str_fmt import StringEntitiesFormatter
5
4
  from arekit.common.entities.types import OpinionEntityType
6
5
  from arekit.common.frames.text_variant import TextFrameVariant
@@ -12,9 +11,12 @@ class OpinionContainingTextTermsMapper(TextTermsMapper):
12
11
  The latter might be utilized with synonyms collection
13
12
  """
14
13
 
15
- def __init__(self, entity_formatter):
14
+ def __init__(self, entity_formatter, entity_group_ind_func, **kwargs):
16
15
  assert(isinstance(entity_formatter, StringEntitiesFormatter))
16
+ assert(callable(entity_group_ind_func))
17
+ super(OpinionContainingTextTermsMapper, self).__init__(**kwargs)
17
18
  self.__entities_formatter = entity_formatter
19
+ self.__syn_group = entity_group_ind_func
18
20
  self.__s_ind = None
19
21
  self.__t_ind = None
20
22
  self.__s_group = None
@@ -24,12 +26,6 @@ class OpinionContainingTextTermsMapper(TextTermsMapper):
24
26
  def StringEntitiesFormatter(self):
25
27
  return self.__entities_formatter
26
28
 
27
- def __syn_group(self, entity):
28
- """ Note: here we guarantee that entity has GroupIndex.
29
- """
30
- assert(isinstance(entity, Entity))
31
- return entity.GroupIndex if entity is not None else None
32
-
33
29
  def set_s_ind(self, s_ind):
34
30
  assert(isinstance(s_ind, int))
35
31
  self.__s_ind = s_ind
@@ -10,6 +10,9 @@ logger = logging.getLogger(__name__)
10
10
 
11
11
  class BaseRowsStorage(object):
12
12
 
13
+ def __init__(self, log_out=None):
14
+ self.__log_out = log_out
15
+
13
16
  # region protected methods
14
17
 
15
18
  def _begin_filling_row(self, row_ind):
@@ -31,27 +34,9 @@ class BaseRowsStorage(object):
31
34
  def _get_rows_count(self):
32
35
  raise NotImplemented()
33
36
 
34
- def find_by_value(self, column_name, value):
35
- raise NotImplemented()
36
-
37
- def find_first_by_value(self, column_name, value):
38
- raise NotImplemented()
39
-
40
- def iter_column_values(self, column_name, dtype=None):
41
- raise NotImplemented()
42
-
43
- def get_row(self, row_index):
44
- raise NotImplemented()
45
-
46
- def get_cell(self, row_index, column_name):
47
- raise NotImplemented()
48
-
49
37
  def init_empty(self, columns_provider):
50
38
  raise NotImplemented()
51
39
 
52
- def iter_shuffled(self):
53
- raise NotImplemented()
54
-
55
40
  def iter_column_names(self):
56
41
  raise NotImplemented()
57
42
 
@@ -81,6 +66,7 @@ class BaseRowsStorage(object):
81
66
  condition_func=lambda item: not isinstance(item[1], MetaEmptyLinkedDataWrapper),
82
67
  postfix_func=postfix_func,
83
68
  desc="{fmt}".format(fmt=desc),
69
+ file=self.__log_out,
84
70
  total=rows_count)
85
71
 
86
72
  for row_index, item in enumerate(pbar_it):
@@ -1,17 +1,19 @@
1
- from arekit.common.entities.base import Entity
2
1
  from arekit.common.pipeline.items.base import BasePipelineItem
3
2
 
4
3
 
5
4
  class EntitiesGroupingPipelineItem(BasePipelineItem):
6
5
 
7
- def __init__(self, value_to_group_id_func, **kwargs):
6
+ def __init__(self, value_to_group_id_func, is_entity_func, **kwargs):
8
7
  assert(callable(value_to_group_id_func))
8
+ assert(callable(is_entity_func))
9
9
  super(EntitiesGroupingPipelineItem, self).__init__(**kwargs)
10
+
10
11
  self.__value_to_group_id_func = value_to_group_id_func
12
+ self.__is_entity_func = is_entity_func
11
13
 
12
14
  def apply_core(self, input_data, pipeline_ctx):
13
15
  assert(isinstance(input_data, list))
14
16
 
15
- for entity in filter(lambda term: isinstance(term, Entity), input_data):
17
+ for entity in filter(lambda term: self.__is_entity_func(term), input_data):
16
18
  group_index = self.__value_to_group_id_func(entity.Value)
17
19
  entity.set_group_index(group_index)
@@ -1,6 +1,5 @@
1
1
  from collections.abc import Iterable
2
2
 
3
- from arekit.common.entities.base import Entity
4
3
  from arekit.common.text.enums import TermFormat
5
4
  from arekit.common.text.parsed import BaseParsedText
6
5
 
@@ -73,8 +72,9 @@ class ParsedDocument(object):
73
72
  assert(isinstance(s_ind, int))
74
73
  return self.__parsed_sentences[s_ind]
75
74
 
76
- def iter_entities(self):
77
- for entity in self.__iter_all_raw_terms(term_only=True, filter_func=lambda t: isinstance(t, Entity)):
75
+ def iter_entities(self, is_entity_func):
76
+ assert(callable(is_entity_func))
77
+ for entity in self.__iter_all_raw_terms(term_only=True, filter_func=is_entity_func):
78
78
  yield entity
79
79
 
80
80
  def iter_terms(self, filter_func=None, term_only=True):
@@ -1,4 +1,3 @@
1
- from arekit.common.entities.base import Entity
2
1
  from arekit.common.docs.entity import DocumentEntity
3
2
  from arekit.common.docs.parsed.base import ParsedDocument
4
3
 
@@ -6,7 +5,7 @@ from arekit.common.docs.parsed.base import ParsedDocument
6
5
  class BaseParsedDocumentServiceProvider(object):
7
6
 
8
7
  def __init__(self, entity_index_func=None):
9
- """ Outside enity indexing function
8
+ """ Outside entity indexing function
10
9
  entity_index_func: provides id for a given entity, i.e.
11
10
  func(entity) -> int (id)
12
11
  """
@@ -19,7 +18,7 @@ class BaseParsedDocumentServiceProvider(object):
19
18
  def Name(self):
20
19
  raise NotImplementedError()
21
20
 
22
- def init_parsed_doc(self, parsed_doc):
21
+ def init_parsed_doc(self, parsed_doc, is_entity_func):
23
22
  assert(isinstance(parsed_doc, ParsedDocument))
24
23
 
25
24
  def __iter_childs_and_root_node(entity):
@@ -37,7 +36,7 @@ class BaseParsedDocumentServiceProvider(object):
37
36
  self.__entity_map.clear()
38
37
 
39
38
  current_id = 0
40
- for _, entity in enumerate(parsed_doc.iter_entities()):
39
+ for _, entity in enumerate(parsed_doc.iter_entities(is_entity_func=is_entity_func)):
41
40
 
42
41
  child_doc_entities = []
43
42
  for tree_entity, is_child in __iter_childs_and_root_node(entity):
@@ -61,7 +60,6 @@ class BaseParsedDocumentServiceProvider(object):
61
60
  def get_document_entity(self, entity):
62
61
  """ Maps entity to the related one with DocumentEntity type
63
62
  """
64
- assert(isinstance(entity, Entity))
65
63
  return self.__entity_map[self.__entity_index_func(entity)]
66
64
 
67
65
  def contains_entity(self, entity):
@@ -1,8 +1,6 @@
1
1
  from enum import Enum
2
2
 
3
- from arekit.common.entities.base import Entity
4
3
  from arekit.common.docs.entity import DocumentEntity
5
- from arekit.common.docs.parsed.base import ParsedDocument
6
4
  from arekit.common.docs.parsed.providers.base import BaseParsedDocumentServiceProvider
7
5
  from arekit.common.docs.parsed.term_position import TermPositionTypes, TermPosition
8
6
  from arekit.common.text_opinions.base import TextOpinion
@@ -41,9 +39,8 @@ class EntityServiceProvider(BaseParsedDocumentServiceProvider):
41
39
 
42
40
  NAME = "entity-service-provider"
43
41
 
44
- def __init__(self, entity_index_func):
45
- assert(callable(entity_index_func))
46
- super(EntityServiceProvider, self).__init__(entity_index_func=entity_index_func)
42
+ def __init__(self, **kwargs):
43
+ super(EntityServiceProvider, self).__init__(**kwargs)
47
44
  # Initialize API.
48
45
  self.__iter_raw_terms_func = None
49
46
  # Initialize entity positions.
@@ -53,24 +50,16 @@ class EntityServiceProvider(BaseParsedDocumentServiceProvider):
53
50
  def Name(self):
54
51
  return self.NAME
55
52
 
56
- def init_parsed_doc(self, parsed_doc):
57
- super(EntityServiceProvider, self).init_parsed_doc(parsed_doc)
58
- assert(isinstance(parsed_doc, ParsedDocument))
53
+ def init_parsed_doc(self, parsed_doc, is_entity_func):
54
+ super(EntityServiceProvider, self).init_parsed_doc(parsed_doc=parsed_doc, is_entity_func=is_entity_func)
59
55
  self.__iter_raw_terms_func = lambda: parsed_doc.iter_terms(filter_func=None, term_only=False)
60
- self.__entity_positions = self.__calculate_entity_positions()
61
-
62
- # region public 'extract' methods
63
-
64
- def extract_entity_value(self, text_opinion, end_type):
65
- return self.__extract_entity_value(text_opinion=text_opinion, end_type=end_type)
56
+ self.__entity_positions = self.__calculate_entity_positions(is_entity_func=is_entity_func)
66
57
 
67
58
  def extract_entity_position(self, text_opinion, end_type, position_type=None):
68
59
  return self.__get_entity_position(text_opinion=text_opinion,
69
60
  end_type=end_type,
70
61
  position_type=position_type)
71
62
 
72
- # endregion
73
-
74
63
  # region public 'calculate' methods
75
64
 
76
65
  @staticmethod
@@ -112,20 +101,10 @@ class EntityServiceProvider(BaseParsedDocumentServiceProvider):
112
101
 
113
102
  return e_pos.get_index(position_type)
114
103
 
115
- def get_entity_value(self, id_in_document):
116
- entity = self._doc_entities[id_in_document]
117
- assert(isinstance(entity, Entity))
118
- return entity.Value
119
-
120
104
  # endregion
121
105
 
122
106
  # region private methods
123
107
 
124
- def __extract_entity_value(self, text_opinion, end_type):
125
- assert(isinstance(text_opinion, TextOpinion))
126
- end_id = self.__get_end_id(text_opinion=text_opinion, end_type=end_type)
127
- return self.get_entity_value(end_id)
128
-
129
108
  def __get_entity_position(self, text_opinion, end_type, position_type=None):
130
109
  assert(isinstance(text_opinion, TextOpinion))
131
110
  end_id = self.__get_end_id(text_opinion=text_opinion, end_type=end_type)
@@ -147,7 +126,7 @@ class EntityServiceProvider(BaseParsedDocumentServiceProvider):
147
126
  assert(end_type == EntityEndType.Source or end_type == EntityEndType.Target)
148
127
  return text_opinion.SourceId if end_type == EntityEndType.Source else text_opinion.TargetId
149
128
 
150
- def __calculate_entity_positions(self):
129
+ def __calculate_entity_positions(self, is_entity_func):
151
130
  """ Note: here we consider the same order as in self._entities.
152
131
  """
153
132
  t_ind_in_doc = -1
@@ -157,7 +136,7 @@ class EntityServiceProvider(BaseParsedDocumentServiceProvider):
157
136
 
158
137
  t_ind_in_doc += 1
159
138
 
160
- if not isinstance(term, Entity):
139
+ if not is_entity_func(term):
161
140
  continue
162
141
 
163
142
  # We consider that entities within a single tree has the same positions.
@@ -1,4 +1,3 @@
1
- from arekit.common.entities.base import Entity
2
1
  from arekit.common.docs.parsed.providers.base_pairs import BasePairProvider
3
2
  from arekit.common.opinions.base import Opinion
4
3
 
@@ -7,14 +6,15 @@ class OpinionPairsProvider(BasePairProvider):
7
6
 
8
7
  NAME = "opinion-pairs-provider"
9
8
 
9
+ def __init__(self, entity_value_func, **kwargs):
10
+ super(OpinionPairsProvider, self).__init__(**kwargs)
11
+ self.__entity_value_func = entity_value_func
12
+
10
13
  @property
11
14
  def Name(self):
12
15
  return self.NAME
13
16
 
14
17
  def _create_pair(self, source_entity, target_entity, label):
15
- assert(isinstance(source_entity, Entity))
16
- assert(isinstance(target_entity, Entity))
17
-
18
- return Opinion(source_value=source_entity.Value,
19
- target_value=target_entity.Value,
18
+ return Opinion(source_value=self.__entity_value_func(source_entity),
19
+ target_value=self.__entity_value_func(target_entity),
20
20
  label=label)
@@ -16,8 +16,8 @@ class TextOpinionPairsProvider(BasePairProvider):
16
16
 
17
17
  NAME = "text-opinion-pairs-provider"
18
18
 
19
- def __init__(self, value_to_group_id_func):
20
- super(TextOpinionPairsProvider, self).__init__()
19
+ def __init__(self, value_to_group_id_func, **kwargs):
20
+ super(TextOpinionPairsProvider, self).__init__(**kwargs)
21
21
  self.__value_to_group_id_func = value_to_group_id_func
22
22
  self.__doc_id = None
23
23
  self.__entities_collection = None
@@ -36,8 +36,8 @@ class TextOpinionPairsProvider(BasePairProvider):
36
36
  label=label,
37
37
  text_opinion_id=None)
38
38
 
39
- def init_parsed_doc(self, parsed_doc):
40
- super(TextOpinionPairsProvider, self).init_parsed_doc(parsed_doc)
39
+ def init_parsed_doc(self, parsed_doc, is_entity_func):
40
+ super(TextOpinionPairsProvider, self).init_parsed_doc(parsed_doc=parsed_doc, is_entity_func=is_entity_func)
41
41
  self.__doc_id = parsed_doc.RelatedDocID
42
42
  self.__entities_collection = EntityCollection(
43
43
  entities=list(self._doc_entities),
@@ -6,7 +6,7 @@ class ParsedDocumentService(object):
6
6
  """ Represents a collection of providers, combined with the parsed doc.
7
7
  """
8
8
 
9
- def __init__(self, parsed_doc, providers):
9
+ def __init__(self, parsed_doc, providers, is_entity_func):
10
10
  assert(isinstance(parsed_doc, ParsedDocument))
11
11
  assert(isinstance(providers, list))
12
12
  self.__parsed_doc = parsed_doc
@@ -20,7 +20,7 @@ class ParsedDocumentService(object):
20
20
  self.__providers[provider.Name] = provider
21
21
 
22
22
  # Post initialize with the related parsed doc.
23
- provider.init_parsed_doc(self.__parsed_doc)
23
+ provider.init_parsed_doc(self.__parsed_doc, is_entity_func=is_entity_func)
24
24
 
25
25
 
26
26
  @property
@@ -1,42 +1,14 @@
1
- from tqdm import tqdm
2
1
  from arekit.common.docs.base import Document
3
2
  from arekit.common.docs.parsed.base import ParsedDocument
4
- from arekit.common.pipeline.base import BasePipelineLauncher
5
3
  from arekit.common.pipeline.batching import BatchingPipelineLauncher
6
4
  from arekit.common.pipeline.context import PipelineContext
7
5
  from arekit.common.pipeline.utils import BatchIterator
8
6
  from arekit.common.text.parsed import BaseParsedText
7
+ from arekit.common.utils import progress_bar_defined
9
8
 
10
9
 
11
10
  class DocumentParsers(object):
12
11
 
13
- @staticmethod
14
- def parse(doc, pipeline_items, parent_ppl_ctx=None, src_key="input", show_progress=False):
15
- """ This document parser is based on single text parts (sentences)
16
- that passes sequentially through the pipeline of transformations.
17
- """
18
- assert(isinstance(doc, Document))
19
- assert(isinstance(pipeline_items, list))
20
- assert(isinstance(parent_ppl_ctx, PipelineContext) or parent_ppl_ctx is None)
21
-
22
- parsed_sentences = []
23
-
24
- data_it = range(doc.SentencesCount)
25
- progress_it = tqdm(data_it, disable=not show_progress)
26
-
27
- for sent_ind in progress_it:
28
-
29
- # Composing the context from a single sentence.
30
- ctx = PipelineContext({src_key: doc.get_sentence(sent_ind)}, parent_ctx=parent_ppl_ctx)
31
-
32
- # Apply all the operations.
33
- BasePipelineLauncher.run(pipeline=pipeline_items, pipeline_ctx=ctx, src_key=src_key)
34
-
35
- # Collecting the result.
36
- parsed_sentences.append(BaseParsedText(terms=ctx.provide("result")))
37
-
38
- return ParsedDocument(doc_id=doc.ID, parsed_sentences=parsed_sentences)
39
-
40
12
  @staticmethod
41
13
  def parse_batch(doc, pipeline_items, batch_size, parent_ppl_ctx=None, src_key="input", show_progress=False):
42
14
  """ This document parser is based on batch of sentences.
@@ -49,7 +21,8 @@ class DocumentParsers(object):
49
21
  parsed_sentences = []
50
22
 
51
23
  data_it = BatchIterator(data_iter=iter(range(doc.SentencesCount)), batch_size=batch_size)
52
- progress_it = tqdm(data_it, total=round(doc.SentencesCount / batch_size), disable=not show_progress)
24
+ progress_it = progress_bar_defined(data_it, total=round(doc.SentencesCount / batch_size),
25
+ disable=not show_progress)
53
26
 
54
27
  for batch in progress_it:
55
28
 
@@ -1,11 +1,15 @@
1
- import numpy as np
2
-
3
1
  from arekit.common.model.labeling.base import LabelsHelper
4
2
  from arekit.common.model.labeling.modes import LabelCalculationMode
5
3
 
6
4
 
7
5
  class SingleLabelsHelper(LabelsHelper):
8
6
 
7
+ @staticmethod
8
+ def __sign(x):
9
+ if x == 0:
10
+ return 0
11
+ return -1 if x < 0 else 1
12
+
9
13
  def aggregate_labels(self, labels_list, label_calc_mode):
10
14
  assert(isinstance(labels_list, list))
11
15
  assert(isinstance(label_calc_mode, LabelCalculationMode))
@@ -18,7 +22,7 @@ class SingleLabelsHelper(LabelsHelper):
18
22
  if label_calc_mode == LabelCalculationMode.AVERAGE:
19
23
  int_labels = [self._label_scaler.label_to_int(label)
20
24
  for label in labels_list]
21
- label = self._label_scaler.int_to_label(int(np.sign(sum(int_labels))))
25
+ label = self._label_scaler.int_to_label(SingleLabelsHelper.__sign(sum(int_labels)))
22
26
 
23
27
  return label
24
28