arekit 0.23.1__py3-none-any.whl → 0.25.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (267) hide show
  1. arekit/common/context/terms_mapper.py +2 -2
  2. arekit/common/data/const.py +5 -4
  3. arekit/common/{experiment/api/ops_doc.py → data/doc_provider.py} +1 -1
  4. arekit/common/data/input/providers/columns/sample.py +6 -1
  5. arekit/common/data/input/providers/instances/base.py +1 -1
  6. arekit/common/data/input/providers/rows/base.py +36 -13
  7. arekit/common/data/input/providers/rows/samples.py +57 -55
  8. arekit/common/data/input/providers/sample/cropped.py +2 -2
  9. arekit/common/data/input/sample.py +1 -1
  10. arekit/common/data/rows_fmt.py +82 -0
  11. arekit/common/data/rows_parser.py +43 -0
  12. arekit/common/data/storages/base.py +23 -18
  13. arekit/common/data/views/samples.py +2 -8
  14. arekit/common/{news → docs}/base.py +2 -2
  15. arekit/common/{news → docs}/entities_grouping.py +2 -1
  16. arekit/common/{news → docs}/entity.py +2 -1
  17. arekit/common/{news → docs}/parsed/base.py +5 -5
  18. arekit/common/docs/parsed/providers/base.py +68 -0
  19. arekit/common/{news → docs}/parsed/providers/base_pairs.py +2 -2
  20. arekit/common/{news → docs}/parsed/providers/entity_service.py +27 -22
  21. arekit/common/{news → docs}/parsed/providers/opinion_pairs.py +2 -2
  22. arekit/common/{news → docs}/parsed/providers/text_opinion_pairs.py +6 -6
  23. arekit/common/docs/parsed/service.py +31 -0
  24. arekit/common/docs/parser.py +66 -0
  25. arekit/common/{news → docs}/sentence.py +1 -1
  26. arekit/common/entities/base.py +11 -2
  27. arekit/common/experiment/api/base_samples_io.py +1 -1
  28. arekit/common/frames/variants/collection.py +2 -2
  29. arekit/common/linkage/base.py +2 -2
  30. arekit/common/linkage/meta.py +23 -0
  31. arekit/common/linkage/opinions.py +1 -1
  32. arekit/common/linkage/text_opinions.py +2 -2
  33. arekit/common/opinions/annot/algo/base.py +1 -1
  34. arekit/common/opinions/annot/algo/pair_based.py +15 -13
  35. arekit/common/opinions/annot/algo/predefined.py +4 -4
  36. arekit/common/opinions/annot/algo_based.py +5 -5
  37. arekit/common/opinions/annot/base.py +3 -3
  38. arekit/common/opinions/base.py +7 -7
  39. arekit/common/opinions/collection.py +3 -3
  40. arekit/common/pipeline/base.py +12 -16
  41. arekit/common/pipeline/batching.py +28 -0
  42. arekit/common/pipeline/context.py +5 -1
  43. arekit/common/pipeline/items/base.py +38 -1
  44. arekit/common/pipeline/items/flatten.py +5 -1
  45. arekit/common/pipeline/items/handle.py +2 -1
  46. arekit/common/pipeline/items/iter.py +2 -1
  47. arekit/common/pipeline/items/map.py +2 -1
  48. arekit/common/pipeline/items/map_nested.py +4 -0
  49. arekit/common/pipeline/utils.py +32 -0
  50. arekit/common/service/sqlite.py +36 -0
  51. arekit/common/synonyms/base.py +2 -2
  52. arekit/common/text/{partitioning/str.py → partitioning.py} +16 -11
  53. arekit/common/text_opinions/base.py +11 -11
  54. arekit/common/utils.py +33 -46
  55. arekit/contrib/networks/embedding.py +3 -3
  56. arekit/contrib/networks/embedding_io.py +5 -5
  57. arekit/contrib/networks/input/const.py +0 -2
  58. arekit/contrib/networks/input/providers/sample.py +15 -29
  59. arekit/contrib/networks/input/rows_parser.py +47 -134
  60. arekit/contrib/prompt/sample.py +18 -16
  61. arekit/contrib/utils/data/contents/opinions.py +17 -5
  62. arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
  63. arekit/contrib/utils/data/{doc_ops → doc_provider}/dir_based.py +7 -7
  64. arekit/contrib/utils/data/readers/base.py +3 -0
  65. arekit/contrib/utils/data/readers/csv_pd.py +10 -4
  66. arekit/contrib/utils/data/readers/jsonl.py +3 -0
  67. arekit/contrib/utils/data/readers/sqlite.py +14 -0
  68. arekit/contrib/utils/data/service/balance.py +0 -1
  69. arekit/contrib/utils/data/storages/pandas_based.py +3 -5
  70. arekit/contrib/utils/data/storages/row_cache.py +18 -6
  71. arekit/contrib/utils/data/storages/sqlite_based.py +17 -0
  72. arekit/contrib/utils/data/writers/base.py +5 -0
  73. arekit/contrib/utils/data/writers/csv_native.py +3 -0
  74. arekit/contrib/utils/data/writers/csv_pd.py +3 -0
  75. arekit/contrib/utils/data/writers/json_opennre.py +31 -13
  76. arekit/contrib/utils/data/writers/sqlite_native.py +114 -0
  77. arekit/contrib/utils/io_utils/embedding.py +25 -33
  78. arekit/contrib/utils/io_utils/utils.py +3 -24
  79. arekit/contrib/utils/pipelines/items/sampling/base.py +31 -26
  80. arekit/contrib/utils/pipelines/items/sampling/networks.py +7 -10
  81. arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
  82. arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
  83. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +3 -3
  84. arekit/contrib/utils/pipelines/items/text/frames_negation.py +2 -1
  85. arekit/contrib/utils/pipelines/items/text/tokenizer.py +3 -5
  86. arekit/contrib/utils/pipelines/items/text/translator.py +136 -0
  87. arekit/contrib/utils/pipelines/opinion_collections.py +5 -5
  88. arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +7 -7
  89. arekit/contrib/utils/pipelines/text_opinion/extraction.py +34 -22
  90. arekit/contrib/utils/pipelines/text_opinion/filters/base.py +1 -1
  91. arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +1 -1
  92. arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +3 -3
  93. arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +4 -4
  94. arekit/contrib/utils/serializer.py +4 -23
  95. arekit-0.25.0.data/data/logo.png +0 -0
  96. arekit-0.25.0.dist-info/METADATA +82 -0
  97. arekit-0.25.0.dist-info/RECORD +259 -0
  98. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/WHEEL +1 -1
  99. arekit/common/data/row_ids/base.py +0 -79
  100. arekit/common/data/row_ids/binary.py +0 -38
  101. arekit/common/data/row_ids/multiple.py +0 -14
  102. arekit/common/folding/base.py +0 -36
  103. arekit/common/folding/fixed.py +0 -42
  104. arekit/common/folding/nofold.py +0 -15
  105. arekit/common/folding/united.py +0 -46
  106. arekit/common/news/objects_parser.py +0 -37
  107. arekit/common/news/parsed/providers/base.py +0 -48
  108. arekit/common/news/parsed/service.py +0 -31
  109. arekit/common/news/parser.py +0 -34
  110. arekit/common/text/parser.py +0 -12
  111. arekit/common/text/partitioning/__init__.py +0 -0
  112. arekit/common/text/partitioning/base.py +0 -4
  113. arekit/common/text/partitioning/terms.py +0 -35
  114. arekit/contrib/source/__init__.py +0 -0
  115. arekit/contrib/source/brat/__init__.py +0 -0
  116. arekit/contrib/source/brat/annot.py +0 -83
  117. arekit/contrib/source/brat/entities/__init__.py +0 -0
  118. arekit/contrib/source/brat/entities/compound.py +0 -33
  119. arekit/contrib/source/brat/entities/entity.py +0 -42
  120. arekit/contrib/source/brat/entities/parser.py +0 -53
  121. arekit/contrib/source/brat/news.py +0 -28
  122. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  123. arekit/contrib/source/brat/opinions/converter.py +0 -19
  124. arekit/contrib/source/brat/relation.py +0 -32
  125. arekit/contrib/source/brat/sentence.py +0 -69
  126. arekit/contrib/source/brat/sentences_reader.py +0 -128
  127. arekit/contrib/source/download.py +0 -41
  128. arekit/contrib/source/nerel/__init__.py +0 -0
  129. arekit/contrib/source/nerel/entities.py +0 -55
  130. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  131. arekit/contrib/source/nerel/folding/fixed.py +0 -75
  132. arekit/contrib/source/nerel/io_utils.py +0 -62
  133. arekit/contrib/source/nerel/labels.py +0 -241
  134. arekit/contrib/source/nerel/reader.py +0 -46
  135. arekit/contrib/source/nerel/utils.py +0 -24
  136. arekit/contrib/source/nerel/versions.py +0 -12
  137. arekit/contrib/source/nerelbio/__init__.py +0 -0
  138. arekit/contrib/source/nerelbio/io_utils.py +0 -62
  139. arekit/contrib/source/nerelbio/labels.py +0 -265
  140. arekit/contrib/source/nerelbio/reader.py +0 -8
  141. arekit/contrib/source/nerelbio/versions.py +0 -8
  142. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  143. arekit/contrib/source/ruattitudes/collection.py +0 -36
  144. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  145. arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
  146. arekit/contrib/source/ruattitudes/io_utils.py +0 -56
  147. arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
  148. arekit/contrib/source/ruattitudes/news.py +0 -51
  149. arekit/contrib/source/ruattitudes/news_brat.py +0 -44
  150. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  151. arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
  152. arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
  153. arekit/contrib/source/ruattitudes/reader.py +0 -268
  154. arekit/contrib/source/ruattitudes/sentence.py +0 -73
  155. arekit/contrib/source/ruattitudes/synonyms.py +0 -17
  156. arekit/contrib/source/ruattitudes/text_object.py +0 -57
  157. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  158. arekit/contrib/source/rusentiframes/collection.py +0 -157
  159. arekit/contrib/source/rusentiframes/effect.py +0 -24
  160. arekit/contrib/source/rusentiframes/io_utils.py +0 -19
  161. arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
  162. arekit/contrib/source/rusentiframes/polarity.py +0 -35
  163. arekit/contrib/source/rusentiframes/role.py +0 -15
  164. arekit/contrib/source/rusentiframes/state.py +0 -24
  165. arekit/contrib/source/rusentiframes/types.py +0 -42
  166. arekit/contrib/source/rusentiframes/value.py +0 -2
  167. arekit/contrib/source/rusentrel/__init__.py +0 -0
  168. arekit/contrib/source/rusentrel/const.py +0 -3
  169. arekit/contrib/source/rusentrel/entities.py +0 -26
  170. arekit/contrib/source/rusentrel/io_utils.py +0 -125
  171. arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
  172. arekit/contrib/source/rusentrel/news_reader.py +0 -51
  173. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  174. arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
  175. arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
  176. arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
  177. arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
  178. arekit/contrib/source/rusentrel/synonyms.py +0 -17
  179. arekit/contrib/source/sentinerel/__init__.py +0 -0
  180. arekit/contrib/source/sentinerel/entities.py +0 -52
  181. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  182. arekit/contrib/source/sentinerel/folding/factory.py +0 -32
  183. arekit/contrib/source/sentinerel/folding/fixed.py +0 -73
  184. arekit/contrib/source/sentinerel/io_utils.py +0 -87
  185. arekit/contrib/source/sentinerel/labels.py +0 -53
  186. arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
  187. arekit/contrib/source/sentinerel/reader.py +0 -42
  188. arekit/contrib/source/synonyms/__init__.py +0 -0
  189. arekit/contrib/source/synonyms/utils.py +0 -19
  190. arekit/contrib/source/zip_utils.py +0 -47
  191. arekit/contrib/utils/bert/rows.py +0 -0
  192. arekit/contrib/utils/bert/text_b_rus.py +0 -18
  193. arekit/contrib/utils/connotations/__init__.py +0 -0
  194. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
  195. arekit/contrib/utils/cv/__init__.py +0 -0
  196. arekit/contrib/utils/cv/doc_stat/__init__.py +0 -0
  197. arekit/contrib/utils/cv/doc_stat/base.py +0 -37
  198. arekit/contrib/utils/cv/doc_stat/sentence.py +0 -12
  199. arekit/contrib/utils/cv/splitters/__init__.py +0 -0
  200. arekit/contrib/utils/cv/splitters/base.py +0 -4
  201. arekit/contrib/utils/cv/splitters/default.py +0 -53
  202. arekit/contrib/utils/cv/splitters/statistical.py +0 -57
  203. arekit/contrib/utils/cv/two_class.py +0 -77
  204. arekit/contrib/utils/data/doc_ops/__init__.py +0 -0
  205. arekit/contrib/utils/data/doc_ops/dict_based.py +0 -13
  206. arekit/contrib/utils/data/ext.py +0 -31
  207. arekit/contrib/utils/data/views/__init__.py +0 -0
  208. arekit/contrib/utils/data/views/linkages/__init__.py +0 -0
  209. arekit/contrib/utils/data/views/linkages/base.py +0 -58
  210. arekit/contrib/utils/data/views/linkages/multilabel.py +0 -48
  211. arekit/contrib/utils/data/views/linkages/utils.py +0 -24
  212. arekit/contrib/utils/data/views/opinions.py +0 -14
  213. arekit/contrib/utils/download.py +0 -78
  214. arekit/contrib/utils/entities/formatters/str_rus_cased_fmt.py +0 -78
  215. arekit/contrib/utils/entities/formatters/str_rus_nocased_fmt.py +0 -15
  216. arekit/contrib/utils/entities/formatters/str_simple_fmt.py +0 -24
  217. arekit/contrib/utils/entities/formatters/str_simple_uppercase_fmt.py +0 -21
  218. arekit/contrib/utils/io_utils/opinions.py +0 -39
  219. arekit/contrib/utils/io_utils/samples.py +0 -78
  220. arekit/contrib/utils/lexicons/__init__.py +0 -0
  221. arekit/contrib/utils/lexicons/lexicon.py +0 -43
  222. arekit/contrib/utils/lexicons/relation.py +0 -45
  223. arekit/contrib/utils/lexicons/rusentilex.py +0 -34
  224. arekit/contrib/utils/nn/__init__.py +0 -0
  225. arekit/contrib/utils/nn/rows.py +0 -83
  226. arekit/contrib/utils/pipelines/items/sampling/bert.py +0 -5
  227. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
  228. arekit/contrib/utils/pipelines/items/to_output.py +0 -101
  229. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  230. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  231. arekit/contrib/utils/pipelines/sources/nerel/doc_ops.py +0 -27
  232. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -59
  233. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
  234. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  235. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_ops.py +0 -29
  236. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -59
  237. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
  238. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  239. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_ops.py +0 -56
  240. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -19
  241. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -58
  242. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  243. arekit/contrib/utils/pipelines/sources/rusentrel/doc_ops.py +0 -21
  244. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -100
  245. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  246. arekit/contrib/utils/pipelines/sources/sentinerel/doc_ops.py +0 -29
  247. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
  248. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -175
  249. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
  250. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
  251. arekit/contrib/utils/resources.py +0 -26
  252. arekit/contrib/utils/sources/__init__.py +0 -0
  253. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  254. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  255. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
  256. arekit/contrib/utils/utils_folding.py +0 -19
  257. arekit/download_data.py +0 -11
  258. arekit-0.23.1.dist-info/METADATA +0 -23
  259. arekit-0.23.1.dist-info/RECORD +0 -403
  260. /arekit/common/{data/row_ids → docs}/__init__.py +0 -0
  261. /arekit/common/{folding → docs/parsed}/__init__.py +0 -0
  262. /arekit/common/{news → docs/parsed/providers}/__init__.py +0 -0
  263. /arekit/common/{news → docs}/parsed/term_position.py +0 -0
  264. /arekit/common/{news/parsed → service}/__init__.py +0 -0
  265. /arekit/{common/news/parsed/providers → contrib/utils/data/doc_provider}/__init__.py +0 -0
  266. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/LICENSE +0 -0
  267. {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/top_level.txt +0 -0
@@ -1,88 +0,0 @@
1
- from arekit.common.experiment.api.ops_doc import DocumentOperations
2
- from arekit.common.labels.str_fmt import StringLabelsFormatter
3
- from arekit.common.news.parsed.base import ParsedNews
4
- from arekit.common.news.parsed.providers.base import BaseParsedNewsServiceProvider
5
- from arekit.common.news.parsed.providers.entity_service import EntityServiceProvider
6
- from arekit.common.news.parsed.service import ParsedNewsService
7
- from arekit.common.opinions.annot.base import BaseOpinionAnnotator
8
- from arekit.contrib.source.brat.news import BratNews
9
- from arekit.contrib.source.brat.opinions.converter import BratRelationConverter
10
-
11
-
12
- class PredefinedTextOpinionAnnotator(BaseOpinionAnnotator):
13
- """ Brat-based text-opinion annotator (converter).
14
- It converts the pre-annotated Relations from BRAT-documents to TextOpinions
15
- """
16
-
17
- def __init__(self, doc_ops, label_formatter, keep_any_type=False, entity_index_func=None):
18
- """
19
- get_doc_func:
20
- func(doc_id)
21
-
22
- label_formatter: String Labels Formatter
23
- required for conversion.
24
-
25
- keep_any_type: bool
26
- flag that defines whether there is a need to consider all the text opinions
27
- or only one that supported by label formatter.
28
-
29
- entity_index_func: is a way of how we provide an external entity ID
30
- fund(entity) -> ID
31
- """
32
- assert(isinstance(doc_ops, DocumentOperations))
33
- assert(isinstance(label_formatter, StringLabelsFormatter))
34
- assert(callable(entity_index_func) or entity_index_func is None)
35
- super(PredefinedTextOpinionAnnotator, self).__init__()
36
-
37
- self.__doc_ops = doc_ops
38
- self.__label_formatter = label_formatter
39
- self.__keep_any_type = keep_any_type
40
- self.__entity_index_func = (lambda brat_entity: brat_entity.ID) if \
41
- entity_index_func is None else entity_index_func
42
-
43
- @staticmethod
44
- def __convert_opinion_id(news, origin_id, esp):
45
- assert(isinstance(news, BratNews))
46
- assert(isinstance(origin_id, int))
47
- assert(isinstance(esp, BaseParsedNewsServiceProvider))
48
-
49
- if not news.contains_entity(origin_id):
50
- # Due to the complexity of entities, some entities might be nested.
51
- # Therefore the latter, some entities might be discarded.
52
- return None
53
-
54
- origin_entity = news.get_entity_by_id(origin_id)
55
-
56
- if not esp.contains_entity(origin_entity):
57
- return None
58
-
59
- document_entity = esp.get_document_entity(origin_entity)
60
- return document_entity.IdInDocument
61
-
62
- def _annot_collection_core(self, parsed_news):
63
- assert(isinstance(parsed_news, ParsedNews))
64
-
65
- pns = ParsedNewsService(parsed_news=parsed_news, providers=[
66
- EntityServiceProvider(self.__entity_index_func)
67
- ])
68
- esp = pns.get_provider(EntityServiceProvider.NAME)
69
- news = self.__doc_ops.by_id(parsed_news.RelatedDocID)
70
-
71
- for brat_relation in news.Relations:
72
-
73
- if self.__label_formatter.supports_value(brat_relation.Type) or self.__keep_any_type:
74
-
75
- text_opinion = BratRelationConverter.to_text_opinion(
76
- brat_relation=brat_relation,
77
- doc_id=parsed_news.RelatedDocID,
78
- label_formatter=self.__label_formatter)
79
-
80
- internal_opinion = text_opinion.try_convert(
81
- other=text_opinion,
82
- convert_func=lambda origin_id: PredefinedTextOpinionAnnotator.__convert_opinion_id(
83
- news=news, origin_id=origin_id, esp=esp))
84
-
85
- if internal_opinion is None:
86
- continue
87
-
88
- yield internal_opinion
@@ -1,26 +0,0 @@
1
- from arekit.common.text.stemmer import Stemmer
2
- from arekit.contrib.utils.download import NEWS_MYSTEM_SKIPGRAM_1000_20_2015, load_embedding_and_vocab
3
- from arekit.contrib.utils.embeddings.rusvectores import RusvectoresEmbedding
4
-
5
-
6
- def load_embedding_news_mystem_skipgram_1000_20_2015(stemmer, auto_download=False):
7
- """ Embedding from https://rusvectores.org/ru/models/
8
- Description: Russian news, from 2013 till the october 2015
9
- Corpora size: 2.5 milliard words
10
- Vocabulary volume: 147 358
11
- Frequency bound: 200
12
- Tagset: Mystem
13
- Algorithm: Continuous Skip-Gram
14
- Vector size: 1000
15
-
16
- stemmer: Stemmer
17
- It is expected to adopt MystemWrapper.
18
- auto_download: bool
19
- Whether try to download if the resource was missed.
20
- """
21
- assert(isinstance(stemmer, Stemmer) or stemmer is None)
22
- embedding, vocab = load_embedding_and_vocab(local_name=NEWS_MYSTEM_SKIPGRAM_1000_20_2015,
23
- check_existance=True,
24
- download_if_missed=auto_download)
25
- embedding = RusvectoresEmbedding(matrix=embedding, words=vocab, stemmer=stemmer)
26
- return embedding
File without changes
File without changes
@@ -1,63 +0,0 @@
1
- from arekit.common.entities.base import Entity
2
- from arekit.common.news.parsed.base import ParsedNews
3
- from arekit.common.news.parsed.providers.entity_service import EntityServiceProvider
4
- from arekit.common.news.parsed.term_position import TermPositionTypes, TermPosition
5
- from arekit.common.text.enums import TermFormat
6
- from arekit.common.text.parsed import BaseParsedText
7
- from arekit.common.text_opinions.base import TextOpinion
8
- from arekit.contrib.utils.pipelines.text_opinion.filters.base import TextOpinionFilter
9
-
10
-
11
- class ProfessionAsCharacteristicSentimentTextOpinionFilter(TextOpinionFilter):
12
- """ This is a filter, based on the PROFESSION type prefixed entity for
13
- the SentiNEREL collection.
14
-
15
- In this case, profession acts as a characteristics of the Person, and
16
- therefore there is no need to consider these attitudes in annotation.
17
-
18
- For a greater details, see:
19
- https://github.com/nicolay-r/AREkit/issues/404
20
- """
21
-
22
- def __init__(self, char_type="PROFESSION"):
23
- self.__char_type = char_type
24
- self.__next_entity_types = ["PERSON"]
25
-
26
- def filter(self, text_opinion, parsed_news, entity_service_provider):
27
- assert(isinstance(text_opinion, TextOpinion))
28
- assert(isinstance(parsed_news, ParsedNews))
29
- assert(isinstance(entity_service_provider, EntityServiceProvider))
30
-
31
- # Picking up entity.
32
- target_entity = entity_service_provider._doc_entities[text_opinion.TargetId]
33
- assert(isinstance(target_entity, Entity))
34
-
35
- if target_entity.Type != self.__char_type:
36
- # This is not our case.
37
- return True
38
-
39
- # Picking up the related target entity position.
40
- target_pos = entity_service_provider.get_entity_position(text_opinion.TargetId)
41
- assert(isinstance(target_pos, TermPosition))
42
-
43
- # Picking up the related sentence of target.
44
- t_sent = target_pos.get_index(TermPositionTypes.SentenceIndex)
45
- sentence = parsed_news.get_sentence(t_sent)
46
- assert(isinstance(sentence, BaseParsedText))
47
-
48
- # Picking up the entity position in sentence.
49
- target_term_ind = target_pos.get_index(TermPositionTypes.IndexInSentence)
50
-
51
- # We pick up the next term within the parsed sentece.
52
- next_term = sentence.get_term(target_term_ind + 1, term_format=TermFormat.Raw) \
53
- if len(sentence) > target_term_ind + 1 else None
54
-
55
- if next_term is None:
56
- # This is not our case.
57
- return True
58
-
59
- if isinstance(next_term, Entity) and next_term.Type in self.__next_entity_types:
60
- # We reject this opinion from the annotation, since this is not expected to be a sentiment one.
61
- return False
62
-
63
- return True
@@ -1,19 +0,0 @@
1
- from arekit.common.folding.base import BaseDataFolding
2
- from arekit.contrib.utils.cv.two_class import TwoClassCVFolding
3
-
4
-
5
- def folding_iter_states(folding):
6
- if isinstance(folding, TwoClassCVFolding):
7
- for state in folding.iter_states():
8
- yield state
9
- yield 0
10
-
11
-
12
- def experiment_iter_index(folding):
13
- assert(isinstance(folding, BaseDataFolding))
14
-
15
- if isinstance(folding, TwoClassCVFolding):
16
- return folding.StateIndex
17
-
18
- # In other cases we consider that there is only a single state.
19
- return 0
arekit/download_data.py DELETED
@@ -1,11 +0,0 @@
1
- from arekit.contrib.source.download import download
2
-
3
-
4
- def do_download():
5
- """ Download all the sources utilized by contrib/sources.
6
- """
7
- download()
8
-
9
-
10
- # Downloading data.
11
- do_download()
@@ -1,23 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: arekit
3
- Version: 0.23.1
4
- Summary: Library devoted to Document level Attitude and Relation Extraction for text objects with entity-linking (EL) API support
5
- Home-page: https://github.com/nicolay-r/AREkit
6
- Author: Nicolay Rusnachenko
7
- Author-email: rusnicolay@gmail.com
8
- License: MIT License
9
- Keywords: natural language processing,relation extraction,sentiment analysis
10
- Platform: UNKNOWN
11
- Classifier: Programming Language :: Python
12
- Classifier: Programming Language :: Python :: 3.6
13
- Classifier: Topic :: Software Development :: Libraries :: Python Modules
14
- Classifier: Topic :: Scientific/Engineering :: Information Analysis
15
- Classifier: Topic :: Text Processing :: Linguistic
16
- License-File: LICENSE
17
- Requires-Dist: tqdm
18
- Requires-Dist: enum34 (==1.1.10)
19
- Requires-Dist: numpy (>=1.14.5)
20
- Requires-Dist: pymystem3 (==0.2.0)
21
-
22
- UNKNOWN
23
-