arekit 0.23.1__py3-none-any.whl → 0.25.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/common/context/terms_mapper.py +2 -2
- arekit/common/data/const.py +5 -4
- arekit/common/{experiment/api/ops_doc.py → data/doc_provider.py} +1 -1
- arekit/common/data/input/providers/columns/sample.py +6 -1
- arekit/common/data/input/providers/instances/base.py +1 -1
- arekit/common/data/input/providers/rows/base.py +36 -13
- arekit/common/data/input/providers/rows/samples.py +57 -55
- arekit/common/data/input/providers/sample/cropped.py +2 -2
- arekit/common/data/input/sample.py +1 -1
- arekit/common/data/rows_fmt.py +82 -0
- arekit/common/data/rows_parser.py +43 -0
- arekit/common/data/storages/base.py +23 -18
- arekit/common/data/views/samples.py +2 -8
- arekit/common/{news → docs}/base.py +2 -2
- arekit/common/{news → docs}/entities_grouping.py +2 -1
- arekit/common/{news → docs}/entity.py +2 -1
- arekit/common/{news → docs}/parsed/base.py +5 -5
- arekit/common/docs/parsed/providers/base.py +68 -0
- arekit/common/{news → docs}/parsed/providers/base_pairs.py +2 -2
- arekit/common/{news → docs}/parsed/providers/entity_service.py +27 -22
- arekit/common/{news → docs}/parsed/providers/opinion_pairs.py +2 -2
- arekit/common/{news → docs}/parsed/providers/text_opinion_pairs.py +6 -6
- arekit/common/docs/parsed/service.py +31 -0
- arekit/common/docs/parser.py +66 -0
- arekit/common/{news → docs}/sentence.py +1 -1
- arekit/common/entities/base.py +11 -2
- arekit/common/experiment/api/base_samples_io.py +1 -1
- arekit/common/frames/variants/collection.py +2 -2
- arekit/common/linkage/base.py +2 -2
- arekit/common/linkage/meta.py +23 -0
- arekit/common/linkage/opinions.py +1 -1
- arekit/common/linkage/text_opinions.py +2 -2
- arekit/common/opinions/annot/algo/base.py +1 -1
- arekit/common/opinions/annot/algo/pair_based.py +15 -13
- arekit/common/opinions/annot/algo/predefined.py +4 -4
- arekit/common/opinions/annot/algo_based.py +5 -5
- arekit/common/opinions/annot/base.py +3 -3
- arekit/common/opinions/base.py +7 -7
- arekit/common/opinions/collection.py +3 -3
- arekit/common/pipeline/base.py +12 -16
- arekit/common/pipeline/batching.py +28 -0
- arekit/common/pipeline/context.py +5 -1
- arekit/common/pipeline/items/base.py +38 -1
- arekit/common/pipeline/items/flatten.py +5 -1
- arekit/common/pipeline/items/handle.py +2 -1
- arekit/common/pipeline/items/iter.py +2 -1
- arekit/common/pipeline/items/map.py +2 -1
- arekit/common/pipeline/items/map_nested.py +4 -0
- arekit/common/pipeline/utils.py +32 -0
- arekit/common/service/sqlite.py +36 -0
- arekit/common/synonyms/base.py +2 -2
- arekit/common/text/{partitioning/str.py → partitioning.py} +16 -11
- arekit/common/text_opinions/base.py +11 -11
- arekit/common/utils.py +33 -46
- arekit/contrib/networks/embedding.py +3 -3
- arekit/contrib/networks/embedding_io.py +5 -5
- arekit/contrib/networks/input/const.py +0 -2
- arekit/contrib/networks/input/providers/sample.py +15 -29
- arekit/contrib/networks/input/rows_parser.py +47 -134
- arekit/contrib/prompt/sample.py +18 -16
- arekit/contrib/utils/data/contents/opinions.py +17 -5
- arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
- arekit/contrib/utils/data/{doc_ops → doc_provider}/dir_based.py +7 -7
- arekit/contrib/utils/data/readers/base.py +3 -0
- arekit/contrib/utils/data/readers/csv_pd.py +10 -4
- arekit/contrib/utils/data/readers/jsonl.py +3 -0
- arekit/contrib/utils/data/readers/sqlite.py +14 -0
- arekit/contrib/utils/data/service/balance.py +0 -1
- arekit/contrib/utils/data/storages/pandas_based.py +3 -5
- arekit/contrib/utils/data/storages/row_cache.py +18 -6
- arekit/contrib/utils/data/storages/sqlite_based.py +17 -0
- arekit/contrib/utils/data/writers/base.py +5 -0
- arekit/contrib/utils/data/writers/csv_native.py +3 -0
- arekit/contrib/utils/data/writers/csv_pd.py +3 -0
- arekit/contrib/utils/data/writers/json_opennre.py +31 -13
- arekit/contrib/utils/data/writers/sqlite_native.py +114 -0
- arekit/contrib/utils/io_utils/embedding.py +25 -33
- arekit/contrib/utils/io_utils/utils.py +3 -24
- arekit/contrib/utils/pipelines/items/sampling/base.py +31 -26
- arekit/contrib/utils/pipelines/items/sampling/networks.py +7 -10
- arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
- arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +3 -3
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +2 -1
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +3 -5
- arekit/contrib/utils/pipelines/items/text/translator.py +136 -0
- arekit/contrib/utils/pipelines/opinion_collections.py +5 -5
- arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +7 -7
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +34 -22
- arekit/contrib/utils/pipelines/text_opinion/filters/base.py +1 -1
- arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +1 -1
- arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +3 -3
- arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +4 -4
- arekit/contrib/utils/serializer.py +4 -23
- arekit-0.25.0.data/data/logo.png +0 -0
- arekit-0.25.0.dist-info/METADATA +82 -0
- arekit-0.25.0.dist-info/RECORD +259 -0
- {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/WHEEL +1 -1
- arekit/common/data/row_ids/base.py +0 -79
- arekit/common/data/row_ids/binary.py +0 -38
- arekit/common/data/row_ids/multiple.py +0 -14
- arekit/common/folding/base.py +0 -36
- arekit/common/folding/fixed.py +0 -42
- arekit/common/folding/nofold.py +0 -15
- arekit/common/folding/united.py +0 -46
- arekit/common/news/objects_parser.py +0 -37
- arekit/common/news/parsed/providers/base.py +0 -48
- arekit/common/news/parsed/service.py +0 -31
- arekit/common/news/parser.py +0 -34
- arekit/common/text/parser.py +0 -12
- arekit/common/text/partitioning/__init__.py +0 -0
- arekit/common/text/partitioning/base.py +0 -4
- arekit/common/text/partitioning/terms.py +0 -35
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +0 -83
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +0 -33
- arekit/contrib/source/brat/entities/entity.py +0 -42
- arekit/contrib/source/brat/entities/parser.py +0 -53
- arekit/contrib/source/brat/news.py +0 -28
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +0 -19
- arekit/contrib/source/brat/relation.py +0 -32
- arekit/contrib/source/brat/sentence.py +0 -69
- arekit/contrib/source/brat/sentences_reader.py +0 -128
- arekit/contrib/source/download.py +0 -41
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +0 -55
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +0 -75
- arekit/contrib/source/nerel/io_utils.py +0 -62
- arekit/contrib/source/nerel/labels.py +0 -241
- arekit/contrib/source/nerel/reader.py +0 -46
- arekit/contrib/source/nerel/utils.py +0 -24
- arekit/contrib/source/nerel/versions.py +0 -12
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +0 -62
- arekit/contrib/source/nerelbio/labels.py +0 -265
- arekit/contrib/source/nerelbio/reader.py +0 -8
- arekit/contrib/source/nerelbio/versions.py +0 -8
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +0 -36
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
- arekit/contrib/source/ruattitudes/io_utils.py +0 -56
- arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
- arekit/contrib/source/ruattitudes/news.py +0 -51
- arekit/contrib/source/ruattitudes/news_brat.py +0 -44
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
- arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
- arekit/contrib/source/ruattitudes/reader.py +0 -268
- arekit/contrib/source/ruattitudes/sentence.py +0 -73
- arekit/contrib/source/ruattitudes/synonyms.py +0 -17
- arekit/contrib/source/ruattitudes/text_object.py +0 -57
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +0 -157
- arekit/contrib/source/rusentiframes/effect.py +0 -24
- arekit/contrib/source/rusentiframes/io_utils.py +0 -19
- arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
- arekit/contrib/source/rusentiframes/polarity.py +0 -35
- arekit/contrib/source/rusentiframes/role.py +0 -15
- arekit/contrib/source/rusentiframes/state.py +0 -24
- arekit/contrib/source/rusentiframes/types.py +0 -42
- arekit/contrib/source/rusentiframes/value.py +0 -2
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +0 -3
- arekit/contrib/source/rusentrel/entities.py +0 -26
- arekit/contrib/source/rusentrel/io_utils.py +0 -125
- arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
- arekit/contrib/source/rusentrel/news_reader.py +0 -51
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
- arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
- arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
- arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
- arekit/contrib/source/rusentrel/synonyms.py +0 -17
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +0 -52
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +0 -32
- arekit/contrib/source/sentinerel/folding/fixed.py +0 -73
- arekit/contrib/source/sentinerel/io_utils.py +0 -87
- arekit/contrib/source/sentinerel/labels.py +0 -53
- arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
- arekit/contrib/source/sentinerel/reader.py +0 -42
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +0 -19
- arekit/contrib/source/zip_utils.py +0 -47
- arekit/contrib/utils/bert/rows.py +0 -0
- arekit/contrib/utils/bert/text_b_rus.py +0 -18
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
- arekit/contrib/utils/cv/__init__.py +0 -0
- arekit/contrib/utils/cv/doc_stat/__init__.py +0 -0
- arekit/contrib/utils/cv/doc_stat/base.py +0 -37
- arekit/contrib/utils/cv/doc_stat/sentence.py +0 -12
- arekit/contrib/utils/cv/splitters/__init__.py +0 -0
- arekit/contrib/utils/cv/splitters/base.py +0 -4
- arekit/contrib/utils/cv/splitters/default.py +0 -53
- arekit/contrib/utils/cv/splitters/statistical.py +0 -57
- arekit/contrib/utils/cv/two_class.py +0 -77
- arekit/contrib/utils/data/doc_ops/__init__.py +0 -0
- arekit/contrib/utils/data/doc_ops/dict_based.py +0 -13
- arekit/contrib/utils/data/ext.py +0 -31
- arekit/contrib/utils/data/views/__init__.py +0 -0
- arekit/contrib/utils/data/views/linkages/__init__.py +0 -0
- arekit/contrib/utils/data/views/linkages/base.py +0 -58
- arekit/contrib/utils/data/views/linkages/multilabel.py +0 -48
- arekit/contrib/utils/data/views/linkages/utils.py +0 -24
- arekit/contrib/utils/data/views/opinions.py +0 -14
- arekit/contrib/utils/download.py +0 -78
- arekit/contrib/utils/entities/formatters/str_rus_cased_fmt.py +0 -78
- arekit/contrib/utils/entities/formatters/str_rus_nocased_fmt.py +0 -15
- arekit/contrib/utils/entities/formatters/str_simple_fmt.py +0 -24
- arekit/contrib/utils/entities/formatters/str_simple_uppercase_fmt.py +0 -21
- arekit/contrib/utils/io_utils/opinions.py +0 -39
- arekit/contrib/utils/io_utils/samples.py +0 -78
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +0 -43
- arekit/contrib/utils/lexicons/relation.py +0 -45
- arekit/contrib/utils/lexicons/rusentilex.py +0 -34
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +0 -83
- arekit/contrib/utils/pipelines/items/sampling/bert.py +0 -5
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
- arekit/contrib/utils/pipelines/items/to_output.py +0 -101
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_ops.py +0 -27
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -59
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_ops.py +0 -29
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -59
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_ops.py +0 -56
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -19
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -58
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_ops.py +0 -21
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -100
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_ops.py +0 -29
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -175
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
- arekit/contrib/utils/resources.py +0 -26
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
- arekit/contrib/utils/utils_folding.py +0 -19
- arekit/download_data.py +0 -11
- arekit-0.23.1.dist-info/METADATA +0 -23
- arekit-0.23.1.dist-info/RECORD +0 -403
- /arekit/common/{data/row_ids → docs}/__init__.py +0 -0
- /arekit/common/{folding → docs/parsed}/__init__.py +0 -0
- /arekit/common/{news → docs/parsed/providers}/__init__.py +0 -0
- /arekit/common/{news → docs}/parsed/term_position.py +0 -0
- /arekit/common/{news/parsed → service}/__init__.py +0 -0
- /arekit/{common/news/parsed/providers → contrib/utils/data/doc_provider}/__init__.py +0 -0
- {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/LICENSE +0 -0
- {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/top_level.txt +0 -0
|
@@ -1,88 +0,0 @@
|
|
|
1
|
-
from arekit.common.experiment.api.ops_doc import DocumentOperations
|
|
2
|
-
from arekit.common.labels.str_fmt import StringLabelsFormatter
|
|
3
|
-
from arekit.common.news.parsed.base import ParsedNews
|
|
4
|
-
from arekit.common.news.parsed.providers.base import BaseParsedNewsServiceProvider
|
|
5
|
-
from arekit.common.news.parsed.providers.entity_service import EntityServiceProvider
|
|
6
|
-
from arekit.common.news.parsed.service import ParsedNewsService
|
|
7
|
-
from arekit.common.opinions.annot.base import BaseOpinionAnnotator
|
|
8
|
-
from arekit.contrib.source.brat.news import BratNews
|
|
9
|
-
from arekit.contrib.source.brat.opinions.converter import BratRelationConverter
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class PredefinedTextOpinionAnnotator(BaseOpinionAnnotator):
|
|
13
|
-
""" Brat-based text-opinion annotator (converter).
|
|
14
|
-
It converts the pre-annotated Relations from BRAT-documents to TextOpinions
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
def __init__(self, doc_ops, label_formatter, keep_any_type=False, entity_index_func=None):
|
|
18
|
-
"""
|
|
19
|
-
get_doc_func:
|
|
20
|
-
func(doc_id)
|
|
21
|
-
|
|
22
|
-
label_formatter: String Labels Formatter
|
|
23
|
-
required for conversion.
|
|
24
|
-
|
|
25
|
-
keep_any_type: bool
|
|
26
|
-
flag that defines whether there is a need to consider all the text opinions
|
|
27
|
-
or only one that supported by label formatter.
|
|
28
|
-
|
|
29
|
-
entity_index_func: is a way of how we provide an external entity ID
|
|
30
|
-
fund(entity) -> ID
|
|
31
|
-
"""
|
|
32
|
-
assert(isinstance(doc_ops, DocumentOperations))
|
|
33
|
-
assert(isinstance(label_formatter, StringLabelsFormatter))
|
|
34
|
-
assert(callable(entity_index_func) or entity_index_func is None)
|
|
35
|
-
super(PredefinedTextOpinionAnnotator, self).__init__()
|
|
36
|
-
|
|
37
|
-
self.__doc_ops = doc_ops
|
|
38
|
-
self.__label_formatter = label_formatter
|
|
39
|
-
self.__keep_any_type = keep_any_type
|
|
40
|
-
self.__entity_index_func = (lambda brat_entity: brat_entity.ID) if \
|
|
41
|
-
entity_index_func is None else entity_index_func
|
|
42
|
-
|
|
43
|
-
@staticmethod
|
|
44
|
-
def __convert_opinion_id(news, origin_id, esp):
|
|
45
|
-
assert(isinstance(news, BratNews))
|
|
46
|
-
assert(isinstance(origin_id, int))
|
|
47
|
-
assert(isinstance(esp, BaseParsedNewsServiceProvider))
|
|
48
|
-
|
|
49
|
-
if not news.contains_entity(origin_id):
|
|
50
|
-
# Due to the complexity of entities, some entities might be nested.
|
|
51
|
-
# Therefore the latter, some entities might be discarded.
|
|
52
|
-
return None
|
|
53
|
-
|
|
54
|
-
origin_entity = news.get_entity_by_id(origin_id)
|
|
55
|
-
|
|
56
|
-
if not esp.contains_entity(origin_entity):
|
|
57
|
-
return None
|
|
58
|
-
|
|
59
|
-
document_entity = esp.get_document_entity(origin_entity)
|
|
60
|
-
return document_entity.IdInDocument
|
|
61
|
-
|
|
62
|
-
def _annot_collection_core(self, parsed_news):
|
|
63
|
-
assert(isinstance(parsed_news, ParsedNews))
|
|
64
|
-
|
|
65
|
-
pns = ParsedNewsService(parsed_news=parsed_news, providers=[
|
|
66
|
-
EntityServiceProvider(self.__entity_index_func)
|
|
67
|
-
])
|
|
68
|
-
esp = pns.get_provider(EntityServiceProvider.NAME)
|
|
69
|
-
news = self.__doc_ops.by_id(parsed_news.RelatedDocID)
|
|
70
|
-
|
|
71
|
-
for brat_relation in news.Relations:
|
|
72
|
-
|
|
73
|
-
if self.__label_formatter.supports_value(brat_relation.Type) or self.__keep_any_type:
|
|
74
|
-
|
|
75
|
-
text_opinion = BratRelationConverter.to_text_opinion(
|
|
76
|
-
brat_relation=brat_relation,
|
|
77
|
-
doc_id=parsed_news.RelatedDocID,
|
|
78
|
-
label_formatter=self.__label_formatter)
|
|
79
|
-
|
|
80
|
-
internal_opinion = text_opinion.try_convert(
|
|
81
|
-
other=text_opinion,
|
|
82
|
-
convert_func=lambda origin_id: PredefinedTextOpinionAnnotator.__convert_opinion_id(
|
|
83
|
-
news=news, origin_id=origin_id, esp=esp))
|
|
84
|
-
|
|
85
|
-
if internal_opinion is None:
|
|
86
|
-
continue
|
|
87
|
-
|
|
88
|
-
yield internal_opinion
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
from arekit.common.text.stemmer import Stemmer
|
|
2
|
-
from arekit.contrib.utils.download import NEWS_MYSTEM_SKIPGRAM_1000_20_2015, load_embedding_and_vocab
|
|
3
|
-
from arekit.contrib.utils.embeddings.rusvectores import RusvectoresEmbedding
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def load_embedding_news_mystem_skipgram_1000_20_2015(stemmer, auto_download=False):
|
|
7
|
-
""" Embedding from https://rusvectores.org/ru/models/
|
|
8
|
-
Description: Russian news, from 2013 till the october 2015
|
|
9
|
-
Corpora size: 2.5 milliard words
|
|
10
|
-
Vocabulary volume: 147 358
|
|
11
|
-
Frequency bound: 200
|
|
12
|
-
Tagset: Mystem
|
|
13
|
-
Algorithm: Continuous Skip-Gram
|
|
14
|
-
Vector size: 1000
|
|
15
|
-
|
|
16
|
-
stemmer: Stemmer
|
|
17
|
-
It is expected to adopt MystemWrapper.
|
|
18
|
-
auto_download: bool
|
|
19
|
-
Whether try to download if the resource was missed.
|
|
20
|
-
"""
|
|
21
|
-
assert(isinstance(stemmer, Stemmer) or stemmer is None)
|
|
22
|
-
embedding, vocab = load_embedding_and_vocab(local_name=NEWS_MYSTEM_SKIPGRAM_1000_20_2015,
|
|
23
|
-
check_existance=True,
|
|
24
|
-
download_if_missed=auto_download)
|
|
25
|
-
embedding = RusvectoresEmbedding(matrix=embedding, words=vocab, stemmer=stemmer)
|
|
26
|
-
return embedding
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
from arekit.common.entities.base import Entity
|
|
2
|
-
from arekit.common.news.parsed.base import ParsedNews
|
|
3
|
-
from arekit.common.news.parsed.providers.entity_service import EntityServiceProvider
|
|
4
|
-
from arekit.common.news.parsed.term_position import TermPositionTypes, TermPosition
|
|
5
|
-
from arekit.common.text.enums import TermFormat
|
|
6
|
-
from arekit.common.text.parsed import BaseParsedText
|
|
7
|
-
from arekit.common.text_opinions.base import TextOpinion
|
|
8
|
-
from arekit.contrib.utils.pipelines.text_opinion.filters.base import TextOpinionFilter
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class ProfessionAsCharacteristicSentimentTextOpinionFilter(TextOpinionFilter):
|
|
12
|
-
""" This is a filter, based on the PROFESSION type prefixed entity for
|
|
13
|
-
the SentiNEREL collection.
|
|
14
|
-
|
|
15
|
-
In this case, profession acts as a characteristics of the Person, and
|
|
16
|
-
therefore there is no need to consider these attitudes in annotation.
|
|
17
|
-
|
|
18
|
-
For a greater details, see:
|
|
19
|
-
https://github.com/nicolay-r/AREkit/issues/404
|
|
20
|
-
"""
|
|
21
|
-
|
|
22
|
-
def __init__(self, char_type="PROFESSION"):
|
|
23
|
-
self.__char_type = char_type
|
|
24
|
-
self.__next_entity_types = ["PERSON"]
|
|
25
|
-
|
|
26
|
-
def filter(self, text_opinion, parsed_news, entity_service_provider):
|
|
27
|
-
assert(isinstance(text_opinion, TextOpinion))
|
|
28
|
-
assert(isinstance(parsed_news, ParsedNews))
|
|
29
|
-
assert(isinstance(entity_service_provider, EntityServiceProvider))
|
|
30
|
-
|
|
31
|
-
# Picking up entity.
|
|
32
|
-
target_entity = entity_service_provider._doc_entities[text_opinion.TargetId]
|
|
33
|
-
assert(isinstance(target_entity, Entity))
|
|
34
|
-
|
|
35
|
-
if target_entity.Type != self.__char_type:
|
|
36
|
-
# This is not our case.
|
|
37
|
-
return True
|
|
38
|
-
|
|
39
|
-
# Picking up the related target entity position.
|
|
40
|
-
target_pos = entity_service_provider.get_entity_position(text_opinion.TargetId)
|
|
41
|
-
assert(isinstance(target_pos, TermPosition))
|
|
42
|
-
|
|
43
|
-
# Picking up the related sentence of target.
|
|
44
|
-
t_sent = target_pos.get_index(TermPositionTypes.SentenceIndex)
|
|
45
|
-
sentence = parsed_news.get_sentence(t_sent)
|
|
46
|
-
assert(isinstance(sentence, BaseParsedText))
|
|
47
|
-
|
|
48
|
-
# Picking up the entity position in sentence.
|
|
49
|
-
target_term_ind = target_pos.get_index(TermPositionTypes.IndexInSentence)
|
|
50
|
-
|
|
51
|
-
# We pick up the next term within the parsed sentece.
|
|
52
|
-
next_term = sentence.get_term(target_term_ind + 1, term_format=TermFormat.Raw) \
|
|
53
|
-
if len(sentence) > target_term_ind + 1 else None
|
|
54
|
-
|
|
55
|
-
if next_term is None:
|
|
56
|
-
# This is not our case.
|
|
57
|
-
return True
|
|
58
|
-
|
|
59
|
-
if isinstance(next_term, Entity) and next_term.Type in self.__next_entity_types:
|
|
60
|
-
# We reject this opinion from the annotation, since this is not expected to be a sentiment one.
|
|
61
|
-
return False
|
|
62
|
-
|
|
63
|
-
return True
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
from arekit.common.folding.base import BaseDataFolding
|
|
2
|
-
from arekit.contrib.utils.cv.two_class import TwoClassCVFolding
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
def folding_iter_states(folding):
|
|
6
|
-
if isinstance(folding, TwoClassCVFolding):
|
|
7
|
-
for state in folding.iter_states():
|
|
8
|
-
yield state
|
|
9
|
-
yield 0
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def experiment_iter_index(folding):
|
|
13
|
-
assert(isinstance(folding, BaseDataFolding))
|
|
14
|
-
|
|
15
|
-
if isinstance(folding, TwoClassCVFolding):
|
|
16
|
-
return folding.StateIndex
|
|
17
|
-
|
|
18
|
-
# In other cases we consider that there is only a single state.
|
|
19
|
-
return 0
|
arekit/download_data.py
DELETED
arekit-0.23.1.dist-info/METADATA
DELETED
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: arekit
|
|
3
|
-
Version: 0.23.1
|
|
4
|
-
Summary: Library devoted to Document level Attitude and Relation Extraction for text objects with entity-linking (EL) API support
|
|
5
|
-
Home-page: https://github.com/nicolay-r/AREkit
|
|
6
|
-
Author: Nicolay Rusnachenko
|
|
7
|
-
Author-email: rusnicolay@gmail.com
|
|
8
|
-
License: MIT License
|
|
9
|
-
Keywords: natural language processing,relation extraction,sentiment analysis
|
|
10
|
-
Platform: UNKNOWN
|
|
11
|
-
Classifier: Programming Language :: Python
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.6
|
|
13
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
14
|
-
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
15
|
-
Classifier: Topic :: Text Processing :: Linguistic
|
|
16
|
-
License-File: LICENSE
|
|
17
|
-
Requires-Dist: tqdm
|
|
18
|
-
Requires-Dist: enum34 (==1.1.10)
|
|
19
|
-
Requires-Dist: numpy (>=1.14.5)
|
|
20
|
-
Requires-Dist: pymystem3 (==0.2.0)
|
|
21
|
-
|
|
22
|
-
UNKNOWN
|
|
23
|
-
|