arekit 0.24.0__py3-none-any.whl → 0.25.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/common/data/storages/base.py +4 -15
- arekit/common/docs/entities_grouping.py +2 -1
- arekit/common/docs/parser.py +27 -22
- arekit/common/pipeline/base.py +12 -16
- arekit/common/pipeline/batching.py +28 -0
- arekit/common/pipeline/context.py +5 -1
- arekit/common/pipeline/items/base.py +39 -2
- arekit/common/pipeline/items/flatten.py +5 -1
- arekit/common/pipeline/items/handle.py +2 -1
- arekit/common/pipeline/items/iter.py +2 -1
- arekit/common/pipeline/items/map.py +2 -1
- arekit/common/pipeline/items/map_nested.py +4 -0
- arekit/common/pipeline/utils.py +32 -0
- arekit/common/text/{partitioning/str.py → partitioning.py} +14 -9
- arekit/common/utils.py +11 -52
- arekit/contrib/utils/data/contents/opinions.py +13 -3
- arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
- arekit/contrib/utils/data/storages/pandas_based.py +2 -17
- arekit/contrib/utils/data/storages/row_cache.py +8 -2
- arekit/contrib/utils/data/storages/sqlite_based.py +18 -0
- arekit/contrib/utils/io_utils/utils.py +1 -18
- arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
- arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +8 -10
- arekit-0.25.1.data/data/logo.png +0 -0
- arekit-0.25.1.dist-info/METADATA +81 -0
- arekit-0.25.1.dist-info/RECORD +186 -0
- {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/WHEEL +1 -1
- arekit/common/data/input/repositories/__init__.py +0 -0
- arekit/common/data/input/repositories/base.py +0 -68
- arekit/common/data/input/repositories/sample.py +0 -22
- arekit/common/data/views/__init__.py +0 -0
- arekit/common/data/views/samples.py +0 -26
- arekit/common/docs/objects_parser.py +0 -37
- arekit/common/text/parser.py +0 -12
- arekit/common/text/partitioning/__init__.py +0 -0
- arekit/common/text/partitioning/base.py +0 -4
- arekit/common/text/partitioning/terms.py +0 -35
- arekit/contrib/networks/__init__.py +0 -0
- arekit/contrib/networks/embedding.py +0 -149
- arekit/contrib/networks/embedding_io.py +0 -18
- arekit/contrib/networks/input/__init__.py +0 -0
- arekit/contrib/networks/input/const.py +0 -6
- arekit/contrib/networks/input/ctx_serialization.py +0 -28
- arekit/contrib/networks/input/embedding/__init__.py +0 -0
- arekit/contrib/networks/input/embedding/matrix.py +0 -29
- arekit/contrib/networks/input/embedding/offsets.py +0 -55
- arekit/contrib/networks/input/formatters/__init__.py +0 -0
- arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
- arekit/contrib/networks/input/providers/__init__.py +0 -0
- arekit/contrib/networks/input/providers/sample.py +0 -129
- arekit/contrib/networks/input/providers/term_connotation.py +0 -23
- arekit/contrib/networks/input/providers/text.py +0 -24
- arekit/contrib/networks/input/rows_parser.py +0 -47
- arekit/contrib/networks/input/term_types.py +0 -13
- arekit/contrib/networks/input/terms_mapping.py +0 -60
- arekit/contrib/networks/vectorizer.py +0 -6
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +0 -84
- arekit/contrib/source/brat/doc.py +0 -28
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +0 -13
- arekit/contrib/source/brat/entities/entity.py +0 -42
- arekit/contrib/source/brat/entities/parser.py +0 -53
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +0 -19
- arekit/contrib/source/brat/relation.py +0 -32
- arekit/contrib/source/brat/sentence.py +0 -69
- arekit/contrib/source/brat/sentences_reader.py +0 -128
- arekit/contrib/source/download.py +0 -41
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +0 -55
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +0 -74
- arekit/contrib/source/nerel/io_utils.py +0 -62
- arekit/contrib/source/nerel/labels.py +0 -241
- arekit/contrib/source/nerel/reader.py +0 -46
- arekit/contrib/source/nerel/utils.py +0 -24
- arekit/contrib/source/nerel/versions.py +0 -12
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +0 -62
- arekit/contrib/source/nerelbio/labels.py +0 -265
- arekit/contrib/source/nerelbio/reader.py +0 -8
- arekit/contrib/source/nerelbio/versions.py +0 -8
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +0 -36
- arekit/contrib/source/ruattitudes/doc.py +0 -51
- arekit/contrib/source/ruattitudes/doc_brat.py +0 -44
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
- arekit/contrib/source/ruattitudes/io_utils.py +0 -56
- arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
- arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
- arekit/contrib/source/ruattitudes/reader.py +0 -268
- arekit/contrib/source/ruattitudes/sentence.py +0 -73
- arekit/contrib/source/ruattitudes/synonyms.py +0 -17
- arekit/contrib/source/ruattitudes/text_object.py +0 -59
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +0 -157
- arekit/contrib/source/rusentiframes/effect.py +0 -24
- arekit/contrib/source/rusentiframes/io_utils.py +0 -19
- arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
- arekit/contrib/source/rusentiframes/polarity.py +0 -35
- arekit/contrib/source/rusentiframes/role.py +0 -15
- arekit/contrib/source/rusentiframes/state.py +0 -24
- arekit/contrib/source/rusentiframes/types.py +0 -42
- arekit/contrib/source/rusentiframes/value.py +0 -2
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +0 -3
- arekit/contrib/source/rusentrel/docs_reader.py +0 -51
- arekit/contrib/source/rusentrel/entities.py +0 -26
- arekit/contrib/source/rusentrel/io_utils.py +0 -125
- arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
- arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
- arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
- arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
- arekit/contrib/source/rusentrel/synonyms.py +0 -17
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +0 -52
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +0 -31
- arekit/contrib/source/sentinerel/folding/fixed.py +0 -70
- arekit/contrib/source/sentinerel/io_utils.py +0 -87
- arekit/contrib/source/sentinerel/labels.py +0 -53
- arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
- arekit/contrib/source/sentinerel/reader.py +0 -42
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +0 -19
- arekit/contrib/source/zip_utils.py +0 -47
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
- arekit/contrib/utils/data/readers/__init__.py +0 -0
- arekit/contrib/utils/data/readers/base.py +0 -7
- arekit/contrib/utils/data/readers/csv_pd.py +0 -38
- arekit/contrib/utils/data/readers/jsonl.py +0 -15
- arekit/contrib/utils/data/service/__init__.py +0 -0
- arekit/contrib/utils/data/service/balance.py +0 -50
- arekit/contrib/utils/data/writers/csv_native.py +0 -63
- arekit/contrib/utils/data/writers/csv_pd.py +0 -40
- arekit/contrib/utils/data/writers/json_opennre.py +0 -132
- arekit/contrib/utils/data/writers/sqlite_native.py +0 -110
- arekit/contrib/utils/download.py +0 -77
- arekit/contrib/utils/embeddings/__init__.py +0 -0
- arekit/contrib/utils/embeddings/rusvectores.py +0 -58
- arekit/contrib/utils/embeddings/tokens.py +0 -30
- arekit/contrib/utils/io_utils/embedding.py +0 -72
- arekit/contrib/utils/io_utils/opinions.py +0 -37
- arekit/contrib/utils/io_utils/samples.py +0 -79
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +0 -41
- arekit/contrib/utils/lexicons/relation.py +0 -42
- arekit/contrib/utils/lexicons/rusentilex.py +0 -37
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +0 -83
- arekit/contrib/utils/np_utils/__init__.py +0 -0
- arekit/contrib/utils/np_utils/embedding.py +0 -22
- arekit/contrib/utils/np_utils/npz_utils.py +0 -13
- arekit/contrib/utils/np_utils/vocab.py +0 -20
- arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/base.py +0 -99
- arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -54
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -32
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -107
- arekit/contrib/utils/pipelines/items/text/translator.py +0 -135
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +0 -27
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -65
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +0 -29
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -64
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +0 -56
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -20
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -65
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +0 -21
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -107
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +0 -29
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -180
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
- arekit/contrib/utils/processing/languages/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/mods.py +0 -12
- arekit/contrib/utils/processing/languages/pos.py +0 -23
- arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
- arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
- arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
- arekit/contrib/utils/processing/languages/ru/number.py +0 -23
- arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
- arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
- arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
- arekit/contrib/utils/processing/pos/__init__.py +0 -0
- arekit/contrib/utils/processing/pos/base.py +0 -12
- arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
- arekit/contrib/utils/processing/pos/russian.py +0 -10
- arekit/contrib/utils/processing/text/__init__.py +0 -0
- arekit/contrib/utils/processing/text/tokens.py +0 -127
- arekit/contrib/utils/resources.py +0 -25
- arekit/contrib/utils/serializer.py +0 -43
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
- arekit/contrib/utils/vectorizers/__init__.py +0 -0
- arekit/contrib/utils/vectorizers/bpe.py +0 -93
- arekit/contrib/utils/vectorizers/random_norm.py +0 -39
- arekit/download_data.py +0 -11
- arekit-0.24.0.dist-info/METADATA +0 -23
- arekit-0.24.0.dist-info/RECORD +0 -374
- {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/LICENSE +0 -0
- {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/top_level.txt +0 -0
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
from enum import Enum
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class RussianCases(Enum):
|
|
5
|
-
""" Падежи русского языка
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
""" не определено
|
|
9
|
-
"""
|
|
10
|
-
UNKN = 10
|
|
11
|
-
|
|
12
|
-
""" именительный
|
|
13
|
-
"""
|
|
14
|
-
NOM = 1
|
|
15
|
-
|
|
16
|
-
""" родительный
|
|
17
|
-
"""
|
|
18
|
-
GEN = 2
|
|
19
|
-
|
|
20
|
-
""" дательный
|
|
21
|
-
"""
|
|
22
|
-
DAT = 3
|
|
23
|
-
|
|
24
|
-
""" винительный
|
|
25
|
-
"""
|
|
26
|
-
ACC = 4
|
|
27
|
-
|
|
28
|
-
""" творительный
|
|
29
|
-
"""
|
|
30
|
-
INS = 5
|
|
31
|
-
|
|
32
|
-
""" предложный
|
|
33
|
-
"""
|
|
34
|
-
ABL = 6
|
|
35
|
-
|
|
36
|
-
""" партитив
|
|
37
|
-
"""
|
|
38
|
-
PART = 7
|
|
39
|
-
|
|
40
|
-
""" местный
|
|
41
|
-
"""
|
|
42
|
-
LOC = 8
|
|
43
|
-
|
|
44
|
-
""" звательный
|
|
45
|
-
"""
|
|
46
|
-
VOC = 9
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
class RussianCasesService(object):
|
|
50
|
-
|
|
51
|
-
__english = {
|
|
52
|
-
'nom': RussianCases.NOM,
|
|
53
|
-
'gen': RussianCases.GEN,
|
|
54
|
-
'dat': RussianCases.DAT,
|
|
55
|
-
'acc': RussianCases.ACC,
|
|
56
|
-
'ins': RussianCases.INS,
|
|
57
|
-
'abl': RussianCases.ABL,
|
|
58
|
-
'part': RussianCases.PART,
|
|
59
|
-
'loc': RussianCases.LOC,
|
|
60
|
-
'voc': RussianCases.VOC,
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
__mystem_russian = {
|
|
64
|
-
'им': RussianCases.NOM,
|
|
65
|
-
'род': RussianCases.GEN,
|
|
66
|
-
'дат': RussianCases.DAT,
|
|
67
|
-
'вин': RussianCases.ACC,
|
|
68
|
-
'твор': RussianCases.INS,
|
|
69
|
-
'пр': RussianCases.ABL,
|
|
70
|
-
'парт': RussianCases.PART,
|
|
71
|
-
'местн': RussianCases.LOC,
|
|
72
|
-
'зват': RussianCases.VOC,
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
@staticmethod
|
|
76
|
-
def iter_rus_mystem_tags():
|
|
77
|
-
for key, value in RussianCasesService.__mystem_russian.items():
|
|
78
|
-
yield key, value
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
from arekit.contrib.utils.processing.languages.mods import BaseLanguageMods
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class RussianLanguageMods(BaseLanguageMods):
|
|
5
|
-
|
|
6
|
-
@staticmethod
|
|
7
|
-
def replace_specific_word_chars(word):
|
|
8
|
-
assert(isinstance(word, str))
|
|
9
|
-
return word.replace('ё', 'e')
|
|
10
|
-
|
|
11
|
-
@staticmethod
|
|
12
|
-
def is_negation_word(word):
|
|
13
|
-
return word == 'не'
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
from enum import Enum
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class RussianNumberType(Enum):
|
|
5
|
-
|
|
6
|
-
UNKN = 3
|
|
7
|
-
|
|
8
|
-
Plural = 1
|
|
9
|
-
|
|
10
|
-
Single = 2
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class RussianNumberTypeService(object):
|
|
14
|
-
|
|
15
|
-
__russian = {
|
|
16
|
-
'ед': RussianNumberType.Single,
|
|
17
|
-
'мн': RussianNumberType.Plural
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
@staticmethod
|
|
21
|
-
def iter_rus_mystem_tags():
|
|
22
|
-
for key, value in RussianNumberTypeService.__russian.items():
|
|
23
|
-
yield key, value
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
from arekit.contrib.utils.processing.languages.pos import PartOfSpeechType
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class PartOfSpeechTypesService(object):
|
|
5
|
-
|
|
6
|
-
__pos_names = {
|
|
7
|
-
"S": PartOfSpeechType.NOUN,
|
|
8
|
-
"ADV": PartOfSpeechType.ADV,
|
|
9
|
-
"ADVPRO": PartOfSpeechType.ADVPRO,
|
|
10
|
-
"ANUM": PartOfSpeechType.ANUM,
|
|
11
|
-
"APRO": PartOfSpeechType.APRO,
|
|
12
|
-
"COM": PartOfSpeechType.COM,
|
|
13
|
-
"CONJ": PartOfSpeechType.CONJ,
|
|
14
|
-
"INTJ": PartOfSpeechType.INTJ,
|
|
15
|
-
"NUM": PartOfSpeechType.NUM,
|
|
16
|
-
"PART": PartOfSpeechType.PART,
|
|
17
|
-
"PR": PartOfSpeechType.PR,
|
|
18
|
-
"A": PartOfSpeechType.ADJ,
|
|
19
|
-
"SPRO": PartOfSpeechType.SPRO,
|
|
20
|
-
"V": PartOfSpeechType.VERB,
|
|
21
|
-
"UNKN": PartOfSpeechType.Unknown,
|
|
22
|
-
"EMPTY": PartOfSpeechType.Empty}
|
|
23
|
-
|
|
24
|
-
@staticmethod
|
|
25
|
-
def iter_mystem_tags():
|
|
26
|
-
for key, value in PartOfSpeechTypesService.__pos_names.items():
|
|
27
|
-
yield key, value
|
|
28
|
-
|
|
29
|
-
@staticmethod
|
|
30
|
-
def get_mystem_from_string(value):
|
|
31
|
-
return PartOfSpeechTypesService.__pos_names[value]
|
|
32
|
-
|
|
33
|
-
@staticmethod
|
|
34
|
-
def get_mystem_pos_count():
|
|
35
|
-
return len(PartOfSpeechTypesService.__pos_names)
|
|
36
|
-
|
|
File without changes
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
from arekit.common.text.stemmer import Stemmer
|
|
2
|
-
from arekit.common.utils import filter_whitespaces
|
|
3
|
-
from pymystem3 import Mystem
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class MystemWrapper(Stemmer):
|
|
7
|
-
""" Yandex MyStem wrapper
|
|
8
|
-
|
|
9
|
-
part of speech description:
|
|
10
|
-
https://tech.yandex.ru/mystem/doc/grammemes-values-docpage/
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
def __init__(self, entire_input=False):
|
|
14
|
-
"""
|
|
15
|
-
entire_input: bool
|
|
16
|
-
Mystem parameter that allows to keep all information from input (true) or
|
|
17
|
-
remove garbage characters
|
|
18
|
-
"""
|
|
19
|
-
self.__mystem = Mystem(entire_input=entire_input)
|
|
20
|
-
|
|
21
|
-
# region properties
|
|
22
|
-
|
|
23
|
-
@property
|
|
24
|
-
def MystemInstance(self):
|
|
25
|
-
return self.__mystem
|
|
26
|
-
|
|
27
|
-
# endregion
|
|
28
|
-
|
|
29
|
-
# region public methods
|
|
30
|
-
|
|
31
|
-
def lemmatize_to_list(self, text):
|
|
32
|
-
return self.__lemmatize_core(text)
|
|
33
|
-
|
|
34
|
-
def lemmatize_to_str(self, text):
|
|
35
|
-
result = " ".join(self.__lemmatize_core(text))
|
|
36
|
-
return result if len(result) != 0 else self.__process_original_text(text)
|
|
37
|
-
|
|
38
|
-
# endregion
|
|
39
|
-
|
|
40
|
-
# region private methods
|
|
41
|
-
|
|
42
|
-
def __lemmatize_core(self, text):
|
|
43
|
-
assert(isinstance(text, str))
|
|
44
|
-
result_list = self.__mystem.lemmatize(self.__process_original_text(text))
|
|
45
|
-
return filter_whitespaces(result_list)
|
|
46
|
-
|
|
47
|
-
@staticmethod
|
|
48
|
-
def __process_original_text(text):
|
|
49
|
-
return text.lower()
|
|
50
|
-
|
|
51
|
-
# endregion
|
|
File without changes
|
|
@@ -1,134 +0,0 @@
|
|
|
1
|
-
from pymystem3 import Mystem
|
|
2
|
-
|
|
3
|
-
from arekit.contrib.utils.processing.languages.pos import PartOfSpeechType
|
|
4
|
-
from arekit.contrib.utils.processing.languages.ru.cases import RussianCases, RussianCasesService
|
|
5
|
-
from arekit.contrib.utils.processing.languages.ru.number import RussianNumberType, RussianNumberTypeService
|
|
6
|
-
from arekit.contrib.utils.processing.languages.ru.pos_service import PartOfSpeechTypesService
|
|
7
|
-
from arekit.contrib.utils.processing.pos.russian import RussianPOSTagger
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class POSMystemWrapper(RussianPOSTagger):
|
|
11
|
-
|
|
12
|
-
_ArgsSeparator = ','
|
|
13
|
-
_GrammarKey = 'gr'
|
|
14
|
-
|
|
15
|
-
def __init__(self, mystem):
|
|
16
|
-
assert(isinstance(mystem, Mystem))
|
|
17
|
-
self.__mystem = mystem
|
|
18
|
-
|
|
19
|
-
# region private methods
|
|
20
|
-
|
|
21
|
-
@staticmethod
|
|
22
|
-
def __extract_from_analysis(analysis, func):
|
|
23
|
-
"""
|
|
24
|
-
part of speech description:
|
|
25
|
-
https://tech.yandex.ru/mystem/doc/grammemes-values-docpage/
|
|
26
|
-
func: f(args) -> out
|
|
27
|
-
returns: str or None
|
|
28
|
-
"""
|
|
29
|
-
assert(callable(func))
|
|
30
|
-
|
|
31
|
-
if 'analysis' not in analysis:
|
|
32
|
-
return func(None)
|
|
33
|
-
|
|
34
|
-
info = analysis['analysis']
|
|
35
|
-
if len(info) == 0:
|
|
36
|
-
return func(None)
|
|
37
|
-
|
|
38
|
-
return func(info[0])
|
|
39
|
-
|
|
40
|
-
@staticmethod
|
|
41
|
-
def __get_pos(arguments):
|
|
42
|
-
if arguments is None:
|
|
43
|
-
return PartOfSpeechType.Unknown
|
|
44
|
-
|
|
45
|
-
pos = arguments[POSMystemWrapper._GrammarKey].split(POSMystemWrapper._ArgsSeparator)[0]
|
|
46
|
-
if '=' in pos:
|
|
47
|
-
pos = pos.split('=')[0]
|
|
48
|
-
|
|
49
|
-
return PartOfSpeechTypesService.get_mystem_from_string(pos)
|
|
50
|
-
|
|
51
|
-
@staticmethod
|
|
52
|
-
def __get_russian_case(arguments):
|
|
53
|
-
if arguments is None:
|
|
54
|
-
return RussianCases.UNKN
|
|
55
|
-
|
|
56
|
-
all_params = set(POSMystemWrapper.__iter_params(arguments))
|
|
57
|
-
|
|
58
|
-
for key, case in RussianCasesService.iter_rus_mystem_tags():
|
|
59
|
-
if key in all_params:
|
|
60
|
-
return case
|
|
61
|
-
|
|
62
|
-
return RussianCases.UNKN
|
|
63
|
-
|
|
64
|
-
@staticmethod
|
|
65
|
-
def __get_number(arguments):
|
|
66
|
-
if arguments is None:
|
|
67
|
-
return RussianNumberType.UNKN
|
|
68
|
-
|
|
69
|
-
all_params = set(POSMystemWrapper.__iter_params(arguments))
|
|
70
|
-
|
|
71
|
-
for key, case in RussianNumberTypeService.iter_rus_mystem_tags():
|
|
72
|
-
if key in all_params:
|
|
73
|
-
return case
|
|
74
|
-
|
|
75
|
-
return RussianNumberType.UNKN
|
|
76
|
-
|
|
77
|
-
@staticmethod
|
|
78
|
-
def __iter_params(arguments):
|
|
79
|
-
params = arguments[POSMystemWrapper._GrammarKey].split(POSMystemWrapper._ArgsSeparator)
|
|
80
|
-
for optionally_combined in params:
|
|
81
|
-
for param in optionally_combined.split('='):
|
|
82
|
-
yield param
|
|
83
|
-
|
|
84
|
-
# endregion
|
|
85
|
-
|
|
86
|
-
def get_term_pos(self, term):
|
|
87
|
-
assert(isinstance(term, str))
|
|
88
|
-
analyzed = self.__mystem.analyze(term)
|
|
89
|
-
return self.__extract_from_analysis(analyzed[0], self.__get_pos) \
|
|
90
|
-
if len(analyzed) > 0 else PartOfSpeechType.Unknown
|
|
91
|
-
|
|
92
|
-
def get_term_case(self, term):
|
|
93
|
-
assert(isinstance(term, str))
|
|
94
|
-
analyzed = self.__mystem.analyze(term)
|
|
95
|
-
return self.__extract_from_analysis(analyzed[0], self.__get_russian_case) \
|
|
96
|
-
if len(analyzed) > 0 else RussianCases.UNKN
|
|
97
|
-
|
|
98
|
-
def get_term_number(self, term):
|
|
99
|
-
assert(isinstance(term, str))
|
|
100
|
-
analyzed = self.__mystem.analyze(term)
|
|
101
|
-
return self.__extract_from_analysis(analyzed[0], self.__get_number) \
|
|
102
|
-
if len(analyzed) > 0 else RussianNumberType.UNKN
|
|
103
|
-
|
|
104
|
-
def get_terms_russian_cases(self, text):
|
|
105
|
-
""" list of part of speech according to the certain word in text
|
|
106
|
-
"""
|
|
107
|
-
assert(isinstance(text, str))
|
|
108
|
-
cases = []
|
|
109
|
-
|
|
110
|
-
analyzed = self.__mystem.analyze(text)
|
|
111
|
-
for a in analyzed:
|
|
112
|
-
pos = self.__extract_from_analysis(a, self.__get_russian_case) if len(analyzed) > 0 else RussianCases.UNKN
|
|
113
|
-
cases.append(pos)
|
|
114
|
-
|
|
115
|
-
return cases
|
|
116
|
-
|
|
117
|
-
def pos_to_int(self, pos):
|
|
118
|
-
assert(isinstance(pos, PartOfSpeechType))
|
|
119
|
-
return int(pos)
|
|
120
|
-
|
|
121
|
-
@staticmethod
|
|
122
|
-
def is_adjective(pos_type):
|
|
123
|
-
assert(isinstance(pos_type, PartOfSpeechType))
|
|
124
|
-
return pos_type == PartOfSpeechType.ADJ
|
|
125
|
-
|
|
126
|
-
@staticmethod
|
|
127
|
-
def is_noun(pos_type):
|
|
128
|
-
assert(isinstance(pos_type, PartOfSpeechType))
|
|
129
|
-
return pos_type == PartOfSpeechType.NOUN
|
|
130
|
-
|
|
131
|
-
@staticmethod
|
|
132
|
-
def is_verb(pos_type):
|
|
133
|
-
assert(isinstance(pos_type, PartOfSpeechType))
|
|
134
|
-
return pos_type == PartOfSpeechType.VERB
|
|
File without changes
|
|
@@ -1,127 +0,0 @@
|
|
|
1
|
-
from urllib.parse import urlparse
|
|
2
|
-
from arekit.common.context.token import Token
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
# TODO. Provide the base (BaseTokens) type.
|
|
6
|
-
# TODO. With the related API at BaseTokens.
|
|
7
|
-
class Tokens:
|
|
8
|
-
"""
|
|
9
|
-
Tokens used to describe a non-word text units, such as punctuation,
|
|
10
|
-
uknown words/chars, smiles, etc.
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
_wrapper = "<[{}]>"
|
|
14
|
-
COMMA = _wrapper.format(',')
|
|
15
|
-
SEMICOLON = _wrapper.format(';')
|
|
16
|
-
COLON = _wrapper.format(':')
|
|
17
|
-
QUOTE = _wrapper.format('QUOTE')
|
|
18
|
-
DASH = _wrapper.format('-')
|
|
19
|
-
LONG_DASH = _wrapper.format('long_dash')
|
|
20
|
-
DOT = _wrapper.format('.')
|
|
21
|
-
TRIPLE_DOTS = _wrapper.format('…')
|
|
22
|
-
EXC_SIGN = _wrapper.format('!')
|
|
23
|
-
QUESTION_SIGN = _wrapper.format('?')
|
|
24
|
-
OPEN_BRACKET = _wrapper.format('OPEN_BRACKET')
|
|
25
|
-
CLOSED_BRACKET = _wrapper.format('CLOSED_BRACKET')
|
|
26
|
-
NUMBER = _wrapper.format('NUMBER')
|
|
27
|
-
NEW_LINE = _wrapper.format("NEW_LINE")
|
|
28
|
-
UNKNOWN_CHAR = _wrapper.format('UNKNOWN_CHAR')
|
|
29
|
-
UNKNOWN_WORD = _wrapper.format('UNKNOWN_WORD')
|
|
30
|
-
URL = _wrapper.format("URL")
|
|
31
|
-
|
|
32
|
-
__token_mapping = {
|
|
33
|
-
',': COMMA,
|
|
34
|
-
'.': DOT,
|
|
35
|
-
'…': TRIPLE_DOTS,
|
|
36
|
-
':': COLON,
|
|
37
|
-
';': SEMICOLON,
|
|
38
|
-
'-': DASH,
|
|
39
|
-
'—': LONG_DASH,
|
|
40
|
-
'?': QUESTION_SIGN,
|
|
41
|
-
'!': EXC_SIGN,
|
|
42
|
-
'(': OPEN_BRACKET,
|
|
43
|
-
')': CLOSED_BRACKET,
|
|
44
|
-
'{': OPEN_BRACKET,
|
|
45
|
-
'}': CLOSED_BRACKET,
|
|
46
|
-
'[': OPEN_BRACKET,
|
|
47
|
-
']': CLOSED_BRACKET,
|
|
48
|
-
'\n': NEW_LINE,
|
|
49
|
-
'«': QUOTE,
|
|
50
|
-
'»': QUOTE,
|
|
51
|
-
'"': QUOTE,
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
__supported_tokens = {
|
|
55
|
-
COMMA,
|
|
56
|
-
SEMICOLON,
|
|
57
|
-
COLON,
|
|
58
|
-
QUOTE,
|
|
59
|
-
DASH,
|
|
60
|
-
DOT,
|
|
61
|
-
LONG_DASH,
|
|
62
|
-
TRIPLE_DOTS,
|
|
63
|
-
EXC_SIGN,
|
|
64
|
-
QUESTION_SIGN,
|
|
65
|
-
OPEN_BRACKET,
|
|
66
|
-
CLOSED_BRACKET,
|
|
67
|
-
NUMBER,
|
|
68
|
-
URL,
|
|
69
|
-
NEW_LINE,
|
|
70
|
-
UNKNOWN_CHAR,
|
|
71
|
-
UNKNOWN_WORD}
|
|
72
|
-
|
|
73
|
-
@staticmethod
|
|
74
|
-
def try_create(subterm):
|
|
75
|
-
"""
|
|
76
|
-
Trying create a token by given 'term' parameter
|
|
77
|
-
subterm: unicode
|
|
78
|
-
I.e. term ending, so means a part of original term
|
|
79
|
-
"""
|
|
80
|
-
assert(isinstance(subterm, str))
|
|
81
|
-
if subterm not in Tokens.__token_mapping:
|
|
82
|
-
return None
|
|
83
|
-
return Token(term=subterm, token_value=Tokens.__token_mapping[subterm])
|
|
84
|
-
|
|
85
|
-
@staticmethod
|
|
86
|
-
def try_parse(term):
|
|
87
|
-
assert(isinstance(term, str))
|
|
88
|
-
for origin, token_value in Tokens.__token_mapping.items():
|
|
89
|
-
if term == token_value:
|
|
90
|
-
return Token(term=origin, token_value=token_value)
|
|
91
|
-
|
|
92
|
-
@staticmethod
|
|
93
|
-
def try_create_number(term):
|
|
94
|
-
assert(isinstance(term, str))
|
|
95
|
-
if not term.isdigit():
|
|
96
|
-
return None
|
|
97
|
-
return Token(term=term, token_value=Tokens.NUMBER)
|
|
98
|
-
|
|
99
|
-
@staticmethod
|
|
100
|
-
def try_create_url(term):
|
|
101
|
-
assert(isinstance(term, str))
|
|
102
|
-
result = urlparse(term)
|
|
103
|
-
is_correct = result.scheme and result.netloc and result.path
|
|
104
|
-
if not is_correct:
|
|
105
|
-
return None
|
|
106
|
-
return Token(term=term, token_value=Tokens.URL)
|
|
107
|
-
|
|
108
|
-
@staticmethod
|
|
109
|
-
def is_token(term):
|
|
110
|
-
assert(isinstance(term, str))
|
|
111
|
-
return term in Tokens.__supported_tokens
|
|
112
|
-
|
|
113
|
-
@staticmethod
|
|
114
|
-
def iter_chars_by_token(term):
|
|
115
|
-
"""
|
|
116
|
-
Iterate through charts that is related to term
|
|
117
|
-
token: char
|
|
118
|
-
"""
|
|
119
|
-
assert(isinstance(term, str))
|
|
120
|
-
for char, token in Tokens.__token_mapping.items():
|
|
121
|
-
if term == token:
|
|
122
|
-
yield char
|
|
123
|
-
|
|
124
|
-
@staticmethod
|
|
125
|
-
def iter_supported_tokens():
|
|
126
|
-
for token in Tokens.__supported_tokens:
|
|
127
|
-
yield token
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
from arekit.common.text.stemmer import Stemmer
|
|
2
|
-
from arekit.contrib.utils.download import NEWS_MYSTEM_SKIPGRAM_1000_20_2015, load_embedding_and_vocab
|
|
3
|
-
from arekit.contrib.utils.embeddings.rusvectores import RusvectoresEmbedding
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def load_embedding_news_mystem_skipgram_1000_20_2015(stemmer, auto_download=False):
|
|
7
|
-
""" Embedding from https://rusvectores.org/ru/models/
|
|
8
|
-
Description: Russian news, from 2013 till the october 2015
|
|
9
|
-
Corpora size: 2.5 milliard words
|
|
10
|
-
Vocabulary volume: 147 358
|
|
11
|
-
Frequency bound: 200
|
|
12
|
-
Tagset: Mystem
|
|
13
|
-
Algorithm: Continuous Skip-Gram
|
|
14
|
-
Vector size: 1000
|
|
15
|
-
|
|
16
|
-
stemmer: Stemmer
|
|
17
|
-
It is expected to adopt MystemWrapper.
|
|
18
|
-
auto_download: bool
|
|
19
|
-
Whether try to download if the resource was missed.
|
|
20
|
-
"""
|
|
21
|
-
assert(isinstance(stemmer, Stemmer) or stemmer is None)
|
|
22
|
-
embedding, vocab = load_embedding_and_vocab(local_name=NEWS_MYSTEM_SKIPGRAM_1000_20_2015, check_existance=True,
|
|
23
|
-
download_if_missed=auto_download)
|
|
24
|
-
embedding = RusvectoresEmbedding(matrix=embedding, words=vocab, stemmer=stemmer)
|
|
25
|
-
return embedding
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
|
|
3
|
-
from collections.abc import Iterable
|
|
4
|
-
|
|
5
|
-
from arekit.common.data.input.providers.columns.sample import SampleColumnsProvider
|
|
6
|
-
from arekit.common.data.input.providers.rows.base import BaseRowProvider
|
|
7
|
-
from arekit.common.data.input.repositories.base import BaseInputRepository
|
|
8
|
-
from arekit.common.data.input.repositories.sample import BaseInputSamplesRepository
|
|
9
|
-
from arekit.common.data.storages.base import BaseRowsStorage
|
|
10
|
-
from arekit.common.pipeline.base import BasePipeline
|
|
11
|
-
from arekit.contrib.utils.data.contents.opinions import InputTextOpinionProvider
|
|
12
|
-
|
|
13
|
-
logger = logging.getLogger(__name__)
|
|
14
|
-
logging.basicConfig(level=logging.INFO)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class InputDataSerializationHelper(object):
|
|
18
|
-
|
|
19
|
-
@staticmethod
|
|
20
|
-
def create_samples_repo(keep_labels, rows_provider, storage):
|
|
21
|
-
assert(isinstance(rows_provider, BaseRowProvider))
|
|
22
|
-
assert(isinstance(keep_labels, bool))
|
|
23
|
-
assert(isinstance(storage, BaseRowsStorage))
|
|
24
|
-
return BaseInputSamplesRepository(
|
|
25
|
-
columns_provider=SampleColumnsProvider(store_labels=keep_labels),
|
|
26
|
-
rows_provider=rows_provider,
|
|
27
|
-
storage=storage)
|
|
28
|
-
|
|
29
|
-
@staticmethod
|
|
30
|
-
def fill_and_write(pipeline, repo, target, writer, doc_ids_iter, desc=""):
|
|
31
|
-
assert(isinstance(pipeline, BasePipeline))
|
|
32
|
-
assert(isinstance(doc_ids_iter, Iterable))
|
|
33
|
-
assert(isinstance(repo, BaseInputRepository))
|
|
34
|
-
|
|
35
|
-
doc_ids = list(doc_ids_iter)
|
|
36
|
-
|
|
37
|
-
repo.populate(contents_provider=InputTextOpinionProvider(pipeline),
|
|
38
|
-
doc_ids=doc_ids,
|
|
39
|
-
desc=desc,
|
|
40
|
-
writer=writer,
|
|
41
|
-
target=target)
|
|
42
|
-
|
|
43
|
-
repo.push(writer=writer, target=target)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
from arekit.common.entities.base import Entity
|
|
2
|
-
from arekit.common.docs.parsed.base import ParsedDocument
|
|
3
|
-
from arekit.common.docs.parsed.providers.entity_service import EntityServiceProvider
|
|
4
|
-
from arekit.common.docs.parsed.term_position import TermPositionTypes, TermPosition
|
|
5
|
-
from arekit.common.text.enums import TermFormat
|
|
6
|
-
from arekit.common.text.parsed import BaseParsedText
|
|
7
|
-
from arekit.common.text_opinions.base import TextOpinion
|
|
8
|
-
from arekit.contrib.utils.pipelines.text_opinion.filters.base import TextOpinionFilter
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class ProfessionAsCharacteristicSentimentTextOpinionFilter(TextOpinionFilter):
|
|
12
|
-
""" This is a filter, based on the PROFESSION type prefixed entity for
|
|
13
|
-
the SentiNEREL collection.
|
|
14
|
-
|
|
15
|
-
In this case, profession acts as a characteristics of the Person, and
|
|
16
|
-
therefore there is no need to consider these attitudes in annotation.
|
|
17
|
-
|
|
18
|
-
For a greater details, see:
|
|
19
|
-
https://github.com/nicolay-r/AREkit/issues/404
|
|
20
|
-
"""
|
|
21
|
-
|
|
22
|
-
def __init__(self, char_type="PROFESSION"):
|
|
23
|
-
self.__char_type = char_type
|
|
24
|
-
self.__next_entity_types = ["PERSON"]
|
|
25
|
-
|
|
26
|
-
def filter(self, text_opinion, parsed_doc, entity_service_provider):
|
|
27
|
-
assert(isinstance(text_opinion, TextOpinion))
|
|
28
|
-
assert(isinstance(parsed_doc, ParsedDocument))
|
|
29
|
-
assert(isinstance(entity_service_provider, EntityServiceProvider))
|
|
30
|
-
|
|
31
|
-
# Picking up entity.
|
|
32
|
-
target_entity = entity_service_provider._doc_entities[text_opinion.TargetId]
|
|
33
|
-
assert(isinstance(target_entity, Entity))
|
|
34
|
-
|
|
35
|
-
if target_entity.Type != self.__char_type:
|
|
36
|
-
# This is not our case.
|
|
37
|
-
return True
|
|
38
|
-
|
|
39
|
-
# Picking up the related target entity position.
|
|
40
|
-
target_pos = entity_service_provider.get_entity_position(text_opinion.TargetId)
|
|
41
|
-
assert(isinstance(target_pos, TermPosition))
|
|
42
|
-
|
|
43
|
-
# Picking up the related sentence of target.
|
|
44
|
-
t_sent = target_pos.get_index(TermPositionTypes.SentenceIndex)
|
|
45
|
-
sentence = parsed_doc.get_sentence(t_sent)
|
|
46
|
-
assert(isinstance(sentence, BaseParsedText))
|
|
47
|
-
|
|
48
|
-
# Picking up the entity position in sentence.
|
|
49
|
-
target_term_ind = target_pos.get_index(TermPositionTypes.IndexInSentence)
|
|
50
|
-
|
|
51
|
-
# We pick up the next term within the parsed sentece.
|
|
52
|
-
next_term = sentence.get_term(target_term_ind + 1, term_format=TermFormat.Raw) \
|
|
53
|
-
if len(sentence) > target_term_ind + 1 else None
|
|
54
|
-
|
|
55
|
-
if next_term is None:
|
|
56
|
-
# This is not our case.
|
|
57
|
-
return True
|
|
58
|
-
|
|
59
|
-
if isinstance(next_term, Entity) and next_term.Type in self.__next_entity_types:
|
|
60
|
-
# We reject this opinion from the annotation, since this is not expected to be a sentiment one.
|
|
61
|
-
return False
|
|
62
|
-
|
|
63
|
-
return True
|
|
File without changes
|