arekit 0.23.1__py3-none-any.whl → 0.25.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/common/context/terms_mapper.py +2 -2
- arekit/common/data/const.py +5 -4
- arekit/common/{experiment/api/ops_doc.py → data/doc_provider.py} +1 -1
- arekit/common/data/input/providers/columns/sample.py +6 -1
- arekit/common/data/input/providers/instances/base.py +1 -1
- arekit/common/data/input/providers/rows/base.py +36 -13
- arekit/common/data/input/providers/rows/samples.py +57 -55
- arekit/common/data/input/providers/sample/cropped.py +2 -2
- arekit/common/data/input/sample.py +1 -1
- arekit/common/data/rows_fmt.py +82 -0
- arekit/common/data/rows_parser.py +43 -0
- arekit/common/data/storages/base.py +23 -18
- arekit/common/data/views/samples.py +2 -8
- arekit/common/{news → docs}/base.py +2 -2
- arekit/common/{news → docs}/entities_grouping.py +2 -1
- arekit/common/{news → docs}/entity.py +2 -1
- arekit/common/{news → docs}/parsed/base.py +5 -5
- arekit/common/docs/parsed/providers/base.py +68 -0
- arekit/common/{news → docs}/parsed/providers/base_pairs.py +2 -2
- arekit/common/{news → docs}/parsed/providers/entity_service.py +27 -22
- arekit/common/{news → docs}/parsed/providers/opinion_pairs.py +2 -2
- arekit/common/{news → docs}/parsed/providers/text_opinion_pairs.py +6 -6
- arekit/common/docs/parsed/service.py +31 -0
- arekit/common/docs/parser.py +66 -0
- arekit/common/{news → docs}/sentence.py +1 -1
- arekit/common/entities/base.py +11 -2
- arekit/common/experiment/api/base_samples_io.py +1 -1
- arekit/common/frames/variants/collection.py +2 -2
- arekit/common/linkage/base.py +2 -2
- arekit/common/linkage/meta.py +23 -0
- arekit/common/linkage/opinions.py +1 -1
- arekit/common/linkage/text_opinions.py +2 -2
- arekit/common/opinions/annot/algo/base.py +1 -1
- arekit/common/opinions/annot/algo/pair_based.py +15 -13
- arekit/common/opinions/annot/algo/predefined.py +4 -4
- arekit/common/opinions/annot/algo_based.py +5 -5
- arekit/common/opinions/annot/base.py +3 -3
- arekit/common/opinions/base.py +7 -7
- arekit/common/opinions/collection.py +3 -3
- arekit/common/pipeline/base.py +12 -16
- arekit/common/pipeline/batching.py +28 -0
- arekit/common/pipeline/context.py +5 -1
- arekit/common/pipeline/items/base.py +38 -1
- arekit/common/pipeline/items/flatten.py +5 -1
- arekit/common/pipeline/items/handle.py +2 -1
- arekit/common/pipeline/items/iter.py +2 -1
- arekit/common/pipeline/items/map.py +2 -1
- arekit/common/pipeline/items/map_nested.py +4 -0
- arekit/common/pipeline/utils.py +32 -0
- arekit/common/service/sqlite.py +36 -0
- arekit/common/synonyms/base.py +2 -2
- arekit/common/text/{partitioning/str.py → partitioning.py} +16 -11
- arekit/common/text_opinions/base.py +11 -11
- arekit/common/utils.py +33 -46
- arekit/contrib/networks/embedding.py +3 -3
- arekit/contrib/networks/embedding_io.py +5 -5
- arekit/contrib/networks/input/const.py +0 -2
- arekit/contrib/networks/input/providers/sample.py +15 -29
- arekit/contrib/networks/input/rows_parser.py +47 -134
- arekit/contrib/prompt/sample.py +18 -16
- arekit/contrib/utils/data/contents/opinions.py +17 -5
- arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
- arekit/contrib/utils/data/{doc_ops → doc_provider}/dir_based.py +7 -7
- arekit/contrib/utils/data/readers/base.py +3 -0
- arekit/contrib/utils/data/readers/csv_pd.py +10 -4
- arekit/contrib/utils/data/readers/jsonl.py +3 -0
- arekit/contrib/utils/data/readers/sqlite.py +14 -0
- arekit/contrib/utils/data/service/balance.py +0 -1
- arekit/contrib/utils/data/storages/pandas_based.py +3 -5
- arekit/contrib/utils/data/storages/row_cache.py +18 -6
- arekit/contrib/utils/data/storages/sqlite_based.py +17 -0
- arekit/contrib/utils/data/writers/base.py +5 -0
- arekit/contrib/utils/data/writers/csv_native.py +3 -0
- arekit/contrib/utils/data/writers/csv_pd.py +3 -0
- arekit/contrib/utils/data/writers/json_opennre.py +31 -13
- arekit/contrib/utils/data/writers/sqlite_native.py +114 -0
- arekit/contrib/utils/io_utils/embedding.py +25 -33
- arekit/contrib/utils/io_utils/utils.py +3 -24
- arekit/contrib/utils/pipelines/items/sampling/base.py +31 -26
- arekit/contrib/utils/pipelines/items/sampling/networks.py +7 -10
- arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
- arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +3 -3
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +2 -1
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +3 -5
- arekit/contrib/utils/pipelines/items/text/translator.py +136 -0
- arekit/contrib/utils/pipelines/opinion_collections.py +5 -5
- arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +7 -7
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +34 -22
- arekit/contrib/utils/pipelines/text_opinion/filters/base.py +1 -1
- arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +1 -1
- arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +3 -3
- arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +4 -4
- arekit/contrib/utils/serializer.py +4 -23
- arekit-0.25.0.data/data/logo.png +0 -0
- arekit-0.25.0.dist-info/METADATA +82 -0
- arekit-0.25.0.dist-info/RECORD +259 -0
- {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/WHEEL +1 -1
- arekit/common/data/row_ids/base.py +0 -79
- arekit/common/data/row_ids/binary.py +0 -38
- arekit/common/data/row_ids/multiple.py +0 -14
- arekit/common/folding/base.py +0 -36
- arekit/common/folding/fixed.py +0 -42
- arekit/common/folding/nofold.py +0 -15
- arekit/common/folding/united.py +0 -46
- arekit/common/news/objects_parser.py +0 -37
- arekit/common/news/parsed/providers/base.py +0 -48
- arekit/common/news/parsed/service.py +0 -31
- arekit/common/news/parser.py +0 -34
- arekit/common/text/parser.py +0 -12
- arekit/common/text/partitioning/__init__.py +0 -0
- arekit/common/text/partitioning/base.py +0 -4
- arekit/common/text/partitioning/terms.py +0 -35
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +0 -83
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +0 -33
- arekit/contrib/source/brat/entities/entity.py +0 -42
- arekit/contrib/source/brat/entities/parser.py +0 -53
- arekit/contrib/source/brat/news.py +0 -28
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +0 -19
- arekit/contrib/source/brat/relation.py +0 -32
- arekit/contrib/source/brat/sentence.py +0 -69
- arekit/contrib/source/brat/sentences_reader.py +0 -128
- arekit/contrib/source/download.py +0 -41
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +0 -55
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +0 -75
- arekit/contrib/source/nerel/io_utils.py +0 -62
- arekit/contrib/source/nerel/labels.py +0 -241
- arekit/contrib/source/nerel/reader.py +0 -46
- arekit/contrib/source/nerel/utils.py +0 -24
- arekit/contrib/source/nerel/versions.py +0 -12
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +0 -62
- arekit/contrib/source/nerelbio/labels.py +0 -265
- arekit/contrib/source/nerelbio/reader.py +0 -8
- arekit/contrib/source/nerelbio/versions.py +0 -8
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +0 -36
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
- arekit/contrib/source/ruattitudes/io_utils.py +0 -56
- arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
- arekit/contrib/source/ruattitudes/news.py +0 -51
- arekit/contrib/source/ruattitudes/news_brat.py +0 -44
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
- arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
- arekit/contrib/source/ruattitudes/reader.py +0 -268
- arekit/contrib/source/ruattitudes/sentence.py +0 -73
- arekit/contrib/source/ruattitudes/synonyms.py +0 -17
- arekit/contrib/source/ruattitudes/text_object.py +0 -57
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +0 -157
- arekit/contrib/source/rusentiframes/effect.py +0 -24
- arekit/contrib/source/rusentiframes/io_utils.py +0 -19
- arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
- arekit/contrib/source/rusentiframes/polarity.py +0 -35
- arekit/contrib/source/rusentiframes/role.py +0 -15
- arekit/contrib/source/rusentiframes/state.py +0 -24
- arekit/contrib/source/rusentiframes/types.py +0 -42
- arekit/contrib/source/rusentiframes/value.py +0 -2
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +0 -3
- arekit/contrib/source/rusentrel/entities.py +0 -26
- arekit/contrib/source/rusentrel/io_utils.py +0 -125
- arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
- arekit/contrib/source/rusentrel/news_reader.py +0 -51
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
- arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
- arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
- arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
- arekit/contrib/source/rusentrel/synonyms.py +0 -17
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +0 -52
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +0 -32
- arekit/contrib/source/sentinerel/folding/fixed.py +0 -73
- arekit/contrib/source/sentinerel/io_utils.py +0 -87
- arekit/contrib/source/sentinerel/labels.py +0 -53
- arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
- arekit/contrib/source/sentinerel/reader.py +0 -42
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +0 -19
- arekit/contrib/source/zip_utils.py +0 -47
- arekit/contrib/utils/bert/rows.py +0 -0
- arekit/contrib/utils/bert/text_b_rus.py +0 -18
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
- arekit/contrib/utils/cv/__init__.py +0 -0
- arekit/contrib/utils/cv/doc_stat/__init__.py +0 -0
- arekit/contrib/utils/cv/doc_stat/base.py +0 -37
- arekit/contrib/utils/cv/doc_stat/sentence.py +0 -12
- arekit/contrib/utils/cv/splitters/__init__.py +0 -0
- arekit/contrib/utils/cv/splitters/base.py +0 -4
- arekit/contrib/utils/cv/splitters/default.py +0 -53
- arekit/contrib/utils/cv/splitters/statistical.py +0 -57
- arekit/contrib/utils/cv/two_class.py +0 -77
- arekit/contrib/utils/data/doc_ops/__init__.py +0 -0
- arekit/contrib/utils/data/doc_ops/dict_based.py +0 -13
- arekit/contrib/utils/data/ext.py +0 -31
- arekit/contrib/utils/data/views/__init__.py +0 -0
- arekit/contrib/utils/data/views/linkages/__init__.py +0 -0
- arekit/contrib/utils/data/views/linkages/base.py +0 -58
- arekit/contrib/utils/data/views/linkages/multilabel.py +0 -48
- arekit/contrib/utils/data/views/linkages/utils.py +0 -24
- arekit/contrib/utils/data/views/opinions.py +0 -14
- arekit/contrib/utils/download.py +0 -78
- arekit/contrib/utils/entities/formatters/str_rus_cased_fmt.py +0 -78
- arekit/contrib/utils/entities/formatters/str_rus_nocased_fmt.py +0 -15
- arekit/contrib/utils/entities/formatters/str_simple_fmt.py +0 -24
- arekit/contrib/utils/entities/formatters/str_simple_uppercase_fmt.py +0 -21
- arekit/contrib/utils/io_utils/opinions.py +0 -39
- arekit/contrib/utils/io_utils/samples.py +0 -78
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +0 -43
- arekit/contrib/utils/lexicons/relation.py +0 -45
- arekit/contrib/utils/lexicons/rusentilex.py +0 -34
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +0 -83
- arekit/contrib/utils/pipelines/items/sampling/bert.py +0 -5
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
- arekit/contrib/utils/pipelines/items/to_output.py +0 -101
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_ops.py +0 -27
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -59
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_ops.py +0 -29
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -59
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_ops.py +0 -56
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -19
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -58
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_ops.py +0 -21
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -100
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_ops.py +0 -29
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -175
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
- arekit/contrib/utils/resources.py +0 -26
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
- arekit/contrib/utils/utils_folding.py +0 -19
- arekit/download_data.py +0 -11
- arekit-0.23.1.dist-info/METADATA +0 -23
- arekit-0.23.1.dist-info/RECORD +0 -403
- /arekit/common/{data/row_ids → docs}/__init__.py +0 -0
- /arekit/common/{folding → docs/parsed}/__init__.py +0 -0
- /arekit/common/{news → docs/parsed/providers}/__init__.py +0 -0
- /arekit/common/{news → docs}/parsed/term_position.py +0 -0
- /arekit/common/{news/parsed → service}/__init__.py +0 -0
- /arekit/{common/news/parsed/providers → contrib/utils/data/doc_provider}/__init__.py +0 -0
- {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/LICENSE +0 -0
- {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/top_level.txt +0 -0
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
import random
|
|
2
|
-
|
|
3
|
-
from arekit.contrib.utils.cv.splitters.base import CrossValidationSplitter
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class SimpleCrossValidationSplitter(CrossValidationSplitter):
|
|
7
|
-
""" This splitter assumes to performs folding
|
|
8
|
-
without extra additional statistics of the related documents.
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
def __init__(self, shuffle=True, seed=1):
|
|
12
|
-
self.__shuffle = shuffle
|
|
13
|
-
self.__seed = seed
|
|
14
|
-
|
|
15
|
-
# region private methods
|
|
16
|
-
|
|
17
|
-
@staticmethod
|
|
18
|
-
def __chunk_it(sequence, num):
|
|
19
|
-
avg = len(sequence) / float(num)
|
|
20
|
-
out = []
|
|
21
|
-
last = 0.0
|
|
22
|
-
|
|
23
|
-
while last < len(sequence):
|
|
24
|
-
out.append(sequence[int(last):int(last + avg)])
|
|
25
|
-
last += avg
|
|
26
|
-
|
|
27
|
-
return out
|
|
28
|
-
|
|
29
|
-
# endregion
|
|
30
|
-
|
|
31
|
-
def items_to_cv_pairs(self, doc_ids, cv_count):
|
|
32
|
-
"""
|
|
33
|
-
Splits array of indices into list of pairs (train_indices_list,
|
|
34
|
-
test_indices_list)
|
|
35
|
-
"""
|
|
36
|
-
assert(isinstance(doc_ids, set))
|
|
37
|
-
assert(isinstance(cv_count, int))
|
|
38
|
-
|
|
39
|
-
doc_ids_list = list(doc_ids)
|
|
40
|
-
|
|
41
|
-
if self.__shuffle:
|
|
42
|
-
random.Random(self.__seed).shuffle(doc_ids_list)
|
|
43
|
-
|
|
44
|
-
chunks = self.__chunk_it(doc_ids_list, cv_count)
|
|
45
|
-
|
|
46
|
-
for test_index, chunk in enumerate(chunks):
|
|
47
|
-
train_indices = list(range(len(chunks)))
|
|
48
|
-
train_indices.remove(test_index)
|
|
49
|
-
|
|
50
|
-
large = [v for train_index in train_indices for v in chunks[train_index]]
|
|
51
|
-
small = chunk
|
|
52
|
-
|
|
53
|
-
yield large, small
|
|
@@ -1,57 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
from arekit.contrib.utils.cv.doc_stat.base import BaseDocumentStatGenerator
|
|
3
|
-
from arekit.contrib.utils.cv.splitters.base import CrossValidationSplitter
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class StatBasedCrossValidationSplitter(CrossValidationSplitter):
|
|
7
|
-
""" Sentence-based splitter.
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
def __init__(self, docs_stat, doc_ids):
|
|
11
|
-
assert(isinstance(docs_stat, BaseDocumentStatGenerator))
|
|
12
|
-
super(StatBasedCrossValidationSplitter, self).__init__()
|
|
13
|
-
self.__docs_info = docs_stat.calculate(doc_ids_iter=doc_ids)
|
|
14
|
-
|
|
15
|
-
# region private methods
|
|
16
|
-
|
|
17
|
-
@staticmethod
|
|
18
|
-
def __select_group(cv_group_size, item):
|
|
19
|
-
deltas = []
|
|
20
|
-
for group_index in range(len(cv_group_size)):
|
|
21
|
-
delta = StatBasedCrossValidationSplitter.__calc_cv_group_delta(
|
|
22
|
-
cv_group_size=cv_group_size, item=item, g_index_to_add=group_index)
|
|
23
|
-
deltas.append(delta)
|
|
24
|
-
|
|
25
|
-
return int(np.argmin(deltas))
|
|
26
|
-
|
|
27
|
-
@staticmethod
|
|
28
|
-
def __calc_cv_group_delta(cv_group_size, item, g_index_to_add):
|
|
29
|
-
sums = []
|
|
30
|
-
for i in range(len(cv_group_size)):
|
|
31
|
-
sums.append(sum(cv_group_size[i]))
|
|
32
|
-
|
|
33
|
-
sums[g_index_to_add] += item
|
|
34
|
-
return max(sums) - np.mean(sums)
|
|
35
|
-
|
|
36
|
-
# endregion
|
|
37
|
-
|
|
38
|
-
def items_to_cv_pairs(self, doc_ids, cv_count):
|
|
39
|
-
""" Separation with the specific separation, in terms of cv-classes size difference.
|
|
40
|
-
"""
|
|
41
|
-
assert(isinstance(doc_ids, set))
|
|
42
|
-
assert(isinstance(cv_count, int))
|
|
43
|
-
|
|
44
|
-
sorted_stat = reversed(sorted(self.__docs_info, key=lambda pair: pair[1]))
|
|
45
|
-
cv_group_docs = [[] for _ in range(cv_count)]
|
|
46
|
-
cv_group_sizes = [[] for _ in range(cv_count)]
|
|
47
|
-
|
|
48
|
-
for doc_id, s_count in sorted_stat:
|
|
49
|
-
group_index = self.__select_group(cv_group_size=cv_group_sizes, item=s_count)
|
|
50
|
-
cv_group_docs[group_index].append(doc_id)
|
|
51
|
-
cv_group_sizes[group_index].append(s_count)
|
|
52
|
-
|
|
53
|
-
for g_index in range(len(cv_group_docs)):
|
|
54
|
-
small = cv_group_docs[g_index]
|
|
55
|
-
large = [doc_id for doc_id, _ in self.__docs_info if doc_id not in small]
|
|
56
|
-
|
|
57
|
-
yield large, small
|
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
from arekit.common.folding.base import BaseDataFolding
|
|
2
|
-
from arekit.contrib.utils.cv.splitters.base import CrossValidationSplitter
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class TwoClassCVFolding(BaseDataFolding):
|
|
6
|
-
""" Performs folding onto a pair of data_types,
|
|
7
|
-
i.e. two-class cv-folding algorithm
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
def __init__(self, supported_data_types, doc_ids_to_fold, cv_count, splitter):
|
|
11
|
-
assert(isinstance(splitter, CrossValidationSplitter))
|
|
12
|
-
assert(isinstance(cv_count, int) and cv_count > 0)
|
|
13
|
-
|
|
14
|
-
if len(supported_data_types) > 2:
|
|
15
|
-
raise NotImplementedError("Experiments with such amount of data-types are not supported!")
|
|
16
|
-
|
|
17
|
-
super(TwoClassCVFolding, self).__init__(doc_ids_to_fold=doc_ids_to_fold,
|
|
18
|
-
supported_data_types=supported_data_types)
|
|
19
|
-
|
|
20
|
-
self.__cv_count = cv_count
|
|
21
|
-
self.__splitter = splitter
|
|
22
|
-
self.__state_index = 0
|
|
23
|
-
|
|
24
|
-
# region Properties
|
|
25
|
-
|
|
26
|
-
@property
|
|
27
|
-
def StateIndex(self):
|
|
28
|
-
return self.__state_index
|
|
29
|
-
|
|
30
|
-
@property
|
|
31
|
-
def CVCount(self):
|
|
32
|
-
return self.__cv_count
|
|
33
|
-
|
|
34
|
-
# endregion
|
|
35
|
-
|
|
36
|
-
def __assign_index(self, i):
|
|
37
|
-
self.__state_index = i
|
|
38
|
-
|
|
39
|
-
# region BaseFolding
|
|
40
|
-
|
|
41
|
-
def iter_states(self):
|
|
42
|
-
""" Performs iteration over states supported by folding algorithm
|
|
43
|
-
Default:
|
|
44
|
-
considering a single state.
|
|
45
|
-
"""
|
|
46
|
-
for state_index in range(self.__cv_count):
|
|
47
|
-
self.__assign_index(state_index)
|
|
48
|
-
yield None
|
|
49
|
-
|
|
50
|
-
def fold_doc_ids_set(self):
|
|
51
|
-
|
|
52
|
-
# Access to protected fields
|
|
53
|
-
data_types = self._supported_data_types
|
|
54
|
-
doc_ids = self._doc_ids_to_fold_set
|
|
55
|
-
|
|
56
|
-
if len(data_types) == 1:
|
|
57
|
-
# By default we provide the same output since
|
|
58
|
-
# there is no need to perform splitting onto single part
|
|
59
|
-
return {
|
|
60
|
-
data_types[0]: list(doc_ids)
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
if self.__splitter is None:
|
|
64
|
-
raise NotImplementedError("Splitter has not been intialized!")
|
|
65
|
-
|
|
66
|
-
it = self.__splitter.items_to_cv_pairs(doc_ids=set(doc_ids),
|
|
67
|
-
cv_count=self.__cv_count)
|
|
68
|
-
|
|
69
|
-
for index, pair in enumerate(it):
|
|
70
|
-
large, small = pair
|
|
71
|
-
if index == self.__state_index:
|
|
72
|
-
return {
|
|
73
|
-
data_types[0]: large,
|
|
74
|
-
data_types[1]: small
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
# endregion
|
|
File without changes
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
from arekit.common.experiment.api.ops_doc import DocumentOperations
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class DictionaryBasedDocumentOperations(DocumentOperations):
|
|
5
|
-
|
|
6
|
-
def __init__(self, d):
|
|
7
|
-
assert(isinstance(d, dict))
|
|
8
|
-
super(DictionaryBasedDocumentOperations, self).__init__()
|
|
9
|
-
self.__d = d
|
|
10
|
-
|
|
11
|
-
def by_id(self, doc_id):
|
|
12
|
-
assert(isinstance(doc_id, int))
|
|
13
|
-
return self.__d[doc_id]
|
arekit/contrib/utils/data/ext.py
DELETED
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
from arekit.contrib.utils.data.readers.base import BaseReader
|
|
2
|
-
from arekit.contrib.utils.data.writers.base import BaseWriter
|
|
3
|
-
from arekit.contrib.utils.data.writers.csv_native import NativeCsvWriter
|
|
4
|
-
from arekit.contrib.utils.data.writers.json_opennre import OpenNREJsonWriter
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
PANDAS_CSV_EXTENSION = ".tsv.gz"
|
|
8
|
-
OPENNRE_EXTENSION = ".jsonl"
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def create_writer_extension(writer):
|
|
12
|
-
assert(isinstance(writer, BaseWriter))
|
|
13
|
-
|
|
14
|
-
if isinstance(writer, OpenNREJsonWriter):
|
|
15
|
-
return OPENNRE_EXTENSION
|
|
16
|
-
if isinstance(writer, NativeCsvWriter):
|
|
17
|
-
return ".csv"
|
|
18
|
-
else:
|
|
19
|
-
# consider ".tsv.gz" and assuming it is a Pandas.
|
|
20
|
-
return PANDAS_CSV_EXTENSION
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def create_reader_extension(writer):
|
|
24
|
-
assert(isinstance(writer, BaseReader))
|
|
25
|
-
|
|
26
|
-
if isinstance(writer, OpenNREJsonWriter):
|
|
27
|
-
return OPENNRE_EXTENSION
|
|
28
|
-
else:
|
|
29
|
-
# consider ".tsv.gz" and assuming it is a Pandas.
|
|
30
|
-
# other options are not available in 0.23.1
|
|
31
|
-
return PANDAS_CSV_EXTENSION
|
|
File without changes
|
|
File without changes
|
|
@@ -1,58 +0,0 @@
|
|
|
1
|
-
from arekit.common.data import const
|
|
2
|
-
from arekit.common.data.row_ids.base import BaseIDProvider
|
|
3
|
-
from arekit.common.data.storages.base import BaseRowsStorage
|
|
4
|
-
from arekit.common.linkage.opinions import OpinionsLinkage
|
|
5
|
-
from arekit.contrib.utils.data.views.linkages import utils
|
|
6
|
-
from arekit.contrib.utils.data.views.opinions import BaseOpinionStorageView
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class BaseOpinionLinkagesView(object):
|
|
10
|
-
""" Base view onto source in terms of opinion linkages.
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
def __init__(self, ids_provider, storage):
|
|
14
|
-
assert(isinstance(ids_provider, BaseIDProvider))
|
|
15
|
-
assert(isinstance(storage, BaseRowsStorage))
|
|
16
|
-
self._ids_provider = ids_provider
|
|
17
|
-
self._storage = storage
|
|
18
|
-
|
|
19
|
-
# region private methods
|
|
20
|
-
|
|
21
|
-
def __iter_doc_opinion_ids(self, row_ids):
|
|
22
|
-
for row_id in row_ids:
|
|
23
|
-
yield self._ids_provider.parse_opinion_in_opinion_id(row_id)
|
|
24
|
-
|
|
25
|
-
def __iter_opinions_by_linkages(self, linkages_df, opinions_view):
|
|
26
|
-
for df_linkage in linkages_df:
|
|
27
|
-
yield self._iter_by_opinions(linked_df=df_linkage, opinions_view=opinions_view)
|
|
28
|
-
|
|
29
|
-
# endregion
|
|
30
|
-
|
|
31
|
-
# region protected methods
|
|
32
|
-
|
|
33
|
-
def _iter_by_opinions(self, linked_df, opinions_view):
|
|
34
|
-
raise NotImplementedError()
|
|
35
|
-
|
|
36
|
-
# endregion
|
|
37
|
-
|
|
38
|
-
# region public methods
|
|
39
|
-
|
|
40
|
-
def iter_opinion_linkages(self, doc_id, opinions_view):
|
|
41
|
-
assert(isinstance(opinions_view, BaseOpinionStorageView))
|
|
42
|
-
doc_df = self._storage.find_by_value(column_name=const.DOC_ID, value=doc_id)
|
|
43
|
-
row_ids = [row_id for row_id in doc_df[const.ID]] # TODO. Adopt storage.
|
|
44
|
-
doc_opin_ids = self.__iter_doc_opinion_ids(row_ids=row_ids)
|
|
45
|
-
|
|
46
|
-
doc_opin_id_patterns = map(
|
|
47
|
-
lambda opinion_id: self._ids_provider.create_pattern(id_value=opinion_id, p_type=BaseIDProvider.OPINION),
|
|
48
|
-
doc_opin_ids)
|
|
49
|
-
|
|
50
|
-
linkages_df = map(
|
|
51
|
-
lambda opin_id: utils.filter_by_id(doc_df=doc_df, column=const.ID, value=opin_id),
|
|
52
|
-
doc_opin_id_patterns)
|
|
53
|
-
|
|
54
|
-
opinions_iter = self.__iter_opinions_by_linkages(linkages_df, opinions_view=opinions_view)
|
|
55
|
-
|
|
56
|
-
return map(lambda opinions: OpinionsLinkage(opinions), opinions_iter)
|
|
57
|
-
|
|
58
|
-
# endregion
|
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
|
|
3
|
-
from arekit.common.data import const
|
|
4
|
-
from arekit.common.data.row_ids.multiple import MultipleIDProvider
|
|
5
|
-
from arekit.common.labels.scaler.base import BaseLabelScaler
|
|
6
|
-
from arekit.contrib.utils.data.views.linkages import utils
|
|
7
|
-
from arekit.contrib.utils.data.views.linkages.base import BaseOpinionLinkagesView
|
|
8
|
-
from arekit.contrib.utils.data.views.opinions import BaseOpinionStorageView
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class MultilableOpinionLinkagesView(BaseOpinionLinkagesView):
|
|
12
|
-
""" View onto source, where each row, related to opinion, has multiple labels.
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
def __init__(self, labels_scaler, storage):
|
|
16
|
-
assert(isinstance(labels_scaler, BaseLabelScaler))
|
|
17
|
-
super(MultilableOpinionLinkagesView, self).__init__(ids_provider=MultipleIDProvider(),
|
|
18
|
-
storage=storage)
|
|
19
|
-
self.__labels_scaler = labels_scaler
|
|
20
|
-
|
|
21
|
-
# region private methods
|
|
22
|
-
|
|
23
|
-
def __get_column_header(self):
|
|
24
|
-
return [str(self.__labels_scaler.label_to_uint(label))
|
|
25
|
-
for label in self.__labels_scaler.ordered_suppoted_labels()]
|
|
26
|
-
|
|
27
|
-
def __calculate_label(self, row):
|
|
28
|
-
"""
|
|
29
|
-
Using a single row (probabilities by each class)
|
|
30
|
-
"""
|
|
31
|
-
labels_prob = [row[label] for label in self.__get_column_header()]
|
|
32
|
-
return self.__labels_scaler.uint_to_label(value=int(np.argmax(labels_prob)))
|
|
33
|
-
|
|
34
|
-
# endregion
|
|
35
|
-
|
|
36
|
-
# region protected methods
|
|
37
|
-
|
|
38
|
-
def _iter_by_opinions(self, linked_df, opinions_view):
|
|
39
|
-
assert(isinstance(opinions_view, BaseOpinionStorageView))
|
|
40
|
-
|
|
41
|
-
for _, series in linked_df.iterrows():
|
|
42
|
-
yield utils.compose_opinion_by_opinion_id(
|
|
43
|
-
ids_provider=self._ids_provider,
|
|
44
|
-
sample_id=series[const.ID],
|
|
45
|
-
opinions_view=opinions_view,
|
|
46
|
-
calc_label_func=lambda: self.__calculate_label(series))
|
|
47
|
-
|
|
48
|
-
# endregion
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
from arekit.common.data import const
|
|
2
|
-
from arekit.common.data.row_ids.base import BaseIDProvider
|
|
3
|
-
from arekit.common.opinions.base import Opinion
|
|
4
|
-
from arekit.contrib.utils.data.views.opinions import BaseOpinionStorageView
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def compose_opinion_by_opinion_id(ids_provider, sample_id, opinions_view, calc_label_func):
|
|
8
|
-
assert(isinstance(ids_provider, BaseIDProvider))
|
|
9
|
-
assert(isinstance(sample_id, str))
|
|
10
|
-
assert(isinstance(opinions_view, BaseOpinionStorageView))
|
|
11
|
-
assert(callable(calc_label_func))
|
|
12
|
-
|
|
13
|
-
opinion_id = ids_provider.convert_sample_id_to_opinion_id(sample_id=sample_id)
|
|
14
|
-
row = opinions_view.row_by_id(opinion_id=opinion_id)
|
|
15
|
-
|
|
16
|
-
return Opinion(source_value=row[const.SOURCE],
|
|
17
|
-
target_value=row[const.TARGET],
|
|
18
|
-
sentiment=calc_label_func())
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
# TODO. Adopt storage.
|
|
22
|
-
def filter_by_id(doc_df, column, value):
|
|
23
|
-
assert(isinstance(column, str))
|
|
24
|
-
return doc_df[doc_df[column].str.contains(value)]
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from arekit.common.data import const
|
|
2
|
-
from arekit.common.data.storages.base import BaseRowsStorage
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class BaseOpinionStorageView(object):
|
|
6
|
-
|
|
7
|
-
def __init__(self, storage):
|
|
8
|
-
assert(isinstance(storage, BaseRowsStorage))
|
|
9
|
-
self._storage = storage
|
|
10
|
-
|
|
11
|
-
def row_by_id(self, opinion_id):
|
|
12
|
-
assert(isinstance(opinion_id, str))
|
|
13
|
-
return self._storage.find_first_by_value(column_name=const.ID,
|
|
14
|
-
value=opinion_id)
|
arekit/contrib/utils/download.py
DELETED
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import tarfile
|
|
3
|
-
from os.path import join, exists
|
|
4
|
-
|
|
5
|
-
from arekit.common import utils
|
|
6
|
-
from arekit.contrib.utils.np_utils.embedding import NpzEmbeddingHelper
|
|
7
|
-
from arekit.contrib.utils.np_utils.vocab import VocabRepositoryUtils
|
|
8
|
-
|
|
9
|
-
NEWS_MYSTEM_SKIPGRAM_1000_20_2015 = "news_mystem_skipgram_1000_20_2015.tar.gz"
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def __get_resource(local_name, check_existance=False, download_if_missed=False):
|
|
13
|
-
assert(isinstance(local_name, str))
|
|
14
|
-
filepath = join(utils.get_default_download_dir(), local_name)
|
|
15
|
-
|
|
16
|
-
if check_existance and not exists(filepath):
|
|
17
|
-
if download_if_missed:
|
|
18
|
-
download()
|
|
19
|
-
# We try to ger the resource again but won't attempt to download it again.
|
|
20
|
-
__get_resource(local_name, check_existance=check_existance, download_if_missed=False)
|
|
21
|
-
else:
|
|
22
|
-
raise Exception("Resource could not be found: {}".format(filepath))
|
|
23
|
-
|
|
24
|
-
return filepath
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def __get_embedding_dir(filepath):
|
|
28
|
-
return filepath.replace(".tar.gz", "")
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def load_embedding_and_vocab(local_name, check_existance=False, download_if_missed=False):
|
|
32
|
-
tar_gz_archive = __get_resource(local_name, check_existance=check_existance,
|
|
33
|
-
download_if_missed=download_if_missed)
|
|
34
|
-
target_dir = __get_embedding_dir(tar_gz_archive)
|
|
35
|
-
embedding = NpzEmbeddingHelper.load_embedding(os.path.join(target_dir, "embedding.npz"))
|
|
36
|
-
vocab = VocabRepositoryUtils.load(os.path.join(target_dir, "vocab.txt"))
|
|
37
|
-
return embedding, vocab
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def download():
|
|
41
|
-
|
|
42
|
-
data = {
|
|
43
|
-
NEWS_MYSTEM_SKIPGRAM_1000_20_2015: "https://www.dropbox.com/s/0omnlgzgnjhxlmf/{filename}?dl=1".format(
|
|
44
|
-
filename=NEWS_MYSTEM_SKIPGRAM_1000_20_2015),
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
# Perform downloading ...
|
|
48
|
-
for local_name, url_link in data.items():
|
|
49
|
-
utils.download(dest_file_path=__get_resource(local_name),
|
|
50
|
-
source_url=url_link)
|
|
51
|
-
|
|
52
|
-
# Untar files ...
|
|
53
|
-
for local_name in data.keys():
|
|
54
|
-
|
|
55
|
-
if ".tar.gz" not in local_name:
|
|
56
|
-
continue
|
|
57
|
-
|
|
58
|
-
target_filepath = __get_resource(local_name)
|
|
59
|
-
with tarfile.open(target_filepath) as file:
|
|
60
|
-
def is_within_directory(directory, target):
|
|
61
|
-
|
|
62
|
-
abs_directory = os.path.abspath(directory)
|
|
63
|
-
abs_target = os.path.abspath(target)
|
|
64
|
-
|
|
65
|
-
prefix = os.path.commonprefix([abs_directory, abs_target])
|
|
66
|
-
|
|
67
|
-
return prefix == abs_directory
|
|
68
|
-
|
|
69
|
-
def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
|
|
70
|
-
|
|
71
|
-
for member in tar.getmembers():
|
|
72
|
-
member_path = os.path.join(path, member.name)
|
|
73
|
-
if not is_within_directory(path, member_path):
|
|
74
|
-
raise Exception("Attempted Path Traversal in Tar File")
|
|
75
|
-
|
|
76
|
-
tar.extractall(path, members, numeric_owner=numeric_owner)
|
|
77
|
-
|
|
78
|
-
safe_extract(file, __get_embedding_dir(target_filepath))
|
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
from arekit.common.entities.base import Entity
|
|
2
|
-
from arekit.common.entities.str_fmt import StringEntitiesFormatter
|
|
3
|
-
from arekit.common.entities.types import OpinionEntityType
|
|
4
|
-
from arekit.contrib.utils.processing.languages.ru.cases import RussianCases
|
|
5
|
-
from arekit.contrib.utils.processing.languages.ru.number import RussianNumberType
|
|
6
|
-
from arekit.contrib.utils.processing.pos.russian import RussianPOSTagger
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class RussianEntitiesCasedFormatter(StringEntitiesFormatter):
|
|
10
|
-
|
|
11
|
-
# Объект/Субъект
|
|
12
|
-
obj_subj_cases_map = {
|
|
13
|
-
RussianCases.UNKN: ['', ''], # UNKN
|
|
14
|
-
RussianCases.NOM: ['', "ы"], # именительный
|
|
15
|
-
RussianCases.GEN: ['а', 'ов'], # родительный
|
|
16
|
-
RussianCases.DAT: ['y', 'ам'], # дательный
|
|
17
|
-
RussianCases.ACC: ['', 'ы'], # винительный
|
|
18
|
-
RussianCases.INS: ['ом', 'aми'], # творительный
|
|
19
|
-
RussianCases.ABL: ['e', 'ах'] # предложный
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
# Сущност
|
|
23
|
-
entity_cases_map = {
|
|
24
|
-
RussianCases.UNKN: ['ь', 'и'], # UNKN
|
|
25
|
-
RussianCases.NOM: ['ь', "и"], # именительный
|
|
26
|
-
RussianCases.GEN: ['и', 'ей'], # родительный
|
|
27
|
-
RussianCases.DAT: ['и', 'ям'], # дательный
|
|
28
|
-
RussianCases.ACC: ['ь', 'и'], # винительный
|
|
29
|
-
RussianCases.INS: ['ью', 'ьями'], # творительный
|
|
30
|
-
RussianCases.ABL: ['и', 'ях'] # предложный
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
def __init__(self, pos_tagger):
|
|
34
|
-
assert(isinstance(pos_tagger, RussianPOSTagger))
|
|
35
|
-
self.__pos_tagger = pos_tagger
|
|
36
|
-
|
|
37
|
-
def to_string(self, original_value, entity_type):
|
|
38
|
-
assert(isinstance(original_value, Entity))
|
|
39
|
-
assert(isinstance(entity_type, OpinionEntityType))
|
|
40
|
-
|
|
41
|
-
template = None
|
|
42
|
-
cases_map = None
|
|
43
|
-
|
|
44
|
-
if (entity_type == OpinionEntityType.Object) or (entity_type == OpinionEntityType.SynonymObject):
|
|
45
|
-
template = "объект"
|
|
46
|
-
cases_map = self.obj_subj_cases_map
|
|
47
|
-
elif (entity_type == OpinionEntityType.Subject) or (entity_type == OpinionEntityType.SynonymSubject):
|
|
48
|
-
template = "субъект"
|
|
49
|
-
cases_map = self.obj_subj_cases_map
|
|
50
|
-
elif entity_type == OpinionEntityType.Other:
|
|
51
|
-
template = "сущност"
|
|
52
|
-
cases_map = self.entity_cases_map
|
|
53
|
-
|
|
54
|
-
return self.__get_correct_declention(value=original_value.Value,
|
|
55
|
-
template=template,
|
|
56
|
-
cases_map=cases_map)
|
|
57
|
-
|
|
58
|
-
def __get_correct_declention(self, value, template, cases_map):
|
|
59
|
-
assert(isinstance(value, str))
|
|
60
|
-
assert(isinstance(template, str))
|
|
61
|
-
assert(isinstance(cases_map, dict))
|
|
62
|
-
|
|
63
|
-
num = self.__pos_tagger.get_term_number(value)
|
|
64
|
-
case = self.__pos_tagger.get_term_case(value)
|
|
65
|
-
|
|
66
|
-
assert(isinstance(num, RussianNumberType))
|
|
67
|
-
assert(isinstance(case, RussianCases))
|
|
68
|
-
|
|
69
|
-
if num == RussianNumberType.UNKN or num == RussianNumberType.Single:
|
|
70
|
-
num_int = 0
|
|
71
|
-
else:
|
|
72
|
-
num_int = 1
|
|
73
|
-
|
|
74
|
-
if case not in cases_map:
|
|
75
|
-
case = RussianCases.UNKN
|
|
76
|
-
|
|
77
|
-
return template + (cases_map[case])[num_int]
|
|
78
|
-
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
from arekit.common.entities.str_fmt import StringEntitiesFormatter
|
|
2
|
-
from arekit.common.entities.types import OpinionEntityType
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class RussianEntitiesFormatter(StringEntitiesFormatter):
|
|
6
|
-
|
|
7
|
-
def to_string(self, original_value, entity_type):
|
|
8
|
-
assert(isinstance(entity_type, OpinionEntityType))
|
|
9
|
-
|
|
10
|
-
if (entity_type == OpinionEntityType.Object) or (entity_type == OpinionEntityType.SynonymObject):
|
|
11
|
-
return "объект"
|
|
12
|
-
elif (entity_type == OpinionEntityType.Subject) or (entity_type == OpinionEntityType.SynonymSubject):
|
|
13
|
-
return "субъект"
|
|
14
|
-
if entity_type == OpinionEntityType.Other:
|
|
15
|
-
return "сущность"
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
from arekit.common.entities.str_fmt import StringEntitiesFormatter
|
|
2
|
-
from arekit.common.entities.types import OpinionEntityType
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class StringEntitiesSimpleFormatter(StringEntitiesFormatter):
|
|
6
|
-
"""
|
|
7
|
-
Utilized for picking a related word in word embedding.
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
def to_string(self, original_value, entity_type):
|
|
11
|
-
"""
|
|
12
|
-
Returns: str (unicode)
|
|
13
|
-
Value that assumes to be utilized in Word2Vec model embedding search.
|
|
14
|
-
"""
|
|
15
|
-
assert(isinstance(entity_type, OpinionEntityType))
|
|
16
|
-
|
|
17
|
-
if entity_type == OpinionEntityType.Other:
|
|
18
|
-
return "e"
|
|
19
|
-
elif entity_type == OpinionEntityType.Object or entity_type == OpinionEntityType.SynonymObject:
|
|
20
|
-
return "object"
|
|
21
|
-
elif entity_type == OpinionEntityType.Subject or entity_type == OpinionEntityType.SynonymSubject:
|
|
22
|
-
return "subject"
|
|
23
|
-
|
|
24
|
-
return None
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
from arekit.common.entities.base import Entity
|
|
2
|
-
from arekit.common.entities.str_fmt import StringEntitiesFormatter
|
|
3
|
-
from arekit.common.entities.types import OpinionEntityType
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class SimpleUppercasedEntityFormatter(StringEntitiesFormatter):
|
|
7
|
-
|
|
8
|
-
def to_string(self, original_value, entity_type):
|
|
9
|
-
assert(isinstance(original_value, Entity) or original_value is None)
|
|
10
|
-
assert(isinstance(entity_type, OpinionEntityType))
|
|
11
|
-
|
|
12
|
-
if entity_type == OpinionEntityType.Other:
|
|
13
|
-
mask = "ENTITY"
|
|
14
|
-
elif entity_type == OpinionEntityType.Subject or entity_type == OpinionEntityType.SynonymSubject:
|
|
15
|
-
mask = "E_SUBJ"
|
|
16
|
-
elif entity_type == OpinionEntityType.Object or entity_type == OpinionEntityType.SynonymObject:
|
|
17
|
-
mask = "E_OBJ"
|
|
18
|
-
else:
|
|
19
|
-
raise NotImplementedError()
|
|
20
|
-
|
|
21
|
-
return mask
|
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
from os.path import join
|
|
2
|
-
|
|
3
|
-
from arekit.contrib.utils.data.ext import create_reader_extension
|
|
4
|
-
from arekit.contrib.utils.data.readers.base import BaseReader
|
|
5
|
-
from arekit.common.experiment.api.base_samples_io import BaseSamplesIO
|
|
6
|
-
from arekit.contrib.utils.io_utils.utils import filename_template
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class OpinionsIO(BaseSamplesIO):
|
|
10
|
-
|
|
11
|
-
def __init__(self, target_dir, reader=None, prefix="opinion", target_extension=".tsv.gz"):
|
|
12
|
-
assert(isinstance(reader, BaseReader))
|
|
13
|
-
self.__target_dir = target_dir
|
|
14
|
-
self.__prefix = prefix
|
|
15
|
-
self.__reader = reader
|
|
16
|
-
self.__target_extension = create_reader_extension(reader) \
|
|
17
|
-
if target_extension is None else target_extension
|
|
18
|
-
|
|
19
|
-
@property
|
|
20
|
-
def Reader(self):
|
|
21
|
-
return self.__reader
|
|
22
|
-
|
|
23
|
-
def create_target(self, data_type, data_folding):
|
|
24
|
-
return self.__get_input_opinions_target(data_type, data_folding=data_folding)
|
|
25
|
-
|
|
26
|
-
def __get_input_opinions_target(self, data_type, data_folding):
|
|
27
|
-
template = filename_template(data_type=data_type, data_folding=data_folding)
|
|
28
|
-
return self.__get_filepath(out_dir=self.__target_dir,
|
|
29
|
-
template=template,
|
|
30
|
-
prefix=self.__prefix,
|
|
31
|
-
extension=self.__target_extension)
|
|
32
|
-
|
|
33
|
-
@staticmethod
|
|
34
|
-
def __get_filepath(out_dir, template, prefix, extension):
|
|
35
|
-
assert(isinstance(template, str))
|
|
36
|
-
assert(isinstance(prefix, str))
|
|
37
|
-
assert(isinstance(extension, str))
|
|
38
|
-
return join(out_dir, "{prefix}-{template}{extension}".format(
|
|
39
|
-
prefix=prefix, template=template, extension=extension))
|