arekit 0.24.0__py3-none-any.whl → 0.25.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/common/docs/entities_grouping.py +2 -1
- arekit/common/docs/parser.py +52 -20
- arekit/common/pipeline/base.py +12 -16
- arekit/common/pipeline/batching.py +28 -0
- arekit/common/pipeline/context.py +5 -1
- arekit/common/pipeline/items/base.py +38 -1
- arekit/common/pipeline/items/flatten.py +5 -1
- arekit/common/pipeline/items/handle.py +2 -1
- arekit/common/pipeline/items/iter.py +2 -1
- arekit/common/pipeline/items/map.py +2 -1
- arekit/common/pipeline/items/map_nested.py +4 -0
- arekit/common/pipeline/utils.py +32 -0
- arekit/common/service/sqlite.py +36 -0
- arekit/common/text/{partitioning/str.py → partitioning.py} +14 -9
- arekit/common/utils.py +0 -44
- arekit/contrib/utils/data/contents/opinions.py +13 -3
- arekit/contrib/utils/data/readers/sqlite.py +14 -0
- arekit/contrib/utils/data/storages/row_cache.py +6 -1
- arekit/contrib/utils/data/storages/sqlite_based.py +17 -0
- arekit/contrib/utils/data/writers/sqlite_native.py +4 -0
- arekit/contrib/utils/io_utils/utils.py +1 -18
- arekit/contrib/utils/pipelines/items/sampling/base.py +7 -12
- arekit/contrib/utils/pipelines/items/sampling/networks.py +3 -2
- arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
- arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +2 -2
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +2 -1
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +2 -4
- arekit/contrib/utils/pipelines/items/text/translator.py +2 -1
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +6 -9
- arekit/contrib/utils/serializer.py +1 -2
- arekit-0.25.0.data/data/logo.png +0 -0
- arekit-0.25.0.dist-info/METADATA +82 -0
- {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/RECORD +38 -153
- {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/WHEEL +1 -1
- arekit/common/docs/objects_parser.py +0 -37
- arekit/common/text/parser.py +0 -12
- arekit/common/text/partitioning/base.py +0 -4
- arekit/common/text/partitioning/terms.py +0 -35
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +0 -84
- arekit/contrib/source/brat/doc.py +0 -28
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +0 -13
- arekit/contrib/source/brat/entities/entity.py +0 -42
- arekit/contrib/source/brat/entities/parser.py +0 -53
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +0 -19
- arekit/contrib/source/brat/relation.py +0 -32
- arekit/contrib/source/brat/sentence.py +0 -69
- arekit/contrib/source/brat/sentences_reader.py +0 -128
- arekit/contrib/source/download.py +0 -41
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +0 -55
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +0 -74
- arekit/contrib/source/nerel/io_utils.py +0 -62
- arekit/contrib/source/nerel/labels.py +0 -241
- arekit/contrib/source/nerel/reader.py +0 -46
- arekit/contrib/source/nerel/utils.py +0 -24
- arekit/contrib/source/nerel/versions.py +0 -12
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +0 -62
- arekit/contrib/source/nerelbio/labels.py +0 -265
- arekit/contrib/source/nerelbio/reader.py +0 -8
- arekit/contrib/source/nerelbio/versions.py +0 -8
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +0 -36
- arekit/contrib/source/ruattitudes/doc.py +0 -51
- arekit/contrib/source/ruattitudes/doc_brat.py +0 -44
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
- arekit/contrib/source/ruattitudes/io_utils.py +0 -56
- arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
- arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
- arekit/contrib/source/ruattitudes/reader.py +0 -268
- arekit/contrib/source/ruattitudes/sentence.py +0 -73
- arekit/contrib/source/ruattitudes/synonyms.py +0 -17
- arekit/contrib/source/ruattitudes/text_object.py +0 -59
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +0 -157
- arekit/contrib/source/rusentiframes/effect.py +0 -24
- arekit/contrib/source/rusentiframes/io_utils.py +0 -19
- arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
- arekit/contrib/source/rusentiframes/polarity.py +0 -35
- arekit/contrib/source/rusentiframes/role.py +0 -15
- arekit/contrib/source/rusentiframes/state.py +0 -24
- arekit/contrib/source/rusentiframes/types.py +0 -42
- arekit/contrib/source/rusentiframes/value.py +0 -2
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +0 -3
- arekit/contrib/source/rusentrel/docs_reader.py +0 -51
- arekit/contrib/source/rusentrel/entities.py +0 -26
- arekit/contrib/source/rusentrel/io_utils.py +0 -125
- arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
- arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
- arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
- arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
- arekit/contrib/source/rusentrel/synonyms.py +0 -17
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +0 -52
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +0 -31
- arekit/contrib/source/sentinerel/folding/fixed.py +0 -70
- arekit/contrib/source/sentinerel/io_utils.py +0 -87
- arekit/contrib/source/sentinerel/labels.py +0 -53
- arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
- arekit/contrib/source/sentinerel/reader.py +0 -42
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +0 -19
- arekit/contrib/source/zip_utils.py +0 -47
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
- arekit/contrib/utils/download.py +0 -77
- arekit/contrib/utils/io_utils/opinions.py +0 -37
- arekit/contrib/utils/io_utils/samples.py +0 -79
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +0 -41
- arekit/contrib/utils/lexicons/relation.py +0 -42
- arekit/contrib/utils/lexicons/rusentilex.py +0 -37
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +0 -83
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +0 -27
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -65
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +0 -29
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -64
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +0 -56
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -20
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -65
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +0 -21
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -107
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +0 -29
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -180
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
- arekit/contrib/utils/resources.py +0 -25
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
- arekit/download_data.py +0 -11
- arekit-0.24.0.dist-info/METADATA +0 -23
- /arekit/common/{text/partitioning → service}/__init__.py +0 -0
- {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/LICENSE +0 -0
- {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/top_level.txt +0 -0
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
from enum import Enum
|
|
2
|
-
from os import path
|
|
3
|
-
from os.path import basename, join
|
|
4
|
-
|
|
5
|
-
import enum
|
|
6
|
-
|
|
7
|
-
from arekit.contrib.source.sentinerel.folding.factory import SentiNERELFoldingFactory
|
|
8
|
-
from arekit.contrib.source.zip_utils import ZipArchiveUtils
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class SentiNerelVersions(Enum):
|
|
12
|
-
""" List of the supported version of this collection
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
# Initial version.
|
|
16
|
-
V1 = "v1_0"
|
|
17
|
-
# Updated annotation within the second half of the texts. (September 2022)
|
|
18
|
-
V2 = "v2_0"
|
|
19
|
-
# Updated annotation within the first half of the texts. (October 2022)
|
|
20
|
-
# Become a source of the RuSentNE-2023 competition.
|
|
21
|
-
# https://github.com/dialogue-evaluation/RuSentNE-evaluation
|
|
22
|
-
V21 = "v2_1"
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
DEFAULT_VERSION = SentiNerelVersions.V21
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class SentiNerelIOUtils(ZipArchiveUtils):
|
|
29
|
-
|
|
30
|
-
inner_root = "sentiment_dataset"
|
|
31
|
-
|
|
32
|
-
@staticmethod
|
|
33
|
-
def get_archive_filepath(version):
|
|
34
|
-
return path.join(SentiNerelIOUtils.get_data_root(), "sentinerel-{}.zip".format(version))
|
|
35
|
-
|
|
36
|
-
@staticmethod
|
|
37
|
-
def get_annotation_innerpath(filename):
|
|
38
|
-
assert(isinstance(filename, str))
|
|
39
|
-
return path.join(SentiNerelIOUtils.inner_root, "{}.ann".format(filename))
|
|
40
|
-
|
|
41
|
-
@staticmethod
|
|
42
|
-
def get_doc_innerpath(filename):
|
|
43
|
-
assert(isinstance(filename, str))
|
|
44
|
-
return path.join(SentiNerelIOUtils.inner_root, "{}.txt".format(filename))
|
|
45
|
-
|
|
46
|
-
@staticmethod
|
|
47
|
-
def __iter_filenames_from_dataset(folder_name, version):
|
|
48
|
-
assert(isinstance(version, enum.Enum))
|
|
49
|
-
assert(isinstance(folder_name, str))
|
|
50
|
-
|
|
51
|
-
for filename in SentiNerelIOUtils.iter_filenames_from_zip(version):
|
|
52
|
-
|
|
53
|
-
extension = filename[-4:]
|
|
54
|
-
|
|
55
|
-
# Crop extension.
|
|
56
|
-
filename = filename[:-4]
|
|
57
|
-
|
|
58
|
-
if extension != ".txt":
|
|
59
|
-
continue
|
|
60
|
-
|
|
61
|
-
if not folder_name in filename:
|
|
62
|
-
continue
|
|
63
|
-
|
|
64
|
-
yield basename(filename)
|
|
65
|
-
|
|
66
|
-
# region public methods
|
|
67
|
-
|
|
68
|
-
@staticmethod
|
|
69
|
-
def iter_collection_filenames(version=DEFAULT_VERSION):
|
|
70
|
-
filenames_it = SentiNerelIOUtils.__iter_filenames_from_dataset(
|
|
71
|
-
folder_name=SentiNerelIOUtils.inner_root, version=version)
|
|
72
|
-
|
|
73
|
-
for doc_id, filename in enumerate(filenames_it):
|
|
74
|
-
yield doc_id, filename
|
|
75
|
-
|
|
76
|
-
@staticmethod
|
|
77
|
-
def read_dataset_split(version=DEFAULT_VERSION, docs_limit=None):
|
|
78
|
-
""" Provides a fixed split of the dataset onto
|
|
79
|
-
`test` and `training` part:
|
|
80
|
-
https://github.com/nicolay-r/SentiNEREL-attitude-extraction
|
|
81
|
-
"""
|
|
82
|
-
return SentiNerelIOUtils.read_from_zip(
|
|
83
|
-
inner_path=join(SentiNerelIOUtils.inner_root, "split_fixed.txt"),
|
|
84
|
-
process_func=lambda f: SentiNERELFoldingFactory.create_fixed_folding(file=f, limit=docs_limit),
|
|
85
|
-
version=version)
|
|
86
|
-
|
|
87
|
-
# endregion
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
from arekit.common.labels.base import Label
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class OpinionBelongsTo(Label):
|
|
5
|
-
pass
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class OpinionRelatesTo(Label):
|
|
9
|
-
pass
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class NegEffectFrom(Label):
|
|
13
|
-
pass
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class NegStateFrom(Label):
|
|
17
|
-
pass
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class PosEffectFrom(Label):
|
|
21
|
-
pass
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class PosAuthorFrom(Label):
|
|
25
|
-
pass
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class NegAuthorFrom(Label):
|
|
29
|
-
pass
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
class PosStateFrom(Label):
|
|
33
|
-
pass
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class NegativeTo(Label):
|
|
37
|
-
pass
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
class PositiveTo(Label):
|
|
41
|
-
pass
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
class AlternativeName(Label):
|
|
45
|
-
pass
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
class StateBelongsTo(Label):
|
|
49
|
-
pass
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class OriginsFrom(Label):
|
|
53
|
-
pass
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
from collections import OrderedDict
|
|
2
|
-
|
|
3
|
-
from arekit.common.labels.scaler.base import BaseLabelScaler
|
|
4
|
-
from arekit.contrib.source.sentinerel import labels
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class SentiNerelLabelScaler(BaseLabelScaler):
|
|
8
|
-
""" This is a complete label scaler of all the labels supported by NEREL dataset.
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
def __init__(self):
|
|
12
|
-
|
|
13
|
-
self.__uint_to_label_dict = OrderedDict([
|
|
14
|
-
(labels.OpinionBelongsTo(), 0),
|
|
15
|
-
(labels.OpinionRelatesTo(), 1),
|
|
16
|
-
(labels.NegEffectFrom(), 2),
|
|
17
|
-
(labels.PosEffectFrom(), 3),
|
|
18
|
-
(labels.NegStateFrom(), 4),
|
|
19
|
-
(labels.PosStateFrom(), 5),
|
|
20
|
-
(labels.NegativeTo(), 6),
|
|
21
|
-
(labels.PositiveTo(), 7),
|
|
22
|
-
(labels.StateBelongsTo(), 8),
|
|
23
|
-
(labels.PosAuthorFrom(), 9),
|
|
24
|
-
(labels.NegAuthorFrom(), 10),
|
|
25
|
-
(labels.AlternativeName(), 11),
|
|
26
|
-
(labels.OriginsFrom(), 12)
|
|
27
|
-
])
|
|
28
|
-
|
|
29
|
-
super(SentiNerelLabelScaler, self).__init__(int_dict=self.__uint_to_label_dict,
|
|
30
|
-
uint_dict=self.__uint_to_label_dict)
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
from arekit.contrib.source.brat.annot import BratAnnotationParser
|
|
2
|
-
from arekit.contrib.source.brat.doc import BratDocument
|
|
3
|
-
from arekit.contrib.source.brat.sentences_reader import BratDocumentSentencesReader
|
|
4
|
-
from arekit.contrib.source.sentinerel.entities import SentiNerelEntityCollection
|
|
5
|
-
from arekit.contrib.source.sentinerel.io_utils import SentiNerelIOUtils, DEFAULT_VERSION
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class SentiNerelDocReader(object):
|
|
9
|
-
|
|
10
|
-
@staticmethod
|
|
11
|
-
def read_text_relations(filename, version):
|
|
12
|
-
assert(isinstance(filename, str))
|
|
13
|
-
|
|
14
|
-
return SentiNerelIOUtils.read_from_zip(
|
|
15
|
-
inner_path=SentiNerelIOUtils.get_annotation_innerpath(filename),
|
|
16
|
-
process_func=lambda input_file: [
|
|
17
|
-
relation for relation in BratAnnotationParser.parse_annotations(
|
|
18
|
-
input_file=input_file, encoding='utf-8-sig')["relations"]],
|
|
19
|
-
version=version)
|
|
20
|
-
|
|
21
|
-
@staticmethod
|
|
22
|
-
def read_document(filename, doc_id, version=DEFAULT_VERSION, entities_to_ignore=None):
|
|
23
|
-
assert(isinstance(filename, str))
|
|
24
|
-
assert(isinstance(doc_id, int))
|
|
25
|
-
|
|
26
|
-
def file_to_doc(input_file):
|
|
27
|
-
sentences = BratDocumentSentencesReader.from_file(input_file=input_file, entities=entities)
|
|
28
|
-
return BratDocument(doc_id=doc_id, sentences=sentences, text_relations=text_relations)
|
|
29
|
-
|
|
30
|
-
# TODO. #398 issue -- in some cases entities might be nested. Therefore we limit the set
|
|
31
|
-
# TODO. of the potential named entities.
|
|
32
|
-
eti = ["EFFECT_NEG", "EFFECT_POS", "ARGUMENT_NEG", "ARGUMENT_POS", "EVENT"] \
|
|
33
|
-
if entities_to_ignore is None else entities_to_ignore
|
|
34
|
-
|
|
35
|
-
entities = SentiNerelEntityCollection.read_collection(
|
|
36
|
-
filename=filename, version=version, entities_to_ignore=eti)
|
|
37
|
-
text_relations = SentiNerelDocReader.read_text_relations(filename=filename, version=version)
|
|
38
|
-
|
|
39
|
-
return SentiNerelIOUtils.read_from_zip(
|
|
40
|
-
inner_path=SentiNerelIOUtils.get_doc_innerpath(filename=filename),
|
|
41
|
-
process_func=file_to_doc,
|
|
42
|
-
version=version)
|
|
File without changes
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
from arekit.common.utils import progress_bar_defined
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def iter_synonym_groups(input_file, sep=",", desc=""):
|
|
5
|
-
""" All the synonyms groups organized in lines, separated by `sep`
|
|
6
|
-
"""
|
|
7
|
-
lines = input_file.readlines()
|
|
8
|
-
|
|
9
|
-
lines_it = progress_bar_defined(lines,
|
|
10
|
-
total=len(lines),
|
|
11
|
-
desc=desc,
|
|
12
|
-
unit="opins")
|
|
13
|
-
|
|
14
|
-
for line in lines_it:
|
|
15
|
-
|
|
16
|
-
if isinstance(line, bytes):
|
|
17
|
-
line = line.decode()
|
|
18
|
-
|
|
19
|
-
yield line.split(sep)
|
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
import zipfile
|
|
2
|
-
|
|
3
|
-
import enum
|
|
4
|
-
|
|
5
|
-
from arekit.common import utils
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class ZipArchiveUtils(object):
|
|
9
|
-
|
|
10
|
-
@staticmethod
|
|
11
|
-
def get_archive_filepath(version):
|
|
12
|
-
raise NotImplementedError()
|
|
13
|
-
|
|
14
|
-
@classmethod
|
|
15
|
-
def read_from_zip(cls, inner_path, process_func, version):
|
|
16
|
-
"""
|
|
17
|
-
process_func:
|
|
18
|
-
func which receives a file reader
|
|
19
|
-
"""
|
|
20
|
-
assert(isinstance(inner_path, str))
|
|
21
|
-
assert(callable(process_func))
|
|
22
|
-
assert(isinstance(version, enum.Enum))
|
|
23
|
-
|
|
24
|
-
with zipfile.ZipFile(cls.get_archive_filepath(version.value), "r") as zip_ref:
|
|
25
|
-
with zip_ref.open(inner_path, mode='r') as c_file:
|
|
26
|
-
return process_func(c_file)
|
|
27
|
-
|
|
28
|
-
@classmethod
|
|
29
|
-
def iter_from_zip(cls, inner_path, process_func, version):
|
|
30
|
-
assert(isinstance(inner_path, str))
|
|
31
|
-
assert(callable(process_func))
|
|
32
|
-
assert(isinstance(version, enum.Enum))
|
|
33
|
-
|
|
34
|
-
with zipfile.ZipFile(cls.get_archive_filepath(version.value), "r") as zip_ref:
|
|
35
|
-
with zip_ref.open(inner_path, mode='r') as c_file:
|
|
36
|
-
for result in process_func(c_file):
|
|
37
|
-
yield result
|
|
38
|
-
|
|
39
|
-
@classmethod
|
|
40
|
-
def iter_filenames_from_zip(cls, version):
|
|
41
|
-
assert(isinstance(version, enum.Enum))
|
|
42
|
-
with zipfile.ZipFile(cls.get_archive_filepath(version.value), "r") as zip_ref:
|
|
43
|
-
return iter(zip_ref.namelist())
|
|
44
|
-
|
|
45
|
-
@staticmethod
|
|
46
|
-
def get_data_root():
|
|
47
|
-
return utils.get_default_download_dir()
|
|
File without changes
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
from arekit.common.frames.connotations.provider import FrameConnotationProvider
|
|
2
|
-
from arekit.contrib.source.rusentiframes.collection import RuSentiFramesCollection
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class RuSentiFramesConnotationProvider(FrameConnotationProvider):
|
|
6
|
-
""" This is a provider based on A0->A1 label type of RuSentiFrames collection.
|
|
7
|
-
For a greater details, checkout the related collection at:
|
|
8
|
-
https://github.com/nicolay-r/RuSentiFrames
|
|
9
|
-
|
|
10
|
-
Papers:
|
|
11
|
-
[1] Natalia Loukachevitch, Nicolay Rusnachenko: Sentiment Frames
|
|
12
|
-
for Attitude Extraction in Russian, 2020
|
|
13
|
-
[2] Distant Supervision for Sentiment Attitude Extraction, 2019
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
def __init__(self, collection):
|
|
17
|
-
assert(isinstance(collection, RuSentiFramesCollection))
|
|
18
|
-
self.__collection = collection
|
|
19
|
-
|
|
20
|
-
def try_provide(self, frame_id):
|
|
21
|
-
return self.__collection.try_get_frame_polarity(frame_id=frame_id,
|
|
22
|
-
role_src='a0',
|
|
23
|
-
role_dest='a1')
|
arekit/contrib/utils/download.py
DELETED
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import tarfile
|
|
3
|
-
from os.path import join, exists
|
|
4
|
-
|
|
5
|
-
from arekit.common import utils
|
|
6
|
-
from arekit.contrib.utils.np_utils.embedding import NpzEmbeddingHelper
|
|
7
|
-
from arekit.contrib.utils.np_utils.vocab import VocabRepositoryUtils
|
|
8
|
-
|
|
9
|
-
NEWS_MYSTEM_SKIPGRAM_1000_20_2015 = "news_mystem_skipgram_1000_20_2015.tar.gz"
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def __get_resource(local_name, check_existance=False, download_if_missed=False):
|
|
13
|
-
assert (isinstance(local_name, str))
|
|
14
|
-
filepath = join(utils.get_default_download_dir(), local_name)
|
|
15
|
-
|
|
16
|
-
if check_existance and not exists(filepath):
|
|
17
|
-
if download_if_missed:
|
|
18
|
-
download()
|
|
19
|
-
# We try to ger the resource again but won't attempt to download it again.
|
|
20
|
-
__get_resource(local_name, check_existance=check_existance, download_if_missed=False)
|
|
21
|
-
else:
|
|
22
|
-
raise Exception("Resource could not be found: {}".format(filepath))
|
|
23
|
-
|
|
24
|
-
return filepath
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def __get_embedding_dir(filepath):
|
|
28
|
-
return filepath.replace(".tar.gz", "")
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def load_embedding_and_vocab(local_name, check_existance=False, download_if_missed=False):
|
|
32
|
-
tar_gz_archive = __get_resource(local_name, check_existance=check_existance,
|
|
33
|
-
download_if_missed=download_if_missed)
|
|
34
|
-
target_dir = __get_embedding_dir(tar_gz_archive)
|
|
35
|
-
embedding = NpzEmbeddingHelper.load_embedding(os.path.join(target_dir, "embedding.npz"))
|
|
36
|
-
vocab = VocabRepositoryUtils.load(os.path.join(target_dir, "vocab.txt"))
|
|
37
|
-
return embedding, vocab
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def download():
|
|
41
|
-
data = {
|
|
42
|
-
NEWS_MYSTEM_SKIPGRAM_1000_20_2015: "https://www.dropbox.com/s/0omnlgzgnjhxlmf/{filename}?dl=1".format(
|
|
43
|
-
filename=NEWS_MYSTEM_SKIPGRAM_1000_20_2015),
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
# Perform downloading ...
|
|
47
|
-
for local_name, url_link in data.items():
|
|
48
|
-
utils.download(dest_file_path=__get_resource(local_name),
|
|
49
|
-
source_url=url_link)
|
|
50
|
-
|
|
51
|
-
# Untar files ...
|
|
52
|
-
for local_name in data.keys():
|
|
53
|
-
|
|
54
|
-
if ".tar.gz" not in local_name:
|
|
55
|
-
continue
|
|
56
|
-
|
|
57
|
-
target_filepath = __get_resource(local_name)
|
|
58
|
-
with tarfile.open(target_filepath) as file:
|
|
59
|
-
def is_within_directory(directory, target):
|
|
60
|
-
|
|
61
|
-
abs_directory = os.path.abspath(directory)
|
|
62
|
-
abs_target = os.path.abspath(target)
|
|
63
|
-
|
|
64
|
-
prefix = os.path.commonprefix([abs_directory, abs_target])
|
|
65
|
-
|
|
66
|
-
return prefix == abs_directory
|
|
67
|
-
|
|
68
|
-
def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
|
|
69
|
-
|
|
70
|
-
for member in tar.getmembers():
|
|
71
|
-
member_path = os.path.join(path, member.name)
|
|
72
|
-
if not is_within_directory(path, member_path):
|
|
73
|
-
raise Exception("Attempted Path Traversal in Tar File")
|
|
74
|
-
|
|
75
|
-
tar.extractall(path, members, numeric_owner=numeric_owner)
|
|
76
|
-
|
|
77
|
-
safe_extract(file, __get_embedding_dir(target_filepath))
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
from os.path import join
|
|
2
|
-
|
|
3
|
-
from arekit.contrib.utils.data.readers.base import BaseReader
|
|
4
|
-
from arekit.common.experiment.api.base_samples_io import BaseSamplesIO
|
|
5
|
-
from arekit.contrib.utils.io_utils.utils import filename_template
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class OpinionsIO(BaseSamplesIO):
|
|
9
|
-
|
|
10
|
-
def __init__(self, target_dir, reader=None, prefix="opinion"):
|
|
11
|
-
assert(isinstance(reader, BaseReader))
|
|
12
|
-
self.__target_dir = target_dir
|
|
13
|
-
self.__prefix = prefix
|
|
14
|
-
self.__reader = reader
|
|
15
|
-
self.__target_extension = reader.extension()
|
|
16
|
-
|
|
17
|
-
@property
|
|
18
|
-
def Reader(self):
|
|
19
|
-
return self.__reader
|
|
20
|
-
|
|
21
|
-
def create_target(self, data_type):
|
|
22
|
-
return self.__get_input_opinions_target(data_type)
|
|
23
|
-
|
|
24
|
-
def __get_input_opinions_target(self, data_type):
|
|
25
|
-
template = filename_template(data_type=data_type)
|
|
26
|
-
return self.__get_filepath(out_dir=self.__target_dir,
|
|
27
|
-
template=template,
|
|
28
|
-
prefix=self.__prefix,
|
|
29
|
-
extension=self.__target_extension)
|
|
30
|
-
|
|
31
|
-
@staticmethod
|
|
32
|
-
def __get_filepath(out_dir, template, prefix, extension):
|
|
33
|
-
assert(isinstance(template, str))
|
|
34
|
-
assert(isinstance(prefix, str))
|
|
35
|
-
assert(isinstance(extension, str))
|
|
36
|
-
return join(out_dir, "{prefix}-{template}{extension}".format(
|
|
37
|
-
prefix=prefix, template=template, extension=extension))
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
from os.path import join
|
|
3
|
-
|
|
4
|
-
from arekit.contrib.utils.data.readers.base import BaseReader
|
|
5
|
-
from arekit.common.experiment.api.base_samples_io import BaseSamplesIO
|
|
6
|
-
from arekit.contrib.utils.data.writers.base import BaseWriter
|
|
7
|
-
from arekit.contrib.utils.io_utils.utils import filename_template, check_targets_existence
|
|
8
|
-
|
|
9
|
-
logger = logging.getLogger(__name__)
|
|
10
|
-
logging.basicConfig(level=logging.INFO)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class SamplesIO(BaseSamplesIO):
|
|
14
|
-
""" Samples default IO utils for samples.
|
|
15
|
-
Sample is a text part which include pair of attitude participants.
|
|
16
|
-
This class allows to provide saver and loader for such entries, bubbed as samples.
|
|
17
|
-
Samples required for machine learning training/inferring.
|
|
18
|
-
"""
|
|
19
|
-
|
|
20
|
-
def __init__(self, target_dir, writer=None, reader=None, prefix="sample"):
|
|
21
|
-
assert(isinstance(target_dir, str))
|
|
22
|
-
assert(isinstance(prefix, str))
|
|
23
|
-
assert(isinstance(writer, BaseWriter) or writer is None)
|
|
24
|
-
assert(isinstance(reader, BaseReader) or reader is None)
|
|
25
|
-
self.__target_dir = target_dir
|
|
26
|
-
self.__prefix = prefix
|
|
27
|
-
self.__writer = writer
|
|
28
|
-
self.__reader = reader
|
|
29
|
-
|
|
30
|
-
self.__target_extension = None
|
|
31
|
-
if writer is not None:
|
|
32
|
-
self.__target_extension = writer.extension()
|
|
33
|
-
elif reader is not None:
|
|
34
|
-
self.__target_extension = reader.extension()
|
|
35
|
-
|
|
36
|
-
# region public methods
|
|
37
|
-
|
|
38
|
-
@property
|
|
39
|
-
def Prefix(self):
|
|
40
|
-
return self.__prefix
|
|
41
|
-
|
|
42
|
-
@property
|
|
43
|
-
def Reader(self):
|
|
44
|
-
return self.__reader
|
|
45
|
-
|
|
46
|
-
@property
|
|
47
|
-
def Writer(self):
|
|
48
|
-
return self.__writer
|
|
49
|
-
|
|
50
|
-
def create_target(self, data_type):
|
|
51
|
-
return self.__get_input_sample_target(data_type)
|
|
52
|
-
|
|
53
|
-
def check_targets_existed(self, data_types_iter):
|
|
54
|
-
for data_type in data_types_iter:
|
|
55
|
-
|
|
56
|
-
targets = [
|
|
57
|
-
self.__get_input_sample_target(data_type=data_type),
|
|
58
|
-
]
|
|
59
|
-
|
|
60
|
-
if not check_targets_existence(targets=targets):
|
|
61
|
-
return False
|
|
62
|
-
return True
|
|
63
|
-
|
|
64
|
-
# endregion
|
|
65
|
-
|
|
66
|
-
def __get_input_sample_target(self, data_type):
|
|
67
|
-
template = filename_template(data_type=data_type)
|
|
68
|
-
return self.__get_filepath(out_dir=self.__target_dir,
|
|
69
|
-
template=template,
|
|
70
|
-
prefix=self.__prefix,
|
|
71
|
-
extension=self.__target_extension)
|
|
72
|
-
|
|
73
|
-
@staticmethod
|
|
74
|
-
def __get_filepath(out_dir, template, prefix, extension):
|
|
75
|
-
assert(isinstance(template, str))
|
|
76
|
-
assert(isinstance(prefix, str))
|
|
77
|
-
assert(isinstance(extension, str))
|
|
78
|
-
return join(out_dir, "{prefix}-{template}{extension}".format(
|
|
79
|
-
prefix=prefix, template=template, extension=extension))
|
|
File without changes
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
from arekit.contrib.utils.data.readers.csv_pd import PandasCsvReader
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class Lexicon(object):
|
|
5
|
-
|
|
6
|
-
@property
|
|
7
|
-
def ToneKey(self):
|
|
8
|
-
return 'tone'
|
|
9
|
-
|
|
10
|
-
@property
|
|
11
|
-
def TermKey(self):
|
|
12
|
-
return 'term'
|
|
13
|
-
|
|
14
|
-
def __init__(self, dataframe):
|
|
15
|
-
self.__lexicon_df = dataframe
|
|
16
|
-
|
|
17
|
-
@classmethod
|
|
18
|
-
def load(cls, filepath, separator=','):
|
|
19
|
-
reader = PandasCsvReader(compression=None, sep=separator)
|
|
20
|
-
return cls(reader.read(filepath))
|
|
21
|
-
|
|
22
|
-
def get_score(self, lemma):
|
|
23
|
-
assert(type(lemma) == str)
|
|
24
|
-
s = self.__lexicon_df[lemma.encode('utf-8') == self.__lexicon_df[self.TermKey]]
|
|
25
|
-
return s[self.ToneKey].values[0] if len(s) > 0 else 0
|
|
26
|
-
|
|
27
|
-
def has_term(self, term):
|
|
28
|
-
assert(type(term) == str)
|
|
29
|
-
s = self.__lexicon_df[term.encode('utf-8') == self.__lexicon_df[self.TermKey]]
|
|
30
|
-
return len(s) > 0
|
|
31
|
-
|
|
32
|
-
def __iter__(self):
|
|
33
|
-
for term in self.__lexicon_df[self.TermKey]:
|
|
34
|
-
yield term
|
|
35
|
-
|
|
36
|
-
def __contains__(self, item):
|
|
37
|
-
assert(isinstance(item, str))
|
|
38
|
-
result = self.__lexicon_df[self.__lexicon_df[self.TermKey] == item.encode('utf-8')]
|
|
39
|
-
return len(result) > 0
|
|
40
|
-
|
|
41
|
-
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
from arekit.contrib.utils.data.readers.csv_pd import PandasCsvReader
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class RelationLexicon(object):
|
|
5
|
-
|
|
6
|
-
def __init__(self, dataframe):
|
|
7
|
-
self.__check(dataframe)
|
|
8
|
-
self.__lexicon = dataframe
|
|
9
|
-
|
|
10
|
-
@classmethod
|
|
11
|
-
def load(cls, filepath, separator=','):
|
|
12
|
-
reader = PandasCsvReader(compression=None, sep=separator)
|
|
13
|
-
return cls(reader.read(filepath))
|
|
14
|
-
|
|
15
|
-
@staticmethod
|
|
16
|
-
def __check(df):
|
|
17
|
-
for index in df.index:
|
|
18
|
-
relation = df.loc[index][0]
|
|
19
|
-
assert(len(relation.split('<->')) == 2)
|
|
20
|
-
|
|
21
|
-
@staticmethod
|
|
22
|
-
def __create_key(l, r):
|
|
23
|
-
assert(type(l) == str)
|
|
24
|
-
assert(type(r) == str)
|
|
25
|
-
return '<->'.join([l, r])
|
|
26
|
-
|
|
27
|
-
def get_score(self, left, right):
|
|
28
|
-
assert(type(left) == str)
|
|
29
|
-
assert(type(right) == str)
|
|
30
|
-
|
|
31
|
-
lr_key = self.__create_key(left, right)
|
|
32
|
-
rl_key = self.__create_key(right, left)
|
|
33
|
-
|
|
34
|
-
lr_score = self.__lexicon[lr_key == self.__lexicon['relation']]
|
|
35
|
-
rl_score = self.__lexicon[rl_key == self.__lexicon['relation']]
|
|
36
|
-
|
|
37
|
-
if len(lr_score) > 0:
|
|
38
|
-
return lr_score['tone'].values[0]
|
|
39
|
-
if len(rl_score) > 0:
|
|
40
|
-
return rl_score['tone'].values[0]
|
|
41
|
-
|
|
42
|
-
return None
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
import importlib
|
|
2
|
-
import zipfile
|
|
3
|
-
from os import path
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from arekit.contrib.source.zip_utils import ZipArchiveUtils
|
|
7
|
-
from arekit.contrib.utils.lexicons.lexicon import Lexicon
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class RuSentiLexLexicon(Lexicon):
|
|
11
|
-
"""
|
|
12
|
-
RuSentiLex Lexicon wrapper for csv file stored in /data folder.
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
__INNER_PATH = 'rusentilex.csv'
|
|
16
|
-
|
|
17
|
-
@property
|
|
18
|
-
def ToneKey(self):
|
|
19
|
-
return 'tone'
|
|
20
|
-
|
|
21
|
-
@property
|
|
22
|
-
def TermKey(self):
|
|
23
|
-
return 'term'
|
|
24
|
-
|
|
25
|
-
@staticmethod
|
|
26
|
-
def __get_archive_filepath():
|
|
27
|
-
return path.join(ZipArchiveUtils.get_data_root(), "rusentilex.zip")
|
|
28
|
-
|
|
29
|
-
@classmethod
|
|
30
|
-
def from_zip(cls):
|
|
31
|
-
""" Using Pandas API to read lexicon.
|
|
32
|
-
"""
|
|
33
|
-
pd = importlib.import_module("pandas")
|
|
34
|
-
with zipfile.ZipFile(cls.__get_archive_filepath(), "r") as zip_ref:
|
|
35
|
-
with zip_ref.open(cls.__INNER_PATH, mode='r') as csv_file:
|
|
36
|
-
df = pd.read_csv(csv_file, sep=',')
|
|
37
|
-
return cls(df)
|
|
File without changes
|