arekit 0.24.0__py3-none-any.whl → 0.25.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/common/docs/entities_grouping.py +2 -1
- arekit/common/docs/parser.py +52 -20
- arekit/common/pipeline/base.py +12 -16
- arekit/common/pipeline/batching.py +28 -0
- arekit/common/pipeline/context.py +5 -1
- arekit/common/pipeline/items/base.py +38 -1
- arekit/common/pipeline/items/flatten.py +5 -1
- arekit/common/pipeline/items/handle.py +2 -1
- arekit/common/pipeline/items/iter.py +2 -1
- arekit/common/pipeline/items/map.py +2 -1
- arekit/common/pipeline/items/map_nested.py +4 -0
- arekit/common/pipeline/utils.py +32 -0
- arekit/common/service/sqlite.py +36 -0
- arekit/common/text/{partitioning/str.py → partitioning.py} +14 -9
- arekit/common/utils.py +0 -44
- arekit/contrib/utils/data/contents/opinions.py +13 -3
- arekit/contrib/utils/data/readers/sqlite.py +14 -0
- arekit/contrib/utils/data/storages/row_cache.py +6 -1
- arekit/contrib/utils/data/storages/sqlite_based.py +17 -0
- arekit/contrib/utils/data/writers/sqlite_native.py +4 -0
- arekit/contrib/utils/io_utils/utils.py +1 -18
- arekit/contrib/utils/pipelines/items/sampling/base.py +7 -12
- arekit/contrib/utils/pipelines/items/sampling/networks.py +3 -2
- arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
- arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +2 -2
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +2 -1
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +2 -4
- arekit/contrib/utils/pipelines/items/text/translator.py +2 -1
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +6 -9
- arekit/contrib/utils/serializer.py +1 -2
- arekit-0.25.0.data/data/logo.png +0 -0
- arekit-0.25.0.dist-info/METADATA +82 -0
- {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/RECORD +38 -153
- {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/WHEEL +1 -1
- arekit/common/docs/objects_parser.py +0 -37
- arekit/common/text/parser.py +0 -12
- arekit/common/text/partitioning/base.py +0 -4
- arekit/common/text/partitioning/terms.py +0 -35
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +0 -84
- arekit/contrib/source/brat/doc.py +0 -28
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +0 -13
- arekit/contrib/source/brat/entities/entity.py +0 -42
- arekit/contrib/source/brat/entities/parser.py +0 -53
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +0 -19
- arekit/contrib/source/brat/relation.py +0 -32
- arekit/contrib/source/brat/sentence.py +0 -69
- arekit/contrib/source/brat/sentences_reader.py +0 -128
- arekit/contrib/source/download.py +0 -41
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +0 -55
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +0 -74
- arekit/contrib/source/nerel/io_utils.py +0 -62
- arekit/contrib/source/nerel/labels.py +0 -241
- arekit/contrib/source/nerel/reader.py +0 -46
- arekit/contrib/source/nerel/utils.py +0 -24
- arekit/contrib/source/nerel/versions.py +0 -12
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +0 -62
- arekit/contrib/source/nerelbio/labels.py +0 -265
- arekit/contrib/source/nerelbio/reader.py +0 -8
- arekit/contrib/source/nerelbio/versions.py +0 -8
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +0 -36
- arekit/contrib/source/ruattitudes/doc.py +0 -51
- arekit/contrib/source/ruattitudes/doc_brat.py +0 -44
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
- arekit/contrib/source/ruattitudes/io_utils.py +0 -56
- arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
- arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
- arekit/contrib/source/ruattitudes/reader.py +0 -268
- arekit/contrib/source/ruattitudes/sentence.py +0 -73
- arekit/contrib/source/ruattitudes/synonyms.py +0 -17
- arekit/contrib/source/ruattitudes/text_object.py +0 -59
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +0 -157
- arekit/contrib/source/rusentiframes/effect.py +0 -24
- arekit/contrib/source/rusentiframes/io_utils.py +0 -19
- arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
- arekit/contrib/source/rusentiframes/polarity.py +0 -35
- arekit/contrib/source/rusentiframes/role.py +0 -15
- arekit/contrib/source/rusentiframes/state.py +0 -24
- arekit/contrib/source/rusentiframes/types.py +0 -42
- arekit/contrib/source/rusentiframes/value.py +0 -2
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +0 -3
- arekit/contrib/source/rusentrel/docs_reader.py +0 -51
- arekit/contrib/source/rusentrel/entities.py +0 -26
- arekit/contrib/source/rusentrel/io_utils.py +0 -125
- arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
- arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
- arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
- arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
- arekit/contrib/source/rusentrel/synonyms.py +0 -17
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +0 -52
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +0 -31
- arekit/contrib/source/sentinerel/folding/fixed.py +0 -70
- arekit/contrib/source/sentinerel/io_utils.py +0 -87
- arekit/contrib/source/sentinerel/labels.py +0 -53
- arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
- arekit/contrib/source/sentinerel/reader.py +0 -42
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +0 -19
- arekit/contrib/source/zip_utils.py +0 -47
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
- arekit/contrib/utils/download.py +0 -77
- arekit/contrib/utils/io_utils/opinions.py +0 -37
- arekit/contrib/utils/io_utils/samples.py +0 -79
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +0 -41
- arekit/contrib/utils/lexicons/relation.py +0 -42
- arekit/contrib/utils/lexicons/rusentilex.py +0 -37
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +0 -83
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +0 -27
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -65
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +0 -29
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -64
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +0 -56
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -20
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -65
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +0 -21
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -107
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +0 -29
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -180
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
- arekit/contrib/utils/resources.py +0 -25
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
- arekit/download_data.py +0 -11
- arekit-0.24.0.dist-info/METADATA +0 -23
- /arekit/common/{text/partitioning → service}/__init__.py +0 -0
- {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/LICENSE +0 -0
- {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/top_level.txt +0 -0
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
class FrameRole(object):
|
|
2
|
-
|
|
3
|
-
def __init__(self, source, description):
|
|
4
|
-
assert(isinstance(source, str))
|
|
5
|
-
assert(isinstance(description, str))
|
|
6
|
-
self.__source = source
|
|
7
|
-
self.__description = description
|
|
8
|
-
|
|
9
|
-
@property
|
|
10
|
-
def Source(self):
|
|
11
|
-
return self.__source
|
|
12
|
-
|
|
13
|
-
@property
|
|
14
|
-
def Description(self):
|
|
15
|
-
return self.__description
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
from arekit.common.labels.base import Label
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class FrameState(object):
|
|
5
|
-
|
|
6
|
-
def __init__(self, role, label, prob):
|
|
7
|
-
assert(isinstance(role, str))
|
|
8
|
-
assert(isinstance(label, Label))
|
|
9
|
-
assert(isinstance(prob, float))
|
|
10
|
-
self.__role = role
|
|
11
|
-
self.__label = label
|
|
12
|
-
self.__prob = prob
|
|
13
|
-
|
|
14
|
-
@property
|
|
15
|
-
def Role(self):
|
|
16
|
-
return self.__role
|
|
17
|
-
|
|
18
|
-
@property
|
|
19
|
-
def Label(self):
|
|
20
|
-
return self.__label
|
|
21
|
-
|
|
22
|
-
@property
|
|
23
|
-
def Prob(self):
|
|
24
|
-
return self.__prob
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
from enum import Enum
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class RuSentiFramesVersions(Enum):
|
|
5
|
-
|
|
6
|
-
# Papers for description:
|
|
7
|
-
# Distant Supervision for Sentiment Attitude Extraction (RANLP-2019)
|
|
8
|
-
# Nicolay Rusnachenko, Natalia Loukachevitch, Elena Tutubalina
|
|
9
|
-
# https://www.aclweb.org/anthology/R19-1118/
|
|
10
|
-
# https://github.com/nicolay-r/RuSentiFrames/tree/v1.0
|
|
11
|
-
V10 = "v1_0"
|
|
12
|
-
|
|
13
|
-
# Papers for description:
|
|
14
|
-
# Sentiment Frames for Attitude Extraction in Russian (DIALOG-2020)
|
|
15
|
-
# Natalia Loukachevitch, Nicolay Rusnachenko
|
|
16
|
-
# https://github.com/nicolay-r/RuSentiFrames/tree/v2.0
|
|
17
|
-
V20 = "v2_0"
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class RuSentiFramesVersionsService:
|
|
21
|
-
|
|
22
|
-
@staticmethod
|
|
23
|
-
def __iter_supported_types():
|
|
24
|
-
return iter(RuSentiFramesVersions)
|
|
25
|
-
|
|
26
|
-
@staticmethod
|
|
27
|
-
def get_name_by_type(version_type):
|
|
28
|
-
assert(isinstance(version_type, RuSentiFramesVersions))
|
|
29
|
-
return version_type.value
|
|
30
|
-
|
|
31
|
-
@staticmethod
|
|
32
|
-
def get_type_by_name(name):
|
|
33
|
-
for version_type in RuSentiFramesVersionsService.__iter_supported_types():
|
|
34
|
-
if version_type.value == name:
|
|
35
|
-
return version_type
|
|
36
|
-
|
|
37
|
-
raise Exception("RuSentiFrames version by name `{}` was hot found!".format(name))
|
|
38
|
-
|
|
39
|
-
@staticmethod
|
|
40
|
-
def iter_supported_names():
|
|
41
|
-
for version_type in RuSentiFramesVersionsService.__iter_supported_types():
|
|
42
|
-
yield version_type.value
|
|
File without changes
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
from arekit.common.synonyms.base import SynonymsCollection
|
|
2
|
-
from arekit.contrib.source.brat.doc import BratDocument
|
|
3
|
-
from arekit.contrib.source.brat.sentences_reader import BratDocumentSentencesReader
|
|
4
|
-
from arekit.contrib.source.rusentrel.entities import RuSentRelDocumentEntityCollection
|
|
5
|
-
from arekit.contrib.source.rusentrel.io_utils import RuSentRelVersions, RuSentRelIOUtils
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class RuSentRelDocumentsReader(object):
|
|
9
|
-
|
|
10
|
-
# region class methods
|
|
11
|
-
|
|
12
|
-
@staticmethod
|
|
13
|
-
def hide_first_entry(line, entry, hide_with=" "):
|
|
14
|
-
|
|
15
|
-
index = line.find(entry)
|
|
16
|
-
|
|
17
|
-
if index >= 0:
|
|
18
|
-
pad = hide_with * len(entry)
|
|
19
|
-
before = line[0:index]
|
|
20
|
-
after = line[index+len(entry):]
|
|
21
|
-
line = "".join([before, pad, after])
|
|
22
|
-
|
|
23
|
-
return line
|
|
24
|
-
|
|
25
|
-
@staticmethod
|
|
26
|
-
def read_document(doc_id, synonyms, version=RuSentRelVersions.V11, target_doc_id=None):
|
|
27
|
-
assert(isinstance(synonyms, SynonymsCollection))
|
|
28
|
-
assert(isinstance(version, RuSentRelVersions))
|
|
29
|
-
assert(isinstance(target_doc_id, int) or target_doc_id is None)
|
|
30
|
-
|
|
31
|
-
def file_to_doc(input_file):
|
|
32
|
-
|
|
33
|
-
sentences = BratDocumentSentencesReader.from_file(
|
|
34
|
-
input_file=input_file,
|
|
35
|
-
entities=entities,
|
|
36
|
-
line_handler=lambda line: RuSentRelDocumentsReader.hide_first_entry(line, entry="{Author, Unknown}"),
|
|
37
|
-
skip_entity_func=lambda entity: entity.Value in ['author', 'unknown'])
|
|
38
|
-
|
|
39
|
-
return BratDocument(doc_id=target_doc_id if target_doc_id is not None else doc_id,
|
|
40
|
-
sentences=sentences,
|
|
41
|
-
text_relations=[])
|
|
42
|
-
|
|
43
|
-
entities = RuSentRelDocumentEntityCollection.read_collection(
|
|
44
|
-
doc_id=doc_id,
|
|
45
|
-
synonyms=synonyms,
|
|
46
|
-
version=version)
|
|
47
|
-
|
|
48
|
-
return RuSentRelIOUtils.read_from_zip(
|
|
49
|
-
inner_path=RuSentRelIOUtils.get_doc_innerpath(index=doc_id, version=version),
|
|
50
|
-
process_func=file_to_doc,
|
|
51
|
-
version=version)
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
from arekit.common.entities.collection import EntityCollection
|
|
2
|
-
from arekit.common.synonyms.base import SynonymsCollection
|
|
3
|
-
from arekit.contrib.source.brat.annot import BratAnnotationParser
|
|
4
|
-
from arekit.contrib.source.rusentrel.io_utils import RuSentRelVersions, RuSentRelIOUtils
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class RuSentRelDocumentEntityCollection(EntityCollection):
|
|
8
|
-
|
|
9
|
-
def __init__(self, entities, value_to_group_id_func):
|
|
10
|
-
super(RuSentRelDocumentEntityCollection, self).__init__(
|
|
11
|
-
entities=entities,
|
|
12
|
-
value_to_group_id_func=value_to_group_id_func)
|
|
13
|
-
|
|
14
|
-
self._sort_entities(key=lambda entity: entity.IndexBegin)
|
|
15
|
-
|
|
16
|
-
@classmethod
|
|
17
|
-
def read_collection(cls, doc_id, synonyms, version=RuSentRelVersions.V11):
|
|
18
|
-
assert (isinstance(synonyms, SynonymsCollection))
|
|
19
|
-
assert (isinstance(doc_id, int))
|
|
20
|
-
|
|
21
|
-
return RuSentRelIOUtils.read_from_zip(
|
|
22
|
-
inner_path=RuSentRelIOUtils.get_entity_innerpath(index=doc_id, version=version),
|
|
23
|
-
process_func=lambda input_file: cls(
|
|
24
|
-
entities=BratAnnotationParser.parse_annotations(input_file)["entities"],
|
|
25
|
-
value_to_group_id_func=synonyms.get_synonym_group_index),
|
|
26
|
-
version=version)
|
|
@@ -1,125 +0,0 @@
|
|
|
1
|
-
from os import path
|
|
2
|
-
|
|
3
|
-
from enum import Enum
|
|
4
|
-
|
|
5
|
-
from arekit.contrib.source.zip_utils import ZipArchiveUtils
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class RuSentRelVersions(Enum):
|
|
9
|
-
""" Original collection repository: https://github.com/nicolay-r/RuSentRel
|
|
10
|
-
Paper: https://arxiv.org/abs/1808.08932
|
|
11
|
-
"""
|
|
12
|
-
V11 = "v1_1"
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class RuSentRelIOUtils(ZipArchiveUtils):
|
|
16
|
-
|
|
17
|
-
TEST_FOLDER = "test"
|
|
18
|
-
TRAIN_FOLDER = "train"
|
|
19
|
-
ETALON_FOLDER = "etalon"
|
|
20
|
-
|
|
21
|
-
@staticmethod
|
|
22
|
-
def get_archive_filepath(version):
|
|
23
|
-
assert(version, str)
|
|
24
|
-
return path.join(RuSentRelIOUtils.get_data_root(), "rusentrel-{}.zip".format(version))
|
|
25
|
-
|
|
26
|
-
# region internal methods
|
|
27
|
-
|
|
28
|
-
@staticmethod
|
|
29
|
-
def get_sentiment_opin_filepath(index, version, prefix='art'):
|
|
30
|
-
root = RuSentRelIOUtils.__get_root_by_index(index, version=version, keep_etalon=True)
|
|
31
|
-
return path.join(root, "{prefix}{index}.opin.txt".format(prefix=prefix, index=index))
|
|
32
|
-
|
|
33
|
-
@staticmethod
|
|
34
|
-
def get_entity_innerpath(index, version):
|
|
35
|
-
assert(isinstance(index, int))
|
|
36
|
-
assert(isinstance(version, RuSentRelVersions))
|
|
37
|
-
inner_root = RuSentRelIOUtils.__get_root_by_index(doc_id=index, version=version)
|
|
38
|
-
return path.join(inner_root, "art{}.ann".format(index))
|
|
39
|
-
|
|
40
|
-
@staticmethod
|
|
41
|
-
def get_doc_innerpath(index, version):
|
|
42
|
-
assert(isinstance(index, int))
|
|
43
|
-
assert(isinstance(version, RuSentRelVersions))
|
|
44
|
-
inner_root = RuSentRelIOUtils.__get_root_by_index(doc_id=index, version=version)
|
|
45
|
-
return path.join(inner_root, "art{}.txt".format(index))
|
|
46
|
-
|
|
47
|
-
@staticmethod
|
|
48
|
-
def get_synonyms_innerpath():
|
|
49
|
-
return "synonyms.txt"
|
|
50
|
-
|
|
51
|
-
# endregion
|
|
52
|
-
|
|
53
|
-
@staticmethod
|
|
54
|
-
def __get_root_by_index(doc_id, version, keep_etalon=False):
|
|
55
|
-
assert(RuSentRelIOUtils.__is_supported(version))
|
|
56
|
-
assert(isinstance(version, RuSentRelVersions))
|
|
57
|
-
assert(isinstance(doc_id, int))
|
|
58
|
-
other_dir = RuSentRelIOUtils.ETALON_FOLDER if keep_etalon else RuSentRelIOUtils.TEST_FOLDER
|
|
59
|
-
test_indices = set(RuSentRelIOUtils.__iter_indicies_from_dataset(version, RuSentRelIOUtils.TEST_FOLDER))
|
|
60
|
-
return other_dir if doc_id in test_indices else RuSentRelIOUtils.TRAIN_FOLDER
|
|
61
|
-
|
|
62
|
-
@staticmethod
|
|
63
|
-
def __is_supported(version):
|
|
64
|
-
assert(isinstance(version, RuSentRelVersions))
|
|
65
|
-
return version == RuSentRelVersions.V11
|
|
66
|
-
|
|
67
|
-
@staticmethod
|
|
68
|
-
def __number_from_string(s):
|
|
69
|
-
digit_chars = [chr for chr in s if chr.isdigit()]
|
|
70
|
-
|
|
71
|
-
if len(digit_chars) == 0:
|
|
72
|
-
return None
|
|
73
|
-
|
|
74
|
-
return int("".join(digit_chars))
|
|
75
|
-
|
|
76
|
-
@staticmethod
|
|
77
|
-
def __iter_indicies_from_dataset(version, folder_name):
|
|
78
|
-
assert(isinstance(folder_name, str))
|
|
79
|
-
assert(RuSentRelIOUtils.__is_supported(version))
|
|
80
|
-
|
|
81
|
-
used = set()
|
|
82
|
-
|
|
83
|
-
for filename in RuSentRelIOUtils.iter_filenames_from_zip(version):
|
|
84
|
-
if not folder_name in filename:
|
|
85
|
-
continue
|
|
86
|
-
|
|
87
|
-
index = RuSentRelIOUtils.__number_from_string(filename)
|
|
88
|
-
|
|
89
|
-
if index is None:
|
|
90
|
-
continue
|
|
91
|
-
|
|
92
|
-
if index in used:
|
|
93
|
-
continue
|
|
94
|
-
|
|
95
|
-
used.add(index)
|
|
96
|
-
|
|
97
|
-
yield index
|
|
98
|
-
|
|
99
|
-
# region public methods
|
|
100
|
-
|
|
101
|
-
@staticmethod
|
|
102
|
-
def iter_test_indices(version):
|
|
103
|
-
assert(RuSentRelIOUtils.__is_supported(version))
|
|
104
|
-
indices_iter = RuSentRelIOUtils.__iter_indicies_from_dataset(
|
|
105
|
-
version=version, folder_name="{}/".format(RuSentRelIOUtils.TEST_FOLDER))
|
|
106
|
-
for index in indices_iter:
|
|
107
|
-
yield index
|
|
108
|
-
|
|
109
|
-
@staticmethod
|
|
110
|
-
def iter_train_indices(version):
|
|
111
|
-
assert(RuSentRelIOUtils.__is_supported(version))
|
|
112
|
-
indices_iter = RuSentRelIOUtils.__iter_indicies_from_dataset(
|
|
113
|
-
version=version, folder_name="{}/".format(RuSentRelIOUtils.TRAIN_FOLDER))
|
|
114
|
-
for index in indices_iter:
|
|
115
|
-
yield index
|
|
116
|
-
|
|
117
|
-
@staticmethod
|
|
118
|
-
def iter_collection_indices(version):
|
|
119
|
-
assert(RuSentRelIOUtils.__is_supported(version))
|
|
120
|
-
for index in RuSentRelIOUtils.iter_train_indices(version):
|
|
121
|
-
yield index
|
|
122
|
-
for index in RuSentRelIOUtils.iter_test_indices(version):
|
|
123
|
-
yield index
|
|
124
|
-
|
|
125
|
-
# endregion
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
from arekit.common.labels.base import Label
|
|
2
|
-
from arekit.common.labels.str_fmt import StringLabelsFormatter
|
|
3
|
-
from arekit.contrib.source.rusentrel.const import NEG_LABEL_STR, POS_LABEL_STR
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class RuSentRelLabelsFormatter(StringLabelsFormatter):
|
|
7
|
-
|
|
8
|
-
def __init__(self, pos_label_type, neg_label_type):
|
|
9
|
-
assert(issubclass(pos_label_type, Label))
|
|
10
|
-
assert(issubclass(neg_label_type, Label))
|
|
11
|
-
stol = {NEG_LABEL_STR: neg_label_type, POS_LABEL_STR: pos_label_type}
|
|
12
|
-
super(RuSentRelLabelsFormatter, self).__init__(stol=stol)
|
|
File without changes
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
from arekit.contrib.source.rusentrel.const import POS_LABEL_STR, NEG_LABEL_STR
|
|
2
|
-
from arekit.contrib.source.rusentrel.io_utils import RuSentRelIOUtils, RuSentRelVersions
|
|
3
|
-
from arekit.contrib.source.rusentrel.labels_fmt import RuSentRelLabelsFormatter
|
|
4
|
-
from arekit.contrib.source.rusentrel.opinions.provider import RuSentRelOpinionCollectionProvider
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class RuSentRelOpinions:
|
|
8
|
-
"""
|
|
9
|
-
Collection of sentiment opinions between entities
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
@staticmethod
|
|
13
|
-
def iter_from_doc(doc_id, labels_fmt, version=RuSentRelVersions.V11):
|
|
14
|
-
""" doc_id:
|
|
15
|
-
synonyms: None or SynonymsCollection
|
|
16
|
-
None corresponds to the related synonym collection from RuSentRel collection.
|
|
17
|
-
version: RuSentrelVersions enum
|
|
18
|
-
"""
|
|
19
|
-
assert(isinstance(version, RuSentRelVersions))
|
|
20
|
-
assert(isinstance(labels_fmt, RuSentRelLabelsFormatter))
|
|
21
|
-
assert(labels_fmt.supports_value(POS_LABEL_STR))
|
|
22
|
-
assert(labels_fmt.supports_value(NEG_LABEL_STR))
|
|
23
|
-
|
|
24
|
-
return RuSentRelIOUtils.iter_from_zip(
|
|
25
|
-
inner_path=RuSentRelIOUtils.get_sentiment_opin_filepath(index=doc_id, version=version),
|
|
26
|
-
process_func=lambda input_file: RuSentRelOpinionCollectionProvider._iter_opinions_from_file(
|
|
27
|
-
input_file=input_file,
|
|
28
|
-
labels_formatter=labels_fmt,
|
|
29
|
-
error_on_non_supported=True),
|
|
30
|
-
version=version)
|
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
from arekit.common.labels.str_fmt import StringLabelsFormatter
|
|
2
|
-
from arekit.common.opinions.base import Opinion
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class OpinionConverter(object):
|
|
6
|
-
""" Opinion type <-> string Converter.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
@staticmethod
|
|
10
|
-
def try_from_string(line, labels_formatter):
|
|
11
|
-
assert(isinstance(line, str))
|
|
12
|
-
|
|
13
|
-
args = line.strip().split(',')
|
|
14
|
-
assert (len(args) >= 3)
|
|
15
|
-
|
|
16
|
-
source_value = args[0].strip()
|
|
17
|
-
target_value = args[1].strip()
|
|
18
|
-
str_label = args[2].strip()
|
|
19
|
-
|
|
20
|
-
if not labels_formatter.supports_value(str_label):
|
|
21
|
-
return None
|
|
22
|
-
|
|
23
|
-
return Opinion(source_value=source_value,
|
|
24
|
-
target_value=target_value,
|
|
25
|
-
label=labels_formatter.str_to_label(str_label))
|
|
26
|
-
|
|
27
|
-
@staticmethod
|
|
28
|
-
def try_to_string(opinion, labels_formatter):
|
|
29
|
-
assert(isinstance(opinion, Opinion))
|
|
30
|
-
assert(isinstance(labels_formatter, StringLabelsFormatter))
|
|
31
|
-
|
|
32
|
-
label = opinion.Label
|
|
33
|
-
|
|
34
|
-
if not labels_formatter.supports_label(label):
|
|
35
|
-
return None
|
|
36
|
-
|
|
37
|
-
return "{}, {}, {}, current".format(
|
|
38
|
-
opinion.SourceValue,
|
|
39
|
-
opinion.TargetValue,
|
|
40
|
-
labels_formatter.label_to_str(opinion.Label))
|
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
from arekit.common.opinions.provider import OpinionCollectionsProvider
|
|
2
|
-
from arekit.common.labels.str_fmt import StringLabelsFormatter
|
|
3
|
-
from arekit.contrib.source.rusentrel.opinions.converter import OpinionConverter
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class RuSentRelOpinionCollectionProvider(OpinionCollectionsProvider):
|
|
7
|
-
|
|
8
|
-
@staticmethod
|
|
9
|
-
def _iter_opinions_from_file(input_file, labels_formatter, error_on_non_supported):
|
|
10
|
-
assert(isinstance(labels_formatter, StringLabelsFormatter))
|
|
11
|
-
assert(isinstance(error_on_non_supported, bool))
|
|
12
|
-
|
|
13
|
-
for line in input_file.readlines():
|
|
14
|
-
|
|
15
|
-
# Force perform decoding if needed.
|
|
16
|
-
if isinstance(line, bytes):
|
|
17
|
-
line = line.decode()
|
|
18
|
-
|
|
19
|
-
if line == '\n':
|
|
20
|
-
continue
|
|
21
|
-
|
|
22
|
-
str_opinion = OpinionConverter.try_from_string(
|
|
23
|
-
line=line,
|
|
24
|
-
labels_formatter=labels_formatter)
|
|
25
|
-
|
|
26
|
-
if str_opinion is None:
|
|
27
|
-
if error_on_non_supported:
|
|
28
|
-
raise Exception("Line '{line}' has non supported label")
|
|
29
|
-
else:
|
|
30
|
-
continue
|
|
31
|
-
|
|
32
|
-
yield str_opinion
|
|
33
|
-
|
|
34
|
-
# region public methods
|
|
35
|
-
|
|
36
|
-
def iter_opinions(self, source, encoding, labels_formatter, error_on_non_supported=True):
|
|
37
|
-
"""
|
|
38
|
-
Important: For externally saved collections (using save_to_file method) and related usage
|
|
39
|
-
"""
|
|
40
|
-
assert(isinstance(source, str))
|
|
41
|
-
assert(isinstance(labels_formatter, StringLabelsFormatter))
|
|
42
|
-
assert(isinstance(error_on_non_supported, bool))
|
|
43
|
-
|
|
44
|
-
with open(source, 'r', encoding=encoding) as input_file:
|
|
45
|
-
|
|
46
|
-
it = RuSentRelOpinionCollectionProvider._iter_opinions_from_file(
|
|
47
|
-
input_file=input_file,
|
|
48
|
-
labels_formatter=labels_formatter,
|
|
49
|
-
error_on_non_supported=error_on_non_supported)
|
|
50
|
-
|
|
51
|
-
for opinion in it:
|
|
52
|
-
yield opinion
|
|
53
|
-
|
|
54
|
-
# endregion
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
import io
|
|
2
|
-
|
|
3
|
-
from arekit.common.labels.str_fmt import StringLabelsFormatter
|
|
4
|
-
from arekit.common.opinions.base import Opinion
|
|
5
|
-
from arekit.common.opinions.collection import OpinionCollection
|
|
6
|
-
from arekit.common.opinions.writer import OpinionCollectionWriter
|
|
7
|
-
from arekit.common.utils import create_dir_if_not_exists
|
|
8
|
-
from arekit.contrib.source.rusentrel.opinions.converter import OpinionConverter
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class RuSentRelOpinionCollectionWriter(OpinionCollectionWriter):
|
|
12
|
-
|
|
13
|
-
def serialize(self, collection, target, encoding, labels_formatter, error_on_non_supported=True):
|
|
14
|
-
assert(isinstance(collection, OpinionCollection))
|
|
15
|
-
assert(isinstance(target, str))
|
|
16
|
-
assert(isinstance(labels_formatter, StringLabelsFormatter))
|
|
17
|
-
assert(isinstance(error_on_non_supported, bool))
|
|
18
|
-
|
|
19
|
-
def __opinion_key(opinion):
|
|
20
|
-
assert (isinstance(opinion, Opinion))
|
|
21
|
-
return opinion.SourceValue + opinion.TargetValue
|
|
22
|
-
|
|
23
|
-
sorted_ops = sorted(collection, key=__opinion_key)
|
|
24
|
-
|
|
25
|
-
create_dir_if_not_exists(target)
|
|
26
|
-
|
|
27
|
-
with io.open(target, 'w', encoding=encoding) as f:
|
|
28
|
-
for o in sorted_ops:
|
|
29
|
-
|
|
30
|
-
str_value = OpinionConverter.try_to_string(
|
|
31
|
-
opinion=o,
|
|
32
|
-
labels_formatter=labels_formatter)
|
|
33
|
-
|
|
34
|
-
if str_value is None:
|
|
35
|
-
if error_on_non_supported:
|
|
36
|
-
raise Exception("Opinion label `{label}` is not supported by formatter".format(
|
|
37
|
-
label=o.Label))
|
|
38
|
-
else:
|
|
39
|
-
continue
|
|
40
|
-
|
|
41
|
-
f.write(str_value)
|
|
42
|
-
f.write('\n')
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
from arekit.contrib.source.synonyms.utils import iter_synonym_groups
|
|
2
|
-
from arekit.contrib.source.rusentrel.io_utils import RuSentRelIOUtils
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class RuSentRelSynonymsCollectionHelper(object):
|
|
6
|
-
|
|
7
|
-
@staticmethod
|
|
8
|
-
def iter_groups(version):
|
|
9
|
-
it = RuSentRelIOUtils.iter_from_zip(
|
|
10
|
-
inner_path=RuSentRelIOUtils.get_synonyms_innerpath(),
|
|
11
|
-
process_func=lambda input_file: iter_synonym_groups(
|
|
12
|
-
input_file,
|
|
13
|
-
desc="Loading RuSentRel Collection"),
|
|
14
|
-
version=version)
|
|
15
|
-
|
|
16
|
-
for group in it:
|
|
17
|
-
yield group
|
|
File without changes
|
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
from arekit.common.entities.collection import EntityCollection
|
|
2
|
-
from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders
|
|
3
|
-
from arekit.contrib.source.brat.annot import BratAnnotationParser
|
|
4
|
-
from arekit.contrib.source.brat.entities.entity import BratEntity
|
|
5
|
-
from arekit.contrib.source.sentinerel.io_utils import SentiNerelIOUtils
|
|
6
|
-
from arekit.contrib.utils.processing.lemmatization.mystem import MystemWrapper
|
|
7
|
-
from arekit.contrib.utils.synonyms.stemmer_based import StemmerBasedSynonymCollection
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class SentiNerelEntityCollection(EntityCollection):
|
|
11
|
-
|
|
12
|
-
def __init__(self, contents, value_to_group_id_func, entities_to_ignore=None):
|
|
13
|
-
"""
|
|
14
|
-
entities_to_ignore: list or None
|
|
15
|
-
this parameter is required because of the simplified implmentation of
|
|
16
|
-
the nested objects of the BRAT annotation.
|
|
17
|
-
"""
|
|
18
|
-
assert(isinstance(contents, dict))
|
|
19
|
-
assert(BratAnnotationParser.ENTITIES in contents)
|
|
20
|
-
assert(isinstance(entities_to_ignore, list) or entities_to_ignore is None)
|
|
21
|
-
|
|
22
|
-
self.__dicard_entities = set([] if entities_to_ignore is None else entities_to_ignore)
|
|
23
|
-
contents[BratAnnotationParser.ENTITIES] = [e for e in contents[BratAnnotationParser.ENTITIES]
|
|
24
|
-
if self.__keep_entity(e)]
|
|
25
|
-
|
|
26
|
-
super(SentiNerelEntityCollection, self).__init__(
|
|
27
|
-
entities=contents[BratAnnotationParser.ENTITIES],
|
|
28
|
-
value_to_group_id_func=value_to_group_id_func)
|
|
29
|
-
|
|
30
|
-
self._sort_entities(key=lambda entity: entity.IndexBegin)
|
|
31
|
-
|
|
32
|
-
def __keep_entity(self, entity):
|
|
33
|
-
assert(isinstance(entity, BratEntity))
|
|
34
|
-
return entity.Type not in self.__dicard_entities
|
|
35
|
-
|
|
36
|
-
@classmethod
|
|
37
|
-
def read_collection(cls, filename, version, entities_to_ignore=None):
|
|
38
|
-
assert(isinstance(filename, str))
|
|
39
|
-
|
|
40
|
-
# Since this dataset does not provide the synonyms collection by default,
|
|
41
|
-
# it is necessary to declare an empty collection to populate so in further.
|
|
42
|
-
synonyms = StemmerBasedSynonymCollection(stemmer=MystemWrapper(), is_read_only=False)
|
|
43
|
-
|
|
44
|
-
return SentiNerelIOUtils.read_from_zip(
|
|
45
|
-
inner_path=SentiNerelIOUtils.get_annotation_innerpath(filename),
|
|
46
|
-
process_func=lambda input_file: cls(
|
|
47
|
-
contents=BratAnnotationParser.parse_annotations(input_file=input_file, encoding='utf-8-sig'),
|
|
48
|
-
entities_to_ignore=entities_to_ignore,
|
|
49
|
-
value_to_group_id_func=lambda value:
|
|
50
|
-
SynonymsCollectionValuesGroupingProviders.provide_existed_or_register_missed_value(
|
|
51
|
-
synonyms, value)),
|
|
52
|
-
version=version)
|
|
File without changes
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
from arekit.contrib.source.sentinerel.folding.fixed import create_fixed_folding_doc_ids
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class SentiNERELFoldingFactory:
|
|
5
|
-
""" Factory of the variety types of the splits that
|
|
6
|
-
are considered within the present experiments.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
@staticmethod
|
|
10
|
-
def create_fixed_folding(file, limit=None):
|
|
11
|
-
""" limit: int
|
|
12
|
-
Allows to limit amount of documents (utilized for testing reasons)
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
train_filenames, test_filenames = SentiNERELFoldingFactory._read_train_test(f=file)
|
|
16
|
-
if limit is not None:
|
|
17
|
-
train_filenames = train_filenames[:limit]
|
|
18
|
-
test_filenames = test_filenames[:limit]
|
|
19
|
-
filenames_by_ids, data_folding = create_fixed_folding_doc_ids(train_filenames=train_filenames,
|
|
20
|
-
test_filenames=test_filenames)
|
|
21
|
-
|
|
22
|
-
return filenames_by_ids, data_folding
|
|
23
|
-
|
|
24
|
-
@staticmethod
|
|
25
|
-
def _read_train_test(f):
|
|
26
|
-
parts = []
|
|
27
|
-
for line in f.readlines():
|
|
28
|
-
if isinstance(line, bytes):
|
|
29
|
-
line = line.decode('utf-8')
|
|
30
|
-
parts.append(line.strip().split(','))
|
|
31
|
-
return parts[0], parts[1]
|
|
@@ -1,70 +0,0 @@
|
|
|
1
|
-
from collections import OrderedDict
|
|
2
|
-
|
|
3
|
-
from arekit.common.experiment.data_type import DataType
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def create_fixed_folding_doc_ids(train_filenames, test_filenames):
|
|
7
|
-
""" Create fixed data-folding based on the predefined list of filenames,
|
|
8
|
-
written in file.
|
|
9
|
-
"""
|
|
10
|
-
assert(isinstance(train_filenames, list))
|
|
11
|
-
assert(isinstance(test_filenames, list))
|
|
12
|
-
|
|
13
|
-
filenames_by_ids = __create_filenames_by_ids(filenames=train_filenames + test_filenames)
|
|
14
|
-
|
|
15
|
-
ids_by_filenames = {}
|
|
16
|
-
for doc_id, filename in filenames_by_ids.items():
|
|
17
|
-
ids_by_filenames[filename] = doc_id
|
|
18
|
-
|
|
19
|
-
train_doc_ids = [ids_by_filenames[filename] for filename in train_filenames]
|
|
20
|
-
test_doc_ids = [ids_by_filenames[filename] for filename in test_filenames]
|
|
21
|
-
|
|
22
|
-
return {
|
|
23
|
-
DataType.Train: train_doc_ids,
|
|
24
|
-
DataType.Test: test_doc_ids,
|
|
25
|
-
DataType.Etalon: test_doc_ids,
|
|
26
|
-
DataType.Dev: test_doc_ids
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def __create_filenames_by_ids(filenames):
|
|
31
|
-
""" Indexing filenames
|
|
32
|
-
"""
|
|
33
|
-
|
|
34
|
-
def __create_new_id(default_id):
|
|
35
|
-
new_id = default_id
|
|
36
|
-
while new_id in filenames_by_ids:
|
|
37
|
-
new_id += 1
|
|
38
|
-
return new_id
|
|
39
|
-
|
|
40
|
-
default_id = 0
|
|
41
|
-
|
|
42
|
-
filenames_by_ids = OrderedDict()
|
|
43
|
-
for fname in filenames:
|
|
44
|
-
|
|
45
|
-
doc_id = __number_from_string(fname)
|
|
46
|
-
|
|
47
|
-
if doc_id is None:
|
|
48
|
-
doc_id = __create_new_id(default_id)
|
|
49
|
-
default_id = doc_id
|
|
50
|
-
|
|
51
|
-
filenames_by_ids[doc_id] = fname
|
|
52
|
-
|
|
53
|
-
return filenames_by_ids
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def __number_from_string(s):
|
|
57
|
-
assert(isinstance(s, str))
|
|
58
|
-
|
|
59
|
-
digit_chars_prefix = []
|
|
60
|
-
|
|
61
|
-
for chr in s:
|
|
62
|
-
if chr.isdigit():
|
|
63
|
-
digit_chars_prefix.append(chr)
|
|
64
|
-
else:
|
|
65
|
-
break
|
|
66
|
-
|
|
67
|
-
if len(digit_chars_prefix) == 0:
|
|
68
|
-
return None
|
|
69
|
-
|
|
70
|
-
return int("".join(digit_chars_prefix))
|