arekit 0.23.1__py3-none-any.whl → 0.25.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/common/context/terms_mapper.py +2 -2
- arekit/common/data/const.py +5 -4
- arekit/common/{experiment/api/ops_doc.py → data/doc_provider.py} +1 -1
- arekit/common/data/input/providers/columns/sample.py +6 -1
- arekit/common/data/input/providers/instances/base.py +1 -1
- arekit/common/data/input/providers/rows/base.py +36 -13
- arekit/common/data/input/providers/rows/samples.py +57 -55
- arekit/common/data/input/providers/sample/cropped.py +2 -2
- arekit/common/data/input/sample.py +1 -1
- arekit/common/data/rows_fmt.py +82 -0
- arekit/common/data/rows_parser.py +43 -0
- arekit/common/data/storages/base.py +23 -18
- arekit/common/data/views/samples.py +2 -8
- arekit/common/{news → docs}/base.py +2 -2
- arekit/common/{news → docs}/entities_grouping.py +2 -1
- arekit/common/{news → docs}/entity.py +2 -1
- arekit/common/{news → docs}/parsed/base.py +5 -5
- arekit/common/docs/parsed/providers/base.py +68 -0
- arekit/common/{news → docs}/parsed/providers/base_pairs.py +2 -2
- arekit/common/{news → docs}/parsed/providers/entity_service.py +27 -22
- arekit/common/{news → docs}/parsed/providers/opinion_pairs.py +2 -2
- arekit/common/{news → docs}/parsed/providers/text_opinion_pairs.py +6 -6
- arekit/common/docs/parsed/service.py +31 -0
- arekit/common/docs/parser.py +66 -0
- arekit/common/{news → docs}/sentence.py +1 -1
- arekit/common/entities/base.py +11 -2
- arekit/common/experiment/api/base_samples_io.py +1 -1
- arekit/common/frames/variants/collection.py +2 -2
- arekit/common/linkage/base.py +2 -2
- arekit/common/linkage/meta.py +23 -0
- arekit/common/linkage/opinions.py +1 -1
- arekit/common/linkage/text_opinions.py +2 -2
- arekit/common/opinions/annot/algo/base.py +1 -1
- arekit/common/opinions/annot/algo/pair_based.py +15 -13
- arekit/common/opinions/annot/algo/predefined.py +4 -4
- arekit/common/opinions/annot/algo_based.py +5 -5
- arekit/common/opinions/annot/base.py +3 -3
- arekit/common/opinions/base.py +7 -7
- arekit/common/opinions/collection.py +3 -3
- arekit/common/pipeline/base.py +12 -16
- arekit/common/pipeline/batching.py +28 -0
- arekit/common/pipeline/context.py +5 -1
- arekit/common/pipeline/items/base.py +38 -1
- arekit/common/pipeline/items/flatten.py +5 -1
- arekit/common/pipeline/items/handle.py +2 -1
- arekit/common/pipeline/items/iter.py +2 -1
- arekit/common/pipeline/items/map.py +2 -1
- arekit/common/pipeline/items/map_nested.py +4 -0
- arekit/common/pipeline/utils.py +32 -0
- arekit/common/service/sqlite.py +36 -0
- arekit/common/synonyms/base.py +2 -2
- arekit/common/text/{partitioning/str.py → partitioning.py} +16 -11
- arekit/common/text_opinions/base.py +11 -11
- arekit/common/utils.py +33 -46
- arekit/contrib/networks/embedding.py +3 -3
- arekit/contrib/networks/embedding_io.py +5 -5
- arekit/contrib/networks/input/const.py +0 -2
- arekit/contrib/networks/input/providers/sample.py +15 -29
- arekit/contrib/networks/input/rows_parser.py +47 -134
- arekit/contrib/prompt/sample.py +18 -16
- arekit/contrib/utils/data/contents/opinions.py +17 -5
- arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
- arekit/contrib/utils/data/{doc_ops → doc_provider}/dir_based.py +7 -7
- arekit/contrib/utils/data/readers/base.py +3 -0
- arekit/contrib/utils/data/readers/csv_pd.py +10 -4
- arekit/contrib/utils/data/readers/jsonl.py +3 -0
- arekit/contrib/utils/data/readers/sqlite.py +14 -0
- arekit/contrib/utils/data/service/balance.py +0 -1
- arekit/contrib/utils/data/storages/pandas_based.py +3 -5
- arekit/contrib/utils/data/storages/row_cache.py +18 -6
- arekit/contrib/utils/data/storages/sqlite_based.py +17 -0
- arekit/contrib/utils/data/writers/base.py +5 -0
- arekit/contrib/utils/data/writers/csv_native.py +3 -0
- arekit/contrib/utils/data/writers/csv_pd.py +3 -0
- arekit/contrib/utils/data/writers/json_opennre.py +31 -13
- arekit/contrib/utils/data/writers/sqlite_native.py +114 -0
- arekit/contrib/utils/io_utils/embedding.py +25 -33
- arekit/contrib/utils/io_utils/utils.py +3 -24
- arekit/contrib/utils/pipelines/items/sampling/base.py +31 -26
- arekit/contrib/utils/pipelines/items/sampling/networks.py +7 -10
- arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
- arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +3 -3
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +2 -1
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +3 -5
- arekit/contrib/utils/pipelines/items/text/translator.py +136 -0
- arekit/contrib/utils/pipelines/opinion_collections.py +5 -5
- arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +7 -7
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +34 -22
- arekit/contrib/utils/pipelines/text_opinion/filters/base.py +1 -1
- arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +1 -1
- arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +3 -3
- arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +4 -4
- arekit/contrib/utils/serializer.py +4 -23
- arekit-0.25.0.data/data/logo.png +0 -0
- arekit-0.25.0.dist-info/METADATA +82 -0
- arekit-0.25.0.dist-info/RECORD +259 -0
- {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/WHEEL +1 -1
- arekit/common/data/row_ids/base.py +0 -79
- arekit/common/data/row_ids/binary.py +0 -38
- arekit/common/data/row_ids/multiple.py +0 -14
- arekit/common/folding/base.py +0 -36
- arekit/common/folding/fixed.py +0 -42
- arekit/common/folding/nofold.py +0 -15
- arekit/common/folding/united.py +0 -46
- arekit/common/news/objects_parser.py +0 -37
- arekit/common/news/parsed/providers/base.py +0 -48
- arekit/common/news/parsed/service.py +0 -31
- arekit/common/news/parser.py +0 -34
- arekit/common/text/parser.py +0 -12
- arekit/common/text/partitioning/__init__.py +0 -0
- arekit/common/text/partitioning/base.py +0 -4
- arekit/common/text/partitioning/terms.py +0 -35
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +0 -83
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +0 -33
- arekit/contrib/source/brat/entities/entity.py +0 -42
- arekit/contrib/source/brat/entities/parser.py +0 -53
- arekit/contrib/source/brat/news.py +0 -28
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +0 -19
- arekit/contrib/source/brat/relation.py +0 -32
- arekit/contrib/source/brat/sentence.py +0 -69
- arekit/contrib/source/brat/sentences_reader.py +0 -128
- arekit/contrib/source/download.py +0 -41
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +0 -55
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +0 -75
- arekit/contrib/source/nerel/io_utils.py +0 -62
- arekit/contrib/source/nerel/labels.py +0 -241
- arekit/contrib/source/nerel/reader.py +0 -46
- arekit/contrib/source/nerel/utils.py +0 -24
- arekit/contrib/source/nerel/versions.py +0 -12
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +0 -62
- arekit/contrib/source/nerelbio/labels.py +0 -265
- arekit/contrib/source/nerelbio/reader.py +0 -8
- arekit/contrib/source/nerelbio/versions.py +0 -8
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +0 -36
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
- arekit/contrib/source/ruattitudes/io_utils.py +0 -56
- arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
- arekit/contrib/source/ruattitudes/news.py +0 -51
- arekit/contrib/source/ruattitudes/news_brat.py +0 -44
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
- arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
- arekit/contrib/source/ruattitudes/reader.py +0 -268
- arekit/contrib/source/ruattitudes/sentence.py +0 -73
- arekit/contrib/source/ruattitudes/synonyms.py +0 -17
- arekit/contrib/source/ruattitudes/text_object.py +0 -57
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +0 -157
- arekit/contrib/source/rusentiframes/effect.py +0 -24
- arekit/contrib/source/rusentiframes/io_utils.py +0 -19
- arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
- arekit/contrib/source/rusentiframes/polarity.py +0 -35
- arekit/contrib/source/rusentiframes/role.py +0 -15
- arekit/contrib/source/rusentiframes/state.py +0 -24
- arekit/contrib/source/rusentiframes/types.py +0 -42
- arekit/contrib/source/rusentiframes/value.py +0 -2
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +0 -3
- arekit/contrib/source/rusentrel/entities.py +0 -26
- arekit/contrib/source/rusentrel/io_utils.py +0 -125
- arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
- arekit/contrib/source/rusentrel/news_reader.py +0 -51
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
- arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
- arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
- arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
- arekit/contrib/source/rusentrel/synonyms.py +0 -17
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +0 -52
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +0 -32
- arekit/contrib/source/sentinerel/folding/fixed.py +0 -73
- arekit/contrib/source/sentinerel/io_utils.py +0 -87
- arekit/contrib/source/sentinerel/labels.py +0 -53
- arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
- arekit/contrib/source/sentinerel/reader.py +0 -42
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +0 -19
- arekit/contrib/source/zip_utils.py +0 -47
- arekit/contrib/utils/bert/rows.py +0 -0
- arekit/contrib/utils/bert/text_b_rus.py +0 -18
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
- arekit/contrib/utils/cv/__init__.py +0 -0
- arekit/contrib/utils/cv/doc_stat/__init__.py +0 -0
- arekit/contrib/utils/cv/doc_stat/base.py +0 -37
- arekit/contrib/utils/cv/doc_stat/sentence.py +0 -12
- arekit/contrib/utils/cv/splitters/__init__.py +0 -0
- arekit/contrib/utils/cv/splitters/base.py +0 -4
- arekit/contrib/utils/cv/splitters/default.py +0 -53
- arekit/contrib/utils/cv/splitters/statistical.py +0 -57
- arekit/contrib/utils/cv/two_class.py +0 -77
- arekit/contrib/utils/data/doc_ops/__init__.py +0 -0
- arekit/contrib/utils/data/doc_ops/dict_based.py +0 -13
- arekit/contrib/utils/data/ext.py +0 -31
- arekit/contrib/utils/data/views/__init__.py +0 -0
- arekit/contrib/utils/data/views/linkages/__init__.py +0 -0
- arekit/contrib/utils/data/views/linkages/base.py +0 -58
- arekit/contrib/utils/data/views/linkages/multilabel.py +0 -48
- arekit/contrib/utils/data/views/linkages/utils.py +0 -24
- arekit/contrib/utils/data/views/opinions.py +0 -14
- arekit/contrib/utils/download.py +0 -78
- arekit/contrib/utils/entities/formatters/str_rus_cased_fmt.py +0 -78
- arekit/contrib/utils/entities/formatters/str_rus_nocased_fmt.py +0 -15
- arekit/contrib/utils/entities/formatters/str_simple_fmt.py +0 -24
- arekit/contrib/utils/entities/formatters/str_simple_uppercase_fmt.py +0 -21
- arekit/contrib/utils/io_utils/opinions.py +0 -39
- arekit/contrib/utils/io_utils/samples.py +0 -78
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +0 -43
- arekit/contrib/utils/lexicons/relation.py +0 -45
- arekit/contrib/utils/lexicons/rusentilex.py +0 -34
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +0 -83
- arekit/contrib/utils/pipelines/items/sampling/bert.py +0 -5
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
- arekit/contrib/utils/pipelines/items/to_output.py +0 -101
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_ops.py +0 -27
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -59
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_ops.py +0 -29
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -59
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_ops.py +0 -56
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -19
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -58
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_ops.py +0 -21
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -100
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_ops.py +0 -29
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -175
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
- arekit/contrib/utils/resources.py +0 -26
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
- arekit/contrib/utils/utils_folding.py +0 -19
- arekit/download_data.py +0 -11
- arekit-0.23.1.dist-info/METADATA +0 -23
- arekit-0.23.1.dist-info/RECORD +0 -403
- /arekit/common/{data/row_ids → docs}/__init__.py +0 -0
- /arekit/common/{folding → docs/parsed}/__init__.py +0 -0
- /arekit/common/{news → docs/parsed/providers}/__init__.py +0 -0
- /arekit/common/{news → docs}/parsed/term_position.py +0 -0
- /arekit/common/{news/parsed → service}/__init__.py +0 -0
- /arekit/{common/news/parsed/providers → contrib/utils/data/doc_provider}/__init__.py +0 -0
- {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/LICENSE +0 -0
- {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/top_level.txt +0 -0
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
from os.path import join
|
|
3
|
-
|
|
4
|
-
from arekit.contrib.utils.data.ext import create_writer_extension, create_reader_extension
|
|
5
|
-
from arekit.contrib.utils.data.readers.base import BaseReader
|
|
6
|
-
from arekit.common.experiment.api.base_samples_io import BaseSamplesIO
|
|
7
|
-
from arekit.contrib.utils.data.writers.base import BaseWriter
|
|
8
|
-
from arekit.contrib.utils.io_utils.utils import filename_template, check_targets_existence
|
|
9
|
-
|
|
10
|
-
logger = logging.getLogger(__name__)
|
|
11
|
-
logging.basicConfig(level=logging.INFO)
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class SamplesIO(BaseSamplesIO):
|
|
15
|
-
""" Samples default IO utils for samples.
|
|
16
|
-
Sample is a text part which include pair of attitude participants.
|
|
17
|
-
This class allows to provide saver and loader for such entries, bubbed as samples.
|
|
18
|
-
Samples required for machine learning training/inferring.
|
|
19
|
-
"""
|
|
20
|
-
|
|
21
|
-
def __init__(self, target_dir, writer=None, reader=None, prefix="sample", target_extension=None):
|
|
22
|
-
assert(isinstance(target_dir, str))
|
|
23
|
-
assert(isinstance(prefix, str))
|
|
24
|
-
assert(isinstance(writer, BaseWriter) or writer is None)
|
|
25
|
-
assert(isinstance(reader, BaseReader) or reader is None)
|
|
26
|
-
assert(isinstance(target_extension, str) or target_extension is None)
|
|
27
|
-
self.__target_dir = target_dir
|
|
28
|
-
self.__prefix = prefix
|
|
29
|
-
self.__writer = writer
|
|
30
|
-
self.__reader = reader
|
|
31
|
-
self.__target_extension = target_extension
|
|
32
|
-
|
|
33
|
-
if target_extension is None:
|
|
34
|
-
if writer is not None:
|
|
35
|
-
self.__target_extension = create_writer_extension(writer)
|
|
36
|
-
elif reader is not None:
|
|
37
|
-
self.__target_extension = create_reader_extension(reader)
|
|
38
|
-
|
|
39
|
-
# region public methods
|
|
40
|
-
|
|
41
|
-
@property
|
|
42
|
-
def Reader(self):
|
|
43
|
-
return self.__reader
|
|
44
|
-
|
|
45
|
-
@property
|
|
46
|
-
def Writer(self):
|
|
47
|
-
return self.__writer
|
|
48
|
-
|
|
49
|
-
def create_target(self, data_type, data_folding):
|
|
50
|
-
return self.__get_input_sample_target(data_type, data_folding=data_folding)
|
|
51
|
-
|
|
52
|
-
def check_targets_existed(self, data_types_iter, data_folding):
|
|
53
|
-
for data_type in data_types_iter:
|
|
54
|
-
|
|
55
|
-
targets = [
|
|
56
|
-
self.__get_input_sample_target(data_type=data_type, data_folding=data_folding),
|
|
57
|
-
]
|
|
58
|
-
|
|
59
|
-
if not check_targets_existence(targets=targets):
|
|
60
|
-
return False
|
|
61
|
-
return True
|
|
62
|
-
|
|
63
|
-
# endregion
|
|
64
|
-
|
|
65
|
-
def __get_input_sample_target(self, data_type, data_folding):
|
|
66
|
-
template = filename_template(data_type=data_type, data_folding=data_folding)
|
|
67
|
-
return self.__get_filepath(out_dir=self.__target_dir,
|
|
68
|
-
template=template,
|
|
69
|
-
prefix=self.__prefix,
|
|
70
|
-
extension=self.__target_extension)
|
|
71
|
-
|
|
72
|
-
@staticmethod
|
|
73
|
-
def __get_filepath(out_dir, template, prefix, extension):
|
|
74
|
-
assert(isinstance(template, str))
|
|
75
|
-
assert(isinstance(prefix, str))
|
|
76
|
-
assert(isinstance(extension, str))
|
|
77
|
-
return join(out_dir, "{prefix}-{template}{extension}".format(
|
|
78
|
-
prefix=prefix, template=template, extension=extension))
|
|
File without changes
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
from arekit.contrib.utils.data.readers.csv_pd import PandasCsvReader
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class Lexicon(object):
|
|
6
|
-
|
|
7
|
-
@property
|
|
8
|
-
def ToneKey(self):
|
|
9
|
-
return 'tone'
|
|
10
|
-
|
|
11
|
-
@property
|
|
12
|
-
def TermKey(self):
|
|
13
|
-
return 'term'
|
|
14
|
-
|
|
15
|
-
def __init__(self, dataframe):
|
|
16
|
-
assert(isinstance(dataframe, pd.DataFrame))
|
|
17
|
-
self.__lexicon_df = dataframe
|
|
18
|
-
|
|
19
|
-
@classmethod
|
|
20
|
-
def load(cls, filepath, separator=','):
|
|
21
|
-
reader = PandasCsvReader(compression=None, sep=separator)
|
|
22
|
-
return cls(reader.read(filepath))
|
|
23
|
-
|
|
24
|
-
def get_score(self, lemma):
|
|
25
|
-
assert(type(lemma) == str)
|
|
26
|
-
s = self.__lexicon_df[lemma.encode('utf-8') == self.__lexicon_df[self.TermKey]]
|
|
27
|
-
return s[self.ToneKey].values[0] if len(s) > 0 else 0
|
|
28
|
-
|
|
29
|
-
def has_term(self, term):
|
|
30
|
-
assert(type(term) == str)
|
|
31
|
-
s = self.__lexicon_df[term.encode('utf-8') == self.__lexicon_df[self.TermKey]]
|
|
32
|
-
return len(s) > 0
|
|
33
|
-
|
|
34
|
-
def __iter__(self):
|
|
35
|
-
for term in self.__lexicon_df[self.TermKey]:
|
|
36
|
-
yield term
|
|
37
|
-
|
|
38
|
-
def __contains__(self, item):
|
|
39
|
-
assert(isinstance(item, str))
|
|
40
|
-
result = self.__lexicon_df[self.__lexicon_df[self.TermKey] == item.encode('utf-8')]
|
|
41
|
-
return len(result) > 0
|
|
42
|
-
|
|
43
|
-
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
|
|
3
|
-
from arekit.contrib.utils.data.readers.csv_pd import PandasCsvReader
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class RelationLexicon(object):
|
|
7
|
-
|
|
8
|
-
def __init__(self, dataframe):
|
|
9
|
-
assert(isinstance(dataframe, pd.DataFrame))
|
|
10
|
-
self.__check(dataframe)
|
|
11
|
-
self.__lexicon = dataframe
|
|
12
|
-
|
|
13
|
-
@classmethod
|
|
14
|
-
def load(cls, filepath, separator=','):
|
|
15
|
-
reader = PandasCsvReader(compression=None, sep=separator)
|
|
16
|
-
return cls(reader.read(filepath))
|
|
17
|
-
|
|
18
|
-
@staticmethod
|
|
19
|
-
def __check(df):
|
|
20
|
-
for index in df.index:
|
|
21
|
-
relation = df.loc[index][0]
|
|
22
|
-
assert(len(relation.split('<->')) == 2)
|
|
23
|
-
|
|
24
|
-
@staticmethod
|
|
25
|
-
def __create_key(l, r):
|
|
26
|
-
assert(type(l) == str)
|
|
27
|
-
assert(type(r) == str)
|
|
28
|
-
return '<->'.join([l, r])
|
|
29
|
-
|
|
30
|
-
def get_score(self, left, right):
|
|
31
|
-
assert(type(left) == str)
|
|
32
|
-
assert(type(right) == str)
|
|
33
|
-
|
|
34
|
-
lr_key = self.__create_key(left, right)
|
|
35
|
-
rl_key = self.__create_key(right, left)
|
|
36
|
-
|
|
37
|
-
lr_score = self.__lexicon[lr_key == self.__lexicon['relation']]
|
|
38
|
-
rl_score = self.__lexicon[rl_key == self.__lexicon['relation']]
|
|
39
|
-
|
|
40
|
-
if len(lr_score) > 0:
|
|
41
|
-
return lr_score['tone'].values[0]
|
|
42
|
-
if len(rl_score) > 0:
|
|
43
|
-
return rl_score['tone'].values[0]
|
|
44
|
-
|
|
45
|
-
return None
|
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
import zipfile
|
|
2
|
-
from os import path
|
|
3
|
-
|
|
4
|
-
import pandas as pd
|
|
5
|
-
|
|
6
|
-
from arekit.contrib.source.zip_utils import ZipArchiveUtils
|
|
7
|
-
from arekit.contrib.utils.lexicons.lexicon import Lexicon
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class RuSentiLexLexicon(Lexicon):
|
|
11
|
-
"""
|
|
12
|
-
RuSentiLex Lexicon wrapper for csv file stored in /data folder.
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
__INNER_PATH = 'rusentilex.csv'
|
|
16
|
-
|
|
17
|
-
@property
|
|
18
|
-
def ToneKey(self):
|
|
19
|
-
return 'tone'
|
|
20
|
-
|
|
21
|
-
@property
|
|
22
|
-
def TermKey(self):
|
|
23
|
-
return 'term'
|
|
24
|
-
|
|
25
|
-
@staticmethod
|
|
26
|
-
def __get_archive_filepath():
|
|
27
|
-
return path.join(ZipArchiveUtils.get_data_root(), "rusentilex.zip")
|
|
28
|
-
|
|
29
|
-
@classmethod
|
|
30
|
-
def from_zip(cls):
|
|
31
|
-
with zipfile.ZipFile(cls.__get_archive_filepath(), "r") as zip_ref:
|
|
32
|
-
with zip_ref.open(cls.__INNER_PATH, mode='r') as csv_file:
|
|
33
|
-
df = pd.read_csv(csv_file, sep=',')
|
|
34
|
-
return cls(df)
|
|
File without changes
|
arekit/contrib/utils/nn/rows.py
DELETED
|
@@ -1,83 +0,0 @@
|
|
|
1
|
-
import collections
|
|
2
|
-
|
|
3
|
-
from arekit.common.data.input.providers.text.single import BaseSingleTextProvider
|
|
4
|
-
from arekit.common.data.input.terms_mapper import OpinionContainingTextTermsMapper
|
|
5
|
-
from arekit.common.entities.str_fmt import StringEntitiesFormatter
|
|
6
|
-
from arekit.contrib.networks.input.ctx_serialization import NetworkSerializationContext
|
|
7
|
-
from arekit.contrib.networks.input.formatters.pos_mapper import PosTermsMapper
|
|
8
|
-
from arekit.contrib.networks.input.providers.sample import NetworkSampleRowProvider
|
|
9
|
-
from arekit.contrib.networks.input.providers.text import NetworkSingleTextProvider
|
|
10
|
-
from arekit.contrib.networks.input.term_types import TermTypes
|
|
11
|
-
from arekit.contrib.networks.input.terms_mapping import VectorizedNetworkTermMapping
|
|
12
|
-
from arekit.contrib.utils.processing.lemmatization.mystem import MystemWrapper
|
|
13
|
-
from arekit.contrib.utils.resources import load_embedding_news_mystem_skipgram_1000_20_2015
|
|
14
|
-
from arekit.contrib.utils.vectorizers.bpe import BPEVectorizer
|
|
15
|
-
from arekit.contrib.utils.vectorizers.random_norm import RandomNormalVectorizer
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def __add_term_embedding(dict_data, term, emb_vector):
|
|
19
|
-
if term in dict_data:
|
|
20
|
-
return
|
|
21
|
-
dict_data[term] = emb_vector
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def create_rows_provider(str_entity_fmt, ctx, vectorizers="default"):
|
|
25
|
-
""" This method is corresponds to the default initialization of
|
|
26
|
-
the rows provider for data sampling pipeline.
|
|
27
|
-
|
|
28
|
-
vectorizers:
|
|
29
|
-
NONE: no need to vectorize, just provide text (using SingleTextProvider).
|
|
30
|
-
DEFAULT: we consider an application of stemmer for Russian Language.
|
|
31
|
-
DICT: in which for every type there is an assigned Vectorizer
|
|
32
|
-
vectorization of term types.
|
|
33
|
-
{
|
|
34
|
-
TermType.Word: Vectorizer,
|
|
35
|
-
TermType.Entity: Vectorizer,
|
|
36
|
-
...
|
|
37
|
-
}
|
|
38
|
-
"""
|
|
39
|
-
assert(isinstance(str_entity_fmt, StringEntitiesFormatter))
|
|
40
|
-
assert(isinstance(ctx, NetworkSerializationContext))
|
|
41
|
-
assert(isinstance(vectorizers, dict) or vectorizers == "default" or vectorizers is None)
|
|
42
|
-
|
|
43
|
-
term_embedding_pairs = None
|
|
44
|
-
|
|
45
|
-
if vectorizers is not None:
|
|
46
|
-
|
|
47
|
-
if vectorizers == "default":
|
|
48
|
-
# initialize default vectorizer for Russian language.
|
|
49
|
-
embedding = load_embedding_news_mystem_skipgram_1000_20_2015(stemmer=MystemWrapper(), auto_download=True)
|
|
50
|
-
bpe_vectorizer = BPEVectorizer(embedding=embedding, max_part_size=3)
|
|
51
|
-
norm_vectorizer = RandomNormalVectorizer(vector_size=embedding.VectorSize,
|
|
52
|
-
token_offset=12345)
|
|
53
|
-
vectorizers = {
|
|
54
|
-
TermTypes.WORD: bpe_vectorizer,
|
|
55
|
-
TermTypes.ENTITY: bpe_vectorizer,
|
|
56
|
-
TermTypes.FRAME: bpe_vectorizer,
|
|
57
|
-
TermTypes.TOKEN: norm_vectorizer
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
# Setup term-embedding pairs collection instance.
|
|
61
|
-
term_embedding_pairs = collections.OrderedDict()
|
|
62
|
-
|
|
63
|
-
# Use text provider with vectorizers.
|
|
64
|
-
text_provider = NetworkSingleTextProvider(
|
|
65
|
-
text_terms_mapper=VectorizedNetworkTermMapping(
|
|
66
|
-
vectorizers=vectorizers,
|
|
67
|
-
string_entities_formatter=str_entity_fmt),
|
|
68
|
-
pair_handling_func=lambda pair: __add_term_embedding(
|
|
69
|
-
dict_data=term_embedding_pairs,
|
|
70
|
-
term=pair[0],
|
|
71
|
-
emb_vector=pair[1]))
|
|
72
|
-
else:
|
|
73
|
-
# Create text provider which without vectorizers.
|
|
74
|
-
text_provider = BaseSingleTextProvider(
|
|
75
|
-
text_terms_mapper=OpinionContainingTextTermsMapper(str_entity_fmt))
|
|
76
|
-
|
|
77
|
-
return NetworkSampleRowProvider(
|
|
78
|
-
label_provider=ctx.LabelProvider,
|
|
79
|
-
text_provider=text_provider,
|
|
80
|
-
frames_connotation_provider=ctx.FramesConnotationProvider,
|
|
81
|
-
frame_role_label_scaler=ctx.FrameRolesLabelScaler,
|
|
82
|
-
pos_terms_mapper=PosTermsMapper(ctx.PosTagger) if ctx.PosTagger is not None else None,
|
|
83
|
-
term_embedding_pairs=term_embedding_pairs)
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
from arekit.common.pipeline.context import PipelineContext
|
|
2
|
-
from arekit.common.pipeline.items.base import BasePipelineItem
|
|
3
|
-
from arekit.common.utils import split_by_whitespaces
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class TermsSplitterParser(BasePipelineItem):
|
|
7
|
-
|
|
8
|
-
def apply_core(self, input_data, pipeline_ctx):
|
|
9
|
-
assert(isinstance(pipeline_ctx, PipelineContext))
|
|
10
|
-
return split_by_whitespaces(input_data)
|
|
@@ -1,101 +0,0 @@
|
|
|
1
|
-
from arekit.common.data import const
|
|
2
|
-
from arekit.common.data.storages.base import BaseRowsStorage
|
|
3
|
-
from arekit.common.experiment.data_type import DataType
|
|
4
|
-
from arekit.common.folding.base import BaseDataFolding
|
|
5
|
-
from arekit.common.labels.scaler.base import BaseLabelScaler
|
|
6
|
-
from arekit.common.labels.str_fmt import StringLabelsFormatter
|
|
7
|
-
from arekit.common.model.labeling.modes import LabelCalculationMode
|
|
8
|
-
from arekit.common.opinions.writer import OpinionCollectionWriter
|
|
9
|
-
from arekit.common.pipeline.base import BasePipeline
|
|
10
|
-
from arekit.common.pipeline.context import PipelineContext
|
|
11
|
-
from arekit.common.pipeline.items.base import BasePipelineItem
|
|
12
|
-
from arekit.common.pipeline.items.handle import HandleIterPipelineItem
|
|
13
|
-
from arekit.contrib.utils.data.views.linkages.multilabel import MultilableOpinionLinkagesView
|
|
14
|
-
from arekit.contrib.utils.data.views.opinions import BaseOpinionStorageView
|
|
15
|
-
from arekit.contrib.utils.io_utils.opinions import OpinionsIO
|
|
16
|
-
from arekit.contrib.utils.utils_folding import folding_iter_states, experiment_iter_index
|
|
17
|
-
from arekit.contrib.utils.pipelines.opinion_collections import \
|
|
18
|
-
text_opinion_linkages_to_opinion_collections_pipeline_part
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class TextOpinionLinkagesToOpinionConverterPipelineItem(BasePipelineItem):
|
|
22
|
-
|
|
23
|
-
def __init__(self, opinions_io, create_opinion_collection_func,
|
|
24
|
-
opinion_collection_writer, label_scaler, labels_formatter):
|
|
25
|
-
""" create_opinion_collection_func: func
|
|
26
|
-
func () -> OpinionCollection (empty)
|
|
27
|
-
"""
|
|
28
|
-
assert(isinstance(opinions_io, OpinionsIO))
|
|
29
|
-
assert(callable(create_opinion_collection_func))
|
|
30
|
-
assert(isinstance(label_scaler, BaseLabelScaler))
|
|
31
|
-
assert(isinstance(labels_formatter, StringLabelsFormatter))
|
|
32
|
-
assert(isinstance(opinion_collection_writer, OpinionCollectionWriter))
|
|
33
|
-
super(TextOpinionLinkagesToOpinionConverterPipelineItem, self).__init__()
|
|
34
|
-
|
|
35
|
-
self.__opinions_io = opinions_io
|
|
36
|
-
self.__labels_formatter = labels_formatter
|
|
37
|
-
self.__label_scaler = label_scaler
|
|
38
|
-
self.__create_opinion_collection_func = create_opinion_collection_func
|
|
39
|
-
self.__opinion_collection_writer = opinion_collection_writer
|
|
40
|
-
|
|
41
|
-
def __convert(self, data_folding, output_storage, target_func, data_type, pipeline_ctx):
|
|
42
|
-
""" From `output_storage` to `target` conversion.
|
|
43
|
-
output_storage: BaseRowsStorage
|
|
44
|
-
target_func: func(doc_id) -- considered to provide a target for the particular document.
|
|
45
|
-
"""
|
|
46
|
-
assert(isinstance(data_folding, BaseDataFolding))
|
|
47
|
-
assert(isinstance(output_storage, BaseRowsStorage))
|
|
48
|
-
assert(isinstance(data_type, DataType))
|
|
49
|
-
assert(callable(target_func))
|
|
50
|
-
|
|
51
|
-
# We utilize google bert format, where every row
|
|
52
|
-
# consist of label probabilities per every class
|
|
53
|
-
linkages_view = MultilableOpinionLinkagesView(labels_scaler=self.__label_scaler,
|
|
54
|
-
storage=output_storage)
|
|
55
|
-
target = self.__opinions_io.create_target(data_type=data_type, data_folding=data_folding)
|
|
56
|
-
storage = self.__opinions_io.Reader.read(target)
|
|
57
|
-
|
|
58
|
-
converter_part = text_opinion_linkages_to_opinion_collections_pipeline_part(
|
|
59
|
-
iter_opinion_linkages_func=lambda doc_id: linkages_view.iter_opinion_linkages(
|
|
60
|
-
doc_id=doc_id, opinions_view=BaseOpinionStorageView(storage)),
|
|
61
|
-
doc_ids_set=set(data_folding.fold_doc_ids_set()[data_type]),
|
|
62
|
-
create_opinion_collection_func=self.__create_opinion_collection_func,
|
|
63
|
-
labels_scaler=self.__label_scaler,
|
|
64
|
-
label_calc_mode=LabelCalculationMode.AVERAGE)
|
|
65
|
-
|
|
66
|
-
pipeline = BasePipeline(
|
|
67
|
-
converter_part +
|
|
68
|
-
[HandleIterPipelineItem(lambda data: self.__opinion_collection_writer.serialize(
|
|
69
|
-
collection=data[1],
|
|
70
|
-
encoding='utf-8',
|
|
71
|
-
labels_formatter=self.__labels_formatter,
|
|
72
|
-
error_on_non_supported=True,
|
|
73
|
-
target=target_func(data[0])))
|
|
74
|
-
])
|
|
75
|
-
|
|
76
|
-
input_data = set(output_storage.iter_column_values(column_name=const.DOC_ID))
|
|
77
|
-
|
|
78
|
-
# iterate over the result.
|
|
79
|
-
for _ in pipeline.run(input_data, parent_ctx=pipeline_ctx):
|
|
80
|
-
pass
|
|
81
|
-
|
|
82
|
-
def _iter_output_and_target_pairs(self, iter_index, data_type):
|
|
83
|
-
raise NotImplementedError()
|
|
84
|
-
|
|
85
|
-
def apply_core(self, input_data, pipeline_ctx):
|
|
86
|
-
assert(isinstance(pipeline_ctx, PipelineContext))
|
|
87
|
-
assert("data_folding" in pipeline_ctx)
|
|
88
|
-
assert("data_type" in pipeline_ctx)
|
|
89
|
-
|
|
90
|
-
data_folding = pipeline_ctx.provide("data_folding")
|
|
91
|
-
data_type = pipeline_ctx.provide("data_type")
|
|
92
|
-
|
|
93
|
-
for _ in folding_iter_states(data_folding):
|
|
94
|
-
iter_index = experiment_iter_index(data_folding)
|
|
95
|
-
pairs_it = self._iter_output_and_target_pairs(iter_index=iter_index, data_type=data_type)
|
|
96
|
-
for output_storage, target in pairs_it:
|
|
97
|
-
self.__convert(output_storage=output_storage,
|
|
98
|
-
target_func=target,
|
|
99
|
-
data_type=data_type,
|
|
100
|
-
data_folding=data_folding,
|
|
101
|
-
pipeline_ctx=pipeline_ctx)
|
|
File without changes
|
|
File without changes
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
from arekit.common.experiment.api.ops_doc import DocumentOperations
|
|
2
|
-
from arekit.contrib.source.nerel.reader import NerelDocReader
|
|
3
|
-
from arekit.contrib.source.nerel.versions import NerelVersions
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class NERELDocOperation(DocumentOperations):
|
|
7
|
-
""" A Russian dataset with nested named entities, relations, events and linked entities.
|
|
8
|
-
https://github.com/nerel-ds/NEREL
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
def __init__(self, filename_by_id, version):
|
|
12
|
-
""" filename_ids: dict
|
|
13
|
-
Dictionary of {id: filename}, where
|
|
14
|
-
- id: int
|
|
15
|
-
- filename: str
|
|
16
|
-
version: NerelVersions
|
|
17
|
-
Specify the appropriate version of teh NEREL collection.
|
|
18
|
-
"""
|
|
19
|
-
assert(isinstance(filename_by_id, dict))
|
|
20
|
-
assert(isinstance(version, NerelVersions))
|
|
21
|
-
super(NERELDocOperation, self).__init__()
|
|
22
|
-
self.__filename_by_id = filename_by_id
|
|
23
|
-
self.__version = version
|
|
24
|
-
self.__doc_reader = NerelDocReader(version)
|
|
25
|
-
|
|
26
|
-
def by_id(self, doc_id):
|
|
27
|
-
return self.__doc_reader.read_document(doc_id=doc_id, filename=self.__filename_by_id[doc_id])
|
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
from arekit.common.experiment.api.ops_doc import DocumentOperations
|
|
2
|
-
from arekit.common.experiment.data_type import DataType
|
|
3
|
-
from arekit.contrib.source.nerel.io_utils import NerelIOUtils
|
|
4
|
-
from arekit.contrib.source.nerel.versions import NerelVersions
|
|
5
|
-
from arekit.contrib.utils.pipelines.sources.nerel.doc_ops import NERELDocOperation
|
|
6
|
-
from arekit.contrib.utils.pipelines.sources.nerel.labels_fmt import NerelAnyLabelFormatter
|
|
7
|
-
from arekit.contrib.utils.pipelines.text_opinion.annot.predefined import PredefinedTextOpinionAnnotator
|
|
8
|
-
from arekit.contrib.utils.pipelines.text_opinion.extraction import text_opinion_extraction_pipeline
|
|
9
|
-
from arekit.contrib.utils.pipelines.text_opinion.filters.distance_based import DistanceLimitedTextOpinionFilter
|
|
10
|
-
from arekit.contrib.utils.pipelines.text_opinion.filters.entity_based import EntityBasedTextOpinionFilter
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def create_text_relation_extraction_pipeline(nerel_version,
|
|
14
|
-
text_parser,
|
|
15
|
-
label_formatter=NerelAnyLabelFormatter(),
|
|
16
|
-
terms_per_context=50,
|
|
17
|
-
doc_ops=None,
|
|
18
|
-
docs_limit=None,
|
|
19
|
-
entity_filter=None):
|
|
20
|
-
assert(isinstance(nerel_version, NerelVersions))
|
|
21
|
-
assert(isinstance(doc_ops, DocumentOperations) or doc_ops is None)
|
|
22
|
-
|
|
23
|
-
data_folding = None
|
|
24
|
-
|
|
25
|
-
if doc_ops is None:
|
|
26
|
-
# Default Initialization.
|
|
27
|
-
filenames_by_ids, data_folding = NerelIOUtils.read_dataset_split(version=nerel_version,
|
|
28
|
-
docs_limit=docs_limit)
|
|
29
|
-
doc_ops = NERELDocOperation(filename_by_id=filenames_by_ids,
|
|
30
|
-
version=nerel_version)
|
|
31
|
-
|
|
32
|
-
text_opinion_filters = [
|
|
33
|
-
EntityBasedTextOpinionFilter(entity_filter=entity_filter),
|
|
34
|
-
DistanceLimitedTextOpinionFilter(terms_per_context)
|
|
35
|
-
]
|
|
36
|
-
|
|
37
|
-
predefined_annot = PredefinedTextOpinionAnnotator(doc_ops, label_formatter)
|
|
38
|
-
|
|
39
|
-
pipelines = {
|
|
40
|
-
DataType.Train: text_opinion_extraction_pipeline(text_parser=text_parser,
|
|
41
|
-
get_doc_by_id_func=doc_ops.by_id,
|
|
42
|
-
annotators=[predefined_annot],
|
|
43
|
-
text_opinion_filters=text_opinion_filters),
|
|
44
|
-
DataType.Test: text_opinion_extraction_pipeline(text_parser=text_parser,
|
|
45
|
-
get_doc_by_id_func=doc_ops.by_id,
|
|
46
|
-
annotators=[predefined_annot],
|
|
47
|
-
text_opinion_filters=text_opinion_filters),
|
|
48
|
-
DataType.Dev: text_opinion_extraction_pipeline(text_parser=text_parser,
|
|
49
|
-
get_doc_by_id_func=doc_ops.by_id,
|
|
50
|
-
annotators=[predefined_annot],
|
|
51
|
-
text_opinion_filters=text_opinion_filters),
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
# In the case when we setup a default data-folding.
|
|
55
|
-
# There is a need to provide it, due to the needs in further.
|
|
56
|
-
if data_folding is not None:
|
|
57
|
-
return pipelines, data_folding
|
|
58
|
-
|
|
59
|
-
return pipelines
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
from arekit.common.labels.str_fmt import StringLabelsFormatter
|
|
2
|
-
from arekit.contrib.source.nerel import labels
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class NerelAnyLabelFormatter(StringLabelsFormatter):
|
|
6
|
-
|
|
7
|
-
def __init__(self):
|
|
8
|
-
|
|
9
|
-
stol = {
|
|
10
|
-
"OPINION_BELONGS_TO": labels.OpinionBelongsTo,
|
|
11
|
-
"OPINION_RELATES_TO": labels.OpinionRelatesTo,
|
|
12
|
-
"NEG_EFFECT_FROM": labels.NegEffectFrom,
|
|
13
|
-
"POS_EFFECT_FROM": labels.PosEffectFrom,
|
|
14
|
-
"NEG_STATE_FROM": labels.NegStateFrom,
|
|
15
|
-
"POS_STATE_FROM": labels.PosStateFrom,
|
|
16
|
-
"NEGATIVE_TO": labels.NegativeTo,
|
|
17
|
-
"POSITIVE_TO": labels.PositiveTo,
|
|
18
|
-
"STATE_BELONGS_TO": labels.STATE_BELONGS_TO,
|
|
19
|
-
"POS_AUTHOR_FROM": labels.PosAuthorFrom,
|
|
20
|
-
"NEG_AUTHOR_FROM": labels.NegAuthorFrom,
|
|
21
|
-
"ALTERNATIVE_NAME": labels.ALTERNATIVE_NAME,
|
|
22
|
-
"ORIGINS_FROM": labels.ORIGINS_FROM,
|
|
23
|
-
"START_TIME": labels.START_TIME,
|
|
24
|
-
"OWNER_OF": labels.OWNER_OF,
|
|
25
|
-
"SUBEVENT_OF": labels.SUBEVENT_OF,
|
|
26
|
-
"PARENT_OF": labels.PARENT_OF,
|
|
27
|
-
"SUBORDINATE_OF": labels.SUBORDINATE_OF,
|
|
28
|
-
"PART_OF": labels.PART_OF,
|
|
29
|
-
"TAKES_PLACE_IN": labels.TAKES_PLACE_IN,
|
|
30
|
-
"PARTICIPANT_IN": labels.PARTICIPANT_IN,
|
|
31
|
-
"WORKPLACE": labels.WORKPLACE,
|
|
32
|
-
"PENALIZED_AS": labels.PENALIZED_AS,
|
|
33
|
-
"WORKS_AS": labels.WORKS_AS,
|
|
34
|
-
"PLACE_OF_DEATH": labels.PLACE_OF_DEATH,
|
|
35
|
-
"PLACE_OF_BIRTH": labels.PLACE_OF_BIRTH,
|
|
36
|
-
"HAS_CAUSE": labels.HAS_CAUSE,
|
|
37
|
-
"AWARDED_WITH": labels.AWARDED_WITH,
|
|
38
|
-
"CAUSE_OF_DEATH": labels.CAUSE_OF_DEATH,
|
|
39
|
-
"CONVICTED_OF": labels.CONVICTED_OF,
|
|
40
|
-
"DATE_DEFUNCT_IN": labels.DATE_DEFUNCT_IN,
|
|
41
|
-
"DATE_FOUNDED_IN": labels.DATE_FOUNDED_IN,
|
|
42
|
-
"DATE_OF_BIRTH": labels.DATE_OF_BIRTH,
|
|
43
|
-
"DATE_OF_CREATION": labels.DATE_OF_CREATION,
|
|
44
|
-
"DATE_OF_DEATH": labels.DATE_OF_DEATH,
|
|
45
|
-
"END_TIME": labels.END_TIME,
|
|
46
|
-
"EXPENDITURE": labels.EXPENDITURE,
|
|
47
|
-
"FOUNDED_BY": labels.FOUNDED_BY,
|
|
48
|
-
"KNOWS": labels.KNOWS,
|
|
49
|
-
"RELATIVE": labels.RELATIVE,
|
|
50
|
-
"LOCATED_IN": labels.LOCATED_IN,
|
|
51
|
-
"RELIGION_OF": labels.RELIGION_OF,
|
|
52
|
-
"MEDICAL_CONDITION": labels.MEDICAL_CONDITION,
|
|
53
|
-
"SCHOOLS_ATTENDED": labels.SCHOOLS_ATTENDED,
|
|
54
|
-
"MEMBER_OF": labels.MEMBER_OF,
|
|
55
|
-
"SIBLING": labels.SIBLING,
|
|
56
|
-
"ORGANIZES": labels.ORGANIZES,
|
|
57
|
-
"SPOUSE": labels.SPOUSE
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
super(NerelAnyLabelFormatter, self).__init__(stol=stol)
|
|
File without changes
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
from arekit.common.experiment.api.ops_doc import DocumentOperations
|
|
2
|
-
from arekit.contrib.source.nerelbio.reader import NerelBioDocReader
|
|
3
|
-
from arekit.contrib.source.nerelbio.versions import NerelBioVersions
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class NERELBioDocOperation(DocumentOperations):
|
|
7
|
-
""" NEREL-BIO extends the general domain dataset NEREL.
|
|
8
|
-
NEREL-BIO annotation scheme covers both general and biomedical
|
|
9
|
-
domains making it suitable for domain transfer experiments.
|
|
10
|
-
https://github.com/nerel-ds/NEREL-BIO
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
def __init__(self, filename_by_id, version):
|
|
14
|
-
""" filename_ids: dict
|
|
15
|
-
Dictionary of {id: filename}, where
|
|
16
|
-
- id: int
|
|
17
|
-
- filename: str
|
|
18
|
-
version: NerelBioVersions
|
|
19
|
-
Specify the appropriate version of the NEREL-BIO collection.
|
|
20
|
-
"""
|
|
21
|
-
assert(isinstance(filename_by_id, dict))
|
|
22
|
-
assert(isinstance(version, NerelBioVersions))
|
|
23
|
-
super(NERELBioDocOperation, self).__init__()
|
|
24
|
-
self.__filename_by_id = filename_by_id
|
|
25
|
-
self.__version = version
|
|
26
|
-
self.__doc_reader = NerelBioDocReader(version)
|
|
27
|
-
|
|
28
|
-
def by_id(self, doc_id):
|
|
29
|
-
return self.__doc_reader.read_document(doc_id=doc_id, filename=self.__filename_by_id[doc_id])
|
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
from arekit.common.experiment.api.ops_doc import DocumentOperations
|
|
2
|
-
from arekit.common.experiment.data_type import DataType
|
|
3
|
-
from arekit.contrib.source.nerelbio.io_utils import NerelBioIOUtils
|
|
4
|
-
from arekit.contrib.source.nerelbio.versions import NerelBioVersions
|
|
5
|
-
from arekit.contrib.utils.pipelines.sources.nerel_bio.doc_ops import NERELBioDocOperation
|
|
6
|
-
from arekit.contrib.utils.pipelines.sources.nerel_bio.labels_fmt import NerelBioAnyLabelFormatter
|
|
7
|
-
from arekit.contrib.utils.pipelines.text_opinion.annot.predefined import PredefinedTextOpinionAnnotator
|
|
8
|
-
from arekit.contrib.utils.pipelines.text_opinion.extraction import text_opinion_extraction_pipeline
|
|
9
|
-
from arekit.contrib.utils.pipelines.text_opinion.filters.distance_based import DistanceLimitedTextOpinionFilter
|
|
10
|
-
from arekit.contrib.utils.pipelines.text_opinion.filters.entity_based import EntityBasedTextOpinionFilter
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def create_text_relation_extraction_pipeline(nerel_bio_version,
|
|
14
|
-
text_parser,
|
|
15
|
-
label_formatter=NerelBioAnyLabelFormatter(),
|
|
16
|
-
terms_per_context=50,
|
|
17
|
-
doc_ops=None,
|
|
18
|
-
docs_limit=None,
|
|
19
|
-
entity_filter=None):
|
|
20
|
-
assert(isinstance(nerel_bio_version, NerelBioVersions))
|
|
21
|
-
assert(isinstance(doc_ops, DocumentOperations) or doc_ops is None)
|
|
22
|
-
|
|
23
|
-
data_folding = None
|
|
24
|
-
|
|
25
|
-
if doc_ops is None:
|
|
26
|
-
# Default Initialization.
|
|
27
|
-
filenames_by_ids, data_folding = NerelBioIOUtils.read_dataset_split(version=nerel_bio_version,
|
|
28
|
-
docs_limit=docs_limit)
|
|
29
|
-
doc_ops = NERELBioDocOperation(filename_by_id=filenames_by_ids,
|
|
30
|
-
version=nerel_bio_version)
|
|
31
|
-
|
|
32
|
-
text_opinion_filters = [
|
|
33
|
-
EntityBasedTextOpinionFilter(entity_filter=entity_filter),
|
|
34
|
-
DistanceLimitedTextOpinionFilter(terms_per_context)
|
|
35
|
-
]
|
|
36
|
-
|
|
37
|
-
predefined_annot = PredefinedTextOpinionAnnotator(doc_ops, label_formatter)
|
|
38
|
-
|
|
39
|
-
pipelines = {
|
|
40
|
-
DataType.Train: text_opinion_extraction_pipeline(text_parser=text_parser,
|
|
41
|
-
get_doc_by_id_func=doc_ops.by_id,
|
|
42
|
-
annotators=[predefined_annot],
|
|
43
|
-
text_opinion_filters=text_opinion_filters),
|
|
44
|
-
DataType.Test: text_opinion_extraction_pipeline(text_parser=text_parser,
|
|
45
|
-
get_doc_by_id_func=doc_ops.by_id,
|
|
46
|
-
annotators=[predefined_annot],
|
|
47
|
-
text_opinion_filters=text_opinion_filters),
|
|
48
|
-
DataType.Dev: text_opinion_extraction_pipeline(text_parser=text_parser,
|
|
49
|
-
get_doc_by_id_func=doc_ops.by_id,
|
|
50
|
-
annotators=[predefined_annot],
|
|
51
|
-
text_opinion_filters=text_opinion_filters),
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
# In the case when we setup a default data-folding.
|
|
55
|
-
# There is a need to provide it, due to the needs in further.
|
|
56
|
-
if data_folding is not None:
|
|
57
|
-
return pipelines, data_folding
|
|
58
|
-
|
|
59
|
-
return pipelines
|