arekit 0.24.0__py3-none-any.whl → 0.25.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/common/docs/entities_grouping.py +2 -1
- arekit/common/docs/parser.py +52 -20
- arekit/common/pipeline/base.py +12 -16
- arekit/common/pipeline/batching.py +28 -0
- arekit/common/pipeline/context.py +5 -1
- arekit/common/pipeline/items/base.py +38 -1
- arekit/common/pipeline/items/flatten.py +5 -1
- arekit/common/pipeline/items/handle.py +2 -1
- arekit/common/pipeline/items/iter.py +2 -1
- arekit/common/pipeline/items/map.py +2 -1
- arekit/common/pipeline/items/map_nested.py +4 -0
- arekit/common/pipeline/utils.py +32 -0
- arekit/common/service/sqlite.py +36 -0
- arekit/common/text/{partitioning/str.py → partitioning.py} +14 -9
- arekit/common/utils.py +0 -44
- arekit/contrib/utils/data/contents/opinions.py +13 -3
- arekit/contrib/utils/data/readers/sqlite.py +14 -0
- arekit/contrib/utils/data/storages/row_cache.py +6 -1
- arekit/contrib/utils/data/storages/sqlite_based.py +17 -0
- arekit/contrib/utils/data/writers/sqlite_native.py +4 -0
- arekit/contrib/utils/io_utils/utils.py +1 -18
- arekit/contrib/utils/pipelines/items/sampling/base.py +7 -12
- arekit/contrib/utils/pipelines/items/sampling/networks.py +3 -2
- arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
- arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +2 -2
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +2 -1
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +2 -4
- arekit/contrib/utils/pipelines/items/text/translator.py +2 -1
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +6 -9
- arekit/contrib/utils/serializer.py +1 -2
- arekit-0.25.0.data/data/logo.png +0 -0
- arekit-0.25.0.dist-info/METADATA +82 -0
- {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/RECORD +38 -153
- {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/WHEEL +1 -1
- arekit/common/docs/objects_parser.py +0 -37
- arekit/common/text/parser.py +0 -12
- arekit/common/text/partitioning/base.py +0 -4
- arekit/common/text/partitioning/terms.py +0 -35
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +0 -84
- arekit/contrib/source/brat/doc.py +0 -28
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +0 -13
- arekit/contrib/source/brat/entities/entity.py +0 -42
- arekit/contrib/source/brat/entities/parser.py +0 -53
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +0 -19
- arekit/contrib/source/brat/relation.py +0 -32
- arekit/contrib/source/brat/sentence.py +0 -69
- arekit/contrib/source/brat/sentences_reader.py +0 -128
- arekit/contrib/source/download.py +0 -41
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +0 -55
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +0 -74
- arekit/contrib/source/nerel/io_utils.py +0 -62
- arekit/contrib/source/nerel/labels.py +0 -241
- arekit/contrib/source/nerel/reader.py +0 -46
- arekit/contrib/source/nerel/utils.py +0 -24
- arekit/contrib/source/nerel/versions.py +0 -12
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +0 -62
- arekit/contrib/source/nerelbio/labels.py +0 -265
- arekit/contrib/source/nerelbio/reader.py +0 -8
- arekit/contrib/source/nerelbio/versions.py +0 -8
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +0 -36
- arekit/contrib/source/ruattitudes/doc.py +0 -51
- arekit/contrib/source/ruattitudes/doc_brat.py +0 -44
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
- arekit/contrib/source/ruattitudes/io_utils.py +0 -56
- arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
- arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
- arekit/contrib/source/ruattitudes/reader.py +0 -268
- arekit/contrib/source/ruattitudes/sentence.py +0 -73
- arekit/contrib/source/ruattitudes/synonyms.py +0 -17
- arekit/contrib/source/ruattitudes/text_object.py +0 -59
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +0 -157
- arekit/contrib/source/rusentiframes/effect.py +0 -24
- arekit/contrib/source/rusentiframes/io_utils.py +0 -19
- arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
- arekit/contrib/source/rusentiframes/polarity.py +0 -35
- arekit/contrib/source/rusentiframes/role.py +0 -15
- arekit/contrib/source/rusentiframes/state.py +0 -24
- arekit/contrib/source/rusentiframes/types.py +0 -42
- arekit/contrib/source/rusentiframes/value.py +0 -2
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +0 -3
- arekit/contrib/source/rusentrel/docs_reader.py +0 -51
- arekit/contrib/source/rusentrel/entities.py +0 -26
- arekit/contrib/source/rusentrel/io_utils.py +0 -125
- arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
- arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
- arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
- arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
- arekit/contrib/source/rusentrel/synonyms.py +0 -17
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +0 -52
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +0 -31
- arekit/contrib/source/sentinerel/folding/fixed.py +0 -70
- arekit/contrib/source/sentinerel/io_utils.py +0 -87
- arekit/contrib/source/sentinerel/labels.py +0 -53
- arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
- arekit/contrib/source/sentinerel/reader.py +0 -42
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +0 -19
- arekit/contrib/source/zip_utils.py +0 -47
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
- arekit/contrib/utils/download.py +0 -77
- arekit/contrib/utils/io_utils/opinions.py +0 -37
- arekit/contrib/utils/io_utils/samples.py +0 -79
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +0 -41
- arekit/contrib/utils/lexicons/relation.py +0 -42
- arekit/contrib/utils/lexicons/rusentilex.py +0 -37
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +0 -83
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +0 -27
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -65
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +0 -29
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -64
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +0 -56
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -20
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -65
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +0 -21
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -107
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +0 -29
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -180
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
- arekit/contrib/utils/resources.py +0 -25
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
- arekit/download_data.py +0 -11
- arekit-0.24.0.dist-info/METADATA +0 -23
- /arekit/common/{text/partitioning → service}/__init__.py +0 -0
- {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/LICENSE +0 -0
- {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/top_level.txt +0 -0
arekit/contrib/utils/nn/rows.py
DELETED
|
@@ -1,83 +0,0 @@
|
|
|
1
|
-
import collections
|
|
2
|
-
|
|
3
|
-
from arekit.common.data.input.providers.text.single import BaseSingleTextProvider
|
|
4
|
-
from arekit.common.data.input.terms_mapper import OpinionContainingTextTermsMapper
|
|
5
|
-
from arekit.common.entities.str_fmt import StringEntitiesFormatter
|
|
6
|
-
from arekit.contrib.networks.input.ctx_serialization import NetworkSerializationContext
|
|
7
|
-
from arekit.contrib.networks.input.formatters.pos_mapper import PosTermsMapper
|
|
8
|
-
from arekit.contrib.networks.input.providers.sample import NetworkSampleRowProvider
|
|
9
|
-
from arekit.contrib.networks.input.providers.text import NetworkSingleTextProvider
|
|
10
|
-
from arekit.contrib.networks.input.term_types import TermTypes
|
|
11
|
-
from arekit.contrib.networks.input.terms_mapping import VectorizedNetworkTermMapping
|
|
12
|
-
from arekit.contrib.utils.processing.lemmatization.mystem import MystemWrapper
|
|
13
|
-
from arekit.contrib.utils.resources import load_embedding_news_mystem_skipgram_1000_20_2015
|
|
14
|
-
from arekit.contrib.utils.vectorizers.bpe import BPEVectorizer
|
|
15
|
-
from arekit.contrib.utils.vectorizers.random_norm import RandomNormalVectorizer
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def __add_term_embedding(dict_data, term, emb_vector):
|
|
19
|
-
if term in dict_data:
|
|
20
|
-
return
|
|
21
|
-
dict_data[term] = emb_vector
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def create_rows_provider(str_entity_fmt, ctx, vectorizers="default"):
|
|
25
|
-
""" This method is corresponds to the default initialization of
|
|
26
|
-
the rows provider for data sampling pipeline.
|
|
27
|
-
|
|
28
|
-
vectorizers:
|
|
29
|
-
NONE: no need to vectorize, just provide text (using SingleTextProvider).
|
|
30
|
-
DEFAULT: we consider an application of stemmer for Russian Language.
|
|
31
|
-
DICT: in which for every type there is an assigned Vectorizer
|
|
32
|
-
vectorization of term types.
|
|
33
|
-
{
|
|
34
|
-
TermType.Word: Vectorizer,
|
|
35
|
-
TermType.Entity: Vectorizer,
|
|
36
|
-
...
|
|
37
|
-
}
|
|
38
|
-
"""
|
|
39
|
-
assert(isinstance(str_entity_fmt, StringEntitiesFormatter))
|
|
40
|
-
assert(isinstance(ctx, NetworkSerializationContext))
|
|
41
|
-
assert(isinstance(vectorizers, dict) or vectorizers == "default" or vectorizers is None)
|
|
42
|
-
|
|
43
|
-
term_embedding_pairs = None
|
|
44
|
-
|
|
45
|
-
if vectorizers is not None:
|
|
46
|
-
|
|
47
|
-
if vectorizers == "default":
|
|
48
|
-
# initialize default vectorizer for Russian language.
|
|
49
|
-
embedding = load_embedding_news_mystem_skipgram_1000_20_2015(stemmer=MystemWrapper(), auto_download=True)
|
|
50
|
-
bpe_vectorizer = BPEVectorizer(embedding=embedding, max_part_size=3)
|
|
51
|
-
norm_vectorizer = RandomNormalVectorizer(vector_size=embedding.VectorSize,
|
|
52
|
-
token_offset=12345)
|
|
53
|
-
vectorizers = {
|
|
54
|
-
TermTypes.WORD: bpe_vectorizer,
|
|
55
|
-
TermTypes.ENTITY: bpe_vectorizer,
|
|
56
|
-
TermTypes.FRAME: bpe_vectorizer,
|
|
57
|
-
TermTypes.TOKEN: norm_vectorizer
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
# Setup term-embedding pairs collection instance.
|
|
61
|
-
term_embedding_pairs = collections.OrderedDict()
|
|
62
|
-
|
|
63
|
-
# Use text provider with vectorizers.
|
|
64
|
-
text_provider = NetworkSingleTextProvider(
|
|
65
|
-
text_terms_mapper=VectorizedNetworkTermMapping(
|
|
66
|
-
vectorizers=vectorizers,
|
|
67
|
-
string_entities_formatter=str_entity_fmt),
|
|
68
|
-
pair_handling_func=lambda pair: __add_term_embedding(
|
|
69
|
-
dict_data=term_embedding_pairs,
|
|
70
|
-
term=pair[0],
|
|
71
|
-
emb_vector=pair[1]))
|
|
72
|
-
else:
|
|
73
|
-
# Create text provider which without vectorizers.
|
|
74
|
-
text_provider = BaseSingleTextProvider(
|
|
75
|
-
text_terms_mapper=OpinionContainingTextTermsMapper(str_entity_fmt))
|
|
76
|
-
|
|
77
|
-
return NetworkSampleRowProvider(
|
|
78
|
-
label_provider=ctx.LabelProvider,
|
|
79
|
-
text_provider=text_provider,
|
|
80
|
-
frames_connotation_provider=ctx.FramesConnotationProvider,
|
|
81
|
-
frame_role_label_scaler=ctx.FrameRolesLabelScaler,
|
|
82
|
-
pos_terms_mapper=PosTermsMapper(ctx.PosTagger) if ctx.PosTagger is not None else None,
|
|
83
|
-
term_embedding_pairs=term_embedding_pairs)
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
from arekit.common.pipeline.context import PipelineContext
|
|
2
|
-
from arekit.common.pipeline.items.base import BasePipelineItem
|
|
3
|
-
from arekit.common.utils import split_by_whitespaces
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class TermsSplitterParser(BasePipelineItem):
|
|
7
|
-
|
|
8
|
-
def apply_core(self, input_data, pipeline_ctx):
|
|
9
|
-
assert(isinstance(pipeline_ctx, PipelineContext))
|
|
10
|
-
return split_by_whitespaces(input_data)
|
|
File without changes
|
|
File without changes
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
from arekit.common.data.doc_provider import DocumentProvider
|
|
2
|
-
from arekit.contrib.source.nerel.reader import NerelDocReader
|
|
3
|
-
from arekit.contrib.source.nerel.versions import NerelVersions
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class NERELDocProvider(DocumentProvider):
|
|
7
|
-
""" A Russian dataset with nested named entities, relations, events and linked entities.
|
|
8
|
-
https://github.com/nerel-ds/NEREL
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
def __init__(self, filename_by_id, version):
|
|
12
|
-
""" filename_ids: dict
|
|
13
|
-
Dictionary of {id: filename}, where
|
|
14
|
-
- id: int
|
|
15
|
-
- filename: str
|
|
16
|
-
version: NerelVersions
|
|
17
|
-
Specify the appropriate version of teh NEREL collection.
|
|
18
|
-
"""
|
|
19
|
-
assert(isinstance(filename_by_id, dict))
|
|
20
|
-
assert(isinstance(version, NerelVersions))
|
|
21
|
-
super(NERELDocProvider, self).__init__()
|
|
22
|
-
self.__filename_by_id = filename_by_id
|
|
23
|
-
self.__version = version
|
|
24
|
-
self.__doc_reader = NerelDocReader(version)
|
|
25
|
-
|
|
26
|
-
def by_id(self, doc_id):
|
|
27
|
-
return self.__doc_reader.read_document(doc_id=doc_id, filename=self.__filename_by_id[doc_id])
|
|
@@ -1,65 +0,0 @@
|
|
|
1
|
-
from arekit.common.data.doc_provider import DocumentProvider
|
|
2
|
-
from arekit.common.experiment.data_type import DataType
|
|
3
|
-
from arekit.contrib.source.nerel.io_utils import NerelIOUtils
|
|
4
|
-
from arekit.contrib.source.nerel.versions import NerelVersions
|
|
5
|
-
from arekit.contrib.utils.pipelines.sources.nerel.doc_provider import NERELDocProvider
|
|
6
|
-
from arekit.contrib.utils.pipelines.sources.nerel.labels_fmt import NerelAnyLabelFormatter
|
|
7
|
-
from arekit.contrib.utils.pipelines.text_opinion.annot.predefined import PredefinedTextOpinionAnnotator
|
|
8
|
-
from arekit.contrib.utils.pipelines.text_opinion.extraction import text_opinion_extraction_pipeline
|
|
9
|
-
from arekit.contrib.utils.pipelines.text_opinion.filters.distance_based import DistanceLimitedTextOpinionFilter
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def create_text_relation_extraction_pipeline(nerel_version,
|
|
13
|
-
text_parser,
|
|
14
|
-
label_formatter=NerelAnyLabelFormatter(),
|
|
15
|
-
terms_per_context=50,
|
|
16
|
-
doc_ops=None,
|
|
17
|
-
docs_limit=None,
|
|
18
|
-
custom_text_opinion_filters=None):
|
|
19
|
-
assert(isinstance(nerel_version, NerelVersions))
|
|
20
|
-
assert(isinstance(doc_ops, DocumentProvider) or doc_ops is None)
|
|
21
|
-
assert(isinstance(custom_text_opinion_filters, list) or custom_text_opinion_filters is None)
|
|
22
|
-
|
|
23
|
-
data_folding = None
|
|
24
|
-
|
|
25
|
-
if doc_ops is None:
|
|
26
|
-
# Default Initialization.
|
|
27
|
-
filenames_by_ids, data_folding = NerelIOUtils.read_dataset_split(version=nerel_version,
|
|
28
|
-
docs_limit=docs_limit)
|
|
29
|
-
doc_ops = NERELDocProvider(filename_by_id=filenames_by_ids, version=nerel_version)
|
|
30
|
-
|
|
31
|
-
# Default text opinion filters.
|
|
32
|
-
text_opinion_filters = [
|
|
33
|
-
DistanceLimitedTextOpinionFilter(terms_per_context)
|
|
34
|
-
]
|
|
35
|
-
|
|
36
|
-
# Append with the custom filters afterwards.
|
|
37
|
-
if custom_text_opinion_filters is not None:
|
|
38
|
-
text_opinion_filters += custom_text_opinion_filters
|
|
39
|
-
|
|
40
|
-
predefined_annot = PredefinedTextOpinionAnnotator(doc_ops, label_formatter)
|
|
41
|
-
|
|
42
|
-
pipelines = {
|
|
43
|
-
DataType.Train: text_opinion_extraction_pipeline(text_parser=text_parser,
|
|
44
|
-
get_doc_by_id_func=doc_ops.by_id,
|
|
45
|
-
annotators=[predefined_annot],
|
|
46
|
-
entity_index_func=lambda brat_entity: brat_entity.ID,
|
|
47
|
-
text_opinion_filters=text_opinion_filters),
|
|
48
|
-
DataType.Test: text_opinion_extraction_pipeline(text_parser=text_parser,
|
|
49
|
-
get_doc_by_id_func=doc_ops.by_id,
|
|
50
|
-
annotators=[predefined_annot],
|
|
51
|
-
entity_index_func=lambda brat_entity: brat_entity.ID,
|
|
52
|
-
text_opinion_filters=text_opinion_filters),
|
|
53
|
-
DataType.Dev: text_opinion_extraction_pipeline(text_parser=text_parser,
|
|
54
|
-
get_doc_by_id_func=doc_ops.by_id,
|
|
55
|
-
annotators=[predefined_annot],
|
|
56
|
-
entity_index_func=lambda brat_entity: brat_entity.ID,
|
|
57
|
-
text_opinion_filters=text_opinion_filters),
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
# In the case when we setup a default data-folding.
|
|
61
|
-
# There is a need to provide it, due to the needs in further.
|
|
62
|
-
if data_folding is not None:
|
|
63
|
-
return pipelines, data_folding
|
|
64
|
-
|
|
65
|
-
return pipelines
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
from arekit.common.labels.str_fmt import StringLabelsFormatter
|
|
2
|
-
from arekit.contrib.source.nerel import labels
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class NerelAnyLabelFormatter(StringLabelsFormatter):
|
|
6
|
-
|
|
7
|
-
def __init__(self):
|
|
8
|
-
|
|
9
|
-
stol = {
|
|
10
|
-
"OPINION_BELONGS_TO": labels.OpinionBelongsTo,
|
|
11
|
-
"OPINION_RELATES_TO": labels.OpinionRelatesTo,
|
|
12
|
-
"NEG_EFFECT_FROM": labels.NegEffectFrom,
|
|
13
|
-
"POS_EFFECT_FROM": labels.PosEffectFrom,
|
|
14
|
-
"NEG_STATE_FROM": labels.NegStateFrom,
|
|
15
|
-
"POS_STATE_FROM": labels.PosStateFrom,
|
|
16
|
-
"NEGATIVE_TO": labels.NegativeTo,
|
|
17
|
-
"POSITIVE_TO": labels.PositiveTo,
|
|
18
|
-
"STATE_BELONGS_TO": labels.STATE_BELONGS_TO,
|
|
19
|
-
"POS_AUTHOR_FROM": labels.PosAuthorFrom,
|
|
20
|
-
"NEG_AUTHOR_FROM": labels.NegAuthorFrom,
|
|
21
|
-
"ALTERNATIVE_NAME": labels.ALTERNATIVE_NAME,
|
|
22
|
-
"ORIGINS_FROM": labels.ORIGINS_FROM,
|
|
23
|
-
"START_TIME": labels.START_TIME,
|
|
24
|
-
"OWNER_OF": labels.OWNER_OF,
|
|
25
|
-
"SUBEVENT_OF": labels.SUBEVENT_OF,
|
|
26
|
-
"PARENT_OF": labels.PARENT_OF,
|
|
27
|
-
"SUBORDINATE_OF": labels.SUBORDINATE_OF,
|
|
28
|
-
"PART_OF": labels.PART_OF,
|
|
29
|
-
"TAKES_PLACE_IN": labels.TAKES_PLACE_IN,
|
|
30
|
-
"PARTICIPANT_IN": labels.PARTICIPANT_IN,
|
|
31
|
-
"WORKPLACE": labels.WORKPLACE,
|
|
32
|
-
"PENALIZED_AS": labels.PENALIZED_AS,
|
|
33
|
-
"WORKS_AS": labels.WORKS_AS,
|
|
34
|
-
"PLACE_OF_DEATH": labels.PLACE_OF_DEATH,
|
|
35
|
-
"PLACE_OF_BIRTH": labels.PLACE_OF_BIRTH,
|
|
36
|
-
"HAS_CAUSE": labels.HAS_CAUSE,
|
|
37
|
-
"AWARDED_WITH": labels.AWARDED_WITH,
|
|
38
|
-
"CAUSE_OF_DEATH": labels.CAUSE_OF_DEATH,
|
|
39
|
-
"CONVICTED_OF": labels.CONVICTED_OF,
|
|
40
|
-
"DATE_DEFUNCT_IN": labels.DATE_DEFUNCT_IN,
|
|
41
|
-
"DATE_FOUNDED_IN": labels.DATE_FOUNDED_IN,
|
|
42
|
-
"DATE_OF_BIRTH": labels.DATE_OF_BIRTH,
|
|
43
|
-
"DATE_OF_CREATION": labels.DATE_OF_CREATION,
|
|
44
|
-
"DATE_OF_DEATH": labels.DATE_OF_DEATH,
|
|
45
|
-
"END_TIME": labels.END_TIME,
|
|
46
|
-
"EXPENDITURE": labels.EXPENDITURE,
|
|
47
|
-
"FOUNDED_BY": labels.FOUNDED_BY,
|
|
48
|
-
"KNOWS": labels.KNOWS,
|
|
49
|
-
"RELATIVE": labels.RELATIVE,
|
|
50
|
-
"LOCATED_IN": labels.LOCATED_IN,
|
|
51
|
-
"RELIGION_OF": labels.RELIGION_OF,
|
|
52
|
-
"MEDICAL_CONDITION": labels.MEDICAL_CONDITION,
|
|
53
|
-
"SCHOOLS_ATTENDED": labels.SCHOOLS_ATTENDED,
|
|
54
|
-
"MEMBER_OF": labels.MEMBER_OF,
|
|
55
|
-
"SIBLING": labels.SIBLING,
|
|
56
|
-
"ORGANIZES": labels.ORGANIZES,
|
|
57
|
-
"SPOUSE": labels.SPOUSE
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
super(NerelAnyLabelFormatter, self).__init__(stol=stol)
|
|
File without changes
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
from arekit.common.data.doc_provider import DocumentProvider
|
|
2
|
-
from arekit.contrib.source.nerelbio.reader import NerelBioDocReader
|
|
3
|
-
from arekit.contrib.source.nerelbio.versions import NerelBioVersions
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class NERELBioDocProvider(DocumentProvider):
|
|
7
|
-
""" NEREL-BIO extends the general domain dataset NEREL.
|
|
8
|
-
NEREL-BIO annotation scheme covers both general and biomedical
|
|
9
|
-
domains making it suitable for domain transfer experiments.
|
|
10
|
-
https://github.com/nerel-ds/NEREL-BIO
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
def __init__(self, filename_by_id, version):
|
|
14
|
-
""" filename_ids: dict
|
|
15
|
-
Dictionary of {id: filename}, where
|
|
16
|
-
- id: int
|
|
17
|
-
- filename: str
|
|
18
|
-
version: NerelBioVersions
|
|
19
|
-
Specify the appropriate version of the NEREL-BIO collection.
|
|
20
|
-
"""
|
|
21
|
-
assert(isinstance(filename_by_id, dict))
|
|
22
|
-
assert(isinstance(version, NerelBioVersions))
|
|
23
|
-
super(NERELBioDocProvider, self).__init__()
|
|
24
|
-
self.__filename_by_id = filename_by_id
|
|
25
|
-
self.__version = version
|
|
26
|
-
self.__doc_reader = NerelBioDocReader(version)
|
|
27
|
-
|
|
28
|
-
def by_id(self, doc_id):
|
|
29
|
-
return self.__doc_reader.read_document(doc_id=doc_id, filename=self.__filename_by_id[doc_id])
|
|
@@ -1,64 +0,0 @@
|
|
|
1
|
-
from arekit.common.data.doc_provider import DocumentProvider
|
|
2
|
-
from arekit.common.experiment.data_type import DataType
|
|
3
|
-
from arekit.contrib.source.nerelbio.io_utils import NerelBioIOUtils
|
|
4
|
-
from arekit.contrib.source.nerelbio.versions import NerelBioVersions
|
|
5
|
-
from arekit.contrib.utils.pipelines.sources.nerel_bio.doc_provider import NERELBioDocProvider
|
|
6
|
-
from arekit.contrib.utils.pipelines.sources.nerel_bio.labels_fmt import NerelBioAnyLabelFormatter
|
|
7
|
-
from arekit.contrib.utils.pipelines.text_opinion.annot.predefined import PredefinedTextOpinionAnnotator
|
|
8
|
-
from arekit.contrib.utils.pipelines.text_opinion.extraction import text_opinion_extraction_pipeline
|
|
9
|
-
from arekit.contrib.utils.pipelines.text_opinion.filters.distance_based import DistanceLimitedTextOpinionFilter
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def create_text_relation_extraction_pipeline(nerel_bio_version,
|
|
13
|
-
text_parser,
|
|
14
|
-
label_formatter=NerelBioAnyLabelFormatter(),
|
|
15
|
-
terms_per_context=50,
|
|
16
|
-
doc_ops=None,
|
|
17
|
-
docs_limit=None,
|
|
18
|
-
custom_text_opinion_filters=None):
|
|
19
|
-
assert(isinstance(nerel_bio_version, NerelBioVersions))
|
|
20
|
-
assert(isinstance(doc_ops, DocumentProvider) or doc_ops is None)
|
|
21
|
-
assert(isinstance(custom_text_opinion_filters, list) or custom_text_opinion_filters is None)
|
|
22
|
-
|
|
23
|
-
data_folding = None
|
|
24
|
-
|
|
25
|
-
if doc_ops is None:
|
|
26
|
-
# Default Initialization.
|
|
27
|
-
filenames_by_ids, data_folding = NerelBioIOUtils.read_dataset_split(version=nerel_bio_version,
|
|
28
|
-
docs_limit=docs_limit)
|
|
29
|
-
doc_ops = NERELBioDocProvider(filename_by_id=filenames_by_ids, version=nerel_bio_version)
|
|
30
|
-
|
|
31
|
-
text_opinion_filters = [
|
|
32
|
-
DistanceLimitedTextOpinionFilter(terms_per_context)
|
|
33
|
-
]
|
|
34
|
-
|
|
35
|
-
# Append with the custom filters afterwards.
|
|
36
|
-
if custom_text_opinion_filters is not None:
|
|
37
|
-
text_opinion_filters += custom_text_opinion_filters
|
|
38
|
-
|
|
39
|
-
predefined_annot = PredefinedTextOpinionAnnotator(doc_ops, label_formatter)
|
|
40
|
-
|
|
41
|
-
pipelines = {
|
|
42
|
-
DataType.Train: text_opinion_extraction_pipeline(text_parser=text_parser,
|
|
43
|
-
get_doc_by_id_func=doc_ops.by_id,
|
|
44
|
-
annotators=[predefined_annot],
|
|
45
|
-
entity_index_func=lambda brat_entity: brat_entity.ID,
|
|
46
|
-
text_opinion_filters=text_opinion_filters),
|
|
47
|
-
DataType.Test: text_opinion_extraction_pipeline(text_parser=text_parser,
|
|
48
|
-
get_doc_by_id_func=doc_ops.by_id,
|
|
49
|
-
annotators=[predefined_annot],
|
|
50
|
-
entity_index_func=lambda brat_entity: brat_entity.ID,
|
|
51
|
-
text_opinion_filters=text_opinion_filters),
|
|
52
|
-
DataType.Dev: text_opinion_extraction_pipeline(text_parser=text_parser,
|
|
53
|
-
get_doc_by_id_func=doc_ops.by_id,
|
|
54
|
-
annotators=[predefined_annot],
|
|
55
|
-
entity_index_func=lambda brat_entity: brat_entity.ID,
|
|
56
|
-
text_opinion_filters=text_opinion_filters),
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
# In the case when we setup a default data-folding.
|
|
60
|
-
# There is a need to provide it, due to the needs in further.
|
|
61
|
-
if data_folding is not None:
|
|
62
|
-
return pipelines, data_folding
|
|
63
|
-
|
|
64
|
-
return pipelines
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
from arekit.common.labels.str_fmt import StringLabelsFormatter
|
|
2
|
-
from arekit.contrib.source.nerelbio import labels
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class NerelBioAnyLabelFormatter(StringLabelsFormatter):
|
|
6
|
-
|
|
7
|
-
def __init__(self):
|
|
8
|
-
|
|
9
|
-
stol = {
|
|
10
|
-
"ABBREVIATION": labels.ABBREVIATION,
|
|
11
|
-
"ALTERNATIVE_NAME": labels.ALTERNATIVE_NAME,
|
|
12
|
-
"KNOWS": labels.KNOWS,
|
|
13
|
-
"AGE_IS": labels.AGE_IS,
|
|
14
|
-
"AGE_DIED_AT": labels.AGE_DIED_AT,
|
|
15
|
-
"AWARDED_WITH": labels.AWARDED_WITH,
|
|
16
|
-
"PLACE_OF_BIRTH": labels.PLACE_OF_BIRTH,
|
|
17
|
-
"DATE_DEFUNCT_IN": labels.DATE_DEFUNCT_IN,
|
|
18
|
-
"DATE_FOUNDED_IN": labels.DATE_FOUNDED_IN,
|
|
19
|
-
"DATE_OF_BIRTH": labels.DATE_OF_BIRTH,
|
|
20
|
-
"DATE_OF_CREATION": labels.DATE_OF_CREATION,
|
|
21
|
-
"DATE_OF_DEATH": labels.DATE_OF_DEATH,
|
|
22
|
-
"POINT_IN_TIME": labels.POINT_IN_TIME,
|
|
23
|
-
"PLACE_OF_DEATH": labels.PLACE_OF_DEATH,
|
|
24
|
-
"FOUNDED_BY": labels.FOUNDED_BY,
|
|
25
|
-
"HEADQUARTERED_IN": labels.HEADQUARTERED_IN,
|
|
26
|
-
"IDEOLOGY_OF": labels.IDEOLOGY_OF,
|
|
27
|
-
"SPOUSE": labels.SPOUSE,
|
|
28
|
-
"MEMBER_OF": labels.MEMBER_OF,
|
|
29
|
-
"ORGANIZES": labels.ORGANIZES,
|
|
30
|
-
"OWNER_OF": labels.OWNER_OF,
|
|
31
|
-
"PARENT_OF": labels.PARENT_OF,
|
|
32
|
-
"PARTICIPANT_IN": labels.PARTICIPANT_IN,
|
|
33
|
-
"PLACE_RESIDES_IN": labels.PLACE_RESIDES_IN,
|
|
34
|
-
"PRICE_OF": labels.PRICE_OF,
|
|
35
|
-
"PRODUCES": labels.PRODUCES,
|
|
36
|
-
"RELATIVE": labels.RELATIVE,
|
|
37
|
-
"RELIGION_OF": labels.RELIGION_OF,
|
|
38
|
-
"SCHOOLS_ATTENDED": labels.SCHOOLS_ATTENDED,
|
|
39
|
-
"SIBLING": labels.SIBLING,
|
|
40
|
-
"SUBEVENT_OF": labels.SUBEVENT_OF,
|
|
41
|
-
"SUBORDINATE_OF": labels.SUBORDINATE_OF,
|
|
42
|
-
"TAKES_PLACE_IN": labels.TAKES_PLACE_IN,
|
|
43
|
-
"WORKPLACE": labels.WORKPLACE,
|
|
44
|
-
"WORKS_AS": labels.WORKS_AS,
|
|
45
|
-
"CONVICTED_OF": labels.CONVICTED_OF,
|
|
46
|
-
"PENALIZED_AS": labels.PENALIZED_AS,
|
|
47
|
-
"START_TIME": labels.START_TIME,
|
|
48
|
-
"END_TIME": labels.END_TIME,
|
|
49
|
-
"EXPENDITURE": labels.EXPENDITURE,
|
|
50
|
-
"AGENT": labels.AGENT,
|
|
51
|
-
"INANIMATE_INVOLVED": labels.INANIMATE_INVOLVED,
|
|
52
|
-
"INCOME": labels.INCOME,
|
|
53
|
-
"SUBCLASS_OF": labels.SUBCLASS_OF,
|
|
54
|
-
"PART_OF": labels.PART_OF,
|
|
55
|
-
"LOCATED_IN": labels.LOCATED_IN,
|
|
56
|
-
"TREATED_USING": labels.TREATED_USING,
|
|
57
|
-
"ORIGINS_FROM": labels.ORIGINS_FROM,
|
|
58
|
-
"TO_DETECT_OR_STUDY": labels.TO_DETECT_OR_STUDY,
|
|
59
|
-
"AFFECTS": labels.AFFECTS,
|
|
60
|
-
"HAS_CAUSE": labels.HAS_CAUSE,
|
|
61
|
-
"APPLIED_TO": labels.APPLIED_TO,
|
|
62
|
-
"USED_IN": labels.USED_IN,
|
|
63
|
-
"ASSOCIATED_WITH": labels.ASSOCIATED_WITH,
|
|
64
|
-
"HAS_ADMINISTRATION_ROUTE": labels.HAS_ADMINISTRATION_ROUTE,
|
|
65
|
-
"HAS_STRENGTH": labels.HAS_STRENGTH,
|
|
66
|
-
"DURATION_OF": labels.DURATION_OF,
|
|
67
|
-
"VALUE_IS": labels.VALUE_IS,
|
|
68
|
-
"PHYSIOLOGY_OF": labels.PHYSIOLOGY_OF,
|
|
69
|
-
"PROCEDURE_PERFORMED": labels.PROCEDURE_PERFORMED,
|
|
70
|
-
"MENTAL_PROCESS_OF": labels.MENTAL_PROCESS_OF,
|
|
71
|
-
"MEDICAL_CONDITION": labels.MEDICAL_CONDITION,
|
|
72
|
-
"DOSE_IS": labels.DOSE_IS,
|
|
73
|
-
"FINDING_OF": labels.FINDING_OF,
|
|
74
|
-
"CAUSE_OF_DEATH": labels.CAUSE_OF_DEATH,
|
|
75
|
-
"CONSUME": labels.CONSUME,
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
super(NerelBioAnyLabelFormatter, self).__init__(stol=stol)
|
|
79
|
-
|
|
File without changes
|
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
from arekit.common.utils import progress_bar_iter
|
|
2
|
-
from arekit.contrib.source.ruattitudes.collection import RuAttitudesCollection
|
|
3
|
-
from arekit.contrib.source.ruattitudes.io_utils import RuAttitudesVersions
|
|
4
|
-
from arekit.contrib.source.ruattitudes.doc import RuAttitudesDocument
|
|
5
|
-
from arekit.contrib.source.ruattitudes.doc_brat import RuAttitudesDocumentsConverter
|
|
6
|
-
from arekit.contrib.utils.data.doc_provider.dict_based import DictionaryBasedDocumentProvider
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class RuAttitudesDocumentProvider(DictionaryBasedDocumentProvider):
|
|
10
|
-
|
|
11
|
-
def __init__(self, version, keep_doc_ids_only, doc_id_func, limit):
|
|
12
|
-
d = self.read_ruattitudes_to_brat_in_memory(version=version,
|
|
13
|
-
keep_doc_ids_only=keep_doc_ids_only,
|
|
14
|
-
doc_id_func=doc_id_func,
|
|
15
|
-
limit=limit)
|
|
16
|
-
super(RuAttitudesDocumentProvider, self).__init__(d)
|
|
17
|
-
|
|
18
|
-
@staticmethod
|
|
19
|
-
def read_ruattitudes_to_brat_in_memory(version, keep_doc_ids_only, doc_id_func, limit=None):
|
|
20
|
-
""" Performs reading of RuAttitude formatted documents and
|
|
21
|
-
selection according to 'doc_ids_set' parameter.
|
|
22
|
-
"""
|
|
23
|
-
assert (isinstance(version, RuAttitudesVersions))
|
|
24
|
-
assert (isinstance(keep_doc_ids_only, bool))
|
|
25
|
-
assert (callable(doc_id_func))
|
|
26
|
-
|
|
27
|
-
it = RuAttitudesCollection.iter_docs(version=version,
|
|
28
|
-
get_doc_index_func=doc_id_func,
|
|
29
|
-
return_inds_only=keep_doc_ids_only)
|
|
30
|
-
|
|
31
|
-
it_formatted_and_logged = progress_bar_iter(
|
|
32
|
-
iterable=RuAttitudesDocumentProvider.__iter_id_with_doc(
|
|
33
|
-
docs_it=it, keep_doc_ids_only=keep_doc_ids_only),
|
|
34
|
-
desc="Loading RuAttitudes Collection [{}]".format("doc ids only" if keep_doc_ids_only else "fully"),
|
|
35
|
-
unit='docs')
|
|
36
|
-
|
|
37
|
-
d = {}
|
|
38
|
-
docs_read = 0
|
|
39
|
-
for doc_id, doc in it_formatted_and_logged:
|
|
40
|
-
assert(isinstance(doc, RuAttitudesDocument) or doc is None)
|
|
41
|
-
d[doc_id] = RuAttitudesDocumentsConverter.to_brat_doc(doc) if doc is not None else None
|
|
42
|
-
docs_read += 1
|
|
43
|
-
if limit is not None and docs_read >= limit:
|
|
44
|
-
break
|
|
45
|
-
|
|
46
|
-
return d
|
|
47
|
-
|
|
48
|
-
@staticmethod
|
|
49
|
-
def __iter_id_with_doc(docs_it, keep_doc_ids_only):
|
|
50
|
-
if keep_doc_ids_only:
|
|
51
|
-
for doc_id in docs_it:
|
|
52
|
-
yield doc_id, None
|
|
53
|
-
else:
|
|
54
|
-
for doc in docs_it:
|
|
55
|
-
assert (isinstance(doc, RuAttitudesDocument))
|
|
56
|
-
yield doc.ID, doc
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
from arekit.common.entities.types import OpinionEntityType
|
|
2
|
-
from arekit.contrib.utils.entities.filter import EntityFilter
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class RuAttitudesEntityFilter(EntityFilter):
|
|
6
|
-
""" This is a task-specific filter, which is applicable of entity types proposed
|
|
7
|
-
by the OntoNotesV5 resource: https://catalog.ldc.upenn.edu/LDC2013T19
|
|
8
|
-
We consider only a short list related to the sentiment attitude extraction task.
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
supported = ["GPE", "PERSON", "LOCAL", "GEO", "ORG"]
|
|
12
|
-
|
|
13
|
-
def is_ignored(self, entity, e_type):
|
|
14
|
-
|
|
15
|
-
if e_type == OpinionEntityType.Subject:
|
|
16
|
-
return entity.Type not in RuAttitudesEntityFilter.supported
|
|
17
|
-
if e_type == OpinionEntityType.Object:
|
|
18
|
-
return entity.Type not in RuAttitudesEntityFilter.supported
|
|
19
|
-
else:
|
|
20
|
-
return True
|
|
@@ -1,65 +0,0 @@
|
|
|
1
|
-
from arekit.common.labels.scaler.base import BaseLabelScaler
|
|
2
|
-
from arekit.contrib.source.ruattitudes.io_utils import RuAttitudesVersions
|
|
3
|
-
from arekit.contrib.source.ruattitudes.labels_fmt import RuAttitudesLabelFormatter
|
|
4
|
-
from arekit.contrib.utils.pipelines.sources.ruattitudes.doc_provider import RuAttitudesDocumentProvider
|
|
5
|
-
from arekit.contrib.utils.pipelines.sources.ruattitudes.entity_filter import RuAttitudesEntityFilter
|
|
6
|
-
from arekit.contrib.utils.pipelines.text_opinion.annot.predefined import PredefinedTextOpinionAnnotator
|
|
7
|
-
from arekit.contrib.utils.pipelines.text_opinion.extraction import text_opinion_extraction_pipeline
|
|
8
|
-
from arekit.contrib.utils.pipelines.text_opinion.filters.distance_based import DistanceLimitedTextOpinionFilter
|
|
9
|
-
from arekit.contrib.utils.pipelines.text_opinion.filters.entity_based import EntityBasedTextOpinionFilter
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def create_text_opinion_extraction_pipeline(text_parser,
|
|
13
|
-
label_scaler,
|
|
14
|
-
custom_text_opinion_filters=None,
|
|
15
|
-
version=RuAttitudesVersions.V20Large,
|
|
16
|
-
terms_per_context=50,
|
|
17
|
-
limit=None):
|
|
18
|
-
""" Processing pipeline for RuAttitudes.
|
|
19
|
-
This pipeline is based on the in-memory RuAttitudes storage.
|
|
20
|
-
|
|
21
|
-
Original collection paper: www.aclweb.org/anthology/r19-1118/
|
|
22
|
-
Github repository: https://github.com/nicolay-r/RuAttitudes
|
|
23
|
-
|
|
24
|
-
version: enum
|
|
25
|
-
Version of the RuAttitudes collection.
|
|
26
|
-
NOTE: we consider to support a variations of the 2.0 versions.
|
|
27
|
-
label_scaler:
|
|
28
|
-
Scaler that allows to perform conversion from integer labels (RuAttitudes) to
|
|
29
|
-
the actual `Label` instances, required in further for text_opinions instances.
|
|
30
|
-
terms_per_context: int
|
|
31
|
-
Amount of terms that we consider in between the Object and Subject.
|
|
32
|
-
limit: int or None
|
|
33
|
-
Limit of documents to consider.
|
|
34
|
-
"""
|
|
35
|
-
assert(isinstance(label_scaler, BaseLabelScaler))
|
|
36
|
-
assert(isinstance(version, RuAttitudesVersions))
|
|
37
|
-
assert(isinstance(custom_text_opinion_filters, list) or custom_text_opinion_filters is None)
|
|
38
|
-
assert(version in [RuAttitudesVersions.V20Large, RuAttitudesVersions.V20Base,
|
|
39
|
-
RuAttitudesVersions.V20BaseNeut, RuAttitudesVersions.V20LargeNeut])
|
|
40
|
-
|
|
41
|
-
doc_provider = RuAttitudesDocumentProvider(version=version,
|
|
42
|
-
keep_doc_ids_only=False,
|
|
43
|
-
doc_id_func=lambda doc_id: doc_id,
|
|
44
|
-
limit=limit)
|
|
45
|
-
|
|
46
|
-
text_opinion_filters = [
|
|
47
|
-
EntityBasedTextOpinionFilter(entity_filter=RuAttitudesEntityFilter()),
|
|
48
|
-
DistanceLimitedTextOpinionFilter(terms_per_context)
|
|
49
|
-
]
|
|
50
|
-
|
|
51
|
-
# Append with the custom filters afterwards.
|
|
52
|
-
if custom_text_opinion_filters is not None:
|
|
53
|
-
text_opinion_filters += custom_text_opinion_filters
|
|
54
|
-
|
|
55
|
-
pipeline = text_opinion_extraction_pipeline(
|
|
56
|
-
annotators=[
|
|
57
|
-
PredefinedTextOpinionAnnotator(doc_provider=doc_provider,
|
|
58
|
-
label_formatter=RuAttitudesLabelFormatter(label_scaler))
|
|
59
|
-
],
|
|
60
|
-
text_opinion_filters=custom_text_opinion_filters,
|
|
61
|
-
get_doc_by_id_func=doc_provider.by_id,
|
|
62
|
-
entity_index_func=lambda brat_entity: brat_entity.ID,
|
|
63
|
-
text_parser=text_parser)
|
|
64
|
-
|
|
65
|
-
return pipeline
|
|
File without changes
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
from arekit.common.data.doc_provider import DocumentProvider
|
|
2
|
-
from arekit.common.synonyms.base import SynonymsCollection
|
|
3
|
-
from arekit.contrib.source.rusentrel.io_utils import RuSentRelVersions
|
|
4
|
-
from arekit.contrib.source.rusentrel.docs_reader import RuSentRelDocumentsReader
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class RuSentrelDocumentProvider(DocumentProvider):
|
|
8
|
-
""" Limitations: Supported only train/test collections format
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
def __init__(self, version, synonyms):
|
|
12
|
-
assert(isinstance(version, RuSentRelVersions))
|
|
13
|
-
assert(isinstance(synonyms, SynonymsCollection))
|
|
14
|
-
super(RuSentrelDocumentProvider, self).__init__()
|
|
15
|
-
self.__version = version
|
|
16
|
-
self.__synonyms = synonyms
|
|
17
|
-
|
|
18
|
-
def by_id(self, doc_id):
|
|
19
|
-
assert (isinstance(doc_id, int))
|
|
20
|
-
return RuSentRelDocumentsReader.read_document(doc_id=doc_id, synonyms=self.__synonyms, version=self.__version)
|
|
21
|
-
|