arekit 0.24.0__py3-none-any.whl → 0.25.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/common/data/storages/base.py +4 -15
- arekit/common/docs/entities_grouping.py +2 -1
- arekit/common/docs/parser.py +27 -22
- arekit/common/pipeline/base.py +12 -16
- arekit/common/pipeline/batching.py +28 -0
- arekit/common/pipeline/context.py +5 -1
- arekit/common/pipeline/items/base.py +39 -2
- arekit/common/pipeline/items/flatten.py +5 -1
- arekit/common/pipeline/items/handle.py +2 -1
- arekit/common/pipeline/items/iter.py +2 -1
- arekit/common/pipeline/items/map.py +2 -1
- arekit/common/pipeline/items/map_nested.py +4 -0
- arekit/common/pipeline/utils.py +32 -0
- arekit/common/text/{partitioning/str.py → partitioning.py} +14 -9
- arekit/common/utils.py +11 -52
- arekit/contrib/utils/data/contents/opinions.py +13 -3
- arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
- arekit/contrib/utils/data/storages/pandas_based.py +2 -17
- arekit/contrib/utils/data/storages/row_cache.py +8 -2
- arekit/contrib/utils/data/storages/sqlite_based.py +18 -0
- arekit/contrib/utils/io_utils/utils.py +1 -18
- arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
- arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +8 -10
- arekit-0.25.1.data/data/logo.png +0 -0
- arekit-0.25.1.dist-info/METADATA +81 -0
- arekit-0.25.1.dist-info/RECORD +186 -0
- {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/WHEEL +1 -1
- arekit/common/data/input/repositories/__init__.py +0 -0
- arekit/common/data/input/repositories/base.py +0 -68
- arekit/common/data/input/repositories/sample.py +0 -22
- arekit/common/data/views/__init__.py +0 -0
- arekit/common/data/views/samples.py +0 -26
- arekit/common/docs/objects_parser.py +0 -37
- arekit/common/text/parser.py +0 -12
- arekit/common/text/partitioning/__init__.py +0 -0
- arekit/common/text/partitioning/base.py +0 -4
- arekit/common/text/partitioning/terms.py +0 -35
- arekit/contrib/networks/__init__.py +0 -0
- arekit/contrib/networks/embedding.py +0 -149
- arekit/contrib/networks/embedding_io.py +0 -18
- arekit/contrib/networks/input/__init__.py +0 -0
- arekit/contrib/networks/input/const.py +0 -6
- arekit/contrib/networks/input/ctx_serialization.py +0 -28
- arekit/contrib/networks/input/embedding/__init__.py +0 -0
- arekit/contrib/networks/input/embedding/matrix.py +0 -29
- arekit/contrib/networks/input/embedding/offsets.py +0 -55
- arekit/contrib/networks/input/formatters/__init__.py +0 -0
- arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
- arekit/contrib/networks/input/providers/__init__.py +0 -0
- arekit/contrib/networks/input/providers/sample.py +0 -129
- arekit/contrib/networks/input/providers/term_connotation.py +0 -23
- arekit/contrib/networks/input/providers/text.py +0 -24
- arekit/contrib/networks/input/rows_parser.py +0 -47
- arekit/contrib/networks/input/term_types.py +0 -13
- arekit/contrib/networks/input/terms_mapping.py +0 -60
- arekit/contrib/networks/vectorizer.py +0 -6
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +0 -84
- arekit/contrib/source/brat/doc.py +0 -28
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +0 -13
- arekit/contrib/source/brat/entities/entity.py +0 -42
- arekit/contrib/source/brat/entities/parser.py +0 -53
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +0 -19
- arekit/contrib/source/brat/relation.py +0 -32
- arekit/contrib/source/brat/sentence.py +0 -69
- arekit/contrib/source/brat/sentences_reader.py +0 -128
- arekit/contrib/source/download.py +0 -41
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +0 -55
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +0 -74
- arekit/contrib/source/nerel/io_utils.py +0 -62
- arekit/contrib/source/nerel/labels.py +0 -241
- arekit/contrib/source/nerel/reader.py +0 -46
- arekit/contrib/source/nerel/utils.py +0 -24
- arekit/contrib/source/nerel/versions.py +0 -12
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +0 -62
- arekit/contrib/source/nerelbio/labels.py +0 -265
- arekit/contrib/source/nerelbio/reader.py +0 -8
- arekit/contrib/source/nerelbio/versions.py +0 -8
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +0 -36
- arekit/contrib/source/ruattitudes/doc.py +0 -51
- arekit/contrib/source/ruattitudes/doc_brat.py +0 -44
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
- arekit/contrib/source/ruattitudes/io_utils.py +0 -56
- arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
- arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
- arekit/contrib/source/ruattitudes/reader.py +0 -268
- arekit/contrib/source/ruattitudes/sentence.py +0 -73
- arekit/contrib/source/ruattitudes/synonyms.py +0 -17
- arekit/contrib/source/ruattitudes/text_object.py +0 -59
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +0 -157
- arekit/contrib/source/rusentiframes/effect.py +0 -24
- arekit/contrib/source/rusentiframes/io_utils.py +0 -19
- arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
- arekit/contrib/source/rusentiframes/polarity.py +0 -35
- arekit/contrib/source/rusentiframes/role.py +0 -15
- arekit/contrib/source/rusentiframes/state.py +0 -24
- arekit/contrib/source/rusentiframes/types.py +0 -42
- arekit/contrib/source/rusentiframes/value.py +0 -2
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +0 -3
- arekit/contrib/source/rusentrel/docs_reader.py +0 -51
- arekit/contrib/source/rusentrel/entities.py +0 -26
- arekit/contrib/source/rusentrel/io_utils.py +0 -125
- arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
- arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
- arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
- arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
- arekit/contrib/source/rusentrel/synonyms.py +0 -17
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +0 -52
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +0 -31
- arekit/contrib/source/sentinerel/folding/fixed.py +0 -70
- arekit/contrib/source/sentinerel/io_utils.py +0 -87
- arekit/contrib/source/sentinerel/labels.py +0 -53
- arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
- arekit/contrib/source/sentinerel/reader.py +0 -42
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +0 -19
- arekit/contrib/source/zip_utils.py +0 -47
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
- arekit/contrib/utils/data/readers/__init__.py +0 -0
- arekit/contrib/utils/data/readers/base.py +0 -7
- arekit/contrib/utils/data/readers/csv_pd.py +0 -38
- arekit/contrib/utils/data/readers/jsonl.py +0 -15
- arekit/contrib/utils/data/service/__init__.py +0 -0
- arekit/contrib/utils/data/service/balance.py +0 -50
- arekit/contrib/utils/data/writers/csv_native.py +0 -63
- arekit/contrib/utils/data/writers/csv_pd.py +0 -40
- arekit/contrib/utils/data/writers/json_opennre.py +0 -132
- arekit/contrib/utils/data/writers/sqlite_native.py +0 -110
- arekit/contrib/utils/download.py +0 -77
- arekit/contrib/utils/embeddings/__init__.py +0 -0
- arekit/contrib/utils/embeddings/rusvectores.py +0 -58
- arekit/contrib/utils/embeddings/tokens.py +0 -30
- arekit/contrib/utils/io_utils/embedding.py +0 -72
- arekit/contrib/utils/io_utils/opinions.py +0 -37
- arekit/contrib/utils/io_utils/samples.py +0 -79
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +0 -41
- arekit/contrib/utils/lexicons/relation.py +0 -42
- arekit/contrib/utils/lexicons/rusentilex.py +0 -37
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +0 -83
- arekit/contrib/utils/np_utils/__init__.py +0 -0
- arekit/contrib/utils/np_utils/embedding.py +0 -22
- arekit/contrib/utils/np_utils/npz_utils.py +0 -13
- arekit/contrib/utils/np_utils/vocab.py +0 -20
- arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/base.py +0 -99
- arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -54
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -32
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -107
- arekit/contrib/utils/pipelines/items/text/translator.py +0 -135
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +0 -27
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -65
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +0 -29
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -64
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +0 -56
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -20
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -65
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +0 -21
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -107
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +0 -29
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -180
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
- arekit/contrib/utils/processing/languages/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/mods.py +0 -12
- arekit/contrib/utils/processing/languages/pos.py +0 -23
- arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
- arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
- arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
- arekit/contrib/utils/processing/languages/ru/number.py +0 -23
- arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
- arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
- arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
- arekit/contrib/utils/processing/pos/__init__.py +0 -0
- arekit/contrib/utils/processing/pos/base.py +0 -12
- arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
- arekit/contrib/utils/processing/pos/russian.py +0 -10
- arekit/contrib/utils/processing/text/__init__.py +0 -0
- arekit/contrib/utils/processing/text/tokens.py +0 -127
- arekit/contrib/utils/resources.py +0 -25
- arekit/contrib/utils/serializer.py +0 -43
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
- arekit/contrib/utils/vectorizers/__init__.py +0 -0
- arekit/contrib/utils/vectorizers/bpe.py +0 -93
- arekit/contrib/utils/vectorizers/random_norm.py +0 -39
- arekit/download_data.py +0 -11
- arekit-0.24.0.dist-info/METADATA +0 -23
- arekit-0.24.0.dist-info/RECORD +0 -374
- {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/LICENSE +0 -0
- {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/top_level.txt +0 -0
|
@@ -1,107 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
|
|
3
|
-
from arekit.common.context.token import Token
|
|
4
|
-
from arekit.common.pipeline.context import PipelineContext
|
|
5
|
-
from arekit.common.pipeline.items.base import BasePipelineItem
|
|
6
|
-
from arekit.common.utils import split_by_whitespaces
|
|
7
|
-
from arekit.contrib.utils.processing.text.tokens import Tokens
|
|
8
|
-
|
|
9
|
-
logger = logging.getLogger(__name__)
|
|
10
|
-
logger.setLevel(logging.INFO)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class DefaultTextTokenizer(BasePipelineItem):
|
|
14
|
-
""" Default parser implementation.
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
def __init__(self, keep_tokens=True):
|
|
18
|
-
super(DefaultTextTokenizer, self).__init__()
|
|
19
|
-
self.__keep_tokens = keep_tokens
|
|
20
|
-
|
|
21
|
-
# region protected methods
|
|
22
|
-
|
|
23
|
-
def apply_core(self, input_data, pipeline_ctx):
|
|
24
|
-
assert(isinstance(pipeline_ctx, PipelineContext))
|
|
25
|
-
output_data = self.__process_parts(input_data)
|
|
26
|
-
if not self.__keep_tokens:
|
|
27
|
-
output_data = [word for word in output_data if not isinstance(word, Token)]
|
|
28
|
-
return output_data
|
|
29
|
-
|
|
30
|
-
# endregion
|
|
31
|
-
|
|
32
|
-
# region private static methods
|
|
33
|
-
|
|
34
|
-
def __process_parts(self, parts):
|
|
35
|
-
assert(isinstance(parts, list))
|
|
36
|
-
|
|
37
|
-
parsed = []
|
|
38
|
-
for part in parts:
|
|
39
|
-
|
|
40
|
-
if part is None:
|
|
41
|
-
continue
|
|
42
|
-
|
|
43
|
-
# Keep non str words as it is and try to parse str-based words.
|
|
44
|
-
processed = [part] if not isinstance(part, str) else \
|
|
45
|
-
self.__iter_processed_part(part=part)
|
|
46
|
-
|
|
47
|
-
parsed.extend(processed)
|
|
48
|
-
|
|
49
|
-
return parsed
|
|
50
|
-
|
|
51
|
-
def __iter_processed_part(self, part):
|
|
52
|
-
for word in split_by_whitespaces(part):
|
|
53
|
-
for term in self.__process_word(word):
|
|
54
|
-
yield term
|
|
55
|
-
|
|
56
|
-
def __process_word(self, word):
|
|
57
|
-
assert(isinstance(word, str))
|
|
58
|
-
return self.__split_tokens(word)
|
|
59
|
-
|
|
60
|
-
@staticmethod
|
|
61
|
-
def __split_tokens(term):
|
|
62
|
-
"""
|
|
63
|
-
Splitting off tokens from parsed_doc ending, i.e. for example:
|
|
64
|
-
term: "сказать,-" -> "(term: "сказать", ["COMMA_TOKEN", "DASH_TOKEN"])
|
|
65
|
-
return: (unicode or None, list)
|
|
66
|
-
modified term and list of extracted tokens.
|
|
67
|
-
"""
|
|
68
|
-
|
|
69
|
-
url = Tokens.try_create_url(term)
|
|
70
|
-
if url is not None:
|
|
71
|
-
return [url]
|
|
72
|
-
|
|
73
|
-
l = 0
|
|
74
|
-
words_and_tokens = []
|
|
75
|
-
while l < len(term):
|
|
76
|
-
|
|
77
|
-
# Token.
|
|
78
|
-
token = Tokens.try_create(term[l])
|
|
79
|
-
if token is not None:
|
|
80
|
-
if token.get_token_value() != Tokens.NEW_LINE:
|
|
81
|
-
words_and_tokens.append(token)
|
|
82
|
-
l += 1
|
|
83
|
-
|
|
84
|
-
# Number.
|
|
85
|
-
elif str.isdigit(term[l]):
|
|
86
|
-
k = l + 1
|
|
87
|
-
while k < len(term) and str.isdigit(term[k]):
|
|
88
|
-
k += 1
|
|
89
|
-
token = Tokens.try_create_number(term[l:k])
|
|
90
|
-
assert(token is not None)
|
|
91
|
-
words_and_tokens.append(token)
|
|
92
|
-
l = k
|
|
93
|
-
|
|
94
|
-
# Term.
|
|
95
|
-
else:
|
|
96
|
-
k = l + 1
|
|
97
|
-
while k < len(term):
|
|
98
|
-
token = Tokens.try_create(term[k])
|
|
99
|
-
if token is not None and token.get_token_value() != Tokens.DASH:
|
|
100
|
-
break
|
|
101
|
-
k += 1
|
|
102
|
-
words_and_tokens.append(term[l:k])
|
|
103
|
-
l = k
|
|
104
|
-
|
|
105
|
-
return words_and_tokens
|
|
106
|
-
|
|
107
|
-
# endregion
|
|
@@ -1,135 +0,0 @@
|
|
|
1
|
-
from arekit.common.data.input.providers.const import IDLE_MODE
|
|
2
|
-
from arekit.common.pipeline.conts import PARENT_CTX
|
|
3
|
-
from arekit.common.entities.base import Entity
|
|
4
|
-
from arekit.common.pipeline.context import PipelineContext
|
|
5
|
-
from arekit.common.pipeline.items.base import BasePipelineItem
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class MLTextTranslatorPipelineItem(BasePipelineItem):
|
|
9
|
-
""" Machine learning based translator pipeline item.
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
def __init__(self, batch_translate_model, do_translate_entity=True):
|
|
13
|
-
""" Model, which is based on translation of the text,
|
|
14
|
-
represented as a list of words.
|
|
15
|
-
"""
|
|
16
|
-
self.__do_translate_entity = do_translate_entity
|
|
17
|
-
self.__translate = batch_translate_model
|
|
18
|
-
|
|
19
|
-
def fast_most_accurate_approach(self, input_data, entity_placeholder_template="<entityTag={}/>"):
|
|
20
|
-
""" This approach assumes that the translation won't corrupt the original
|
|
21
|
-
meta-annotation for entities and objects mentioned in text.
|
|
22
|
-
"""
|
|
23
|
-
|
|
24
|
-
def __optionally_register(prts):
|
|
25
|
-
if len(prts) > 0:
|
|
26
|
-
content.append(" ".join(prts))
|
|
27
|
-
parts_to_join.clear()
|
|
28
|
-
|
|
29
|
-
content = []
|
|
30
|
-
origin_entities = []
|
|
31
|
-
parts_to_join = []
|
|
32
|
-
|
|
33
|
-
for part in input_data:
|
|
34
|
-
if isinstance(part, str) and part.strip():
|
|
35
|
-
parts_to_join.append(part)
|
|
36
|
-
elif isinstance(part, Entity):
|
|
37
|
-
entity_index = len(origin_entities)
|
|
38
|
-
parts_to_join.append(entity_placeholder_template.format(entity_index))
|
|
39
|
-
# Register entities information for further restoration.
|
|
40
|
-
origin_entities.append(part)
|
|
41
|
-
|
|
42
|
-
# Register original text with masked named entities.
|
|
43
|
-
__optionally_register(parts_to_join)
|
|
44
|
-
# Register all named entities in order of their appearance in text.
|
|
45
|
-
content.extend([e.Value for e in origin_entities])
|
|
46
|
-
|
|
47
|
-
# Compose text parts.
|
|
48
|
-
translated_parts = self.__translate(content)
|
|
49
|
-
|
|
50
|
-
if len(translated_parts) == 0:
|
|
51
|
-
return None
|
|
52
|
-
|
|
53
|
-
# Take the original text.
|
|
54
|
-
text = translated_parts[0]
|
|
55
|
-
for entity_index in range(len(origin_entities)):
|
|
56
|
-
if entity_placeholder_template.format(entity_index) not in text:
|
|
57
|
-
return None
|
|
58
|
-
|
|
59
|
-
# Enumerate entities.
|
|
60
|
-
from_ind = 0
|
|
61
|
-
text_parts = []
|
|
62
|
-
for entity_index, translated_value in enumerate(translated_parts[1:]):
|
|
63
|
-
entity_placeholder_instance = entity_placeholder_template.format(entity_index)
|
|
64
|
-
# Cropping text part.
|
|
65
|
-
to_ind = text.index(entity_placeholder_instance)
|
|
66
|
-
|
|
67
|
-
if self.__do_translate_entity:
|
|
68
|
-
origin_entities[entity_index].set_display_value(translated_value.strip())
|
|
69
|
-
|
|
70
|
-
# Register entities.
|
|
71
|
-
text_parts.append(text[from_ind:to_ind])
|
|
72
|
-
text_parts.append(origin_entities[entity_index])
|
|
73
|
-
# Update from index.
|
|
74
|
-
from_ind = to_ind + len(entity_placeholder_instance)
|
|
75
|
-
|
|
76
|
-
# Consider the remaining part.
|
|
77
|
-
text_parts.append(text[from_ind:])
|
|
78
|
-
return text_parts
|
|
79
|
-
|
|
80
|
-
def default_pre_part_splitting_approach(self, input_data):
|
|
81
|
-
""" This is the original strategy, based on the manually cropped named entities
|
|
82
|
-
before the actual translation call.
|
|
83
|
-
"""
|
|
84
|
-
|
|
85
|
-
def __optionally_register(prts):
|
|
86
|
-
if len(prts) > 0:
|
|
87
|
-
content.append(" ".join(prts))
|
|
88
|
-
parts_to_join.clear()
|
|
89
|
-
|
|
90
|
-
content = []
|
|
91
|
-
origin_entities = []
|
|
92
|
-
origin_entity_ind = []
|
|
93
|
-
parts_to_join = []
|
|
94
|
-
|
|
95
|
-
for _, part in enumerate(input_data):
|
|
96
|
-
if isinstance(part, str) and part.strip():
|
|
97
|
-
parts_to_join.append(part)
|
|
98
|
-
elif isinstance(part, Entity):
|
|
99
|
-
# Register first the prior parts were merged.
|
|
100
|
-
__optionally_register(parts_to_join)
|
|
101
|
-
# Register entities information for further restoration.
|
|
102
|
-
origin_entity_ind.append(len(content))
|
|
103
|
-
origin_entities.append(part)
|
|
104
|
-
content.append(part.Value)
|
|
105
|
-
|
|
106
|
-
__optionally_register(parts_to_join)
|
|
107
|
-
|
|
108
|
-
# Compose text parts.
|
|
109
|
-
translated_parts = self.__translate(content)
|
|
110
|
-
|
|
111
|
-
for entity_ind, entity_part_ind in enumerate(origin_entity_ind):
|
|
112
|
-
entity = origin_entities[entity_ind]
|
|
113
|
-
if self.__do_translate_entity:
|
|
114
|
-
entity.set_display_value(translated_parts[entity_part_ind].strip())
|
|
115
|
-
translated_parts[entity_part_ind] = entity
|
|
116
|
-
|
|
117
|
-
return translated_parts
|
|
118
|
-
|
|
119
|
-
def apply_core(self, input_data, pipeline_ctx):
|
|
120
|
-
assert(isinstance(pipeline_ctx, PipelineContext))
|
|
121
|
-
assert(isinstance(input_data, list))
|
|
122
|
-
|
|
123
|
-
# Check the pipeline state whether is an idle mode or not.
|
|
124
|
-
parent_ctx = pipeline_ctx.provide(PARENT_CTX)
|
|
125
|
-
idle_mode = parent_ctx.provide(IDLE_MODE)
|
|
126
|
-
|
|
127
|
-
# When pipeline utilized only for the assessing the expected amount
|
|
128
|
-
# of rows (common case of idle_mode), there is no need to perform
|
|
129
|
-
# translation.
|
|
130
|
-
if idle_mode:
|
|
131
|
-
return
|
|
132
|
-
|
|
133
|
-
fast_accurate = self.fast_most_accurate_approach(input_data)
|
|
134
|
-
return self.default_pre_part_splitting_approach(input_data) \
|
|
135
|
-
if fast_accurate is None else fast_accurate
|
|
File without changes
|
|
File without changes
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
from arekit.common.data.doc_provider import DocumentProvider
|
|
2
|
-
from arekit.contrib.source.nerel.reader import NerelDocReader
|
|
3
|
-
from arekit.contrib.source.nerel.versions import NerelVersions
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class NERELDocProvider(DocumentProvider):
|
|
7
|
-
""" A Russian dataset with nested named entities, relations, events and linked entities.
|
|
8
|
-
https://github.com/nerel-ds/NEREL
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
def __init__(self, filename_by_id, version):
|
|
12
|
-
""" filename_ids: dict
|
|
13
|
-
Dictionary of {id: filename}, where
|
|
14
|
-
- id: int
|
|
15
|
-
- filename: str
|
|
16
|
-
version: NerelVersions
|
|
17
|
-
Specify the appropriate version of teh NEREL collection.
|
|
18
|
-
"""
|
|
19
|
-
assert(isinstance(filename_by_id, dict))
|
|
20
|
-
assert(isinstance(version, NerelVersions))
|
|
21
|
-
super(NERELDocProvider, self).__init__()
|
|
22
|
-
self.__filename_by_id = filename_by_id
|
|
23
|
-
self.__version = version
|
|
24
|
-
self.__doc_reader = NerelDocReader(version)
|
|
25
|
-
|
|
26
|
-
def by_id(self, doc_id):
|
|
27
|
-
return self.__doc_reader.read_document(doc_id=doc_id, filename=self.__filename_by_id[doc_id])
|
|
@@ -1,65 +0,0 @@
|
|
|
1
|
-
from arekit.common.data.doc_provider import DocumentProvider
|
|
2
|
-
from arekit.common.experiment.data_type import DataType
|
|
3
|
-
from arekit.contrib.source.nerel.io_utils import NerelIOUtils
|
|
4
|
-
from arekit.contrib.source.nerel.versions import NerelVersions
|
|
5
|
-
from arekit.contrib.utils.pipelines.sources.nerel.doc_provider import NERELDocProvider
|
|
6
|
-
from arekit.contrib.utils.pipelines.sources.nerel.labels_fmt import NerelAnyLabelFormatter
|
|
7
|
-
from arekit.contrib.utils.pipelines.text_opinion.annot.predefined import PredefinedTextOpinionAnnotator
|
|
8
|
-
from arekit.contrib.utils.pipelines.text_opinion.extraction import text_opinion_extraction_pipeline
|
|
9
|
-
from arekit.contrib.utils.pipelines.text_opinion.filters.distance_based import DistanceLimitedTextOpinionFilter
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def create_text_relation_extraction_pipeline(nerel_version,
|
|
13
|
-
text_parser,
|
|
14
|
-
label_formatter=NerelAnyLabelFormatter(),
|
|
15
|
-
terms_per_context=50,
|
|
16
|
-
doc_ops=None,
|
|
17
|
-
docs_limit=None,
|
|
18
|
-
custom_text_opinion_filters=None):
|
|
19
|
-
assert(isinstance(nerel_version, NerelVersions))
|
|
20
|
-
assert(isinstance(doc_ops, DocumentProvider) or doc_ops is None)
|
|
21
|
-
assert(isinstance(custom_text_opinion_filters, list) or custom_text_opinion_filters is None)
|
|
22
|
-
|
|
23
|
-
data_folding = None
|
|
24
|
-
|
|
25
|
-
if doc_ops is None:
|
|
26
|
-
# Default Initialization.
|
|
27
|
-
filenames_by_ids, data_folding = NerelIOUtils.read_dataset_split(version=nerel_version,
|
|
28
|
-
docs_limit=docs_limit)
|
|
29
|
-
doc_ops = NERELDocProvider(filename_by_id=filenames_by_ids, version=nerel_version)
|
|
30
|
-
|
|
31
|
-
# Default text opinion filters.
|
|
32
|
-
text_opinion_filters = [
|
|
33
|
-
DistanceLimitedTextOpinionFilter(terms_per_context)
|
|
34
|
-
]
|
|
35
|
-
|
|
36
|
-
# Append with the custom filters afterwards.
|
|
37
|
-
if custom_text_opinion_filters is not None:
|
|
38
|
-
text_opinion_filters += custom_text_opinion_filters
|
|
39
|
-
|
|
40
|
-
predefined_annot = PredefinedTextOpinionAnnotator(doc_ops, label_formatter)
|
|
41
|
-
|
|
42
|
-
pipelines = {
|
|
43
|
-
DataType.Train: text_opinion_extraction_pipeline(text_parser=text_parser,
|
|
44
|
-
get_doc_by_id_func=doc_ops.by_id,
|
|
45
|
-
annotators=[predefined_annot],
|
|
46
|
-
entity_index_func=lambda brat_entity: brat_entity.ID,
|
|
47
|
-
text_opinion_filters=text_opinion_filters),
|
|
48
|
-
DataType.Test: text_opinion_extraction_pipeline(text_parser=text_parser,
|
|
49
|
-
get_doc_by_id_func=doc_ops.by_id,
|
|
50
|
-
annotators=[predefined_annot],
|
|
51
|
-
entity_index_func=lambda brat_entity: brat_entity.ID,
|
|
52
|
-
text_opinion_filters=text_opinion_filters),
|
|
53
|
-
DataType.Dev: text_opinion_extraction_pipeline(text_parser=text_parser,
|
|
54
|
-
get_doc_by_id_func=doc_ops.by_id,
|
|
55
|
-
annotators=[predefined_annot],
|
|
56
|
-
entity_index_func=lambda brat_entity: brat_entity.ID,
|
|
57
|
-
text_opinion_filters=text_opinion_filters),
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
# In the case when we setup a default data-folding.
|
|
61
|
-
# There is a need to provide it, due to the needs in further.
|
|
62
|
-
if data_folding is not None:
|
|
63
|
-
return pipelines, data_folding
|
|
64
|
-
|
|
65
|
-
return pipelines
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
from arekit.common.labels.str_fmt import StringLabelsFormatter
|
|
2
|
-
from arekit.contrib.source.nerel import labels
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class NerelAnyLabelFormatter(StringLabelsFormatter):
|
|
6
|
-
|
|
7
|
-
def __init__(self):
|
|
8
|
-
|
|
9
|
-
stol = {
|
|
10
|
-
"OPINION_BELONGS_TO": labels.OpinionBelongsTo,
|
|
11
|
-
"OPINION_RELATES_TO": labels.OpinionRelatesTo,
|
|
12
|
-
"NEG_EFFECT_FROM": labels.NegEffectFrom,
|
|
13
|
-
"POS_EFFECT_FROM": labels.PosEffectFrom,
|
|
14
|
-
"NEG_STATE_FROM": labels.NegStateFrom,
|
|
15
|
-
"POS_STATE_FROM": labels.PosStateFrom,
|
|
16
|
-
"NEGATIVE_TO": labels.NegativeTo,
|
|
17
|
-
"POSITIVE_TO": labels.PositiveTo,
|
|
18
|
-
"STATE_BELONGS_TO": labels.STATE_BELONGS_TO,
|
|
19
|
-
"POS_AUTHOR_FROM": labels.PosAuthorFrom,
|
|
20
|
-
"NEG_AUTHOR_FROM": labels.NegAuthorFrom,
|
|
21
|
-
"ALTERNATIVE_NAME": labels.ALTERNATIVE_NAME,
|
|
22
|
-
"ORIGINS_FROM": labels.ORIGINS_FROM,
|
|
23
|
-
"START_TIME": labels.START_TIME,
|
|
24
|
-
"OWNER_OF": labels.OWNER_OF,
|
|
25
|
-
"SUBEVENT_OF": labels.SUBEVENT_OF,
|
|
26
|
-
"PARENT_OF": labels.PARENT_OF,
|
|
27
|
-
"SUBORDINATE_OF": labels.SUBORDINATE_OF,
|
|
28
|
-
"PART_OF": labels.PART_OF,
|
|
29
|
-
"TAKES_PLACE_IN": labels.TAKES_PLACE_IN,
|
|
30
|
-
"PARTICIPANT_IN": labels.PARTICIPANT_IN,
|
|
31
|
-
"WORKPLACE": labels.WORKPLACE,
|
|
32
|
-
"PENALIZED_AS": labels.PENALIZED_AS,
|
|
33
|
-
"WORKS_AS": labels.WORKS_AS,
|
|
34
|
-
"PLACE_OF_DEATH": labels.PLACE_OF_DEATH,
|
|
35
|
-
"PLACE_OF_BIRTH": labels.PLACE_OF_BIRTH,
|
|
36
|
-
"HAS_CAUSE": labels.HAS_CAUSE,
|
|
37
|
-
"AWARDED_WITH": labels.AWARDED_WITH,
|
|
38
|
-
"CAUSE_OF_DEATH": labels.CAUSE_OF_DEATH,
|
|
39
|
-
"CONVICTED_OF": labels.CONVICTED_OF,
|
|
40
|
-
"DATE_DEFUNCT_IN": labels.DATE_DEFUNCT_IN,
|
|
41
|
-
"DATE_FOUNDED_IN": labels.DATE_FOUNDED_IN,
|
|
42
|
-
"DATE_OF_BIRTH": labels.DATE_OF_BIRTH,
|
|
43
|
-
"DATE_OF_CREATION": labels.DATE_OF_CREATION,
|
|
44
|
-
"DATE_OF_DEATH": labels.DATE_OF_DEATH,
|
|
45
|
-
"END_TIME": labels.END_TIME,
|
|
46
|
-
"EXPENDITURE": labels.EXPENDITURE,
|
|
47
|
-
"FOUNDED_BY": labels.FOUNDED_BY,
|
|
48
|
-
"KNOWS": labels.KNOWS,
|
|
49
|
-
"RELATIVE": labels.RELATIVE,
|
|
50
|
-
"LOCATED_IN": labels.LOCATED_IN,
|
|
51
|
-
"RELIGION_OF": labels.RELIGION_OF,
|
|
52
|
-
"MEDICAL_CONDITION": labels.MEDICAL_CONDITION,
|
|
53
|
-
"SCHOOLS_ATTENDED": labels.SCHOOLS_ATTENDED,
|
|
54
|
-
"MEMBER_OF": labels.MEMBER_OF,
|
|
55
|
-
"SIBLING": labels.SIBLING,
|
|
56
|
-
"ORGANIZES": labels.ORGANIZES,
|
|
57
|
-
"SPOUSE": labels.SPOUSE
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
super(NerelAnyLabelFormatter, self).__init__(stol=stol)
|
|
File without changes
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
from arekit.common.data.doc_provider import DocumentProvider
|
|
2
|
-
from arekit.contrib.source.nerelbio.reader import NerelBioDocReader
|
|
3
|
-
from arekit.contrib.source.nerelbio.versions import NerelBioVersions
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class NERELBioDocProvider(DocumentProvider):
|
|
7
|
-
""" NEREL-BIO extends the general domain dataset NEREL.
|
|
8
|
-
NEREL-BIO annotation scheme covers both general and biomedical
|
|
9
|
-
domains making it suitable for domain transfer experiments.
|
|
10
|
-
https://github.com/nerel-ds/NEREL-BIO
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
def __init__(self, filename_by_id, version):
|
|
14
|
-
""" filename_ids: dict
|
|
15
|
-
Dictionary of {id: filename}, where
|
|
16
|
-
- id: int
|
|
17
|
-
- filename: str
|
|
18
|
-
version: NerelBioVersions
|
|
19
|
-
Specify the appropriate version of the NEREL-BIO collection.
|
|
20
|
-
"""
|
|
21
|
-
assert(isinstance(filename_by_id, dict))
|
|
22
|
-
assert(isinstance(version, NerelBioVersions))
|
|
23
|
-
super(NERELBioDocProvider, self).__init__()
|
|
24
|
-
self.__filename_by_id = filename_by_id
|
|
25
|
-
self.__version = version
|
|
26
|
-
self.__doc_reader = NerelBioDocReader(version)
|
|
27
|
-
|
|
28
|
-
def by_id(self, doc_id):
|
|
29
|
-
return self.__doc_reader.read_document(doc_id=doc_id, filename=self.__filename_by_id[doc_id])
|
|
@@ -1,64 +0,0 @@
|
|
|
1
|
-
from arekit.common.data.doc_provider import DocumentProvider
|
|
2
|
-
from arekit.common.experiment.data_type import DataType
|
|
3
|
-
from arekit.contrib.source.nerelbio.io_utils import NerelBioIOUtils
|
|
4
|
-
from arekit.contrib.source.nerelbio.versions import NerelBioVersions
|
|
5
|
-
from arekit.contrib.utils.pipelines.sources.nerel_bio.doc_provider import NERELBioDocProvider
|
|
6
|
-
from arekit.contrib.utils.pipelines.sources.nerel_bio.labels_fmt import NerelBioAnyLabelFormatter
|
|
7
|
-
from arekit.contrib.utils.pipelines.text_opinion.annot.predefined import PredefinedTextOpinionAnnotator
|
|
8
|
-
from arekit.contrib.utils.pipelines.text_opinion.extraction import text_opinion_extraction_pipeline
|
|
9
|
-
from arekit.contrib.utils.pipelines.text_opinion.filters.distance_based import DistanceLimitedTextOpinionFilter
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def create_text_relation_extraction_pipeline(nerel_bio_version,
|
|
13
|
-
text_parser,
|
|
14
|
-
label_formatter=NerelBioAnyLabelFormatter(),
|
|
15
|
-
terms_per_context=50,
|
|
16
|
-
doc_ops=None,
|
|
17
|
-
docs_limit=None,
|
|
18
|
-
custom_text_opinion_filters=None):
|
|
19
|
-
assert(isinstance(nerel_bio_version, NerelBioVersions))
|
|
20
|
-
assert(isinstance(doc_ops, DocumentProvider) or doc_ops is None)
|
|
21
|
-
assert(isinstance(custom_text_opinion_filters, list) or custom_text_opinion_filters is None)
|
|
22
|
-
|
|
23
|
-
data_folding = None
|
|
24
|
-
|
|
25
|
-
if doc_ops is None:
|
|
26
|
-
# Default Initialization.
|
|
27
|
-
filenames_by_ids, data_folding = NerelBioIOUtils.read_dataset_split(version=nerel_bio_version,
|
|
28
|
-
docs_limit=docs_limit)
|
|
29
|
-
doc_ops = NERELBioDocProvider(filename_by_id=filenames_by_ids, version=nerel_bio_version)
|
|
30
|
-
|
|
31
|
-
text_opinion_filters = [
|
|
32
|
-
DistanceLimitedTextOpinionFilter(terms_per_context)
|
|
33
|
-
]
|
|
34
|
-
|
|
35
|
-
# Append with the custom filters afterwards.
|
|
36
|
-
if custom_text_opinion_filters is not None:
|
|
37
|
-
text_opinion_filters += custom_text_opinion_filters
|
|
38
|
-
|
|
39
|
-
predefined_annot = PredefinedTextOpinionAnnotator(doc_ops, label_formatter)
|
|
40
|
-
|
|
41
|
-
pipelines = {
|
|
42
|
-
DataType.Train: text_opinion_extraction_pipeline(text_parser=text_parser,
|
|
43
|
-
get_doc_by_id_func=doc_ops.by_id,
|
|
44
|
-
annotators=[predefined_annot],
|
|
45
|
-
entity_index_func=lambda brat_entity: brat_entity.ID,
|
|
46
|
-
text_opinion_filters=text_opinion_filters),
|
|
47
|
-
DataType.Test: text_opinion_extraction_pipeline(text_parser=text_parser,
|
|
48
|
-
get_doc_by_id_func=doc_ops.by_id,
|
|
49
|
-
annotators=[predefined_annot],
|
|
50
|
-
entity_index_func=lambda brat_entity: brat_entity.ID,
|
|
51
|
-
text_opinion_filters=text_opinion_filters),
|
|
52
|
-
DataType.Dev: text_opinion_extraction_pipeline(text_parser=text_parser,
|
|
53
|
-
get_doc_by_id_func=doc_ops.by_id,
|
|
54
|
-
annotators=[predefined_annot],
|
|
55
|
-
entity_index_func=lambda brat_entity: brat_entity.ID,
|
|
56
|
-
text_opinion_filters=text_opinion_filters),
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
# In the case when we setup a default data-folding.
|
|
60
|
-
# There is a need to provide it, due to the needs in further.
|
|
61
|
-
if data_folding is not None:
|
|
62
|
-
return pipelines, data_folding
|
|
63
|
-
|
|
64
|
-
return pipelines
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
from arekit.common.labels.str_fmt import StringLabelsFormatter
|
|
2
|
-
from arekit.contrib.source.nerelbio import labels
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class NerelBioAnyLabelFormatter(StringLabelsFormatter):
|
|
6
|
-
|
|
7
|
-
def __init__(self):
|
|
8
|
-
|
|
9
|
-
stol = {
|
|
10
|
-
"ABBREVIATION": labels.ABBREVIATION,
|
|
11
|
-
"ALTERNATIVE_NAME": labels.ALTERNATIVE_NAME,
|
|
12
|
-
"KNOWS": labels.KNOWS,
|
|
13
|
-
"AGE_IS": labels.AGE_IS,
|
|
14
|
-
"AGE_DIED_AT": labels.AGE_DIED_AT,
|
|
15
|
-
"AWARDED_WITH": labels.AWARDED_WITH,
|
|
16
|
-
"PLACE_OF_BIRTH": labels.PLACE_OF_BIRTH,
|
|
17
|
-
"DATE_DEFUNCT_IN": labels.DATE_DEFUNCT_IN,
|
|
18
|
-
"DATE_FOUNDED_IN": labels.DATE_FOUNDED_IN,
|
|
19
|
-
"DATE_OF_BIRTH": labels.DATE_OF_BIRTH,
|
|
20
|
-
"DATE_OF_CREATION": labels.DATE_OF_CREATION,
|
|
21
|
-
"DATE_OF_DEATH": labels.DATE_OF_DEATH,
|
|
22
|
-
"POINT_IN_TIME": labels.POINT_IN_TIME,
|
|
23
|
-
"PLACE_OF_DEATH": labels.PLACE_OF_DEATH,
|
|
24
|
-
"FOUNDED_BY": labels.FOUNDED_BY,
|
|
25
|
-
"HEADQUARTERED_IN": labels.HEADQUARTERED_IN,
|
|
26
|
-
"IDEOLOGY_OF": labels.IDEOLOGY_OF,
|
|
27
|
-
"SPOUSE": labels.SPOUSE,
|
|
28
|
-
"MEMBER_OF": labels.MEMBER_OF,
|
|
29
|
-
"ORGANIZES": labels.ORGANIZES,
|
|
30
|
-
"OWNER_OF": labels.OWNER_OF,
|
|
31
|
-
"PARENT_OF": labels.PARENT_OF,
|
|
32
|
-
"PARTICIPANT_IN": labels.PARTICIPANT_IN,
|
|
33
|
-
"PLACE_RESIDES_IN": labels.PLACE_RESIDES_IN,
|
|
34
|
-
"PRICE_OF": labels.PRICE_OF,
|
|
35
|
-
"PRODUCES": labels.PRODUCES,
|
|
36
|
-
"RELATIVE": labels.RELATIVE,
|
|
37
|
-
"RELIGION_OF": labels.RELIGION_OF,
|
|
38
|
-
"SCHOOLS_ATTENDED": labels.SCHOOLS_ATTENDED,
|
|
39
|
-
"SIBLING": labels.SIBLING,
|
|
40
|
-
"SUBEVENT_OF": labels.SUBEVENT_OF,
|
|
41
|
-
"SUBORDINATE_OF": labels.SUBORDINATE_OF,
|
|
42
|
-
"TAKES_PLACE_IN": labels.TAKES_PLACE_IN,
|
|
43
|
-
"WORKPLACE": labels.WORKPLACE,
|
|
44
|
-
"WORKS_AS": labels.WORKS_AS,
|
|
45
|
-
"CONVICTED_OF": labels.CONVICTED_OF,
|
|
46
|
-
"PENALIZED_AS": labels.PENALIZED_AS,
|
|
47
|
-
"START_TIME": labels.START_TIME,
|
|
48
|
-
"END_TIME": labels.END_TIME,
|
|
49
|
-
"EXPENDITURE": labels.EXPENDITURE,
|
|
50
|
-
"AGENT": labels.AGENT,
|
|
51
|
-
"INANIMATE_INVOLVED": labels.INANIMATE_INVOLVED,
|
|
52
|
-
"INCOME": labels.INCOME,
|
|
53
|
-
"SUBCLASS_OF": labels.SUBCLASS_OF,
|
|
54
|
-
"PART_OF": labels.PART_OF,
|
|
55
|
-
"LOCATED_IN": labels.LOCATED_IN,
|
|
56
|
-
"TREATED_USING": labels.TREATED_USING,
|
|
57
|
-
"ORIGINS_FROM": labels.ORIGINS_FROM,
|
|
58
|
-
"TO_DETECT_OR_STUDY": labels.TO_DETECT_OR_STUDY,
|
|
59
|
-
"AFFECTS": labels.AFFECTS,
|
|
60
|
-
"HAS_CAUSE": labels.HAS_CAUSE,
|
|
61
|
-
"APPLIED_TO": labels.APPLIED_TO,
|
|
62
|
-
"USED_IN": labels.USED_IN,
|
|
63
|
-
"ASSOCIATED_WITH": labels.ASSOCIATED_WITH,
|
|
64
|
-
"HAS_ADMINISTRATION_ROUTE": labels.HAS_ADMINISTRATION_ROUTE,
|
|
65
|
-
"HAS_STRENGTH": labels.HAS_STRENGTH,
|
|
66
|
-
"DURATION_OF": labels.DURATION_OF,
|
|
67
|
-
"VALUE_IS": labels.VALUE_IS,
|
|
68
|
-
"PHYSIOLOGY_OF": labels.PHYSIOLOGY_OF,
|
|
69
|
-
"PROCEDURE_PERFORMED": labels.PROCEDURE_PERFORMED,
|
|
70
|
-
"MENTAL_PROCESS_OF": labels.MENTAL_PROCESS_OF,
|
|
71
|
-
"MEDICAL_CONDITION": labels.MEDICAL_CONDITION,
|
|
72
|
-
"DOSE_IS": labels.DOSE_IS,
|
|
73
|
-
"FINDING_OF": labels.FINDING_OF,
|
|
74
|
-
"CAUSE_OF_DEATH": labels.CAUSE_OF_DEATH,
|
|
75
|
-
"CONSUME": labels.CONSUME,
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
super(NerelBioAnyLabelFormatter, self).__init__(stol=stol)
|
|
79
|
-
|
|
File without changes
|
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
from arekit.common.utils import progress_bar_iter
|
|
2
|
-
from arekit.contrib.source.ruattitudes.collection import RuAttitudesCollection
|
|
3
|
-
from arekit.contrib.source.ruattitudes.io_utils import RuAttitudesVersions
|
|
4
|
-
from arekit.contrib.source.ruattitudes.doc import RuAttitudesDocument
|
|
5
|
-
from arekit.contrib.source.ruattitudes.doc_brat import RuAttitudesDocumentsConverter
|
|
6
|
-
from arekit.contrib.utils.data.doc_provider.dict_based import DictionaryBasedDocumentProvider
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class RuAttitudesDocumentProvider(DictionaryBasedDocumentProvider):
|
|
10
|
-
|
|
11
|
-
def __init__(self, version, keep_doc_ids_only, doc_id_func, limit):
|
|
12
|
-
d = self.read_ruattitudes_to_brat_in_memory(version=version,
|
|
13
|
-
keep_doc_ids_only=keep_doc_ids_only,
|
|
14
|
-
doc_id_func=doc_id_func,
|
|
15
|
-
limit=limit)
|
|
16
|
-
super(RuAttitudesDocumentProvider, self).__init__(d)
|
|
17
|
-
|
|
18
|
-
@staticmethod
|
|
19
|
-
def read_ruattitudes_to_brat_in_memory(version, keep_doc_ids_only, doc_id_func, limit=None):
|
|
20
|
-
""" Performs reading of RuAttitude formatted documents and
|
|
21
|
-
selection according to 'doc_ids_set' parameter.
|
|
22
|
-
"""
|
|
23
|
-
assert (isinstance(version, RuAttitudesVersions))
|
|
24
|
-
assert (isinstance(keep_doc_ids_only, bool))
|
|
25
|
-
assert (callable(doc_id_func))
|
|
26
|
-
|
|
27
|
-
it = RuAttitudesCollection.iter_docs(version=version,
|
|
28
|
-
get_doc_index_func=doc_id_func,
|
|
29
|
-
return_inds_only=keep_doc_ids_only)
|
|
30
|
-
|
|
31
|
-
it_formatted_and_logged = progress_bar_iter(
|
|
32
|
-
iterable=RuAttitudesDocumentProvider.__iter_id_with_doc(
|
|
33
|
-
docs_it=it, keep_doc_ids_only=keep_doc_ids_only),
|
|
34
|
-
desc="Loading RuAttitudes Collection [{}]".format("doc ids only" if keep_doc_ids_only else "fully"),
|
|
35
|
-
unit='docs')
|
|
36
|
-
|
|
37
|
-
d = {}
|
|
38
|
-
docs_read = 0
|
|
39
|
-
for doc_id, doc in it_formatted_and_logged:
|
|
40
|
-
assert(isinstance(doc, RuAttitudesDocument) or doc is None)
|
|
41
|
-
d[doc_id] = RuAttitudesDocumentsConverter.to_brat_doc(doc) if doc is not None else None
|
|
42
|
-
docs_read += 1
|
|
43
|
-
if limit is not None and docs_read >= limit:
|
|
44
|
-
break
|
|
45
|
-
|
|
46
|
-
return d
|
|
47
|
-
|
|
48
|
-
@staticmethod
|
|
49
|
-
def __iter_id_with_doc(docs_it, keep_doc_ids_only):
|
|
50
|
-
if keep_doc_ids_only:
|
|
51
|
-
for doc_id in docs_it:
|
|
52
|
-
yield doc_id, None
|
|
53
|
-
else:
|
|
54
|
-
for doc in docs_it:
|
|
55
|
-
assert (isinstance(doc, RuAttitudesDocument))
|
|
56
|
-
yield doc.ID, doc
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
from arekit.common.entities.types import OpinionEntityType
|
|
2
|
-
from arekit.contrib.utils.entities.filter import EntityFilter
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class RuAttitudesEntityFilter(EntityFilter):
|
|
6
|
-
""" This is a task-specific filter, which is applicable of entity types proposed
|
|
7
|
-
by the OntoNotesV5 resource: https://catalog.ldc.upenn.edu/LDC2013T19
|
|
8
|
-
We consider only a short list related to the sentiment attitude extraction task.
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
supported = ["GPE", "PERSON", "LOCAL", "GEO", "ORG"]
|
|
12
|
-
|
|
13
|
-
def is_ignored(self, entity, e_type):
|
|
14
|
-
|
|
15
|
-
if e_type == OpinionEntityType.Subject:
|
|
16
|
-
return entity.Type not in RuAttitudesEntityFilter.supported
|
|
17
|
-
if e_type == OpinionEntityType.Object:
|
|
18
|
-
return entity.Type not in RuAttitudesEntityFilter.supported
|
|
19
|
-
else:
|
|
20
|
-
return True
|