arekit 0.23.1__py3-none-any.whl → 0.25.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/common/context/terms_mapper.py +2 -2
- arekit/common/data/const.py +5 -4
- arekit/common/{experiment/api/ops_doc.py → data/doc_provider.py} +1 -1
- arekit/common/data/input/providers/columns/sample.py +6 -1
- arekit/common/data/input/providers/instances/base.py +1 -1
- arekit/common/data/input/providers/rows/base.py +36 -13
- arekit/common/data/input/providers/rows/samples.py +57 -55
- arekit/common/data/input/providers/sample/cropped.py +2 -2
- arekit/common/data/input/sample.py +1 -1
- arekit/common/data/rows_fmt.py +82 -0
- arekit/common/data/rows_parser.py +43 -0
- arekit/common/data/storages/base.py +23 -18
- arekit/common/data/views/samples.py +2 -8
- arekit/common/{news → docs}/base.py +2 -2
- arekit/common/{news → docs}/entities_grouping.py +2 -1
- arekit/common/{news → docs}/entity.py +2 -1
- arekit/common/{news → docs}/parsed/base.py +5 -5
- arekit/common/docs/parsed/providers/base.py +68 -0
- arekit/common/{news → docs}/parsed/providers/base_pairs.py +2 -2
- arekit/common/{news → docs}/parsed/providers/entity_service.py +27 -22
- arekit/common/{news → docs}/parsed/providers/opinion_pairs.py +2 -2
- arekit/common/{news → docs}/parsed/providers/text_opinion_pairs.py +6 -6
- arekit/common/docs/parsed/service.py +31 -0
- arekit/common/docs/parser.py +66 -0
- arekit/common/{news → docs}/sentence.py +1 -1
- arekit/common/entities/base.py +11 -2
- arekit/common/experiment/api/base_samples_io.py +1 -1
- arekit/common/frames/variants/collection.py +2 -2
- arekit/common/linkage/base.py +2 -2
- arekit/common/linkage/meta.py +23 -0
- arekit/common/linkage/opinions.py +1 -1
- arekit/common/linkage/text_opinions.py +2 -2
- arekit/common/opinions/annot/algo/base.py +1 -1
- arekit/common/opinions/annot/algo/pair_based.py +15 -13
- arekit/common/opinions/annot/algo/predefined.py +4 -4
- arekit/common/opinions/annot/algo_based.py +5 -5
- arekit/common/opinions/annot/base.py +3 -3
- arekit/common/opinions/base.py +7 -7
- arekit/common/opinions/collection.py +3 -3
- arekit/common/pipeline/base.py +12 -16
- arekit/common/pipeline/batching.py +28 -0
- arekit/common/pipeline/context.py +5 -1
- arekit/common/pipeline/items/base.py +38 -1
- arekit/common/pipeline/items/flatten.py +5 -1
- arekit/common/pipeline/items/handle.py +2 -1
- arekit/common/pipeline/items/iter.py +2 -1
- arekit/common/pipeline/items/map.py +2 -1
- arekit/common/pipeline/items/map_nested.py +4 -0
- arekit/common/pipeline/utils.py +32 -0
- arekit/common/service/sqlite.py +36 -0
- arekit/common/synonyms/base.py +2 -2
- arekit/common/text/{partitioning/str.py → partitioning.py} +16 -11
- arekit/common/text_opinions/base.py +11 -11
- arekit/common/utils.py +33 -46
- arekit/contrib/networks/embedding.py +3 -3
- arekit/contrib/networks/embedding_io.py +5 -5
- arekit/contrib/networks/input/const.py +0 -2
- arekit/contrib/networks/input/providers/sample.py +15 -29
- arekit/contrib/networks/input/rows_parser.py +47 -134
- arekit/contrib/prompt/sample.py +18 -16
- arekit/contrib/utils/data/contents/opinions.py +17 -5
- arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
- arekit/contrib/utils/data/{doc_ops → doc_provider}/dir_based.py +7 -7
- arekit/contrib/utils/data/readers/base.py +3 -0
- arekit/contrib/utils/data/readers/csv_pd.py +10 -4
- arekit/contrib/utils/data/readers/jsonl.py +3 -0
- arekit/contrib/utils/data/readers/sqlite.py +14 -0
- arekit/contrib/utils/data/service/balance.py +0 -1
- arekit/contrib/utils/data/storages/pandas_based.py +3 -5
- arekit/contrib/utils/data/storages/row_cache.py +18 -6
- arekit/contrib/utils/data/storages/sqlite_based.py +17 -0
- arekit/contrib/utils/data/writers/base.py +5 -0
- arekit/contrib/utils/data/writers/csv_native.py +3 -0
- arekit/contrib/utils/data/writers/csv_pd.py +3 -0
- arekit/contrib/utils/data/writers/json_opennre.py +31 -13
- arekit/contrib/utils/data/writers/sqlite_native.py +114 -0
- arekit/contrib/utils/io_utils/embedding.py +25 -33
- arekit/contrib/utils/io_utils/utils.py +3 -24
- arekit/contrib/utils/pipelines/items/sampling/base.py +31 -26
- arekit/contrib/utils/pipelines/items/sampling/networks.py +7 -10
- arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
- arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +3 -3
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +2 -1
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +3 -5
- arekit/contrib/utils/pipelines/items/text/translator.py +136 -0
- arekit/contrib/utils/pipelines/opinion_collections.py +5 -5
- arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +7 -7
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +34 -22
- arekit/contrib/utils/pipelines/text_opinion/filters/base.py +1 -1
- arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +1 -1
- arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +3 -3
- arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +4 -4
- arekit/contrib/utils/serializer.py +4 -23
- arekit-0.25.0.data/data/logo.png +0 -0
- arekit-0.25.0.dist-info/METADATA +82 -0
- arekit-0.25.0.dist-info/RECORD +259 -0
- {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/WHEEL +1 -1
- arekit/common/data/row_ids/base.py +0 -79
- arekit/common/data/row_ids/binary.py +0 -38
- arekit/common/data/row_ids/multiple.py +0 -14
- arekit/common/folding/base.py +0 -36
- arekit/common/folding/fixed.py +0 -42
- arekit/common/folding/nofold.py +0 -15
- arekit/common/folding/united.py +0 -46
- arekit/common/news/objects_parser.py +0 -37
- arekit/common/news/parsed/providers/base.py +0 -48
- arekit/common/news/parsed/service.py +0 -31
- arekit/common/news/parser.py +0 -34
- arekit/common/text/parser.py +0 -12
- arekit/common/text/partitioning/__init__.py +0 -0
- arekit/common/text/partitioning/base.py +0 -4
- arekit/common/text/partitioning/terms.py +0 -35
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +0 -83
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +0 -33
- arekit/contrib/source/brat/entities/entity.py +0 -42
- arekit/contrib/source/brat/entities/parser.py +0 -53
- arekit/contrib/source/brat/news.py +0 -28
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +0 -19
- arekit/contrib/source/brat/relation.py +0 -32
- arekit/contrib/source/brat/sentence.py +0 -69
- arekit/contrib/source/brat/sentences_reader.py +0 -128
- arekit/contrib/source/download.py +0 -41
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +0 -55
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +0 -75
- arekit/contrib/source/nerel/io_utils.py +0 -62
- arekit/contrib/source/nerel/labels.py +0 -241
- arekit/contrib/source/nerel/reader.py +0 -46
- arekit/contrib/source/nerel/utils.py +0 -24
- arekit/contrib/source/nerel/versions.py +0 -12
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +0 -62
- arekit/contrib/source/nerelbio/labels.py +0 -265
- arekit/contrib/source/nerelbio/reader.py +0 -8
- arekit/contrib/source/nerelbio/versions.py +0 -8
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +0 -36
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
- arekit/contrib/source/ruattitudes/io_utils.py +0 -56
- arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
- arekit/contrib/source/ruattitudes/news.py +0 -51
- arekit/contrib/source/ruattitudes/news_brat.py +0 -44
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
- arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
- arekit/contrib/source/ruattitudes/reader.py +0 -268
- arekit/contrib/source/ruattitudes/sentence.py +0 -73
- arekit/contrib/source/ruattitudes/synonyms.py +0 -17
- arekit/contrib/source/ruattitudes/text_object.py +0 -57
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +0 -157
- arekit/contrib/source/rusentiframes/effect.py +0 -24
- arekit/contrib/source/rusentiframes/io_utils.py +0 -19
- arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
- arekit/contrib/source/rusentiframes/polarity.py +0 -35
- arekit/contrib/source/rusentiframes/role.py +0 -15
- arekit/contrib/source/rusentiframes/state.py +0 -24
- arekit/contrib/source/rusentiframes/types.py +0 -42
- arekit/contrib/source/rusentiframes/value.py +0 -2
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +0 -3
- arekit/contrib/source/rusentrel/entities.py +0 -26
- arekit/contrib/source/rusentrel/io_utils.py +0 -125
- arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
- arekit/contrib/source/rusentrel/news_reader.py +0 -51
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
- arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
- arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
- arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
- arekit/contrib/source/rusentrel/synonyms.py +0 -17
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +0 -52
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +0 -32
- arekit/contrib/source/sentinerel/folding/fixed.py +0 -73
- arekit/contrib/source/sentinerel/io_utils.py +0 -87
- arekit/contrib/source/sentinerel/labels.py +0 -53
- arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
- arekit/contrib/source/sentinerel/reader.py +0 -42
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +0 -19
- arekit/contrib/source/zip_utils.py +0 -47
- arekit/contrib/utils/bert/rows.py +0 -0
- arekit/contrib/utils/bert/text_b_rus.py +0 -18
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
- arekit/contrib/utils/cv/__init__.py +0 -0
- arekit/contrib/utils/cv/doc_stat/__init__.py +0 -0
- arekit/contrib/utils/cv/doc_stat/base.py +0 -37
- arekit/contrib/utils/cv/doc_stat/sentence.py +0 -12
- arekit/contrib/utils/cv/splitters/__init__.py +0 -0
- arekit/contrib/utils/cv/splitters/base.py +0 -4
- arekit/contrib/utils/cv/splitters/default.py +0 -53
- arekit/contrib/utils/cv/splitters/statistical.py +0 -57
- arekit/contrib/utils/cv/two_class.py +0 -77
- arekit/contrib/utils/data/doc_ops/__init__.py +0 -0
- arekit/contrib/utils/data/doc_ops/dict_based.py +0 -13
- arekit/contrib/utils/data/ext.py +0 -31
- arekit/contrib/utils/data/views/__init__.py +0 -0
- arekit/contrib/utils/data/views/linkages/__init__.py +0 -0
- arekit/contrib/utils/data/views/linkages/base.py +0 -58
- arekit/contrib/utils/data/views/linkages/multilabel.py +0 -48
- arekit/contrib/utils/data/views/linkages/utils.py +0 -24
- arekit/contrib/utils/data/views/opinions.py +0 -14
- arekit/contrib/utils/download.py +0 -78
- arekit/contrib/utils/entities/formatters/str_rus_cased_fmt.py +0 -78
- arekit/contrib/utils/entities/formatters/str_rus_nocased_fmt.py +0 -15
- arekit/contrib/utils/entities/formatters/str_simple_fmt.py +0 -24
- arekit/contrib/utils/entities/formatters/str_simple_uppercase_fmt.py +0 -21
- arekit/contrib/utils/io_utils/opinions.py +0 -39
- arekit/contrib/utils/io_utils/samples.py +0 -78
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +0 -43
- arekit/contrib/utils/lexicons/relation.py +0 -45
- arekit/contrib/utils/lexicons/rusentilex.py +0 -34
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +0 -83
- arekit/contrib/utils/pipelines/items/sampling/bert.py +0 -5
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
- arekit/contrib/utils/pipelines/items/to_output.py +0 -101
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_ops.py +0 -27
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -59
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_ops.py +0 -29
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -59
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_ops.py +0 -56
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -19
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -58
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_ops.py +0 -21
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -100
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_ops.py +0 -29
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -175
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
- arekit/contrib/utils/resources.py +0 -26
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
- arekit/contrib/utils/utils_folding.py +0 -19
- arekit/download_data.py +0 -11
- arekit-0.23.1.dist-info/METADATA +0 -23
- arekit-0.23.1.dist-info/RECORD +0 -403
- /arekit/common/{data/row_ids → docs}/__init__.py +0 -0
- /arekit/common/{folding → docs/parsed}/__init__.py +0 -0
- /arekit/common/{news → docs/parsed/providers}/__init__.py +0 -0
- /arekit/common/{news → docs}/parsed/term_position.py +0 -0
- /arekit/common/{news/parsed → service}/__init__.py +0 -0
- /arekit/{common/news/parsed/providers → contrib/utils/data/doc_provider}/__init__.py +0 -0
- {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/LICENSE +0 -0
- {arekit-0.23.1.dist-info → arekit-0.25.0.dist-info}/top_level.txt +0 -0
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
from arekit.common.linkage.text_opinions import TextOpinionsLinkage
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class BaseIDProvider(object):
|
|
5
|
-
"""
|
|
6
|
-
Opinion in text is a sequence of opinions in context
|
|
7
|
-
o1, o2, o3, ..., on
|
|
8
|
-
|
|
9
|
-
o1 -- first_text_opinion
|
|
10
|
-
i -- index in lined (for example: i=3 => 03)
|
|
11
|
-
|
|
12
|
-
# TODO. #376. This should be definitely refactored. This implementation
|
|
13
|
-
TODO. combines opinion-based and sample-based data sources, which allows
|
|
14
|
-
TODO. us to bypass such connection via external foreign keys.
|
|
15
|
-
|
|
16
|
-
Since we are head to remove opinions, there is a need to refactor so in a
|
|
17
|
-
way of an additional column that provides such information for further connection
|
|
18
|
-
between rows of different storages.
|
|
19
|
-
"""
|
|
20
|
-
|
|
21
|
-
SEPARATOR = '_'
|
|
22
|
-
OPINION = "o{}" + SEPARATOR
|
|
23
|
-
INDEX = "i{}" + SEPARATOR
|
|
24
|
-
|
|
25
|
-
# region 'create' methods
|
|
26
|
-
|
|
27
|
-
@staticmethod
|
|
28
|
-
def create_opinion_id(text_opinions_linkage, index_in_linked):
|
|
29
|
-
assert(isinstance(text_opinions_linkage, TextOpinionsLinkage))
|
|
30
|
-
assert(isinstance(index_in_linked, int))
|
|
31
|
-
|
|
32
|
-
template = ''.join([BaseIDProvider.OPINION,
|
|
33
|
-
BaseIDProvider.INDEX])
|
|
34
|
-
|
|
35
|
-
text_opinion_id = text_opinions_linkage.First.TextOpinionID
|
|
36
|
-
assert(isinstance(text_opinion_id, int))
|
|
37
|
-
|
|
38
|
-
return template.format(text_opinion_id,
|
|
39
|
-
index_in_linked)
|
|
40
|
-
|
|
41
|
-
@staticmethod
|
|
42
|
-
def create_sample_id(linked_opinions, index_in_linked, label_scaler):
|
|
43
|
-
raise NotImplementedError()
|
|
44
|
-
|
|
45
|
-
@staticmethod
|
|
46
|
-
def create_pattern(id_value, p_type):
|
|
47
|
-
assert(isinstance(id_value, int))
|
|
48
|
-
assert(isinstance(p_type, str))
|
|
49
|
-
return p_type.format(id_value)
|
|
50
|
-
|
|
51
|
-
# endregion
|
|
52
|
-
|
|
53
|
-
@staticmethod
|
|
54
|
-
def convert_sample_id_to_opinion_id(sample_id):
|
|
55
|
-
assert(isinstance(sample_id, str))
|
|
56
|
-
return sample_id[:sample_id.index(BaseIDProvider.INDEX[0])] + BaseIDProvider.INDEX.format(0)
|
|
57
|
-
|
|
58
|
-
# region 'parse' methods
|
|
59
|
-
|
|
60
|
-
@staticmethod
|
|
61
|
-
def _parse(row_id, pattern):
|
|
62
|
-
assert(isinstance(pattern, str))
|
|
63
|
-
|
|
64
|
-
_from = row_id.index(pattern[0]) + 1
|
|
65
|
-
_to = row_id.index(BaseIDProvider.SEPARATOR, _from, len(row_id))
|
|
66
|
-
|
|
67
|
-
return int(row_id[_from:_to])
|
|
68
|
-
|
|
69
|
-
@staticmethod
|
|
70
|
-
def parse_opinion_in_opinion_id(opinion_id):
|
|
71
|
-
assert(isinstance(opinion_id, str))
|
|
72
|
-
return BaseIDProvider._parse(opinion_id, BaseIDProvider.OPINION)
|
|
73
|
-
|
|
74
|
-
@staticmethod
|
|
75
|
-
def parse_opinion_in_sample_id(sample_id):
|
|
76
|
-
assert(isinstance(sample_id, str))
|
|
77
|
-
return BaseIDProvider._parse(sample_id, BaseIDProvider.OPINION)
|
|
78
|
-
|
|
79
|
-
# endregion
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
from arekit.common.data.row_ids.base import BaseIDProvider
|
|
2
|
-
from arekit.common.labels.scaler.base import BaseLabelScaler
|
|
3
|
-
from arekit.common.linkage.text_opinions import TextOpinionsLinkage
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class BinaryIDProvider(BaseIDProvider):
|
|
7
|
-
"""
|
|
8
|
-
Considered that label of opinion IS A PART OF id.
|
|
9
|
-
# TODO. #376 related. This should be removed after refactoring, because
|
|
10
|
-
# TODO. we consider an ordinary IDs, that not based on the other data.
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
LABEL = 'l{}' + BaseIDProvider.SEPARATOR
|
|
14
|
-
|
|
15
|
-
@staticmethod
|
|
16
|
-
def create_sample_id(linked_opinions, index_in_linked, label_scaler):
|
|
17
|
-
assert(isinstance(linked_opinions, TextOpinionsLinkage))
|
|
18
|
-
assert(isinstance(index_in_linked, int))
|
|
19
|
-
assert(isinstance(label_scaler, BaseLabelScaler))
|
|
20
|
-
|
|
21
|
-
o_id = BaseIDProvider.create_opinion_id(text_opinions_linkage=linked_opinions,
|
|
22
|
-
index_in_linked=index_in_linked)
|
|
23
|
-
|
|
24
|
-
template = ''.join(["{}", BinaryIDProvider.LABEL])
|
|
25
|
-
|
|
26
|
-
return template.format(o_id,
|
|
27
|
-
label_scaler.label_to_uint(linked_opinions.get_linked_label()))
|
|
28
|
-
|
|
29
|
-
@staticmethod
|
|
30
|
-
def parse_label_in_sample_id(sample_id):
|
|
31
|
-
assert(isinstance(sample_id, str))
|
|
32
|
-
return BinaryIDProvider._parse(row_id=sample_id, pattern=BinaryIDProvider.LABEL)
|
|
33
|
-
|
|
34
|
-
@staticmethod
|
|
35
|
-
def parse_index_in_sample_id(sample_id):
|
|
36
|
-
assert(isinstance(sample_id, str))
|
|
37
|
-
return BinaryIDProvider._parse(row_id=sample_id, pattern=BinaryIDProvider.INDEX)
|
|
38
|
-
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from arekit.common.data.row_ids.base import BaseIDProvider
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class MultipleIDProvider(BaseIDProvider):
|
|
5
|
-
"""
|
|
6
|
-
Considered that label of opinion is not a part of id.
|
|
7
|
-
# TODO. #376 related. This should be removed after refactoring, because
|
|
8
|
-
# TODO. we consider an ordinary IDs, that not based on the other data.
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
@staticmethod
|
|
12
|
-
def create_sample_id(linked_opinions, index_in_linked, label_scaler):
|
|
13
|
-
return BaseIDProvider.create_opinion_id(text_opinions_linkage=linked_opinions,
|
|
14
|
-
index_in_linked=index_in_linked)
|
arekit/common/folding/base.py
DELETED
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
import collections
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class BaseDataFolding(object):
|
|
5
|
-
""" Describes and provides API on how to handle doc_ids during experiment,
|
|
6
|
-
i.e. how many states does nested folding algorithm supports,
|
|
7
|
-
how to perform folding for a particular state (current),
|
|
8
|
-
and how to such state into string.
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
def __init__(self, doc_ids_to_fold, supported_data_types):
|
|
12
|
-
assert(isinstance(doc_ids_to_fold, collections.Iterable))
|
|
13
|
-
assert(isinstance(supported_data_types, list))
|
|
14
|
-
self._doc_ids_to_fold_set = set(doc_ids_to_fold)
|
|
15
|
-
self._supported_data_types = supported_data_types
|
|
16
|
-
|
|
17
|
-
def contains_doc_id(self, doc_id):
|
|
18
|
-
assert(isinstance(doc_id, int))
|
|
19
|
-
return doc_id in self._doc_ids_to_fold_set
|
|
20
|
-
|
|
21
|
-
def iter_doc_ids(self):
|
|
22
|
-
return iter(self._doc_ids_to_fold_set)
|
|
23
|
-
|
|
24
|
-
def iter_supported_data_types(self):
|
|
25
|
-
""" Iterates through data_types, supported in a related experiment
|
|
26
|
-
Note:
|
|
27
|
-
In CV-split algorithm, the first part corresponds to a LARGE split,
|
|
28
|
-
Jand second to small; therefore, the correct sequence is as follows:
|
|
29
|
-
DataType.Train, DataType.Test.
|
|
30
|
-
"""
|
|
31
|
-
return iter(self._supported_data_types)
|
|
32
|
-
|
|
33
|
-
def fold_doc_ids_set(self):
|
|
34
|
-
""" Perform the doc_ids folding process onto provided data_types
|
|
35
|
-
"""
|
|
36
|
-
raise NotImplementedError()
|
arekit/common/folding/fixed.py
DELETED
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
from arekit.common.folding.base import BaseDataFolding
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class FixedFolding(BaseDataFolding):
|
|
5
|
-
|
|
6
|
-
def __init__(self, doc_to_datatypes_func, doc_ids_to_fold, supported_data_types):
|
|
7
|
-
assert(callable(doc_to_datatypes_func))
|
|
8
|
-
|
|
9
|
-
super(FixedFolding, self).__init__(doc_ids_to_fold=doc_ids_to_fold,
|
|
10
|
-
supported_data_types=supported_data_types)
|
|
11
|
-
|
|
12
|
-
self.__doc_to_datatypes_func = doc_to_datatypes_func
|
|
13
|
-
|
|
14
|
-
@classmethod
|
|
15
|
-
def from_parts(cls, parts):
|
|
16
|
-
""" parts: dict
|
|
17
|
-
dictionary of {data_type: [doc_ids]}
|
|
18
|
-
"""
|
|
19
|
-
assert(isinstance(parts, dict))
|
|
20
|
-
|
|
21
|
-
doc_to_datatypes = {}
|
|
22
|
-
for data_type, doc_ids in parts.items():
|
|
23
|
-
for doc_id in doc_ids:
|
|
24
|
-
if doc_id not in doc_to_datatypes:
|
|
25
|
-
doc_to_datatypes[doc_id] = []
|
|
26
|
-
doc_to_datatypes[doc_id].append(data_type)
|
|
27
|
-
|
|
28
|
-
return cls(doc_to_datatypes_func=lambda doc_id: doc_to_datatypes[doc_id],
|
|
29
|
-
doc_ids_to_fold=doc_to_datatypes.keys(),
|
|
30
|
-
supported_data_types=list(parts.keys()))
|
|
31
|
-
|
|
32
|
-
def fold_doc_ids_set(self):
|
|
33
|
-
|
|
34
|
-
folded = {}
|
|
35
|
-
for data_type in self._supported_data_types:
|
|
36
|
-
folded[data_type] = []
|
|
37
|
-
|
|
38
|
-
for doc_id in self._doc_ids_to_fold_set:
|
|
39
|
-
for data_type in self.__doc_to_datatypes_func(doc_id):
|
|
40
|
-
folded[data_type].append(doc_id)
|
|
41
|
-
|
|
42
|
-
return folded
|
arekit/common/folding/nofold.py
DELETED
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
from arekit.common.folding.base import BaseDataFolding
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class NoFolding(BaseDataFolding):
|
|
5
|
-
""" The case of absent folding in experiment.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
def __init__(self, doc_ids, supported_data_type):
|
|
9
|
-
super(NoFolding, self).__init__(doc_ids_to_fold=doc_ids,
|
|
10
|
-
supported_data_types=[supported_data_type])
|
|
11
|
-
|
|
12
|
-
def fold_doc_ids_set(self):
|
|
13
|
-
return {
|
|
14
|
-
self._supported_data_types[0]: list(self._doc_ids_to_fold_set)
|
|
15
|
-
}
|
arekit/common/folding/united.py
DELETED
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
from arekit.common.folding.base import BaseDataFolding
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class UnitedFolding(BaseDataFolding):
|
|
5
|
-
|
|
6
|
-
def __init__(self, foldings):
|
|
7
|
-
assert(isinstance(foldings, list))
|
|
8
|
-
self.__foldings = foldings
|
|
9
|
-
super(UnitedFolding, self).__init__(
|
|
10
|
-
doc_ids_to_fold=UnitedFolding.__iter_all_doc_ids(foldings),
|
|
11
|
-
supported_data_types=list(set(UnitedFolding.__iter_all_data_types(foldings))))
|
|
12
|
-
|
|
13
|
-
@staticmethod
|
|
14
|
-
def __iter_all_doc_ids(foldings):
|
|
15
|
-
for folding in foldings:
|
|
16
|
-
assert(isinstance(folding, BaseDataFolding))
|
|
17
|
-
for doc_id in folding.iter_doc_ids():
|
|
18
|
-
yield doc_id
|
|
19
|
-
|
|
20
|
-
@staticmethod
|
|
21
|
-
def __iter_all_data_types(foldings):
|
|
22
|
-
for folding in foldings:
|
|
23
|
-
assert(isinstance(folding, BaseDataFolding))
|
|
24
|
-
for d_type in folding.iter_supported_data_types():
|
|
25
|
-
yield d_type
|
|
26
|
-
|
|
27
|
-
@staticmethod
|
|
28
|
-
def __merge(origin, new_data):
|
|
29
|
-
assert(isinstance(origin, dict))
|
|
30
|
-
assert(isinstance(new_data, dict))
|
|
31
|
-
for key, value in new_data.items():
|
|
32
|
-
if key not in origin:
|
|
33
|
-
# Assign list
|
|
34
|
-
origin[key] = value
|
|
35
|
-
else:
|
|
36
|
-
# Combine lists
|
|
37
|
-
origin[key] += value
|
|
38
|
-
|
|
39
|
-
def fold_doc_ids_set(self):
|
|
40
|
-
origin = {}
|
|
41
|
-
for folding in self.__foldings:
|
|
42
|
-
assert(isinstance(folding, BaseDataFolding))
|
|
43
|
-
new_data = folding.fold_doc_ids_set()
|
|
44
|
-
self.__merge(origin=origin, new_data=new_data)
|
|
45
|
-
|
|
46
|
-
return origin
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
from arekit.common.pipeline.items.base import BasePipelineItem
|
|
2
|
-
from arekit.common.text.partitioning.base import BasePartitioning
|
|
3
|
-
from arekit.common.pipeline.context import PipelineContext
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class SentenceObjectsParserPipelineItem(BasePipelineItem):
|
|
7
|
-
|
|
8
|
-
def __init__(self, partitioning):
|
|
9
|
-
assert(isinstance(partitioning, BasePartitioning))
|
|
10
|
-
self.__partitioning = partitioning
|
|
11
|
-
|
|
12
|
-
# region protected
|
|
13
|
-
|
|
14
|
-
def _get_text(self, pipeline_ctx):
|
|
15
|
-
return None
|
|
16
|
-
|
|
17
|
-
def _get_parts_provider_func(self, input_data, pipeline_ctx):
|
|
18
|
-
raise NotImplementedError()
|
|
19
|
-
|
|
20
|
-
# endregion
|
|
21
|
-
|
|
22
|
-
def apply_core(self, input_data, pipeline_ctx):
|
|
23
|
-
assert(isinstance(pipeline_ctx, PipelineContext))
|
|
24
|
-
external_input = self._get_text(pipeline_ctx)
|
|
25
|
-
actual_input = input_data if external_input is None else external_input
|
|
26
|
-
parts_it = self._get_parts_provider_func(input_data=actual_input, pipeline_ctx=pipeline_ctx)
|
|
27
|
-
return self.__partitioning.provide(text=actual_input, parts_it=parts_it)
|
|
28
|
-
|
|
29
|
-
# region base
|
|
30
|
-
|
|
31
|
-
def __enter__(self):
|
|
32
|
-
return self
|
|
33
|
-
|
|
34
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
35
|
-
pass
|
|
36
|
-
|
|
37
|
-
# endregion
|
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
from arekit.common.entities.base import Entity
|
|
2
|
-
from arekit.common.news.entity import DocumentEntity
|
|
3
|
-
from arekit.common.news.parsed.base import ParsedNews
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class BaseParsedNewsServiceProvider(object):
|
|
7
|
-
|
|
8
|
-
def __init__(self, entity_index_func=None):
|
|
9
|
-
""" Outside enity indexing function
|
|
10
|
-
entity_index_func: provides id for a given entity, i.e.
|
|
11
|
-
func(entity) -> int (id)
|
|
12
|
-
"""
|
|
13
|
-
assert(callable(entity_index_func) or entity_index_func is None)
|
|
14
|
-
self._doc_entities = None
|
|
15
|
-
self.__entity_map = {}
|
|
16
|
-
self.__entity_index_func = entity_index_func
|
|
17
|
-
|
|
18
|
-
@property
|
|
19
|
-
def Name(self):
|
|
20
|
-
raise NotImplementedError()
|
|
21
|
-
|
|
22
|
-
def init_parsed_news(self, parsed_news):
|
|
23
|
-
assert(isinstance(parsed_news, ParsedNews))
|
|
24
|
-
|
|
25
|
-
self._doc_entities = []
|
|
26
|
-
self.__entity_map.clear()
|
|
27
|
-
|
|
28
|
-
for index, entity in enumerate(parsed_news.iter_entities()):
|
|
29
|
-
|
|
30
|
-
doc_entity = DocumentEntity(id_in_doc=index,
|
|
31
|
-
value=entity.Value,
|
|
32
|
-
e_type=entity.Type,
|
|
33
|
-
display_value=entity.DisplayValue,
|
|
34
|
-
group_index=entity.GroupIndex)
|
|
35
|
-
|
|
36
|
-
self._doc_entities.append(doc_entity)
|
|
37
|
-
|
|
38
|
-
if self.__entity_index_func is not None:
|
|
39
|
-
self.__entity_map[self.__entity_index_func(entity)] = doc_entity
|
|
40
|
-
|
|
41
|
-
def get_document_entity(self, entity):
|
|
42
|
-
""" Maps entity to the related one with DocumentEntity type
|
|
43
|
-
"""
|
|
44
|
-
assert(isinstance(entity, Entity))
|
|
45
|
-
return self.__entity_map[self.__entity_index_func(entity)]
|
|
46
|
-
|
|
47
|
-
def contains_entity(self, entity):
|
|
48
|
-
return self.__entity_index_func(entity) in self.__entity_map
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
from arekit.common.news.parsed.base import ParsedNews
|
|
2
|
-
from arekit.common.news.parsed.providers.base import BaseParsedNewsServiceProvider
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class ParsedNewsService(object):
|
|
6
|
-
""" Represents a collection of providers, combined with the parsed news.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
def __init__(self, parsed_news, providers):
|
|
10
|
-
assert(isinstance(parsed_news, ParsedNews))
|
|
11
|
-
assert(isinstance(providers, list))
|
|
12
|
-
self.__parsed_news = parsed_news
|
|
13
|
-
self.__providers = {}
|
|
14
|
-
|
|
15
|
-
for provider in providers:
|
|
16
|
-
assert(isinstance(provider, BaseParsedNewsServiceProvider))
|
|
17
|
-
assert(provider.Name not in self.__providers)
|
|
18
|
-
|
|
19
|
-
# Link provider with the related name.
|
|
20
|
-
self.__providers[provider.Name] = provider
|
|
21
|
-
|
|
22
|
-
# Post initialize with the related parsed news.
|
|
23
|
-
provider.init_parsed_news(self.__parsed_news)
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
@property
|
|
27
|
-
def ParsedNews(self):
|
|
28
|
-
return self.__parsed_news
|
|
29
|
-
|
|
30
|
-
def get_provider(self, name):
|
|
31
|
-
return self.__providers[name]
|
arekit/common/news/parser.py
DELETED
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
from arekit.common.news.base import News
|
|
2
|
-
from arekit.common.news.parsed.base import ParsedNews
|
|
3
|
-
from arekit.common.pipeline.context import PipelineContext
|
|
4
|
-
from arekit.common.text.parser import BaseTextParser
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class NewsParser(object):
|
|
8
|
-
|
|
9
|
-
@staticmethod
|
|
10
|
-
def __get_sent(news, sent_ind):
|
|
11
|
-
return news.get_sentence(sent_ind)
|
|
12
|
-
|
|
13
|
-
@staticmethod
|
|
14
|
-
def parse(news, text_parser, parent_ppl_ctx=None):
|
|
15
|
-
assert(isinstance(news, News))
|
|
16
|
-
assert(isinstance(text_parser, BaseTextParser))
|
|
17
|
-
assert(isinstance(parent_ppl_ctx, PipelineContext) or parent_ppl_ctx is None)
|
|
18
|
-
|
|
19
|
-
parsed_sentences = [text_parser.run(input_data=NewsParser.__get_sent(news, sent_ind).Text,
|
|
20
|
-
params_dict=NewsParser.__create_ppl_params(news=news, sent_ind=sent_ind),
|
|
21
|
-
parent_ctx=parent_ppl_ctx)
|
|
22
|
-
for sent_ind in range(news.SentencesCount)]
|
|
23
|
-
|
|
24
|
-
return ParsedNews(doc_id=news.ID,
|
|
25
|
-
parsed_sentences=parsed_sentences)
|
|
26
|
-
|
|
27
|
-
@staticmethod
|
|
28
|
-
def __create_ppl_params(news, sent_ind):
|
|
29
|
-
assert(isinstance(news, News))
|
|
30
|
-
return {
|
|
31
|
-
"s_ind": sent_ind, # sentence index. (as Metadata)
|
|
32
|
-
"doc_id": news.ID, # document index. (as Metadata)
|
|
33
|
-
"sentence": NewsParser.__get_sent(news, sent_ind), # Required for special sources.
|
|
34
|
-
}
|
arekit/common/text/parser.py
DELETED
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
from arekit.common.pipeline.base import BasePipeline
|
|
2
|
-
from arekit.common.text.parsed import BaseParsedText
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class BaseTextParser(BasePipeline):
|
|
6
|
-
|
|
7
|
-
def run(self, input_data, params_dict=None, parent_ctx=None):
|
|
8
|
-
output_data = super(BaseTextParser, self).run(input_data=input_data,
|
|
9
|
-
params_dict=params_dict,
|
|
10
|
-
parent_ctx=parent_ctx)
|
|
11
|
-
|
|
12
|
-
return BaseParsedText(terms=output_data)
|
|
File without changes
|
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
import collections
|
|
2
|
-
|
|
3
|
-
from arekit.common.bound import Bound
|
|
4
|
-
from arekit.common.text.partitioning.base import BasePartitioning
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class TermsPartitioning(BasePartitioning):
|
|
8
|
-
""" NOTE: considering that provided parts
|
|
9
|
-
has no intersections between each other
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
def provide(self, text, parts_it):
|
|
13
|
-
assert(isinstance(text, list))
|
|
14
|
-
assert(isinstance(parts_it, collections.Iterable))
|
|
15
|
-
|
|
16
|
-
start = 0
|
|
17
|
-
parts = []
|
|
18
|
-
for value, bound in parts_it:
|
|
19
|
-
assert(isinstance(bound, Bound))
|
|
20
|
-
assert(bound.Position >= start)
|
|
21
|
-
|
|
22
|
-
# Release everythig till the current value position.
|
|
23
|
-
part = text[start:bound.Position]
|
|
24
|
-
|
|
25
|
-
parts.extend(part)
|
|
26
|
-
|
|
27
|
-
# Release the entity value.
|
|
28
|
-
parts.extend([value])
|
|
29
|
-
|
|
30
|
-
start = bound.Position + bound.Length
|
|
31
|
-
|
|
32
|
-
# Release everything after the last entity.
|
|
33
|
-
parts.extend(text[start:len(text)])
|
|
34
|
-
|
|
35
|
-
return parts
|
|
File without changes
|
|
File without changes
|
|
@@ -1,83 +0,0 @@
|
|
|
1
|
-
from arekit.contrib.source.brat.entities.entity import BratEntity
|
|
2
|
-
from arekit.contrib.source.brat.relation import BratRelation
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class BratAnnotationParser:
|
|
6
|
-
|
|
7
|
-
ENTITIES = "entities"
|
|
8
|
-
RELATIONS = "relations"
|
|
9
|
-
|
|
10
|
-
@staticmethod
|
|
11
|
-
def __non_prefixed_id(value):
|
|
12
|
-
assert (isinstance(value, str))
|
|
13
|
-
return value[1:]
|
|
14
|
-
|
|
15
|
-
@staticmethod
|
|
16
|
-
def handle_entity(args):
|
|
17
|
-
""" T2 Location 10 23 South America
|
|
18
|
-
T1 Location 0 5;16 23 North America
|
|
19
|
-
"""
|
|
20
|
-
assert(len(args) == 3)
|
|
21
|
-
|
|
22
|
-
e_id = int(BratAnnotationParser.__non_prefixed_id(args[0]))
|
|
23
|
-
entity_params = args[1].split()
|
|
24
|
-
|
|
25
|
-
if len(entity_params) != 3:
|
|
26
|
-
# We do not support the case of a non-continuous entity mentions.
|
|
27
|
-
return None
|
|
28
|
-
|
|
29
|
-
e_str_type, e_begin, e_end = entity_params
|
|
30
|
-
|
|
31
|
-
return BratEntity(id_in_doc=e_id,
|
|
32
|
-
e_type=e_str_type,
|
|
33
|
-
index_begin=int(e_begin),
|
|
34
|
-
index_end=int(e_end),
|
|
35
|
-
value=args[2].strip())
|
|
36
|
-
|
|
37
|
-
@staticmethod
|
|
38
|
-
def handle_relation(args):
|
|
39
|
-
""" Example:
|
|
40
|
-
R1 Origin Arg1:T3 Arg2:T4
|
|
41
|
-
"""
|
|
42
|
-
|
|
43
|
-
# Parse identifier index.
|
|
44
|
-
e_id = args[0][1:]
|
|
45
|
-
|
|
46
|
-
# Parse relation arguments.
|
|
47
|
-
rel_type, source, target = args[1].split()
|
|
48
|
-
|
|
49
|
-
source_id = source.split(':')[1]
|
|
50
|
-
target_id = target.split(':')[1]
|
|
51
|
-
|
|
52
|
-
return BratRelation(id_in_doc=e_id,
|
|
53
|
-
source_id=int(BratAnnotationParser.__non_prefixed_id(source_id)),
|
|
54
|
-
target_id=int(BratAnnotationParser.__non_prefixed_id(target_id)),
|
|
55
|
-
rel_type=rel_type)
|
|
56
|
-
|
|
57
|
-
@staticmethod
|
|
58
|
-
def parse_annotations(input_file, encoding='utf-8'):
|
|
59
|
-
""" Read annotation collection from file
|
|
60
|
-
"""
|
|
61
|
-
entities = []
|
|
62
|
-
relations = []
|
|
63
|
-
|
|
64
|
-
for line in input_file.readlines():
|
|
65
|
-
line = line.decode(encoding)
|
|
66
|
-
|
|
67
|
-
args = line.split('\t')
|
|
68
|
-
|
|
69
|
-
record_type = args[0][0]
|
|
70
|
-
|
|
71
|
-
# Entities (objects) are prefixed with `T`
|
|
72
|
-
if record_type == "T":
|
|
73
|
-
entity = BratAnnotationParser.handle_entity(args)
|
|
74
|
-
if entity is not None:
|
|
75
|
-
entities.append(entity)
|
|
76
|
-
|
|
77
|
-
elif record_type == "R":
|
|
78
|
-
relations.append(BratAnnotationParser.handle_relation(args))
|
|
79
|
-
|
|
80
|
-
return {
|
|
81
|
-
BratAnnotationParser.ENTITIES: entities,
|
|
82
|
-
BratAnnotationParser.RELATIONS: relations
|
|
83
|
-
}
|
|
File without changes
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
from arekit.contrib.source.brat.entities.entity import BratEntity
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class BratCompoundEntity(BratEntity):
|
|
5
|
-
""" Entity which contains the hierarchy of the other entities.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
def __init__(self, id_in_doc, value, e_type, root, entities, index_begin, index_end,
|
|
9
|
-
display_value=None, group_index=None):
|
|
10
|
-
assert(isinstance(entities, list))
|
|
11
|
-
assert(isinstance(root, BratCompoundEntity) or root is None)
|
|
12
|
-
super(BratCompoundEntity, self).__init__(value=value, e_type=e_type,
|
|
13
|
-
id_in_doc=id_in_doc,
|
|
14
|
-
index_begin=index_begin,
|
|
15
|
-
index_end=index_end,
|
|
16
|
-
display_value=display_value,
|
|
17
|
-
group_index=group_index)
|
|
18
|
-
self.__entities = entities
|
|
19
|
-
self.__root = root
|
|
20
|
-
|
|
21
|
-
@classmethod
|
|
22
|
-
def from_list(cls, root, childs):
|
|
23
|
-
assert(isinstance(root, BratEntity))
|
|
24
|
-
assert(isinstance(childs, list) and len(childs) > 0)
|
|
25
|
-
return cls(id_in_doc=root.ID, value=root.Value, e_type=root.Type, root=None,
|
|
26
|
-
entities=childs, index_begin=root.IndexBegin, index_end=root.IndexEnd)
|
|
27
|
-
|
|
28
|
-
@property
|
|
29
|
-
def Root(self):
|
|
30
|
-
return self.__root
|
|
31
|
-
|
|
32
|
-
def iter_childs(self):
|
|
33
|
-
return iter(self.__entities)
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
from arekit.common.entities.base import Entity
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class BratEntity(Entity):
|
|
5
|
-
""" Annotated entity in Brat-based collection corpus.
|
|
6
|
-
Provides bounds, i.e. char indices in related sentence.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
def __init__(self, id_in_doc, e_type, index_begin, index_end, value, display_value=None, group_index=None):
|
|
10
|
-
""" index_begin: int
|
|
11
|
-
- char index (in case of string type of `text`)
|
|
12
|
-
- term index (in case of list type of `text`)
|
|
13
|
-
index_end: int
|
|
14
|
-
- char index (in case of string type of `text`)
|
|
15
|
-
- term index (in case of list type of `text`)
|
|
16
|
-
"""
|
|
17
|
-
assert(isinstance(e_type, str))
|
|
18
|
-
assert(isinstance(index_begin, int))
|
|
19
|
-
assert(isinstance(index_end, int))
|
|
20
|
-
super(BratEntity, self).__init__(value=value, e_type=e_type,
|
|
21
|
-
display_value=display_value, group_index=group_index)
|
|
22
|
-
|
|
23
|
-
self.__e_type = e_type
|
|
24
|
-
self.__begin = index_begin
|
|
25
|
-
self.__end = index_end
|
|
26
|
-
self.__id = id_in_doc
|
|
27
|
-
|
|
28
|
-
@property
|
|
29
|
-
def IndexBegin(self):
|
|
30
|
-
return self.__begin
|
|
31
|
-
|
|
32
|
-
@property
|
|
33
|
-
def IndexEnd(self):
|
|
34
|
-
return self.__end
|
|
35
|
-
|
|
36
|
-
@property
|
|
37
|
-
def Type(self):
|
|
38
|
-
return self.__e_type
|
|
39
|
-
|
|
40
|
-
@property
|
|
41
|
-
def ID(self):
|
|
42
|
-
return self.__id
|