arekit 0.24.0__py3-none-any.whl → 0.25.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/common/data/storages/base.py +4 -15
- arekit/common/docs/entities_grouping.py +2 -1
- arekit/common/docs/parser.py +27 -22
- arekit/common/pipeline/base.py +12 -16
- arekit/common/pipeline/batching.py +28 -0
- arekit/common/pipeline/context.py +5 -1
- arekit/common/pipeline/items/base.py +39 -2
- arekit/common/pipeline/items/flatten.py +5 -1
- arekit/common/pipeline/items/handle.py +2 -1
- arekit/common/pipeline/items/iter.py +2 -1
- arekit/common/pipeline/items/map.py +2 -1
- arekit/common/pipeline/items/map_nested.py +4 -0
- arekit/common/pipeline/utils.py +32 -0
- arekit/common/text/{partitioning/str.py → partitioning.py} +14 -9
- arekit/common/utils.py +11 -52
- arekit/contrib/utils/data/contents/opinions.py +13 -3
- arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
- arekit/contrib/utils/data/storages/pandas_based.py +2 -17
- arekit/contrib/utils/data/storages/row_cache.py +8 -2
- arekit/contrib/utils/data/storages/sqlite_based.py +18 -0
- arekit/contrib/utils/io_utils/utils.py +1 -18
- arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
- arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +8 -10
- arekit-0.25.1.data/data/logo.png +0 -0
- arekit-0.25.1.dist-info/METADATA +81 -0
- arekit-0.25.1.dist-info/RECORD +186 -0
- {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/WHEEL +1 -1
- arekit/common/data/input/repositories/__init__.py +0 -0
- arekit/common/data/input/repositories/base.py +0 -68
- arekit/common/data/input/repositories/sample.py +0 -22
- arekit/common/data/views/__init__.py +0 -0
- arekit/common/data/views/samples.py +0 -26
- arekit/common/docs/objects_parser.py +0 -37
- arekit/common/text/parser.py +0 -12
- arekit/common/text/partitioning/__init__.py +0 -0
- arekit/common/text/partitioning/base.py +0 -4
- arekit/common/text/partitioning/terms.py +0 -35
- arekit/contrib/networks/__init__.py +0 -0
- arekit/contrib/networks/embedding.py +0 -149
- arekit/contrib/networks/embedding_io.py +0 -18
- arekit/contrib/networks/input/__init__.py +0 -0
- arekit/contrib/networks/input/const.py +0 -6
- arekit/contrib/networks/input/ctx_serialization.py +0 -28
- arekit/contrib/networks/input/embedding/__init__.py +0 -0
- arekit/contrib/networks/input/embedding/matrix.py +0 -29
- arekit/contrib/networks/input/embedding/offsets.py +0 -55
- arekit/contrib/networks/input/formatters/__init__.py +0 -0
- arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
- arekit/contrib/networks/input/providers/__init__.py +0 -0
- arekit/contrib/networks/input/providers/sample.py +0 -129
- arekit/contrib/networks/input/providers/term_connotation.py +0 -23
- arekit/contrib/networks/input/providers/text.py +0 -24
- arekit/contrib/networks/input/rows_parser.py +0 -47
- arekit/contrib/networks/input/term_types.py +0 -13
- arekit/contrib/networks/input/terms_mapping.py +0 -60
- arekit/contrib/networks/vectorizer.py +0 -6
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +0 -84
- arekit/contrib/source/brat/doc.py +0 -28
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +0 -13
- arekit/contrib/source/brat/entities/entity.py +0 -42
- arekit/contrib/source/brat/entities/parser.py +0 -53
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +0 -19
- arekit/contrib/source/brat/relation.py +0 -32
- arekit/contrib/source/brat/sentence.py +0 -69
- arekit/contrib/source/brat/sentences_reader.py +0 -128
- arekit/contrib/source/download.py +0 -41
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +0 -55
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +0 -74
- arekit/contrib/source/nerel/io_utils.py +0 -62
- arekit/contrib/source/nerel/labels.py +0 -241
- arekit/contrib/source/nerel/reader.py +0 -46
- arekit/contrib/source/nerel/utils.py +0 -24
- arekit/contrib/source/nerel/versions.py +0 -12
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +0 -62
- arekit/contrib/source/nerelbio/labels.py +0 -265
- arekit/contrib/source/nerelbio/reader.py +0 -8
- arekit/contrib/source/nerelbio/versions.py +0 -8
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +0 -36
- arekit/contrib/source/ruattitudes/doc.py +0 -51
- arekit/contrib/source/ruattitudes/doc_brat.py +0 -44
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
- arekit/contrib/source/ruattitudes/io_utils.py +0 -56
- arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
- arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
- arekit/contrib/source/ruattitudes/reader.py +0 -268
- arekit/contrib/source/ruattitudes/sentence.py +0 -73
- arekit/contrib/source/ruattitudes/synonyms.py +0 -17
- arekit/contrib/source/ruattitudes/text_object.py +0 -59
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +0 -157
- arekit/contrib/source/rusentiframes/effect.py +0 -24
- arekit/contrib/source/rusentiframes/io_utils.py +0 -19
- arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
- arekit/contrib/source/rusentiframes/polarity.py +0 -35
- arekit/contrib/source/rusentiframes/role.py +0 -15
- arekit/contrib/source/rusentiframes/state.py +0 -24
- arekit/contrib/source/rusentiframes/types.py +0 -42
- arekit/contrib/source/rusentiframes/value.py +0 -2
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +0 -3
- arekit/contrib/source/rusentrel/docs_reader.py +0 -51
- arekit/contrib/source/rusentrel/entities.py +0 -26
- arekit/contrib/source/rusentrel/io_utils.py +0 -125
- arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
- arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
- arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
- arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
- arekit/contrib/source/rusentrel/synonyms.py +0 -17
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +0 -52
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +0 -31
- arekit/contrib/source/sentinerel/folding/fixed.py +0 -70
- arekit/contrib/source/sentinerel/io_utils.py +0 -87
- arekit/contrib/source/sentinerel/labels.py +0 -53
- arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
- arekit/contrib/source/sentinerel/reader.py +0 -42
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +0 -19
- arekit/contrib/source/zip_utils.py +0 -47
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
- arekit/contrib/utils/data/readers/__init__.py +0 -0
- arekit/contrib/utils/data/readers/base.py +0 -7
- arekit/contrib/utils/data/readers/csv_pd.py +0 -38
- arekit/contrib/utils/data/readers/jsonl.py +0 -15
- arekit/contrib/utils/data/service/__init__.py +0 -0
- arekit/contrib/utils/data/service/balance.py +0 -50
- arekit/contrib/utils/data/writers/csv_native.py +0 -63
- arekit/contrib/utils/data/writers/csv_pd.py +0 -40
- arekit/contrib/utils/data/writers/json_opennre.py +0 -132
- arekit/contrib/utils/data/writers/sqlite_native.py +0 -110
- arekit/contrib/utils/download.py +0 -77
- arekit/contrib/utils/embeddings/__init__.py +0 -0
- arekit/contrib/utils/embeddings/rusvectores.py +0 -58
- arekit/contrib/utils/embeddings/tokens.py +0 -30
- arekit/contrib/utils/io_utils/embedding.py +0 -72
- arekit/contrib/utils/io_utils/opinions.py +0 -37
- arekit/contrib/utils/io_utils/samples.py +0 -79
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +0 -41
- arekit/contrib/utils/lexicons/relation.py +0 -42
- arekit/contrib/utils/lexicons/rusentilex.py +0 -37
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +0 -83
- arekit/contrib/utils/np_utils/__init__.py +0 -0
- arekit/contrib/utils/np_utils/embedding.py +0 -22
- arekit/contrib/utils/np_utils/npz_utils.py +0 -13
- arekit/contrib/utils/np_utils/vocab.py +0 -20
- arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/base.py +0 -99
- arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -54
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -32
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -107
- arekit/contrib/utils/pipelines/items/text/translator.py +0 -135
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +0 -27
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -65
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +0 -29
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -64
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +0 -56
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -20
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -65
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +0 -21
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -107
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +0 -29
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -180
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
- arekit/contrib/utils/processing/languages/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/mods.py +0 -12
- arekit/contrib/utils/processing/languages/pos.py +0 -23
- arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
- arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
- arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
- arekit/contrib/utils/processing/languages/ru/number.py +0 -23
- arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
- arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
- arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
- arekit/contrib/utils/processing/pos/__init__.py +0 -0
- arekit/contrib/utils/processing/pos/base.py +0 -12
- arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
- arekit/contrib/utils/processing/pos/russian.py +0 -10
- arekit/contrib/utils/processing/text/__init__.py +0 -0
- arekit/contrib/utils/processing/text/tokens.py +0 -127
- arekit/contrib/utils/resources.py +0 -25
- arekit/contrib/utils/serializer.py +0 -43
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
- arekit/contrib/utils/vectorizers/__init__.py +0 -0
- arekit/contrib/utils/vectorizers/bpe.py +0 -93
- arekit/contrib/utils/vectorizers/random_norm.py +0 -39
- arekit/download_data.py +0 -11
- arekit-0.24.0.dist-info/METADATA +0 -23
- arekit-0.24.0.dist-info/RECORD +0 -374
- {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/LICENSE +0 -0
- {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/top_level.txt +0 -0
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
from os import path
|
|
2
|
-
|
|
3
|
-
from arekit.common.experiment.data_type import DataType
|
|
4
|
-
from arekit.contrib.source.nerel.folding.fixed import create_fixed_folding
|
|
5
|
-
from arekit.contrib.source.nerel.utils import iter_filename_and_splittype
|
|
6
|
-
from arekit.contrib.source.zip_utils import ZipArchiveUtils
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class NerelIOUtils(ZipArchiveUtils):
|
|
10
|
-
|
|
11
|
-
splits = {
|
|
12
|
-
DataType.Train: "train",
|
|
13
|
-
DataType.Dev: "dev",
|
|
14
|
-
DataType.Test: "test"
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
@staticmethod
|
|
18
|
-
def get_archive_filepath(version):
|
|
19
|
-
return path.join(NerelIOUtils.get_data_root(), "nerel-{}.zip".format(version))
|
|
20
|
-
|
|
21
|
-
@staticmethod
|
|
22
|
-
def get_annotation_innerpath(folding_data_type, filename):
|
|
23
|
-
assert(isinstance(filename, str))
|
|
24
|
-
return path.join(NerelIOUtils.splits[folding_data_type], "{}.ann".format(filename))
|
|
25
|
-
|
|
26
|
-
@staticmethod
|
|
27
|
-
def get_news_innerpath(folding_data_type, filename):
|
|
28
|
-
assert(isinstance(filename, str))
|
|
29
|
-
return path.join(NerelIOUtils.splits[folding_data_type], "{}.txt".format(filename))
|
|
30
|
-
|
|
31
|
-
@staticmethod
|
|
32
|
-
def map_doc_to_fold_type(version):
|
|
33
|
-
|
|
34
|
-
it = iter_filename_and_splittype(
|
|
35
|
-
filenames_it=NerelIOUtils.iter_filenames_from_zip(version),
|
|
36
|
-
splits=NerelIOUtils.splits.items())
|
|
37
|
-
|
|
38
|
-
d2f = {}
|
|
39
|
-
for filename, split_type in it:
|
|
40
|
-
d2f[filename] = split_type
|
|
41
|
-
|
|
42
|
-
return d2f
|
|
43
|
-
|
|
44
|
-
@staticmethod
|
|
45
|
-
def read_dataset_split(version, docs_limit=None):
|
|
46
|
-
|
|
47
|
-
it = iter_filename_and_splittype(
|
|
48
|
-
filenames_it=NerelIOUtils.iter_filenames_from_zip(version),
|
|
49
|
-
splits=NerelIOUtils.splits.items())
|
|
50
|
-
|
|
51
|
-
f2d = {}
|
|
52
|
-
for filename, split_type in it:
|
|
53
|
-
if split_type not in f2d:
|
|
54
|
-
f2d[split_type] = []
|
|
55
|
-
f2d[split_type].append(filename)
|
|
56
|
-
|
|
57
|
-
filenames_by_ids, data_folding = create_fixed_folding(train_filenames=f2d[DataType.Train],
|
|
58
|
-
test_filenames=f2d[DataType.Test],
|
|
59
|
-
dev_filenames=f2d[DataType.Dev],
|
|
60
|
-
limit=docs_limit)
|
|
61
|
-
|
|
62
|
-
return filenames_by_ids, data_folding
|
|
@@ -1,241 +0,0 @@
|
|
|
1
|
-
from arekit.common.labels.base import Label
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class OpinionBelongsTo(Label):
|
|
5
|
-
pass
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class OpinionRelatesTo(Label):
|
|
9
|
-
pass
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class NegEffectFrom(Label):
|
|
13
|
-
pass
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class NegStateFrom(Label):
|
|
17
|
-
pass
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class PosEffectFrom(Label):
|
|
21
|
-
pass
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class PosAuthorFrom(Label):
|
|
25
|
-
pass
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class NegAuthorFrom(Label):
|
|
29
|
-
pass
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
class PosStateFrom(Label):
|
|
33
|
-
pass
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class NegativeTo(Label):
|
|
37
|
-
pass
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
class PositiveTo(Label):
|
|
41
|
-
pass
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
class STATE_BELONGS_TO(Label):
|
|
45
|
-
pass
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
class ABBREVIATION(Label):
|
|
49
|
-
pass
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class HEADQUARTERED_IN(Label):
|
|
53
|
-
pass
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
class AGE_DIED_AT(Label):
|
|
57
|
-
pass
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
class AGE_IS(Label):
|
|
61
|
-
pass
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
class AGENT(Label):
|
|
65
|
-
pass
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
class IDEOLOGY_OF(Label):
|
|
69
|
-
pass
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
class PLACE_RESIDES_IN(Label):
|
|
73
|
-
pass
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
class POINT_IN_TIME(Label):
|
|
77
|
-
pass
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
class INANIMATE_INVOLVED(Label):
|
|
81
|
-
pass
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
class PRICE_OF(Label):
|
|
85
|
-
pass
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
class INCOME(Label):
|
|
89
|
-
pass
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
class PRODUCES(Label):
|
|
93
|
-
pass
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
class ALTERNATIVE_NAME(Label):
|
|
97
|
-
pass
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
class AWARDED_WITH(Label):
|
|
101
|
-
pass
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
class CAUSE_OF_DEATH(Label):
|
|
105
|
-
pass
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
class CONVICTED_OF(Label):
|
|
109
|
-
pass
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
class DATE_DEFUNCT_IN(Label):
|
|
113
|
-
pass
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
class DATE_FOUNDED_IN(Label):
|
|
117
|
-
pass
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
class DATE_OF_BIRTH(Label):
|
|
121
|
-
pass
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
class DATE_OF_CREATION(Label):
|
|
125
|
-
pass
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
class DATE_OF_DEATH(Label):
|
|
129
|
-
pass
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
class END_TIME(Label):
|
|
133
|
-
pass
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
class EXPENDITURE(Label):
|
|
137
|
-
pass
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
class FOUNDED_BY(Label):
|
|
141
|
-
pass
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
class KNOWS(Label):
|
|
145
|
-
pass
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
class RELATIVE(Label):
|
|
149
|
-
pass
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
class LOCATED_IN(Label):
|
|
153
|
-
pass
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
class RELIGION_OF(Label):
|
|
157
|
-
pass
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
class MEDICAL_CONDITION(Label):
|
|
161
|
-
pass
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
class SCHOOLS_ATTENDED(Label):
|
|
165
|
-
pass
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
class MEMBER_OF(Label):
|
|
169
|
-
pass
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
class SIBLING(Label):
|
|
173
|
-
pass
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
class ORGANIZES(Label):
|
|
177
|
-
pass
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
class SPOUSE(Label):
|
|
181
|
-
pass
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
class ORIGINS_FROM(Label):
|
|
185
|
-
pass
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
class START_TIME(Label):
|
|
189
|
-
pass
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
class OWNER_OF(Label):
|
|
193
|
-
pass
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
class SUBEVENT_OF(Label):
|
|
197
|
-
pass
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
class PARENT_OF(Label):
|
|
201
|
-
pass
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
class SUBORDINATE_OF(Label):
|
|
205
|
-
pass
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
class PART_OF(Label):
|
|
209
|
-
pass
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
class TAKES_PLACE_IN(Label):
|
|
213
|
-
pass
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
class PARTICIPANT_IN(Label):
|
|
217
|
-
pass
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
class WORKPLACE(Label):
|
|
221
|
-
pass
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
class PENALIZED_AS(Label):
|
|
225
|
-
pass
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
class WORKS_AS(Label):
|
|
229
|
-
pass
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
class PLACE_OF_DEATH(Label):
|
|
233
|
-
pass
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
class PLACE_OF_BIRTH(Label):
|
|
237
|
-
pass
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
class HAS_CAUSE (Label):
|
|
241
|
-
pass
|
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
from arekit.contrib.source.brat.annot import BratAnnotationParser
|
|
2
|
-
from arekit.contrib.source.brat.doc import BratDocument
|
|
3
|
-
from arekit.contrib.source.brat.sentences_reader import BratDocumentSentencesReader
|
|
4
|
-
from arekit.contrib.source.nerel.entities import NerelEntityCollection
|
|
5
|
-
from arekit.contrib.source.nerel.io_utils import NerelIOUtils
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class NerelDocReader(object):
|
|
9
|
-
|
|
10
|
-
def __init__(self, version, io_utils=NerelIOUtils()):
|
|
11
|
-
assert(isinstance(io_utils, NerelIOUtils))
|
|
12
|
-
self.__version = version
|
|
13
|
-
self.__io_utils = io_utils
|
|
14
|
-
self.__doc_fold = io_utils.map_doc_to_fold_type(version)
|
|
15
|
-
|
|
16
|
-
def read_text_relations(self, filename):
|
|
17
|
-
assert(isinstance(filename, str))
|
|
18
|
-
|
|
19
|
-
return self.__io_utils.read_from_zip(
|
|
20
|
-
inner_path=self.__io_utils.get_annotation_innerpath(
|
|
21
|
-
folding_data_type=self.__doc_fold[filename],
|
|
22
|
-
filename=filename),
|
|
23
|
-
process_func=lambda input_file: [
|
|
24
|
-
relation for relation in BratAnnotationParser.parse_annotations(
|
|
25
|
-
input_file=input_file, encoding='utf-8-sig')["relations"]],
|
|
26
|
-
version=self.__version)
|
|
27
|
-
|
|
28
|
-
def read_document(self, filename, doc_id, entities_to_ignore=None):
|
|
29
|
-
assert(isinstance(filename, str))
|
|
30
|
-
assert(isinstance(doc_id, int))
|
|
31
|
-
|
|
32
|
-
def file_to_doc(input_file):
|
|
33
|
-
sentences = BratDocumentSentencesReader.from_file(input_file=input_file, entities=entities)
|
|
34
|
-
return BratDocument(doc_id=doc_id, sentences=sentences, text_relations=text_relations)
|
|
35
|
-
|
|
36
|
-
entities = NerelEntityCollection.read_collection(
|
|
37
|
-
filename=filename, version=self.__version,
|
|
38
|
-
entities_to_ignore=entities_to_ignore, io_utils=self.__io_utils)
|
|
39
|
-
|
|
40
|
-
text_relations = self.read_text_relations(filename=filename)
|
|
41
|
-
|
|
42
|
-
return self.__io_utils.read_from_zip(
|
|
43
|
-
inner_path=self.__io_utils.get_news_innerpath(
|
|
44
|
-
folding_data_type=self.__doc_fold[filename], filename=filename),
|
|
45
|
-
process_func=file_to_doc,
|
|
46
|
-
version=self.__version)
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
from os.path import basename
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def __iter_filtered_filenames(filenames_iter):
|
|
5
|
-
for filename in filenames_iter:
|
|
6
|
-
extension = filename[-4:]
|
|
7
|
-
# Crop extension.
|
|
8
|
-
filename = filename[:-4]
|
|
9
|
-
if extension != ".txt":
|
|
10
|
-
continue
|
|
11
|
-
yield filename, basename(filename)
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def iter_filename_and_splittype(filenames_it, splits):
|
|
15
|
-
for doc_id, data in enumerate(__iter_filtered_filenames(filenames_it)):
|
|
16
|
-
filepath, filename = data
|
|
17
|
-
for split_type, split_name in splits:
|
|
18
|
-
if split_name in filepath:
|
|
19
|
-
yield filename, split_type
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def iter_collection_filenames(filenames_it):
|
|
23
|
-
for doc_id, filename in enumerate(__iter_filtered_filenames(filenames_it)):
|
|
24
|
-
yield doc_id, filename
|
|
File without changes
|
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
from os import path
|
|
2
|
-
|
|
3
|
-
from arekit.common.experiment.data_type import DataType
|
|
4
|
-
from arekit.contrib.source.nerel.folding.fixed import create_fixed_folding
|
|
5
|
-
from arekit.contrib.source.nerel.io_utils import NerelIOUtils
|
|
6
|
-
from arekit.contrib.source.nerel.utils import iter_filename_and_splittype
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class NerelBioIOUtils(NerelIOUtils):
|
|
10
|
-
|
|
11
|
-
splits = {
|
|
12
|
-
DataType.Train: "train",
|
|
13
|
-
DataType.Dev: "dev",
|
|
14
|
-
DataType.Test: "test"
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
@staticmethod
|
|
18
|
-
def get_archive_filepath(version):
|
|
19
|
-
return path.join(NerelBioIOUtils.get_data_root(), "nerel-bio-{}.zip".format(version))
|
|
20
|
-
|
|
21
|
-
@staticmethod
|
|
22
|
-
def get_annotation_innerpath(folding_data_type, filename):
|
|
23
|
-
assert(isinstance(filename, str))
|
|
24
|
-
return path.join(NerelBioIOUtils.splits[folding_data_type], "{}.ann".format(filename))
|
|
25
|
-
|
|
26
|
-
@staticmethod
|
|
27
|
-
def get_news_innerpath(folding_data_type, filename):
|
|
28
|
-
assert(isinstance(filename, str))
|
|
29
|
-
return path.join(NerelBioIOUtils.splits[folding_data_type], "{}.txt".format(filename))
|
|
30
|
-
|
|
31
|
-
@staticmethod
|
|
32
|
-
def map_doc_to_fold_type(version):
|
|
33
|
-
|
|
34
|
-
it = iter_filename_and_splittype(
|
|
35
|
-
filenames_it=NerelBioIOUtils.iter_filenames_from_zip(version),
|
|
36
|
-
splits=NerelBioIOUtils.splits.items())
|
|
37
|
-
|
|
38
|
-
d2f = {}
|
|
39
|
-
for filename, split_type in it:
|
|
40
|
-
d2f[filename] = split_type
|
|
41
|
-
|
|
42
|
-
return d2f
|
|
43
|
-
|
|
44
|
-
@staticmethod
|
|
45
|
-
def read_dataset_split(version, docs_limit=None):
|
|
46
|
-
|
|
47
|
-
it = iter_filename_and_splittype(
|
|
48
|
-
filenames_it=NerelBioIOUtils.iter_filenames_from_zip(version),
|
|
49
|
-
splits=NerelBioIOUtils.splits.items())
|
|
50
|
-
|
|
51
|
-
f2d = {}
|
|
52
|
-
for filename, split_type in it:
|
|
53
|
-
if split_type not in f2d:
|
|
54
|
-
f2d[split_type] = []
|
|
55
|
-
f2d[split_type].append(filename)
|
|
56
|
-
|
|
57
|
-
filenames_by_ids, data_folding = create_fixed_folding(train_filenames=f2d[DataType.Train],
|
|
58
|
-
test_filenames=f2d[DataType.Test],
|
|
59
|
-
dev_filenames=f2d[DataType.Dev],
|
|
60
|
-
limit=docs_limit)
|
|
61
|
-
|
|
62
|
-
return filenames_by_ids, data_folding
|
|
@@ -1,265 +0,0 @@
|
|
|
1
|
-
from arekit.common.labels.base import Label
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class ABBREVIATION(Label):
|
|
5
|
-
pass
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class ALTERNATIVE_NAME(Label):
|
|
9
|
-
pass
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class KNOWS(Label):
|
|
13
|
-
pass
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class AGE_IS(Label):
|
|
17
|
-
pass
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class AGE_DIED_AT(Label):
|
|
21
|
-
pass
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class AWARDED_WITH(Label):
|
|
25
|
-
pass
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class PLACE_OF_BIRTH(Label):
|
|
29
|
-
pass
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
class DATE_DEFUNCT_IN(Label):
|
|
33
|
-
pass
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class DATE_FOUNDED_IN(Label):
|
|
37
|
-
pass
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
class DATE_OF_BIRTH(Label):
|
|
41
|
-
pass
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
class DATE_OF_CREATION(Label):
|
|
45
|
-
pass
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
class DATE_OF_DEATH(Label):
|
|
49
|
-
pass
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class POINT_IN_TIME(Label):
|
|
53
|
-
pass
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
class PLACE_OF_DEATH(Label):
|
|
57
|
-
pass
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
class FOUNDED_BY(Label):
|
|
61
|
-
pass
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
class HEADQUARTERED_IN(Label):
|
|
65
|
-
pass
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
class IDEOLOGY_OF(Label):
|
|
69
|
-
pass
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
class SPOUSE(Label):
|
|
73
|
-
pass
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
class MEMBER_OF(Label):
|
|
77
|
-
pass
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
class ORGANIZES(Label):
|
|
81
|
-
pass
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
class OWNER_OF(Label):
|
|
85
|
-
pass
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
class PARENT_OF(Label):
|
|
89
|
-
pass
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
class PARTICIPANT_IN(Label):
|
|
93
|
-
pass
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
class PLACE_RESIDES_IN(Label):
|
|
97
|
-
pass
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
class PRICE_OF(Label):
|
|
101
|
-
pass
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
class PRODUCES(Label):
|
|
105
|
-
pass
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
class RELATIVE(Label):
|
|
109
|
-
pass
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
class RELIGION_OF(Label):
|
|
113
|
-
pass
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
class SCHOOLS_ATTENDED(Label):
|
|
117
|
-
pass
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
class SIBLING(Label):
|
|
121
|
-
pass
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
class SUBEVENT_OF(Label):
|
|
125
|
-
pass
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
class SUBORDINATE_OF(Label):
|
|
129
|
-
pass
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
class TAKES_PLACE_IN(Label):
|
|
133
|
-
pass
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
class WORKPLACE(Label):
|
|
137
|
-
pass
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
class WORKS_AS(Label):
|
|
141
|
-
pass
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
class CONVICTED_OF(Label):
|
|
145
|
-
pass
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
class PENALIZED_AS(Label):
|
|
149
|
-
pass
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
class START_TIME(Label):
|
|
153
|
-
pass
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
class END_TIME(Label):
|
|
157
|
-
pass
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
class EXPENDITURE(Label):
|
|
161
|
-
pass
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
class AGENT(Label):
|
|
165
|
-
pass
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
class INANIMATE_INVOLVED(Label):
|
|
169
|
-
pass
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
class INCOME(Label):
|
|
173
|
-
pass
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
class SUBCLASS_OF(Label):
|
|
177
|
-
pass
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
class PART_OF(Label):
|
|
181
|
-
pass
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
class LOCATED_IN(Label):
|
|
185
|
-
pass
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
class TREATED_USING(Label):
|
|
189
|
-
pass
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
class ORIGINS_FROM(Label):
|
|
193
|
-
pass
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
class TO_DETECT_OR_STUDY(Label):
|
|
197
|
-
pass
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
class AFFECTS(Label):
|
|
201
|
-
pass
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
class HAS_CAUSE(Label):
|
|
205
|
-
pass
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
class APPLIED_TO(Label):
|
|
209
|
-
pass
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
class USED_IN(Label):
|
|
213
|
-
pass
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
class ASSOCIATED_WITH(Label):
|
|
217
|
-
pass
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
class HAS_ADMINISTRATION_ROUTE(Label):
|
|
221
|
-
pass
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
class HAS_STRENGTH(Label):
|
|
225
|
-
pass
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
class DURATION_OF(Label):
|
|
229
|
-
pass
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
class VALUE_IS(Label):
|
|
233
|
-
pass
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
class PHYSIOLOGY_OF(Label):
|
|
237
|
-
pass
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
class PROCEDURE_PERFORMED(Label):
|
|
241
|
-
pass
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
class MENTAL_PROCESS_OF(Label):
|
|
245
|
-
pass
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
class MEDICAL_CONDITION(Label):
|
|
249
|
-
pass
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
class DOSE_IS(Label):
|
|
253
|
-
pass
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
class FINDING_OF(Label):
|
|
257
|
-
pass
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
class CAUSE_OF_DEATH(Label):
|
|
261
|
-
pass
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
class CONSUME(Label):
|
|
265
|
-
pass
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
from arekit.contrib.source.nerel.reader import NerelDocReader
|
|
2
|
-
from arekit.contrib.source.nerelbio.io_utils import NerelBioIOUtils
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class NerelBioDocReader(NerelDocReader):
|
|
6
|
-
|
|
7
|
-
def __init__(self, version):
|
|
8
|
-
super(NerelBioDocReader, self).__init__(version=version, io_utils=NerelBioIOUtils())
|