arekit 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/__init__.py +0 -0
- arekit/common/__init__.py +0 -0
- arekit/common/bound.py +48 -0
- arekit/common/context/__init__.py +0 -0
- arekit/common/context/terms_mapper.py +51 -0
- arekit/common/context/token.py +16 -0
- arekit/common/data/__init__.py +0 -0
- arekit/common/data/const.py +21 -0
- arekit/common/data/doc_provider.py +6 -0
- arekit/common/data/input/__init__.py +0 -0
- arekit/common/data/input/providers/__init__.py +0 -0
- arekit/common/data/input/providers/columns/__init__.py +0 -0
- arekit/common/data/input/providers/columns/base.py +9 -0
- arekit/common/data/input/providers/columns/sample.py +59 -0
- arekit/common/data/input/providers/const.py +3 -0
- arekit/common/data/input/providers/contents.py +9 -0
- arekit/common/data/input/providers/instances/__init__.py +0 -0
- arekit/common/data/input/providers/instances/base.py +14 -0
- arekit/common/data/input/providers/instances/multiple.py +27 -0
- arekit/common/data/input/providers/instances/single.py +8 -0
- arekit/common/data/input/providers/label/__init__.py +0 -0
- arekit/common/data/input/providers/label/base.py +24 -0
- arekit/common/data/input/providers/label/binary.py +11 -0
- arekit/common/data/input/providers/label/multiple.py +15 -0
- arekit/common/data/input/providers/rows/__init__.py +0 -0
- arekit/common/data/input/providers/rows/base.py +64 -0
- arekit/common/data/input/providers/rows/samples.py +227 -0
- arekit/common/data/input/providers/sample/__init__.py +0 -0
- arekit/common/data/input/providers/sample/cropped.py +43 -0
- arekit/common/data/input/providers/text/__init__.py +0 -0
- arekit/common/data/input/providers/text/single.py +49 -0
- arekit/common/data/input/repositories/__init__.py +0 -0
- arekit/common/data/input/repositories/base.py +68 -0
- arekit/common/data/input/repositories/sample.py +22 -0
- arekit/common/data/input/sample.py +66 -0
- arekit/common/data/input/terms_mapper.py +88 -0
- arekit/common/data/rows_fmt.py +82 -0
- arekit/common/data/rows_parser.py +43 -0
- arekit/common/data/storages/__init__.py +0 -0
- arekit/common/data/storages/base.py +109 -0
- arekit/common/data/views/__init__.py +0 -0
- arekit/common/data/views/samples.py +26 -0
- arekit/common/docs/__init__.py +0 -0
- arekit/common/docs/base.py +30 -0
- arekit/common/docs/entities_grouping.py +16 -0
- arekit/common/docs/entity.py +18 -0
- arekit/common/docs/objects_parser.py +37 -0
- arekit/common/docs/parsed/__init__.py +0 -0
- arekit/common/docs/parsed/base.py +101 -0
- arekit/common/docs/parsed/providers/__init__.py +0 -0
- arekit/common/docs/parsed/providers/base.py +68 -0
- arekit/common/docs/parsed/providers/base_pairs.py +51 -0
- arekit/common/docs/parsed/providers/entity_service.py +175 -0
- arekit/common/docs/parsed/providers/opinion_pairs.py +20 -0
- arekit/common/docs/parsed/providers/text_opinion_pairs.py +78 -0
- arekit/common/docs/parsed/service.py +31 -0
- arekit/common/docs/parsed/term_position.py +42 -0
- arekit/common/docs/parser.py +34 -0
- arekit/common/docs/sentence.py +14 -0
- arekit/common/entities/__init__.py +0 -0
- arekit/common/entities/base.py +51 -0
- arekit/common/entities/collection.py +72 -0
- arekit/common/entities/str_fmt.py +8 -0
- arekit/common/entities/types.py +9 -0
- arekit/common/experiment/__init__.py +0 -0
- arekit/common/experiment/api/__init__.py +0 -0
- arekit/common/experiment/api/base_samples_io.py +20 -0
- arekit/common/experiment/data_type.py +17 -0
- arekit/common/frames/__init__.py +0 -0
- arekit/common/frames/connotations/__init__.py +0 -0
- arekit/common/frames/connotations/descriptor.py +17 -0
- arekit/common/frames/connotations/provider.py +4 -0
- arekit/common/frames/text_variant.py +43 -0
- arekit/common/frames/variants/__init__.py +0 -0
- arekit/common/frames/variants/base.py +21 -0
- arekit/common/frames/variants/collection.py +60 -0
- arekit/common/labels/__init__.py +0 -0
- arekit/common/labels/base.py +19 -0
- arekit/common/labels/provider/__init__.py +0 -0
- arekit/common/labels/provider/base.py +7 -0
- arekit/common/labels/provider/constant.py +14 -0
- arekit/common/labels/scaler/__init__.py +0 -0
- arekit/common/labels/scaler/base.py +85 -0
- arekit/common/labels/scaler/sentiment.py +7 -0
- arekit/common/labels/scaler/single.py +10 -0
- arekit/common/labels/str_fmt.py +55 -0
- arekit/common/linkage/__init__.py +0 -0
- arekit/common/linkage/base.py +44 -0
- arekit/common/linkage/meta.py +23 -0
- arekit/common/linkage/opinions.py +9 -0
- arekit/common/linkage/text_opinions.py +22 -0
- arekit/common/log_utils.py +29 -0
- arekit/common/model/__init__.py +0 -0
- arekit/common/model/labeling/__init__.py +0 -0
- arekit/common/model/labeling/base.py +24 -0
- arekit/common/model/labeling/modes.py +8 -0
- arekit/common/model/labeling/single.py +24 -0
- arekit/common/opinions/__init__.py +0 -0
- arekit/common/opinions/annot/__init__.py +0 -0
- arekit/common/opinions/annot/algo/__init__.py +0 -0
- arekit/common/opinions/annot/algo/base.py +4 -0
- arekit/common/opinions/annot/algo/pair_based.py +99 -0
- arekit/common/opinions/annot/algo/predefined.py +16 -0
- arekit/common/opinions/annot/algo_based.py +55 -0
- arekit/common/opinions/annot/base.py +15 -0
- arekit/common/opinions/base.py +74 -0
- arekit/common/opinions/collection.py +150 -0
- arekit/common/opinions/enums.py +6 -0
- arekit/common/opinions/provider.py +4 -0
- arekit/common/opinions/writer.py +4 -0
- arekit/common/pipeline/__init__.py +0 -0
- arekit/common/pipeline/base.py +25 -0
- arekit/common/pipeline/context.py +36 -0
- arekit/common/pipeline/conts.py +2 -0
- arekit/common/pipeline/items/__init__.py +0 -0
- arekit/common/pipeline/items/base.py +12 -0
- arekit/common/pipeline/items/flatten.py +14 -0
- arekit/common/pipeline/items/handle.py +17 -0
- arekit/common/pipeline/items/iter.py +11 -0
- arekit/common/pipeline/items/map.py +11 -0
- arekit/common/pipeline/items/map_nested.py +13 -0
- arekit/common/synonyms/__init__.py +0 -0
- arekit/common/synonyms/base.py +151 -0
- arekit/common/synonyms/grouping.py +21 -0
- arekit/common/text/__init__.py +0 -0
- arekit/common/text/enums.py +12 -0
- arekit/common/text/parsed.py +42 -0
- arekit/common/text/parser.py +12 -0
- arekit/common/text/partitioning/__init__.py +0 -0
- arekit/common/text/partitioning/base.py +4 -0
- arekit/common/text/partitioning/str.py +36 -0
- arekit/common/text/partitioning/terms.py +35 -0
- arekit/common/text/stemmer.py +16 -0
- arekit/common/text_opinions/__init__.py +0 -0
- arekit/common/text_opinions/base.py +105 -0
- arekit/common/utils.py +129 -0
- arekit/contrib/__init__.py +0 -0
- arekit/contrib/bert/__init__.py +0 -0
- arekit/contrib/bert/input/__init__.py +0 -0
- arekit/contrib/bert/input/providers/__init__.py +0 -0
- arekit/contrib/bert/input/providers/cropped_sample.py +17 -0
- arekit/contrib/bert/input/providers/text_pair.py +62 -0
- arekit/contrib/bert/terms/__init__.py +0 -0
- arekit/contrib/bert/terms/mapper.py +20 -0
- arekit/contrib/networks/__init__.py +0 -0
- arekit/contrib/networks/embedding.py +149 -0
- arekit/contrib/networks/embedding_io.py +18 -0
- arekit/contrib/networks/input/__init__.py +0 -0
- arekit/contrib/networks/input/const.py +6 -0
- arekit/contrib/networks/input/ctx_serialization.py +28 -0
- arekit/contrib/networks/input/embedding/__init__.py +0 -0
- arekit/contrib/networks/input/embedding/matrix.py +29 -0
- arekit/contrib/networks/input/embedding/offsets.py +55 -0
- arekit/contrib/networks/input/formatters/__init__.py +0 -0
- arekit/contrib/networks/input/formatters/pos_mapper.py +22 -0
- arekit/contrib/networks/input/providers/__init__.py +0 -0
- arekit/contrib/networks/input/providers/sample.py +129 -0
- arekit/contrib/networks/input/providers/term_connotation.py +23 -0
- arekit/contrib/networks/input/providers/text.py +24 -0
- arekit/contrib/networks/input/rows_parser.py +47 -0
- arekit/contrib/networks/input/term_types.py +13 -0
- arekit/contrib/networks/input/terms_mapping.py +60 -0
- arekit/contrib/networks/vectorizer.py +6 -0
- arekit/contrib/prompt/__init__.py +0 -0
- arekit/contrib/prompt/sample.py +61 -0
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +84 -0
- arekit/contrib/source/brat/doc.py +28 -0
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +13 -0
- arekit/contrib/source/brat/entities/entity.py +42 -0
- arekit/contrib/source/brat/entities/parser.py +53 -0
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +19 -0
- arekit/contrib/source/brat/relation.py +32 -0
- arekit/contrib/source/brat/sentence.py +69 -0
- arekit/contrib/source/brat/sentences_reader.py +128 -0
- arekit/contrib/source/download.py +41 -0
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +55 -0
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +74 -0
- arekit/contrib/source/nerel/io_utils.py +62 -0
- arekit/contrib/source/nerel/labels.py +241 -0
- arekit/contrib/source/nerel/reader.py +46 -0
- arekit/contrib/source/nerel/utils.py +24 -0
- arekit/contrib/source/nerel/versions.py +12 -0
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +62 -0
- arekit/contrib/source/nerelbio/labels.py +265 -0
- arekit/contrib/source/nerelbio/reader.py +8 -0
- arekit/contrib/source/nerelbio/versions.py +8 -0
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +36 -0
- arekit/contrib/source/ruattitudes/doc.py +51 -0
- arekit/contrib/source/ruattitudes/doc_brat.py +44 -0
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +7 -0
- arekit/contrib/source/ruattitudes/io_utils.py +56 -0
- arekit/contrib/source/ruattitudes/labels_fmt.py +12 -0
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +28 -0
- arekit/contrib/source/ruattitudes/opinions/converter.py +37 -0
- arekit/contrib/source/ruattitudes/reader.py +268 -0
- arekit/contrib/source/ruattitudes/sentence.py +73 -0
- arekit/contrib/source/ruattitudes/synonyms.py +17 -0
- arekit/contrib/source/ruattitudes/text_object.py +59 -0
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +157 -0
- arekit/contrib/source/rusentiframes/effect.py +24 -0
- arekit/contrib/source/rusentiframes/io_utils.py +19 -0
- arekit/contrib/source/rusentiframes/labels_fmt.py +22 -0
- arekit/contrib/source/rusentiframes/polarity.py +35 -0
- arekit/contrib/source/rusentiframes/role.py +15 -0
- arekit/contrib/source/rusentiframes/state.py +24 -0
- arekit/contrib/source/rusentiframes/types.py +42 -0
- arekit/contrib/source/rusentiframes/value.py +2 -0
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +3 -0
- arekit/contrib/source/rusentrel/docs_reader.py +51 -0
- arekit/contrib/source/rusentrel/entities.py +26 -0
- arekit/contrib/source/rusentrel/io_utils.py +125 -0
- arekit/contrib/source/rusentrel/labels_fmt.py +12 -0
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +30 -0
- arekit/contrib/source/rusentrel/opinions/converter.py +40 -0
- arekit/contrib/source/rusentrel/opinions/provider.py +54 -0
- arekit/contrib/source/rusentrel/opinions/writer.py +42 -0
- arekit/contrib/source/rusentrel/synonyms.py +17 -0
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +52 -0
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +31 -0
- arekit/contrib/source/sentinerel/folding/fixed.py +70 -0
- arekit/contrib/source/sentinerel/io_utils.py +87 -0
- arekit/contrib/source/sentinerel/labels.py +53 -0
- arekit/contrib/source/sentinerel/labels_scaler.py +30 -0
- arekit/contrib/source/sentinerel/reader.py +42 -0
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +19 -0
- arekit/contrib/source/zip_utils.py +47 -0
- arekit/contrib/utils/__init__.py +0 -0
- arekit/contrib/utils/bert/__init__.py +0 -0
- arekit/contrib/utils/bert/samplers.py +17 -0
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +23 -0
- arekit/contrib/utils/data/__init__.py +0 -0
- arekit/contrib/utils/data/contents/__init__.py +0 -0
- arekit/contrib/utils/data/contents/opinions.py +37 -0
- arekit/contrib/utils/data/doc_provider/__init__.py +0 -0
- arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
- arekit/contrib/utils/data/doc_provider/dir_based.py +53 -0
- arekit/contrib/utils/data/readers/__init__.py +0 -0
- arekit/contrib/utils/data/readers/base.py +7 -0
- arekit/contrib/utils/data/readers/csv_pd.py +38 -0
- arekit/contrib/utils/data/readers/jsonl.py +15 -0
- arekit/contrib/utils/data/service/__init__.py +0 -0
- arekit/contrib/utils/data/service/balance.py +50 -0
- arekit/contrib/utils/data/storages/__init__.py +0 -0
- arekit/contrib/utils/data/storages/jsonl_based.py +18 -0
- arekit/contrib/utils/data/storages/pandas_based.py +123 -0
- arekit/contrib/utils/data/storages/row_cache.py +48 -0
- arekit/contrib/utils/data/writers/__init__.py +0 -0
- arekit/contrib/utils/data/writers/base.py +27 -0
- arekit/contrib/utils/data/writers/csv_native.py +63 -0
- arekit/contrib/utils/data/writers/csv_pd.py +40 -0
- arekit/contrib/utils/data/writers/json_opennre.py +132 -0
- arekit/contrib/utils/data/writers/sqlite_native.py +110 -0
- arekit/contrib/utils/download.py +77 -0
- arekit/contrib/utils/embeddings/__init__.py +0 -0
- arekit/contrib/utils/embeddings/rusvectores.py +58 -0
- arekit/contrib/utils/embeddings/tokens.py +30 -0
- arekit/contrib/utils/entities/__init__.py +0 -0
- arekit/contrib/utils/entities/filter.py +7 -0
- arekit/contrib/utils/entities/formatters/__init__.py +0 -0
- arekit/contrib/utils/entities/formatters/str_display.py +11 -0
- arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py +15 -0
- arekit/contrib/utils/io_utils/__init__.py +0 -0
- arekit/contrib/utils/io_utils/embedding.py +72 -0
- arekit/contrib/utils/io_utils/opinions.py +37 -0
- arekit/contrib/utils/io_utils/samples.py +79 -0
- arekit/contrib/utils/io_utils/utils.py +39 -0
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +41 -0
- arekit/contrib/utils/lexicons/relation.py +42 -0
- arekit/contrib/utils/lexicons/rusentilex.py +37 -0
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +83 -0
- arekit/contrib/utils/np_utils/__init__.py +0 -0
- arekit/contrib/utils/np_utils/embedding.py +22 -0
- arekit/contrib/utils/np_utils/npz_utils.py +13 -0
- arekit/contrib/utils/np_utils/vocab.py +20 -0
- arekit/contrib/utils/pipelines/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/base.py +99 -0
- arekit/contrib/utils/pipelines/items/sampling/networks.py +54 -0
- arekit/contrib/utils/pipelines/items/text/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/text/entities_default.py +23 -0
- arekit/contrib/utils/pipelines/items/text/frames.py +86 -0
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +36 -0
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +32 -0
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +10 -0
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +107 -0
- arekit/contrib/utils/pipelines/items/text/translator.py +135 -0
- arekit/contrib/utils/pipelines/opinion_collections.py +85 -0
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +27 -0
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +65 -0
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +60 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +29 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +64 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +79 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +56 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +20 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +65 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +21 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +107 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +29 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +62 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +180 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +50 -0
- arekit/contrib/utils/pipelines/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +34 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +88 -0
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +93 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/base.py +4 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +16 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +29 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +26 -0
- arekit/contrib/utils/processing/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/mods.py +12 -0
- arekit/contrib/utils/processing/languages/pos.py +23 -0
- arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/ru/cases.py +78 -0
- arekit/contrib/utils/processing/languages/ru/constants.py +6 -0
- arekit/contrib/utils/processing/languages/ru/mods.py +13 -0
- arekit/contrib/utils/processing/languages/ru/number.py +23 -0
- arekit/contrib/utils/processing/languages/ru/pos_service.py +36 -0
- arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
- arekit/contrib/utils/processing/lemmatization/mystem.py +51 -0
- arekit/contrib/utils/processing/pos/__init__.py +0 -0
- arekit/contrib/utils/processing/pos/base.py +12 -0
- arekit/contrib/utils/processing/pos/mystem_wrap.py +134 -0
- arekit/contrib/utils/processing/pos/russian.py +10 -0
- arekit/contrib/utils/processing/text/__init__.py +0 -0
- arekit/contrib/utils/processing/text/tokens.py +127 -0
- arekit/contrib/utils/resources.py +25 -0
- arekit/contrib/utils/serializer.py +43 -0
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +63 -0
- arekit/contrib/utils/synonyms/__init__.py +0 -0
- arekit/contrib/utils/synonyms/simple.py +15 -0
- arekit/contrib/utils/synonyms/stemmer_based.py +38 -0
- arekit/contrib/utils/vectorizers/__init__.py +0 -0
- arekit/contrib/utils/vectorizers/bpe.py +93 -0
- arekit/contrib/utils/vectorizers/random_norm.py +39 -0
- arekit/download_data.py +11 -0
- arekit-0.24.0.dist-info/LICENSE +21 -0
- arekit-0.24.0.dist-info/METADATA +23 -0
- arekit-0.24.0.dist-info/RECORD +374 -0
- arekit-0.24.0.dist-info/WHEEL +5 -0
- arekit-0.24.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from arekit.common.entities.collection import EntityCollection
|
|
2
|
+
from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders
|
|
3
|
+
from arekit.contrib.source.brat.annot import BratAnnotationParser
|
|
4
|
+
from arekit.contrib.source.brat.entities.entity import BratEntity
|
|
5
|
+
from arekit.contrib.source.nerel.io_utils import NerelIOUtils
|
|
6
|
+
from arekit.contrib.utils.processing.lemmatization.mystem import MystemWrapper
|
|
7
|
+
from arekit.contrib.utils.synonyms.stemmer_based import StemmerBasedSynonymCollection
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class NerelEntityCollection(EntityCollection):
|
|
11
|
+
|
|
12
|
+
def __init__(self, contents, value_to_group_id_func, entities_to_ignore=None):
|
|
13
|
+
"""
|
|
14
|
+
entities_to_ignore: list or None
|
|
15
|
+
this parameter is required because of the simplified implementation of
|
|
16
|
+
the nested objects of the BRAT annotation.
|
|
17
|
+
"""
|
|
18
|
+
assert(isinstance(contents, dict))
|
|
19
|
+
assert(BratAnnotationParser.ENTITIES in contents)
|
|
20
|
+
assert(isinstance(entities_to_ignore, list) or entities_to_ignore is None)
|
|
21
|
+
|
|
22
|
+
self.__discard_entities = set([] if entities_to_ignore is None else entities_to_ignore)
|
|
23
|
+
contents[BratAnnotationParser.ENTITIES] = [e for e in contents[BratAnnotationParser.ENTITIES]
|
|
24
|
+
if self.__keep_entity(e)]
|
|
25
|
+
|
|
26
|
+
super(NerelEntityCollection, self).__init__(
|
|
27
|
+
entities=contents[BratAnnotationParser.ENTITIES],
|
|
28
|
+
value_to_group_id_func=value_to_group_id_func)
|
|
29
|
+
|
|
30
|
+
self._sort_entities(key=lambda entity: entity.IndexBegin)
|
|
31
|
+
|
|
32
|
+
def __keep_entity(self, entity):
|
|
33
|
+
assert(isinstance(entity, BratEntity))
|
|
34
|
+
return entity.Type not in self.__discard_entities
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def read_collection(cls, filename, version, io_utils, entities_to_ignore=None):
|
|
38
|
+
assert(isinstance(io_utils, NerelIOUtils))
|
|
39
|
+
assert(isinstance(filename, str))
|
|
40
|
+
|
|
41
|
+
# Since this dataset does not provide the synonyms collection by default,
|
|
42
|
+
# it is necessary to declare an empty collection to populate so in further.
|
|
43
|
+
synonyms = StemmerBasedSynonymCollection(stemmer=MystemWrapper(), is_read_only=False)
|
|
44
|
+
|
|
45
|
+
doc_fold = io_utils.map_doc_to_fold_type(version)
|
|
46
|
+
|
|
47
|
+
return io_utils.read_from_zip(
|
|
48
|
+
inner_path=io_utils.get_annotation_innerpath(folding_data_type=doc_fold[filename], filename=filename),
|
|
49
|
+
process_func=lambda input_file: cls(
|
|
50
|
+
contents=BratAnnotationParser.parse_annotations(input_file=input_file, encoding='utf-8-sig'),
|
|
51
|
+
entities_to_ignore=entities_to_ignore,
|
|
52
|
+
value_to_group_id_func=lambda value:
|
|
53
|
+
SynonymsCollectionValuesGroupingProviders.provide_existed_or_register_missed_value(
|
|
54
|
+
synonyms, value)),
|
|
55
|
+
version=version)
|
|
File without changes
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from collections import OrderedDict
|
|
2
|
+
|
|
3
|
+
from arekit.common.experiment.data_type import DataType
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def create_fixed_folding(train_filenames, dev_filenames, test_filenames, limit=None):
|
|
7
|
+
""" Create fixed data-folding based on the predefined list of filenames,
|
|
8
|
+
written in file.
|
|
9
|
+
"""
|
|
10
|
+
assert(isinstance(train_filenames, list))
|
|
11
|
+
assert(isinstance(dev_filenames, list))
|
|
12
|
+
assert(isinstance(test_filenames, list))
|
|
13
|
+
|
|
14
|
+
filenames_by_ids = create_filenames_by_ids(filenames=train_filenames + dev_filenames + test_filenames)
|
|
15
|
+
|
|
16
|
+
ids_by_filenames = {}
|
|
17
|
+
for doc_id, filename in filenames_by_ids.items():
|
|
18
|
+
ids_by_filenames[filename] = doc_id
|
|
19
|
+
|
|
20
|
+
train_filenames = train_filenames if limit is None else train_filenames[:limit]
|
|
21
|
+
test_filenames = test_filenames if limit is None else test_filenames[:limit]
|
|
22
|
+
dev_filenames = dev_filenames if limit is None else dev_filenames[:limit]
|
|
23
|
+
|
|
24
|
+
fixed_folding = {
|
|
25
|
+
DataType.Train: [ids_by_filenames[filename] for filename in train_filenames],
|
|
26
|
+
DataType.Test: [ids_by_filenames[filename] for filename in test_filenames],
|
|
27
|
+
DataType.Dev: [ids_by_filenames[filename] for filename in dev_filenames]
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
return filenames_by_ids, fixed_folding
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def create_filenames_by_ids(filenames):
|
|
34
|
+
""" Indexing filenames
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __create_new_id(default_id):
|
|
38
|
+
new_id = default_id
|
|
39
|
+
while new_id in filenames_by_ids:
|
|
40
|
+
new_id += 1
|
|
41
|
+
return new_id
|
|
42
|
+
|
|
43
|
+
default_id = 0
|
|
44
|
+
|
|
45
|
+
filenames_by_ids = OrderedDict()
|
|
46
|
+
for fname in filenames:
|
|
47
|
+
|
|
48
|
+
doc_id = number_from_string(fname)
|
|
49
|
+
|
|
50
|
+
if doc_id is None:
|
|
51
|
+
doc_id = __create_new_id(default_id)
|
|
52
|
+
default_id = doc_id
|
|
53
|
+
|
|
54
|
+
assert(doc_id not in filenames_by_ids)
|
|
55
|
+
filenames_by_ids[doc_id] = fname
|
|
56
|
+
|
|
57
|
+
return filenames_by_ids
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def number_from_string(s):
|
|
61
|
+
assert(isinstance(s, str))
|
|
62
|
+
|
|
63
|
+
digit_chars_prefix = []
|
|
64
|
+
|
|
65
|
+
for chr in s:
|
|
66
|
+
if chr.isdigit():
|
|
67
|
+
digit_chars_prefix.append(chr)
|
|
68
|
+
else:
|
|
69
|
+
break
|
|
70
|
+
|
|
71
|
+
if len(digit_chars_prefix) == 0:
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
return int("".join(digit_chars_prefix))
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from os import path
|
|
2
|
+
|
|
3
|
+
from arekit.common.experiment.data_type import DataType
|
|
4
|
+
from arekit.contrib.source.nerel.folding.fixed import create_fixed_folding
|
|
5
|
+
from arekit.contrib.source.nerel.utils import iter_filename_and_splittype
|
|
6
|
+
from arekit.contrib.source.zip_utils import ZipArchiveUtils
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class NerelIOUtils(ZipArchiveUtils):
|
|
10
|
+
|
|
11
|
+
splits = {
|
|
12
|
+
DataType.Train: "train",
|
|
13
|
+
DataType.Dev: "dev",
|
|
14
|
+
DataType.Test: "test"
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
@staticmethod
|
|
18
|
+
def get_archive_filepath(version):
|
|
19
|
+
return path.join(NerelIOUtils.get_data_root(), "nerel-{}.zip".format(version))
|
|
20
|
+
|
|
21
|
+
@staticmethod
|
|
22
|
+
def get_annotation_innerpath(folding_data_type, filename):
|
|
23
|
+
assert(isinstance(filename, str))
|
|
24
|
+
return path.join(NerelIOUtils.splits[folding_data_type], "{}.ann".format(filename))
|
|
25
|
+
|
|
26
|
+
@staticmethod
|
|
27
|
+
def get_news_innerpath(folding_data_type, filename):
|
|
28
|
+
assert(isinstance(filename, str))
|
|
29
|
+
return path.join(NerelIOUtils.splits[folding_data_type], "{}.txt".format(filename))
|
|
30
|
+
|
|
31
|
+
@staticmethod
|
|
32
|
+
def map_doc_to_fold_type(version):
|
|
33
|
+
|
|
34
|
+
it = iter_filename_and_splittype(
|
|
35
|
+
filenames_it=NerelIOUtils.iter_filenames_from_zip(version),
|
|
36
|
+
splits=NerelIOUtils.splits.items())
|
|
37
|
+
|
|
38
|
+
d2f = {}
|
|
39
|
+
for filename, split_type in it:
|
|
40
|
+
d2f[filename] = split_type
|
|
41
|
+
|
|
42
|
+
return d2f
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def read_dataset_split(version, docs_limit=None):
|
|
46
|
+
|
|
47
|
+
it = iter_filename_and_splittype(
|
|
48
|
+
filenames_it=NerelIOUtils.iter_filenames_from_zip(version),
|
|
49
|
+
splits=NerelIOUtils.splits.items())
|
|
50
|
+
|
|
51
|
+
f2d = {}
|
|
52
|
+
for filename, split_type in it:
|
|
53
|
+
if split_type not in f2d:
|
|
54
|
+
f2d[split_type] = []
|
|
55
|
+
f2d[split_type].append(filename)
|
|
56
|
+
|
|
57
|
+
filenames_by_ids, data_folding = create_fixed_folding(train_filenames=f2d[DataType.Train],
|
|
58
|
+
test_filenames=f2d[DataType.Test],
|
|
59
|
+
dev_filenames=f2d[DataType.Dev],
|
|
60
|
+
limit=docs_limit)
|
|
61
|
+
|
|
62
|
+
return filenames_by_ids, data_folding
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
from arekit.common.labels.base import Label
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class OpinionBelongsTo(Label):
|
|
5
|
+
pass
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class OpinionRelatesTo(Label):
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class NegEffectFrom(Label):
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class NegStateFrom(Label):
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class PosEffectFrom(Label):
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class PosAuthorFrom(Label):
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class NegAuthorFrom(Label):
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class PosStateFrom(Label):
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class NegativeTo(Label):
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class PositiveTo(Label):
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class STATE_BELONGS_TO(Label):
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class ABBREVIATION(Label):
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class HEADQUARTERED_IN(Label):
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class AGE_DIED_AT(Label):
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class AGE_IS(Label):
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class AGENT(Label):
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class IDEOLOGY_OF(Label):
|
|
69
|
+
pass
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class PLACE_RESIDES_IN(Label):
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class POINT_IN_TIME(Label):
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class INANIMATE_INVOLVED(Label):
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class PRICE_OF(Label):
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class INCOME(Label):
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class PRODUCES(Label):
|
|
93
|
+
pass
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class ALTERNATIVE_NAME(Label):
|
|
97
|
+
pass
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class AWARDED_WITH(Label):
|
|
101
|
+
pass
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class CAUSE_OF_DEATH(Label):
|
|
105
|
+
pass
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class CONVICTED_OF(Label):
|
|
109
|
+
pass
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class DATE_DEFUNCT_IN(Label):
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class DATE_FOUNDED_IN(Label):
|
|
117
|
+
pass
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class DATE_OF_BIRTH(Label):
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class DATE_OF_CREATION(Label):
|
|
125
|
+
pass
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class DATE_OF_DEATH(Label):
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class END_TIME(Label):
|
|
133
|
+
pass
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class EXPENDITURE(Label):
|
|
137
|
+
pass
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class FOUNDED_BY(Label):
|
|
141
|
+
pass
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class KNOWS(Label):
|
|
145
|
+
pass
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class RELATIVE(Label):
|
|
149
|
+
pass
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class LOCATED_IN(Label):
|
|
153
|
+
pass
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class RELIGION_OF(Label):
|
|
157
|
+
pass
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class MEDICAL_CONDITION(Label):
|
|
161
|
+
pass
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class SCHOOLS_ATTENDED(Label):
|
|
165
|
+
pass
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class MEMBER_OF(Label):
|
|
169
|
+
pass
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
class SIBLING(Label):
|
|
173
|
+
pass
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
class ORGANIZES(Label):
|
|
177
|
+
pass
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
class SPOUSE(Label):
|
|
181
|
+
pass
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
class ORIGINS_FROM(Label):
|
|
185
|
+
pass
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class START_TIME(Label):
|
|
189
|
+
pass
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class OWNER_OF(Label):
|
|
193
|
+
pass
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
class SUBEVENT_OF(Label):
|
|
197
|
+
pass
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class PARENT_OF(Label):
|
|
201
|
+
pass
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class SUBORDINATE_OF(Label):
|
|
205
|
+
pass
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
class PART_OF(Label):
|
|
209
|
+
pass
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
class TAKES_PLACE_IN(Label):
|
|
213
|
+
pass
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class PARTICIPANT_IN(Label):
|
|
217
|
+
pass
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
class WORKPLACE(Label):
|
|
221
|
+
pass
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
class PENALIZED_AS(Label):
|
|
225
|
+
pass
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
class WORKS_AS(Label):
|
|
229
|
+
pass
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
class PLACE_OF_DEATH(Label):
|
|
233
|
+
pass
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
class PLACE_OF_BIRTH(Label):
|
|
237
|
+
pass
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
class HAS_CAUSE (Label):
|
|
241
|
+
pass
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from arekit.contrib.source.brat.annot import BratAnnotationParser
|
|
2
|
+
from arekit.contrib.source.brat.doc import BratDocument
|
|
3
|
+
from arekit.contrib.source.brat.sentences_reader import BratDocumentSentencesReader
|
|
4
|
+
from arekit.contrib.source.nerel.entities import NerelEntityCollection
|
|
5
|
+
from arekit.contrib.source.nerel.io_utils import NerelIOUtils
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class NerelDocReader(object):
|
|
9
|
+
|
|
10
|
+
def __init__(self, version, io_utils=NerelIOUtils()):
|
|
11
|
+
assert(isinstance(io_utils, NerelIOUtils))
|
|
12
|
+
self.__version = version
|
|
13
|
+
self.__io_utils = io_utils
|
|
14
|
+
self.__doc_fold = io_utils.map_doc_to_fold_type(version)
|
|
15
|
+
|
|
16
|
+
def read_text_relations(self, filename):
|
|
17
|
+
assert(isinstance(filename, str))
|
|
18
|
+
|
|
19
|
+
return self.__io_utils.read_from_zip(
|
|
20
|
+
inner_path=self.__io_utils.get_annotation_innerpath(
|
|
21
|
+
folding_data_type=self.__doc_fold[filename],
|
|
22
|
+
filename=filename),
|
|
23
|
+
process_func=lambda input_file: [
|
|
24
|
+
relation for relation in BratAnnotationParser.parse_annotations(
|
|
25
|
+
input_file=input_file, encoding='utf-8-sig')["relations"]],
|
|
26
|
+
version=self.__version)
|
|
27
|
+
|
|
28
|
+
def read_document(self, filename, doc_id, entities_to_ignore=None):
|
|
29
|
+
assert(isinstance(filename, str))
|
|
30
|
+
assert(isinstance(doc_id, int))
|
|
31
|
+
|
|
32
|
+
def file_to_doc(input_file):
|
|
33
|
+
sentences = BratDocumentSentencesReader.from_file(input_file=input_file, entities=entities)
|
|
34
|
+
return BratDocument(doc_id=doc_id, sentences=sentences, text_relations=text_relations)
|
|
35
|
+
|
|
36
|
+
entities = NerelEntityCollection.read_collection(
|
|
37
|
+
filename=filename, version=self.__version,
|
|
38
|
+
entities_to_ignore=entities_to_ignore, io_utils=self.__io_utils)
|
|
39
|
+
|
|
40
|
+
text_relations = self.read_text_relations(filename=filename)
|
|
41
|
+
|
|
42
|
+
return self.__io_utils.read_from_zip(
|
|
43
|
+
inner_path=self.__io_utils.get_news_innerpath(
|
|
44
|
+
folding_data_type=self.__doc_fold[filename], filename=filename),
|
|
45
|
+
process_func=file_to_doc,
|
|
46
|
+
version=self.__version)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from os.path import basename
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def __iter_filtered_filenames(filenames_iter):
|
|
5
|
+
for filename in filenames_iter:
|
|
6
|
+
extension = filename[-4:]
|
|
7
|
+
# Crop extension.
|
|
8
|
+
filename = filename[:-4]
|
|
9
|
+
if extension != ".txt":
|
|
10
|
+
continue
|
|
11
|
+
yield filename, basename(filename)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def iter_filename_and_splittype(filenames_it, splits):
|
|
15
|
+
for doc_id, data in enumerate(__iter_filtered_filenames(filenames_it)):
|
|
16
|
+
filepath, filename = data
|
|
17
|
+
for split_type, split_name in splits:
|
|
18
|
+
if split_name in filepath:
|
|
19
|
+
yield filename, split_type
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def iter_collection_filenames(filenames_it):
|
|
23
|
+
for doc_id, filename in enumerate(__iter_filtered_filenames(filenames_it)):
|
|
24
|
+
yield doc_id, filename
|
|
File without changes
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from os import path
|
|
2
|
+
|
|
3
|
+
from arekit.common.experiment.data_type import DataType
|
|
4
|
+
from arekit.contrib.source.nerel.folding.fixed import create_fixed_folding
|
|
5
|
+
from arekit.contrib.source.nerel.io_utils import NerelIOUtils
|
|
6
|
+
from arekit.contrib.source.nerel.utils import iter_filename_and_splittype
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class NerelBioIOUtils(NerelIOUtils):
|
|
10
|
+
|
|
11
|
+
splits = {
|
|
12
|
+
DataType.Train: "train",
|
|
13
|
+
DataType.Dev: "dev",
|
|
14
|
+
DataType.Test: "test"
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
@staticmethod
|
|
18
|
+
def get_archive_filepath(version):
|
|
19
|
+
return path.join(NerelBioIOUtils.get_data_root(), "nerel-bio-{}.zip".format(version))
|
|
20
|
+
|
|
21
|
+
@staticmethod
|
|
22
|
+
def get_annotation_innerpath(folding_data_type, filename):
|
|
23
|
+
assert(isinstance(filename, str))
|
|
24
|
+
return path.join(NerelBioIOUtils.splits[folding_data_type], "{}.ann".format(filename))
|
|
25
|
+
|
|
26
|
+
@staticmethod
|
|
27
|
+
def get_news_innerpath(folding_data_type, filename):
|
|
28
|
+
assert(isinstance(filename, str))
|
|
29
|
+
return path.join(NerelBioIOUtils.splits[folding_data_type], "{}.txt".format(filename))
|
|
30
|
+
|
|
31
|
+
@staticmethod
|
|
32
|
+
def map_doc_to_fold_type(version):
|
|
33
|
+
|
|
34
|
+
it = iter_filename_and_splittype(
|
|
35
|
+
filenames_it=NerelBioIOUtils.iter_filenames_from_zip(version),
|
|
36
|
+
splits=NerelBioIOUtils.splits.items())
|
|
37
|
+
|
|
38
|
+
d2f = {}
|
|
39
|
+
for filename, split_type in it:
|
|
40
|
+
d2f[filename] = split_type
|
|
41
|
+
|
|
42
|
+
return d2f
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def read_dataset_split(version, docs_limit=None):
|
|
46
|
+
|
|
47
|
+
it = iter_filename_and_splittype(
|
|
48
|
+
filenames_it=NerelBioIOUtils.iter_filenames_from_zip(version),
|
|
49
|
+
splits=NerelBioIOUtils.splits.items())
|
|
50
|
+
|
|
51
|
+
f2d = {}
|
|
52
|
+
for filename, split_type in it:
|
|
53
|
+
if split_type not in f2d:
|
|
54
|
+
f2d[split_type] = []
|
|
55
|
+
f2d[split_type].append(filename)
|
|
56
|
+
|
|
57
|
+
filenames_by_ids, data_folding = create_fixed_folding(train_filenames=f2d[DataType.Train],
|
|
58
|
+
test_filenames=f2d[DataType.Test],
|
|
59
|
+
dev_filenames=f2d[DataType.Dev],
|
|
60
|
+
limit=docs_limit)
|
|
61
|
+
|
|
62
|
+
return filenames_by_ids, data_folding
|