arekit 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/__init__.py +0 -0
- arekit/common/__init__.py +0 -0
- arekit/common/bound.py +48 -0
- arekit/common/context/__init__.py +0 -0
- arekit/common/context/terms_mapper.py +51 -0
- arekit/common/context/token.py +16 -0
- arekit/common/data/__init__.py +0 -0
- arekit/common/data/const.py +21 -0
- arekit/common/data/doc_provider.py +6 -0
- arekit/common/data/input/__init__.py +0 -0
- arekit/common/data/input/providers/__init__.py +0 -0
- arekit/common/data/input/providers/columns/__init__.py +0 -0
- arekit/common/data/input/providers/columns/base.py +9 -0
- arekit/common/data/input/providers/columns/sample.py +59 -0
- arekit/common/data/input/providers/const.py +3 -0
- arekit/common/data/input/providers/contents.py +9 -0
- arekit/common/data/input/providers/instances/__init__.py +0 -0
- arekit/common/data/input/providers/instances/base.py +14 -0
- arekit/common/data/input/providers/instances/multiple.py +27 -0
- arekit/common/data/input/providers/instances/single.py +8 -0
- arekit/common/data/input/providers/label/__init__.py +0 -0
- arekit/common/data/input/providers/label/base.py +24 -0
- arekit/common/data/input/providers/label/binary.py +11 -0
- arekit/common/data/input/providers/label/multiple.py +15 -0
- arekit/common/data/input/providers/rows/__init__.py +0 -0
- arekit/common/data/input/providers/rows/base.py +64 -0
- arekit/common/data/input/providers/rows/samples.py +227 -0
- arekit/common/data/input/providers/sample/__init__.py +0 -0
- arekit/common/data/input/providers/sample/cropped.py +43 -0
- arekit/common/data/input/providers/text/__init__.py +0 -0
- arekit/common/data/input/providers/text/single.py +49 -0
- arekit/common/data/input/repositories/__init__.py +0 -0
- arekit/common/data/input/repositories/base.py +68 -0
- arekit/common/data/input/repositories/sample.py +22 -0
- arekit/common/data/input/sample.py +66 -0
- arekit/common/data/input/terms_mapper.py +88 -0
- arekit/common/data/rows_fmt.py +82 -0
- arekit/common/data/rows_parser.py +43 -0
- arekit/common/data/storages/__init__.py +0 -0
- arekit/common/data/storages/base.py +109 -0
- arekit/common/data/views/__init__.py +0 -0
- arekit/common/data/views/samples.py +26 -0
- arekit/common/docs/__init__.py +0 -0
- arekit/common/docs/base.py +30 -0
- arekit/common/docs/entities_grouping.py +16 -0
- arekit/common/docs/entity.py +18 -0
- arekit/common/docs/objects_parser.py +37 -0
- arekit/common/docs/parsed/__init__.py +0 -0
- arekit/common/docs/parsed/base.py +101 -0
- arekit/common/docs/parsed/providers/__init__.py +0 -0
- arekit/common/docs/parsed/providers/base.py +68 -0
- arekit/common/docs/parsed/providers/base_pairs.py +51 -0
- arekit/common/docs/parsed/providers/entity_service.py +175 -0
- arekit/common/docs/parsed/providers/opinion_pairs.py +20 -0
- arekit/common/docs/parsed/providers/text_opinion_pairs.py +78 -0
- arekit/common/docs/parsed/service.py +31 -0
- arekit/common/docs/parsed/term_position.py +42 -0
- arekit/common/docs/parser.py +34 -0
- arekit/common/docs/sentence.py +14 -0
- arekit/common/entities/__init__.py +0 -0
- arekit/common/entities/base.py +51 -0
- arekit/common/entities/collection.py +72 -0
- arekit/common/entities/str_fmt.py +8 -0
- arekit/common/entities/types.py +9 -0
- arekit/common/experiment/__init__.py +0 -0
- arekit/common/experiment/api/__init__.py +0 -0
- arekit/common/experiment/api/base_samples_io.py +20 -0
- arekit/common/experiment/data_type.py +17 -0
- arekit/common/frames/__init__.py +0 -0
- arekit/common/frames/connotations/__init__.py +0 -0
- arekit/common/frames/connotations/descriptor.py +17 -0
- arekit/common/frames/connotations/provider.py +4 -0
- arekit/common/frames/text_variant.py +43 -0
- arekit/common/frames/variants/__init__.py +0 -0
- arekit/common/frames/variants/base.py +21 -0
- arekit/common/frames/variants/collection.py +60 -0
- arekit/common/labels/__init__.py +0 -0
- arekit/common/labels/base.py +19 -0
- arekit/common/labels/provider/__init__.py +0 -0
- arekit/common/labels/provider/base.py +7 -0
- arekit/common/labels/provider/constant.py +14 -0
- arekit/common/labels/scaler/__init__.py +0 -0
- arekit/common/labels/scaler/base.py +85 -0
- arekit/common/labels/scaler/sentiment.py +7 -0
- arekit/common/labels/scaler/single.py +10 -0
- arekit/common/labels/str_fmt.py +55 -0
- arekit/common/linkage/__init__.py +0 -0
- arekit/common/linkage/base.py +44 -0
- arekit/common/linkage/meta.py +23 -0
- arekit/common/linkage/opinions.py +9 -0
- arekit/common/linkage/text_opinions.py +22 -0
- arekit/common/log_utils.py +29 -0
- arekit/common/model/__init__.py +0 -0
- arekit/common/model/labeling/__init__.py +0 -0
- arekit/common/model/labeling/base.py +24 -0
- arekit/common/model/labeling/modes.py +8 -0
- arekit/common/model/labeling/single.py +24 -0
- arekit/common/opinions/__init__.py +0 -0
- arekit/common/opinions/annot/__init__.py +0 -0
- arekit/common/opinions/annot/algo/__init__.py +0 -0
- arekit/common/opinions/annot/algo/base.py +4 -0
- arekit/common/opinions/annot/algo/pair_based.py +99 -0
- arekit/common/opinions/annot/algo/predefined.py +16 -0
- arekit/common/opinions/annot/algo_based.py +55 -0
- arekit/common/opinions/annot/base.py +15 -0
- arekit/common/opinions/base.py +74 -0
- arekit/common/opinions/collection.py +150 -0
- arekit/common/opinions/enums.py +6 -0
- arekit/common/opinions/provider.py +4 -0
- arekit/common/opinions/writer.py +4 -0
- arekit/common/pipeline/__init__.py +0 -0
- arekit/common/pipeline/base.py +25 -0
- arekit/common/pipeline/context.py +36 -0
- arekit/common/pipeline/conts.py +2 -0
- arekit/common/pipeline/items/__init__.py +0 -0
- arekit/common/pipeline/items/base.py +12 -0
- arekit/common/pipeline/items/flatten.py +14 -0
- arekit/common/pipeline/items/handle.py +17 -0
- arekit/common/pipeline/items/iter.py +11 -0
- arekit/common/pipeline/items/map.py +11 -0
- arekit/common/pipeline/items/map_nested.py +13 -0
- arekit/common/synonyms/__init__.py +0 -0
- arekit/common/synonyms/base.py +151 -0
- arekit/common/synonyms/grouping.py +21 -0
- arekit/common/text/__init__.py +0 -0
- arekit/common/text/enums.py +12 -0
- arekit/common/text/parsed.py +42 -0
- arekit/common/text/parser.py +12 -0
- arekit/common/text/partitioning/__init__.py +0 -0
- arekit/common/text/partitioning/base.py +4 -0
- arekit/common/text/partitioning/str.py +36 -0
- arekit/common/text/partitioning/terms.py +35 -0
- arekit/common/text/stemmer.py +16 -0
- arekit/common/text_opinions/__init__.py +0 -0
- arekit/common/text_opinions/base.py +105 -0
- arekit/common/utils.py +129 -0
- arekit/contrib/__init__.py +0 -0
- arekit/contrib/bert/__init__.py +0 -0
- arekit/contrib/bert/input/__init__.py +0 -0
- arekit/contrib/bert/input/providers/__init__.py +0 -0
- arekit/contrib/bert/input/providers/cropped_sample.py +17 -0
- arekit/contrib/bert/input/providers/text_pair.py +62 -0
- arekit/contrib/bert/terms/__init__.py +0 -0
- arekit/contrib/bert/terms/mapper.py +20 -0
- arekit/contrib/networks/__init__.py +0 -0
- arekit/contrib/networks/embedding.py +149 -0
- arekit/contrib/networks/embedding_io.py +18 -0
- arekit/contrib/networks/input/__init__.py +0 -0
- arekit/contrib/networks/input/const.py +6 -0
- arekit/contrib/networks/input/ctx_serialization.py +28 -0
- arekit/contrib/networks/input/embedding/__init__.py +0 -0
- arekit/contrib/networks/input/embedding/matrix.py +29 -0
- arekit/contrib/networks/input/embedding/offsets.py +55 -0
- arekit/contrib/networks/input/formatters/__init__.py +0 -0
- arekit/contrib/networks/input/formatters/pos_mapper.py +22 -0
- arekit/contrib/networks/input/providers/__init__.py +0 -0
- arekit/contrib/networks/input/providers/sample.py +129 -0
- arekit/contrib/networks/input/providers/term_connotation.py +23 -0
- arekit/contrib/networks/input/providers/text.py +24 -0
- arekit/contrib/networks/input/rows_parser.py +47 -0
- arekit/contrib/networks/input/term_types.py +13 -0
- arekit/contrib/networks/input/terms_mapping.py +60 -0
- arekit/contrib/networks/vectorizer.py +6 -0
- arekit/contrib/prompt/__init__.py +0 -0
- arekit/contrib/prompt/sample.py +61 -0
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +84 -0
- arekit/contrib/source/brat/doc.py +28 -0
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +13 -0
- arekit/contrib/source/brat/entities/entity.py +42 -0
- arekit/contrib/source/brat/entities/parser.py +53 -0
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +19 -0
- arekit/contrib/source/brat/relation.py +32 -0
- arekit/contrib/source/brat/sentence.py +69 -0
- arekit/contrib/source/brat/sentences_reader.py +128 -0
- arekit/contrib/source/download.py +41 -0
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +55 -0
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +74 -0
- arekit/contrib/source/nerel/io_utils.py +62 -0
- arekit/contrib/source/nerel/labels.py +241 -0
- arekit/contrib/source/nerel/reader.py +46 -0
- arekit/contrib/source/nerel/utils.py +24 -0
- arekit/contrib/source/nerel/versions.py +12 -0
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +62 -0
- arekit/contrib/source/nerelbio/labels.py +265 -0
- arekit/contrib/source/nerelbio/reader.py +8 -0
- arekit/contrib/source/nerelbio/versions.py +8 -0
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +36 -0
- arekit/contrib/source/ruattitudes/doc.py +51 -0
- arekit/contrib/source/ruattitudes/doc_brat.py +44 -0
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +7 -0
- arekit/contrib/source/ruattitudes/io_utils.py +56 -0
- arekit/contrib/source/ruattitudes/labels_fmt.py +12 -0
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +28 -0
- arekit/contrib/source/ruattitudes/opinions/converter.py +37 -0
- arekit/contrib/source/ruattitudes/reader.py +268 -0
- arekit/contrib/source/ruattitudes/sentence.py +73 -0
- arekit/contrib/source/ruattitudes/synonyms.py +17 -0
- arekit/contrib/source/ruattitudes/text_object.py +59 -0
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +157 -0
- arekit/contrib/source/rusentiframes/effect.py +24 -0
- arekit/contrib/source/rusentiframes/io_utils.py +19 -0
- arekit/contrib/source/rusentiframes/labels_fmt.py +22 -0
- arekit/contrib/source/rusentiframes/polarity.py +35 -0
- arekit/contrib/source/rusentiframes/role.py +15 -0
- arekit/contrib/source/rusentiframes/state.py +24 -0
- arekit/contrib/source/rusentiframes/types.py +42 -0
- arekit/contrib/source/rusentiframes/value.py +2 -0
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +3 -0
- arekit/contrib/source/rusentrel/docs_reader.py +51 -0
- arekit/contrib/source/rusentrel/entities.py +26 -0
- arekit/contrib/source/rusentrel/io_utils.py +125 -0
- arekit/contrib/source/rusentrel/labels_fmt.py +12 -0
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +30 -0
- arekit/contrib/source/rusentrel/opinions/converter.py +40 -0
- arekit/contrib/source/rusentrel/opinions/provider.py +54 -0
- arekit/contrib/source/rusentrel/opinions/writer.py +42 -0
- arekit/contrib/source/rusentrel/synonyms.py +17 -0
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +52 -0
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +31 -0
- arekit/contrib/source/sentinerel/folding/fixed.py +70 -0
- arekit/contrib/source/sentinerel/io_utils.py +87 -0
- arekit/contrib/source/sentinerel/labels.py +53 -0
- arekit/contrib/source/sentinerel/labels_scaler.py +30 -0
- arekit/contrib/source/sentinerel/reader.py +42 -0
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +19 -0
- arekit/contrib/source/zip_utils.py +47 -0
- arekit/contrib/utils/__init__.py +0 -0
- arekit/contrib/utils/bert/__init__.py +0 -0
- arekit/contrib/utils/bert/samplers.py +17 -0
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +23 -0
- arekit/contrib/utils/data/__init__.py +0 -0
- arekit/contrib/utils/data/contents/__init__.py +0 -0
- arekit/contrib/utils/data/contents/opinions.py +37 -0
- arekit/contrib/utils/data/doc_provider/__init__.py +0 -0
- arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
- arekit/contrib/utils/data/doc_provider/dir_based.py +53 -0
- arekit/contrib/utils/data/readers/__init__.py +0 -0
- arekit/contrib/utils/data/readers/base.py +7 -0
- arekit/contrib/utils/data/readers/csv_pd.py +38 -0
- arekit/contrib/utils/data/readers/jsonl.py +15 -0
- arekit/contrib/utils/data/service/__init__.py +0 -0
- arekit/contrib/utils/data/service/balance.py +50 -0
- arekit/contrib/utils/data/storages/__init__.py +0 -0
- arekit/contrib/utils/data/storages/jsonl_based.py +18 -0
- arekit/contrib/utils/data/storages/pandas_based.py +123 -0
- arekit/contrib/utils/data/storages/row_cache.py +48 -0
- arekit/contrib/utils/data/writers/__init__.py +0 -0
- arekit/contrib/utils/data/writers/base.py +27 -0
- arekit/contrib/utils/data/writers/csv_native.py +63 -0
- arekit/contrib/utils/data/writers/csv_pd.py +40 -0
- arekit/contrib/utils/data/writers/json_opennre.py +132 -0
- arekit/contrib/utils/data/writers/sqlite_native.py +110 -0
- arekit/contrib/utils/download.py +77 -0
- arekit/contrib/utils/embeddings/__init__.py +0 -0
- arekit/contrib/utils/embeddings/rusvectores.py +58 -0
- arekit/contrib/utils/embeddings/tokens.py +30 -0
- arekit/contrib/utils/entities/__init__.py +0 -0
- arekit/contrib/utils/entities/filter.py +7 -0
- arekit/contrib/utils/entities/formatters/__init__.py +0 -0
- arekit/contrib/utils/entities/formatters/str_display.py +11 -0
- arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py +15 -0
- arekit/contrib/utils/io_utils/__init__.py +0 -0
- arekit/contrib/utils/io_utils/embedding.py +72 -0
- arekit/contrib/utils/io_utils/opinions.py +37 -0
- arekit/contrib/utils/io_utils/samples.py +79 -0
- arekit/contrib/utils/io_utils/utils.py +39 -0
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +41 -0
- arekit/contrib/utils/lexicons/relation.py +42 -0
- arekit/contrib/utils/lexicons/rusentilex.py +37 -0
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +83 -0
- arekit/contrib/utils/np_utils/__init__.py +0 -0
- arekit/contrib/utils/np_utils/embedding.py +22 -0
- arekit/contrib/utils/np_utils/npz_utils.py +13 -0
- arekit/contrib/utils/np_utils/vocab.py +20 -0
- arekit/contrib/utils/pipelines/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/base.py +99 -0
- arekit/contrib/utils/pipelines/items/sampling/networks.py +54 -0
- arekit/contrib/utils/pipelines/items/text/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/text/entities_default.py +23 -0
- arekit/contrib/utils/pipelines/items/text/frames.py +86 -0
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +36 -0
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +32 -0
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +10 -0
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +107 -0
- arekit/contrib/utils/pipelines/items/text/translator.py +135 -0
- arekit/contrib/utils/pipelines/opinion_collections.py +85 -0
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +27 -0
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +65 -0
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +60 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +29 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +64 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +79 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +56 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +20 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +65 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +21 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +107 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +29 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +62 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +180 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +50 -0
- arekit/contrib/utils/pipelines/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +34 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +88 -0
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +93 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/base.py +4 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +16 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +29 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +26 -0
- arekit/contrib/utils/processing/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/mods.py +12 -0
- arekit/contrib/utils/processing/languages/pos.py +23 -0
- arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/ru/cases.py +78 -0
- arekit/contrib/utils/processing/languages/ru/constants.py +6 -0
- arekit/contrib/utils/processing/languages/ru/mods.py +13 -0
- arekit/contrib/utils/processing/languages/ru/number.py +23 -0
- arekit/contrib/utils/processing/languages/ru/pos_service.py +36 -0
- arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
- arekit/contrib/utils/processing/lemmatization/mystem.py +51 -0
- arekit/contrib/utils/processing/pos/__init__.py +0 -0
- arekit/contrib/utils/processing/pos/base.py +12 -0
- arekit/contrib/utils/processing/pos/mystem_wrap.py +134 -0
- arekit/contrib/utils/processing/pos/russian.py +10 -0
- arekit/contrib/utils/processing/text/__init__.py +0 -0
- arekit/contrib/utils/processing/text/tokens.py +127 -0
- arekit/contrib/utils/resources.py +25 -0
- arekit/contrib/utils/serializer.py +43 -0
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +63 -0
- arekit/contrib/utils/synonyms/__init__.py +0 -0
- arekit/contrib/utils/synonyms/simple.py +15 -0
- arekit/contrib/utils/synonyms/stemmer_based.py +38 -0
- arekit/contrib/utils/vectorizers/__init__.py +0 -0
- arekit/contrib/utils/vectorizers/bpe.py +93 -0
- arekit/contrib/utils/vectorizers/random_norm.py +39 -0
- arekit/download_data.py +11 -0
- arekit-0.24.0.dist-info/LICENSE +21 -0
- arekit-0.24.0.dist-info/METADATA +23 -0
- arekit-0.24.0.dist-info/RECORD +374 -0
- arekit-0.24.0.dist-info/WHEEL +5 -0
- arekit-0.24.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from arekit.common.data.input.terms_mapper import OpinionContainingTextTermsMapper
|
|
2
|
+
from arekit.common.entities.base import Entity
|
|
3
|
+
from arekit.common.frames.text_variant import TextFrameVariant
|
|
4
|
+
from arekit.contrib.networks.input.term_types import TermTypes
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class VectorizedNetworkTermMapping(OpinionContainingTextTermsMapper):
|
|
8
|
+
""" For every element returns: (word, embedded vector)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def __init__(self, string_entities_formatter, vectorizers):
|
|
12
|
+
"""string_emb_entity_formatter:
|
|
13
|
+
Utilized in order to obtain embedding value from predefined_embeding for entities
|
|
14
|
+
vectorizers:
|
|
15
|
+
dict
|
|
16
|
+
"""
|
|
17
|
+
assert(isinstance(vectorizers, dict))
|
|
18
|
+
|
|
19
|
+
for term_type in TermTypes.iter_types():
|
|
20
|
+
assert(term_type in vectorizers)
|
|
21
|
+
|
|
22
|
+
super(VectorizedNetworkTermMapping, self).__init__(
|
|
23
|
+
entity_formatter=string_entities_formatter)
|
|
24
|
+
|
|
25
|
+
self.__vectorizers = vectorizers
|
|
26
|
+
|
|
27
|
+
def map_term(self, term_type, term):
|
|
28
|
+
"""Universal term mapping method.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
term_type (TermTypes): The type of term to map.
|
|
32
|
+
term (str): The term to map.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
The mapped term.
|
|
36
|
+
"""
|
|
37
|
+
return self.__vectorizers[term_type].create_term_embedding(term=term)
|
|
38
|
+
|
|
39
|
+
def map_word(self, w_ind, word):
|
|
40
|
+
return self.map_term(TermTypes.WORD, word)
|
|
41
|
+
|
|
42
|
+
def map_text_frame_variant(self, fv_ind, text_frame_variant):
|
|
43
|
+
assert(isinstance(text_frame_variant, TextFrameVariant))
|
|
44
|
+
return self.map_term(TermTypes.FRAME, text_frame_variant.Variant.get_value())
|
|
45
|
+
|
|
46
|
+
def map_token(self, t_ind, token):
|
|
47
|
+
""" It assumes to be composed for all the supported types.
|
|
48
|
+
"""
|
|
49
|
+
return self.map_term(TermTypes.TOKEN, token.get_token_value())
|
|
50
|
+
|
|
51
|
+
def map_entity(self, e_ind, entity):
|
|
52
|
+
assert(isinstance(entity, Entity))
|
|
53
|
+
|
|
54
|
+
# Value extraction
|
|
55
|
+
str_formatted_entity = super(VectorizedNetworkTermMapping, self).map_entity(
|
|
56
|
+
e_ind=e_ind,
|
|
57
|
+
entity=entity)
|
|
58
|
+
|
|
59
|
+
# Vector extraction
|
|
60
|
+
return self.map_term(TermTypes.ENTITY, str_formatted_entity)
|
|
File without changes
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from arekit.common.data import const
|
|
2
|
+
from arekit.common.data.input.providers.sample.cropped import CroppedSampleRowProvider
|
|
3
|
+
from arekit.common.data.input.providers.text.single import BaseSingleTextProvider
|
|
4
|
+
from arekit.common.labels.str_fmt import StringLabelsFormatter
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class PromptedSampleRowProvider(CroppedSampleRowProvider):
|
|
8
|
+
""" Sample, enriched with the prompt technique.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def __init__(self, crop_window_size, label_scaler, text_provider, prompt, label_fmt=None):
|
|
12
|
+
""" crop_window_size: int
|
|
13
|
+
crop window size for the original text.
|
|
14
|
+
prompt: str
|
|
15
|
+
text which wraps the original cropped (optionally text).
|
|
16
|
+
this string suppose to include the following parameters (optional):
|
|
17
|
+
text, s_ind, t_ind, s_val, t_val, label_uint
|
|
18
|
+
"""
|
|
19
|
+
assert(isinstance(prompt, str))
|
|
20
|
+
assert(isinstance(text_provider, BaseSingleTextProvider))
|
|
21
|
+
assert(isinstance(label_fmt, StringLabelsFormatter) or label_fmt is None)
|
|
22
|
+
|
|
23
|
+
super(PromptedSampleRowProvider, self).__init__(crop_window_size=crop_window_size,
|
|
24
|
+
label_scaler=label_scaler,
|
|
25
|
+
text_provider=text_provider)
|
|
26
|
+
|
|
27
|
+
self.__prompt = prompt
|
|
28
|
+
self.__labels_fmt = label_fmt
|
|
29
|
+
|
|
30
|
+
def _fill_row_core(self, row, text_opinion_linkage, index_in_linked, etalon_label,
|
|
31
|
+
parsed_doc, sentence_ind, s_ind, t_ind):
|
|
32
|
+
|
|
33
|
+
super(PromptedSampleRowProvider, self)._fill_row_core(row=row,
|
|
34
|
+
text_opinion_linkage=text_opinion_linkage,
|
|
35
|
+
index_in_linked=index_in_linked,
|
|
36
|
+
etalon_label=etalon_label,
|
|
37
|
+
parsed_doc=parsed_doc,
|
|
38
|
+
sentence_ind=sentence_ind,
|
|
39
|
+
s_ind=s_ind,
|
|
40
|
+
t_ind=t_ind)
|
|
41
|
+
original_text = row[BaseSingleTextProvider.TEXT_A]
|
|
42
|
+
|
|
43
|
+
sentence_terms, actual_s_ind, actual_t_ind = self._provide_sentence_terms(
|
|
44
|
+
parsed_doc=parsed_doc, sentence_ind=sentence_ind, s_ind=s_ind, t_ind=t_ind)
|
|
45
|
+
|
|
46
|
+
label_uint = row[const.LABEL_UINT] if const.LABEL_UINT in row else None
|
|
47
|
+
label_val = str(label_uint) if label_uint is None or self.__labels_fmt is None else \
|
|
48
|
+
self.__labels_fmt.label_to_str(self._label_provider.LabelScaler.uint_to_label(row[const.LABEL_UINT]))
|
|
49
|
+
|
|
50
|
+
vm = {
|
|
51
|
+
const.TEXT: self.__prompt.format(
|
|
52
|
+
text=original_text,
|
|
53
|
+
s_ind=row[const.S_IND],
|
|
54
|
+
t_ind=row[const.T_IND],
|
|
55
|
+
s_val=sentence_terms[actual_s_ind].DisplayValue,
|
|
56
|
+
t_val=sentence_terms[actual_t_ind].DisplayValue,
|
|
57
|
+
label_uint=label_uint,
|
|
58
|
+
label_val=label_val)
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
self._apply_row_data(row=row, vm=vm, val_fmt=self._val_fmt)
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from arekit.contrib.source.brat.entities.entity import BratEntity
|
|
2
|
+
from arekit.contrib.source.brat.relation import BratRelation
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class BratAnnotationParser:
|
|
6
|
+
|
|
7
|
+
ENTITIES = "entities"
|
|
8
|
+
RELATIONS = "relations"
|
|
9
|
+
|
|
10
|
+
@staticmethod
|
|
11
|
+
def __non_prefixed_id(value):
|
|
12
|
+
assert (isinstance(value, str))
|
|
13
|
+
return value[1:]
|
|
14
|
+
|
|
15
|
+
@staticmethod
|
|
16
|
+
def handle_entity(args):
|
|
17
|
+
""" T2 Location 10 23 South America
|
|
18
|
+
T1 Location 0 5;16 23 North America
|
|
19
|
+
"""
|
|
20
|
+
assert(len(args) == 3)
|
|
21
|
+
|
|
22
|
+
e_id = int(BratAnnotationParser.__non_prefixed_id(args[0]))
|
|
23
|
+
entity_params = args[1].split()
|
|
24
|
+
|
|
25
|
+
if len(entity_params) != 3:
|
|
26
|
+
# We do not support the case of a non-continuous entity mentions.
|
|
27
|
+
return None
|
|
28
|
+
|
|
29
|
+
e_str_type, e_begin, e_end = entity_params
|
|
30
|
+
|
|
31
|
+
return BratEntity(id_in_doc=e_id,
|
|
32
|
+
e_type=e_str_type,
|
|
33
|
+
index_begin=int(e_begin),
|
|
34
|
+
index_end=int(e_end),
|
|
35
|
+
childs=None,
|
|
36
|
+
value=args[2].strip())
|
|
37
|
+
|
|
38
|
+
@staticmethod
|
|
39
|
+
def handle_relation(args):
|
|
40
|
+
""" Example:
|
|
41
|
+
R1 Origin Arg1:T3 Arg2:T4
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
# Parse identifier index.
|
|
45
|
+
e_id = args[0][1:]
|
|
46
|
+
|
|
47
|
+
# Parse relation arguments.
|
|
48
|
+
rel_type, source, target = args[1].split()
|
|
49
|
+
|
|
50
|
+
source_id = source.split(':')[1]
|
|
51
|
+
target_id = target.split(':')[1]
|
|
52
|
+
|
|
53
|
+
return BratRelation(id_in_doc=e_id,
|
|
54
|
+
source_id=int(BratAnnotationParser.__non_prefixed_id(source_id)),
|
|
55
|
+
target_id=int(BratAnnotationParser.__non_prefixed_id(target_id)),
|
|
56
|
+
rel_type=rel_type)
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def parse_annotations(input_file, encoding='utf-8'):
|
|
60
|
+
""" Read annotation collection from file
|
|
61
|
+
"""
|
|
62
|
+
entities = []
|
|
63
|
+
relations = []
|
|
64
|
+
|
|
65
|
+
for line in input_file.readlines():
|
|
66
|
+
line = line.decode(encoding)
|
|
67
|
+
|
|
68
|
+
args = line.split('\t')
|
|
69
|
+
|
|
70
|
+
record_type = args[0][0]
|
|
71
|
+
|
|
72
|
+
# Entities (objects) are prefixed with `T`
|
|
73
|
+
if record_type == "T":
|
|
74
|
+
entity = BratAnnotationParser.handle_entity(args)
|
|
75
|
+
if entity is not None:
|
|
76
|
+
entities.append(entity)
|
|
77
|
+
|
|
78
|
+
elif record_type == "R":
|
|
79
|
+
relations.append(BratAnnotationParser.handle_relation(args))
|
|
80
|
+
|
|
81
|
+
return {
|
|
82
|
+
BratAnnotationParser.ENTITIES: entities,
|
|
83
|
+
BratAnnotationParser.RELATIONS: relations
|
|
84
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from arekit.common.docs.base import Document
|
|
2
|
+
from arekit.contrib.source.brat.entities.entity import BratEntity
|
|
3
|
+
from arekit.contrib.source.brat.sentence import BratSentence
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class BratDocument(Document):
|
|
7
|
+
|
|
8
|
+
def __init__(self, doc_id, sentences, text_relations):
|
|
9
|
+
assert(isinstance(text_relations, list) or text_relations is None)
|
|
10
|
+
super(BratDocument, self).__init__(doc_id=doc_id, sentences=sentences)
|
|
11
|
+
self.__text_relations = text_relations
|
|
12
|
+
self.__entity_by_id = {}
|
|
13
|
+
for sentence in sentences:
|
|
14
|
+
assert(isinstance(sentence, BratSentence))
|
|
15
|
+
for brat_entity, _ in sentence.iter_entity_with_local_bounds():
|
|
16
|
+
assert(isinstance(brat_entity, BratEntity))
|
|
17
|
+
self.__entity_by_id[brat_entity.ID] = brat_entity
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def Relations(self):
|
|
21
|
+
for brat_relation in self.__text_relations:
|
|
22
|
+
yield brat_relation
|
|
23
|
+
|
|
24
|
+
def contains_entity(self, entity_id):
|
|
25
|
+
return entity_id in self.__entity_by_id
|
|
26
|
+
|
|
27
|
+
def get_entity_by_id(self, entity_id):
|
|
28
|
+
return self.__entity_by_id[entity_id]
|
|
File without changes
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from arekit.contrib.source.brat.entities.entity import BratEntity
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class BratCompoundEntity(BratEntity):
|
|
5
|
+
""" Entity which contains the hierarchy of the other entities.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
@classmethod
|
|
9
|
+
def from_list(cls, root, childs):
|
|
10
|
+
assert(isinstance(root, BratEntity))
|
|
11
|
+
assert(isinstance(childs, list) and len(childs) > 0)
|
|
12
|
+
return cls(id_in_doc=root.ID, value=root.Value, e_type=root.Type, childs=childs,
|
|
13
|
+
index_begin=root.IndexBegin, index_end=root.IndexEnd)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from arekit.common.entities.base import Entity
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class BratEntity(Entity):
|
|
5
|
+
""" Annotated entity in Brat-based collection corpus.
|
|
6
|
+
Provides bounds, i.e. char indices in related sentence.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
def __init__(self, id_in_doc, e_type, index_begin, index_end, value, childs, display_value=None, group_index=None):
|
|
10
|
+
""" index_begin: int
|
|
11
|
+
- char index (in case of string type of `text`)
|
|
12
|
+
- term index (in case of list type of `text`)
|
|
13
|
+
index_end: int
|
|
14
|
+
- char index (in case of string type of `text`)
|
|
15
|
+
- term index (in case of list type of `text`)
|
|
16
|
+
"""
|
|
17
|
+
assert(isinstance(e_type, str))
|
|
18
|
+
assert(isinstance(index_begin, int))
|
|
19
|
+
assert(isinstance(index_end, int))
|
|
20
|
+
super(BratEntity, self).__init__(value=value, e_type=e_type, childs=childs,
|
|
21
|
+
display_value=display_value, group_index=group_index)
|
|
22
|
+
|
|
23
|
+
self.__e_type = e_type
|
|
24
|
+
self.__begin = index_begin
|
|
25
|
+
self.__end = index_end
|
|
26
|
+
self.__id = id_in_doc
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def IndexBegin(self):
|
|
30
|
+
return self.__begin
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def IndexEnd(self):
|
|
34
|
+
return self.__end
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def Type(self):
|
|
38
|
+
return self.__e_type
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def ID(self):
|
|
42
|
+
return self.__id
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from arekit.common.docs.objects_parser import SentenceObjectsParserPipelineItem
|
|
2
|
+
from arekit.common.pipeline.context import PipelineContext
|
|
3
|
+
from arekit.common.text.partitioning.str import StringPartitioning
|
|
4
|
+
from arekit.common.text.partitioning.terms import TermsPartitioning
|
|
5
|
+
from arekit.contrib.source.brat.sentence import BratSentence
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BratTextEntitiesParser(SentenceObjectsParserPipelineItem):
|
|
9
|
+
|
|
10
|
+
KEY = "sentence"
|
|
11
|
+
|
|
12
|
+
################################
|
|
13
|
+
# NOTE: Supported partitionings.
|
|
14
|
+
################################
|
|
15
|
+
# By default, BRAT annotation proposes to adopt entities annotation
|
|
16
|
+
# based on string input, which means that entity ends described as
|
|
17
|
+
# `char-ind-begin` and `char-ind-end`. However, the latter could be
|
|
18
|
+
# expanded to list of terms, which means that we deal with `ind-begin`
|
|
19
|
+
# and `ind-end` list indices.
|
|
20
|
+
__supported_partitionings = {
|
|
21
|
+
"string": StringPartitioning(),
|
|
22
|
+
"terms": TermsPartitioning()
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
def __init__(self, partitioning="string"):
|
|
26
|
+
assert(isinstance(partitioning, str))
|
|
27
|
+
super(BratTextEntitiesParser, self).__init__(self.__supported_partitionings[partitioning])
|
|
28
|
+
|
|
29
|
+
# region protected methods
|
|
30
|
+
|
|
31
|
+
def _get_text(self, pipeline_ctx):
|
|
32
|
+
sentence = self.__get_sentence(pipeline_ctx)
|
|
33
|
+
return sentence.Text
|
|
34
|
+
|
|
35
|
+
def _get_parts_provider_func(self, input_data, pipeline_ctx):
|
|
36
|
+
sentence = self.__get_sentence(pipeline_ctx)
|
|
37
|
+
return self.__iter_subs_values_with_bounds(sentence)
|
|
38
|
+
|
|
39
|
+
# endregion
|
|
40
|
+
|
|
41
|
+
# region private methods
|
|
42
|
+
|
|
43
|
+
def __get_sentence(self, pipeline_ctx):
|
|
44
|
+
assert(isinstance(pipeline_ctx, PipelineContext))
|
|
45
|
+
assert(self.KEY in pipeline_ctx)
|
|
46
|
+
return pipeline_ctx.provide(self.KEY)
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def __iter_subs_values_with_bounds(sentence):
|
|
50
|
+
assert(isinstance(sentence, BratSentence))
|
|
51
|
+
return sentence.iter_entity_with_local_bounds()
|
|
52
|
+
|
|
53
|
+
# endregion
|
|
File without changes
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from arekit.common.labels.str_fmt import StringLabelsFormatter
|
|
2
|
+
from arekit.common.text_opinions.base import TextOpinion
|
|
3
|
+
from arekit.contrib.source.brat.relation import BratRelation
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class BratRelationConverter(object):
|
|
7
|
+
|
|
8
|
+
@staticmethod
|
|
9
|
+
def to_text_opinion(brat_relation, doc_id, label_formatter):
|
|
10
|
+
""" Converts opinion into document-level referenced opinion
|
|
11
|
+
"""
|
|
12
|
+
assert (isinstance(brat_relation, BratRelation))
|
|
13
|
+
assert(isinstance(label_formatter, StringLabelsFormatter))
|
|
14
|
+
|
|
15
|
+
return TextOpinion(doc_id=doc_id,
|
|
16
|
+
text_opinion_id=int(brat_relation.ID),
|
|
17
|
+
source_id=brat_relation.SourceID,
|
|
18
|
+
target_id=brat_relation.TargetID,
|
|
19
|
+
label=label_formatter.str_to_label(brat_relation.Type))
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
class BratRelation(object):
|
|
2
|
+
|
|
3
|
+
def __init__(self, id_in_doc, source_id, target_id, rel_type):
|
|
4
|
+
assert(isinstance(id_in_doc, str))
|
|
5
|
+
assert(isinstance(source_id, int))
|
|
6
|
+
assert(isinstance(target_id, int))
|
|
7
|
+
assert(isinstance(rel_type, str))
|
|
8
|
+
|
|
9
|
+
self.__id = id_in_doc
|
|
10
|
+
self.__rel_type = rel_type
|
|
11
|
+
self.__source_id = source_id
|
|
12
|
+
self.__target_id = target_id
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def ID(self):
|
|
16
|
+
return self.__id
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def Type(self):
|
|
20
|
+
return self.__rel_type
|
|
21
|
+
|
|
22
|
+
@property
|
|
23
|
+
def SourceID(self):
|
|
24
|
+
""" Arg0.
|
|
25
|
+
"""
|
|
26
|
+
return self.__source_id
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def TargetID(self):
|
|
30
|
+
""" Arg1.
|
|
31
|
+
"""
|
|
32
|
+
return self.__target_id
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from functools import cmp_to_key
|
|
2
|
+
|
|
3
|
+
from arekit.common.bound import Bound
|
|
4
|
+
from arekit.common.docs.sentence import BaseDocumentSentence
|
|
5
|
+
from arekit.contrib.source.brat.entities.compound import BratCompoundEntity
|
|
6
|
+
from arekit.contrib.source.brat.entities.entity import BratEntity
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BratSentence(BaseDocumentSentence):
|
|
10
|
+
""" Represent a raw sentence of BRAT.
|
|
11
|
+
Provides text could be used to parse then.
|
|
12
|
+
Provides API to store entities.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, text, index_begin, entities):
|
|
16
|
+
""" entities: list of BratEntities
|
|
17
|
+
index_begin: int
|
|
18
|
+
- char index (in case of string type of `text`)
|
|
19
|
+
- term index (in case of list type of `text`)
|
|
20
|
+
"""
|
|
21
|
+
assert(isinstance(text, str) or isinstance(text, list))
|
|
22
|
+
assert(isinstance(index_begin, int))
|
|
23
|
+
assert(isinstance(entities, list))
|
|
24
|
+
super(BratSentence, self).__init__(text=text)
|
|
25
|
+
self.__index_begin = index_begin
|
|
26
|
+
self.__entities = entities
|
|
27
|
+
|
|
28
|
+
@staticmethod
|
|
29
|
+
def cmp_entities(a, b):
|
|
30
|
+
assert(isinstance(a, BratEntity))
|
|
31
|
+
assert(isinstance(b, BratEntity))
|
|
32
|
+
if a.IndexBegin != b.IndexBegin:
|
|
33
|
+
# Ordered by appearance
|
|
34
|
+
return a.IndexBegin - b.IndexBegin
|
|
35
|
+
else:
|
|
36
|
+
# Ordered by length first
|
|
37
|
+
b_length = b.IndexEnd - b.IndexBegin
|
|
38
|
+
a_length = a.IndexEnd - a.IndexBegin
|
|
39
|
+
return b_length - a_length
|
|
40
|
+
|
|
41
|
+
def iter_entity_with_local_bounds(self):
|
|
42
|
+
self.__entities.sort(key=cmp_to_key(lambda a, b: self.cmp_entities(a, b)))
|
|
43
|
+
|
|
44
|
+
bounds_and_entities = []
|
|
45
|
+
|
|
46
|
+
# Merging nested entities.
|
|
47
|
+
for entity in self.__entities:
|
|
48
|
+
start = entity.IndexBegin - self.__index_begin
|
|
49
|
+
end = entity.IndexEnd - self.__index_begin
|
|
50
|
+
bound = Bound(pos=start, length=end - start)
|
|
51
|
+
|
|
52
|
+
updated = False
|
|
53
|
+
if len(bounds_and_entities) > 0:
|
|
54
|
+
last_bound, last_entities = bounds_and_entities[-1]
|
|
55
|
+
if bound.itersects_with(last_bound):
|
|
56
|
+
# Update.
|
|
57
|
+
last_entities.append(entity)
|
|
58
|
+
bounds_and_entities[-1] = (bound.intersect(last_bound), last_entities)
|
|
59
|
+
updated = True
|
|
60
|
+
|
|
61
|
+
if not updated:
|
|
62
|
+
bounds_and_entities.append((bound, [entity]))
|
|
63
|
+
|
|
64
|
+
# Returning result.
|
|
65
|
+
for item in bounds_and_entities:
|
|
66
|
+
bound, entities = item
|
|
67
|
+
entity = entities[0] if len(entities) == 1 else \
|
|
68
|
+
BratCompoundEntity.from_list(root=entities[0], childs=entities[1:])
|
|
69
|
+
yield entity, bound
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
from arekit.common.entities.collection import EntityCollection
|
|
2
|
+
from arekit.contrib.source.brat.entities.entity import BratEntity
|
|
3
|
+
from arekit.contrib.source.brat.sentence import BratSentence
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class BratDocumentSentencesReader(object):
|
|
7
|
+
|
|
8
|
+
@staticmethod
|
|
9
|
+
def from_file(input_file, entities, line_handler=None, skip_entity_func=None):
|
|
10
|
+
assert(isinstance(entities, EntityCollection))
|
|
11
|
+
assert(callable(skip_entity_func) or skip_entity_func is None)
|
|
12
|
+
|
|
13
|
+
sentences_data = BratDocumentSentencesReader._parse_sentences(
|
|
14
|
+
input_file=input_file, line_handler=line_handler)
|
|
15
|
+
|
|
16
|
+
sentence_entities = BratDocumentSentencesReader._parse_entities(
|
|
17
|
+
sentences_data=sentences_data,
|
|
18
|
+
entities=entities,
|
|
19
|
+
skip_entity_func=skip_entity_func)
|
|
20
|
+
|
|
21
|
+
# Convert all the content to brat sentences.
|
|
22
|
+
brat_sentences = []
|
|
23
|
+
for s_ind, s_dict in enumerate(sentences_data):
|
|
24
|
+
brat_sentence = BratSentence(text=s_dict["text"],
|
|
25
|
+
index_begin=s_dict["ind_begin"],
|
|
26
|
+
entities=sentence_entities[s_ind])
|
|
27
|
+
brat_sentences.append(brat_sentence)
|
|
28
|
+
|
|
29
|
+
return brat_sentences
|
|
30
|
+
|
|
31
|
+
@staticmethod
|
|
32
|
+
def from_sentences_data(entities, sentences_data, skip_entity_func=None):
|
|
33
|
+
assert(isinstance(entities, EntityCollection))
|
|
34
|
+
|
|
35
|
+
sentence_entities = BratDocumentSentencesReader._parse_entities(
|
|
36
|
+
sentences_data=sentences_data,
|
|
37
|
+
entities=entities,
|
|
38
|
+
skip_entity_func=skip_entity_func)
|
|
39
|
+
|
|
40
|
+
# Convert all the content to brat sentences.
|
|
41
|
+
brat_sentences = []
|
|
42
|
+
for s_ind, s_dict in enumerate(sentences_data):
|
|
43
|
+
brat_sentence = BratSentence(text=s_dict["text"],
|
|
44
|
+
index_begin=s_dict["ind_begin"],
|
|
45
|
+
entities=sentence_entities[s_ind])
|
|
46
|
+
brat_sentences.append(brat_sentence)
|
|
47
|
+
|
|
48
|
+
return brat_sentences
|
|
49
|
+
|
|
50
|
+
@staticmethod
|
|
51
|
+
def __is_sentence_contains(sentence_data, entity):
|
|
52
|
+
assert(isinstance(sentence_data, dict))
|
|
53
|
+
assert(isinstance(entity, BratEntity))
|
|
54
|
+
return entity.IndexBegin >= sentence_data["ind_begin"] and \
|
|
55
|
+
entity.IndexEnd <= sentence_data["ind_end"]
|
|
56
|
+
|
|
57
|
+
@staticmethod
|
|
58
|
+
def _parse_entities(sentences_data, entities, skip_entity_func):
|
|
59
|
+
""" Sentences is a list of json-like data (dictionaries).
|
|
60
|
+
"""
|
|
61
|
+
assert(isinstance(sentences_data, list))
|
|
62
|
+
assert(isinstance(entities, EntityCollection))
|
|
63
|
+
|
|
64
|
+
entities_in_sentences = [[] for _ in range(len(sentences_data))]
|
|
65
|
+
|
|
66
|
+
s_ind = 0
|
|
67
|
+
e_ind = 0
|
|
68
|
+
|
|
69
|
+
while s_ind < len(sentences_data) and e_ind < len(entities):
|
|
70
|
+
e = entities.get_entity_by_index(e_ind)
|
|
71
|
+
assert (isinstance(e, BratEntity))
|
|
72
|
+
|
|
73
|
+
s = sentences_data[s_ind]
|
|
74
|
+
entities_in_sentence = entities_in_sentences[s_ind]
|
|
75
|
+
|
|
76
|
+
# If entity goes after the current sentence.
|
|
77
|
+
if e.IndexBegin > s["ind_end"]:
|
|
78
|
+
s_ind += 1
|
|
79
|
+
continue
|
|
80
|
+
|
|
81
|
+
if skip_entity_func is not None and skip_entity_func(e):
|
|
82
|
+
e_ind += 1
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
if BratDocumentSentencesReader.__is_sentence_contains(sentence_data=s, entity=e):
|
|
86
|
+
entities_in_sentence.append(e)
|
|
87
|
+
e_ind += 1
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
if e.IndexEnd > s["ind_end"]:
|
|
91
|
+
# Intersects with the right border of sentence
|
|
92
|
+
s_ind += 1
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
if e.IndexBegin < s["ind_begin"]:
|
|
96
|
+
# Intersects with the left border of sentence
|
|
97
|
+
e_ind += 1
|
|
98
|
+
continue
|
|
99
|
+
|
|
100
|
+
raise Exception("e_i:{} e:('{}',{},{}), s_i:{}, s_b: [{} {}]".format(
|
|
101
|
+
e_ind,
|
|
102
|
+
e.Value, e.IndexBegin, e.IndexEnd,
|
|
103
|
+
s_ind,
|
|
104
|
+
s["ind_begin"], s["ind_end"]))
|
|
105
|
+
|
|
106
|
+
return entities_in_sentences
|
|
107
|
+
|
|
108
|
+
@staticmethod
|
|
109
|
+
def _parse_sentences(input_file, line_handler):
|
|
110
|
+
assert(callable(line_handler) or line_handler is None)
|
|
111
|
+
sentences = []
|
|
112
|
+
line_start = 0
|
|
113
|
+
|
|
114
|
+
for line in input_file.readlines():
|
|
115
|
+
|
|
116
|
+
line = line.decode('utf-8')
|
|
117
|
+
handled_line = line_handler(line) if line_handler is not None else line
|
|
118
|
+
|
|
119
|
+
assert(len(line) == len(handled_line))
|
|
120
|
+
|
|
121
|
+
line_end = line_start + len(handled_line) - 1
|
|
122
|
+
|
|
123
|
+
if handled_line != str('\r\n'):
|
|
124
|
+
sentences.append({"text": handled_line, "ind_begin": line_start, "ind_end": line_end})
|
|
125
|
+
|
|
126
|
+
line_start = line_end + 1
|
|
127
|
+
|
|
128
|
+
return sentences
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from os.path import join
|
|
2
|
+
|
|
3
|
+
from arekit.common import utils
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def download():
|
|
7
|
+
root_dir = utils.get_default_download_dir()
|
|
8
|
+
|
|
9
|
+
data = {
|
|
10
|
+
# RuSentiLex
|
|
11
|
+
"rusentilex.zip": "https://www.dropbox.com/s/bdsl3kney30y45z/rusentilex.zip?dl=1",
|
|
12
|
+
# RuSentRel-v1.1
|
|
13
|
+
"rusentrel-v1_1.zip": "https://www.dropbox.com/s/6aw5jv84jf5hrl2/rusentrel-v1_1.zip?dl=1",
|
|
14
|
+
# RuSentiFrames
|
|
15
|
+
"rusentiframes-v1_0.zip": "https://www.dropbox.com/s/zvkis77li3f40bm/rusentiframes-v1_0.zip?dl=1",
|
|
16
|
+
"rusentiframes-v2_0.zip": "https://www.dropbox.com/s/slbyma7eudmmugp/rusentiframes-v2_0.zip?dl=1",
|
|
17
|
+
# RuAttitudes-v1.0 (Many variations)
|
|
18
|
+
"ruattitudes-dbg.zip": "https://www.dropbox.com/s/5lmqw9kyb4tfm94/ruattitudes-dbg.zip?dl=1",
|
|
19
|
+
"ruattitudes-v1_0.zip": "https://www.dropbox.com/s/wg6oa447msdytj3/ruattitudes-v1_0.zip?dl=1",
|
|
20
|
+
"ruattitudes-v1_1.zip": "https://www.dropbox.com/s/e3menx5iqyush19/ruattitudes-v1_1.zip?dl=1",
|
|
21
|
+
# RuAttitudes-v2.0 Base
|
|
22
|
+
"ruattitudes-v2_0_base.zip": "https://www.dropbox.com/s/y39vqzzjumqhce1/ruattitudes_20_base.zip?dl=1",
|
|
23
|
+
"ruattitudes-v2_0_base_neut.zip": "https://www.dropbox.com/s/3xh7gd004oyuwx5/ruattitudes_20_base_neut.zip?dl=1",
|
|
24
|
+
# RuAttitudes-v2.0 Large
|
|
25
|
+
"ruattitudes-v2_0_large.zip": "https://www.dropbox.com/s/43iqoxlyh38qk8u/ruattitudes_20_large.zip?dl=1",
|
|
26
|
+
"ruattitudes-v2_0_large_neut.zip": "https://www.dropbox.com/s/6edqsxehtus4c61/ruattitudes_20_large_neut.zip?dl=1",
|
|
27
|
+
# SentiNEREL
|
|
28
|
+
"sentinerel-v1_0.zip": "https://www.dropbox.com/s/<HIDDEN>/sentinerel-v1_0.zip?dl=1",
|
|
29
|
+
"sentinerel-v2_0.zip": "https://www.dropbox.com/s/<HIDDEN>/sentinerel-v2_0.zip?dl=1",
|
|
30
|
+
"sentinerel-v2_1.zip": "https://www.dropbox.com/s/<HIDDEN>/sentinerel-v2_1.zip?dl=1",
|
|
31
|
+
# NEREL
|
|
32
|
+
"nerel-v1_0.zip": "https://www.dropbox.com/scl/fi/vegk0aczjdm9km410loqv/nerel-v1_0.zip?rlkey=wv0ut86n3x5ao6xabsaxd7lh7&dl=1",
|
|
33
|
+
"nerel-v1_1.zip": "https://www.dropbox.com/scl/fi/oaytj0rvx7vhdxjk98x7g/nerel-v1_1.zip?rlkey=klrq0l5rpn10cf7e2swkay6r4&dl=1",
|
|
34
|
+
# NEREL-BIO
|
|
35
|
+
"nerel-bio-v1_0.zip": "https://www.dropbox.com/scl/fi/nltuulfixbkhg3raczash/nerel-bio-v1_0.zip?rlkey=86uizq1hbkgkx302c5p5znpp6&dl=1"
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
# Perform downloading ...
|
|
39
|
+
for local_name, url_link in data.items():
|
|
40
|
+
utils.download(dest_file_path=join(root_dir, local_name),
|
|
41
|
+
source_url=url_link)
|
|
File without changes
|