arekit 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/__init__.py +0 -0
- arekit/common/__init__.py +0 -0
- arekit/common/bound.py +48 -0
- arekit/common/context/__init__.py +0 -0
- arekit/common/context/terms_mapper.py +51 -0
- arekit/common/context/token.py +16 -0
- arekit/common/data/__init__.py +0 -0
- arekit/common/data/const.py +21 -0
- arekit/common/data/doc_provider.py +6 -0
- arekit/common/data/input/__init__.py +0 -0
- arekit/common/data/input/providers/__init__.py +0 -0
- arekit/common/data/input/providers/columns/__init__.py +0 -0
- arekit/common/data/input/providers/columns/base.py +9 -0
- arekit/common/data/input/providers/columns/sample.py +59 -0
- arekit/common/data/input/providers/const.py +3 -0
- arekit/common/data/input/providers/contents.py +9 -0
- arekit/common/data/input/providers/instances/__init__.py +0 -0
- arekit/common/data/input/providers/instances/base.py +14 -0
- arekit/common/data/input/providers/instances/multiple.py +27 -0
- arekit/common/data/input/providers/instances/single.py +8 -0
- arekit/common/data/input/providers/label/__init__.py +0 -0
- arekit/common/data/input/providers/label/base.py +24 -0
- arekit/common/data/input/providers/label/binary.py +11 -0
- arekit/common/data/input/providers/label/multiple.py +15 -0
- arekit/common/data/input/providers/rows/__init__.py +0 -0
- arekit/common/data/input/providers/rows/base.py +64 -0
- arekit/common/data/input/providers/rows/samples.py +227 -0
- arekit/common/data/input/providers/sample/__init__.py +0 -0
- arekit/common/data/input/providers/sample/cropped.py +43 -0
- arekit/common/data/input/providers/text/__init__.py +0 -0
- arekit/common/data/input/providers/text/single.py +49 -0
- arekit/common/data/input/repositories/__init__.py +0 -0
- arekit/common/data/input/repositories/base.py +68 -0
- arekit/common/data/input/repositories/sample.py +22 -0
- arekit/common/data/input/sample.py +66 -0
- arekit/common/data/input/terms_mapper.py +88 -0
- arekit/common/data/rows_fmt.py +82 -0
- arekit/common/data/rows_parser.py +43 -0
- arekit/common/data/storages/__init__.py +0 -0
- arekit/common/data/storages/base.py +109 -0
- arekit/common/data/views/__init__.py +0 -0
- arekit/common/data/views/samples.py +26 -0
- arekit/common/docs/__init__.py +0 -0
- arekit/common/docs/base.py +30 -0
- arekit/common/docs/entities_grouping.py +16 -0
- arekit/common/docs/entity.py +18 -0
- arekit/common/docs/objects_parser.py +37 -0
- arekit/common/docs/parsed/__init__.py +0 -0
- arekit/common/docs/parsed/base.py +101 -0
- arekit/common/docs/parsed/providers/__init__.py +0 -0
- arekit/common/docs/parsed/providers/base.py +68 -0
- arekit/common/docs/parsed/providers/base_pairs.py +51 -0
- arekit/common/docs/parsed/providers/entity_service.py +175 -0
- arekit/common/docs/parsed/providers/opinion_pairs.py +20 -0
- arekit/common/docs/parsed/providers/text_opinion_pairs.py +78 -0
- arekit/common/docs/parsed/service.py +31 -0
- arekit/common/docs/parsed/term_position.py +42 -0
- arekit/common/docs/parser.py +34 -0
- arekit/common/docs/sentence.py +14 -0
- arekit/common/entities/__init__.py +0 -0
- arekit/common/entities/base.py +51 -0
- arekit/common/entities/collection.py +72 -0
- arekit/common/entities/str_fmt.py +8 -0
- arekit/common/entities/types.py +9 -0
- arekit/common/experiment/__init__.py +0 -0
- arekit/common/experiment/api/__init__.py +0 -0
- arekit/common/experiment/api/base_samples_io.py +20 -0
- arekit/common/experiment/data_type.py +17 -0
- arekit/common/frames/__init__.py +0 -0
- arekit/common/frames/connotations/__init__.py +0 -0
- arekit/common/frames/connotations/descriptor.py +17 -0
- arekit/common/frames/connotations/provider.py +4 -0
- arekit/common/frames/text_variant.py +43 -0
- arekit/common/frames/variants/__init__.py +0 -0
- arekit/common/frames/variants/base.py +21 -0
- arekit/common/frames/variants/collection.py +60 -0
- arekit/common/labels/__init__.py +0 -0
- arekit/common/labels/base.py +19 -0
- arekit/common/labels/provider/__init__.py +0 -0
- arekit/common/labels/provider/base.py +7 -0
- arekit/common/labels/provider/constant.py +14 -0
- arekit/common/labels/scaler/__init__.py +0 -0
- arekit/common/labels/scaler/base.py +85 -0
- arekit/common/labels/scaler/sentiment.py +7 -0
- arekit/common/labels/scaler/single.py +10 -0
- arekit/common/labels/str_fmt.py +55 -0
- arekit/common/linkage/__init__.py +0 -0
- arekit/common/linkage/base.py +44 -0
- arekit/common/linkage/meta.py +23 -0
- arekit/common/linkage/opinions.py +9 -0
- arekit/common/linkage/text_opinions.py +22 -0
- arekit/common/log_utils.py +29 -0
- arekit/common/model/__init__.py +0 -0
- arekit/common/model/labeling/__init__.py +0 -0
- arekit/common/model/labeling/base.py +24 -0
- arekit/common/model/labeling/modes.py +8 -0
- arekit/common/model/labeling/single.py +24 -0
- arekit/common/opinions/__init__.py +0 -0
- arekit/common/opinions/annot/__init__.py +0 -0
- arekit/common/opinions/annot/algo/__init__.py +0 -0
- arekit/common/opinions/annot/algo/base.py +4 -0
- arekit/common/opinions/annot/algo/pair_based.py +99 -0
- arekit/common/opinions/annot/algo/predefined.py +16 -0
- arekit/common/opinions/annot/algo_based.py +55 -0
- arekit/common/opinions/annot/base.py +15 -0
- arekit/common/opinions/base.py +74 -0
- arekit/common/opinions/collection.py +150 -0
- arekit/common/opinions/enums.py +6 -0
- arekit/common/opinions/provider.py +4 -0
- arekit/common/opinions/writer.py +4 -0
- arekit/common/pipeline/__init__.py +0 -0
- arekit/common/pipeline/base.py +25 -0
- arekit/common/pipeline/context.py +36 -0
- arekit/common/pipeline/conts.py +2 -0
- arekit/common/pipeline/items/__init__.py +0 -0
- arekit/common/pipeline/items/base.py +12 -0
- arekit/common/pipeline/items/flatten.py +14 -0
- arekit/common/pipeline/items/handle.py +17 -0
- arekit/common/pipeline/items/iter.py +11 -0
- arekit/common/pipeline/items/map.py +11 -0
- arekit/common/pipeline/items/map_nested.py +13 -0
- arekit/common/synonyms/__init__.py +0 -0
- arekit/common/synonyms/base.py +151 -0
- arekit/common/synonyms/grouping.py +21 -0
- arekit/common/text/__init__.py +0 -0
- arekit/common/text/enums.py +12 -0
- arekit/common/text/parsed.py +42 -0
- arekit/common/text/parser.py +12 -0
- arekit/common/text/partitioning/__init__.py +0 -0
- arekit/common/text/partitioning/base.py +4 -0
- arekit/common/text/partitioning/str.py +36 -0
- arekit/common/text/partitioning/terms.py +35 -0
- arekit/common/text/stemmer.py +16 -0
- arekit/common/text_opinions/__init__.py +0 -0
- arekit/common/text_opinions/base.py +105 -0
- arekit/common/utils.py +129 -0
- arekit/contrib/__init__.py +0 -0
- arekit/contrib/bert/__init__.py +0 -0
- arekit/contrib/bert/input/__init__.py +0 -0
- arekit/contrib/bert/input/providers/__init__.py +0 -0
- arekit/contrib/bert/input/providers/cropped_sample.py +17 -0
- arekit/contrib/bert/input/providers/text_pair.py +62 -0
- arekit/contrib/bert/terms/__init__.py +0 -0
- arekit/contrib/bert/terms/mapper.py +20 -0
- arekit/contrib/networks/__init__.py +0 -0
- arekit/contrib/networks/embedding.py +149 -0
- arekit/contrib/networks/embedding_io.py +18 -0
- arekit/contrib/networks/input/__init__.py +0 -0
- arekit/contrib/networks/input/const.py +6 -0
- arekit/contrib/networks/input/ctx_serialization.py +28 -0
- arekit/contrib/networks/input/embedding/__init__.py +0 -0
- arekit/contrib/networks/input/embedding/matrix.py +29 -0
- arekit/contrib/networks/input/embedding/offsets.py +55 -0
- arekit/contrib/networks/input/formatters/__init__.py +0 -0
- arekit/contrib/networks/input/formatters/pos_mapper.py +22 -0
- arekit/contrib/networks/input/providers/__init__.py +0 -0
- arekit/contrib/networks/input/providers/sample.py +129 -0
- arekit/contrib/networks/input/providers/term_connotation.py +23 -0
- arekit/contrib/networks/input/providers/text.py +24 -0
- arekit/contrib/networks/input/rows_parser.py +47 -0
- arekit/contrib/networks/input/term_types.py +13 -0
- arekit/contrib/networks/input/terms_mapping.py +60 -0
- arekit/contrib/networks/vectorizer.py +6 -0
- arekit/contrib/prompt/__init__.py +0 -0
- arekit/contrib/prompt/sample.py +61 -0
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +84 -0
- arekit/contrib/source/brat/doc.py +28 -0
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +13 -0
- arekit/contrib/source/brat/entities/entity.py +42 -0
- arekit/contrib/source/brat/entities/parser.py +53 -0
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +19 -0
- arekit/contrib/source/brat/relation.py +32 -0
- arekit/contrib/source/brat/sentence.py +69 -0
- arekit/contrib/source/brat/sentences_reader.py +128 -0
- arekit/contrib/source/download.py +41 -0
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +55 -0
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +74 -0
- arekit/contrib/source/nerel/io_utils.py +62 -0
- arekit/contrib/source/nerel/labels.py +241 -0
- arekit/contrib/source/nerel/reader.py +46 -0
- arekit/contrib/source/nerel/utils.py +24 -0
- arekit/contrib/source/nerel/versions.py +12 -0
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +62 -0
- arekit/contrib/source/nerelbio/labels.py +265 -0
- arekit/contrib/source/nerelbio/reader.py +8 -0
- arekit/contrib/source/nerelbio/versions.py +8 -0
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +36 -0
- arekit/contrib/source/ruattitudes/doc.py +51 -0
- arekit/contrib/source/ruattitudes/doc_brat.py +44 -0
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +7 -0
- arekit/contrib/source/ruattitudes/io_utils.py +56 -0
- arekit/contrib/source/ruattitudes/labels_fmt.py +12 -0
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +28 -0
- arekit/contrib/source/ruattitudes/opinions/converter.py +37 -0
- arekit/contrib/source/ruattitudes/reader.py +268 -0
- arekit/contrib/source/ruattitudes/sentence.py +73 -0
- arekit/contrib/source/ruattitudes/synonyms.py +17 -0
- arekit/contrib/source/ruattitudes/text_object.py +59 -0
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +157 -0
- arekit/contrib/source/rusentiframes/effect.py +24 -0
- arekit/contrib/source/rusentiframes/io_utils.py +19 -0
- arekit/contrib/source/rusentiframes/labels_fmt.py +22 -0
- arekit/contrib/source/rusentiframes/polarity.py +35 -0
- arekit/contrib/source/rusentiframes/role.py +15 -0
- arekit/contrib/source/rusentiframes/state.py +24 -0
- arekit/contrib/source/rusentiframes/types.py +42 -0
- arekit/contrib/source/rusentiframes/value.py +2 -0
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +3 -0
- arekit/contrib/source/rusentrel/docs_reader.py +51 -0
- arekit/contrib/source/rusentrel/entities.py +26 -0
- arekit/contrib/source/rusentrel/io_utils.py +125 -0
- arekit/contrib/source/rusentrel/labels_fmt.py +12 -0
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +30 -0
- arekit/contrib/source/rusentrel/opinions/converter.py +40 -0
- arekit/contrib/source/rusentrel/opinions/provider.py +54 -0
- arekit/contrib/source/rusentrel/opinions/writer.py +42 -0
- arekit/contrib/source/rusentrel/synonyms.py +17 -0
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +52 -0
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +31 -0
- arekit/contrib/source/sentinerel/folding/fixed.py +70 -0
- arekit/contrib/source/sentinerel/io_utils.py +87 -0
- arekit/contrib/source/sentinerel/labels.py +53 -0
- arekit/contrib/source/sentinerel/labels_scaler.py +30 -0
- arekit/contrib/source/sentinerel/reader.py +42 -0
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +19 -0
- arekit/contrib/source/zip_utils.py +47 -0
- arekit/contrib/utils/__init__.py +0 -0
- arekit/contrib/utils/bert/__init__.py +0 -0
- arekit/contrib/utils/bert/samplers.py +17 -0
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +23 -0
- arekit/contrib/utils/data/__init__.py +0 -0
- arekit/contrib/utils/data/contents/__init__.py +0 -0
- arekit/contrib/utils/data/contents/opinions.py +37 -0
- arekit/contrib/utils/data/doc_provider/__init__.py +0 -0
- arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
- arekit/contrib/utils/data/doc_provider/dir_based.py +53 -0
- arekit/contrib/utils/data/readers/__init__.py +0 -0
- arekit/contrib/utils/data/readers/base.py +7 -0
- arekit/contrib/utils/data/readers/csv_pd.py +38 -0
- arekit/contrib/utils/data/readers/jsonl.py +15 -0
- arekit/contrib/utils/data/service/__init__.py +0 -0
- arekit/contrib/utils/data/service/balance.py +50 -0
- arekit/contrib/utils/data/storages/__init__.py +0 -0
- arekit/contrib/utils/data/storages/jsonl_based.py +18 -0
- arekit/contrib/utils/data/storages/pandas_based.py +123 -0
- arekit/contrib/utils/data/storages/row_cache.py +48 -0
- arekit/contrib/utils/data/writers/__init__.py +0 -0
- arekit/contrib/utils/data/writers/base.py +27 -0
- arekit/contrib/utils/data/writers/csv_native.py +63 -0
- arekit/contrib/utils/data/writers/csv_pd.py +40 -0
- arekit/contrib/utils/data/writers/json_opennre.py +132 -0
- arekit/contrib/utils/data/writers/sqlite_native.py +110 -0
- arekit/contrib/utils/download.py +77 -0
- arekit/contrib/utils/embeddings/__init__.py +0 -0
- arekit/contrib/utils/embeddings/rusvectores.py +58 -0
- arekit/contrib/utils/embeddings/tokens.py +30 -0
- arekit/contrib/utils/entities/__init__.py +0 -0
- arekit/contrib/utils/entities/filter.py +7 -0
- arekit/contrib/utils/entities/formatters/__init__.py +0 -0
- arekit/contrib/utils/entities/formatters/str_display.py +11 -0
- arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py +15 -0
- arekit/contrib/utils/io_utils/__init__.py +0 -0
- arekit/contrib/utils/io_utils/embedding.py +72 -0
- arekit/contrib/utils/io_utils/opinions.py +37 -0
- arekit/contrib/utils/io_utils/samples.py +79 -0
- arekit/contrib/utils/io_utils/utils.py +39 -0
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +41 -0
- arekit/contrib/utils/lexicons/relation.py +42 -0
- arekit/contrib/utils/lexicons/rusentilex.py +37 -0
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +83 -0
- arekit/contrib/utils/np_utils/__init__.py +0 -0
- arekit/contrib/utils/np_utils/embedding.py +22 -0
- arekit/contrib/utils/np_utils/npz_utils.py +13 -0
- arekit/contrib/utils/np_utils/vocab.py +20 -0
- arekit/contrib/utils/pipelines/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/base.py +99 -0
- arekit/contrib/utils/pipelines/items/sampling/networks.py +54 -0
- arekit/contrib/utils/pipelines/items/text/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/text/entities_default.py +23 -0
- arekit/contrib/utils/pipelines/items/text/frames.py +86 -0
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +36 -0
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +32 -0
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +10 -0
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +107 -0
- arekit/contrib/utils/pipelines/items/text/translator.py +135 -0
- arekit/contrib/utils/pipelines/opinion_collections.py +85 -0
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +27 -0
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +65 -0
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +60 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +29 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +64 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +79 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +56 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +20 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +65 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +21 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +107 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +29 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +62 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +180 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +50 -0
- arekit/contrib/utils/pipelines/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +34 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +88 -0
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +93 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/base.py +4 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +16 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +29 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +26 -0
- arekit/contrib/utils/processing/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/mods.py +12 -0
- arekit/contrib/utils/processing/languages/pos.py +23 -0
- arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/ru/cases.py +78 -0
- arekit/contrib/utils/processing/languages/ru/constants.py +6 -0
- arekit/contrib/utils/processing/languages/ru/mods.py +13 -0
- arekit/contrib/utils/processing/languages/ru/number.py +23 -0
- arekit/contrib/utils/processing/languages/ru/pos_service.py +36 -0
- arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
- arekit/contrib/utils/processing/lemmatization/mystem.py +51 -0
- arekit/contrib/utils/processing/pos/__init__.py +0 -0
- arekit/contrib/utils/processing/pos/base.py +12 -0
- arekit/contrib/utils/processing/pos/mystem_wrap.py +134 -0
- arekit/contrib/utils/processing/pos/russian.py +10 -0
- arekit/contrib/utils/processing/text/__init__.py +0 -0
- arekit/contrib/utils/processing/text/tokens.py +127 -0
- arekit/contrib/utils/resources.py +25 -0
- arekit/contrib/utils/serializer.py +43 -0
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +63 -0
- arekit/contrib/utils/synonyms/__init__.py +0 -0
- arekit/contrib/utils/synonyms/simple.py +15 -0
- arekit/contrib/utils/synonyms/stemmer_based.py +38 -0
- arekit/contrib/utils/vectorizers/__init__.py +0 -0
- arekit/contrib/utils/vectorizers/bpe.py +93 -0
- arekit/contrib/utils/vectorizers/random_norm.py +39 -0
- arekit/download_data.py +11 -0
- arekit-0.24.0.dist-info/LICENSE +21 -0
- arekit-0.24.0.dist-info/METADATA +23 -0
- arekit-0.24.0.dist-info/RECORD +374 -0
- arekit-0.24.0.dist-info/WHEEL +5 -0
- arekit-0.24.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from arekit.common.data.input.providers.rows.samples import BaseSampleRowProvider
|
|
2
|
+
from arekit.common.data.storages.base import BaseRowsStorage
|
|
3
|
+
from arekit.common.experiment.api.base_samples_io import BaseSamplesIO
|
|
4
|
+
from arekit.common.experiment.data_type import DataType
|
|
5
|
+
from arekit.common.pipeline.base import BasePipeline
|
|
6
|
+
from arekit.common.pipeline.context import PipelineContext
|
|
7
|
+
from arekit.common.pipeline.items.base import BasePipelineItem
|
|
8
|
+
from arekit.contrib.utils.serializer import InputDataSerializationHelper
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BaseSerializerPipelineItem(BasePipelineItem):
|
|
12
|
+
|
|
13
|
+
def __init__(self, rows_provider, samples_io, save_labels_func, storage):
|
|
14
|
+
""" sample_rows_formatter:
|
|
15
|
+
how we format input texts for a BERT model, for example:
|
|
16
|
+
- single text
|
|
17
|
+
- two sequences, separated by [SEP] token
|
|
18
|
+
|
|
19
|
+
save_labels_func: function
|
|
20
|
+
data_type -> bool
|
|
21
|
+
"""
|
|
22
|
+
assert(isinstance(rows_provider, BaseSampleRowProvider))
|
|
23
|
+
assert(isinstance(samples_io, BaseSamplesIO))
|
|
24
|
+
assert(callable(save_labels_func))
|
|
25
|
+
assert(isinstance(storage, BaseRowsStorage))
|
|
26
|
+
|
|
27
|
+
self._rows_provider = rows_provider
|
|
28
|
+
self._samples_io = samples_io
|
|
29
|
+
self._save_labels_func = save_labels_func
|
|
30
|
+
self._storage = storage
|
|
31
|
+
|
|
32
|
+
def _serialize_iteration(self, data_type, pipeline, data_folding, doc_ids):
|
|
33
|
+
assert(isinstance(data_type, DataType))
|
|
34
|
+
assert(isinstance(pipeline, BasePipeline))
|
|
35
|
+
assert(isinstance(data_folding, dict) or data_folding is None)
|
|
36
|
+
assert(isinstance(doc_ids, list) or doc_ids is None)
|
|
37
|
+
assert(doc_ids is not None or data_folding is not None)
|
|
38
|
+
|
|
39
|
+
repos = {
|
|
40
|
+
"sample": InputDataSerializationHelper.create_samples_repo(
|
|
41
|
+
keep_labels=self._save_labels_func(data_type),
|
|
42
|
+
rows_provider=self._rows_provider,
|
|
43
|
+
storage=self._storage),
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
writer_and_targets = {
|
|
47
|
+
"sample": (self._samples_io.Writer,
|
|
48
|
+
self._samples_io.create_target(data_type=data_type)),
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
for description, repo in repos.items():
|
|
52
|
+
|
|
53
|
+
if data_folding is None:
|
|
54
|
+
# Consider only the predefined doc_ids.
|
|
55
|
+
doc_ids_iter = doc_ids
|
|
56
|
+
else:
|
|
57
|
+
# Take particular data_type.
|
|
58
|
+
doc_ids_iter = data_folding[data_type]
|
|
59
|
+
# Consider only predefined doc_ids.
|
|
60
|
+
if doc_ids is not None:
|
|
61
|
+
doc_ids_iter = set(doc_ids_iter).intersection(doc_ids)
|
|
62
|
+
|
|
63
|
+
InputDataSerializationHelper.fill_and_write(
|
|
64
|
+
repo=repo,
|
|
65
|
+
pipeline=pipeline,
|
|
66
|
+
doc_ids_iter=doc_ids_iter,
|
|
67
|
+
desc="{desc} [{data_type}]".format(desc=description, data_type=data_type),
|
|
68
|
+
writer=writer_and_targets[description][0],
|
|
69
|
+
target=writer_and_targets[description][1])
|
|
70
|
+
|
|
71
|
+
def _handle_iteration(self, data_type_pipelines, data_folding, doc_ids):
|
|
72
|
+
""" Performing data serialization for a particular iteration
|
|
73
|
+
"""
|
|
74
|
+
assert(isinstance(data_type_pipelines, dict))
|
|
75
|
+
for data_type, pipeline in data_type_pipelines.items():
|
|
76
|
+
self._serialize_iteration(data_type=data_type, pipeline=pipeline, data_folding=data_folding,
|
|
77
|
+
doc_ids=doc_ids)
|
|
78
|
+
|
|
79
|
+
def apply_core(self, input_data, pipeline_ctx):
|
|
80
|
+
"""
|
|
81
|
+
data_type_pipelines: dict of, for example:
|
|
82
|
+
{
|
|
83
|
+
DataType.Train: BasePipeline,
|
|
84
|
+
DataType.Test: BasePipeline
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
data_type_pipelines: doc_id -> parsed_doc -> annot -> opinion linkages
|
|
88
|
+
for example, function: sentiment_attitude_extraction_default_pipeline
|
|
89
|
+
doc_ids: optional
|
|
90
|
+
this parameter allows to limit amount of documents considered for sampling
|
|
91
|
+
"""
|
|
92
|
+
assert(isinstance(input_data, PipelineContext))
|
|
93
|
+
assert("data_type_pipelines" in input_data)
|
|
94
|
+
|
|
95
|
+
data_folding = input_data.provide_or_none("data_folding")
|
|
96
|
+
|
|
97
|
+
self._handle_iteration(data_type_pipelines=input_data.provide("data_type_pipelines"),
|
|
98
|
+
doc_ids=input_data.provide_or_none("doc_ids"),
|
|
99
|
+
data_folding=data_folding)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from arekit.contrib.networks.input.embedding.matrix import create_term_embedding_matrix
|
|
2
|
+
from arekit.contrib.networks.input.embedding.offsets import TermsEmbeddingOffsets
|
|
3
|
+
from arekit.contrib.networks.embedding import Embedding
|
|
4
|
+
from arekit.contrib.networks.input.providers.sample import NetworkSampleRowProvider
|
|
5
|
+
from arekit.contrib.utils.io_utils.embedding import NpEmbeddingIO
|
|
6
|
+
from arekit.contrib.utils.pipelines.items.sampling.base import BaseSerializerPipelineItem
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class NetworksInputSerializerPipelineItem(BaseSerializerPipelineItem):
|
|
10
|
+
|
|
11
|
+
def __init__(self, save_labels_func, rows_provider, samples_io, emb_io, storage, save_embedding=True):
|
|
12
|
+
""" This pipeline item allows to perform a data preparation for neural network models.
|
|
13
|
+
|
|
14
|
+
considering a list of the whole data_types with the related pipelines,
|
|
15
|
+
which are supported and required in a handler. It is necessary to know
|
|
16
|
+
data_types in advance as it allows to create a complete vocabulary of input terms,
|
|
17
|
+
with the related embeddings.
|
|
18
|
+
"""
|
|
19
|
+
assert(isinstance(emb_io, NpEmbeddingIO))
|
|
20
|
+
assert(isinstance(rows_provider, NetworkSampleRowProvider))
|
|
21
|
+
assert(isinstance(save_embedding, bool))
|
|
22
|
+
super(NetworksInputSerializerPipelineItem, self).__init__(
|
|
23
|
+
rows_provider=rows_provider,
|
|
24
|
+
samples_io=samples_io,
|
|
25
|
+
save_labels_func=save_labels_func,
|
|
26
|
+
storage=storage)
|
|
27
|
+
|
|
28
|
+
self.__emb_io = emb_io
|
|
29
|
+
self.__save_embedding = save_embedding
|
|
30
|
+
|
|
31
|
+
def _handle_iteration(self, data_type_pipelines, data_folding, doc_ids):
|
|
32
|
+
""" Performing data serialization for a particular iteration
|
|
33
|
+
"""
|
|
34
|
+
assert(isinstance(data_type_pipelines, dict))
|
|
35
|
+
|
|
36
|
+
# Prepare for the present iteration.
|
|
37
|
+
self._rows_provider.clear_embedding_pairs()
|
|
38
|
+
|
|
39
|
+
super(NetworksInputSerializerPipelineItem, self)._handle_iteration(
|
|
40
|
+
data_type_pipelines=data_type_pipelines, data_folding=data_folding, doc_ids=doc_ids)
|
|
41
|
+
|
|
42
|
+
if not (self.__save_embedding and self._rows_provider.HasEmbeddingPairs):
|
|
43
|
+
return
|
|
44
|
+
|
|
45
|
+
# Save embedding information additionally.
|
|
46
|
+
term_embedding = Embedding.from_word_embedding_pairs_iter(self._rows_provider.iter_term_embedding_pairs())
|
|
47
|
+
embedding_matrix = create_term_embedding_matrix(term_embedding=term_embedding)
|
|
48
|
+
vocab = list(TermsEmbeddingOffsets.extract_vocab(words_embedding=term_embedding))
|
|
49
|
+
|
|
50
|
+
# Save embedding matrix
|
|
51
|
+
self.__emb_io.save_embedding(data=embedding_matrix)
|
|
52
|
+
self.__emb_io.save_vocab(data=vocab)
|
|
53
|
+
|
|
54
|
+
del embedding_matrix
|
|
File without changes
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from arekit.common.entities.base import Entity
|
|
2
|
+
from arekit.common.pipeline.items.base import BasePipelineItem
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class TextEntitiesParser(BasePipelineItem):
|
|
6
|
+
|
|
7
|
+
def __init__(self):
|
|
8
|
+
super(TextEntitiesParser, self).__init__()
|
|
9
|
+
|
|
10
|
+
@staticmethod
|
|
11
|
+
def __process_word(word):
|
|
12
|
+
assert(isinstance(word, str))
|
|
13
|
+
|
|
14
|
+
# If this is a special word which is related to the [entity] mention.
|
|
15
|
+
if word[0] == "[" and word[-1] == "]":
|
|
16
|
+
entity = Entity(value=word[1:-1], e_type=None)
|
|
17
|
+
return entity
|
|
18
|
+
|
|
19
|
+
return word
|
|
20
|
+
|
|
21
|
+
def apply_core(self, input_data, pipeline_ctx):
|
|
22
|
+
assert(isinstance(input_data, list))
|
|
23
|
+
return [self.__process_word(w) for w in input_data]
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from arekit.common.frames.text_variant import TextFrameVariant
|
|
2
|
+
from arekit.common.frames.variants.collection import FrameVariantsCollection
|
|
3
|
+
from arekit.common.pipeline.context import PipelineContext
|
|
4
|
+
from arekit.common.pipeline.items.base import BasePipelineItem
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class FrameVariantsParser(BasePipelineItem):
|
|
8
|
+
|
|
9
|
+
def __init__(self, frame_variants):
|
|
10
|
+
assert(isinstance(frame_variants, FrameVariantsCollection))
|
|
11
|
+
assert(len(frame_variants) > 0)
|
|
12
|
+
|
|
13
|
+
super(FrameVariantsParser, self).__init__()
|
|
14
|
+
|
|
15
|
+
self.__frame_variants = frame_variants
|
|
16
|
+
self.__max_variant_len = max([len(variant) for _, variant in frame_variants.iter_variants()])
|
|
17
|
+
|
|
18
|
+
# region private methods
|
|
19
|
+
|
|
20
|
+
@staticmethod
|
|
21
|
+
def __check_all_terms_within(terms, start_index, last_index):
|
|
22
|
+
for term_ind in range(start_index, last_index + 1):
|
|
23
|
+
if not isinstance(terms[term_ind], str):
|
|
24
|
+
return False
|
|
25
|
+
return True
|
|
26
|
+
|
|
27
|
+
def __try_compose_frame_variant(self, lemmas, start_ind, last_ind):
|
|
28
|
+
|
|
29
|
+
if last_ind >= len(lemmas):
|
|
30
|
+
return None
|
|
31
|
+
|
|
32
|
+
is_all_words_within = self.__check_all_terms_within(
|
|
33
|
+
terms=lemmas,
|
|
34
|
+
start_index=start_ind,
|
|
35
|
+
last_index=last_ind)
|
|
36
|
+
|
|
37
|
+
if not is_all_words_within:
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
ctx_value = " ".join(lemmas[start_ind:last_ind + 1])
|
|
41
|
+
|
|
42
|
+
if not self.__frame_variants.has_variant(ctx_value):
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
return ctx_value
|
|
46
|
+
|
|
47
|
+
def _iter_processed(self, terms, origin):
|
|
48
|
+
assert(len(terms) == len(origin))
|
|
49
|
+
|
|
50
|
+
start_ind = 0
|
|
51
|
+
last_ind = 0
|
|
52
|
+
while start_ind < len(terms):
|
|
53
|
+
|
|
54
|
+
found = False
|
|
55
|
+
|
|
56
|
+
for ctx_size in reversed(list(range(1, self.__max_variant_len))):
|
|
57
|
+
|
|
58
|
+
last_ind = start_ind + ctx_size - 1
|
|
59
|
+
|
|
60
|
+
ctx_value = self.__try_compose_frame_variant(
|
|
61
|
+
start_ind=start_ind,
|
|
62
|
+
last_ind=last_ind,
|
|
63
|
+
lemmas=terms)
|
|
64
|
+
|
|
65
|
+
if ctx_value is None:
|
|
66
|
+
continue
|
|
67
|
+
|
|
68
|
+
variant = self.__frame_variants.get_variant_by_value(ctx_value)
|
|
69
|
+
|
|
70
|
+
yield TextFrameVariant(variant)
|
|
71
|
+
|
|
72
|
+
found = True
|
|
73
|
+
|
|
74
|
+
break
|
|
75
|
+
|
|
76
|
+
if not found:
|
|
77
|
+
yield origin[start_ind]
|
|
78
|
+
|
|
79
|
+
start_ind = last_ind + 1
|
|
80
|
+
|
|
81
|
+
# endregion
|
|
82
|
+
|
|
83
|
+
def apply_core(self, input_data, pipeline_ctx):
|
|
84
|
+
assert(isinstance(pipeline_ctx, PipelineContext))
|
|
85
|
+
processed_it = self._iter_processed(terms=input_data, origin=input_data)
|
|
86
|
+
return list(processed_it)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from arekit.common.text.stemmer import Stemmer
|
|
2
|
+
from arekit.contrib.utils.pipelines.items.text.frames import FrameVariantsParser
|
|
3
|
+
from arekit.contrib.utils.processing.languages.ru.mods import RussianLanguageMods
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class LemmasBasedFrameVariantsParser(FrameVariantsParser):
|
|
7
|
+
|
|
8
|
+
def __init__(self, frame_variants, stemmer, locale_mods=RussianLanguageMods, save_lemmas=False):
|
|
9
|
+
assert(isinstance(stemmer, Stemmer))
|
|
10
|
+
assert(isinstance(save_lemmas, bool))
|
|
11
|
+
super(LemmasBasedFrameVariantsParser, self).__init__(frame_variants=frame_variants)
|
|
12
|
+
|
|
13
|
+
self.__frame_variants = frame_variants
|
|
14
|
+
self.__stemmer = stemmer
|
|
15
|
+
self.__save_lemmas = save_lemmas
|
|
16
|
+
self.__max_variant_len = max([len(variant) for _, variant in frame_variants.iter_variants()])
|
|
17
|
+
self.__locale_mods = locale_mods
|
|
18
|
+
|
|
19
|
+
def __lemmatize_term(self, term):
|
|
20
|
+
# we first split onto words for lemmatization and then join all of them.
|
|
21
|
+
lemma = "".join(self.__stemmer.lemmatize_to_list(term))
|
|
22
|
+
# then we replace certain chars according to the locale restrictions.
|
|
23
|
+
return self.__locale_mods.replace_specific_word_chars(lemma)
|
|
24
|
+
|
|
25
|
+
def __provide_lemmatized_terms(self, terms):
|
|
26
|
+
"""
|
|
27
|
+
Compose a list of lemmatized versions of parsed_doc
|
|
28
|
+
PS: Might be significantly slow, depending on stemmer were used.
|
|
29
|
+
"""
|
|
30
|
+
assert(isinstance(terms, list))
|
|
31
|
+
return [self.__lemmatize_term(term) if isinstance(term, str) else term for term in terms]
|
|
32
|
+
|
|
33
|
+
def apply_core(self, input_data, pipeline_ctx):
|
|
34
|
+
lemmas = self.__provide_lemmatized_terms(input_data)
|
|
35
|
+
processed_it = self._iter_processed(terms=lemmas, origin=lemmas if self.__save_lemmas else input_data)
|
|
36
|
+
return list(processed_it)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from arekit.common.frames.text_variant import TextFrameVariant
|
|
2
|
+
from arekit.common.pipeline.context import PipelineContext
|
|
3
|
+
from arekit.common.pipeline.items.base import BasePipelineItem
|
|
4
|
+
from arekit.contrib.utils.processing.languages.mods import BaseLanguageMods
|
|
5
|
+
from arekit.contrib.utils.processing.languages.ru.mods import RussianLanguageMods
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class FrameVariantsSentimentNegation(BasePipelineItem):
|
|
9
|
+
|
|
10
|
+
def __init__(self, locale_mods=RussianLanguageMods):
|
|
11
|
+
assert(issubclass(locale_mods, BaseLanguageMods))
|
|
12
|
+
self._locale_mods = locale_mods
|
|
13
|
+
|
|
14
|
+
@staticmethod
|
|
15
|
+
def __get_preposition(terms, index):
|
|
16
|
+
return terms[index-1] if index > 0 else None
|
|
17
|
+
|
|
18
|
+
def apply_core(self, input_data, pipeline_ctx):
|
|
19
|
+
assert(isinstance(input_data, list))
|
|
20
|
+
assert(isinstance(pipeline_ctx, PipelineContext))
|
|
21
|
+
|
|
22
|
+
for curr_ind, term in enumerate(input_data):
|
|
23
|
+
|
|
24
|
+
if not isinstance(term, TextFrameVariant):
|
|
25
|
+
continue
|
|
26
|
+
|
|
27
|
+
prep_term = self.__get_preposition(terms=input_data, index=curr_ind)
|
|
28
|
+
is_negated = self._locale_mods.is_negation_word(prep_term) if prep_term is not None else False
|
|
29
|
+
|
|
30
|
+
term.set_is_negated(is_negated)
|
|
31
|
+
|
|
32
|
+
return input_data
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from arekit.common.pipeline.context import PipelineContext
|
|
2
|
+
from arekit.common.pipeline.items.base import BasePipelineItem
|
|
3
|
+
from arekit.common.utils import split_by_whitespaces
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TermsSplitterParser(BasePipelineItem):
|
|
7
|
+
|
|
8
|
+
def apply_core(self, input_data, pipeline_ctx):
|
|
9
|
+
assert(isinstance(pipeline_ctx, PipelineContext))
|
|
10
|
+
return split_by_whitespaces(input_data)
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from arekit.common.context.token import Token
|
|
4
|
+
from arekit.common.pipeline.context import PipelineContext
|
|
5
|
+
from arekit.common.pipeline.items.base import BasePipelineItem
|
|
6
|
+
from arekit.common.utils import split_by_whitespaces
|
|
7
|
+
from arekit.contrib.utils.processing.text.tokens import Tokens
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
logger.setLevel(logging.INFO)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DefaultTextTokenizer(BasePipelineItem):
|
|
14
|
+
""" Default parser implementation.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, keep_tokens=True):
|
|
18
|
+
super(DefaultTextTokenizer, self).__init__()
|
|
19
|
+
self.__keep_tokens = keep_tokens
|
|
20
|
+
|
|
21
|
+
# region protected methods
|
|
22
|
+
|
|
23
|
+
def apply_core(self, input_data, pipeline_ctx):
|
|
24
|
+
assert(isinstance(pipeline_ctx, PipelineContext))
|
|
25
|
+
output_data = self.__process_parts(input_data)
|
|
26
|
+
if not self.__keep_tokens:
|
|
27
|
+
output_data = [word for word in output_data if not isinstance(word, Token)]
|
|
28
|
+
return output_data
|
|
29
|
+
|
|
30
|
+
# endregion
|
|
31
|
+
|
|
32
|
+
# region private static methods
|
|
33
|
+
|
|
34
|
+
def __process_parts(self, parts):
|
|
35
|
+
assert(isinstance(parts, list))
|
|
36
|
+
|
|
37
|
+
parsed = []
|
|
38
|
+
for part in parts:
|
|
39
|
+
|
|
40
|
+
if part is None:
|
|
41
|
+
continue
|
|
42
|
+
|
|
43
|
+
# Keep non str words as it is and try to parse str-based words.
|
|
44
|
+
processed = [part] if not isinstance(part, str) else \
|
|
45
|
+
self.__iter_processed_part(part=part)
|
|
46
|
+
|
|
47
|
+
parsed.extend(processed)
|
|
48
|
+
|
|
49
|
+
return parsed
|
|
50
|
+
|
|
51
|
+
def __iter_processed_part(self, part):
|
|
52
|
+
for word in split_by_whitespaces(part):
|
|
53
|
+
for term in self.__process_word(word):
|
|
54
|
+
yield term
|
|
55
|
+
|
|
56
|
+
def __process_word(self, word):
|
|
57
|
+
assert(isinstance(word, str))
|
|
58
|
+
return self.__split_tokens(word)
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def __split_tokens(term):
|
|
62
|
+
"""
|
|
63
|
+
Splitting off tokens from parsed_doc ending, i.e. for example:
|
|
64
|
+
term: "сказать,-" -> "(term: "сказать", ["COMMA_TOKEN", "DASH_TOKEN"])
|
|
65
|
+
return: (unicode or None, list)
|
|
66
|
+
modified term and list of extracted tokens.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
url = Tokens.try_create_url(term)
|
|
70
|
+
if url is not None:
|
|
71
|
+
return [url]
|
|
72
|
+
|
|
73
|
+
l = 0
|
|
74
|
+
words_and_tokens = []
|
|
75
|
+
while l < len(term):
|
|
76
|
+
|
|
77
|
+
# Token.
|
|
78
|
+
token = Tokens.try_create(term[l])
|
|
79
|
+
if token is not None:
|
|
80
|
+
if token.get_token_value() != Tokens.NEW_LINE:
|
|
81
|
+
words_and_tokens.append(token)
|
|
82
|
+
l += 1
|
|
83
|
+
|
|
84
|
+
# Number.
|
|
85
|
+
elif str.isdigit(term[l]):
|
|
86
|
+
k = l + 1
|
|
87
|
+
while k < len(term) and str.isdigit(term[k]):
|
|
88
|
+
k += 1
|
|
89
|
+
token = Tokens.try_create_number(term[l:k])
|
|
90
|
+
assert(token is not None)
|
|
91
|
+
words_and_tokens.append(token)
|
|
92
|
+
l = k
|
|
93
|
+
|
|
94
|
+
# Term.
|
|
95
|
+
else:
|
|
96
|
+
k = l + 1
|
|
97
|
+
while k < len(term):
|
|
98
|
+
token = Tokens.try_create(term[k])
|
|
99
|
+
if token is not None and token.get_token_value() != Tokens.DASH:
|
|
100
|
+
break
|
|
101
|
+
k += 1
|
|
102
|
+
words_and_tokens.append(term[l:k])
|
|
103
|
+
l = k
|
|
104
|
+
|
|
105
|
+
return words_and_tokens
|
|
106
|
+
|
|
107
|
+
# endregion
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
from arekit.common.data.input.providers.const import IDLE_MODE
|
|
2
|
+
from arekit.common.pipeline.conts import PARENT_CTX
|
|
3
|
+
from arekit.common.entities.base import Entity
|
|
4
|
+
from arekit.common.pipeline.context import PipelineContext
|
|
5
|
+
from arekit.common.pipeline.items.base import BasePipelineItem
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class MLTextTranslatorPipelineItem(BasePipelineItem):
|
|
9
|
+
""" Machine learning based translator pipeline item.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, batch_translate_model, do_translate_entity=True):
|
|
13
|
+
""" Model, which is based on translation of the text,
|
|
14
|
+
represented as a list of words.
|
|
15
|
+
"""
|
|
16
|
+
self.__do_translate_entity = do_translate_entity
|
|
17
|
+
self.__translate = batch_translate_model
|
|
18
|
+
|
|
19
|
+
def fast_most_accurate_approach(self, input_data, entity_placeholder_template="<entityTag={}/>"):
|
|
20
|
+
""" This approach assumes that the translation won't corrupt the original
|
|
21
|
+
meta-annotation for entities and objects mentioned in text.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __optionally_register(prts):
|
|
25
|
+
if len(prts) > 0:
|
|
26
|
+
content.append(" ".join(prts))
|
|
27
|
+
parts_to_join.clear()
|
|
28
|
+
|
|
29
|
+
content = []
|
|
30
|
+
origin_entities = []
|
|
31
|
+
parts_to_join = []
|
|
32
|
+
|
|
33
|
+
for part in input_data:
|
|
34
|
+
if isinstance(part, str) and part.strip():
|
|
35
|
+
parts_to_join.append(part)
|
|
36
|
+
elif isinstance(part, Entity):
|
|
37
|
+
entity_index = len(origin_entities)
|
|
38
|
+
parts_to_join.append(entity_placeholder_template.format(entity_index))
|
|
39
|
+
# Register entities information for further restoration.
|
|
40
|
+
origin_entities.append(part)
|
|
41
|
+
|
|
42
|
+
# Register original text with masked named entities.
|
|
43
|
+
__optionally_register(parts_to_join)
|
|
44
|
+
# Register all named entities in order of their appearance in text.
|
|
45
|
+
content.extend([e.Value for e in origin_entities])
|
|
46
|
+
|
|
47
|
+
# Compose text parts.
|
|
48
|
+
translated_parts = self.__translate(content)
|
|
49
|
+
|
|
50
|
+
if len(translated_parts) == 0:
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
# Take the original text.
|
|
54
|
+
text = translated_parts[0]
|
|
55
|
+
for entity_index in range(len(origin_entities)):
|
|
56
|
+
if entity_placeholder_template.format(entity_index) not in text:
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
# Enumerate entities.
|
|
60
|
+
from_ind = 0
|
|
61
|
+
text_parts = []
|
|
62
|
+
for entity_index, translated_value in enumerate(translated_parts[1:]):
|
|
63
|
+
entity_placeholder_instance = entity_placeholder_template.format(entity_index)
|
|
64
|
+
# Cropping text part.
|
|
65
|
+
to_ind = text.index(entity_placeholder_instance)
|
|
66
|
+
|
|
67
|
+
if self.__do_translate_entity:
|
|
68
|
+
origin_entities[entity_index].set_display_value(translated_value.strip())
|
|
69
|
+
|
|
70
|
+
# Register entities.
|
|
71
|
+
text_parts.append(text[from_ind:to_ind])
|
|
72
|
+
text_parts.append(origin_entities[entity_index])
|
|
73
|
+
# Update from index.
|
|
74
|
+
from_ind = to_ind + len(entity_placeholder_instance)
|
|
75
|
+
|
|
76
|
+
# Consider the remaining part.
|
|
77
|
+
text_parts.append(text[from_ind:])
|
|
78
|
+
return text_parts
|
|
79
|
+
|
|
80
|
+
def default_pre_part_splitting_approach(self, input_data):
|
|
81
|
+
""" This is the original strategy, based on the manually cropped named entities
|
|
82
|
+
before the actual translation call.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def __optionally_register(prts):
|
|
86
|
+
if len(prts) > 0:
|
|
87
|
+
content.append(" ".join(prts))
|
|
88
|
+
parts_to_join.clear()
|
|
89
|
+
|
|
90
|
+
content = []
|
|
91
|
+
origin_entities = []
|
|
92
|
+
origin_entity_ind = []
|
|
93
|
+
parts_to_join = []
|
|
94
|
+
|
|
95
|
+
for _, part in enumerate(input_data):
|
|
96
|
+
if isinstance(part, str) and part.strip():
|
|
97
|
+
parts_to_join.append(part)
|
|
98
|
+
elif isinstance(part, Entity):
|
|
99
|
+
# Register first the prior parts were merged.
|
|
100
|
+
__optionally_register(parts_to_join)
|
|
101
|
+
# Register entities information for further restoration.
|
|
102
|
+
origin_entity_ind.append(len(content))
|
|
103
|
+
origin_entities.append(part)
|
|
104
|
+
content.append(part.Value)
|
|
105
|
+
|
|
106
|
+
__optionally_register(parts_to_join)
|
|
107
|
+
|
|
108
|
+
# Compose text parts.
|
|
109
|
+
translated_parts = self.__translate(content)
|
|
110
|
+
|
|
111
|
+
for entity_ind, entity_part_ind in enumerate(origin_entity_ind):
|
|
112
|
+
entity = origin_entities[entity_ind]
|
|
113
|
+
if self.__do_translate_entity:
|
|
114
|
+
entity.set_display_value(translated_parts[entity_part_ind].strip())
|
|
115
|
+
translated_parts[entity_part_ind] = entity
|
|
116
|
+
|
|
117
|
+
return translated_parts
|
|
118
|
+
|
|
119
|
+
def apply_core(self, input_data, pipeline_ctx):
|
|
120
|
+
assert(isinstance(pipeline_ctx, PipelineContext))
|
|
121
|
+
assert(isinstance(input_data, list))
|
|
122
|
+
|
|
123
|
+
# Check the pipeline state whether is an idle mode or not.
|
|
124
|
+
parent_ctx = pipeline_ctx.provide(PARENT_CTX)
|
|
125
|
+
idle_mode = parent_ctx.provide(IDLE_MODE)
|
|
126
|
+
|
|
127
|
+
# When pipeline utilized only for the assessing the expected amount
|
|
128
|
+
# of rows (common case of idle_mode), there is no need to perform
|
|
129
|
+
# translation.
|
|
130
|
+
if idle_mode:
|
|
131
|
+
return
|
|
132
|
+
|
|
133
|
+
fast_accurate = self.fast_most_accurate_approach(input_data)
|
|
134
|
+
return self.default_pre_part_splitting_approach(input_data) \
|
|
135
|
+
if fast_accurate is None else fast_accurate
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from collections.abc import Iterable
|
|
2
|
+
|
|
3
|
+
from arekit.common.labels.scaler.base import BaseLabelScaler
|
|
4
|
+
from arekit.common.linkage.base import LinkedDataWrapper
|
|
5
|
+
from arekit.common.model.labeling.modes import LabelCalculationMode
|
|
6
|
+
from arekit.common.model.labeling.single import SingleLabelsHelper
|
|
7
|
+
from arekit.common.opinions.base import Opinion
|
|
8
|
+
from arekit.common.opinions.collection import OpinionCollection
|
|
9
|
+
from arekit.common.pipeline.items.iter import FilterPipelineItem
|
|
10
|
+
from arekit.common.pipeline.items.map import MapPipelineItem
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def __create_labeled_opinion(item, label):
|
|
14
|
+
assert(isinstance(item, Opinion))
|
|
15
|
+
return Opinion(source_value=item.SourceValue,
|
|
16
|
+
target_value=item.TargetValue,
|
|
17
|
+
label=label)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def __linkages_to_opinions(linkages_iter, labels_helper, label_calc_mode):
|
|
21
|
+
assert(isinstance(linkages_iter, Iterable))
|
|
22
|
+
|
|
23
|
+
for linkage in linkages_iter:
|
|
24
|
+
assert(isinstance(linkage, LinkedDataWrapper))
|
|
25
|
+
|
|
26
|
+
agg_label = labels_helper.aggregate_labels(
|
|
27
|
+
labels_list=list(linkage.iter_labels()),
|
|
28
|
+
label_calc_mode=label_calc_mode)
|
|
29
|
+
|
|
30
|
+
yield __create_labeled_opinion(linkage.First, agg_label)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def __fill_opinion_collection(opinions_iter, collection, supported_labels):
|
|
34
|
+
assert(isinstance(opinions_iter, Iterable))
|
|
35
|
+
assert(isinstance(collection, OpinionCollection))
|
|
36
|
+
assert(isinstance(supported_labels, set) or supported_labels is None)
|
|
37
|
+
|
|
38
|
+
for opinion in opinions_iter:
|
|
39
|
+
assert(isinstance(opinion, Opinion))
|
|
40
|
+
|
|
41
|
+
if supported_labels is not None:
|
|
42
|
+
if opinion.Label not in supported_labels:
|
|
43
|
+
continue
|
|
44
|
+
|
|
45
|
+
if collection.has_synonymous_opinion(opinion):
|
|
46
|
+
continue
|
|
47
|
+
|
|
48
|
+
collection.add_opinion(opinion)
|
|
49
|
+
|
|
50
|
+
return collection
|
|
51
|
+
|
|
52
|
+
# endregion
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def text_opinion_linkages_to_opinion_collections_pipeline_part(
|
|
56
|
+
doc_ids_set, labels_scaler, iter_opinion_linkages_func,
|
|
57
|
+
create_opinion_collection_func, label_calc_mode):
|
|
58
|
+
""" Opinion collection generation pipeline.
|
|
59
|
+
"""
|
|
60
|
+
assert(isinstance(labels_scaler, BaseLabelScaler))
|
|
61
|
+
assert(isinstance(label_calc_mode, LabelCalculationMode))
|
|
62
|
+
assert(callable(iter_opinion_linkages_func))
|
|
63
|
+
assert(callable(create_opinion_collection_func))
|
|
64
|
+
|
|
65
|
+
return [
|
|
66
|
+
# Filter doc-ids.
|
|
67
|
+
FilterPipelineItem(filter_func=lambda doc_id: doc_id in doc_ids_set),
|
|
68
|
+
|
|
69
|
+
# Iterate opinion linkages.
|
|
70
|
+
MapPipelineItem(lambda doc_id: (doc_id, iter_opinion_linkages_func(doc_id))),
|
|
71
|
+
|
|
72
|
+
# Convert linkages to opinions.
|
|
73
|
+
MapPipelineItem(lambda data:
|
|
74
|
+
(data[0], __linkages_to_opinions(linkages_iter=data[1],
|
|
75
|
+
labels_helper=SingleLabelsHelper(labels_scaler),
|
|
76
|
+
label_calc_mode=label_calc_mode))),
|
|
77
|
+
|
|
78
|
+
# Filling opinion collection.
|
|
79
|
+
MapPipelineItem(lambda data:
|
|
80
|
+
(data[0],
|
|
81
|
+
__fill_opinion_collection(
|
|
82
|
+
opinions_iter=data[1],
|
|
83
|
+
collection=create_opinion_collection_func(),
|
|
84
|
+
supported_labels=None))),
|
|
85
|
+
]
|
|
File without changes
|
|
File without changes
|