arekit 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/__init__.py +0 -0
- arekit/common/__init__.py +0 -0
- arekit/common/bound.py +48 -0
- arekit/common/context/__init__.py +0 -0
- arekit/common/context/terms_mapper.py +51 -0
- arekit/common/context/token.py +16 -0
- arekit/common/data/__init__.py +0 -0
- arekit/common/data/const.py +21 -0
- arekit/common/data/doc_provider.py +6 -0
- arekit/common/data/input/__init__.py +0 -0
- arekit/common/data/input/providers/__init__.py +0 -0
- arekit/common/data/input/providers/columns/__init__.py +0 -0
- arekit/common/data/input/providers/columns/base.py +9 -0
- arekit/common/data/input/providers/columns/sample.py +59 -0
- arekit/common/data/input/providers/const.py +3 -0
- arekit/common/data/input/providers/contents.py +9 -0
- arekit/common/data/input/providers/instances/__init__.py +0 -0
- arekit/common/data/input/providers/instances/base.py +14 -0
- arekit/common/data/input/providers/instances/multiple.py +27 -0
- arekit/common/data/input/providers/instances/single.py +8 -0
- arekit/common/data/input/providers/label/__init__.py +0 -0
- arekit/common/data/input/providers/label/base.py +24 -0
- arekit/common/data/input/providers/label/binary.py +11 -0
- arekit/common/data/input/providers/label/multiple.py +15 -0
- arekit/common/data/input/providers/rows/__init__.py +0 -0
- arekit/common/data/input/providers/rows/base.py +64 -0
- arekit/common/data/input/providers/rows/samples.py +227 -0
- arekit/common/data/input/providers/sample/__init__.py +0 -0
- arekit/common/data/input/providers/sample/cropped.py +43 -0
- arekit/common/data/input/providers/text/__init__.py +0 -0
- arekit/common/data/input/providers/text/single.py +49 -0
- arekit/common/data/input/repositories/__init__.py +0 -0
- arekit/common/data/input/repositories/base.py +68 -0
- arekit/common/data/input/repositories/sample.py +22 -0
- arekit/common/data/input/sample.py +66 -0
- arekit/common/data/input/terms_mapper.py +88 -0
- arekit/common/data/rows_fmt.py +82 -0
- arekit/common/data/rows_parser.py +43 -0
- arekit/common/data/storages/__init__.py +0 -0
- arekit/common/data/storages/base.py +109 -0
- arekit/common/data/views/__init__.py +0 -0
- arekit/common/data/views/samples.py +26 -0
- arekit/common/docs/__init__.py +0 -0
- arekit/common/docs/base.py +30 -0
- arekit/common/docs/entities_grouping.py +16 -0
- arekit/common/docs/entity.py +18 -0
- arekit/common/docs/objects_parser.py +37 -0
- arekit/common/docs/parsed/__init__.py +0 -0
- arekit/common/docs/parsed/base.py +101 -0
- arekit/common/docs/parsed/providers/__init__.py +0 -0
- arekit/common/docs/parsed/providers/base.py +68 -0
- arekit/common/docs/parsed/providers/base_pairs.py +51 -0
- arekit/common/docs/parsed/providers/entity_service.py +175 -0
- arekit/common/docs/parsed/providers/opinion_pairs.py +20 -0
- arekit/common/docs/parsed/providers/text_opinion_pairs.py +78 -0
- arekit/common/docs/parsed/service.py +31 -0
- arekit/common/docs/parsed/term_position.py +42 -0
- arekit/common/docs/parser.py +34 -0
- arekit/common/docs/sentence.py +14 -0
- arekit/common/entities/__init__.py +0 -0
- arekit/common/entities/base.py +51 -0
- arekit/common/entities/collection.py +72 -0
- arekit/common/entities/str_fmt.py +8 -0
- arekit/common/entities/types.py +9 -0
- arekit/common/experiment/__init__.py +0 -0
- arekit/common/experiment/api/__init__.py +0 -0
- arekit/common/experiment/api/base_samples_io.py +20 -0
- arekit/common/experiment/data_type.py +17 -0
- arekit/common/frames/__init__.py +0 -0
- arekit/common/frames/connotations/__init__.py +0 -0
- arekit/common/frames/connotations/descriptor.py +17 -0
- arekit/common/frames/connotations/provider.py +4 -0
- arekit/common/frames/text_variant.py +43 -0
- arekit/common/frames/variants/__init__.py +0 -0
- arekit/common/frames/variants/base.py +21 -0
- arekit/common/frames/variants/collection.py +60 -0
- arekit/common/labels/__init__.py +0 -0
- arekit/common/labels/base.py +19 -0
- arekit/common/labels/provider/__init__.py +0 -0
- arekit/common/labels/provider/base.py +7 -0
- arekit/common/labels/provider/constant.py +14 -0
- arekit/common/labels/scaler/__init__.py +0 -0
- arekit/common/labels/scaler/base.py +85 -0
- arekit/common/labels/scaler/sentiment.py +7 -0
- arekit/common/labels/scaler/single.py +10 -0
- arekit/common/labels/str_fmt.py +55 -0
- arekit/common/linkage/__init__.py +0 -0
- arekit/common/linkage/base.py +44 -0
- arekit/common/linkage/meta.py +23 -0
- arekit/common/linkage/opinions.py +9 -0
- arekit/common/linkage/text_opinions.py +22 -0
- arekit/common/log_utils.py +29 -0
- arekit/common/model/__init__.py +0 -0
- arekit/common/model/labeling/__init__.py +0 -0
- arekit/common/model/labeling/base.py +24 -0
- arekit/common/model/labeling/modes.py +8 -0
- arekit/common/model/labeling/single.py +24 -0
- arekit/common/opinions/__init__.py +0 -0
- arekit/common/opinions/annot/__init__.py +0 -0
- arekit/common/opinions/annot/algo/__init__.py +0 -0
- arekit/common/opinions/annot/algo/base.py +4 -0
- arekit/common/opinions/annot/algo/pair_based.py +99 -0
- arekit/common/opinions/annot/algo/predefined.py +16 -0
- arekit/common/opinions/annot/algo_based.py +55 -0
- arekit/common/opinions/annot/base.py +15 -0
- arekit/common/opinions/base.py +74 -0
- arekit/common/opinions/collection.py +150 -0
- arekit/common/opinions/enums.py +6 -0
- arekit/common/opinions/provider.py +4 -0
- arekit/common/opinions/writer.py +4 -0
- arekit/common/pipeline/__init__.py +0 -0
- arekit/common/pipeline/base.py +25 -0
- arekit/common/pipeline/context.py +36 -0
- arekit/common/pipeline/conts.py +2 -0
- arekit/common/pipeline/items/__init__.py +0 -0
- arekit/common/pipeline/items/base.py +12 -0
- arekit/common/pipeline/items/flatten.py +14 -0
- arekit/common/pipeline/items/handle.py +17 -0
- arekit/common/pipeline/items/iter.py +11 -0
- arekit/common/pipeline/items/map.py +11 -0
- arekit/common/pipeline/items/map_nested.py +13 -0
- arekit/common/synonyms/__init__.py +0 -0
- arekit/common/synonyms/base.py +151 -0
- arekit/common/synonyms/grouping.py +21 -0
- arekit/common/text/__init__.py +0 -0
- arekit/common/text/enums.py +12 -0
- arekit/common/text/parsed.py +42 -0
- arekit/common/text/parser.py +12 -0
- arekit/common/text/partitioning/__init__.py +0 -0
- arekit/common/text/partitioning/base.py +4 -0
- arekit/common/text/partitioning/str.py +36 -0
- arekit/common/text/partitioning/terms.py +35 -0
- arekit/common/text/stemmer.py +16 -0
- arekit/common/text_opinions/__init__.py +0 -0
- arekit/common/text_opinions/base.py +105 -0
- arekit/common/utils.py +129 -0
- arekit/contrib/__init__.py +0 -0
- arekit/contrib/bert/__init__.py +0 -0
- arekit/contrib/bert/input/__init__.py +0 -0
- arekit/contrib/bert/input/providers/__init__.py +0 -0
- arekit/contrib/bert/input/providers/cropped_sample.py +17 -0
- arekit/contrib/bert/input/providers/text_pair.py +62 -0
- arekit/contrib/bert/terms/__init__.py +0 -0
- arekit/contrib/bert/terms/mapper.py +20 -0
- arekit/contrib/networks/__init__.py +0 -0
- arekit/contrib/networks/embedding.py +149 -0
- arekit/contrib/networks/embedding_io.py +18 -0
- arekit/contrib/networks/input/__init__.py +0 -0
- arekit/contrib/networks/input/const.py +6 -0
- arekit/contrib/networks/input/ctx_serialization.py +28 -0
- arekit/contrib/networks/input/embedding/__init__.py +0 -0
- arekit/contrib/networks/input/embedding/matrix.py +29 -0
- arekit/contrib/networks/input/embedding/offsets.py +55 -0
- arekit/contrib/networks/input/formatters/__init__.py +0 -0
- arekit/contrib/networks/input/formatters/pos_mapper.py +22 -0
- arekit/contrib/networks/input/providers/__init__.py +0 -0
- arekit/contrib/networks/input/providers/sample.py +129 -0
- arekit/contrib/networks/input/providers/term_connotation.py +23 -0
- arekit/contrib/networks/input/providers/text.py +24 -0
- arekit/contrib/networks/input/rows_parser.py +47 -0
- arekit/contrib/networks/input/term_types.py +13 -0
- arekit/contrib/networks/input/terms_mapping.py +60 -0
- arekit/contrib/networks/vectorizer.py +6 -0
- arekit/contrib/prompt/__init__.py +0 -0
- arekit/contrib/prompt/sample.py +61 -0
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +84 -0
- arekit/contrib/source/brat/doc.py +28 -0
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +13 -0
- arekit/contrib/source/brat/entities/entity.py +42 -0
- arekit/contrib/source/brat/entities/parser.py +53 -0
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +19 -0
- arekit/contrib/source/brat/relation.py +32 -0
- arekit/contrib/source/brat/sentence.py +69 -0
- arekit/contrib/source/brat/sentences_reader.py +128 -0
- arekit/contrib/source/download.py +41 -0
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +55 -0
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +74 -0
- arekit/contrib/source/nerel/io_utils.py +62 -0
- arekit/contrib/source/nerel/labels.py +241 -0
- arekit/contrib/source/nerel/reader.py +46 -0
- arekit/contrib/source/nerel/utils.py +24 -0
- arekit/contrib/source/nerel/versions.py +12 -0
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +62 -0
- arekit/contrib/source/nerelbio/labels.py +265 -0
- arekit/contrib/source/nerelbio/reader.py +8 -0
- arekit/contrib/source/nerelbio/versions.py +8 -0
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +36 -0
- arekit/contrib/source/ruattitudes/doc.py +51 -0
- arekit/contrib/source/ruattitudes/doc_brat.py +44 -0
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +7 -0
- arekit/contrib/source/ruattitudes/io_utils.py +56 -0
- arekit/contrib/source/ruattitudes/labels_fmt.py +12 -0
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +28 -0
- arekit/contrib/source/ruattitudes/opinions/converter.py +37 -0
- arekit/contrib/source/ruattitudes/reader.py +268 -0
- arekit/contrib/source/ruattitudes/sentence.py +73 -0
- arekit/contrib/source/ruattitudes/synonyms.py +17 -0
- arekit/contrib/source/ruattitudes/text_object.py +59 -0
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +157 -0
- arekit/contrib/source/rusentiframes/effect.py +24 -0
- arekit/contrib/source/rusentiframes/io_utils.py +19 -0
- arekit/contrib/source/rusentiframes/labels_fmt.py +22 -0
- arekit/contrib/source/rusentiframes/polarity.py +35 -0
- arekit/contrib/source/rusentiframes/role.py +15 -0
- arekit/contrib/source/rusentiframes/state.py +24 -0
- arekit/contrib/source/rusentiframes/types.py +42 -0
- arekit/contrib/source/rusentiframes/value.py +2 -0
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +3 -0
- arekit/contrib/source/rusentrel/docs_reader.py +51 -0
- arekit/contrib/source/rusentrel/entities.py +26 -0
- arekit/contrib/source/rusentrel/io_utils.py +125 -0
- arekit/contrib/source/rusentrel/labels_fmt.py +12 -0
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +30 -0
- arekit/contrib/source/rusentrel/opinions/converter.py +40 -0
- arekit/contrib/source/rusentrel/opinions/provider.py +54 -0
- arekit/contrib/source/rusentrel/opinions/writer.py +42 -0
- arekit/contrib/source/rusentrel/synonyms.py +17 -0
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +52 -0
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +31 -0
- arekit/contrib/source/sentinerel/folding/fixed.py +70 -0
- arekit/contrib/source/sentinerel/io_utils.py +87 -0
- arekit/contrib/source/sentinerel/labels.py +53 -0
- arekit/contrib/source/sentinerel/labels_scaler.py +30 -0
- arekit/contrib/source/sentinerel/reader.py +42 -0
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +19 -0
- arekit/contrib/source/zip_utils.py +47 -0
- arekit/contrib/utils/__init__.py +0 -0
- arekit/contrib/utils/bert/__init__.py +0 -0
- arekit/contrib/utils/bert/samplers.py +17 -0
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +23 -0
- arekit/contrib/utils/data/__init__.py +0 -0
- arekit/contrib/utils/data/contents/__init__.py +0 -0
- arekit/contrib/utils/data/contents/opinions.py +37 -0
- arekit/contrib/utils/data/doc_provider/__init__.py +0 -0
- arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
- arekit/contrib/utils/data/doc_provider/dir_based.py +53 -0
- arekit/contrib/utils/data/readers/__init__.py +0 -0
- arekit/contrib/utils/data/readers/base.py +7 -0
- arekit/contrib/utils/data/readers/csv_pd.py +38 -0
- arekit/contrib/utils/data/readers/jsonl.py +15 -0
- arekit/contrib/utils/data/service/__init__.py +0 -0
- arekit/contrib/utils/data/service/balance.py +50 -0
- arekit/contrib/utils/data/storages/__init__.py +0 -0
- arekit/contrib/utils/data/storages/jsonl_based.py +18 -0
- arekit/contrib/utils/data/storages/pandas_based.py +123 -0
- arekit/contrib/utils/data/storages/row_cache.py +48 -0
- arekit/contrib/utils/data/writers/__init__.py +0 -0
- arekit/contrib/utils/data/writers/base.py +27 -0
- arekit/contrib/utils/data/writers/csv_native.py +63 -0
- arekit/contrib/utils/data/writers/csv_pd.py +40 -0
- arekit/contrib/utils/data/writers/json_opennre.py +132 -0
- arekit/contrib/utils/data/writers/sqlite_native.py +110 -0
- arekit/contrib/utils/download.py +77 -0
- arekit/contrib/utils/embeddings/__init__.py +0 -0
- arekit/contrib/utils/embeddings/rusvectores.py +58 -0
- arekit/contrib/utils/embeddings/tokens.py +30 -0
- arekit/contrib/utils/entities/__init__.py +0 -0
- arekit/contrib/utils/entities/filter.py +7 -0
- arekit/contrib/utils/entities/formatters/__init__.py +0 -0
- arekit/contrib/utils/entities/formatters/str_display.py +11 -0
- arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py +15 -0
- arekit/contrib/utils/io_utils/__init__.py +0 -0
- arekit/contrib/utils/io_utils/embedding.py +72 -0
- arekit/contrib/utils/io_utils/opinions.py +37 -0
- arekit/contrib/utils/io_utils/samples.py +79 -0
- arekit/contrib/utils/io_utils/utils.py +39 -0
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +41 -0
- arekit/contrib/utils/lexicons/relation.py +42 -0
- arekit/contrib/utils/lexicons/rusentilex.py +37 -0
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +83 -0
- arekit/contrib/utils/np_utils/__init__.py +0 -0
- arekit/contrib/utils/np_utils/embedding.py +22 -0
- arekit/contrib/utils/np_utils/npz_utils.py +13 -0
- arekit/contrib/utils/np_utils/vocab.py +20 -0
- arekit/contrib/utils/pipelines/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/base.py +99 -0
- arekit/contrib/utils/pipelines/items/sampling/networks.py +54 -0
- arekit/contrib/utils/pipelines/items/text/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/text/entities_default.py +23 -0
- arekit/contrib/utils/pipelines/items/text/frames.py +86 -0
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +36 -0
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +32 -0
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +10 -0
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +107 -0
- arekit/contrib/utils/pipelines/items/text/translator.py +135 -0
- arekit/contrib/utils/pipelines/opinion_collections.py +85 -0
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +27 -0
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +65 -0
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +60 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +29 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +64 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +79 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +56 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +20 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +65 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +21 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +107 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +29 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +62 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +180 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +50 -0
- arekit/contrib/utils/pipelines/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +34 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +88 -0
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +93 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/base.py +4 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +16 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +29 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +26 -0
- arekit/contrib/utils/processing/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/mods.py +12 -0
- arekit/contrib/utils/processing/languages/pos.py +23 -0
- arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/ru/cases.py +78 -0
- arekit/contrib/utils/processing/languages/ru/constants.py +6 -0
- arekit/contrib/utils/processing/languages/ru/mods.py +13 -0
- arekit/contrib/utils/processing/languages/ru/number.py +23 -0
- arekit/contrib/utils/processing/languages/ru/pos_service.py +36 -0
- arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
- arekit/contrib/utils/processing/lemmatization/mystem.py +51 -0
- arekit/contrib/utils/processing/pos/__init__.py +0 -0
- arekit/contrib/utils/processing/pos/base.py +12 -0
- arekit/contrib/utils/processing/pos/mystem_wrap.py +134 -0
- arekit/contrib/utils/processing/pos/russian.py +10 -0
- arekit/contrib/utils/processing/text/__init__.py +0 -0
- arekit/contrib/utils/processing/text/tokens.py +127 -0
- arekit/contrib/utils/resources.py +25 -0
- arekit/contrib/utils/serializer.py +43 -0
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +63 -0
- arekit/contrib/utils/synonyms/__init__.py +0 -0
- arekit/contrib/utils/synonyms/simple.py +15 -0
- arekit/contrib/utils/synonyms/stemmer_based.py +38 -0
- arekit/contrib/utils/vectorizers/__init__.py +0 -0
- arekit/contrib/utils/vectorizers/bpe.py +93 -0
- arekit/contrib/utils/vectorizers/random_norm.py +39 -0
- arekit/download_data.py +11 -0
- arekit-0.24.0.dist-info/LICENSE +21 -0
- arekit-0.24.0.dist-info/METADATA +23 -0
- arekit-0.24.0.dist-info/RECORD +374 -0
- arekit-0.24.0.dist-info/WHEEL +5 -0
- arekit-0.24.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from arekit.common.pipeline.items.base import BasePipelineItem
|
|
2
|
+
from arekit.common.text.partitioning.base import BasePartitioning
|
|
3
|
+
from arekit.common.pipeline.context import PipelineContext
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SentenceObjectsParserPipelineItem(BasePipelineItem):
|
|
7
|
+
|
|
8
|
+
def __init__(self, partitioning):
|
|
9
|
+
assert(isinstance(partitioning, BasePartitioning))
|
|
10
|
+
self.__partitioning = partitioning
|
|
11
|
+
|
|
12
|
+
# region protected
|
|
13
|
+
|
|
14
|
+
def _get_text(self, pipeline_ctx):
|
|
15
|
+
return None
|
|
16
|
+
|
|
17
|
+
def _get_parts_provider_func(self, input_data, pipeline_ctx):
|
|
18
|
+
raise NotImplementedError()
|
|
19
|
+
|
|
20
|
+
# endregion
|
|
21
|
+
|
|
22
|
+
def apply_core(self, input_data, pipeline_ctx):
|
|
23
|
+
assert(isinstance(pipeline_ctx, PipelineContext))
|
|
24
|
+
external_input = self._get_text(pipeline_ctx)
|
|
25
|
+
actual_input = input_data if external_input is None else external_input
|
|
26
|
+
parts_it = self._get_parts_provider_func(input_data=actual_input, pipeline_ctx=pipeline_ctx)
|
|
27
|
+
return self.__partitioning.provide(text=actual_input, parts_it=parts_it)
|
|
28
|
+
|
|
29
|
+
# region base
|
|
30
|
+
|
|
31
|
+
def __enter__(self):
|
|
32
|
+
return self
|
|
33
|
+
|
|
34
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
# endregion
|
|
File without changes
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from collections.abc import Iterable
|
|
2
|
+
|
|
3
|
+
from arekit.common.entities.base import Entity
|
|
4
|
+
from arekit.common.text.enums import TermFormat
|
|
5
|
+
from arekit.common.text.parsed import BaseParsedText
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ParsedDocument(object):
|
|
9
|
+
"""
|
|
10
|
+
This class represents an information of the processed doc in following directions:
|
|
11
|
+
- doc words
|
|
12
|
+
- tokens
|
|
13
|
+
- entities (positions).
|
|
14
|
+
- frames (FrameVariants)
|
|
15
|
+
It allows:
|
|
16
|
+
- Expand parsed sentences with other objects:
|
|
17
|
+
modify_parsed_sentences(func)
|
|
18
|
+
|
|
19
|
+
Limitations:
|
|
20
|
+
IN MEMORY implementation (`add` method)
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, doc_id, parsed_sentences):
|
|
24
|
+
"""
|
|
25
|
+
parsed_sentences: iterable of ParsedSentence type
|
|
26
|
+
NOTE: Considered sentences with labeled Entities in it!
|
|
27
|
+
"""
|
|
28
|
+
assert(isinstance(parsed_sentences, Iterable))
|
|
29
|
+
|
|
30
|
+
self.__doc_id = doc_id
|
|
31
|
+
self.__parsed_sentences = list(parsed_sentences)
|
|
32
|
+
|
|
33
|
+
# region properties
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def RelatedDocID(self):
|
|
37
|
+
return self.__doc_id
|
|
38
|
+
|
|
39
|
+
# endregion
|
|
40
|
+
|
|
41
|
+
# region private methods
|
|
42
|
+
|
|
43
|
+
def __iter_all_raw_terms(self, filter_func=None, term_only=False):
|
|
44
|
+
assert(callable(filter_func) or filter_func is None)
|
|
45
|
+
assert(isinstance(term_only, bool))
|
|
46
|
+
|
|
47
|
+
for s_ind, sentence in enumerate(self.__parsed_sentences):
|
|
48
|
+
for ind_in_sent, term in self.__iter_sentence_raw_terms(sentence, filter_func=filter_func):
|
|
49
|
+
|
|
50
|
+
if term_only:
|
|
51
|
+
yield term
|
|
52
|
+
else:
|
|
53
|
+
yield s_ind, ind_in_sent, term
|
|
54
|
+
|
|
55
|
+
@staticmethod
|
|
56
|
+
def __iter_sentence_raw_terms(sentence, filter_func):
|
|
57
|
+
assert(isinstance(sentence, BaseParsedText))
|
|
58
|
+
assert(callable(filter_func) or filter_func is None)
|
|
59
|
+
|
|
60
|
+
for ind_in_sent, term in enumerate(sentence.iter_terms(TermFormat.Raw)):
|
|
61
|
+
|
|
62
|
+
if filter_func is not None:
|
|
63
|
+
if not filter_func(term):
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
yield ind_in_sent, term
|
|
67
|
+
|
|
68
|
+
# endregion
|
|
69
|
+
|
|
70
|
+
# region public 'iter' methods
|
|
71
|
+
|
|
72
|
+
def get_sentence(self, s_ind):
|
|
73
|
+
assert(isinstance(s_ind, int))
|
|
74
|
+
return self.__parsed_sentences[s_ind]
|
|
75
|
+
|
|
76
|
+
def iter_entities(self):
|
|
77
|
+
for entity in self.__iter_all_raw_terms(term_only=True, filter_func=lambda t: isinstance(t, Entity)):
|
|
78
|
+
yield entity
|
|
79
|
+
|
|
80
|
+
def iter_terms(self, filter_func=None, term_only=True):
|
|
81
|
+
for term in self.__iter_all_raw_terms(term_only=term_only, filter_func=filter_func):
|
|
82
|
+
yield term
|
|
83
|
+
|
|
84
|
+
def iter_sentence_terms(self, sentence_index, return_id, filter_func=None):
|
|
85
|
+
assert(isinstance(sentence_index, int))
|
|
86
|
+
assert(isinstance(return_id, bool))
|
|
87
|
+
assert(callable(filter_func) or filter_func is None)
|
|
88
|
+
|
|
89
|
+
it = self.__iter_sentence_raw_terms(sentence=self.__parsed_sentences[sentence_index],
|
|
90
|
+
filter_func=filter_func)
|
|
91
|
+
|
|
92
|
+
for ind_in_sent, term in it:
|
|
93
|
+
if return_id:
|
|
94
|
+
yield ind_in_sent, term
|
|
95
|
+
else:
|
|
96
|
+
yield term
|
|
97
|
+
# endregion
|
|
98
|
+
|
|
99
|
+
def __iter__(self):
|
|
100
|
+
for sentence in self.__parsed_sentences:
|
|
101
|
+
yield sentence
|
|
File without changes
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from arekit.common.entities.base import Entity
|
|
2
|
+
from arekit.common.docs.entity import DocumentEntity
|
|
3
|
+
from arekit.common.docs.parsed.base import ParsedDocument
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class BaseParsedDocumentServiceProvider(object):
|
|
7
|
+
|
|
8
|
+
def __init__(self, entity_index_func=None):
|
|
9
|
+
""" Outside enity indexing function
|
|
10
|
+
entity_index_func: provides id for a given entity, i.e.
|
|
11
|
+
func(entity) -> int (id)
|
|
12
|
+
"""
|
|
13
|
+
assert(callable(entity_index_func) or entity_index_func is None)
|
|
14
|
+
self._doc_entities = None
|
|
15
|
+
self.__entity_map = {}
|
|
16
|
+
self.__entity_index_func = entity_index_func
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def Name(self):
|
|
20
|
+
raise NotImplementedError()
|
|
21
|
+
|
|
22
|
+
def init_parsed_doc(self, parsed_doc):
|
|
23
|
+
assert(isinstance(parsed_doc, ParsedDocument))
|
|
24
|
+
|
|
25
|
+
def __iter_childs_and_root_node(entity):
|
|
26
|
+
""" Note: Entity has childs and we would like to iterate over childs
|
|
27
|
+
to conider them as well as keep the root Node.
|
|
28
|
+
"""
|
|
29
|
+
# We first add childs.
|
|
30
|
+
for child_entity in entity.iter_childs():
|
|
31
|
+
yield child_entity, True
|
|
32
|
+
|
|
33
|
+
# Return Root node.
|
|
34
|
+
yield entity, False
|
|
35
|
+
|
|
36
|
+
self._doc_entities = []
|
|
37
|
+
self.__entity_map.clear()
|
|
38
|
+
|
|
39
|
+
current_id = 0
|
|
40
|
+
for _, entity in enumerate(parsed_doc.iter_entities()):
|
|
41
|
+
|
|
42
|
+
child_doc_entities = []
|
|
43
|
+
for tree_entity, is_child in __iter_childs_and_root_node(entity):
|
|
44
|
+
|
|
45
|
+
doc_entity = DocumentEntity(id_in_doc=current_id,
|
|
46
|
+
value=tree_entity.Value,
|
|
47
|
+
e_type=tree_entity.Type,
|
|
48
|
+
display_value=tree_entity.DisplayValue,
|
|
49
|
+
childs=None if is_child else child_doc_entities,
|
|
50
|
+
group_index=tree_entity.GroupIndex)
|
|
51
|
+
current_id += 1
|
|
52
|
+
|
|
53
|
+
if is_child:
|
|
54
|
+
child_doc_entities.append(doc_entity)
|
|
55
|
+
|
|
56
|
+
self._doc_entities.append(doc_entity)
|
|
57
|
+
|
|
58
|
+
if self.__entity_index_func is not None:
|
|
59
|
+
self.__entity_map[self.__entity_index_func(tree_entity)] = doc_entity
|
|
60
|
+
|
|
61
|
+
def get_document_entity(self, entity):
|
|
62
|
+
""" Maps entity to the related one with DocumentEntity type
|
|
63
|
+
"""
|
|
64
|
+
assert(isinstance(entity, Entity))
|
|
65
|
+
return self.__entity_map[self.__entity_index_func(entity)]
|
|
66
|
+
|
|
67
|
+
def contains_entity(self, entity):
|
|
68
|
+
return self.__entity_index_func(entity) in self.__entity_map
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from arekit.common.labels.provider.base import BasePairLabelProvider
|
|
2
|
+
from arekit.common.docs.parsed.providers.base import BaseParsedDocumentServiceProvider
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class BasePairProvider(BaseParsedDocumentServiceProvider):
|
|
6
|
+
|
|
7
|
+
@property
|
|
8
|
+
def Name(self):
|
|
9
|
+
raise NotImplementedError()
|
|
10
|
+
|
|
11
|
+
def _create_pair(self, source_entity, target_entity, label):
|
|
12
|
+
raise NotImplementedError()
|
|
13
|
+
|
|
14
|
+
# region private methods
|
|
15
|
+
|
|
16
|
+
def _iter_from_entities(self, src_entity_doc_ids, tgt_entity_doc_ids, label_provider, filter_func=None):
|
|
17
|
+
assert(isinstance(src_entity_doc_ids, list))
|
|
18
|
+
assert(isinstance(tgt_entity_doc_ids, list))
|
|
19
|
+
assert(isinstance(label_provider, BasePairLabelProvider))
|
|
20
|
+
assert(callable(filter_func) or filter_func is None)
|
|
21
|
+
|
|
22
|
+
for src_e_doc_id in src_entity_doc_ids:
|
|
23
|
+
for tgt_e_doc_id in tgt_entity_doc_ids:
|
|
24
|
+
assert(isinstance(src_e_doc_id, int))
|
|
25
|
+
assert(isinstance(tgt_e_doc_id, int))
|
|
26
|
+
|
|
27
|
+
# Extract entities by doc_id.
|
|
28
|
+
source_entity = self._doc_entities[src_e_doc_id]
|
|
29
|
+
target_entity = self._doc_entities[tgt_e_doc_id]
|
|
30
|
+
|
|
31
|
+
if filter_func is not None and not filter_func(source_entity, target_entity):
|
|
32
|
+
continue
|
|
33
|
+
|
|
34
|
+
if source_entity == target_entity:
|
|
35
|
+
continue
|
|
36
|
+
|
|
37
|
+
label = label_provider.provide(source=source_entity,
|
|
38
|
+
target=target_entity)
|
|
39
|
+
|
|
40
|
+
yield self._create_pair(source_entity=source_entity,
|
|
41
|
+
target_entity=target_entity,
|
|
42
|
+
label=label)
|
|
43
|
+
|
|
44
|
+
# endregion
|
|
45
|
+
|
|
46
|
+
def iter_from_all(self, label_provider, filter_func):
|
|
47
|
+
assert(isinstance(label_provider, BasePairLabelProvider))
|
|
48
|
+
return self._iter_from_entities(src_entity_doc_ids=list(map(lambda e: e.IdInDocument, self._doc_entities)),
|
|
49
|
+
tgt_entity_doc_ids=list(map(lambda e: e.IdInDocument, self._doc_entities)),
|
|
50
|
+
label_provider=label_provider,
|
|
51
|
+
filter_func=filter_func)
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
from arekit.common.entities.base import Entity
|
|
4
|
+
from arekit.common.docs.entity import DocumentEntity
|
|
5
|
+
from arekit.common.docs.parsed.base import ParsedDocument
|
|
6
|
+
from arekit.common.docs.parsed.providers.base import BaseParsedDocumentServiceProvider
|
|
7
|
+
from arekit.common.docs.parsed.term_position import TermPositionTypes, TermPosition
|
|
8
|
+
from arekit.common.text_opinions.base import TextOpinion
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class EntityEndType(Enum):
|
|
12
|
+
""" Pair end type
|
|
13
|
+
"""
|
|
14
|
+
Source = 1
|
|
15
|
+
Target = 2
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DistanceType(Enum):
|
|
19
|
+
InTerms = 1
|
|
20
|
+
InSentences = 2
|
|
21
|
+
|
|
22
|
+
@staticmethod
|
|
23
|
+
def to_position_type(dist_type):
|
|
24
|
+
assert(isinstance(dist_type, DistanceType))
|
|
25
|
+
|
|
26
|
+
if dist_type == DistanceType.InTerms:
|
|
27
|
+
return TermPositionTypes.IndexInDocument
|
|
28
|
+
|
|
29
|
+
if dist_type == DistanceType.InSentences:
|
|
30
|
+
return TermPositionTypes.SentenceIndex
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class EntityServiceProvider(BaseParsedDocumentServiceProvider):
|
|
34
|
+
""" This class provides a helper functions for TextOpinions, which become a part of TextOpinionCollection.
|
|
35
|
+
The latter is important because of the dependency from Owner.
|
|
36
|
+
We utilize 'extract' prefix in methods to emphasize that these are methods of helper.
|
|
37
|
+
|
|
38
|
+
Wrapper over:
|
|
39
|
+
parsed doc, positions, text_opinions
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
NAME = "entity-service-provider"
|
|
43
|
+
|
|
44
|
+
def __init__(self, entity_index_func):
|
|
45
|
+
assert(callable(entity_index_func))
|
|
46
|
+
super(EntityServiceProvider, self).__init__(entity_index_func=entity_index_func)
|
|
47
|
+
# Initialize API.
|
|
48
|
+
self.__iter_raw_terms_func = None
|
|
49
|
+
# Initialize entity positions.
|
|
50
|
+
self.__entity_positions = None
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def Name(self):
|
|
54
|
+
return self.NAME
|
|
55
|
+
|
|
56
|
+
def init_parsed_doc(self, parsed_doc):
|
|
57
|
+
super(EntityServiceProvider, self).init_parsed_doc(parsed_doc)
|
|
58
|
+
assert(isinstance(parsed_doc, ParsedDocument))
|
|
59
|
+
self.__iter_raw_terms_func = lambda: parsed_doc.iter_terms(filter_func=None, term_only=False)
|
|
60
|
+
self.__entity_positions = self.__calculate_entity_positions()
|
|
61
|
+
|
|
62
|
+
# region public 'extract' methods
|
|
63
|
+
|
|
64
|
+
def extract_entity_value(self, text_opinion, end_type):
|
|
65
|
+
return self.__extract_entity_value(text_opinion=text_opinion, end_type=end_type)
|
|
66
|
+
|
|
67
|
+
def extract_entity_position(self, text_opinion, end_type, position_type=None):
|
|
68
|
+
return self.__get_entity_position(text_opinion=text_opinion,
|
|
69
|
+
end_type=end_type,
|
|
70
|
+
position_type=position_type)
|
|
71
|
+
|
|
72
|
+
# endregion
|
|
73
|
+
|
|
74
|
+
# region public 'calculate' methods
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
def calc_dist_between_text_opinion_end_indices(pos1_ind, pos2_ind):
|
|
78
|
+
return EntityServiceProvider.__calc_distance_by_inds(pos1_ind=pos1_ind, pos2_ind=pos2_ind)
|
|
79
|
+
|
|
80
|
+
def calc_dist_between_text_opinion_ends(self, text_opinion, distance_type):
|
|
81
|
+
assert(isinstance(text_opinion, TextOpinion))
|
|
82
|
+
assert(isinstance(distance_type, DistanceType))
|
|
83
|
+
|
|
84
|
+
e1_id = self.__get_end_id(text_opinion=text_opinion, end_type=EntityEndType.Source)
|
|
85
|
+
e2_id = self.__get_end_id(text_opinion=text_opinion, end_type=EntityEndType.Target)
|
|
86
|
+
|
|
87
|
+
return self.__calc_distance(
|
|
88
|
+
pos1=self.get_entity_position(id_in_document=e1_id),
|
|
89
|
+
pos2=self.get_entity_position(id_in_document=e2_id),
|
|
90
|
+
position_type=DistanceType.to_position_type(distance_type))
|
|
91
|
+
|
|
92
|
+
def calc_dist_between_entities(self, e1, e2, distance_type):
|
|
93
|
+
assert(isinstance(e1, DocumentEntity))
|
|
94
|
+
assert(isinstance(e2, DocumentEntity))
|
|
95
|
+
assert(isinstance(distance_type, DistanceType))
|
|
96
|
+
|
|
97
|
+
return self.__calc_distance(
|
|
98
|
+
pos1=self.get_entity_position(e1.IdInDocument),
|
|
99
|
+
pos2=self.get_entity_position(e2.IdInDocument),
|
|
100
|
+
position_type=DistanceType.to_position_type(distance_type))
|
|
101
|
+
|
|
102
|
+
def get_entity_position(self, id_in_document, position_type=None):
|
|
103
|
+
""" returns: TermPosition or int
|
|
104
|
+
"""
|
|
105
|
+
assert(isinstance(position_type, TermPositionTypes) or position_type is None)
|
|
106
|
+
|
|
107
|
+
e_pos = self.__entity_positions[id_in_document]
|
|
108
|
+
assert(isinstance(e_pos, TermPosition))
|
|
109
|
+
|
|
110
|
+
if position_type is None:
|
|
111
|
+
return e_pos
|
|
112
|
+
|
|
113
|
+
return e_pos.get_index(position_type)
|
|
114
|
+
|
|
115
|
+
def get_entity_value(self, id_in_document):
|
|
116
|
+
entity = self._doc_entities[id_in_document]
|
|
117
|
+
assert(isinstance(entity, Entity))
|
|
118
|
+
return entity.Value
|
|
119
|
+
|
|
120
|
+
# endregion
|
|
121
|
+
|
|
122
|
+
# region private methods
|
|
123
|
+
|
|
124
|
+
def __extract_entity_value(self, text_opinion, end_type):
|
|
125
|
+
assert(isinstance(text_opinion, TextOpinion))
|
|
126
|
+
end_id = self.__get_end_id(text_opinion=text_opinion, end_type=end_type)
|
|
127
|
+
return self.get_entity_value(end_id)
|
|
128
|
+
|
|
129
|
+
def __get_entity_position(self, text_opinion, end_type, position_type=None):
|
|
130
|
+
assert(isinstance(text_opinion, TextOpinion))
|
|
131
|
+
end_id = self.__get_end_id(text_opinion=text_opinion, end_type=end_type)
|
|
132
|
+
return self.get_entity_position(end_id, position_type)
|
|
133
|
+
|
|
134
|
+
def __calc_distance(self, pos1, pos2, position_type=TermPositionTypes.IndexInDocument):
|
|
135
|
+
assert(isinstance(pos1, TermPosition))
|
|
136
|
+
assert(isinstance(pos2, TermPosition))
|
|
137
|
+
return self.__calc_distance_by_inds(pos1_ind=pos1.get_index(position_type),
|
|
138
|
+
pos2_ind=pos2.get_index(position_type))
|
|
139
|
+
|
|
140
|
+
@staticmethod
|
|
141
|
+
def __calc_distance_by_inds(pos1_ind, pos2_ind):
|
|
142
|
+
return abs(pos1_ind - pos2_ind)
|
|
143
|
+
|
|
144
|
+
@staticmethod
|
|
145
|
+
def __get_end_id(text_opinion, end_type):
|
|
146
|
+
assert(isinstance(text_opinion, TextOpinion))
|
|
147
|
+
assert(end_type == EntityEndType.Source or end_type == EntityEndType.Target)
|
|
148
|
+
return text_opinion.SourceId if end_type == EntityEndType.Source else text_opinion.TargetId
|
|
149
|
+
|
|
150
|
+
def __calculate_entity_positions(self):
|
|
151
|
+
""" Note: here we consider the same order as in self._entities.
|
|
152
|
+
"""
|
|
153
|
+
t_ind_in_doc = -1
|
|
154
|
+
|
|
155
|
+
positions = {}
|
|
156
|
+
for s_ind, t_ind_in_sent, term in self.__iter_raw_terms_func():
|
|
157
|
+
|
|
158
|
+
t_ind_in_doc += 1
|
|
159
|
+
|
|
160
|
+
if not isinstance(term, Entity):
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
# We consider that entities within a single tree has the same positions.
|
|
164
|
+
for tree_entity in list(term.iter_childs()) + [term]:
|
|
165
|
+
|
|
166
|
+
key = self.get_document_entity(tree_entity).IdInDocument
|
|
167
|
+
assert(key not in positions)
|
|
168
|
+
|
|
169
|
+
positions[key] = TermPosition(term_ind_in_doc=t_ind_in_doc,
|
|
170
|
+
term_ind_in_sent=t_ind_in_sent,
|
|
171
|
+
s_ind=s_ind)
|
|
172
|
+
|
|
173
|
+
return positions
|
|
174
|
+
|
|
175
|
+
# endregion
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from arekit.common.entities.base import Entity
|
|
2
|
+
from arekit.common.docs.parsed.providers.base_pairs import BasePairProvider
|
|
3
|
+
from arekit.common.opinions.base import Opinion
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class OpinionPairsProvider(BasePairProvider):
|
|
7
|
+
|
|
8
|
+
NAME = "opinion-pairs-provider"
|
|
9
|
+
|
|
10
|
+
@property
|
|
11
|
+
def Name(self):
|
|
12
|
+
return self.NAME
|
|
13
|
+
|
|
14
|
+
def _create_pair(self, source_entity, target_entity, label):
|
|
15
|
+
assert(isinstance(source_entity, Entity))
|
|
16
|
+
assert(isinstance(target_entity, Entity))
|
|
17
|
+
|
|
18
|
+
return Opinion(source_value=source_entity.Value,
|
|
19
|
+
target_value=target_entity.Value,
|
|
20
|
+
label=label)
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from arekit.common.entities.collection import EntityCollection
|
|
4
|
+
from arekit.common.docs.entity import DocumentEntity
|
|
5
|
+
from arekit.common.docs.parsed.providers.base_pairs import BasePairProvider
|
|
6
|
+
from arekit.common.opinions.base import Opinion
|
|
7
|
+
from arekit.common.text_opinions.base import TextOpinion
|
|
8
|
+
from arekit.common.labels.provider.constant import ConstantLabelProvider
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TextOpinionPairsProvider(BasePairProvider):
|
|
14
|
+
""" Document Related text opinion provider.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
NAME = "text-opinion-pairs-provider"
|
|
18
|
+
|
|
19
|
+
def __init__(self, value_to_group_id_func):
|
|
20
|
+
super(TextOpinionPairsProvider, self).__init__()
|
|
21
|
+
self.__value_to_group_id_func = value_to_group_id_func
|
|
22
|
+
self.__doc_id = None
|
|
23
|
+
self.__entities_collection = None
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def Name(self):
|
|
27
|
+
return self.NAME
|
|
28
|
+
|
|
29
|
+
def _create_pair(self, source_entity, target_entity, label):
|
|
30
|
+
assert(isinstance(source_entity, DocumentEntity))
|
|
31
|
+
assert(isinstance(target_entity, DocumentEntity))
|
|
32
|
+
|
|
33
|
+
return TextOpinion(doc_id=self.__doc_id,
|
|
34
|
+
source_id=source_entity.IdInDocument,
|
|
35
|
+
target_id=target_entity.IdInDocument,
|
|
36
|
+
label=label,
|
|
37
|
+
text_opinion_id=None)
|
|
38
|
+
|
|
39
|
+
def init_parsed_doc(self, parsed_doc):
|
|
40
|
+
super(TextOpinionPairsProvider, self).init_parsed_doc(parsed_doc)
|
|
41
|
+
self.__doc_id = parsed_doc.RelatedDocID
|
|
42
|
+
self.__entities_collection = EntityCollection(
|
|
43
|
+
entities=list(self._doc_entities),
|
|
44
|
+
value_to_group_id_func=self.__value_to_group_id_func)
|
|
45
|
+
|
|
46
|
+
def iter_from_opinion(self, opinion, debug=False):
|
|
47
|
+
""" Provides text-level opinion extraction by document-level opinions
|
|
48
|
+
(Opinion class instances), for a particular document (doc_id),
|
|
49
|
+
with the related entity collection.
|
|
50
|
+
"""
|
|
51
|
+
assert(isinstance(opinion, Opinion))
|
|
52
|
+
|
|
53
|
+
key = EntityCollection.KeyType.BY_SYNONYMS
|
|
54
|
+
source_entities = self.__entities_collection.try_get_entities(opinion.SourceValue, group_key=key)
|
|
55
|
+
target_entities = self.__entities_collection.try_get_entities(opinion.TargetValue, group_key=key)
|
|
56
|
+
|
|
57
|
+
if source_entities is None:
|
|
58
|
+
if debug:
|
|
59
|
+
logger.info("Appropriate entity for '{}'->'...' has not been found".format(
|
|
60
|
+
opinion.SourceValue))
|
|
61
|
+
return
|
|
62
|
+
yield
|
|
63
|
+
|
|
64
|
+
if target_entities is None:
|
|
65
|
+
if debug:
|
|
66
|
+
logger.info("Appropriate entity for '...'->'{}' has not been found".format(
|
|
67
|
+
opinion.TargetValue))
|
|
68
|
+
return
|
|
69
|
+
yield
|
|
70
|
+
|
|
71
|
+
label_provider = ConstantLabelProvider(label_instance=opinion.Label)
|
|
72
|
+
|
|
73
|
+
pairs_it = self._iter_from_entities(src_entity_doc_ids=list(map(lambda e: e.IdInDocument, source_entities)),
|
|
74
|
+
tgt_entity_doc_ids=list(map(lambda e: e.IdInDocument, target_entities)),
|
|
75
|
+
label_provider=label_provider)
|
|
76
|
+
|
|
77
|
+
for pair in pairs_it:
|
|
78
|
+
yield pair
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from arekit.common.docs.parsed.base import ParsedDocument
|
|
2
|
+
from arekit.common.docs.parsed.providers.base import BaseParsedDocumentServiceProvider
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ParsedDocumentService(object):
|
|
6
|
+
""" Represents a collection of providers, combined with the parsed doc.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
def __init__(self, parsed_doc, providers):
|
|
10
|
+
assert(isinstance(parsed_doc, ParsedDocument))
|
|
11
|
+
assert(isinstance(providers, list))
|
|
12
|
+
self.__parsed_doc = parsed_doc
|
|
13
|
+
self.__providers = {}
|
|
14
|
+
|
|
15
|
+
for provider in providers:
|
|
16
|
+
assert(isinstance(provider, BaseParsedDocumentServiceProvider))
|
|
17
|
+
assert(provider.Name not in self.__providers)
|
|
18
|
+
|
|
19
|
+
# Link provider with the related name.
|
|
20
|
+
self.__providers[provider.Name] = provider
|
|
21
|
+
|
|
22
|
+
# Post initialize with the related parsed doc.
|
|
23
|
+
provider.init_parsed_doc(self.__parsed_doc)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def ParsedDocument(self):
|
|
28
|
+
return self.__parsed_doc
|
|
29
|
+
|
|
30
|
+
def get_provider(self, name):
|
|
31
|
+
return self.__providers[name]
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class TermPosition:
|
|
5
|
+
|
|
6
|
+
def __init__(self, term_ind_in_doc, term_ind_in_sent, s_ind):
|
|
7
|
+
self.__t_ind_in_doc = term_ind_in_doc
|
|
8
|
+
self.__t_ind_in_sent = term_ind_in_sent
|
|
9
|
+
self.__s_ind = s_ind
|
|
10
|
+
|
|
11
|
+
def get_index(self, position_type):
|
|
12
|
+
assert(isinstance(position_type, TermPositionTypes))
|
|
13
|
+
|
|
14
|
+
if position_type == TermPositionTypes.IndexInDocument:
|
|
15
|
+
return self.__t_ind_in_doc
|
|
16
|
+
if position_type == TermPositionTypes.IndexInSentence:
|
|
17
|
+
return self.__t_ind_in_sent
|
|
18
|
+
if position_type == TermPositionTypes.SentenceIndex:
|
|
19
|
+
return self.__s_ind
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class TermPositionTypes(Enum):
|
|
23
|
+
|
|
24
|
+
"""
|
|
25
|
+
Corresponds to an index of a related term in a whole document
|
|
26
|
+
(document considered as a sequence of terms)
|
|
27
|
+
"""
|
|
28
|
+
IndexInDocument = 1
|
|
29
|
+
|
|
30
|
+
"""
|
|
31
|
+
Corresponds to an index of a related term in a certain sentence.
|
|
32
|
+
"""
|
|
33
|
+
IndexInSentence = 2
|
|
34
|
+
|
|
35
|
+
"""
|
|
36
|
+
Corresponds to an index of a sentence in a whole document.
|
|
37
|
+
"""
|
|
38
|
+
SentenceIndex = 3
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from arekit.common.docs.base import Document
|
|
2
|
+
from arekit.common.docs.parsed.base import ParsedDocument
|
|
3
|
+
from arekit.common.pipeline.context import PipelineContext
|
|
4
|
+
from arekit.common.text.parser import BaseTextParser
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DocumentParser(object):
|
|
8
|
+
|
|
9
|
+
@staticmethod
|
|
10
|
+
def __get_sent(doc, sent_ind):
|
|
11
|
+
return doc.get_sentence(sent_ind)
|
|
12
|
+
|
|
13
|
+
@staticmethod
|
|
14
|
+
def parse(doc, text_parser, parent_ppl_ctx=None):
|
|
15
|
+
assert(isinstance(doc, Document))
|
|
16
|
+
assert(isinstance(text_parser, BaseTextParser))
|
|
17
|
+
assert(isinstance(parent_ppl_ctx, PipelineContext) or parent_ppl_ctx is None)
|
|
18
|
+
|
|
19
|
+
parsed_sentences = [text_parser.run(input_data=DocumentParser.__get_sent(doc, sent_ind).Text,
|
|
20
|
+
params_dict=DocumentParser.__create_ppl_params(doc=doc, sent_ind=sent_ind),
|
|
21
|
+
parent_ctx=parent_ppl_ctx)
|
|
22
|
+
for sent_ind in range(doc.SentencesCount)]
|
|
23
|
+
|
|
24
|
+
return ParsedDocument(doc_id=doc.ID,
|
|
25
|
+
parsed_sentences=parsed_sentences)
|
|
26
|
+
|
|
27
|
+
@staticmethod
|
|
28
|
+
def __create_ppl_params(doc, sent_ind):
|
|
29
|
+
assert(isinstance(doc, Document))
|
|
30
|
+
return {
|
|
31
|
+
"s_ind": sent_ind, # sentence index. (as Metadata)
|
|
32
|
+
"doc_id": doc.ID, # document index. (as Metadata)
|
|
33
|
+
"sentence": DocumentParser.__get_sent(doc, sent_ind), # Required for special sources.
|
|
34
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
|
|
2
|
+
class BaseDocumentSentence(object):
|
|
3
|
+
|
|
4
|
+
def __init__(self, text):
|
|
5
|
+
self.__text = text
|
|
6
|
+
|
|
7
|
+
@property
|
|
8
|
+
def Text(self):
|
|
9
|
+
"""
|
|
10
|
+
Any type, i.e.
|
|
11
|
+
- str: original text as string
|
|
12
|
+
- list of words: separated by words/tokens
|
|
13
|
+
"""
|
|
14
|
+
return self.__text
|
|
File without changes
|