arekit 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/__init__.py +0 -0
- arekit/common/__init__.py +0 -0
- arekit/common/bound.py +48 -0
- arekit/common/context/__init__.py +0 -0
- arekit/common/context/terms_mapper.py +51 -0
- arekit/common/context/token.py +16 -0
- arekit/common/data/__init__.py +0 -0
- arekit/common/data/const.py +21 -0
- arekit/common/data/doc_provider.py +6 -0
- arekit/common/data/input/__init__.py +0 -0
- arekit/common/data/input/providers/__init__.py +0 -0
- arekit/common/data/input/providers/columns/__init__.py +0 -0
- arekit/common/data/input/providers/columns/base.py +9 -0
- arekit/common/data/input/providers/columns/sample.py +59 -0
- arekit/common/data/input/providers/const.py +3 -0
- arekit/common/data/input/providers/contents.py +9 -0
- arekit/common/data/input/providers/instances/__init__.py +0 -0
- arekit/common/data/input/providers/instances/base.py +14 -0
- arekit/common/data/input/providers/instances/multiple.py +27 -0
- arekit/common/data/input/providers/instances/single.py +8 -0
- arekit/common/data/input/providers/label/__init__.py +0 -0
- arekit/common/data/input/providers/label/base.py +24 -0
- arekit/common/data/input/providers/label/binary.py +11 -0
- arekit/common/data/input/providers/label/multiple.py +15 -0
- arekit/common/data/input/providers/rows/__init__.py +0 -0
- arekit/common/data/input/providers/rows/base.py +64 -0
- arekit/common/data/input/providers/rows/samples.py +227 -0
- arekit/common/data/input/providers/sample/__init__.py +0 -0
- arekit/common/data/input/providers/sample/cropped.py +43 -0
- arekit/common/data/input/providers/text/__init__.py +0 -0
- arekit/common/data/input/providers/text/single.py +49 -0
- arekit/common/data/input/repositories/__init__.py +0 -0
- arekit/common/data/input/repositories/base.py +68 -0
- arekit/common/data/input/repositories/sample.py +22 -0
- arekit/common/data/input/sample.py +66 -0
- arekit/common/data/input/terms_mapper.py +88 -0
- arekit/common/data/rows_fmt.py +82 -0
- arekit/common/data/rows_parser.py +43 -0
- arekit/common/data/storages/__init__.py +0 -0
- arekit/common/data/storages/base.py +109 -0
- arekit/common/data/views/__init__.py +0 -0
- arekit/common/data/views/samples.py +26 -0
- arekit/common/docs/__init__.py +0 -0
- arekit/common/docs/base.py +30 -0
- arekit/common/docs/entities_grouping.py +16 -0
- arekit/common/docs/entity.py +18 -0
- arekit/common/docs/objects_parser.py +37 -0
- arekit/common/docs/parsed/__init__.py +0 -0
- arekit/common/docs/parsed/base.py +101 -0
- arekit/common/docs/parsed/providers/__init__.py +0 -0
- arekit/common/docs/parsed/providers/base.py +68 -0
- arekit/common/docs/parsed/providers/base_pairs.py +51 -0
- arekit/common/docs/parsed/providers/entity_service.py +175 -0
- arekit/common/docs/parsed/providers/opinion_pairs.py +20 -0
- arekit/common/docs/parsed/providers/text_opinion_pairs.py +78 -0
- arekit/common/docs/parsed/service.py +31 -0
- arekit/common/docs/parsed/term_position.py +42 -0
- arekit/common/docs/parser.py +34 -0
- arekit/common/docs/sentence.py +14 -0
- arekit/common/entities/__init__.py +0 -0
- arekit/common/entities/base.py +51 -0
- arekit/common/entities/collection.py +72 -0
- arekit/common/entities/str_fmt.py +8 -0
- arekit/common/entities/types.py +9 -0
- arekit/common/experiment/__init__.py +0 -0
- arekit/common/experiment/api/__init__.py +0 -0
- arekit/common/experiment/api/base_samples_io.py +20 -0
- arekit/common/experiment/data_type.py +17 -0
- arekit/common/frames/__init__.py +0 -0
- arekit/common/frames/connotations/__init__.py +0 -0
- arekit/common/frames/connotations/descriptor.py +17 -0
- arekit/common/frames/connotations/provider.py +4 -0
- arekit/common/frames/text_variant.py +43 -0
- arekit/common/frames/variants/__init__.py +0 -0
- arekit/common/frames/variants/base.py +21 -0
- arekit/common/frames/variants/collection.py +60 -0
- arekit/common/labels/__init__.py +0 -0
- arekit/common/labels/base.py +19 -0
- arekit/common/labels/provider/__init__.py +0 -0
- arekit/common/labels/provider/base.py +7 -0
- arekit/common/labels/provider/constant.py +14 -0
- arekit/common/labels/scaler/__init__.py +0 -0
- arekit/common/labels/scaler/base.py +85 -0
- arekit/common/labels/scaler/sentiment.py +7 -0
- arekit/common/labels/scaler/single.py +10 -0
- arekit/common/labels/str_fmt.py +55 -0
- arekit/common/linkage/__init__.py +0 -0
- arekit/common/linkage/base.py +44 -0
- arekit/common/linkage/meta.py +23 -0
- arekit/common/linkage/opinions.py +9 -0
- arekit/common/linkage/text_opinions.py +22 -0
- arekit/common/log_utils.py +29 -0
- arekit/common/model/__init__.py +0 -0
- arekit/common/model/labeling/__init__.py +0 -0
- arekit/common/model/labeling/base.py +24 -0
- arekit/common/model/labeling/modes.py +8 -0
- arekit/common/model/labeling/single.py +24 -0
- arekit/common/opinions/__init__.py +0 -0
- arekit/common/opinions/annot/__init__.py +0 -0
- arekit/common/opinions/annot/algo/__init__.py +0 -0
- arekit/common/opinions/annot/algo/base.py +4 -0
- arekit/common/opinions/annot/algo/pair_based.py +99 -0
- arekit/common/opinions/annot/algo/predefined.py +16 -0
- arekit/common/opinions/annot/algo_based.py +55 -0
- arekit/common/opinions/annot/base.py +15 -0
- arekit/common/opinions/base.py +74 -0
- arekit/common/opinions/collection.py +150 -0
- arekit/common/opinions/enums.py +6 -0
- arekit/common/opinions/provider.py +4 -0
- arekit/common/opinions/writer.py +4 -0
- arekit/common/pipeline/__init__.py +0 -0
- arekit/common/pipeline/base.py +25 -0
- arekit/common/pipeline/context.py +36 -0
- arekit/common/pipeline/conts.py +2 -0
- arekit/common/pipeline/items/__init__.py +0 -0
- arekit/common/pipeline/items/base.py +12 -0
- arekit/common/pipeline/items/flatten.py +14 -0
- arekit/common/pipeline/items/handle.py +17 -0
- arekit/common/pipeline/items/iter.py +11 -0
- arekit/common/pipeline/items/map.py +11 -0
- arekit/common/pipeline/items/map_nested.py +13 -0
- arekit/common/synonyms/__init__.py +0 -0
- arekit/common/synonyms/base.py +151 -0
- arekit/common/synonyms/grouping.py +21 -0
- arekit/common/text/__init__.py +0 -0
- arekit/common/text/enums.py +12 -0
- arekit/common/text/parsed.py +42 -0
- arekit/common/text/parser.py +12 -0
- arekit/common/text/partitioning/__init__.py +0 -0
- arekit/common/text/partitioning/base.py +4 -0
- arekit/common/text/partitioning/str.py +36 -0
- arekit/common/text/partitioning/terms.py +35 -0
- arekit/common/text/stemmer.py +16 -0
- arekit/common/text_opinions/__init__.py +0 -0
- arekit/common/text_opinions/base.py +105 -0
- arekit/common/utils.py +129 -0
- arekit/contrib/__init__.py +0 -0
- arekit/contrib/bert/__init__.py +0 -0
- arekit/contrib/bert/input/__init__.py +0 -0
- arekit/contrib/bert/input/providers/__init__.py +0 -0
- arekit/contrib/bert/input/providers/cropped_sample.py +17 -0
- arekit/contrib/bert/input/providers/text_pair.py +62 -0
- arekit/contrib/bert/terms/__init__.py +0 -0
- arekit/contrib/bert/terms/mapper.py +20 -0
- arekit/contrib/networks/__init__.py +0 -0
- arekit/contrib/networks/embedding.py +149 -0
- arekit/contrib/networks/embedding_io.py +18 -0
- arekit/contrib/networks/input/__init__.py +0 -0
- arekit/contrib/networks/input/const.py +6 -0
- arekit/contrib/networks/input/ctx_serialization.py +28 -0
- arekit/contrib/networks/input/embedding/__init__.py +0 -0
- arekit/contrib/networks/input/embedding/matrix.py +29 -0
- arekit/contrib/networks/input/embedding/offsets.py +55 -0
- arekit/contrib/networks/input/formatters/__init__.py +0 -0
- arekit/contrib/networks/input/formatters/pos_mapper.py +22 -0
- arekit/contrib/networks/input/providers/__init__.py +0 -0
- arekit/contrib/networks/input/providers/sample.py +129 -0
- arekit/contrib/networks/input/providers/term_connotation.py +23 -0
- arekit/contrib/networks/input/providers/text.py +24 -0
- arekit/contrib/networks/input/rows_parser.py +47 -0
- arekit/contrib/networks/input/term_types.py +13 -0
- arekit/contrib/networks/input/terms_mapping.py +60 -0
- arekit/contrib/networks/vectorizer.py +6 -0
- arekit/contrib/prompt/__init__.py +0 -0
- arekit/contrib/prompt/sample.py +61 -0
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +84 -0
- arekit/contrib/source/brat/doc.py +28 -0
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +13 -0
- arekit/contrib/source/brat/entities/entity.py +42 -0
- arekit/contrib/source/brat/entities/parser.py +53 -0
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +19 -0
- arekit/contrib/source/brat/relation.py +32 -0
- arekit/contrib/source/brat/sentence.py +69 -0
- arekit/contrib/source/brat/sentences_reader.py +128 -0
- arekit/contrib/source/download.py +41 -0
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +55 -0
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +74 -0
- arekit/contrib/source/nerel/io_utils.py +62 -0
- arekit/contrib/source/nerel/labels.py +241 -0
- arekit/contrib/source/nerel/reader.py +46 -0
- arekit/contrib/source/nerel/utils.py +24 -0
- arekit/contrib/source/nerel/versions.py +12 -0
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +62 -0
- arekit/contrib/source/nerelbio/labels.py +265 -0
- arekit/contrib/source/nerelbio/reader.py +8 -0
- arekit/contrib/source/nerelbio/versions.py +8 -0
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +36 -0
- arekit/contrib/source/ruattitudes/doc.py +51 -0
- arekit/contrib/source/ruattitudes/doc_brat.py +44 -0
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +7 -0
- arekit/contrib/source/ruattitudes/io_utils.py +56 -0
- arekit/contrib/source/ruattitudes/labels_fmt.py +12 -0
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +28 -0
- arekit/contrib/source/ruattitudes/opinions/converter.py +37 -0
- arekit/contrib/source/ruattitudes/reader.py +268 -0
- arekit/contrib/source/ruattitudes/sentence.py +73 -0
- arekit/contrib/source/ruattitudes/synonyms.py +17 -0
- arekit/contrib/source/ruattitudes/text_object.py +59 -0
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +157 -0
- arekit/contrib/source/rusentiframes/effect.py +24 -0
- arekit/contrib/source/rusentiframes/io_utils.py +19 -0
- arekit/contrib/source/rusentiframes/labels_fmt.py +22 -0
- arekit/contrib/source/rusentiframes/polarity.py +35 -0
- arekit/contrib/source/rusentiframes/role.py +15 -0
- arekit/contrib/source/rusentiframes/state.py +24 -0
- arekit/contrib/source/rusentiframes/types.py +42 -0
- arekit/contrib/source/rusentiframes/value.py +2 -0
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +3 -0
- arekit/contrib/source/rusentrel/docs_reader.py +51 -0
- arekit/contrib/source/rusentrel/entities.py +26 -0
- arekit/contrib/source/rusentrel/io_utils.py +125 -0
- arekit/contrib/source/rusentrel/labels_fmt.py +12 -0
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +30 -0
- arekit/contrib/source/rusentrel/opinions/converter.py +40 -0
- arekit/contrib/source/rusentrel/opinions/provider.py +54 -0
- arekit/contrib/source/rusentrel/opinions/writer.py +42 -0
- arekit/contrib/source/rusentrel/synonyms.py +17 -0
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +52 -0
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +31 -0
- arekit/contrib/source/sentinerel/folding/fixed.py +70 -0
- arekit/contrib/source/sentinerel/io_utils.py +87 -0
- arekit/contrib/source/sentinerel/labels.py +53 -0
- arekit/contrib/source/sentinerel/labels_scaler.py +30 -0
- arekit/contrib/source/sentinerel/reader.py +42 -0
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +19 -0
- arekit/contrib/source/zip_utils.py +47 -0
- arekit/contrib/utils/__init__.py +0 -0
- arekit/contrib/utils/bert/__init__.py +0 -0
- arekit/contrib/utils/bert/samplers.py +17 -0
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +23 -0
- arekit/contrib/utils/data/__init__.py +0 -0
- arekit/contrib/utils/data/contents/__init__.py +0 -0
- arekit/contrib/utils/data/contents/opinions.py +37 -0
- arekit/contrib/utils/data/doc_provider/__init__.py +0 -0
- arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
- arekit/contrib/utils/data/doc_provider/dir_based.py +53 -0
- arekit/contrib/utils/data/readers/__init__.py +0 -0
- arekit/contrib/utils/data/readers/base.py +7 -0
- arekit/contrib/utils/data/readers/csv_pd.py +38 -0
- arekit/contrib/utils/data/readers/jsonl.py +15 -0
- arekit/contrib/utils/data/service/__init__.py +0 -0
- arekit/contrib/utils/data/service/balance.py +50 -0
- arekit/contrib/utils/data/storages/__init__.py +0 -0
- arekit/contrib/utils/data/storages/jsonl_based.py +18 -0
- arekit/contrib/utils/data/storages/pandas_based.py +123 -0
- arekit/contrib/utils/data/storages/row_cache.py +48 -0
- arekit/contrib/utils/data/writers/__init__.py +0 -0
- arekit/contrib/utils/data/writers/base.py +27 -0
- arekit/contrib/utils/data/writers/csv_native.py +63 -0
- arekit/contrib/utils/data/writers/csv_pd.py +40 -0
- arekit/contrib/utils/data/writers/json_opennre.py +132 -0
- arekit/contrib/utils/data/writers/sqlite_native.py +110 -0
- arekit/contrib/utils/download.py +77 -0
- arekit/contrib/utils/embeddings/__init__.py +0 -0
- arekit/contrib/utils/embeddings/rusvectores.py +58 -0
- arekit/contrib/utils/embeddings/tokens.py +30 -0
- arekit/contrib/utils/entities/__init__.py +0 -0
- arekit/contrib/utils/entities/filter.py +7 -0
- arekit/contrib/utils/entities/formatters/__init__.py +0 -0
- arekit/contrib/utils/entities/formatters/str_display.py +11 -0
- arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py +15 -0
- arekit/contrib/utils/io_utils/__init__.py +0 -0
- arekit/contrib/utils/io_utils/embedding.py +72 -0
- arekit/contrib/utils/io_utils/opinions.py +37 -0
- arekit/contrib/utils/io_utils/samples.py +79 -0
- arekit/contrib/utils/io_utils/utils.py +39 -0
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +41 -0
- arekit/contrib/utils/lexicons/relation.py +42 -0
- arekit/contrib/utils/lexicons/rusentilex.py +37 -0
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +83 -0
- arekit/contrib/utils/np_utils/__init__.py +0 -0
- arekit/contrib/utils/np_utils/embedding.py +22 -0
- arekit/contrib/utils/np_utils/npz_utils.py +13 -0
- arekit/contrib/utils/np_utils/vocab.py +20 -0
- arekit/contrib/utils/pipelines/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/base.py +99 -0
- arekit/contrib/utils/pipelines/items/sampling/networks.py +54 -0
- arekit/contrib/utils/pipelines/items/text/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/text/entities_default.py +23 -0
- arekit/contrib/utils/pipelines/items/text/frames.py +86 -0
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +36 -0
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +32 -0
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +10 -0
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +107 -0
- arekit/contrib/utils/pipelines/items/text/translator.py +135 -0
- arekit/contrib/utils/pipelines/opinion_collections.py +85 -0
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +27 -0
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +65 -0
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +60 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +29 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +64 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +79 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +56 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +20 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +65 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +21 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +107 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +29 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +62 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +180 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +50 -0
- arekit/contrib/utils/pipelines/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +34 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +88 -0
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +93 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/base.py +4 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +16 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +29 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +26 -0
- arekit/contrib/utils/processing/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/mods.py +12 -0
- arekit/contrib/utils/processing/languages/pos.py +23 -0
- arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/ru/cases.py +78 -0
- arekit/contrib/utils/processing/languages/ru/constants.py +6 -0
- arekit/contrib/utils/processing/languages/ru/mods.py +13 -0
- arekit/contrib/utils/processing/languages/ru/number.py +23 -0
- arekit/contrib/utils/processing/languages/ru/pos_service.py +36 -0
- arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
- arekit/contrib/utils/processing/lemmatization/mystem.py +51 -0
- arekit/contrib/utils/processing/pos/__init__.py +0 -0
- arekit/contrib/utils/processing/pos/base.py +12 -0
- arekit/contrib/utils/processing/pos/mystem_wrap.py +134 -0
- arekit/contrib/utils/processing/pos/russian.py +10 -0
- arekit/contrib/utils/processing/text/__init__.py +0 -0
- arekit/contrib/utils/processing/text/tokens.py +127 -0
- arekit/contrib/utils/resources.py +25 -0
- arekit/contrib/utils/serializer.py +43 -0
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +63 -0
- arekit/contrib/utils/synonyms/__init__.py +0 -0
- arekit/contrib/utils/synonyms/simple.py +15 -0
- arekit/contrib/utils/synonyms/stemmer_based.py +38 -0
- arekit/contrib/utils/vectorizers/__init__.py +0 -0
- arekit/contrib/utils/vectorizers/bpe.py +93 -0
- arekit/contrib/utils/vectorizers/random_norm.py +39 -0
- arekit/download_data.py +11 -0
- arekit-0.24.0.dist-info/LICENSE +21 -0
- arekit-0.24.0.dist-info/METADATA +23 -0
- arekit-0.24.0.dist-info/RECORD +374 -0
- arekit-0.24.0.dist-info/WHEEL +5 -0
- arekit-0.24.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from arekit.common.entities.base import Entity
|
|
2
|
+
from arekit.common.entities.types import OpinionEntityType
|
|
3
|
+
from arekit.contrib.utils.entities.filter import EntityFilter
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class EntityHelper(object):
|
|
7
|
+
""" Named Entities formatting in text.
|
|
8
|
+
Based on OntoNotes5 collection tags:
|
|
9
|
+
https://catalog.ldc.upenn.edu/LDC2013T19
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
AGE = "AGE"
|
|
13
|
+
AWARD = "AWARD"
|
|
14
|
+
CITY = "CITY"
|
|
15
|
+
COUNTRY = "COUNTRY"
|
|
16
|
+
CRIME = "CRIME"
|
|
17
|
+
DATE = "DATE"
|
|
18
|
+
DISEASE = "DISEASE"
|
|
19
|
+
DISTRICT = "DISTRICT"
|
|
20
|
+
EVENT = "EVENT"
|
|
21
|
+
FACILITY = "FACILITY"
|
|
22
|
+
FAMILY = "FAMILY"
|
|
23
|
+
IDEOLOGY = "IDEOLOGY"
|
|
24
|
+
LANGUAGE = "LANGUAGE"
|
|
25
|
+
LAW = "LAW"
|
|
26
|
+
LOCATION = "LOCATION"
|
|
27
|
+
MONEY = "MONEY"
|
|
28
|
+
NATIONALITY = "NATIONALITY"
|
|
29
|
+
NUMBER = "NUMBER"
|
|
30
|
+
ORDINAL = "ORDINAL"
|
|
31
|
+
ORGANIZATION = "ORGANIZATION"
|
|
32
|
+
PENALTY = "PENALTY"
|
|
33
|
+
PERCENT = "PERCENT"
|
|
34
|
+
PERSON = "PERSON"
|
|
35
|
+
PRODUCT = "PRODUCT"
|
|
36
|
+
PROFESSION = "PROFESSION"
|
|
37
|
+
RELIGION = "RELIGION"
|
|
38
|
+
STATE_OR_PROVINCE = "STATE_OR_PROVINCE"
|
|
39
|
+
TIME = "TIME"
|
|
40
|
+
WORK_OF_ART = "WORK_OF_ART"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class SentiNerelEntityFilter(EntityFilter):
|
|
44
|
+
""" Filter, oriented on sentiment related extraction task
|
|
45
|
+
within SentiNEREL dataset.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def is_ignored(self, entity, e_type):
|
|
49
|
+
""" Subject and Object could be one of the following object types:
|
|
50
|
+
[PERSON, ORGANIZATION, COUNTRY, PROFESSION]
|
|
51
|
+
"""
|
|
52
|
+
assert(isinstance(entity, Entity))
|
|
53
|
+
assert(isinstance(e_type, OpinionEntityType))
|
|
54
|
+
|
|
55
|
+
supported = [EntityHelper.PERSON, EntityHelper.ORGANIZATION, EntityHelper.COUNTRY, EntityHelper.PROFESSION]
|
|
56
|
+
|
|
57
|
+
if e_type == OpinionEntityType.Subject:
|
|
58
|
+
return entity.Type not in supported
|
|
59
|
+
if e_type == OpinionEntityType.Object:
|
|
60
|
+
return entity.Type not in supported
|
|
61
|
+
else:
|
|
62
|
+
return True
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
from arekit.common.data.doc_provider import DocumentProvider
|
|
2
|
+
from arekit.common.experiment.data_type import DataType
|
|
3
|
+
from arekit.common.labels.base import NoLabel
|
|
4
|
+
from arekit.common.labels.provider.constant import ConstantLabelProvider
|
|
5
|
+
from arekit.common.opinions.annot.algo.pair_based import PairBasedOpinionAnnotationAlgorithm
|
|
6
|
+
from arekit.common.opinions.collection import OpinionCollection
|
|
7
|
+
from arekit.common.synonyms.base import SynonymsCollection
|
|
8
|
+
from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders
|
|
9
|
+
from arekit.contrib.source.sentinerel.io_utils import SentiNerelVersions, SentiNerelIOUtils
|
|
10
|
+
from arekit.contrib.utils.pipelines.sources.sentinerel.doc_provider import SentiNERELDocProvider
|
|
11
|
+
from arekit.contrib.utils.pipelines.sources.sentinerel.labels_fmt import SentiNERELSentimentLabelFormatter
|
|
12
|
+
from arekit.contrib.utils.pipelines.text_opinion.annot.algo_based import AlgorithmBasedTextOpinionAnnotator
|
|
13
|
+
from arekit.contrib.utils.pipelines.text_opinion.annot.predefined import PredefinedTextOpinionAnnotator
|
|
14
|
+
from arekit.contrib.utils.pipelines.text_opinion.extraction import text_opinion_extraction_pipeline
|
|
15
|
+
from arekit.contrib.utils.pipelines.text_opinion.filters.distance_based import DistanceLimitedTextOpinionFilter
|
|
16
|
+
from arekit.contrib.utils.processing.lemmatization.mystem import MystemWrapper
|
|
17
|
+
from arekit.contrib.utils.sources.sentinerel.text_opinion.prof_per_org_filter import \
|
|
18
|
+
ProfessionAsCharacteristicSentimentTextOpinionFilter
|
|
19
|
+
from arekit.contrib.utils.synonyms.stemmer_based import StemmerBasedSynonymCollection
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def create_text_opinion_extraction_pipeline(sentinerel_version,
|
|
23
|
+
text_parser,
|
|
24
|
+
custom_text_opinion_filters=None,
|
|
25
|
+
label_formatter=SentiNERELSentimentLabelFormatter(),
|
|
26
|
+
no_label=NoLabel(),
|
|
27
|
+
terms_per_context=50,
|
|
28
|
+
doc_provider=None,
|
|
29
|
+
dist_in_sentences=0,
|
|
30
|
+
docs_limit=None):
|
|
31
|
+
""" This is a main pipeline which generates the samples for a SentiNEREL documents.
|
|
32
|
+
SentiNEREL is a collection that becomes a part of the:
|
|
33
|
+
1. Attitude extraction studies (AREkit focused studies):
|
|
34
|
+
https://github.com/nicolay-r/SentiNEREL-attitude-extraction
|
|
35
|
+
2. RuSentNE-2023 competitions under CODALAB platform (github page):
|
|
36
|
+
https://github.com/dialogue-evaluation/RuSentNE-evaluation
|
|
37
|
+
|
|
38
|
+
Parameters:
|
|
39
|
+
sentinerel_version: enum
|
|
40
|
+
Version of the SentiNEREL collection.
|
|
41
|
+
text_parser: Is the way of how do we process the text.
|
|
42
|
+
doc_provider: DocumentProvider or None
|
|
43
|
+
In case of None we consider the default initialization.
|
|
44
|
+
label_formatter:
|
|
45
|
+
Formatter for labels which allows to: limit set of labels, and perform its conversion from
|
|
46
|
+
string to actual python type.
|
|
47
|
+
terms_per_context: int
|
|
48
|
+
Amount of terms that we consider in between the Object and Subject.
|
|
49
|
+
|
|
50
|
+
Returns: dict, (data_folding) optional
|
|
51
|
+
pipelines per every type.
|
|
52
|
+
"""
|
|
53
|
+
assert(isinstance(sentinerel_version, SentiNerelVersions))
|
|
54
|
+
assert(isinstance(doc_provider, DocumentProvider) or doc_provider is None)
|
|
55
|
+
assert(isinstance(custom_text_opinion_filters, list) or custom_text_opinion_filters is None)
|
|
56
|
+
|
|
57
|
+
data_folding = None
|
|
58
|
+
|
|
59
|
+
if doc_provider is None:
|
|
60
|
+
# Default Initialization.
|
|
61
|
+
filenames_by_ids, data_folding = SentiNerelIOUtils.read_dataset_split(version=sentinerel_version,
|
|
62
|
+
docs_limit=docs_limit)
|
|
63
|
+
doc_provider = SentiNERELDocProvider(filename_by_id=filenames_by_ids,
|
|
64
|
+
version=sentinerel_version)
|
|
65
|
+
|
|
66
|
+
train_neut_annot = create_nolabel_text_opinion_annotator(terms_per_context=terms_per_context,
|
|
67
|
+
dist_in_sents=dist_in_sentences,
|
|
68
|
+
no_label=no_label)
|
|
69
|
+
test_neut_annot = create_nolabel_text_opinion_annotator(terms_per_context=terms_per_context,
|
|
70
|
+
dist_in_sents=dist_in_sentences,
|
|
71
|
+
no_label=no_label)
|
|
72
|
+
|
|
73
|
+
text_opinion_filters = [
|
|
74
|
+
ProfessionAsCharacteristicSentimentTextOpinionFilter(),
|
|
75
|
+
DistanceLimitedTextOpinionFilter(terms_per_context)
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
# Append with the custom filters afterwards.
|
|
79
|
+
if custom_text_opinion_filters is not None:
|
|
80
|
+
text_opinion_filters += custom_text_opinion_filters
|
|
81
|
+
|
|
82
|
+
predefined_annot = PredefinedTextOpinionAnnotator(doc_provider, label_formatter)
|
|
83
|
+
|
|
84
|
+
pipelines = {
|
|
85
|
+
DataType.Train: create_main_pipeline(text_parser=text_parser,
|
|
86
|
+
doc_provider=doc_provider,
|
|
87
|
+
annotators=[
|
|
88
|
+
predefined_annot,
|
|
89
|
+
train_neut_annot
|
|
90
|
+
],
|
|
91
|
+
text_opinion_filters=text_opinion_filters),
|
|
92
|
+
DataType.Test: create_main_pipeline(text_parser=text_parser,
|
|
93
|
+
doc_provider=doc_provider,
|
|
94
|
+
annotators=[
|
|
95
|
+
test_neut_annot
|
|
96
|
+
],
|
|
97
|
+
text_opinion_filters=text_opinion_filters),
|
|
98
|
+
DataType.Etalon: create_etalon_pipeline(text_parser=text_parser,
|
|
99
|
+
doc_provider=doc_provider,
|
|
100
|
+
predefined_annot=predefined_annot,
|
|
101
|
+
text_opinion_filters=text_opinion_filters),
|
|
102
|
+
DataType.Dev: create_etalon_with_no_label_pipeline(text_parser=text_parser,
|
|
103
|
+
doc_provider=doc_provider,
|
|
104
|
+
annotators=[
|
|
105
|
+
predefined_annot,
|
|
106
|
+
train_neut_annot
|
|
107
|
+
],
|
|
108
|
+
text_opinion_filters=text_opinion_filters),
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
# In the case when we setup a default data-folding.
|
|
112
|
+
# There is a need to provide it, due to the needs in further.
|
|
113
|
+
if data_folding is not None:
|
|
114
|
+
return pipelines, data_folding
|
|
115
|
+
|
|
116
|
+
return pipelines
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def create_nolabel_text_opinion_annotator(terms_per_context, no_label, dist_in_sents=0, synonyms=None):
|
|
120
|
+
""" This is a core annotator, which provides all entity pairs.
|
|
121
|
+
Could be revealed from the document.
|
|
122
|
+
|
|
123
|
+
Parameters:
|
|
124
|
+
terms_per_context: int
|
|
125
|
+
Amount of terms that we consider in between the Object and Subject.
|
|
126
|
+
dist_in_sents: int
|
|
127
|
+
Distance in sentences in between the objects.
|
|
128
|
+
"""
|
|
129
|
+
assert(isinstance(terms_per_context, int))
|
|
130
|
+
assert(isinstance(synonyms, SynonymsCollection) or synonyms is None)
|
|
131
|
+
assert(isinstance(dist_in_sents, int))
|
|
132
|
+
|
|
133
|
+
if synonyms is None:
|
|
134
|
+
synonyms = StemmerBasedSynonymCollection(stemmer=MystemWrapper(), is_read_only=False)
|
|
135
|
+
|
|
136
|
+
return AlgorithmBasedTextOpinionAnnotator(
|
|
137
|
+
value_to_group_id_func=lambda value:
|
|
138
|
+
SynonymsCollectionValuesGroupingProviders.provide_existed_or_register_missed_value(
|
|
139
|
+
synonyms=synonyms, value=value),
|
|
140
|
+
annot_algo=PairBasedOpinionAnnotationAlgorithm(
|
|
141
|
+
dist_in_sents=dist_in_sents,
|
|
142
|
+
dist_in_terms_bound=terms_per_context,
|
|
143
|
+
label_provider=ConstantLabelProvider(no_label),
|
|
144
|
+
entity_index_func=lambda brat_entity: brat_entity.ID),
|
|
145
|
+
create_empty_collection_func=lambda: OpinionCollection(
|
|
146
|
+
synonyms=synonyms,
|
|
147
|
+
error_on_duplicates=True,
|
|
148
|
+
error_on_synonym_end_missed=False))
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def create_main_pipeline(text_parser, doc_provider, annotators, text_opinion_filters):
|
|
152
|
+
""" Train pipeline is based on the predefined annotations and
|
|
153
|
+
automatic annotations of other pairs with a NoLabel.
|
|
154
|
+
"""
|
|
155
|
+
return text_opinion_extraction_pipeline(
|
|
156
|
+
get_doc_by_id_func=doc_provider.by_id,
|
|
157
|
+
text_parser=text_parser,
|
|
158
|
+
annotators=annotators,
|
|
159
|
+
entity_index_func=lambda brat_entity: brat_entity.ID,
|
|
160
|
+
text_opinion_filters=text_opinion_filters)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def create_etalon_pipeline(text_parser, doc_provider, predefined_annot, text_opinion_filters):
|
|
164
|
+
""" We adopt exact the same pipeline as for training data,
|
|
165
|
+
but we do not perform "NoLabel" annotation.
|
|
166
|
+
(we are interested only in sentiment attitudes).
|
|
167
|
+
"""
|
|
168
|
+
return create_main_pipeline(text_parser=text_parser,
|
|
169
|
+
doc_provider=doc_provider,
|
|
170
|
+
annotators=[predefined_annot],
|
|
171
|
+
text_opinion_filters=text_opinion_filters)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def create_etalon_with_no_label_pipeline(annotators, text_parser, doc_provider, text_opinion_filters):
|
|
175
|
+
""" We adopt exact the same pipeline as for training data.
|
|
176
|
+
"""
|
|
177
|
+
return create_main_pipeline(text_parser=text_parser,
|
|
178
|
+
doc_provider=doc_provider,
|
|
179
|
+
annotators=annotators,
|
|
180
|
+
text_opinion_filters=text_opinion_filters)
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from arekit.common.labels.base import NoLabel
|
|
2
|
+
from arekit.common.labels.str_fmt import StringLabelsFormatter
|
|
3
|
+
from arekit.contrib.source.sentinerel import labels
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SentiNERELAnyLabelFormatter(StringLabelsFormatter):
|
|
7
|
+
|
|
8
|
+
def __init__(self):
|
|
9
|
+
|
|
10
|
+
stol = {
|
|
11
|
+
"OPINION_BELONGS_TO": labels.OpinionBelongsTo,
|
|
12
|
+
"OPINION_RELATES_TO": labels.OpinionRelatesTo,
|
|
13
|
+
"NEG_EFFECT_FROM": labels.NegEffectFrom,
|
|
14
|
+
"POS_EFFECT_FROM": labels.PosEffectFrom,
|
|
15
|
+
"NEG_STATE_FROM": labels.NegStateFrom,
|
|
16
|
+
"POS_STATE_FROM": labels.PosStateFrom,
|
|
17
|
+
"NEGATIVE_TO": labels.NegativeTo,
|
|
18
|
+
"POSITIVE_TO": labels.PositiveTo,
|
|
19
|
+
"STATE_BELONGS_TO": labels.StateBelongsTo,
|
|
20
|
+
"POS_AUTHOR_FROM": labels.PosAuthorFrom,
|
|
21
|
+
"NEG_AUTHOR_FROM": labels.NegAuthorFrom,
|
|
22
|
+
"ALTERNATIVE_NAME": labels.AlternativeName,
|
|
23
|
+
"ORIGINS_FROM": labels.OriginsFrom
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
super(SentiNERELAnyLabelFormatter, self).__init__(stol=stol)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class SentiNERELSentimentLabelFormatter(StringLabelsFormatter):
|
|
30
|
+
|
|
31
|
+
def __init__(self):
|
|
32
|
+
stol = {
|
|
33
|
+
"NEGATIVE_TO": labels.NegativeTo,
|
|
34
|
+
"POSITIVE_TO": labels.PositiveTo,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
super(SentiNERELSentimentLabelFormatter, self).__init__(stol=stol)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class SentiNERELPosNegNeuRelationsLabelFormatter(StringLabelsFormatter):
|
|
41
|
+
|
|
42
|
+
def __init__(self):
|
|
43
|
+
|
|
44
|
+
stol = {
|
|
45
|
+
"NEUTRAL": NoLabel,
|
|
46
|
+
"NEGATIVE_TO": labels.NegativeTo,
|
|
47
|
+
"POSITIVE_TO": labels.PositiveTo,
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
super(SentiNERELPosNegNeuRelationsLabelFormatter, self).__init__(stol=stol)
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from arekit.common.docs.parsed.providers.text_opinion_pairs import TextOpinionPairsProvider
|
|
2
|
+
from arekit.common.docs.parsed.service import ParsedDocumentService
|
|
3
|
+
from arekit.common.opinions.annot.algo_based import AlgorithmBasedOpinionAnnotator
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class AlgorithmBasedTextOpinionAnnotator(AlgorithmBasedOpinionAnnotator):
|
|
7
|
+
""" This class represent a wrap over TextOpinionAnnotator
|
|
8
|
+
and allows to perform a conversion into TextOpinions
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def __init__(self, value_to_group_id_func, annot_algo, create_empty_collection_func,
|
|
12
|
+
get_doc_existed_opinions_func=None):
|
|
13
|
+
""" get_doc_existed_opinions_func: func or None
|
|
14
|
+
function that provides existed opinions for a document;
|
|
15
|
+
if None, then we consider an absence of the existed document-level opinions.
|
|
16
|
+
"""
|
|
17
|
+
assert(callable(value_to_group_id_func))
|
|
18
|
+
super(AlgorithmBasedTextOpinionAnnotator, self).__init__(
|
|
19
|
+
annot_algo=annot_algo,
|
|
20
|
+
create_empty_collection_func=create_empty_collection_func,
|
|
21
|
+
get_doc_existed_opinions_func=get_doc_existed_opinions_func)
|
|
22
|
+
self.__value_to_group_id_func = value_to_group_id_func
|
|
23
|
+
|
|
24
|
+
def __create_service(self, parsed_doc):
|
|
25
|
+
return ParsedDocumentService(parsed_doc=parsed_doc, providers=[
|
|
26
|
+
TextOpinionPairsProvider(self.__value_to_group_id_func)
|
|
27
|
+
])
|
|
28
|
+
|
|
29
|
+
def annotate_collection(self, parsed_doc):
|
|
30
|
+
service = self.__create_service(parsed_doc)
|
|
31
|
+
topp = service.get_provider(TextOpinionPairsProvider.NAME)
|
|
32
|
+
for opinion in super(AlgorithmBasedTextOpinionAnnotator, self).annotate_collection(parsed_doc):
|
|
33
|
+
for text_opinion in topp.iter_from_opinion(opinion):
|
|
34
|
+
yield text_opinion
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
from arekit.common.data.doc_provider import DocumentProvider
|
|
2
|
+
from arekit.common.labels.str_fmt import StringLabelsFormatter
|
|
3
|
+
from arekit.common.docs.parsed.base import ParsedDocument
|
|
4
|
+
from arekit.common.docs.parsed.providers.base import BaseParsedDocumentServiceProvider
|
|
5
|
+
from arekit.common.docs.parsed.providers.entity_service import EntityServiceProvider
|
|
6
|
+
from arekit.common.docs.parsed.service import ParsedDocumentService
|
|
7
|
+
from arekit.common.opinions.annot.base import BaseOpinionAnnotator
|
|
8
|
+
from arekit.contrib.source.brat.doc import BratDocument
|
|
9
|
+
from arekit.contrib.source.brat.opinions.converter import BratRelationConverter
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PredefinedTextOpinionAnnotator(BaseOpinionAnnotator):
|
|
13
|
+
""" Brat-based text-opinion annotator (converter).
|
|
14
|
+
It converts the pre-annotated Relations from BRAT-documents to TextOpinions
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, doc_provider, label_formatter, keep_any_type=False, entity_index_func=None):
|
|
18
|
+
"""
|
|
19
|
+
get_doc_func:
|
|
20
|
+
func(doc_id)
|
|
21
|
+
|
|
22
|
+
label_formatter: String Labels Formatter
|
|
23
|
+
required for conversion.
|
|
24
|
+
|
|
25
|
+
keep_any_type: bool
|
|
26
|
+
flag that defines whether there is a need to consider all the text opinions
|
|
27
|
+
or only one that supported by label formatter.
|
|
28
|
+
|
|
29
|
+
entity_index_func: is a way of how we provide an external entity ID
|
|
30
|
+
fund(entity) -> ID
|
|
31
|
+
"""
|
|
32
|
+
assert(isinstance(doc_provider, DocumentProvider))
|
|
33
|
+
assert(isinstance(label_formatter, StringLabelsFormatter))
|
|
34
|
+
assert(callable(entity_index_func) or entity_index_func is None)
|
|
35
|
+
super(PredefinedTextOpinionAnnotator, self).__init__()
|
|
36
|
+
|
|
37
|
+
self.__doc_provider = doc_provider
|
|
38
|
+
self.__label_formatter = label_formatter
|
|
39
|
+
self.__keep_any_type = keep_any_type
|
|
40
|
+
self.__entity_index_func = (lambda brat_entity: brat_entity.ID) if \
|
|
41
|
+
entity_index_func is None else entity_index_func
|
|
42
|
+
|
|
43
|
+
@staticmethod
|
|
44
|
+
def __convert_entity_id(doc, origin_entity_id, esp):
|
|
45
|
+
assert(isinstance(doc, BratDocument))
|
|
46
|
+
assert(isinstance(origin_entity_id, int))
|
|
47
|
+
assert(isinstance(esp, BaseParsedDocumentServiceProvider))
|
|
48
|
+
|
|
49
|
+
if not doc.contains_entity(origin_entity_id):
|
|
50
|
+
# Due to the complexity of entities, some entities might be nested.
|
|
51
|
+
# Therefore the latter, some entities might be discarded.
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
origin_entity = doc.get_entity_by_id(origin_entity_id)
|
|
55
|
+
|
|
56
|
+
if not esp.contains_entity(origin_entity):
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
document_entity = esp.get_document_entity(origin_entity)
|
|
60
|
+
return document_entity.IdInDocument
|
|
61
|
+
|
|
62
|
+
def _annot_collection_core(self, parsed_doc):
|
|
63
|
+
assert(isinstance(parsed_doc, ParsedDocument))
|
|
64
|
+
|
|
65
|
+
pns = ParsedDocumentService(parsed_doc=parsed_doc, providers=[
|
|
66
|
+
EntityServiceProvider(self.__entity_index_func)
|
|
67
|
+
])
|
|
68
|
+
esp = pns.get_provider(EntityServiceProvider.NAME)
|
|
69
|
+
doc = self.__doc_provider.by_id(parsed_doc.RelatedDocID)
|
|
70
|
+
|
|
71
|
+
for brat_relation in doc.Relations:
|
|
72
|
+
|
|
73
|
+
if self.__label_formatter.supports_value(brat_relation.Type) or self.__keep_any_type:
|
|
74
|
+
|
|
75
|
+
text_opinion = BratRelationConverter.to_text_opinion(
|
|
76
|
+
brat_relation=brat_relation,
|
|
77
|
+
doc_id=parsed_doc.RelatedDocID,
|
|
78
|
+
label_formatter=self.__label_formatter)
|
|
79
|
+
|
|
80
|
+
internal_opinion = text_opinion.try_convert(
|
|
81
|
+
other=text_opinion,
|
|
82
|
+
convert_entity_id_func=lambda origin_id: PredefinedTextOpinionAnnotator.__convert_entity_id(
|
|
83
|
+
doc=doc, origin_entity_id=origin_id, esp=esp))
|
|
84
|
+
|
|
85
|
+
if internal_opinion is None:
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
yield internal_opinion
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from arekit.common.linkage.meta import MetaEmptyLinkedDataWrapper
|
|
2
|
+
from arekit.common.linkage.text_opinions import TextOpinionsLinkage
|
|
3
|
+
from arekit.common.docs.parsed.base import ParsedDocument
|
|
4
|
+
from arekit.common.docs.parsed.providers.entity_service import EntityServiceProvider
|
|
5
|
+
from arekit.common.docs.parsed.service import ParsedDocumentService
|
|
6
|
+
from arekit.common.docs.parser import DocumentParser
|
|
7
|
+
from arekit.common.pipeline.base import BasePipeline
|
|
8
|
+
from arekit.common.pipeline.items.flatten import FlattenIterPipelineItem
|
|
9
|
+
from arekit.common.pipeline.items.map import MapPipelineItem
|
|
10
|
+
from arekit.common.pipeline.items.map_nested import MapNestedPipelineItem
|
|
11
|
+
from arekit.common.text.parser import BaseTextParser
|
|
12
|
+
from arekit.common.text_opinions.base import TextOpinion
|
|
13
|
+
from arekit.contrib.utils.pipelines.text_opinion.filters.base import TextOpinionFilter
|
|
14
|
+
from arekit.contrib.utils.pipelines.text_opinion.filters.limitation import FrameworkLimitationsTextOpinionFilter
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def __iter_text_opinion_linkages(parsed_doc, annotators, entity_index_func,
|
|
18
|
+
text_opinion_filters, use_meta):
|
|
19
|
+
""" use_meta: bool
|
|
20
|
+
this is mainly for tqdm and other console parameters to stay up-to-date
|
|
21
|
+
with the state in the case we do not have that much output results
|
|
22
|
+
across multiple amount of documents.
|
|
23
|
+
"""
|
|
24
|
+
assert(isinstance(annotators, list))
|
|
25
|
+
assert(isinstance(parsed_doc, ParsedDocument))
|
|
26
|
+
assert(isinstance(text_opinion_filters, list))
|
|
27
|
+
assert(isinstance(use_meta, bool))
|
|
28
|
+
|
|
29
|
+
def __to_id(text_opinion):
|
|
30
|
+
return "{}_{}".format(text_opinion.SourceId, text_opinion.TargetId)
|
|
31
|
+
|
|
32
|
+
service = ParsedDocumentService(parsed_doc=parsed_doc, providers=[EntityServiceProvider(entity_index_func)])
|
|
33
|
+
esp = service.get_provider(EntityServiceProvider.NAME)
|
|
34
|
+
|
|
35
|
+
predefined = set()
|
|
36
|
+
|
|
37
|
+
for annotator in annotators:
|
|
38
|
+
for text_opinion in annotator.annotate_collection(parsed_doc=parsed_doc):
|
|
39
|
+
assert(isinstance(text_opinion, TextOpinion))
|
|
40
|
+
|
|
41
|
+
passed = True
|
|
42
|
+
for f in text_opinion_filters:
|
|
43
|
+
assert(isinstance(f, TextOpinionFilter))
|
|
44
|
+
if not f.filter(text_opinion=text_opinion, parsed_doc=parsed_doc, entity_service_provider=esp):
|
|
45
|
+
passed = False
|
|
46
|
+
break
|
|
47
|
+
|
|
48
|
+
if not passed:
|
|
49
|
+
continue
|
|
50
|
+
|
|
51
|
+
if __to_id(text_opinion) in predefined:
|
|
52
|
+
# We reject those one which was already obtained
|
|
53
|
+
# from the predefined sentiment annotation.
|
|
54
|
+
continue
|
|
55
|
+
|
|
56
|
+
predefined.add(__to_id(text_opinion))
|
|
57
|
+
|
|
58
|
+
text_opinion_linkage = TextOpinionsLinkage([text_opinion])
|
|
59
|
+
text_opinion_linkage.set_tag(service)
|
|
60
|
+
yield text_opinion_linkage
|
|
61
|
+
|
|
62
|
+
# This is the case to consider the end of the document.
|
|
63
|
+
if use_meta:
|
|
64
|
+
yield MetaEmptyLinkedDataWrapper(doc_id=parsed_doc.RelatedDocID)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def text_opinion_extraction_pipeline(text_parser, get_doc_by_id_func, annotators, entity_index_func,
|
|
68
|
+
text_opinion_filters=None, use_meta_between_docs=True):
|
|
69
|
+
assert(isinstance(text_parser, BaseTextParser))
|
|
70
|
+
assert(callable(get_doc_by_id_func))
|
|
71
|
+
assert(isinstance(annotators, list))
|
|
72
|
+
assert(isinstance(text_opinion_filters, list) or text_opinion_filters is None)
|
|
73
|
+
assert(isinstance(use_meta_between_docs, bool))
|
|
74
|
+
|
|
75
|
+
extra_filters = [] if text_opinion_filters is None else text_opinion_filters
|
|
76
|
+
actual_text_opinion_filters = [FrameworkLimitationsTextOpinionFilter()] + extra_filters
|
|
77
|
+
|
|
78
|
+
return BasePipeline([
|
|
79
|
+
# (doc_id) -> (doc)
|
|
80
|
+
MapPipelineItem(map_func=lambda doc_id: get_doc_by_id_func(doc_id)),
|
|
81
|
+
|
|
82
|
+
# (doc, ppl_ctx) -> (parsed_doc)
|
|
83
|
+
MapNestedPipelineItem(map_func=lambda doc, ppl_ctx: DocumentParser.parse(
|
|
84
|
+
doc=doc, text_parser=text_parser, parent_ppl_ctx=ppl_ctx)),
|
|
85
|
+
|
|
86
|
+
# (parsed_doc) -> (text_opinions)
|
|
87
|
+
MapPipelineItem(map_func=lambda parsed_doc: __iter_text_opinion_linkages(
|
|
88
|
+
annotators=annotators, parsed_doc=parsed_doc, entity_index_func=entity_index_func,
|
|
89
|
+
text_opinion_filters=actual_text_opinion_filters, use_meta=use_meta_between_docs)),
|
|
90
|
+
|
|
91
|
+
# linkages[] -> linkages
|
|
92
|
+
FlattenIterPipelineItem()
|
|
93
|
+
])
|
|
File without changes
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from arekit.common.data.input.sample import InputSampleBase
|
|
2
|
+
from arekit.contrib.utils.pipelines.text_opinion.filters.base import TextOpinionFilter
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class DistanceLimitedTextOpinionFilter(TextOpinionFilter):
|
|
6
|
+
|
|
7
|
+
def __init__(self, terms_per_context):
|
|
8
|
+
super(DistanceLimitedTextOpinionFilter, self).__init__()
|
|
9
|
+
self.__terms_per_context = terms_per_context
|
|
10
|
+
|
|
11
|
+
def filter(self, text_opinion, parsed_doc, entity_service_provider):
|
|
12
|
+
|
|
13
|
+
return InputSampleBase.check_ability_to_create_sample(
|
|
14
|
+
entity_service=entity_service_provider,
|
|
15
|
+
text_opinion=text_opinion,
|
|
16
|
+
window_size=self.__terms_per_context)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from arekit.common.entities.types import OpinionEntityType
|
|
2
|
+
from arekit.common.docs.parsed.base import ParsedDocument
|
|
3
|
+
from arekit.common.text_opinions.base import TextOpinion
|
|
4
|
+
|
|
5
|
+
from arekit.contrib.utils.entities.filter import EntityFilter
|
|
6
|
+
from arekit.contrib.utils.pipelines.text_opinion.filters.base import TextOpinionFilter
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class EntityBasedTextOpinionFilter(TextOpinionFilter):
|
|
10
|
+
|
|
11
|
+
def __init__(self, entity_filter):
|
|
12
|
+
super(EntityBasedTextOpinionFilter, self).__init__()
|
|
13
|
+
assert(isinstance(entity_filter, EntityFilter) or entity_filter is None)
|
|
14
|
+
self.__entity_filter = entity_filter
|
|
15
|
+
|
|
16
|
+
def filter(self, text_opinion, parsed_doc, entity_service_provider):
|
|
17
|
+
assert(isinstance(text_opinion, TextOpinion))
|
|
18
|
+
assert(isinstance(parsed_doc, ParsedDocument))
|
|
19
|
+
|
|
20
|
+
if self.__entity_filter is not None:
|
|
21
|
+
e_source = entity_service_provider._doc_entities[text_opinion.SourceId]
|
|
22
|
+
if self.__entity_filter.is_ignored(e_source, OpinionEntityType.Subject):
|
|
23
|
+
return False
|
|
24
|
+
|
|
25
|
+
e_target = entity_service_provider._doc_entities[text_opinion.TargetId]
|
|
26
|
+
if self.__entity_filter.is_ignored(e_target, OpinionEntityType.Object):
|
|
27
|
+
return False
|
|
28
|
+
|
|
29
|
+
return True
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from arekit.common.docs.parsed.base import ParsedDocument
|
|
2
|
+
from arekit.common.docs.parsed.term_position import TermPositionTypes
|
|
3
|
+
from arekit.common.text_opinions.base import TextOpinion
|
|
4
|
+
from arekit.contrib.utils.pipelines.text_opinion.filters.base import TextOpinionFilter
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class FrameworkLimitationsTextOpinionFilter(TextOpinionFilter):
|
|
8
|
+
""" Note: this is an internal class, there is no need to
|
|
9
|
+
adopt this from the outside of the AREkit.
|
|
10
|
+
It is require to hide and provide known limitations.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def filter(self, text_opinion, parsed_doc, entity_service_provider):
|
|
14
|
+
assert(isinstance(text_opinion, TextOpinion))
|
|
15
|
+
assert(isinstance(parsed_doc, ParsedDocument))
|
|
16
|
+
|
|
17
|
+
s_ind = entity_service_provider.get_entity_position(
|
|
18
|
+
text_opinion.SourceId, position_type=TermPositionTypes.SentenceIndex)
|
|
19
|
+
t_ind = entity_service_provider.get_entity_position(
|
|
20
|
+
text_opinion.TargetId, position_type=TermPositionTypes.SentenceIndex)
|
|
21
|
+
|
|
22
|
+
if s_ind != t_ind:
|
|
23
|
+
# AREkit does not provide a support for multi-sentence opinions at present.
|
|
24
|
+
return False
|
|
25
|
+
|
|
26
|
+
return True
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from enum import IntEnum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class PartOfSpeechType(IntEnum):
|
|
5
|
+
|
|
6
|
+
NOUN = 1
|
|
7
|
+
ADV = 2
|
|
8
|
+
ADVPRO = 3
|
|
9
|
+
ANUM = 4
|
|
10
|
+
APRO = 5
|
|
11
|
+
COM = 6
|
|
12
|
+
CONJ = 7
|
|
13
|
+
INTJ = 8
|
|
14
|
+
NUM = 9
|
|
15
|
+
PART = 10
|
|
16
|
+
PR = 11
|
|
17
|
+
ADJ = 12
|
|
18
|
+
SPRO = 13
|
|
19
|
+
VERB = 14
|
|
20
|
+
|
|
21
|
+
Unknown = 15
|
|
22
|
+
|
|
23
|
+
Empty = 16
|
|
File without changes
|