arekit 0.24.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit-0.24.0/LICENSE +21 -0
- arekit-0.24.0/PKG-INFO +19 -0
- arekit-0.24.0/README.md +56 -0
- arekit-0.24.0/arekit/__init__.py +0 -0
- arekit-0.24.0/arekit/common/__init__.py +0 -0
- arekit-0.24.0/arekit/common/bound.py +48 -0
- arekit-0.24.0/arekit/common/context/__init__.py +0 -0
- arekit-0.24.0/arekit/common/context/terms_mapper.py +51 -0
- arekit-0.24.0/arekit/common/context/token.py +16 -0
- arekit-0.24.0/arekit/common/data/__init__.py +0 -0
- arekit-0.24.0/arekit/common/data/const.py +21 -0
- arekit-0.24.0/arekit/common/data/doc_provider.py +6 -0
- arekit-0.24.0/arekit/common/data/input/__init__.py +0 -0
- arekit-0.24.0/arekit/common/data/input/providers/__init__.py +0 -0
- arekit-0.24.0/arekit/common/data/input/providers/columns/__init__.py +0 -0
- arekit-0.24.0/arekit/common/data/input/providers/columns/base.py +9 -0
- arekit-0.24.0/arekit/common/data/input/providers/columns/sample.py +59 -0
- arekit-0.24.0/arekit/common/data/input/providers/const.py +3 -0
- arekit-0.24.0/arekit/common/data/input/providers/contents.py +9 -0
- arekit-0.24.0/arekit/common/data/input/providers/instances/__init__.py +0 -0
- arekit-0.24.0/arekit/common/data/input/providers/instances/base.py +14 -0
- arekit-0.24.0/arekit/common/data/input/providers/instances/multiple.py +27 -0
- arekit-0.24.0/arekit/common/data/input/providers/instances/single.py +8 -0
- arekit-0.24.0/arekit/common/data/input/providers/label/__init__.py +0 -0
- arekit-0.24.0/arekit/common/data/input/providers/label/base.py +24 -0
- arekit-0.24.0/arekit/common/data/input/providers/label/binary.py +11 -0
- arekit-0.24.0/arekit/common/data/input/providers/label/multiple.py +15 -0
- arekit-0.24.0/arekit/common/data/input/providers/rows/__init__.py +0 -0
- arekit-0.24.0/arekit/common/data/input/providers/rows/base.py +64 -0
- arekit-0.24.0/arekit/common/data/input/providers/rows/samples.py +227 -0
- arekit-0.24.0/arekit/common/data/input/providers/sample/__init__.py +0 -0
- arekit-0.24.0/arekit/common/data/input/providers/sample/cropped.py +43 -0
- arekit-0.24.0/arekit/common/data/input/providers/text/__init__.py +0 -0
- arekit-0.24.0/arekit/common/data/input/providers/text/single.py +49 -0
- arekit-0.24.0/arekit/common/data/input/repositories/__init__.py +0 -0
- arekit-0.24.0/arekit/common/data/input/repositories/base.py +68 -0
- arekit-0.24.0/arekit/common/data/input/repositories/sample.py +22 -0
- arekit-0.24.0/arekit/common/data/input/sample.py +66 -0
- arekit-0.24.0/arekit/common/data/input/terms_mapper.py +88 -0
- arekit-0.24.0/arekit/common/data/rows_fmt.py +82 -0
- arekit-0.24.0/arekit/common/data/rows_parser.py +43 -0
- arekit-0.24.0/arekit/common/data/storages/__init__.py +0 -0
- arekit-0.24.0/arekit/common/data/storages/base.py +109 -0
- arekit-0.24.0/arekit/common/data/views/__init__.py +0 -0
- arekit-0.24.0/arekit/common/data/views/samples.py +26 -0
- arekit-0.24.0/arekit/common/docs/__init__.py +0 -0
- arekit-0.24.0/arekit/common/docs/base.py +30 -0
- arekit-0.24.0/arekit/common/docs/entities_grouping.py +16 -0
- arekit-0.24.0/arekit/common/docs/entity.py +18 -0
- arekit-0.24.0/arekit/common/docs/objects_parser.py +37 -0
- arekit-0.24.0/arekit/common/docs/parsed/__init__.py +0 -0
- arekit-0.24.0/arekit/common/docs/parsed/base.py +101 -0
- arekit-0.24.0/arekit/common/docs/parsed/providers/__init__.py +0 -0
- arekit-0.24.0/arekit/common/docs/parsed/providers/base.py +68 -0
- arekit-0.24.0/arekit/common/docs/parsed/providers/base_pairs.py +51 -0
- arekit-0.24.0/arekit/common/docs/parsed/providers/entity_service.py +175 -0
- arekit-0.24.0/arekit/common/docs/parsed/providers/opinion_pairs.py +20 -0
- arekit-0.24.0/arekit/common/docs/parsed/providers/text_opinion_pairs.py +78 -0
- arekit-0.24.0/arekit/common/docs/parsed/service.py +31 -0
- arekit-0.24.0/arekit/common/docs/parsed/term_position.py +42 -0
- arekit-0.24.0/arekit/common/docs/parser.py +34 -0
- arekit-0.24.0/arekit/common/docs/sentence.py +14 -0
- arekit-0.24.0/arekit/common/entities/__init__.py +0 -0
- arekit-0.24.0/arekit/common/entities/base.py +51 -0
- arekit-0.24.0/arekit/common/entities/collection.py +72 -0
- arekit-0.24.0/arekit/common/entities/str_fmt.py +8 -0
- arekit-0.24.0/arekit/common/entities/types.py +9 -0
- arekit-0.24.0/arekit/common/experiment/__init__.py +0 -0
- arekit-0.24.0/arekit/common/experiment/api/__init__.py +0 -0
- arekit-0.24.0/arekit/common/experiment/api/base_samples_io.py +20 -0
- arekit-0.24.0/arekit/common/experiment/data_type.py +17 -0
- arekit-0.24.0/arekit/common/frames/__init__.py +0 -0
- arekit-0.24.0/arekit/common/frames/connotations/__init__.py +0 -0
- arekit-0.24.0/arekit/common/frames/connotations/descriptor.py +17 -0
- arekit-0.24.0/arekit/common/frames/connotations/provider.py +4 -0
- arekit-0.24.0/arekit/common/frames/text_variant.py +43 -0
- arekit-0.24.0/arekit/common/frames/variants/__init__.py +0 -0
- arekit-0.24.0/arekit/common/frames/variants/base.py +21 -0
- arekit-0.24.0/arekit/common/frames/variants/collection.py +60 -0
- arekit-0.24.0/arekit/common/labels/__init__.py +0 -0
- arekit-0.24.0/arekit/common/labels/base.py +19 -0
- arekit-0.24.0/arekit/common/labels/provider/__init__.py +0 -0
- arekit-0.24.0/arekit/common/labels/provider/base.py +7 -0
- arekit-0.24.0/arekit/common/labels/provider/constant.py +14 -0
- arekit-0.24.0/arekit/common/labels/scaler/__init__.py +0 -0
- arekit-0.24.0/arekit/common/labels/scaler/base.py +85 -0
- arekit-0.24.0/arekit/common/labels/scaler/sentiment.py +7 -0
- arekit-0.24.0/arekit/common/labels/scaler/single.py +10 -0
- arekit-0.24.0/arekit/common/labels/str_fmt.py +55 -0
- arekit-0.24.0/arekit/common/linkage/__init__.py +0 -0
- arekit-0.24.0/arekit/common/linkage/base.py +44 -0
- arekit-0.24.0/arekit/common/linkage/meta.py +23 -0
- arekit-0.24.0/arekit/common/linkage/opinions.py +9 -0
- arekit-0.24.0/arekit/common/linkage/text_opinions.py +22 -0
- arekit-0.24.0/arekit/common/log_utils.py +29 -0
- arekit-0.24.0/arekit/common/model/__init__.py +0 -0
- arekit-0.24.0/arekit/common/model/labeling/__init__.py +0 -0
- arekit-0.24.0/arekit/common/model/labeling/base.py +24 -0
- arekit-0.24.0/arekit/common/model/labeling/modes.py +8 -0
- arekit-0.24.0/arekit/common/model/labeling/single.py +24 -0
- arekit-0.24.0/arekit/common/opinions/__init__.py +0 -0
- arekit-0.24.0/arekit/common/opinions/annot/__init__.py +0 -0
- arekit-0.24.0/arekit/common/opinions/annot/algo/__init__.py +0 -0
- arekit-0.24.0/arekit/common/opinions/annot/algo/base.py +4 -0
- arekit-0.24.0/arekit/common/opinions/annot/algo/pair_based.py +99 -0
- arekit-0.24.0/arekit/common/opinions/annot/algo/predefined.py +16 -0
- arekit-0.24.0/arekit/common/opinions/annot/algo_based.py +55 -0
- arekit-0.24.0/arekit/common/opinions/annot/base.py +15 -0
- arekit-0.24.0/arekit/common/opinions/base.py +74 -0
- arekit-0.24.0/arekit/common/opinions/collection.py +150 -0
- arekit-0.24.0/arekit/common/opinions/enums.py +6 -0
- arekit-0.24.0/arekit/common/opinions/provider.py +4 -0
- arekit-0.24.0/arekit/common/opinions/writer.py +4 -0
- arekit-0.24.0/arekit/common/pipeline/__init__.py +0 -0
- arekit-0.24.0/arekit/common/pipeline/base.py +25 -0
- arekit-0.24.0/arekit/common/pipeline/context.py +36 -0
- arekit-0.24.0/arekit/common/pipeline/conts.py +2 -0
- arekit-0.24.0/arekit/common/pipeline/items/__init__.py +0 -0
- arekit-0.24.0/arekit/common/pipeline/items/base.py +12 -0
- arekit-0.24.0/arekit/common/pipeline/items/flatten.py +14 -0
- arekit-0.24.0/arekit/common/pipeline/items/handle.py +17 -0
- arekit-0.24.0/arekit/common/pipeline/items/iter.py +11 -0
- arekit-0.24.0/arekit/common/pipeline/items/map.py +11 -0
- arekit-0.24.0/arekit/common/pipeline/items/map_nested.py +13 -0
- arekit-0.24.0/arekit/common/synonyms/__init__.py +0 -0
- arekit-0.24.0/arekit/common/synonyms/base.py +151 -0
- arekit-0.24.0/arekit/common/synonyms/grouping.py +21 -0
- arekit-0.24.0/arekit/common/text/__init__.py +0 -0
- arekit-0.24.0/arekit/common/text/enums.py +12 -0
- arekit-0.24.0/arekit/common/text/parsed.py +42 -0
- arekit-0.24.0/arekit/common/text/parser.py +12 -0
- arekit-0.24.0/arekit/common/text/partitioning/__init__.py +0 -0
- arekit-0.24.0/arekit/common/text/partitioning/base.py +4 -0
- arekit-0.24.0/arekit/common/text/partitioning/str.py +36 -0
- arekit-0.24.0/arekit/common/text/partitioning/terms.py +35 -0
- arekit-0.24.0/arekit/common/text/stemmer.py +16 -0
- arekit-0.24.0/arekit/common/text_opinions/__init__.py +0 -0
- arekit-0.24.0/arekit/common/text_opinions/base.py +105 -0
- arekit-0.24.0/arekit/common/utils.py +129 -0
- arekit-0.24.0/arekit/contrib/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/bert/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/bert/input/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/bert/input/providers/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/bert/input/providers/cropped_sample.py +17 -0
- arekit-0.24.0/arekit/contrib/bert/input/providers/text_pair.py +62 -0
- arekit-0.24.0/arekit/contrib/bert/terms/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/bert/terms/mapper.py +20 -0
- arekit-0.24.0/arekit/contrib/networks/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/networks/embedding.py +149 -0
- arekit-0.24.0/arekit/contrib/networks/embedding_io.py +18 -0
- arekit-0.24.0/arekit/contrib/networks/input/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/networks/input/const.py +6 -0
- arekit-0.24.0/arekit/contrib/networks/input/ctx_serialization.py +28 -0
- arekit-0.24.0/arekit/contrib/networks/input/embedding/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/networks/input/embedding/matrix.py +29 -0
- arekit-0.24.0/arekit/contrib/networks/input/embedding/offsets.py +55 -0
- arekit-0.24.0/arekit/contrib/networks/input/formatters/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/networks/input/formatters/pos_mapper.py +22 -0
- arekit-0.24.0/arekit/contrib/networks/input/providers/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/networks/input/providers/sample.py +129 -0
- arekit-0.24.0/arekit/contrib/networks/input/providers/term_connotation.py +23 -0
- arekit-0.24.0/arekit/contrib/networks/input/providers/text.py +24 -0
- arekit-0.24.0/arekit/contrib/networks/input/rows_parser.py +47 -0
- arekit-0.24.0/arekit/contrib/networks/input/term_types.py +13 -0
- arekit-0.24.0/arekit/contrib/networks/input/terms_mapping.py +60 -0
- arekit-0.24.0/arekit/contrib/networks/vectorizer.py +6 -0
- arekit-0.24.0/arekit/contrib/prompt/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/prompt/sample.py +61 -0
- arekit-0.24.0/arekit/contrib/source/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/source/brat/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/source/brat/annot.py +84 -0
- arekit-0.24.0/arekit/contrib/source/brat/doc.py +28 -0
- arekit-0.24.0/arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/source/brat/entities/compound.py +13 -0
- arekit-0.24.0/arekit/contrib/source/brat/entities/entity.py +42 -0
- arekit-0.24.0/arekit/contrib/source/brat/entities/parser.py +53 -0
- arekit-0.24.0/arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/source/brat/opinions/converter.py +19 -0
- arekit-0.24.0/arekit/contrib/source/brat/relation.py +32 -0
- arekit-0.24.0/arekit/contrib/source/brat/sentence.py +69 -0
- arekit-0.24.0/arekit/contrib/source/brat/sentences_reader.py +128 -0
- arekit-0.24.0/arekit/contrib/source/download.py +41 -0
- arekit-0.24.0/arekit/contrib/source/nerel/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/source/nerel/entities.py +55 -0
- arekit-0.24.0/arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/source/nerel/folding/fixed.py +74 -0
- arekit-0.24.0/arekit/contrib/source/nerel/io_utils.py +62 -0
- arekit-0.24.0/arekit/contrib/source/nerel/labels.py +241 -0
- arekit-0.24.0/arekit/contrib/source/nerel/reader.py +46 -0
- arekit-0.24.0/arekit/contrib/source/nerel/utils.py +24 -0
- arekit-0.24.0/arekit/contrib/source/nerel/versions.py +12 -0
- arekit-0.24.0/arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/source/nerelbio/io_utils.py +62 -0
- arekit-0.24.0/arekit/contrib/source/nerelbio/labels.py +265 -0
- arekit-0.24.0/arekit/contrib/source/nerelbio/reader.py +8 -0
- arekit-0.24.0/arekit/contrib/source/nerelbio/versions.py +8 -0
- arekit-0.24.0/arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/source/ruattitudes/collection.py +36 -0
- arekit-0.24.0/arekit/contrib/source/ruattitudes/doc.py +51 -0
- arekit-0.24.0/arekit/contrib/source/ruattitudes/doc_brat.py +44 -0
- arekit-0.24.0/arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/source/ruattitudes/entity/parser.py +7 -0
- arekit-0.24.0/arekit/contrib/source/ruattitudes/io_utils.py +56 -0
- arekit-0.24.0/arekit/contrib/source/ruattitudes/labels_fmt.py +12 -0
- arekit-0.24.0/arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/source/ruattitudes/opinions/base.py +28 -0
- arekit-0.24.0/arekit/contrib/source/ruattitudes/opinions/converter.py +37 -0
- arekit-0.24.0/arekit/contrib/source/ruattitudes/reader.py +268 -0
- arekit-0.24.0/arekit/contrib/source/ruattitudes/sentence.py +73 -0
- arekit-0.24.0/arekit/contrib/source/ruattitudes/synonyms.py +17 -0
- arekit-0.24.0/arekit/contrib/source/ruattitudes/text_object.py +59 -0
- arekit-0.24.0/arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/source/rusentiframes/collection.py +157 -0
- arekit-0.24.0/arekit/contrib/source/rusentiframes/effect.py +24 -0
- arekit-0.24.0/arekit/contrib/source/rusentiframes/io_utils.py +19 -0
- arekit-0.24.0/arekit/contrib/source/rusentiframes/labels_fmt.py +22 -0
- arekit-0.24.0/arekit/contrib/source/rusentiframes/polarity.py +35 -0
- arekit-0.24.0/arekit/contrib/source/rusentiframes/role.py +15 -0
- arekit-0.24.0/arekit/contrib/source/rusentiframes/state.py +24 -0
- arekit-0.24.0/arekit/contrib/source/rusentiframes/types.py +42 -0
- arekit-0.24.0/arekit/contrib/source/rusentiframes/value.py +2 -0
- arekit-0.24.0/arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/source/rusentrel/const.py +3 -0
- arekit-0.24.0/arekit/contrib/source/rusentrel/docs_reader.py +51 -0
- arekit-0.24.0/arekit/contrib/source/rusentrel/entities.py +26 -0
- arekit-0.24.0/arekit/contrib/source/rusentrel/io_utils.py +125 -0
- arekit-0.24.0/arekit/contrib/source/rusentrel/labels_fmt.py +12 -0
- arekit-0.24.0/arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/source/rusentrel/opinions/collection.py +30 -0
- arekit-0.24.0/arekit/contrib/source/rusentrel/opinions/converter.py +40 -0
- arekit-0.24.0/arekit/contrib/source/rusentrel/opinions/provider.py +54 -0
- arekit-0.24.0/arekit/contrib/source/rusentrel/opinions/writer.py +42 -0
- arekit-0.24.0/arekit/contrib/source/rusentrel/synonyms.py +17 -0
- arekit-0.24.0/arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/source/sentinerel/entities.py +52 -0
- arekit-0.24.0/arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/source/sentinerel/folding/factory.py +31 -0
- arekit-0.24.0/arekit/contrib/source/sentinerel/folding/fixed.py +70 -0
- arekit-0.24.0/arekit/contrib/source/sentinerel/io_utils.py +87 -0
- arekit-0.24.0/arekit/contrib/source/sentinerel/labels.py +53 -0
- arekit-0.24.0/arekit/contrib/source/sentinerel/labels_scaler.py +30 -0
- arekit-0.24.0/arekit/contrib/source/sentinerel/reader.py +42 -0
- arekit-0.24.0/arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/source/synonyms/utils.py +19 -0
- arekit-0.24.0/arekit/contrib/source/zip_utils.py +47 -0
- arekit-0.24.0/arekit/contrib/utils/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/bert/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/bert/samplers.py +17 -0
- arekit-0.24.0/arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/connotations/rusentiframes_sentiment.py +23 -0
- arekit-0.24.0/arekit/contrib/utils/data/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/data/contents/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/data/contents/opinions.py +37 -0
- arekit-0.24.0/arekit/contrib/utils/data/doc_provider/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
- arekit-0.24.0/arekit/contrib/utils/data/doc_provider/dir_based.py +53 -0
- arekit-0.24.0/arekit/contrib/utils/data/readers/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/data/readers/base.py +7 -0
- arekit-0.24.0/arekit/contrib/utils/data/readers/csv_pd.py +38 -0
- arekit-0.24.0/arekit/contrib/utils/data/readers/jsonl.py +15 -0
- arekit-0.24.0/arekit/contrib/utils/data/service/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/data/service/balance.py +50 -0
- arekit-0.24.0/arekit/contrib/utils/data/storages/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/data/storages/jsonl_based.py +18 -0
- arekit-0.24.0/arekit/contrib/utils/data/storages/pandas_based.py +123 -0
- arekit-0.24.0/arekit/contrib/utils/data/storages/row_cache.py +48 -0
- arekit-0.24.0/arekit/contrib/utils/data/writers/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/data/writers/base.py +27 -0
- arekit-0.24.0/arekit/contrib/utils/data/writers/csv_native.py +63 -0
- arekit-0.24.0/arekit/contrib/utils/data/writers/csv_pd.py +40 -0
- arekit-0.24.0/arekit/contrib/utils/data/writers/json_opennre.py +132 -0
- arekit-0.24.0/arekit/contrib/utils/data/writers/sqlite_native.py +110 -0
- arekit-0.24.0/arekit/contrib/utils/download.py +77 -0
- arekit-0.24.0/arekit/contrib/utils/embeddings/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/embeddings/rusvectores.py +58 -0
- arekit-0.24.0/arekit/contrib/utils/embeddings/tokens.py +30 -0
- arekit-0.24.0/arekit/contrib/utils/entities/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/entities/filter.py +7 -0
- arekit-0.24.0/arekit/contrib/utils/entities/formatters/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/entities/formatters/str_display.py +11 -0
- arekit-0.24.0/arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py +15 -0
- arekit-0.24.0/arekit/contrib/utils/io_utils/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/io_utils/embedding.py +72 -0
- arekit-0.24.0/arekit/contrib/utils/io_utils/opinions.py +37 -0
- arekit-0.24.0/arekit/contrib/utils/io_utils/samples.py +79 -0
- arekit-0.24.0/arekit/contrib/utils/io_utils/utils.py +39 -0
- arekit-0.24.0/arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/lexicons/lexicon.py +41 -0
- arekit-0.24.0/arekit/contrib/utils/lexicons/relation.py +42 -0
- arekit-0.24.0/arekit/contrib/utils/lexicons/rusentilex.py +37 -0
- arekit-0.24.0/arekit/contrib/utils/nn/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/nn/rows.py +83 -0
- arekit-0.24.0/arekit/contrib/utils/np_utils/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/np_utils/embedding.py +22 -0
- arekit-0.24.0/arekit/contrib/utils/np_utils/npz_utils.py +13 -0
- arekit-0.24.0/arekit/contrib/utils/np_utils/vocab.py +20 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/items/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/items/sampling/base.py +99 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/items/sampling/networks.py +54 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/items/text/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/items/text/entities_default.py +23 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/items/text/frames.py +86 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +36 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/items/text/frames_negation.py +32 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/items/text/terms_splitter.py +10 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/items/text/tokenizer.py +107 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/items/text/translator.py +135 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/opinion_collections.py +85 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +27 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +65 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +60 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +29 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +64 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +79 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +56 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +20 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +65 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +21 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +107 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +29 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +62 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +180 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +50 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/text_opinion/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +34 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +88 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/text_opinion/extraction.py +93 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/text_opinion/filters/base.py +4 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +16 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +29 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +26 -0
- arekit-0.24.0/arekit/contrib/utils/processing/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/processing/languages/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/processing/languages/mods.py +12 -0
- arekit-0.24.0/arekit/contrib/utils/processing/languages/pos.py +23 -0
- arekit-0.24.0/arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/processing/languages/ru/cases.py +78 -0
- arekit-0.24.0/arekit/contrib/utils/processing/languages/ru/constants.py +6 -0
- arekit-0.24.0/arekit/contrib/utils/processing/languages/ru/mods.py +13 -0
- arekit-0.24.0/arekit/contrib/utils/processing/languages/ru/number.py +23 -0
- arekit-0.24.0/arekit/contrib/utils/processing/languages/ru/pos_service.py +36 -0
- arekit-0.24.0/arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/processing/lemmatization/mystem.py +51 -0
- arekit-0.24.0/arekit/contrib/utils/processing/pos/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/processing/pos/base.py +12 -0
- arekit-0.24.0/arekit/contrib/utils/processing/pos/mystem_wrap.py +134 -0
- arekit-0.24.0/arekit/contrib/utils/processing/pos/russian.py +10 -0
- arekit-0.24.0/arekit/contrib/utils/processing/text/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/processing/text/tokens.py +127 -0
- arekit-0.24.0/arekit/contrib/utils/resources.py +25 -0
- arekit-0.24.0/arekit/contrib/utils/serializer.py +43 -0
- arekit-0.24.0/arekit/contrib/utils/sources/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +63 -0
- arekit-0.24.0/arekit/contrib/utils/synonyms/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/synonyms/simple.py +15 -0
- arekit-0.24.0/arekit/contrib/utils/synonyms/stemmer_based.py +38 -0
- arekit-0.24.0/arekit/contrib/utils/vectorizers/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/vectorizers/bpe.py +93 -0
- arekit-0.24.0/arekit/contrib/utils/vectorizers/random_norm.py +39 -0
- arekit-0.24.0/arekit/download_data.py +11 -0
- arekit-0.24.0/arekit.egg-info/PKG-INFO +19 -0
- arekit-0.24.0/arekit.egg-info/SOURCES.txt +377 -0
- arekit-0.24.0/arekit.egg-info/dependency_links.txt +1 -0
- arekit-0.24.0/arekit.egg-info/requires.txt +4 -0
- arekit-0.24.0/arekit.egg-info/top_level.txt +1 -0
- arekit-0.24.0/setup.cfg +4 -0
- arekit-0.24.0/setup.py +36 -0
arekit-0.24.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2018 Nicolay Rusnachenko
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
arekit-0.24.0/PKG-INFO
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: arekit
|
|
3
|
+
Version: 0.24.0
|
|
4
|
+
Summary: Library devoted to Document level Attitude and Relation Extraction for text objects with entity-linking (EL) API support
|
|
5
|
+
Home-page: https://github.com/nicolay-r/AREkit
|
|
6
|
+
Author: Nicolay Rusnachenko
|
|
7
|
+
Author-email: rusnicolay@gmail.com
|
|
8
|
+
License: MIT License
|
|
9
|
+
Keywords: natural language processing,relation extraction,sentiment analysis
|
|
10
|
+
Platform: UNKNOWN
|
|
11
|
+
Classifier: Programming Language :: Python
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.6
|
|
13
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
15
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
|
|
18
|
+
UNKNOWN
|
|
19
|
+
|
arekit-0.24.0/README.md
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# AREkit 0.24.0
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+
|
|
5
|
+
<p align="center">
|
|
6
|
+
<img src="logo.png"/>
|
|
7
|
+
</p>
|
|
8
|
+
|
|
9
|
+
**AREkit** (Attitude and Relation Extraction Toolkit) --
|
|
10
|
+
is a python toolkit, devoted to document level Attitude and Relation Extraction between text objects from mass-media news.
|
|
11
|
+
|
|
12
|
+
## Description
|
|
13
|
+
|
|
14
|
+
This toolkit aims to solve data preparation problems in Relation Extraction related taks, considiering such factors as:
|
|
15
|
+
* 🔗 EL (entity-linking) API support for objects,
|
|
16
|
+
* ➰ avoidance of cyclic connections,
|
|
17
|
+
* :straight_ruler: distance consideration between relation participants (in `terms` or `sentences`),
|
|
18
|
+
* 📑 relations annotations and filtering rules,
|
|
19
|
+
* *️⃣ entities formatting or masking, and more.
|
|
20
|
+
|
|
21
|
+
Using AREkit you may focus on preparation and experiments with your ML-models by shift all the data-preparation part onto toolset of this project for:
|
|
22
|
+
[neural-networks](https://github.com/nicolay-r/AREkit/wiki/Sampling-for-Neural-Network),
|
|
23
|
+
[language-models](https://github.com/nicolay-r/AREkit/wiki/Sampling-for-BERT),
|
|
24
|
+
[ChatGPT](https://github.com/nicolay-r/AREkit/wiki/Sampling-for-ChatGPT).
|
|
25
|
+
|
|
26
|
+
In order to do so, we provide:
|
|
27
|
+
* :file_folder: API for external [collection binding](https://github.com/nicolay-r/AREkit/wiki/Binding-a-Custom-Source) (native support of [BRAT](https://brat.nlplab.org/)-based exported annotations)
|
|
28
|
+
* ➿ [pipelines](https://github.com/nicolay-r/AREkit/wiki/Pipelines:-Text-Opinion-Annotation) and iterators for handling large-scale collections serialization without out-of-memory issues.
|
|
29
|
+
* evaluators which allows you to assess your trained model.
|
|
30
|
+
|
|
31
|
+
AREkit is a very close to opensource framework [SeqIO](https://github.com/google/seqio) proposed by [Google](https://github.com/google)
|
|
32
|
+
for data-preprocessing, evaluation, for sequence models.
|
|
33
|
+
While SeqIO dedicated for conversion/pre-processing of datasets of any type,
|
|
34
|
+
this project proposes pipelines creation from the very raw or preannotated (BRAT-based) texts, including the solutions for problems mentioned above.
|
|
35
|
+
|
|
36
|
+
The core functionality includes
|
|
37
|
+
(1) API for document presentation with EL (Entity Linking, i.e. Object Synonymy) support
|
|
38
|
+
for sentence level relations preparation (dubbed as contexts)
|
|
39
|
+
(2) API for contexts extraction
|
|
40
|
+
(3) relations transferring from sentence-level onto document-level, and more.
|
|
41
|
+
|
|
42
|
+
## Installation
|
|
43
|
+
|
|
44
|
+
1. Install required dependencies
|
|
45
|
+
```bash
|
|
46
|
+
pip install git+https://github.com/nicolay-r/AREkit.git@0.24.0-rc
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
2. Download Resources
|
|
50
|
+
```bash
|
|
51
|
+
python -m arekit.download_data
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Usage
|
|
55
|
+
Please follow the wiki page
|
|
56
|
+
[Tutorials List](https://github.com/nicolay-r/AREkit/wiki/Tutorials).
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
class Bound:
|
|
2
|
+
|
|
3
|
+
def __init__(self, pos, length):
|
|
4
|
+
assert(isinstance(pos, int))
|
|
5
|
+
assert(isinstance(length, int))
|
|
6
|
+
self.__pos = pos
|
|
7
|
+
self.__length = length
|
|
8
|
+
|
|
9
|
+
# region properties
|
|
10
|
+
|
|
11
|
+
@property
|
|
12
|
+
def Position(self):
|
|
13
|
+
return self.__pos
|
|
14
|
+
|
|
15
|
+
@property
|
|
16
|
+
def Length(self):
|
|
17
|
+
return self.__length
|
|
18
|
+
|
|
19
|
+
# endregion
|
|
20
|
+
|
|
21
|
+
def itersects_with(self, other):
|
|
22
|
+
begin = self.__pos
|
|
23
|
+
end = self.__pos + self.__length
|
|
24
|
+
other_begin = other.Position
|
|
25
|
+
other_end_included = other.Position + other.Length - 1
|
|
26
|
+
if end > other_begin >= begin:
|
|
27
|
+
return True
|
|
28
|
+
if end > other_end_included >= begin:
|
|
29
|
+
return True
|
|
30
|
+
if other_begin < begin and end <= other_end_included:
|
|
31
|
+
return True
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
def intersect(self, other):
|
|
35
|
+
begin = self.__pos
|
|
36
|
+
end = self.__pos + self.__length
|
|
37
|
+
other_begin = other.Position
|
|
38
|
+
other_end = other.Position + other.Length
|
|
39
|
+
actual_begin = min(begin, other_begin)
|
|
40
|
+
actual_length = max(end, other_end) - actual_begin
|
|
41
|
+
return Bound(pos=actual_begin, length=actual_length)
|
|
42
|
+
|
|
43
|
+
def contains(self, other):
|
|
44
|
+
begin = self.__pos
|
|
45
|
+
end = self.__pos + self.__length
|
|
46
|
+
other_begin = other.Position
|
|
47
|
+
other_end = other.Position + other.Length
|
|
48
|
+
return begin <= other_begin and end >= other_end
|
|
File without changes
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from collections.abc import Iterable
|
|
2
|
+
|
|
3
|
+
from arekit.common.context.token import Token
|
|
4
|
+
from arekit.common.entities.base import Entity
|
|
5
|
+
from arekit.common.frames.text_variant import TextFrameVariant
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TextTermsMapper(object):
|
|
9
|
+
|
|
10
|
+
def iter_mapped(self, terms):
|
|
11
|
+
""" Performs mapping operation of each terms in a sequence
|
|
12
|
+
"""
|
|
13
|
+
assert(isinstance(terms, Iterable))
|
|
14
|
+
|
|
15
|
+
self._before_mapping()
|
|
16
|
+
|
|
17
|
+
for i, term in enumerate(terms):
|
|
18
|
+
|
|
19
|
+
if isinstance(term, str):
|
|
20
|
+
m_term = self.map_word(i, term)
|
|
21
|
+
elif isinstance(term, Token):
|
|
22
|
+
m_term = self.map_token(i, term)
|
|
23
|
+
elif isinstance(term, TextFrameVariant):
|
|
24
|
+
m_term = self.map_text_frame_variant(i, term)
|
|
25
|
+
elif isinstance(term, Entity):
|
|
26
|
+
m_term = self.map_entity(i, term)
|
|
27
|
+
else:
|
|
28
|
+
raise Exception("Unsupported type {}".format(term))
|
|
29
|
+
|
|
30
|
+
if m_term is not None:
|
|
31
|
+
yield m_term
|
|
32
|
+
|
|
33
|
+
self._after_mapping()
|
|
34
|
+
|
|
35
|
+
def _before_mapping(self):
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
def _after_mapping(self):
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
def map_word(self, w_ind, word):
|
|
42
|
+
raise NotImplementedError()
|
|
43
|
+
|
|
44
|
+
def map_token(self, t_ind, token):
|
|
45
|
+
raise NotImplementedError()
|
|
46
|
+
|
|
47
|
+
def map_text_frame_variant(self, fv_ind, text_frame_variant):
|
|
48
|
+
raise NotImplementedError()
|
|
49
|
+
|
|
50
|
+
def map_entity(self, e_ind, entity):
|
|
51
|
+
raise NotImplementedError()
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
class Token:
|
|
2
|
+
"""
|
|
3
|
+
Token that stores original and resulted token values
|
|
4
|
+
i.e.: term=',', token_value='<[COMMA]>'
|
|
5
|
+
"""
|
|
6
|
+
def __init__(self, term, token_value):
|
|
7
|
+
assert(isinstance(term, str))
|
|
8
|
+
assert(isinstance(token_value, str))
|
|
9
|
+
self.__meta_value = term
|
|
10
|
+
self.__token_value = token_value
|
|
11
|
+
|
|
12
|
+
def get_meta_value(self):
|
|
13
|
+
return self.__meta_value
|
|
14
|
+
|
|
15
|
+
def get_token_value(self):
|
|
16
|
+
return self.__token_value
|
|
File without changes
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
ID = 'id'
|
|
2
|
+
DOC_ID = 'doc_id'
|
|
3
|
+
TEXT = 'text_a'
|
|
4
|
+
LABEL_UINT = 'label_uint'
|
|
5
|
+
LABEL_STR = 'label_str'
|
|
6
|
+
|
|
7
|
+
# Global identifier of the opinion in the sampled data.
|
|
8
|
+
OPINION_ID = "opinion_id"
|
|
9
|
+
OPINION_LINKAGE_ID = "linkage_id"
|
|
10
|
+
|
|
11
|
+
# Corresponds to fields with attitude ends. (indices, INT)
|
|
12
|
+
S_IND = 's_ind'
|
|
13
|
+
T_IND = 't_ind'
|
|
14
|
+
|
|
15
|
+
# Provide sentence index.
|
|
16
|
+
SENT_IND = 'sent_ind'
|
|
17
|
+
|
|
18
|
+
# Entity parameters
|
|
19
|
+
ENTITY_VALUES = 'entity_values'
|
|
20
|
+
ENTITY_TYPES = 'entity_types'
|
|
21
|
+
ENTITIES = 'entities'
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from arekit.common.data import const
|
|
2
|
+
from arekit.common.data.input.providers.columns.base import BaseColumnsProvider
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class SampleColumnsProvider(BaseColumnsProvider):
|
|
6
|
+
"""
|
|
7
|
+
[id, label, text_a] -- for train
|
|
8
|
+
[id, text_a] -- for test
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def __init__(self, store_labels):
|
|
12
|
+
super(SampleColumnsProvider, self).__init__()
|
|
13
|
+
self.__store_labels = store_labels
|
|
14
|
+
self.__text_column_names = None
|
|
15
|
+
|
|
16
|
+
# region properties
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def StoreLabels(self):
|
|
20
|
+
return self.__store_labels
|
|
21
|
+
|
|
22
|
+
@property
|
|
23
|
+
def TextColumnNames(self):
|
|
24
|
+
return self.__text_column_names
|
|
25
|
+
|
|
26
|
+
# endregion
|
|
27
|
+
|
|
28
|
+
def get_columns_list_with_types(self):
|
|
29
|
+
"""
|
|
30
|
+
Composing df with the following columns:
|
|
31
|
+
[id, label, type, text_a]
|
|
32
|
+
"""
|
|
33
|
+
dtypes_list = super(SampleColumnsProvider, self).get_columns_list_with_types()
|
|
34
|
+
|
|
35
|
+
dtypes_list.append((const.ID, str))
|
|
36
|
+
dtypes_list.append((const.DOC_ID, str))
|
|
37
|
+
|
|
38
|
+
# insert labels
|
|
39
|
+
if self.__store_labels:
|
|
40
|
+
dtypes_list.append((const.LABEL_UINT, 'int32'))
|
|
41
|
+
dtypes_list.append((const.LABEL_STR, str))
|
|
42
|
+
|
|
43
|
+
# insert text columns
|
|
44
|
+
for col_name in self.__text_column_names:
|
|
45
|
+
dtypes_list.append((col_name, str))
|
|
46
|
+
|
|
47
|
+
# insert indices
|
|
48
|
+
dtypes_list.append((const.S_IND, 'int32'))
|
|
49
|
+
dtypes_list.append((const.T_IND, 'int32'))
|
|
50
|
+
|
|
51
|
+
# opinion-extraction task related fields
|
|
52
|
+
dtypes_list.append((const.OPINION_ID, 'int32'))
|
|
53
|
+
dtypes_list.append((const.OPINION_LINKAGE_ID, 'int32'))
|
|
54
|
+
|
|
55
|
+
return dtypes_list
|
|
56
|
+
|
|
57
|
+
def set_text_column_names(self, text_column_names):
|
|
58
|
+
assert(isinstance(text_column_names, list))
|
|
59
|
+
self.__text_column_names = text_column_names
|
|
File without changes
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from arekit.common.linkage.base import LinkedDataWrapper
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class BaseLinkedDataInstancesProvider(object):
|
|
5
|
+
|
|
6
|
+
def iter_instances(self, linked_data):
|
|
7
|
+
raise NotImplementedError()
|
|
8
|
+
|
|
9
|
+
@staticmethod
|
|
10
|
+
def provide_label(linked_data):
|
|
11
|
+
""" Implementation based on the first element of the linkage.
|
|
12
|
+
"""
|
|
13
|
+
assert(isinstance(linked_data, LinkedDataWrapper))
|
|
14
|
+
return linked_data.First.Label
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from arekit.common.data.input.providers.instances.base import BaseLinkedDataInstancesProvider
|
|
2
|
+
from arekit.common.linkage.text_opinions import TextOpinionsLinkage
|
|
3
|
+
from arekit.common.text_opinions.base import TextOpinion
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class MultipleInstancesLinkedTextOpinionsProvider(BaseLinkedDataInstancesProvider):
|
|
7
|
+
|
|
8
|
+
def __init__(self, supported_labels):
|
|
9
|
+
assert(isinstance(supported_labels, list))
|
|
10
|
+
self.__supported_labels = supported_labels
|
|
11
|
+
|
|
12
|
+
def iter_instances(self, linked_data):
|
|
13
|
+
""" Enumerate all opinions as if it would be with the different label types.
|
|
14
|
+
"""
|
|
15
|
+
for label in self.__supported_labels:
|
|
16
|
+
yield self.__modify_first_and_copy_linked_wrap(linked_data, label)
|
|
17
|
+
|
|
18
|
+
@staticmethod
|
|
19
|
+
def __modify_first_and_copy_linked_wrap(text_opinions_linkage, label):
|
|
20
|
+
assert (isinstance(text_opinions_linkage, TextOpinionsLinkage))
|
|
21
|
+
|
|
22
|
+
linkage = list(text_opinions_linkage)
|
|
23
|
+
text_opinion_copy = TextOpinion.create_copy(other=linkage[0])
|
|
24
|
+
text_opinion_copy.set_label(label=label)
|
|
25
|
+
linkage[0] = text_opinion_copy
|
|
26
|
+
|
|
27
|
+
return TextOpinionsLinkage(linked_data=linkage)
|
|
File without changes
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from arekit.common.labels.scaler.base import BaseLabelScaler
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class LabelProvider(object):
|
|
5
|
+
|
|
6
|
+
def __init__(self, label_scaler):
|
|
7
|
+
assert(isinstance(label_scaler, BaseLabelScaler))
|
|
8
|
+
self.__label_scaler = label_scaler
|
|
9
|
+
|
|
10
|
+
@property
|
|
11
|
+
def LabelScaler(self):
|
|
12
|
+
return self.__label_scaler
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def SupportedLabels(self):
|
|
16
|
+
return self.__label_scaler.ordered_suppoted_labels()
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def OutputLabelsUint(self):
|
|
20
|
+
raise NotImplementedError()
|
|
21
|
+
|
|
22
|
+
def calculate_output_uint_label(self, expected_uint_label, etalon_uint_label):
|
|
23
|
+
raise NotImplementedError()
|
|
24
|
+
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from arekit.common.data.input.providers.label.base import LabelProvider
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class BinaryLabelProvider(LabelProvider):
|
|
5
|
+
|
|
6
|
+
def calculate_output_uint_label(self, expected_uint_label, etalon_uint_label):
|
|
7
|
+
return 1 if expected_uint_label == etalon_uint_label else 0
|
|
8
|
+
|
|
9
|
+
@property
|
|
10
|
+
def OutputLabelsUint(self):
|
|
11
|
+
return [0, 1]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from arekit.common.data.input.providers.label.base import LabelProvider
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class MultipleLabelProvider(LabelProvider):
|
|
5
|
+
|
|
6
|
+
def __init__(self, label_scaler):
|
|
7
|
+
super(MultipleLabelProvider, self).__init__(label_scaler=label_scaler)
|
|
8
|
+
|
|
9
|
+
def calculate_output_uint_label(self, expected_uint_label, etalon_uint_label):
|
|
10
|
+
return expected_uint_label
|
|
11
|
+
|
|
12
|
+
@property
|
|
13
|
+
def OutputLabelsUint(self):
|
|
14
|
+
return [self.LabelScaler.label_to_uint(label) for label in self.SupportedLabels]
|
|
15
|
+
|
|
File without changes
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from collections import Counter
|
|
2
|
+
from collections.abc import Iterable
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from arekit.common.data.input.providers.contents import ContentsProvider
|
|
6
|
+
from arekit.common.linkage.base import LinkedDataWrapper
|
|
7
|
+
from arekit.common.docs.parsed.providers.entity_service import EntityServiceProvider
|
|
8
|
+
from arekit.common.linkage.meta import MetaEmptyLinkedDataWrapper
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BaseRowProvider(object):
|
|
14
|
+
""" Base provider for rows that suppose to be filled into BaseRowsStorage.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self):
|
|
18
|
+
self.__rows_counter = None
|
|
19
|
+
|
|
20
|
+
# region protected methods
|
|
21
|
+
|
|
22
|
+
# TODO. This might be also generalized.
|
|
23
|
+
# TODO. Idle-mode is also a implementation and task specific parameter, i.e. might be removed from here.
|
|
24
|
+
def _provide_rows(self, parsed_doc, entity_service, text_opinion_linkage, idle_mode):
|
|
25
|
+
raise NotImplementedError()
|
|
26
|
+
|
|
27
|
+
def _count_row(self):
|
|
28
|
+
index = self.__rows_counter["rows_iterated"]
|
|
29
|
+
self.__rows_counter["rows_iterated"] += 1
|
|
30
|
+
return index
|
|
31
|
+
|
|
32
|
+
# endregion
|
|
33
|
+
|
|
34
|
+
def __iter_rows(self, linked_data, idle_mode):
|
|
35
|
+
parsed_doc_service = linked_data.Tag
|
|
36
|
+
return self._provide_rows(parsed_doc=parsed_doc_service.ParsedDocument,
|
|
37
|
+
entity_service=parsed_doc_service.get_provider(EntityServiceProvider.NAME),
|
|
38
|
+
text_opinion_linkage=linked_data,
|
|
39
|
+
idle_mode=idle_mode)
|
|
40
|
+
|
|
41
|
+
def iter_by_rows(self, contents_provider, doc_ids_iter, idle_mode):
|
|
42
|
+
assert(isinstance(contents_provider, ContentsProvider))
|
|
43
|
+
assert(isinstance(doc_ids_iter, Iterable))
|
|
44
|
+
|
|
45
|
+
self.__rows_counter = Counter()
|
|
46
|
+
|
|
47
|
+
for linked_data in contents_provider.from_doc_ids(doc_ids=doc_ids_iter, idle_mode=idle_mode):
|
|
48
|
+
assert(isinstance(linked_data, LinkedDataWrapper))
|
|
49
|
+
|
|
50
|
+
if isinstance(linked_data, MetaEmptyLinkedDataWrapper):
|
|
51
|
+
if idle_mode:
|
|
52
|
+
# In the case of the IDLE mode we do not consider the meta-data.
|
|
53
|
+
data_it = []
|
|
54
|
+
else:
|
|
55
|
+
# Consider the actual linked data instance.
|
|
56
|
+
data_it = [linked_data]
|
|
57
|
+
else:
|
|
58
|
+
# Consider the actual rows of the related linked data.
|
|
59
|
+
data_it = self.__iter_rows(linked_data=linked_data, idle_mode=idle_mode)
|
|
60
|
+
|
|
61
|
+
for data in data_it:
|
|
62
|
+
yield linked_data.RelatedDocID, data
|
|
63
|
+
|
|
64
|
+
self.__rows_counter = None
|