arekit 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/__init__.py +0 -0
- arekit/common/__init__.py +0 -0
- arekit/common/bound.py +48 -0
- arekit/common/context/__init__.py +0 -0
- arekit/common/context/terms_mapper.py +51 -0
- arekit/common/context/token.py +16 -0
- arekit/common/data/__init__.py +0 -0
- arekit/common/data/const.py +21 -0
- arekit/common/data/doc_provider.py +6 -0
- arekit/common/data/input/__init__.py +0 -0
- arekit/common/data/input/providers/__init__.py +0 -0
- arekit/common/data/input/providers/columns/__init__.py +0 -0
- arekit/common/data/input/providers/columns/base.py +9 -0
- arekit/common/data/input/providers/columns/sample.py +59 -0
- arekit/common/data/input/providers/const.py +3 -0
- arekit/common/data/input/providers/contents.py +9 -0
- arekit/common/data/input/providers/instances/__init__.py +0 -0
- arekit/common/data/input/providers/instances/base.py +14 -0
- arekit/common/data/input/providers/instances/multiple.py +27 -0
- arekit/common/data/input/providers/instances/single.py +8 -0
- arekit/common/data/input/providers/label/__init__.py +0 -0
- arekit/common/data/input/providers/label/base.py +24 -0
- arekit/common/data/input/providers/label/binary.py +11 -0
- arekit/common/data/input/providers/label/multiple.py +15 -0
- arekit/common/data/input/providers/rows/__init__.py +0 -0
- arekit/common/data/input/providers/rows/base.py +64 -0
- arekit/common/data/input/providers/rows/samples.py +227 -0
- arekit/common/data/input/providers/sample/__init__.py +0 -0
- arekit/common/data/input/providers/sample/cropped.py +43 -0
- arekit/common/data/input/providers/text/__init__.py +0 -0
- arekit/common/data/input/providers/text/single.py +49 -0
- arekit/common/data/input/repositories/__init__.py +0 -0
- arekit/common/data/input/repositories/base.py +68 -0
- arekit/common/data/input/repositories/sample.py +22 -0
- arekit/common/data/input/sample.py +66 -0
- arekit/common/data/input/terms_mapper.py +88 -0
- arekit/common/data/rows_fmt.py +82 -0
- arekit/common/data/rows_parser.py +43 -0
- arekit/common/data/storages/__init__.py +0 -0
- arekit/common/data/storages/base.py +109 -0
- arekit/common/data/views/__init__.py +0 -0
- arekit/common/data/views/samples.py +26 -0
- arekit/common/docs/__init__.py +0 -0
- arekit/common/docs/base.py +30 -0
- arekit/common/docs/entities_grouping.py +16 -0
- arekit/common/docs/entity.py +18 -0
- arekit/common/docs/objects_parser.py +37 -0
- arekit/common/docs/parsed/__init__.py +0 -0
- arekit/common/docs/parsed/base.py +101 -0
- arekit/common/docs/parsed/providers/__init__.py +0 -0
- arekit/common/docs/parsed/providers/base.py +68 -0
- arekit/common/docs/parsed/providers/base_pairs.py +51 -0
- arekit/common/docs/parsed/providers/entity_service.py +175 -0
- arekit/common/docs/parsed/providers/opinion_pairs.py +20 -0
- arekit/common/docs/parsed/providers/text_opinion_pairs.py +78 -0
- arekit/common/docs/parsed/service.py +31 -0
- arekit/common/docs/parsed/term_position.py +42 -0
- arekit/common/docs/parser.py +34 -0
- arekit/common/docs/sentence.py +14 -0
- arekit/common/entities/__init__.py +0 -0
- arekit/common/entities/base.py +51 -0
- arekit/common/entities/collection.py +72 -0
- arekit/common/entities/str_fmt.py +8 -0
- arekit/common/entities/types.py +9 -0
- arekit/common/experiment/__init__.py +0 -0
- arekit/common/experiment/api/__init__.py +0 -0
- arekit/common/experiment/api/base_samples_io.py +20 -0
- arekit/common/experiment/data_type.py +17 -0
- arekit/common/frames/__init__.py +0 -0
- arekit/common/frames/connotations/__init__.py +0 -0
- arekit/common/frames/connotations/descriptor.py +17 -0
- arekit/common/frames/connotations/provider.py +4 -0
- arekit/common/frames/text_variant.py +43 -0
- arekit/common/frames/variants/__init__.py +0 -0
- arekit/common/frames/variants/base.py +21 -0
- arekit/common/frames/variants/collection.py +60 -0
- arekit/common/labels/__init__.py +0 -0
- arekit/common/labels/base.py +19 -0
- arekit/common/labels/provider/__init__.py +0 -0
- arekit/common/labels/provider/base.py +7 -0
- arekit/common/labels/provider/constant.py +14 -0
- arekit/common/labels/scaler/__init__.py +0 -0
- arekit/common/labels/scaler/base.py +85 -0
- arekit/common/labels/scaler/sentiment.py +7 -0
- arekit/common/labels/scaler/single.py +10 -0
- arekit/common/labels/str_fmt.py +55 -0
- arekit/common/linkage/__init__.py +0 -0
- arekit/common/linkage/base.py +44 -0
- arekit/common/linkage/meta.py +23 -0
- arekit/common/linkage/opinions.py +9 -0
- arekit/common/linkage/text_opinions.py +22 -0
- arekit/common/log_utils.py +29 -0
- arekit/common/model/__init__.py +0 -0
- arekit/common/model/labeling/__init__.py +0 -0
- arekit/common/model/labeling/base.py +24 -0
- arekit/common/model/labeling/modes.py +8 -0
- arekit/common/model/labeling/single.py +24 -0
- arekit/common/opinions/__init__.py +0 -0
- arekit/common/opinions/annot/__init__.py +0 -0
- arekit/common/opinions/annot/algo/__init__.py +0 -0
- arekit/common/opinions/annot/algo/base.py +4 -0
- arekit/common/opinions/annot/algo/pair_based.py +99 -0
- arekit/common/opinions/annot/algo/predefined.py +16 -0
- arekit/common/opinions/annot/algo_based.py +55 -0
- arekit/common/opinions/annot/base.py +15 -0
- arekit/common/opinions/base.py +74 -0
- arekit/common/opinions/collection.py +150 -0
- arekit/common/opinions/enums.py +6 -0
- arekit/common/opinions/provider.py +4 -0
- arekit/common/opinions/writer.py +4 -0
- arekit/common/pipeline/__init__.py +0 -0
- arekit/common/pipeline/base.py +25 -0
- arekit/common/pipeline/context.py +36 -0
- arekit/common/pipeline/conts.py +2 -0
- arekit/common/pipeline/items/__init__.py +0 -0
- arekit/common/pipeline/items/base.py +12 -0
- arekit/common/pipeline/items/flatten.py +14 -0
- arekit/common/pipeline/items/handle.py +17 -0
- arekit/common/pipeline/items/iter.py +11 -0
- arekit/common/pipeline/items/map.py +11 -0
- arekit/common/pipeline/items/map_nested.py +13 -0
- arekit/common/synonyms/__init__.py +0 -0
- arekit/common/synonyms/base.py +151 -0
- arekit/common/synonyms/grouping.py +21 -0
- arekit/common/text/__init__.py +0 -0
- arekit/common/text/enums.py +12 -0
- arekit/common/text/parsed.py +42 -0
- arekit/common/text/parser.py +12 -0
- arekit/common/text/partitioning/__init__.py +0 -0
- arekit/common/text/partitioning/base.py +4 -0
- arekit/common/text/partitioning/str.py +36 -0
- arekit/common/text/partitioning/terms.py +35 -0
- arekit/common/text/stemmer.py +16 -0
- arekit/common/text_opinions/__init__.py +0 -0
- arekit/common/text_opinions/base.py +105 -0
- arekit/common/utils.py +129 -0
- arekit/contrib/__init__.py +0 -0
- arekit/contrib/bert/__init__.py +0 -0
- arekit/contrib/bert/input/__init__.py +0 -0
- arekit/contrib/bert/input/providers/__init__.py +0 -0
- arekit/contrib/bert/input/providers/cropped_sample.py +17 -0
- arekit/contrib/bert/input/providers/text_pair.py +62 -0
- arekit/contrib/bert/terms/__init__.py +0 -0
- arekit/contrib/bert/terms/mapper.py +20 -0
- arekit/contrib/networks/__init__.py +0 -0
- arekit/contrib/networks/embedding.py +149 -0
- arekit/contrib/networks/embedding_io.py +18 -0
- arekit/contrib/networks/input/__init__.py +0 -0
- arekit/contrib/networks/input/const.py +6 -0
- arekit/contrib/networks/input/ctx_serialization.py +28 -0
- arekit/contrib/networks/input/embedding/__init__.py +0 -0
- arekit/contrib/networks/input/embedding/matrix.py +29 -0
- arekit/contrib/networks/input/embedding/offsets.py +55 -0
- arekit/contrib/networks/input/formatters/__init__.py +0 -0
- arekit/contrib/networks/input/formatters/pos_mapper.py +22 -0
- arekit/contrib/networks/input/providers/__init__.py +0 -0
- arekit/contrib/networks/input/providers/sample.py +129 -0
- arekit/contrib/networks/input/providers/term_connotation.py +23 -0
- arekit/contrib/networks/input/providers/text.py +24 -0
- arekit/contrib/networks/input/rows_parser.py +47 -0
- arekit/contrib/networks/input/term_types.py +13 -0
- arekit/contrib/networks/input/terms_mapping.py +60 -0
- arekit/contrib/networks/vectorizer.py +6 -0
- arekit/contrib/prompt/__init__.py +0 -0
- arekit/contrib/prompt/sample.py +61 -0
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +84 -0
- arekit/contrib/source/brat/doc.py +28 -0
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +13 -0
- arekit/contrib/source/brat/entities/entity.py +42 -0
- arekit/contrib/source/brat/entities/parser.py +53 -0
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +19 -0
- arekit/contrib/source/brat/relation.py +32 -0
- arekit/contrib/source/brat/sentence.py +69 -0
- arekit/contrib/source/brat/sentences_reader.py +128 -0
- arekit/contrib/source/download.py +41 -0
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +55 -0
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +74 -0
- arekit/contrib/source/nerel/io_utils.py +62 -0
- arekit/contrib/source/nerel/labels.py +241 -0
- arekit/contrib/source/nerel/reader.py +46 -0
- arekit/contrib/source/nerel/utils.py +24 -0
- arekit/contrib/source/nerel/versions.py +12 -0
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +62 -0
- arekit/contrib/source/nerelbio/labels.py +265 -0
- arekit/contrib/source/nerelbio/reader.py +8 -0
- arekit/contrib/source/nerelbio/versions.py +8 -0
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +36 -0
- arekit/contrib/source/ruattitudes/doc.py +51 -0
- arekit/contrib/source/ruattitudes/doc_brat.py +44 -0
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +7 -0
- arekit/contrib/source/ruattitudes/io_utils.py +56 -0
- arekit/contrib/source/ruattitudes/labels_fmt.py +12 -0
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +28 -0
- arekit/contrib/source/ruattitudes/opinions/converter.py +37 -0
- arekit/contrib/source/ruattitudes/reader.py +268 -0
- arekit/contrib/source/ruattitudes/sentence.py +73 -0
- arekit/contrib/source/ruattitudes/synonyms.py +17 -0
- arekit/contrib/source/ruattitudes/text_object.py +59 -0
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +157 -0
- arekit/contrib/source/rusentiframes/effect.py +24 -0
- arekit/contrib/source/rusentiframes/io_utils.py +19 -0
- arekit/contrib/source/rusentiframes/labels_fmt.py +22 -0
- arekit/contrib/source/rusentiframes/polarity.py +35 -0
- arekit/contrib/source/rusentiframes/role.py +15 -0
- arekit/contrib/source/rusentiframes/state.py +24 -0
- arekit/contrib/source/rusentiframes/types.py +42 -0
- arekit/contrib/source/rusentiframes/value.py +2 -0
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +3 -0
- arekit/contrib/source/rusentrel/docs_reader.py +51 -0
- arekit/contrib/source/rusentrel/entities.py +26 -0
- arekit/contrib/source/rusentrel/io_utils.py +125 -0
- arekit/contrib/source/rusentrel/labels_fmt.py +12 -0
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +30 -0
- arekit/contrib/source/rusentrel/opinions/converter.py +40 -0
- arekit/contrib/source/rusentrel/opinions/provider.py +54 -0
- arekit/contrib/source/rusentrel/opinions/writer.py +42 -0
- arekit/contrib/source/rusentrel/synonyms.py +17 -0
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +52 -0
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +31 -0
- arekit/contrib/source/sentinerel/folding/fixed.py +70 -0
- arekit/contrib/source/sentinerel/io_utils.py +87 -0
- arekit/contrib/source/sentinerel/labels.py +53 -0
- arekit/contrib/source/sentinerel/labels_scaler.py +30 -0
- arekit/contrib/source/sentinerel/reader.py +42 -0
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +19 -0
- arekit/contrib/source/zip_utils.py +47 -0
- arekit/contrib/utils/__init__.py +0 -0
- arekit/contrib/utils/bert/__init__.py +0 -0
- arekit/contrib/utils/bert/samplers.py +17 -0
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +23 -0
- arekit/contrib/utils/data/__init__.py +0 -0
- arekit/contrib/utils/data/contents/__init__.py +0 -0
- arekit/contrib/utils/data/contents/opinions.py +37 -0
- arekit/contrib/utils/data/doc_provider/__init__.py +0 -0
- arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
- arekit/contrib/utils/data/doc_provider/dir_based.py +53 -0
- arekit/contrib/utils/data/readers/__init__.py +0 -0
- arekit/contrib/utils/data/readers/base.py +7 -0
- arekit/contrib/utils/data/readers/csv_pd.py +38 -0
- arekit/contrib/utils/data/readers/jsonl.py +15 -0
- arekit/contrib/utils/data/service/__init__.py +0 -0
- arekit/contrib/utils/data/service/balance.py +50 -0
- arekit/contrib/utils/data/storages/__init__.py +0 -0
- arekit/contrib/utils/data/storages/jsonl_based.py +18 -0
- arekit/contrib/utils/data/storages/pandas_based.py +123 -0
- arekit/contrib/utils/data/storages/row_cache.py +48 -0
- arekit/contrib/utils/data/writers/__init__.py +0 -0
- arekit/contrib/utils/data/writers/base.py +27 -0
- arekit/contrib/utils/data/writers/csv_native.py +63 -0
- arekit/contrib/utils/data/writers/csv_pd.py +40 -0
- arekit/contrib/utils/data/writers/json_opennre.py +132 -0
- arekit/contrib/utils/data/writers/sqlite_native.py +110 -0
- arekit/contrib/utils/download.py +77 -0
- arekit/contrib/utils/embeddings/__init__.py +0 -0
- arekit/contrib/utils/embeddings/rusvectores.py +58 -0
- arekit/contrib/utils/embeddings/tokens.py +30 -0
- arekit/contrib/utils/entities/__init__.py +0 -0
- arekit/contrib/utils/entities/filter.py +7 -0
- arekit/contrib/utils/entities/formatters/__init__.py +0 -0
- arekit/contrib/utils/entities/formatters/str_display.py +11 -0
- arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py +15 -0
- arekit/contrib/utils/io_utils/__init__.py +0 -0
- arekit/contrib/utils/io_utils/embedding.py +72 -0
- arekit/contrib/utils/io_utils/opinions.py +37 -0
- arekit/contrib/utils/io_utils/samples.py +79 -0
- arekit/contrib/utils/io_utils/utils.py +39 -0
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +41 -0
- arekit/contrib/utils/lexicons/relation.py +42 -0
- arekit/contrib/utils/lexicons/rusentilex.py +37 -0
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +83 -0
- arekit/contrib/utils/np_utils/__init__.py +0 -0
- arekit/contrib/utils/np_utils/embedding.py +22 -0
- arekit/contrib/utils/np_utils/npz_utils.py +13 -0
- arekit/contrib/utils/np_utils/vocab.py +20 -0
- arekit/contrib/utils/pipelines/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/base.py +99 -0
- arekit/contrib/utils/pipelines/items/sampling/networks.py +54 -0
- arekit/contrib/utils/pipelines/items/text/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/text/entities_default.py +23 -0
- arekit/contrib/utils/pipelines/items/text/frames.py +86 -0
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +36 -0
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +32 -0
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +10 -0
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +107 -0
- arekit/contrib/utils/pipelines/items/text/translator.py +135 -0
- arekit/contrib/utils/pipelines/opinion_collections.py +85 -0
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +27 -0
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +65 -0
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +60 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +29 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +64 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +79 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +56 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +20 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +65 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +21 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +107 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +29 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +62 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +180 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +50 -0
- arekit/contrib/utils/pipelines/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +34 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +88 -0
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +93 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/base.py +4 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +16 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +29 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +26 -0
- arekit/contrib/utils/processing/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/mods.py +12 -0
- arekit/contrib/utils/processing/languages/pos.py +23 -0
- arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/ru/cases.py +78 -0
- arekit/contrib/utils/processing/languages/ru/constants.py +6 -0
- arekit/contrib/utils/processing/languages/ru/mods.py +13 -0
- arekit/contrib/utils/processing/languages/ru/number.py +23 -0
- arekit/contrib/utils/processing/languages/ru/pos_service.py +36 -0
- arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
- arekit/contrib/utils/processing/lemmatization/mystem.py +51 -0
- arekit/contrib/utils/processing/pos/__init__.py +0 -0
- arekit/contrib/utils/processing/pos/base.py +12 -0
- arekit/contrib/utils/processing/pos/mystem_wrap.py +134 -0
- arekit/contrib/utils/processing/pos/russian.py +10 -0
- arekit/contrib/utils/processing/text/__init__.py +0 -0
- arekit/contrib/utils/processing/text/tokens.py +127 -0
- arekit/contrib/utils/resources.py +25 -0
- arekit/contrib/utils/serializer.py +43 -0
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +63 -0
- arekit/contrib/utils/synonyms/__init__.py +0 -0
- arekit/contrib/utils/synonyms/simple.py +15 -0
- arekit/contrib/utils/synonyms/stemmer_based.py +38 -0
- arekit/contrib/utils/vectorizers/__init__.py +0 -0
- arekit/contrib/utils/vectorizers/bpe.py +93 -0
- arekit/contrib/utils/vectorizers/random_norm.py +39 -0
- arekit/download_data.py +11 -0
- arekit-0.24.0.dist-info/LICENSE +21 -0
- arekit-0.24.0.dist-info/METADATA +23 -0
- arekit-0.24.0.dist-info/RECORD +374 -0
- arekit-0.24.0.dist-info/WHEEL +5 -0
- arekit-0.24.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from os import path
|
|
3
|
+
from os.path import basename, join
|
|
4
|
+
|
|
5
|
+
import enum
|
|
6
|
+
|
|
7
|
+
from arekit.contrib.source.sentinerel.folding.factory import SentiNERELFoldingFactory
|
|
8
|
+
from arekit.contrib.source.zip_utils import ZipArchiveUtils
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SentiNerelVersions(Enum):
|
|
12
|
+
""" List of the supported version of this collection
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
# Initial version.
|
|
16
|
+
V1 = "v1_0"
|
|
17
|
+
# Updated annotation within the second half of the texts. (September 2022)
|
|
18
|
+
V2 = "v2_0"
|
|
19
|
+
# Updated annotation within the first half of the texts. (October 2022)
|
|
20
|
+
# Become a source of the RuSentNE-2023 competition.
|
|
21
|
+
# https://github.com/dialogue-evaluation/RuSentNE-evaluation
|
|
22
|
+
V21 = "v2_1"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
DEFAULT_VERSION = SentiNerelVersions.V21
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class SentiNerelIOUtils(ZipArchiveUtils):
|
|
29
|
+
|
|
30
|
+
inner_root = "sentiment_dataset"
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def get_archive_filepath(version):
|
|
34
|
+
return path.join(SentiNerelIOUtils.get_data_root(), "sentinerel-{}.zip".format(version))
|
|
35
|
+
|
|
36
|
+
@staticmethod
|
|
37
|
+
def get_annotation_innerpath(filename):
|
|
38
|
+
assert(isinstance(filename, str))
|
|
39
|
+
return path.join(SentiNerelIOUtils.inner_root, "{}.ann".format(filename))
|
|
40
|
+
|
|
41
|
+
@staticmethod
|
|
42
|
+
def get_doc_innerpath(filename):
|
|
43
|
+
assert(isinstance(filename, str))
|
|
44
|
+
return path.join(SentiNerelIOUtils.inner_root, "{}.txt".format(filename))
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
def __iter_filenames_from_dataset(folder_name, version):
|
|
48
|
+
assert(isinstance(version, enum.Enum))
|
|
49
|
+
assert(isinstance(folder_name, str))
|
|
50
|
+
|
|
51
|
+
for filename in SentiNerelIOUtils.iter_filenames_from_zip(version):
|
|
52
|
+
|
|
53
|
+
extension = filename[-4:]
|
|
54
|
+
|
|
55
|
+
# Crop extension.
|
|
56
|
+
filename = filename[:-4]
|
|
57
|
+
|
|
58
|
+
if extension != ".txt":
|
|
59
|
+
continue
|
|
60
|
+
|
|
61
|
+
if not folder_name in filename:
|
|
62
|
+
continue
|
|
63
|
+
|
|
64
|
+
yield basename(filename)
|
|
65
|
+
|
|
66
|
+
# region public methods
|
|
67
|
+
|
|
68
|
+
@staticmethod
|
|
69
|
+
def iter_collection_filenames(version=DEFAULT_VERSION):
|
|
70
|
+
filenames_it = SentiNerelIOUtils.__iter_filenames_from_dataset(
|
|
71
|
+
folder_name=SentiNerelIOUtils.inner_root, version=version)
|
|
72
|
+
|
|
73
|
+
for doc_id, filename in enumerate(filenames_it):
|
|
74
|
+
yield doc_id, filename
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
def read_dataset_split(version=DEFAULT_VERSION, docs_limit=None):
|
|
78
|
+
""" Provides a fixed split of the dataset onto
|
|
79
|
+
`test` and `training` part:
|
|
80
|
+
https://github.com/nicolay-r/SentiNEREL-attitude-extraction
|
|
81
|
+
"""
|
|
82
|
+
return SentiNerelIOUtils.read_from_zip(
|
|
83
|
+
inner_path=join(SentiNerelIOUtils.inner_root, "split_fixed.txt"),
|
|
84
|
+
process_func=lambda f: SentiNERELFoldingFactory.create_fixed_folding(file=f, limit=docs_limit),
|
|
85
|
+
version=version)
|
|
86
|
+
|
|
87
|
+
# endregion
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from arekit.common.labels.base import Label
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class OpinionBelongsTo(Label):
|
|
5
|
+
pass
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class OpinionRelatesTo(Label):
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class NegEffectFrom(Label):
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class NegStateFrom(Label):
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class PosEffectFrom(Label):
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class PosAuthorFrom(Label):
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class NegAuthorFrom(Label):
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class PosStateFrom(Label):
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class NegativeTo(Label):
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class PositiveTo(Label):
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class AlternativeName(Label):
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class StateBelongsTo(Label):
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class OriginsFrom(Label):
|
|
53
|
+
pass
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from collections import OrderedDict
|
|
2
|
+
|
|
3
|
+
from arekit.common.labels.scaler.base import BaseLabelScaler
|
|
4
|
+
from arekit.contrib.source.sentinerel import labels
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class SentiNerelLabelScaler(BaseLabelScaler):
|
|
8
|
+
""" This is a complete label scaler of all the labels supported by NEREL dataset.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def __init__(self):
|
|
12
|
+
|
|
13
|
+
self.__uint_to_label_dict = OrderedDict([
|
|
14
|
+
(labels.OpinionBelongsTo(), 0),
|
|
15
|
+
(labels.OpinionRelatesTo(), 1),
|
|
16
|
+
(labels.NegEffectFrom(), 2),
|
|
17
|
+
(labels.PosEffectFrom(), 3),
|
|
18
|
+
(labels.NegStateFrom(), 4),
|
|
19
|
+
(labels.PosStateFrom(), 5),
|
|
20
|
+
(labels.NegativeTo(), 6),
|
|
21
|
+
(labels.PositiveTo(), 7),
|
|
22
|
+
(labels.StateBelongsTo(), 8),
|
|
23
|
+
(labels.PosAuthorFrom(), 9),
|
|
24
|
+
(labels.NegAuthorFrom(), 10),
|
|
25
|
+
(labels.AlternativeName(), 11),
|
|
26
|
+
(labels.OriginsFrom(), 12)
|
|
27
|
+
])
|
|
28
|
+
|
|
29
|
+
super(SentiNerelLabelScaler, self).__init__(int_dict=self.__uint_to_label_dict,
|
|
30
|
+
uint_dict=self.__uint_to_label_dict)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from arekit.contrib.source.brat.annot import BratAnnotationParser
|
|
2
|
+
from arekit.contrib.source.brat.doc import BratDocument
|
|
3
|
+
from arekit.contrib.source.brat.sentences_reader import BratDocumentSentencesReader
|
|
4
|
+
from arekit.contrib.source.sentinerel.entities import SentiNerelEntityCollection
|
|
5
|
+
from arekit.contrib.source.sentinerel.io_utils import SentiNerelIOUtils, DEFAULT_VERSION
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SentiNerelDocReader(object):
|
|
9
|
+
|
|
10
|
+
@staticmethod
|
|
11
|
+
def read_text_relations(filename, version):
|
|
12
|
+
assert(isinstance(filename, str))
|
|
13
|
+
|
|
14
|
+
return SentiNerelIOUtils.read_from_zip(
|
|
15
|
+
inner_path=SentiNerelIOUtils.get_annotation_innerpath(filename),
|
|
16
|
+
process_func=lambda input_file: [
|
|
17
|
+
relation for relation in BratAnnotationParser.parse_annotations(
|
|
18
|
+
input_file=input_file, encoding='utf-8-sig')["relations"]],
|
|
19
|
+
version=version)
|
|
20
|
+
|
|
21
|
+
@staticmethod
|
|
22
|
+
def read_document(filename, doc_id, version=DEFAULT_VERSION, entities_to_ignore=None):
|
|
23
|
+
assert(isinstance(filename, str))
|
|
24
|
+
assert(isinstance(doc_id, int))
|
|
25
|
+
|
|
26
|
+
def file_to_doc(input_file):
|
|
27
|
+
sentences = BratDocumentSentencesReader.from_file(input_file=input_file, entities=entities)
|
|
28
|
+
return BratDocument(doc_id=doc_id, sentences=sentences, text_relations=text_relations)
|
|
29
|
+
|
|
30
|
+
# TODO. #398 issue -- in some cases entities might be nested. Therefore we limit the set
|
|
31
|
+
# TODO. of the potential named entities.
|
|
32
|
+
eti = ["EFFECT_NEG", "EFFECT_POS", "ARGUMENT_NEG", "ARGUMENT_POS", "EVENT"] \
|
|
33
|
+
if entities_to_ignore is None else entities_to_ignore
|
|
34
|
+
|
|
35
|
+
entities = SentiNerelEntityCollection.read_collection(
|
|
36
|
+
filename=filename, version=version, entities_to_ignore=eti)
|
|
37
|
+
text_relations = SentiNerelDocReader.read_text_relations(filename=filename, version=version)
|
|
38
|
+
|
|
39
|
+
return SentiNerelIOUtils.read_from_zip(
|
|
40
|
+
inner_path=SentiNerelIOUtils.get_doc_innerpath(filename=filename),
|
|
41
|
+
process_func=file_to_doc,
|
|
42
|
+
version=version)
|
|
File without changes
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from arekit.common.utils import progress_bar_defined
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def iter_synonym_groups(input_file, sep=",", desc=""):
|
|
5
|
+
""" All the synonyms groups organized in lines, separated by `sep`
|
|
6
|
+
"""
|
|
7
|
+
lines = input_file.readlines()
|
|
8
|
+
|
|
9
|
+
lines_it = progress_bar_defined(lines,
|
|
10
|
+
total=len(lines),
|
|
11
|
+
desc=desc,
|
|
12
|
+
unit="opins")
|
|
13
|
+
|
|
14
|
+
for line in lines_it:
|
|
15
|
+
|
|
16
|
+
if isinstance(line, bytes):
|
|
17
|
+
line = line.decode()
|
|
18
|
+
|
|
19
|
+
yield line.split(sep)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import zipfile
|
|
2
|
+
|
|
3
|
+
import enum
|
|
4
|
+
|
|
5
|
+
from arekit.common import utils
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ZipArchiveUtils(object):
|
|
9
|
+
|
|
10
|
+
@staticmethod
|
|
11
|
+
def get_archive_filepath(version):
|
|
12
|
+
raise NotImplementedError()
|
|
13
|
+
|
|
14
|
+
@classmethod
|
|
15
|
+
def read_from_zip(cls, inner_path, process_func, version):
|
|
16
|
+
"""
|
|
17
|
+
process_func:
|
|
18
|
+
func which receives a file reader
|
|
19
|
+
"""
|
|
20
|
+
assert(isinstance(inner_path, str))
|
|
21
|
+
assert(callable(process_func))
|
|
22
|
+
assert(isinstance(version, enum.Enum))
|
|
23
|
+
|
|
24
|
+
with zipfile.ZipFile(cls.get_archive_filepath(version.value), "r") as zip_ref:
|
|
25
|
+
with zip_ref.open(inner_path, mode='r') as c_file:
|
|
26
|
+
return process_func(c_file)
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def iter_from_zip(cls, inner_path, process_func, version):
|
|
30
|
+
assert(isinstance(inner_path, str))
|
|
31
|
+
assert(callable(process_func))
|
|
32
|
+
assert(isinstance(version, enum.Enum))
|
|
33
|
+
|
|
34
|
+
with zipfile.ZipFile(cls.get_archive_filepath(version.value), "r") as zip_ref:
|
|
35
|
+
with zip_ref.open(inner_path, mode='r') as c_file:
|
|
36
|
+
for result in process_func(c_file):
|
|
37
|
+
yield result
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def iter_filenames_from_zip(cls, version):
|
|
41
|
+
assert(isinstance(version, enum.Enum))
|
|
42
|
+
with zipfile.ZipFile(cls.get_archive_filepath(version.value), "r") as zip_ref:
|
|
43
|
+
return iter(zip_ref.namelist())
|
|
44
|
+
|
|
45
|
+
@staticmethod
|
|
46
|
+
def get_data_root():
|
|
47
|
+
return utils.get_default_download_dir()
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from arekit.common.data.input.providers.label.multiple import MultipleLabelProvider
|
|
2
|
+
from arekit.common.data.input.providers.rows.samples import BaseSampleRowProvider
|
|
3
|
+
from arekit.common.data.input.providers.text.single import BaseSingleTextProvider
|
|
4
|
+
from arekit.common.data.input.terms_mapper import OpinionContainingTextTermsMapper
|
|
5
|
+
from arekit.contrib.bert.input.providers.text_pair import PairTextProvider
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def create_sample_provider(label_scaler, text_terms_mapper, text_b_prompt=None):
|
|
9
|
+
assert(isinstance(text_terms_mapper, OpinionContainingTextTermsMapper))
|
|
10
|
+
|
|
11
|
+
text_provider = BaseSingleTextProvider(text_terms_mapper=text_terms_mapper) \
|
|
12
|
+
if text_b_prompt is None else PairTextProvider(text_b_prompt=text_b_prompt,
|
|
13
|
+
text_terms_mapper=text_terms_mapper)
|
|
14
|
+
|
|
15
|
+
label_provider = MultipleLabelProvider(label_scaler=label_scaler)
|
|
16
|
+
|
|
17
|
+
return BaseSampleRowProvider(text_provider=text_provider, label_provider=label_provider)
|
|
File without changes
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from arekit.common.frames.connotations.provider import FrameConnotationProvider
|
|
2
|
+
from arekit.contrib.source.rusentiframes.collection import RuSentiFramesCollection
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class RuSentiFramesConnotationProvider(FrameConnotationProvider):
|
|
6
|
+
""" This is a provider based on A0->A1 label type of RuSentiFrames collection.
|
|
7
|
+
For a greater details, checkout the related collection at:
|
|
8
|
+
https://github.com/nicolay-r/RuSentiFrames
|
|
9
|
+
|
|
10
|
+
Papers:
|
|
11
|
+
[1] Natalia Loukachevitch, Nicolay Rusnachenko: Sentiment Frames
|
|
12
|
+
for Attitude Extraction in Russian, 2020
|
|
13
|
+
[2] Distant Supervision for Sentiment Attitude Extraction, 2019
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self, collection):
|
|
17
|
+
assert(isinstance(collection, RuSentiFramesCollection))
|
|
18
|
+
self.__collection = collection
|
|
19
|
+
|
|
20
|
+
def try_provide(self, frame_id):
|
|
21
|
+
return self.__collection.try_get_frame_polarity(frame_id=frame_id,
|
|
22
|
+
role_src='a0',
|
|
23
|
+
role_dest='a1')
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from arekit.common.data.input.providers.const import IDLE_MODE
|
|
2
|
+
from arekit.common.data.input.providers.contents import ContentsProvider
|
|
3
|
+
from arekit.common.linkage.base import LinkedDataWrapper
|
|
4
|
+
from arekit.common.linkage.text_opinions import TextOpinionsLinkage
|
|
5
|
+
from arekit.common.pipeline.base import BasePipeline
|
|
6
|
+
from arekit.common.text_opinions.base import TextOpinion
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class InputTextOpinionProvider(ContentsProvider):
|
|
10
|
+
|
|
11
|
+
def __init__(self, pipeline):
|
|
12
|
+
""" NOTE: it is important that the output of the pipeline
|
|
13
|
+
results in a TextOpinionLinkage instances.
|
|
14
|
+
pipeline: id -> ... -> TextOpinionLinkage[]
|
|
15
|
+
"""
|
|
16
|
+
assert(isinstance(pipeline, BasePipeline))
|
|
17
|
+
self.__pipeline = pipeline
|
|
18
|
+
self.__current_id = None
|
|
19
|
+
|
|
20
|
+
# endregion
|
|
21
|
+
|
|
22
|
+
def __assign_ids(self, linkage):
|
|
23
|
+
""" Perform IDs assignation.
|
|
24
|
+
"""
|
|
25
|
+
assert(isinstance(linkage, TextOpinionsLinkage))
|
|
26
|
+
for text_opinion in linkage:
|
|
27
|
+
assert(isinstance(text_opinion, TextOpinion))
|
|
28
|
+
text_opinion.set_text_opinion_id(self.__current_id)
|
|
29
|
+
self.__current_id += 1
|
|
30
|
+
|
|
31
|
+
def from_doc_ids(self, doc_ids, idle_mode=False):
|
|
32
|
+
self.__current_id = 0
|
|
33
|
+
for linkage in self.__pipeline.run(doc_ids, params_dict={IDLE_MODE: idle_mode}):
|
|
34
|
+
assert(isinstance(linkage, LinkedDataWrapper))
|
|
35
|
+
if isinstance(linkage, TextOpinionsLinkage):
|
|
36
|
+
self.__assign_ids(linkage)
|
|
37
|
+
yield linkage
|
|
File without changes
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from arekit.common.data.doc_provider import DocumentProvider
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class DictionaryBasedDocumentProvider(DocumentProvider):
|
|
5
|
+
|
|
6
|
+
def __init__(self, d):
|
|
7
|
+
assert(isinstance(d, dict))
|
|
8
|
+
super(DictionaryBasedDocumentProvider, self).__init__()
|
|
9
|
+
self.__d = d
|
|
10
|
+
|
|
11
|
+
def by_id(self, doc_id):
|
|
12
|
+
assert(isinstance(doc_id, int))
|
|
13
|
+
return self.__d[doc_id]
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from os.path import join
|
|
2
|
+
|
|
3
|
+
from arekit.common.data.doc_provider import DocumentProvider
|
|
4
|
+
from arekit.common.docs.base import Document
|
|
5
|
+
from arekit.common.docs.sentence import BaseDocumentSentence
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DirectoryFilesDocProvider(DocumentProvider):
|
|
9
|
+
""" Document Providers based on the list of provided file paths
|
|
10
|
+
for the particular directory.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, dir_path, file_names=None, sentence_parser=None):
|
|
14
|
+
"""
|
|
15
|
+
dir_path: str
|
|
16
|
+
path to the root directory.
|
|
17
|
+
file_names: list
|
|
18
|
+
list of file paths related to documents.
|
|
19
|
+
sentence_splitter: object
|
|
20
|
+
how data is suppose to be separated onto sentences.
|
|
21
|
+
str -> list(str)
|
|
22
|
+
"""
|
|
23
|
+
assert(isinstance(dir_path, str))
|
|
24
|
+
assert(isinstance(file_names, list) or file_names is None)
|
|
25
|
+
assert(callable(sentence_parser) or sentence_parser is None)
|
|
26
|
+
|
|
27
|
+
self.__dir_path = dir_path
|
|
28
|
+
self.__file_names = file_names
|
|
29
|
+
|
|
30
|
+
# Line-split sentence parser by default.
|
|
31
|
+
self.__sentence_parser = (lambda text: [t.strip() for t in text.split('\n')]) \
|
|
32
|
+
if sentence_parser is None else sentence_parser
|
|
33
|
+
|
|
34
|
+
def __read_doc(self, doc_id, contents):
|
|
35
|
+
""" Parse a single document.
|
|
36
|
+
"""
|
|
37
|
+
# setup input data.
|
|
38
|
+
sentences = self.__sentence_parser(contents)
|
|
39
|
+
sentences = list(map(lambda text: BaseDocumentSentence(text), sentences))
|
|
40
|
+
|
|
41
|
+
# Parse text.
|
|
42
|
+
return Document(doc_id=doc_id, sentences=sentences)
|
|
43
|
+
|
|
44
|
+
def by_id(self, doc_id):
|
|
45
|
+
""" Perform reading operation of the document.
|
|
46
|
+
"""
|
|
47
|
+
file_name = self.__file_names[doc_id]
|
|
48
|
+
with open(join(self.__dir_path, file_name), "r") as f:
|
|
49
|
+
contents = f.read()
|
|
50
|
+
return self.__read_doc(doc_id=file_name, contents=contents)
|
|
51
|
+
|
|
52
|
+
def __len__(self):
|
|
53
|
+
return len(self.__file_names)
|
|
File without changes
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
|
|
3
|
+
from arekit.contrib.utils.data.readers.base import BaseReader
|
|
4
|
+
from arekit.contrib.utils.data.storages.pandas_based import PandasBasedRowsStorage
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class PandasCsvReader(BaseReader):
|
|
8
|
+
""" Represents a CSV-based reader, implmented via pandas API.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def __init__(self, sep='\t', header='infer', compression='infer', encoding='utf-8', col_types=None,
|
|
12
|
+
custom_extension=None):
|
|
13
|
+
self.__sep = sep
|
|
14
|
+
self.__compression = compression
|
|
15
|
+
self.__encoding = encoding
|
|
16
|
+
self.__header = header
|
|
17
|
+
self.__custom_extension = custom_extension
|
|
18
|
+
|
|
19
|
+
# Special assignation of types for certain columns.
|
|
20
|
+
self.__col_types = col_types
|
|
21
|
+
if self.__col_types is None:
|
|
22
|
+
self.__col_types = dict()
|
|
23
|
+
|
|
24
|
+
def extension(self):
|
|
25
|
+
return ".tsv.gz" if self.__custom_extension is None else self.__custom_extension
|
|
26
|
+
|
|
27
|
+
def __from_csv(self, filepath):
|
|
28
|
+
pd = importlib.import_module("pandas")
|
|
29
|
+
return pd.read_csv(filepath,
|
|
30
|
+
sep=self.__sep,
|
|
31
|
+
encoding=self.__encoding,
|
|
32
|
+
compression=self.__compression,
|
|
33
|
+
dtype=self.__col_types,
|
|
34
|
+
header=self.__header)
|
|
35
|
+
|
|
36
|
+
def read(self, target):
|
|
37
|
+
df = self.__from_csv(filepath=target)
|
|
38
|
+
return PandasBasedRowsStorage(df)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from arekit.contrib.utils.data.readers.base import BaseReader
|
|
2
|
+
from arekit.contrib.utils.data.storages.jsonl_based import JsonlBasedRowsStorage
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class JsonlReader(BaseReader):
|
|
6
|
+
|
|
7
|
+
def extension(self):
|
|
8
|
+
return ".jsonl"
|
|
9
|
+
|
|
10
|
+
def read(self, target):
|
|
11
|
+
rows = []
|
|
12
|
+
with open(target, "r") as f:
|
|
13
|
+
for line in f.readlines():
|
|
14
|
+
rows.append(line)
|
|
15
|
+
return JsonlBasedRowsStorage(rows)
|
|
File without changes
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import gc
|
|
2
|
+
import importlib
|
|
3
|
+
from arekit.contrib.utils.data.storages.pandas_based import PandasBasedRowsStorage
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class PandasBasedStorageBalancing(object):
|
|
7
|
+
|
|
8
|
+
@staticmethod
|
|
9
|
+
def create_balanced_from(storage, column_name, free_origin=True):
|
|
10
|
+
""" Performs oversampled balancing.
|
|
11
|
+
|
|
12
|
+
Note: it is quite important to remove previously created storage
|
|
13
|
+
in order to avoid memory leaking.
|
|
14
|
+
|
|
15
|
+
storage: PandasBasedRowsStorage
|
|
16
|
+
storage contents to be balanced.
|
|
17
|
+
|
|
18
|
+
column_name: str
|
|
19
|
+
column utilized for balancing.
|
|
20
|
+
|
|
21
|
+
free_origin: bool
|
|
22
|
+
indicates whether there is a need to release the resources
|
|
23
|
+
utilized for the original storage.
|
|
24
|
+
"""
|
|
25
|
+
assert(isinstance(storage, PandasBasedRowsStorage))
|
|
26
|
+
|
|
27
|
+
original_df = storage.DataFrame
|
|
28
|
+
|
|
29
|
+
max_size = original_df[column_name].value_counts().max()
|
|
30
|
+
|
|
31
|
+
dframes = []
|
|
32
|
+
for class_index, group in original_df.groupby(column_name):
|
|
33
|
+
dframes.append(group.sample(max_size - len(group), replace=True))
|
|
34
|
+
|
|
35
|
+
# Clear resources.
|
|
36
|
+
pd = importlib.import_module("pandas")
|
|
37
|
+
balanced_df = pd.concat(dframes + [original_df])
|
|
38
|
+
|
|
39
|
+
# Removing temporary created dataframe.
|
|
40
|
+
for df in dframes:
|
|
41
|
+
del df
|
|
42
|
+
|
|
43
|
+
# Marking the original dataframe as released
|
|
44
|
+
# in terms of the allocated memory for it.
|
|
45
|
+
if free_origin:
|
|
46
|
+
storage.free()
|
|
47
|
+
|
|
48
|
+
gc.collect()
|
|
49
|
+
|
|
50
|
+
return PandasBasedRowsStorage(df=balanced_df)
|
|
File without changes
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from arekit.common.data.storages.base import BaseRowsStorage
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class JsonlBasedRowsStorage(BaseRowsStorage):
|
|
7
|
+
|
|
8
|
+
def __init__(self, rows):
|
|
9
|
+
assert(isinstance(rows, list))
|
|
10
|
+
self.__rows = rows
|
|
11
|
+
|
|
12
|
+
def _iter_rows(self):
|
|
13
|
+
for row_index, row in enumerate(self.__rows):
|
|
14
|
+
assert(isinstance(row, str))
|
|
15
|
+
yield row_index, json.loads(row)
|
|
16
|
+
|
|
17
|
+
def _get_rows_count(self):
|
|
18
|
+
return len(self.__rows)
|