arekit 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/__init__.py +0 -0
- arekit/common/__init__.py +0 -0
- arekit/common/bound.py +48 -0
- arekit/common/context/__init__.py +0 -0
- arekit/common/context/terms_mapper.py +51 -0
- arekit/common/context/token.py +16 -0
- arekit/common/data/__init__.py +0 -0
- arekit/common/data/const.py +21 -0
- arekit/common/data/doc_provider.py +6 -0
- arekit/common/data/input/__init__.py +0 -0
- arekit/common/data/input/providers/__init__.py +0 -0
- arekit/common/data/input/providers/columns/__init__.py +0 -0
- arekit/common/data/input/providers/columns/base.py +9 -0
- arekit/common/data/input/providers/columns/sample.py +59 -0
- arekit/common/data/input/providers/const.py +3 -0
- arekit/common/data/input/providers/contents.py +9 -0
- arekit/common/data/input/providers/instances/__init__.py +0 -0
- arekit/common/data/input/providers/instances/base.py +14 -0
- arekit/common/data/input/providers/instances/multiple.py +27 -0
- arekit/common/data/input/providers/instances/single.py +8 -0
- arekit/common/data/input/providers/label/__init__.py +0 -0
- arekit/common/data/input/providers/label/base.py +24 -0
- arekit/common/data/input/providers/label/binary.py +11 -0
- arekit/common/data/input/providers/label/multiple.py +15 -0
- arekit/common/data/input/providers/rows/__init__.py +0 -0
- arekit/common/data/input/providers/rows/base.py +64 -0
- arekit/common/data/input/providers/rows/samples.py +227 -0
- arekit/common/data/input/providers/sample/__init__.py +0 -0
- arekit/common/data/input/providers/sample/cropped.py +43 -0
- arekit/common/data/input/providers/text/__init__.py +0 -0
- arekit/common/data/input/providers/text/single.py +49 -0
- arekit/common/data/input/repositories/__init__.py +0 -0
- arekit/common/data/input/repositories/base.py +68 -0
- arekit/common/data/input/repositories/sample.py +22 -0
- arekit/common/data/input/sample.py +66 -0
- arekit/common/data/input/terms_mapper.py +88 -0
- arekit/common/data/rows_fmt.py +82 -0
- arekit/common/data/rows_parser.py +43 -0
- arekit/common/data/storages/__init__.py +0 -0
- arekit/common/data/storages/base.py +109 -0
- arekit/common/data/views/__init__.py +0 -0
- arekit/common/data/views/samples.py +26 -0
- arekit/common/docs/__init__.py +0 -0
- arekit/common/docs/base.py +30 -0
- arekit/common/docs/entities_grouping.py +16 -0
- arekit/common/docs/entity.py +18 -0
- arekit/common/docs/objects_parser.py +37 -0
- arekit/common/docs/parsed/__init__.py +0 -0
- arekit/common/docs/parsed/base.py +101 -0
- arekit/common/docs/parsed/providers/__init__.py +0 -0
- arekit/common/docs/parsed/providers/base.py +68 -0
- arekit/common/docs/parsed/providers/base_pairs.py +51 -0
- arekit/common/docs/parsed/providers/entity_service.py +175 -0
- arekit/common/docs/parsed/providers/opinion_pairs.py +20 -0
- arekit/common/docs/parsed/providers/text_opinion_pairs.py +78 -0
- arekit/common/docs/parsed/service.py +31 -0
- arekit/common/docs/parsed/term_position.py +42 -0
- arekit/common/docs/parser.py +34 -0
- arekit/common/docs/sentence.py +14 -0
- arekit/common/entities/__init__.py +0 -0
- arekit/common/entities/base.py +51 -0
- arekit/common/entities/collection.py +72 -0
- arekit/common/entities/str_fmt.py +8 -0
- arekit/common/entities/types.py +9 -0
- arekit/common/experiment/__init__.py +0 -0
- arekit/common/experiment/api/__init__.py +0 -0
- arekit/common/experiment/api/base_samples_io.py +20 -0
- arekit/common/experiment/data_type.py +17 -0
- arekit/common/frames/__init__.py +0 -0
- arekit/common/frames/connotations/__init__.py +0 -0
- arekit/common/frames/connotations/descriptor.py +17 -0
- arekit/common/frames/connotations/provider.py +4 -0
- arekit/common/frames/text_variant.py +43 -0
- arekit/common/frames/variants/__init__.py +0 -0
- arekit/common/frames/variants/base.py +21 -0
- arekit/common/frames/variants/collection.py +60 -0
- arekit/common/labels/__init__.py +0 -0
- arekit/common/labels/base.py +19 -0
- arekit/common/labels/provider/__init__.py +0 -0
- arekit/common/labels/provider/base.py +7 -0
- arekit/common/labels/provider/constant.py +14 -0
- arekit/common/labels/scaler/__init__.py +0 -0
- arekit/common/labels/scaler/base.py +85 -0
- arekit/common/labels/scaler/sentiment.py +7 -0
- arekit/common/labels/scaler/single.py +10 -0
- arekit/common/labels/str_fmt.py +55 -0
- arekit/common/linkage/__init__.py +0 -0
- arekit/common/linkage/base.py +44 -0
- arekit/common/linkage/meta.py +23 -0
- arekit/common/linkage/opinions.py +9 -0
- arekit/common/linkage/text_opinions.py +22 -0
- arekit/common/log_utils.py +29 -0
- arekit/common/model/__init__.py +0 -0
- arekit/common/model/labeling/__init__.py +0 -0
- arekit/common/model/labeling/base.py +24 -0
- arekit/common/model/labeling/modes.py +8 -0
- arekit/common/model/labeling/single.py +24 -0
- arekit/common/opinions/__init__.py +0 -0
- arekit/common/opinions/annot/__init__.py +0 -0
- arekit/common/opinions/annot/algo/__init__.py +0 -0
- arekit/common/opinions/annot/algo/base.py +4 -0
- arekit/common/opinions/annot/algo/pair_based.py +99 -0
- arekit/common/opinions/annot/algo/predefined.py +16 -0
- arekit/common/opinions/annot/algo_based.py +55 -0
- arekit/common/opinions/annot/base.py +15 -0
- arekit/common/opinions/base.py +74 -0
- arekit/common/opinions/collection.py +150 -0
- arekit/common/opinions/enums.py +6 -0
- arekit/common/opinions/provider.py +4 -0
- arekit/common/opinions/writer.py +4 -0
- arekit/common/pipeline/__init__.py +0 -0
- arekit/common/pipeline/base.py +25 -0
- arekit/common/pipeline/context.py +36 -0
- arekit/common/pipeline/conts.py +2 -0
- arekit/common/pipeline/items/__init__.py +0 -0
- arekit/common/pipeline/items/base.py +12 -0
- arekit/common/pipeline/items/flatten.py +14 -0
- arekit/common/pipeline/items/handle.py +17 -0
- arekit/common/pipeline/items/iter.py +11 -0
- arekit/common/pipeline/items/map.py +11 -0
- arekit/common/pipeline/items/map_nested.py +13 -0
- arekit/common/synonyms/__init__.py +0 -0
- arekit/common/synonyms/base.py +151 -0
- arekit/common/synonyms/grouping.py +21 -0
- arekit/common/text/__init__.py +0 -0
- arekit/common/text/enums.py +12 -0
- arekit/common/text/parsed.py +42 -0
- arekit/common/text/parser.py +12 -0
- arekit/common/text/partitioning/__init__.py +0 -0
- arekit/common/text/partitioning/base.py +4 -0
- arekit/common/text/partitioning/str.py +36 -0
- arekit/common/text/partitioning/terms.py +35 -0
- arekit/common/text/stemmer.py +16 -0
- arekit/common/text_opinions/__init__.py +0 -0
- arekit/common/text_opinions/base.py +105 -0
- arekit/common/utils.py +129 -0
- arekit/contrib/__init__.py +0 -0
- arekit/contrib/bert/__init__.py +0 -0
- arekit/contrib/bert/input/__init__.py +0 -0
- arekit/contrib/bert/input/providers/__init__.py +0 -0
- arekit/contrib/bert/input/providers/cropped_sample.py +17 -0
- arekit/contrib/bert/input/providers/text_pair.py +62 -0
- arekit/contrib/bert/terms/__init__.py +0 -0
- arekit/contrib/bert/terms/mapper.py +20 -0
- arekit/contrib/networks/__init__.py +0 -0
- arekit/contrib/networks/embedding.py +149 -0
- arekit/contrib/networks/embedding_io.py +18 -0
- arekit/contrib/networks/input/__init__.py +0 -0
- arekit/contrib/networks/input/const.py +6 -0
- arekit/contrib/networks/input/ctx_serialization.py +28 -0
- arekit/contrib/networks/input/embedding/__init__.py +0 -0
- arekit/contrib/networks/input/embedding/matrix.py +29 -0
- arekit/contrib/networks/input/embedding/offsets.py +55 -0
- arekit/contrib/networks/input/formatters/__init__.py +0 -0
- arekit/contrib/networks/input/formatters/pos_mapper.py +22 -0
- arekit/contrib/networks/input/providers/__init__.py +0 -0
- arekit/contrib/networks/input/providers/sample.py +129 -0
- arekit/contrib/networks/input/providers/term_connotation.py +23 -0
- arekit/contrib/networks/input/providers/text.py +24 -0
- arekit/contrib/networks/input/rows_parser.py +47 -0
- arekit/contrib/networks/input/term_types.py +13 -0
- arekit/contrib/networks/input/terms_mapping.py +60 -0
- arekit/contrib/networks/vectorizer.py +6 -0
- arekit/contrib/prompt/__init__.py +0 -0
- arekit/contrib/prompt/sample.py +61 -0
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +84 -0
- arekit/contrib/source/brat/doc.py +28 -0
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +13 -0
- arekit/contrib/source/brat/entities/entity.py +42 -0
- arekit/contrib/source/brat/entities/parser.py +53 -0
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +19 -0
- arekit/contrib/source/brat/relation.py +32 -0
- arekit/contrib/source/brat/sentence.py +69 -0
- arekit/contrib/source/brat/sentences_reader.py +128 -0
- arekit/contrib/source/download.py +41 -0
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +55 -0
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +74 -0
- arekit/contrib/source/nerel/io_utils.py +62 -0
- arekit/contrib/source/nerel/labels.py +241 -0
- arekit/contrib/source/nerel/reader.py +46 -0
- arekit/contrib/source/nerel/utils.py +24 -0
- arekit/contrib/source/nerel/versions.py +12 -0
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +62 -0
- arekit/contrib/source/nerelbio/labels.py +265 -0
- arekit/contrib/source/nerelbio/reader.py +8 -0
- arekit/contrib/source/nerelbio/versions.py +8 -0
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +36 -0
- arekit/contrib/source/ruattitudes/doc.py +51 -0
- arekit/contrib/source/ruattitudes/doc_brat.py +44 -0
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +7 -0
- arekit/contrib/source/ruattitudes/io_utils.py +56 -0
- arekit/contrib/source/ruattitudes/labels_fmt.py +12 -0
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +28 -0
- arekit/contrib/source/ruattitudes/opinions/converter.py +37 -0
- arekit/contrib/source/ruattitudes/reader.py +268 -0
- arekit/contrib/source/ruattitudes/sentence.py +73 -0
- arekit/contrib/source/ruattitudes/synonyms.py +17 -0
- arekit/contrib/source/ruattitudes/text_object.py +59 -0
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +157 -0
- arekit/contrib/source/rusentiframes/effect.py +24 -0
- arekit/contrib/source/rusentiframes/io_utils.py +19 -0
- arekit/contrib/source/rusentiframes/labels_fmt.py +22 -0
- arekit/contrib/source/rusentiframes/polarity.py +35 -0
- arekit/contrib/source/rusentiframes/role.py +15 -0
- arekit/contrib/source/rusentiframes/state.py +24 -0
- arekit/contrib/source/rusentiframes/types.py +42 -0
- arekit/contrib/source/rusentiframes/value.py +2 -0
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +3 -0
- arekit/contrib/source/rusentrel/docs_reader.py +51 -0
- arekit/contrib/source/rusentrel/entities.py +26 -0
- arekit/contrib/source/rusentrel/io_utils.py +125 -0
- arekit/contrib/source/rusentrel/labels_fmt.py +12 -0
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +30 -0
- arekit/contrib/source/rusentrel/opinions/converter.py +40 -0
- arekit/contrib/source/rusentrel/opinions/provider.py +54 -0
- arekit/contrib/source/rusentrel/opinions/writer.py +42 -0
- arekit/contrib/source/rusentrel/synonyms.py +17 -0
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +52 -0
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +31 -0
- arekit/contrib/source/sentinerel/folding/fixed.py +70 -0
- arekit/contrib/source/sentinerel/io_utils.py +87 -0
- arekit/contrib/source/sentinerel/labels.py +53 -0
- arekit/contrib/source/sentinerel/labels_scaler.py +30 -0
- arekit/contrib/source/sentinerel/reader.py +42 -0
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +19 -0
- arekit/contrib/source/zip_utils.py +47 -0
- arekit/contrib/utils/__init__.py +0 -0
- arekit/contrib/utils/bert/__init__.py +0 -0
- arekit/contrib/utils/bert/samplers.py +17 -0
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +23 -0
- arekit/contrib/utils/data/__init__.py +0 -0
- arekit/contrib/utils/data/contents/__init__.py +0 -0
- arekit/contrib/utils/data/contents/opinions.py +37 -0
- arekit/contrib/utils/data/doc_provider/__init__.py +0 -0
- arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
- arekit/contrib/utils/data/doc_provider/dir_based.py +53 -0
- arekit/contrib/utils/data/readers/__init__.py +0 -0
- arekit/contrib/utils/data/readers/base.py +7 -0
- arekit/contrib/utils/data/readers/csv_pd.py +38 -0
- arekit/contrib/utils/data/readers/jsonl.py +15 -0
- arekit/contrib/utils/data/service/__init__.py +0 -0
- arekit/contrib/utils/data/service/balance.py +50 -0
- arekit/contrib/utils/data/storages/__init__.py +0 -0
- arekit/contrib/utils/data/storages/jsonl_based.py +18 -0
- arekit/contrib/utils/data/storages/pandas_based.py +123 -0
- arekit/contrib/utils/data/storages/row_cache.py +48 -0
- arekit/contrib/utils/data/writers/__init__.py +0 -0
- arekit/contrib/utils/data/writers/base.py +27 -0
- arekit/contrib/utils/data/writers/csv_native.py +63 -0
- arekit/contrib/utils/data/writers/csv_pd.py +40 -0
- arekit/contrib/utils/data/writers/json_opennre.py +132 -0
- arekit/contrib/utils/data/writers/sqlite_native.py +110 -0
- arekit/contrib/utils/download.py +77 -0
- arekit/contrib/utils/embeddings/__init__.py +0 -0
- arekit/contrib/utils/embeddings/rusvectores.py +58 -0
- arekit/contrib/utils/embeddings/tokens.py +30 -0
- arekit/contrib/utils/entities/__init__.py +0 -0
- arekit/contrib/utils/entities/filter.py +7 -0
- arekit/contrib/utils/entities/formatters/__init__.py +0 -0
- arekit/contrib/utils/entities/formatters/str_display.py +11 -0
- arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py +15 -0
- arekit/contrib/utils/io_utils/__init__.py +0 -0
- arekit/contrib/utils/io_utils/embedding.py +72 -0
- arekit/contrib/utils/io_utils/opinions.py +37 -0
- arekit/contrib/utils/io_utils/samples.py +79 -0
- arekit/contrib/utils/io_utils/utils.py +39 -0
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +41 -0
- arekit/contrib/utils/lexicons/relation.py +42 -0
- arekit/contrib/utils/lexicons/rusentilex.py +37 -0
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +83 -0
- arekit/contrib/utils/np_utils/__init__.py +0 -0
- arekit/contrib/utils/np_utils/embedding.py +22 -0
- arekit/contrib/utils/np_utils/npz_utils.py +13 -0
- arekit/contrib/utils/np_utils/vocab.py +20 -0
- arekit/contrib/utils/pipelines/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/base.py +99 -0
- arekit/contrib/utils/pipelines/items/sampling/networks.py +54 -0
- arekit/contrib/utils/pipelines/items/text/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/text/entities_default.py +23 -0
- arekit/contrib/utils/pipelines/items/text/frames.py +86 -0
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +36 -0
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +32 -0
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +10 -0
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +107 -0
- arekit/contrib/utils/pipelines/items/text/translator.py +135 -0
- arekit/contrib/utils/pipelines/opinion_collections.py +85 -0
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +27 -0
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +65 -0
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +60 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +29 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +64 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +79 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +56 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +20 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +65 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +21 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +107 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +29 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +62 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +180 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +50 -0
- arekit/contrib/utils/pipelines/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +34 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +88 -0
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +93 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/base.py +4 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +16 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +29 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +26 -0
- arekit/contrib/utils/processing/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/mods.py +12 -0
- arekit/contrib/utils/processing/languages/pos.py +23 -0
- arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/ru/cases.py +78 -0
- arekit/contrib/utils/processing/languages/ru/constants.py +6 -0
- arekit/contrib/utils/processing/languages/ru/mods.py +13 -0
- arekit/contrib/utils/processing/languages/ru/number.py +23 -0
- arekit/contrib/utils/processing/languages/ru/pos_service.py +36 -0
- arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
- arekit/contrib/utils/processing/lemmatization/mystem.py +51 -0
- arekit/contrib/utils/processing/pos/__init__.py +0 -0
- arekit/contrib/utils/processing/pos/base.py +12 -0
- arekit/contrib/utils/processing/pos/mystem_wrap.py +134 -0
- arekit/contrib/utils/processing/pos/russian.py +10 -0
- arekit/contrib/utils/processing/text/__init__.py +0 -0
- arekit/contrib/utils/processing/text/tokens.py +127 -0
- arekit/contrib/utils/resources.py +25 -0
- arekit/contrib/utils/serializer.py +43 -0
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +63 -0
- arekit/contrib/utils/synonyms/__init__.py +0 -0
- arekit/contrib/utils/synonyms/simple.py +15 -0
- arekit/contrib/utils/synonyms/stemmer_based.py +38 -0
- arekit/contrib/utils/vectorizers/__init__.py +0 -0
- arekit/contrib/utils/vectorizers/bpe.py +93 -0
- arekit/contrib/utils/vectorizers/random_norm.py +39 -0
- arekit/download_data.py +11 -0
- arekit-0.24.0.dist-info/LICENSE +21 -0
- arekit-0.24.0.dist-info/METADATA +23 -0
- arekit-0.24.0.dist-info/RECORD +374 -0
- arekit-0.24.0.dist-info/WHEEL +5 -0
- arekit-0.24.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from arekit.common.data.input.providers.label.multiple import MultipleLabelProvider
|
|
2
|
+
from arekit.common.data.input.providers.rows.samples import BaseSampleRowProvider
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class CroppedSampleRowProvider(BaseSampleRowProvider):
|
|
6
|
+
""" Sample provided which has `crop_window` that allows to slice
|
|
7
|
+
the potentially large samples and guarantee the presence of
|
|
8
|
+
attitude inside.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def __init__(self, crop_window_size, label_scaler, text_provider):
|
|
12
|
+
assert(isinstance(crop_window_size, int) and crop_window_size > 0)
|
|
13
|
+
super(CroppedSampleRowProvider, self).__init__(label_provider=MultipleLabelProvider(label_scaler),
|
|
14
|
+
text_provider=text_provider)
|
|
15
|
+
self.__crop_window_size = crop_window_size
|
|
16
|
+
|
|
17
|
+
@staticmethod
|
|
18
|
+
def __calc_window_bounds(window_size, s_ind, t_ind, input_length):
|
|
19
|
+
""" returns: [_from, _to)
|
|
20
|
+
"""
|
|
21
|
+
assert(isinstance(s_ind, int))
|
|
22
|
+
assert(isinstance(t_ind, int))
|
|
23
|
+
assert(isinstance(input_length, int))
|
|
24
|
+
assert(input_length >= s_ind and input_length >= t_ind)
|
|
25
|
+
|
|
26
|
+
def __in():
|
|
27
|
+
return _from <= s_ind < _to and _from <= t_ind < _to
|
|
28
|
+
|
|
29
|
+
_from = 0
|
|
30
|
+
_to = window_size
|
|
31
|
+
while not __in():
|
|
32
|
+
_from += 1
|
|
33
|
+
_to += 1
|
|
34
|
+
|
|
35
|
+
return _from, _to
|
|
36
|
+
|
|
37
|
+
def _provide_sentence_terms(self, parsed_doc, sentence_ind, s_ind, t_ind):
|
|
38
|
+
terms_iter, src_ind, tgt_ind = super(CroppedSampleRowProvider, self)._provide_sentence_terms(
|
|
39
|
+
parsed_doc=parsed_doc, sentence_ind=sentence_ind, s_ind=s_ind, t_ind=t_ind)
|
|
40
|
+
terms = list(terms_iter)
|
|
41
|
+
_from, _to = self.__calc_window_bounds(window_size=self.__crop_window_size,
|
|
42
|
+
s_ind=s_ind, t_ind=t_ind, input_length=len(terms))
|
|
43
|
+
return terms[_from:_to], src_ind - _from, tgt_ind - _from
|
|
File without changes
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from arekit.common.data import const
|
|
2
|
+
from arekit.common.data.input.terms_mapper import OpinionContainingTextTermsMapper
|
|
3
|
+
from arekit.common.labels.base import Label
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class BaseSingleTextProvider(object):
|
|
7
|
+
|
|
8
|
+
TEXT_A = const.TEXT
|
|
9
|
+
TERMS_SEPARATOR = " "
|
|
10
|
+
|
|
11
|
+
def __init__(self, text_terms_mapper):
|
|
12
|
+
assert(isinstance(text_terms_mapper, OpinionContainingTextTermsMapper))
|
|
13
|
+
self._mapper = text_terms_mapper
|
|
14
|
+
|
|
15
|
+
def iter_columns(self):
|
|
16
|
+
yield BaseSingleTextProvider.TEXT_A
|
|
17
|
+
|
|
18
|
+
@staticmethod
|
|
19
|
+
def _process_text(text):
|
|
20
|
+
assert(isinstance(text, str))
|
|
21
|
+
return text.strip()
|
|
22
|
+
|
|
23
|
+
def _mapped_data_to_str(self, m_data):
|
|
24
|
+
return m_data
|
|
25
|
+
|
|
26
|
+
def _handle_mapped_data(self, m_data):
|
|
27
|
+
# Optionally handle mapped data.
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
def _handle_terms_and_compose_text(self, sentence_terms):
|
|
31
|
+
assert(isinstance(sentence_terms, list))
|
|
32
|
+
|
|
33
|
+
str_terms = []
|
|
34
|
+
|
|
35
|
+
for m_data in self._mapper.iter_mapped(sentence_terms):
|
|
36
|
+
str_terms.append(self._mapped_data_to_str(m_data=m_data))
|
|
37
|
+
self._handle_mapped_data(m_data=m_data)
|
|
38
|
+
|
|
39
|
+
return self.TERMS_SEPARATOR.join(str_terms)
|
|
40
|
+
|
|
41
|
+
def add_text_in_row(self, set_text_func, sentence_terms, s_ind, t_ind, expected_label):
|
|
42
|
+
assert(callable(set_text_func))
|
|
43
|
+
assert(isinstance(sentence_terms, list))
|
|
44
|
+
assert(isinstance(expected_label, Label))
|
|
45
|
+
|
|
46
|
+
self._mapper.set_s_ind(s_ind)
|
|
47
|
+
self._mapper.set_t_ind(t_ind)
|
|
48
|
+
set_text_func(column=self.TEXT_A,
|
|
49
|
+
value=self._process_text(text=self._handle_terms_and_compose_text(sentence_terms)))
|
|
File without changes
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from arekit.common.data.input.providers.columns.base import BaseColumnsProvider
|
|
2
|
+
from arekit.common.data.input.providers.contents import ContentsProvider
|
|
3
|
+
from arekit.common.data.input.providers.rows.base import BaseRowProvider
|
|
4
|
+
from arekit.common.data.storages.base import BaseRowsStorage
|
|
5
|
+
from arekit.contrib.utils.data.storages.row_cache import RowCacheStorage
|
|
6
|
+
from arekit.contrib.utils.data.writers.base import BaseWriter
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BaseInputRepository(object):
|
|
10
|
+
|
|
11
|
+
def __init__(self, columns_provider, rows_provider, storage):
|
|
12
|
+
assert(isinstance(columns_provider, BaseColumnsProvider))
|
|
13
|
+
assert(isinstance(rows_provider, BaseRowProvider))
|
|
14
|
+
assert(isinstance(storage, BaseRowsStorage))
|
|
15
|
+
|
|
16
|
+
self._columns_provider = columns_provider
|
|
17
|
+
self._rows_provider = rows_provider
|
|
18
|
+
self._storage = storage
|
|
19
|
+
|
|
20
|
+
# Do setup operations.
|
|
21
|
+
self._setup_columns_provider()
|
|
22
|
+
self._setup_rows_provider()
|
|
23
|
+
|
|
24
|
+
# region protected methods
|
|
25
|
+
|
|
26
|
+
def _setup_columns_provider(self):
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
def _setup_rows_provider(self):
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
# endregion
|
|
33
|
+
|
|
34
|
+
def populate(self, contents_provider, doc_ids, desc="", writer=None, target=None):
|
|
35
|
+
assert(isinstance(contents_provider, ContentsProvider))
|
|
36
|
+
assert(isinstance(self._storage, BaseRowsStorage))
|
|
37
|
+
assert(isinstance(doc_ids, list))
|
|
38
|
+
assert(isinstance(writer, BaseWriter) or writer is None)
|
|
39
|
+
assert(isinstance(target, str) or target is None)
|
|
40
|
+
|
|
41
|
+
def iter_rows(idle_mode):
|
|
42
|
+
return self._rows_provider.iter_by_rows(
|
|
43
|
+
contents_provider=contents_provider,
|
|
44
|
+
doc_ids_iter=doc_ids,
|
|
45
|
+
idle_mode=idle_mode)
|
|
46
|
+
|
|
47
|
+
self._storage.init_empty(columns_provider=self._columns_provider)
|
|
48
|
+
|
|
49
|
+
is_async_write_mode_on = writer is not None and target is not None
|
|
50
|
+
|
|
51
|
+
if is_async_write_mode_on:
|
|
52
|
+
writer.open_target(target)
|
|
53
|
+
|
|
54
|
+
self._storage.fill(lambda idle_mode: iter_rows(idle_mode),
|
|
55
|
+
columns_provider=self._columns_provider,
|
|
56
|
+
row_handler=lambda: writer.commit_line(self._storage) if is_async_write_mode_on else None,
|
|
57
|
+
desc=desc)
|
|
58
|
+
|
|
59
|
+
if is_async_write_mode_on:
|
|
60
|
+
writer.close_target()
|
|
61
|
+
|
|
62
|
+
def push(self, writer, target, free_storage=True):
|
|
63
|
+
if not isinstance(self._storage, RowCacheStorage):
|
|
64
|
+
writer.write_all(self._storage, target)
|
|
65
|
+
|
|
66
|
+
# After writing we free the contents of the storage.
|
|
67
|
+
if free_storage:
|
|
68
|
+
self._storage.free()
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from arekit.common.data.input.providers.rows.samples import BaseSampleRowProvider
|
|
4
|
+
from arekit.common.data.input.repositories.base import BaseInputRepository
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
logging.basicConfig(level=logging.INFO)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BaseInputSamplesRepository(BaseInputRepository):
|
|
11
|
+
|
|
12
|
+
def _setup_rows_provider(self):
|
|
13
|
+
""" Setup store labels.
|
|
14
|
+
"""
|
|
15
|
+
assert(isinstance(self._rows_provider, BaseSampleRowProvider))
|
|
16
|
+
self._rows_provider.set_store_labels(self._columns_provider.StoreLabels)
|
|
17
|
+
|
|
18
|
+
def _setup_columns_provider(self):
|
|
19
|
+
""" Setup text column names.
|
|
20
|
+
"""
|
|
21
|
+
text_column_names = list(self._rows_provider.TextProvider.iter_columns())
|
|
22
|
+
self._columns_provider.set_text_column_names(text_column_names)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from collections import OrderedDict
|
|
2
|
+
|
|
3
|
+
from arekit.common.docs.parsed.providers.entity_service import EntityServiceProvider, DistanceType
|
|
4
|
+
from arekit.common.text_opinions.base import TextOpinion
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class InputSampleBase(object):
|
|
8
|
+
"""
|
|
9
|
+
Description of a single sample (context) of a model
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, shift_index_dbg, input_sample_id, values):
|
|
13
|
+
assert(isinstance(shift_index_dbg, int))
|
|
14
|
+
assert(isinstance(input_sample_id, str))
|
|
15
|
+
assert(isinstance(values, list))
|
|
16
|
+
self._shift_index_dbg = shift_index_dbg
|
|
17
|
+
self.__input_sample_id = input_sample_id
|
|
18
|
+
self.__values = OrderedDict(values)
|
|
19
|
+
|
|
20
|
+
# region properties
|
|
21
|
+
|
|
22
|
+
@property
|
|
23
|
+
def ID(self):
|
|
24
|
+
return self.__input_sample_id
|
|
25
|
+
|
|
26
|
+
# endregion
|
|
27
|
+
|
|
28
|
+
@staticmethod
|
|
29
|
+
def check_ability_to_create_sample(entity_service, window_size, text_opinion):
|
|
30
|
+
"""
|
|
31
|
+
Main text_opinion filtering rules
|
|
32
|
+
"""
|
|
33
|
+
assert(isinstance(entity_service, EntityServiceProvider))
|
|
34
|
+
assert(isinstance(text_opinion, TextOpinion))
|
|
35
|
+
assert(isinstance(window_size, int) and window_size > 0)
|
|
36
|
+
|
|
37
|
+
is_not_same_ends = False
|
|
38
|
+
is_in_window = False
|
|
39
|
+
is_same_sentence = False
|
|
40
|
+
|
|
41
|
+
if text_opinion.SourceId != text_opinion.TargetId:
|
|
42
|
+
is_not_same_ends = True
|
|
43
|
+
|
|
44
|
+
dist_between_entities = entity_service.calc_dist_between_text_opinion_ends(
|
|
45
|
+
text_opinion=text_opinion,
|
|
46
|
+
distance_type=DistanceType.InTerms)
|
|
47
|
+
|
|
48
|
+
if InputSampleBase._check_ends_could_be_fitted_in_window(dist_between_entities, window_size):
|
|
49
|
+
is_in_window = True
|
|
50
|
+
|
|
51
|
+
dist_in_sents = entity_service.calc_dist_between_text_opinion_ends(
|
|
52
|
+
text_opinion=text_opinion,
|
|
53
|
+
distance_type=DistanceType.InSentences)
|
|
54
|
+
|
|
55
|
+
if dist_in_sents == 0:
|
|
56
|
+
is_same_sentence = True
|
|
57
|
+
|
|
58
|
+
return is_not_same_ends and is_in_window and is_same_sentence
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def _check_ends_could_be_fitted_in_window(actual_dist, window):
|
|
62
|
+
return actual_dist < window
|
|
63
|
+
|
|
64
|
+
def __iter__(self):
|
|
65
|
+
for key, value in self.__values.items():
|
|
66
|
+
yield key, value
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
from arekit.common.context.terms_mapper import TextTermsMapper
|
|
2
|
+
from arekit.common.context.token import Token
|
|
3
|
+
from arekit.common.entities.base import Entity
|
|
4
|
+
from arekit.common.entities.str_fmt import StringEntitiesFormatter
|
|
5
|
+
from arekit.common.entities.types import OpinionEntityType
|
|
6
|
+
from arekit.common.frames.text_variant import TextFrameVariant
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class OpinionContainingTextTermsMapper(TextTermsMapper):
|
|
10
|
+
"""
|
|
11
|
+
Provides an ability to setup s_obj, t_obj
|
|
12
|
+
The latter might be utilized with synonyms collection
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, entity_formatter):
|
|
16
|
+
assert(isinstance(entity_formatter, StringEntitiesFormatter))
|
|
17
|
+
self.__entities_formatter = entity_formatter
|
|
18
|
+
self.__s_ind = None
|
|
19
|
+
self.__t_ind = None
|
|
20
|
+
self.__s_group = None
|
|
21
|
+
self.__t_group = None
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def StringEntitiesFormatter(self):
|
|
25
|
+
return self.__entities_formatter
|
|
26
|
+
|
|
27
|
+
def __syn_group(self, entity):
|
|
28
|
+
""" Note: here we guarantee that entity has GroupIndex.
|
|
29
|
+
"""
|
|
30
|
+
assert(isinstance(entity, Entity))
|
|
31
|
+
return entity.GroupIndex if entity is not None else None
|
|
32
|
+
|
|
33
|
+
def set_s_ind(self, s_ind):
|
|
34
|
+
assert(isinstance(s_ind, int))
|
|
35
|
+
self.__s_ind = s_ind
|
|
36
|
+
|
|
37
|
+
def set_t_ind(self, t_ind):
|
|
38
|
+
assert(isinstance(t_ind, int))
|
|
39
|
+
self.__t_ind = t_ind
|
|
40
|
+
|
|
41
|
+
def _after_mapping(self):
|
|
42
|
+
""" In order to prevent bugs.
|
|
43
|
+
Every index should be declared before mapping.
|
|
44
|
+
"""
|
|
45
|
+
self.__s_ind = None
|
|
46
|
+
self.__t_ind = None
|
|
47
|
+
|
|
48
|
+
def iter_mapped(self, terms):
|
|
49
|
+
terms_list = list(terms)
|
|
50
|
+
self.__s_group = self.__syn_group(terms_list[self.__s_ind] if self.__s_ind is not None else None)
|
|
51
|
+
self.__t_group = self.__syn_group(terms_list[self.__t_ind] if self.__t_ind is not None else None)
|
|
52
|
+
return super(OpinionContainingTextTermsMapper, self).iter_mapped(terms)
|
|
53
|
+
|
|
54
|
+
def map_entity(self, e_ind, entity):
|
|
55
|
+
|
|
56
|
+
entity_type = OpinionEntityType.Other
|
|
57
|
+
if e_ind == self.__s_ind:
|
|
58
|
+
entity_type = OpinionEntityType.Subject
|
|
59
|
+
elif e_ind == self.__t_ind:
|
|
60
|
+
entity_type = OpinionEntityType.Object
|
|
61
|
+
elif self.__is_in_same_group(self.__syn_group(entity), self.__s_group):
|
|
62
|
+
entity_type = OpinionEntityType.SynonymSubject
|
|
63
|
+
elif self.__is_in_same_group(self.__syn_group(entity), self.__t_group):
|
|
64
|
+
entity_type = OpinionEntityType.SynonymObject
|
|
65
|
+
|
|
66
|
+
return self.__entities_formatter.to_string(original_value=entity,
|
|
67
|
+
entity_type=entity_type)
|
|
68
|
+
|
|
69
|
+
@staticmethod
|
|
70
|
+
def __is_in_same_group(g1, g2):
|
|
71
|
+
|
|
72
|
+
if g1 is None or g2 is None:
|
|
73
|
+
# In such scenario we cannot guarantee
|
|
74
|
+
# that g1 and g2 belong to the same group.
|
|
75
|
+
return False
|
|
76
|
+
|
|
77
|
+
return g1 == g2
|
|
78
|
+
|
|
79
|
+
def map_word(self, w_ind, word):
|
|
80
|
+
return word.strip()
|
|
81
|
+
|
|
82
|
+
def map_text_frame_variant(self, fv_ind, text_frame_variant):
|
|
83
|
+
assert(isinstance(text_frame_variant, TextFrameVariant))
|
|
84
|
+
return text_frame_variant.Variant.get_value().strip()
|
|
85
|
+
|
|
86
|
+
def map_token(self, t_ind, token):
|
|
87
|
+
assert(isinstance(token, Token))
|
|
88
|
+
return token.get_meta_value()
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from arekit.common.data import const
|
|
2
|
+
from arekit.common.utils import filter_whitespaces, split_by_whitespaces
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def process_values_list(value, args_sep):
|
|
6
|
+
return value.split(args_sep)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def process_indices_list(value, no_value_func, args_sep):
|
|
10
|
+
return no_value_func() if not value else [int(v) for v in str(value).split(args_sep)]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def process_text(value):
|
|
14
|
+
""" The core method of the input text processing.
|
|
15
|
+
"""
|
|
16
|
+
assert(isinstance(value, str) or isinstance(value, list))
|
|
17
|
+
return filter_whitespaces([term for term in split_by_whitespaces(value)]
|
|
18
|
+
if isinstance(value, str) else value)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def create_base_column_value_fmt(no_value_func=lambda: None, args_sep=","):
|
|
22
|
+
|
|
23
|
+
self_func = lambda value: value
|
|
24
|
+
|
|
25
|
+
return {
|
|
26
|
+
const.ID: {
|
|
27
|
+
"writer": self_func,
|
|
28
|
+
"parser": self_func
|
|
29
|
+
},
|
|
30
|
+
const.DOC_ID: {
|
|
31
|
+
"writer": self_func,
|
|
32
|
+
"parser": self_func,
|
|
33
|
+
},
|
|
34
|
+
const.S_IND: {
|
|
35
|
+
"writer": self_func,
|
|
36
|
+
"parser": lambda value: int(value)
|
|
37
|
+
},
|
|
38
|
+
const.T_IND: {
|
|
39
|
+
"writer": self_func,
|
|
40
|
+
"parser": lambda value: int(value)
|
|
41
|
+
},
|
|
42
|
+
const.SENT_IND: {
|
|
43
|
+
"writer": self_func,
|
|
44
|
+
"parser": lambda value: int(value)
|
|
45
|
+
},
|
|
46
|
+
const.OPINION_ID: {
|
|
47
|
+
"writer": self_func,
|
|
48
|
+
"parser": lambda value: int(value)
|
|
49
|
+
},
|
|
50
|
+
const.OPINION_LINKAGE_ID: {
|
|
51
|
+
"writer": self_func,
|
|
52
|
+
"parser": lambda value: int(value)
|
|
53
|
+
},
|
|
54
|
+
const.ENTITY_VALUES: {
|
|
55
|
+
"writer": lambda entities: args_sep.join([e.DisplayValue.replace(args_sep, '') for e in entities]),
|
|
56
|
+
"parser": lambda value: process_values_list(value, args_sep=args_sep),
|
|
57
|
+
},
|
|
58
|
+
const.ENTITY_TYPES: {
|
|
59
|
+
"writer": lambda entities: args_sep.join([e.Type.replace(args_sep, '') for e in entities]),
|
|
60
|
+
"parser": lambda value: process_values_list(value, args_sep=args_sep)
|
|
61
|
+
},
|
|
62
|
+
const.ENTITIES: {
|
|
63
|
+
"writer": lambda entity_inds: args_sep.join(entity_inds),
|
|
64
|
+
"parser": lambda value: process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
|
|
65
|
+
},
|
|
66
|
+
const.TEXT: {
|
|
67
|
+
"writer": self_func,
|
|
68
|
+
"parser": lambda value: process_text(value)
|
|
69
|
+
},
|
|
70
|
+
const.LABEL_UINT: {
|
|
71
|
+
"writer": self_func,
|
|
72
|
+
"parser": lambda value: int(value)
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def create_base_column_fmt(fmt_type, args_sep=","):
|
|
78
|
+
assert(isinstance(fmt_type, str))
|
|
79
|
+
d = create_base_column_value_fmt(args_sep=args_sep)
|
|
80
|
+
for k, v in d.items():
|
|
81
|
+
d[k] = v[fmt_type]
|
|
82
|
+
return d
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
class ParsedSampleRow(object):
|
|
2
|
+
""" Provides a parsed information for a sample row.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
def __init__(self, row, columns_fmts, no_value_func):
|
|
6
|
+
""" row: dict
|
|
7
|
+
dict of the pairs ("field_name", value)
|
|
8
|
+
columns_fmt: list
|
|
9
|
+
list of the formatters, where every formatter represent a dictionary.
|
|
10
|
+
no_value_func: func
|
|
11
|
+
the default value the conveys the absence of the parameter-value.
|
|
12
|
+
"""
|
|
13
|
+
assert(isinstance(row, dict))
|
|
14
|
+
assert(isinstance(columns_fmts, list))
|
|
15
|
+
assert(callable(no_value_func))
|
|
16
|
+
|
|
17
|
+
self.__uint_label = None
|
|
18
|
+
self.__params = {}
|
|
19
|
+
self.__no_value = no_value_func
|
|
20
|
+
|
|
21
|
+
for key, value in row.items():
|
|
22
|
+
|
|
23
|
+
for columns_fmt in columns_fmts:
|
|
24
|
+
assert(isinstance(columns_fmt, dict))
|
|
25
|
+
|
|
26
|
+
if key not in columns_fmt:
|
|
27
|
+
continue
|
|
28
|
+
|
|
29
|
+
self.__params[key] = columns_fmt[key](value)
|
|
30
|
+
break
|
|
31
|
+
|
|
32
|
+
def __value_or_none(self, key):
|
|
33
|
+
return self.__params[key] if key in self.__params else self.__no_value()
|
|
34
|
+
|
|
35
|
+
def __getitem__(self, item):
|
|
36
|
+
assert (isinstance(item, str) or item is None)
|
|
37
|
+
if item not in self.__params:
|
|
38
|
+
return self.__no_value()
|
|
39
|
+
return self.__params[item] if item is not None else self.__no_value()
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def parse(cls, row, columns_fmts, no_value_func):
|
|
43
|
+
return cls(row=row, columns_fmts=columns_fmts, no_value_func=no_value_func)
|
|
File without changes
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import gc
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
from arekit.common.data.input.providers.columns.base import BaseColumnsProvider
|
|
5
|
+
from arekit.common.linkage.meta import MetaEmptyLinkedDataWrapper
|
|
6
|
+
from arekit.common.utils import progress_bar_conditional
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BaseRowsStorage(object):
|
|
12
|
+
|
|
13
|
+
# region protected methods
|
|
14
|
+
|
|
15
|
+
def _begin_filling_row(self, row_ind):
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
# endregion
|
|
19
|
+
|
|
20
|
+
# region abstract methods
|
|
21
|
+
|
|
22
|
+
def _set_row_value(self, row_ind, column, value):
|
|
23
|
+
raise NotImplemented()
|
|
24
|
+
|
|
25
|
+
def _iter_rows(self):
|
|
26
|
+
""" returns: tuple(int, list)
|
|
27
|
+
provides the index (int) and the related content of the row (dict)
|
|
28
|
+
"""
|
|
29
|
+
raise NotImplemented()
|
|
30
|
+
|
|
31
|
+
def _get_rows_count(self):
|
|
32
|
+
raise NotImplemented()
|
|
33
|
+
|
|
34
|
+
def find_by_value(self, column_name, value):
|
|
35
|
+
raise NotImplemented()
|
|
36
|
+
|
|
37
|
+
def find_first_by_value(self, column_name, value):
|
|
38
|
+
raise NotImplemented()
|
|
39
|
+
|
|
40
|
+
def iter_column_values(self, column_name, dtype=None):
|
|
41
|
+
raise NotImplemented()
|
|
42
|
+
|
|
43
|
+
def get_row(self, row_index):
|
|
44
|
+
raise NotImplemented()
|
|
45
|
+
|
|
46
|
+
def get_cell(self, row_index, column_name):
|
|
47
|
+
raise NotImplemented()
|
|
48
|
+
|
|
49
|
+
def init_empty(self, columns_provider):
|
|
50
|
+
raise NotImplemented()
|
|
51
|
+
|
|
52
|
+
def iter_shuffled(self):
|
|
53
|
+
raise NotImplemented()
|
|
54
|
+
|
|
55
|
+
def iter_column_names(self):
|
|
56
|
+
raise NotImplemented()
|
|
57
|
+
|
|
58
|
+
def iter_column_types(self):
|
|
59
|
+
raise NotImplemented()
|
|
60
|
+
|
|
61
|
+
# endregion
|
|
62
|
+
|
|
63
|
+
def fill(self, iter_rows_func, columns_provider, row_handler=None, rows_count=None, desc=""):
|
|
64
|
+
assert(callable(iter_rows_func))
|
|
65
|
+
assert(isinstance(columns_provider, BaseColumnsProvider))
|
|
66
|
+
assert(callable(row_handler) or row_handler is None)
|
|
67
|
+
|
|
68
|
+
doc_ids_seen = set()
|
|
69
|
+
|
|
70
|
+
def postfix_func(item):
|
|
71
|
+
doc_id, _ = item
|
|
72
|
+
doc_ids_seen.add(doc_id)
|
|
73
|
+
return {
|
|
74
|
+
"docs_seen": len(doc_ids_seen),
|
|
75
|
+
"doc_now": str(doc_id)
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
pbar_it = progress_bar_conditional(
|
|
79
|
+
iterable=iter_rows_func(False),
|
|
80
|
+
# We skip meta information data.
|
|
81
|
+
condition_func=lambda item: not isinstance(item[1], MetaEmptyLinkedDataWrapper),
|
|
82
|
+
postfix_func=postfix_func,
|
|
83
|
+
desc="{fmt}".format(fmt=desc),
|
|
84
|
+
total=rows_count)
|
|
85
|
+
|
|
86
|
+
for row_index, item in enumerate(pbar_it):
|
|
87
|
+
_, row_values = item
|
|
88
|
+
self._begin_filling_row(row_index)
|
|
89
|
+
for column, value in row_values.items():
|
|
90
|
+
self._set_row_value(row_ind=row_index,
|
|
91
|
+
column=column,
|
|
92
|
+
value=value)
|
|
93
|
+
if row_handler is not None:
|
|
94
|
+
row_handler()
|
|
95
|
+
|
|
96
|
+
def free(self):
|
|
97
|
+
gc.collect()
|
|
98
|
+
|
|
99
|
+
# endregion
|
|
100
|
+
|
|
101
|
+
# region base methods
|
|
102
|
+
|
|
103
|
+
def __iter__(self):
|
|
104
|
+
return self._iter_rows()
|
|
105
|
+
|
|
106
|
+
def __len__(self):
|
|
107
|
+
return self._get_rows_count()
|
|
108
|
+
|
|
109
|
+
# endregion
|
|
File without changes
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from arekit.common.data import const
|
|
2
|
+
from arekit.common.data.storages.base import BaseRowsStorage
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
# TODO. This is a particular type of view, and expected to be off the core.
|
|
6
|
+
class LinkedSamplesStorageView(object):
|
|
7
|
+
|
|
8
|
+
def iter_from_storage(self, storage):
|
|
9
|
+
assert(isinstance(storage, BaseRowsStorage))
|
|
10
|
+
undefined = -1
|
|
11
|
+
|
|
12
|
+
linked = []
|
|
13
|
+
current_opinion_id = undefined
|
|
14
|
+
for row_index, opinion_id in enumerate(storage.iter_column_values(const.OPINION_ID)):
|
|
15
|
+
if current_opinion_id != undefined:
|
|
16
|
+
if opinion_id != current_opinion_id:
|
|
17
|
+
yield linked
|
|
18
|
+
linked = []
|
|
19
|
+
current_opinion_id = opinion_id
|
|
20
|
+
else:
|
|
21
|
+
current_opinion_id = opinion_id
|
|
22
|
+
|
|
23
|
+
linked.append(storage.get_row(row_index))
|
|
24
|
+
|
|
25
|
+
if len(linked) > 0:
|
|
26
|
+
yield linked
|
|
File without changes
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
class Document(object):
|
|
2
|
+
|
|
3
|
+
def __init__(self, doc_id, sentences):
|
|
4
|
+
assert(isinstance(sentences, list))
|
|
5
|
+
self.__id = doc_id
|
|
6
|
+
self._sentences = sentences
|
|
7
|
+
|
|
8
|
+
# region properties
|
|
9
|
+
|
|
10
|
+
@property
|
|
11
|
+
def ID(self):
|
|
12
|
+
return self.__id
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def SentencesCount(self):
|
|
16
|
+
""" Provides total amount of sentences within a doc
|
|
17
|
+
At present is useful for:
|
|
18
|
+
- CV-splitters, which may rely on sentences count.
|
|
19
|
+
- Text parsing.
|
|
20
|
+
"""
|
|
21
|
+
return len(self._sentences)
|
|
22
|
+
|
|
23
|
+
# endregion
|
|
24
|
+
|
|
25
|
+
def iter_sentences(self):
|
|
26
|
+
for sentence in self._sentences:
|
|
27
|
+
yield sentence
|
|
28
|
+
|
|
29
|
+
def get_sentence(self, s_ind):
|
|
30
|
+
return self._sentences[s_ind]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from arekit.common.entities.base import Entity
|
|
2
|
+
from arekit.common.pipeline.items.base import BasePipelineItem
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class EntitiesGroupingPipelineItem(BasePipelineItem):
|
|
6
|
+
|
|
7
|
+
def __init__(self, value_to_group_id_func):
|
|
8
|
+
assert(callable(value_to_group_id_func))
|
|
9
|
+
self.__value_to_group_id_func = value_to_group_id_func
|
|
10
|
+
|
|
11
|
+
def apply_core(self, input_data, pipeline_ctx):
|
|
12
|
+
assert(isinstance(input_data, list))
|
|
13
|
+
|
|
14
|
+
for entity in filter(lambda term: isinstance(term, Entity), input_data):
|
|
15
|
+
group_index = self.__value_to_group_id_func(entity.Value)
|
|
16
|
+
entity.set_group_index(group_index)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from arekit.common.entities.base import Entity
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class DocumentEntity(Entity):
|
|
5
|
+
|
|
6
|
+
def __init__(self, value, display_value, e_type, childs, id_in_doc, group_index):
|
|
7
|
+
""" id_in_doc: Id, utilized witin the internal services
|
|
8
|
+
"""
|
|
9
|
+
super(DocumentEntity, self).__init__(value=value,
|
|
10
|
+
e_type=e_type,
|
|
11
|
+
display_value=display_value,
|
|
12
|
+
childs=childs,
|
|
13
|
+
group_index=group_index)
|
|
14
|
+
self.__id = id_in_doc
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def IdInDocument(self):
|
|
18
|
+
return self.__id
|