arekit 0.25.0__tar.gz → 0.25.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arekit-0.25.0 → arekit-0.25.1}/PKG-INFO +4 -5
- {arekit-0.25.0 → arekit-0.25.1}/README.md +4 -4
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/storages/base.py +4 -15
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/parser.py +3 -30
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/pipeline/items/base.py +1 -1
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/utils.py +11 -8
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/data/storages/pandas_based.py +2 -17
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/data/storages/row_cache.py +2 -1
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/data/storages/sqlite_based.py +2 -1
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/text_opinion/extraction.py +5 -4
- {arekit-0.25.0 → arekit-0.25.1}/arekit.egg-info/PKG-INFO +4 -5
- {arekit-0.25.0 → arekit-0.25.1}/arekit.egg-info/SOURCES.txt +1 -74
- {arekit-0.25.0 → arekit-0.25.1}/arekit.egg-info/requires.txt +0 -1
- {arekit-0.25.0 → arekit-0.25.1}/setup.py +2 -3
- arekit-0.25.0/arekit/common/data/input/repositories/base.py +0 -68
- arekit-0.25.0/arekit/common/data/input/repositories/sample.py +0 -22
- arekit-0.25.0/arekit/common/data/views/samples.py +0 -26
- arekit-0.25.0/arekit/common/service/sqlite.py +0 -36
- arekit-0.25.0/arekit/contrib/networks/embedding.py +0 -149
- arekit-0.25.0/arekit/contrib/networks/embedding_io.py +0 -18
- arekit-0.25.0/arekit/contrib/networks/input/const.py +0 -6
- arekit-0.25.0/arekit/contrib/networks/input/ctx_serialization.py +0 -28
- arekit-0.25.0/arekit/contrib/networks/input/embedding/matrix.py +0 -29
- arekit-0.25.0/arekit/contrib/networks/input/embedding/offsets.py +0 -55
- arekit-0.25.0/arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
- arekit-0.25.0/arekit/contrib/networks/input/providers/sample.py +0 -129
- arekit-0.25.0/arekit/contrib/networks/input/providers/term_connotation.py +0 -23
- arekit-0.25.0/arekit/contrib/networks/input/providers/text.py +0 -24
- arekit-0.25.0/arekit/contrib/networks/input/rows_parser.py +0 -47
- arekit-0.25.0/arekit/contrib/networks/input/term_types.py +0 -13
- arekit-0.25.0/arekit/contrib/networks/input/terms_mapping.py +0 -60
- arekit-0.25.0/arekit/contrib/networks/vectorizer.py +0 -6
- arekit-0.25.0/arekit/contrib/utils/data/readers/base.py +0 -7
- arekit-0.25.0/arekit/contrib/utils/data/readers/csv_pd.py +0 -38
- arekit-0.25.0/arekit/contrib/utils/data/readers/jsonl.py +0 -15
- arekit-0.25.0/arekit/contrib/utils/data/readers/sqlite.py +0 -14
- arekit-0.25.0/arekit/contrib/utils/data/service/balance.py +0 -50
- arekit-0.25.0/arekit/contrib/utils/data/writers/csv_native.py +0 -63
- arekit-0.25.0/arekit/contrib/utils/data/writers/csv_pd.py +0 -40
- arekit-0.25.0/arekit/contrib/utils/data/writers/json_opennre.py +0 -132
- arekit-0.25.0/arekit/contrib/utils/data/writers/sqlite_native.py +0 -114
- arekit-0.25.0/arekit/contrib/utils/embeddings/rusvectores.py +0 -58
- arekit-0.25.0/arekit/contrib/utils/embeddings/tokens.py +0 -30
- arekit-0.25.0/arekit/contrib/utils/entities/__init__.py +0 -0
- arekit-0.25.0/arekit/contrib/utils/entities/formatters/__init__.py +0 -0
- arekit-0.25.0/arekit/contrib/utils/io_utils/__init__.py +0 -0
- arekit-0.25.0/arekit/contrib/utils/io_utils/embedding.py +0 -72
- arekit-0.25.0/arekit/contrib/utils/np_utils/__init__.py +0 -0
- arekit-0.25.0/arekit/contrib/utils/np_utils/embedding.py +0 -22
- arekit-0.25.0/arekit/contrib/utils/np_utils/npz_utils.py +0 -13
- arekit-0.25.0/arekit/contrib/utils/np_utils/vocab.py +0 -20
- arekit-0.25.0/arekit/contrib/utils/pipelines/__init__.py +0 -0
- arekit-0.25.0/arekit/contrib/utils/pipelines/items/__init__.py +0 -0
- arekit-0.25.0/arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
- arekit-0.25.0/arekit/contrib/utils/pipelines/items/sampling/base.py +0 -94
- arekit-0.25.0/arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -55
- arekit-0.25.0/arekit/contrib/utils/pipelines/items/text/__init__.py +0 -0
- arekit-0.25.0/arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
- arekit-0.25.0/arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -33
- arekit-0.25.0/arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -105
- arekit-0.25.0/arekit/contrib/utils/pipelines/items/text/translator.py +0 -136
- arekit-0.25.0/arekit/contrib/utils/pipelines/text_opinion/__init__.py +0 -0
- arekit-0.25.0/arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py +0 -0
- arekit-0.25.0/arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py +0 -0
- arekit-0.25.0/arekit/contrib/utils/processing/__init__.py +0 -0
- arekit-0.25.0/arekit/contrib/utils/processing/languages/__init__.py +0 -0
- arekit-0.25.0/arekit/contrib/utils/processing/languages/mods.py +0 -12
- arekit-0.25.0/arekit/contrib/utils/processing/languages/pos.py +0 -23
- arekit-0.25.0/arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
- arekit-0.25.0/arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
- arekit-0.25.0/arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
- arekit-0.25.0/arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
- arekit-0.25.0/arekit/contrib/utils/processing/languages/ru/number.py +0 -23
- arekit-0.25.0/arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
- arekit-0.25.0/arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
- arekit-0.25.0/arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
- arekit-0.25.0/arekit/contrib/utils/processing/pos/__init__.py +0 -0
- arekit-0.25.0/arekit/contrib/utils/processing/pos/base.py +0 -12
- arekit-0.25.0/arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
- arekit-0.25.0/arekit/contrib/utils/processing/pos/russian.py +0 -10
- arekit-0.25.0/arekit/contrib/utils/processing/text/__init__.py +0 -0
- arekit-0.25.0/arekit/contrib/utils/processing/text/tokens.py +0 -127
- arekit-0.25.0/arekit/contrib/utils/serializer.py +0 -42
- arekit-0.25.0/arekit/contrib/utils/synonyms/__init__.py +0 -0
- arekit-0.25.0/arekit/contrib/utils/vectorizers/__init__.py +0 -0
- arekit-0.25.0/arekit/contrib/utils/vectorizers/bpe.py +0 -93
- arekit-0.25.0/arekit/contrib/utils/vectorizers/random_norm.py +0 -39
- {arekit-0.25.0 → arekit-0.25.1}/LICENSE +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/bound.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/context/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/context/terms_mapper.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/context/token.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/const.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/doc_provider.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/columns/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/columns/base.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/columns/sample.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/const.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/contents.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/instances/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/instances/base.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/instances/multiple.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/instances/single.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/label/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/label/base.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/label/binary.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/label/multiple.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/rows/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/rows/base.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/rows/samples.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/sample/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/sample/cropped.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/text/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/providers/text/single.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/sample.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/input/terms_mapper.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/rows_fmt.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/data/rows_parser.py +0 -0
- {arekit-0.25.0/arekit/common/data/input/repositories → arekit-0.25.1/arekit/common/data/storages}/__init__.py +0 -0
- {arekit-0.25.0/arekit/common/data/storages → arekit-0.25.1/arekit/common/docs}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/base.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/entities_grouping.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/entity.py +0 -0
- {arekit-0.25.0/arekit/common/data/views → arekit-0.25.1/arekit/common/docs/parsed}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/parsed/base.py +0 -0
- {arekit-0.25.0/arekit/common/docs → arekit-0.25.1/arekit/common/docs/parsed/providers}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/parsed/providers/base.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/parsed/providers/base_pairs.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/parsed/providers/entity_service.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/parsed/providers/opinion_pairs.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/parsed/providers/text_opinion_pairs.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/parsed/service.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/parsed/term_position.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/docs/sentence.py +0 -0
- {arekit-0.25.0/arekit/common/docs/parsed → arekit-0.25.1/arekit/common/entities}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/entities/base.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/entities/collection.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/entities/str_fmt.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/entities/types.py +0 -0
- {arekit-0.25.0/arekit/common/docs/parsed/providers → arekit-0.25.1/arekit/common/experiment}/__init__.py +0 -0
- {arekit-0.25.0/arekit/common/entities → arekit-0.25.1/arekit/common/experiment/api}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/experiment/api/base_samples_io.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/experiment/data_type.py +0 -0
- {arekit-0.25.0/arekit/common/experiment → arekit-0.25.1/arekit/common/frames}/__init__.py +0 -0
- {arekit-0.25.0/arekit/common/experiment/api → arekit-0.25.1/arekit/common/frames/connotations}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/frames/connotations/descriptor.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/frames/connotations/provider.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/frames/text_variant.py +0 -0
- {arekit-0.25.0/arekit/common/frames → arekit-0.25.1/arekit/common/frames/variants}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/frames/variants/base.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/frames/variants/collection.py +0 -0
- {arekit-0.25.0/arekit/common/frames/connotations → arekit-0.25.1/arekit/common/labels}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/labels/base.py +0 -0
- {arekit-0.25.0/arekit/common/frames/variants → arekit-0.25.1/arekit/common/labels/provider}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/labels/provider/base.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/labels/provider/constant.py +0 -0
- {arekit-0.25.0/arekit/common/labels → arekit-0.25.1/arekit/common/labels/scaler}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/labels/scaler/base.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/labels/scaler/sentiment.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/labels/scaler/single.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/labels/str_fmt.py +0 -0
- {arekit-0.25.0/arekit/common/labels/provider → arekit-0.25.1/arekit/common/linkage}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/linkage/base.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/linkage/meta.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/linkage/opinions.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/linkage/text_opinions.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/log_utils.py +0 -0
- {arekit-0.25.0/arekit/common/labels/scaler → arekit-0.25.1/arekit/common/model}/__init__.py +0 -0
- {arekit-0.25.0/arekit/common/linkage → arekit-0.25.1/arekit/common/model/labeling}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/model/labeling/base.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/model/labeling/modes.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/model/labeling/single.py +0 -0
- {arekit-0.25.0/arekit/common/model → arekit-0.25.1/arekit/common/opinions}/__init__.py +0 -0
- {arekit-0.25.0/arekit/common/model/labeling → arekit-0.25.1/arekit/common/opinions/annot}/__init__.py +0 -0
- {arekit-0.25.0/arekit/common/opinions → arekit-0.25.1/arekit/common/opinions/annot/algo}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/opinions/annot/algo/base.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/opinions/annot/algo/pair_based.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/opinions/annot/algo/predefined.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/opinions/annot/algo_based.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/opinions/annot/base.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/opinions/base.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/opinions/collection.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/opinions/enums.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/opinions/provider.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/opinions/writer.py +0 -0
- {arekit-0.25.0/arekit/common/opinions/annot → arekit-0.25.1/arekit/common/pipeline}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/pipeline/base.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/pipeline/batching.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/pipeline/context.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/pipeline/conts.py +0 -0
- {arekit-0.25.0/arekit/common/opinions/annot/algo → arekit-0.25.1/arekit/common/pipeline/items}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/pipeline/items/flatten.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/pipeline/items/handle.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/pipeline/items/iter.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/pipeline/items/map.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/pipeline/items/map_nested.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/pipeline/utils.py +0 -0
- {arekit-0.25.0/arekit/common/pipeline → arekit-0.25.1/arekit/common/synonyms}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/synonyms/base.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/synonyms/grouping.py +0 -0
- {arekit-0.25.0/arekit/common/pipeline/items → arekit-0.25.1/arekit/common/text}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/text/enums.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/text/parsed.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/text/partitioning.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/text/stemmer.py +0 -0
- {arekit-0.25.0/arekit/common/service → arekit-0.25.1/arekit/common/text_opinions}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/common/text_opinions/base.py +0 -0
- {arekit-0.25.0/arekit/common/synonyms → arekit-0.25.1/arekit/contrib}/__init__.py +0 -0
- {arekit-0.25.0/arekit/common/text → arekit-0.25.1/arekit/contrib/bert}/__init__.py +0 -0
- {arekit-0.25.0/arekit/common/text_opinions → arekit-0.25.1/arekit/contrib/bert/input}/__init__.py +0 -0
- {arekit-0.25.0/arekit/contrib → arekit-0.25.1/arekit/contrib/bert/input/providers}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/bert/input/providers/cropped_sample.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/bert/input/providers/text_pair.py +0 -0
- {arekit-0.25.0/arekit/contrib/bert → arekit-0.25.1/arekit/contrib/bert/terms}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/bert/terms/mapper.py +0 -0
- {arekit-0.25.0/arekit/contrib/bert/input → arekit-0.25.1/arekit/contrib/prompt}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/prompt/sample.py +0 -0
- {arekit-0.25.0/arekit/contrib/bert/input/providers → arekit-0.25.1/arekit/contrib/utils}/__init__.py +0 -0
- {arekit-0.25.0/arekit/contrib/bert/terms → arekit-0.25.1/arekit/contrib/utils/bert}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/bert/samplers.py +0 -0
- {arekit-0.25.0/arekit/contrib/networks → arekit-0.25.1/arekit/contrib/utils/data}/__init__.py +0 -0
- {arekit-0.25.0/arekit/contrib/networks/input → arekit-0.25.1/arekit/contrib/utils/data/contents}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/data/contents/opinions.py +0 -0
- {arekit-0.25.0/arekit/contrib/networks/input/embedding → arekit-0.25.1/arekit/contrib/utils/data/doc_provider}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/data/doc_provider/dict_based.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/data/doc_provider/dir_based.py +0 -0
- {arekit-0.25.0/arekit/contrib/networks/input/formatters → arekit-0.25.1/arekit/contrib/utils/data/storages}/__init__.py +0 -0
- {arekit-0.25.0/arekit/contrib/networks/input/providers → arekit-0.25.1/arekit/contrib/utils/data/writers}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/data/writers/base.py +0 -0
- {arekit-0.25.0/arekit/contrib/prompt → arekit-0.25.1/arekit/contrib/utils/entities}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/entities/filter.py +0 -0
- {arekit-0.25.0/arekit/contrib/utils → arekit-0.25.1/arekit/contrib/utils/entities/formatters}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/entities/formatters/str_display.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py +0 -0
- {arekit-0.25.0/arekit/contrib/utils/bert → arekit-0.25.1/arekit/contrib/utils/io_utils}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/io_utils/utils.py +0 -0
- {arekit-0.25.0/arekit/contrib/utils/data → arekit-0.25.1/arekit/contrib/utils/pipelines}/__init__.py +0 -0
- {arekit-0.25.0/arekit/contrib/utils/data/contents → arekit-0.25.1/arekit/contrib/utils/pipelines/items}/__init__.py +0 -0
- {arekit-0.25.0/arekit/contrib/utils/data/doc_provider → arekit-0.25.1/arekit/contrib/utils/pipelines/items/text}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/items/text/entities_default.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/items/text/frames.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/opinion_collections.py +0 -0
- {arekit-0.25.0/arekit/contrib/utils/data/readers → arekit-0.25.1/arekit/contrib/utils/pipelines/text_opinion}/__init__.py +0 -0
- {arekit-0.25.0/arekit/contrib/utils/data/service → arekit-0.25.1/arekit/contrib/utils/pipelines/text_opinion/annot}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +0 -0
- {arekit-0.25.0/arekit/contrib/utils/data/storages → arekit-0.25.1/arekit/contrib/utils/pipelines/text_opinion/filters}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/text_opinion/filters/base.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +0 -0
- {arekit-0.25.0/arekit/contrib/utils/data/writers → arekit-0.25.1/arekit/contrib/utils/processing}/__init__.py +0 -0
- {arekit-0.25.0/arekit/contrib/utils/embeddings → arekit-0.25.1/arekit/contrib/utils/synonyms}/__init__.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/synonyms/simple.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit/contrib/utils/synonyms/stemmer_based.py +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit.egg-info/dependency_links.txt +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/arekit.egg-info/top_level.txt +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/logo.png +0 -0
- {arekit-0.25.0 → arekit-0.25.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: arekit
|
|
3
|
-
Version: 0.25.
|
|
3
|
+
Version: 0.25.1
|
|
4
4
|
Summary: Document level Attitude and Relation Extraction toolkit (AREkit) for sampling and prompting mass-media news into datasets for ML-model training
|
|
5
5
|
Home-page: https://github.com/nicolay-r/AREkit
|
|
6
6
|
Author: Nicolay Rusnachenko
|
|
@@ -18,9 +18,8 @@ License-File: LICENSE
|
|
|
18
18
|
Requires-Dist: tqdm
|
|
19
19
|
Requires-Dist: enum34==1.1.10
|
|
20
20
|
Requires-Dist: numpy>=1.14.5
|
|
21
|
-
Requires-Dist: pymystem3==0.2.0
|
|
22
21
|
|
|
23
|
-
# AREkit 0.25.
|
|
22
|
+
# AREkit 0.25.1
|
|
24
23
|
|
|
25
24
|

|
|
26
25
|
|
|
@@ -34,7 +33,7 @@ is a python toolkit, devoted to document level Attitude and Relation Extraction
|
|
|
34
33
|
## Description
|
|
35
34
|
|
|
36
35
|
|
|
37
|
-
This toolkit aims at memory-effective data processing in Relation Extraction (RE) related tasks.
|
|
36
|
+
This toolkit aims at memory-effective data processing in [Relation Extraction (RE)](https://nlpprogress.com/english/relationship_extraction.html) related tasks.
|
|
38
37
|
|
|
39
38
|
<p align="center">
|
|
40
39
|
<img src="docs/arekit-pipeline-concept.png"/>
|
|
@@ -60,7 +59,7 @@ for sentence level relations preparation (dubbed as contexts);
|
|
|
60
59
|
## Installation
|
|
61
60
|
|
|
62
61
|
```bash
|
|
63
|
-
pip install git+https://github.com/nicolay-r/AREkit.git@0.25.
|
|
62
|
+
pip install git+https://github.com/nicolay-r/AREkit.git@0.25.1-rc
|
|
64
63
|
```
|
|
65
64
|
|
|
66
65
|
## Usage
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# AREkit 0.25.
|
|
1
|
+
# AREkit 0.25.1
|
|
2
2
|
|
|
3
3
|

|
|
4
4
|
|
|
@@ -12,7 +12,7 @@ is a python toolkit, devoted to document level Attitude and Relation Extraction
|
|
|
12
12
|
## Description
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
This toolkit aims at memory-effective data processing in Relation Extraction (RE) related tasks.
|
|
15
|
+
This toolkit aims at memory-effective data processing in [Relation Extraction (RE)](https://nlpprogress.com/english/relationship_extraction.html) related tasks.
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
18
18
|
<img src="docs/arekit-pipeline-concept.png"/>
|
|
@@ -38,7 +38,7 @@ for sentence level relations preparation (dubbed as contexts);
|
|
|
38
38
|
## Installation
|
|
39
39
|
|
|
40
40
|
```bash
|
|
41
|
-
pip install git+https://github.com/nicolay-r/AREkit.git@0.25.
|
|
41
|
+
pip install git+https://github.com/nicolay-r/AREkit.git@0.25.1-rc
|
|
42
42
|
```
|
|
43
43
|
|
|
44
44
|
## Usage
|
|
@@ -57,4 +57,4 @@ if you use or extend our work, please cite as follows:
|
|
|
57
57
|
year={2024},
|
|
58
58
|
organization={Springer}
|
|
59
59
|
}
|
|
60
|
-
```
|
|
60
|
+
```
|
|
@@ -10,6 +10,9 @@ logger = logging.getLogger(__name__)
|
|
|
10
10
|
|
|
11
11
|
class BaseRowsStorage(object):
|
|
12
12
|
|
|
13
|
+
def __init__(self, log_out=None):
|
|
14
|
+
self.__log_out = log_out
|
|
15
|
+
|
|
13
16
|
# region protected methods
|
|
14
17
|
|
|
15
18
|
def _begin_filling_row(self, row_ind):
|
|
@@ -31,27 +34,12 @@ class BaseRowsStorage(object):
|
|
|
31
34
|
def _get_rows_count(self):
|
|
32
35
|
raise NotImplemented()
|
|
33
36
|
|
|
34
|
-
def find_by_value(self, column_name, value):
|
|
35
|
-
raise NotImplemented()
|
|
36
|
-
|
|
37
|
-
def find_first_by_value(self, column_name, value):
|
|
38
|
-
raise NotImplemented()
|
|
39
|
-
|
|
40
|
-
def iter_column_values(self, column_name, dtype=None):
|
|
41
|
-
raise NotImplemented()
|
|
42
|
-
|
|
43
37
|
def get_row(self, row_index):
|
|
44
38
|
raise NotImplemented()
|
|
45
39
|
|
|
46
|
-
def get_cell(self, row_index, column_name):
|
|
47
|
-
raise NotImplemented()
|
|
48
|
-
|
|
49
40
|
def init_empty(self, columns_provider):
|
|
50
41
|
raise NotImplemented()
|
|
51
42
|
|
|
52
|
-
def iter_shuffled(self):
|
|
53
|
-
raise NotImplemented()
|
|
54
|
-
|
|
55
43
|
def iter_column_names(self):
|
|
56
44
|
raise NotImplemented()
|
|
57
45
|
|
|
@@ -81,6 +69,7 @@ class BaseRowsStorage(object):
|
|
|
81
69
|
condition_func=lambda item: not isinstance(item[1], MetaEmptyLinkedDataWrapper),
|
|
82
70
|
postfix_func=postfix_func,
|
|
83
71
|
desc="{fmt}".format(fmt=desc),
|
|
72
|
+
file=self.__log_out,
|
|
84
73
|
total=rows_count)
|
|
85
74
|
|
|
86
75
|
for row_index, item in enumerate(pbar_it):
|
|
@@ -1,42 +1,14 @@
|
|
|
1
|
-
from tqdm import tqdm
|
|
2
1
|
from arekit.common.docs.base import Document
|
|
3
2
|
from arekit.common.docs.parsed.base import ParsedDocument
|
|
4
|
-
from arekit.common.pipeline.base import BasePipelineLauncher
|
|
5
3
|
from arekit.common.pipeline.batching import BatchingPipelineLauncher
|
|
6
4
|
from arekit.common.pipeline.context import PipelineContext
|
|
7
5
|
from arekit.common.pipeline.utils import BatchIterator
|
|
8
6
|
from arekit.common.text.parsed import BaseParsedText
|
|
7
|
+
from arekit.common.utils import progress_bar_defined
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
class DocumentParsers(object):
|
|
12
11
|
|
|
13
|
-
@staticmethod
|
|
14
|
-
def parse(doc, pipeline_items, parent_ppl_ctx=None, src_key="input", show_progress=False):
|
|
15
|
-
""" This document parser is based on single text parts (sentences)
|
|
16
|
-
that passes sequentially through the pipeline of transformations.
|
|
17
|
-
"""
|
|
18
|
-
assert(isinstance(doc, Document))
|
|
19
|
-
assert(isinstance(pipeline_items, list))
|
|
20
|
-
assert(isinstance(parent_ppl_ctx, PipelineContext) or parent_ppl_ctx is None)
|
|
21
|
-
|
|
22
|
-
parsed_sentences = []
|
|
23
|
-
|
|
24
|
-
data_it = range(doc.SentencesCount)
|
|
25
|
-
progress_it = tqdm(data_it, disable=not show_progress)
|
|
26
|
-
|
|
27
|
-
for sent_ind in progress_it:
|
|
28
|
-
|
|
29
|
-
# Composing the context from a single sentence.
|
|
30
|
-
ctx = PipelineContext({src_key: doc.get_sentence(sent_ind)}, parent_ctx=parent_ppl_ctx)
|
|
31
|
-
|
|
32
|
-
# Apply all the operations.
|
|
33
|
-
BasePipelineLauncher.run(pipeline=pipeline_items, pipeline_ctx=ctx, src_key=src_key)
|
|
34
|
-
|
|
35
|
-
# Collecting the result.
|
|
36
|
-
parsed_sentences.append(BaseParsedText(terms=ctx.provide("result")))
|
|
37
|
-
|
|
38
|
-
return ParsedDocument(doc_id=doc.ID, parsed_sentences=parsed_sentences)
|
|
39
|
-
|
|
40
12
|
@staticmethod
|
|
41
13
|
def parse_batch(doc, pipeline_items, batch_size, parent_ppl_ctx=None, src_key="input", show_progress=False):
|
|
42
14
|
""" This document parser is based on batch of sentences.
|
|
@@ -49,7 +21,8 @@ class DocumentParsers(object):
|
|
|
49
21
|
parsed_sentences = []
|
|
50
22
|
|
|
51
23
|
data_it = BatchIterator(data_iter=iter(range(doc.SentencesCount)), batch_size=batch_size)
|
|
52
|
-
progress_it =
|
|
24
|
+
progress_it = progress_bar_defined(data_it, total=round(doc.SentencesCount / batch_size),
|
|
25
|
+
disable=not show_progress)
|
|
53
26
|
|
|
54
27
|
for batch in progress_it:
|
|
55
28
|
|
|
@@ -2,7 +2,7 @@ from arekit.common.pipeline.context import PipelineContext
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class BasePipelineItem(object):
|
|
5
|
-
""" Single pipeline item that might be
|
|
5
|
+
""" Single pipeline item that might be instantiated and embedded into pipeline.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
def __init__(self, src_key="result", result_key="result", src_func=None):
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import sys
|
|
2
1
|
import os
|
|
3
2
|
from tqdm import tqdm
|
|
4
3
|
|
|
@@ -27,14 +26,14 @@ def split_by_whitespaces(text):
|
|
|
27
26
|
return text.split()
|
|
28
27
|
|
|
29
28
|
|
|
30
|
-
def progress_bar(iterable, total, desc="", unit="it"):
|
|
29
|
+
def progress_bar(iterable, total, desc="", unit="it", file=None, disable=False):
|
|
31
30
|
if total is not None:
|
|
32
|
-
return progress_bar_defined(iterable=iterable, total=total, desc=desc, unit=unit)
|
|
31
|
+
return progress_bar_defined(iterable=iterable, total=total, desc=desc, unit=unit, file=file, disable=disable)
|
|
33
32
|
else:
|
|
34
|
-
return progress_bar_iter(iterable=iterable, desc=desc, unit=unit)
|
|
33
|
+
return progress_bar_iter(iterable=iterable, desc=desc, unit=unit, file=file, disable=disable)
|
|
35
34
|
|
|
36
35
|
|
|
37
|
-
def progress_bar_conditional(iterable, condition_func, total, postfix_func=None, desc="", unit="it"):
|
|
36
|
+
def progress_bar_conditional(iterable, condition_func, total, postfix_func=None, desc="", unit="it", file=None):
|
|
38
37
|
""" This progress-bar updates only on the
|
|
39
38
|
specific conditions during the iteration process.
|
|
40
39
|
"""
|
|
@@ -47,7 +46,7 @@ def progress_bar_conditional(iterable, condition_func, total, postfix_func=None,
|
|
|
47
46
|
yield 0
|
|
48
47
|
|
|
49
48
|
pbar_it = progress_bar(iterable=__iter_infinite_placeholder(),
|
|
50
|
-
desc=desc, unit=unit, total=total)
|
|
49
|
+
desc=desc, unit=unit, total=total, file=file)
|
|
51
50
|
element = iter(pbar_it)
|
|
52
51
|
|
|
53
52
|
# Initialize with 0.
|
|
@@ -65,7 +64,7 @@ def progress_bar_conditional(iterable, condition_func, total, postfix_func=None,
|
|
|
65
64
|
pbar_it.set_postfix(postfix_func(item))
|
|
66
65
|
|
|
67
66
|
|
|
68
|
-
def progress_bar_defined(iterable, total, miniters=200, desc="", unit="it"):
|
|
67
|
+
def progress_bar_defined(iterable, total, miniters=200, desc="", unit="it", file=None, disable=False):
|
|
69
68
|
return tqdm(iterable=iterable,
|
|
70
69
|
total=total,
|
|
71
70
|
desc=desc,
|
|
@@ -73,13 +72,17 @@ def progress_bar_defined(iterable, total, miniters=200, desc="", unit="it"):
|
|
|
73
72
|
position=0,
|
|
74
73
|
leave=True,
|
|
75
74
|
unit=unit,
|
|
75
|
+
file=file,
|
|
76
|
+
disable=disable,
|
|
76
77
|
miniters=total / miniters if total is not None else total)
|
|
77
78
|
|
|
78
79
|
|
|
79
|
-
def progress_bar_iter(iterable, desc="", unit='it'):
|
|
80
|
+
def progress_bar_iter(iterable, desc="", unit='it', file=None, disable=False):
|
|
80
81
|
return tqdm(iterable=iterable,
|
|
81
82
|
desc=desc,
|
|
82
83
|
position=0,
|
|
83
84
|
leave=True,
|
|
84
85
|
ncols=120,
|
|
86
|
+
file=file,
|
|
87
|
+
disable=disable,
|
|
85
88
|
unit=unit)
|
|
@@ -5,8 +5,9 @@ from arekit.common.data.storages.base import BaseRowsStorage
|
|
|
5
5
|
|
|
6
6
|
class JsonlBasedRowsStorage(BaseRowsStorage):
|
|
7
7
|
|
|
8
|
-
def __init__(self, rows):
|
|
8
|
+
def __init__(self, rows, **kwargs):
|
|
9
9
|
assert(isinstance(rows, list))
|
|
10
|
+
super(JsonlBasedRowsStorage, self).__init__(**kwargs)
|
|
10
11
|
self.__rows = rows
|
|
11
12
|
|
|
12
13
|
def _iter_rows(self):
|
|
@@ -12,7 +12,8 @@ class PandasBasedRowsStorage(BaseRowsStorage):
|
|
|
12
12
|
based on the pandas DataFrames.
|
|
13
13
|
"""
|
|
14
14
|
|
|
15
|
-
def __init__(self, df=None):
|
|
15
|
+
def __init__(self, df=None, **kwargs):
|
|
16
|
+
super(PandasBasedRowsStorage, self).__init__(**kwargs)
|
|
16
17
|
self._df = df
|
|
17
18
|
|
|
18
19
|
@property
|
|
@@ -96,26 +97,10 @@ class PandasBasedRowsStorage(BaseRowsStorage):
|
|
|
96
97
|
def get_row(self, row_index):
|
|
97
98
|
return self._df.iloc[row_index]
|
|
98
99
|
|
|
99
|
-
def get_cell(self, row_index, column_name):
|
|
100
|
-
return self._df.iloc[row_index][column_name]
|
|
101
|
-
|
|
102
|
-
def iter_column_values(self, column_name, dtype=None):
|
|
103
|
-
values = self._df[column_name]
|
|
104
|
-
if dtype is None:
|
|
105
|
-
return values
|
|
106
|
-
return values.astype(dtype)
|
|
107
|
-
|
|
108
|
-
def find_by_value(self, column_name, value):
|
|
109
|
-
return self.__filter(column_name=column_name, value=value)
|
|
110
|
-
|
|
111
100
|
def init_empty(self, columns_provider):
|
|
112
101
|
cols_with_types = columns_provider.get_columns_list_with_types()
|
|
113
102
|
self._df = self.__create_empty(cols_with_types)
|
|
114
103
|
|
|
115
|
-
def iter_shuffled(self):
|
|
116
|
-
shuffled_df = self._df.sample(frac=1)
|
|
117
|
-
return self.__iter_rows_core(shuffled_df)
|
|
118
|
-
|
|
119
104
|
def free(self):
|
|
120
105
|
del self._df
|
|
121
106
|
super(PandasBasedRowsStorage, self).free()
|
|
@@ -6,13 +6,14 @@ class RowCacheStorage(BaseRowsStorage):
|
|
|
6
6
|
""" Row Caching storage kernel, based on python dictionary.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
def __init__(self, force_collect_columns=None):
|
|
9
|
+
def __init__(self, force_collect_columns=None, **kwargs):
|
|
10
10
|
""" This is a particular/related solution for the following issue:
|
|
11
11
|
https://github.com/nicolay-r/AREkit/issues/464
|
|
12
12
|
force_collect_columns: list
|
|
13
13
|
columns that supposed to be additionally considered in output.
|
|
14
14
|
"""
|
|
15
15
|
assert(isinstance(force_collect_columns, list) or force_collect_columns is None)
|
|
16
|
+
super(RowCacheStorage, self).__init__(**kwargs)
|
|
16
17
|
self.__f = None
|
|
17
18
|
self.__row_cache = {}
|
|
18
19
|
self.__column_names = []
|
|
@@ -4,7 +4,8 @@ from arekit.common.data.storages.base import BaseRowsStorage
|
|
|
4
4
|
|
|
5
5
|
class SQliteBasedRowsStorage(BaseRowsStorage):
|
|
6
6
|
|
|
7
|
-
def __init__(self, path, table_name):
|
|
7
|
+
def __init__(self, path, table_name, **kwargs):
|
|
8
|
+
super(SQliteBasedRowsStorage, self).__init__(**kwargs)
|
|
8
9
|
self.__path = path
|
|
9
10
|
self.__table_name = table_name
|
|
10
11
|
self.__conn = None
|
|
@@ -15,7 +15,7 @@ from arekit.contrib.utils.pipelines.text_opinion.filters.limitation import Frame
|
|
|
15
15
|
def __iter_text_opinion_linkages(parsed_doc, annotators, entity_index_func,
|
|
16
16
|
text_opinion_filters, use_meta):
|
|
17
17
|
""" use_meta: bool
|
|
18
|
-
this is mainly for
|
|
18
|
+
this is mainly for the progress-bar and other console parameters to stay up-to-date
|
|
19
19
|
with the state in the case we do not have that much output results
|
|
20
20
|
across multiple amount of documents.
|
|
21
21
|
"""
|
|
@@ -62,12 +62,13 @@ def __iter_text_opinion_linkages(parsed_doc, annotators, entity_index_func,
|
|
|
62
62
|
yield MetaEmptyLinkedDataWrapper(doc_id=parsed_doc.RelatedDocID)
|
|
63
63
|
|
|
64
64
|
|
|
65
|
-
def text_opinion_extraction_pipeline(pipeline_items, get_doc_by_id_func, annotators, entity_index_func,
|
|
65
|
+
def text_opinion_extraction_pipeline(pipeline_items, get_doc_by_id_func, annotators, entity_index_func, batch_size,
|
|
66
66
|
text_opinion_filters=None, use_meta_between_docs=True):
|
|
67
67
|
assert(callable(get_doc_by_id_func))
|
|
68
68
|
assert(isinstance(annotators, list))
|
|
69
69
|
assert(isinstance(text_opinion_filters, list) or text_opinion_filters is None)
|
|
70
70
|
assert(isinstance(use_meta_between_docs, bool))
|
|
71
|
+
assert(isinstance(batch_size, int) and batch_size > 0)
|
|
71
72
|
|
|
72
73
|
extra_filters = [] if text_opinion_filters is None else text_opinion_filters
|
|
73
74
|
actual_text_opinion_filters = [FrameworkLimitationsTextOpinionFilter()] + extra_filters
|
|
@@ -77,8 +78,8 @@ def text_opinion_extraction_pipeline(pipeline_items, get_doc_by_id_func, annotat
|
|
|
77
78
|
MapPipelineItem(map_func=lambda doc_id: get_doc_by_id_func(doc_id)),
|
|
78
79
|
|
|
79
80
|
# (doc, ppl_ctx) -> (parsed_doc)
|
|
80
|
-
MapNestedPipelineItem(map_func=lambda doc, ppl_ctx: DocumentParsers.
|
|
81
|
-
doc=doc, pipeline_items=pipeline_items, parent_ppl_ctx=ppl_ctx)),
|
|
81
|
+
MapNestedPipelineItem(map_func=lambda doc, ppl_ctx: DocumentParsers.parse_batch(
|
|
82
|
+
doc=doc, pipeline_items=pipeline_items, parent_ppl_ctx=ppl_ctx, batch_size=batch_size)),
|
|
82
83
|
|
|
83
84
|
# (parsed_doc) -> (text_opinions)
|
|
84
85
|
MapPipelineItem(map_func=lambda parsed_doc: __iter_text_opinion_linkages(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: arekit
|
|
3
|
-
Version: 0.25.
|
|
3
|
+
Version: 0.25.1
|
|
4
4
|
Summary: Document level Attitude and Relation Extraction toolkit (AREkit) for sampling and prompting mass-media news into datasets for ML-model training
|
|
5
5
|
Home-page: https://github.com/nicolay-r/AREkit
|
|
6
6
|
Author: Nicolay Rusnachenko
|
|
@@ -18,9 +18,8 @@ License-File: LICENSE
|
|
|
18
18
|
Requires-Dist: tqdm
|
|
19
19
|
Requires-Dist: enum34==1.1.10
|
|
20
20
|
Requires-Dist: numpy>=1.14.5
|
|
21
|
-
Requires-Dist: pymystem3==0.2.0
|
|
22
21
|
|
|
23
|
-
# AREkit 0.25.
|
|
22
|
+
# AREkit 0.25.1
|
|
24
23
|
|
|
25
24
|

|
|
26
25
|
|
|
@@ -34,7 +33,7 @@ is a python toolkit, devoted to document level Attitude and Relation Extraction
|
|
|
34
33
|
## Description
|
|
35
34
|
|
|
36
35
|
|
|
37
|
-
This toolkit aims at memory-effective data processing in Relation Extraction (RE) related tasks.
|
|
36
|
+
This toolkit aims at memory-effective data processing in [Relation Extraction (RE)](https://nlpprogress.com/english/relationship_extraction.html) related tasks.
|
|
38
37
|
|
|
39
38
|
<p align="center">
|
|
40
39
|
<img src="docs/arekit-pipeline-concept.png"/>
|
|
@@ -60,7 +59,7 @@ for sentence level relations preparation (dubbed as contexts);
|
|
|
60
59
|
## Installation
|
|
61
60
|
|
|
62
61
|
```bash
|
|
63
|
-
pip install git+https://github.com/nicolay-r/AREkit.git@0.25.
|
|
62
|
+
pip install git+https://github.com/nicolay-r/AREkit.git@0.25.1-rc
|
|
64
63
|
```
|
|
65
64
|
|
|
66
65
|
## Usage
|
|
@@ -44,13 +44,8 @@ arekit/common/data/input/providers/sample/__init__.py
|
|
|
44
44
|
arekit/common/data/input/providers/sample/cropped.py
|
|
45
45
|
arekit/common/data/input/providers/text/__init__.py
|
|
46
46
|
arekit/common/data/input/providers/text/single.py
|
|
47
|
-
arekit/common/data/input/repositories/__init__.py
|
|
48
|
-
arekit/common/data/input/repositories/base.py
|
|
49
|
-
arekit/common/data/input/repositories/sample.py
|
|
50
47
|
arekit/common/data/storages/__init__.py
|
|
51
48
|
arekit/common/data/storages/base.py
|
|
52
|
-
arekit/common/data/views/__init__.py
|
|
53
|
-
arekit/common/data/views/samples.py
|
|
54
49
|
arekit/common/docs/__init__.py
|
|
55
50
|
arekit/common/docs/base.py
|
|
56
51
|
arekit/common/docs/entities_grouping.py
|
|
@@ -130,8 +125,6 @@ arekit/common/pipeline/items/handle.py
|
|
|
130
125
|
arekit/common/pipeline/items/iter.py
|
|
131
126
|
arekit/common/pipeline/items/map.py
|
|
132
127
|
arekit/common/pipeline/items/map_nested.py
|
|
133
|
-
arekit/common/service/__init__.py
|
|
134
|
-
arekit/common/service/sqlite.py
|
|
135
128
|
arekit/common/synonyms/__init__.py
|
|
136
129
|
arekit/common/synonyms/base.py
|
|
137
130
|
arekit/common/synonyms/grouping.py
|
|
@@ -150,29 +143,9 @@ arekit/contrib/bert/input/providers/cropped_sample.py
|
|
|
150
143
|
arekit/contrib/bert/input/providers/text_pair.py
|
|
151
144
|
arekit/contrib/bert/terms/__init__.py
|
|
152
145
|
arekit/contrib/bert/terms/mapper.py
|
|
153
|
-
arekit/contrib/networks/__init__.py
|
|
154
|
-
arekit/contrib/networks/embedding.py
|
|
155
|
-
arekit/contrib/networks/embedding_io.py
|
|
156
|
-
arekit/contrib/networks/vectorizer.py
|
|
157
|
-
arekit/contrib/networks/input/__init__.py
|
|
158
|
-
arekit/contrib/networks/input/const.py
|
|
159
|
-
arekit/contrib/networks/input/ctx_serialization.py
|
|
160
|
-
arekit/contrib/networks/input/rows_parser.py
|
|
161
|
-
arekit/contrib/networks/input/term_types.py
|
|
162
|
-
arekit/contrib/networks/input/terms_mapping.py
|
|
163
|
-
arekit/contrib/networks/input/embedding/__init__.py
|
|
164
|
-
arekit/contrib/networks/input/embedding/matrix.py
|
|
165
|
-
arekit/contrib/networks/input/embedding/offsets.py
|
|
166
|
-
arekit/contrib/networks/input/formatters/__init__.py
|
|
167
|
-
arekit/contrib/networks/input/formatters/pos_mapper.py
|
|
168
|
-
arekit/contrib/networks/input/providers/__init__.py
|
|
169
|
-
arekit/contrib/networks/input/providers/sample.py
|
|
170
|
-
arekit/contrib/networks/input/providers/term_connotation.py
|
|
171
|
-
arekit/contrib/networks/input/providers/text.py
|
|
172
146
|
arekit/contrib/prompt/__init__.py
|
|
173
147
|
arekit/contrib/prompt/sample.py
|
|
174
148
|
arekit/contrib/utils/__init__.py
|
|
175
|
-
arekit/contrib/utils/serializer.py
|
|
176
149
|
arekit/contrib/utils/bert/__init__.py
|
|
177
150
|
arekit/contrib/utils/bert/samplers.py
|
|
178
151
|
arekit/contrib/utils/data/__init__.py
|
|
@@ -181,13 +154,6 @@ arekit/contrib/utils/data/contents/opinions.py
|
|
|
181
154
|
arekit/contrib/utils/data/doc_provider/__init__.py
|
|
182
155
|
arekit/contrib/utils/data/doc_provider/dict_based.py
|
|
183
156
|
arekit/contrib/utils/data/doc_provider/dir_based.py
|
|
184
|
-
arekit/contrib/utils/data/readers/__init__.py
|
|
185
|
-
arekit/contrib/utils/data/readers/base.py
|
|
186
|
-
arekit/contrib/utils/data/readers/csv_pd.py
|
|
187
|
-
arekit/contrib/utils/data/readers/jsonl.py
|
|
188
|
-
arekit/contrib/utils/data/readers/sqlite.py
|
|
189
|
-
arekit/contrib/utils/data/service/__init__.py
|
|
190
|
-
arekit/contrib/utils/data/service/balance.py
|
|
191
157
|
arekit/contrib/utils/data/storages/__init__.py
|
|
192
158
|
arekit/contrib/utils/data/storages/jsonl_based.py
|
|
193
159
|
arekit/contrib/utils/data/storages/pandas_based.py
|
|
@@ -195,38 +161,19 @@ arekit/contrib/utils/data/storages/row_cache.py
|
|
|
195
161
|
arekit/contrib/utils/data/storages/sqlite_based.py
|
|
196
162
|
arekit/contrib/utils/data/writers/__init__.py
|
|
197
163
|
arekit/contrib/utils/data/writers/base.py
|
|
198
|
-
arekit/contrib/utils/data/writers/csv_native.py
|
|
199
|
-
arekit/contrib/utils/data/writers/csv_pd.py
|
|
200
|
-
arekit/contrib/utils/data/writers/json_opennre.py
|
|
201
|
-
arekit/contrib/utils/data/writers/sqlite_native.py
|
|
202
|
-
arekit/contrib/utils/embeddings/__init__.py
|
|
203
|
-
arekit/contrib/utils/embeddings/rusvectores.py
|
|
204
|
-
arekit/contrib/utils/embeddings/tokens.py
|
|
205
164
|
arekit/contrib/utils/entities/__init__.py
|
|
206
165
|
arekit/contrib/utils/entities/filter.py
|
|
207
166
|
arekit/contrib/utils/entities/formatters/__init__.py
|
|
208
167
|
arekit/contrib/utils/entities/formatters/str_display.py
|
|
209
168
|
arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py
|
|
210
169
|
arekit/contrib/utils/io_utils/__init__.py
|
|
211
|
-
arekit/contrib/utils/io_utils/embedding.py
|
|
212
170
|
arekit/contrib/utils/io_utils/utils.py
|
|
213
|
-
arekit/contrib/utils/np_utils/__init__.py
|
|
214
|
-
arekit/contrib/utils/np_utils/embedding.py
|
|
215
|
-
arekit/contrib/utils/np_utils/npz_utils.py
|
|
216
|
-
arekit/contrib/utils/np_utils/vocab.py
|
|
217
171
|
arekit/contrib/utils/pipelines/__init__.py
|
|
218
172
|
arekit/contrib/utils/pipelines/opinion_collections.py
|
|
219
173
|
arekit/contrib/utils/pipelines/items/__init__.py
|
|
220
|
-
arekit/contrib/utils/pipelines/items/sampling/__init__.py
|
|
221
|
-
arekit/contrib/utils/pipelines/items/sampling/base.py
|
|
222
|
-
arekit/contrib/utils/pipelines/items/sampling/networks.py
|
|
223
174
|
arekit/contrib/utils/pipelines/items/text/__init__.py
|
|
224
175
|
arekit/contrib/utils/pipelines/items/text/entities_default.py
|
|
225
176
|
arekit/contrib/utils/pipelines/items/text/frames.py
|
|
226
|
-
arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py
|
|
227
|
-
arekit/contrib/utils/pipelines/items/text/frames_negation.py
|
|
228
|
-
arekit/contrib/utils/pipelines/items/text/tokenizer.py
|
|
229
|
-
arekit/contrib/utils/pipelines/items/text/translator.py
|
|
230
177
|
arekit/contrib/utils/pipelines/text_opinion/__init__.py
|
|
231
178
|
arekit/contrib/utils/pipelines/text_opinion/extraction.py
|
|
232
179
|
arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py
|
|
@@ -237,26 +184,6 @@ arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py
|
|
|
237
184
|
arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py
|
|
238
185
|
arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py
|
|
239
186
|
arekit/contrib/utils/processing/__init__.py
|
|
240
|
-
arekit/contrib/utils/processing/languages/__init__.py
|
|
241
|
-
arekit/contrib/utils/processing/languages/mods.py
|
|
242
|
-
arekit/contrib/utils/processing/languages/pos.py
|
|
243
|
-
arekit/contrib/utils/processing/languages/ru/__init__.py
|
|
244
|
-
arekit/contrib/utils/processing/languages/ru/cases.py
|
|
245
|
-
arekit/contrib/utils/processing/languages/ru/constants.py
|
|
246
|
-
arekit/contrib/utils/processing/languages/ru/mods.py
|
|
247
|
-
arekit/contrib/utils/processing/languages/ru/number.py
|
|
248
|
-
arekit/contrib/utils/processing/languages/ru/pos_service.py
|
|
249
|
-
arekit/contrib/utils/processing/lemmatization/__init__.py
|
|
250
|
-
arekit/contrib/utils/processing/lemmatization/mystem.py
|
|
251
|
-
arekit/contrib/utils/processing/pos/__init__.py
|
|
252
|
-
arekit/contrib/utils/processing/pos/base.py
|
|
253
|
-
arekit/contrib/utils/processing/pos/mystem_wrap.py
|
|
254
|
-
arekit/contrib/utils/processing/pos/russian.py
|
|
255
|
-
arekit/contrib/utils/processing/text/__init__.py
|
|
256
|
-
arekit/contrib/utils/processing/text/tokens.py
|
|
257
187
|
arekit/contrib/utils/synonyms/__init__.py
|
|
258
188
|
arekit/contrib/utils/synonyms/simple.py
|
|
259
|
-
arekit/contrib/utils/synonyms/stemmer_based.py
|
|
260
|
-
arekit/contrib/utils/vectorizers/__init__.py
|
|
261
|
-
arekit/contrib/utils/vectorizers/bpe.py
|
|
262
|
-
arekit/contrib/utils/vectorizers/random_norm.py
|
|
189
|
+
arekit/contrib/utils/synonyms/stemmer_based.py
|
|
@@ -15,7 +15,7 @@ def get_requirements(filenames):
|
|
|
15
15
|
|
|
16
16
|
setup(
|
|
17
17
|
name='arekit',
|
|
18
|
-
version='0.25.
|
|
18
|
+
version='0.25.1',
|
|
19
19
|
python_requires=">=3.6",
|
|
20
20
|
description='Document level Attitude and Relation Extraction toolkit (AREkit)'
|
|
21
21
|
' for sampling and prompting mass-media news into datasets for ML-model training',
|
|
@@ -35,7 +35,6 @@ setup(
|
|
|
35
35
|
keywords='natural language processing, relation extraction, sentiment analysis',
|
|
36
36
|
packages=find_packages(),
|
|
37
37
|
install_requires=get_requirements([
|
|
38
|
-
'dependencies.txt',
|
|
39
|
-
'arekit/contrib/utils/dependencies.txt']),
|
|
38
|
+
'dependencies.txt']),
|
|
40
39
|
data_files=["logo.png"],
|
|
41
40
|
)
|
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
from arekit.common.data.input.providers.columns.base import BaseColumnsProvider
|
|
2
|
-
from arekit.common.data.input.providers.contents import ContentsProvider
|
|
3
|
-
from arekit.common.data.input.providers.rows.base import BaseRowProvider
|
|
4
|
-
from arekit.common.data.storages.base import BaseRowsStorage
|
|
5
|
-
from arekit.contrib.utils.data.storages.row_cache import RowCacheStorage
|
|
6
|
-
from arekit.contrib.utils.data.writers.base import BaseWriter
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class BaseInputRepository(object):
|
|
10
|
-
|
|
11
|
-
def __init__(self, columns_provider, rows_provider, storage):
|
|
12
|
-
assert(isinstance(columns_provider, BaseColumnsProvider))
|
|
13
|
-
assert(isinstance(rows_provider, BaseRowProvider))
|
|
14
|
-
assert(isinstance(storage, BaseRowsStorage))
|
|
15
|
-
|
|
16
|
-
self._columns_provider = columns_provider
|
|
17
|
-
self._rows_provider = rows_provider
|
|
18
|
-
self._storage = storage
|
|
19
|
-
|
|
20
|
-
# Do setup operations.
|
|
21
|
-
self._setup_columns_provider()
|
|
22
|
-
self._setup_rows_provider()
|
|
23
|
-
|
|
24
|
-
# region protected methods
|
|
25
|
-
|
|
26
|
-
def _setup_columns_provider(self):
|
|
27
|
-
pass
|
|
28
|
-
|
|
29
|
-
def _setup_rows_provider(self):
|
|
30
|
-
pass
|
|
31
|
-
|
|
32
|
-
# endregion
|
|
33
|
-
|
|
34
|
-
def populate(self, contents_provider, doc_ids, desc="", writer=None, target=None):
|
|
35
|
-
assert(isinstance(contents_provider, ContentsProvider))
|
|
36
|
-
assert(isinstance(self._storage, BaseRowsStorage))
|
|
37
|
-
assert(isinstance(doc_ids, list))
|
|
38
|
-
assert(isinstance(writer, BaseWriter) or writer is None)
|
|
39
|
-
assert(isinstance(target, str) or target is None)
|
|
40
|
-
|
|
41
|
-
def iter_rows(idle_mode):
|
|
42
|
-
return self._rows_provider.iter_by_rows(
|
|
43
|
-
contents_provider=contents_provider,
|
|
44
|
-
doc_ids_iter=doc_ids,
|
|
45
|
-
idle_mode=idle_mode)
|
|
46
|
-
|
|
47
|
-
self._storage.init_empty(columns_provider=self._columns_provider)
|
|
48
|
-
|
|
49
|
-
is_async_write_mode_on = writer is not None and target is not None
|
|
50
|
-
|
|
51
|
-
if is_async_write_mode_on:
|
|
52
|
-
writer.open_target(target)
|
|
53
|
-
|
|
54
|
-
self._storage.fill(lambda idle_mode: iter_rows(idle_mode),
|
|
55
|
-
columns_provider=self._columns_provider,
|
|
56
|
-
row_handler=lambda: writer.commit_line(self._storage) if is_async_write_mode_on else None,
|
|
57
|
-
desc=desc)
|
|
58
|
-
|
|
59
|
-
if is_async_write_mode_on:
|
|
60
|
-
writer.close_target()
|
|
61
|
-
|
|
62
|
-
def push(self, writer, target, free_storage=True):
|
|
63
|
-
if not isinstance(self._storage, RowCacheStorage):
|
|
64
|
-
writer.write_all(self._storage, target)
|
|
65
|
-
|
|
66
|
-
# After writing we free the contents of the storage.
|
|
67
|
-
if free_storage:
|
|
68
|
-
self._storage.free()
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
|
|
3
|
-
from arekit.common.data.input.providers.rows.samples import BaseSampleRowProvider
|
|
4
|
-
from arekit.common.data.input.repositories.base import BaseInputRepository
|
|
5
|
-
|
|
6
|
-
logger = logging.getLogger(__name__)
|
|
7
|
-
logging.basicConfig(level=logging.INFO)
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class BaseInputSamplesRepository(BaseInputRepository):
|
|
11
|
-
|
|
12
|
-
def _setup_rows_provider(self):
|
|
13
|
-
""" Setup store labels.
|
|
14
|
-
"""
|
|
15
|
-
assert(isinstance(self._rows_provider, BaseSampleRowProvider))
|
|
16
|
-
self._rows_provider.set_store_labels(self._columns_provider.StoreLabels)
|
|
17
|
-
|
|
18
|
-
def _setup_columns_provider(self):
|
|
19
|
-
""" Setup text column names.
|
|
20
|
-
"""
|
|
21
|
-
text_column_names = list(self._rows_provider.TextProvider.iter_columns())
|
|
22
|
-
self._columns_provider.set_text_column_names(text_column_names)
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
from arekit.common.data import const
|
|
2
|
-
from arekit.common.data.storages.base import BaseRowsStorage
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
# TODO. This is a particular type of view, and expected to be off the core.
|
|
6
|
-
class LinkedSamplesStorageView(object):
|
|
7
|
-
|
|
8
|
-
def iter_from_storage(self, storage):
|
|
9
|
-
assert(isinstance(storage, BaseRowsStorage))
|
|
10
|
-
undefined = -1
|
|
11
|
-
|
|
12
|
-
linked = []
|
|
13
|
-
current_opinion_id = undefined
|
|
14
|
-
for row_index, opinion_id in enumerate(storage.iter_column_values(const.OPINION_ID)):
|
|
15
|
-
if current_opinion_id != undefined:
|
|
16
|
-
if opinion_id != current_opinion_id:
|
|
17
|
-
yield linked
|
|
18
|
-
linked = []
|
|
19
|
-
current_opinion_id = opinion_id
|
|
20
|
-
else:
|
|
21
|
-
current_opinion_id = opinion_id
|
|
22
|
-
|
|
23
|
-
linked.append(storage.get_row(row_index))
|
|
24
|
-
|
|
25
|
-
if len(linked) > 0:
|
|
26
|
-
yield linked
|