arekit 0.24.0__tar.gz → 0.25.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit-0.25.1/PKG-INFO +81 -0
- arekit-0.25.1/README.md +60 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/storages/base.py +4 -15
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/docs/entities_grouping.py +2 -1
- arekit-0.25.1/arekit/common/docs/parser.py +39 -0
- arekit-0.25.1/arekit/common/pipeline/base.py +21 -0
- arekit-0.25.1/arekit/common/pipeline/batching.py +28 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/pipeline/context.py +5 -1
- arekit-0.25.1/arekit/common/pipeline/items/base.py +49 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/pipeline/items/flatten.py +5 -1
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/pipeline/items/handle.py +2 -1
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/pipeline/items/iter.py +2 -1
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/pipeline/items/map.py +2 -1
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/pipeline/items/map_nested.py +4 -0
- arekit-0.25.1/arekit/common/pipeline/utils.py +32 -0
- arekit-0.24.0/arekit/common/text/partitioning/str.py → arekit-0.25.1/arekit/common/text/partitioning.py +14 -9
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/utils.py +11 -52
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/data/contents/opinions.py +13 -3
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/data/storages/pandas_based.py +2 -17
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/data/storages/row_cache.py +8 -2
- arekit-0.25.1/arekit/contrib/utils/data/storages/sqlite_based.py +18 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/io_utils/utils.py +1 -18
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/text_opinion/extraction.py +8 -10
- arekit-0.25.1/arekit.egg-info/PKG-INFO +81 -0
- arekit-0.25.1/arekit.egg-info/SOURCES.txt +189 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit.egg-info/requires.txt +0 -1
- arekit-0.25.1/logo.png +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/setup.py +9 -5
- arekit-0.24.0/PKG-INFO +0 -19
- arekit-0.24.0/README.md +0 -56
- arekit-0.24.0/arekit/common/data/input/repositories/base.py +0 -68
- arekit-0.24.0/arekit/common/data/input/repositories/sample.py +0 -22
- arekit-0.24.0/arekit/common/data/views/samples.py +0 -26
- arekit-0.24.0/arekit/common/docs/objects_parser.py +0 -37
- arekit-0.24.0/arekit/common/docs/parser.py +0 -34
- arekit-0.24.0/arekit/common/pipeline/base.py +0 -25
- arekit-0.24.0/arekit/common/pipeline/items/base.py +0 -12
- arekit-0.24.0/arekit/common/text/parser.py +0 -12
- arekit-0.24.0/arekit/common/text/partitioning/base.py +0 -4
- arekit-0.24.0/arekit/common/text/partitioning/terms.py +0 -35
- arekit-0.24.0/arekit/contrib/networks/embedding.py +0 -149
- arekit-0.24.0/arekit/contrib/networks/embedding_io.py +0 -18
- arekit-0.24.0/arekit/contrib/networks/input/const.py +0 -6
- arekit-0.24.0/arekit/contrib/networks/input/ctx_serialization.py +0 -28
- arekit-0.24.0/arekit/contrib/networks/input/embedding/matrix.py +0 -29
- arekit-0.24.0/arekit/contrib/networks/input/embedding/offsets.py +0 -55
- arekit-0.24.0/arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
- arekit-0.24.0/arekit/contrib/networks/input/providers/sample.py +0 -129
- arekit-0.24.0/arekit/contrib/networks/input/providers/term_connotation.py +0 -23
- arekit-0.24.0/arekit/contrib/networks/input/providers/text.py +0 -24
- arekit-0.24.0/arekit/contrib/networks/input/rows_parser.py +0 -47
- arekit-0.24.0/arekit/contrib/networks/input/term_types.py +0 -13
- arekit-0.24.0/arekit/contrib/networks/input/terms_mapping.py +0 -60
- arekit-0.24.0/arekit/contrib/networks/vectorizer.py +0 -6
- arekit-0.24.0/arekit/contrib/source/brat/annot.py +0 -84
- arekit-0.24.0/arekit/contrib/source/brat/doc.py +0 -28
- arekit-0.24.0/arekit/contrib/source/brat/entities/compound.py +0 -13
- arekit-0.24.0/arekit/contrib/source/brat/entities/entity.py +0 -42
- arekit-0.24.0/arekit/contrib/source/brat/entities/parser.py +0 -53
- arekit-0.24.0/arekit/contrib/source/brat/opinions/converter.py +0 -19
- arekit-0.24.0/arekit/contrib/source/brat/relation.py +0 -32
- arekit-0.24.0/arekit/contrib/source/brat/sentence.py +0 -69
- arekit-0.24.0/arekit/contrib/source/brat/sentences_reader.py +0 -128
- arekit-0.24.0/arekit/contrib/source/download.py +0 -41
- arekit-0.24.0/arekit/contrib/source/nerel/entities.py +0 -55
- arekit-0.24.0/arekit/contrib/source/nerel/folding/fixed.py +0 -74
- arekit-0.24.0/arekit/contrib/source/nerel/io_utils.py +0 -62
- arekit-0.24.0/arekit/contrib/source/nerel/labels.py +0 -241
- arekit-0.24.0/arekit/contrib/source/nerel/reader.py +0 -46
- arekit-0.24.0/arekit/contrib/source/nerel/utils.py +0 -24
- arekit-0.24.0/arekit/contrib/source/nerel/versions.py +0 -12
- arekit-0.24.0/arekit/contrib/source/nerelbio/io_utils.py +0 -62
- arekit-0.24.0/arekit/contrib/source/nerelbio/labels.py +0 -265
- arekit-0.24.0/arekit/contrib/source/nerelbio/reader.py +0 -8
- arekit-0.24.0/arekit/contrib/source/nerelbio/versions.py +0 -8
- arekit-0.24.0/arekit/contrib/source/ruattitudes/collection.py +0 -36
- arekit-0.24.0/arekit/contrib/source/ruattitudes/doc.py +0 -51
- arekit-0.24.0/arekit/contrib/source/ruattitudes/doc_brat.py +0 -44
- arekit-0.24.0/arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
- arekit-0.24.0/arekit/contrib/source/ruattitudes/io_utils.py +0 -56
- arekit-0.24.0/arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
- arekit-0.24.0/arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
- arekit-0.24.0/arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
- arekit-0.24.0/arekit/contrib/source/ruattitudes/reader.py +0 -268
- arekit-0.24.0/arekit/contrib/source/ruattitudes/sentence.py +0 -73
- arekit-0.24.0/arekit/contrib/source/ruattitudes/synonyms.py +0 -17
- arekit-0.24.0/arekit/contrib/source/ruattitudes/text_object.py +0 -59
- arekit-0.24.0/arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/source/rusentiframes/collection.py +0 -157
- arekit-0.24.0/arekit/contrib/source/rusentiframes/effect.py +0 -24
- arekit-0.24.0/arekit/contrib/source/rusentiframes/io_utils.py +0 -19
- arekit-0.24.0/arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
- arekit-0.24.0/arekit/contrib/source/rusentiframes/polarity.py +0 -35
- arekit-0.24.0/arekit/contrib/source/rusentiframes/role.py +0 -15
- arekit-0.24.0/arekit/contrib/source/rusentiframes/state.py +0 -24
- arekit-0.24.0/arekit/contrib/source/rusentiframes/types.py +0 -42
- arekit-0.24.0/arekit/contrib/source/rusentiframes/value.py +0 -2
- arekit-0.24.0/arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/source/rusentrel/const.py +0 -3
- arekit-0.24.0/arekit/contrib/source/rusentrel/docs_reader.py +0 -51
- arekit-0.24.0/arekit/contrib/source/rusentrel/entities.py +0 -26
- arekit-0.24.0/arekit/contrib/source/rusentrel/io_utils.py +0 -125
- arekit-0.24.0/arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
- arekit-0.24.0/arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
- arekit-0.24.0/arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
- arekit-0.24.0/arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
- arekit-0.24.0/arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
- arekit-0.24.0/arekit/contrib/source/rusentrel/synonyms.py +0 -17
- arekit-0.24.0/arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/source/sentinerel/entities.py +0 -52
- arekit-0.24.0/arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/source/sentinerel/folding/factory.py +0 -31
- arekit-0.24.0/arekit/contrib/source/sentinerel/folding/fixed.py +0 -70
- arekit-0.24.0/arekit/contrib/source/sentinerel/io_utils.py +0 -87
- arekit-0.24.0/arekit/contrib/source/sentinerel/labels.py +0 -53
- arekit-0.24.0/arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
- arekit-0.24.0/arekit/contrib/source/sentinerel/reader.py +0 -42
- arekit-0.24.0/arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/source/synonyms/utils.py +0 -19
- arekit-0.24.0/arekit/contrib/source/zip_utils.py +0 -47
- arekit-0.24.0/arekit/contrib/utils/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/bert/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
- arekit-0.24.0/arekit/contrib/utils/data/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/data/contents/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/data/doc_provider/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/data/readers/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/data/readers/base.py +0 -7
- arekit-0.24.0/arekit/contrib/utils/data/readers/csv_pd.py +0 -38
- arekit-0.24.0/arekit/contrib/utils/data/readers/jsonl.py +0 -15
- arekit-0.24.0/arekit/contrib/utils/data/service/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/data/service/balance.py +0 -50
- arekit-0.24.0/arekit/contrib/utils/data/storages/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/data/writers/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/data/writers/csv_native.py +0 -63
- arekit-0.24.0/arekit/contrib/utils/data/writers/csv_pd.py +0 -40
- arekit-0.24.0/arekit/contrib/utils/data/writers/json_opennre.py +0 -132
- arekit-0.24.0/arekit/contrib/utils/data/writers/sqlite_native.py +0 -110
- arekit-0.24.0/arekit/contrib/utils/download.py +0 -77
- arekit-0.24.0/arekit/contrib/utils/embeddings/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/embeddings/rusvectores.py +0 -58
- arekit-0.24.0/arekit/contrib/utils/embeddings/tokens.py +0 -30
- arekit-0.24.0/arekit/contrib/utils/entities/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/entities/formatters/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/io_utils/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/io_utils/embedding.py +0 -72
- arekit-0.24.0/arekit/contrib/utils/io_utils/opinions.py +0 -37
- arekit-0.24.0/arekit/contrib/utils/io_utils/samples.py +0 -79
- arekit-0.24.0/arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/lexicons/lexicon.py +0 -41
- arekit-0.24.0/arekit/contrib/utils/lexicons/relation.py +0 -42
- arekit-0.24.0/arekit/contrib/utils/lexicons/rusentilex.py +0 -37
- arekit-0.24.0/arekit/contrib/utils/nn/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/nn/rows.py +0 -83
- arekit-0.24.0/arekit/contrib/utils/np_utils/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/np_utils/embedding.py +0 -22
- arekit-0.24.0/arekit/contrib/utils/np_utils/npz_utils.py +0 -13
- arekit-0.24.0/arekit/contrib/utils/np_utils/vocab.py +0 -20
- arekit-0.24.0/arekit/contrib/utils/pipelines/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/items/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/items/sampling/base.py +0 -99
- arekit-0.24.0/arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -54
- arekit-0.24.0/arekit/contrib/utils/pipelines/items/text/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
- arekit-0.24.0/arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -32
- arekit-0.24.0/arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
- arekit-0.24.0/arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -107
- arekit-0.24.0/arekit/contrib/utils/pipelines/items/text/translator.py +0 -135
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +0 -27
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -65
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +0 -29
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -64
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +0 -56
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -20
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -65
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +0 -21
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -107
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +0 -29
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -180
- arekit-0.24.0/arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
- arekit-0.24.0/arekit/contrib/utils/pipelines/text_opinion/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
- arekit-0.24.0/arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/processing/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/processing/languages/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/processing/languages/mods.py +0 -12
- arekit-0.24.0/arekit/contrib/utils/processing/languages/pos.py +0 -23
- arekit-0.24.0/arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
- arekit-0.24.0/arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
- arekit-0.24.0/arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
- arekit-0.24.0/arekit/contrib/utils/processing/languages/ru/number.py +0 -23
- arekit-0.24.0/arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
- arekit-0.24.0/arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
- arekit-0.24.0/arekit/contrib/utils/processing/pos/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/processing/pos/base.py +0 -12
- arekit-0.24.0/arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
- arekit-0.24.0/arekit/contrib/utils/processing/pos/russian.py +0 -10
- arekit-0.24.0/arekit/contrib/utils/processing/text/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/processing/text/tokens.py +0 -127
- arekit-0.24.0/arekit/contrib/utils/resources.py +0 -25
- arekit-0.24.0/arekit/contrib/utils/serializer.py +0 -43
- arekit-0.24.0/arekit/contrib/utils/sources/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
- arekit-0.24.0/arekit/contrib/utils/synonyms/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/vectorizers/__init__.py +0 -0
- arekit-0.24.0/arekit/contrib/utils/vectorizers/bpe.py +0 -93
- arekit-0.24.0/arekit/contrib/utils/vectorizers/random_norm.py +0 -39
- arekit-0.24.0/arekit/download_data.py +0 -11
- arekit-0.24.0/arekit.egg-info/PKG-INFO +0 -19
- arekit-0.24.0/arekit.egg-info/SOURCES.txt +0 -377
- {arekit-0.24.0 → arekit-0.25.1}/LICENSE +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/bound.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/context/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/context/terms_mapper.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/context/token.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/const.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/doc_provider.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/providers/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/providers/columns/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/providers/columns/base.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/providers/columns/sample.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/providers/const.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/providers/contents.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/providers/instances/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/providers/instances/base.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/providers/instances/multiple.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/providers/instances/single.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/providers/label/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/providers/label/base.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/providers/label/binary.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/providers/label/multiple.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/providers/rows/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/providers/rows/base.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/providers/rows/samples.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/providers/sample/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/providers/sample/cropped.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/providers/text/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/providers/text/single.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/sample.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/input/terms_mapper.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/rows_fmt.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/data/rows_parser.py +0 -0
- {arekit-0.24.0/arekit/common/data/input/repositories → arekit-0.25.1/arekit/common/data/storages}/__init__.py +0 -0
- {arekit-0.24.0/arekit/common/data/storages → arekit-0.25.1/arekit/common/docs}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/docs/base.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/docs/entity.py +0 -0
- {arekit-0.24.0/arekit/common/data/views → arekit-0.25.1/arekit/common/docs/parsed}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/docs/parsed/base.py +0 -0
- {arekit-0.24.0/arekit/common/docs → arekit-0.25.1/arekit/common/docs/parsed/providers}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/docs/parsed/providers/base.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/docs/parsed/providers/base_pairs.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/docs/parsed/providers/entity_service.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/docs/parsed/providers/opinion_pairs.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/docs/parsed/providers/text_opinion_pairs.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/docs/parsed/service.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/docs/parsed/term_position.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/docs/sentence.py +0 -0
- {arekit-0.24.0/arekit/common/docs/parsed → arekit-0.25.1/arekit/common/entities}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/entities/base.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/entities/collection.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/entities/str_fmt.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/entities/types.py +0 -0
- {arekit-0.24.0/arekit/common/docs/parsed/providers → arekit-0.25.1/arekit/common/experiment}/__init__.py +0 -0
- {arekit-0.24.0/arekit/common/entities → arekit-0.25.1/arekit/common/experiment/api}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/experiment/api/base_samples_io.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/experiment/data_type.py +0 -0
- {arekit-0.24.0/arekit/common/experiment → arekit-0.25.1/arekit/common/frames}/__init__.py +0 -0
- {arekit-0.24.0/arekit/common/experiment/api → arekit-0.25.1/arekit/common/frames/connotations}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/frames/connotations/descriptor.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/frames/connotations/provider.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/frames/text_variant.py +0 -0
- {arekit-0.24.0/arekit/common/frames → arekit-0.25.1/arekit/common/frames/variants}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/frames/variants/base.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/frames/variants/collection.py +0 -0
- {arekit-0.24.0/arekit/common/frames/connotations → arekit-0.25.1/arekit/common/labels}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/labels/base.py +0 -0
- {arekit-0.24.0/arekit/common/frames/variants → arekit-0.25.1/arekit/common/labels/provider}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/labels/provider/base.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/labels/provider/constant.py +0 -0
- {arekit-0.24.0/arekit/common/labels → arekit-0.25.1/arekit/common/labels/scaler}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/labels/scaler/base.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/labels/scaler/sentiment.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/labels/scaler/single.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/labels/str_fmt.py +0 -0
- {arekit-0.24.0/arekit/common/labels/provider → arekit-0.25.1/arekit/common/linkage}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/linkage/base.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/linkage/meta.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/linkage/opinions.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/linkage/text_opinions.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/log_utils.py +0 -0
- {arekit-0.24.0/arekit/common/labels/scaler → arekit-0.25.1/arekit/common/model}/__init__.py +0 -0
- {arekit-0.24.0/arekit/common/linkage → arekit-0.25.1/arekit/common/model/labeling}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/model/labeling/base.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/model/labeling/modes.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/model/labeling/single.py +0 -0
- {arekit-0.24.0/arekit/common/model → arekit-0.25.1/arekit/common/opinions}/__init__.py +0 -0
- {arekit-0.24.0/arekit/common/model/labeling → arekit-0.25.1/arekit/common/opinions/annot}/__init__.py +0 -0
- {arekit-0.24.0/arekit/common/opinions → arekit-0.25.1/arekit/common/opinions/annot/algo}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/opinions/annot/algo/base.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/opinions/annot/algo/pair_based.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/opinions/annot/algo/predefined.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/opinions/annot/algo_based.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/opinions/annot/base.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/opinions/base.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/opinions/collection.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/opinions/enums.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/opinions/provider.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/opinions/writer.py +0 -0
- {arekit-0.24.0/arekit/common/opinions/annot → arekit-0.25.1/arekit/common/pipeline}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/pipeline/conts.py +0 -0
- {arekit-0.24.0/arekit/common/opinions/annot/algo → arekit-0.25.1/arekit/common/pipeline/items}/__init__.py +0 -0
- {arekit-0.24.0/arekit/common/pipeline → arekit-0.25.1/arekit/common/synonyms}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/synonyms/base.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/synonyms/grouping.py +0 -0
- {arekit-0.24.0/arekit/common/pipeline/items → arekit-0.25.1/arekit/common/text}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/text/enums.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/text/parsed.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/text/stemmer.py +0 -0
- {arekit-0.24.0/arekit/common/synonyms → arekit-0.25.1/arekit/common/text_opinions}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/common/text_opinions/base.py +0 -0
- {arekit-0.24.0/arekit/common/text → arekit-0.25.1/arekit/contrib}/__init__.py +0 -0
- {arekit-0.24.0/arekit/common/text/partitioning → arekit-0.25.1/arekit/contrib/bert}/__init__.py +0 -0
- {arekit-0.24.0/arekit/common/text_opinions → arekit-0.25.1/arekit/contrib/bert/input}/__init__.py +0 -0
- {arekit-0.24.0/arekit/contrib → arekit-0.25.1/arekit/contrib/bert/input/providers}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/bert/input/providers/cropped_sample.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/bert/input/providers/text_pair.py +0 -0
- {arekit-0.24.0/arekit/contrib/bert → arekit-0.25.1/arekit/contrib/bert/terms}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/bert/terms/mapper.py +0 -0
- {arekit-0.24.0/arekit/contrib/bert/input → arekit-0.25.1/arekit/contrib/prompt}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/prompt/sample.py +0 -0
- {arekit-0.24.0/arekit/contrib/bert/input/providers → arekit-0.25.1/arekit/contrib/utils}/__init__.py +0 -0
- {arekit-0.24.0/arekit/contrib/bert/terms → arekit-0.25.1/arekit/contrib/utils/bert}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/bert/samplers.py +0 -0
- {arekit-0.24.0/arekit/contrib/networks → arekit-0.25.1/arekit/contrib/utils/data}/__init__.py +0 -0
- {arekit-0.24.0/arekit/contrib/networks/input → arekit-0.25.1/arekit/contrib/utils/data/contents}/__init__.py +0 -0
- {arekit-0.24.0/arekit/contrib/networks/input/embedding → arekit-0.25.1/arekit/contrib/utils/data/doc_provider}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/data/doc_provider/dict_based.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/data/doc_provider/dir_based.py +0 -0
- {arekit-0.24.0/arekit/contrib/networks/input/formatters → arekit-0.25.1/arekit/contrib/utils/data/storages}/__init__.py +0 -0
- {arekit-0.24.0/arekit/contrib/networks/input/providers → arekit-0.25.1/arekit/contrib/utils/data/writers}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/data/writers/base.py +0 -0
- {arekit-0.24.0/arekit/contrib/prompt → arekit-0.25.1/arekit/contrib/utils/entities}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/entities/filter.py +0 -0
- {arekit-0.24.0/arekit/contrib/source → arekit-0.25.1/arekit/contrib/utils/entities/formatters}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/entities/formatters/str_display.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py +0 -0
- {arekit-0.24.0/arekit/contrib/source/brat → arekit-0.25.1/arekit/contrib/utils/io_utils}/__init__.py +0 -0
- {arekit-0.24.0/arekit/contrib/source/brat/entities → arekit-0.25.1/arekit/contrib/utils/pipelines}/__init__.py +0 -0
- {arekit-0.24.0/arekit/contrib/source/brat/opinions → arekit-0.25.1/arekit/contrib/utils/pipelines/items}/__init__.py +0 -0
- {arekit-0.24.0/arekit/contrib/source/nerel → arekit-0.25.1/arekit/contrib/utils/pipelines/items/text}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/opinion_collections.py +0 -0
- {arekit-0.24.0/arekit/contrib/source/nerel/folding → arekit-0.25.1/arekit/contrib/utils/pipelines/text_opinion}/__init__.py +0 -0
- {arekit-0.24.0/arekit/contrib/source/nerelbio → arekit-0.25.1/arekit/contrib/utils/pipelines/text_opinion/annot}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +0 -0
- {arekit-0.24.0/arekit/contrib/source/ruattitudes → arekit-0.25.1/arekit/contrib/utils/pipelines/text_opinion/filters}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/text_opinion/filters/base.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +0 -0
- {arekit-0.24.0/arekit/contrib/source/ruattitudes/entity → arekit-0.25.1/arekit/contrib/utils/processing}/__init__.py +0 -0
- {arekit-0.24.0/arekit/contrib/source/ruattitudes/opinions → arekit-0.25.1/arekit/contrib/utils/synonyms}/__init__.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/synonyms/simple.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit/contrib/utils/synonyms/stemmer_based.py +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit.egg-info/dependency_links.txt +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/arekit.egg-info/top_level.txt +0 -0
- {arekit-0.24.0 → arekit-0.25.1}/setup.cfg +0 -0
arekit-0.25.1/PKG-INFO
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: arekit
|
|
3
|
+
Version: 0.25.1
|
|
4
|
+
Summary: Document level Attitude and Relation Extraction toolkit (AREkit) for sampling and prompting mass-media news into datasets for ML-model training
|
|
5
|
+
Home-page: https://github.com/nicolay-r/AREkit
|
|
6
|
+
Author: Nicolay Rusnachenko
|
|
7
|
+
Author-email: rusnicolay@gmail.com
|
|
8
|
+
License: MIT License
|
|
9
|
+
Keywords: natural language processing,relation extraction,sentiment analysis
|
|
10
|
+
Classifier: Programming Language :: Python
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.6
|
|
12
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
14
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
15
|
+
Requires-Python: >=3.6
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: tqdm
|
|
19
|
+
Requires-Dist: enum34==1.1.10
|
|
20
|
+
Requires-Dist: numpy>=1.14.5
|
|
21
|
+
|
|
22
|
+
# AREkit 0.25.1
|
|
23
|
+
|
|
24
|
+

|
|
25
|
+
|
|
26
|
+
<p align="center">
|
|
27
|
+
<img src="logo.png"/>
|
|
28
|
+
</p>
|
|
29
|
+
|
|
30
|
+
**AREkit** (Attitude and Relation Extraction Toolkit) --
|
|
31
|
+
is a python toolkit, devoted to document level Attitude and Relation Extraction between text objects from mass-media news.
|
|
32
|
+
|
|
33
|
+
## Description
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
This toolkit aims at memory-effective data processing in [Relation Extraction (RE)](https://nlpprogress.com/english/relationship_extraction.html) related tasks.
|
|
37
|
+
|
|
38
|
+
<p align="center">
|
|
39
|
+
<img src="docs/arekit-pipeline-concept.png"/>
|
|
40
|
+
</p>
|
|
41
|
+
|
|
42
|
+
> Figure: AREkit pipelines design. More on
|
|
43
|
+
> **[ARElight: Context Sampling of Large Texts for Deep Learning Relation Extraction](https://link.springer.com/chapter/10.1007/978-3-031-56069-9_23)** paper
|
|
44
|
+
|
|
45
|
+
In particular, this framework serves the following features:
|
|
46
|
+
* ➿ [pipelines](https://github.com/nicolay-r/AREkit/wiki/Pipelines:-Text-Opinion-Annotation) and iterators for handling large-scale collections serialization without out-of-memory issues.
|
|
47
|
+
* 🔗 EL (entity-linking) API support for objects,
|
|
48
|
+
* ➰ avoidance of cyclic connections,
|
|
49
|
+
* :straight_ruler: distance consideration between relation participants (in `terms` or `sentences`),
|
|
50
|
+
* 📑 relations annotations and filtering rules,
|
|
51
|
+
* *️⃣ entities formatting or masking, and more.
|
|
52
|
+
|
|
53
|
+
The core functionality includes:
|
|
54
|
+
* API for document presentation with EL (Entity Linking, i.e. Object Synonymy) support
|
|
55
|
+
for sentence level relations preparation (dubbed as contexts);
|
|
56
|
+
* API for contexts extraction;
|
|
57
|
+
* Relations transferring from sentence-level onto document-level, and more.
|
|
58
|
+
|
|
59
|
+
## Installation
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install git+https://github.com/nicolay-r/AREkit.git@0.25.1-rc
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Usage
|
|
66
|
+
|
|
67
|
+
Please follow the **[tutorial section on project Wiki](https://github.com/nicolay-r/AREkit/wiki/Tutorials)** for mode details.
|
|
68
|
+
|
|
69
|
+
## How to cite
|
|
70
|
+
A great research is also accompanied by the faithful reference.
|
|
71
|
+
if you use or extend our work, please cite as follows:
|
|
72
|
+
|
|
73
|
+
```bibtex
|
|
74
|
+
@inproceedings{rusnachenko2024arelight,
|
|
75
|
+
title={ARElight: Context Sampling of Large Texts for Deep Learning Relation Extraction},
|
|
76
|
+
author={Rusnachenko, Nicolay and Liang, Huizhi and Kolomeets, Maxim and Shi, Lei},
|
|
77
|
+
booktitle={European Conference on Information Retrieval},
|
|
78
|
+
year={2024},
|
|
79
|
+
organization={Springer}
|
|
80
|
+
}
|
|
81
|
+
```
|
arekit-0.25.1/README.md
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# AREkit 0.25.1
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+
|
|
5
|
+
<p align="center">
|
|
6
|
+
<img src="logo.png"/>
|
|
7
|
+
</p>
|
|
8
|
+
|
|
9
|
+
**AREkit** (Attitude and Relation Extraction Toolkit) --
|
|
10
|
+
is a python toolkit, devoted to document level Attitude and Relation Extraction between text objects from mass-media news.
|
|
11
|
+
|
|
12
|
+
## Description
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
This toolkit aims at memory-effective data processing in [Relation Extraction (RE)](https://nlpprogress.com/english/relationship_extraction.html) related tasks.
|
|
16
|
+
|
|
17
|
+
<p align="center">
|
|
18
|
+
<img src="docs/arekit-pipeline-concept.png"/>
|
|
19
|
+
</p>
|
|
20
|
+
|
|
21
|
+
> Figure: AREkit pipelines design. More on
|
|
22
|
+
> **[ARElight: Context Sampling of Large Texts for Deep Learning Relation Extraction](https://link.springer.com/chapter/10.1007/978-3-031-56069-9_23)** paper
|
|
23
|
+
|
|
24
|
+
In particular, this framework serves the following features:
|
|
25
|
+
* ➿ [pipelines](https://github.com/nicolay-r/AREkit/wiki/Pipelines:-Text-Opinion-Annotation) and iterators for handling large-scale collections serialization without out-of-memory issues.
|
|
26
|
+
* 🔗 EL (entity-linking) API support for objects,
|
|
27
|
+
* ➰ avoidance of cyclic connections,
|
|
28
|
+
* :straight_ruler: distance consideration between relation participants (in `terms` or `sentences`),
|
|
29
|
+
* 📑 relations annotations and filtering rules,
|
|
30
|
+
* *️⃣ entities formatting or masking, and more.
|
|
31
|
+
|
|
32
|
+
The core functionality includes:
|
|
33
|
+
* API for document presentation with EL (Entity Linking, i.e. Object Synonymy) support
|
|
34
|
+
for sentence level relations preparation (dubbed as contexts);
|
|
35
|
+
* API for contexts extraction;
|
|
36
|
+
* Relations transferring from sentence-level onto document-level, and more.
|
|
37
|
+
|
|
38
|
+
## Installation
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install git+https://github.com/nicolay-r/AREkit.git@0.25.1-rc
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Usage
|
|
45
|
+
|
|
46
|
+
Please follow the **[tutorial section on project Wiki](https://github.com/nicolay-r/AREkit/wiki/Tutorials)** for mode details.
|
|
47
|
+
|
|
48
|
+
## How to cite
|
|
49
|
+
A great research is also accompanied by the faithful reference.
|
|
50
|
+
if you use or extend our work, please cite as follows:
|
|
51
|
+
|
|
52
|
+
```bibtex
|
|
53
|
+
@inproceedings{rusnachenko2024arelight,
|
|
54
|
+
title={ARElight: Context Sampling of Large Texts for Deep Learning Relation Extraction},
|
|
55
|
+
author={Rusnachenko, Nicolay and Liang, Huizhi and Kolomeets, Maxim and Shi, Lei},
|
|
56
|
+
booktitle={European Conference on Information Retrieval},
|
|
57
|
+
year={2024},
|
|
58
|
+
organization={Springer}
|
|
59
|
+
}
|
|
60
|
+
```
|
|
@@ -10,6 +10,9 @@ logger = logging.getLogger(__name__)
|
|
|
10
10
|
|
|
11
11
|
class BaseRowsStorage(object):
|
|
12
12
|
|
|
13
|
+
def __init__(self, log_out=None):
|
|
14
|
+
self.__log_out = log_out
|
|
15
|
+
|
|
13
16
|
# region protected methods
|
|
14
17
|
|
|
15
18
|
def _begin_filling_row(self, row_ind):
|
|
@@ -31,27 +34,12 @@ class BaseRowsStorage(object):
|
|
|
31
34
|
def _get_rows_count(self):
|
|
32
35
|
raise NotImplemented()
|
|
33
36
|
|
|
34
|
-
def find_by_value(self, column_name, value):
|
|
35
|
-
raise NotImplemented()
|
|
36
|
-
|
|
37
|
-
def find_first_by_value(self, column_name, value):
|
|
38
|
-
raise NotImplemented()
|
|
39
|
-
|
|
40
|
-
def iter_column_values(self, column_name, dtype=None):
|
|
41
|
-
raise NotImplemented()
|
|
42
|
-
|
|
43
37
|
def get_row(self, row_index):
|
|
44
38
|
raise NotImplemented()
|
|
45
39
|
|
|
46
|
-
def get_cell(self, row_index, column_name):
|
|
47
|
-
raise NotImplemented()
|
|
48
|
-
|
|
49
40
|
def init_empty(self, columns_provider):
|
|
50
41
|
raise NotImplemented()
|
|
51
42
|
|
|
52
|
-
def iter_shuffled(self):
|
|
53
|
-
raise NotImplemented()
|
|
54
|
-
|
|
55
43
|
def iter_column_names(self):
|
|
56
44
|
raise NotImplemented()
|
|
57
45
|
|
|
@@ -81,6 +69,7 @@ class BaseRowsStorage(object):
|
|
|
81
69
|
condition_func=lambda item: not isinstance(item[1], MetaEmptyLinkedDataWrapper),
|
|
82
70
|
postfix_func=postfix_func,
|
|
83
71
|
desc="{fmt}".format(fmt=desc),
|
|
72
|
+
file=self.__log_out,
|
|
84
73
|
total=rows_count)
|
|
85
74
|
|
|
86
75
|
for row_index, item in enumerate(pbar_it):
|
|
@@ -4,8 +4,9 @@ from arekit.common.pipeline.items.base import BasePipelineItem
|
|
|
4
4
|
|
|
5
5
|
class EntitiesGroupingPipelineItem(BasePipelineItem):
|
|
6
6
|
|
|
7
|
-
def __init__(self, value_to_group_id_func):
|
|
7
|
+
def __init__(self, value_to_group_id_func, **kwargs):
|
|
8
8
|
assert(callable(value_to_group_id_func))
|
|
9
|
+
super(EntitiesGroupingPipelineItem, self).__init__(**kwargs)
|
|
9
10
|
self.__value_to_group_id_func = value_to_group_id_func
|
|
10
11
|
|
|
11
12
|
def apply_core(self, input_data, pipeline_ctx):
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from arekit.common.docs.base import Document
|
|
2
|
+
from arekit.common.docs.parsed.base import ParsedDocument
|
|
3
|
+
from arekit.common.pipeline.batching import BatchingPipelineLauncher
|
|
4
|
+
from arekit.common.pipeline.context import PipelineContext
|
|
5
|
+
from arekit.common.pipeline.utils import BatchIterator
|
|
6
|
+
from arekit.common.text.parsed import BaseParsedText
|
|
7
|
+
from arekit.common.utils import progress_bar_defined
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DocumentParsers(object):
|
|
11
|
+
|
|
12
|
+
@staticmethod
|
|
13
|
+
def parse_batch(doc, pipeline_items, batch_size, parent_ppl_ctx=None, src_key="input", show_progress=False):
|
|
14
|
+
""" This document parser is based on batch of sentences.
|
|
15
|
+
"""
|
|
16
|
+
assert(isinstance(batch_size, int) and batch_size > 0)
|
|
17
|
+
assert(isinstance(doc, Document))
|
|
18
|
+
assert(isinstance(pipeline_items, list))
|
|
19
|
+
assert(isinstance(parent_ppl_ctx, PipelineContext) or parent_ppl_ctx is None)
|
|
20
|
+
|
|
21
|
+
parsed_sentences = []
|
|
22
|
+
|
|
23
|
+
data_it = BatchIterator(data_iter=iter(range(doc.SentencesCount)), batch_size=batch_size)
|
|
24
|
+
progress_it = progress_bar_defined(data_it, total=round(doc.SentencesCount / batch_size),
|
|
25
|
+
disable=not show_progress)
|
|
26
|
+
|
|
27
|
+
for batch in progress_it:
|
|
28
|
+
|
|
29
|
+
# Composing the context from a single sentence.
|
|
30
|
+
ctx = PipelineContext({src_key: [doc.get_sentence(s_ind) for s_ind in batch]},
|
|
31
|
+
parent_ctx=parent_ppl_ctx)
|
|
32
|
+
|
|
33
|
+
# Apply all the operations.
|
|
34
|
+
BatchingPipelineLauncher.run(pipeline=pipeline_items, pipeline_ctx=ctx, src_key=src_key)
|
|
35
|
+
|
|
36
|
+
# Collecting the result.
|
|
37
|
+
parsed_sentences += [BaseParsedText(terms=result) for result in ctx.provide("result")]
|
|
38
|
+
|
|
39
|
+
return ParsedDocument(doc_id=doc.ID, parsed_sentences=parsed_sentences)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from arekit.common.pipeline.context import PipelineContext
|
|
2
|
+
from arekit.common.pipeline.items.base import BasePipelineItem
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class BasePipelineLauncher:
|
|
6
|
+
|
|
7
|
+
@staticmethod
|
|
8
|
+
def run(pipeline, pipeline_ctx, src_key=None, has_input=True):
|
|
9
|
+
assert(isinstance(pipeline, list))
|
|
10
|
+
assert(isinstance(pipeline_ctx, PipelineContext))
|
|
11
|
+
assert(isinstance(src_key, str) or src_key is None)
|
|
12
|
+
|
|
13
|
+
for ind, item in enumerate(filter(lambda itm: itm is not None, pipeline)):
|
|
14
|
+
assert(isinstance(item, BasePipelineItem))
|
|
15
|
+
do_force_key = src_key is not None and ind == 0
|
|
16
|
+
input_data = item.get_source(pipeline_ctx, force_key=src_key if do_force_key else None) \
|
|
17
|
+
if has_input or ind > 0 else None
|
|
18
|
+
item_result = item.apply(input_data=input_data, pipeline_ctx=pipeline_ctx)
|
|
19
|
+
pipeline_ctx.update(param=item.ResultKey, value=item_result, is_new_key=False)
|
|
20
|
+
|
|
21
|
+
return pipeline_ctx
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from arekit.common.pipeline.context import PipelineContext
|
|
2
|
+
from arekit.common.pipeline.items.base import BasePipelineItem
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class BatchingPipelineLauncher:
|
|
6
|
+
|
|
7
|
+
@staticmethod
|
|
8
|
+
def run(pipeline, pipeline_ctx, src_key=None):
|
|
9
|
+
assert(isinstance(pipeline, list))
|
|
10
|
+
assert(isinstance(pipeline_ctx, PipelineContext))
|
|
11
|
+
assert(isinstance(src_key, str) or src_key is None)
|
|
12
|
+
|
|
13
|
+
for ind, item in enumerate(filter(lambda itm: itm is not None, pipeline)):
|
|
14
|
+
assert (isinstance(item, BasePipelineItem))
|
|
15
|
+
|
|
16
|
+
# Handle the content of the batch or batch itself.
|
|
17
|
+
content = item.get_source(pipeline_ctx, call_func=False, force_key=src_key if ind == 0 else None)
|
|
18
|
+
handled_batch = [item._src_func(i) if item._src_func is not None else i for i in content]
|
|
19
|
+
|
|
20
|
+
if item.SupportBatching:
|
|
21
|
+
batch_result = list(item.apply(input_data=handled_batch, pipeline_ctx=pipeline_ctx))
|
|
22
|
+
else:
|
|
23
|
+
batch_result = [item.apply(input_data=input_data, pipeline_ctx=pipeline_ctx)
|
|
24
|
+
for input_data in handled_batch]
|
|
25
|
+
|
|
26
|
+
pipeline_ctx.update(param=item.ResultKey, value=batch_result, is_new_key=False)
|
|
27
|
+
|
|
28
|
+
return pipeline_ctx
|
|
@@ -13,6 +13,8 @@ class PipelineContext(object):
|
|
|
13
13
|
self._d[PARENT_CTX] = parent_ctx
|
|
14
14
|
|
|
15
15
|
def __provide(self, param):
|
|
16
|
+
if param not in self._d:
|
|
17
|
+
raise Exception(f"Key `{param}` is not in dictionary.\n{self._d}")
|
|
16
18
|
return self._d[param]
|
|
17
19
|
|
|
18
20
|
# region public
|
|
@@ -23,7 +25,9 @@ class PipelineContext(object):
|
|
|
23
25
|
def provide_or_none(self, param):
|
|
24
26
|
return self.__provide(param) if param in self._d else None
|
|
25
27
|
|
|
26
|
-
def update(self, param, value):
|
|
28
|
+
def update(self, param, value, is_new_key=False):
|
|
29
|
+
if is_new_key and param in self._d:
|
|
30
|
+
raise Exception(f"Key `{param}` is already presented in pipeline context dictionary.")
|
|
27
31
|
self._d[param] = value
|
|
28
32
|
|
|
29
33
|
# endregion
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from arekit.common.pipeline.context import PipelineContext
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class BasePipelineItem(object):
|
|
5
|
+
""" Single pipeline item that might be instantiated and embedded into pipeline.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
def __init__(self, src_key="result", result_key="result", src_func=None):
|
|
9
|
+
assert(isinstance(src_key, str) or src_key is None)
|
|
10
|
+
assert(callable(src_func) or src_func is None)
|
|
11
|
+
self.__src_key = src_key
|
|
12
|
+
self._src_func = src_func
|
|
13
|
+
self.__result_key = result_key
|
|
14
|
+
|
|
15
|
+
@property
|
|
16
|
+
def ResultKey(self):
|
|
17
|
+
return self.__result_key
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def SupportBatching(self):
|
|
21
|
+
""" By default pipeline item is not designed for batching.
|
|
22
|
+
"""
|
|
23
|
+
return False
|
|
24
|
+
|
|
25
|
+
def get_source(self, src_ctx, call_func=True, force_key=None):
|
|
26
|
+
""" Extract input element for processing.
|
|
27
|
+
"""
|
|
28
|
+
assert(isinstance(src_ctx, PipelineContext))
|
|
29
|
+
|
|
30
|
+
# If there is no information about key, then we consider absence of the source.
|
|
31
|
+
if self.__src_key is None:
|
|
32
|
+
return None
|
|
33
|
+
|
|
34
|
+
# Extracting actual source.
|
|
35
|
+
src_data = src_ctx.provide(self.__src_key if force_key is None else force_key)
|
|
36
|
+
if self._src_func is not None and call_func:
|
|
37
|
+
src_data = self._src_func(src_data)
|
|
38
|
+
|
|
39
|
+
return src_data
|
|
40
|
+
|
|
41
|
+
def apply_core(self, input_data, pipeline_ctx):
|
|
42
|
+
"""By default we do nothing."""
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
def apply(self, input_data, pipeline_ctx=None):
|
|
46
|
+
""" Performs input processing an update it for a further pipeline items.
|
|
47
|
+
"""
|
|
48
|
+
output_data = self.apply_core(input_data=input_data, pipeline_ctx=pipeline_ctx)
|
|
49
|
+
return output_data if output_data is not None else input_data
|
|
@@ -5,10 +5,14 @@ class FlattenIterPipelineItem(BasePipelineItem):
|
|
|
5
5
|
""" Considered to flat iterations of items that represent iterations.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
+
def __init__(self, **kwargs):
|
|
9
|
+
super(FlattenIterPipelineItem, self).__init__(**kwargs)
|
|
10
|
+
pass
|
|
11
|
+
|
|
8
12
|
def __flat_iter(self, iter_data):
|
|
9
13
|
for iter_item in iter_data:
|
|
10
14
|
for item in iter_item:
|
|
11
15
|
yield item
|
|
12
16
|
|
|
13
17
|
def apply_core(self, input_data, pipeline_ctx):
|
|
14
|
-
return self.__flat_iter(input_data)
|
|
18
|
+
return self.__flat_iter(input_data)
|
|
@@ -3,8 +3,9 @@ from arekit.common.pipeline.items.base import BasePipelineItem
|
|
|
3
3
|
|
|
4
4
|
class HandleIterPipelineItem(BasePipelineItem):
|
|
5
5
|
|
|
6
|
-
def __init__(self, handle_func=None):
|
|
6
|
+
def __init__(self, handle_func=None, **kwargs):
|
|
7
7
|
assert(callable(handle_func))
|
|
8
|
+
super(HandleIterPipelineItem, self).__init__(**kwargs)
|
|
8
9
|
self.__handle_func = handle_func
|
|
9
10
|
|
|
10
11
|
def __updated_data(self, items_iter):
|
|
@@ -3,8 +3,9 @@ from arekit.common.pipeline.items.base import BasePipelineItem
|
|
|
3
3
|
|
|
4
4
|
class FilterPipelineItem(BasePipelineItem):
|
|
5
5
|
|
|
6
|
-
def __init__(self, filter_func=None):
|
|
6
|
+
def __init__(self, filter_func=None, **kwargs):
|
|
7
7
|
assert(callable(filter_func))
|
|
8
|
+
super(FilterPipelineItem, self).__init__(**kwargs)
|
|
8
9
|
self.__filter_func = filter_func
|
|
9
10
|
|
|
10
11
|
def apply_core(self, input_data, pipeline_ctx):
|
|
@@ -3,8 +3,9 @@ from arekit.common.pipeline.items.base import BasePipelineItem
|
|
|
3
3
|
|
|
4
4
|
class MapPipelineItem(BasePipelineItem):
|
|
5
5
|
|
|
6
|
-
def __init__(self, map_func=None):
|
|
6
|
+
def __init__(self, map_func=None, **kwargs):
|
|
7
7
|
assert(callable(map_func))
|
|
8
|
+
super(MapPipelineItem, self).__init__(**kwargs)
|
|
8
9
|
self._map_func = map_func
|
|
9
10
|
|
|
10
11
|
def apply_core(self, input_data, pipeline_ctx):
|
|
@@ -9,5 +9,9 @@ class MapNestedPipelineItem(MapPipelineItem):
|
|
|
9
9
|
suppose to be mapped with the passed pipeline context.
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
|
+
def __init__(self, **kwargs):
|
|
13
|
+
super(MapNestedPipelineItem, self).__init__(**kwargs)
|
|
14
|
+
pass
|
|
15
|
+
|
|
12
16
|
def apply_core(self, input_data, pipeline_ctx):
|
|
13
17
|
return map(lambda item: self._map_func(item, pipeline_ctx), input_data)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
class BatchIterator:
|
|
2
|
+
|
|
3
|
+
def __init__(self, data_iter, batch_size, end_value=None):
|
|
4
|
+
assert(isinstance(batch_size, int) and batch_size > 0)
|
|
5
|
+
assert(callable(end_value) or end_value is None)
|
|
6
|
+
self.__data_iter = data_iter
|
|
7
|
+
self.__index = 0
|
|
8
|
+
self.__batch_size = batch_size
|
|
9
|
+
self.__end_value = end_value
|
|
10
|
+
|
|
11
|
+
def __iter__(self):
|
|
12
|
+
return self
|
|
13
|
+
|
|
14
|
+
def __next__(self):
|
|
15
|
+
buffer = []
|
|
16
|
+
while True:
|
|
17
|
+
try:
|
|
18
|
+
data = next(self.__data_iter)
|
|
19
|
+
except StopIteration:
|
|
20
|
+
break
|
|
21
|
+
buffer.append(data)
|
|
22
|
+
if len(buffer) == self.__batch_size:
|
|
23
|
+
break
|
|
24
|
+
|
|
25
|
+
if len(buffer) > 0:
|
|
26
|
+
self.__index += 1
|
|
27
|
+
return buffer
|
|
28
|
+
|
|
29
|
+
if self.__end_value is None:
|
|
30
|
+
raise StopIteration
|
|
31
|
+
else:
|
|
32
|
+
return self.__end_value()
|
|
@@ -1,28 +1,34 @@
|
|
|
1
1
|
from collections.abc import Iterable
|
|
2
2
|
|
|
3
3
|
from arekit.common.bound import Bound
|
|
4
|
-
from arekit.common.text.partitioning.base import BasePartitioning
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
class
|
|
6
|
+
class Partitioning(object):
|
|
8
7
|
""" NOTE: considering that provided parts
|
|
9
8
|
has no intersections between each other
|
|
10
9
|
"""
|
|
11
10
|
|
|
11
|
+
list_reg_types = {
|
|
12
|
+
"str": lambda p, item: p.append(item),
|
|
13
|
+
"list": lambda p, item: p.extend(item)
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
def __init__(self, text_fmt):
|
|
17
|
+
assert(isinstance(text_fmt, str) and text_fmt in self.list_reg_types)
|
|
18
|
+
self.__reg_part = self.list_reg_types[text_fmt]
|
|
19
|
+
|
|
12
20
|
def provide(self, text, parts_it):
|
|
13
|
-
assert(isinstance(text, str))
|
|
14
21
|
assert(isinstance(parts_it, Iterable))
|
|
15
22
|
|
|
16
|
-
start = 0
|
|
17
23
|
parts = []
|
|
24
|
+
start = 0
|
|
25
|
+
|
|
18
26
|
for value, bound in parts_it:
|
|
19
27
|
assert(isinstance(bound, Bound))
|
|
20
28
|
assert(bound.Position >= start)
|
|
21
29
|
|
|
22
30
|
# Release everything till the current value position.
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
parts.append(part)
|
|
31
|
+
self.__reg_part(p=parts, item=text[start:bound.Position])
|
|
26
32
|
|
|
27
33
|
# Release the entity value.
|
|
28
34
|
parts.extend([value])
|
|
@@ -30,7 +36,6 @@ class StringPartitioning(BasePartitioning):
|
|
|
30
36
|
start = bound.Position + bound.Length
|
|
31
37
|
|
|
32
38
|
# Release everything after the last entity.
|
|
33
|
-
|
|
34
|
-
parts.extend([last_part])
|
|
39
|
+
self.__reg_part(p=parts, item=text[start:len(text)])
|
|
35
40
|
|
|
36
41
|
return parts
|
|
@@ -1,6 +1,4 @@
|
|
|
1
|
-
import sys
|
|
2
1
|
import os
|
|
3
|
-
import requests
|
|
4
2
|
from tqdm import tqdm
|
|
5
3
|
|
|
6
4
|
|
|
@@ -28,14 +26,14 @@ def split_by_whitespaces(text):
|
|
|
28
26
|
return text.split()
|
|
29
27
|
|
|
30
28
|
|
|
31
|
-
def progress_bar(iterable, total, desc="", unit="it"):
|
|
29
|
+
def progress_bar(iterable, total, desc="", unit="it", file=None, disable=False):
|
|
32
30
|
if total is not None:
|
|
33
|
-
return progress_bar_defined(iterable=iterable, total=total, desc=desc, unit=unit)
|
|
31
|
+
return progress_bar_defined(iterable=iterable, total=total, desc=desc, unit=unit, file=file, disable=disable)
|
|
34
32
|
else:
|
|
35
|
-
return progress_bar_iter(iterable=iterable, desc=desc, unit=unit)
|
|
33
|
+
return progress_bar_iter(iterable=iterable, desc=desc, unit=unit, file=file, disable=disable)
|
|
36
34
|
|
|
37
35
|
|
|
38
|
-
def progress_bar_conditional(iterable, condition_func, total, postfix_func=None, desc="", unit="it"):
|
|
36
|
+
def progress_bar_conditional(iterable, condition_func, total, postfix_func=None, desc="", unit="it", file=None):
|
|
39
37
|
""" This progress-bar updates only on the
|
|
40
38
|
specific conditions during the iteration process.
|
|
41
39
|
"""
|
|
@@ -48,7 +46,7 @@ def progress_bar_conditional(iterable, condition_func, total, postfix_func=None,
|
|
|
48
46
|
yield 0
|
|
49
47
|
|
|
50
48
|
pbar_it = progress_bar(iterable=__iter_infinite_placeholder(),
|
|
51
|
-
desc=desc, unit=unit, total=total)
|
|
49
|
+
desc=desc, unit=unit, total=total, file=file)
|
|
52
50
|
element = iter(pbar_it)
|
|
53
51
|
|
|
54
52
|
# Initialize with 0.
|
|
@@ -66,7 +64,7 @@ def progress_bar_conditional(iterable, condition_func, total, postfix_func=None,
|
|
|
66
64
|
pbar_it.set_postfix(postfix_func(item))
|
|
67
65
|
|
|
68
66
|
|
|
69
|
-
def progress_bar_defined(iterable, total, miniters=200, desc="", unit="it"):
|
|
67
|
+
def progress_bar_defined(iterable, total, miniters=200, desc="", unit="it", file=None, disable=False):
|
|
70
68
|
return tqdm(iterable=iterable,
|
|
71
69
|
total=total,
|
|
72
70
|
desc=desc,
|
|
@@ -74,56 +72,17 @@ def progress_bar_defined(iterable, total, miniters=200, desc="", unit="it"):
|
|
|
74
72
|
position=0,
|
|
75
73
|
leave=True,
|
|
76
74
|
unit=unit,
|
|
75
|
+
file=file,
|
|
76
|
+
disable=disable,
|
|
77
77
|
miniters=total / miniters if total is not None else total)
|
|
78
78
|
|
|
79
79
|
|
|
80
|
-
def progress_bar_iter(iterable, desc="", unit='it'):
|
|
80
|
+
def progress_bar_iter(iterable, desc="", unit='it', file=None, disable=False):
|
|
81
81
|
return tqdm(iterable=iterable,
|
|
82
82
|
desc=desc,
|
|
83
83
|
position=0,
|
|
84
84
|
leave=True,
|
|
85
85
|
ncols=120,
|
|
86
|
+
file=file,
|
|
87
|
+
disable=disable,
|
|
86
88
|
unit=unit)
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def get_default_download_dir():
|
|
90
|
-
""" Refered to NLTK toolkit approach
|
|
91
|
-
https://github.com/nltk/nltk/blob/8e771679cee1b4a9540633cc3ea17f4421ffd6c0/nltk/downloader.py#L1051
|
|
92
|
-
"""
|
|
93
|
-
|
|
94
|
-
# On Windows, use %APPDATA%
|
|
95
|
-
if sys.platform == "win32" and "APPDATA" in os.environ:
|
|
96
|
-
homedir = os.environ["APPDATA"]
|
|
97
|
-
|
|
98
|
-
# Otherwise, install in the user's home directory.
|
|
99
|
-
else:
|
|
100
|
-
homedir = os.path.expanduser("~/")
|
|
101
|
-
if homedir == "~/":
|
|
102
|
-
raise ValueError("Could not find a default download directory")
|
|
103
|
-
|
|
104
|
-
return os.path.join(homedir, ".arekit")
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
def download(dest_file_path, source_url):
|
|
108
|
-
""" Refered to https://github.com/nicolay-r/ner-bilstm-crf-tensorflow/blob/master/ner/utils.py
|
|
109
|
-
Simple http file downloader
|
|
110
|
-
"""
|
|
111
|
-
print(('Downloading from {src} to {dest}'.format(src=source_url, dest=dest_file_path)))
|
|
112
|
-
|
|
113
|
-
sys.stdout.flush()
|
|
114
|
-
datapath = os.path.dirname(dest_file_path)
|
|
115
|
-
|
|
116
|
-
if not os.path.exists(datapath):
|
|
117
|
-
os.makedirs(datapath, mode=0o755)
|
|
118
|
-
|
|
119
|
-
dest_file_path = os.path.abspath(dest_file_path)
|
|
120
|
-
|
|
121
|
-
r = requests.get(source_url, stream=True)
|
|
122
|
-
total_length = int(r.headers.get('content-length', 0))
|
|
123
|
-
|
|
124
|
-
with open(dest_file_path, 'wb') as f:
|
|
125
|
-
pbar = tqdm(total=total_length, unit='B', unit_scale=True)
|
|
126
|
-
for chunk in r.iter_content(chunk_size=32 * 1024):
|
|
127
|
-
if chunk: # filter out keep-alive new chunks
|
|
128
|
-
pbar.update(len(chunk))
|
|
129
|
-
f.write(chunk)
|
|
@@ -2,7 +2,8 @@ from arekit.common.data.input.providers.const import IDLE_MODE
|
|
|
2
2
|
from arekit.common.data.input.providers.contents import ContentsProvider
|
|
3
3
|
from arekit.common.linkage.base import LinkedDataWrapper
|
|
4
4
|
from arekit.common.linkage.text_opinions import TextOpinionsLinkage
|
|
5
|
-
from arekit.common.pipeline.base import
|
|
5
|
+
from arekit.common.pipeline.base import BasePipelineLauncher
|
|
6
|
+
from arekit.common.pipeline.context import PipelineContext
|
|
6
7
|
from arekit.common.text_opinions.base import TextOpinion
|
|
7
8
|
|
|
8
9
|
|
|
@@ -13,7 +14,7 @@ class InputTextOpinionProvider(ContentsProvider):
|
|
|
13
14
|
results in a TextOpinionLinkage instances.
|
|
14
15
|
pipeline: id -> ... -> TextOpinionLinkage[]
|
|
15
16
|
"""
|
|
16
|
-
assert(isinstance(pipeline,
|
|
17
|
+
assert(isinstance(pipeline, list))
|
|
17
18
|
self.__pipeline = pipeline
|
|
18
19
|
self.__current_id = None
|
|
19
20
|
|
|
@@ -30,7 +31,16 @@ class InputTextOpinionProvider(ContentsProvider):
|
|
|
30
31
|
|
|
31
32
|
def from_doc_ids(self, doc_ids, idle_mode=False):
|
|
32
33
|
self.__current_id = 0
|
|
33
|
-
|
|
34
|
+
|
|
35
|
+
ctx = PipelineContext(d={
|
|
36
|
+
"result": doc_ids,
|
|
37
|
+
IDLE_MODE: idle_mode
|
|
38
|
+
})
|
|
39
|
+
|
|
40
|
+
# Launching pipeline with the passed context
|
|
41
|
+
BasePipelineLauncher.run(pipeline=self.__pipeline, pipeline_ctx=ctx)
|
|
42
|
+
|
|
43
|
+
for linkage in ctx.provide("result"):
|
|
34
44
|
assert(isinstance(linkage, LinkedDataWrapper))
|
|
35
45
|
if isinstance(linkage, TextOpinionsLinkage):
|
|
36
46
|
self.__assign_ids(linkage)
|
|
@@ -5,8 +5,9 @@ from arekit.common.data.storages.base import BaseRowsStorage
|
|
|
5
5
|
|
|
6
6
|
class JsonlBasedRowsStorage(BaseRowsStorage):
|
|
7
7
|
|
|
8
|
-
def __init__(self, rows):
|
|
8
|
+
def __init__(self, rows, **kwargs):
|
|
9
9
|
assert(isinstance(rows, list))
|
|
10
|
+
super(JsonlBasedRowsStorage, self).__init__(**kwargs)
|
|
10
11
|
self.__rows = rows
|
|
11
12
|
|
|
12
13
|
def _iter_rows(self):
|