arekit 0.23.1__tar.gz → 0.25.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit-0.25.0/PKG-INFO +82 -0
- arekit-0.25.0/README.md +60 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/context/terms_mapper.py +2 -2
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/const.py +5 -4
- arekit-0.23.1/arekit/common/experiment/api/ops_doc.py → arekit-0.25.0/arekit/common/data/doc_provider.py +1 -1
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/providers/columns/sample.py +6 -1
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/providers/instances/base.py +1 -1
- arekit-0.25.0/arekit/common/data/input/providers/rows/base.py +64 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/providers/rows/samples.py +57 -55
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/providers/sample/cropped.py +2 -2
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/sample.py +1 -1
- arekit-0.25.0/arekit/common/data/rows_fmt.py +82 -0
- arekit-0.25.0/arekit/common/data/rows_parser.py +43 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/storages/base.py +23 -18
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/views/samples.py +2 -8
- {arekit-0.23.1/arekit/common/news → arekit-0.25.0/arekit/common/docs}/base.py +2 -2
- {arekit-0.23.1/arekit/common/news → arekit-0.25.0/arekit/common/docs}/entities_grouping.py +2 -1
- {arekit-0.23.1/arekit/common/news → arekit-0.25.0/arekit/common/docs}/entity.py +2 -1
- {arekit-0.23.1/arekit/common/news → arekit-0.25.0/arekit/common/docs}/parsed/base.py +5 -5
- arekit-0.25.0/arekit/common/docs/parsed/providers/base.py +68 -0
- {arekit-0.23.1/arekit/common/news → arekit-0.25.0/arekit/common/docs}/parsed/providers/base_pairs.py +2 -2
- {arekit-0.23.1/arekit/common/news → arekit-0.25.0/arekit/common/docs}/parsed/providers/entity_service.py +27 -22
- {arekit-0.23.1/arekit/common/news → arekit-0.25.0/arekit/common/docs}/parsed/providers/opinion_pairs.py +2 -2
- {arekit-0.23.1/arekit/common/news → arekit-0.25.0/arekit/common/docs}/parsed/providers/text_opinion_pairs.py +6 -6
- arekit-0.25.0/arekit/common/docs/parsed/service.py +31 -0
- arekit-0.25.0/arekit/common/docs/parser.py +66 -0
- {arekit-0.23.1/arekit/common/news → arekit-0.25.0/arekit/common/docs}/sentence.py +1 -1
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/entities/base.py +11 -2
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/experiment/api/base_samples_io.py +1 -1
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/frames/variants/collection.py +2 -2
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/linkage/base.py +2 -2
- arekit-0.25.0/arekit/common/linkage/meta.py +23 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/linkage/opinions.py +1 -1
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/linkage/text_opinions.py +2 -2
- arekit-0.25.0/arekit/common/opinions/annot/algo/base.py +4 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/opinions/annot/algo/pair_based.py +15 -13
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/opinions/annot/algo/predefined.py +4 -4
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/opinions/annot/algo_based.py +5 -5
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/opinions/annot/base.py +3 -3
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/opinions/base.py +7 -7
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/opinions/collection.py +3 -3
- arekit-0.25.0/arekit/common/pipeline/base.py +21 -0
- arekit-0.25.0/arekit/common/pipeline/batching.py +28 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/pipeline/context.py +5 -1
- arekit-0.25.0/arekit/common/pipeline/items/base.py +49 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/pipeline/items/flatten.py +5 -1
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/pipeline/items/handle.py +2 -1
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/pipeline/items/iter.py +2 -1
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/pipeline/items/map.py +2 -1
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/pipeline/items/map_nested.py +4 -0
- arekit-0.25.0/arekit/common/pipeline/utils.py +32 -0
- arekit-0.25.0/arekit/common/service/sqlite.py +36 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/synonyms/base.py +2 -2
- arekit-0.23.1/arekit/common/text/partitioning/str.py → arekit-0.25.0/arekit/common/text/partitioning.py +16 -11
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/text_opinions/base.py +11 -11
- arekit-0.25.0/arekit/common/utils.py +85 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/networks/embedding.py +3 -3
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/networks/embedding_io.py +5 -5
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/networks/input/const.py +0 -2
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/networks/input/providers/sample.py +15 -29
- arekit-0.25.0/arekit/contrib/networks/input/rows_parser.py +47 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/prompt/sample.py +18 -16
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/data/contents/opinions.py +17 -5
- arekit-0.25.0/arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
- {arekit-0.23.1/arekit/contrib/utils/data/doc_ops → arekit-0.25.0/arekit/contrib/utils/data/doc_provider}/dir_based.py +7 -7
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/data/readers/base.py +3 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/data/readers/csv_pd.py +10 -4
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/data/readers/jsonl.py +3 -0
- arekit-0.25.0/arekit/contrib/utils/data/readers/sqlite.py +14 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/data/service/balance.py +0 -1
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/data/storages/pandas_based.py +3 -5
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/data/storages/row_cache.py +18 -6
- arekit-0.25.0/arekit/contrib/utils/data/storages/sqlite_based.py +17 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/data/writers/base.py +5 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/data/writers/csv_native.py +3 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/data/writers/csv_pd.py +3 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/data/writers/json_opennre.py +31 -13
- arekit-0.25.0/arekit/contrib/utils/data/writers/sqlite_native.py +114 -0
- arekit-0.25.0/arekit/contrib/utils/io_utils/embedding.py +72 -0
- arekit-0.25.0/arekit/contrib/utils/io_utils/utils.py +22 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/pipelines/items/sampling/base.py +31 -26
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/pipelines/items/sampling/networks.py +7 -10
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +3 -3
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/pipelines/items/text/frames_negation.py +2 -1
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/pipelines/items/text/tokenizer.py +3 -5
- arekit-0.25.0/arekit/contrib/utils/pipelines/items/text/translator.py +136 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/pipelines/opinion_collections.py +5 -5
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +7 -7
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/pipelines/text_opinion/extraction.py +34 -22
- arekit-0.25.0/arekit/contrib/utils/pipelines/text_opinion/filters/base.py +4 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +1 -1
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +3 -3
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +4 -4
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/serializer.py +4 -23
- arekit-0.25.0/arekit.egg-info/PKG-INFO +82 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit.egg-info/SOURCES.txt +33 -177
- arekit-0.25.0/logo.png +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/setup.py +9 -4
- arekit-0.23.1/PKG-INFO +0 -19
- arekit-0.23.1/README.md +0 -77
- arekit-0.23.1/arekit/common/data/input/providers/rows/base.py +0 -41
- arekit-0.23.1/arekit/common/data/row_ids/base.py +0 -79
- arekit-0.23.1/arekit/common/data/row_ids/binary.py +0 -38
- arekit-0.23.1/arekit/common/data/row_ids/multiple.py +0 -14
- arekit-0.23.1/arekit/common/folding/base.py +0 -36
- arekit-0.23.1/arekit/common/folding/fixed.py +0 -42
- arekit-0.23.1/arekit/common/folding/nofold.py +0 -15
- arekit-0.23.1/arekit/common/folding/united.py +0 -46
- arekit-0.23.1/arekit/common/news/objects_parser.py +0 -37
- arekit-0.23.1/arekit/common/news/parsed/providers/base.py +0 -48
- arekit-0.23.1/arekit/common/news/parsed/service.py +0 -31
- arekit-0.23.1/arekit/common/news/parser.py +0 -34
- arekit-0.23.1/arekit/common/opinions/annot/algo/base.py +0 -4
- arekit-0.23.1/arekit/common/pipeline/base.py +0 -25
- arekit-0.23.1/arekit/common/pipeline/items/base.py +0 -12
- arekit-0.23.1/arekit/common/text/parser.py +0 -12
- arekit-0.23.1/arekit/common/text/partitioning/base.py +0 -4
- arekit-0.23.1/arekit/common/text/partitioning/terms.py +0 -35
- arekit-0.23.1/arekit/common/utils.py +0 -98
- arekit-0.23.1/arekit/contrib/networks/input/rows_parser.py +0 -134
- arekit-0.23.1/arekit/contrib/source/brat/annot.py +0 -83
- arekit-0.23.1/arekit/contrib/source/brat/entities/compound.py +0 -33
- arekit-0.23.1/arekit/contrib/source/brat/entities/entity.py +0 -42
- arekit-0.23.1/arekit/contrib/source/brat/entities/parser.py +0 -53
- arekit-0.23.1/arekit/contrib/source/brat/news.py +0 -28
- arekit-0.23.1/arekit/contrib/source/brat/opinions/converter.py +0 -19
- arekit-0.23.1/arekit/contrib/source/brat/relation.py +0 -32
- arekit-0.23.1/arekit/contrib/source/brat/sentence.py +0 -69
- arekit-0.23.1/arekit/contrib/source/brat/sentences_reader.py +0 -128
- arekit-0.23.1/arekit/contrib/source/download.py +0 -41
- arekit-0.23.1/arekit/contrib/source/nerel/entities.py +0 -55
- arekit-0.23.1/arekit/contrib/source/nerel/folding/fixed.py +0 -75
- arekit-0.23.1/arekit/contrib/source/nerel/io_utils.py +0 -62
- arekit-0.23.1/arekit/contrib/source/nerel/labels.py +0 -241
- arekit-0.23.1/arekit/contrib/source/nerel/reader.py +0 -46
- arekit-0.23.1/arekit/contrib/source/nerel/utils.py +0 -24
- arekit-0.23.1/arekit/contrib/source/nerel/versions.py +0 -12
- arekit-0.23.1/arekit/contrib/source/nerelbio/io_utils.py +0 -62
- arekit-0.23.1/arekit/contrib/source/nerelbio/labels.py +0 -265
- arekit-0.23.1/arekit/contrib/source/nerelbio/reader.py +0 -8
- arekit-0.23.1/arekit/contrib/source/nerelbio/versions.py +0 -8
- arekit-0.23.1/arekit/contrib/source/ruattitudes/collection.py +0 -36
- arekit-0.23.1/arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
- arekit-0.23.1/arekit/contrib/source/ruattitudes/io_utils.py +0 -56
- arekit-0.23.1/arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
- arekit-0.23.1/arekit/contrib/source/ruattitudes/news.py +0 -51
- arekit-0.23.1/arekit/contrib/source/ruattitudes/news_brat.py +0 -44
- arekit-0.23.1/arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
- arekit-0.23.1/arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
- arekit-0.23.1/arekit/contrib/source/ruattitudes/reader.py +0 -268
- arekit-0.23.1/arekit/contrib/source/ruattitudes/sentence.py +0 -73
- arekit-0.23.1/arekit/contrib/source/ruattitudes/synonyms.py +0 -17
- arekit-0.23.1/arekit/contrib/source/ruattitudes/text_object.py +0 -57
- arekit-0.23.1/arekit/contrib/source/rusentiframes/collection.py +0 -157
- arekit-0.23.1/arekit/contrib/source/rusentiframes/effect.py +0 -24
- arekit-0.23.1/arekit/contrib/source/rusentiframes/io_utils.py +0 -19
- arekit-0.23.1/arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
- arekit-0.23.1/arekit/contrib/source/rusentiframes/polarity.py +0 -35
- arekit-0.23.1/arekit/contrib/source/rusentiframes/role.py +0 -15
- arekit-0.23.1/arekit/contrib/source/rusentiframes/state.py +0 -24
- arekit-0.23.1/arekit/contrib/source/rusentiframes/types.py +0 -42
- arekit-0.23.1/arekit/contrib/source/rusentiframes/value.py +0 -2
- arekit-0.23.1/arekit/contrib/source/rusentrel/const.py +0 -3
- arekit-0.23.1/arekit/contrib/source/rusentrel/entities.py +0 -26
- arekit-0.23.1/arekit/contrib/source/rusentrel/io_utils.py +0 -125
- arekit-0.23.1/arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
- arekit-0.23.1/arekit/contrib/source/rusentrel/news_reader.py +0 -51
- arekit-0.23.1/arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
- arekit-0.23.1/arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
- arekit-0.23.1/arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
- arekit-0.23.1/arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
- arekit-0.23.1/arekit/contrib/source/rusentrel/synonyms.py +0 -17
- arekit-0.23.1/arekit/contrib/source/sentinerel/entities.py +0 -52
- arekit-0.23.1/arekit/contrib/source/sentinerel/folding/factory.py +0 -32
- arekit-0.23.1/arekit/contrib/source/sentinerel/folding/fixed.py +0 -73
- arekit-0.23.1/arekit/contrib/source/sentinerel/io_utils.py +0 -87
- arekit-0.23.1/arekit/contrib/source/sentinerel/labels.py +0 -53
- arekit-0.23.1/arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
- arekit-0.23.1/arekit/contrib/source/sentinerel/reader.py +0 -42
- arekit-0.23.1/arekit/contrib/source/synonyms/utils.py +0 -19
- arekit-0.23.1/arekit/contrib/source/zip_utils.py +0 -47
- arekit-0.23.1/arekit/contrib/utils/bert/rows.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/bert/text_b_rus.py +0 -18
- arekit-0.23.1/arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
- arekit-0.23.1/arekit/contrib/utils/cv/doc_stat/base.py +0 -37
- arekit-0.23.1/arekit/contrib/utils/cv/doc_stat/sentence.py +0 -12
- arekit-0.23.1/arekit/contrib/utils/cv/splitters/base.py +0 -4
- arekit-0.23.1/arekit/contrib/utils/cv/splitters/default.py +0 -53
- arekit-0.23.1/arekit/contrib/utils/cv/splitters/statistical.py +0 -57
- arekit-0.23.1/arekit/contrib/utils/cv/two_class.py +0 -77
- arekit-0.23.1/arekit/contrib/utils/data/doc_ops/dict_based.py +0 -13
- arekit-0.23.1/arekit/contrib/utils/data/ext.py +0 -31
- arekit-0.23.1/arekit/contrib/utils/data/storages/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/data/views/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/data/views/linkages/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/data/views/linkages/base.py +0 -58
- arekit-0.23.1/arekit/contrib/utils/data/views/linkages/multilabel.py +0 -48
- arekit-0.23.1/arekit/contrib/utils/data/views/linkages/utils.py +0 -24
- arekit-0.23.1/arekit/contrib/utils/data/views/opinions.py +0 -14
- arekit-0.23.1/arekit/contrib/utils/data/writers/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/download.py +0 -78
- arekit-0.23.1/arekit/contrib/utils/embeddings/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/entities/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/entities/formatters/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/entities/formatters/str_rus_cased_fmt.py +0 -78
- arekit-0.23.1/arekit/contrib/utils/entities/formatters/str_rus_nocased_fmt.py +0 -15
- arekit-0.23.1/arekit/contrib/utils/entities/formatters/str_simple_fmt.py +0 -24
- arekit-0.23.1/arekit/contrib/utils/entities/formatters/str_simple_uppercase_fmt.py +0 -21
- arekit-0.23.1/arekit/contrib/utils/io_utils/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/io_utils/embedding.py +0 -80
- arekit-0.23.1/arekit/contrib/utils/io_utils/opinions.py +0 -39
- arekit-0.23.1/arekit/contrib/utils/io_utils/samples.py +0 -78
- arekit-0.23.1/arekit/contrib/utils/io_utils/utils.py +0 -43
- arekit-0.23.1/arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/lexicons/lexicon.py +0 -43
- arekit-0.23.1/arekit/contrib/utils/lexicons/relation.py +0 -45
- arekit-0.23.1/arekit/contrib/utils/lexicons/rusentilex.py +0 -34
- arekit-0.23.1/arekit/contrib/utils/nn/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/nn/rows.py +0 -83
- arekit-0.23.1/arekit/contrib/utils/np_utils/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/pipelines/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/pipelines/items/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/pipelines/items/sampling/bert.py +0 -5
- arekit-0.23.1/arekit/contrib/utils/pipelines/items/text/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
- arekit-0.23.1/arekit/contrib/utils/pipelines/items/to_output.py +0 -101
- arekit-0.23.1/arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/pipelines/sources/nerel/doc_ops.py +0 -27
- arekit-0.23.1/arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -59
- arekit-0.23.1/arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
- arekit-0.23.1/arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/pipelines/sources/nerel_bio/doc_ops.py +0 -29
- arekit-0.23.1/arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -59
- arekit-0.23.1/arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
- arekit-0.23.1/arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/pipelines/sources/ruattitudes/doc_ops.py +0 -56
- arekit-0.23.1/arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -19
- arekit-0.23.1/arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -58
- arekit-0.23.1/arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/pipelines/sources/rusentrel/doc_ops.py +0 -21
- arekit-0.23.1/arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -100
- arekit-0.23.1/arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/pipelines/sources/sentinerel/doc_ops.py +0 -29
- arekit-0.23.1/arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
- arekit-0.23.1/arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -175
- arekit-0.23.1/arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
- arekit-0.23.1/arekit/contrib/utils/pipelines/text_opinion/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
- arekit-0.23.1/arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/pipelines/text_opinion/filters/base.py +0 -4
- arekit-0.23.1/arekit/contrib/utils/processing/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/processing/languages/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/processing/pos/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/processing/text/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/resources.py +0 -26
- arekit-0.23.1/arekit/contrib/utils/sources/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
- arekit-0.23.1/arekit/contrib/utils/synonyms/__init__.py +0 -0
- arekit-0.23.1/arekit/contrib/utils/utils_folding.py +0 -19
- arekit-0.23.1/arekit/contrib/utils/vectorizers/__init__.py +0 -0
- arekit-0.23.1/arekit/download_data.py +0 -11
- arekit-0.23.1/arekit.egg-info/PKG-INFO +0 -19
- {arekit-0.23.1 → arekit-0.25.0}/LICENSE +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/bound.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/context/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/context/token.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/providers/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/providers/columns/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/providers/columns/base.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/providers/const.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/providers/contents.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/providers/instances/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/providers/instances/multiple.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/providers/instances/single.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/providers/label/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/providers/label/base.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/providers/label/binary.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/providers/label/multiple.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/providers/rows/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/providers/sample/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/providers/text/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/providers/text/single.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/repositories/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/repositories/base.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/repositories/sample.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/data/input/terms_mapper.py +0 -0
- {arekit-0.23.1/arekit/common/data/row_ids → arekit-0.25.0/arekit/common/data/storages}/__init__.py +0 -0
- {arekit-0.23.1/arekit/common/data/storages → arekit-0.25.0/arekit/common/data/views}/__init__.py +0 -0
- {arekit-0.23.1/arekit/common/data/views → arekit-0.25.0/arekit/common/docs}/__init__.py +0 -0
- {arekit-0.23.1/arekit/common/entities → arekit-0.25.0/arekit/common/docs/parsed}/__init__.py +0 -0
- {arekit-0.23.1/arekit/common/experiment → arekit-0.25.0/arekit/common/docs/parsed/providers}/__init__.py +0 -0
- {arekit-0.23.1/arekit/common/news → arekit-0.25.0/arekit/common/docs}/parsed/term_position.py +0 -0
- {arekit-0.23.1/arekit/common/experiment/api → arekit-0.25.0/arekit/common/entities}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/entities/collection.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/entities/str_fmt.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/entities/types.py +0 -0
- {arekit-0.23.1/arekit/common/folding → arekit-0.25.0/arekit/common/experiment}/__init__.py +0 -0
- {arekit-0.23.1/arekit/common/frames → arekit-0.25.0/arekit/common/experiment/api}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/experiment/data_type.py +0 -0
- {arekit-0.23.1/arekit/common/frames/connotations → arekit-0.25.0/arekit/common/frames}/__init__.py +0 -0
- {arekit-0.23.1/arekit/common/frames/variants → arekit-0.25.0/arekit/common/frames/connotations}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/frames/connotations/descriptor.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/frames/connotations/provider.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/frames/text_variant.py +0 -0
- {arekit-0.23.1/arekit/common/labels → arekit-0.25.0/arekit/common/frames/variants}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/frames/variants/base.py +0 -0
- {arekit-0.23.1/arekit/common/labels/provider → arekit-0.25.0/arekit/common/labels}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/labels/base.py +0 -0
- {arekit-0.23.1/arekit/common/labels/scaler → arekit-0.25.0/arekit/common/labels/provider}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/labels/provider/base.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/labels/provider/constant.py +0 -0
- {arekit-0.23.1/arekit/common/linkage → arekit-0.25.0/arekit/common/labels/scaler}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/labels/scaler/base.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/labels/scaler/sentiment.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/labels/scaler/single.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/labels/str_fmt.py +0 -0
- {arekit-0.23.1/arekit/common/model → arekit-0.25.0/arekit/common/linkage}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/log_utils.py +0 -0
- {arekit-0.23.1/arekit/common/model/labeling → arekit-0.25.0/arekit/common/model}/__init__.py +0 -0
- {arekit-0.23.1/arekit/common/news → arekit-0.25.0/arekit/common/model/labeling}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/model/labeling/base.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/model/labeling/modes.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/model/labeling/single.py +0 -0
- {arekit-0.23.1/arekit/common/news/parsed → arekit-0.25.0/arekit/common/opinions}/__init__.py +0 -0
- {arekit-0.23.1/arekit/common/news/parsed/providers → arekit-0.25.0/arekit/common/opinions/annot}/__init__.py +0 -0
- {arekit-0.23.1/arekit/common/opinions → arekit-0.25.0/arekit/common/opinions/annot/algo}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/opinions/enums.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/opinions/provider.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/opinions/writer.py +0 -0
- {arekit-0.23.1/arekit/common/opinions/annot → arekit-0.25.0/arekit/common/pipeline}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/pipeline/conts.py +0 -0
- {arekit-0.23.1/arekit/common/opinions/annot/algo → arekit-0.25.0/arekit/common/pipeline/items}/__init__.py +0 -0
- {arekit-0.23.1/arekit/common/pipeline → arekit-0.25.0/arekit/common/service}/__init__.py +0 -0
- {arekit-0.23.1/arekit/common/pipeline/items → arekit-0.25.0/arekit/common/synonyms}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/synonyms/grouping.py +0 -0
- {arekit-0.23.1/arekit/common/synonyms → arekit-0.25.0/arekit/common/text}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/text/enums.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/text/parsed.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/common/text/stemmer.py +0 -0
- {arekit-0.23.1/arekit/common/text → arekit-0.25.0/arekit/common/text_opinions}/__init__.py +0 -0
- {arekit-0.23.1/arekit/common/text/partitioning → arekit-0.25.0/arekit/contrib}/__init__.py +0 -0
- {arekit-0.23.1/arekit/common/text_opinions → arekit-0.25.0/arekit/contrib/bert}/__init__.py +0 -0
- {arekit-0.23.1/arekit/contrib → arekit-0.25.0/arekit/contrib/bert/input}/__init__.py +0 -0
- {arekit-0.23.1/arekit/contrib/bert → arekit-0.25.0/arekit/contrib/bert/input/providers}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/bert/input/providers/cropped_sample.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/bert/input/providers/text_pair.py +0 -0
- {arekit-0.23.1/arekit/contrib/bert/input → arekit-0.25.0/arekit/contrib/bert/terms}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/bert/terms/mapper.py +0 -0
- {arekit-0.23.1/arekit/contrib/bert/input/providers → arekit-0.25.0/arekit/contrib/networks}/__init__.py +0 -0
- {arekit-0.23.1/arekit/contrib/bert/terms → arekit-0.25.0/arekit/contrib/networks/input}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/networks/input/ctx_serialization.py +0 -0
- {arekit-0.23.1/arekit/contrib/networks → arekit-0.25.0/arekit/contrib/networks/input/embedding}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/networks/input/embedding/matrix.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/networks/input/embedding/offsets.py +0 -0
- {arekit-0.23.1/arekit/contrib/networks/input → arekit-0.25.0/arekit/contrib/networks/input/formatters}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/networks/input/formatters/pos_mapper.py +0 -0
- {arekit-0.23.1/arekit/contrib/networks/input/embedding → arekit-0.25.0/arekit/contrib/networks/input/providers}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/networks/input/providers/term_connotation.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/networks/input/providers/text.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/networks/input/term_types.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/networks/input/terms_mapping.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/networks/vectorizer.py +0 -0
- {arekit-0.23.1/arekit/contrib/networks/input/formatters → arekit-0.25.0/arekit/contrib/prompt}/__init__.py +0 -0
- {arekit-0.23.1/arekit/contrib/networks/input/providers → arekit-0.25.0/arekit/contrib/utils}/__init__.py +0 -0
- {arekit-0.23.1/arekit/contrib/prompt → arekit-0.25.0/arekit/contrib/utils/bert}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/bert/samplers.py +0 -0
- {arekit-0.23.1/arekit/contrib/source → arekit-0.25.0/arekit/contrib/utils/data}/__init__.py +0 -0
- {arekit-0.23.1/arekit/contrib/source/brat → arekit-0.25.0/arekit/contrib/utils/data/contents}/__init__.py +0 -0
- {arekit-0.23.1/arekit/contrib/source/brat/entities → arekit-0.25.0/arekit/contrib/utils/data/doc_provider}/__init__.py +0 -0
- {arekit-0.23.1/arekit/contrib/source/brat/opinions → arekit-0.25.0/arekit/contrib/utils/data/readers}/__init__.py +0 -0
- {arekit-0.23.1/arekit/contrib/source/nerel → arekit-0.25.0/arekit/contrib/utils/data/service}/__init__.py +0 -0
- {arekit-0.23.1/arekit/contrib/source/nerel/folding → arekit-0.25.0/arekit/contrib/utils/data/storages}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/data/storages/jsonl_based.py +0 -0
- {arekit-0.23.1/arekit/contrib/source/nerelbio → arekit-0.25.0/arekit/contrib/utils/data/writers}/__init__.py +0 -0
- {arekit-0.23.1/arekit/contrib/source/ruattitudes → arekit-0.25.0/arekit/contrib/utils/embeddings}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/embeddings/rusvectores.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/embeddings/tokens.py +0 -0
- {arekit-0.23.1/arekit/contrib/source/ruattitudes/entity → arekit-0.25.0/arekit/contrib/utils/entities}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/entities/filter.py +0 -0
- {arekit-0.23.1/arekit/contrib/source/ruattitudes/opinions → arekit-0.25.0/arekit/contrib/utils/entities/formatters}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/entities/formatters/str_display.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py +0 -0
- {arekit-0.23.1/arekit/contrib/source/rusentiframes → arekit-0.25.0/arekit/contrib/utils/io_utils}/__init__.py +0 -0
- {arekit-0.23.1/arekit/contrib/source/rusentrel → arekit-0.25.0/arekit/contrib/utils/np_utils}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/np_utils/embedding.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/np_utils/npz_utils.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/np_utils/vocab.py +0 -0
- {arekit-0.23.1/arekit/contrib/source/rusentrel/opinions → arekit-0.25.0/arekit/contrib/utils/pipelines}/__init__.py +0 -0
- {arekit-0.23.1/arekit/contrib/source/sentinerel → arekit-0.25.0/arekit/contrib/utils/pipelines/items}/__init__.py +0 -0
- {arekit-0.23.1/arekit/contrib/source/sentinerel/folding → arekit-0.25.0/arekit/contrib/utils/pipelines/items/sampling}/__init__.py +0 -0
- {arekit-0.23.1/arekit/contrib/source/synonyms → arekit-0.25.0/arekit/contrib/utils/pipelines/items/text}/__init__.py +0 -0
- {arekit-0.23.1/arekit/contrib/utils → arekit-0.25.0/arekit/contrib/utils/pipelines/text_opinion}/__init__.py +0 -0
- {arekit-0.23.1/arekit/contrib/utils/bert → arekit-0.25.0/arekit/contrib/utils/pipelines/text_opinion/annot}/__init__.py +0 -0
- {arekit-0.23.1/arekit/contrib/utils/connotations → arekit-0.25.0/arekit/contrib/utils/pipelines/text_opinion/filters}/__init__.py +0 -0
- {arekit-0.23.1/arekit/contrib/utils/cv → arekit-0.25.0/arekit/contrib/utils/processing}/__init__.py +0 -0
- {arekit-0.23.1/arekit/contrib/utils/cv/doc_stat → arekit-0.25.0/arekit/contrib/utils/processing/languages}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/processing/languages/mods.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/processing/languages/pos.py +0 -0
- {arekit-0.23.1/arekit/contrib/utils/cv/splitters → arekit-0.25.0/arekit/contrib/utils/processing/languages/ru}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/processing/languages/ru/cases.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/processing/languages/ru/constants.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/processing/languages/ru/mods.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/processing/languages/ru/number.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -0
- {arekit-0.23.1/arekit/contrib/utils/data → arekit-0.25.0/arekit/contrib/utils/processing/lemmatization}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/processing/lemmatization/mystem.py +0 -0
- {arekit-0.23.1/arekit/contrib/utils/data/contents → arekit-0.25.0/arekit/contrib/utils/processing/pos}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/processing/pos/base.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/processing/pos/russian.py +0 -0
- {arekit-0.23.1/arekit/contrib/utils/data/doc_ops → arekit-0.25.0/arekit/contrib/utils/processing/text}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/processing/text/tokens.py +0 -0
- {arekit-0.23.1/arekit/contrib/utils/data/readers → arekit-0.25.0/arekit/contrib/utils/synonyms}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/synonyms/simple.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/synonyms/stemmer_based.py +0 -0
- {arekit-0.23.1/arekit/contrib/utils/data/service → arekit-0.25.0/arekit/contrib/utils/vectorizers}/__init__.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/vectorizers/bpe.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit/contrib/utils/vectorizers/random_norm.py +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit.egg-info/dependency_links.txt +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit.egg-info/requires.txt +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/arekit.egg-info/top_level.txt +0 -0
- {arekit-0.23.1 → arekit-0.25.0}/setup.cfg +0 -0
arekit-0.25.0/PKG-INFO
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: arekit
|
|
3
|
+
Version: 0.25.0
|
|
4
|
+
Summary: Document level Attitude and Relation Extraction toolkit (AREkit) for sampling and prompting mass-media news into datasets for ML-model training
|
|
5
|
+
Home-page: https://github.com/nicolay-r/AREkit
|
|
6
|
+
Author: Nicolay Rusnachenko
|
|
7
|
+
Author-email: rusnicolay@gmail.com
|
|
8
|
+
License: MIT License
|
|
9
|
+
Keywords: natural language processing,relation extraction,sentiment analysis
|
|
10
|
+
Classifier: Programming Language :: Python
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.6
|
|
12
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
14
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
15
|
+
Requires-Python: >=3.6
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: tqdm
|
|
19
|
+
Requires-Dist: enum34==1.1.10
|
|
20
|
+
Requires-Dist: numpy>=1.14.5
|
|
21
|
+
Requires-Dist: pymystem3==0.2.0
|
|
22
|
+
|
|
23
|
+
# AREkit 0.25.0
|
|
24
|
+
|
|
25
|
+

|
|
26
|
+
|
|
27
|
+
<p align="center">
|
|
28
|
+
<img src="logo.png"/>
|
|
29
|
+
</p>
|
|
30
|
+
|
|
31
|
+
**AREkit** (Attitude and Relation Extraction Toolkit) --
|
|
32
|
+
is a python toolkit, devoted to document level Attitude and Relation Extraction between text objects from mass-media news.
|
|
33
|
+
|
|
34
|
+
## Description
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
This toolkit aims at memory-effective data processing in Relation Extraction (RE) related tasks.
|
|
38
|
+
|
|
39
|
+
<p align="center">
|
|
40
|
+
<img src="docs/arekit-pipeline-concept.png"/>
|
|
41
|
+
</p>
|
|
42
|
+
|
|
43
|
+
> Figure: AREkit pipelines design. More on
|
|
44
|
+
> **[ARElight: Context Sampling of Large Texts for Deep Learning Relation Extraction](https://link.springer.com/chapter/10.1007/978-3-031-56069-9_23)** paper
|
|
45
|
+
|
|
46
|
+
In particular, this framework serves the following features:
|
|
47
|
+
* ➿ [pipelines](https://github.com/nicolay-r/AREkit/wiki/Pipelines:-Text-Opinion-Annotation) and iterators for handling large-scale collections serialization without out-of-memory issues.
|
|
48
|
+
* 🔗 EL (entity-linking) API support for objects,
|
|
49
|
+
* ➰ avoidance of cyclic connections,
|
|
50
|
+
* :straight_ruler: distance consideration between relation participants (in `terms` or `sentences`),
|
|
51
|
+
* 📑 relations annotations and filtering rules,
|
|
52
|
+
* *️⃣ entities formatting or masking, and more.
|
|
53
|
+
|
|
54
|
+
The core functionality includes:
|
|
55
|
+
* API for document presentation with EL (Entity Linking, i.e. Object Synonymy) support
|
|
56
|
+
for sentence level relations preparation (dubbed as contexts);
|
|
57
|
+
* API for contexts extraction;
|
|
58
|
+
* Relations transferring from sentence-level onto document-level, and more.
|
|
59
|
+
|
|
60
|
+
## Installation
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
pip install git+https://github.com/nicolay-r/AREkit.git@0.25.0-rc
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Usage
|
|
67
|
+
|
|
68
|
+
Please follow the **[tutorial section on project Wiki](https://github.com/nicolay-r/AREkit/wiki/Tutorials)** for mode details.
|
|
69
|
+
|
|
70
|
+
## How to cite
|
|
71
|
+
A great research is also accompanied by the faithful reference.
|
|
72
|
+
if you use or extend our work, please cite as follows:
|
|
73
|
+
|
|
74
|
+
```bibtex
|
|
75
|
+
@inproceedings{rusnachenko2024arelight,
|
|
76
|
+
title={ARElight: Context Sampling of Large Texts for Deep Learning Relation Extraction},
|
|
77
|
+
author={Rusnachenko, Nicolay and Liang, Huizhi and Kolomeets, Maxim and Shi, Lei},
|
|
78
|
+
booktitle={European Conference on Information Retrieval},
|
|
79
|
+
year={2024},
|
|
80
|
+
organization={Springer}
|
|
81
|
+
}
|
|
82
|
+
```
|
arekit-0.25.0/README.md
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# AREkit 0.25.0
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+
|
|
5
|
+
<p align="center">
|
|
6
|
+
<img src="logo.png"/>
|
|
7
|
+
</p>
|
|
8
|
+
|
|
9
|
+
**AREkit** (Attitude and Relation Extraction Toolkit) --
|
|
10
|
+
is a python toolkit, devoted to document level Attitude and Relation Extraction between text objects from mass-media news.
|
|
11
|
+
|
|
12
|
+
## Description
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
This toolkit aims at memory-effective data processing in Relation Extraction (RE) related tasks.
|
|
16
|
+
|
|
17
|
+
<p align="center">
|
|
18
|
+
<img src="docs/arekit-pipeline-concept.png"/>
|
|
19
|
+
</p>
|
|
20
|
+
|
|
21
|
+
> Figure: AREkit pipelines design. More on
|
|
22
|
+
> **[ARElight: Context Sampling of Large Texts for Deep Learning Relation Extraction](https://link.springer.com/chapter/10.1007/978-3-031-56069-9_23)** paper
|
|
23
|
+
|
|
24
|
+
In particular, this framework serves the following features:
|
|
25
|
+
* ➿ [pipelines](https://github.com/nicolay-r/AREkit/wiki/Pipelines:-Text-Opinion-Annotation) and iterators for handling large-scale collections serialization without out-of-memory issues.
|
|
26
|
+
* 🔗 EL (entity-linking) API support for objects,
|
|
27
|
+
* ➰ avoidance of cyclic connections,
|
|
28
|
+
* :straight_ruler: distance consideration between relation participants (in `terms` or `sentences`),
|
|
29
|
+
* 📑 relations annotations and filtering rules,
|
|
30
|
+
* *️⃣ entities formatting or masking, and more.
|
|
31
|
+
|
|
32
|
+
The core functionality includes:
|
|
33
|
+
* API for document presentation with EL (Entity Linking, i.e. Object Synonymy) support
|
|
34
|
+
for sentence level relations preparation (dubbed as contexts);
|
|
35
|
+
* API for contexts extraction;
|
|
36
|
+
* Relations transferring from sentence-level onto document-level, and more.
|
|
37
|
+
|
|
38
|
+
## Installation
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install git+https://github.com/nicolay-r/AREkit.git@0.25.0-rc
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Usage
|
|
45
|
+
|
|
46
|
+
Please follow the **[tutorial section on project Wiki](https://github.com/nicolay-r/AREkit/wiki/Tutorials)** for mode details.
|
|
47
|
+
|
|
48
|
+
## How to cite
|
|
49
|
+
A great research is also accompanied by the faithful reference.
|
|
50
|
+
if you use or extend our work, please cite as follows:
|
|
51
|
+
|
|
52
|
+
```bibtex
|
|
53
|
+
@inproceedings{rusnachenko2024arelight,
|
|
54
|
+
title={ARElight: Context Sampling of Large Texts for Deep Learning Relation Extraction},
|
|
55
|
+
author={Rusnachenko, Nicolay and Liang, Huizhi and Kolomeets, Maxim and Shi, Lei},
|
|
56
|
+
booktitle={European Conference on Information Retrieval},
|
|
57
|
+
year={2024},
|
|
58
|
+
organization={Springer}
|
|
59
|
+
}
|
|
60
|
+
```
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import
|
|
1
|
+
from collections.abc import Iterable
|
|
2
2
|
|
|
3
3
|
from arekit.common.context.token import Token
|
|
4
4
|
from arekit.common.entities.base import Entity
|
|
@@ -10,7 +10,7 @@ class TextTermsMapper(object):
|
|
|
10
10
|
def iter_mapped(self, terms):
|
|
11
11
|
""" Performs mapping operation of each terms in a sequence
|
|
12
12
|
"""
|
|
13
|
-
assert(isinstance(terms,
|
|
13
|
+
assert(isinstance(terms, Iterable))
|
|
14
14
|
|
|
15
15
|
self._before_mapping()
|
|
16
16
|
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
ID = 'id'
|
|
2
2
|
DOC_ID = 'doc_id'
|
|
3
3
|
TEXT = 'text_a'
|
|
4
|
-
|
|
4
|
+
LABEL_UINT = 'label_uint'
|
|
5
|
+
LABEL_STR = 'label_str'
|
|
5
6
|
|
|
6
|
-
#
|
|
7
|
-
|
|
8
|
-
|
|
7
|
+
# Global identifier of the opinion in the sampled data.
|
|
8
|
+
OPINION_ID = "opinion_id"
|
|
9
|
+
OPINION_LINKAGE_ID = "linkage_id"
|
|
9
10
|
|
|
10
11
|
# Corresponds to fields with attitude ends. (indices, INT)
|
|
11
12
|
S_IND = 's_ind'
|
|
@@ -37,7 +37,8 @@ class SampleColumnsProvider(BaseColumnsProvider):
|
|
|
37
37
|
|
|
38
38
|
# insert labels
|
|
39
39
|
if self.__store_labels:
|
|
40
|
-
dtypes_list.append((const.
|
|
40
|
+
dtypes_list.append((const.LABEL_UINT, 'int32'))
|
|
41
|
+
dtypes_list.append((const.LABEL_STR, str))
|
|
41
42
|
|
|
42
43
|
# insert text columns
|
|
43
44
|
for col_name in self.__text_column_names:
|
|
@@ -47,6 +48,10 @@ class SampleColumnsProvider(BaseColumnsProvider):
|
|
|
47
48
|
dtypes_list.append((const.S_IND, 'int32'))
|
|
48
49
|
dtypes_list.append((const.T_IND, 'int32'))
|
|
49
50
|
|
|
51
|
+
# opinion-extraction task related fields
|
|
52
|
+
dtypes_list.append((const.OPINION_ID, 'int32'))
|
|
53
|
+
dtypes_list.append((const.OPINION_LINKAGE_ID, 'int32'))
|
|
54
|
+
|
|
50
55
|
return dtypes_list
|
|
51
56
|
|
|
52
57
|
def set_text_column_names(self, text_column_names):
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from collections import Counter
|
|
2
|
+
from collections.abc import Iterable
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from arekit.common.data.input.providers.contents import ContentsProvider
|
|
6
|
+
from arekit.common.linkage.base import LinkedDataWrapper
|
|
7
|
+
from arekit.common.docs.parsed.providers.entity_service import EntityServiceProvider
|
|
8
|
+
from arekit.common.linkage.meta import MetaEmptyLinkedDataWrapper
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BaseRowProvider(object):
|
|
14
|
+
""" Base provider for rows that suppose to be filled into BaseRowsStorage.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self):
|
|
18
|
+
self.__rows_counter = None
|
|
19
|
+
|
|
20
|
+
# region protected methods
|
|
21
|
+
|
|
22
|
+
# TODO. This might be also generalized.
|
|
23
|
+
# TODO. Idle-mode is also a implementation and task specific parameter, i.e. might be removed from here.
|
|
24
|
+
def _provide_rows(self, parsed_doc, entity_service, text_opinion_linkage, idle_mode):
|
|
25
|
+
raise NotImplementedError()
|
|
26
|
+
|
|
27
|
+
def _count_row(self):
|
|
28
|
+
index = self.__rows_counter["rows_iterated"]
|
|
29
|
+
self.__rows_counter["rows_iterated"] += 1
|
|
30
|
+
return index
|
|
31
|
+
|
|
32
|
+
# endregion
|
|
33
|
+
|
|
34
|
+
def __iter_rows(self, linked_data, idle_mode):
|
|
35
|
+
parsed_doc_service = linked_data.Tag
|
|
36
|
+
return self._provide_rows(parsed_doc=parsed_doc_service.ParsedDocument,
|
|
37
|
+
entity_service=parsed_doc_service.get_provider(EntityServiceProvider.NAME),
|
|
38
|
+
text_opinion_linkage=linked_data,
|
|
39
|
+
idle_mode=idle_mode)
|
|
40
|
+
|
|
41
|
+
def iter_by_rows(self, contents_provider, doc_ids_iter, idle_mode):
|
|
42
|
+
assert(isinstance(contents_provider, ContentsProvider))
|
|
43
|
+
assert(isinstance(doc_ids_iter, Iterable))
|
|
44
|
+
|
|
45
|
+
self.__rows_counter = Counter()
|
|
46
|
+
|
|
47
|
+
for linked_data in contents_provider.from_doc_ids(doc_ids=doc_ids_iter, idle_mode=idle_mode):
|
|
48
|
+
assert(isinstance(linked_data, LinkedDataWrapper))
|
|
49
|
+
|
|
50
|
+
if isinstance(linked_data, MetaEmptyLinkedDataWrapper):
|
|
51
|
+
if idle_mode:
|
|
52
|
+
# In the case of the IDLE mode we do not consider the meta-data.
|
|
53
|
+
data_it = []
|
|
54
|
+
else:
|
|
55
|
+
# Consider the actual linked data instance.
|
|
56
|
+
data_it = [linked_data]
|
|
57
|
+
else:
|
|
58
|
+
# Consider the actual rows of the related linked data.
|
|
59
|
+
data_it = self.__iter_rows(linked_data=linked_data, idle_mode=idle_mode)
|
|
60
|
+
|
|
61
|
+
for data in data_it:
|
|
62
|
+
yield linked_data.RelatedDocID, data
|
|
63
|
+
|
|
64
|
+
self.__rows_counter = None
|
|
@@ -8,15 +8,14 @@ from arekit.common.data.input.providers.label.binary import BinaryLabelProvider
|
|
|
8
8
|
from arekit.common.data.input.providers.label.multiple import MultipleLabelProvider
|
|
9
9
|
from arekit.common.data.input.providers.rows.base import BaseRowProvider
|
|
10
10
|
from arekit.common.data.input.providers.text.single import BaseSingleTextProvider
|
|
11
|
-
from arekit.common.data.
|
|
12
|
-
from arekit.common.data.row_ids.multiple import MultipleIDProvider
|
|
11
|
+
from arekit.common.data.rows_fmt import create_base_column_fmt
|
|
13
12
|
from arekit.common.entities.base import Entity
|
|
14
13
|
from arekit.common.labels.base import Label
|
|
15
14
|
|
|
16
15
|
from arekit.common.linkage.text_opinions import TextOpinionsLinkage
|
|
17
|
-
from arekit.common.
|
|
18
|
-
from arekit.common.
|
|
19
|
-
from arekit.common.
|
|
16
|
+
from arekit.common.docs.parsed.base import ParsedDocument
|
|
17
|
+
from arekit.common.docs.parsed.providers.entity_service import EntityEndType, EntityServiceProvider
|
|
18
|
+
from arekit.common.docs.parsed.term_position import TermPositionTypes
|
|
20
19
|
from arekit.common.text_opinions.base import TextOpinion
|
|
21
20
|
|
|
22
21
|
|
|
@@ -34,9 +33,9 @@ class BaseSampleRowProvider(BaseRowProvider):
|
|
|
34
33
|
|
|
35
34
|
self._label_provider = label_provider
|
|
36
35
|
self.__text_provider = text_provider
|
|
37
|
-
self.__row_ids_provider = self.__create_row_ids_provider(label_provider)
|
|
38
36
|
self.__instances_provider = self.__create_instances_provider(label_provider)
|
|
39
37
|
self.__store_labels = None
|
|
38
|
+
self._val_fmt = create_base_column_fmt(fmt_type="writer")
|
|
40
39
|
|
|
41
40
|
# region properties
|
|
42
41
|
|
|
@@ -52,56 +51,67 @@ class BaseSampleRowProvider(BaseRowProvider):
|
|
|
52
51
|
|
|
53
52
|
# region protected methods
|
|
54
53
|
|
|
55
|
-
def _provide_sentence_terms(self,
|
|
56
|
-
terms_iter =
|
|
54
|
+
def _provide_sentence_terms(self, parsed_doc, sentence_ind, s_ind, t_ind):
|
|
55
|
+
terms_iter = parsed_doc.iter_sentence_terms(sentence_index=sentence_ind, return_id=False)
|
|
57
56
|
return list(terms_iter), s_ind, t_ind
|
|
58
57
|
|
|
59
58
|
# TODO. This is a very task-specific description, too many data provided.
|
|
60
59
|
# TODO. Switch this API to dict of params
|
|
61
60
|
def _fill_row_core(self, row, text_opinion_linkage, index_in_linked, etalon_label,
|
|
62
|
-
|
|
61
|
+
parsed_doc, sentence_ind, s_ind, t_ind):
|
|
63
62
|
assert(isinstance(self.__store_labels, bool))
|
|
64
63
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
row[const.ID] = self.__row_ids_provider.create_sample_id(
|
|
69
|
-
linked_opinions=text_opinion_linkage,
|
|
70
|
-
index_in_linked=index_in_linked,
|
|
71
|
-
label_scaler=self._label_provider.LabelScaler)
|
|
64
|
+
sentence_terms, actual_s_ind, actual_t_ind = self._provide_sentence_terms(
|
|
65
|
+
parsed_doc=parsed_doc, sentence_ind=sentence_ind, s_ind=s_ind, t_ind=t_ind)
|
|
72
66
|
|
|
73
|
-
|
|
67
|
+
# Entity indices from the related context.
|
|
68
|
+
entities = list(filter(lambda term: isinstance(term, Entity), sentence_terms))
|
|
74
69
|
|
|
75
|
-
|
|
70
|
+
# Values mapping.
|
|
71
|
+
vm = {
|
|
72
|
+
const.ID: self._count_row(),
|
|
73
|
+
const.OPINION_ID: text_opinion_linkage.First.TextOpinionID,
|
|
74
|
+
const.OPINION_LINKAGE_ID: index_in_linked,
|
|
75
|
+
const.DOC_ID: text_opinion_linkage.First.DocID,
|
|
76
|
+
const.SENT_IND: sentence_ind,
|
|
77
|
+
const.ENTITY_VALUES: entities,
|
|
78
|
+
const.ENTITY_TYPES: entities,
|
|
79
|
+
const.ENTITIES: [str(i) for i, t in enumerate(sentence_terms) if isinstance(t, Entity)],
|
|
80
|
+
const.S_IND: actual_s_ind,
|
|
81
|
+
const.T_IND: actual_t_ind,
|
|
82
|
+
const.LABEL_UINT: None,
|
|
83
|
+
const.LABEL_STR: None
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
# Compose text value.
|
|
87
|
+
def __assign_value(column, value):
|
|
88
|
+
vm[column] = value
|
|
76
89
|
|
|
77
90
|
expected_label = text_opinion_linkage.get_linked_label()
|
|
78
91
|
|
|
79
|
-
if self.__store_labels:
|
|
80
|
-
row[const.LABEL] = self._label_provider.calculate_output_uint_label(
|
|
81
|
-
expected_uint_label=self._label_provider.LabelScaler.label_to_uint(expected_label),
|
|
82
|
-
etalon_uint_label=self._label_provider.LabelScaler.label_to_uint(etalon_label))
|
|
83
|
-
|
|
84
|
-
sentence_terms, actual_s_ind, actual_t_ind = self._provide_sentence_terms(
|
|
85
|
-
parsed_news=parsed_news, sentence_ind=sentence_ind, s_ind=s_ind, t_ind=t_ind)
|
|
86
|
-
|
|
87
92
|
self.__text_provider.add_text_in_row(
|
|
88
|
-
set_text_func=
|
|
89
|
-
|
|
90
|
-
s_ind=actual_s_ind,
|
|
91
|
-
t_ind=actual_t_ind,
|
|
93
|
+
set_text_func=__assign_value, sentence_terms=sentence_terms,
|
|
94
|
+
s_ind=actual_s_ind, t_ind=actual_t_ind,
|
|
92
95
|
expected_label=expected_label)
|
|
93
96
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
97
|
+
if self.__store_labels:
|
|
98
|
+
l2i = self._label_provider.LabelScaler.label_to_uint
|
|
99
|
+
ui2l = self._label_provider.LabelScaler.uint_to_label
|
|
100
|
+
uint_label = self._label_provider.calculate_output_uint_label(
|
|
101
|
+
expected_uint_label=l2i(expected_label), etalon_uint_label=l2i(etalon_label))
|
|
102
|
+
vm[const.LABEL_UINT] = uint_label
|
|
103
|
+
vm[const.LABEL_STR] = type(ui2l(uint_label)).__name__
|
|
100
104
|
|
|
101
|
-
row
|
|
102
|
-
|
|
105
|
+
self._apply_row_data(row=row, vm=vm, val_fmt=self._val_fmt)
|
|
106
|
+
|
|
107
|
+
@staticmethod
|
|
108
|
+
def _apply_row_data(row, vm, val_fmt):
|
|
109
|
+
for k, v in vm.items():
|
|
110
|
+
if v is None:
|
|
111
|
+
continue
|
|
112
|
+
row[k] = v if k not in val_fmt else val_fmt[k](v)
|
|
103
113
|
|
|
104
|
-
def _provide_rows(self,
|
|
114
|
+
def _provide_rows(self, parsed_doc, entity_service, text_opinion_linkage, idle_mode):
|
|
105
115
|
assert(isinstance(idle_mode, bool))
|
|
106
116
|
|
|
107
117
|
row_dict = OrderedDict()
|
|
@@ -109,7 +119,7 @@ class BaseSampleRowProvider(BaseRowProvider):
|
|
|
109
119
|
for index_in_linked in range(len(text_opinion_linkage)):
|
|
110
120
|
|
|
111
121
|
rows_it = self.__provide_rows(
|
|
112
|
-
|
|
122
|
+
parsed_doc=parsed_doc,
|
|
113
123
|
entity_service=entity_service,
|
|
114
124
|
row_dict=row_dict,
|
|
115
125
|
text_opinion_linkage=text_opinion_linkage,
|
|
@@ -123,36 +133,28 @@ class BaseSampleRowProvider(BaseRowProvider):
|
|
|
123
133
|
|
|
124
134
|
# region private methods
|
|
125
135
|
|
|
126
|
-
@staticmethod
|
|
127
|
-
def __create_row_ids_provider(label_provider):
|
|
128
|
-
# TODO. #376 related. This should be removed after refactoring, because
|
|
129
|
-
# TODO. we consider an ordinary IDs, that not based on the other data.
|
|
130
|
-
if isinstance(label_provider, BinaryLabelProvider):
|
|
131
|
-
return BinaryIDProvider()
|
|
132
|
-
if isinstance(label_provider, MultipleLabelProvider):
|
|
133
|
-
return MultipleIDProvider()
|
|
134
|
-
|
|
135
136
|
@staticmethod
|
|
136
137
|
def __create_instances_provider(label_provider):
|
|
137
|
-
# TODO. #473 related:
|
|
138
|
+
# TODO. #473 related: these label providers are based on text opinion extraction task!
|
|
138
139
|
if isinstance(label_provider, BinaryLabelProvider):
|
|
139
140
|
return MultipleInstancesLinkedTextOpinionsProvider(label_provider.SupportedLabels)
|
|
140
141
|
if isinstance(label_provider, MultipleLabelProvider):
|
|
141
142
|
return SingleInstanceLinkedDataProvider()
|
|
142
143
|
|
|
143
|
-
def __provide_rows(self, row_dict,
|
|
144
|
+
def __provide_rows(self, row_dict, parsed_doc, entity_service,
|
|
144
145
|
text_opinion_linkage, index_in_linked, idle_mode):
|
|
145
146
|
"""
|
|
146
147
|
Providing Rows depending on row_id_formatter type
|
|
147
148
|
"""
|
|
148
|
-
assert(isinstance(
|
|
149
|
+
assert(isinstance(parsed_doc, ParsedDocument))
|
|
149
150
|
assert(isinstance(row_dict, OrderedDict))
|
|
150
151
|
assert(isinstance(text_opinion_linkage, TextOpinionsLinkage))
|
|
151
152
|
|
|
152
153
|
etalon_label = self.__instances_provider.provide_label(text_opinion_linkage)
|
|
153
154
|
for instance in self.__instances_provider.iter_instances(text_opinion_linkage):
|
|
154
155
|
yield self.__create_row(row=row_dict,
|
|
155
|
-
|
|
156
|
+
row_id=0,
|
|
157
|
+
parsed_doc=parsed_doc,
|
|
156
158
|
entity_service=entity_service,
|
|
157
159
|
text_opinions_linkage=instance,
|
|
158
160
|
index_in_linked=index_in_linked,
|
|
@@ -160,7 +162,7 @@ class BaseSampleRowProvider(BaseRowProvider):
|
|
|
160
162
|
etalon_label=etalon_label,
|
|
161
163
|
idle_mode=idle_mode)
|
|
162
164
|
|
|
163
|
-
def __create_row(self, row,
|
|
165
|
+
def __create_row(self, row, row_id, parsed_doc, entity_service, text_opinions_linkage,
|
|
164
166
|
index_in_linked, etalon_label, idle_mode):
|
|
165
167
|
"""
|
|
166
168
|
Composing row in following format:
|
|
@@ -196,7 +198,7 @@ class BaseSampleRowProvider(BaseRowProvider):
|
|
|
196
198
|
raise Exception("Limitation: Multi-Sentence text_opinions are not supported.")
|
|
197
199
|
|
|
198
200
|
self._fill_row_core(row=row,
|
|
199
|
-
|
|
201
|
+
parsed_doc=parsed_doc,
|
|
200
202
|
sentence_ind=source_s_ind,
|
|
201
203
|
text_opinion_linkage=text_opinions_linkage,
|
|
202
204
|
index_in_linked=index_in_linked,
|
|
@@ -34,9 +34,9 @@ class CroppedSampleRowProvider(BaseSampleRowProvider):
|
|
|
34
34
|
|
|
35
35
|
return _from, _to
|
|
36
36
|
|
|
37
|
-
def _provide_sentence_terms(self,
|
|
37
|
+
def _provide_sentence_terms(self, parsed_doc, sentence_ind, s_ind, t_ind):
|
|
38
38
|
terms_iter, src_ind, tgt_ind = super(CroppedSampleRowProvider, self)._provide_sentence_terms(
|
|
39
|
-
|
|
39
|
+
parsed_doc=parsed_doc, sentence_ind=sentence_ind, s_ind=s_ind, t_ind=t_ind)
|
|
40
40
|
terms = list(terms_iter)
|
|
41
41
|
_from, _to = self.__calc_window_bounds(window_size=self.__crop_window_size,
|
|
42
42
|
s_ind=s_ind, t_ind=t_ind, input_length=len(terms))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from collections import OrderedDict
|
|
2
2
|
|
|
3
|
-
from arekit.common.
|
|
3
|
+
from arekit.common.docs.parsed.providers.entity_service import EntityServiceProvider, DistanceType
|
|
4
4
|
from arekit.common.text_opinions.base import TextOpinion
|
|
5
5
|
|
|
6
6
|
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from arekit.common.data import const
|
|
2
|
+
from arekit.common.utils import filter_whitespaces, split_by_whitespaces
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def process_values_list(value, args_sep):
|
|
6
|
+
return value.split(args_sep)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def process_indices_list(value, no_value_func, args_sep):
|
|
10
|
+
return no_value_func() if not value else [int(v) for v in str(value).split(args_sep)]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def process_text(value):
|
|
14
|
+
""" The core method of the input text processing.
|
|
15
|
+
"""
|
|
16
|
+
assert(isinstance(value, str) or isinstance(value, list))
|
|
17
|
+
return filter_whitespaces([term for term in split_by_whitespaces(value)]
|
|
18
|
+
if isinstance(value, str) else value)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def create_base_column_value_fmt(no_value_func=lambda: None, args_sep=","):
|
|
22
|
+
|
|
23
|
+
self_func = lambda value: value
|
|
24
|
+
|
|
25
|
+
return {
|
|
26
|
+
const.ID: {
|
|
27
|
+
"writer": self_func,
|
|
28
|
+
"parser": self_func
|
|
29
|
+
},
|
|
30
|
+
const.DOC_ID: {
|
|
31
|
+
"writer": self_func,
|
|
32
|
+
"parser": self_func,
|
|
33
|
+
},
|
|
34
|
+
const.S_IND: {
|
|
35
|
+
"writer": self_func,
|
|
36
|
+
"parser": lambda value: int(value)
|
|
37
|
+
},
|
|
38
|
+
const.T_IND: {
|
|
39
|
+
"writer": self_func,
|
|
40
|
+
"parser": lambda value: int(value)
|
|
41
|
+
},
|
|
42
|
+
const.SENT_IND: {
|
|
43
|
+
"writer": self_func,
|
|
44
|
+
"parser": lambda value: int(value)
|
|
45
|
+
},
|
|
46
|
+
const.OPINION_ID: {
|
|
47
|
+
"writer": self_func,
|
|
48
|
+
"parser": lambda value: int(value)
|
|
49
|
+
},
|
|
50
|
+
const.OPINION_LINKAGE_ID: {
|
|
51
|
+
"writer": self_func,
|
|
52
|
+
"parser": lambda value: int(value)
|
|
53
|
+
},
|
|
54
|
+
const.ENTITY_VALUES: {
|
|
55
|
+
"writer": lambda entities: args_sep.join([e.DisplayValue.replace(args_sep, '') for e in entities]),
|
|
56
|
+
"parser": lambda value: process_values_list(value, args_sep=args_sep),
|
|
57
|
+
},
|
|
58
|
+
const.ENTITY_TYPES: {
|
|
59
|
+
"writer": lambda entities: args_sep.join([e.Type.replace(args_sep, '') for e in entities]),
|
|
60
|
+
"parser": lambda value: process_values_list(value, args_sep=args_sep)
|
|
61
|
+
},
|
|
62
|
+
const.ENTITIES: {
|
|
63
|
+
"writer": lambda entity_inds: args_sep.join(entity_inds),
|
|
64
|
+
"parser": lambda value: process_indices_list(value, no_value_func=no_value_func, args_sep=args_sep)
|
|
65
|
+
},
|
|
66
|
+
const.TEXT: {
|
|
67
|
+
"writer": self_func,
|
|
68
|
+
"parser": lambda value: process_text(value)
|
|
69
|
+
},
|
|
70
|
+
const.LABEL_UINT: {
|
|
71
|
+
"writer": self_func,
|
|
72
|
+
"parser": lambda value: int(value)
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def create_base_column_fmt(fmt_type, args_sep=","):
|
|
78
|
+
assert(isinstance(fmt_type, str))
|
|
79
|
+
d = create_base_column_value_fmt(args_sep=args_sep)
|
|
80
|
+
for k, v in d.items():
|
|
81
|
+
d[k] = v[fmt_type]
|
|
82
|
+
return d
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
class ParsedSampleRow(object):
|
|
2
|
+
""" Provides a parsed information for a sample row.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
def __init__(self, row, columns_fmts, no_value_func):
|
|
6
|
+
""" row: dict
|
|
7
|
+
dict of the pairs ("field_name", value)
|
|
8
|
+
columns_fmt: list
|
|
9
|
+
list of the formatters, where every formatter represent a dictionary.
|
|
10
|
+
no_value_func: func
|
|
11
|
+
the default value the conveys the absence of the parameter-value.
|
|
12
|
+
"""
|
|
13
|
+
assert(isinstance(row, dict))
|
|
14
|
+
assert(isinstance(columns_fmts, list))
|
|
15
|
+
assert(callable(no_value_func))
|
|
16
|
+
|
|
17
|
+
self.__uint_label = None
|
|
18
|
+
self.__params = {}
|
|
19
|
+
self.__no_value = no_value_func
|
|
20
|
+
|
|
21
|
+
for key, value in row.items():
|
|
22
|
+
|
|
23
|
+
for columns_fmt in columns_fmts:
|
|
24
|
+
assert(isinstance(columns_fmt, dict))
|
|
25
|
+
|
|
26
|
+
if key not in columns_fmt:
|
|
27
|
+
continue
|
|
28
|
+
|
|
29
|
+
self.__params[key] = columns_fmt[key](value)
|
|
30
|
+
break
|
|
31
|
+
|
|
32
|
+
def __value_or_none(self, key):
|
|
33
|
+
return self.__params[key] if key in self.__params else self.__no_value()
|
|
34
|
+
|
|
35
|
+
def __getitem__(self, item):
|
|
36
|
+
assert (isinstance(item, str) or item is None)
|
|
37
|
+
if item not in self.__params:
|
|
38
|
+
return self.__no_value()
|
|
39
|
+
return self.__params[item] if item is not None else self.__no_value()
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def parse(cls, row, columns_fmts, no_value_func):
|
|
43
|
+
return cls(row=row, columns_fmts=columns_fmts, no_value_func=no_value_func)
|