arekit 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/__init__.py +0 -0
- arekit/common/__init__.py +0 -0
- arekit/common/bound.py +48 -0
- arekit/common/context/__init__.py +0 -0
- arekit/common/context/terms_mapper.py +51 -0
- arekit/common/context/token.py +16 -0
- arekit/common/data/__init__.py +0 -0
- arekit/common/data/const.py +21 -0
- arekit/common/data/doc_provider.py +6 -0
- arekit/common/data/input/__init__.py +0 -0
- arekit/common/data/input/providers/__init__.py +0 -0
- arekit/common/data/input/providers/columns/__init__.py +0 -0
- arekit/common/data/input/providers/columns/base.py +9 -0
- arekit/common/data/input/providers/columns/sample.py +59 -0
- arekit/common/data/input/providers/const.py +3 -0
- arekit/common/data/input/providers/contents.py +9 -0
- arekit/common/data/input/providers/instances/__init__.py +0 -0
- arekit/common/data/input/providers/instances/base.py +14 -0
- arekit/common/data/input/providers/instances/multiple.py +27 -0
- arekit/common/data/input/providers/instances/single.py +8 -0
- arekit/common/data/input/providers/label/__init__.py +0 -0
- arekit/common/data/input/providers/label/base.py +24 -0
- arekit/common/data/input/providers/label/binary.py +11 -0
- arekit/common/data/input/providers/label/multiple.py +15 -0
- arekit/common/data/input/providers/rows/__init__.py +0 -0
- arekit/common/data/input/providers/rows/base.py +64 -0
- arekit/common/data/input/providers/rows/samples.py +227 -0
- arekit/common/data/input/providers/sample/__init__.py +0 -0
- arekit/common/data/input/providers/sample/cropped.py +43 -0
- arekit/common/data/input/providers/text/__init__.py +0 -0
- arekit/common/data/input/providers/text/single.py +49 -0
- arekit/common/data/input/repositories/__init__.py +0 -0
- arekit/common/data/input/repositories/base.py +68 -0
- arekit/common/data/input/repositories/sample.py +22 -0
- arekit/common/data/input/sample.py +66 -0
- arekit/common/data/input/terms_mapper.py +88 -0
- arekit/common/data/rows_fmt.py +82 -0
- arekit/common/data/rows_parser.py +43 -0
- arekit/common/data/storages/__init__.py +0 -0
- arekit/common/data/storages/base.py +109 -0
- arekit/common/data/views/__init__.py +0 -0
- arekit/common/data/views/samples.py +26 -0
- arekit/common/docs/__init__.py +0 -0
- arekit/common/docs/base.py +30 -0
- arekit/common/docs/entities_grouping.py +16 -0
- arekit/common/docs/entity.py +18 -0
- arekit/common/docs/objects_parser.py +37 -0
- arekit/common/docs/parsed/__init__.py +0 -0
- arekit/common/docs/parsed/base.py +101 -0
- arekit/common/docs/parsed/providers/__init__.py +0 -0
- arekit/common/docs/parsed/providers/base.py +68 -0
- arekit/common/docs/parsed/providers/base_pairs.py +51 -0
- arekit/common/docs/parsed/providers/entity_service.py +175 -0
- arekit/common/docs/parsed/providers/opinion_pairs.py +20 -0
- arekit/common/docs/parsed/providers/text_opinion_pairs.py +78 -0
- arekit/common/docs/parsed/service.py +31 -0
- arekit/common/docs/parsed/term_position.py +42 -0
- arekit/common/docs/parser.py +34 -0
- arekit/common/docs/sentence.py +14 -0
- arekit/common/entities/__init__.py +0 -0
- arekit/common/entities/base.py +51 -0
- arekit/common/entities/collection.py +72 -0
- arekit/common/entities/str_fmt.py +8 -0
- arekit/common/entities/types.py +9 -0
- arekit/common/experiment/__init__.py +0 -0
- arekit/common/experiment/api/__init__.py +0 -0
- arekit/common/experiment/api/base_samples_io.py +20 -0
- arekit/common/experiment/data_type.py +17 -0
- arekit/common/frames/__init__.py +0 -0
- arekit/common/frames/connotations/__init__.py +0 -0
- arekit/common/frames/connotations/descriptor.py +17 -0
- arekit/common/frames/connotations/provider.py +4 -0
- arekit/common/frames/text_variant.py +43 -0
- arekit/common/frames/variants/__init__.py +0 -0
- arekit/common/frames/variants/base.py +21 -0
- arekit/common/frames/variants/collection.py +60 -0
- arekit/common/labels/__init__.py +0 -0
- arekit/common/labels/base.py +19 -0
- arekit/common/labels/provider/__init__.py +0 -0
- arekit/common/labels/provider/base.py +7 -0
- arekit/common/labels/provider/constant.py +14 -0
- arekit/common/labels/scaler/__init__.py +0 -0
- arekit/common/labels/scaler/base.py +85 -0
- arekit/common/labels/scaler/sentiment.py +7 -0
- arekit/common/labels/scaler/single.py +10 -0
- arekit/common/labels/str_fmt.py +55 -0
- arekit/common/linkage/__init__.py +0 -0
- arekit/common/linkage/base.py +44 -0
- arekit/common/linkage/meta.py +23 -0
- arekit/common/linkage/opinions.py +9 -0
- arekit/common/linkage/text_opinions.py +22 -0
- arekit/common/log_utils.py +29 -0
- arekit/common/model/__init__.py +0 -0
- arekit/common/model/labeling/__init__.py +0 -0
- arekit/common/model/labeling/base.py +24 -0
- arekit/common/model/labeling/modes.py +8 -0
- arekit/common/model/labeling/single.py +24 -0
- arekit/common/opinions/__init__.py +0 -0
- arekit/common/opinions/annot/__init__.py +0 -0
- arekit/common/opinions/annot/algo/__init__.py +0 -0
- arekit/common/opinions/annot/algo/base.py +4 -0
- arekit/common/opinions/annot/algo/pair_based.py +99 -0
- arekit/common/opinions/annot/algo/predefined.py +16 -0
- arekit/common/opinions/annot/algo_based.py +55 -0
- arekit/common/opinions/annot/base.py +15 -0
- arekit/common/opinions/base.py +74 -0
- arekit/common/opinions/collection.py +150 -0
- arekit/common/opinions/enums.py +6 -0
- arekit/common/opinions/provider.py +4 -0
- arekit/common/opinions/writer.py +4 -0
- arekit/common/pipeline/__init__.py +0 -0
- arekit/common/pipeline/base.py +25 -0
- arekit/common/pipeline/context.py +36 -0
- arekit/common/pipeline/conts.py +2 -0
- arekit/common/pipeline/items/__init__.py +0 -0
- arekit/common/pipeline/items/base.py +12 -0
- arekit/common/pipeline/items/flatten.py +14 -0
- arekit/common/pipeline/items/handle.py +17 -0
- arekit/common/pipeline/items/iter.py +11 -0
- arekit/common/pipeline/items/map.py +11 -0
- arekit/common/pipeline/items/map_nested.py +13 -0
- arekit/common/synonyms/__init__.py +0 -0
- arekit/common/synonyms/base.py +151 -0
- arekit/common/synonyms/grouping.py +21 -0
- arekit/common/text/__init__.py +0 -0
- arekit/common/text/enums.py +12 -0
- arekit/common/text/parsed.py +42 -0
- arekit/common/text/parser.py +12 -0
- arekit/common/text/partitioning/__init__.py +0 -0
- arekit/common/text/partitioning/base.py +4 -0
- arekit/common/text/partitioning/str.py +36 -0
- arekit/common/text/partitioning/terms.py +35 -0
- arekit/common/text/stemmer.py +16 -0
- arekit/common/text_opinions/__init__.py +0 -0
- arekit/common/text_opinions/base.py +105 -0
- arekit/common/utils.py +129 -0
- arekit/contrib/__init__.py +0 -0
- arekit/contrib/bert/__init__.py +0 -0
- arekit/contrib/bert/input/__init__.py +0 -0
- arekit/contrib/bert/input/providers/__init__.py +0 -0
- arekit/contrib/bert/input/providers/cropped_sample.py +17 -0
- arekit/contrib/bert/input/providers/text_pair.py +62 -0
- arekit/contrib/bert/terms/__init__.py +0 -0
- arekit/contrib/bert/terms/mapper.py +20 -0
- arekit/contrib/networks/__init__.py +0 -0
- arekit/contrib/networks/embedding.py +149 -0
- arekit/contrib/networks/embedding_io.py +18 -0
- arekit/contrib/networks/input/__init__.py +0 -0
- arekit/contrib/networks/input/const.py +6 -0
- arekit/contrib/networks/input/ctx_serialization.py +28 -0
- arekit/contrib/networks/input/embedding/__init__.py +0 -0
- arekit/contrib/networks/input/embedding/matrix.py +29 -0
- arekit/contrib/networks/input/embedding/offsets.py +55 -0
- arekit/contrib/networks/input/formatters/__init__.py +0 -0
- arekit/contrib/networks/input/formatters/pos_mapper.py +22 -0
- arekit/contrib/networks/input/providers/__init__.py +0 -0
- arekit/contrib/networks/input/providers/sample.py +129 -0
- arekit/contrib/networks/input/providers/term_connotation.py +23 -0
- arekit/contrib/networks/input/providers/text.py +24 -0
- arekit/contrib/networks/input/rows_parser.py +47 -0
- arekit/contrib/networks/input/term_types.py +13 -0
- arekit/contrib/networks/input/terms_mapping.py +60 -0
- arekit/contrib/networks/vectorizer.py +6 -0
- arekit/contrib/prompt/__init__.py +0 -0
- arekit/contrib/prompt/sample.py +61 -0
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +84 -0
- arekit/contrib/source/brat/doc.py +28 -0
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +13 -0
- arekit/contrib/source/brat/entities/entity.py +42 -0
- arekit/contrib/source/brat/entities/parser.py +53 -0
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +19 -0
- arekit/contrib/source/brat/relation.py +32 -0
- arekit/contrib/source/brat/sentence.py +69 -0
- arekit/contrib/source/brat/sentences_reader.py +128 -0
- arekit/contrib/source/download.py +41 -0
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +55 -0
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +74 -0
- arekit/contrib/source/nerel/io_utils.py +62 -0
- arekit/contrib/source/nerel/labels.py +241 -0
- arekit/contrib/source/nerel/reader.py +46 -0
- arekit/contrib/source/nerel/utils.py +24 -0
- arekit/contrib/source/nerel/versions.py +12 -0
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +62 -0
- arekit/contrib/source/nerelbio/labels.py +265 -0
- arekit/contrib/source/nerelbio/reader.py +8 -0
- arekit/contrib/source/nerelbio/versions.py +8 -0
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +36 -0
- arekit/contrib/source/ruattitudes/doc.py +51 -0
- arekit/contrib/source/ruattitudes/doc_brat.py +44 -0
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +7 -0
- arekit/contrib/source/ruattitudes/io_utils.py +56 -0
- arekit/contrib/source/ruattitudes/labels_fmt.py +12 -0
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +28 -0
- arekit/contrib/source/ruattitudes/opinions/converter.py +37 -0
- arekit/contrib/source/ruattitudes/reader.py +268 -0
- arekit/contrib/source/ruattitudes/sentence.py +73 -0
- arekit/contrib/source/ruattitudes/synonyms.py +17 -0
- arekit/contrib/source/ruattitudes/text_object.py +59 -0
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +157 -0
- arekit/contrib/source/rusentiframes/effect.py +24 -0
- arekit/contrib/source/rusentiframes/io_utils.py +19 -0
- arekit/contrib/source/rusentiframes/labels_fmt.py +22 -0
- arekit/contrib/source/rusentiframes/polarity.py +35 -0
- arekit/contrib/source/rusentiframes/role.py +15 -0
- arekit/contrib/source/rusentiframes/state.py +24 -0
- arekit/contrib/source/rusentiframes/types.py +42 -0
- arekit/contrib/source/rusentiframes/value.py +2 -0
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +3 -0
- arekit/contrib/source/rusentrel/docs_reader.py +51 -0
- arekit/contrib/source/rusentrel/entities.py +26 -0
- arekit/contrib/source/rusentrel/io_utils.py +125 -0
- arekit/contrib/source/rusentrel/labels_fmt.py +12 -0
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +30 -0
- arekit/contrib/source/rusentrel/opinions/converter.py +40 -0
- arekit/contrib/source/rusentrel/opinions/provider.py +54 -0
- arekit/contrib/source/rusentrel/opinions/writer.py +42 -0
- arekit/contrib/source/rusentrel/synonyms.py +17 -0
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +52 -0
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +31 -0
- arekit/contrib/source/sentinerel/folding/fixed.py +70 -0
- arekit/contrib/source/sentinerel/io_utils.py +87 -0
- arekit/contrib/source/sentinerel/labels.py +53 -0
- arekit/contrib/source/sentinerel/labels_scaler.py +30 -0
- arekit/contrib/source/sentinerel/reader.py +42 -0
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +19 -0
- arekit/contrib/source/zip_utils.py +47 -0
- arekit/contrib/utils/__init__.py +0 -0
- arekit/contrib/utils/bert/__init__.py +0 -0
- arekit/contrib/utils/bert/samplers.py +17 -0
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +23 -0
- arekit/contrib/utils/data/__init__.py +0 -0
- arekit/contrib/utils/data/contents/__init__.py +0 -0
- arekit/contrib/utils/data/contents/opinions.py +37 -0
- arekit/contrib/utils/data/doc_provider/__init__.py +0 -0
- arekit/contrib/utils/data/doc_provider/dict_based.py +13 -0
- arekit/contrib/utils/data/doc_provider/dir_based.py +53 -0
- arekit/contrib/utils/data/readers/__init__.py +0 -0
- arekit/contrib/utils/data/readers/base.py +7 -0
- arekit/contrib/utils/data/readers/csv_pd.py +38 -0
- arekit/contrib/utils/data/readers/jsonl.py +15 -0
- arekit/contrib/utils/data/service/__init__.py +0 -0
- arekit/contrib/utils/data/service/balance.py +50 -0
- arekit/contrib/utils/data/storages/__init__.py +0 -0
- arekit/contrib/utils/data/storages/jsonl_based.py +18 -0
- arekit/contrib/utils/data/storages/pandas_based.py +123 -0
- arekit/contrib/utils/data/storages/row_cache.py +48 -0
- arekit/contrib/utils/data/writers/__init__.py +0 -0
- arekit/contrib/utils/data/writers/base.py +27 -0
- arekit/contrib/utils/data/writers/csv_native.py +63 -0
- arekit/contrib/utils/data/writers/csv_pd.py +40 -0
- arekit/contrib/utils/data/writers/json_opennre.py +132 -0
- arekit/contrib/utils/data/writers/sqlite_native.py +110 -0
- arekit/contrib/utils/download.py +77 -0
- arekit/contrib/utils/embeddings/__init__.py +0 -0
- arekit/contrib/utils/embeddings/rusvectores.py +58 -0
- arekit/contrib/utils/embeddings/tokens.py +30 -0
- arekit/contrib/utils/entities/__init__.py +0 -0
- arekit/contrib/utils/entities/filter.py +7 -0
- arekit/contrib/utils/entities/formatters/__init__.py +0 -0
- arekit/contrib/utils/entities/formatters/str_display.py +11 -0
- arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py +15 -0
- arekit/contrib/utils/io_utils/__init__.py +0 -0
- arekit/contrib/utils/io_utils/embedding.py +72 -0
- arekit/contrib/utils/io_utils/opinions.py +37 -0
- arekit/contrib/utils/io_utils/samples.py +79 -0
- arekit/contrib/utils/io_utils/utils.py +39 -0
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +41 -0
- arekit/contrib/utils/lexicons/relation.py +42 -0
- arekit/contrib/utils/lexicons/rusentilex.py +37 -0
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +83 -0
- arekit/contrib/utils/np_utils/__init__.py +0 -0
- arekit/contrib/utils/np_utils/embedding.py +22 -0
- arekit/contrib/utils/np_utils/npz_utils.py +13 -0
- arekit/contrib/utils/np_utils/vocab.py +20 -0
- arekit/contrib/utils/pipelines/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/base.py +99 -0
- arekit/contrib/utils/pipelines/items/sampling/networks.py +54 -0
- arekit/contrib/utils/pipelines/items/text/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/text/entities_default.py +23 -0
- arekit/contrib/utils/pipelines/items/text/frames.py +86 -0
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +36 -0
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +32 -0
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +10 -0
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +107 -0
- arekit/contrib/utils/pipelines/items/text/translator.py +135 -0
- arekit/contrib/utils/pipelines/opinion_collections.py +85 -0
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +27 -0
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +65 -0
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +60 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +29 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +64 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +79 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +56 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +20 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +65 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +21 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +107 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +29 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +62 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +180 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +50 -0
- arekit/contrib/utils/pipelines/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py +34 -0
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +88 -0
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +93 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py +0 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/base.py +4 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py +16 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py +29 -0
- arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py +26 -0
- arekit/contrib/utils/processing/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/mods.py +12 -0
- arekit/contrib/utils/processing/languages/pos.py +23 -0
- arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/ru/cases.py +78 -0
- arekit/contrib/utils/processing/languages/ru/constants.py +6 -0
- arekit/contrib/utils/processing/languages/ru/mods.py +13 -0
- arekit/contrib/utils/processing/languages/ru/number.py +23 -0
- arekit/contrib/utils/processing/languages/ru/pos_service.py +36 -0
- arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
- arekit/contrib/utils/processing/lemmatization/mystem.py +51 -0
- arekit/contrib/utils/processing/pos/__init__.py +0 -0
- arekit/contrib/utils/processing/pos/base.py +12 -0
- arekit/contrib/utils/processing/pos/mystem_wrap.py +134 -0
- arekit/contrib/utils/processing/pos/russian.py +10 -0
- arekit/contrib/utils/processing/text/__init__.py +0 -0
- arekit/contrib/utils/processing/text/tokens.py +127 -0
- arekit/contrib/utils/resources.py +25 -0
- arekit/contrib/utils/serializer.py +43 -0
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +63 -0
- arekit/contrib/utils/synonyms/__init__.py +0 -0
- arekit/contrib/utils/synonyms/simple.py +15 -0
- arekit/contrib/utils/synonyms/stemmer_based.py +38 -0
- arekit/contrib/utils/vectorizers/__init__.py +0 -0
- arekit/contrib/utils/vectorizers/bpe.py +93 -0
- arekit/contrib/utils/vectorizers/random_norm.py +39 -0
- arekit/download_data.py +11 -0
- arekit-0.24.0.dist-info/LICENSE +21 -0
- arekit-0.24.0.dist-info/METADATA +23 -0
- arekit-0.24.0.dist-info/RECORD +374 -0
- arekit-0.24.0.dist-info/WHEEL +5 -0
- arekit-0.24.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from arekit.common.pipeline.items.base import BasePipelineItem
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class MapPipelineItem(BasePipelineItem):
|
|
5
|
+
|
|
6
|
+
def __init__(self, map_func=None):
|
|
7
|
+
assert(callable(map_func))
|
|
8
|
+
self._map_func = map_func
|
|
9
|
+
|
|
10
|
+
def apply_core(self, input_data, pipeline_ctx):
|
|
11
|
+
return map(self._map_func, input_data)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from arekit.common.pipeline.items.map import MapPipelineItem
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class MapNestedPipelineItem(MapPipelineItem):
|
|
5
|
+
""" This type is considered for describing nested pipelines,
|
|
6
|
+
which might be required in parameters of the parent pipeline-contexts.
|
|
7
|
+
|
|
8
|
+
Data treated as a sequence, in which every element is
|
|
9
|
+
suppose to be mapped with the passed pipeline context.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def apply_core(self, input_data, pipeline_ctx):
|
|
13
|
+
return map(lambda item: self._map_func(item, pipeline_ctx), input_data)
|
|
File without changes
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
from collections.abc import Iterable
|
|
2
|
+
|
|
3
|
+
from arekit.common import log_utils
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SynonymsCollection(object):
|
|
7
|
+
|
|
8
|
+
def __init__(self, iter_group_values_lists=None, is_read_only=True, debug=False):
|
|
9
|
+
""" iter_group_values_lists: iterable or None
|
|
10
|
+
is_read_only: bool
|
|
11
|
+
whether the relation collection could be expanded or not
|
|
12
|
+
debug: bool
|
|
13
|
+
utilized for logging the salient information during usage.
|
|
14
|
+
"""
|
|
15
|
+
assert(isinstance(iter_group_values_lists, Iterable) or iter_group_values_lists is None)
|
|
16
|
+
assert(isinstance(is_read_only, bool))
|
|
17
|
+
assert(isinstance(debug, bool))
|
|
18
|
+
|
|
19
|
+
# Assumes to be filled
|
|
20
|
+
self.__by_sid = {}
|
|
21
|
+
self.__by_index = []
|
|
22
|
+
|
|
23
|
+
self.__is_read_only = is_read_only
|
|
24
|
+
self.__debug = debug
|
|
25
|
+
self.__fill(iter_group_values_lists=[] if iter_group_values_lists is None else iter_group_values_lists)
|
|
26
|
+
|
|
27
|
+
# region properties
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def IsReadOnly(self):
|
|
31
|
+
return self.__is_read_only
|
|
32
|
+
|
|
33
|
+
# endregion
|
|
34
|
+
|
|
35
|
+
# region public 'add' methods
|
|
36
|
+
|
|
37
|
+
def add_synonym_value(self, value):
|
|
38
|
+
assert(isinstance(value, str))
|
|
39
|
+
|
|
40
|
+
if self.__contains_synonym_value(value):
|
|
41
|
+
raise Exception(("Collection already contains synonyms '{}'".format(value)).encode('utf-8'))
|
|
42
|
+
|
|
43
|
+
if self.__is_read_only:
|
|
44
|
+
raise Exception(("Failed to add '{}'. Synonym collection is read only!".format(value)).encode('utf-8'))
|
|
45
|
+
|
|
46
|
+
sid = self._create_external_sid(value)
|
|
47
|
+
self.__by_sid[sid] = self.__get_groups_count()
|
|
48
|
+
self.__by_index.append([value])
|
|
49
|
+
|
|
50
|
+
# endregion
|
|
51
|
+
|
|
52
|
+
# region public 'contains' methods
|
|
53
|
+
|
|
54
|
+
def contains_synonym_value(self, value):
|
|
55
|
+
return self.__contains_synonym_value(value)
|
|
56
|
+
|
|
57
|
+
# endregion
|
|
58
|
+
|
|
59
|
+
# region public 'get' methods
|
|
60
|
+
|
|
61
|
+
def get_synonym_group_index(self, value):
|
|
62
|
+
""" NOTE: Before use this, please take a look at the grouping (see #327 issue).
|
|
63
|
+
It is better to use that class API rather than pass that method for `value_to_group_id_func`
|
|
64
|
+
"""
|
|
65
|
+
assert(isinstance(value, str))
|
|
66
|
+
return self.__get_group_index(value)
|
|
67
|
+
|
|
68
|
+
# endregion
|
|
69
|
+
|
|
70
|
+
# region public 'create' methods
|
|
71
|
+
|
|
72
|
+
def create_synonym_id(self, value):
|
|
73
|
+
return self._create_external_sid(value)
|
|
74
|
+
|
|
75
|
+
# endregion
|
|
76
|
+
|
|
77
|
+
# region protected methods
|
|
78
|
+
|
|
79
|
+
def _contains_sid(self, v_id):
|
|
80
|
+
return v_id in self.__by_sid
|
|
81
|
+
|
|
82
|
+
def _create_internal_sid(self, value):
|
|
83
|
+
""" Utilized during filling stage.
|
|
84
|
+
"""
|
|
85
|
+
raise NotImplementedError()
|
|
86
|
+
|
|
87
|
+
def _create_external_sid(self, value):
|
|
88
|
+
raise NotImplementedError()
|
|
89
|
+
|
|
90
|
+
# endregion
|
|
91
|
+
|
|
92
|
+
# region public 'iter' methods
|
|
93
|
+
|
|
94
|
+
def iter_synonym_values(self, value):
|
|
95
|
+
assert(isinstance(value, str))
|
|
96
|
+
sid = self._create_external_sid(value)
|
|
97
|
+
index = self.__by_sid[sid]
|
|
98
|
+
return iter(self.__by_index[index])
|
|
99
|
+
|
|
100
|
+
def iter_by_index(self):
|
|
101
|
+
return iter(self.__by_index)
|
|
102
|
+
|
|
103
|
+
def iter_group(self, group_index):
|
|
104
|
+
assert(isinstance(group_index, int))
|
|
105
|
+
return iter(self.__by_index[group_index])
|
|
106
|
+
|
|
107
|
+
# endregion
|
|
108
|
+
|
|
109
|
+
# region private methods
|
|
110
|
+
|
|
111
|
+
def __fill(self, iter_group_values_lists):
|
|
112
|
+
for group in iter_group_values_lists:
|
|
113
|
+
self.__process_group(group)
|
|
114
|
+
|
|
115
|
+
def __process_group(self, group_values_list):
|
|
116
|
+
group_index = len(self.__by_index)
|
|
117
|
+
synonym_list = []
|
|
118
|
+
|
|
119
|
+
for synonym_value in group_values_list:
|
|
120
|
+
|
|
121
|
+
value = synonym_value.strip()
|
|
122
|
+
|
|
123
|
+
sid = self._create_internal_sid(value)
|
|
124
|
+
|
|
125
|
+
if self._contains_sid(sid) and self.__debug:
|
|
126
|
+
log_utils.log_synonym_existed(value)
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
synonym_list.append(value)
|
|
130
|
+
self.__by_sid[sid] = group_index
|
|
131
|
+
|
|
132
|
+
self.__by_index.append(synonym_list)
|
|
133
|
+
|
|
134
|
+
def __get_groups_count(self):
|
|
135
|
+
return len(self.__by_index)
|
|
136
|
+
|
|
137
|
+
def __get_group_index(self, value):
|
|
138
|
+
sid = self._create_external_sid(value)
|
|
139
|
+
return self.__by_sid[sid]
|
|
140
|
+
|
|
141
|
+
def __contains_synonym_value(self, value):
|
|
142
|
+
return self._contains_sid(self._create_external_sid(value))
|
|
143
|
+
|
|
144
|
+
# endregion
|
|
145
|
+
|
|
146
|
+
# region overridden methods
|
|
147
|
+
|
|
148
|
+
def __len__(self):
|
|
149
|
+
return len(self.__by_index)
|
|
150
|
+
|
|
151
|
+
# endregion
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from arekit.common.synonyms.base import SynonymsCollection
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SynonymsCollectionValuesGroupingProviders:
|
|
5
|
+
""" Providers for the grouping.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
@staticmethod
|
|
9
|
+
def provide_existed_or_register_missed_value(synonyms, value):
|
|
10
|
+
""" grouping with a potential expansion.
|
|
11
|
+
"""
|
|
12
|
+
assert(isinstance(synonyms, SynonymsCollection))
|
|
13
|
+
if not synonyms.contains_synonym_value(value):
|
|
14
|
+
synonyms.add_synonym_value(value)
|
|
15
|
+
return synonyms.get_synonym_group_index(value)
|
|
16
|
+
|
|
17
|
+
@staticmethod
|
|
18
|
+
def provide_existed_value(synonyms, value):
|
|
19
|
+
""" grouping by using only existed value.
|
|
20
|
+
"""
|
|
21
|
+
return synonyms.get_synonym_group_index(value)
|
|
File without changes
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from arekit.common.text.enums import TermFormat
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class BaseParsedText(object):
|
|
5
|
+
"""
|
|
6
|
+
Represents a processed text with extra parameters
|
|
7
|
+
that were used during parsing.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
# region constructors
|
|
11
|
+
|
|
12
|
+
def __init__(self, terms):
|
|
13
|
+
assert(isinstance(terms, list))
|
|
14
|
+
self._terms = terms
|
|
15
|
+
|
|
16
|
+
# endregion
|
|
17
|
+
|
|
18
|
+
def get_term(self, index, term_format):
|
|
19
|
+
assert(isinstance(term_format, TermFormat))
|
|
20
|
+
terms = self._get_terms(term_format)
|
|
21
|
+
return terms[index]
|
|
22
|
+
|
|
23
|
+
def iter_terms(self, term_format, filter=None):
|
|
24
|
+
assert(isinstance(term_format, TermFormat))
|
|
25
|
+
assert(callable(filter) or filter is None)
|
|
26
|
+
terms = self._get_terms(term_format)
|
|
27
|
+
for term in terms:
|
|
28
|
+
if filter is not None and not list(filter(term)):
|
|
29
|
+
continue
|
|
30
|
+
yield term
|
|
31
|
+
|
|
32
|
+
# region private methods
|
|
33
|
+
|
|
34
|
+
def _get_terms(self, term_format):
|
|
35
|
+
assert(isinstance(term_format, TermFormat))
|
|
36
|
+
assert(term_format == TermFormat.Raw)
|
|
37
|
+
return self._terms
|
|
38
|
+
|
|
39
|
+
# endregion
|
|
40
|
+
|
|
41
|
+
def __len__(self):
|
|
42
|
+
return len(self._terms)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from arekit.common.pipeline.base import BasePipeline
|
|
2
|
+
from arekit.common.text.parsed import BaseParsedText
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class BaseTextParser(BasePipeline):
|
|
6
|
+
|
|
7
|
+
def run(self, input_data, params_dict=None, parent_ctx=None):
|
|
8
|
+
output_data = super(BaseTextParser, self).run(input_data=input_data,
|
|
9
|
+
params_dict=params_dict,
|
|
10
|
+
parent_ctx=parent_ctx)
|
|
11
|
+
|
|
12
|
+
return BaseParsedText(terms=output_data)
|
|
File without changes
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from collections.abc import Iterable
|
|
2
|
+
|
|
3
|
+
from arekit.common.bound import Bound
|
|
4
|
+
from arekit.common.text.partitioning.base import BasePartitioning
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class StringPartitioning(BasePartitioning):
|
|
8
|
+
""" NOTE: considering that provided parts
|
|
9
|
+
has no intersections between each other
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def provide(self, text, parts_it):
|
|
13
|
+
assert(isinstance(text, str))
|
|
14
|
+
assert(isinstance(parts_it, Iterable))
|
|
15
|
+
|
|
16
|
+
start = 0
|
|
17
|
+
parts = []
|
|
18
|
+
for value, bound in parts_it:
|
|
19
|
+
assert(isinstance(bound, Bound))
|
|
20
|
+
assert(bound.Position >= start)
|
|
21
|
+
|
|
22
|
+
# Release everything till the current value position.
|
|
23
|
+
part = text[start:bound.Position]
|
|
24
|
+
|
|
25
|
+
parts.append(part)
|
|
26
|
+
|
|
27
|
+
# Release the entity value.
|
|
28
|
+
parts.extend([value])
|
|
29
|
+
|
|
30
|
+
start = bound.Position + bound.Length
|
|
31
|
+
|
|
32
|
+
# Release everything after the last entity.
|
|
33
|
+
last_part = text[start:len(text)]
|
|
34
|
+
parts.extend([last_part])
|
|
35
|
+
|
|
36
|
+
return parts
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from collections.abc import Iterable
|
|
2
|
+
|
|
3
|
+
from arekit.common.bound import Bound
|
|
4
|
+
from arekit.common.text.partitioning.base import BasePartitioning
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TermsPartitioning(BasePartitioning):
|
|
8
|
+
""" NOTE: considering that provided parts
|
|
9
|
+
has no intersections between each other
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def provide(self, text, parts_it):
|
|
13
|
+
assert(isinstance(text, list))
|
|
14
|
+
assert(isinstance(parts_it, Iterable))
|
|
15
|
+
|
|
16
|
+
start = 0
|
|
17
|
+
parts = []
|
|
18
|
+
for value, bound in parts_it:
|
|
19
|
+
assert(isinstance(bound, Bound))
|
|
20
|
+
assert(bound.Position >= start)
|
|
21
|
+
|
|
22
|
+
# Release everythig till the current value position.
|
|
23
|
+
part = text[start:bound.Position]
|
|
24
|
+
|
|
25
|
+
parts.extend(part)
|
|
26
|
+
|
|
27
|
+
# Release the entity value.
|
|
28
|
+
parts.extend([value])
|
|
29
|
+
|
|
30
|
+
start = bound.Position + bound.Length
|
|
31
|
+
|
|
32
|
+
# Release everything after the last entity.
|
|
33
|
+
parts.extend(text[start:len(text)])
|
|
34
|
+
|
|
35
|
+
return parts
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
class Stemmer:
|
|
2
|
+
"""
|
|
3
|
+
Interface
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
def lemmatize_to_list(self, text):
|
|
7
|
+
raise NotImplementedError()
|
|
8
|
+
|
|
9
|
+
def lemmatize_to_str(self, text):
|
|
10
|
+
raise NotImplementedError()
|
|
11
|
+
|
|
12
|
+
def is_adjective(self, pos_type):
|
|
13
|
+
raise NotImplementedError()
|
|
14
|
+
|
|
15
|
+
def is_noun(self, pos_type):
|
|
16
|
+
raise NotImplementedError()
|
|
File without changes
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
from arekit.common.labels.base import Label
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class TextOpinion(object):
|
|
5
|
+
"""
|
|
6
|
+
Represents a relation which were found in doc article
|
|
7
|
+
and composed between two named entities
|
|
8
|
+
(it was found especially by Opinion with predefined label)
|
|
9
|
+
allows to modify label using set_label
|
|
10
|
+
|
|
11
|
+
NOTE: it is important to keep document level IDs. (designed for that)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
# region constructors
|
|
15
|
+
|
|
16
|
+
def __init__(self, doc_id, text_opinion_id, source_id, target_id, label):
|
|
17
|
+
""" source_id: document level object id
|
|
18
|
+
target_id: document level object id
|
|
19
|
+
"""
|
|
20
|
+
self.__doc_id = doc_id
|
|
21
|
+
self.__source_id = source_id
|
|
22
|
+
self.__target_id = target_id
|
|
23
|
+
self.__text_opinion_id = text_opinion_id
|
|
24
|
+
self.__modifiable_label = None
|
|
25
|
+
self.__set_label_core(label)
|
|
26
|
+
|
|
27
|
+
@classmethod
|
|
28
|
+
def create_copy(cls, other, keep_text_opinion_id=True):
|
|
29
|
+
assert(isinstance(other, TextOpinion))
|
|
30
|
+
assert(isinstance(keep_text_opinion_id, bool))
|
|
31
|
+
return cls.__try_create_copy_core(other=other, keep_text_opinion_id=keep_text_opinion_id)
|
|
32
|
+
|
|
33
|
+
@staticmethod
|
|
34
|
+
def try_convert(other, convert_entity_id_func):
|
|
35
|
+
""" Creates a copy of `other` opinion with different id of opinion participants.
|
|
36
|
+
Use cases: required for BaseParsedDocumentServiceProvider, when we decided to bring the outside
|
|
37
|
+
opinion into one which is based on DocumentEntities.
|
|
38
|
+
"""
|
|
39
|
+
assert(isinstance(other, TextOpinion))
|
|
40
|
+
assert(callable(convert_entity_id_func))
|
|
41
|
+
return TextOpinion.__try_create_copy_core(other=other,
|
|
42
|
+
convert_entity_id_func=convert_entity_id_func,
|
|
43
|
+
keep_text_opinion_id=False)
|
|
44
|
+
|
|
45
|
+
@staticmethod
|
|
46
|
+
def __try_create_copy_core(other, convert_entity_id_func=lambda part_id: part_id, keep_text_opinion_id=True):
|
|
47
|
+
""" Tries to compose a copy by considering an optional id conversion,
|
|
48
|
+
and identification keeping.
|
|
49
|
+
convert_id:
|
|
50
|
+
func(id) -> id
|
|
51
|
+
"""
|
|
52
|
+
assert(callable(convert_entity_id_func))
|
|
53
|
+
|
|
54
|
+
source_id = convert_entity_id_func(other.SourceId)
|
|
55
|
+
target_id = convert_entity_id_func(other.TargetId)
|
|
56
|
+
|
|
57
|
+
if source_id is None or target_id is None:
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
return TextOpinion(doc_id=other.__doc_id,
|
|
61
|
+
text_opinion_id=other.__text_opinion_id if keep_text_opinion_id else None,
|
|
62
|
+
source_id=source_id,
|
|
63
|
+
target_id=target_id,
|
|
64
|
+
label=other.Label)
|
|
65
|
+
|
|
66
|
+
def __set_label_core(self, label):
|
|
67
|
+
assert(isinstance(label, Label))
|
|
68
|
+
self.__modifiable_label = label
|
|
69
|
+
|
|
70
|
+
# endregion
|
|
71
|
+
|
|
72
|
+
# region properties
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def Label(self):
|
|
76
|
+
return self.__modifiable_label
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def DocID(self):
|
|
80
|
+
return self.__doc_id
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def TextOpinionID(self):
|
|
84
|
+
return self.__text_opinion_id
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def SourceId(self):
|
|
88
|
+
return self.__source_id
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
def TargetId(self):
|
|
92
|
+
return self.__target_id
|
|
93
|
+
|
|
94
|
+
# endregion
|
|
95
|
+
|
|
96
|
+
# region public methods
|
|
97
|
+
|
|
98
|
+
def set_text_opinion_id(self, text_opinion_id):
|
|
99
|
+
assert(self.__text_opinion_id is None)
|
|
100
|
+
self.__text_opinion_id = text_opinion_id
|
|
101
|
+
|
|
102
|
+
def set_label(self, label):
|
|
103
|
+
self.__set_label_core(label)
|
|
104
|
+
|
|
105
|
+
# endregion
|
arekit/common/utils.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import os
|
|
3
|
+
import requests
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def create_dir_if_not_exists(filepath):
|
|
8
|
+
dir = os.path.dirname(filepath)
|
|
9
|
+
|
|
10
|
+
# Check whether string is empty.
|
|
11
|
+
if not dir:
|
|
12
|
+
return
|
|
13
|
+
|
|
14
|
+
if not os.path.exists(dir):
|
|
15
|
+
os.makedirs(dir)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def filter_whitespaces(terms):
|
|
19
|
+
return [term.strip() for term in terms if term.strip()]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def split_by_whitespaces(text):
|
|
23
|
+
"""
|
|
24
|
+
Assumes to perform a word separation including a variety of space entries.
|
|
25
|
+
In terms of the latter we consider any whitespace separator.
|
|
26
|
+
"""
|
|
27
|
+
assert(isinstance(text, str))
|
|
28
|
+
return text.split()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def progress_bar(iterable, total, desc="", unit="it"):
|
|
32
|
+
if total is not None:
|
|
33
|
+
return progress_bar_defined(iterable=iterable, total=total, desc=desc, unit=unit)
|
|
34
|
+
else:
|
|
35
|
+
return progress_bar_iter(iterable=iterable, desc=desc, unit=unit)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def progress_bar_conditional(iterable, condition_func, total, postfix_func=None, desc="", unit="it"):
|
|
39
|
+
""" This progress-bar updates only on the
|
|
40
|
+
specific conditions during the iteration process.
|
|
41
|
+
"""
|
|
42
|
+
assert(callable(condition_func))
|
|
43
|
+
assert(callable(postfix_func) or postfix_func is None)
|
|
44
|
+
|
|
45
|
+
# We consider artificial function that always iters 0.
|
|
46
|
+
def __iter_infinite_placeholder():
|
|
47
|
+
while True:
|
|
48
|
+
yield 0
|
|
49
|
+
|
|
50
|
+
pbar_it = progress_bar(iterable=__iter_infinite_placeholder(),
|
|
51
|
+
desc=desc, unit=unit, total=total)
|
|
52
|
+
element = iter(pbar_it)
|
|
53
|
+
|
|
54
|
+
# Initialize with 0.
|
|
55
|
+
next(element)
|
|
56
|
+
|
|
57
|
+
for item in iterable:
|
|
58
|
+
|
|
59
|
+
# Optionally Update progress bar with the next state.
|
|
60
|
+
if condition_func(item):
|
|
61
|
+
next(element)
|
|
62
|
+
yield item
|
|
63
|
+
|
|
64
|
+
# Optionally provide meta-information.
|
|
65
|
+
if postfix_func is not None:
|
|
66
|
+
pbar_it.set_postfix(postfix_func(item))
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def progress_bar_defined(iterable, total, miniters=200, desc="", unit="it"):
|
|
70
|
+
return tqdm(iterable=iterable,
|
|
71
|
+
total=total,
|
|
72
|
+
desc=desc,
|
|
73
|
+
ncols=120,
|
|
74
|
+
position=0,
|
|
75
|
+
leave=True,
|
|
76
|
+
unit=unit,
|
|
77
|
+
miniters=total / miniters if total is not None else total)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def progress_bar_iter(iterable, desc="", unit='it'):
|
|
81
|
+
return tqdm(iterable=iterable,
|
|
82
|
+
desc=desc,
|
|
83
|
+
position=0,
|
|
84
|
+
leave=True,
|
|
85
|
+
ncols=120,
|
|
86
|
+
unit=unit)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def get_default_download_dir():
|
|
90
|
+
""" Refered to NLTK toolkit approach
|
|
91
|
+
https://github.com/nltk/nltk/blob/8e771679cee1b4a9540633cc3ea17f4421ffd6c0/nltk/downloader.py#L1051
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
# On Windows, use %APPDATA%
|
|
95
|
+
if sys.platform == "win32" and "APPDATA" in os.environ:
|
|
96
|
+
homedir = os.environ["APPDATA"]
|
|
97
|
+
|
|
98
|
+
# Otherwise, install in the user's home directory.
|
|
99
|
+
else:
|
|
100
|
+
homedir = os.path.expanduser("~/")
|
|
101
|
+
if homedir == "~/":
|
|
102
|
+
raise ValueError("Could not find a default download directory")
|
|
103
|
+
|
|
104
|
+
return os.path.join(homedir, ".arekit")
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def download(dest_file_path, source_url):
|
|
108
|
+
""" Refered to https://github.com/nicolay-r/ner-bilstm-crf-tensorflow/blob/master/ner/utils.py
|
|
109
|
+
Simple http file downloader
|
|
110
|
+
"""
|
|
111
|
+
print(('Downloading from {src} to {dest}'.format(src=source_url, dest=dest_file_path)))
|
|
112
|
+
|
|
113
|
+
sys.stdout.flush()
|
|
114
|
+
datapath = os.path.dirname(dest_file_path)
|
|
115
|
+
|
|
116
|
+
if not os.path.exists(datapath):
|
|
117
|
+
os.makedirs(datapath, mode=0o755)
|
|
118
|
+
|
|
119
|
+
dest_file_path = os.path.abspath(dest_file_path)
|
|
120
|
+
|
|
121
|
+
r = requests.get(source_url, stream=True)
|
|
122
|
+
total_length = int(r.headers.get('content-length', 0))
|
|
123
|
+
|
|
124
|
+
with open(dest_file_path, 'wb') as f:
|
|
125
|
+
pbar = tqdm(total=total_length, unit='B', unit_scale=True)
|
|
126
|
+
for chunk in r.iter_content(chunk_size=32 * 1024):
|
|
127
|
+
if chunk: # filter out keep-alive new chunks
|
|
128
|
+
pbar.update(len(chunk))
|
|
129
|
+
f.write(chunk)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from arekit.common.data.input.providers.sample.cropped import CroppedSampleRowProvider
|
|
2
|
+
from arekit.common.data.input.providers.text.single import BaseSingleTextProvider
|
|
3
|
+
from arekit.contrib.bert.input.providers.text_pair import PairTextProvider
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CroppedBertSampleRowProvider(CroppedSampleRowProvider):
|
|
7
|
+
|
|
8
|
+
def __init__(self, crop_window_size, label_scaler, text_terms_mapper, text_b_template):
|
|
9
|
+
|
|
10
|
+
text_provider = BaseSingleTextProvider(text_terms_mapper=text_terms_mapper) \
|
|
11
|
+
if text_b_template is None else PairTextProvider(text_b_prompt=text_b_template,
|
|
12
|
+
text_terms_mapper=text_terms_mapper)
|
|
13
|
+
|
|
14
|
+
super(CroppedBertSampleRowProvider, self).__init__(
|
|
15
|
+
crop_window_size=crop_window_size,
|
|
16
|
+
label_scaler=label_scaler,
|
|
17
|
+
text_provider=text_provider)
|