spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
# Copyright 2017-2025 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains the PartitionTransformer class for reading various types of documents into chunks."""
|
|
15
|
+
from sparknlp.common import *
|
|
16
|
+
from sparknlp.partition.partition_properties import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class PartitionTransformer(
|
|
20
|
+
AnnotatorModel,
|
|
21
|
+
HasEmailReaderProperties,
|
|
22
|
+
HasExcelReaderProperties,
|
|
23
|
+
HasHTMLReaderProperties,
|
|
24
|
+
HasPowerPointProperties,
|
|
25
|
+
HasTextReaderProperties,
|
|
26
|
+
HasChunkerProperties
|
|
27
|
+
):
|
|
28
|
+
"""
|
|
29
|
+
The PartitionTransformer annotator allows you to use the Partition feature more smoothly
|
|
30
|
+
within existing Spark NLP workflows, enabling seamless reuse of your pipelines.
|
|
31
|
+
|
|
32
|
+
It supports reading from files, URLs, in-memory strings, or byte arrays, and works
|
|
33
|
+
within a Spark NLP pipeline.
|
|
34
|
+
|
|
35
|
+
Supported formats include:
|
|
36
|
+
- Plain text
|
|
37
|
+
- HTML
|
|
38
|
+
- Word (.doc/.docx)
|
|
39
|
+
- Excel (.xls/.xlsx)
|
|
40
|
+
- PowerPoint (.ppt/.pptx)
|
|
41
|
+
- Email files (.eml, .msg)
|
|
42
|
+
- PDFs
|
|
43
|
+
|
|
44
|
+
Parameters
|
|
45
|
+
----------
|
|
46
|
+
inputCols : list of str
|
|
47
|
+
Names of input columns (typically from DocumentAssembler).
|
|
48
|
+
outputCol : str
|
|
49
|
+
Name of the column to store the output.
|
|
50
|
+
contentType : str
|
|
51
|
+
The type of content: e.g., "text", "url", "file", etc.
|
|
52
|
+
headers : dict, optional
|
|
53
|
+
Headers to be used if content type is a URL.
|
|
54
|
+
|
|
55
|
+
Examples
|
|
56
|
+
--------
|
|
57
|
+
>>> dataset = spark.createDataFrame([
|
|
58
|
+
... ("https://www.blizzard.com",),
|
|
59
|
+
... ], ["text"])
|
|
60
|
+
|
|
61
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
62
|
+
... .setInputCol("text") \\
|
|
63
|
+
... .setOutputCol("document")
|
|
64
|
+
|
|
65
|
+
>>> partition = PartitionTransformer() \\
|
|
66
|
+
... .setInputCols(["document"]) \\
|
|
67
|
+
... .setOutputCol("partition") \\
|
|
68
|
+
... .setContentType("url") \\
|
|
69
|
+
... .setHeaders({"Accept-Language": "es-ES"})
|
|
70
|
+
|
|
71
|
+
>>> pipeline = Pipeline(stages=[documentAssembler, partition])
|
|
72
|
+
>>> pipelineModel = pipeline.fit(dataset)
|
|
73
|
+
>>> resultDf = pipelineModel.transform(dataset)
|
|
74
|
+
>>> resultDf.show()
|
|
75
|
+
+--------------------+--------------------+--------------------+
|
|
76
|
+
| text| document| partition|
|
|
77
|
+
+--------------------+--------------------+--------------------+
|
|
78
|
+
|https://www.blizz...|[{Title, Juegos d...|[{document, 0, 16...|
|
|
79
|
+
|https://www.googl...|[{Title, Gmail Im...|[{document, 0, 28...|
|
|
80
|
+
+--------------------+--------------------+--------------------+
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
name = "PartitionTransformer"
|
|
84
|
+
|
|
85
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
|
|
86
|
+
|
|
87
|
+
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
88
|
+
|
|
89
|
+
contentPath = Param(
|
|
90
|
+
Params._dummy(),
|
|
91
|
+
"contentPath",
|
|
92
|
+
"Path to the content source",
|
|
93
|
+
typeConverter=TypeConverters.toString
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def setContentPath(self, value):
|
|
97
|
+
return self._set(contentPath=value)
|
|
98
|
+
|
|
99
|
+
def getContentPath(self):
|
|
100
|
+
return self.getOrDefault(self.contentPath)
|
|
101
|
+
|
|
102
|
+
contentType = Param(
|
|
103
|
+
Params._dummy(),
|
|
104
|
+
"contentType",
|
|
105
|
+
"Set the content type to load following MIME specification",
|
|
106
|
+
typeConverter=TypeConverters.toString
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def setContentType(self, value):
|
|
110
|
+
return self._set(contentType=value)
|
|
111
|
+
|
|
112
|
+
def getContentType(self):
|
|
113
|
+
return self.getOrDefault(self.contentType)
|
|
114
|
+
|
|
115
|
+
storeContent = Param(
|
|
116
|
+
Params._dummy(),
|
|
117
|
+
"storeContent",
|
|
118
|
+
"Whether to include the raw file content in the output DataFrame as a separate 'content' column, alongside the structured output.",
|
|
119
|
+
typeConverter=TypeConverters.toBoolean
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
def setStoreContent(self, value):
|
|
123
|
+
return self._set(storeContent=value)
|
|
124
|
+
|
|
125
|
+
def getStoreContent(self):
|
|
126
|
+
return self.getOrDefault(self.storeContent)
|
|
127
|
+
|
|
128
|
+
titleFontSize = Param(
|
|
129
|
+
Params._dummy(),
|
|
130
|
+
"titleFontSize",
|
|
131
|
+
"Minimum font size threshold used as part of heuristic rules to detect title elements based on formatting (e.g., bold, centered, capitalized).",
|
|
132
|
+
typeConverter=TypeConverters.toInt
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
def setTitleFontSize(self, value):
|
|
136
|
+
return self._set(titleFontSize=value)
|
|
137
|
+
|
|
138
|
+
def getTitleFontSize(self):
|
|
139
|
+
return self.getOrDefault(self.titleFontSize)
|
|
140
|
+
|
|
141
|
+
inferTableStructure = Param(
|
|
142
|
+
Params._dummy(),
|
|
143
|
+
"inferTableStructure",
|
|
144
|
+
"Whether to generate an HTML table representation from structured table content. When enabled, a full <table> element is added alongside cell-level elements, based on row and column layout.",
|
|
145
|
+
typeConverter=TypeConverters.toBoolean
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
def setInferTableStructure(self, value):
|
|
149
|
+
return self._set(inferTableStructure=value)
|
|
150
|
+
|
|
151
|
+
def getInferTableStructure(self):
|
|
152
|
+
return self.getOrDefault(self.inferTableStructure)
|
|
153
|
+
|
|
154
|
+
includePageBreaks = Param(
|
|
155
|
+
Params._dummy(),
|
|
156
|
+
"includePageBreaks",
|
|
157
|
+
"Whether to detect and tag content with page break metadata. In Word documents, this includes manual and section breaks. In Excel files, this includes page breaks based on column boundaries.",
|
|
158
|
+
typeConverter=TypeConverters.toBoolean
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
def setIncludePageBreaks(self, value):
|
|
162
|
+
return self._set(includePageBreaks=value)
|
|
163
|
+
|
|
164
|
+
def getIncludePageBreaks(self):
|
|
165
|
+
return self.getOrDefault(self.includePageBreaks)
|
|
166
|
+
|
|
167
|
+
@keyword_only
|
|
168
|
+
def __init__(self, classname="com.johnsnowlabs.partition.PartitionTransformer",
|
|
169
|
+
java_model=None):
|
|
170
|
+
super(PartitionTransformer, self).__init__(
|
|
171
|
+
classname=classname,
|
|
172
|
+
java_model=java_model
|
|
173
|
+
)
|
|
174
|
+
DOUBLE_PARAGRAPH_PATTERN = r"(?:\s*\n\s*){2,}"
|
|
175
|
+
|
|
176
|
+
self._setDefault(
|
|
177
|
+
contentPath="",
|
|
178
|
+
contentType="text/plain",
|
|
179
|
+
storeContent=False,
|
|
180
|
+
titleFontSize = 9,
|
|
181
|
+
inferTableStructure=False,
|
|
182
|
+
includePageBreaks=False,
|
|
183
|
+
addAttachmentContent=False,
|
|
184
|
+
cellSeparator="\t",
|
|
185
|
+
appendCells=False,
|
|
186
|
+
timeout=0,
|
|
187
|
+
includeSlideNotes=False,
|
|
188
|
+
titleLengthSize=50,
|
|
189
|
+
groupBrokenParagraphs=False,
|
|
190
|
+
paragraphSplit=DOUBLE_PARAGRAPH_PATTERN,
|
|
191
|
+
shortLineWordThreshold=5,
|
|
192
|
+
maxLineCount=2000,
|
|
193
|
+
threshold=0.1,
|
|
194
|
+
chunkingStrategy="",
|
|
195
|
+
maxCharacters=100,
|
|
196
|
+
newAfterNChars=-1,
|
|
197
|
+
overlap=0,
|
|
198
|
+
combineTextUnderNChars=0,
|
|
199
|
+
overlapAll=False
|
|
200
|
+
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Module for pretrained pipelines and resources."""
|
|
15
|
+
from sparknlp.pretrained.pretrained_pipeline import *
|
|
16
|
+
from sparknlp.pretrained.resource_downloader import *
|
|
17
|
+
from sparknlp.pretrained.utils import *
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the PretrainedPipeline."""
|
|
15
|
+
|
|
16
|
+
from pyspark.ml import PipelineModel
|
|
17
|
+
from pyspark.sql import DataFrame
|
|
18
|
+
|
|
19
|
+
from sparknlp.base import LightPipeline
|
|
20
|
+
from sparknlp.pretrained.resource_downloader import ResourceDownloader
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PretrainedPipeline:
|
|
24
|
+
"""Loads a Represents a fully constructed and trained Spark NLP pipeline,
|
|
25
|
+
ready to be used.
|
|
26
|
+
|
|
27
|
+
This way, a whole pipeline can be defined in 1 line. Additionally, the
|
|
28
|
+
:class:`.LightPipeline` version of the model can be retrieved with member
|
|
29
|
+
:attr:`.light_model`.
|
|
30
|
+
|
|
31
|
+
For more extended examples see the `Pipelines page
|
|
32
|
+
<https://sparknlp.org/docs/en/pipelines>`_ and our `Github Model
|
|
33
|
+
Repository <https://github.com/JohnSnowLabs/spark-nlp-models>`_ for
|
|
34
|
+
available pipeline models.
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
name : str
|
|
39
|
+
Name of the PretrainedPipeline. These can be gathered from the Pipelines
|
|
40
|
+
Page.
|
|
41
|
+
lang : str, optional
|
|
42
|
+
Langauge of the model, by default 'en'
|
|
43
|
+
remote_loc : str, optional
|
|
44
|
+
Link to the remote location of the model (if it was already downloaded),
|
|
45
|
+
by default None
|
|
46
|
+
parse_embeddings : bool, optional
|
|
47
|
+
Whether to parse embeddings, by default False
|
|
48
|
+
disk_location : str , optional
|
|
49
|
+
Path to locally stored PretrainedPipeline, by default None
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(self, name, lang='en', remote_loc=None, parse_embeddings=False, disk_location=None):
|
|
53
|
+
if not disk_location:
|
|
54
|
+
self.model = ResourceDownloader().downloadPipeline(name, lang, remote_loc)
|
|
55
|
+
else:
|
|
56
|
+
self.model = PipelineModel.load(disk_location)
|
|
57
|
+
self.light_model = LightPipeline(self.model, parse_embeddings)
|
|
58
|
+
|
|
59
|
+
@staticmethod
|
|
60
|
+
def from_disk(path, parse_embeddings=False):
|
|
61
|
+
return PretrainedPipeline(None, None, None, parse_embeddings, path)
|
|
62
|
+
|
|
63
|
+
def annotate(self, target, column=None):
|
|
64
|
+
"""Annotates the data provided, extracting the results.
|
|
65
|
+
|
|
66
|
+
The data should be either a list or a str.
|
|
67
|
+
|
|
68
|
+
Parameters
|
|
69
|
+
----------
|
|
70
|
+
target : list or str
|
|
71
|
+
The data to be annotated
|
|
72
|
+
|
|
73
|
+
Returns
|
|
74
|
+
-------
|
|
75
|
+
List[dict] or dict
|
|
76
|
+
The result of the annotation
|
|
77
|
+
|
|
78
|
+
Examples
|
|
79
|
+
--------
|
|
80
|
+
>>> from sparknlp.pretrained import PretrainedPipeline
|
|
81
|
+
>>> explain_document_pipeline = PretrainedPipeline("explain_document_dl")
|
|
82
|
+
>>> result = explain_document_pipeline.annotate('U.N. official Ekeus heads for Baghdad.')
|
|
83
|
+
>>> result.keys()
|
|
84
|
+
dict_keys(['entities', 'stem', 'checked', 'lemma', 'document', 'pos', 'token', 'ner', 'embeddings', 'sentence'])
|
|
85
|
+
>>> result["ner"]
|
|
86
|
+
['B-ORG', 'O', 'O', 'B-PER', 'O', 'O', 'B-LOC', 'O']
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
annotations = self.light_model.annotate(target)
|
|
90
|
+
return annotations
|
|
91
|
+
|
|
92
|
+
def fullAnnotate(self, target, optional_target=""):
|
|
93
|
+
"""Annotates the data provided into `Annotation` type results.
|
|
94
|
+
|
|
95
|
+
The data should be either a list or a str.
|
|
96
|
+
|
|
97
|
+
Parameters
|
|
98
|
+
----------
|
|
99
|
+
target : list or str
|
|
100
|
+
The data to be annotated
|
|
101
|
+
|
|
102
|
+
Returns
|
|
103
|
+
-------
|
|
104
|
+
List[Annotation]
|
|
105
|
+
The result of the annotation
|
|
106
|
+
|
|
107
|
+
Examples
|
|
108
|
+
--------
|
|
109
|
+
>>> from sparknlp.pretrained import PretrainedPipeline
|
|
110
|
+
>>> explain_document_pipeline = PretrainedPipeline("explain_document_dl")
|
|
111
|
+
>>> result = explain_document_pipeline.fullAnnotate('U.N. official Ekeus heads for Baghdad.')
|
|
112
|
+
>>> result[0].keys()
|
|
113
|
+
dict_keys(['entities', 'stem', 'checked', 'lemma', 'document', 'pos', 'token', 'ner', 'embeddings', 'sentence'])
|
|
114
|
+
>>> result[0]["ner"]
|
|
115
|
+
[Annotation(named_entity, 0, 2, B-ORG, {'word': 'U.N'}),
|
|
116
|
+
Annotation(named_entity, 3, 3, O, {'word': '.'}),
|
|
117
|
+
Annotation(named_entity, 5, 12, O, {'word': 'official'}),
|
|
118
|
+
Annotation(named_entity, 14, 18, B-PER, {'word': 'Ekeus'}),
|
|
119
|
+
Annotation(named_entity, 20, 24, O, {'word': 'heads'}),
|
|
120
|
+
Annotation(named_entity, 26, 28, O, {'word': 'for'}),
|
|
121
|
+
Annotation(named_entity, 30, 36, B-LOC, {'word': 'Baghdad'}),
|
|
122
|
+
Annotation(named_entity, 37, 37, O, {'word': '.'})]
|
|
123
|
+
"""
|
|
124
|
+
annotations = self.light_model.fullAnnotate(target, optional_target)
|
|
125
|
+
return annotations
|
|
126
|
+
|
|
127
|
+
def fullAnnotateImage(self, path_to_image):
|
|
128
|
+
"""Annotates the data provided into `Annotation` type results.
|
|
129
|
+
|
|
130
|
+
The data should be either a list or a str.
|
|
131
|
+
|
|
132
|
+
Parameters
|
|
133
|
+
----------
|
|
134
|
+
path_to_image : list or str
|
|
135
|
+
Source path of image, list of paths to images
|
|
136
|
+
|
|
137
|
+
Returns
|
|
138
|
+
-------
|
|
139
|
+
List[AnnotationImage]
|
|
140
|
+
The result of the annotation
|
|
141
|
+
"""
|
|
142
|
+
pipeline = self.light_model
|
|
143
|
+
return pipeline.fullAnnotateImage(path_to_image)
|
|
144
|
+
|
|
145
|
+
def transform(self, data):
|
|
146
|
+
"""Transforms the input dataset with Spark.
|
|
147
|
+
|
|
148
|
+
Parameters
|
|
149
|
+
----------
|
|
150
|
+
data : :class:`pyspark.sql.DataFrame`
|
|
151
|
+
input dataset
|
|
152
|
+
|
|
153
|
+
Returns
|
|
154
|
+
-------
|
|
155
|
+
:class:`pyspark.sql.DataFrame`
|
|
156
|
+
transformed dataset
|
|
157
|
+
"""
|
|
158
|
+
return self.model.transform(data)
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the ResourceDownloader."""
|
|
15
|
+
|
|
16
|
+
import sys
|
|
17
|
+
import threading
|
|
18
|
+
|
|
19
|
+
from py4j.protocol import Py4JJavaError
|
|
20
|
+
from pyspark.ml import PipelineModel
|
|
21
|
+
|
|
22
|
+
import sparknlp.internal as _internal
|
|
23
|
+
from sparknlp.pretrained.utils import printProgress
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ResourceDownloader(object):
|
|
27
|
+
"""Downloads and manages resources, pretrained models/pipelines.
|
|
28
|
+
|
|
29
|
+
Usually you will not need to use this class directly. It is called by the
|
|
30
|
+
`pretrained()` function of annotators.
|
|
31
|
+
|
|
32
|
+
However, you can use this class to list the available pretrained resources.
|
|
33
|
+
|
|
34
|
+
Examples
|
|
35
|
+
--------
|
|
36
|
+
If you want to list all NerDLModels for the english language you can run:
|
|
37
|
+
|
|
38
|
+
>>> ResourceDownloader.showPublicModels("NerDLModel", "en")
|
|
39
|
+
+-------------+------+---------+
|
|
40
|
+
| Model | lang | version |
|
|
41
|
+
+-------------+------+---------+
|
|
42
|
+
| onto_100 | en | 2.1.0 |
|
|
43
|
+
| onto_300 | en | 2.1.0 |
|
|
44
|
+
| ner_dl_bert | en | 2.2.0 |
|
|
45
|
+
| ... | ... | ... |
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
Similarly for Pipelines:
|
|
49
|
+
|
|
50
|
+
>>> ResourceDownloader.showPublicPipelines("en")
|
|
51
|
+
+------------------+------+---------+
|
|
52
|
+
| Pipeline | lang | version |
|
|
53
|
+
+------------------+------+---------+
|
|
54
|
+
| dependency_parse | en | 2.0.2 |
|
|
55
|
+
| check_spelling | en | 2.1.0 |
|
|
56
|
+
| match_datetime | en | 2.1.0 |
|
|
57
|
+
| ... | ... | ... |
|
|
58
|
+
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
@staticmethod
|
|
62
|
+
def downloadModel(reader, name, language, remote_loc=None, j_dwn='PythonResourceDownloader'):
|
|
63
|
+
"""Downloads and loads a model with the default downloader. Usually this method
|
|
64
|
+
does not need to be called directly, as it is called by the `pretrained()`
|
|
65
|
+
method of the annotator.
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
----------
|
|
69
|
+
reader : obj
|
|
70
|
+
Class to read the model for
|
|
71
|
+
name : str
|
|
72
|
+
Name of the pretrained model
|
|
73
|
+
language : str
|
|
74
|
+
Language of the model
|
|
75
|
+
remote_loc : str, optional
|
|
76
|
+
Directory of the Spark NLP Folder, by default None
|
|
77
|
+
j_dwn : str, optional
|
|
78
|
+
Which java downloader to use, by default 'PythonResourceDownloader'
|
|
79
|
+
|
|
80
|
+
Returns
|
|
81
|
+
-------
|
|
82
|
+
AnnotatorModel
|
|
83
|
+
Loaded pretrained annotator/pipeline
|
|
84
|
+
"""
|
|
85
|
+
print(name + " download started this may take some time.")
|
|
86
|
+
file_size = _internal._GetResourceSize(name, language, remote_loc).apply()
|
|
87
|
+
if file_size == "-1":
|
|
88
|
+
print("Can not find the model to download please check the name!")
|
|
89
|
+
else:
|
|
90
|
+
print("Approximate size to download " + file_size)
|
|
91
|
+
stop_threads = False
|
|
92
|
+
t1 = threading.Thread(target=printProgress, args=(lambda: stop_threads,))
|
|
93
|
+
t1.start()
|
|
94
|
+
try:
|
|
95
|
+
j_obj = _internal._DownloadModel(reader.name, name, language, remote_loc, j_dwn).apply()
|
|
96
|
+
except Py4JJavaError as e:
|
|
97
|
+
sys.stdout.write("\n" + str(e))
|
|
98
|
+
raise e
|
|
99
|
+
finally:
|
|
100
|
+
stop_threads = True
|
|
101
|
+
t1.join()
|
|
102
|
+
|
|
103
|
+
return reader(classname=None, java_model=j_obj)
|
|
104
|
+
|
|
105
|
+
@staticmethod
|
|
106
|
+
def downloadModelDirectly(name, remote_loc="public/models", unzip=True):
|
|
107
|
+
"""Downloads a model directly to the cache folder.
|
|
108
|
+
You can use to copy-paste the s3 URI from the model hub and download the model.
|
|
109
|
+
For available s3 URI and models, please see the `Models Hub <https://sparknlp.org/models>`__.
|
|
110
|
+
Parameters
|
|
111
|
+
----------
|
|
112
|
+
name : str
|
|
113
|
+
Name of the model or s3 URI
|
|
114
|
+
remote_loc : str, optional
|
|
115
|
+
Directory of the remote Spark NLP Folder, by default "public/models"
|
|
116
|
+
unzip : Bool, optional
|
|
117
|
+
Used to unzip model, by default 'True'
|
|
118
|
+
"""
|
|
119
|
+
_internal._DownloadModelDirectly(name, remote_loc, unzip).apply()
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@staticmethod
|
|
123
|
+
def downloadPipeline(name, language, remote_loc=None):
|
|
124
|
+
"""Downloads and loads a pipeline with the default downloader.
|
|
125
|
+
|
|
126
|
+
Parameters
|
|
127
|
+
----------
|
|
128
|
+
name : str
|
|
129
|
+
Name of the pipeline
|
|
130
|
+
language : str
|
|
131
|
+
Language of the pipeline
|
|
132
|
+
remote_loc : str, optional
|
|
133
|
+
Directory of the remote Spark NLP Folder, by default None
|
|
134
|
+
|
|
135
|
+
Returns
|
|
136
|
+
-------
|
|
137
|
+
PipelineModel
|
|
138
|
+
The loaded pipeline
|
|
139
|
+
"""
|
|
140
|
+
print(name + " download started this may take some time.")
|
|
141
|
+
file_size = _internal._GetResourceSize(name, language, remote_loc).apply()
|
|
142
|
+
if file_size == "-1":
|
|
143
|
+
print("Can not find the model to download please check the name!")
|
|
144
|
+
else:
|
|
145
|
+
print("Approx size to download " + file_size)
|
|
146
|
+
stop_threads = False
|
|
147
|
+
t1 = threading.Thread(target=printProgress, args=(lambda: stop_threads,))
|
|
148
|
+
t1.start()
|
|
149
|
+
try:
|
|
150
|
+
j_obj = _internal._DownloadPipeline(name, language, remote_loc).apply()
|
|
151
|
+
jmodel = PipelineModel._from_java(j_obj)
|
|
152
|
+
finally:
|
|
153
|
+
stop_threads = True
|
|
154
|
+
t1.join()
|
|
155
|
+
|
|
156
|
+
return jmodel
|
|
157
|
+
|
|
158
|
+
@staticmethod
|
|
159
|
+
def clearCache(name, language, remote_loc=None):
|
|
160
|
+
"""Clears the cache entry of a model.
|
|
161
|
+
|
|
162
|
+
Parameters
|
|
163
|
+
----------
|
|
164
|
+
name : str
|
|
165
|
+
Name of the model
|
|
166
|
+
language : en
|
|
167
|
+
Language of the model
|
|
168
|
+
remote_loc : str, optional
|
|
169
|
+
Directory of the remote Spark NLP Folder, by default None
|
|
170
|
+
"""
|
|
171
|
+
_internal._ClearCache(name, language, remote_loc).apply()
|
|
172
|
+
|
|
173
|
+
@staticmethod
|
|
174
|
+
def showPublicModels(annotator=None, lang=None, version=None):
|
|
175
|
+
"""Prints all pretrained models for a particular annotator model, that are
|
|
176
|
+
compatible with a version of Spark NLP. If any of the optional arguments are not
|
|
177
|
+
set, the filter is not considered.
|
|
178
|
+
|
|
179
|
+
Parameters
|
|
180
|
+
----------
|
|
181
|
+
annotator : str, optional
|
|
182
|
+
Name of the annotator to filer, by default None
|
|
183
|
+
lang : str, optional
|
|
184
|
+
Language of the models to filter, by default None
|
|
185
|
+
version : str, optional
|
|
186
|
+
Version of Spark NLP to filter, by default None
|
|
187
|
+
"""
|
|
188
|
+
print(_internal._ShowPublicModels(annotator, lang, version).apply())
|
|
189
|
+
|
|
190
|
+
@staticmethod
|
|
191
|
+
def showPublicPipelines(lang=None, version=None):
|
|
192
|
+
"""Prints all pretrained models for a particular annotator model, that are
|
|
193
|
+
compatible with a version of Spark NLP. If any of the optional arguments are not
|
|
194
|
+
set, the filter is not considered.
|
|
195
|
+
|
|
196
|
+
Parameters
|
|
197
|
+
----------
|
|
198
|
+
lang : str, optional
|
|
199
|
+
Language of the models to filter, by default None
|
|
200
|
+
version : str, optional
|
|
201
|
+
Version of Spark NLP to filter, by default None
|
|
202
|
+
"""
|
|
203
|
+
print(_internal._ShowPublicPipelines(lang, version).apply())
|
|
204
|
+
|
|
205
|
+
@staticmethod
|
|
206
|
+
def showUnCategorizedResources():
|
|
207
|
+
"""Shows models or pipelines in the metadata which has not been categorized yet.
|
|
208
|
+
"""
|
|
209
|
+
print(_internal._ShowUnCategorizedResources().apply())
|
|
210
|
+
|
|
211
|
+
@staticmethod
|
|
212
|
+
def showAvailableAnnotators():
|
|
213
|
+
"""Shows all available annotators in Spark NLP.
|
|
214
|
+
"""
|
|
215
|
+
print(_internal._ShowAvailableAnnotators().apply())
|
|
216
|
+
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains utilities for pretrained annotators and pipelines."""
|
|
15
|
+
|
|
16
|
+
import sys
|
|
17
|
+
import time
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def printProgress(stop):
|
|
21
|
+
states = [' | ', ' / ', ' — ', ' \\ ']
|
|
22
|
+
nextc = 0
|
|
23
|
+
while True:
|
|
24
|
+
sys.stdout.write('\r[{}]'.format(states[nextc]))
|
|
25
|
+
sys.stdout.flush()
|
|
26
|
+
time.sleep(2.5)
|
|
27
|
+
nextc = nextc + 1 if nextc < 3 else 0
|
|
28
|
+
if stop():
|
|
29
|
+
sys.stdout.write('\r[{}]'.format('OK!'))
|
|
30
|
+
sys.stdout.flush()
|
|
31
|
+
break
|
|
32
|
+
|
|
33
|
+
sys.stdout.write('\n')
|
|
34
|
+
return
|
|
35
|
+
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Module for reading different files types."""
|
|
15
|
+
from sparknlp.reader.sparknlp_reader import *
|
sparknlp/reader/enums.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Copyright 2017-2025 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from enum import Enum
|
|
15
|
+
|
|
16
|
+
class TextStripperType(Enum):
|
|
17
|
+
"""Text Stripper Type"""
|
|
18
|
+
PDF_TEXT_STRIPPER = "PDFTextStripper"
|
|
19
|
+
PDF_LAYOUT_TEXT_STRIPPER = "PDFLayoutTextStripper"
|