spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
# Copyright 2017-2025 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from pyspark import keyword_only
|
|
15
|
+
from pyspark.ml.param import Param, Params, TypeConverters
|
|
16
|
+
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
|
|
17
|
+
from pyspark.ml.util import JavaMLReadable, JavaMLWritable
|
|
18
|
+
from pyspark.ml.wrapper import JavaTransformer
|
|
19
|
+
|
|
20
|
+
from sparknlp.reader.enums import TextStripperType
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PdfToText(JavaTransformer, HasInputCol, HasOutputCol,
|
|
24
|
+
JavaMLReadable, JavaMLWritable):
|
|
25
|
+
"""
|
|
26
|
+
Extract text from PDF documents as either a single string or multiple strings per page.
|
|
27
|
+
Input is a column with binary content of PDF files. Output is a column with extracted text,
|
|
28
|
+
with options to include page numbers or split pages.
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
pageNumCol : str, optional
|
|
33
|
+
Page number output column name.
|
|
34
|
+
partitionNum : int, optional
|
|
35
|
+
Number of partitions (default is 0).
|
|
36
|
+
storeSplittedPdf : bool, optional
|
|
37
|
+
Whether to store content of split PDFs (default is False).
|
|
38
|
+
splitPage : bool, optional
|
|
39
|
+
Enable/disable splitting per page (default is True).
|
|
40
|
+
onlyPageNum : bool, optional
|
|
41
|
+
Whether to extract only page numbers (default is False).
|
|
42
|
+
textStripper : str or TextStripperType, optional
|
|
43
|
+
Defines layout and formatting type.
|
|
44
|
+
sort : bool, optional
|
|
45
|
+
Enable/disable sorting content per page (default is False).
|
|
46
|
+
|
|
47
|
+
Examples
|
|
48
|
+
--------
|
|
49
|
+
>>> import sparknlp
|
|
50
|
+
>>> from sparknlp.reader import *
|
|
51
|
+
>>> from pyspark.ml import Pipeline
|
|
52
|
+
>>> pdf_path = "Documents/files/pdf"
|
|
53
|
+
>>> data_frame = spark.read.format("binaryFile").load(pdf_path)
|
|
54
|
+
>>> pdf_to_text = PdfToText().setStoreSplittedPdf(True)
|
|
55
|
+
>>> pipeline = Pipeline(stages=[pdf_to_text])
|
|
56
|
+
>>> pipeline_model = pipeline.fit(data_frame)
|
|
57
|
+
>>> pdf_df = pipeline_model.transform(data_frame)
|
|
58
|
+
>>> pdf_df.show()
|
|
59
|
+
+--------------------+--------------------+
|
|
60
|
+
| path| modificationTime|
|
|
61
|
+
+--------------------+--------------------+
|
|
62
|
+
|file:/Users/paula...|2025-05-15 11:33:...|
|
|
63
|
+
|file:/Users/paula...|2025-05-15 11:33:...|
|
|
64
|
+
+--------------------+--------------------+
|
|
65
|
+
>>> pdf_df.printSchema()
|
|
66
|
+
root
|
|
67
|
+
|-- path: string (nullable = true)
|
|
68
|
+
|-- modificationTime: timestamp (nullable = true)
|
|
69
|
+
|-- length: long (nullable = true)
|
|
70
|
+
|-- text: string (nullable = true)
|
|
71
|
+
|-- height_dimension: integer (nullable = true)
|
|
72
|
+
|-- width_dimension: integer (nullable = true)
|
|
73
|
+
|-- content: binary (nullable = true)
|
|
74
|
+
|-- exception: string (nullable = true)
|
|
75
|
+
|-- pagenum: integer (nullable = true)
|
|
76
|
+
"""
|
|
77
|
+
pageNumCol = Param(Params._dummy(), "pageNumCol",
|
|
78
|
+
"Page number output column name.",
|
|
79
|
+
typeConverter=TypeConverters.toString)
|
|
80
|
+
|
|
81
|
+
partitionNum = Param(Params._dummy(), "partitionNum",
|
|
82
|
+
"Number of partitions.",
|
|
83
|
+
typeConverter=TypeConverters.toInt)
|
|
84
|
+
|
|
85
|
+
storeSplittedPdf = Param(Params._dummy(), "storeSplittedPdf",
|
|
86
|
+
"Force to store splitted pdf.",
|
|
87
|
+
typeConverter=TypeConverters.toBoolean)
|
|
88
|
+
|
|
89
|
+
splitPage = Param(Params._dummy(), "splitPage",
|
|
90
|
+
"Param for enable/disable splitting document per page",
|
|
91
|
+
typeConverter=TypeConverters.toBoolean)
|
|
92
|
+
|
|
93
|
+
textStripper = Param(Params._dummy(), "textStripper",
|
|
94
|
+
"Text stripper type used for output layout and formatting",
|
|
95
|
+
typeConverter=TypeConverters.toString)
|
|
96
|
+
|
|
97
|
+
sort = Param(Params._dummy(), "sort",
|
|
98
|
+
"Param for enable/disable sort lines",
|
|
99
|
+
typeConverter=TypeConverters.toBoolean)
|
|
100
|
+
|
|
101
|
+
onlyPageNum = Param(Params._dummy(), "onlyPageNum",
|
|
102
|
+
"Force to extract only number of pages",
|
|
103
|
+
typeConverter=TypeConverters.toBoolean)
|
|
104
|
+
|
|
105
|
+
extractCoordinates = Param(Params._dummy(), "extractCoordinates",
|
|
106
|
+
"Force extract coordinates of text.",
|
|
107
|
+
typeConverter=TypeConverters.toBoolean)
|
|
108
|
+
|
|
109
|
+
normalizeLigatures = Param(Params._dummy(), "normalizeLigatures",
|
|
110
|
+
"Whether to convert ligature chars such as 'fl' into its corresponding chars (e.g., {'f', 'l'}).",
|
|
111
|
+
typeConverter=TypeConverters.toBoolean)
|
|
112
|
+
|
|
113
|
+
@keyword_only
|
|
114
|
+
def __init__(self):
|
|
115
|
+
"""
|
|
116
|
+
__init__(self)
|
|
117
|
+
"""
|
|
118
|
+
super(PdfToText, self).__init__()
|
|
119
|
+
self._java_obj = self._new_java_obj("com.johnsnowlabs.reader.PdfToText", self.uid)
|
|
120
|
+
|
|
121
|
+
def setInputCol(self, value):
|
|
122
|
+
"""
|
|
123
|
+
Sets the value of :py:attr:`inputCol`.
|
|
124
|
+
"""
|
|
125
|
+
return self._set(inputCol=value)
|
|
126
|
+
|
|
127
|
+
def setOutputCol(self, value):
|
|
128
|
+
"""
|
|
129
|
+
Sets the value of :py:attr:`outputCol`.
|
|
130
|
+
"""
|
|
131
|
+
return self._set(outputCol=value)
|
|
132
|
+
|
|
133
|
+
def setPageNumCol(self, value):
|
|
134
|
+
"""
|
|
135
|
+
Sets the value of :py:attr:`pageNumCol`.
|
|
136
|
+
"""
|
|
137
|
+
return self._set(pageNumCol=value)
|
|
138
|
+
|
|
139
|
+
def setPartitionNum(self, value):
|
|
140
|
+
"""
|
|
141
|
+
Sets the value of :py:attr:`partitionNum`.
|
|
142
|
+
"""
|
|
143
|
+
return self._set(partitionNum=value)
|
|
144
|
+
|
|
145
|
+
def setStoreSplittedPdf(self, value):
|
|
146
|
+
"""
|
|
147
|
+
Sets the value of :py:attr:`storeSplittedPdf`.
|
|
148
|
+
"""
|
|
149
|
+
return self._set(storeSplittedPdf=value)
|
|
150
|
+
|
|
151
|
+
def setSplitPage(self, value):
|
|
152
|
+
"""
|
|
153
|
+
Sets the value of :py:attr:`splitPage`.
|
|
154
|
+
"""
|
|
155
|
+
return self._set(splitPage=value)
|
|
156
|
+
|
|
157
|
+
def setOnlyPageNum(self, value):
|
|
158
|
+
"""
|
|
159
|
+
Sets the value of :py:attr:`onlyPageNum`.
|
|
160
|
+
"""
|
|
161
|
+
return self._set(onlyPageNum=value)
|
|
162
|
+
|
|
163
|
+
def setTextStripper(self, value):
|
|
164
|
+
"""
|
|
165
|
+
Sets the value of :py:attr:`textStripper`.
|
|
166
|
+
"""
|
|
167
|
+
if isinstance(value, TextStripperType):
|
|
168
|
+
value = value.value
|
|
169
|
+
if value not in [i.value for i in TextStripperType]:
|
|
170
|
+
type_value = type(value)
|
|
171
|
+
raise ValueError(f"Param textStripper must be a 'TextStripperType' enum but got {type_value}.")
|
|
172
|
+
return self._set(textStripper=str(value))
|
|
173
|
+
|
|
174
|
+
def setSort(self, value):
|
|
175
|
+
"""
|
|
176
|
+
Sets the value of :py:attr:`sort`.
|
|
177
|
+
"""
|
|
178
|
+
return self._set(sort=value)
|
|
179
|
+
|
|
180
|
+
def setExtractCoordinates(self, value):
|
|
181
|
+
"""
|
|
182
|
+
Sets the value of :py:attr:`extractCoordinates`.
|
|
183
|
+
"""
|
|
184
|
+
return self._set(extractCoordinates=value)
|
|
185
|
+
|
|
186
|
+
def setNormalizeLigatures(self, value):
|
|
187
|
+
"""
|
|
188
|
+
Sets the value of :py:attr:`normalizeLigatures`.
|
|
189
|
+
"""
|
|
190
|
+
return self._set(normalizeLigatures=value)
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# Copyright 2017-2025 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from pyspark import keyword_only
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import AnnotatorType
|
|
17
|
+
from sparknlp.internal import AnnotatorTransformer
|
|
18
|
+
from sparknlp.partition.partition_properties import *
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Reader2Doc(
|
|
22
|
+
AnnotatorTransformer,
|
|
23
|
+
HasReaderProperties,
|
|
24
|
+
HasHTMLReaderProperties,
|
|
25
|
+
HasEmailReaderProperties,
|
|
26
|
+
HasExcelReaderProperties,
|
|
27
|
+
HasPowerPointProperties,
|
|
28
|
+
HasTextReaderProperties
|
|
29
|
+
):
|
|
30
|
+
"""
|
|
31
|
+
The Reader2Doc annotator allows you to use reading files more smoothly within existing
|
|
32
|
+
Spark NLP workflows, enabling seamless reuse of your pipelines.
|
|
33
|
+
|
|
34
|
+
Reader2Doc can be used for extracting structured content from various document types
|
|
35
|
+
using Spark NLP readers. It supports reading from many file types and returns parsed
|
|
36
|
+
output as a structured Spark DataFrame.
|
|
37
|
+
|
|
38
|
+
Supported formats include:
|
|
39
|
+
|
|
40
|
+
- Plain text
|
|
41
|
+
- HTML
|
|
42
|
+
- Word (.doc/.docx)
|
|
43
|
+
- Excel (.xls/.xlsx)
|
|
44
|
+
- PowerPoint (.ppt/.pptx)
|
|
45
|
+
- Email files (.eml, .msg)
|
|
46
|
+
- PDFs
|
|
47
|
+
|
|
48
|
+
Examples
|
|
49
|
+
--------
|
|
50
|
+
>>> from johnsnowlabs.reader import Reader2Doc
|
|
51
|
+
>>> from johnsnowlabs.nlp.base import DocumentAssembler
|
|
52
|
+
>>> from pyspark.ml import Pipeline
|
|
53
|
+
>>> # Initialize Reader2Doc for PDF files
|
|
54
|
+
>>> reader2doc = Reader2Doc() \\
|
|
55
|
+
... .setContentType("application/pdf") \\
|
|
56
|
+
... .setContentPath(f"{pdf_directory}/")
|
|
57
|
+
>>> # Build the pipeline with the Reader2Doc stage
|
|
58
|
+
>>> pipeline = Pipeline(stages=[reader2doc])
|
|
59
|
+
>>> # Fit the pipeline to an empty DataFrame
|
|
60
|
+
>>> pipeline_model = pipeline.fit(empty_data_set)
|
|
61
|
+
>>> result_df = pipeline_model.transform(empty_data_set)
|
|
62
|
+
>>> # Show the resulting DataFrame
|
|
63
|
+
>>> result_df.show()
|
|
64
|
+
+------------------------------------------------------------------------------------------------------------------------------------+
|
|
65
|
+
|document |
|
|
66
|
+
+------------------------------------------------------------------------------------------------------------------------------------+
|
|
67
|
+
|[{'document', 0, 14, 'This is a Title', {'pageNumber': 1, 'elementType': 'Title', 'fileName': 'pdf-title.pdf'}, []}] |
|
|
68
|
+
|[{'document', 15, 38, 'This is a narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
|
|
69
|
+
|[{'document', 39, 68, 'This is another narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
|
|
70
|
+
+------------------------------------------------------------------------------------------------------------------------------------+
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
name = "Reader2Doc"
|
|
74
|
+
|
|
75
|
+
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
76
|
+
|
|
77
|
+
excludeNonText = Param(
|
|
78
|
+
Params._dummy(),
|
|
79
|
+
"excludeNonText",
|
|
80
|
+
"Whether to exclude non-text content from the output. Default is False.",
|
|
81
|
+
typeConverter=TypeConverters.toBoolean
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def setExcludeNonText(self, value):
|
|
85
|
+
"""Sets whether to exclude non-text content from the output.
|
|
86
|
+
|
|
87
|
+
Parameters
|
|
88
|
+
----------
|
|
89
|
+
value : bool
|
|
90
|
+
Whether to exclude non-text content from the output. Default is False.
|
|
91
|
+
"""
|
|
92
|
+
return self._set(excludeNonText=value)
|
|
93
|
+
|
|
94
|
+
joinString = Param(
|
|
95
|
+
Params._dummy(),
|
|
96
|
+
"joinString",
|
|
97
|
+
"If outputAsDocument is true, specifies the string used to join elements into a single document.",
|
|
98
|
+
typeConverter=TypeConverters.toString
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
def setJoinString(self, value):
|
|
102
|
+
"""
|
|
103
|
+
If outputAsDocument is true, specifies the string used to join elements into a single
|
|
104
|
+
"""
|
|
105
|
+
return self._set(joinString=value)
|
|
106
|
+
|
|
107
|
+
@keyword_only
|
|
108
|
+
def __init__(self):
|
|
109
|
+
super(Reader2Doc, self).__init__(classname="com.johnsnowlabs.reader.Reader2Doc")
|
|
110
|
+
self._setDefault(
|
|
111
|
+
outputCol="document",
|
|
112
|
+
explodeDocs=False,
|
|
113
|
+
contentType="",
|
|
114
|
+
flattenOutput=False,
|
|
115
|
+
outputAsDocument=True,
|
|
116
|
+
outputFormat="plain-text",
|
|
117
|
+
excludeNonText=False,
|
|
118
|
+
joinString="\n"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
@keyword_only
|
|
122
|
+
def setParams(self):
|
|
123
|
+
kwargs = self._input_kwargs
|
|
124
|
+
return self._set(**kwargs)
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# Copyright 2017-2025 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from pyspark import keyword_only
|
|
15
|
+
from pyspark.ml.param import TypeConverters, Params, Param
|
|
16
|
+
|
|
17
|
+
from sparknlp.common import AnnotatorType
|
|
18
|
+
from sparknlp.internal import AnnotatorTransformer
|
|
19
|
+
from sparknlp.partition.partition_properties import *
|
|
20
|
+
|
|
21
|
+
class Reader2Image(
|
|
22
|
+
AnnotatorTransformer,
|
|
23
|
+
HasReaderProperties,
|
|
24
|
+
HasHTMLReaderProperties,
|
|
25
|
+
HasPdfProperties
|
|
26
|
+
):
|
|
27
|
+
"""
|
|
28
|
+
The Reader2Image annotator allows you to use the reading files with images more smoothly within existing
|
|
29
|
+
Spark NLP workflows, enabling seamless reuse of your pipelines. Reader2Image can be used for
|
|
30
|
+
extracting structured image content from various document types using Spark NLP readers. It supports
|
|
31
|
+
reading from many file types and returns parsed output as a structured Spark DataFrame.
|
|
32
|
+
|
|
33
|
+
Supported formats include HTML and Markdown.
|
|
34
|
+
|
|
35
|
+
== Example ==
|
|
36
|
+
This example demonstrates how to load HTML files with images and process them into a structured
|
|
37
|
+
Spark DataFrame using Reader2Image.
|
|
38
|
+
|
|
39
|
+
Expected output:
|
|
40
|
+
+-------------------+--------------------+
|
|
41
|
+
| fileName| image|
|
|
42
|
+
+-------------------+--------------------+
|
|
43
|
+
|example-images.html|[{image, example-...|
|
|
44
|
+
|example-images.html|[{image, example-...|
|
|
45
|
+
+-------------------+--------------------+
|
|
46
|
+
|
|
47
|
+
Schema:
|
|
48
|
+
root
|
|
49
|
+
|-- fileName: string (nullable = true)
|
|
50
|
+
|-- image: array (nullable = false)
|
|
51
|
+
| |-- element: struct (containsNull = true)
|
|
52
|
+
| | |-- annotatorType: string (nullable = true)
|
|
53
|
+
| | |-- origin: string (nullable = true)
|
|
54
|
+
| | |-- height: integer (nullable = false)
|
|
55
|
+
| | |-- width: integer (nullable = false)
|
|
56
|
+
| | |-- nChannels: integer (nullable = false)
|
|
57
|
+
| | |-- mode: integer (nullable = false)
|
|
58
|
+
| | |-- result: binary (nullable = true)
|
|
59
|
+
| | |-- metadata: map (nullable = true)
|
|
60
|
+
| | | |-- key: string
|
|
61
|
+
| | | |-- value: string (valueContainsNull = true)
|
|
62
|
+
| | |-- text: string (nullable = true)
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
name = "Reader2Image"
|
|
66
|
+
outputAnnotatorType = AnnotatorType.IMAGE
|
|
67
|
+
|
|
68
|
+
userMessage = Param(
|
|
69
|
+
Params._dummy(),
|
|
70
|
+
"userMessage",
|
|
71
|
+
"Custom user message.",
|
|
72
|
+
typeConverter=TypeConverters.toString
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
promptTemplate = Param(
|
|
76
|
+
Params._dummy(),
|
|
77
|
+
"promptTemplate",
|
|
78
|
+
"Format of the output prompt.",
|
|
79
|
+
typeConverter=TypeConverters.toString
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
customPromptTemplate = Param(
|
|
83
|
+
Params._dummy(),
|
|
84
|
+
"customPromptTemplate",
|
|
85
|
+
"Custom prompt template for image models.",
|
|
86
|
+
typeConverter=TypeConverters.toString
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
@keyword_only
|
|
90
|
+
def __init__(self):
|
|
91
|
+
super(Reader2Image, self).__init__(classname="com.johnsnowlabs.reader.Reader2Image")
|
|
92
|
+
self._setDefault(
|
|
93
|
+
contentType="",
|
|
94
|
+
outputFormat="image",
|
|
95
|
+
explodeDocs=True,
|
|
96
|
+
userMessage="Describe this image",
|
|
97
|
+
promptTemplate="qwen2vl-chat",
|
|
98
|
+
readAsImage=True,
|
|
99
|
+
customPromptTemplate="",
|
|
100
|
+
ignoreExceptions=True
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
@keyword_only
|
|
104
|
+
def setParams(self):
|
|
105
|
+
kwargs = self._input_kwargs
|
|
106
|
+
return self._set(**kwargs)
|
|
107
|
+
|
|
108
|
+
def setUserMessage(self, value: str):
|
|
109
|
+
"""Sets custom user message.
|
|
110
|
+
|
|
111
|
+
Parameters
|
|
112
|
+
----------
|
|
113
|
+
value : str
|
|
114
|
+
Custom user message to include.
|
|
115
|
+
"""
|
|
116
|
+
return self._set(userMessage=value)
|
|
117
|
+
|
|
118
|
+
def setPromptTemplate(self, value: str):
|
|
119
|
+
"""Sets format of the output prompt.
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
value : str
|
|
124
|
+
Prompt template format.
|
|
125
|
+
"""
|
|
126
|
+
return self._set(promptTemplate=value)
|
|
127
|
+
|
|
128
|
+
def setCustomPromptTemplate(self, value: str):
|
|
129
|
+
"""Sets custom prompt template for image models.
|
|
130
|
+
|
|
131
|
+
Parameters
|
|
132
|
+
----------
|
|
133
|
+
value : str
|
|
134
|
+
Custom prompt template string.
|
|
135
|
+
"""
|
|
136
|
+
return self._set(customPromptTemplate=value)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# Copyright 2017-2025 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from pyspark import keyword_only
|
|
16
|
+
|
|
17
|
+
from sparknlp.common import AnnotatorType
|
|
18
|
+
from sparknlp.internal import AnnotatorTransformer
|
|
19
|
+
from sparknlp.partition.partition_properties import *
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Reader2Table(
|
|
23
|
+
AnnotatorTransformer,
|
|
24
|
+
HasReaderProperties,
|
|
25
|
+
HasEmailReaderProperties,
|
|
26
|
+
HasExcelReaderProperties,
|
|
27
|
+
HasHTMLReaderProperties,
|
|
28
|
+
HasPowerPointProperties,
|
|
29
|
+
HasTextReaderProperties
|
|
30
|
+
):
|
|
31
|
+
name = 'Reader2Table'
|
|
32
|
+
|
|
33
|
+
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
34
|
+
|
|
35
|
+
@keyword_only
|
|
36
|
+
def __init__(self):
|
|
37
|
+
super(Reader2Table, self).__init__(classname="com.johnsnowlabs.reader.Reader2Table")
|
|
38
|
+
self._setDefault(outputCol="document", outputFormat="json-table", inferTableStructure=True,
|
|
39
|
+
outputAsDocument=False)
|
|
40
|
+
|
|
41
|
+
@keyword_only
|
|
42
|
+
def setParams(self):
|
|
43
|
+
kwargs = self._input_kwargs
|
|
44
|
+
return self._set(**kwargs)
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# Copyright 2017-2025 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from pyspark import keyword_only
|
|
16
|
+
|
|
17
|
+
from sparknlp.common import AnnotatorType
|
|
18
|
+
from sparknlp.internal import AnnotatorTransformer
|
|
19
|
+
from sparknlp.partition.partition_properties import *
|
|
20
|
+
|
|
21
|
+
class ReaderAssembler(
|
|
22
|
+
AnnotatorTransformer,
|
|
23
|
+
HasReaderProperties,
|
|
24
|
+
HasHTMLReaderProperties,
|
|
25
|
+
HasEmailReaderProperties,
|
|
26
|
+
HasExcelReaderProperties,
|
|
27
|
+
HasPowerPointProperties,
|
|
28
|
+
HasTextReaderProperties,
|
|
29
|
+
HasPdfProperties
|
|
30
|
+
):
|
|
31
|
+
"""
|
|
32
|
+
The ReaderAssembler annotator provides a unified interface for combining multiple Spark NLP
|
|
33
|
+
readers (such as Reader2Doc, Reader2Table, and Reader2Image) into a single, configurable
|
|
34
|
+
component. It automatically orchestrates the execution of different readers based on input type,
|
|
35
|
+
configured priorities, and fallback strategies allowing you to handle diverse content formats
|
|
36
|
+
without manually chaining multiple readers in your pipeline.
|
|
37
|
+
|
|
38
|
+
ReaderAssembler simplifies the process of building flexible pipelines capable of ingesting and
|
|
39
|
+
processing documents, tables, and images in a consistent way. It handles reader selection,
|
|
40
|
+
ordering, and fault-tolerance internally, ensuring that pipelines remain concise, robust, and
|
|
41
|
+
easy to maintain.
|
|
42
|
+
|
|
43
|
+
Examples
|
|
44
|
+
--------
|
|
45
|
+
>>> from johnsnowlabs.reader import ReaderAssembler
|
|
46
|
+
>>> from pyspark.ml import Pipeline
|
|
47
|
+
>>>
|
|
48
|
+
>>> reader_assembler = ReaderAssembler() \\
|
|
49
|
+
... .setContentType("text/html") \\
|
|
50
|
+
... .setContentPath("/table-image.html") \\
|
|
51
|
+
... .setOutputCol("document")
|
|
52
|
+
>>>
|
|
53
|
+
>>> pipeline = Pipeline(stages=[reader_assembler])
|
|
54
|
+
>>> pipeline_model = pipeline.fit(empty_data_set)
|
|
55
|
+
>>> result_df = pipeline_model.transform(empty_data_set)
|
|
56
|
+
>>>
|
|
57
|
+
>>> result_df.show()
|
|
58
|
+
+--------+--------------------+--------------------+--------------------+---------+
|
|
59
|
+
|fileName| document_text| document_table| document_image|exception|
|
|
60
|
+
+--------+--------------------+--------------------+--------------------+---------+
|
|
61
|
+
| null|[{'document', 0, 26...|[{'document', 0, 50...|[{'image', , 5, 5, ...| null|
|
|
62
|
+
+--------+--------------------+--------------------+--------------------+---------+
|
|
63
|
+
|
|
64
|
+
This annotator is especially useful when working with heterogeneous input data — for example,
|
|
65
|
+
when a dataset includes PDFs, spreadsheets, and images — allowing Spark NLP to automatically
|
|
66
|
+
invoke the appropriate reader for each file type while preserving a unified schema in the output.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
name = 'ReaderAssembler'
|
|
71
|
+
|
|
72
|
+
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
73
|
+
|
|
74
|
+
excludeNonText = Param(
|
|
75
|
+
Params._dummy(),
|
|
76
|
+
"excludeNonText",
|
|
77
|
+
"Whether to exclude non-text content from the output. Default is False.",
|
|
78
|
+
typeConverter=TypeConverters.toBoolean
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
userMessage = Param(
|
|
82
|
+
Params._dummy(),
|
|
83
|
+
"userMessage",
|
|
84
|
+
"Custom user message.",
|
|
85
|
+
typeConverter=TypeConverters.toString
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
promptTemplate = Param(
|
|
89
|
+
Params._dummy(),
|
|
90
|
+
"promptTemplate",
|
|
91
|
+
"Format of the output prompt.",
|
|
92
|
+
typeConverter=TypeConverters.toString
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
customPromptTemplate = Param(
|
|
96
|
+
Params._dummy(),
|
|
97
|
+
"customPromptTemplate",
|
|
98
|
+
"Custom prompt template for image models.",
|
|
99
|
+
typeConverter=TypeConverters.toString
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
@keyword_only
|
|
103
|
+
def __init__(self):
|
|
104
|
+
super(ReaderAssembler, self).__init__(classname="com.johnsnowlabs.reader.ReaderAssembler")
|
|
105
|
+
self._setDefault(contentType="",
|
|
106
|
+
explodeDocs=False,
|
|
107
|
+
userMessage="Describe this image",
|
|
108
|
+
promptTemplate="qwen2vl-chat",
|
|
109
|
+
readAsImage=True,
|
|
110
|
+
customPromptTemplate="",
|
|
111
|
+
ignoreExceptions=True,
|
|
112
|
+
flattenOutput=False,
|
|
113
|
+
titleThreshold=18)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@keyword_only
|
|
117
|
+
def setParams(self):
|
|
118
|
+
kwargs = self._input_kwargs
|
|
119
|
+
return self._set(**kwargs)
|
|
120
|
+
|
|
121
|
+
def setExcludeNonText(self, value):
|
|
122
|
+
"""Sets whether to exclude non-text content from the output.
|
|
123
|
+
|
|
124
|
+
Parameters
|
|
125
|
+
----------
|
|
126
|
+
value : bool
|
|
127
|
+
Whether to exclude non-text content from the output. Default is False.
|
|
128
|
+
"""
|
|
129
|
+
return self._set(excludeNonText=value)
|
|
130
|
+
|
|
131
|
+
def setUserMessage(self, value: str):
|
|
132
|
+
"""Sets custom user message.
|
|
133
|
+
|
|
134
|
+
Parameters
|
|
135
|
+
----------
|
|
136
|
+
value : str
|
|
137
|
+
Custom user message to include.
|
|
138
|
+
"""
|
|
139
|
+
return self._set(userMessage=value)
|
|
140
|
+
|
|
141
|
+
def setPromptTemplate(self, value: str):
|
|
142
|
+
"""Sets format of the output prompt.
|
|
143
|
+
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
value : str
|
|
147
|
+
Prompt template format.
|
|
148
|
+
"""
|
|
149
|
+
return self._set(promptTemplate=value)
|
|
150
|
+
|
|
151
|
+
def setCustomPromptTemplate(self, value: str):
|
|
152
|
+
"""Sets custom prompt template for image models.
|
|
153
|
+
|
|
154
|
+
Parameters
|
|
155
|
+
----------
|
|
156
|
+
value : str
|
|
157
|
+
Custom prompt template string.
|
|
158
|
+
"""
|
|
159
|
+
return self._set(customPromptTemplate=value)
|