spark-nlp 6.0.2__tar.gz → 6.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of spark-nlp might be problematic. Click here for more details.
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/PKG-INFO +5 -5
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/README.md +4 -4
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/setup.py +1 -1
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/spark_nlp.egg-info/PKG-INFO +5 -5
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/spark_nlp.egg-info/SOURCES.txt +3 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/__init__.py +1 -1
- spark_nlp-6.0.4/sparknlp/annotator/dataframe_optimizer.py +216 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/__init__.py +2 -0
- spark_nlp-6.0.4/sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- spark_nlp-6.0.4/sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/internal/__init__.py +18 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/partition/partition_properties.py +63 -1
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/partition/partition_transformer.py +11 -7
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/reader/pdf_to_text.py +34 -1
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/reader/sparknlp_reader.py +45 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/util.py +26 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/com/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/com/johnsnowlabs/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/com/johnsnowlabs/ml/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/com/johnsnowlabs/ml/ai/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/com/johnsnowlabs/nlp/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/setup.cfg +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/spark_nlp.egg-info/dependency_links.txt +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/spark_nlp.egg-info/top_level.txt +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotation.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotation_audio.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotation_image.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/audio/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/audio/hubert_for_ctc.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/audio/wav2vec2_for_ctc.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/audio/whisper_for_ctc.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/chunk2_doc.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/chunker.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/albert_for_question_answering.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/albert_for_token_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/bert_for_question_answering.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/bert_for_token_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/classifier_dl.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/multi_classifier_dl.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/sentiment_dl.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/cleaners/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/cleaners/cleaner.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/cleaners/extractor.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/coref/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/coref/spanbert_coref.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/cv/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/cv/blip_for_question_answering.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/cv/clip_for_zero_shot_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/cv/convnext_for_image_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/cv/florence2_transformer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/cv/gemma3_for_multimodal.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/cv/internvl_for_multimodal.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/cv/janus_for_multimodal.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/cv/llava_for_multimodal.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/cv/mllama_for_multimodal.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/cv/paligemma_for_multimodal.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/cv/phi3_vision_for_multimodal.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/cv/qwen2vl_transformer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/cv/smolvlm_transformer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/cv/swin_for_image_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/cv/vit_for_image_classification.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/date2_chunk.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/dependency/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/dependency/dependency_parser.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/dependency/typed_dependency_parser.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/document_character_text_splitter.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/document_normalizer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/document_token_splitter.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/document_token_splitter_test.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/albert_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/auto_gguf_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/bert_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/bert_sentence_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/bge_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/camembert_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/chunk_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/deberta_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/distil_bert_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/doc2vec.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/e5_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/elmo_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/instructor_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/longformer_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/mpnet_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/mxbai_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/nomic_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/roberta_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/sentence_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/snowflake_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/uae_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/universal_sentence_encoder.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/word2vec.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/word_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/embeddings/xlnet_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/er/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/er/entity_ruler.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/graph_extraction.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/keyword_extraction/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/ld_dl/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/ld_dl/language_detector_dl.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/lemmatizer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/matcher/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/matcher/big_text_matcher.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/matcher/date_matcher.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/matcher/multi_date_matcher.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/matcher/regex_matcher.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/matcher/text_matcher.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/n_gram_generator.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/ner/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/ner/ner_approach.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/ner/ner_converter.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/ner/ner_crf.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/ner/ner_dl.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/ner/ner_overwriter.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/ner/zero_shot_ner_model.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/normalizer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/openai/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/openai/openai_completion.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/openai/openai_embeddings.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/param/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/param/classifier_encoder.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/param/evaluation_dl_params.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/pos/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/pos/perceptron.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/sentence/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/sentence/sentence_detector.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/sentence/sentence_detector_dl.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/sentiment/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/sentiment/sentiment_detector.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/sentiment/vivekn_sentiment.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/seq2seq/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/seq2seq/auto_gguf_model.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/seq2seq/bart_transformer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/seq2seq/cohere_transformer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/seq2seq/cpm_transformer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/seq2seq/gpt2_transformer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/seq2seq/llama2_transformer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/seq2seq/llama3_transformer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/seq2seq/m2m100_transformer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/seq2seq/marian_transformer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/seq2seq/mistral_transformer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/seq2seq/nllb_transformer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/seq2seq/olmo_transformer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/seq2seq/phi2_transformer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/seq2seq/phi3_transformer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/seq2seq/qwen_transformer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/seq2seq/starcoder_transformer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/seq2seq/t5_transformer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/similarity/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/similarity/document_similarity_ranker.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/spell_check/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/spell_check/context_spell_checker.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/spell_check/norvig_sweeting.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/spell_check/symmetric_delete.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/stemmer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/stop_words_cleaner.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/tf_ner_dl_graph_builder.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/token/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/token/chunk_tokenizer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/token/recursive_tokenizer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/token/regex_tokenizer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/token/tokenizer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/token2_chunk.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/ws/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/annotator/ws/word_segmenter.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/base/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/base/audio_assembler.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/base/doc2_chunk.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/base/document_assembler.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/base/embeddings_finisher.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/base/finisher.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/base/graph_finisher.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/base/has_recursive_fit.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/base/has_recursive_transform.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/base/image_assembler.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/base/light_pipeline.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/base/multi_document_assembler.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/base/prompt_assembler.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/base/recursive_pipeline.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/base/table_assembler.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/base/token_assembler.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/common/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/common/annotator_approach.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/common/annotator_model.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/common/annotator_properties.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/common/annotator_type.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/common/coverage_result.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/common/match_strategy.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/common/properties.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/common/read_as.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/common/recursive_annotator_approach.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/common/storage.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/common/utils.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/functions.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/internal/annotator_java_ml.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/internal/annotator_transformer.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/internal/extended_java_wrapper.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/internal/params_getters_setters.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/internal/recursive.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/logging/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/logging/comet.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/partition/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/partition/partition.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/pretrained/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/pretrained/pretrained_pipeline.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/pretrained/resource_downloader.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/pretrained/utils.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/reader/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/reader/enums.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders/graph_builders.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders_1x/graph_builders.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/conll.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/conllu.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/pos.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/pub_tator.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/spacy_to_annotation.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/training/tfgraphs.py +0 -0
- {spark_nlp-6.0.2 → spark_nlp-6.0.4}/sparknlp/upload_to_hub.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: spark-nlp
|
|
3
|
-
Version: 6.0.
|
|
3
|
+
Version: 6.0.4
|
|
4
4
|
Summary: John Snow Labs Spark NLP is a natural language processing library built on top of Apache Spark ML. It provides simple, performant & accurate NLP annotations for machine learning pipelines, that scale easily in a distributed environment.
|
|
5
5
|
Home-page: https://github.com/JohnSnowLabs/spark-nlp
|
|
6
6
|
Author: John Snow Labs
|
|
@@ -102,7 +102,7 @@ $ java -version
|
|
|
102
102
|
$ conda create -n sparknlp python=3.7 -y
|
|
103
103
|
$ conda activate sparknlp
|
|
104
104
|
# spark-nlp by default is based on pyspark 3.x
|
|
105
|
-
$ pip install spark-nlp==6.0.
|
|
105
|
+
$ pip install spark-nlp==6.0.4 pyspark==3.3.1
|
|
106
106
|
```
|
|
107
107
|
|
|
108
108
|
In Python console or Jupyter `Python3` kernel:
|
|
@@ -168,7 +168,7 @@ For a quick example of using pipelines and models take a look at our official [d
|
|
|
168
168
|
|
|
169
169
|
### Apache Spark Support
|
|
170
170
|
|
|
171
|
-
Spark NLP *6.0.
|
|
171
|
+
Spark NLP *6.0.4* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x
|
|
172
172
|
|
|
173
173
|
| Spark NLP | Apache Spark 3.5.x | Apache Spark 3.4.x | Apache Spark 3.3.x | Apache Spark 3.2.x | Apache Spark 3.1.x | Apache Spark 3.0.x | Apache Spark 2.4.x | Apache Spark 2.3.x |
|
|
174
174
|
|-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
|
|
@@ -198,7 +198,7 @@ Find out more about 4.x `SparkNLP` versions in our official [documentation](http
|
|
|
198
198
|
|
|
199
199
|
### Databricks Support
|
|
200
200
|
|
|
201
|
-
Spark NLP 6.0.
|
|
201
|
+
Spark NLP 6.0.4 has been tested and is compatible with the following runtimes:
|
|
202
202
|
|
|
203
203
|
| **CPU** | **GPU** |
|
|
204
204
|
|--------------------|--------------------|
|
|
@@ -215,7 +215,7 @@ We are compatible with older runtimes. For a full list check databricks support
|
|
|
215
215
|
|
|
216
216
|
### EMR Support
|
|
217
217
|
|
|
218
|
-
Spark NLP 6.0.
|
|
218
|
+
Spark NLP 6.0.4 has been tested and is compatible with the following EMR releases:
|
|
219
219
|
|
|
220
220
|
| **EMR Release** |
|
|
221
221
|
|--------------------|
|
|
@@ -63,7 +63,7 @@ $ java -version
|
|
|
63
63
|
$ conda create -n sparknlp python=3.7 -y
|
|
64
64
|
$ conda activate sparknlp
|
|
65
65
|
# spark-nlp by default is based on pyspark 3.x
|
|
66
|
-
$ pip install spark-nlp==6.0.
|
|
66
|
+
$ pip install spark-nlp==6.0.4 pyspark==3.3.1
|
|
67
67
|
```
|
|
68
68
|
|
|
69
69
|
In Python console or Jupyter `Python3` kernel:
|
|
@@ -129,7 +129,7 @@ For a quick example of using pipelines and models take a look at our official [d
|
|
|
129
129
|
|
|
130
130
|
### Apache Spark Support
|
|
131
131
|
|
|
132
|
-
Spark NLP *6.0.
|
|
132
|
+
Spark NLP *6.0.4* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x
|
|
133
133
|
|
|
134
134
|
| Spark NLP | Apache Spark 3.5.x | Apache Spark 3.4.x | Apache Spark 3.3.x | Apache Spark 3.2.x | Apache Spark 3.1.x | Apache Spark 3.0.x | Apache Spark 2.4.x | Apache Spark 2.3.x |
|
|
135
135
|
|-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
|
|
@@ -159,7 +159,7 @@ Find out more about 4.x `SparkNLP` versions in our official [documentation](http
|
|
|
159
159
|
|
|
160
160
|
### Databricks Support
|
|
161
161
|
|
|
162
|
-
Spark NLP 6.0.
|
|
162
|
+
Spark NLP 6.0.4 has been tested and is compatible with the following runtimes:
|
|
163
163
|
|
|
164
164
|
| **CPU** | **GPU** |
|
|
165
165
|
|--------------------|--------------------|
|
|
@@ -176,7 +176,7 @@ We are compatible with older runtimes. For a full list check databricks support
|
|
|
176
176
|
|
|
177
177
|
### EMR Support
|
|
178
178
|
|
|
179
|
-
Spark NLP 6.0.
|
|
179
|
+
Spark NLP 6.0.4 has been tested and is compatible with the following EMR releases:
|
|
180
180
|
|
|
181
181
|
| **EMR Release** |
|
|
182
182
|
|--------------------|
|
|
@@ -41,7 +41,7 @@ setup(
|
|
|
41
41
|
# project code, see
|
|
42
42
|
# https://packaging.python.org/en/latest/single_source_version.html
|
|
43
43
|
|
|
44
|
-
version='6.0.
|
|
44
|
+
version='6.0.4', # Required
|
|
45
45
|
|
|
46
46
|
# This is a one-line description or tagline of what your project does. This
|
|
47
47
|
# corresponds to the 'Summary' metadata field:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: spark-nlp
|
|
3
|
-
Version: 6.0.
|
|
3
|
+
Version: 6.0.4
|
|
4
4
|
Summary: John Snow Labs Spark NLP is a natural language processing library built on top of Apache Spark ML. It provides simple, performant & accurate NLP annotations for machine learning pipelines, that scale easily in a distributed environment.
|
|
5
5
|
Home-page: https://github.com/JohnSnowLabs/spark-nlp
|
|
6
6
|
Author: John Snow Labs
|
|
@@ -102,7 +102,7 @@ $ java -version
|
|
|
102
102
|
$ conda create -n sparknlp python=3.7 -y
|
|
103
103
|
$ conda activate sparknlp
|
|
104
104
|
# spark-nlp by default is based on pyspark 3.x
|
|
105
|
-
$ pip install spark-nlp==6.0.
|
|
105
|
+
$ pip install spark-nlp==6.0.4 pyspark==3.3.1
|
|
106
106
|
```
|
|
107
107
|
|
|
108
108
|
In Python console or Jupyter `Python3` kernel:
|
|
@@ -168,7 +168,7 @@ For a quick example of using pipelines and models take a look at our official [d
|
|
|
168
168
|
|
|
169
169
|
### Apache Spark Support
|
|
170
170
|
|
|
171
|
-
Spark NLP *6.0.
|
|
171
|
+
Spark NLP *6.0.4* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x
|
|
172
172
|
|
|
173
173
|
| Spark NLP | Apache Spark 3.5.x | Apache Spark 3.4.x | Apache Spark 3.3.x | Apache Spark 3.2.x | Apache Spark 3.1.x | Apache Spark 3.0.x | Apache Spark 2.4.x | Apache Spark 2.3.x |
|
|
174
174
|
|-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
|
|
@@ -198,7 +198,7 @@ Find out more about 4.x `SparkNLP` versions in our official [documentation](http
|
|
|
198
198
|
|
|
199
199
|
### Databricks Support
|
|
200
200
|
|
|
201
|
-
Spark NLP 6.0.
|
|
201
|
+
Spark NLP 6.0.4 has been tested and is compatible with the following runtimes:
|
|
202
202
|
|
|
203
203
|
| **CPU** | **GPU** |
|
|
204
204
|
|--------------------|--------------------|
|
|
@@ -215,7 +215,7 @@ We are compatible with older runtimes. For a full list check databricks support
|
|
|
215
215
|
|
|
216
216
|
### EMR Support
|
|
217
217
|
|
|
218
|
-
Spark NLP 6.0.
|
|
218
|
+
Spark NLP 6.0.4 has been tested and is compatible with the following EMR releases:
|
|
219
219
|
|
|
220
220
|
| **EMR Release** |
|
|
221
221
|
|--------------------|
|
|
@@ -20,6 +20,7 @@ sparknlp/util.py
|
|
|
20
20
|
sparknlp/annotator/__init__.py
|
|
21
21
|
sparknlp/annotator/chunk2_doc.py
|
|
22
22
|
sparknlp/annotator/chunker.py
|
|
23
|
+
sparknlp/annotator/dataframe_optimizer.py
|
|
23
24
|
sparknlp/annotator/date2_chunk.py
|
|
24
25
|
sparknlp/annotator/document_character_text_splitter.py
|
|
25
26
|
sparknlp/annotator/document_normalizer.py
|
|
@@ -121,9 +122,11 @@ sparknlp/annotator/embeddings/deberta_embeddings.py
|
|
|
121
122
|
sparknlp/annotator/embeddings/distil_bert_embeddings.py
|
|
122
123
|
sparknlp/annotator/embeddings/doc2vec.py
|
|
123
124
|
sparknlp/annotator/embeddings/e5_embeddings.py
|
|
125
|
+
sparknlp/annotator/embeddings/e5v_embeddings.py
|
|
124
126
|
sparknlp/annotator/embeddings/elmo_embeddings.py
|
|
125
127
|
sparknlp/annotator/embeddings/instructor_embeddings.py
|
|
126
128
|
sparknlp/annotator/embeddings/longformer_embeddings.py
|
|
129
|
+
sparknlp/annotator/embeddings/minilm_embeddings.py
|
|
127
130
|
sparknlp/annotator/embeddings/mpnet_embeddings.py
|
|
128
131
|
sparknlp/annotator/embeddings/mxbai_embeddings.py
|
|
129
132
|
sparknlp/annotator/embeddings/nomic_embeddings.py
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
# Copyright 2017-2025 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from pyspark.ml import Transformer
|
|
15
|
+
from pyspark.ml.param.shared import *
|
|
16
|
+
from pyspark.sql import DataFrame
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
# Custom converter for string-to-string dictionaries
|
|
20
|
+
def toStringDict(value):
|
|
21
|
+
if not isinstance(value, dict):
|
|
22
|
+
raise TypeError("Expected a dictionary of strings.")
|
|
23
|
+
return {str(k): str(v) for k, v in value.items()}
|
|
24
|
+
|
|
25
|
+
class DataFrameOptimizer(Transformer):
|
|
26
|
+
"""
|
|
27
|
+
Optimizes a Spark DataFrame by repartitioning, optionally caching, and persisting it to disk.
|
|
28
|
+
|
|
29
|
+
This transformer is intended to improve performance for Spark NLP pipelines or when preparing
|
|
30
|
+
data for export. It allows partition tuning via `numPartitions` directly, or indirectly using
|
|
31
|
+
`executorCores` and `numWorkers`. The DataFrame can also be persisted in a specified format
|
|
32
|
+
(`csv`, `json`, or `parquet`) with additional writer options.
|
|
33
|
+
|
|
34
|
+
Parameters
|
|
35
|
+
----------
|
|
36
|
+
executorCores : int, optional
|
|
37
|
+
Number of cores per Spark executor (used to compute number of partitions if `numPartitions` is not set).
|
|
38
|
+
|
|
39
|
+
numWorkers : int, optional
|
|
40
|
+
Number of executor nodes (used to compute number of partitions if `numPartitions` is not set).
|
|
41
|
+
|
|
42
|
+
numPartitions : int, optional
|
|
43
|
+
Target number of partitions for the DataFrame (overrides calculation via cores × workers).
|
|
44
|
+
|
|
45
|
+
doCache : bool, default False
|
|
46
|
+
Whether to cache the DataFrame after repartitioning.
|
|
47
|
+
|
|
48
|
+
persistPath : str, optional
|
|
49
|
+
Path to save the DataFrame output (if persistence is enabled).
|
|
50
|
+
|
|
51
|
+
persistFormat : str, optional
|
|
52
|
+
Format to persist the DataFrame in: one of `'csv'`, `'json'`, or `'parquet'`.
|
|
53
|
+
|
|
54
|
+
outputOptions : dict, optional
|
|
55
|
+
Dictionary of options for the DataFrameWriter (e.g., `{"compression": "snappy"}` for parquet).
|
|
56
|
+
|
|
57
|
+
Examples
|
|
58
|
+
--------
|
|
59
|
+
>>> optimizer = DataFrameOptimizer() \\
|
|
60
|
+
... .setExecutorCores(4) \\
|
|
61
|
+
... .setNumWorkers(5) \\
|
|
62
|
+
... .setDoCache(True) \\
|
|
63
|
+
... .setPersistPath("/tmp/out") \\
|
|
64
|
+
... .setPersistFormat("parquet") \\
|
|
65
|
+
... .setOutputOptions({"compression": "snappy"})
|
|
66
|
+
|
|
67
|
+
>>> optimized_df = optimizer.transform(input_df)
|
|
68
|
+
|
|
69
|
+
Notes
|
|
70
|
+
-----
|
|
71
|
+
- You must specify either `numPartitions`, or both `executorCores` and `numWorkers`.
|
|
72
|
+
- Schema is preserved; no columns are modified or removed.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
executorCores = Param(
|
|
76
|
+
Params._dummy(),
|
|
77
|
+
"executorCores",
|
|
78
|
+
"Number of cores per executor",
|
|
79
|
+
typeConverter = TypeConverters.toInt
|
|
80
|
+
)
|
|
81
|
+
numWorkers = Param(
|
|
82
|
+
Params._dummy(),
|
|
83
|
+
"numWorkers",
|
|
84
|
+
"Number of Spark workers",
|
|
85
|
+
typeConverter = TypeConverters.toInt
|
|
86
|
+
)
|
|
87
|
+
numPartitions = Param(
|
|
88
|
+
Params._dummy(),
|
|
89
|
+
"numPartitions",
|
|
90
|
+
"Total number of partitions (overrides executorCores * numWorkers)",
|
|
91
|
+
typeConverter = TypeConverters.toInt
|
|
92
|
+
)
|
|
93
|
+
doCache = Param(
|
|
94
|
+
Params._dummy(),
|
|
95
|
+
"doCache",
|
|
96
|
+
"Whether to cache the DataFrame",
|
|
97
|
+
typeConverter = TypeConverters.toBoolean
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
persistPath = Param(
|
|
101
|
+
Params._dummy(),
|
|
102
|
+
"persistPath",
|
|
103
|
+
"Optional path to persist the DataFrame",
|
|
104
|
+
typeConverter = TypeConverters.toString
|
|
105
|
+
)
|
|
106
|
+
persistFormat = Param(
|
|
107
|
+
Params._dummy(),
|
|
108
|
+
"persistFormat",
|
|
109
|
+
"Format to persist: parquet, json, csv",
|
|
110
|
+
typeConverter = TypeConverters.toString
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
outputOptions = Param(
|
|
114
|
+
Params._dummy(),
|
|
115
|
+
"outputOptions",
|
|
116
|
+
"Additional writer options",
|
|
117
|
+
typeConverter=toStringDict
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
def __init__(self):
|
|
121
|
+
super().__init__()
|
|
122
|
+
self._setDefault(
|
|
123
|
+
doCache=False,
|
|
124
|
+
persistFormat="none",
|
|
125
|
+
numPartitions=1,
|
|
126
|
+
executorCores=1,
|
|
127
|
+
numWorkers=1
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# Parameter setters
|
|
131
|
+
def setExecutorCores(self, value: int):
|
|
132
|
+
"""Set the number of executor cores."""
|
|
133
|
+
return self._set(executorCores=value)
|
|
134
|
+
|
|
135
|
+
def setNumWorkers(self, value: int):
|
|
136
|
+
"""Set the number of Spark workers."""
|
|
137
|
+
return self._set(numWorkers=value)
|
|
138
|
+
|
|
139
|
+
def setNumPartitions(self, value: int):
|
|
140
|
+
"""Set the total number of partitions (overrides cores * workers)."""
|
|
141
|
+
return self._set(numPartitions=value)
|
|
142
|
+
|
|
143
|
+
def setDoCache(self, value: bool):
|
|
144
|
+
"""Set whether to cache the DataFrame."""
|
|
145
|
+
return self._set(doCache=value)
|
|
146
|
+
|
|
147
|
+
def setPersistPath(self, value: str):
|
|
148
|
+
"""Set the path where the DataFrame should be persisted."""
|
|
149
|
+
return self._set(persistPath=value)
|
|
150
|
+
|
|
151
|
+
def setPersistFormat(self, value: str):
|
|
152
|
+
"""Set the format to persist the DataFrame (parquet, json, csv)."""
|
|
153
|
+
return self._set(persistFormat=value)
|
|
154
|
+
|
|
155
|
+
def setOutputOptions(self, value: dict):
|
|
156
|
+
"""Set additional writer options (e.g. for csv headers)."""
|
|
157
|
+
return self._set(outputOptions=value)
|
|
158
|
+
|
|
159
|
+
# Optional bulk setter
|
|
160
|
+
def setParams(self, **kwargs: Any):
|
|
161
|
+
for param, value in kwargs.items():
|
|
162
|
+
self._set(**{param: value})
|
|
163
|
+
return self
|
|
164
|
+
|
|
165
|
+
def _transform(self, dataset: DataFrame) -> DataFrame:
|
|
166
|
+
self._validate_params()
|
|
167
|
+
part_count = self.getOrDefault(self.numPartitions)
|
|
168
|
+
cores = self.getOrDefault(self.executorCores)
|
|
169
|
+
workers = self.getOrDefault(self.numWorkers)
|
|
170
|
+
if cores is None or workers is None:
|
|
171
|
+
raise ValueError("Provide either numPartitions or both executorCores and numWorkers")
|
|
172
|
+
if part_count == 1:
|
|
173
|
+
part_count = cores * workers
|
|
174
|
+
|
|
175
|
+
optimized_df = dataset.repartition(part_count)
|
|
176
|
+
|
|
177
|
+
if self.getOrDefault(self.doCache):
|
|
178
|
+
optimized_df = optimized_df.cache()
|
|
179
|
+
|
|
180
|
+
format = self.getOrDefault(self.persistFormat).lower()
|
|
181
|
+
if format != "none":
|
|
182
|
+
path = self.getOrDefault(self.persistPath)
|
|
183
|
+
if not path:
|
|
184
|
+
raise ValueError("persistPath must be set when persistFormat is not 'none'")
|
|
185
|
+
writer = optimized_df.write.mode("overwrite")
|
|
186
|
+
if self.isDefined(self.outputOptions):
|
|
187
|
+
writer = writer.options(**self.getOrDefault(self.outputOptions))
|
|
188
|
+
if format == "parquet":
|
|
189
|
+
writer.parquet(path)
|
|
190
|
+
elif format == "json":
|
|
191
|
+
writer.json(path)
|
|
192
|
+
elif format == "csv":
|
|
193
|
+
writer.csv(path)
|
|
194
|
+
else:
|
|
195
|
+
raise ValueError(f"Unsupported format: {format}")
|
|
196
|
+
|
|
197
|
+
return optimized_df
|
|
198
|
+
|
|
199
|
+
def _validate_params(self):
|
|
200
|
+
if self.isDefined(self.executorCores):
|
|
201
|
+
val = self.getOrDefault(self.executorCores)
|
|
202
|
+
if val <= 0:
|
|
203
|
+
raise ValueError("executorCores must be > 0")
|
|
204
|
+
|
|
205
|
+
if self.isDefined(self.numWorkers):
|
|
206
|
+
val = self.getOrDefault(self.numWorkers)
|
|
207
|
+
if val <= 0:
|
|
208
|
+
raise ValueError("numWorkers must be > 0")
|
|
209
|
+
|
|
210
|
+
if self.isDefined(self.numPartitions):
|
|
211
|
+
val = self.getOrDefault(self.numPartitions)
|
|
212
|
+
if val <= 0:
|
|
213
|
+
raise ValueError("numPartitions must be > 0")
|
|
214
|
+
|
|
215
|
+
if self.isDefined(self.persistPath) and not self.isDefined(self.persistFormat):
|
|
216
|
+
raise ValueError("persistFormat must be defined when persistPath is set")
|
|
@@ -25,6 +25,7 @@ from sparknlp.annotator.embeddings.elmo_embeddings import *
|
|
|
25
25
|
from sparknlp.annotator.embeddings.e5_embeddings import *
|
|
26
26
|
from sparknlp.annotator.embeddings.instructor_embeddings import *
|
|
27
27
|
from sparknlp.annotator.embeddings.longformer_embeddings import *
|
|
28
|
+
from sparknlp.annotator.embeddings.minilm_embeddings import *
|
|
28
29
|
from sparknlp.annotator.embeddings.mpnet_embeddings import *
|
|
29
30
|
from sparknlp.annotator.embeddings.roberta_embeddings import *
|
|
30
31
|
from sparknlp.annotator.embeddings.roberta_sentence_embeddings import *
|
|
@@ -41,3 +42,4 @@ from sparknlp.annotator.embeddings.mxbai_embeddings import *
|
|
|
41
42
|
from sparknlp.annotator.embeddings.snowflake_embeddings import *
|
|
42
43
|
from sparknlp.annotator.embeddings.nomic_embeddings import *
|
|
43
44
|
from sparknlp.annotator.embeddings.auto_gguf_embeddings import *
|
|
45
|
+
from sparknlp.annotator.embeddings.e5v_embeddings import *
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# Copyright 2017-2024 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from sparknlp.common import *
|
|
16
|
+
|
|
17
|
+
class E5VEmbeddings(AnnotatorModel,
|
|
18
|
+
HasBatchedAnnotateImage,
|
|
19
|
+
HasImageFeatureProperties,
|
|
20
|
+
HasEngine,
|
|
21
|
+
HasRescaleFactor):
|
|
22
|
+
"""Universal multimodal embeddings using the E5-V model (see https://huggingface.co/royokong/e5-v).
|
|
23
|
+
|
|
24
|
+
E5-V bridges the modality gap between different input types (text, image) and demonstrates strong performance in multimodal embeddings, even without fine-tuning. It also supports a single-modality training approach, where the model is trained exclusively on text pairs, often yielding better performance than multimodal training.
|
|
25
|
+
|
|
26
|
+
Pretrained models can be loaded with :meth:`.pretrained` of the companion object:
|
|
27
|
+
|
|
28
|
+
>>> e5vEmbeddings = E5VEmbeddings.pretrained() \
|
|
29
|
+
... .setInputCols(["image_assembler"]) \
|
|
30
|
+
... .setOutputCol("e5v")
|
|
31
|
+
|
|
32
|
+
The default model is ``"e5v_int4"``, if no name is provided.
|
|
33
|
+
|
|
34
|
+
For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Question+Answering>`__.
|
|
35
|
+
|
|
36
|
+
====================== ======================
|
|
37
|
+
Input Annotation types Output Annotation type
|
|
38
|
+
====================== ======================
|
|
39
|
+
``IMAGE`` ``SENTENCE_EMBEDDINGS``
|
|
40
|
+
====================== ======================
|
|
41
|
+
|
|
42
|
+
Examples
|
|
43
|
+
--------
|
|
44
|
+
Image + Text Embedding:
|
|
45
|
+
>>> import sparknlp
|
|
46
|
+
>>> from sparknlp.base import *
|
|
47
|
+
>>> from sparknlp.annotator import *
|
|
48
|
+
>>> from pyspark.ml import Pipeline
|
|
49
|
+
>>> image_df = spark.read.format("image").option("dropInvalid", value = True).load(imageFolder)
|
|
50
|
+
>>> imagePrompt = "<|start_header_id|>user<|end_header_id|>\n\n<image>\\nSummary above image in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"
|
|
51
|
+
>>> test_df = image_df.withColumn("text", lit(imagePrompt))
|
|
52
|
+
>>> imageAssembler = ImageAssembler() \
|
|
53
|
+
... .setInputCol("image") \
|
|
54
|
+
... .setOutputCol("image_assembler")
|
|
55
|
+
>>> e5vEmbeddings = E5VEmbeddings.pretrained() \
|
|
56
|
+
... .setInputCols(["image_assembler"]) \
|
|
57
|
+
... .setOutputCol("e5v")
|
|
58
|
+
>>> pipeline = Pipeline().setStages([
|
|
59
|
+
... imageAssembler,
|
|
60
|
+
... e5vEmbeddings
|
|
61
|
+
... ])
|
|
62
|
+
>>> result = pipeline.fit(test_df).transform(test_df)
|
|
63
|
+
>>> result.select("e5v.embeddings").show(truncate = False)
|
|
64
|
+
|
|
65
|
+
Text-Only Embedding:
|
|
66
|
+
>>> from sparknlp.util import EmbeddingsDataFrameUtils
|
|
67
|
+
>>> textPrompt = "<|start_header_id|>user<|end_header_id|>\n\n<sent>\\nSummary above sentence in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"
|
|
68
|
+
>>> textDesc = "A cat sitting in a box."
|
|
69
|
+
>>> nullImageDF = spark.createDataFrame(spark.sparkContext.parallelize([EmbeddingsDataFrameUtils.emptyImageRow]), EmbeddingsDataFrameUtils.imageSchema)
|
|
70
|
+
>>> textDF = nullImageDF.withColumn("text", lit(textPrompt.replace("<sent>", textDesc)))
|
|
71
|
+
>>> e5vEmbeddings = E5VEmbeddings.pretrained() \
|
|
72
|
+
... .setInputCols(["image"]) \
|
|
73
|
+
... .setOutputCol("e5v")
|
|
74
|
+
>>> result = e5vEmbeddings.transform(textDF)
|
|
75
|
+
>>> result.select("e5v.embeddings").show(truncate = False)
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
name = "E5VEmbeddings"
|
|
79
|
+
|
|
80
|
+
inputAnnotatorTypes = [AnnotatorType.IMAGE]
|
|
81
|
+
outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
|
|
82
|
+
|
|
83
|
+
@keyword_only
|
|
84
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.E5VEmbeddings", java_model=None):
|
|
85
|
+
"""Initializes the E5VEmbeddings annotator.
|
|
86
|
+
|
|
87
|
+
Parameters
|
|
88
|
+
----------
|
|
89
|
+
classname : str, optional
|
|
90
|
+
The Java class name of the annotator, by default "com.johnsnowlabs.nlp.annotators.embeddings.E5VEmbeddings"
|
|
91
|
+
java_model : Optional[java.lang.Object], optional
|
|
92
|
+
A pre-initialized Java model, by default None
|
|
93
|
+
"""
|
|
94
|
+
super(E5VEmbeddings, self).__init__(classname=classname, java_model=java_model)
|
|
95
|
+
self._setDefault()
|
|
96
|
+
|
|
97
|
+
@staticmethod
|
|
98
|
+
def loadSavedModel(folder, spark_session, use_openvino=False):
|
|
99
|
+
"""Loads a locally saved model.
|
|
100
|
+
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
folder : str
|
|
104
|
+
Folder of the saved model
|
|
105
|
+
spark_session : pyspark.sql.SparkSession
|
|
106
|
+
The current SparkSession
|
|
107
|
+
use_openvino : bool, optional
|
|
108
|
+
Whether to use OpenVINO engine, by default False
|
|
109
|
+
|
|
110
|
+
Returns
|
|
111
|
+
-------
|
|
112
|
+
E5VEmbeddings
|
|
113
|
+
The restored model
|
|
114
|
+
"""
|
|
115
|
+
from sparknlp.internal import _E5VEmbeddingsLoader
|
|
116
|
+
jModel = _E5VEmbeddingsLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
|
|
117
|
+
return E5VEmbeddings(java_model=jModel)
|
|
118
|
+
|
|
119
|
+
@staticmethod
|
|
120
|
+
def pretrained(name="e5v_int4", lang="en", remote_loc=None):
|
|
121
|
+
"""Downloads and loads a pretrained model.
|
|
122
|
+
|
|
123
|
+
Parameters
|
|
124
|
+
----------
|
|
125
|
+
name : str, optional
|
|
126
|
+
Name of the pretrained model, by default "e5v_int4"
|
|
127
|
+
lang : str, optional
|
|
128
|
+
Language of the pretrained model, by default "en"
|
|
129
|
+
remote_loc : str, optional
|
|
130
|
+
Optional remote address of the resource, by default None. Will use Spark NLPs repositories otherwise.
|
|
131
|
+
|
|
132
|
+
Returns
|
|
133
|
+
-------
|
|
134
|
+
E5VEmbeddings
|
|
135
|
+
The restored model
|
|
136
|
+
"""
|
|
137
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
138
|
+
return ResourceDownloader.downloadModel(E5VEmbeddings, name, lang, remote_loc)
|