spark-nlp 5.2.2__tar.gz → 5.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of spark-nlp might be problematic. Click here for more details.
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/PKG-INFO +89 -82
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/README.md +88 -81
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/setup.py +1 -1
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/spark_nlp.egg-info/PKG-INFO +89 -82
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/spark_nlp.egg-info/SOURCES.txt +5 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/__init__.py +2 -2
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/__init__.py +4 -1
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +3 -3
- spark-nlp-5.3.0/sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +206 -0
- spark-nlp-5.3.0/sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- spark-nlp-5.3.0/sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/er/entity_ruler.py +1 -1
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/seq2seq/__init__.py +2 -0
- spark-nlp-5.3.0/sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- spark-nlp-5.3.0/sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/similarity/document_similarity_ranker.py +19 -1
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/base/finisher.py +1 -1
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/base/light_pipeline.py +1 -1
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/base/multi_document_assembler.py +1 -1
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/internal/__init__.py +33 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/com/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/com/johnsnowlabs/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/com/johnsnowlabs/nlp/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/setup.cfg +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/spark_nlp.egg-info/dependency_links.txt +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/spark_nlp.egg-info/top_level.txt +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotation.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotation_audio.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotation_image.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/audio/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/audio/hubert_for_ctc.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/audio/wav2vec2_for_ctc.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/audio/whisper_for_ctc.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/chunk2_doc.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/chunker.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/albert_for_question_answering.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/albert_for_token_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/bert_for_question_answering.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/bert_for_token_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/classifier_dl.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/multi_classifier_dl.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/sentiment_dl.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/coref/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/coref/spanbert_coref.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/cv/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/cv/clip_for_zero_shot_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/cv/convnext_for_image_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/cv/swin_for_image_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/cv/vit_for_image_classification.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/date2_chunk.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/dependency/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/dependency/dependency_parser.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/dependency/typed_dependency_parser.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/document_character_text_splitter.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/document_normalizer.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/document_token_splitter.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/document_token_splitter_test.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/albert_embeddings.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/bert_embeddings.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/bert_sentence_embeddings.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/bge_embeddings.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/camembert_embeddings.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/chunk_embeddings.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/deberta_embeddings.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/distil_bert_embeddings.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/doc2vec.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/e5_embeddings.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/elmo_embeddings.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/instructor_embeddings.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/longformer_embeddings.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/mpnet_embeddings.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/roberta_embeddings.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/sentence_embeddings.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/universal_sentence_encoder.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/word2vec.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/word_embeddings.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/embeddings/xlnet_embeddings.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/er/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/graph_extraction.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/keyword_extraction/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/ld_dl/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/ld_dl/language_detector_dl.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/lemmatizer.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/matcher/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/matcher/big_text_matcher.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/matcher/date_matcher.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/matcher/multi_date_matcher.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/matcher/regex_matcher.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/matcher/text_matcher.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/n_gram_generator.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/ner/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/ner/ner_approach.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/ner/ner_converter.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/ner/ner_crf.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/ner/ner_dl.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/ner/ner_overwriter.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/ner/zero_shot_ner_model.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/normalizer.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/openai/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/openai/openai_completion.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/openai/openai_embeddings.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/param/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/param/classifier_encoder.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/param/evaluation_dl_params.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/pos/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/pos/perceptron.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/sentence/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/sentence/sentence_detector.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/sentence/sentence_detector_dl.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/sentiment/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/sentiment/sentiment_detector.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/sentiment/vivekn_sentiment.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/seq2seq/bart_transformer.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/seq2seq/gpt2_transformer.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/seq2seq/marian_transformer.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/seq2seq/t5_transformer.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/similarity/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/spell_check/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/spell_check/context_spell_checker.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/spell_check/norvig_sweeting.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/spell_check/symmetric_delete.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/stemmer.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/stop_words_cleaner.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/token/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/token/chunk_tokenizer.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/token/recursive_tokenizer.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/token/regex_tokenizer.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/token/tokenizer.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/token2_chunk.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/ws/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/annotator/ws/word_segmenter.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/base/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/base/audio_assembler.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/base/doc2_chunk.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/base/document_assembler.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/base/embeddings_finisher.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/base/graph_finisher.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/base/has_recursive_fit.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/base/has_recursive_transform.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/base/image_assembler.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/base/recursive_pipeline.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/base/table_assembler.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/base/token_assembler.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/common/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/common/annotator_approach.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/common/annotator_model.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/common/annotator_properties.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/common/annotator_type.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/common/coverage_result.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/common/match_strategy.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/common/properties.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/common/read_as.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/common/recursive_annotator_approach.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/common/storage.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/common/utils.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/functions.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/internal/annotator_java_ml.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/internal/annotator_transformer.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/internal/extended_java_wrapper.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/internal/params_getters_setters.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/internal/recursive.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/logging/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/logging/comet.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/pretrained/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/pretrained/pretrained_pipeline.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/pretrained/resource_downloader.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/pretrained/utils.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders/graph_builders.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders_1x/graph_builders.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/conll.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/conllu.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/pos.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/pub_tator.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/spacy_to_annotation.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/training/tfgraphs.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/upload_to_hub.py +0 -0
- {spark-nlp-5.2.2 → spark-nlp-5.3.0}/sparknlp/util.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: spark-nlp
|
|
3
|
-
Version: 5.
|
|
3
|
+
Version: 5.3.0
|
|
4
4
|
Summary: John Snow Labs Spark NLP is a natural language processing library built on top of Apache Spark ML. It provides simple, performant & accurate NLP annotations for machine learning pipelines, that scale easily in a distributed environment.
|
|
5
5
|
Home-page: https://github.com/JohnSnowLabs/spark-nlp
|
|
6
6
|
Author: John Snow Labs
|
|
@@ -51,10 +51,10 @@ Description-Content-Type: text/markdown
|
|
|
51
51
|
|
|
52
52
|
Spark NLP is a state-of-the-art Natural Language Processing library built on top of Apache Spark. It provides **simple**, **performant** & **accurate** NLP annotations for machine learning pipelines that **scale** easily in a distributed
|
|
53
53
|
environment.
|
|
54
|
-
Spark NLP comes with **
|
|
54
|
+
Spark NLP comes with **36000+** pretrained **pipelines** and **models** in more than **200+** languages.
|
|
55
55
|
It also offers tasks such as **Tokenization**, **Word Segmentation**, **Part-of-Speech Tagging**, Word and Sentence **Embeddings**, **Named Entity Recognition**, **Dependency Parsing**, **Spell Checking**, **Text Classification**, **Sentiment Analysis**, **Token Classification**, **Machine Translation** (+180 languages), **Summarization**, **Question Answering**, **Table Question Answering**, **Text Generation**, **Image Classification**, **Image to Text (captioning)**, **Automatic Speech Recognition**, **Zero-Shot Learning**, and many more [NLP tasks](#features).
|
|
56
56
|
|
|
57
|
-
**Spark NLP** is the only open-source NLP library in **production** that offers state-of-the-art transformers such as **BERT**, **CamemBERT**, **ALBERT**, **ELECTRA**, **XLNet**, **DistilBERT**, **RoBERTa**, **DeBERTa**, **XLM-RoBERTa**, **Longformer**, **ELMO**, **Universal Sentence Encoder**, **
|
|
57
|
+
**Spark NLP** is the only open-source NLP library in **production** that offers state-of-the-art transformers such as **BERT**, **CamemBERT**, **ALBERT**, **ELECTRA**, **XLNet**, **DistilBERT**, **RoBERTa**, **DeBERTa**, **XLM-RoBERTa**, **Longformer**, **ELMO**, **Universal Sentence Encoder**, **Llama-2**, **M2M100**, **BART**, **Instructor**, **E5**, **Google T5**, **MarianMT**, **OpenAI GPT2**, **Vision Transformers (ViT)**, **OpenAI Whisper**, and many more not only to **Python** and **R**, but also to **JVM** ecosystem (**Java**, **Scala**, and **Kotlin**) at **scale** by extending **Apache Spark** natively.
|
|
58
58
|
|
|
59
59
|
## Project's website
|
|
60
60
|
|
|
@@ -143,42 +143,34 @@ documentation and examples
|
|
|
143
143
|
- BERT Sentence Embeddings (TF Hub & HuggingFace models)
|
|
144
144
|
- RoBerta Sentence Embeddings (HuggingFace models)
|
|
145
145
|
- XLM-RoBerta Sentence Embeddings (HuggingFace models)
|
|
146
|
-
-
|
|
146
|
+
- INSTRUCTOR Embeddings (HuggingFace models)
|
|
147
147
|
- E5 Embeddings (HuggingFace models)
|
|
148
148
|
- MPNet Embeddings (HuggingFace models)
|
|
149
149
|
- OpenAI Embeddings
|
|
150
|
-
- Sentence Embeddings
|
|
151
|
-
- Chunk Embeddings
|
|
150
|
+
- Sentence & Chunk Embeddings
|
|
152
151
|
- Unsupervised keywords extraction
|
|
153
152
|
- Language Detection & Identification (up to 375 languages)
|
|
154
|
-
- Multi-class Sentiment analysis (Deep learning)
|
|
155
|
-
- Multi-label Sentiment analysis (Deep learning)
|
|
153
|
+
- Multi-class & Multi-labe Sentiment analysis (Deep learning)
|
|
156
154
|
- Multi-class Text Classification (Deep learning)
|
|
157
|
-
- BERT for Token & Sequence Classification
|
|
158
|
-
- DistilBERT for Token & Sequence Classification
|
|
159
|
-
- CamemBERT for Token & Sequence Classification
|
|
160
|
-
- ALBERT for Token & Sequence Classification
|
|
161
|
-
- RoBERTa for Token & Sequence Classification
|
|
162
|
-
- DeBERTa for Token & Sequence Classification
|
|
163
|
-
- XLM-RoBERTa for Token & Sequence Classification
|
|
155
|
+
- BERT for Token & Sequence Classification & Question Answering
|
|
156
|
+
- DistilBERT for Token & Sequence Classification & Question Answering
|
|
157
|
+
- CamemBERT for Token & Sequence Classification & Question Answering
|
|
158
|
+
- ALBERT for Token & Sequence Classification & Question Answering
|
|
159
|
+
- RoBERTa for Token & Sequence Classification & Question Answering
|
|
160
|
+
- DeBERTa for Token & Sequence Classification & Question Answering
|
|
161
|
+
- XLM-RoBERTa for Token & Sequence Classification & Question Answering
|
|
162
|
+
- Longformer for Token & Sequence Classification & Question Answering
|
|
163
|
+
- MPnet for Token & Sequence Classification & Question Answering
|
|
164
164
|
- XLNet for Token & Sequence Classification
|
|
165
|
-
- Longformer for Token & Sequence Classification
|
|
166
|
-
- BERT for Token & Sequence Classification
|
|
167
|
-
- BERT for Question Answering
|
|
168
|
-
- CamemBERT for Question Answering
|
|
169
|
-
- DistilBERT for Question Answering
|
|
170
|
-
- ALBERT for Question Answering
|
|
171
|
-
- RoBERTa for Question Answering
|
|
172
|
-
- DeBERTa for Question Answering
|
|
173
|
-
- XLM-RoBERTa for Question Answering
|
|
174
|
-
- Longformer for Question Answering
|
|
175
|
-
- Table Question Answering (TAPAS)
|
|
176
165
|
- Zero-Shot NER Model
|
|
177
166
|
- Zero-Shot Text Classification by Transformers (ZSL)
|
|
178
167
|
- Neural Machine Translation (MarianMT)
|
|
168
|
+
- Many-to-Many multilingual translation model (Facebook M2M100)
|
|
169
|
+
- Table Question Answering (TAPAS)
|
|
179
170
|
- Text-To-Text Transfer Transformer (Google T5)
|
|
180
171
|
- Generative Pre-trained Transformer 2 (OpenAI GPT2)
|
|
181
172
|
- Seq2Seq for NLG, Translation, and Comprehension (Facebook BART)
|
|
173
|
+
- Chat and Conversational LLMs (Facebook Llama-22)
|
|
182
174
|
- Vision Transformer (Google ViT)
|
|
183
175
|
- Swin Image Classification (Microsoft Swin Transformer)
|
|
184
176
|
- ConvNext Image Classification (Facebook ConvNext)
|
|
@@ -191,7 +183,7 @@ documentation and examples
|
|
|
191
183
|
- Easy ONNX and TensorFlow integrations
|
|
192
184
|
- GPU Support
|
|
193
185
|
- Full integration with Spark ML functions
|
|
194
|
-
- +
|
|
186
|
+
- +30000 pre-trained models in +200 languages!
|
|
195
187
|
- +6000 pre-trained pipelines in +200 languages!
|
|
196
188
|
- Multi-lingual NER models: Arabic, Bengali, Chinese, Danish, Dutch, English, Finnish, French, German, Hebrew, Italian,
|
|
197
189
|
Japanese, Korean, Norwegian, Persian, Polish, Portuguese, Russian, Spanish, Swedish, Urdu, and more.
|
|
@@ -205,7 +197,7 @@ To use Spark NLP you need the following requirements:
|
|
|
205
197
|
|
|
206
198
|
**GPU (optional):**
|
|
207
199
|
|
|
208
|
-
Spark NLP 5.
|
|
200
|
+
Spark NLP 5.3.0 is built with ONNX 1.17.0 and TensorFlow 2.7.1 deep learning engines. The minimum following NVIDIA® software are only required for GPU support:
|
|
209
201
|
|
|
210
202
|
- NVIDIA® GPU drivers version 450.80.02 or higher
|
|
211
203
|
- CUDA® Toolkit 11.2
|
|
@@ -221,7 +213,7 @@ $ java -version
|
|
|
221
213
|
$ conda create -n sparknlp python=3.7 -y
|
|
222
214
|
$ conda activate sparknlp
|
|
223
215
|
# spark-nlp by default is based on pyspark 3.x
|
|
224
|
-
$ pip install spark-nlp==5.
|
|
216
|
+
$ pip install spark-nlp==5.3.0 pyspark==3.3.1
|
|
225
217
|
```
|
|
226
218
|
|
|
227
219
|
In Python console or Jupyter `Python3` kernel:
|
|
@@ -266,11 +258,12 @@ For more examples, you can visit our dedicated [examples](https://github.com/Joh
|
|
|
266
258
|
|
|
267
259
|
## Apache Spark Support
|
|
268
260
|
|
|
269
|
-
Spark NLP *5.
|
|
261
|
+
Spark NLP *5.3.0* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x
|
|
270
262
|
|
|
271
263
|
| Spark NLP | Apache Spark 3.5.x | Apache Spark 3.4.x | Apache Spark 3.3.x | Apache Spark 3.2.x | Apache Spark 3.1.x | Apache Spark 3.0.x | Apache Spark 2.4.x | Apache Spark 2.3.x |
|
|
272
264
|
|-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
|
|
273
|
-
| 5.
|
|
265
|
+
| 5.3.x | YES | YES | YES | YES | YES | YES | NO | NO |
|
|
266
|
+
| 5.2.x | YES | YES | YES | YES | YES | YES | NO | NO |
|
|
274
267
|
| 5.1.x | Partially | YES | YES | YES | YES | YES | NO | NO |
|
|
275
268
|
| 5.0.x | YES | YES | YES | YES | YES | YES | NO | NO |
|
|
276
269
|
| 4.4.x | YES | YES | YES | YES | YES | YES | NO | NO |
|
|
@@ -291,6 +284,7 @@ Find out more about `Spark NLP` versions from our [release notes](https://github
|
|
|
291
284
|
|
|
292
285
|
| Spark NLP | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10| Scala 2.11 | Scala 2.12 |
|
|
293
286
|
|-----------|------------|------------|------------|------------|------------|------------|------------|
|
|
287
|
+
| 5.3.x | NO | YES | YES | YES | YES | NO | YES |
|
|
294
288
|
| 5.2.x | NO | YES | YES | YES | YES | NO | YES |
|
|
295
289
|
| 5.1.x | NO | YES | YES | YES | YES | NO | YES |
|
|
296
290
|
| 5.0.x | NO | YES | YES | YES | YES | NO | YES |
|
|
@@ -308,7 +302,7 @@ Find out more about `Spark NLP` versions from our [release notes](https://github
|
|
|
308
302
|
|
|
309
303
|
## Databricks Support
|
|
310
304
|
|
|
311
|
-
Spark NLP 5.
|
|
305
|
+
Spark NLP 5.3.0 has been tested and is compatible with the following runtimes:
|
|
312
306
|
|
|
313
307
|
**CPU:**
|
|
314
308
|
|
|
@@ -350,6 +344,10 @@ Spark NLP 5.2.2 has been tested and is compatible with the following runtimes:
|
|
|
350
344
|
- 14.0 ML
|
|
351
345
|
- 14.1
|
|
352
346
|
- 14.1 ML
|
|
347
|
+
- 14.2
|
|
348
|
+
- 14.2 ML
|
|
349
|
+
- 14.3
|
|
350
|
+
- 14.3 ML
|
|
353
351
|
|
|
354
352
|
**GPU:**
|
|
355
353
|
|
|
@@ -372,10 +370,12 @@ Spark NLP 5.2.2 has been tested and is compatible with the following runtimes:
|
|
|
372
370
|
- 13.3 ML & GPU
|
|
373
371
|
- 14.0 ML & GPU
|
|
374
372
|
- 14.1 ML & GPU
|
|
373
|
+
- 14.2 ML & GPU
|
|
374
|
+
- 14.3 ML & GPU
|
|
375
375
|
|
|
376
376
|
## EMR Support
|
|
377
377
|
|
|
378
|
-
Spark NLP 5.
|
|
378
|
+
Spark NLP 5.3.0 has been tested and is compatible with the following EMR releases:
|
|
379
379
|
|
|
380
380
|
- emr-6.2.0
|
|
381
381
|
- emr-6.3.0
|
|
@@ -391,8 +391,11 @@ Spark NLP 5.2.2 has been tested and is compatible with the following EMR release
|
|
|
391
391
|
- emr-6.12.0
|
|
392
392
|
- emr-6.13.0
|
|
393
393
|
- emr-6.14.0
|
|
394
|
+
- emr-6.15.0
|
|
395
|
+
- emr-7.0.0
|
|
394
396
|
|
|
395
397
|
Full list of [Amazon EMR 6.x releases](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-release-6x.html)
|
|
398
|
+
Full list of [Amazon EMR 7.x releases](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-release-7x.html)
|
|
396
399
|
|
|
397
400
|
NOTE: The EMR 6.1.0 and 6.1.1 are not supported.
|
|
398
401
|
|
|
@@ -422,11 +425,11 @@ Spark NLP supports all major releases of Apache Spark 3.0.x, Apache Spark 3.1.x,
|
|
|
422
425
|
```sh
|
|
423
426
|
# CPU
|
|
424
427
|
|
|
425
|
-
spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.
|
|
428
|
+
spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.0
|
|
426
429
|
|
|
427
|
-
pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.
|
|
430
|
+
pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.0
|
|
428
431
|
|
|
429
|
-
spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.
|
|
432
|
+
spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.0
|
|
430
433
|
```
|
|
431
434
|
|
|
432
435
|
The `spark-nlp` has been published to
|
|
@@ -435,11 +438,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s
|
|
|
435
438
|
```sh
|
|
436
439
|
# GPU
|
|
437
440
|
|
|
438
|
-
spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.
|
|
441
|
+
spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.3.0
|
|
439
442
|
|
|
440
|
-
pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.
|
|
443
|
+
pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.3.0
|
|
441
444
|
|
|
442
|
-
spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.
|
|
445
|
+
spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.3.0
|
|
443
446
|
|
|
444
447
|
```
|
|
445
448
|
|
|
@@ -449,11 +452,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s
|
|
|
449
452
|
```sh
|
|
450
453
|
# AArch64
|
|
451
454
|
|
|
452
|
-
spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.
|
|
455
|
+
spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.3.0
|
|
453
456
|
|
|
454
|
-
pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.
|
|
457
|
+
pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.3.0
|
|
455
458
|
|
|
456
|
-
spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.
|
|
459
|
+
spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.3.0
|
|
457
460
|
|
|
458
461
|
```
|
|
459
462
|
|
|
@@ -463,11 +466,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s
|
|
|
463
466
|
```sh
|
|
464
467
|
# M1/M2 (Apple Silicon)
|
|
465
468
|
|
|
466
|
-
spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.
|
|
469
|
+
spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.3.0
|
|
467
470
|
|
|
468
|
-
pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.
|
|
471
|
+
pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.3.0
|
|
469
472
|
|
|
470
|
-
spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.
|
|
473
|
+
spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.3.0
|
|
471
474
|
|
|
472
475
|
```
|
|
473
476
|
|
|
@@ -481,7 +484,7 @@ set in your SparkSession:
|
|
|
481
484
|
spark-shell \
|
|
482
485
|
--driver-memory 16g \
|
|
483
486
|
--conf spark.kryoserializer.buffer.max=2000M \
|
|
484
|
-
--packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.
|
|
487
|
+
--packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.0
|
|
485
488
|
```
|
|
486
489
|
|
|
487
490
|
## Scala
|
|
@@ -499,7 +502,7 @@ coordinates:
|
|
|
499
502
|
<dependency>
|
|
500
503
|
<groupId>com.johnsnowlabs.nlp</groupId>
|
|
501
504
|
<artifactId>spark-nlp_2.12</artifactId>
|
|
502
|
-
<version>5.
|
|
505
|
+
<version>5.3.0</version>
|
|
503
506
|
</dependency>
|
|
504
507
|
```
|
|
505
508
|
|
|
@@ -510,7 +513,7 @@ coordinates:
|
|
|
510
513
|
<dependency>
|
|
511
514
|
<groupId>com.johnsnowlabs.nlp</groupId>
|
|
512
515
|
<artifactId>spark-nlp-gpu_2.12</artifactId>
|
|
513
|
-
<version>5.
|
|
516
|
+
<version>5.3.0</version>
|
|
514
517
|
</dependency>
|
|
515
518
|
```
|
|
516
519
|
|
|
@@ -521,7 +524,7 @@ coordinates:
|
|
|
521
524
|
<dependency>
|
|
522
525
|
<groupId>com.johnsnowlabs.nlp</groupId>
|
|
523
526
|
<artifactId>spark-nlp-aarch64_2.12</artifactId>
|
|
524
|
-
<version>5.
|
|
527
|
+
<version>5.3.0</version>
|
|
525
528
|
</dependency>
|
|
526
529
|
```
|
|
527
530
|
|
|
@@ -532,7 +535,7 @@ coordinates:
|
|
|
532
535
|
<dependency>
|
|
533
536
|
<groupId>com.johnsnowlabs.nlp</groupId>
|
|
534
537
|
<artifactId>spark-nlp-silicon_2.12</artifactId>
|
|
535
|
-
<version>5.
|
|
538
|
+
<version>5.3.0</version>
|
|
536
539
|
</dependency>
|
|
537
540
|
```
|
|
538
541
|
|
|
@@ -542,28 +545,28 @@ coordinates:
|
|
|
542
545
|
|
|
543
546
|
```sbtshell
|
|
544
547
|
// https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp
|
|
545
|
-
libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.
|
|
548
|
+
libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.3.0"
|
|
546
549
|
```
|
|
547
550
|
|
|
548
551
|
**spark-nlp-gpu:**
|
|
549
552
|
|
|
550
553
|
```sbtshell
|
|
551
554
|
// https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu
|
|
552
|
-
libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.
|
|
555
|
+
libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.3.0"
|
|
553
556
|
```
|
|
554
557
|
|
|
555
558
|
**spark-nlp-aarch64:**
|
|
556
559
|
|
|
557
560
|
```sbtshell
|
|
558
561
|
// https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64
|
|
559
|
-
libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.
|
|
562
|
+
libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.3.0"
|
|
560
563
|
```
|
|
561
564
|
|
|
562
565
|
**spark-nlp-silicon:**
|
|
563
566
|
|
|
564
567
|
```sbtshell
|
|
565
568
|
// https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon
|
|
566
|
-
libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.
|
|
569
|
+
libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.3.0"
|
|
567
570
|
```
|
|
568
571
|
|
|
569
572
|
Maven
|
|
@@ -585,7 +588,7 @@ If you installed pyspark through pip/conda, you can install `spark-nlp` through
|
|
|
585
588
|
Pip:
|
|
586
589
|
|
|
587
590
|
```bash
|
|
588
|
-
pip install spark-nlp==5.
|
|
591
|
+
pip install spark-nlp==5.3.0
|
|
589
592
|
```
|
|
590
593
|
|
|
591
594
|
Conda:
|
|
@@ -614,7 +617,7 @@ spark = SparkSession.builder
|
|
|
614
617
|
.config("spark.driver.memory", "16G")
|
|
615
618
|
.config("spark.driver.maxResultSize", "0")
|
|
616
619
|
.config("spark.kryoserializer.buffer.max", "2000M")
|
|
617
|
-
.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.
|
|
620
|
+
.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.0")
|
|
618
621
|
.getOrCreate()
|
|
619
622
|
```
|
|
620
623
|
|
|
@@ -685,7 +688,7 @@ Use either one of the following options
|
|
|
685
688
|
- Add the following Maven Coordinates to the interpreter's library list
|
|
686
689
|
|
|
687
690
|
```bash
|
|
688
|
-
com.johnsnowlabs.nlp:spark-nlp_2.12:5.
|
|
691
|
+
com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.0
|
|
689
692
|
```
|
|
690
693
|
|
|
691
694
|
- Add a path to pre-built jar from [here](#compiled-jars) in the interpreter's library list making sure the jar is
|
|
@@ -696,7 +699,7 @@ com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.2
|
|
|
696
699
|
Apart from the previous step, install the python module through pip
|
|
697
700
|
|
|
698
701
|
```bash
|
|
699
|
-
pip install spark-nlp==5.
|
|
702
|
+
pip install spark-nlp==5.3.0
|
|
700
703
|
```
|
|
701
704
|
|
|
702
705
|
Or you can install `spark-nlp` from inside Zeppelin by using Conda:
|
|
@@ -724,7 +727,7 @@ launch the Jupyter from the same Python environment:
|
|
|
724
727
|
$ conda create -n sparknlp python=3.8 -y
|
|
725
728
|
$ conda activate sparknlp
|
|
726
729
|
# spark-nlp by default is based on pyspark 3.x
|
|
727
|
-
$ pip install spark-nlp==5.
|
|
730
|
+
$ pip install spark-nlp==5.3.0 pyspark==3.3.1 jupyter
|
|
728
731
|
$ jupyter notebook
|
|
729
732
|
```
|
|
730
733
|
|
|
@@ -741,7 +744,7 @@ export PYSPARK_PYTHON=python3
|
|
|
741
744
|
export PYSPARK_DRIVER_PYTHON=jupyter
|
|
742
745
|
export PYSPARK_DRIVER_PYTHON_OPTS=notebook
|
|
743
746
|
|
|
744
|
-
pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.
|
|
747
|
+
pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.0
|
|
745
748
|
```
|
|
746
749
|
|
|
747
750
|
Alternatively, you can mix in using `--jars` option for pyspark + `pip install spark-nlp`
|
|
@@ -768,7 +771,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi
|
|
|
768
771
|
# -s is for spark-nlp
|
|
769
772
|
# -g will enable upgrading libcudnn8 to 8.1.0 on Google Colab for GPU usage
|
|
770
773
|
# by default they are set to the latest
|
|
771
|
-
!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.
|
|
774
|
+
!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.3.0
|
|
772
775
|
```
|
|
773
776
|
|
|
774
777
|
[Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb)
|
|
@@ -791,7 +794,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi
|
|
|
791
794
|
# -s is for spark-nlp
|
|
792
795
|
# -g will enable upgrading libcudnn8 to 8.1.0 on Kaggle for GPU usage
|
|
793
796
|
# by default they are set to the latest
|
|
794
|
-
!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.
|
|
797
|
+
!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.3.0
|
|
795
798
|
```
|
|
796
799
|
|
|
797
800
|
[Spark NLP quick start on Kaggle Kernel](https://www.kaggle.com/mozzie/spark-nlp-named-entity-recognition) is a live
|
|
@@ -810,9 +813,9 @@ demo on Kaggle Kernel that performs named entity recognitions by using Spark NLP
|
|
|
810
813
|
|
|
811
814
|
3. In `Libraries` tab inside your cluster you need to follow these steps:
|
|
812
815
|
|
|
813
|
-
3.1. Install New -> PyPI -> `spark-nlp==5.
|
|
816
|
+
3.1. Install New -> PyPI -> `spark-nlp==5.3.0` -> Install
|
|
814
817
|
|
|
815
|
-
3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.
|
|
818
|
+
3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.0` -> Install
|
|
816
819
|
|
|
817
820
|
4. Now you can attach your notebook to the cluster and use Spark NLP!
|
|
818
821
|
|
|
@@ -863,7 +866,7 @@ A sample of your software configuration in JSON on S3 (must be public access):
|
|
|
863
866
|
"spark.kryoserializer.buffer.max": "2000M",
|
|
864
867
|
"spark.serializer": "org.apache.spark.serializer.KryoSerializer",
|
|
865
868
|
"spark.driver.maxResultSize": "0",
|
|
866
|
-
"spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.
|
|
869
|
+
"spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.0"
|
|
867
870
|
}
|
|
868
871
|
}]
|
|
869
872
|
```
|
|
@@ -872,7 +875,7 @@ A sample of AWS CLI to launch EMR cluster:
|
|
|
872
875
|
|
|
873
876
|
```.sh
|
|
874
877
|
aws emr create-cluster \
|
|
875
|
-
--name "Spark NLP 5.
|
|
878
|
+
--name "Spark NLP 5.3.0" \
|
|
876
879
|
--release-label emr-6.2.0 \
|
|
877
880
|
--applications Name=Hadoop Name=Spark Name=Hive \
|
|
878
881
|
--instance-type m4.4xlarge \
|
|
@@ -936,7 +939,7 @@ gcloud dataproc clusters create ${CLUSTER_NAME} \
|
|
|
936
939
|
--enable-component-gateway \
|
|
937
940
|
--metadata 'PIP_PACKAGES=spark-nlp spark-nlp-display google-cloud-bigquery google-cloud-storage' \
|
|
938
941
|
--initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/python/pip-install.sh \
|
|
939
|
-
--properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.
|
|
942
|
+
--properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.0
|
|
940
943
|
```
|
|
941
944
|
|
|
942
945
|
2. On an existing one, you need to install spark-nlp and spark-nlp-display packages from PyPI.
|
|
@@ -947,16 +950,20 @@ gcloud dataproc clusters create ${CLUSTER_NAME} \
|
|
|
947
950
|
|
|
948
951
|
You can change the following Spark NLP configurations via Spark Configuration:
|
|
949
952
|
|
|
950
|
-
| Property Name
|
|
951
|
-
|
|
952
|
-
| `spark.jsl.settings.pretrained.cache_folder`
|
|
953
|
-
| `spark.jsl.settings.storage.cluster_tmp_dir`
|
|
954
|
-
| `spark.jsl.settings.annotator.log_folder`
|
|
955
|
-
| `spark.jsl.settings.aws.credentials.access_key_id`
|
|
956
|
-
| `spark.jsl.settings.aws.credentials.secret_access_key`
|
|
957
|
-
| `spark.jsl.settings.aws.credentials.session_token`
|
|
958
|
-
| `spark.jsl.settings.aws.s3_bucket`
|
|
959
|
-
| `spark.jsl.settings.aws.region`
|
|
953
|
+
| Property Name | Default | Meaning |
|
|
954
|
+
|---------------------------------------------------------|----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|
955
|
+
| `spark.jsl.settings.pretrained.cache_folder` | `~/cache_pretrained` | The location to download and extract pretrained `Models` and `Pipelines`. By default, it will be in User's Home directory under `cache_pretrained` directory |
|
|
956
|
+
| `spark.jsl.settings.storage.cluster_tmp_dir` | `hadoop.tmp.dir` | The location to use on a cluster for temporarily files such as unpacking indexes for WordEmbeddings. By default, this locations is the location of `hadoop.tmp.dir` set via Hadoop configuration for Apache Spark. NOTE: `S3` is not supported and it must be local, HDFS, or DBFS |
|
|
957
|
+
| `spark.jsl.settings.annotator.log_folder` | `~/annotator_logs` | The location to save logs from annotators during training such as `NerDLApproach`, `ClassifierDLApproach`, `SentimentDLApproach`, `MultiClassifierDLApproach`, etc. By default, it will be in User's Home directory under `annotator_logs` directory |
|
|
958
|
+
| `spark.jsl.settings.aws.credentials.access_key_id` | `None` | Your AWS access key to use your S3 bucket to store log files of training models or access tensorflow graphs used in `NerDLApproach` |
|
|
959
|
+
| `spark.jsl.settings.aws.credentials.secret_access_key` | `None` | Your AWS secret access key to use your S3 bucket to store log files of training models or access tensorflow graphs used in `NerDLApproach` |
|
|
960
|
+
| `spark.jsl.settings.aws.credentials.session_token` | `None` | Your AWS MFA session token to use your S3 bucket to store log files of training models or access tensorflow graphs used in `NerDLApproach` |
|
|
961
|
+
| `spark.jsl.settings.aws.s3_bucket` | `None` | Your AWS S3 bucket to store log files of training models or access tensorflow graphs used in `NerDLApproach` |
|
|
962
|
+
| `spark.jsl.settings.aws.region` | `None` | Your AWS region to use your S3 bucket to store log files of training models or access tensorflow graphs used in `NerDLApproach` |
|
|
963
|
+
| `spark.jsl.settings.onnx.gpuDeviceId` | `0` | Constructs CUDA execution provider options for the specified non-negative device id. |
|
|
964
|
+
| `spark.jsl.settings.onnx.intraOpNumThreads` | `6` | Sets the size of the CPU thread pool used for executing a single graph, if executing on a CPU. |
|
|
965
|
+
| `spark.jsl.settings.onnx.optimizationLevel` | `ALL_OPT` | Sets the optimization level of this options object, overriding the old setting. |
|
|
966
|
+
| `spark.jsl.settings.onnx.executionMode` | `SEQUENTIAL` | Sets the execution mode of this options object, overriding the old setting. |
|
|
960
967
|
|
|
961
968
|
### How to set Spark NLP Configuration
|
|
962
969
|
|
|
@@ -975,7 +982,7 @@ spark = SparkSession.builder
|
|
|
975
982
|
.config("spark.kryoserializer.buffer.max", "2000m")
|
|
976
983
|
.config("spark.jsl.settings.pretrained.cache_folder", "sample_data/pretrained")
|
|
977
984
|
.config("spark.jsl.settings.storage.cluster_tmp_dir", "sample_data/storage")
|
|
978
|
-
.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.
|
|
985
|
+
.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.0")
|
|
979
986
|
.getOrCreate()
|
|
980
987
|
```
|
|
981
988
|
|
|
@@ -989,7 +996,7 @@ spark-shell \
|
|
|
989
996
|
--conf spark.kryoserializer.buffer.max=2000M \
|
|
990
997
|
--conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \
|
|
991
998
|
--conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \
|
|
992
|
-
--packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.
|
|
999
|
+
--packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.0
|
|
993
1000
|
```
|
|
994
1001
|
|
|
995
1002
|
**pyspark:**
|
|
@@ -1002,7 +1009,7 @@ pyspark \
|
|
|
1002
1009
|
--conf spark.kryoserializer.buffer.max=2000M \
|
|
1003
1010
|
--conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \
|
|
1004
1011
|
--conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \
|
|
1005
|
-
--packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.
|
|
1012
|
+
--packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.0
|
|
1006
1013
|
```
|
|
1007
1014
|
|
|
1008
1015
|
**Databricks:**
|
|
@@ -1274,7 +1281,7 @@ spark = SparkSession.builder
|
|
|
1274
1281
|
.config("spark.driver.memory", "16G")
|
|
1275
1282
|
.config("spark.driver.maxResultSize", "0")
|
|
1276
1283
|
.config("spark.kryoserializer.buffer.max", "2000M")
|
|
1277
|
-
.config("spark.jars", "/tmp/spark-nlp-assembly-5.
|
|
1284
|
+
.config("spark.jars", "/tmp/spark-nlp-assembly-5.3.0.jar")
|
|
1278
1285
|
.getOrCreate()
|
|
1279
1286
|
```
|
|
1280
1287
|
|
|
@@ -1283,7 +1290,7 @@ spark = SparkSession.builder
|
|
|
1283
1290
|
version (3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x)
|
|
1284
1291
|
- If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need
|
|
1285
1292
|
to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. (
|
|
1286
|
-
i.e., `hdfs:///tmp/spark-nlp-assembly-5.
|
|
1293
|
+
i.e., `hdfs:///tmp/spark-nlp-assembly-5.3.0.jar`)
|
|
1287
1294
|
|
|
1288
1295
|
Example of using pretrained Models and Pipelines in offline:
|
|
1289
1296
|
|