spark-nlp 6.2.2__tar.gz → 6.2.2.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/PKG-INFO +5 -5
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/README.md +4 -4
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/setup.py +1 -1
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/spark_nlp.egg-info/PKG-INFO +5 -5
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/__init__.py +11 -6
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/ner/ner_dl.py +0 -5
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/ner/ner_dl_graph_checker.py +15 -71
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/partition/partition_properties.py +6 -146
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/reader/reader2doc.py +1 -18
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/reader/reader2table.py +1 -2
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/com/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/com/johnsnowlabs/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/com/johnsnowlabs/ml/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/com/johnsnowlabs/ml/ai/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/com/johnsnowlabs/nlp/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/setup.cfg +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/spark_nlp.egg-info/SOURCES.txt +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/spark_nlp.egg-info/dependency_links.txt +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/spark_nlp.egg-info/top_level.txt +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotation.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotation_audio.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotation_image.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/audio/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/audio/hubert_for_ctc.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/audio/wav2vec2_for_ctc.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/audio/whisper_for_ctc.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/chunk2_doc.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/chunker.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/albert_for_question_answering.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/albert_for_token_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/bert_for_question_answering.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/bert_for_token_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/classifier_dl.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/multi_classifier_dl.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/sentiment_dl.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/cleaners/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/cleaners/cleaner.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/cleaners/extractor.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/coref/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/coref/spanbert_coref.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/cv/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/cv/blip_for_question_answering.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/cv/clip_for_zero_shot_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/cv/convnext_for_image_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/cv/florence2_transformer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/cv/gemma3_for_multimodal.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/cv/internvl_for_multimodal.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/cv/janus_for_multimodal.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/cv/llava_for_multimodal.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/cv/mllama_for_multimodal.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/cv/paligemma_for_multimodal.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/cv/phi3_vision_for_multimodal.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/cv/qwen2vl_transformer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/cv/smolvlm_transformer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/cv/swin_for_image_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/cv/vit_for_image_classification.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/dataframe_optimizer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/date2_chunk.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/dependency/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/dependency/dependency_parser.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/dependency/typed_dependency_parser.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/document_character_text_splitter.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/document_normalizer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/document_token_splitter.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/document_token_splitter_test.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/albert_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/auto_gguf_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/bert_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/bert_sentence_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/bge_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/camembert_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/chunk_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/deberta_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/distil_bert_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/doc2vec.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/e5_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/e5v_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/elmo_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/instructor_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/longformer_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/minilm_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/mpnet_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/mxbai_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/nomic_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/roberta_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/sentence_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/snowflake_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/uae_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/universal_sentence_encoder.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/word2vec.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/word_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/embeddings/xlnet_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/er/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/er/entity_ruler.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/graph_extraction.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/keyword_extraction/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/ld_dl/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/ld_dl/language_detector_dl.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/lemmatizer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/matcher/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/matcher/big_text_matcher.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/matcher/date_matcher.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/matcher/multi_date_matcher.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/matcher/regex_matcher.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/matcher/text_matcher.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/n_gram_generator.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/ner/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/ner/ner_approach.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/ner/ner_converter.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/ner/ner_crf.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/ner/ner_overwriter.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/ner/zero_shot_ner_model.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/normalizer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/openai/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/openai/openai_completion.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/openai/openai_embeddings.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/param/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/param/classifier_encoder.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/param/evaluation_dl_params.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/pos/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/pos/perceptron.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/sentence/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/sentence/sentence_detector.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/sentence/sentence_detector_dl.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/sentiment/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/sentiment/sentiment_detector.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/sentiment/vivekn_sentiment.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/seq2seq/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/seq2seq/auto_gguf_model.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/seq2seq/auto_gguf_reranker.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/seq2seq/bart_transformer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/seq2seq/cohere_transformer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/seq2seq/cpm_transformer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/seq2seq/gpt2_transformer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/seq2seq/llama2_transformer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/seq2seq/llama3_transformer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/seq2seq/m2m100_transformer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/seq2seq/marian_transformer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/seq2seq/mistral_transformer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/seq2seq/nllb_transformer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/seq2seq/olmo_transformer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/seq2seq/phi2_transformer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/seq2seq/phi3_transformer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/seq2seq/phi4_transformer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/seq2seq/qwen_transformer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/seq2seq/starcoder_transformer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/seq2seq/t5_transformer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/similarity/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/similarity/document_similarity_ranker.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/spell_check/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/spell_check/context_spell_checker.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/spell_check/norvig_sweeting.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/spell_check/symmetric_delete.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/stemmer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/stop_words_cleaner.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/tf_ner_dl_graph_builder.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/token/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/token/chunk_tokenizer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/token/recursive_tokenizer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/token/regex_tokenizer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/token/tokenizer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/token2_chunk.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/ws/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/annotator/ws/word_segmenter.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/base/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/base/audio_assembler.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/base/doc2_chunk.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/base/document_assembler.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/base/embeddings_finisher.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/base/finisher.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/base/gguf_ranking_finisher.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/base/graph_finisher.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/base/has_recursive_fit.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/base/has_recursive_transform.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/base/image_assembler.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/base/light_pipeline.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/base/multi_document_assembler.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/base/prompt_assembler.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/base/recursive_pipeline.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/base/table_assembler.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/base/token_assembler.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/common/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/common/annotator_approach.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/common/annotator_model.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/common/annotator_properties.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/common/annotator_type.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/common/completion_post_processing.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/common/coverage_result.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/common/match_strategy.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/common/properties.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/common/read_as.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/common/recursive_annotator_approach.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/common/storage.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/common/utils.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/functions.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/internal/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/internal/annotator_java_ml.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/internal/annotator_transformer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/internal/extended_java_wrapper.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/internal/params_getters_setters.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/internal/recursive.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/logging/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/logging/comet.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/partition/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/partition/partition.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/partition/partition_transformer.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/pretrained/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/pretrained/pretrained_pipeline.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/pretrained/resource_downloader.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/pretrained/utils.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/reader/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/reader/enums.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/reader/pdf_to_text.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/reader/reader2image.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/reader/reader_assembler.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/reader/sparknlp_reader.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders/graph_builders.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders_1x/graph_builders.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/conll.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/conllu.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/pos.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/pub_tator.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/spacy_to_annotation.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/training/tfgraphs.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/upload_to_hub.py +0 -0
- {spark_nlp-6.2.2 → spark_nlp-6.2.2.dev2}/sparknlp/util.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: spark-nlp
|
|
3
|
-
Version: 6.2.2
|
|
3
|
+
Version: 6.2.2.dev2
|
|
4
4
|
Summary: John Snow Labs Spark NLP is a natural language processing library built on top of Apache Spark ML. It provides simple, performant & accurate NLP annotations for machine learning pipelines, that scale easily in a distributed environment.
|
|
5
5
|
Home-page: https://github.com/JohnSnowLabs/spark-nlp
|
|
6
6
|
Author: John Snow Labs
|
|
@@ -102,7 +102,7 @@ $ java -version
|
|
|
102
102
|
$ conda create -n sparknlp python=3.7 -y
|
|
103
103
|
$ conda activate sparknlp
|
|
104
104
|
# spark-nlp by default is based on pyspark 3.x
|
|
105
|
-
$ pip install spark-nlp==6.2.
|
|
105
|
+
$ pip install spark-nlp==6.2.0 pyspark==3.3.1
|
|
106
106
|
```
|
|
107
107
|
|
|
108
108
|
In Python console or Jupyter `Python3` kernel:
|
|
@@ -168,7 +168,7 @@ For a quick example of using pipelines and models take a look at our official [d
|
|
|
168
168
|
|
|
169
169
|
### Apache Spark Support
|
|
170
170
|
|
|
171
|
-
Spark NLP *6.2.
|
|
171
|
+
Spark NLP *6.2.0* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x
|
|
172
172
|
|
|
173
173
|
| Spark NLP | Apache Spark 3.5.x | Apache Spark 3.4.x | Apache Spark 3.3.x | Apache Spark 3.2.x | Apache Spark 3.1.x | Apache Spark 3.0.x | Apache Spark 2.4.x | Apache Spark 2.3.x |
|
|
174
174
|
|-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
|
|
@@ -198,7 +198,7 @@ Find out more about 4.x `SparkNLP` versions in our official [documentation](http
|
|
|
198
198
|
|
|
199
199
|
### Databricks Support
|
|
200
200
|
|
|
201
|
-
Spark NLP 6.2.
|
|
201
|
+
Spark NLP 6.2.0 has been tested and is compatible with the following runtimes:
|
|
202
202
|
|
|
203
203
|
| **CPU** | **GPU** |
|
|
204
204
|
|--------------------|--------------------|
|
|
@@ -216,7 +216,7 @@ We are compatible with older runtimes. For a full list check databricks support
|
|
|
216
216
|
|
|
217
217
|
### EMR Support
|
|
218
218
|
|
|
219
|
-
Spark NLP 6.2.
|
|
219
|
+
Spark NLP 6.2.0 has been tested and is compatible with the following EMR releases:
|
|
220
220
|
|
|
221
221
|
| **EMR Release** |
|
|
222
222
|
|--------------------|
|
|
@@ -63,7 +63,7 @@ $ java -version
|
|
|
63
63
|
$ conda create -n sparknlp python=3.7 -y
|
|
64
64
|
$ conda activate sparknlp
|
|
65
65
|
# spark-nlp by default is based on pyspark 3.x
|
|
66
|
-
$ pip install spark-nlp==6.2.
|
|
66
|
+
$ pip install spark-nlp==6.2.0 pyspark==3.3.1
|
|
67
67
|
```
|
|
68
68
|
|
|
69
69
|
In Python console or Jupyter `Python3` kernel:
|
|
@@ -129,7 +129,7 @@ For a quick example of using pipelines and models take a look at our official [d
|
|
|
129
129
|
|
|
130
130
|
### Apache Spark Support
|
|
131
131
|
|
|
132
|
-
Spark NLP *6.2.
|
|
132
|
+
Spark NLP *6.2.0* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x
|
|
133
133
|
|
|
134
134
|
| Spark NLP | Apache Spark 3.5.x | Apache Spark 3.4.x | Apache Spark 3.3.x | Apache Spark 3.2.x | Apache Spark 3.1.x | Apache Spark 3.0.x | Apache Spark 2.4.x | Apache Spark 2.3.x |
|
|
135
135
|
|-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
|
|
@@ -159,7 +159,7 @@ Find out more about 4.x `SparkNLP` versions in our official [documentation](http
|
|
|
159
159
|
|
|
160
160
|
### Databricks Support
|
|
161
161
|
|
|
162
|
-
Spark NLP 6.2.
|
|
162
|
+
Spark NLP 6.2.0 has been tested and is compatible with the following runtimes:
|
|
163
163
|
|
|
164
164
|
| **CPU** | **GPU** |
|
|
165
165
|
|--------------------|--------------------|
|
|
@@ -177,7 +177,7 @@ We are compatible with older runtimes. For a full list check databricks support
|
|
|
177
177
|
|
|
178
178
|
### EMR Support
|
|
179
179
|
|
|
180
|
-
Spark NLP 6.2.
|
|
180
|
+
Spark NLP 6.2.0 has been tested and is compatible with the following EMR releases:
|
|
181
181
|
|
|
182
182
|
| **EMR Release** |
|
|
183
183
|
|--------------------|
|
|
@@ -41,7 +41,7 @@ setup(
|
|
|
41
41
|
# project code, see
|
|
42
42
|
# https://packaging.python.org/en/latest/single_source_version.html
|
|
43
43
|
|
|
44
|
-
version='6.2.
|
|
44
|
+
version='6.2.2dev2', # Required
|
|
45
45
|
|
|
46
46
|
# This is a one-line description or tagline of what your project does. This
|
|
47
47
|
# corresponds to the 'Summary' metadata field:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: spark-nlp
|
|
3
|
-
Version: 6.2.2
|
|
3
|
+
Version: 6.2.2.dev2
|
|
4
4
|
Summary: John Snow Labs Spark NLP is a natural language processing library built on top of Apache Spark ML. It provides simple, performant & accurate NLP annotations for machine learning pipelines, that scale easily in a distributed environment.
|
|
5
5
|
Home-page: https://github.com/JohnSnowLabs/spark-nlp
|
|
6
6
|
Author: John Snow Labs
|
|
@@ -102,7 +102,7 @@ $ java -version
|
|
|
102
102
|
$ conda create -n sparknlp python=3.7 -y
|
|
103
103
|
$ conda activate sparknlp
|
|
104
104
|
# spark-nlp by default is based on pyspark 3.x
|
|
105
|
-
$ pip install spark-nlp==6.2.
|
|
105
|
+
$ pip install spark-nlp==6.2.0 pyspark==3.3.1
|
|
106
106
|
```
|
|
107
107
|
|
|
108
108
|
In Python console or Jupyter `Python3` kernel:
|
|
@@ -168,7 +168,7 @@ For a quick example of using pipelines and models take a look at our official [d
|
|
|
168
168
|
|
|
169
169
|
### Apache Spark Support
|
|
170
170
|
|
|
171
|
-
Spark NLP *6.2.
|
|
171
|
+
Spark NLP *6.2.0* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x
|
|
172
172
|
|
|
173
173
|
| Spark NLP | Apache Spark 3.5.x | Apache Spark 3.4.x | Apache Spark 3.3.x | Apache Spark 3.2.x | Apache Spark 3.1.x | Apache Spark 3.0.x | Apache Spark 2.4.x | Apache Spark 2.3.x |
|
|
174
174
|
|-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
|
|
@@ -198,7 +198,7 @@ Find out more about 4.x `SparkNLP` versions in our official [documentation](http
|
|
|
198
198
|
|
|
199
199
|
### Databricks Support
|
|
200
200
|
|
|
201
|
-
Spark NLP 6.2.
|
|
201
|
+
Spark NLP 6.2.0 has been tested and is compatible with the following runtimes:
|
|
202
202
|
|
|
203
203
|
| **CPU** | **GPU** |
|
|
204
204
|
|--------------------|--------------------|
|
|
@@ -216,7 +216,7 @@ We are compatible with older runtimes. For a full list check databricks support
|
|
|
216
216
|
|
|
217
217
|
### EMR Support
|
|
218
218
|
|
|
219
|
-
Spark NLP 6.2.
|
|
219
|
+
Spark NLP 6.2.0 has been tested and is compatible with the following EMR releases:
|
|
220
220
|
|
|
221
221
|
| **EMR Release** |
|
|
222
222
|
|--------------------|
|
|
@@ -66,7 +66,7 @@ sys.modules['com.johnsnowlabs.ml.ai'] = annotator
|
|
|
66
66
|
annotators = annotator
|
|
67
67
|
embeddings = annotator
|
|
68
68
|
|
|
69
|
-
__version__ = "6.2.2"
|
|
69
|
+
__version__ = "6.2.2-dev2"
|
|
70
70
|
|
|
71
71
|
|
|
72
72
|
def start(gpu=False,
|
|
@@ -78,7 +78,8 @@ def start(gpu=False,
|
|
|
78
78
|
cluster_tmp_dir="",
|
|
79
79
|
params=None,
|
|
80
80
|
real_time_output=False,
|
|
81
|
-
output_level=1
|
|
81
|
+
output_level=1,
|
|
82
|
+
scala213=False):
|
|
82
83
|
"""Starts a PySpark instance with default parameters for Spark NLP.
|
|
83
84
|
|
|
84
85
|
The default parameters would result in the equivalent of:
|
|
@@ -122,6 +123,8 @@ def start(gpu=False,
|
|
|
122
123
|
Whether to read and print JVM output in real time, by default False
|
|
123
124
|
output_level : int, optional
|
|
124
125
|
Output level for logs, by default 1
|
|
126
|
+
scala213 : bool, optional
|
|
127
|
+
Whether to use Scala 2.13 build of Spark NLP, by default False (Scala 2.12)
|
|
125
128
|
|
|
126
129
|
Notes
|
|
127
130
|
-----
|
|
@@ -159,12 +162,13 @@ def start(gpu=False,
|
|
|
159
162
|
self.serializer, self.serializer_max_buffer = "org.apache.spark.serializer.KryoSerializer", "2000M"
|
|
160
163
|
self.driver_max_result_size = "0"
|
|
161
164
|
# Spark NLP on CPU or GPU
|
|
162
|
-
|
|
163
|
-
self.
|
|
165
|
+
scala_version = "2.13" if scala213 else "2.12"
|
|
166
|
+
self.maven_spark3 = f"com.johnsnowlabs.nlp:spark-nlp_{scala_version}:{current_version}"
|
|
167
|
+
self.maven_gpu_spark3 = f"com.johnsnowlabs.nlp:spark-nlp-gpu_{scala_version}:{current_version}"
|
|
164
168
|
# Spark NLP on Apple Silicon
|
|
165
|
-
self.maven_silicon = "com.johnsnowlabs.nlp:spark-nlp-
|
|
169
|
+
self.maven_silicon = f"com.johnsnowlabs.nlp:spark-nlp-silicon_{scala_version}:{current_version}"
|
|
166
170
|
# Spark NLP on Linux Aarch64
|
|
167
|
-
self.maven_aarch64 = "com.johnsnowlabs.nlp:spark-nlp-
|
|
171
|
+
self.maven_aarch64 = f"com.johnsnowlabs.nlp:spark-nlp-aarch64_{scala_version}:{current_version}"
|
|
168
172
|
|
|
169
173
|
def start_without_realtime_output():
|
|
170
174
|
builder = SparkSession.builder \
|
|
@@ -318,4 +322,5 @@ def version():
|
|
|
318
322
|
str
|
|
319
323
|
The current Spark NLP version.
|
|
320
324
|
"""
|
|
325
|
+
|
|
321
326
|
return __version__
|
|
@@ -41,11 +41,6 @@ class NerDLApproach(AnnotatorApproach, NerApproach, EvaluationDLParams):
|
|
|
41
41
|
- a WordEmbeddingsModel (any embeddings can be chosen, e.g. BertEmbeddings
|
|
42
42
|
for BERT based embeddings).
|
|
43
43
|
|
|
44
|
-
By default, collects all data points into memory for training. For larger datasets, use
|
|
45
|
-
``setEnableMemoryOptimizer(true)``. This will optimize memory usage during training at the cost
|
|
46
|
-
of speed. Note that this annotator will use as much memory as the largest partition of the
|
|
47
|
-
input dataset, so we recommend repartitioning to batch sizes.
|
|
48
|
-
|
|
49
44
|
Setting a test dataset to monitor model metrics can be done with
|
|
50
45
|
``.setTestDataset``. The method expects a path to a parquet file containing a
|
|
51
46
|
dataframe that has the same required columns as the training dataframe. The
|
|
@@ -13,10 +13,10 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
"""Contains classes for NerDL."""
|
|
15
15
|
|
|
16
|
-
from pyspark.ml.util import JavaMLReadable
|
|
17
|
-
|
|
18
|
-
import sparknlp.internal as _internal
|
|
19
16
|
from sparknlp.common import *
|
|
17
|
+
import sparknlp.internal as _internal
|
|
18
|
+
from pyspark.ml.util import JavaMLWritable
|
|
19
|
+
from pyspark.ml.wrapper import JavaEstimator
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
class NerDLGraphChecker(
|
|
@@ -28,9 +28,6 @@ class NerDLGraphChecker(
|
|
|
28
28
|
computations/training is done. This annotator is useful for custom training cases, where
|
|
29
29
|
specialized graphs are needed.
|
|
30
30
|
|
|
31
|
-
This annotator will fill graph hyperparameters as metadata in the label column, which will be
|
|
32
|
-
available for NerDLApproach, saving computations.
|
|
33
|
-
|
|
34
31
|
Important: This annotator should be used or positioned before any embedding or NerDLApproach
|
|
35
32
|
annotators in the pipeline and will process the whole dataset to extract the required graph parameters.
|
|
36
33
|
|
|
@@ -205,18 +202,17 @@ class NerDLGraphChecker(
|
|
|
205
202
|
# self._setDefault()
|
|
206
203
|
|
|
207
204
|
def _create_model(self, java_model):
|
|
208
|
-
return NerDLGraphCheckerModel(
|
|
205
|
+
return NerDLGraphCheckerModel()
|
|
209
206
|
|
|
210
207
|
|
|
211
208
|
class NerDLGraphCheckerModel(
|
|
212
209
|
JavaModel,
|
|
213
210
|
JavaMLWritable,
|
|
214
|
-
JavaMLReadable,
|
|
215
211
|
_internal.ParamsGettersSetters,
|
|
216
212
|
):
|
|
217
|
-
"""
|
|
218
|
-
|
|
219
|
-
checks
|
|
213
|
+
"""
|
|
214
|
+
Resulting model from NerDLGraphChecker, that does not perform any transformations, as the
|
|
215
|
+
checks are done during the ``fit`` phase. It acts as the identity.
|
|
220
216
|
|
|
221
217
|
This annotator should never be used directly.
|
|
222
218
|
"""
|
|
@@ -228,66 +224,14 @@ class NerDLGraphCheckerModel(
|
|
|
228
224
|
|
|
229
225
|
@keyword_only
|
|
230
226
|
def __init__(
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
227
|
+
self,
|
|
228
|
+
classname="com.johnsnowlabs.nlp.annotators.ner.dl.NerDLGraphCheckerModel",
|
|
229
|
+
java_model=None,
|
|
234
230
|
):
|
|
235
|
-
|
|
236
|
-
|
|
231
|
+
super(NerDLGraphCheckerModel, self).__init__(java_model=java_model)
|
|
232
|
+
if classname and not java_model:
|
|
233
|
+
self.__class__._java_class_name = classname
|
|
234
|
+
self._java_obj = self._new_java_obj(classname, self.uid)
|
|
237
235
|
if java_model is not None:
|
|
238
|
-
super(NerDLGraphCheckerModel, self).__init__(java_model=java_model)
|
|
239
|
-
self._java_obj = java_model
|
|
240
236
|
self._transfer_params_from_java()
|
|
241
|
-
|
|
242
|
-
super(NerDLGraphCheckerModel, self).__init__()
|
|
243
|
-
self.__class__._java_class_name = classname
|
|
244
|
-
self._java_obj = self._new_java_obj(classname)
|
|
245
|
-
|
|
246
|
-
# Metadata keys for graph parameters
|
|
247
|
-
graphParamsMetadataKey = "NerDLGraphCheckerParams"
|
|
248
|
-
embeddingsDimKey = "embeddingsDim"
|
|
249
|
-
labelsKey = "labels"
|
|
250
|
-
charsKey = "chars"
|
|
251
|
-
dsLenKey = "dsLen"
|
|
252
|
-
|
|
253
|
-
labelColumn = Param(
|
|
254
|
-
Params._dummy(),
|
|
255
|
-
"labelColumn",
|
|
256
|
-
"Column with label per each token",
|
|
257
|
-
typeConverter=TypeConverters.toString,
|
|
258
|
-
)
|
|
259
|
-
|
|
260
|
-
embeddingsDim = Param(
|
|
261
|
-
Params._dummy(),
|
|
262
|
-
"embeddingsDim",
|
|
263
|
-
"Dimensionality of embeddings",
|
|
264
|
-
typeConverter=TypeConverters.toInt,
|
|
265
|
-
)
|
|
266
|
-
|
|
267
|
-
labels = Param(
|
|
268
|
-
Params._dummy(),
|
|
269
|
-
"labels",
|
|
270
|
-
"Labels in the dataset",
|
|
271
|
-
typeConverter=TypeConverters.toListString,
|
|
272
|
-
)
|
|
273
|
-
|
|
274
|
-
chars = Param(
|
|
275
|
-
Params._dummy(),
|
|
276
|
-
"chars",
|
|
277
|
-
"Set of characters in the dataset",
|
|
278
|
-
typeConverter=TypeConverters.toListString,
|
|
279
|
-
)
|
|
280
|
-
|
|
281
|
-
graphFolder = Param(
|
|
282
|
-
Params._dummy(),
|
|
283
|
-
"graphFolder",
|
|
284
|
-
"Folder path that contain external graph files",
|
|
285
|
-
typeConverter=TypeConverters.toString,
|
|
286
|
-
)
|
|
287
|
-
|
|
288
|
-
dsLen = Param(
|
|
289
|
-
Params._dummy(),
|
|
290
|
-
"dsLen",
|
|
291
|
-
"Length of the training dataset.",
|
|
292
|
-
typeConverter=TypeConverters.toInt,
|
|
293
|
-
)
|
|
237
|
+
# self._setDefault(lazyAnnotator=False)
|
|
@@ -17,6 +17,7 @@ from pyspark.ml.param import Param, Params, TypeConverters
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class HasReaderProperties(Params):
|
|
20
|
+
|
|
20
21
|
inputCol = Param(
|
|
21
22
|
Params._dummy(),
|
|
22
23
|
"inputCol",
|
|
@@ -244,8 +245,8 @@ class HasReaderProperties(Params):
|
|
|
244
245
|
"""
|
|
245
246
|
return self._set(outputAsDocument=value)
|
|
246
247
|
|
|
247
|
-
|
|
248
248
|
class HasEmailReaderProperties(Params):
|
|
249
|
+
|
|
249
250
|
addAttachmentContent = Param(
|
|
250
251
|
Params._dummy(),
|
|
251
252
|
"addAttachmentContent",
|
|
@@ -277,6 +278,7 @@ class HasEmailReaderProperties(Params):
|
|
|
277
278
|
|
|
278
279
|
|
|
279
280
|
class HasExcelReaderProperties(Params):
|
|
281
|
+
|
|
280
282
|
cellSeparator = Param(
|
|
281
283
|
Params._dummy(),
|
|
282
284
|
"cellSeparator",
|
|
@@ -335,8 +337,8 @@ class HasExcelReaderProperties(Params):
|
|
|
335
337
|
"""
|
|
336
338
|
return self.getOrDefault(self.appendCells)
|
|
337
339
|
|
|
338
|
-
|
|
339
340
|
class HasHTMLReaderProperties(Params):
|
|
341
|
+
|
|
340
342
|
timeout = Param(
|
|
341
343
|
Params._dummy(),
|
|
342
344
|
"timeout",
|
|
@@ -393,8 +395,8 @@ class HasHTMLReaderProperties(Params):
|
|
|
393
395
|
"""
|
|
394
396
|
return self._set(outputFormat=value)
|
|
395
397
|
|
|
396
|
-
|
|
397
398
|
class HasPowerPointProperties(Params):
|
|
399
|
+
|
|
398
400
|
includeSlideNotes = Param(
|
|
399
401
|
Params._dummy(),
|
|
400
402
|
"includeSlideNotes",
|
|
@@ -424,8 +426,8 @@ class HasPowerPointProperties(Params):
|
|
|
424
426
|
"""
|
|
425
427
|
return self.getOrDefault(self.includeSlideNotes)
|
|
426
428
|
|
|
427
|
-
|
|
428
429
|
class HasTextReaderProperties(Params):
|
|
430
|
+
|
|
429
431
|
titleLengthSize = Param(
|
|
430
432
|
Params._dummy(),
|
|
431
433
|
"titleLengthSize",
|
|
@@ -434,28 +436,9 @@ class HasTextReaderProperties(Params):
|
|
|
434
436
|
)
|
|
435
437
|
|
|
436
438
|
def setTitleLengthSize(self, value):
|
|
437
|
-
"""Set the maximum character length used to identify title blocks.
|
|
438
|
-
|
|
439
|
-
Parameters
|
|
440
|
-
----------
|
|
441
|
-
value : int
|
|
442
|
-
Maximum number of characters a text block can have to be considered a title.
|
|
443
|
-
|
|
444
|
-
Returns
|
|
445
|
-
-------
|
|
446
|
-
self
|
|
447
|
-
The instance with updated `titleLengthSize` parameter.
|
|
448
|
-
"""
|
|
449
439
|
return self._set(titleLengthSize=value)
|
|
450
440
|
|
|
451
441
|
def getTitleLengthSize(self):
|
|
452
|
-
"""Get the configured maximum title length.
|
|
453
|
-
|
|
454
|
-
Returns
|
|
455
|
-
-------
|
|
456
|
-
int
|
|
457
|
-
The maximum character length used to detect title blocks.
|
|
458
|
-
"""
|
|
459
442
|
return self.getOrDefault(self.titleLengthSize)
|
|
460
443
|
|
|
461
444
|
groupBrokenParagraphs = Param(
|
|
@@ -466,28 +449,9 @@ class HasTextReaderProperties(Params):
|
|
|
466
449
|
)
|
|
467
450
|
|
|
468
451
|
def setGroupBrokenParagraphs(self, value):
|
|
469
|
-
"""Enable or disable grouping of broken paragraphs.
|
|
470
|
-
|
|
471
|
-
Parameters
|
|
472
|
-
----------
|
|
473
|
-
value : bool
|
|
474
|
-
True to merge fragmented lines into paragraphs, False to leave lines as-is.
|
|
475
|
-
|
|
476
|
-
Returns
|
|
477
|
-
-------
|
|
478
|
-
self
|
|
479
|
-
The instance with updated `groupBrokenParagraphs` parameter.
|
|
480
|
-
"""
|
|
481
452
|
return self._set(groupBrokenParagraphs=value)
|
|
482
453
|
|
|
483
454
|
def getGroupBrokenParagraphs(self):
|
|
484
|
-
"""Get whether broken paragraph grouping is enabled.
|
|
485
|
-
|
|
486
|
-
Returns
|
|
487
|
-
-------
|
|
488
|
-
bool
|
|
489
|
-
True if grouping of broken paragraphs is enabled, False otherwise.
|
|
490
|
-
"""
|
|
491
455
|
return self.getOrDefault(self.groupBrokenParagraphs)
|
|
492
456
|
|
|
493
457
|
paragraphSplit = Param(
|
|
@@ -498,28 +462,9 @@ class HasTextReaderProperties(Params):
|
|
|
498
462
|
)
|
|
499
463
|
|
|
500
464
|
def setParagraphSplit(self, value):
|
|
501
|
-
"""Set the regex pattern used to split paragraphs when grouping broken paragraphs.
|
|
502
|
-
|
|
503
|
-
Parameters
|
|
504
|
-
----------
|
|
505
|
-
value : str
|
|
506
|
-
Regular expression string used to detect paragraph boundaries.
|
|
507
|
-
|
|
508
|
-
Returns
|
|
509
|
-
-------
|
|
510
|
-
self
|
|
511
|
-
The instance with updated `paragraphSplit` parameter.
|
|
512
|
-
"""
|
|
513
465
|
return self._set(paragraphSplit=value)
|
|
514
466
|
|
|
515
467
|
def getParagraphSplit(self):
|
|
516
|
-
"""Get the paragraph-splitting regex pattern.
|
|
517
|
-
|
|
518
|
-
Returns
|
|
519
|
-
-------
|
|
520
|
-
str
|
|
521
|
-
The regex pattern used to detect paragraph boundaries.
|
|
522
|
-
"""
|
|
523
468
|
return self.getOrDefault(self.paragraphSplit)
|
|
524
469
|
|
|
525
470
|
shortLineWordThreshold = Param(
|
|
@@ -530,28 +475,9 @@ class HasTextReaderProperties(Params):
|
|
|
530
475
|
)
|
|
531
476
|
|
|
532
477
|
def setShortLineWordThreshold(self, value):
|
|
533
|
-
"""Set the maximum word count for a line to be considered short.
|
|
534
|
-
|
|
535
|
-
Parameters
|
|
536
|
-
----------
|
|
537
|
-
value : int
|
|
538
|
-
Number of words under which a line is considered 'short'.
|
|
539
|
-
|
|
540
|
-
Returns
|
|
541
|
-
-------
|
|
542
|
-
self
|
|
543
|
-
The instance with updated `shortLineWordThreshold` parameter.
|
|
544
|
-
"""
|
|
545
478
|
return self._set(shortLineWordThreshold=value)
|
|
546
479
|
|
|
547
480
|
def getShortLineWordThreshold(self):
|
|
548
|
-
"""Get the short line word threshold.
|
|
549
|
-
|
|
550
|
-
Returns
|
|
551
|
-
-------
|
|
552
|
-
int
|
|
553
|
-
Word count threshold for short lines used in paragraph grouping.
|
|
554
|
-
"""
|
|
555
481
|
return self.getOrDefault(self.shortLineWordThreshold)
|
|
556
482
|
|
|
557
483
|
maxLineCount = Param(
|
|
@@ -562,28 +488,9 @@ class HasTextReaderProperties(Params):
|
|
|
562
488
|
)
|
|
563
489
|
|
|
564
490
|
def setMaxLineCount(self, value):
|
|
565
|
-
"""Set the maximum number of lines to inspect when estimating paragraph layout.
|
|
566
|
-
|
|
567
|
-
Parameters
|
|
568
|
-
----------
|
|
569
|
-
value : int
|
|
570
|
-
Maximum number of lines to evaluate for layout heuristics.
|
|
571
|
-
|
|
572
|
-
Returns
|
|
573
|
-
-------
|
|
574
|
-
self
|
|
575
|
-
The instance with updated `maxLineCount` parameter.
|
|
576
|
-
"""
|
|
577
491
|
return self._set(maxLineCount=value)
|
|
578
492
|
|
|
579
493
|
def getMaxLineCount(self):
|
|
580
|
-
"""Get the maximum number of lines used for layout heuristics.
|
|
581
|
-
|
|
582
|
-
Returns
|
|
583
|
-
-------
|
|
584
|
-
int
|
|
585
|
-
The configured maximum number of lines to consider.
|
|
586
|
-
"""
|
|
587
494
|
return self.getOrDefault(self.maxLineCount)
|
|
588
495
|
|
|
589
496
|
threshold = Param(
|
|
@@ -594,58 +501,11 @@ class HasTextReaderProperties(Params):
|
|
|
594
501
|
)
|
|
595
502
|
|
|
596
503
|
def setThreshold(self, value):
|
|
597
|
-
"""Set the empty-line ratio threshold for paragraph grouping decision.
|
|
598
|
-
|
|
599
|
-
Parameters
|
|
600
|
-
----------
|
|
601
|
-
value : float
|
|
602
|
-
Ratio (0.0-1.0) of empty lines used to switch grouping strategies.
|
|
603
|
-
|
|
604
|
-
Returns
|
|
605
|
-
-------
|
|
606
|
-
self
|
|
607
|
-
The instance with updated `threshold` parameter.
|
|
608
|
-
"""
|
|
609
504
|
return self._set(threshold=value)
|
|
610
505
|
|
|
611
506
|
def getThreshold(self):
|
|
612
|
-
"""Get the configured empty-line threshold ratio.
|
|
613
|
-
|
|
614
|
-
Returns
|
|
615
|
-
-------
|
|
616
|
-
float
|
|
617
|
-
The ratio used to decide paragraph grouping strategy.
|
|
618
|
-
"""
|
|
619
507
|
return self.getOrDefault(self.threshold)
|
|
620
508
|
|
|
621
|
-
extractTagAttributes = Param(
|
|
622
|
-
Params._dummy(),
|
|
623
|
-
"extractTagAttributes",
|
|
624
|
-
"Extract attribute values into separate lines when parsing tag-based formats (e.g., HTML or XML).",
|
|
625
|
-
typeConverter=TypeConverters.toListString
|
|
626
|
-
)
|
|
627
|
-
|
|
628
|
-
def setExtractTagAttributes(self, attributes: list[str]):
|
|
629
|
-
"""
|
|
630
|
-
Specify which tag attributes should have their values extracted as text when parsing
|
|
631
|
-
tag-based formats (e.g., HTML or XML).
|
|
632
|
-
|
|
633
|
-
:param attributes: list of attribute names to extract
|
|
634
|
-
:return: this instance with the updated `extractTagAttributes` parameter
|
|
635
|
-
"""
|
|
636
|
-
return self._set(extractTagAttributes=attributes)
|
|
637
|
-
|
|
638
|
-
def getExtractTagAttributes(self):
|
|
639
|
-
"""Get the list of tag attribute names configured to be extracted.
|
|
640
|
-
|
|
641
|
-
Returns
|
|
642
|
-
-------
|
|
643
|
-
list[str]
|
|
644
|
-
The attribute names whose values will be extracted as text.
|
|
645
|
-
"""
|
|
646
|
-
return self.getOrDefault(self.extractTagAttributes)
|
|
647
|
-
|
|
648
|
-
|
|
649
509
|
class HasChunkerProperties(Params):
|
|
650
510
|
|
|
651
511
|
chunkingStrategy = Param(
|
|
@@ -91,19 +91,6 @@ class Reader2Doc(
|
|
|
91
91
|
"""
|
|
92
92
|
return self._set(excludeNonText=value)
|
|
93
93
|
|
|
94
|
-
joinString = Param(
|
|
95
|
-
Params._dummy(),
|
|
96
|
-
"joinString",
|
|
97
|
-
"If outputAsDocument is true, specifies the string used to join elements into a single document.",
|
|
98
|
-
typeConverter=TypeConverters.toString
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
def setJoinString(self, value):
|
|
102
|
-
"""
|
|
103
|
-
If outputAsDocument is true, specifies the string used to join elements into a single
|
|
104
|
-
"""
|
|
105
|
-
return self._set(joinString=value)
|
|
106
|
-
|
|
107
94
|
@keyword_only
|
|
108
95
|
def __init__(self):
|
|
109
96
|
super(Reader2Doc, self).__init__(classname="com.johnsnowlabs.reader.Reader2Doc")
|
|
@@ -112,12 +99,8 @@ class Reader2Doc(
|
|
|
112
99
|
explodeDocs=False,
|
|
113
100
|
contentType="",
|
|
114
101
|
flattenOutput=False,
|
|
115
|
-
|
|
116
|
-
outputFormat="plain-text",
|
|
117
|
-
excludeNonText=False,
|
|
118
|
-
joinString="\n"
|
|
102
|
+
titleThreshold=18
|
|
119
103
|
)
|
|
120
|
-
|
|
121
104
|
@keyword_only
|
|
122
105
|
def setParams(self):
|
|
123
106
|
kwargs = self._input_kwargs
|
|
@@ -35,8 +35,7 @@ class Reader2Table(
|
|
|
35
35
|
@keyword_only
|
|
36
36
|
def __init__(self):
|
|
37
37
|
super(Reader2Table, self).__init__(classname="com.johnsnowlabs.reader.Reader2Table")
|
|
38
|
-
self._setDefault(outputCol="document"
|
|
39
|
-
outputAsDocument=False)
|
|
38
|
+
self._setDefault(outputCol="document")
|
|
40
39
|
|
|
41
40
|
@keyword_only
|
|
42
41
|
def setParams(self):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|