PyPI - spark-nlp - Versions diffs - 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl - Mend

spark-nlp 4.2.6py2.py3-none-any.whl → 6.2.1py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (221) hide show

com/johnsnowlabs/ml/__init__.py +0 -0
com/johnsnowlabs/ml/ai/__init__.py +10 -0
spark_nlp-6.2.1.dist-info/METADATA +362 -0
spark_nlp-6.2.1.dist-info/RECORD +292 -0
{spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
sparknlp/__init__.py +81 -28
sparknlp/annotation.py +3 -2
sparknlp/annotator/__init__.py +6 -0
sparknlp/annotator/audio/__init__.py +2 -0
sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
sparknlp/{base → annotator}/chunk2_doc.py +4 -7
sparknlp/annotator/chunker.py +1 -2
sparknlp/annotator/classifier_dl/__init__.py +17 -0
sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
sparknlp/annotator/cleaners/__init__.py +15 -0
sparknlp/annotator/cleaners/cleaner.py +202 -0
sparknlp/annotator/cleaners/extractor.py +191 -0
sparknlp/annotator/coref/spanbert_coref.py +4 -18
sparknlp/annotator/cv/__init__.py +15 -0
sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
sparknlp/annotator/cv/florence2_transformer.py +180 -0
sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
sparknlp/annotator/dataframe_optimizer.py +216 -0
sparknlp/annotator/date2_chunk.py +88 -0
sparknlp/annotator/dependency/dependency_parser.py +2 -3
sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
sparknlp/annotator/document_character_text_splitter.py +228 -0
sparknlp/annotator/document_normalizer.py +37 -1
sparknlp/annotator/document_token_splitter.py +175 -0
sparknlp/annotator/document_token_splitter_test.py +85 -0
sparknlp/annotator/embeddings/__init__.py +11 -0
sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
sparknlp/annotator/embeddings/doc2vec.py +7 -1
sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
sparknlp/annotator/embeddings/word2vec.py +7 -1
sparknlp/annotator/embeddings/word_embeddings.py +4 -5
sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
sparknlp/annotator/er/entity_ruler.py +37 -23
sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
sparknlp/annotator/lemmatizer.py +3 -4
sparknlp/annotator/matcher/date_matcher.py +35 -3
sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
sparknlp/annotator/matcher/regex_matcher.py +3 -3
sparknlp/annotator/matcher/text_matcher.py +2 -3
sparknlp/annotator/n_gram_generator.py +1 -2
sparknlp/annotator/ner/__init__.py +3 -1
sparknlp/annotator/ner/ner_converter.py +18 -0
sparknlp/annotator/ner/ner_crf.py +4 -5
sparknlp/annotator/ner/ner_dl.py +10 -5
sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
sparknlp/annotator/ner/ner_overwriter.py +2 -2
sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
sparknlp/annotator/normalizer.py +2 -2
sparknlp/annotator/openai/__init__.py +16 -0
sparknlp/annotator/openai/openai_completion.py +349 -0
sparknlp/annotator/openai/openai_embeddings.py +106 -0
sparknlp/annotator/pos/perceptron.py +6 -7
sparknlp/annotator/sentence/sentence_detector.py +2 -2
sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
sparknlp/annotator/seq2seq/__init__.py +17 -0
sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
sparknlp/annotator/similarity/__init__.py +0 -0
sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
sparknlp/annotator/stemmer.py +2 -3
sparknlp/annotator/stop_words_cleaner.py +3 -4
sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
sparknlp/annotator/token/__init__.py +0 -1
sparknlp/annotator/token/recursive_tokenizer.py +2 -3
sparknlp/annotator/token/tokenizer.py +2 -3
sparknlp/annotator/ws/word_segmenter.py +35 -10
sparknlp/base/__init__.py +2 -3
sparknlp/base/doc2_chunk.py +0 -3
sparknlp/base/document_assembler.py +5 -5
sparknlp/base/embeddings_finisher.py +14 -2
sparknlp/base/finisher.py +15 -4
sparknlp/base/gguf_ranking_finisher.py +234 -0
sparknlp/base/image_assembler.py +69 -0
sparknlp/base/light_pipeline.py +53 -21
sparknlp/base/multi_document_assembler.py +9 -13
sparknlp/base/prompt_assembler.py +207 -0
sparknlp/base/token_assembler.py +1 -2
sparknlp/common/__init__.py +2 -0
sparknlp/common/annotator_type.py +1 -0
sparknlp/common/completion_post_processing.py +37 -0
sparknlp/common/match_strategy.py +33 -0
sparknlp/common/properties.py +914 -9
sparknlp/internal/__init__.py +841 -116
sparknlp/internal/annotator_java_ml.py +1 -1
sparknlp/internal/annotator_transformer.py +3 -0
sparknlp/logging/comet.py +2 -2
sparknlp/partition/__init__.py +16 -0
sparknlp/partition/partition.py +244 -0
sparknlp/partition/partition_properties.py +902 -0
sparknlp/partition/partition_transformer.py +200 -0
sparknlp/pretrained/pretrained_pipeline.py +1 -1
sparknlp/pretrained/resource_downloader.py +126 -2
sparknlp/reader/__init__.py +15 -0
sparknlp/reader/enums.py +19 -0
sparknlp/reader/pdf_to_text.py +190 -0
sparknlp/reader/reader2doc.py +124 -0
sparknlp/reader/reader2image.py +136 -0
sparknlp/reader/reader2table.py +44 -0
sparknlp/reader/reader_assembler.py +159 -0
sparknlp/reader/sparknlp_reader.py +461 -0
sparknlp/training/__init__.py +1 -0
sparknlp/training/conll.py +8 -2
sparknlp/training/spacy_to_annotation.py +57 -0
sparknlp/util.py +26 -0
spark_nlp-4.2.6.dist-info/METADATA +0 -1256
spark_nlp-4.2.6.dist-info/RECORD +0 -196
{spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
/sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0

sparknlp/annotator/spell_check/norvig_sweeting.py CHANGED Viewed

@@ -29,9 +29,6 @@ class NorvigSweetingApproach(AnnotatorApproach):
     For instantiated/pretrained models, see :class:`.NorvigSweetingModel`.
-    For extended examples of usage, see the `Spark NLP Workshop
-    <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/vivekn-sentiment/VivekNarayanSentimentApproach.ipynb>`__.
     ====================== ======================
     Input Annotation types Output Annotation type
     ====================== ======================
@@ -270,11 +267,11 @@ class NorvigSweetingModel(AnnotatorModel):
     The default model is ``"spellcheck_norvig"``, if no name is provided. For
     available pretrained models please see the `Models Hub
-    <https://nlp.johnsnowlabs.com/models?task=Spell+Check>`__.
+    <https://sparknlp.org/models?task=Spell+Check>`__.
-    For extended examples of usage, see the `Spark NLP Workshop
-    <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/vivekn-sentiment/VivekNarayanSentimentApproach.ipynb>`__.
+    For extended examples of usage, see the `Examples
+    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/vivekn-sentiment/VivekNarayanSentimentApproach.ipynb>`__.
     ====================== ======================
     Input Annotation types Output Annotation type

sparknlp/annotator/spell_check/symmetric_delete.py CHANGED Viewed

@@ -212,7 +212,7 @@ class SymmetricDeleteModel(AnnotatorModel):
     The default model is ``"spellcheck_sd"``, if no name is provided. For
     available pretrained models please see the `Models Hub
-    <https://nlp.johnsnowlabs.com/models?task=Spell+Check>`__.
+    <https://sparknlp.org/models?task=Spell+Check>`__.
     ====================== ======================
     Input Annotation types Output Annotation type

sparknlp/annotator/stemmer.py CHANGED Viewed

@@ -19,8 +19,8 @@ class Stemmer(AnnotatorModel):
     """Returns hard-stems out of words with the objective of retrieving the
     meaningful part of the word.
-    For extended examples of usage, see the `Spark NLP Workshop
-    <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
+    For extended examples of usage, see the `Examples
+    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/stemmer/Word_Stemming_with_Stemmer.ipynb>`__.
     ====================== ======================
     Input Annotation types Output Annotation type
@@ -77,4 +77,3 @@ class Stemmer(AnnotatorModel):
         self._setDefault(
             language="english"
         )

sparknlp/annotator/stop_words_cleaner.py CHANGED Viewed

@@ -34,10 +34,10 @@ class StopWordsCleaner(AnnotatorModel):
     This will load the default pretrained model ``"stopwords_en"``.
     For available pretrained models please see the `Models Hub
-    <https://nlp.johnsnowlabs.com/models?task=Stop+Words+Removal>`__.
+    <https://sparknlp.orgtask=Stop+Words+Removal>`__.
-    For extended examples of usage, see the `Spark NLP Workshop
-    <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
+    For extended examples of usage, see the `Examples
+    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/stop-words/StopWordsCleaner.ipynb>`__.
     ====================== ======================
     Input Annotation types Output Annotation type
@@ -188,4 +188,3 @@ class StopWordsCleaner(AnnotatorModel):
         """
         from sparknlp.pretrained import ResourceDownloader
         return ResourceDownloader.downloadModel(StopWordsCleaner, name, lang, remote_loc)

sparknlp/annotator/tf_ner_dl_graph_builder.py CHANGED Viewed

@@ -66,7 +66,7 @@ class TFNerDLGraphBuilder(Estimator, DefaultParamsWritable, DefaultParamsReadabl
         Parameters
         ----------
-        *value : str
+        *value : List[str]
             Input columns for the annotator
         """
         if type(value[0]) == str or type(value[0]) == list:

sparknlp/annotator/token/__init__.py CHANGED Viewed

@@ -16,5 +16,4 @@
 from sparknlp.annotator.token.chunk_tokenizer import *
 from sparknlp.annotator.token.recursive_tokenizer import *
 from sparknlp.annotator.token.regex_tokenizer import *
-from sparknlp.annotator.token.token2_chunk import *
 from sparknlp.annotator.token.tokenizer import *

sparknlp/annotator/token/recursive_tokenizer.py CHANGED Viewed

@@ -28,8 +28,8 @@ class RecursiveTokenizer(AnnotatorApproach):
     - ``infixes``: Strings that will be split when found at the middle of token.
     - ``whitelist``: Whitelist of strings not to split
-    For extended examples of usage, see the `Spark NLP Workshop
-    <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/7.Context_Spell_Checker.ipynb>`__.
+    For extended examples of usage, see the `Examples
+    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/italian/Training_Context_Spell_Checker_Italian.ipynb>`__.
     ====================== ======================
     Input Annotation types Output Annotation type
@@ -203,4 +203,3 @@ class RecursiveTokenizerModel(AnnotatorModel):
             classname=classname,
             java_model=java_model
         )

sparknlp/annotator/token/tokenizer.py CHANGED Viewed

@@ -27,8 +27,8 @@ class Tokenizer(AnnotatorApproach):
     Identifies tokens with tokenization open standards. A few rules will help
     customizing it if defaults do not fit user needs.
-    For extended examples of usage see the `Spark NLP Workshop
-    <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
+    For extended examples of usage see the `Examples
+    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-normalizer/document_normalizer_notebook.ipynb>`__.
     ====================== ======================
     Input Annotation types Output Annotation type
@@ -559,4 +559,3 @@ class TokenizerModel(AnnotatorModel):
         """
         from sparknlp.pretrained import ResourceDownloader
         return ResourceDownloader.downloadModel(TokenizerModel, name, lang, remote_loc)

sparknlp/annotator/ws/word_segmenter.py CHANGED Viewed

@@ -20,11 +20,27 @@ class WordSegmenterApproach(AnnotatorApproach):
     """Trains a WordSegmenter which tokenizes non-english or non-whitespace
     separated texts.
-    Many languages are not whitespace separated and their sentences are a
-    concatenation of many symbols, like Korean, Japanese or Chinese. Without
-    understanding the language, splitting the words into their corresponding
-    tokens is impossible. The WordSegmenter is trained to understand these
-    languages and split them into semantically correct parts.
+    Many languages are not whitespace separated and their sentences are a concatenation
+    of many symbols, like Korean, Japanese or Chinese. Without understanding the
+    language, splitting the words into their corresponding tokens is impossible. The
+    WordSegmenter is trained to understand these languages and split them into
+    semantically correct parts.
+    This annotator is based on the paper Chinese Word Segmentation as Character Tagging
+    [1]. Word segmentation is treated as a tagging problem. Each character is be tagged
+    as on of four different labels: LL (left boundary), RR (right boundary), MM (middle)
+    and LR (word by itself). The label depends on the position of the word in the
+    sentence. LL tagged words will combine with the word on the right. Likewise, RR
+    tagged words combine with words on the left. MM tagged words are treated as the
+    middle of the word and combine with either side. LR tagged words are words by
+    themselves.
+    Example (from [1], Example 3(a) (raw), 3(b) (tagged), 3(c) (translation)):
+        - 上海 计划 到 本 世纪 末 实现 人均 国内 生产 总值 五千 美元
+        - 上/LL 海/RR 计/LL 划/RR 到/LR 本/LR 世/LL 纪/RR 末/LR 实/LL 现/RR 人/LL 均/RR
+          国/LL 内/RR 生/LL 产/RR 总/LL值/RR 五/LL 千/RR 美/LL 元/RR
+        - Shanghai plans to reach the goal of 5,000 dollars in per capita GDP by the end
+          of the century.
     For instantiated/pretrained models, see :class:`.WordSegmenterModel`.
@@ -37,8 +53,17 @@ class WordSegmenterApproach(AnnotatorApproach):
     The helper class :class:`.POS` might be useful to read training data into
     data frames.
-    For extended examples of usage, see the `Spark NLP Workshop
-    <https://github.com/JohnSnowLabs/spark-nlp-workshop/tree/master/jupyter/annotation/chinese/word_segmentation>`__.
+    For extended examples of usage, see the `Examples
+    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/chinese/word_segmentation>`__.
+    References
+    ----------
+    `[1] <https://aclanthology.org/O03-4002.pdf>`__ Xue, Nianwen. “Chinese Word
+    Segmentation as Character Tagging.” International Journal of Computational
+    Linguistics & Chinese Language Processing, Volume 8, Number 1, February 2003:
+    Special Issue on Word Formation and Chinese Language Processing, 2003, pp. 29-48.
+    ACLWeb, https://aclanthology.org/O03-4002.
     ====================== ======================
     Input Annotation types Output Annotation type
@@ -282,10 +307,10 @@ class WordSegmenterModel(AnnotatorModel):
     The default model is ``"wordseg_pku"``, default language is ``"zh"``, if no
     values are provided. For available pretrained models please see the `Models
-    Hub <https://nlp.johnsnowlabs.com/models?task=Word+Segmentation>`__.
+    Hub <https://sparknlp.org/models?task=Word+Segmentation>`__.
-    For extended examples of usage, see the `Spark NLP Workshop
-    <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/chinese/word_segmentation/words_segmenter_demo.ipynb>`__.
+    For extended examples of usage, see the `Examples
+    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/jupyter/annotation/chinese/word_segmentation/words_segmenter_demo.ipynb>`__.
     ====================== ======================
     Input Annotation types Output Annotation type

sparknlp/base/__init__.py CHANGED Viewed

@@ -12,13 +12,12 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 """Module of base Spark NLP annotators."""
-from sparknlp.base.chunk2_doc import *
 from sparknlp.base.doc2_chunk import *
 from sparknlp.base.document_assembler import *
 from sparknlp.base.multi_document_assembler import *
 from sparknlp.base.embeddings_finisher import *
 from sparknlp.base.finisher import *
+from sparknlp.base.gguf_ranking_finisher import *
 from sparknlp.base.graph_finisher import *
 from sparknlp.base.has_recursive_fit import *
 from sparknlp.base.has_recursive_transform import *
@@ -28,4 +27,4 @@ from sparknlp.base.token_assembler import *
 from sparknlp.base.image_assembler import *
 from sparknlp.base.audio_assembler import *
 from sparknlp.base.table_assembler import *
+from sparknlp.base.prompt_assembler import *

sparknlp/base/doc2_chunk.py CHANGED Viewed

@@ -29,9 +29,6 @@ class Doc2Chunk(AnnotatorTransformer, AnnotatorProperties):
     ``StringType`` or ``ArrayType[StringType]`` (using setIsArray). Useful for
     annotators that require a CHUNK type input.
-    For more extended examples on document pre-processing see the
-    `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
     ====================== ======================
     Input Annotation types Output Annotation type
     ====================== ======================

sparknlp/base/document_assembler.py CHANGED Viewed

@@ -24,13 +24,13 @@ class DocumentAssembler(AnnotatorTransformer):
     """Prepares data into a format that is processable by Spark NLP.
     This is the entry point for every Spark NLP pipeline. The
-    `DocumentAssembler` can read either a ``String`` column or an
-    ``Array[String]``. Additionally, :meth:`.setCleanupMode` can be used to
-    pre-process the text (Default: ``disabled``). For possible options please
-    refer the parameters section.
+    `DocumentAssembler` reads ``String`` columns. Additionally,
+    :meth:`.setCleanupMode` can be used to pre-process the
+    text (Default: ``disabled``). For possible options please refer the
+    parameters section.
     For more extended examples on document pre-processing see the
-    `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
+    `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-assembler/Loading_Documents_With_DocumentAssembler.ipynb>`__.
     ====================== ======================
     Input Annotation types Output Annotation type

sparknlp/base/embeddings_finisher.py CHANGED Viewed

@@ -34,7 +34,8 @@ class EmbeddingsFinisher(AnnotatorTransformer):
     require a ``featureCol``.
     For more extended examples see the
-    `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/5.1_Text_classification_examples_in_SparkML_SparkNLP.ipynb>`__.
+    `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/text-similarity/Spark_NLP_Spark_ML_Text_Similarity.ipynb
+>`__.
     ====================== ======================
     Input Annotation types Output Annotation type
@@ -127,7 +128,8 @@ class EmbeddingsFinisher(AnnotatorTransformer):
         super(EmbeddingsFinisher, self).__init__(classname="com.johnsnowlabs.nlp.EmbeddingsFinisher")
         self._setDefault(
             cleanAnnotations=False,
-            outputAsVector=False
+            outputAsVector=False,
+            outputCols=[]
         )
     @keyword_only
@@ -187,3 +189,13 @@ class EmbeddingsFinisher(AnnotatorTransformer):
         return self._set(outputAsVector=value)
+    def getInputCols(self):
+        """Gets input columns name of annotations."""
+        return self.getOrDefault(self.inputCols)
+    def getOutputCols(self):
+        """Gets output columns name of annotations."""
+        if len(self.getOrDefault(self.outputCols)) == 0:
+            return ["finished_" + input_col for input_col in self.getInputCols()]
+        else:
+            return self.getOrDefault(self.outputCols)

sparknlp/base/finisher.py CHANGED Viewed

@@ -25,7 +25,8 @@ class Finisher(AnnotatorTransformer):
     outputs annotation(s) values into ``String``.
     For more extended examples on document pre-processing see the
-    `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
+    `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/model-downloader/Create%20custom%20pipeline%20-%20NerDL.ipynb
+>`__.
     ====================== ======================
     Input Annotation types Output Annotation type
@@ -97,7 +98,6 @@ class Finisher(AnnotatorTransformer):
     includeMetadata = Param(Params._dummy(), "includeMetadata", "annotation metadata format", typeConverter=TypeConverters.toBoolean)
     outputAsArray = Param(Params._dummy(), "outputAsArray", "finisher generates an Array with the results instead of string", typeConverter=TypeConverters.toBoolean)
     parseEmbeddingsVectors = Param(Params._dummy(), "parseEmbeddingsVectors", "whether to include embeddings vectors in the process", typeConverter=TypeConverters.toBoolean)
     name = "Finisher"
     @keyword_only
@@ -109,7 +109,8 @@ class Finisher(AnnotatorTransformer):
             outputAsArray=True,
             parseEmbeddingsVectors=False,
             valueSplitSymbol="#",
-            annotationSplitSymbol="@"
+            annotationSplitSymbol="@",
+            outputCols=[]
         )
     @keyword_only
@@ -122,7 +123,7 @@ class Finisher(AnnotatorTransformer):
         Parameters
         ----------
-        *value : str
+        *value : List[str]
             Input columns for the annotator
         """
         if len(value) == 1 and type(value[0]) == list:
@@ -204,3 +205,13 @@ class Finisher(AnnotatorTransformer):
         """
         return self._set(parseEmbeddingsVectors=value)
+    def getInputCols(self):
+        """Gets input columns name of annotations."""
+        return self.getOrDefault(self.inputCols)
+    def getOutputCols(self):
+        """Gets output columns name of annotations."""
+        if len(self.getOrDefault(self.outputCols)) == 0:
+            return ["finished_" + input_col for input_col in self.getInputCols()]
+        else:
+            return self.getOrDefault(self.outputCols)

sparknlp/base/gguf_ranking_finisher.py ADDED Viewed

@@ -0,0 +1,234 @@
+#  Copyright 2017-2024 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for the GGUFRankingFinisher."""
+from pyspark import keyword_only
+from pyspark.ml.param import TypeConverters, Params, Param
+from sparknlp.internal import AnnotatorTransformer
+class GGUFRankingFinisher(AnnotatorTransformer):
+    """Finisher for AutoGGUFReranker outputs that provides ranking capabilities
+    including top-k selection, sorting by relevance score, and score normalization.
+    This finisher processes the output of AutoGGUFReranker, which contains documents with
+    relevance scores in their metadata. It provides several options for post-processing:
+    - Top-k selection: Select only the top k documents by relevance score
+    - Score thresholding: Filter documents by minimum relevance score
+    - Min-max scaling: Normalize relevance scores to 0-1 range
+    - Sorting: Sort documents by relevance score in descending order
+    - Ranking: Add rank information to document metadata
+    The finisher preserves the document annotation structure while adding ranking information
+    to the metadata and optionally filtering/sorting the documents.
+    For extended examples of usage, see the `Examples
+    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/finisher/gguf_ranking_finisher_example.py>`__.
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``DOCUMENT``           ``DOCUMENT``
+    ====================== ======================
+    Parameters
+    ----------
+    inputCols
+        Name of input annotation columns containing reranked documents
+    outputCol
+        Name of output annotation column containing ranked documents, by default "ranked_documents"
+    topK
+        Maximum number of top documents to return based on relevance score (-1 for no limit), by default -1
+    minRelevanceScore
+        Minimum relevance score threshold for filtering documents, by default Double.MinValue
+    minMaxScaling
+        Whether to apply min-max scaling to normalize relevance scores to 0-1 range, by default False
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> documentAssembler = DocumentAssembler() \\
+    ...     .setInputCol("text") \\
+    ...     .setOutputCol("document")
+    >>> reranker = AutoGGUFReranker.pretrained() \\
+    ...     .setInputCols("document") \\
+    ...     .setOutputCol("reranked_documents") \\
+    ...     .setQuery("A man is eating pasta.")
+    >>> finisher = GGUFRankingFinisher() \\
+    ...     .setInputCols("reranked_documents") \\
+    ...     .setOutputCol("ranked_documents") \\
+    ...     .setTopK(3) \\
+    ...     .setMinMaxScaling(True)
+    >>> pipeline = Pipeline().setStages([documentAssembler, reranker, finisher])
+    >>> data = spark.createDataFrame([
+    ...     ("A man is eating food.",),
+    ...     ("A man is eating a piece of bread.",),
+    ...     ("The girl is carrying a baby.",),
+    ...     ("A man is riding a horse.",)
+    ... ], ["text"])
+    >>> result = pipeline.fit(data).transform(data)
+    >>> result.select("ranked_documents").show(truncate=False)
+    # Documents will be sorted by relevance with rank information in metadata
+    """
+    name = "GGUFRankingFinisher"
+    inputCols = Param(Params._dummy(),
+                     "inputCols",
+                     "Name of input annotation columns containing reranked documents",
+                     typeConverter=TypeConverters.toListString)
+    outputCol = Param(Params._dummy(),
+                     "outputCol",
+                     "Name of output annotation column containing ranked documents",
+                     typeConverter=TypeConverters.toListString)
+    topK = Param(Params._dummy(),
+                 "topK",
+                 "Maximum number of top documents to return based on relevance score (-1 for no limit)",
+                 typeConverter=TypeConverters.toInt)
+    minRelevanceScore = Param(Params._dummy(),
+                             "minRelevanceScore",
+                             "Minimum relevance score threshold for filtering documents",
+                             typeConverter=TypeConverters.toFloat)
+    minMaxScaling = Param(Params._dummy(),
+                         "minMaxScaling",
+                         "Whether to apply min-max scaling to normalize relevance scores to 0-1 range",
+                         typeConverter=TypeConverters.toBoolean)
+    @keyword_only
+    def __init__(self):
+        super(GGUFRankingFinisher, self).__init__(
+            classname="com.johnsnowlabs.nlp.finisher.GGUFRankingFinisher")
+        self._setDefault(
+            topK=-1,
+            minRelevanceScore=float('-inf'),  # Equivalent to Double.MinValue
+            minMaxScaling=False,
+            outputCol=["ranked_documents"]
+        )
+    @keyword_only
+    def setParams(self):
+        kwargs = self._input_kwargs
+        return self._set(**kwargs)
+    def setInputCols(self, *value):
+        """Sets input annotation column names.
+        Parameters
+        ----------
+        value : List[str]
+            Input annotation column names containing reranked documents
+        """
+        if len(value) == 1 and isinstance(value[0], list):
+            return self._set(inputCols=value[0])
+        else:
+            return self._set(inputCols=list(value))
+    def getInputCols(self):
+        """Gets input annotation column names.
+        Returns
+        -------
+        List[str]
+            Input annotation column names
+        """
+        return self.getOrDefault(self.inputCols)
+    def setOutputCol(self, value):
+        """Sets output annotation column name.
+        Parameters
+        ----------
+        value : str
+            Output annotation column name
+        """
+        return self._set(outputCol=[value])
+    def getOutputCol(self):
+        """Gets output annotation column name.
+        Returns
+        -------
+        str
+            Output annotation column name
+        """
+        output_cols = self.getOrDefault(self.outputCol)
+        return output_cols[0] if output_cols else "ranked_documents"
+    def setTopK(self, value):
+        """Sets maximum number of top documents to return.
+        Parameters
+        ----------
+        value : int
+            Maximum number of top documents to return (-1 for no limit)
+        """
+        return self._set(topK=value)
+    def getTopK(self):
+        """Gets maximum number of top documents to return.
+        Returns
+        -------
+        int
+            Maximum number of top documents to return
+        """
+        return self.getOrDefault(self.topK)
+    def setMinRelevanceScore(self, value):
+        """Sets minimum relevance score threshold.
+        Parameters
+        ----------
+        value : float
+            Minimum relevance score threshold
+        """
+        return self._set(minRelevanceScore=value)
+    def getMinRelevanceScore(self):
+        """Gets minimum relevance score threshold.
+        Returns
+        -------
+        float
+            Minimum relevance score threshold
+        """
+        return self.getOrDefault(self.minRelevanceScore)
+    def setMinMaxScaling(self, value):
+        """Sets whether to apply min-max scaling.
+        Parameters
+        ----------
+        value : bool
+            Whether to apply min-max scaling to normalize scores
+        """
+        return self._set(minMaxScaling=value)
+    def getMinMaxScaling(self):
+        """Gets whether to apply min-max scaling.
+        Returns
+        -------
+        bool
+            Whether min-max scaling is enabled
+        """
+        return self.getOrDefault(self.minMaxScaling)

sparknlp/base/image_assembler.py CHANGED Viewed

@@ -15,6 +15,8 @@
 from pyspark import keyword_only
 from pyspark.ml.param import TypeConverters, Params, Param
+from pyspark.sql import SparkSession, DataFrame
+from pyspark.sql.functions import regexp_replace, col
 from sparknlp.common import AnnotatorType
 from sparknlp.internal import AnnotatorTransformer
@@ -65,6 +67,7 @@ class ImageAssembler(AnnotatorTransformer):
     outputAnnotatorType = AnnotatorType.IMAGE
     inputCol = Param(Params._dummy(), "inputCol", "input column name", typeConverter=TypeConverters.toString)
+    textCol = Param(Params._dummy(), "textCol", "text column name", typeConverter=TypeConverters.toString)
     outputCol = Param(Params._dummy(), "outputCol", "output column name", typeConverter=TypeConverters.toString)
     name = 'ImageAssembler'
@@ -101,3 +104,69 @@ class ImageAssembler(AnnotatorTransformer):
     def getOutputCol(self):
         """Gets output column name of annotations."""
         return self.getOrDefault(self.outputCol)
+    def setTextCol(self, value):
+        """Sets an optional text column name.
+        Parameters
+        ----------
+        value : str
+            Name of an optional input text column
+        """
+        return self._set(inputCol=value)
+    @classmethod
+    def loadImagesAsBytes(cls, spark: SparkSession, path: str):
+        """
+        Loads images from a given path and returns them as raw bytes, instead of the default
+        OpenCV-compatible format. Supported image types include JPEG, PNG, GIF, and BMP.
+        Multimodal inference with llama.cpp requires raw bytes as input.
+        Parameters
+        ----------
+        spark : SparkSession
+            The active SparkSession.
+        path : str
+            The path to the images. Supported image types are JPEG, PNG, GIF, and BMP.
+        Returns
+        -------
+        DataFrame
+            A DataFrame containing the images as raw bytes along with their metadata.
+        """
+        # Replace the path separator in the `origin` field and `path` column, so that they match
+        def replace_path(column_name: str):
+            return regexp_replace(col(column_name), ":///", ":/")
+        # Load the images as metadata with the default Spark image format
+        data = (
+            spark.read.format("image")
+            .option("dropInvalid", True)
+            .load(path)
+            .withColumn(
+                "image", col("image").withField("origin", replace_path("image.origin"))
+            )
+        )
+        # Load the images as raw binary files
+        image_bytes = (
+            spark.read.format("binaryFile")
+            .option("pathGlobFilter", "*.{jpeg,jpg,png,gif,bmp,JPEG,JPG,PNG,GIF,BMP}")
+            .option("dropInvalid", True)
+            .load(path)
+            .withColumn("path", replace_path("path"))
+        )
+        # Join the two datasets on the file path
+        df_joined = data.join(
+            image_bytes, data["image.origin"] == image_bytes["path"], "inner"
+        )
+        # Replace the `data` field of the `image` column with raw bytes
+        df_image_replaced = df_joined.withColumn(
+            "image", df_joined["image"].withField("data", df_joined["content"])
+        )
+        return df_image_replaced

spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

spark-nlp 4.2.6py2.py3-none-any.whl → 6.2.1py2.py3-none-any.whl