PyPI - spark-nlp - Versions diffs - 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl - Mend

spark-nlp 4.2.6py2.py3-none-any.whl → 6.2.1py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (221) hide show

com/johnsnowlabs/ml/__init__.py +0 -0
com/johnsnowlabs/ml/ai/__init__.py +10 -0
spark_nlp-6.2.1.dist-info/METADATA +362 -0
spark_nlp-6.2.1.dist-info/RECORD +292 -0
{spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
sparknlp/__init__.py +81 -28
sparknlp/annotation.py +3 -2
sparknlp/annotator/__init__.py +6 -0
sparknlp/annotator/audio/__init__.py +2 -0
sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
sparknlp/{base → annotator}/chunk2_doc.py +4 -7
sparknlp/annotator/chunker.py +1 -2
sparknlp/annotator/classifier_dl/__init__.py +17 -0
sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
sparknlp/annotator/cleaners/__init__.py +15 -0
sparknlp/annotator/cleaners/cleaner.py +202 -0
sparknlp/annotator/cleaners/extractor.py +191 -0
sparknlp/annotator/coref/spanbert_coref.py +4 -18
sparknlp/annotator/cv/__init__.py +15 -0
sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
sparknlp/annotator/cv/florence2_transformer.py +180 -0
sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
sparknlp/annotator/dataframe_optimizer.py +216 -0
sparknlp/annotator/date2_chunk.py +88 -0
sparknlp/annotator/dependency/dependency_parser.py +2 -3
sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
sparknlp/annotator/document_character_text_splitter.py +228 -0
sparknlp/annotator/document_normalizer.py +37 -1
sparknlp/annotator/document_token_splitter.py +175 -0
sparknlp/annotator/document_token_splitter_test.py +85 -0
sparknlp/annotator/embeddings/__init__.py +11 -0
sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
sparknlp/annotator/embeddings/doc2vec.py +7 -1
sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
sparknlp/annotator/embeddings/word2vec.py +7 -1
sparknlp/annotator/embeddings/word_embeddings.py +4 -5
sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
sparknlp/annotator/er/entity_ruler.py +37 -23
sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
sparknlp/annotator/lemmatizer.py +3 -4
sparknlp/annotator/matcher/date_matcher.py +35 -3
sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
sparknlp/annotator/matcher/regex_matcher.py +3 -3
sparknlp/annotator/matcher/text_matcher.py +2 -3
sparknlp/annotator/n_gram_generator.py +1 -2
sparknlp/annotator/ner/__init__.py +3 -1
sparknlp/annotator/ner/ner_converter.py +18 -0
sparknlp/annotator/ner/ner_crf.py +4 -5
sparknlp/annotator/ner/ner_dl.py +10 -5
sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
sparknlp/annotator/ner/ner_overwriter.py +2 -2
sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
sparknlp/annotator/normalizer.py +2 -2
sparknlp/annotator/openai/__init__.py +16 -0
sparknlp/annotator/openai/openai_completion.py +349 -0
sparknlp/annotator/openai/openai_embeddings.py +106 -0
sparknlp/annotator/pos/perceptron.py +6 -7
sparknlp/annotator/sentence/sentence_detector.py +2 -2
sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
sparknlp/annotator/seq2seq/__init__.py +17 -0
sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
sparknlp/annotator/similarity/__init__.py +0 -0
sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
sparknlp/annotator/stemmer.py +2 -3
sparknlp/annotator/stop_words_cleaner.py +3 -4
sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
sparknlp/annotator/token/__init__.py +0 -1
sparknlp/annotator/token/recursive_tokenizer.py +2 -3
sparknlp/annotator/token/tokenizer.py +2 -3
sparknlp/annotator/ws/word_segmenter.py +35 -10
sparknlp/base/__init__.py +2 -3
sparknlp/base/doc2_chunk.py +0 -3
sparknlp/base/document_assembler.py +5 -5
sparknlp/base/embeddings_finisher.py +14 -2
sparknlp/base/finisher.py +15 -4
sparknlp/base/gguf_ranking_finisher.py +234 -0
sparknlp/base/image_assembler.py +69 -0
sparknlp/base/light_pipeline.py +53 -21
sparknlp/base/multi_document_assembler.py +9 -13
sparknlp/base/prompt_assembler.py +207 -0
sparknlp/base/token_assembler.py +1 -2
sparknlp/common/__init__.py +2 -0
sparknlp/common/annotator_type.py +1 -0
sparknlp/common/completion_post_processing.py +37 -0
sparknlp/common/match_strategy.py +33 -0
sparknlp/common/properties.py +914 -9
sparknlp/internal/__init__.py +841 -116
sparknlp/internal/annotator_java_ml.py +1 -1
sparknlp/internal/annotator_transformer.py +3 -0
sparknlp/logging/comet.py +2 -2
sparknlp/partition/__init__.py +16 -0
sparknlp/partition/partition.py +244 -0
sparknlp/partition/partition_properties.py +902 -0
sparknlp/partition/partition_transformer.py +200 -0
sparknlp/pretrained/pretrained_pipeline.py +1 -1
sparknlp/pretrained/resource_downloader.py +126 -2
sparknlp/reader/__init__.py +15 -0
sparknlp/reader/enums.py +19 -0
sparknlp/reader/pdf_to_text.py +190 -0
sparknlp/reader/reader2doc.py +124 -0
sparknlp/reader/reader2image.py +136 -0
sparknlp/reader/reader2table.py +44 -0
sparknlp/reader/reader_assembler.py +159 -0
sparknlp/reader/sparknlp_reader.py +461 -0
sparknlp/training/__init__.py +1 -0
sparknlp/training/conll.py +8 -2
sparknlp/training/spacy_to_annotation.py +57 -0
sparknlp/util.py +26 -0
spark_nlp-4.2.6.dist-info/METADATA +0 -1256
spark_nlp-4.2.6.dist-info/RECORD +0 -196
{spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
/sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0

sparknlp/annotator/cleaners/extractor.py ADDED Viewed

@@ -0,0 +1,191 @@
+#  Copyright 2017-2025 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for Extractor."""
+from sparknlp.common import *
+class Extractor(AnnotatorModel):
+    name = "Extractor"
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
+    outputAnnotatorType = AnnotatorType.CHUNK
+    emailDateTimeTzPattern = Param(Params._dummy(),
+                                   "emailDateTimeTzPattern",
+                                   "Specifies the date-time pattern for email timestamps, including time zone formatting.",
+                                   typeConverter=TypeConverters.toString)
+    emailAddress = Param(
+        Params._dummy(),
+        "emailAddress",
+        "Specifies the pattern for email addresses.",
+        typeConverter=TypeConverters.toString
+    )
+    ipAddressPattern = Param(
+        Params._dummy(),
+        "ipAddressPattern",
+        "Specifies the pattern for IP addresses.",
+        typeConverter=TypeConverters.toString
+    )
+    ipAddressNamePattern = Param(
+        Params._dummy(),
+        "ipAddressNamePattern",
+        "Specifies the pattern for IP addresses with names.",
+        typeConverter=TypeConverters.toString
+    )
+    mapiIdPattern = Param(
+        Params._dummy(),
+        "mapiIdPattern",
+        "Specifies the pattern for MAPI IDs.",
+        typeConverter=TypeConverters.toString
+    )
+    usPhoneNumbersPattern = Param(
+        Params._dummy(),
+        "usPhoneNumbersPattern",
+        "Specifies the pattern for US phone numbers.",
+        typeConverter=TypeConverters.toString
+    )
+    imageUrlPattern = Param(
+        Params._dummy(),
+        "imageUrlPattern",
+        "Specifies the pattern for image URLs.",
+        typeConverter=TypeConverters.toString
+    )
+    textPattern = Param(
+        Params._dummy(),
+        "textPattern",
+        "Specifies the pattern for text after and before.",
+        typeConverter=TypeConverters.toString
+    )
+    extractorMode = Param(
+        Params._dummy(),
+        "extractorMode",
+        "possible values: " +
+        "email_date, email_address, ip_address, ip_address_name, mapi_id, us_phone_numbers, image_urls, bullets, text_after, text_before",
+        typeConverter=TypeConverters.toString
+    )
+    index = Param(
+        Params._dummy(),
+        "index",
+        "Specifies the index of the pattern to extract in text after or before",
+        typeConverter=TypeConverters.toInt
+    )
+    def setEmailDateTimeTzPattern(self, value):
+        """Sets specifies the date-time pattern for email timestamps, including time zone formatting.
+        Parameters
+        ----------
+        value : str
+            Specifies the date-time pattern for email timestamps, including time zone formatting.
+        """
+        return self._set(emailDateTimeTzPattern=value)
+    def setEmailAddress(self, value):
+        """Sets the pattern for email addresses.
+        Parameters
+        ----------
+        value : str
+            Specifies the pattern for email addresses.
+        """
+        return self._set(emailAddress=value)
+    def setIpAddressPattern(self, value):
+        """Sets the pattern for IP addresses.
+        Parameters
+        ----------
+        value : str
+            Specifies the pattern for IP addresses.
+        """
+        return self._set(ipAddressPattern=value)
+    def setIpAddressNamePattern(self, value):
+        """Sets the pattern for IP addresses with names.
+        Parameters
+        ----------
+        value : str
+            Specifies the pattern for IP addresses with names.
+        """
+        return self._set(ipAddressNamePattern=value)
+    def setMapiIdPattern(self, value):
+        """Sets the pattern for MAPI IDs.
+        Parameters
+        ----------
+        value : str
+            Specifies the pattern for MAPI IDs.
+        """
+        return self._set(mapiIdPattern=value)
+    def setUsPhoneNumbersPattern(self, value):
+        """Sets the pattern for US phone numbers.
+        Parameters
+        ----------
+        value : str
+            Specifies the pattern for US phone numbers.
+        """
+        return self._set(usPhoneNumbersPattern=value)
+    def setImageUrlPattern(self, value):
+        """Sets the pattern for image URLs.
+        Parameters
+        ----------
+        value : str
+            Specifies the pattern for image URLs.
+        """
+        return self._set(imageUrlPattern=value)
+    def setTextPattern(self, value):
+        """Sets the pattern for text after and before.
+        Parameters
+        ----------
+        value : str
+            Specifies the pattern for text after and before.
+        """
+        return self._set(textPattern=value)
+    def setExtractorMode(self, value):
+        return self._set(extractorMode=value)
+    def setIndex(self, value):
+        """Sets the index of the pattern to extract in text after or before.
+        Parameters
+        ----------
+        value : int
+            Specifies the index of the pattern to extract in text after or before.
+        """
+        return self._set(index=value)
+    @keyword_only
+    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.cleaners.Extractor", java_model=None):
+        super(Extractor, self).__init__(
+            classname=classname,
+            java_model=java_model
+        )

sparknlp/annotator/coref/spanbert_coref.py CHANGED Viewed

@@ -20,7 +20,8 @@ class SpanBertCorefModel(AnnotatorModel,
                          HasEmbeddingsProperties,
                          HasCaseSensitiveProperties,
                          HasStorageRef,
-                         HasEngine):
+                         HasEngine,
+                         HasMaxSentenceLengthLimit):
     """
     A coreference resolution model based on SpanBert.
@@ -38,10 +39,10 @@ class SpanBertCorefModel(AnnotatorModel,
     The default model is ``"spanbert_base_coref"``, if no name is provided. For available
     pretrained models please see the `Models Hub
-    <https://nlp.johnsnowlabs.com/models?q=coref>`__.
+    <https://sparknlp.org/models?q=coref>`__.
     For extended examples of usage, see the
-    `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/coreference-resolution/Coreference_Resolution_SpanBertCorefModel.ipynb>`__.
+    `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/coreference-resolution/Coreference_Resolution_SpanBertCorefModel.ipynb>`__.
     ====================== ======================
     Input Annotation types Output Annotation type
@@ -114,11 +115,6 @@ class SpanBertCorefModel(AnnotatorModel,
     outputAnnotatorType = AnnotatorType.DEPENDENCY
-    maxSentenceLength = Param(Params._dummy(),
-                              "maxSentenceLength",
-                              "Max sentence length to process",
-                              typeConverter=TypeConverters.toInt)
     maxSegmentLength = Param(Params._dummy(),
                              "maxSegmentLength",
                              "Max segment length",
@@ -144,16 +140,6 @@ class SpanBertCorefModel(AnnotatorModel,
         """
         return self._set(configProtoBytes=b)
-    def setMaxSentenceLength(self, value):
-        """Sets max sentence length to process.
-        Parameters
-        ----------
-        value : int
-            Max sentence length to process
-        """
-        return self._set(maxSentenceLength=value)
     def setMaxSegmentLength(self, value):
         """Sets max segment length

sparknlp/annotator/cv/__init__.py CHANGED Viewed

@@ -12,3 +12,18 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 from sparknlp.annotator.cv.vit_for_image_classification import *
+from sparknlp.annotator.cv.swin_for_image_classification import *
+from sparknlp.annotator.cv.convnext_for_image_classification import *
+from sparknlp.annotator.cv.vision_encoder_decoder_for_image_captioning import *
+from sparknlp.annotator.cv.clip_for_zero_shot_classification import *
+from sparknlp.annotator.cv.blip_for_question_answering import *
+from sparknlp.annotator.cv.janus_for_multimodal import *
+from sparknlp.annotator.cv.mllama_for_multimodal import *
+from sparknlp.annotator.cv.qwen2vl_transformer import *
+from sparknlp.annotator.cv.llava_for_multimodal import *
+from sparknlp.annotator.cv.phi3_vision_for_multimodal import *
+from sparknlp.annotator.cv.smolvlm_transformer import *
+from sparknlp.annotator.cv.paligemma_for_multimodal import *
+from sparknlp.annotator.cv.gemma3_for_multimodal import *
+from sparknlp.annotator.cv.internvl_for_multimodal import *
+from sparknlp.annotator.cv.florence2_transformer import *

sparknlp/annotator/cv/blip_for_question_answering.py ADDED Viewed

@@ -0,0 +1,172 @@
+#  Copyright 2017-2024 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+from sparknlp.common import *
+class BLIPForQuestionAnswering(AnnotatorModel,
+                               HasBatchedAnnotateImage,
+                               HasImageFeatureProperties,
+                               HasEngine,
+                               HasCandidateLabelsProperties,
+                               HasRescaleFactor):
+    """BLIPForQuestionAnswering can load BLIP models  for visual question answering.
+    The model consists of a vision encoder, a text encoder as well as a text decoder.
+    The vision encoder will encode the input image, the text encoder will encode the input question together
+    with the encoding of the image, and the text decoder will output the answer to the question.
+    Pretrained models can be loaded with :meth:`.pretrained` of the companion
+    object:
+    >>> visualQAClassifier = BLIPForQuestionAnswering.pretrained() \\
+    ...     .setInputCols(["image_assembler"]) \\
+    ...     .setOutputCol("answer")
+    The default model is ``"blip_vqa_base"``, if no name is
+    provided.
+    For available pretrained models please see the `Models Hub
+    <https://sparknlp.org/models?task=Question+Answering>`__.
+    To see which models are compatible and how to import them see
+    `Import Transformers into Spark NLP 🚀
+    <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``IMAGE``              ``DOCUMENT``
+    ====================== ======================
+    Parameters
+    ----------
+    batchSize
+        Batch size. Large values allows faster processing but requires more
+        memory, by default 2
+    configProtoBytes
+        ConfigProto from tensorflow, serialized into byte array.
+    maxSentenceLength
+        Max sentence length to process, by default 50
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> image_df = SparkSessionForTest.spark.read.format("image").load(path=images_path)
+    >>> test_df = image_df.withColumn("text", lit("What's this picture about?"))
+    >>> imageAssembler = ImageAssembler() \\
+    ...     .setInputCol("image") \\
+    ...     .setOutputCol("image_assembler")
+    >>> visualQAClassifier = BLIPForQuestionAnswering.pretrained() \\
+    ...     .setInputCols("image_assembler") \\
+    ...     .setOutputCol("answer") \\
+    ...     .setSize(384)
+    >>> pipeline = Pipeline().setStages([
+    ...     imageAssembler,
+    ...     visualQAClassifier
+    ... ])
+    >>> result = pipeline.fit(test_df).transform(test_df)
+    >>> result.select("image_assembler.origin", "answer.result").show(false)
+    +--------------------------------------+------+
+    |origin                                |result|
+    +--------------------------------------+------+
+    |[file:///content/images/cat_image.jpg]|[cats]|
+    +--------------------------------------+------+
+    """
+    name = "BLIPForQuestionAnswering"
+    inputAnnotatorTypes = [AnnotatorType.IMAGE]
+    outputAnnotatorType = AnnotatorType.DOCUMENT
+    configProtoBytes = Param(Params._dummy(),
+                             "configProtoBytes",
+                             "ConfigProto from tensorflow, serialized into byte array. Get with "
+                             "config_proto.SerializeToString()",
+                             TypeConverters.toListInt)
+    maxSentenceLength = Param(Params._dummy(),
+                            "maxSentenceLength",
+                            "Maximum sentence length that the annotator will process. Above this, the sentence is skipped",
+                            typeConverter=TypeConverters.toInt)
+    def setMaxSentenceSize(self, value):
+        """Sets Maximum sentence length that the annotator will process, by
+        default 50.
+        Parameters
+        ----------
+        value : int
+            Maximum sentence length that the annotator will process
+        """
+        return self._set(maxSentenceLength=value)
+    @keyword_only
+    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.cv.BLIPForQuestionAnswering",
+                 java_model=None):
+        super(BLIPForQuestionAnswering, self).__init__(
+            classname=classname,
+            java_model=java_model
+        )
+        self._setDefault(
+            batchSize=2,
+            size=384,
+            maxSentenceLength=50
+        )
+    @staticmethod
+    def loadSavedModel(folder, spark_session):
+        """Loads a locally saved model.
+        Parameters
+        ----------
+        folder : str
+            Folder of the saved model
+        spark_session : pyspark.sql.SparkSession
+            The current SparkSession
+        Returns
+        -------
+        CLIPForZeroShotClassification
+            The restored model
+        """
+        from sparknlp.internal import _BLIPForQuestionAnswering
+        jModel = _BLIPForQuestionAnswering(folder, spark_session._jsparkSession)._java_obj
+        return BLIPForQuestionAnswering(java_model=jModel)
+    @staticmethod
+    def pretrained(name="blip_vqa_base", lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model, by default
+            "blip_vqa_tf"
+        lang : str, optional
+            Language of the pretrained model, by default "en"
+        remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use
+            Spark NLPs repositories otherwise.
+        Returns
+        -------
+        CLIPForZeroShotClassification
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+        return ResourceDownloader.downloadModel(BLIPForQuestionAnswering, name, lang, remote_loc)

sparknlp/annotator/cv/clip_for_zero_shot_classification.py ADDED Viewed

@@ -0,0 +1,193 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes concerning CLIPForZeroShotClassification."""
+from sparknlp.common import *
+class CLIPForZeroShotClassification(AnnotatorModel,
+                                    HasBatchedAnnotateImage,
+                                    HasImageFeatureProperties,
+                                    HasEngine,
+                                    HasCandidateLabelsProperties,
+                                    HasRescaleFactor):
+    """Zero Shot Image Classifier based on CLIP.
+    CLIP (Contrastive Language-Image Pre-Training) is a neural network that was trained on image
+    and text pairs. It has the ability to predict images without training on any hard-coded
+    labels. This makes it very flexible, as labels can be provided during inference. This is
+    similar to the zero-shot capabilities of the GPT-2 and 3 models.
+    Pretrained models can be loaded with ``pretrained`` of the companion object:
+    .. code-block:: python
+        imageClassifier = CLIPForZeroShotClassification.pretrained() \\
+            .setInputCols(["image_assembler"]) \\
+            .setOutputCol("label")
+    The default model is ``"zero_shot_classifier_clip_vit_base_patch32"``, if no name is provided.
+    For available pretrained models please see the
+    `Models Hub <https://sparknlp.org/models?task=Zero-Shot+Classification>`__.
+    Models from the HuggingFace 🤗 Transformers library are also compatible with Spark NLP 🚀. To
+    see which models are compatible and how to import them see
+    https://github.com/JohnSnowLabs/spark-nlp/discussions/5669 and to see more extended
+    examples, see
+    `CLIPForZeroShotClassificationTestSpec <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/CLIPForZeroShotClassificationTestSpec.scala>`__.
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``IMAGE``              ``CATEGORY``
+    ====================== ======================
+    Parameters
+    ----------
+    batchSize
+        Batch size, by default `2`.
+    candidateLabels
+        Array of labels for classification
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> imageDF = spark.read \\
+    ...     .format("image") \\
+    ...     .option("dropInvalid", value = True) \\
+    ...     .load("src/test/resources/image/")
+    >>> imageAssembler = ImageAssembler() \\
+    ...     .setInputCol("image") \\
+    ...     .setOutputCol("image_assembler")
+    >>> candidateLabels = [
+    ...     "a photo of a bird",
+    ...     "a photo of a cat",
+    ...     "a photo of a dog",
+    ...     "a photo of a hen",
+    ...     "a photo of a hippo",
+    ...     "a photo of a room",
+    ...     "a photo of a tractor",
+    ...     "a photo of an ostrich",
+    ...     "a photo of an ox"]
+    >>> imageClassifier = CLIPForZeroShotClassification \\
+    ...     .pretrained() \\
+    ...     .setInputCols(["image_assembler"]) \\
+    ...     .setOutputCol("label") \\
+    ...     .setCandidateLabels(candidateLabels)
+    >>> pipeline = Pipeline().setStages([imageAssembler, imageClassifier])
+    >>> pipelineDF = pipeline.fit(imageDF).transform(imageDF)
+    >>> pipelineDF \\
+    ...   .selectExpr("reverse(split(image.origin, '/'))[0] as image_name", "label.result") \\
+    ...   .show(truncate=False)
+    +-----------------+-----------------------+
+    |image_name       |result                 |
+    +-----------------+-----------------------+
+    |palace.JPEG      |[a photo of a room]    |
+    |egyptian_cat.jpeg|[a photo of a cat]     |
+    |hippopotamus.JPEG|[a photo of a hippo]   |
+    |hen.JPEG         |[a photo of a hen]     |
+    |ostrich.JPEG     |[a photo of an ostrich]|
+    |junco.JPEG       |[a photo of a bird]    |
+    |bluetick.jpg     |[a photo of a dog]     |
+    |chihuahua.jpg    |[a photo of a dog]     |
+    |tractor.JPEG     |[a photo of a tractor] |
+    |ox.JPEG          |[a photo of an ox]     |
+    +-----------------+-----------------------+
+    """
+    name = "CLIPForZeroShotClassification"
+    inputAnnotatorTypes = [AnnotatorType.IMAGE]
+    outputAnnotatorType = AnnotatorType.CATEGORY
+    configProtoBytes = Param(Params._dummy(),
+                             "configProtoBytes",
+                             "ConfigProto from tensorflow, serialized into byte array. Get with "
+                             "config_proto.SerializeToString()",
+                             TypeConverters.toListInt)
+    def getCandidateLabels(self):
+        """
+        Returns labels used to train this model
+        """
+        return self._call_java("getCandidateLabels")
+    @keyword_only
+    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.cv.CLIPForZeroShotClassification",
+                 java_model=None):
+        super(CLIPForZeroShotClassification, self).__init__(
+            classname=classname,
+            java_model=java_model
+        )
+        self._setDefault(
+            batchSize=2,
+            doNormalize=True,
+            doRescale=True,
+            doResize=True,
+            imageMean=[0.48145466, 0.4578275, 0.40821073],
+            imageStd=[0.26862954, 0.26130258, 0.27577711],
+            resample=2,
+            rescaleFactor=1 / 255.0,
+            size=224
+        )
+    @staticmethod
+    def loadSavedModel(folder, spark_session):
+        """Loads a locally saved model.
+        Parameters
+        ----------
+        folder : str
+            Folder of the saved model
+        spark_session : pyspark.sql.SparkSession
+            The current SparkSession
+        Returns
+        -------
+        CLIPForZeroShotClassification
+            The restored model
+        """
+        from sparknlp.internal import _CLIPForZeroShotClassification
+        jModel = _CLIPForZeroShotClassification(folder, spark_session._jsparkSession)._java_obj
+        return CLIPForZeroShotClassification(java_model=jModel)
+    @staticmethod
+    def pretrained(name="zero_shot_classifier_clip_vit_base_patch32", lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model, by default
+            "image_classifier_vit_base_patch16_224"
+        lang : str, optional
+            Language of the pretrained model, by default "en"
+        remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use
+            Spark NLPs repositories otherwise.
+        Returns
+        -------
+        CLIPForZeroShotClassification
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+        return ResourceDownloader.downloadModel(CLIPForZeroShotClassification, name, lang, remote_loc)

spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

spark-nlp 4.2.6py2.py3-none-any.whl → 6.2.1py2.py3-none-any.whl