PyPI - spark-nlp - Versions diffs - 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl - Mend

spark-nlp 4.2.6py2.py3-none-any.whl → 6.2.1py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (221) hide show

com/johnsnowlabs/ml/__init__.py +0 -0
com/johnsnowlabs/ml/ai/__init__.py +10 -0
spark_nlp-6.2.1.dist-info/METADATA +362 -0
spark_nlp-6.2.1.dist-info/RECORD +292 -0
{spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
sparknlp/__init__.py +81 -28
sparknlp/annotation.py +3 -2
sparknlp/annotator/__init__.py +6 -0
sparknlp/annotator/audio/__init__.py +2 -0
sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
sparknlp/{base → annotator}/chunk2_doc.py +4 -7
sparknlp/annotator/chunker.py +1 -2
sparknlp/annotator/classifier_dl/__init__.py +17 -0
sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
sparknlp/annotator/cleaners/__init__.py +15 -0
sparknlp/annotator/cleaners/cleaner.py +202 -0
sparknlp/annotator/cleaners/extractor.py +191 -0
sparknlp/annotator/coref/spanbert_coref.py +4 -18
sparknlp/annotator/cv/__init__.py +15 -0
sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
sparknlp/annotator/cv/florence2_transformer.py +180 -0
sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
sparknlp/annotator/dataframe_optimizer.py +216 -0
sparknlp/annotator/date2_chunk.py +88 -0
sparknlp/annotator/dependency/dependency_parser.py +2 -3
sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
sparknlp/annotator/document_character_text_splitter.py +228 -0
sparknlp/annotator/document_normalizer.py +37 -1
sparknlp/annotator/document_token_splitter.py +175 -0
sparknlp/annotator/document_token_splitter_test.py +85 -0
sparknlp/annotator/embeddings/__init__.py +11 -0
sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
sparknlp/annotator/embeddings/doc2vec.py +7 -1
sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
sparknlp/annotator/embeddings/word2vec.py +7 -1
sparknlp/annotator/embeddings/word_embeddings.py +4 -5
sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
sparknlp/annotator/er/entity_ruler.py +37 -23
sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
sparknlp/annotator/lemmatizer.py +3 -4
sparknlp/annotator/matcher/date_matcher.py +35 -3
sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
sparknlp/annotator/matcher/regex_matcher.py +3 -3
sparknlp/annotator/matcher/text_matcher.py +2 -3
sparknlp/annotator/n_gram_generator.py +1 -2
sparknlp/annotator/ner/__init__.py +3 -1
sparknlp/annotator/ner/ner_converter.py +18 -0
sparknlp/annotator/ner/ner_crf.py +4 -5
sparknlp/annotator/ner/ner_dl.py +10 -5
sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
sparknlp/annotator/ner/ner_overwriter.py +2 -2
sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
sparknlp/annotator/normalizer.py +2 -2
sparknlp/annotator/openai/__init__.py +16 -0
sparknlp/annotator/openai/openai_completion.py +349 -0
sparknlp/annotator/openai/openai_embeddings.py +106 -0
sparknlp/annotator/pos/perceptron.py +6 -7
sparknlp/annotator/sentence/sentence_detector.py +2 -2
sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
sparknlp/annotator/seq2seq/__init__.py +17 -0
sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
sparknlp/annotator/similarity/__init__.py +0 -0
sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
sparknlp/annotator/stemmer.py +2 -3
sparknlp/annotator/stop_words_cleaner.py +3 -4
sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
sparknlp/annotator/token/__init__.py +0 -1
sparknlp/annotator/token/recursive_tokenizer.py +2 -3
sparknlp/annotator/token/tokenizer.py +2 -3
sparknlp/annotator/ws/word_segmenter.py +35 -10
sparknlp/base/__init__.py +2 -3
sparknlp/base/doc2_chunk.py +0 -3
sparknlp/base/document_assembler.py +5 -5
sparknlp/base/embeddings_finisher.py +14 -2
sparknlp/base/finisher.py +15 -4
sparknlp/base/gguf_ranking_finisher.py +234 -0
sparknlp/base/image_assembler.py +69 -0
sparknlp/base/light_pipeline.py +53 -21
sparknlp/base/multi_document_assembler.py +9 -13
sparknlp/base/prompt_assembler.py +207 -0
sparknlp/base/token_assembler.py +1 -2
sparknlp/common/__init__.py +2 -0
sparknlp/common/annotator_type.py +1 -0
sparknlp/common/completion_post_processing.py +37 -0
sparknlp/common/match_strategy.py +33 -0
sparknlp/common/properties.py +914 -9
sparknlp/internal/__init__.py +841 -116
sparknlp/internal/annotator_java_ml.py +1 -1
sparknlp/internal/annotator_transformer.py +3 -0
sparknlp/logging/comet.py +2 -2
sparknlp/partition/__init__.py +16 -0
sparknlp/partition/partition.py +244 -0
sparknlp/partition/partition_properties.py +902 -0
sparknlp/partition/partition_transformer.py +200 -0
sparknlp/pretrained/pretrained_pipeline.py +1 -1
sparknlp/pretrained/resource_downloader.py +126 -2
sparknlp/reader/__init__.py +15 -0
sparknlp/reader/enums.py +19 -0
sparknlp/reader/pdf_to_text.py +190 -0
sparknlp/reader/reader2doc.py +124 -0
sparknlp/reader/reader2image.py +136 -0
sparknlp/reader/reader2table.py +44 -0
sparknlp/reader/reader_assembler.py +159 -0
sparknlp/reader/sparknlp_reader.py +461 -0
sparknlp/training/__init__.py +1 -0
sparknlp/training/conll.py +8 -2
sparknlp/training/spacy_to_annotation.py +57 -0
sparknlp/util.py +26 -0
spark_nlp-4.2.6.dist-info/METADATA +0 -1256
spark_nlp-4.2.6.dist-info/RECORD +0 -196
{spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
/sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0

sparknlp/annotator/embeddings/mpnet_embeddings.py ADDED Viewed

@@ -0,0 +1,192 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for E5Embeddings."""
+from sparknlp.common import *
+class MPNetEmbeddings(AnnotatorModel,
+                           HasEmbeddingsProperties,
+                           HasCaseSensitiveProperties,
+                           HasStorageRef,
+                           HasBatchedAnnotate,
+                           HasMaxSentenceLengthLimit):
+    """Sentence embeddings using MPNet.
+    MPNet adopts a novel pre-training method, named masked and permuted language modeling,
+    to inherit the advantages of masked language modeling and permuted language modeling for
+    natural language understanding.
+    Note that this annotator is only supported for Spark Versions 3.4 and up.
+    Pretrained models can be loaded with :meth:`.pretrained` of the companion
+    object:
+    >>> embeddings = MPNetEmbeddings.pretrained() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("mpnet_embeddings")
+    The default model is ``"all_mpnet_base_v2"``, if no name is provided.
+    For available pretrained models please see the
+    `Models Hub <https://sparknlp.org/models?q=MPNet>`__.
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``DOCUMENT``            ``SENTENCE_EMBEDDINGS``
+    ====================== ======================
+    Parameters
+    ----------
+    batchSize
+        Size of every batch , by default 8
+    dimension
+        Number of embedding dimensions, by default 768
+    caseSensitive
+        Whether to ignore case in tokens for embeddings matching, by default False
+    maxSentenceLength
+        Max sentence length to process, by default 512
+    configProtoBytes
+        ConfigProto from tensorflow, serialized into byte array.
+    References
+    ----------
+    `MPNet: Masked and Permuted Pre-training for Language Understanding <https://arxiv.org/pdf/2004.09297>`__
+    https://github.com/microsoft/MPNet
+    **Paper abstract**
+    *BERT adopts masked language modeling (MLM) for pre-training and is one of the most successful pre-training models.
+     Since BERT neglects dependency among predicted tokens, XLNet introduces permuted language modeling (PLM) for
+     pre-training to address this problem. However, XLNet does not leverage the full position information of a sentence
+     and thus suffers from position discrepancy between pre-training and fine-tuning. In this paper, we propose MPNet,
+     a novel pre-training method that inherits the advantages of BERT and XLNet and avoids their limitations. MPNet
+     leverages the dependency among predicted tokens through permuted language modeling (vs. MLM in BERT), and takes
+     auxiliary position information as input to make the model see a full sentence and thus reducing the position
+     discrepancy (vs. PLM in XLNet). We pre-train MPNet on a large-scale dataset (over 160GB text corpora) and fine-tune
+     on a variety of down-streaming tasks (GLUE, SQuAD, etc). Experimental results show that MPNet outperforms MLM and
+     PLM by a large margin, and achieves better results on these tasks compared with previous state-of-the-art
+     pre-trained methods (e.g., BERT, XLNet, RoBERTa) under the same model setting.*
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> documentAssembler = DocumentAssembler() \\
+    ...     .setInputCol("text") \\
+    ...     .setOutputCol("document")
+    >>> embeddings = MPNetEmbeddings.pretrained() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("mpnet_embeddings")
+    >>> embeddingsFinisher = EmbeddingsFinisher() \\
+    ...     .setInputCols(["mpnet_embeddings"]) \\
+    ...     .setOutputCols("finished_embeddings") \\
+    ...     .setOutputAsVector(True)
+    >>> pipeline = Pipeline().setStages([
+    ...     documentAssembler,
+    ...     embeddings,
+    ...     embeddingsFinisher
+    ... ])
+    >>> data = spark.createDataFrame([["This is an example sentence", "Each sentence is converted"]]).toDF("text")
+    >>> result = pipeline.fit(data).transform(data)
+    >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
+    +--------------------------------------------------------------------------------+
+    |                                                                          result|
+    +--------------------------------------------------------------------------------+
+    |[[0.022502584, -0.078291744, -0.023030775, -0.0051000593, -0.080340415, 0.039...|
+    |[[0.041702367, 0.0010974605, -0.015534201, 0.07092203, -0.0017729357, 0.04661...|
+    +--------------------------------------------------------------------------------+
+    """
+    name = "MPNetEmbeddings"
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
+    outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
+    configProtoBytes = Param(Params._dummy(),
+                             "configProtoBytes",
+                             "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
+                             TypeConverters.toListInt)
+    def setConfigProtoBytes(self, b):
+        """Sets configProto from tensorflow, serialized into byte array.
+        Parameters
+        ----------
+        b : List[int]
+            ConfigProto from tensorflow, serialized into byte array
+        """
+        return self._set(configProtoBytes=b)
+    @keyword_only
+    def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.MPNetEmbeddings", java_model=None):
+        super(MPNetEmbeddings, self).__init__(
+            classname=classname,
+            java_model=java_model
+        )
+        self._setDefault(
+            dimension=768,
+            batchSize=8,
+            maxSentenceLength=512,
+            caseSensitive=False,
+        )
+    @staticmethod
+    def loadSavedModel(folder, spark_session):
+        """Loads a locally saved model.
+        Parameters
+        ----------
+        folder : str
+            Folder of the saved model
+        spark_session : pyspark.sql.SparkSession
+            The current SparkSession
+        Returns
+        -------
+        MPNetEmbeddings
+            The restored model
+        """
+        from sparknlp.internal import _MPNetLoader
+        jModel = _MPNetLoader(folder, spark_session._jsparkSession)._java_obj
+        return MPNetEmbeddings(java_model=jModel)
+    @staticmethod
+    def pretrained(name="all_mpnet_base_v2", lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model, by default "all_mpnet_base_v2"
+        lang : str, optional
+            Language of the pretrained model, by default "en"
+        remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use
+            Spark NLPs repositories otherwise.
+        Returns
+        -------
+        MPNetEmbeddings
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+        return ResourceDownloader.downloadModel(MPNetEmbeddings, name, lang, remote_loc)

sparknlp/annotator/embeddings/mxbai_embeddings.py ADDED Viewed

@@ -0,0 +1,184 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for MxbaiEmbeddings."""
+from sparknlp.common import *
+class MxbaiEmbeddings(AnnotatorModel,
+					HasEmbeddingsProperties,
+					HasCaseSensitiveProperties,
+					HasStorageRef,
+					HasBatchedAnnotate,
+					HasMaxSentenceLengthLimit):
+	"""Sentence embeddings using Mxbai Embeddings.
+	Pretrained models can be loaded with :meth:`.pretrained` of the companion
+	object:
+	>>> embeddings = MxbaiEmbeddings.pretrained() \\
+	...     .setInputCols(["document"]) \\
+	...     .setOutputCol("Mxbai_embeddings")
+	The default model is ``"mxbai_large_v1"``, if no name is provided.
+	For available pretrained models please see the
+	`Models Hub <https://sparknlp.org/models?q=Mxbai>`__.
+	====================== ======================
+	Input Annotation types Output Annotation type
+	====================== ======================
+	``DOCUMENT``            ``SENTENCE_EMBEDDINGS``
+	====================== ======================
+	Parameters
+	----------
+	batchSize
+		Size of every batch , by default 8
+	dimension
+		Number of embedding dimensions, by default 768
+	caseSensitive
+		Whether to ignore case in tokens for embeddings matching, by default False
+	maxSentenceLength
+		Max sentence length to process, by default 512
+	configProtoBytes
+		ConfigProto from tensorflow, serialized into byte array.
+	Examples
+	--------
+	>>> import sparknlp
+	>>> from sparknlp.base import *
+	>>> from sparknlp.annotator import *
+	>>> from pyspark.ml import Pipeline
+	>>> documentAssembler = DocumentAssembler() \\
+	...     .setInputCol("text") \\
+	...     .setOutputCol("document")
+	>>> embeddings = MxbaiEmbeddings.pretrained() \\
+	...     .setInputCols(["document"]) \\
+	...     .setOutputCol("embeddings")
+	>>> embeddingsFinisher = EmbeddingsFinisher() \\
+	...     .setInputCols("embeddings") \\
+	...     .setOutputCols("finished_embeddings") \\
+	...     .setOutputAsVector(True)
+	>>> pipeline = Pipeline().setStages([
+	...     documentAssembler,
+	...     embeddings,
+	...     embeddingsFinisher
+	... ])
+	>>> data = spark.createDataFrame([["hello world", "hello moon"]]).toDF("text")
+	>>> result = pipeline.fit(data).transform(data)
+	>>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
+	+--------------------------------------------------------------------------------+
+	|                                                                          result|
+	+--------------------------------------------------------------------------------+
+	|[0.50387806, 0.5861606, 0.35129607, -0.76046336, -0.32446072, -0.117674336, 0...|
+	|[0.6660665, 0.961762, 0.24854276, -0.1018044, -0.6569202, 0.027635604, 0.1915...|
+	+--------------------------------------------------------------------------------+
+	"""
+	name = "MxbaiEmbeddings"
+	inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
+	outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
+	poolingStrategy = Param(Params._dummy(),
+							"poolingStrategy",
+							"Pooling strategy to use for sentence embeddings",
+							TypeConverters.toString)
+	def setPoolingStrategy(self, value):
+		"""Pooling strategy to use for sentence embeddings.
+		Available pooling strategies for sentence embeddings are:
+		  - `"cls"`: leading `[CLS]` token
+		  - `"cls_avg"`: leading `[CLS]` token + mean of all other tokens
+		  - `"last"`: embeddings of the last token in the sequence
+		  - `"avg"`: mean of all tokens
+		  - `"max"`: max of all embedding features of the entire token sequence
+		  - `"int"`: An integer number, which represents the index of the token to use as the
+			embedding
+		Parameters
+		----------
+		value : str
+			Pooling strategy to use for sentence embeddings
+		"""
+		valid_strategies = {"cls", "cls_avg", "last", "avg", "max"}
+		if value in valid_strategies or value.isdigit():
+			return self._set(poolingStrategy=value)
+		else:
+			raise ValueError(f"Invalid pooling strategy: {value}. "
+							 f"Valid strategies are: {', '.join(self.valid_strategies)} or an integer.")
+	@keyword_only
+	def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.MxbaiEmbeddings", java_model=None):
+		super(MxbaiEmbeddings, self).__init__(
+			classname=classname,
+			java_model=java_model
+		)
+		self._setDefault(
+			dimension=1024,
+			batchSize=8,
+			maxSentenceLength=512,
+			caseSensitive=False,
+			poolingStrategy="cls"
+		)
+	@staticmethod
+	def loadSavedModel(folder, spark_session):
+		"""Loads a locally saved model.
+		Parameters
+		----------
+		folder : str
+			Folder of the saved model
+		spark_session : pyspark.sql.SparkSession
+			The current SparkSession
+		Returns
+		-------
+		MxbaiEmbeddings
+			The restored model
+		"""
+		from sparknlp.internal import _MxbaiEmbeddingsLoader
+		jModel = _MxbaiEmbeddingsLoader(folder, spark_session._jsparkSession)._java_obj
+		return MxbaiEmbeddings(java_model=jModel)
+	@staticmethod
+	def pretrained(name="mxbai_large_v1", lang="en", remote_loc=None):
+		"""Downloads and loads a pretrained model.
+		Parameters
+		----------
+		name : str, optional
+			Name of the pretrained model, by default "mxbai_large_v1"
+		lang : str, optional
+			Language of the pretrained model, by default "en"
+		remote_loc : str, optional
+			Optional remote address of the resource, by default None. Will use
+			Spark NLPs repositories otherwise.
+		Returns
+		-------
+		MxbaiEmbeddings
+			The restored model
+		"""
+		from sparknlp.pretrained import ResourceDownloader
+		return ResourceDownloader.downloadModel(MxbaiEmbeddings, name, lang, remote_loc)

sparknlp/annotator/embeddings/nomic_embeddings.py ADDED Viewed

@@ -0,0 +1,181 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for E5Embeddings."""
+from sparknlp.common import *
+class NomicEmbeddings(AnnotatorModel, HasEmbeddingsProperties, HasCaseSensitiveProperties, HasStorageRef,
+                      HasBatchedAnnotate, HasMaxSentenceLengthLimit):
+    """Sentence embeddings using NomicEmbeddings.
+    nomic-embed-text-v1 is 8192 context length text encoder that surpasses OpenAI
+    text-embedding-ada-002 and text-embedding-3-small performance on short and long context tasks.
+    Pretrained models can be loaded with :meth:`.pretrained` of the companion
+    object:
+    >>> embeddings = NomicEmbeddings.pretrained() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("nomic_embeddings")
+    The default model is ``"nomic_embed_v1"``, if no name is provided.
+    For available pretrained models please see the
+    `Models Hub <https://sparknlp.org/models?q=Nomic>`__.
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``DOCUMENT``            ``SENTENCE_EMBEDDINGS``
+    ====================== ======================
+    Parameters
+    ----------
+    batchSize
+        Size of every batch , by default 8
+    dimension
+        Number of embedding dimensions, by default 768
+    caseSensitive
+        Whether to ignore case in tokens for embeddings matching, by default False
+    maxSentenceLength
+        Max sentence length to process, by default 512
+    configProtoBytes
+        ConfigProto from tensorflow, serialized into byte array.
+    References
+    ----------
+    `Text Embeddings by Weakly-Supervised Contrastive Pre-training <https://arxiv.org/pdf/2212.03533>`__
+    https://github.com/microsoft/unilm/tree/master/nomic
+    **Paper abstract**
+    *This technical report describes the training
+    of nomic-embed-text-v1, the first fully reproducible,
+    open-source, open-weights, opendata, 8192 context length
+    English text embedding model that outperforms both OpenAI
+    Ada-002 and OpenAI text-embedding-3-small
+    on short and long-context tasks. We release
+    the training code and model weights under
+    an Apache 2 license. In contrast with other
+    open-source models, we release a training data
+    loader with 235 million curated text pairs that
+    allows for the full replication of nomic-embedtext-v1.
+    You can find code and data to replicate the
+    model at https://github.com/nomicai/contrastors.*
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> documentAssembler = DocumentAssembler() \\
+    ...     .setInputCol("text") \\
+    ...     .setOutputCol("document")
+    >>> embeddings = NomicEmbeddings.pretrained() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("nomic_embeddings")
+    >>> embeddingsFinisher = EmbeddingsFinisher() \\
+    ...     .setInputCols(["nomic_embeddings"]) \\
+    ...     .setOutputCols("finished_embeddings") \\
+    ...     .setOutputAsVector(True)
+    >>> pipeline = Pipeline().setStages([
+    ...     documentAssembler,
+    ...     embeddings,
+    ...     embeddingsFinisher
+    ... ])
+    >>> data = spark.createDataFrame([["query: how much protein should a female eat",
+    ... "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." + \
+    ... "But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" + \
+    ... "marathon. Check out the chart below to see how much protein you should be eating each day.",
+    ... ]]).toDF("text")
+    >>> result = pipeline.fit(data).transform(data)
+    >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
+    +--------------------------------------------------------------------------------+
+    |                                                                          result|
+    +--------------------------------------------------------------------------------+
+    |[[8.0190285E-4, -0.005974853, -0.072875895, 0.007944068, 0.026059335, -0.0080...|
+    |[[0.050514214, 0.010061974, -0.04340176, -0.020937217, 0.05170225, 0.01157857...|
+    +--------------------------------------------------------------------------------+
+    """
+    name = "NomicEmbeddings"
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
+    outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
+    configProtoBytes = Param(Params._dummy(), "configProtoBytes",
+                             "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
+                             TypeConverters.toListInt)
+    def setConfigProtoBytes(self, b):
+        """Sets configProto from tensorflow, serialized into byte array.
+        Parameters
+        ----------
+        b : List[int]
+            ConfigProto from tensorflow, serialized into byte array
+        """
+        return self._set(configProtoBytes=b)
+    @keyword_only
+    def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.NomicEmbeddings", java_model=None):
+        super(NomicEmbeddings, self).__init__(classname=classname, java_model=java_model)
+        self._setDefault(dimension=768, batchSize=8, maxSentenceLength=512, caseSensitive=False, )
+    @staticmethod
+    def loadSavedModel(folder, spark_session, use_openvino=False):
+        """Loads a locally saved model.
+        Parameters
+        ----------
+        folder : str
+            Folder of the saved model
+        spark_session : pyspark.sql.SparkSession
+            The current SparkSession
+        Returns
+        -------
+        NomicEmbeddings
+            The restored model
+        """
+        from sparknlp.internal import _NomicLoader
+        jModel = _NomicLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
+        return NomicEmbeddings(java_model=jModel)
+    @staticmethod
+    def pretrained(name="nomic_embed_v1", lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model, by default "nomic_embed_v1"
+        lang : str, optional
+            Language of the pretrained model, by default "en"
+        remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use
+            Spark NLPs repositories otherwise.
+        Returns
+        -------
+        NomicEmbeddings
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+        return ResourceDownloader.downloadModel(NomicEmbeddings, name, lang, remote_loc)

sparknlp/annotator/embeddings/roberta_embeddings.py CHANGED Viewed

@@ -21,7 +21,8 @@ class RoBertaEmbeddings(AnnotatorModel,
                         HasCaseSensitiveProperties,
                         HasStorageRef,
                         HasBatchedAnnotate,
-                        HasEngine):
+                        HasEngine,
+                        HasMaxSentenceLengthLimit):
     """Creates word embeddings using RoBERTa.
     The RoBERTa model was proposed in `RoBERTa: A Robustly Optimized BERT
@@ -42,10 +43,10 @@ class RoBertaEmbeddings(AnnotatorModel,
     The default model is ``"roberta_base"``, if no name is provided. For
     available pretrained models please see the `Models Hub
-    <https://nlp.johnsnowlabs.com/models?task=Embeddings>`__.
+    <https://sparknlp.org/models?task=Embeddings>`__.
-    For extended examples of usage, see the `Spark NLP Workshop
-    <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20RoBERTa.ipynb>`__.
+    For extended examples of usage, see the `Examples
+    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20RoBERTa.ipynb>`__.
     To see which models are compatible and how to import them see
     `Import Transformers into Spark NLP 🚀
     <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
@@ -151,11 +152,6 @@ class RoBertaEmbeddings(AnnotatorModel,
     outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS
-    maxSentenceLength = Param(Params._dummy(),
-                              "maxSentenceLength",
-                              "Max sentence length to process",
-                              typeConverter=TypeConverters.toInt)
     configProtoBytes = Param(Params._dummy(),
                              "configProtoBytes",
                              "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
@@ -171,16 +167,6 @@ class RoBertaEmbeddings(AnnotatorModel,
         """
         return self._set(configProtoBytes=b)
-    def setMaxSentenceLength(self, value):
-        """Sets max sentence length to process.
-        Parameters
-        ----------
-        value : int
-            Max sentence length to process
-        """
-        return self._set(maxSentenceLength=value)
     @keyword_only
     def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.RoBertaEmbeddings", java_model=None):
         super(RoBertaEmbeddings, self).__init__(
@@ -195,7 +181,7 @@ class RoBertaEmbeddings(AnnotatorModel,
         )
     @staticmethod
-    def loadSavedModel(folder, spark_session):
+    def loadSavedModel(folder, spark_session, use_openvino=False):
         """Loads a locally saved model.
         Parameters
@@ -204,6 +190,8 @@ class RoBertaEmbeddings(AnnotatorModel,
             Folder of the saved model
         spark_session : pyspark.sql.SparkSession
             The current SparkSession
+        use_openvino: bool
+            Use OpenVINO backend
         Returns
         -------
@@ -211,7 +199,7 @@ class RoBertaEmbeddings(AnnotatorModel,
             The restored model
         """
         from sparknlp.internal import _RoBertaLoader
-        jModel = _RoBertaLoader(folder, spark_session._jsparkSession)._java_obj
+        jModel = _RoBertaLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
         return RoBertaEmbeddings(java_model=jModel)
     @staticmethod

sparknlp/annotator/embeddings/roberta_sentence_embeddings.py CHANGED Viewed

@@ -17,11 +17,12 @@ from sparknlp.common import *
 class RoBertaSentenceEmbeddings(AnnotatorModel,
-                                HasEmbeddingsProperties,
-                                HasCaseSensitiveProperties,
-                                HasStorageRef,
-                                HasBatchedAnnotate,
-                                HasEngine):
+                                     HasEmbeddingsProperties,
+                                     HasCaseSensitiveProperties,
+                                     HasStorageRef,
+                                     HasBatchedAnnotate,
+                                     HasEngine,
+                                     HasMaxSentenceLengthLimit):
     """Sentence-level embeddings using RoBERTa. The RoBERTa model was proposed in RoBERTa: A Robustly Optimized BERT
     Pretraining Approach  by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy,
     Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. It is based on Google's BERT model released in 2018. It builds on
@@ -39,7 +40,7 @@ class RoBertaSentenceEmbeddings(AnnotatorModel,
     The default model is ``"sent_roberta_base"``, if no name is provided.
     For available pretrained models please see the
-    `Models Hub <https://nlp.johnsnowlabs.com/models?task=Embeddings>`__.
+    `Models Hub <https://sparknlp.org/models?task=Embeddings>`__.
     ====================== =======================
     Input Annotation types Output Annotation type
@@ -119,11 +120,6 @@ class RoBertaSentenceEmbeddings(AnnotatorModel,
     outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
-    maxSentenceLength = Param(Params._dummy(),
-                              "maxSentenceLength",
-                              "Max sentence length to process",
-                              typeConverter=TypeConverters.toInt)
     configProtoBytes = Param(Params._dummy(),
                              "configProtoBytes",
                              "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
@@ -139,16 +135,6 @@ class RoBertaSentenceEmbeddings(AnnotatorModel,
         """
         return self._set(configProtoBytes=b)
-    def setMaxSentenceLength(self, value):
-        """Sets max sentence length to process.
-        Parameters
-        ----------
-        value : int
-            Max sentence length to process
-        """
-        return self._set(maxSentenceLength=value)
     @keyword_only
     def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.RoBertaSentenceEmbeddings", java_model=None):
         super(RoBertaSentenceEmbeddings, self).__init__(

spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

spark-nlp 4.2.6py2.py3-none-any.whl → 6.2.1py2.py3-none-any.whl