PyPI - spark-nlp - Versions diffs - 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl - Mend

spark-nlp 4.2.6py2.py3-none-any.whl → 6.2.1py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (221) hide show

com/johnsnowlabs/ml/__init__.py +0 -0
com/johnsnowlabs/ml/ai/__init__.py +10 -0
spark_nlp-6.2.1.dist-info/METADATA +362 -0
spark_nlp-6.2.1.dist-info/RECORD +292 -0
{spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
sparknlp/__init__.py +81 -28
sparknlp/annotation.py +3 -2
sparknlp/annotator/__init__.py +6 -0
sparknlp/annotator/audio/__init__.py +2 -0
sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
sparknlp/{base → annotator}/chunk2_doc.py +4 -7
sparknlp/annotator/chunker.py +1 -2
sparknlp/annotator/classifier_dl/__init__.py +17 -0
sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
sparknlp/annotator/cleaners/__init__.py +15 -0
sparknlp/annotator/cleaners/cleaner.py +202 -0
sparknlp/annotator/cleaners/extractor.py +191 -0
sparknlp/annotator/coref/spanbert_coref.py +4 -18
sparknlp/annotator/cv/__init__.py +15 -0
sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
sparknlp/annotator/cv/florence2_transformer.py +180 -0
sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
sparknlp/annotator/dataframe_optimizer.py +216 -0
sparknlp/annotator/date2_chunk.py +88 -0
sparknlp/annotator/dependency/dependency_parser.py +2 -3
sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
sparknlp/annotator/document_character_text_splitter.py +228 -0
sparknlp/annotator/document_normalizer.py +37 -1
sparknlp/annotator/document_token_splitter.py +175 -0
sparknlp/annotator/document_token_splitter_test.py +85 -0
sparknlp/annotator/embeddings/__init__.py +11 -0
sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
sparknlp/annotator/embeddings/doc2vec.py +7 -1
sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
sparknlp/annotator/embeddings/word2vec.py +7 -1
sparknlp/annotator/embeddings/word_embeddings.py +4 -5
sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
sparknlp/annotator/er/entity_ruler.py +37 -23
sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
sparknlp/annotator/lemmatizer.py +3 -4
sparknlp/annotator/matcher/date_matcher.py +35 -3
sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
sparknlp/annotator/matcher/regex_matcher.py +3 -3
sparknlp/annotator/matcher/text_matcher.py +2 -3
sparknlp/annotator/n_gram_generator.py +1 -2
sparknlp/annotator/ner/__init__.py +3 -1
sparknlp/annotator/ner/ner_converter.py +18 -0
sparknlp/annotator/ner/ner_crf.py +4 -5
sparknlp/annotator/ner/ner_dl.py +10 -5
sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
sparknlp/annotator/ner/ner_overwriter.py +2 -2
sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
sparknlp/annotator/normalizer.py +2 -2
sparknlp/annotator/openai/__init__.py +16 -0
sparknlp/annotator/openai/openai_completion.py +349 -0
sparknlp/annotator/openai/openai_embeddings.py +106 -0
sparknlp/annotator/pos/perceptron.py +6 -7
sparknlp/annotator/sentence/sentence_detector.py +2 -2
sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
sparknlp/annotator/seq2seq/__init__.py +17 -0
sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
sparknlp/annotator/similarity/__init__.py +0 -0
sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
sparknlp/annotator/stemmer.py +2 -3
sparknlp/annotator/stop_words_cleaner.py +3 -4
sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
sparknlp/annotator/token/__init__.py +0 -1
sparknlp/annotator/token/recursive_tokenizer.py +2 -3
sparknlp/annotator/token/tokenizer.py +2 -3
sparknlp/annotator/ws/word_segmenter.py +35 -10
sparknlp/base/__init__.py +2 -3
sparknlp/base/doc2_chunk.py +0 -3
sparknlp/base/document_assembler.py +5 -5
sparknlp/base/embeddings_finisher.py +14 -2
sparknlp/base/finisher.py +15 -4
sparknlp/base/gguf_ranking_finisher.py +234 -0
sparknlp/base/image_assembler.py +69 -0
sparknlp/base/light_pipeline.py +53 -21
sparknlp/base/multi_document_assembler.py +9 -13
sparknlp/base/prompt_assembler.py +207 -0
sparknlp/base/token_assembler.py +1 -2
sparknlp/common/__init__.py +2 -0
sparknlp/common/annotator_type.py +1 -0
sparknlp/common/completion_post_processing.py +37 -0
sparknlp/common/match_strategy.py +33 -0
sparknlp/common/properties.py +914 -9
sparknlp/internal/__init__.py +841 -116
sparknlp/internal/annotator_java_ml.py +1 -1
sparknlp/internal/annotator_transformer.py +3 -0
sparknlp/logging/comet.py +2 -2
sparknlp/partition/__init__.py +16 -0
sparknlp/partition/partition.py +244 -0
sparknlp/partition/partition_properties.py +902 -0
sparknlp/partition/partition_transformer.py +200 -0
sparknlp/pretrained/pretrained_pipeline.py +1 -1
sparknlp/pretrained/resource_downloader.py +126 -2
sparknlp/reader/__init__.py +15 -0
sparknlp/reader/enums.py +19 -0
sparknlp/reader/pdf_to_text.py +190 -0
sparknlp/reader/reader2doc.py +124 -0
sparknlp/reader/reader2image.py +136 -0
sparknlp/reader/reader2table.py +44 -0
sparknlp/reader/reader_assembler.py +159 -0
sparknlp/reader/sparknlp_reader.py +461 -0
sparknlp/training/__init__.py +1 -0
sparknlp/training/conll.py +8 -2
sparknlp/training/spacy_to_annotation.py +57 -0
sparknlp/util.py +26 -0
spark_nlp-4.2.6.dist-info/METADATA +0 -1256
spark_nlp-4.2.6.dist-info/RECORD +0 -196
{spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
/sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0

sparknlp/annotator/embeddings/bert_sentence_embeddings.py CHANGED Viewed

@@ -17,11 +17,12 @@ from sparknlp.common import *
 class BertSentenceEmbeddings(AnnotatorModel,
-                             HasEmbeddingsProperties,
-                             HasCaseSensitiveProperties,
-                             HasStorageRef,
-                             HasBatchedAnnotate,
-                             HasEngine):
+                                  HasEmbeddingsProperties,
+                                  HasCaseSensitiveProperties,
+                                  HasStorageRef,
+                                  HasBatchedAnnotate,
+                                  HasEngine,
+                                  HasMaxSentenceLengthLimit):
     """Sentence-level embeddings using BERT. BERT (Bidirectional Encoder
     Representations from Transformers) provides dense vector representations for
     natural language by using a deep, pre-trained neural network with the
@@ -38,10 +39,10 @@ class BertSentenceEmbeddings(AnnotatorModel,
     The default model is ``"sent_small_bert_L2_768"``, if no name is provided.
     For available pretrained models please see the
-    `Models Hub <https://nlp.johnsnowlabs.com/models?task=Embeddings>`__.
+    `Models Hub <https://sparknlp.org/models?task=Embeddings>`__.
     For extended examples of usage, see the
-    `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BERT%20Sentence.ipynb>`__.
+    `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BERT%20Sentence.ipynb>`__.
     ====================== =======================
     Input Annotation types Output Annotation type
@@ -133,11 +134,6 @@ class BertSentenceEmbeddings(AnnotatorModel,
     outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
-    maxSentenceLength = Param(Params._dummy(),
-                              "maxSentenceLength",
-                              "Max sentence length to process",
-                              typeConverter=TypeConverters.toInt)
     isLong = Param(Params._dummy(),
                    "isLong",
                    "Use Long type instead of Int type for inputs buffer - Some Bert models require Long instead of Int.",
@@ -158,16 +154,6 @@ class BertSentenceEmbeddings(AnnotatorModel,
         """
         return self._set(configProtoBytes=b)
-    def setMaxSentenceLength(self, value):
-        """Sets max sentence length to process.
-        Parameters
-        ----------
-        value : int
-            Max sentence length to process
-        """
-        return self._set(maxSentenceLength=value)
     def setIsLong(self, value):
         """Sets whether to use Long type instead of Int type for inputs buffer.
@@ -194,7 +180,7 @@ class BertSentenceEmbeddings(AnnotatorModel,
         )
     @staticmethod
-    def loadSavedModel(folder, spark_session):
+    def loadSavedModel(folder, spark_session, use_openvino=False):
         """Loads a locally saved model.
         Parameters
@@ -203,6 +189,8 @@ class BertSentenceEmbeddings(AnnotatorModel,
             Folder of the saved model
         spark_session : pyspark.sql.SparkSession
             The current SparkSession
+        use_openvino: bool
+            Use OpenVINO backend
         Returns
         -------
@@ -210,7 +198,7 @@ class BertSentenceEmbeddings(AnnotatorModel,
             The restored model
         """
         from sparknlp.internal import _BertSentenceLoader
-        jModel = _BertSentenceLoader(folder, spark_session._jsparkSession)._java_obj
+        jModel = _BertSentenceLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
         return BertSentenceEmbeddings(java_model=jModel)
     @staticmethod

sparknlp/annotator/embeddings/bge_embeddings.py ADDED Viewed

@@ -0,0 +1,199 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for BGEEmbeddings."""
+from sparknlp.common import *
+class BGEEmbeddings(AnnotatorModel,
+                    HasEmbeddingsProperties,
+                    HasCaseSensitiveProperties,
+                    HasStorageRef,
+                    HasBatchedAnnotate,
+                    HasMaxSentenceLengthLimit,
+                    HasClsTokenProperties):
+    """Sentence embeddings using BGE.
+   BGE, or BAAI General Embeddings, a model that can map any text to a low-dimensional dense
+  vector which can be used for tasks like retrieval, classification, clustering, or semantic search.
+  Note that this annotator is only supported for Spark Versions 3.4 and up.
+  Pretrained models can be loaded with `pretrained` of the companion object:
+    >>> embeddings = BGEEmbeddings.pretrained() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("bge_embeddings")
+    The default model is ``"bge_base"``, if no name is provided.
+    For available pretrained models please see the
+    `Models Hub <https://sparknlp.org/models?q=BGE>`__.
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``DOCUMENT``            ``SENTENCE_EMBEDDINGS``
+    ====================== ======================
+    **References**
+    `C-Pack: Packaged Resources To Advance General Chinese Embedding <https://arxiv.org/pdf/2309.07597>`__
+    `BGE Github Repository <https://github.com/FlagOpen/FlagEmbedding>`__
+    **Paper abstract**
+    *We introduce C-Pack, a package of resources that significantly advance the field of general
+    Chinese embeddings. C-Pack includes three critical resources.
+    1) C-MTEB is a comprehensive benchmark for Chinese text embeddings covering 6 tasks and 35 datasets.
+    2) C-MTP is a massive text embedding dataset curated from labeled and unlabeled Chinese corpora
+    for training embedding models.
+    3) C-TEM is a family of embedding models covering multiple sizes.
+    Our models outperform all prior Chinese text embeddings on C-MTEB by up to +10% upon the
+    time of the release. We also integrate and optimize the entire suite of training methods for
+    C-TEM. Along with our resources on general Chinese embedding, we release our data and models for
+    English text embeddings. The English models achieve stateof-the-art performance on the MTEB
+    benchmark; meanwhile, our released English data is 2 times larger than the Chinese data. All
+    these resources are made publicly available at https://github.com/FlagOpen/FlagEmbedding.*
+    Parameters
+    ----------
+    batchSize
+        Size of every batch , by default 8
+    dimension
+        Number of embedding dimensions, by default 768
+    caseSensitive
+        Whether to ignore case in tokens for embeddings matching, by default False
+    maxSentenceLength
+        Max sentence length to process, by default 512
+    configProtoBytes
+        ConfigProto from tensorflow, serialized into byte array.
+    useCLSToken
+        Whether to use the CLS token for sentence embeddings, by default True
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> documentAssembler = DocumentAssembler() \\
+    ...     .setInputCol("text") \\
+    ...     .setOutputCol("document")
+    >>> embeddings = BGEEmbeddings.pretrained() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("bge_embeddings")
+    >>> embeddingsFinisher = EmbeddingsFinisher() \\
+    ...     .setInputCols(["bge_embeddings"]) \\
+    ...     .setOutputCols("finished_embeddings") \\
+    ...     .setOutputAsVector(True)
+    >>> pipeline = Pipeline().setStages([
+    ...     documentAssembler,
+    ...     embeddings,
+    ...     embeddingsFinisher
+    ... ])
+    >>> data = spark.createDataFrame([["query: how much protein should a female eat",
+    ... "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." + \\
+    ... "But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" + \\
+    ... "marathon. Check out the chart below to see how much protein you should be eating each day.",
+    ... ]]).toDF("text")
+    >>> result = pipeline.fit(data).transform(data)
+    >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
+    +--------------------------------------------------------------------------------+
+    |                                                                          result|
+    +--------------------------------------------------------------------------------+
+    |[[8.0190285E-4, -0.005974853, -0.072875895, 0.007944068, 0.026059335, -0.0080...|
+    |[[0.050514214, 0.010061974, -0.04340176, -0.020937217, 0.05170225, 0.01157857...|
+    +--------------------------------------------------------------------------------+
+    """
+    name = "BGEEmbeddings"
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
+    outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
+    configProtoBytes = Param(Params._dummy(),
+                             "configProtoBytes",
+                             "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
+                             TypeConverters.toListInt)
+    def setConfigProtoBytes(self, b):
+        """Sets configProto from tensorflow, serialized into byte array.
+        Parameters
+        ----------
+        b : List[int]
+            ConfigProto from tensorflow, serialized into byte array
+        """
+        return self._set(configProtoBytes=b)
+    @keyword_only
+    def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.BGEEmbeddings", java_model=None):
+        super(BGEEmbeddings, self).__init__(
+            classname=classname,
+            java_model=java_model
+        )
+        self._setDefault(
+            dimension=768,
+            batchSize=8,
+            maxSentenceLength=512,
+            caseSensitive=False,
+            useCLSToken=True
+        )
+    @staticmethod
+    def loadSavedModel(folder, spark_session):
+        """Loads a locally saved model.
+        Parameters
+        ----------
+        folder : str
+            Folder of the saved model
+        spark_session : pyspark.sql.SparkSession
+            The current SparkSession
+        Returns
+        -------
+        BGEEmbeddings
+            The restored model
+        """
+        from sparknlp.internal import _BGELoader
+        jModel = _BGELoader(folder, spark_session._jsparkSession)._java_obj
+        return BGEEmbeddings(java_model=jModel)
+    @staticmethod
+    def pretrained(name="bge_small_en_v1.5", lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model, by default "bge_small_en_v1.5"
+        lang : str, optional
+            Language of the pretrained model, by default "en"
+        remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use
+            Spark NLPs repositories otherwise.
+        Returns
+        -------
+        BGEEmbeddings
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+        return ResourceDownloader.downloadModel(BGEEmbeddings, name, lang, remote_loc)

sparknlp/annotator/embeddings/camembert_embeddings.py CHANGED Viewed

@@ -21,7 +21,8 @@ class CamemBertEmbeddings(AnnotatorModel,
                           HasCaseSensitiveProperties,
                           HasStorageRef,
                           HasBatchedAnnotate,
-                          HasEngine):
+                          HasEngine,
+                          HasMaxSentenceLengthLimit):
     """The CamemBERT model was proposed in CamemBERT: a Tasty French Language Model by
         Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent
         Romary, Éric Villemonte de la Clergerie, Djamé Seddah, and Benoît Sagot.
@@ -39,10 +40,10 @@ class CamemBertEmbeddings(AnnotatorModel,
         The default model is ``"camembert_base"``, if no name is provided.
         For available pretrained models please see the
-        `Models Hub <https://nlp.johnsnowlabs.com/models?task=Embeddings>`__.
+        `Models Hub <https://sparknlp.org/models?task=Embeddings>`__.
         For extended examples of usage, see the
-        `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/blogposts/3.NER_with_BERT.ipynb>`__
+        `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner/ner_bert.ipynb>`__
         and the
         `CamemBertEmbeddingsTestSpec <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/embeddings/CamemBertEmbeddingsTestSpec.scala>`__.
@@ -143,13 +144,6 @@ class CamemBertEmbeddings(AnnotatorModel,
         TypeConverters.toListInt,
     )
-    maxSentenceLength = Param(
-        Params._dummy(),
-        "maxSentenceLength",
-        "Max sentence length to process",
-        typeConverter=TypeConverters.toInt,
-    )
     def setConfigProtoBytes(self, b):
         """Sets configProto from tensorflow, serialized into byte array.
@@ -160,16 +154,6 @@ class CamemBertEmbeddings(AnnotatorModel,
         """
         return self._set(configProtoBytes=b)
-    def setMaxSentenceLength(self, value):
-        """Sets max sentence length to process.
-        Parameters
-        ----------
-        value : int
-            Max sentence length to process
-        """
-        return self._set(maxSentenceLength=value)
     @keyword_only
     def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.CamemBertEmbeddings", java_model=None):
         super(CamemBertEmbeddings, self).__init__(

sparknlp/annotator/embeddings/chunk_embeddings.py CHANGED Viewed

@@ -21,7 +21,7 @@ class ChunkEmbeddings(AnnotatorModel):
     chunk embeddings from either Chunker, NGramGenerator, or NerConverter
     outputs.
-    For extended examples of usage, see the `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/3.SparkNLP_Pretrained_Models.ipynb>`__.
+    For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/embeddings/ChunkEmbeddings.ipynb>`__.
     ========================== ======================
     Input Annotation types     Output Annotation type
@@ -147,4 +147,3 @@ class ChunkEmbeddings(AnnotatorModel):
             aggregation/pooling.
         """
         return self._set(skipOOV=value)

sparknlp/annotator/embeddings/deberta_embeddings.py CHANGED Viewed

@@ -20,7 +20,8 @@ class DeBertaEmbeddings(AnnotatorModel,
                         HasCaseSensitiveProperties,
                         HasStorageRef,
                         HasBatchedAnnotate,
-                        HasEngine):
+                        HasEngine,
+                        HasMaxSentenceLengthLimit):
     """The DeBERTa model was proposed in DeBERTa: Decoding-enhanced BERT with
     Disentangled Attention by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu
     Chen It is based on Google’s BERT model released in 2018 and Facebook’s
@@ -141,11 +142,6 @@ class DeBertaEmbeddings(AnnotatorModel,
                              "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
                              TypeConverters.toListInt)
-    maxSentenceLength = Param(Params._dummy(),
-                              "maxSentenceLength",
-                              "Max sentence length to process",
-                              typeConverter=TypeConverters.toInt)
     def setConfigProtoBytes(self, b):
         """Sets configProto from tensorflow, serialized into byte array.
@@ -156,16 +152,6 @@ class DeBertaEmbeddings(AnnotatorModel,
         """
         return self._set(configProtoBytes=b)
-    def setMaxSentenceLength(self, value):
-        """Sets max sentence length to process.
-        Parameters
-        ----------
-        value : int
-            Max sentence length to process
-        """
-        return self._set(maxSentenceLength=value)
     @keyword_only
     def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.DeBertaEmbeddings", java_model=None):
         super(DeBertaEmbeddings, self).__init__(

sparknlp/annotator/embeddings/distil_bert_embeddings.py CHANGED Viewed

@@ -21,7 +21,8 @@ class DistilBertEmbeddings(AnnotatorModel,
                            HasCaseSensitiveProperties,
                            HasStorageRef,
                            HasBatchedAnnotate,
-                           HasEngine):
+                           HasEngine,
+                           HasMaxSentenceLengthLimit):
     """DistilBERT is a small, fast, cheap and light Transformer model trained by
     distilling BERT base. It has 40% less parameters than ``bert-base-uncased``,
     runs 60% faster while preserving over 95% of BERT's performances as measured
@@ -37,10 +38,10 @@ class DistilBertEmbeddings(AnnotatorModel,
     The default model is ``"distilbert_base_cased"``, if no name is provided.
     For available pretrained models please see the
-    `Models Hub <https://nlp.johnsnowlabs.com/models?task=Embeddings>`__.
+    `Models Hub <https://sparknlp.org/models?task=Embeddings>`__.
-    For extended examples of usage, see the `Spark NLP Workshop
-    <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBERT.ipynb>`__.
+    For extended examples of usage, see the `Examples
+    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBERT.ipynb>`__.
     To see which models are compatible and how to import them see
     `Import Transformers into Spark NLP 🚀
     <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
@@ -149,11 +150,6 @@ class DistilBertEmbeddings(AnnotatorModel,
     outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS
-    maxSentenceLength = Param(Params._dummy(),
-                              "maxSentenceLength",
-                              "Max sentence length to process",
-                              typeConverter=TypeConverters.toInt)
     configProtoBytes = Param(Params._dummy(),
                              "configProtoBytes",
                              "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
@@ -169,16 +165,6 @@ class DistilBertEmbeddings(AnnotatorModel,
         """
         return self._set(configProtoBytes=b)
-    def setMaxSentenceLength(self, value):
-        """Sets max sentence length to process.
-        Parameters
-        ----------
-        value : int
-            Max sentence length to process
-        """
-        return self._set(maxSentenceLength=value)
     @keyword_only
     def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.DistilBertEmbeddings", java_model=None):
         super(DistilBertEmbeddings, self).__init__(

sparknlp/annotator/embeddings/doc2vec.py CHANGED Viewed

@@ -31,7 +31,7 @@ class Doc2VecApproach(AnnotatorApproach, HasStorageRef, HasEnableCachingProperti
     For instantiated/pretrained models, see :class:`.Doc2VecModel`.
-    For available pretrained models please see the `Models Hub <https://nlp.johnsnowlabs.com/models>`__.
+    For available pretrained models please see the `Models Hub <https://sparknlp.org/models>`__.
     ====================== =======================
     Input Annotation types Output Annotation type
@@ -344,3 +344,9 @@ class Doc2VecModel(AnnotatorModel, HasStorageRef, HasEmbeddingsProperties):
         from sparknlp.pretrained import ResourceDownloader
         return ResourceDownloader.downloadModel(Doc2VecModel, name, lang, remote_loc)
+    def getVectors(self):
+        """
+        Returns the vector representation of the words as a dataframe
+        with two fields, word and vector.
+        """
+        return self._call_java("getVectors")

sparknlp/annotator/embeddings/e5_embeddings.py ADDED Viewed

@@ -0,0 +1,195 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for E5Embeddings."""
+from sparknlp.common import *
+class E5Embeddings(AnnotatorModel,
+                           HasEmbeddingsProperties,
+                           HasCaseSensitiveProperties,
+                           HasStorageRef,
+                           HasBatchedAnnotate,
+                           HasMaxSentenceLengthLimit):
+    """Sentence embeddings using E5.
+    E5, a weakly supervised text embedding model that can generate text embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation, etc.)
+    Note that this annotator is only supported for Spark Versions 3.4 and up.
+    Pretrained models can be loaded with :meth:`.pretrained` of the companion
+    object:
+    >>> embeddings = E5Embeddings.pretrained() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("e5_embeddings")
+    The default model is ``"e5_small"``, if no name is provided.
+    For available pretrained models please see the
+    `Models Hub <https://sparknlp.org/models?q=E5>`__.
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``DOCUMENT``            ``SENTENCE_EMBEDDINGS``
+    ====================== ======================
+    Parameters
+    ----------
+    batchSize
+        Size of every batch , by default 8
+    dimension
+        Number of embedding dimensions, by default 768
+    caseSensitive
+        Whether to ignore case in tokens for embeddings matching, by default False
+    maxSentenceLength
+        Max sentence length to process, by default 512
+    configProtoBytes
+        ConfigProto from tensorflow, serialized into byte array.
+    References
+    ----------
+    `Text Embeddings by Weakly-Supervised Contrastive Pre-training <https://arxiv.org/pdf/2212.03533>`__
+    https://github.com/microsoft/unilm/tree/master/e5
+    **Paper abstract**
+    *This paper presents E5, a family of state-of-the-art text embeddings that transfer
+    well to a wide range of tasks. The model is trained in a contrastive manner with
+    weak supervision signals from our curated large-scale text pair dataset (called
+    CCPairs). E5 can be readily used as a general-purpose embedding model for any
+    tasks requiring a single-vector representation of texts such as retrieval, clustering,
+    and classification, achieving strong performance in both zero-shot and fine-tuned
+    settings. We conduct extensive evaluations on 56 datasets from the BEIR and
+    MTEB benchmarks. For zero-shot settings, E5 is the first model that outperforms
+    the strong BM25 baseline on the BEIR retrieval benchmark without using any
+    labeled data. When fine-tuned, E5 obtains the best results on the MTEB benchmark,
+    beating existing embedding models with 40× more parameters.*
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> documentAssembler = DocumentAssembler() \\
+    ...     .setInputCol("text") \\
+    ...     .setOutputCol("document")
+    >>> embeddings = E5Embeddings.pretrained() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("e5_embeddings")
+    >>> embeddingsFinisher = EmbeddingsFinisher() \\
+    ...     .setInputCols(["e5_embeddings"]) \\
+    ...     .setOutputCols("finished_embeddings") \\
+    ...     .setOutputAsVector(True)
+    >>> pipeline = Pipeline().setStages([
+    ...     documentAssembler,
+    ...     embeddings,
+    ...     embeddingsFinisher
+    ... ])
+    >>> data = spark.createDataFrame([["query: how much protein should a female eat",
+    ... "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." + \
+    ... "But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" + \
+    ... "marathon. Check out the chart below to see how much protein you should be eating each day.",
+    ... ]]).toDF("text")
+    >>> result = pipeline.fit(data).transform(data)
+    >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
+    +--------------------------------------------------------------------------------+
+    |                                                                          result|
+    +--------------------------------------------------------------------------------+
+    |[[8.0190285E-4, -0.005974853, -0.072875895, 0.007944068, 0.026059335, -0.0080...|
+    |[[0.050514214, 0.010061974, -0.04340176, -0.020937217, 0.05170225, 0.01157857...|
+    +--------------------------------------------------------------------------------+
+    """
+    name = "E5Embeddings"
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
+    outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
+    configProtoBytes = Param(Params._dummy(),
+                             "configProtoBytes",
+                             "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
+                             TypeConverters.toListInt)
+    def setConfigProtoBytes(self, b):
+        """Sets configProto from tensorflow, serialized into byte array.
+        Parameters
+        ----------
+        b : List[int]
+            ConfigProto from tensorflow, serialized into byte array
+        """
+        return self._set(configProtoBytes=b)
+    @keyword_only
+    def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.E5Embeddings", java_model=None):
+        super(E5Embeddings, self).__init__(
+            classname=classname,
+            java_model=java_model
+        )
+        self._setDefault(
+            dimension=768,
+            batchSize=8,
+            maxSentenceLength=512,
+            caseSensitive=False,
+        )
+    @staticmethod
+    def loadSavedModel(folder, spark_session, use_openvino=False):
+        """Loads a locally saved model.
+        Parameters
+        ----------
+        folder : str
+            Folder of the saved model
+        spark_session : pyspark.sql.SparkSession
+            The current SparkSession
+        use_openvino : bool
+            Use OpenVINO backend
+        Returns
+        -------
+        E5Embeddings
+            The restored model
+        """
+        from sparknlp.internal import _E5Loader
+        jModel = _E5Loader(folder, spark_session._jsparkSession, use_openvino)._java_obj
+        return E5Embeddings(java_model=jModel)
+    @staticmethod
+    def pretrained(name="e5_small", lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model, by default "e5_small"
+        lang : str, optional
+            Language of the pretrained model, by default "en"
+        remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use
+            Spark NLPs repositories otherwise.
+        Returns
+        -------
+        E5Embeddings
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+        return ResourceDownloader.downloadModel(E5Embeddings, name, lang, remote_loc)

spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

spark-nlp 4.2.6py2.py3-none-any.whl → 6.2.1py2.py3-none-any.whl