PyPI - spark-nlp - Versions diffs - 5.5.1__py2.py3-none-any.whl → 5.5.3__py2.py3-none-any.whl - Mend

spark-nlp 5.5.1py2.py3-none-any.whl → 5.5.3py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of spark-nlp might be problematic. Click here for more details.

Files changed (23) hide show

{spark_nlp-5.5.1.dist-info → spark_nlp-5.5.3.dist-info}/METADATA +8 -8
{spark_nlp-5.5.1.dist-info → spark_nlp-5.5.3.dist-info}/RECORD +23 -19
sparknlp/__init__.py +12 -6
sparknlp/annotator/cv/__init__.py +1 -0
sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
sparknlp/annotator/embeddings/__init__.py +1 -0
sparknlp/annotator/embeddings/auto_gguf_embeddings.py +538 -0
sparknlp/annotator/embeddings/bge_embeddings.py +7 -3
sparknlp/annotator/embeddings/nomic_embeddings.py +3 -3
sparknlp/annotator/seq2seq/auto_gguf_model.py +14 -24
sparknlp/annotator/seq2seq/cpm_transformer.py +5 -5
sparknlp/annotator/seq2seq/nllb_transformer.py +4 -4
sparknlp/annotator/seq2seq/phi3_transformer.py +4 -4
sparknlp/annotator/seq2seq/qwen_transformer.py +3 -3
sparknlp/base/image_assembler.py +11 -0
sparknlp/base/light_pipeline.py +20 -9
sparknlp/common/properties.py +27 -0
sparknlp/internal/__init__.py +15 -0
sparknlp/reader/__init__.py +15 -0
sparknlp/reader/sparknlp_reader.py +113 -0
{spark_nlp-5.5.1.dist-info → spark_nlp-5.5.3.dist-info}/.uuid +0 -0
{spark_nlp-5.5.1.dist-info → spark_nlp-5.5.3.dist-info}/WHEEL +0 -0
{spark_nlp-5.5.1.dist-info → spark_nlp-5.5.3.dist-info}/top_level.txt +0 -0

sparknlp/annotator/seq2seq/cpm_transformer.py CHANGED Viewed

@@ -44,7 +44,7 @@ class CPMTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
     ...     .setOutputCol("generation")
-    The default model is ``"llam2-7b"``, if no name is provided. For available
+    The default model is ``"mini_cpm_2b_8bit"``, if no name is provided. For available
     pretrained models please see the `Models Hub
     <https://sparknlp.org/models?q=cpm>`__.
@@ -104,7 +104,7 @@ class CPMTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
     >>> documentAssembler = DocumentAssembler() \\
     ...     .setInputCol("text") \\
     ...     .setOutputCol("documents")
-    >>> cpm = CPMTransformer.pretrained("llama_2_7b_chat_hf_int4") \\
+    >>> cpm = CPMTransformer.pretrained("mini_cpm_2b_8bit","xx") \\
     ...     .setInputCols(["documents"]) \\
     ...     .setMaxOutputLength(50) \\
     ...     .setOutputCol("generation")
@@ -299,15 +299,15 @@ class CPMTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
         return CPMTransformer(java_model=jModel)
     @staticmethod
-    def pretrained(name="llama_2_7b_chat_hf_int4", lang="en", remote_loc=None):
+    def pretrained(name="mini_cpm_2b_8bit", lang="xx", remote_loc=None):
         """Downloads and loads a pretrained model.
         Parameters
         ----------
         name : str, optional
-            Name of the pretrained model, by default "llama_2_7b_chat_hf_int4"
+            Name of the pretrained model, by default "mini_cpm_2b_8bit"
         lang : str, optional
-            Language of the pretrained model, by default "en"
+            Language of the pretrained model, by default "xx"
         remote_loc : str, optional
             Optional remote address of the resource, by default None. Will use
             Spark NLPs repositories otherwise.

sparknlp/annotator/seq2seq/nllb_transformer.py CHANGED Viewed

@@ -32,7 +32,7 @@ class NLLBTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
     ...     .setOutputCol("generation")
-    The default model is ``"nllb_418M"``, if no name is provided. For available
+    The default model is ``"nllb_distilled_600M_8int"``, if no name is provided. For available
     pretrained models please see the `Models Hub
     <https://sparknlp.org/models?q=nllb>`__.
@@ -164,7 +164,7 @@ class NLLBTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
     >>> documentAssembler = DocumentAssembler() \\
     ...     .setInputCol("text") \\
     ...     .setOutputCol("documents")
-    >>> nllb = NLLBTransformer.pretrained("nllb_418M") \\
+    >>> nllb = NLLBTransformer.pretrained("nllb_distilled_600M_8int") \\
     ...     .setInputCols(["documents"]) \\
     ...     .setMaxOutputLength(50) \\
     ...     .setOutputCol("generation") \\
@@ -398,13 +398,13 @@ class NLLBTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
         return NLLBTransformer(java_model=jModel)
     @staticmethod
-    def pretrained(name="nllb_418M", lang="xx", remote_loc=None):
+    def pretrained(name="nllb_distilled_600M_8int", lang="xx", remote_loc=None):
         """Downloads and loads a pretrained model.
         Parameters
         ----------
         name : str, optional
-            Name of the pretrained model, by default "nllb_418M"
+            Name of the pretrained model, by default "nllb_distilled_600M_8int"
         lang : str, optional
             Language of the pretrained model, by default "en"
         remote_loc : str, optional

sparknlp/annotator/seq2seq/phi3_transformer.py CHANGED Viewed

@@ -37,7 +37,7 @@ class Phi3Transformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
     ...     .setOutputCol("generation")
-    The default model is ``"phi3"``, if no name is provided. For available
+    The default model is ``phi_3_mini_128k_instruct``, if no name is provided. For available
     pretrained models please see the `Models Hub
     <https://sparknlp.org/models?q=phi3>`__.
@@ -112,7 +112,7 @@ class Phi3Transformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
     >>> documentAssembler = DocumentAssembler() \\
     ...     .setInputCol("text") \\
     ...     .setOutputCol("documents")
-    >>> phi3 = Phi3Transformer.pretrained("phi3") \\
+    >>> phi3 = Phi3Transformer.pretrained(phi_3_mini_128k_instruct) \\
     ...     .setInputCols(["documents"]) \\
     ...     .setMaxOutputLength(50) \\
     ...     .setOutputCol("generation")
@@ -308,13 +308,13 @@ class Phi3Transformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
         return Phi3Transformer(java_model=jModel)
     @staticmethod
-    def pretrained(name="phi3", lang="en", remote_loc=None):
+    def pretrained(name="phi_3_mini_128k_instruct", lang="en", remote_loc=None):
         """Downloads and loads a pretrained model.
         Parameters
         ----------
         name : str, optional
-            Name of the pretrained model, by default "phi3"
+            Name of the pretrained model, by default phi_3_mini_128k_instruct
         lang : str, optional
             Language of the pretrained model, by default "en"
         remote_loc : str, optional

sparknlp/annotator/seq2seq/qwen_transformer.py CHANGED Viewed

@@ -121,7 +121,7 @@ class QwenTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
     >>> documentAssembler = DocumentAssembler() \\
     ...     .setInputCol("text") \\
     ...     .setOutputCol("documents")
-    >>> qwen = QwenTransformer.pretrained("qwen-7b") \\
+    >>> qwen = QwenTransformer.pretrained("qwen_7.5b_chat") \\
     ...     .setInputCols(["documents"]) \\
     ...     .setMaxOutputLength(50) \\
     ...     .setOutputCol("generation")
@@ -317,13 +317,13 @@ class QwenTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
         return QwenTransformer(java_model=jModel)
     @staticmethod
-    def pretrained(name="qwen-7b", lang="en", remote_loc=None):
+    def pretrained(name="qwen_7.5b_chat", lang="en", remote_loc=None):
         """Downloads and loads a pretrained model.
         Parameters
         ----------
         name : str, optional
-            Name of the pretrained model, by default "qwen-7b"
+            Name of the pretrained model, by default "qwen_7.5b_chat"
         lang : str, optional
             Language of the pretrained model, by default "en"
         remote_loc : str, optional

sparknlp/base/image_assembler.py CHANGED Viewed

@@ -65,6 +65,7 @@ class ImageAssembler(AnnotatorTransformer):
     outputAnnotatorType = AnnotatorType.IMAGE
     inputCol = Param(Params._dummy(), "inputCol", "input column name", typeConverter=TypeConverters.toString)
+    textCol = Param(Params._dummy(), "textCol", "text column name", typeConverter=TypeConverters.toString)
     outputCol = Param(Params._dummy(), "outputCol", "output column name", typeConverter=TypeConverters.toString)
     name = 'ImageAssembler'
@@ -101,3 +102,13 @@ class ImageAssembler(AnnotatorTransformer):
     def getOutputCol(self):
         """Gets output column name of annotations."""
         return self.getOrDefault(self.outputCol)
+    def setTextCol(self, value):
+        """Sets an optional text column name.
+        Parameters
+        ----------
+        value : str
+            Name of an optional input text column
+        """
+        return self._set(inputCol=value)

sparknlp/base/light_pipeline.py CHANGED Viewed

@@ -277,7 +277,7 @@ class LightPipeline:
         return result
-    def fullAnnotateImage(self, path_to_image):
+    def fullAnnotateImage(self, path_to_image, text=None):
         """Annotates the data provided into `Annotation` type results.
         The data should be either a list or a str.
@@ -287,27 +287,38 @@ class LightPipeline:
         path_to_image : list or str
             Source path of image, list of paths to images
+        text: list or str, optional
+           Optional list or str of texts. If None, defaults to empty list if path_to_image is a list, or empty string if path_to_image is a string.
         Returns
         -------
         List[AnnotationImage]
             The result of the annotation
         """
+        if not isinstance(path_to_image, (str, list)):
+            raise TypeError("argument for path_to_image must be 'str' or 'list[str]'")
+        if text is None:
+            text = "" if isinstance(path_to_image, str) else []
+        if type(path_to_image) != type(text):
+            raise ValueError("`path_to_image` and `text` must be of the same type")
         stages = self.pipeline_model.stages
         if not self._skipPipelineValidation(stages):
             self._validateStagesInputCols(stages)
-        if type(path_to_image) is str:
+        if isinstance(path_to_image, str):
             path_to_image = [path_to_image]
+            text = [text]
-        if type(path_to_image) is list:
-            result = []
+        result = []
-            for image_result in self._lightPipeline.fullAnnotateImageJava(path_to_image):
-                result.append(self.__buildStages(image_result))
+        for image_result in self._lightPipeline.fullAnnotateImageJava(path_to_image, text):
+            result.append(self.__buildStages(image_result))
+        return result
-            return result
-        else:
-            raise TypeError("argument for annotation may be 'str' or list[str]")
     def __buildStages(self, annotations_result):
         stages = {}

sparknlp/common/properties.py CHANGED Viewed

@@ -67,6 +67,33 @@ class HasCaseSensitiveProperties:
         return self.getOrDefault(self.caseSensitive)
+class HasClsTokenProperties:
+    useCLSToken = Param(Params._dummy(),
+                        "useCLSToken",
+                        "Whether to use CLS token for pooling (true) or attention-based average pooling (false)",
+                        typeConverter=TypeConverters.toBoolean)
+    def setUseCLSToken(self, value):
+        """Sets whether to ignore case in tokens for embeddings matching.
+        Parameters
+        ----------
+        value : bool
+            Whether to use CLS token for pooling (true) or attention-based average pooling (false)
+        """
+        return self._set(useCLSToken=value)
+    def getUseCLSToken(self):
+        """Gets whether to use CLS token for pooling (true) or attention-based average pooling (false)
+        Returns
+        -------
+        bool
+            Whether to use CLS token for pooling (true) or attention-based average pooling (false)
+        """
+        return self.getOrDefault(self.useCLSToken)
 class HasClassifierActivationProperties:
     activation = Param(Params._dummy(),
                        "activation",

sparknlp/internal/__init__.py CHANGED Viewed

@@ -1006,3 +1006,18 @@ class _SnowFlakeEmbeddingsLoader(ExtendedJavaWrapper):
         super(_SnowFlakeEmbeddingsLoader, self).__init__(
             "com.johnsnowlabs.nlp.embeddings.SnowFlakeEmbeddings.loadSavedModel", path, jspark
         )
+class _AutoGGUFEmbeddingsLoader(ExtendedJavaWrapper):
+    def __init__(self, path, jspark):
+        super(_AutoGGUFEmbeddingsLoader, self).__init__(
+            "com.johnsnowlabs.nlp.embeddings.AutoGGUFEmbeddings.loadSavedModel", path, jspark)
+class _BLIPForQuestionAnswering(ExtendedJavaWrapper):
+    def __init__(self, path, jspark):
+        super(_BLIPForQuestionAnswering, self).__init__(
+            "com.johnsnowlabs.nlp.annotators.cv.BLIPForQuestionAnswering.loadSavedModel",
+            path,
+            jspark,
+        )

sparknlp/reader/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Module for reading different files types."""
+from sparknlp.reader.sparknlp_reader import *

sparknlp/reader/sparknlp_reader.py ADDED Viewed

@@ -0,0 +1,113 @@
+#  Copyright 2017-2024 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+from sparknlp.internal import ExtendedJavaWrapper
+class SparkNLPReader(ExtendedJavaWrapper):
+    """Instantiates class to read HTML, email, and document files.
+    Two types of input paths are supported:
+    - `htmlPath`: A path to a directory of HTML files or a single HTML file (e.g., `"path/html/files"`).
+    - `url`: A single URL or a set of URLs (e.g., `"https://www.wikipedia.org"`).
+    Parameters
+    ----------
+    spark : SparkSession
+        The active Spark session.
+    params : dict, optional
+        A dictionary with custom configurations.
+    """
+    def __init__(self, spark, params=None):
+        if params is None:
+            params = {}
+        super(SparkNLPReader, self).__init__("com.johnsnowlabs.reader.SparkNLPReader", params)
+        self.spark = spark
+    def html(self, htmlPath):
+        """Reads HTML files or URLs and returns a Spark DataFrame.
+        Parameters
+        ----------
+        htmlPath : str or list of str
+            Path(s) to HTML file(s) or a list of URLs.
+        Returns
+        -------
+        pyspark.sql.DataFrame
+            A DataFrame containing the parsed HTML content.
+        Examples
+        --------
+        >>> from sparknlp.reader import SparkNLPReader
+        >>> html_df = SparkNLPReader(spark).html("https://www.wikipedia.org")
+        You can also use SparkNLP to simplify the process:
+        >>> import sparknlp
+        >>> html_df = sparknlp.read().html("https://www.wikipedia.org")
+        >>> html_df.show(truncate=False)
+        """
+        if not isinstance(htmlPath, (str, list)) or (isinstance(htmlPath, list) and not all(isinstance(item, str) for item in htmlPath)):
+            raise TypeError("htmlPath must be a string or a list of strings")
+        jdf = self._java_obj.html(htmlPath)
+        return self.getDataFrame(self.spark, jdf)
+    def email(self, filePath):
+        """Reads email files and returns a Spark DataFrame.
+        Parameters
+        ----------
+        filePath : str
+            Path to an email file or a directory containing emails.
+        Returns
+        -------
+        pyspark.sql.DataFrame
+            A DataFrame containing parsed email data.
+        Examples
+        --------
+        >>> from sparknlp.reader import SparkNLPReader
+        >>> email_df = SparkNLPReader(spark).email("home/user/emails-directory")
+        Using SparkNLP:
+        >>> import sparknlp
+        >>> email_df = sparknlp.read().email("home/user/emails-directory")
+        >>> email_df.show(truncate=False)
+        """
+        if not isinstance(filePath, str):
+            raise TypeError("filePath must be a string")
+        jdf = self._java_obj.email(filePath)
+        return self.getDataFrame(self.spark, jdf)
+    def doc(self, docPath):
+        """Reads document files and returns a Spark DataFrame.
+        Parameters
+        ----------
+        docPath : str
+            Path to a document file.
+        Returns
+        -------
+        pyspark.sql.DataFrame
+            A DataFrame containing parsed document content.
+        """
+        if not isinstance(docPath, str):
+            raise TypeError("docPath must be a string")
+        jdf = self._java_obj.doc(docPath)
+        return self.getDataFrame(self.spark, jdf)

{spark_nlp-5.5.1.dist-info → spark_nlp-5.5.3.dist-info}/.uuid RENAMED Viewed

File without changes

{spark_nlp-5.5.1.dist-info → spark_nlp-5.5.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{spark_nlp-5.5.1.dist-info → spark_nlp-5.5.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

spark-nlp 5.5.1__py2.py3-none-any.whl → 5.5.3__py2.py3-none-any.whl

Potentially problematic release.

spark-nlp 5.5.1py2.py3-none-any.whl → 5.5.3py2.py3-none-any.whl