PyPI - spark-nlp - Versions diffs - 6.0.1rc1__py2.py3-none-any.whl → 6.0.2__py2.py3-none-any.whl - Mend

spark-nlp 6.0.1rc1py2.py3-none-any.whl → 6.0.2py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of spark-nlp might be problematic. Click here for more details.

Files changed (36) hide show

{spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.2.dist-info}/METADATA +13 -6
{spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.2.dist-info}/RECORD +36 -30
{spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.2.dist-info}/WHEEL +1 -1
sparknlp/__init__.py +4 -2
sparknlp/annotator/cv/__init__.py +2 -0
sparknlp/annotator/cv/florence2_transformer.py +180 -0
sparknlp/annotator/cv/gemma3_for_multimodal.py +5 -10
sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
sparknlp/annotator/cv/janus_for_multimodal.py +8 -13
sparknlp/annotator/cv/llava_for_multimodal.py +1 -1
sparknlp/annotator/cv/paligemma_for_multimodal.py +7 -7
sparknlp/annotator/cv/phi3_vision_for_multimodal.py +1 -1
sparknlp/annotator/cv/qwen2vl_transformer.py +1 -1
sparknlp/annotator/cv/smolvlm_transformer.py +7 -13
sparknlp/annotator/date2_chunk.py +1 -1
sparknlp/annotator/document_character_text_splitter.py +8 -8
sparknlp/annotator/document_token_splitter.py +7 -7
sparknlp/annotator/embeddings/bge_embeddings.py +21 -19
sparknlp/annotator/embeddings/snowflake_embeddings.py +15 -15
sparknlp/annotator/openai/openai_completion.py +3 -4
sparknlp/annotator/seq2seq/m2m100_transformer.py +1 -1
sparknlp/annotator/seq2seq/mistral_transformer.py +2 -3
sparknlp/annotator/seq2seq/nllb_transformer.py +1 -1
sparknlp/annotator/seq2seq/qwen_transformer.py +26 -25
sparknlp/annotator/spell_check/context_spell_checker.py +1 -1
sparknlp/base/prompt_assembler.py +1 -1
sparknlp/common/properties.py +7 -7
sparknlp/internal/__init__.py +19 -0
sparknlp/partition/__init__.py +16 -0
sparknlp/partition/partition.py +244 -0
sparknlp/partition/partition_properties.py +257 -0
sparknlp/partition/partition_transformer.py +196 -0
sparknlp/reader/pdf_to_text.py +50 -4
sparknlp/reader/sparknlp_reader.py +56 -52
sparknlp/training/spacy_to_annotation.py +7 -7
{spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.2.dist-info}/top_level.txt +0 -0

sparknlp/annotator/cv/internvl_for_multimodal.py ADDED Viewed

@@ -0,0 +1,280 @@
+from sparknlp.common import *
+class InternVLForMultiModal(AnnotatorModel,
+                          HasBatchedAnnotateImage,
+                          HasImageFeatureProperties,
+                          HasEngine,
+                          HasGeneratorProperties):
+    """
+    InternVLForMultiModal can load InternVL Vision models for visual question answering.
+    The model consists of a vision encoder, a text encoder, a text decoder and a model merger.
+    The vision encoder will encode the input image, the text encoder will encode the input text,
+    the model merger will merge the image and text embeddings, and the text decoder will output the answer.
+    InternVL 2.5 is an advanced multimodal large language model (MLLM) series that builds upon InternVL 2.0,
+    maintaining its core model architecture while introducing significant enhancements in training and testing
+    strategies as well as data quality. Key features include:
+    - Large context window support
+    - Multilingual support
+    - Multimodal capabilities handling both text and image inputs
+    - Optimized for deployment with int4 quantization
+    Pretrained models can be loaded with :meth:`.pretrained` of the companion object:
+    >>> visualQA = InternVLForMultiModal.pretrained() \\
+    ...     .setInputCols("image_assembler") \\
+    ...     .setOutputCol("answer")
+    The default model is `"internvl2_5_1b_int4"`, if no name is provided.
+    For available pretrained models, refer to the `Models Hub
+    <https://sparknlp.org/models?task=Question+Answering>`__.
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``IMAGE``              ``DOCUMENT``
+    ====================== ======================
+    Parameters
+    ----------
+    batchSize : int, optional
+        Batch size. Larger values allow faster processing but require more memory,
+        by default 1.
+    maxSentenceLength : int, optional
+        Maximum sentence length to process, by default 4096.
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> from pyspark.sql.functions import lit
+    >>> image_df = spark.read.format("image").load(path=images_path)
+    >>> test_df = image_df.withColumn(
+    ...     "text",
+    ...     lit("<|im_start|><image>\\nDescribe this image in detail.<|im_end|><|im_start|>assistant\\n")
+    ... )
+    >>> imageAssembler = ImageAssembler() \\
+    ...     .setInputCol("image") \\
+    ...     .setOutputCol("image_assembler")
+    >>> visualQA = InternVLForMultiModal.pretrained() \\
+    ...     .setInputCols("image_assembler") \\
+    ...     .setOutputCol("answer")
+    >>> pipeline = Pipeline().setStages([
+    ...     imageAssembler,
+    ...     visualQA
+    ... ])
+    >>> result = pipeline.fit(test_df).transform(test_df)
+    >>> result.select("image_assembler.origin", "answer.result").show(truncate=False)
+    """
+    name = "InternVLForMultiModal"
+    inputAnnotatorTypes = [AnnotatorType.IMAGE]
+    outputAnnotatorType = AnnotatorType.DOCUMENT
+    minOutputLength = Param(Params._dummy(), "minOutputLength", "Minimum length of the sequence to be generated",
+                            typeConverter=TypeConverters.toInt)
+    maxOutputLength = Param(Params._dummy(), "maxOutputLength", "Maximum length of output text",
+                            typeConverter=TypeConverters.toInt)
+    doSample = Param(Params._dummy(), "doSample", "Whether or not to use sampling; use greedy decoding otherwise",
+                     typeConverter=TypeConverters.toBoolean)
+    temperature = Param(Params._dummy(), "temperature", "The value used to module the next token probabilities",
+                        typeConverter=TypeConverters.toFloat)
+    topK = Param(Params._dummy(), "topK",
+                 "The number of highest probability vocabulary tokens to keep for top-k-filtering",
+                 typeConverter=TypeConverters.toInt)
+    topP = Param(Params._dummy(), "topP",
+                 "If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are kept for generation",
+                 typeConverter=TypeConverters.toFloat)
+    repetitionPenalty = Param(Params._dummy(), "repetitionPenalty",
+                              "The parameter for repetition penalty. 1.0 means no penalty. See `this paper <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details",
+                              typeConverter=TypeConverters.toFloat)
+    noRepeatNgramSize = Param(Params._dummy(), "noRepeatNgramSize",
+                              "If set to int > 0, all ngrams of that size can only occur once",
+                              typeConverter=TypeConverters.toInt)
+    ignoreTokenIds = Param(Params._dummy(), "ignoreTokenIds",
+                           "A list of token ids which are ignored in the decoder's output",
+                           typeConverter=TypeConverters.toListInt)
+    beamSize = Param(Params._dummy(), "beamSize",
+                     "The Number of beams for beam search.",
+                     typeConverter=TypeConverters.toInt)
+    def setMaxSentenceSize(self, value):
+        """Sets Maximum sentence length that the annotator will process, by
+        default 4096.
+        Parameters
+        ----------
+        value : int
+            Maximum sentence length that the annotator will process
+        """
+        return self._set(maxSentenceLength=value)
+    def setIgnoreTokenIds(self, value):
+        """A list of token ids which are ignored in the decoder's output.
+        Parameters
+        ----------
+        value : List[int]
+            The words to be filtered out
+        """
+        return self._set(ignoreTokenIds=value)
+    def setMinOutputLength(self, value):
+        """Sets minimum length of the sequence to be generated.
+        Parameters
+        ----------
+        value : int
+            Minimum length of the sequence to be generated
+        """
+        return self._set(minOutputLength=value)
+    def setMaxOutputLength(self, value):
+        """Sets maximum length of output text.
+        Parameters
+        ----------
+        value : int
+            Maximum length of output text
+        """
+        return self._set(maxOutputLength=value)
+    def setDoSample(self, value):
+        """Sets whether or not to use sampling, use greedy decoding otherwise.
+        Parameters
+        ----------
+        value : bool
+            Whether or not to use sampling; use greedy decoding otherwise
+        """
+        return self._set(doSample=value)
+    def setTemperature(self, value):
+        """Sets the value used to module the next token probabilities.
+        Parameters
+        ----------
+        value : float
+            The value used to module the next token probabilities
+        """
+        return self._set(temperature=value)
+    def setTopK(self, value):
+        """Sets the number of highest probability vocabulary tokens to keep for
+        top-k-filtering.
+        Parameters
+        ----------
+        value : int
+            Number of highest probability vocabulary tokens to keep
+        """
+        return self._set(topK=value)
+    def setTopP(self, value):
+        """Sets the top cumulative probability for vocabulary tokens.
+        If set to float < 1, only the most probable tokens with probabilities
+        that add up to ``topP`` or higher are kept for generation.
+        Parameters
+        ----------
+        value : float
+            Cumulative probability for vocabulary tokens
+        """
+        return self._set(topP=value)
+    def setRepetitionPenalty(self, value):
+        """Sets the parameter for repetition penalty. 1.0 means no penalty.
+        Parameters
+        ----------
+        value : float
+            The repetition penalty
+        References
+        ----------
+        See `Ctrl: A Conditional Transformer Language Model For Controllable
+        Generation <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
+        """
+        return self._set(repetitionPenalty=value)
+    def setNoRepeatNgramSize(self, value):
+        """Sets size of n-grams that can only occur once.
+        If set to int > 0, all ngrams of that size can only occur once.
+        Parameters
+        ----------
+        value : int
+            N-gram size can only occur once
+        """
+        return self._set(noRepeatNgramSize=value)
+    def setBeamSize(self, value):
+        """Sets the number of beam size for beam search, by default `1`.
+        Parameters
+        ----------
+        value : int
+            Number of beam size for beam search
+        """
+        return self._set(beamSize=value)
+    @keyword_only
+    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.cv.InternVLForMultiModal",
+                 java_model=None):
+        super(InternVLForMultiModal, self).__init__(
+            classname=classname,
+            java_model=java_model
+        )
+        self._setDefault(
+            batchSize=1,
+            minOutputLength=0,
+            maxOutputLength=20,
+            doSample=False,
+            temperature=0.6,
+            topK=-1,
+            topP=0.9,
+            repetitionPenalty=1.0,
+            noRepeatNgramSize=3,
+            ignoreTokenIds=[],
+            beamSize=1
+        )
+    @staticmethod
+    def loadSavedModel(folder, spark_session, use_openvino=False):
+        """Loads a locally saved model.
+        Parameters
+        ----------
+        folder : str
+            Folder of the saved model
+        spark_session : pyspark.sql.SparkSession
+            The current SparkSession
+        Returns
+        -------
+        InternVLForMultiModal
+            The restored model
+        """
+        from sparknlp.internal import _InternVLForMultiModalLoader
+        jModel = _InternVLForMultiModalLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
+        return InternVLForMultiModal(java_model=jModel)
+    @staticmethod
+    def pretrained(name="internvl2_5_1b_int4", lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model, by default
+            "internvl2_5_1b_int4"
+        lang : str, optional
+            Language of the pretrained model, by default "en"
+        remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use
+            Spark NLPs repositories otherwise.
+        Returns
+        -------
+        InternVLForMultiModal
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+        return ResourceDownloader.downloadModel(InternVLForMultiModal, name, lang, remote_loc)

sparknlp/annotator/cv/janus_for_multimodal.py CHANGED Viewed

@@ -36,8 +36,9 @@ class JanusForMultiModal(AnnotatorModel,
     and for image generation, it uses a tokenizer with a downsample rate of 16.
     Pretrained models can be loaded with :meth:`.pretrained` of the companion object:
-    >>> visualQAClassifier = JanusForMultiModal.pretrained() \
-    ...     .setInputCols(["image_assembler"]) \
+    >>> visualQAClassifier = JanusForMultiModal.pretrained() \\
+    ...     .setInputCols(["image_assembler"]) \\
     ...     .setOutputCol("answer")
     The default model is `"janus_1_3b_int4"`, if no name is provided.
@@ -73,29 +74,23 @@ class JanusForMultiModal(AnnotatorModel,
     >>> from sparknlp.annotator import *
     >>> from pyspark.ml import Pipeline
     >>> from pyspark.sql.functions import lit
     >>> image_df = SparkSessionForTest.spark.read.format("image").load(path=images_path)
     >>> test_df = image_df.withColumn(
     ...     "text",
-    ...     lit("User: <image_placeholder>Describe image in details\n\nAssistant:")
+    ...     lit("User: <image_placeholder>Describe image in details\\n\\nAssistant:")
     ... )
-    >>> imageAssembler = ImageAssembler() \
-    ...     .setInputCol("image") \
+    >>> imageAssembler = ImageAssembler() \\
+    ...     .setInputCol("image") \\
     ...     .setOutputCol("image_assembler")
-    >>> visualQAClassifier = JanusForMultiModal.pretrained() \
-    ...     .setInputCols("image_assembler") \
+    >>> visualQAClassifier = JanusForMultiModal.pretrained() \\
+    ...     .setInputCols("image_assembler") \\
     ...     .setOutputCol("answer")
     >>> pipeline = Pipeline().setStages([
     ...     imageAssembler,
     ...     visualQAClassifier
     ... ])
     >>> result = pipeline.fit(test_df).transform(test_df)
     >>> result.select("image_assembler.origin", "answer.result").show(truncate=False)
     +--------------------------------------+----------------------------------------------------------------------+
     |origin                                |result                                                                |
     +--------------------------------------+----------------------------------------------------------------------+

sparknlp/annotator/cv/llava_for_multimodal.py CHANGED Viewed

@@ -65,7 +65,7 @@ class LLAVAForMultiModal(AnnotatorModel,
     >>> from sparknlp.annotator import *
     >>> from pyspark.ml import Pipeline
     >>> image_df = SparkSessionForTest.spark.read.format("image").load(path=images_path)
-    >>> test_df = image_df.withColumn("text", lit("USER: \n <|image|> \n What's this picture about? \n ASSISTANT:\n"))
+    >>> test_df = image_df.withColumn("text", lit("USER: \\n <|image|> \\n What's this picture about? \\n ASSISTANT:\\n"))
     >>> imageAssembler = ImageAssembler() \\
     ...     .setInputCol("image") \\
     ...     .setOutputCol("image_assembler")

sparknlp/annotator/cv/paligemma_for_multimodal.py CHANGED Viewed

@@ -28,8 +28,8 @@ class PaliGemmaForMultiModal(AnnotatorModel,
     Pretrained models can be loaded with :meth:`.pretrained` of the companion
     object:
-    >>> visualQAClassifier = PaliGemmaForMultiModal.pretrained() \
-    ...     .setInputCols(["image_assembler"]) \
+    >>> visualQAClassifier = PaliGemmaForMultiModal.pretrained() \\
+    ...     .setInputCols(["image_assembler"]) \\
     ...     .setOutputCol("answer")
     The default model is ``"paligemma_3b_pt_224_int4"``, if no name is
@@ -59,12 +59,12 @@ class PaliGemmaForMultiModal(AnnotatorModel,
     >>> from sparknlp.annotator import *
     >>> from pyspark.ml import Pipeline
     >>> image_df = SparkSessionForTest.spark.read.format("image").load(path=images_path)
-    >>> test_df = image_df.withColumn("text", lit("USER: \n <image> \nDescribe this image. \nASSISTANT:\n"))
-    >>> imageAssembler = ImageAssembler() \
-    ...     .setInputCol("image") \
+    >>> test_df = image_df.withColumn("text", lit("USER: \\n <image> \\nDescribe this image. \\nASSISTANT:\\n"))
+    >>> imageAssembler = ImageAssembler() \\
+    ...     .setInputCol("image") \\
     ...     .setOutputCol("image_assembler")
-    >>> visualQAClassifier = PaliGemmaForMultiModal.pretrained() \
-    ...     .setInputCols("image_assembler") \
+    >>> visualQAClassifier = PaliGemmaForMultiModal.pretrained() \\
+    ...     .setInputCols("image_assembler") \\
     ...     .setOutputCol("answer")
     >>> pipeline = Pipeline().setStages([
     ...     imageAssembler,

sparknlp/annotator/cv/phi3_vision_for_multimodal.py CHANGED Viewed

@@ -65,7 +65,7 @@ class Phi3Vision(AnnotatorModel,
     >>> from sparknlp.annotator import *
     >>> from pyspark.ml import Pipeline
     >>> image_df = SparkSessionForTest.spark.read.format("image").load(path=images_path)
-    >>> test_df = image_df.withColumn("text", lit("<|user|> \n <|image_1|> \nWhat is unusual on this picture? <|end|>\n <|assistant|>\n"))
+    >>> test_df = image_df.withColumn("text", lit("<|user|> \\n <|image_1|> \\nWhat is unusual on this picture? <|end|>\\n <|assistant|>\\n"))
     >>> imageAssembler = ImageAssembler() \\
     ...     .setInputCol("image") \\
     ...     .setOutputCol("image_assembler")

sparknlp/annotator/cv/qwen2vl_transformer.py CHANGED Viewed

@@ -68,7 +68,7 @@ class Qwen2VLTransformer(AnnotatorModel,
     >>> from sparknlp.annotator import *
     >>> from pyspark.ml import Pipeline
     >>> image_df = SparkSessionForTest.spark.read.format("image").load(path=images_path)
-    >>> test_df = image_df.withColumn("text", lit("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n"))
+    >>> test_df = image_df.withColumn("text", lit("<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n<|im_start|>user\\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\\n<|im_start|>assistant\\n"))
     >>> imageAssembler = ImageAssembler() \\
     ...     .setInputCol("image") \\
     ...     .setOutputCol("image_assembler")

sparknlp/annotator/cv/smolvlm_transformer.py CHANGED Viewed

@@ -33,8 +33,8 @@ class SmolVLMTransformer(AnnotatorModel,
     while maintaining strong performance on multimodal tasks.
     Pretrained models can be loaded with :meth:`.pretrained` of the companion object:
-    >>> visualQA = SmolVLMTransformer.pretrained() \
-    ...     .setInputCols(["image_assembler"]) \
+    >>> visualQA = SmolVLMTransformer.pretrained() \\
+    ...     .setInputCols(["image_assembler"]) \\
     ...     .setOutputCol("answer")
     The default model is `"smolvlm_instruct_int4"`, if no name is provided.
@@ -82,29 +82,23 @@ class SmolVLMTransformer(AnnotatorModel,
     >>> from sparknlp.annotator import *
     >>> from pyspark.ml import Pipeline
     >>> from pyspark.sql.functions import lit
     >>> imageDF = spark.read.format("image").load(path=images_path)
     >>> testDF = imageDF.withColumn(
     ...     "text",
-    ...     lit("<|im_start|>User:<image>Can you describe the image?<end_of_utterance>\nAssistant:")
+    ...     lit("<|im_start|>User:<image>Can you describe the image?<end_of_utterance>\\nAssistant:")
     ... )
-    >>> imageAssembler = ImageAssembler() \
-    ...     .setInputCol("image") \
+    >>> imageAssembler = ImageAssembler() \\
+    ...     .setInputCol("image") \\
     ...     .setOutputCol("image_assembler")
-    >>> visualQAClassifier = SmolVLMTransformer.pretrained() \
-    ...     .setInputCols("image_assembler") \
+    >>> visualQAClassifier = SmolVLMTransformer.pretrained() \\
+    ...     .setInputCols("image_assembler") \\
     ...     .setOutputCol("answer")
     >>> pipeline = Pipeline().setStages([
     ...     imageAssembler,
     ...     visualQAClassifier
     ... ])
     >>> result = pipeline.fit(testDF).transform(testDF)
     >>> result.select("image_assembler.origin", "answer.result").show(truncate=False)
     +--------------------------------------+----------------------------------------------------------------------+
     |origin                                |result                                                                |
     +--------------------------------------+----------------------------------------------------------------------+

sparknlp/annotator/date2_chunk.py CHANGED Viewed

@@ -24,7 +24,7 @@ class Date2Chunk(AnnotatorModel):
     ====================== ======================
     Input Annotation types Output Annotation type
     ====================== ======================
-    ``DATE``              ``CHUNK``
+    ``DATE``               ``CHUNK``
     ====================== ======================
     Parameters

sparknlp/annotator/document_character_text_splitter.py CHANGED Viewed

@@ -55,7 +55,7 @@ class DocumentCharacterTextSplitter(AnnotatorModel):
     chunkOverlap
         Length of the overlap between text chunks , by default `0`.
     splitPatterns
-        Patterns to separate the text by in decreasing priority , by default `["\n\n", "\n", " ", ""]`.
+        Patterns to separate the text by in decreasing priority , by default `["\\n\\n", "\\n", " ", ""]`.
     patternsAreRegex
         Whether to interpret the split patterns as regular expressions , by default `False`.
     keepSeparators
@@ -94,13 +94,13 @@ class DocumentCharacterTextSplitter(AnnotatorModel):
     |                                                                          result|splits[0].begin|splits[0].end|length|
     +--------------------------------------------------------------------------------+---------------+-------------+------+
     |[ Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyl...|              0|        19994| 19994|
-    |["And Mademoiselle's address?" he asked.\n\n"Is Briony Lodge, Serpentine Aven...|          19798|        39395| 19597|
-    |["How did that help you?"\n\n"It was all-important. When a woman thinks that ...|          39371|        59242| 19871|
-    |["'But,' said I, 'there would be millions of red-headed men who\nwould apply....|          59166|        77833| 18667|
-    |[My friend was an enthusiastic musician, being himself not only a\nvery capab...|          77835|        97769| 19934|
-    |["And yet I am not convinced of it," I answered. "The cases which\ncome to li...|          97771|       117248| 19477|
-    |["Well, she had a slate-coloured, broad-brimmed straw hat, with a\nfeather of...|         117250|       137242| 19992|
-    |["That sounds a little paradoxical."\n\n"But it is profoundly True. Singulari...|         137244|       157171| 19927|
+    |["And Mademoiselle's address?" he asked.\\n\\n"Is Briony Lodge, Serpentine Aven...|          19798|        39395| 19597|
+    |["How did that help you?"\\n\\n"It was all-important. When a woman thinks that ...|          39371|        59242| 19871|
+    |["'But,' said I, 'there would be millions of red-headed men who\\nwould apply....|          59166|        77833| 18667|
+    |[My friend was an enthusiastic musician, being himself not only a\\nvery capab...|          77835|        97769| 19934|
+    |["And yet I am not convinced of it," I answered. "The cases which\\ncome to li...|          97771|       117248| 19477|
+    |["Well, she had a slate-coloured, broad-brimmed straw hat, with a\\nfeather of...|         117250|       137242| 19992|
+    |["That sounds a little paradoxical."\\n\\n"But it is profoundly True. Singulari...|         137244|       157171| 19927|
     +--------------------------------------------------------------------------------+---------------+-------------+------+
     """

sparknlp/annotator/document_token_splitter.py CHANGED Viewed

@@ -88,13 +88,13 @@ class DocumentTokenSplitter(AnnotatorModel):
     |                                                                          result|begin|  end|length|tokens|
     +--------------------------------------------------------------------------------+-----+-----+------+------+
     |[ Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyl...|    0| 3018|  3018|   512|
-    |[study of crime, and occupied his\nimmense faculties and extraordinary powers...| 2950| 5707|  2757|   512|
-    |[but as I have changed my clothes I can't imagine how you\ndeduce it. As to M...| 5659| 8483|  2824|   512|
-    |[quarters received. Be in your chamber then at that hour, and do\nnot take it...| 8427|11241|  2814|   512|
-    |[a pity\nto miss it."\n\n"But your client--"\n\n"Never mind him. I may want y...|11188|13970|  2782|   512|
-    |[person who employs me wishes his agent to be unknown to\nyou, and I may conf...|13918|16898|  2980|   512|
-    |[letters back."\n\n"Precisely so. But how--"\n\n"Was there a secret marriage?...|16836|19744|  2908|   512|
-    |[seven hundred in\nnotes," he said.\n\nHolmes scribbled a receipt upon a shee...|19683|22551|  2868|   512|
+    |[study of crime, and occupied his\\nimmense faculties and extraordinary powers...| 2950| 5707|  2757|   512|
+    |[but as I have changed my clothes I can't imagine how you\\ndeduce it. As to M...| 5659| 8483|  2824|   512|
+    |[quarters received. Be in your chamber then at that hour, and do\\nnot take it...| 8427|11241|  2814|   512|
+    |[a pity\\nto miss it."\\n\\n"But your client--"\\n\\n"Never mind him. I may want y...|11188|13970|  2782|   512|
+    |[person who employs me wishes his agent to be unknown to\\nyou, and I may conf...|13918|16898|  2980|   512|
+    |[letters back."\\n\\n"Precisely so. But how--"\\n\\n"Was there a secret marriage?...|16836|19744|  2908|   512|
+    |[seven hundred in\\nnotes," he said.\\n\\nHolmes scribbled a receipt upon a shee...|19683|22551|  2868|   512|
     +--------------------------------------------------------------------------------+-----+-----+------+------+
     """

sparknlp/annotator/embeddings/bge_embeddings.py CHANGED Viewed

@@ -49,23 +49,9 @@ class BGEEmbeddings(AnnotatorModel,
     ``DOCUMENT``            ``SENTENCE_EMBEDDINGS``
     ====================== ======================
-    Parameters
-    ----------
-    batchSize
-        Size of every batch , by default 8
-    dimension
-        Number of embedding dimensions, by default 768
-    caseSensitive
-        Whether to ignore case in tokens for embeddings matching, by default False
-    maxSentenceLength
-        Max sentence length to process, by default 512
-    configProtoBytes
-        ConfigProto from tensorflow, serialized into byte array.
-    useCLSToken
-        Whether to use the CLS token for sentence embeddings, by default True
-    References
-    ----------
+    **References**
     `C-Pack: Packaged Resources To Advance General Chinese Embedding <https://arxiv.org/pdf/2309.07597>`__
     `BGE Github Repository <https://github.com/FlagOpen/FlagEmbedding>`__
@@ -84,6 +70,22 @@ class BGEEmbeddings(AnnotatorModel,
     benchmark; meanwhile, our released English data is 2 times larger than the Chinese data. All
     these resources are made publicly available at https://github.com/FlagOpen/FlagEmbedding.*
+    Parameters
+    ----------
+    batchSize
+        Size of every batch , by default 8
+    dimension
+        Number of embedding dimensions, by default 768
+    caseSensitive
+        Whether to ignore case in tokens for embeddings matching, by default False
+    maxSentenceLength
+        Max sentence length to process, by default 512
+    configProtoBytes
+        ConfigProto from tensorflow, serialized into byte array.
+    useCLSToken
+        Whether to use the CLS token for sentence embeddings, by default True
     Examples
     --------
     >>> import sparknlp
@@ -106,8 +108,8 @@ class BGEEmbeddings(AnnotatorModel,
     ...     embeddingsFinisher
     ... ])
     >>> data = spark.createDataFrame([["query: how much protein should a female eat",
-    ... "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." + \
-    ... "But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" + \
+    ... "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." + \\
+    ... "But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" + \\
     ... "marathon. Check out the chart below to see how much protein you should be eating each day.",
     ... ]]).toDF("text")
     >>> result = pipeline.fit(data).transform(data)

sparknlp/annotator/embeddings/snowflake_embeddings.py CHANGED Viewed

@@ -47,21 +47,7 @@ class SnowFlakeEmbeddings(AnnotatorModel,
 	``DOCUMENT``            ``SENTENCE_EMBEDDINGS``
 	====================== ======================
-	Parameters
-	----------
-	batchSize
-		Size of every batch , by default 8
-	dimension
-		Number of embedding dimensions, by default 768
-	caseSensitive
-		Whether to ignore case in tokens for embeddings matching, by default False
-	maxSentenceLength
-		Max sentence length to process, by default 512
-	configProtoBytes
-		ConfigProto from tensorflow, serialized into byte array.
-	References
-	----------
+	**References**
 	`Arctic-Embed: Scalable, Efficient, and Accurate Text Embedding Models <https://arxiv.org/abs/2405.05374>`__
 	`Snowflake Arctic-Embed Models <https://github.com/Snowflake-Labs/arctic-embed>`__
@@ -78,6 +64,20 @@ class SnowFlakeEmbeddings(AnnotatorModel,
      data curation is crucial to retrieval accuracy. A detailed technical report will be available
      shortly. *
+	Parameters
+	----------
+	batchSize
+		Size of every batch , by default 8
+	dimension
+		Number of embedding dimensions, by default 768
+	caseSensitive
+		Whether to ignore case in tokens for embeddings matching, by default False
+	maxSentenceLength
+		Max sentence length to process, by default 512
+	configProtoBytes
+		ConfigProto from tensorflow, serialized into byte array.
 	Examples
 	--------
 	>>> import sparknlp

sparknlp/annotator/openai/openai_completion.py CHANGED Viewed

@@ -63,7 +63,6 @@ class OpenAICompletion(AnnotatorModel):
    >>> from sparknlp.annotator import *
    >>> from sparknlp.common import *
    >>> from pyspark.ml import Pipeline
    >>> documentAssembler = DocumentAssembler() \\
    ...     .setInputCol("text") \\
    ...     .setOutputCol("document")
@@ -83,9 +82,9 @@ class OpenAICompletion(AnnotatorModel):
    +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    |completion                                                                                                                                                                                                                                                                                        |
    +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-   |[{document, 0, 258, \n\nI had the pleasure of dining at La Fiorita recently, and it was a truly delightful experience! The menu boasted a wonderful selection of classic Italian dishes, all exquisitely prepared and presented. The service staff was friendly and attentive and really, {}, []}]|
-   |[{document, 0, 227, \n\nI recently visited Barbecue Joe's for dinner and it was amazing! The menu had so many items to choose from including pulled pork, smoked turkey, brisket, pork ribs, and sandwiches. I opted for the pulled pork sandwich and let, {}, []}]                               |
-   |[{document, 0, 172, \n\n{ \n   "review": { \n      "overallRating": 4, \n      "reviewBody": "I enjoyed my meal at this restaurant. The food was flavourful, well-prepared and beautifully presented., {}, []}]                                                                                   |
+   |[{document, 0, 258, \\n\\nI had the pleasure of dining at La Fiorita recently, and it was a truly delightful experience! The menu boasted a wonderful selection of classic Italian dishes, all exquisitely prepared and presented. The service staff was friendly and attentive and really, {}, []}]|
+   |[{document, 0, 227, \\n\\nI recently visited Barbecue Joe's for dinner and it was amazing! The menu had so many items to choose from including pulled pork, smoked turkey, brisket, pork ribs, and sandwiches. I opted for the pulled pork sandwich and let, {}, []}]                               |
+   |[{document, 0, 172, \\n\\n{ \\n   "review": { \\n      "overallRating": 4, \\n      "reviewBody": "I enjoyed my meal at this restaurant. The food was flavourful, well-prepared and beautifully presented., {}, []}]                                                                                   |
    +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    """

sparknlp/annotator/seq2seq/m2m100_transformer.py CHANGED Viewed

@@ -77,7 +77,7 @@ class M2M100Transformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
         Target Language (Default: `fr`)
     Languages Covered
-    -----
+    -----------------
     Afrikaans (af), Amharic (am), Arabic (ar), Asturian (ast), Azerbaijani (az), Bashkir (ba),
     Belarusian (be), Bulgarian (bg), Bengali (bn), Breton (br), Bosnian (bs), Catalan; Valencian
     (ca), Cebuano (ceb), Czech (cs), Welsh (cy), Danish (da), German (de), Greeek (el), English

spark-nlp 6.0.1rc1__py2.py3-none-any.whl → 6.0.2__py2.py3-none-any.whl

Potentially problematic release.

spark-nlp 6.0.1rc1py2.py3-none-any.whl → 6.0.2py2.py3-none-any.whl