PyPI - spark-nlp - Versions diffs - 6.1.3rc1__py2.py3-none-any.whl → 6.1.5__py2.py3-none-any.whl - Mend

spark-nlp 6.1.3rc1py2.py3-none-any.whl → 6.1.5py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of spark-nlp might be problematic. Click here for more details.

Files changed (20) hide show

{spark_nlp-6.1.3rc1.dist-info → spark_nlp-6.1.5.dist-info}/METADATA +5 -5
{spark_nlp-6.1.3rc1.dist-info → spark_nlp-6.1.5.dist-info}/RECORD +15 -17
sparknlp/__init__.py +1 -1
sparknlp/annotator/ner/__init__.py +1 -0
sparknlp/annotator/ner/ner_dl_graph_checker.py +237 -0
sparknlp/annotator/seq2seq/auto_gguf_reranker.py +4 -4
sparknlp/base/__init__.py +1 -0
sparknlp/base/gguf_ranking_finisher.py +234 -0
sparknlp/partition/partition_properties.py +444 -1
sparknlp/reader/reader2doc.py +15 -118
sparknlp/reader/reader2image.py +69 -43
sparknlp/reader/reader2table.py +2 -122
sparknlp/reader/reader_assembler.py +159 -0
sparknlp/annotator/classifier_dl/roberta_bert_for_zero_shot_classification.py +0 -225
sparknlp/annotator/extractor.py +0 -174
sparknlp/annotator/openai_completion.py +0 -352
sparknlp/annotator/openai_embeddings.py +0 -132
sparknlp/base/token2_chunk.py +0 -76
{spark_nlp-6.1.3rc1.dist-info → spark_nlp-6.1.5.dist-info}/WHEEL +0 -0
{spark_nlp-6.1.3rc1.dist-info → spark_nlp-6.1.5.dist-info}/top_level.txt +0 -0

sparknlp/base/gguf_ranking_finisher.py ADDED Viewed

@@ -0,0 +1,234 @@
+#  Copyright 2017-2024 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for the GGUFRankingFinisher."""
+from pyspark import keyword_only
+from pyspark.ml.param import TypeConverters, Params, Param
+from sparknlp.internal import AnnotatorTransformer
+class GGUFRankingFinisher(AnnotatorTransformer):
+    """Finisher for AutoGGUFReranker outputs that provides ranking capabilities
+    including top-k selection, sorting by relevance score, and score normalization.
+    This finisher processes the output of AutoGGUFReranker, which contains documents with
+    relevance scores in their metadata. It provides several options for post-processing:
+    - Top-k selection: Select only the top k documents by relevance score
+    - Score thresholding: Filter documents by minimum relevance score
+    - Min-max scaling: Normalize relevance scores to 0-1 range
+    - Sorting: Sort documents by relevance score in descending order
+    - Ranking: Add rank information to document metadata
+    The finisher preserves the document annotation structure while adding ranking information
+    to the metadata and optionally filtering/sorting the documents.
+    For extended examples of usage, see the `Examples
+    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/finisher/gguf_ranking_finisher_example.py>`__.
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``DOCUMENT``           ``DOCUMENT``
+    ====================== ======================
+    Parameters
+    ----------
+    inputCols
+        Name of input annotation columns containing reranked documents
+    outputCol
+        Name of output annotation column containing ranked documents, by default "ranked_documents"
+    topK
+        Maximum number of top documents to return based on relevance score (-1 for no limit), by default -1
+    minRelevanceScore
+        Minimum relevance score threshold for filtering documents, by default Double.MinValue
+    minMaxScaling
+        Whether to apply min-max scaling to normalize relevance scores to 0-1 range, by default False
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> documentAssembler = DocumentAssembler() \\
+    ...     .setInputCol("text") \\
+    ...     .setOutputCol("document")
+    >>> reranker = AutoGGUFReranker.pretrained() \\
+    ...     .setInputCols("document") \\
+    ...     .setOutputCol("reranked_documents") \\
+    ...     .setQuery("A man is eating pasta.")
+    >>> finisher = GGUFRankingFinisher() \\
+    ...     .setInputCols("reranked_documents") \\
+    ...     .setOutputCol("ranked_documents") \\
+    ...     .setTopK(3) \\
+    ...     .setMinMaxScaling(True)
+    >>> pipeline = Pipeline().setStages([documentAssembler, reranker, finisher])
+    >>> data = spark.createDataFrame([
+    ...     ("A man is eating food.",),
+    ...     ("A man is eating a piece of bread.",),
+    ...     ("The girl is carrying a baby.",),
+    ...     ("A man is riding a horse.",)
+    ... ], ["text"])
+    >>> result = pipeline.fit(data).transform(data)
+    >>> result.select("ranked_documents").show(truncate=False)
+    # Documents will be sorted by relevance with rank information in metadata
+    """
+    name = "GGUFRankingFinisher"
+    inputCols = Param(Params._dummy(),
+                     "inputCols",
+                     "Name of input annotation columns containing reranked documents",
+                     typeConverter=TypeConverters.toListString)
+    outputCol = Param(Params._dummy(),
+                     "outputCol",
+                     "Name of output annotation column containing ranked documents",
+                     typeConverter=TypeConverters.toListString)
+    topK = Param(Params._dummy(),
+                 "topK",
+                 "Maximum number of top documents to return based on relevance score (-1 for no limit)",
+                 typeConverter=TypeConverters.toInt)
+    minRelevanceScore = Param(Params._dummy(),
+                             "minRelevanceScore",
+                             "Minimum relevance score threshold for filtering documents",
+                             typeConverter=TypeConverters.toFloat)
+    minMaxScaling = Param(Params._dummy(),
+                         "minMaxScaling",
+                         "Whether to apply min-max scaling to normalize relevance scores to 0-1 range",
+                         typeConverter=TypeConverters.toBoolean)
+    @keyword_only
+    def __init__(self):
+        super(GGUFRankingFinisher, self).__init__(
+            classname="com.johnsnowlabs.nlp.finisher.GGUFRankingFinisher")
+        self._setDefault(
+            topK=-1,
+            minRelevanceScore=float('-inf'),  # Equivalent to Double.MinValue
+            minMaxScaling=False,
+            outputCol=["ranked_documents"]
+        )
+    @keyword_only
+    def setParams(self):
+        kwargs = self._input_kwargs
+        return self._set(**kwargs)
+    def setInputCols(self, *value):
+        """Sets input annotation column names.
+        Parameters
+        ----------
+        value : List[str]
+            Input annotation column names containing reranked documents
+        """
+        if len(value) == 1 and isinstance(value[0], list):
+            return self._set(inputCols=value[0])
+        else:
+            return self._set(inputCols=list(value))
+    def getInputCols(self):
+        """Gets input annotation column names.
+        Returns
+        -------
+        List[str]
+            Input annotation column names
+        """
+        return self.getOrDefault(self.inputCols)
+    def setOutputCol(self, value):
+        """Sets output annotation column name.
+        Parameters
+        ----------
+        value : str
+            Output annotation column name
+        """
+        return self._set(outputCol=[value])
+    def getOutputCol(self):
+        """Gets output annotation column name.
+        Returns
+        -------
+        str
+            Output annotation column name
+        """
+        output_cols = self.getOrDefault(self.outputCol)
+        return output_cols[0] if output_cols else "ranked_documents"
+    def setTopK(self, value):
+        """Sets maximum number of top documents to return.
+        Parameters
+        ----------
+        value : int
+            Maximum number of top documents to return (-1 for no limit)
+        """
+        return self._set(topK=value)
+    def getTopK(self):
+        """Gets maximum number of top documents to return.
+        Returns
+        -------
+        int
+            Maximum number of top documents to return
+        """
+        return self.getOrDefault(self.topK)
+    def setMinRelevanceScore(self, value):
+        """Sets minimum relevance score threshold.
+        Parameters
+        ----------
+        value : float
+            Minimum relevance score threshold
+        """
+        return self._set(minRelevanceScore=value)
+    def getMinRelevanceScore(self):
+        """Gets minimum relevance score threshold.
+        Returns
+        -------
+        float
+            Minimum relevance score threshold
+        """
+        return self.getOrDefault(self.minRelevanceScore)
+    def setMinMaxScaling(self, value):
+        """Sets whether to apply min-max scaling.
+        Parameters
+        ----------
+        value : bool
+            Whether to apply min-max scaling to normalize scores
+        """
+        return self._set(minMaxScaling=value)
+    def getMinMaxScaling(self):
+        """Gets whether to apply min-max scaling.
+        Returns
+        -------
+        bool
+            Whether min-max scaling is enabled
+        """
+        return self.getOrDefault(self.minMaxScaling)

sparknlp/partition/partition_properties.py CHANGED Viewed

@@ -13,9 +13,237 @@
 #  limitations under the License.
 """Contains classes for partition properties used in reading various document types."""
 from typing import Dict
+from pyspark.ml.param import Param, Params, TypeConverters
-from pyspark.ml.param import TypeConverters, Params, Param
+class HasReaderProperties(Params):
+    inputCol = Param(
+        Params._dummy(),
+        "inputCol",
+        "input column name",
+        typeConverter=TypeConverters.toString
+    )
+    def setInputCol(self, value):
+        """Sets input column name.
+        Parameters
+        ----------
+        value : str
+            Name of the Input Column
+        """
+        return self._set(inputCol=value)
+    outputCol = Param(
+        Params._dummy(),
+        "outputCol",
+        "output column name",
+        typeConverter=TypeConverters.toString
+    )
+    def setOutputCol(self, value):
+        """Sets output column name.
+        Parameters
+        ----------
+        value : str
+            Name of the Output Column
+        """
+        return self._set(outputCol=value)
+    contentPath = Param(
+        Params._dummy(),
+        "contentPath",
+        "Path to the content source.",
+        typeConverter=TypeConverters.toString
+    )
+    def setContentPath(self, value: str):
+        """Sets content path.
+        Parameters
+        ----------
+        value : str
+            Path to the content source.
+        """
+        return self._set(contentPath=value)
+    contentType = Param(
+        Params._dummy(),
+        "contentType",
+        "Set the content type to load following MIME specification.",
+        typeConverter=TypeConverters.toString
+    )
+    def setContentType(self, value: str):
+        """Sets content type following MIME specification.
+        Parameters
+        ----------
+        value : str
+            Content type string (MIME format).
+        """
+        return self._set(contentType=value)
+    storeContent = Param(
+        Params._dummy(),
+        "storeContent",
+        "Whether to include the raw file content in the output DataFrame "
+        "as a separate 'content' column, alongside the structured output.",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setStoreContent(self, value: bool):
+        """Sets whether to store raw file content.
+        Parameters
+        ----------
+        value : bool
+            True to include raw file content, False otherwise.
+        """
+        return self._set(storeContent=value)
+    titleFontSize = Param(
+        Params._dummy(),
+        "titleFontSize",
+        "Minimum font size threshold used as part of heuristic rules to detect "
+        "title elements based on formatting (e.g., bold, centered, capitalized).",
+        typeConverter=TypeConverters.toInt
+    )
+    def setTitleFontSize(self, value: int):
+        """Sets minimum font size for detecting titles.
+        Parameters
+        ----------
+        value : int
+            Minimum font size threshold for title detection.
+        """
+        return self._set(titleFontSize=value)
+    inferTableStructure = Param(
+        Params._dummy(),
+        "inferTableStructure",
+        "Whether to generate an HTML table representation from structured table content. "
+        "When enabled, a full <table> element is added alongside cell-level elements, "
+        "based on row and column layout.",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setInferTableStructure(self, value: bool):
+        """Sets whether to infer table structure.
+        Parameters
+        ----------
+        value : bool
+            True to generate HTML table representation, False otherwise.
+        """
+        return self._set(inferTableStructure=value)
+    includePageBreaks = Param(
+        Params._dummy(),
+        "includePageBreaks",
+        "Whether to detect and tag content with page break metadata. "
+        "In Word documents, this includes manual and section breaks. "
+        "In Excel files, this includes page breaks based on column boundaries.",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setIncludePageBreaks(self, value: bool):
+        """Sets whether to include page break metadata.
+        Parameters
+        ----------
+        value : bool
+            True to detect and tag page breaks, False otherwise.
+        """
+        return self._set(includePageBreaks=value)
+    ignoreExceptions = Param(
+        Params._dummy(),
+        "ignoreExceptions",
+        "Whether to ignore exceptions during processing.",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setIgnoreExceptions(self, value: bool):
+        """Sets whether to ignore exceptions during processing.
+        Parameters
+        ----------
+        value : bool
+            True to ignore exceptions, False otherwise.
+        """
+        return self._set(ignoreExceptions=value)
+    explodeDocs = Param(
+        Params._dummy(),
+        "explodeDocs",
+        "Whether to explode the documents into separate rows.",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setExplodeDocs(self, value: bool):
+        """Sets whether to explode the documents into separate rows.
+        Parameters
+        ----------
+        value : bool
+            True to split documents into multiple rows, False to keep them in one row.
+        """
+        return self._set(explodeDocs=value)
+    flattenOutput = Param(
+        Params._dummy(),
+        "flattenOutput",
+        "If true, output is flattened to plain text with minimal metadata",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setFlattenOutput(self, value):
+        """Sets whether to flatten the output to plain text with minimal metadata.
+        ParametersF
+        ----------
+        value : bool
+            If true, output is flattened to plain text with minimal metadata
+        """
+        return self._set(flattenOutput=value)
+    titleThreshold = Param(
+        Params._dummy(),
+        "titleThreshold",
+        "Minimum font size threshold for title detection in PDF docs",
+        typeConverter=TypeConverters.toFloat
+    )
+    def setTitleThreshold(self, value):
+        """Sets the minimum font size threshold for title detection in PDF documents.
+        Parameters
+        ----------
+        value : float
+            Minimum font size threshold for title detection in PDF docs
+        """
+        return self._set(titleThreshold=value)
+    outputAsDocument = Param(
+        Params._dummy(),
+        "outputAsDocument",
+        "Whether to return all sentences joined into a single document",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setOutputAsDocument(self, value):
+        """Sets whether to return all sentences joined into a single document.
+        Parameters
+        ----------
+        value : bool
+            Whether to return all sentences joined into a single document
+        """
+        return self._set(outputAsDocument=value)
 class HasEmailReaderProperties(Params):
@@ -144,6 +372,28 @@ class HasHTMLReaderProperties(Params):
         self._call_java("setHeadersPython", headers)
         return self
+    outputFormat = Param(
+        Params._dummy(),
+        "outputFormat",
+        "Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
+        typeConverter=TypeConverters.toString
+    )
+    def setOutputFormat(self, value: str):
+        """Sets output format for the table content.
+        Options
+        -------
+        - 'plain-text'
+        - 'html-table'
+        - 'json-table' (default)
+        Parameters
+        ----------
+        value : str
+            Output format for the table content.
+        """
+        return self._set(outputFormat=value)
 class HasPowerPointProperties(Params):
@@ -317,3 +567,196 @@ class HasChunkerProperties(Params):
     def setOverlapAll(self, value):
         return self._set(overlapAll=value)
+from pyspark.ml.param import Param, Params, TypeConverters
+class HasPdfProperties(Params):
+    pageNumCol = Param(
+        Params._dummy(),
+        "pageNumCol",
+        "Page number output column name.",
+        typeConverter=TypeConverters.toString
+    )
+    def setPageNumCol(self, value: str):
+        """Sets page number output column name.
+        Parameters
+        ----------
+        value : str
+            Name of the column for page numbers.
+        """
+        return self._set(pageNumCol=value)
+    originCol = Param(
+        Params._dummy(),
+        "originCol",
+        "Input column name with original path of file.",
+        typeConverter=TypeConverters.toString
+    )
+    def setOriginCol(self, value: str):
+        """Sets input column with original file path.
+        Parameters
+        ----------
+        value : str
+            Column name that stores the file path.
+        """
+        return self._set(originCol=value)
+    partitionNum = Param(
+        Params._dummy(),
+        "partitionNum",
+        "Number of partitions.",
+        typeConverter=TypeConverters.toInt
+    )
+    def setPartitionNum(self, value: int):
+        """Sets number of partitions.
+        Parameters
+        ----------
+        value : int
+            Number of partitions to use.
+        """
+        return self._set(partitionNum=value)
+    storeSplittedPdf = Param(
+        Params._dummy(),
+        "storeSplittedPdf",
+        "Force to store bytes content of splitted pdf.",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setStoreSplittedPdf(self, value: bool):
+        """Sets whether to store byte content of split PDF pages.
+        Parameters
+        ----------
+        value : bool
+            True to store PDF page bytes, False otherwise.
+        """
+        return self._set(storeSplittedPdf=value)
+    splitPage = Param(
+        Params._dummy(),
+        "splitPage",
+        "Enable/disable splitting per page to identify page numbers and improve performance.",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setSplitPage(self, value: bool):
+        """Sets whether to split PDF into pages.
+        Parameters
+        ----------
+        value : bool
+            True to split per page, False otherwise.
+        """
+        return self._set(splitPage=value)
+    onlyPageNum = Param(
+        Params._dummy(),
+        "onlyPageNum",
+        "Extract only page numbers.",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setOnlyPageNum(self, value: bool):
+        """Sets whether to extract only page numbers.
+        Parameters
+        ----------
+        value : bool
+            True to extract only page numbers, False otherwise.
+        """
+        return self._set(onlyPageNum=value)
+    textStripper = Param(
+        Params._dummy(),
+        "textStripper",
+        "Text stripper type used for output layout and formatting.",
+        typeConverter=TypeConverters.toString
+    )
+    def setTextStripper(self, value: str):
+        """Sets text stripper type.
+        Parameters
+        ----------
+        value : str
+            Text stripper type for layout and formatting.
+        """
+        return self._set(textStripper=value)
+    sort = Param(
+        Params._dummy(),
+        "sort",
+        "Enable/disable sorting content on the page.",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setSort(self, value: bool):
+        """Sets whether to sort content on the page.
+        Parameters
+        ----------
+        value : bool
+            True to sort content, False otherwise.
+        """
+        return self._set(sort=value)
+    extractCoordinates = Param(
+        Params._dummy(),
+        "extractCoordinates",
+        "Force extract coordinates of text.",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setExtractCoordinates(self, value: bool):
+        """Sets whether to extract coordinates of text.
+        Parameters
+        ----------
+        value : bool
+            True to extract coordinates, False otherwise.
+        """
+        return self._set(extractCoordinates=value)
+    normalizeLigatures = Param(
+        Params._dummy(),
+        "normalizeLigatures",
+        "Whether to convert ligature chars such as 'ﬂ' into its corresponding chars (e.g., {'f', 'l'}).",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setNormalizeLigatures(self, value: bool):
+        """Sets whether to normalize ligatures (e.g., ﬂ → f + l).
+        Parameters
+        ----------
+        value : bool
+            True to normalize ligatures, False otherwise.
+        """
+        return self._set(normalizeLigatures=value)
+    readAsImage = Param(
+        Params._dummy(),
+        "readAsImage",
+        "Read PDF pages as images.",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setReadAsImage(self, value: bool):
+        """Sets whether to read PDF pages as images.
+        Parameters
+        ----------
+        value : bool
+            True to read as images, False otherwise.
+        """
+        return self._set(readAsImage=value)

spark-nlp 6.1.3rc1__py2.py3-none-any.whl → 6.1.5__py2.py3-none-any.whl

Potentially problematic release.

spark-nlp 6.1.3rc1py2.py3-none-any.whl → 6.1.5py2.py3-none-any.whl