PyPI - spark-nlp - Versions diffs - 6.1.3__py2.py3-none-any.whl → 6.1.4__py2.py3-none-any.whl - Mend

spark-nlp 6.1.3py2.py3-none-any.whl → 6.1.4py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of spark-nlp might be problematic. Click here for more details.

Files changed (9) hide show

{spark_nlp-6.1.3.dist-info → spark_nlp-6.1.4.dist-info}/METADATA +5 -5
{spark_nlp-6.1.3.dist-info → spark_nlp-6.1.4.dist-info}/RECORD +9 -8
sparknlp/__init__.py +1 -1
sparknlp/partition/partition_properties.py +377 -1
sparknlp/reader/reader2doc.py +2 -86
sparknlp/reader/reader2image.py +136 -0
sparknlp/reader/reader2table.py +2 -88
{spark_nlp-6.1.3.dist-info → spark_nlp-6.1.4.dist-info}/WHEEL +0 -0
{spark_nlp-6.1.3.dist-info → spark_nlp-6.1.4.dist-info}/top_level.txt +0 -0

{spark_nlp-6.1.3.dist-info → spark_nlp-6.1.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: spark-nlp
-Version: 6.1.3
+Version: 6.1.4
 Summary: John Snow Labs Spark NLP is a natural language processing library built on top of Apache Spark ML. It provides simple, performant & accurate NLP annotations for machine learning pipelines, that scale easily in a distributed environment.
 Home-page: https://github.com/JohnSnowLabs/spark-nlp
 Author: John Snow Labs
@@ -102,7 +102,7 @@ $ java -version
 $ conda create -n sparknlp python=3.7 -y
 $ conda activate sparknlp
 # spark-nlp by default is based on pyspark 3.x
-$ pip install spark-nlp==6.1.3 pyspark==3.3.1
+$ pip install spark-nlp==6.1.4 pyspark==3.3.1
 ```
 In Python console or Jupyter `Python3` kernel:
@@ -168,7 +168,7 @@ For a quick example of using pipelines and models take a look at our official [d
 ### Apache Spark Support
-Spark NLP *6.1.3* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x
+Spark NLP *6.1.4* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x
 | Spark NLP | Apache Spark 3.5.x | Apache Spark 3.4.x | Apache Spark 3.3.x | Apache Spark 3.2.x | Apache Spark 3.1.x | Apache Spark 3.0.x | Apache Spark 2.4.x | Apache Spark 2.3.x |
 |-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
@@ -198,7 +198,7 @@ Find out more about 4.x `SparkNLP` versions in our official [documentation](http
 ### Databricks Support
-Spark NLP 6.1.3 has been tested and is compatible with the following runtimes:
+Spark NLP 6.1.4 has been tested and is compatible with the following runtimes:
 | **CPU**            | **GPU**            |
 |--------------------|--------------------|
@@ -216,7 +216,7 @@ We are compatible with older runtimes. For a full list check databricks support
 ### EMR Support
-Spark NLP 6.1.3 has been tested and is compatible with the following EMR releases:
+Spark NLP 6.1.4 has been tested and is compatible with the following EMR releases:
 | **EMR Release**    |
 |--------------------|

{spark_nlp-6.1.3.dist-info → spark_nlp-6.1.4.dist-info}/RECORD RENAMED Viewed

@@ -3,7 +3,7 @@ com/johnsnowlabs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
 com/johnsnowlabs/ml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 com/johnsnowlabs/ml/ai/__init__.py,sha256=YQiK2M7U4d8y5irPy_HB8ae0mSpqS9583MH44pnKJXc,295
 com/johnsnowlabs/nlp/__init__.py,sha256=DPIVXtONO5xXyOk-HB0-sNiHAcco17NN13zPS_6Uw8c,294
-sparknlp/__init__.py,sha256=UR0dRykX67j-Ksuzk5Xe-Mod5qCK24iBjHHa0omOp2w,13814
+sparknlp/__init__.py,sha256=LcfC7bWeae5XgjWbNbWH94LlJkBon5dA8fYnb_2NyGc,13814
 sparknlp/annotation.py,sha256=I5zOxG5vV2RfPZfqN9enT1i4mo6oBcn3Lrzs37QiOiA,5635
 sparknlp/annotation_audio.py,sha256=iRV_InSVhgvAwSRe9NTbUH9v6OGvTM-FPCpSAKVu0mE,1917
 sparknlp/annotation_image.py,sha256=xhCe8Ko-77XqWVuuYHFrjKqF6zPd8Z-RY_rmZXNwCXU,2547
@@ -241,7 +241,7 @@ sparknlp/logging/__init__.py,sha256=DoROFF5KLZe4t4Q-OHxqk1nhqbw9NQ-wb64y8icNwgw,
 sparknlp/logging/comet.py,sha256=_ZBi9-hlilCAnd4lvdYMWiq4Vqsppv8kow3k0cf-NG4,15958
 sparknlp/partition/__init__.py,sha256=L0w-yv_HnnvoKlSX5MzI2GKHW3RLLfGyq8bgWYVeKjU,749
 sparknlp/partition/partition.py,sha256=GXEAUvOea04Vc_JK0z112cAKFrJ4AEpjLJ8xlzZt6Kw,8551
-sparknlp/partition/partition_properties.py,sha256=xhAMhlsTBg-WS6KWDyVbRPwO7IzpowVVhJNR-ZGhvdo,9520
+sparknlp/partition/partition_properties.py,sha256=2tGdIv1NaJNaux_TTskKQHnARAwBkFctaqCcNw21Wr8,19920
 sparknlp/partition/partition_transformer.py,sha256=lRR1h-IMlHR8M0VeB50SbU39GHHF5PgMaJ42qOriS6A,6855
 sparknlp/pretrained/__init__.py,sha256=GV-x9UBK8F2_IR6zYatrzFcVJtkSUIMbxqWsxRUePmQ,793
 sparknlp/pretrained/pretrained_pipeline.py,sha256=lquxiaABuA68Rmu7csamJPqBoRJqMUO0oNHsmEZDAIs,5740
@@ -250,8 +250,9 @@ sparknlp/pretrained/utils.py,sha256=T1MrvW_DaWk_jcOjVLOea0NMFE9w8fe0ZT_5urZ_nEY,
 sparknlp/reader/__init__.py,sha256=-Toj3AIBki-zXPpV8ezFTI2LX1yP_rK2bhpoa8nBkTw,685
 sparknlp/reader/enums.py,sha256=MNGug9oJ1BBLM1Pbske13kAabalDzHa2kucF5xzFpHs,770
 sparknlp/reader/pdf_to_text.py,sha256=eWw-cwjosmcSZ9eHso0F5QQoeGBBnwsOhzhCXXvMjZA,7169
-sparknlp/reader/reader2doc.py,sha256=8x1tvx7Hj2J4xpyRiCUvrG-kmOPBvIE8K1tJZY-e0Xw,8200
-sparknlp/reader/reader2table.py,sha256=GC6Yz0gQ83S6XKOi329TUNQuAvLrBxysqDkDRZPvcYA,4759
+sparknlp/reader/reader2doc.py,sha256=87aMk8-_1NHd3bB1rxw56BQMJc6mGgtnYGXwKw2uCmU,5916
+sparknlp/reader/reader2image.py,sha256=k3gb4LEiqDV-pnD-HEaA1KHoAxXmoYys2Y817i1yvP0,4557
+sparknlp/reader/reader2table.py,sha256=pIR9r6NapUV4xdsFecadWlKTSJmRMAm36eqM9aXf13k,2416
 sparknlp/reader/sparknlp_reader.py,sha256=MJs8v_ECYaV1SOabI1L_2MkVYEDVImtwgbYypO7DJSY,20623
 sparknlp/training/__init__.py,sha256=qREi9u-5Vc2VjpL6-XZsyvu5jSEIdIhowW7_kKaqMqo,852
 sparknlp/training/conll.py,sha256=wKBiSTrjc6mjsl7Nyt6B8f4yXsDJkZb-sn8iOjix9cE,6961
@@ -283,7 +284,7 @@ sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py,sha256=R4yHFN3
 sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py,sha256=EoCSdcIjqQ3wv13MAuuWrKV8wyVBP0SbOEW41omHlR0,23189
 sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py,sha256=k5CQ7gKV6HZbZMB8cKLUJuZxoZWlP_DFWdZ--aIDwsc,2356
 sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py,sha256=pAxjWhjazSX8Vg0MFqJiuRVw1IbnQNSs-8Xp26L4nko,870
-spark_nlp-6.1.3.dist-info/METADATA,sha256=U4Fb5wRd8Ql6BULfRwQSE6Pa77wsLwOGwTk-s038YuI,19774
-spark_nlp-6.1.3.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
-spark_nlp-6.1.3.dist-info/top_level.txt,sha256=uuytur4pyMRw2H_txNY2ZkaucZHUs22QF8-R03ch_-E,13
-spark_nlp-6.1.3.dist-info/RECORD,,
+spark_nlp-6.1.4.dist-info/METADATA,sha256=CqRyNEZCA_8F_J5vHG4GUZXRiavXyfb3tPMTStidr4c,19774
+spark_nlp-6.1.4.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
+spark_nlp-6.1.4.dist-info/top_level.txt,sha256=uuytur4pyMRw2H_txNY2ZkaucZHUs22QF8-R03ch_-E,13
+spark_nlp-6.1.4.dist-info/RECORD,,

sparknlp/__init__.py CHANGED Viewed

@@ -66,7 +66,7 @@ sys.modules['com.johnsnowlabs.ml.ai'] = annotator
 annotators = annotator
 embeddings = annotator
-__version__ = "6.1.3"
+__version__ = "6.1.4"
 def start(gpu=False,

sparknlp/partition/partition_properties.py CHANGED Viewed

@@ -13,8 +13,159 @@
 #  limitations under the License.
 """Contains classes for partition properties used in reading various document types."""
 from typing import Dict
+from pyspark.ml.param import Param, Params, TypeConverters
-from pyspark.ml.param import TypeConverters, Params, Param
+class HasReaderProperties(Params):
+    outputCol = Param(
+        Params._dummy(),
+        "outputCol",
+        "output column name",
+        typeConverter=TypeConverters.toString
+    )
+    contentPath = Param(
+        Params._dummy(),
+        "contentPath",
+        "Path to the content source.",
+        typeConverter=TypeConverters.toString
+    )
+    def setContentPath(self, value: str):
+        """Sets content path.
+        Parameters
+        ----------
+        value : str
+            Path to the content source.
+        """
+        return self._set(contentPath=value)
+    contentType = Param(
+        Params._dummy(),
+        "contentType",
+        "Set the content type to load following MIME specification.",
+        typeConverter=TypeConverters.toString
+    )
+    def setContentType(self, value: str):
+        """Sets content type following MIME specification.
+        Parameters
+        ----------
+        value : str
+            Content type string (MIME format).
+        """
+        return self._set(contentType=value)
+    storeContent = Param(
+        Params._dummy(),
+        "storeContent",
+        "Whether to include the raw file content in the output DataFrame "
+        "as a separate 'content' column, alongside the structured output.",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setStoreContent(self, value: bool):
+        """Sets whether to store raw file content.
+        Parameters
+        ----------
+        value : bool
+            True to include raw file content, False otherwise.
+        """
+        return self._set(storeContent=value)
+    titleFontSize = Param(
+        Params._dummy(),
+        "titleFontSize",
+        "Minimum font size threshold used as part of heuristic rules to detect "
+        "title elements based on formatting (e.g., bold, centered, capitalized).",
+        typeConverter=TypeConverters.toInt
+    )
+    def setTitleFontSize(self, value: int):
+        """Sets minimum font size for detecting titles.
+        Parameters
+        ----------
+        value : int
+            Minimum font size threshold for title detection.
+        """
+        return self._set(titleFontSize=value)
+    inferTableStructure = Param(
+        Params._dummy(),
+        "inferTableStructure",
+        "Whether to generate an HTML table representation from structured table content. "
+        "When enabled, a full <table> element is added alongside cell-level elements, "
+        "based on row and column layout.",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setInferTableStructure(self, value: bool):
+        """Sets whether to infer table structure.
+        Parameters
+        ----------
+        value : bool
+            True to generate HTML table representation, False otherwise.
+        """
+        return self._set(inferTableStructure=value)
+    includePageBreaks = Param(
+        Params._dummy(),
+        "includePageBreaks",
+        "Whether to detect and tag content with page break metadata. "
+        "In Word documents, this includes manual and section breaks. "
+        "In Excel files, this includes page breaks based on column boundaries.",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setIncludePageBreaks(self, value: bool):
+        """Sets whether to include page break metadata.
+        Parameters
+        ----------
+        value : bool
+            True to detect and tag page breaks, False otherwise.
+        """
+        return self._set(includePageBreaks=value)
+    ignoreExceptions = Param(
+        Params._dummy(),
+        "ignoreExceptions",
+        "Whether to ignore exceptions during processing.",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setIgnoreExceptions(self, value: bool):
+        """Sets whether to ignore exceptions during processing.
+        Parameters
+        ----------
+        value : bool
+            True to ignore exceptions, False otherwise.
+        """
+        return self._set(ignoreExceptions=value)
+    explodeDocs = Param(
+        Params._dummy(),
+        "explodeDocs",
+        "Whether to explode the documents into separate rows.",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setExplodeDocs(self, value: bool):
+        """Sets whether to explode the documents into separate rows.
+        Parameters
+        ----------
+        value : bool
+            True to split documents into multiple rows, False to keep them in one row.
+        """
+        return self._set(explodeDocs=value)
 class HasEmailReaderProperties(Params):
@@ -144,6 +295,28 @@ class HasHTMLReaderProperties(Params):
         self._call_java("setHeadersPython", headers)
         return self
+    outputFormat = Param(
+        Params._dummy(),
+        "outputFormat",
+        "Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
+        typeConverter=TypeConverters.toString
+    )
+    def setOutputFormat(self, value: str):
+        """Sets output format for the table content.
+        Options
+        -------
+        - 'plain-text'
+        - 'html-table'
+        - 'json-table' (default)
+        Parameters
+        ----------
+        value : str
+            Output format for the table content.
+        """
+        return self._set(outputFormat=value)
 class HasPowerPointProperties(Params):
@@ -317,3 +490,206 @@ class HasChunkerProperties(Params):
     def setOverlapAll(self, value):
         return self._set(overlapAll=value)
+from pyspark.ml.param import Param, Params, TypeConverters
+class HasPdfProperties(Params):
+    pageNumCol = Param(
+        Params._dummy(),
+        "pageNumCol",
+        "Page number output column name.",
+        typeConverter=TypeConverters.toString
+    )
+    def setPageNumCol(self, value: str):
+        """Sets page number output column name.
+        Parameters
+        ----------
+        value : str
+            Name of the column for page numbers.
+        """
+        return self._set(pageNumCol=value)
+    originCol = Param(
+        Params._dummy(),
+        "originCol",
+        "Input column name with original path of file.",
+        typeConverter=TypeConverters.toString
+    )
+    def setOriginCol(self, value: str):
+        """Sets input column with original file path.
+        Parameters
+        ----------
+        value : str
+            Column name that stores the file path.
+        """
+        return self._set(originCol=value)
+    partitionNum = Param(
+        Params._dummy(),
+        "partitionNum",
+        "Number of partitions.",
+        typeConverter=TypeConverters.toInt
+    )
+    def setPartitionNum(self, value: int):
+        """Sets number of partitions.
+        Parameters
+        ----------
+        value : int
+            Number of partitions to use.
+        """
+        return self._set(partitionNum=value)
+    storeSplittedPdf = Param(
+        Params._dummy(),
+        "storeSplittedPdf",
+        "Force to store bytes content of splitted pdf.",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setStoreSplittedPdf(self, value: bool):
+        """Sets whether to store byte content of split PDF pages.
+        Parameters
+        ----------
+        value : bool
+            True to store PDF page bytes, False otherwise.
+        """
+        return self._set(storeSplittedPdf=value)
+    splitPage = Param(
+        Params._dummy(),
+        "splitPage",
+        "Enable/disable splitting per page to identify page numbers and improve performance.",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setSplitPage(self, value: bool):
+        """Sets whether to split PDF into pages.
+        Parameters
+        ----------
+        value : bool
+            True to split per page, False otherwise.
+        """
+        return self._set(splitPage=value)
+    onlyPageNum = Param(
+        Params._dummy(),
+        "onlyPageNum",
+        "Extract only page numbers.",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setOnlyPageNum(self, value: bool):
+        """Sets whether to extract only page numbers.
+        Parameters
+        ----------
+        value : bool
+            True to extract only page numbers, False otherwise.
+        """
+        return self._set(onlyPageNum=value)
+    textStripper = Param(
+        Params._dummy(),
+        "textStripper",
+        "Text stripper type used for output layout and formatting.",
+        typeConverter=TypeConverters.toString
+    )
+    def setTextStripper(self, value: str):
+        """Sets text stripper type.
+        Parameters
+        ----------
+        value : str
+            Text stripper type for layout and formatting.
+        """
+        return self._set(textStripper=value)
+    sort = Param(
+        Params._dummy(),
+        "sort",
+        "Enable/disable sorting content on the page.",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setSort(self, value: bool):
+        """Sets whether to sort content on the page.
+        Parameters
+        ----------
+        value : bool
+            True to sort content, False otherwise.
+        """
+        return self._set(sort=value)
+    extractCoordinates = Param(
+        Params._dummy(),
+        "extractCoordinates",
+        "Force extract coordinates of text.",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setExtractCoordinates(self, value: bool):
+        """Sets whether to extract coordinates of text.
+        Parameters
+        ----------
+        value : bool
+            True to extract coordinates, False otherwise.
+        """
+        return self._set(extractCoordinates=value)
+    normalizeLigatures = Param(
+        Params._dummy(),
+        "normalizeLigatures",
+        "Whether to convert ligature chars such as 'ﬂ' into its corresponding chars (e.g., {'f', 'l'}).",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setNormalizeLigatures(self, value: bool):
+        """Sets whether to normalize ligatures (e.g., ﬂ → f + l).
+        Parameters
+        ----------
+        value : bool
+            True to normalize ligatures, False otherwise.
+        """
+        return self._set(normalizeLigatures=value)
+    readAsImage = Param(
+        Params._dummy(),
+        "readAsImage",
+        "Read PDF pages as images.",
+        typeConverter=TypeConverters.toBoolean
+    )
+    def setReadAsImage(self, value: bool):
+        """Sets whether to read PDF pages as images.
+        Parameters
+        ----------
+        value : bool
+            True to read as images, False otherwise.
+        """
+        return self._set(readAsImage=value)
+    def setOutputCol(self, value):
+        """Sets output column name.
+        Parameters
+        ----------
+        value : str
+            Name of the Output Column
+        """
+        return self._set(outputCol=value)

sparknlp/reader/reader2doc.py CHANGED Viewed

@@ -21,9 +21,10 @@ from sparknlp.partition.partition_properties import *
 class Reader2Doc(
     AnnotatorTransformer,
+    HasReaderProperties,
+    HasHTMLReaderProperties,
     HasEmailReaderProperties,
     HasExcelReaderProperties,
-    HasHTMLReaderProperties,
     HasPowerPointProperties,
     HasTextReaderProperties
 ):
@@ -73,33 +74,6 @@ class Reader2Doc(
     name = "Reader2Doc"
     outputAnnotatorType = AnnotatorType.DOCUMENT
-    contentPath = Param(
-        Params._dummy(),
-        "contentPath",
-        "contentPath path to files to read",
-        typeConverter=TypeConverters.toString
-    )
-    outputCol = Param(
-        Params._dummy(),
-        "outputCol",
-        "output column name",
-        typeConverter=TypeConverters.toString
-    )
-    contentType = Param(
-        Params._dummy(),
-        "contentType",
-        "Set the content type to load following MIME specification",
-        typeConverter=TypeConverters.toString
-    )
-    explodeDocs = Param(
-        Params._dummy(),
-        "explodeDocs",
-        "whether to explode the documents into separate rows",
-        typeConverter=TypeConverters.toBoolean
-    )
     flattenOutput = Param(
         Params._dummy(),
@@ -115,13 +89,6 @@ class Reader2Doc(
         typeConverter=TypeConverters.toFloat
     )
-    outputFormat = Param(
-        Params._dummy(),
-        "outputFormat",
-        "Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
-        typeConverter=TypeConverters.toString
-    )
     outputAsDocument = Param(
         Params._dummy(),
         "outputAsDocument",
@@ -151,47 +118,6 @@ class Reader2Doc(
         kwargs = self._input_kwargs
         return self._set(**kwargs)
-    def setContentPath(self, value):
-        """Sets content path.
-        Parameters
-        ----------
-        value : str
-            contentPath path to files to read
-        """
-        return self._set(contentPath=value)
-    def setContentType(self, value):
-        """
-        Set the content type to load following MIME specification
-        Parameters
-        ----------
-        value : str
-            content type to load following MIME specification
-        """
-        return self._set(contentType=value)
-    def setExplodeDocs(self, value):
-        """Sets whether to explode the documents into separate rows.
-        Parameters
-        ----------
-        value : boolean
-        Whether to explode the documents into separate rows
-        """
-        return self._set(explodeDocs=value)
-    def setOutputCol(self, value):
-        """Sets output column name.
-        Parameters
-        ----------
-        value : str
-            Name of the Output Column
-        """
-        return self._set(outputCol=value)
     def setFlattenOutput(self, value):
         """Sets whether to flatten the output to plain text with minimal metadata.
@@ -213,16 +139,6 @@ class Reader2Doc(
         """
         return self._set(titleThreshold=value)
-    def setOutputFormat(self, value):
-        """Sets the output format for the table content.
-        Parameters
-        ----------
-        value : str
-            Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.
-        """
-        return self._set(outputFormat=value)
     def setOutputAsDocument(self, value):
         """Sets whether to return all sentences joined into a single document.

sparknlp/reader/reader2image.py ADDED Viewed

@@ -0,0 +1,136 @@
+#  Copyright 2017-2025 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+from pyspark import keyword_only
+from pyspark.ml.param import TypeConverters, Params, Param
+from sparknlp.common import AnnotatorType
+from sparknlp.internal import AnnotatorTransformer
+from sparknlp.partition.partition_properties import *
+class Reader2Image(
+    AnnotatorTransformer,
+    HasReaderProperties,
+    HasHTMLReaderProperties,
+    HasPdfProperties
+):
+    """
+    The Reader2Image annotator allows you to use the reading files with images more smoothly within existing
+    Spark NLP workflows, enabling seamless reuse of your pipelines. Reader2Image can be used for
+    extracting structured image content from various document types using Spark NLP readers. It supports
+    reading from many file types and returns parsed output as a structured Spark DataFrame.
+    Supported formats include HTML and Markdown.
+    == Example ==
+    This example demonstrates how to load HTML files with images and process them into a structured
+    Spark DataFrame using Reader2Image.
+    Expected output:
+    +-------------------+--------------------+
+    |           fileName|               image|
+    +-------------------+--------------------+
+    |example-images.html|[{image, example-...|
+    |example-images.html|[{image, example-...|
+    +-------------------+--------------------+
+    Schema:
+    root
+     |-- fileName: string (nullable = true)
+     |-- image: array (nullable = false)
+     |    |-- element: struct (containsNull = true)
+     |    |    |-- annotatorType: string (nullable = true)
+     |    |    |-- origin: string (nullable = true)
+     |    |    |-- height: integer (nullable = false)
+     |    |    |-- width: integer (nullable = false)
+     |    |    |-- nChannels: integer (nullable = false)
+     |    |    |-- mode: integer (nullable = false)
+     |    |    |-- result: binary (nullable = true)
+     |    |    |-- metadata: map (nullable = true)
+     |    |    |    |-- key: string
+     |    |    |    |-- value: string (valueContainsNull = true)
+     |    |    |-- text: string (nullable = true)
+    """
+    name = "Reader2Image"
+    outputAnnotatorType = AnnotatorType.IMAGE
+    userMessage = Param(
+        Params._dummy(),
+        "userMessage",
+        "Custom user message.",
+        typeConverter=TypeConverters.toString
+    )
+    promptTemplate = Param(
+        Params._dummy(),
+        "promptTemplate",
+        "Format of the output prompt.",
+        typeConverter=TypeConverters.toString
+    )
+    customPromptTemplate = Param(
+        Params._dummy(),
+        "customPromptTemplate",
+        "Custom prompt template for image models.",
+        typeConverter=TypeConverters.toString
+    )
+    @keyword_only
+    def __init__(self):
+        super(Reader2Image, self).__init__(classname="com.johnsnowlabs.reader.Reader2Image")
+        self._setDefault(
+            contentType="",
+            outputFormat="image",
+            explodeDocs=True,
+            userMessage="Describe this image",
+            promptTemplate="qwen2vl-chat",
+            readAsImage=True,
+            customPromptTemplate="",
+            ignoreExceptions=True
+        )
+    @keyword_only
+    def setParams(self):
+        kwargs = self._input_kwargs
+        return self._set(**kwargs)
+    def setUserMessage(self, value: str):
+        """Sets custom user message.
+        Parameters
+        ----------
+        value : str
+            Custom user message to include.
+        """
+        return self._set(userMessage=value)
+    def setPromptTemplate(self, value: str):
+        """Sets format of the output prompt.
+        Parameters
+        ----------
+        value : str
+            Prompt template format.
+        """
+        return self._set(promptTemplate=value)
+    def setCustomPromptTemplate(self, value: str):
+        """Sets custom prompt template for image models.
+        Parameters
+        ----------
+        value : str
+            Custom prompt template string.
+        """
+        return self._set(customPromptTemplate=value)

sparknlp/reader/reader2table.py CHANGED Viewed

@@ -13,14 +13,15 @@
 #  limitations under the License.
 from pyspark import keyword_only
-from pyspark.ml.param import TypeConverters, Params, Param
 from sparknlp.common import AnnotatorType
 from sparknlp.internal import AnnotatorTransformer
 from sparknlp.partition.partition_properties import *
 class Reader2Table(
     AnnotatorTransformer,
+    HasReaderProperties,
     HasEmailReaderProperties,
     HasExcelReaderProperties,
     HasHTMLReaderProperties,
@@ -31,34 +32,6 @@ class Reader2Table(
     outputAnnotatorType = AnnotatorType.DOCUMENT
-    contentPath = Param(
-        Params._dummy(),
-        "contentPath",
-        "contentPath path to files to read",
-        typeConverter=TypeConverters.toString
-    )
-    outputCol = Param(
-        Params._dummy(),
-        "outputCol",
-        "output column name",
-        typeConverter=TypeConverters.toString
-    )
-    contentType = Param(
-        Params._dummy(),
-        "contentType",
-        "Set the content type to load following MIME specification",
-        typeConverter=TypeConverters.toString
-    )
-    explodeDocs = Param(
-        Params._dummy(),
-        "explodeDocs",
-        "whether to explode the documents into separate rows",
-        typeConverter=TypeConverters.toBoolean
-    )
     flattenOutput = Param(
         Params._dummy(),
         "flattenOutput",
@@ -73,13 +46,6 @@ class Reader2Table(
         typeConverter=TypeConverters.toFloat
     )
-    outputFormat = Param(
-        Params._dummy(),
-        "outputFormat",
-        "Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
-        typeConverter=TypeConverters.toString
-    )
     @keyword_only
     def __init__(self):
         super(Reader2Table, self).__init__(classname="com.johnsnowlabs.reader.Reader2Table")
@@ -90,48 +56,6 @@ class Reader2Table(
         kwargs = self._input_kwargs
         return self._set(**kwargs)
-    def setContentPath(self, value):
-        """Sets content path.
-        Parameters
-        ----------
-        value : str
-            contentPath path to files to read
-        """
-        return self._set(contentPath=value)
-    def setContentType(self, value):
-        """
-        Set the content type to load following MIME specification
-        Parameters
-        ----------
-        value : str
-            content type to load following MIME specification
-        """
-        return self._set(contentType=value)
-    def setExplodeDocs(self, value):
-        """Sets whether to explode the documents into separate rows.
-        Parameters
-        ----------
-        value : boolean
-        Whether to explode the documents into separate rows
-        """
-        return self._set(explodeDocs=value)
-    def setOutputCol(self, value):
-        """Sets output column name.
-        Parameters
-        ----------
-        value : str
-            Name of the Output Column
-        """
-        return self._set(outputCol=value)
     def setFlattenOutput(self, value):
         """Sets whether to flatten the output to plain text with minimal metadata.
@@ -151,13 +75,3 @@ class Reader2Table(
             Minimum font size threshold for title detection in PDF docs
         """
         return self._set(titleThreshold=value)
-    def setOutputFormat(self, value):
-        """Sets the output format for the table content.
-        Parameters
-        ----------
-        value : str
-            Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.
-        """
-        return self._set(outputFormat=value)

{spark_nlp-6.1.3.dist-info → spark_nlp-6.1.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{spark_nlp-6.1.3.dist-info → spark_nlp-6.1.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

spark-nlp 6.1.3__py2.py3-none-any.whl → 6.1.4__py2.py3-none-any.whl

Potentially problematic release.

spark-nlp 6.1.3py2.py3-none-any.whl → 6.1.4py2.py3-none-any.whl