spark-nlp 6.1.3rc1__py2.py3-none-any.whl → 6.1.5__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spark-nlp might be problematic. Click here for more details.

@@ -12,7 +12,6 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  from pyspark import keyword_only
15
- from pyspark.ml.param import TypeConverters, Params, Param
16
15
 
17
16
  from sparknlp.common import AnnotatorType
18
17
  from sparknlp.internal import AnnotatorTransformer
@@ -21,9 +20,10 @@ from sparknlp.partition.partition_properties import *
21
20
 
22
21
  class Reader2Doc(
23
22
  AnnotatorTransformer,
23
+ HasReaderProperties,
24
+ HasHTMLReaderProperties,
24
25
  HasEmailReaderProperties,
25
26
  HasExcelReaderProperties,
26
- HasHTMLReaderProperties,
27
27
  HasPowerPointProperties,
28
28
  HasTextReaderProperties
29
29
  ):
@@ -68,59 +68,28 @@ class Reader2Doc(
68
68
  |[{'document', 15, 38, 'This is a narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
69
69
  |[{'document', 39, 68, 'This is another narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
70
70
  +------------------------------------------------------------------------------------------------------------------------------------+
71
- """
71
+ """
72
72
 
73
73
  name = "Reader2Doc"
74
- outputAnnotatorType = AnnotatorType.DOCUMENT
75
-
76
- contentPath = Param(
77
- Params._dummy(),
78
- "contentPath",
79
- "contentPath path to files to read",
80
- typeConverter=TypeConverters.toString
81
- )
82
-
83
- outputCol = Param(
84
- Params._dummy(),
85
- "outputCol",
86
- "output column name",
87
- typeConverter=TypeConverters.toString
88
- )
89
-
90
- contentType = Param(
91
- Params._dummy(),
92
- "contentType",
93
- "Set the content type to load following MIME specification",
94
- typeConverter=TypeConverters.toString
95
- )
96
74
 
97
- explodeDocs = Param(
98
- Params._dummy(),
99
- "explodeDocs",
100
- "whether to explode the documents into separate rows",
101
- typeConverter=TypeConverters.toBoolean
102
- )
75
+ outputAnnotatorType = AnnotatorType.DOCUMENT
103
76
 
104
- flattenOutput = Param(
77
+ excludeNonText = Param(
105
78
  Params._dummy(),
106
- "flattenOutput",
107
- "If true, output is flattened to plain text with minimal metadata",
79
+ "excludeNonText",
80
+ "Whether to exclude non-text content from the output. Default is False.",
108
81
  typeConverter=TypeConverters.toBoolean
109
82
  )
110
83
 
111
- titleThreshold = Param(
112
- Params._dummy(),
113
- "titleThreshold",
114
- "Minimum font size threshold for title detection in PDF docs",
115
- typeConverter=TypeConverters.toFloat
116
- )
84
+ def setExcludeNonText(self, value):
85
+ """Sets whether to exclude non-text content from the output.
117
86
 
118
- outputFormat = Param(
119
- Params._dummy(),
120
- "outputFormat",
121
- "Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
122
- typeConverter=TypeConverters.toString
123
- )
87
+ Parameters
88
+ ----------
89
+ value : bool
90
+ Whether to exclude non-text content from the output. Default is False.
91
+ """
92
+ return self._set(excludeNonText=value)
124
93
 
125
94
  @keyword_only
126
95
  def __init__(self):
@@ -136,75 +105,3 @@ class Reader2Doc(
136
105
  def setParams(self):
137
106
  kwargs = self._input_kwargs
138
107
  return self._set(**kwargs)
139
-
140
- def setContentPath(self, value):
141
- """Sets content path.
142
-
143
- Parameters
144
- ----------
145
- value : str
146
- contentPath path to files to read
147
- """
148
- return self._set(contentPath=value)
149
-
150
- def setContentType(self, value):
151
- """
152
- Set the content type to load following MIME specification
153
-
154
- Parameters
155
- ----------
156
- value : str
157
- content type to load following MIME specification
158
- """
159
- return self._set(contentType=value)
160
-
161
- def setExplodeDocs(self, value):
162
- """Sets whether to explode the documents into separate rows.
163
-
164
-
165
- Parameters
166
- ----------
167
- value : boolean
168
- Whether to explode the documents into separate rows
169
- """
170
- return self._set(explodeDocs=value)
171
-
172
- def setOutputCol(self, value):
173
- """Sets output column name.
174
-
175
- Parameters
176
- ----------
177
- value : str
178
- Name of the Output Column
179
- """
180
- return self._set(outputCol=value)
181
-
182
- def setFlattenOutput(self, value):
183
- """Sets whether to flatten the output to plain text with minimal metadata.
184
-
185
- Parameters
186
- ----------
187
- value : bool
188
- If true, output is flattened to plain text with minimal metadata
189
- """
190
- return self._set(flattenOutput=value)
191
-
192
- def setTitleThreshold(self, value):
193
- """Sets the minimum font size threshold for title detection in PDF documents.
194
-
195
- Parameters
196
- ----------
197
- value : float
198
- Minimum font size threshold for title detection in PDF docs
199
- """
200
- return self._set(titleThreshold=value)
201
-
202
- def setOutputFormat(self, value):
203
- """Sets the output format for the table content.
204
-
205
- Parameters
206
- ----------
207
- value : str
208
- Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.
209
- """
210
- return self._set(outputFormat=value)
@@ -20,46 +20,84 @@ from sparknlp.partition.partition_properties import *
20
20
 
21
21
  class Reader2Image(
22
22
  AnnotatorTransformer,
23
- HasHTMLReaderProperties
23
+ HasReaderProperties,
24
+ HasHTMLReaderProperties,
25
+ HasPdfProperties
24
26
  ):
27
+ """
28
+ The Reader2Image annotator allows you to use the reading files with images more smoothly within existing
29
+ Spark NLP workflows, enabling seamless reuse of your pipelines. Reader2Image can be used for
30
+ extracting structured image content from various document types using Spark NLP readers. It supports
31
+ reading from many file types and returns parsed output as a structured Spark DataFrame.
32
+
33
+ Supported formats include HTML and Markdown.
34
+
35
+ == Example ==
36
+ This example demonstrates how to load HTML files with images and process them into a structured
37
+ Spark DataFrame using Reader2Image.
38
+
39
+ Expected output:
40
+ +-------------------+--------------------+
41
+ | fileName| image|
42
+ +-------------------+--------------------+
43
+ |example-images.html|[{image, example-...|
44
+ |example-images.html|[{image, example-...|
45
+ +-------------------+--------------------+
46
+
47
+ Schema:
48
+ root
49
+ |-- fileName: string (nullable = true)
50
+ |-- image: array (nullable = false)
51
+ | |-- element: struct (containsNull = true)
52
+ | | |-- annotatorType: string (nullable = true)
53
+ | | |-- origin: string (nullable = true)
54
+ | | |-- height: integer (nullable = false)
55
+ | | |-- width: integer (nullable = false)
56
+ | | |-- nChannels: integer (nullable = false)
57
+ | | |-- mode: integer (nullable = false)
58
+ | | |-- result: binary (nullable = true)
59
+ | | |-- metadata: map (nullable = true)
60
+ | | | |-- key: string
61
+ | | | |-- value: string (valueContainsNull = true)
62
+ | | |-- text: string (nullable = true)
63
+ """
64
+
25
65
  name = "Reader2Image"
26
66
  outputAnnotatorType = AnnotatorType.IMAGE
27
67
 
28
- contentPath = Param(
68
+ userMessage = Param(
29
69
  Params._dummy(),
30
- "contentPath",
31
- "contentPath path to files to read",
70
+ "userMessage",
71
+ "Custom user message.",
32
72
  typeConverter=TypeConverters.toString
33
73
  )
34
74
 
35
- outputCol = Param(
75
+ promptTemplate = Param(
36
76
  Params._dummy(),
37
- "outputCol",
38
- "output column name",
77
+ "promptTemplate",
78
+ "Format of the output prompt.",
39
79
  typeConverter=TypeConverters.toString
40
80
  )
41
81
 
42
- contentType = Param(
82
+ customPromptTemplate = Param(
43
83
  Params._dummy(),
44
- "contentType",
45
- "Set the content type to load following MIME specification",
84
+ "customPromptTemplate",
85
+ "Custom prompt template for image models.",
46
86
  typeConverter=TypeConverters.toString
47
87
  )
48
88
 
49
- explodeDocs = Param(
50
- Params._dummy(),
51
- "explodeDocs",
52
- "whether to explode the documents into separate rows",
53
- typeConverter=TypeConverters.toBoolean
54
- )
55
-
56
89
  @keyword_only
57
90
  def __init__(self):
58
91
  super(Reader2Image, self).__init__(classname="com.johnsnowlabs.reader.Reader2Image")
59
92
  self._setDefault(
60
- outputCol="document",
93
+ contentType="",
94
+ outputFormat="image",
61
95
  explodeDocs=True,
62
- contentType=""
96
+ userMessage="Describe this image",
97
+ promptTemplate="qwen2vl-chat",
98
+ readAsImage=True,
99
+ customPromptTemplate="",
100
+ ignoreExceptions=True
63
101
  )
64
102
 
65
103
  @keyword_only
@@ -67,44 +105,32 @@ class Reader2Image(
67
105
  kwargs = self._input_kwargs
68
106
  return self._set(**kwargs)
69
107
 
70
- def setContentPath(self, value):
71
- """Sets content path.
108
+ def setUserMessage(self, value: str):
109
+ """Sets custom user message.
72
110
 
73
111
  Parameters
74
112
  ----------
75
113
  value : str
76
- contentPath path to files to read
114
+ Custom user message to include.
77
115
  """
78
- return self._set(contentPath=value)
116
+ return self._set(userMessage=value)
79
117
 
80
- def setContentType(self, value):
81
- """
82
- Set the content type to load following MIME specification
118
+ def setPromptTemplate(self, value: str):
119
+ """Sets format of the output prompt.
83
120
 
84
121
  Parameters
85
122
  ----------
86
123
  value : str
87
- content type to load following MIME specification
88
- """
89
- return self._set(contentType=value)
90
-
91
- def setExplodeDocs(self, value):
92
- """Sets whether to explode the documents into separate rows.
93
-
94
-
95
- Parameters
96
- ----------
97
- value : boolean
98
- Whether to explode the documents into separate rows
124
+ Prompt template format.
99
125
  """
100
- return self._set(explodeDocs=value)
126
+ return self._set(promptTemplate=value)
101
127
 
102
- def setOutputCol(self, value):
103
- """Sets output column name.
128
+ def setCustomPromptTemplate(self, value: str):
129
+ """Sets custom prompt template for image models.
104
130
 
105
131
  Parameters
106
132
  ----------
107
133
  value : str
108
- Name of the Output Column
134
+ Custom prompt template string.
109
135
  """
110
- return self._set(outputCol=value)
136
+ return self._set(customPromptTemplate=value)
@@ -13,14 +13,15 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from pyspark import keyword_only
16
- from pyspark.ml.param import TypeConverters, Params, Param
17
16
 
18
17
  from sparknlp.common import AnnotatorType
19
18
  from sparknlp.internal import AnnotatorTransformer
20
19
  from sparknlp.partition.partition_properties import *
21
20
 
21
+
22
22
  class Reader2Table(
23
23
  AnnotatorTransformer,
24
+ HasReaderProperties,
24
25
  HasEmailReaderProperties,
25
26
  HasExcelReaderProperties,
26
27
  HasHTMLReaderProperties,
@@ -31,55 +32,6 @@ class Reader2Table(
31
32
 
32
33
  outputAnnotatorType = AnnotatorType.DOCUMENT
33
34
 
34
- contentPath = Param(
35
- Params._dummy(),
36
- "contentPath",
37
- "contentPath path to files to read",
38
- typeConverter=TypeConverters.toString
39
- )
40
-
41
- outputCol = Param(
42
- Params._dummy(),
43
- "outputCol",
44
- "output column name",
45
- typeConverter=TypeConverters.toString
46
- )
47
-
48
- contentType = Param(
49
- Params._dummy(),
50
- "contentType",
51
- "Set the content type to load following MIME specification",
52
- typeConverter=TypeConverters.toString
53
- )
54
-
55
- explodeDocs = Param(
56
- Params._dummy(),
57
- "explodeDocs",
58
- "whether to explode the documents into separate rows",
59
- typeConverter=TypeConverters.toBoolean
60
- )
61
-
62
- flattenOutput = Param(
63
- Params._dummy(),
64
- "flattenOutput",
65
- "If true, output is flattened to plain text with minimal metadata",
66
- typeConverter=TypeConverters.toBoolean
67
- )
68
-
69
- titleThreshold = Param(
70
- Params._dummy(),
71
- "titleThreshold",
72
- "Minimum font size threshold for title detection in PDF docs",
73
- typeConverter=TypeConverters.toFloat
74
- )
75
-
76
- outputFormat = Param(
77
- Params._dummy(),
78
- "outputFormat",
79
- "Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
80
- typeConverter=TypeConverters.toString
81
- )
82
-
83
35
  @keyword_only
84
36
  def __init__(self):
85
37
  super(Reader2Table, self).__init__(classname="com.johnsnowlabs.reader.Reader2Table")
@@ -89,75 +41,3 @@ class Reader2Table(
89
41
  def setParams(self):
90
42
  kwargs = self._input_kwargs
91
43
  return self._set(**kwargs)
92
-
93
- def setContentPath(self, value):
94
- """Sets content path.
95
-
96
- Parameters
97
- ----------
98
- value : str
99
- contentPath path to files to read
100
- """
101
- return self._set(contentPath=value)
102
-
103
- def setContentType(self, value):
104
- """
105
- Set the content type to load following MIME specification
106
-
107
- Parameters
108
- ----------
109
- value : str
110
- content type to load following MIME specification
111
- """
112
- return self._set(contentType=value)
113
-
114
- def setExplodeDocs(self, value):
115
- """Sets whether to explode the documents into separate rows.
116
-
117
-
118
- Parameters
119
- ----------
120
- value : boolean
121
- Whether to explode the documents into separate rows
122
- """
123
- return self._set(explodeDocs=value)
124
-
125
- def setOutputCol(self, value):
126
- """Sets output column name.
127
-
128
- Parameters
129
- ----------
130
- value : str
131
- Name of the Output Column
132
- """
133
- return self._set(outputCol=value)
134
-
135
- def setFlattenOutput(self, value):
136
- """Sets whether to flatten the output to plain text with minimal metadata.
137
-
138
- Parameters
139
- ----------
140
- value : bool
141
- If true, output is flattened to plain text with minimal metadata
142
- """
143
- return self._set(flattenOutput=value)
144
-
145
- def setTitleThreshold(self, value):
146
- """Sets the minimum font size threshold for title detection in PDF documents.
147
-
148
- Parameters
149
- ----------
150
- value : float
151
- Minimum font size threshold for title detection in PDF docs
152
- """
153
- return self._set(titleThreshold=value)
154
-
155
- def setOutputFormat(self, value):
156
- """Sets the output format for the table content.
157
-
158
- Parameters
159
- ----------
160
- value : str
161
- Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.
162
- """
163
- return self._set(outputFormat=value)
@@ -0,0 +1,159 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from pyspark import keyword_only
16
+
17
+ from sparknlp.common import AnnotatorType
18
+ from sparknlp.internal import AnnotatorTransformer
19
+ from sparknlp.partition.partition_properties import *
20
+
21
+ class ReaderAssembler(
22
+ AnnotatorTransformer,
23
+ HasReaderProperties,
24
+ HasHTMLReaderProperties,
25
+ HasEmailReaderProperties,
26
+ HasExcelReaderProperties,
27
+ HasPowerPointProperties,
28
+ HasTextReaderProperties,
29
+ HasPdfProperties
30
+ ):
31
+ """
32
+ The ReaderAssembler annotator provides a unified interface for combining multiple Spark NLP
33
+ readers (such as Reader2Doc, Reader2Table, and Reader2Image) into a single, configurable
34
+ component. It automatically orchestrates the execution of different readers based on input type,
35
+ configured priorities, and fallback strategies allowing you to handle diverse content formats
36
+ without manually chaining multiple readers in your pipeline.
37
+
38
+ ReaderAssembler simplifies the process of building flexible pipelines capable of ingesting and
39
+ processing documents, tables, and images in a consistent way. It handles reader selection,
40
+ ordering, and fault-tolerance internally, ensuring that pipelines remain concise, robust, and
41
+ easy to maintain.
42
+
43
+ Examples
44
+ --------
45
+ >>> from johnsnowlabs.reader import ReaderAssembler
46
+ >>> from pyspark.ml import Pipeline
47
+ >>>
48
+ >>> reader_assembler = ReaderAssembler() \\
49
+ ... .setContentType("text/html") \\
50
+ ... .setContentPath("/table-image.html") \\
51
+ ... .setOutputCol("document")
52
+ >>>
53
+ >>> pipeline = Pipeline(stages=[reader_assembler])
54
+ >>> pipeline_model = pipeline.fit(empty_data_set)
55
+ >>> result_df = pipeline_model.transform(empty_data_set)
56
+ >>>
57
+ >>> result_df.show()
58
+ +--------+--------------------+--------------------+--------------------+---------+
59
+ |fileName| document_text| document_table| document_image|exception|
60
+ +--------+--------------------+--------------------+--------------------+---------+
61
+ | null|[{'document', 0, 26...|[{'document', 0, 50...|[{'image', , 5, 5, ...| null|
62
+ +--------+--------------------+--------------------+--------------------+---------+
63
+
64
+ This annotator is especially useful when working with heterogeneous input data — for example,
65
+ when a dataset includes PDFs, spreadsheets, and images — allowing Spark NLP to automatically
66
+ invoke the appropriate reader for each file type while preserving a unified schema in the output.
67
+ """
68
+
69
+
70
+ name = 'ReaderAssembler'
71
+
72
+ outputAnnotatorType = AnnotatorType.DOCUMENT
73
+
74
+ excludeNonText = Param(
75
+ Params._dummy(),
76
+ "excludeNonText",
77
+ "Whether to exclude non-text content from the output. Default is False.",
78
+ typeConverter=TypeConverters.toBoolean
79
+ )
80
+
81
+ userMessage = Param(
82
+ Params._dummy(),
83
+ "userMessage",
84
+ "Custom user message.",
85
+ typeConverter=TypeConverters.toString
86
+ )
87
+
88
+ promptTemplate = Param(
89
+ Params._dummy(),
90
+ "promptTemplate",
91
+ "Format of the output prompt.",
92
+ typeConverter=TypeConverters.toString
93
+ )
94
+
95
+ customPromptTemplate = Param(
96
+ Params._dummy(),
97
+ "customPromptTemplate",
98
+ "Custom prompt template for image models.",
99
+ typeConverter=TypeConverters.toString
100
+ )
101
+
102
+ @keyword_only
103
+ def __init__(self):
104
+ super(ReaderAssembler, self).__init__(classname="com.johnsnowlabs.reader.ReaderAssembler")
105
+ self._setDefault(contentType="",
106
+ explodeDocs=False,
107
+ userMessage="Describe this image",
108
+ promptTemplate="qwen2vl-chat",
109
+ readAsImage=True,
110
+ customPromptTemplate="",
111
+ ignoreExceptions=True,
112
+ flattenOutput=False,
113
+ titleThreshold=18)
114
+
115
+
116
+ @keyword_only
117
+ def setParams(self):
118
+ kwargs = self._input_kwargs
119
+ return self._set(**kwargs)
120
+
121
+ def setExcludeNonText(self, value):
122
+ """Sets whether to exclude non-text content from the output.
123
+
124
+ Parameters
125
+ ----------
126
+ value : bool
127
+ Whether to exclude non-text content from the output. Default is False.
128
+ """
129
+ return self._set(excludeNonText=value)
130
+
131
+ def setUserMessage(self, value: str):
132
+ """Sets custom user message.
133
+
134
+ Parameters
135
+ ----------
136
+ value : str
137
+ Custom user message to include.
138
+ """
139
+ return self._set(userMessage=value)
140
+
141
+ def setPromptTemplate(self, value: str):
142
+ """Sets format of the output prompt.
143
+
144
+ Parameters
145
+ ----------
146
+ value : str
147
+ Prompt template format.
148
+ """
149
+ return self._set(promptTemplate=value)
150
+
151
+ def setCustomPromptTemplate(self, value: str):
152
+ """Sets custom prompt template for image models.
153
+
154
+ Parameters
155
+ ----------
156
+ value : str
157
+ Custom prompt template string.
158
+ """
159
+ return self._set(customPromptTemplate=value)