spark-nlp 6.1.3rc1__py2.py3-none-any.whl → 6.1.5__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of spark-nlp might be problematic. Click here for more details.
- {spark_nlp-6.1.3rc1.dist-info → spark_nlp-6.1.5.dist-info}/METADATA +5 -5
- {spark_nlp-6.1.3rc1.dist-info → spark_nlp-6.1.5.dist-info}/RECORD +15 -17
- sparknlp/__init__.py +1 -1
- sparknlp/annotator/ner/__init__.py +1 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +237 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +4 -4
- sparknlp/base/__init__.py +1 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/partition/partition_properties.py +444 -1
- sparknlp/reader/reader2doc.py +15 -118
- sparknlp/reader/reader2image.py +69 -43
- sparknlp/reader/reader2table.py +2 -122
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/annotator/classifier_dl/roberta_bert_for_zero_shot_classification.py +0 -225
- sparknlp/annotator/extractor.py +0 -174
- sparknlp/annotator/openai_completion.py +0 -352
- sparknlp/annotator/openai_embeddings.py +0 -132
- sparknlp/base/token2_chunk.py +0 -76
- {spark_nlp-6.1.3rc1.dist-info → spark_nlp-6.1.5.dist-info}/WHEEL +0 -0
- {spark_nlp-6.1.3rc1.dist-info → spark_nlp-6.1.5.dist-info}/top_level.txt +0 -0
sparknlp/reader/reader2doc.py
CHANGED
|
@@ -12,7 +12,6 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
from pyspark import keyword_only
|
|
15
|
-
from pyspark.ml.param import TypeConverters, Params, Param
|
|
16
15
|
|
|
17
16
|
from sparknlp.common import AnnotatorType
|
|
18
17
|
from sparknlp.internal import AnnotatorTransformer
|
|
@@ -21,9 +20,10 @@ from sparknlp.partition.partition_properties import *
|
|
|
21
20
|
|
|
22
21
|
class Reader2Doc(
|
|
23
22
|
AnnotatorTransformer,
|
|
23
|
+
HasReaderProperties,
|
|
24
|
+
HasHTMLReaderProperties,
|
|
24
25
|
HasEmailReaderProperties,
|
|
25
26
|
HasExcelReaderProperties,
|
|
26
|
-
HasHTMLReaderProperties,
|
|
27
27
|
HasPowerPointProperties,
|
|
28
28
|
HasTextReaderProperties
|
|
29
29
|
):
|
|
@@ -68,59 +68,28 @@ class Reader2Doc(
|
|
|
68
68
|
|[{'document', 15, 38, 'This is a narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
|
|
69
69
|
|[{'document', 39, 68, 'This is another narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
|
|
70
70
|
+------------------------------------------------------------------------------------------------------------------------------------+
|
|
71
|
-
"""
|
|
71
|
+
"""
|
|
72
72
|
|
|
73
73
|
name = "Reader2Doc"
|
|
74
|
-
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
75
|
-
|
|
76
|
-
contentPath = Param(
|
|
77
|
-
Params._dummy(),
|
|
78
|
-
"contentPath",
|
|
79
|
-
"contentPath path to files to read",
|
|
80
|
-
typeConverter=TypeConverters.toString
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
outputCol = Param(
|
|
84
|
-
Params._dummy(),
|
|
85
|
-
"outputCol",
|
|
86
|
-
"output column name",
|
|
87
|
-
typeConverter=TypeConverters.toString
|
|
88
|
-
)
|
|
89
|
-
|
|
90
|
-
contentType = Param(
|
|
91
|
-
Params._dummy(),
|
|
92
|
-
"contentType",
|
|
93
|
-
"Set the content type to load following MIME specification",
|
|
94
|
-
typeConverter=TypeConverters.toString
|
|
95
|
-
)
|
|
96
74
|
|
|
97
|
-
|
|
98
|
-
Params._dummy(),
|
|
99
|
-
"explodeDocs",
|
|
100
|
-
"whether to explode the documents into separate rows",
|
|
101
|
-
typeConverter=TypeConverters.toBoolean
|
|
102
|
-
)
|
|
75
|
+
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
103
76
|
|
|
104
|
-
|
|
77
|
+
excludeNonText = Param(
|
|
105
78
|
Params._dummy(),
|
|
106
|
-
"
|
|
107
|
-
"
|
|
79
|
+
"excludeNonText",
|
|
80
|
+
"Whether to exclude non-text content from the output. Default is False.",
|
|
108
81
|
typeConverter=TypeConverters.toBoolean
|
|
109
82
|
)
|
|
110
83
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
"titleThreshold",
|
|
114
|
-
"Minimum font size threshold for title detection in PDF docs",
|
|
115
|
-
typeConverter=TypeConverters.toFloat
|
|
116
|
-
)
|
|
84
|
+
def setExcludeNonText(self, value):
|
|
85
|
+
"""Sets whether to exclude non-text content from the output.
|
|
117
86
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
87
|
+
Parameters
|
|
88
|
+
----------
|
|
89
|
+
value : bool
|
|
90
|
+
Whether to exclude non-text content from the output. Default is False.
|
|
91
|
+
"""
|
|
92
|
+
return self._set(excludeNonText=value)
|
|
124
93
|
|
|
125
94
|
@keyword_only
|
|
126
95
|
def __init__(self):
|
|
@@ -136,75 +105,3 @@ class Reader2Doc(
|
|
|
136
105
|
def setParams(self):
|
|
137
106
|
kwargs = self._input_kwargs
|
|
138
107
|
return self._set(**kwargs)
|
|
139
|
-
|
|
140
|
-
def setContentPath(self, value):
|
|
141
|
-
"""Sets content path.
|
|
142
|
-
|
|
143
|
-
Parameters
|
|
144
|
-
----------
|
|
145
|
-
value : str
|
|
146
|
-
contentPath path to files to read
|
|
147
|
-
"""
|
|
148
|
-
return self._set(contentPath=value)
|
|
149
|
-
|
|
150
|
-
def setContentType(self, value):
|
|
151
|
-
"""
|
|
152
|
-
Set the content type to load following MIME specification
|
|
153
|
-
|
|
154
|
-
Parameters
|
|
155
|
-
----------
|
|
156
|
-
value : str
|
|
157
|
-
content type to load following MIME specification
|
|
158
|
-
"""
|
|
159
|
-
return self._set(contentType=value)
|
|
160
|
-
|
|
161
|
-
def setExplodeDocs(self, value):
|
|
162
|
-
"""Sets whether to explode the documents into separate rows.
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
Parameters
|
|
166
|
-
----------
|
|
167
|
-
value : boolean
|
|
168
|
-
Whether to explode the documents into separate rows
|
|
169
|
-
"""
|
|
170
|
-
return self._set(explodeDocs=value)
|
|
171
|
-
|
|
172
|
-
def setOutputCol(self, value):
|
|
173
|
-
"""Sets output column name.
|
|
174
|
-
|
|
175
|
-
Parameters
|
|
176
|
-
----------
|
|
177
|
-
value : str
|
|
178
|
-
Name of the Output Column
|
|
179
|
-
"""
|
|
180
|
-
return self._set(outputCol=value)
|
|
181
|
-
|
|
182
|
-
def setFlattenOutput(self, value):
|
|
183
|
-
"""Sets whether to flatten the output to plain text with minimal metadata.
|
|
184
|
-
|
|
185
|
-
Parameters
|
|
186
|
-
----------
|
|
187
|
-
value : bool
|
|
188
|
-
If true, output is flattened to plain text with minimal metadata
|
|
189
|
-
"""
|
|
190
|
-
return self._set(flattenOutput=value)
|
|
191
|
-
|
|
192
|
-
def setTitleThreshold(self, value):
|
|
193
|
-
"""Sets the minimum font size threshold for title detection in PDF documents.
|
|
194
|
-
|
|
195
|
-
Parameters
|
|
196
|
-
----------
|
|
197
|
-
value : float
|
|
198
|
-
Minimum font size threshold for title detection in PDF docs
|
|
199
|
-
"""
|
|
200
|
-
return self._set(titleThreshold=value)
|
|
201
|
-
|
|
202
|
-
def setOutputFormat(self, value):
|
|
203
|
-
"""Sets the output format for the table content.
|
|
204
|
-
|
|
205
|
-
Parameters
|
|
206
|
-
----------
|
|
207
|
-
value : str
|
|
208
|
-
Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.
|
|
209
|
-
"""
|
|
210
|
-
return self._set(outputFormat=value)
|
sparknlp/reader/reader2image.py
CHANGED
|
@@ -20,46 +20,84 @@ from sparknlp.partition.partition_properties import *
|
|
|
20
20
|
|
|
21
21
|
class Reader2Image(
|
|
22
22
|
AnnotatorTransformer,
|
|
23
|
-
|
|
23
|
+
HasReaderProperties,
|
|
24
|
+
HasHTMLReaderProperties,
|
|
25
|
+
HasPdfProperties
|
|
24
26
|
):
|
|
27
|
+
"""
|
|
28
|
+
The Reader2Image annotator allows you to use the reading files with images more smoothly within existing
|
|
29
|
+
Spark NLP workflows, enabling seamless reuse of your pipelines. Reader2Image can be used for
|
|
30
|
+
extracting structured image content from various document types using Spark NLP readers. It supports
|
|
31
|
+
reading from many file types and returns parsed output as a structured Spark DataFrame.
|
|
32
|
+
|
|
33
|
+
Supported formats include HTML and Markdown.
|
|
34
|
+
|
|
35
|
+
== Example ==
|
|
36
|
+
This example demonstrates how to load HTML files with images and process them into a structured
|
|
37
|
+
Spark DataFrame using Reader2Image.
|
|
38
|
+
|
|
39
|
+
Expected output:
|
|
40
|
+
+-------------------+--------------------+
|
|
41
|
+
| fileName| image|
|
|
42
|
+
+-------------------+--------------------+
|
|
43
|
+
|example-images.html|[{image, example-...|
|
|
44
|
+
|example-images.html|[{image, example-...|
|
|
45
|
+
+-------------------+--------------------+
|
|
46
|
+
|
|
47
|
+
Schema:
|
|
48
|
+
root
|
|
49
|
+
|-- fileName: string (nullable = true)
|
|
50
|
+
|-- image: array (nullable = false)
|
|
51
|
+
| |-- element: struct (containsNull = true)
|
|
52
|
+
| | |-- annotatorType: string (nullable = true)
|
|
53
|
+
| | |-- origin: string (nullable = true)
|
|
54
|
+
| | |-- height: integer (nullable = false)
|
|
55
|
+
| | |-- width: integer (nullable = false)
|
|
56
|
+
| | |-- nChannels: integer (nullable = false)
|
|
57
|
+
| | |-- mode: integer (nullable = false)
|
|
58
|
+
| | |-- result: binary (nullable = true)
|
|
59
|
+
| | |-- metadata: map (nullable = true)
|
|
60
|
+
| | | |-- key: string
|
|
61
|
+
| | | |-- value: string (valueContainsNull = true)
|
|
62
|
+
| | |-- text: string (nullable = true)
|
|
63
|
+
"""
|
|
64
|
+
|
|
25
65
|
name = "Reader2Image"
|
|
26
66
|
outputAnnotatorType = AnnotatorType.IMAGE
|
|
27
67
|
|
|
28
|
-
|
|
68
|
+
userMessage = Param(
|
|
29
69
|
Params._dummy(),
|
|
30
|
-
"
|
|
31
|
-
"
|
|
70
|
+
"userMessage",
|
|
71
|
+
"Custom user message.",
|
|
32
72
|
typeConverter=TypeConverters.toString
|
|
33
73
|
)
|
|
34
74
|
|
|
35
|
-
|
|
75
|
+
promptTemplate = Param(
|
|
36
76
|
Params._dummy(),
|
|
37
|
-
"
|
|
38
|
-
"output
|
|
77
|
+
"promptTemplate",
|
|
78
|
+
"Format of the output prompt.",
|
|
39
79
|
typeConverter=TypeConverters.toString
|
|
40
80
|
)
|
|
41
81
|
|
|
42
|
-
|
|
82
|
+
customPromptTemplate = Param(
|
|
43
83
|
Params._dummy(),
|
|
44
|
-
"
|
|
45
|
-
"
|
|
84
|
+
"customPromptTemplate",
|
|
85
|
+
"Custom prompt template for image models.",
|
|
46
86
|
typeConverter=TypeConverters.toString
|
|
47
87
|
)
|
|
48
88
|
|
|
49
|
-
explodeDocs = Param(
|
|
50
|
-
Params._dummy(),
|
|
51
|
-
"explodeDocs",
|
|
52
|
-
"whether to explode the documents into separate rows",
|
|
53
|
-
typeConverter=TypeConverters.toBoolean
|
|
54
|
-
)
|
|
55
|
-
|
|
56
89
|
@keyword_only
|
|
57
90
|
def __init__(self):
|
|
58
91
|
super(Reader2Image, self).__init__(classname="com.johnsnowlabs.reader.Reader2Image")
|
|
59
92
|
self._setDefault(
|
|
60
|
-
|
|
93
|
+
contentType="",
|
|
94
|
+
outputFormat="image",
|
|
61
95
|
explodeDocs=True,
|
|
62
|
-
|
|
96
|
+
userMessage="Describe this image",
|
|
97
|
+
promptTemplate="qwen2vl-chat",
|
|
98
|
+
readAsImage=True,
|
|
99
|
+
customPromptTemplate="",
|
|
100
|
+
ignoreExceptions=True
|
|
63
101
|
)
|
|
64
102
|
|
|
65
103
|
@keyword_only
|
|
@@ -67,44 +105,32 @@ class Reader2Image(
|
|
|
67
105
|
kwargs = self._input_kwargs
|
|
68
106
|
return self._set(**kwargs)
|
|
69
107
|
|
|
70
|
-
def
|
|
71
|
-
"""Sets
|
|
108
|
+
def setUserMessage(self, value: str):
|
|
109
|
+
"""Sets custom user message.
|
|
72
110
|
|
|
73
111
|
Parameters
|
|
74
112
|
----------
|
|
75
113
|
value : str
|
|
76
|
-
|
|
114
|
+
Custom user message to include.
|
|
77
115
|
"""
|
|
78
|
-
return self._set(
|
|
116
|
+
return self._set(userMessage=value)
|
|
79
117
|
|
|
80
|
-
def
|
|
81
|
-
"""
|
|
82
|
-
Set the content type to load following MIME specification
|
|
118
|
+
def setPromptTemplate(self, value: str):
|
|
119
|
+
"""Sets format of the output prompt.
|
|
83
120
|
|
|
84
121
|
Parameters
|
|
85
122
|
----------
|
|
86
123
|
value : str
|
|
87
|
-
|
|
88
|
-
"""
|
|
89
|
-
return self._set(contentType=value)
|
|
90
|
-
|
|
91
|
-
def setExplodeDocs(self, value):
|
|
92
|
-
"""Sets whether to explode the documents into separate rows.
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
Parameters
|
|
96
|
-
----------
|
|
97
|
-
value : boolean
|
|
98
|
-
Whether to explode the documents into separate rows
|
|
124
|
+
Prompt template format.
|
|
99
125
|
"""
|
|
100
|
-
return self._set(
|
|
126
|
+
return self._set(promptTemplate=value)
|
|
101
127
|
|
|
102
|
-
def
|
|
103
|
-
"""Sets
|
|
128
|
+
def setCustomPromptTemplate(self, value: str):
|
|
129
|
+
"""Sets custom prompt template for image models.
|
|
104
130
|
|
|
105
131
|
Parameters
|
|
106
132
|
----------
|
|
107
133
|
value : str
|
|
108
|
-
|
|
134
|
+
Custom prompt template string.
|
|
109
135
|
"""
|
|
110
|
-
return self._set(
|
|
136
|
+
return self._set(customPromptTemplate=value)
|
sparknlp/reader/reader2table.py
CHANGED
|
@@ -13,14 +13,15 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
from pyspark import keyword_only
|
|
16
|
-
from pyspark.ml.param import TypeConverters, Params, Param
|
|
17
16
|
|
|
18
17
|
from sparknlp.common import AnnotatorType
|
|
19
18
|
from sparknlp.internal import AnnotatorTransformer
|
|
20
19
|
from sparknlp.partition.partition_properties import *
|
|
21
20
|
|
|
21
|
+
|
|
22
22
|
class Reader2Table(
|
|
23
23
|
AnnotatorTransformer,
|
|
24
|
+
HasReaderProperties,
|
|
24
25
|
HasEmailReaderProperties,
|
|
25
26
|
HasExcelReaderProperties,
|
|
26
27
|
HasHTMLReaderProperties,
|
|
@@ -31,55 +32,6 @@ class Reader2Table(
|
|
|
31
32
|
|
|
32
33
|
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
33
34
|
|
|
34
|
-
contentPath = Param(
|
|
35
|
-
Params._dummy(),
|
|
36
|
-
"contentPath",
|
|
37
|
-
"contentPath path to files to read",
|
|
38
|
-
typeConverter=TypeConverters.toString
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
outputCol = Param(
|
|
42
|
-
Params._dummy(),
|
|
43
|
-
"outputCol",
|
|
44
|
-
"output column name",
|
|
45
|
-
typeConverter=TypeConverters.toString
|
|
46
|
-
)
|
|
47
|
-
|
|
48
|
-
contentType = Param(
|
|
49
|
-
Params._dummy(),
|
|
50
|
-
"contentType",
|
|
51
|
-
"Set the content type to load following MIME specification",
|
|
52
|
-
typeConverter=TypeConverters.toString
|
|
53
|
-
)
|
|
54
|
-
|
|
55
|
-
explodeDocs = Param(
|
|
56
|
-
Params._dummy(),
|
|
57
|
-
"explodeDocs",
|
|
58
|
-
"whether to explode the documents into separate rows",
|
|
59
|
-
typeConverter=TypeConverters.toBoolean
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
flattenOutput = Param(
|
|
63
|
-
Params._dummy(),
|
|
64
|
-
"flattenOutput",
|
|
65
|
-
"If true, output is flattened to plain text with minimal metadata",
|
|
66
|
-
typeConverter=TypeConverters.toBoolean
|
|
67
|
-
)
|
|
68
|
-
|
|
69
|
-
titleThreshold = Param(
|
|
70
|
-
Params._dummy(),
|
|
71
|
-
"titleThreshold",
|
|
72
|
-
"Minimum font size threshold for title detection in PDF docs",
|
|
73
|
-
typeConverter=TypeConverters.toFloat
|
|
74
|
-
)
|
|
75
|
-
|
|
76
|
-
outputFormat = Param(
|
|
77
|
-
Params._dummy(),
|
|
78
|
-
"outputFormat",
|
|
79
|
-
"Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
|
|
80
|
-
typeConverter=TypeConverters.toString
|
|
81
|
-
)
|
|
82
|
-
|
|
83
35
|
@keyword_only
|
|
84
36
|
def __init__(self):
|
|
85
37
|
super(Reader2Table, self).__init__(classname="com.johnsnowlabs.reader.Reader2Table")
|
|
@@ -89,75 +41,3 @@ class Reader2Table(
|
|
|
89
41
|
def setParams(self):
|
|
90
42
|
kwargs = self._input_kwargs
|
|
91
43
|
return self._set(**kwargs)
|
|
92
|
-
|
|
93
|
-
def setContentPath(self, value):
|
|
94
|
-
"""Sets content path.
|
|
95
|
-
|
|
96
|
-
Parameters
|
|
97
|
-
----------
|
|
98
|
-
value : str
|
|
99
|
-
contentPath path to files to read
|
|
100
|
-
"""
|
|
101
|
-
return self._set(contentPath=value)
|
|
102
|
-
|
|
103
|
-
def setContentType(self, value):
|
|
104
|
-
"""
|
|
105
|
-
Set the content type to load following MIME specification
|
|
106
|
-
|
|
107
|
-
Parameters
|
|
108
|
-
----------
|
|
109
|
-
value : str
|
|
110
|
-
content type to load following MIME specification
|
|
111
|
-
"""
|
|
112
|
-
return self._set(contentType=value)
|
|
113
|
-
|
|
114
|
-
def setExplodeDocs(self, value):
|
|
115
|
-
"""Sets whether to explode the documents into separate rows.
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
Parameters
|
|
119
|
-
----------
|
|
120
|
-
value : boolean
|
|
121
|
-
Whether to explode the documents into separate rows
|
|
122
|
-
"""
|
|
123
|
-
return self._set(explodeDocs=value)
|
|
124
|
-
|
|
125
|
-
def setOutputCol(self, value):
|
|
126
|
-
"""Sets output column name.
|
|
127
|
-
|
|
128
|
-
Parameters
|
|
129
|
-
----------
|
|
130
|
-
value : str
|
|
131
|
-
Name of the Output Column
|
|
132
|
-
"""
|
|
133
|
-
return self._set(outputCol=value)
|
|
134
|
-
|
|
135
|
-
def setFlattenOutput(self, value):
|
|
136
|
-
"""Sets whether to flatten the output to plain text with minimal metadata.
|
|
137
|
-
|
|
138
|
-
Parameters
|
|
139
|
-
----------
|
|
140
|
-
value : bool
|
|
141
|
-
If true, output is flattened to plain text with minimal metadata
|
|
142
|
-
"""
|
|
143
|
-
return self._set(flattenOutput=value)
|
|
144
|
-
|
|
145
|
-
def setTitleThreshold(self, value):
|
|
146
|
-
"""Sets the minimum font size threshold for title detection in PDF documents.
|
|
147
|
-
|
|
148
|
-
Parameters
|
|
149
|
-
----------
|
|
150
|
-
value : float
|
|
151
|
-
Minimum font size threshold for title detection in PDF docs
|
|
152
|
-
"""
|
|
153
|
-
return self._set(titleThreshold=value)
|
|
154
|
-
|
|
155
|
-
def setOutputFormat(self, value):
|
|
156
|
-
"""Sets the output format for the table content.
|
|
157
|
-
|
|
158
|
-
Parameters
|
|
159
|
-
----------
|
|
160
|
-
value : str
|
|
161
|
-
Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.
|
|
162
|
-
"""
|
|
163
|
-
return self._set(outputFormat=value)
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# Copyright 2017-2025 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from pyspark import keyword_only
|
|
16
|
+
|
|
17
|
+
from sparknlp.common import AnnotatorType
|
|
18
|
+
from sparknlp.internal import AnnotatorTransformer
|
|
19
|
+
from sparknlp.partition.partition_properties import *
|
|
20
|
+
|
|
21
|
+
class ReaderAssembler(
|
|
22
|
+
AnnotatorTransformer,
|
|
23
|
+
HasReaderProperties,
|
|
24
|
+
HasHTMLReaderProperties,
|
|
25
|
+
HasEmailReaderProperties,
|
|
26
|
+
HasExcelReaderProperties,
|
|
27
|
+
HasPowerPointProperties,
|
|
28
|
+
HasTextReaderProperties,
|
|
29
|
+
HasPdfProperties
|
|
30
|
+
):
|
|
31
|
+
"""
|
|
32
|
+
The ReaderAssembler annotator provides a unified interface for combining multiple Spark NLP
|
|
33
|
+
readers (such as Reader2Doc, Reader2Table, and Reader2Image) into a single, configurable
|
|
34
|
+
component. It automatically orchestrates the execution of different readers based on input type,
|
|
35
|
+
configured priorities, and fallback strategies allowing you to handle diverse content formats
|
|
36
|
+
without manually chaining multiple readers in your pipeline.
|
|
37
|
+
|
|
38
|
+
ReaderAssembler simplifies the process of building flexible pipelines capable of ingesting and
|
|
39
|
+
processing documents, tables, and images in a consistent way. It handles reader selection,
|
|
40
|
+
ordering, and fault-tolerance internally, ensuring that pipelines remain concise, robust, and
|
|
41
|
+
easy to maintain.
|
|
42
|
+
|
|
43
|
+
Examples
|
|
44
|
+
--------
|
|
45
|
+
>>> from johnsnowlabs.reader import ReaderAssembler
|
|
46
|
+
>>> from pyspark.ml import Pipeline
|
|
47
|
+
>>>
|
|
48
|
+
>>> reader_assembler = ReaderAssembler() \\
|
|
49
|
+
... .setContentType("text/html") \\
|
|
50
|
+
... .setContentPath("/table-image.html") \\
|
|
51
|
+
... .setOutputCol("document")
|
|
52
|
+
>>>
|
|
53
|
+
>>> pipeline = Pipeline(stages=[reader_assembler])
|
|
54
|
+
>>> pipeline_model = pipeline.fit(empty_data_set)
|
|
55
|
+
>>> result_df = pipeline_model.transform(empty_data_set)
|
|
56
|
+
>>>
|
|
57
|
+
>>> result_df.show()
|
|
58
|
+
+--------+--------------------+--------------------+--------------------+---------+
|
|
59
|
+
|fileName| document_text| document_table| document_image|exception|
|
|
60
|
+
+--------+--------------------+--------------------+--------------------+---------+
|
|
61
|
+
| null|[{'document', 0, 26...|[{'document', 0, 50...|[{'image', , 5, 5, ...| null|
|
|
62
|
+
+--------+--------------------+--------------------+--------------------+---------+
|
|
63
|
+
|
|
64
|
+
This annotator is especially useful when working with heterogeneous input data — for example,
|
|
65
|
+
when a dataset includes PDFs, spreadsheets, and images — allowing Spark NLP to automatically
|
|
66
|
+
invoke the appropriate reader for each file type while preserving a unified schema in the output.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
name = 'ReaderAssembler'
|
|
71
|
+
|
|
72
|
+
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
73
|
+
|
|
74
|
+
excludeNonText = Param(
|
|
75
|
+
Params._dummy(),
|
|
76
|
+
"excludeNonText",
|
|
77
|
+
"Whether to exclude non-text content from the output. Default is False.",
|
|
78
|
+
typeConverter=TypeConverters.toBoolean
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
userMessage = Param(
|
|
82
|
+
Params._dummy(),
|
|
83
|
+
"userMessage",
|
|
84
|
+
"Custom user message.",
|
|
85
|
+
typeConverter=TypeConverters.toString
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
promptTemplate = Param(
|
|
89
|
+
Params._dummy(),
|
|
90
|
+
"promptTemplate",
|
|
91
|
+
"Format of the output prompt.",
|
|
92
|
+
typeConverter=TypeConverters.toString
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
customPromptTemplate = Param(
|
|
96
|
+
Params._dummy(),
|
|
97
|
+
"customPromptTemplate",
|
|
98
|
+
"Custom prompt template for image models.",
|
|
99
|
+
typeConverter=TypeConverters.toString
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
@keyword_only
|
|
103
|
+
def __init__(self):
|
|
104
|
+
super(ReaderAssembler, self).__init__(classname="com.johnsnowlabs.reader.ReaderAssembler")
|
|
105
|
+
self._setDefault(contentType="",
|
|
106
|
+
explodeDocs=False,
|
|
107
|
+
userMessage="Describe this image",
|
|
108
|
+
promptTemplate="qwen2vl-chat",
|
|
109
|
+
readAsImage=True,
|
|
110
|
+
customPromptTemplate="",
|
|
111
|
+
ignoreExceptions=True,
|
|
112
|
+
flattenOutput=False,
|
|
113
|
+
titleThreshold=18)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@keyword_only
|
|
117
|
+
def setParams(self):
|
|
118
|
+
kwargs = self._input_kwargs
|
|
119
|
+
return self._set(**kwargs)
|
|
120
|
+
|
|
121
|
+
def setExcludeNonText(self, value):
|
|
122
|
+
"""Sets whether to exclude non-text content from the output.
|
|
123
|
+
|
|
124
|
+
Parameters
|
|
125
|
+
----------
|
|
126
|
+
value : bool
|
|
127
|
+
Whether to exclude non-text content from the output. Default is False.
|
|
128
|
+
"""
|
|
129
|
+
return self._set(excludeNonText=value)
|
|
130
|
+
|
|
131
|
+
def setUserMessage(self, value: str):
|
|
132
|
+
"""Sets custom user message.
|
|
133
|
+
|
|
134
|
+
Parameters
|
|
135
|
+
----------
|
|
136
|
+
value : str
|
|
137
|
+
Custom user message to include.
|
|
138
|
+
"""
|
|
139
|
+
return self._set(userMessage=value)
|
|
140
|
+
|
|
141
|
+
def setPromptTemplate(self, value: str):
|
|
142
|
+
"""Sets format of the output prompt.
|
|
143
|
+
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
value : str
|
|
147
|
+
Prompt template format.
|
|
148
|
+
"""
|
|
149
|
+
return self._set(promptTemplate=value)
|
|
150
|
+
|
|
151
|
+
def setCustomPromptTemplate(self, value: str):
|
|
152
|
+
"""Sets custom prompt template for image models.
|
|
153
|
+
|
|
154
|
+
Parameters
|
|
155
|
+
----------
|
|
156
|
+
value : str
|
|
157
|
+
Custom prompt template string.
|
|
158
|
+
"""
|
|
159
|
+
return self._set(customPromptTemplate=value)
|