spark-nlp 6.0.1rc1__py2.py3-none-any.whl → 6.0.2__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of spark-nlp might be problematic. Click here for more details.
- {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.2.dist-info}/METADATA +13 -6
- {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.2.dist-info}/RECORD +36 -30
- {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.2.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +4 -2
- sparknlp/annotator/cv/__init__.py +2 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +5 -10
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +8 -13
- sparknlp/annotator/cv/llava_for_multimodal.py +1 -1
- sparknlp/annotator/cv/paligemma_for_multimodal.py +7 -7
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +1 -1
- sparknlp/annotator/cv/qwen2vl_transformer.py +1 -1
- sparknlp/annotator/cv/smolvlm_transformer.py +7 -13
- sparknlp/annotator/date2_chunk.py +1 -1
- sparknlp/annotator/document_character_text_splitter.py +8 -8
- sparknlp/annotator/document_token_splitter.py +7 -7
- sparknlp/annotator/embeddings/bge_embeddings.py +21 -19
- sparknlp/annotator/embeddings/snowflake_embeddings.py +15 -15
- sparknlp/annotator/openai/openai_completion.py +3 -4
- sparknlp/annotator/seq2seq/m2m100_transformer.py +1 -1
- sparknlp/annotator/seq2seq/mistral_transformer.py +2 -3
- sparknlp/annotator/seq2seq/nllb_transformer.py +1 -1
- sparknlp/annotator/seq2seq/qwen_transformer.py +26 -25
- sparknlp/annotator/spell_check/context_spell_checker.py +1 -1
- sparknlp/base/prompt_assembler.py +1 -1
- sparknlp/common/properties.py +7 -7
- sparknlp/internal/__init__.py +19 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +257 -0
- sparknlp/partition/partition_transformer.py +196 -0
- sparknlp/reader/pdf_to_text.py +50 -4
- sparknlp/reader/sparknlp_reader.py +56 -52
- sparknlp/training/spacy_to_annotation.py +7 -7
- {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for partition properties used in reading various document types."""
|
|
15
|
+
from typing import Dict
|
|
16
|
+
|
|
17
|
+
from pyspark.ml.param import TypeConverters, Params, Param
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class HasEmailReaderProperties(Params):
|
|
21
|
+
|
|
22
|
+
addAttachmentContent = Param(
|
|
23
|
+
Params._dummy(),
|
|
24
|
+
"addAttachmentContent",
|
|
25
|
+
"Whether to extract and include the textual content of plain-text attachments in the output",
|
|
26
|
+
typeConverter=TypeConverters.toBoolean
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
def setAddAttachmentContent(self, value):
|
|
30
|
+
"""
|
|
31
|
+
Sets whether to extract and include the textual content of plain-text attachments in the output.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
value : bool
|
|
36
|
+
Whether to include text from plain-text attachments.
|
|
37
|
+
"""
|
|
38
|
+
return self._set(addAttachmentContent=value)
|
|
39
|
+
|
|
40
|
+
def getAddAttachmentContent(self):
|
|
41
|
+
"""
|
|
42
|
+
Gets whether to extract and include the textual content of plain-text attachments in the output.
|
|
43
|
+
|
|
44
|
+
Returns
|
|
45
|
+
-------
|
|
46
|
+
bool
|
|
47
|
+
Whether to include text from plain-text attachments.
|
|
48
|
+
"""
|
|
49
|
+
return self.getOrDefault(self.addAttachmentContent)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class HasExcelReaderProperties(Params):
|
|
53
|
+
|
|
54
|
+
cellSeparator = Param(
|
|
55
|
+
Params._dummy(),
|
|
56
|
+
"cellSeparator",
|
|
57
|
+
"String used to join cell values in a row when assembling textual output.",
|
|
58
|
+
typeConverter=TypeConverters.toString
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def setCellSeparator(self, value):
|
|
62
|
+
"""
|
|
63
|
+
Sets the string used to join cell values in a row when assembling textual output.
|
|
64
|
+
|
|
65
|
+
Parameters
|
|
66
|
+
----------
|
|
67
|
+
value : str
|
|
68
|
+
Delimiter used to concatenate cell values.
|
|
69
|
+
"""
|
|
70
|
+
return self._set(cellSeparator=value)
|
|
71
|
+
|
|
72
|
+
def getCellSeparator(self):
|
|
73
|
+
"""
|
|
74
|
+
Gets the string used to join cell values in a row when assembling textual output.
|
|
75
|
+
|
|
76
|
+
Returns
|
|
77
|
+
-------
|
|
78
|
+
str
|
|
79
|
+
Delimiter used to concatenate cell values.
|
|
80
|
+
"""
|
|
81
|
+
return self.getOrDefault(self.cellSeparator)
|
|
82
|
+
|
|
83
|
+
appendCells = Param(
|
|
84
|
+
Params._dummy(),
|
|
85
|
+
"appendCells",
|
|
86
|
+
"Whether to append all rows into a single content block instead of creating separate elements per row.",
|
|
87
|
+
typeConverter=TypeConverters.toBoolean
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
def setAppendCells(self, value):
|
|
91
|
+
"""
|
|
92
|
+
Sets whether to append all rows into a single content block.
|
|
93
|
+
|
|
94
|
+
Parameters
|
|
95
|
+
----------
|
|
96
|
+
value : bool
|
|
97
|
+
True to merge rows into one block, False for individual elements.
|
|
98
|
+
"""
|
|
99
|
+
return self._set(appendCells=value)
|
|
100
|
+
|
|
101
|
+
def getAppendCells(self):
|
|
102
|
+
"""
|
|
103
|
+
Gets whether to append all rows into a single content block.
|
|
104
|
+
|
|
105
|
+
Returns
|
|
106
|
+
-------
|
|
107
|
+
bool
|
|
108
|
+
True to merge rows into one block, False for individual elements.
|
|
109
|
+
"""
|
|
110
|
+
return self.getOrDefault(self.appendCells)
|
|
111
|
+
|
|
112
|
+
class HasHTMLReaderProperties(Params):
|
|
113
|
+
|
|
114
|
+
timeout = Param(
|
|
115
|
+
Params._dummy(),
|
|
116
|
+
"timeout",
|
|
117
|
+
"Timeout value in seconds for reading remote HTML resources. Applied when fetching content from URLs.",
|
|
118
|
+
typeConverter=TypeConverters.toInt
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
def setTimeout(self, value):
|
|
122
|
+
"""
|
|
123
|
+
Sets the timeout (in seconds) for reading remote HTML resources.
|
|
124
|
+
|
|
125
|
+
Parameters
|
|
126
|
+
----------
|
|
127
|
+
value : int
|
|
128
|
+
Timeout in seconds for remote content retrieval.
|
|
129
|
+
"""
|
|
130
|
+
return self._set(timeout=value)
|
|
131
|
+
|
|
132
|
+
def getTimeout(self):
|
|
133
|
+
"""
|
|
134
|
+
Gets the timeout value for reading remote HTML resources.
|
|
135
|
+
|
|
136
|
+
Returns
|
|
137
|
+
-------
|
|
138
|
+
int
|
|
139
|
+
Timeout in seconds.
|
|
140
|
+
"""
|
|
141
|
+
return self.getOrDefault(self.timeout)
|
|
142
|
+
|
|
143
|
+
def setHeaders(self, headers: Dict[str, str]):
|
|
144
|
+
self._call_java("setHeadersPython", headers)
|
|
145
|
+
return self
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class HasPowerPointProperties(Params):
|
|
149
|
+
|
|
150
|
+
includeSlideNotes = Param(
|
|
151
|
+
Params._dummy(),
|
|
152
|
+
"includeSlideNotes",
|
|
153
|
+
"Whether to extract speaker notes from slides. When enabled, notes are included as narrative text elements.",
|
|
154
|
+
typeConverter=TypeConverters.toBoolean
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
def setIncludeSlideNotes(self, value):
|
|
158
|
+
"""
|
|
159
|
+
Sets whether to extract speaker notes from slides.
|
|
160
|
+
|
|
161
|
+
Parameters
|
|
162
|
+
----------
|
|
163
|
+
value : bool
|
|
164
|
+
If True, notes are included as narrative text elements.
|
|
165
|
+
"""
|
|
166
|
+
return self._set(includeSlideNotes=value)
|
|
167
|
+
|
|
168
|
+
def getIncludeSlideNotes(self):
|
|
169
|
+
"""
|
|
170
|
+
Gets whether to extract speaker notes from slides.
|
|
171
|
+
|
|
172
|
+
Returns
|
|
173
|
+
-------
|
|
174
|
+
bool
|
|
175
|
+
True if notes are included as narrative text elements.
|
|
176
|
+
"""
|
|
177
|
+
return self.getOrDefault(self.includeSlideNotes)
|
|
178
|
+
|
|
179
|
+
class HasTextReaderProperties(Params):
|
|
180
|
+
|
|
181
|
+
titleLengthSize = Param(
|
|
182
|
+
Params._dummy(),
|
|
183
|
+
"titleLengthSize",
|
|
184
|
+
"Maximum character length used to determine if a text block qualifies as a title during parsing.",
|
|
185
|
+
typeConverter=TypeConverters.toInt
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
def setTitleLengthSize(self, value):
|
|
189
|
+
return self._set(titleLengthSize=value)
|
|
190
|
+
|
|
191
|
+
def getTitleLengthSize(self):
|
|
192
|
+
return self.getOrDefault(self.titleLengthSize)
|
|
193
|
+
|
|
194
|
+
groupBrokenParagraphs = Param(
|
|
195
|
+
Params._dummy(),
|
|
196
|
+
"groupBrokenParagraphs",
|
|
197
|
+
"Whether to merge fragmented lines into coherent paragraphs using heuristics based on line length and structure.",
|
|
198
|
+
typeConverter=TypeConverters.toBoolean
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
def setGroupBrokenParagraphs(self, value):
|
|
202
|
+
return self._set(groupBrokenParagraphs=value)
|
|
203
|
+
|
|
204
|
+
def getGroupBrokenParagraphs(self):
|
|
205
|
+
return self.getOrDefault(self.groupBrokenParagraphs)
|
|
206
|
+
|
|
207
|
+
paragraphSplit = Param(
|
|
208
|
+
Params._dummy(),
|
|
209
|
+
"paragraphSplit",
|
|
210
|
+
"Regex pattern used to detect paragraph boundaries when grouping broken paragraphs.",
|
|
211
|
+
typeConverter=TypeConverters.toString
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
def setParagraphSplit(self, value):
|
|
215
|
+
return self._set(paragraphSplit=value)
|
|
216
|
+
|
|
217
|
+
def getParagraphSplit(self):
|
|
218
|
+
return self.getOrDefault(self.paragraphSplit)
|
|
219
|
+
|
|
220
|
+
shortLineWordThreshold = Param(
|
|
221
|
+
Params._dummy(),
|
|
222
|
+
"shortLineWordThreshold",
|
|
223
|
+
"Maximum word count for a line to be considered 'short' during broken paragraph grouping.",
|
|
224
|
+
typeConverter=TypeConverters.toInt
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
def setShortLineWordThreshold(self, value):
|
|
228
|
+
return self._set(shortLineWordThreshold=value)
|
|
229
|
+
|
|
230
|
+
def getShortLineWordThreshold(self):
|
|
231
|
+
return self.getOrDefault(self.shortLineWordThreshold)
|
|
232
|
+
|
|
233
|
+
maxLineCount = Param(
|
|
234
|
+
Params._dummy(),
|
|
235
|
+
"maxLineCount",
|
|
236
|
+
"Maximum number of lines to evaluate when estimating paragraph layout characteristics.",
|
|
237
|
+
typeConverter=TypeConverters.toInt
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
def setMaxLineCount(self, value):
|
|
241
|
+
return self._set(maxLineCount=value)
|
|
242
|
+
|
|
243
|
+
def getMaxLineCount(self):
|
|
244
|
+
return self.getOrDefault(self.maxLineCount)
|
|
245
|
+
|
|
246
|
+
threshold = Param(
|
|
247
|
+
Params._dummy(),
|
|
248
|
+
"threshold",
|
|
249
|
+
"Threshold ratio of empty lines used to decide between new line-based or broken-paragraph grouping.",
|
|
250
|
+
typeConverter=TypeConverters.toFloat
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
def setThreshold(self, value):
|
|
254
|
+
return self._set(threshold=value)
|
|
255
|
+
|
|
256
|
+
def getThreshold(self):
|
|
257
|
+
return self.getOrDefault(self.threshold)
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
# Copyright 2017-2025 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains the PartitionTransformer class for reading various types of documents into chunks."""
|
|
15
|
+
from sparknlp.common import *
|
|
16
|
+
from sparknlp.partition.partition_properties import *
|
|
17
|
+
|
|
18
|
+
class PartitionTransformer(
|
|
19
|
+
AnnotatorModel,
|
|
20
|
+
HasEmailReaderProperties,
|
|
21
|
+
HasExcelReaderProperties,
|
|
22
|
+
HasHTMLReaderProperties,
|
|
23
|
+
HasPowerPointProperties,
|
|
24
|
+
HasTextReaderProperties
|
|
25
|
+
):
|
|
26
|
+
"""
|
|
27
|
+
The PartitionTransformer annotator allows you to use the Partition feature more smoothly
|
|
28
|
+
within existing Spark NLP workflows, enabling seamless reuse of your pipelines.
|
|
29
|
+
|
|
30
|
+
It supports reading from files, URLs, in-memory strings, or byte arrays, and works
|
|
31
|
+
within a Spark NLP pipeline.
|
|
32
|
+
|
|
33
|
+
Supported formats include:
|
|
34
|
+
- Plain text
|
|
35
|
+
- HTML
|
|
36
|
+
- Word (.doc/.docx)
|
|
37
|
+
- Excel (.xls/.xlsx)
|
|
38
|
+
- PowerPoint (.ppt/.pptx)
|
|
39
|
+
- Email files (.eml, .msg)
|
|
40
|
+
- PDFs
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
inputCols : list of str
|
|
45
|
+
Names of input columns (typically from DocumentAssembler).
|
|
46
|
+
outputCol : str
|
|
47
|
+
Name of the column to store the output.
|
|
48
|
+
contentType : str
|
|
49
|
+
The type of content: e.g., "text", "url", "file", etc.
|
|
50
|
+
headers : dict, optional
|
|
51
|
+
Headers to be used if content type is a URL.
|
|
52
|
+
|
|
53
|
+
Examples
|
|
54
|
+
--------
|
|
55
|
+
>>> dataset = spark.createDataFrame([
|
|
56
|
+
... ("https://www.blizzard.com",),
|
|
57
|
+
... ], ["text"])
|
|
58
|
+
|
|
59
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
60
|
+
... .setInputCol("text") \\
|
|
61
|
+
... .setOutputCol("document")
|
|
62
|
+
|
|
63
|
+
>>> partition = PartitionTransformer() \\
|
|
64
|
+
... .setInputCols(["document"]) \\
|
|
65
|
+
... .setOutputCol("partition") \\
|
|
66
|
+
... .setContentType("url") \\
|
|
67
|
+
... .setHeaders({"Accept-Language": "es-ES"})
|
|
68
|
+
|
|
69
|
+
>>> pipeline = Pipeline(stages=[documentAssembler, partition])
|
|
70
|
+
>>> pipelineModel = pipeline.fit(dataset)
|
|
71
|
+
>>> resultDf = pipelineModel.transform(dataset)
|
|
72
|
+
>>> resultDf.show()
|
|
73
|
+
+--------------------+--------------------+--------------------+
|
|
74
|
+
| text| document| partition|
|
|
75
|
+
+--------------------+--------------------+--------------------+
|
|
76
|
+
|https://www.blizz...|[{Title, Juegos d...|[{document, 0, 16...|
|
|
77
|
+
|https://www.googl...|[{Title, Gmail Im...|[{document, 0, 28...|
|
|
78
|
+
+--------------------+--------------------+--------------------+
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
name = "PartitionTransformer"
|
|
82
|
+
|
|
83
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
|
|
84
|
+
|
|
85
|
+
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
86
|
+
|
|
87
|
+
contentPath = Param(
|
|
88
|
+
Params._dummy(),
|
|
89
|
+
"contentPath",
|
|
90
|
+
"Path to the content source",
|
|
91
|
+
typeConverter=TypeConverters.toString
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
def setContentPath(self, value):
|
|
95
|
+
return self._set(contentPath=value)
|
|
96
|
+
|
|
97
|
+
def getContentPath(self):
|
|
98
|
+
return self.getOrDefault(self.contentPath)
|
|
99
|
+
|
|
100
|
+
contentType = Param(
|
|
101
|
+
Params._dummy(),
|
|
102
|
+
"contentType",
|
|
103
|
+
"Set the content type to load following MIME specification",
|
|
104
|
+
typeConverter=TypeConverters.toString
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
def setContentType(self, value):
|
|
108
|
+
return self._set(contentType=value)
|
|
109
|
+
|
|
110
|
+
def getContentType(self):
|
|
111
|
+
return self.getOrDefault(self.contentType)
|
|
112
|
+
|
|
113
|
+
storeContent = Param(
|
|
114
|
+
Params._dummy(),
|
|
115
|
+
"storeContent",
|
|
116
|
+
"Whether to include the raw file content in the output DataFrame as a separate 'content' column, alongside the structured output.",
|
|
117
|
+
typeConverter=TypeConverters.toBoolean
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
def setStoreContent(self, value):
|
|
121
|
+
return self._set(storeContent=value)
|
|
122
|
+
|
|
123
|
+
def getStoreContent(self):
|
|
124
|
+
return self.getOrDefault(self.storeContent)
|
|
125
|
+
|
|
126
|
+
titleFontSize = Param(
|
|
127
|
+
Params._dummy(),
|
|
128
|
+
"titleFontSize",
|
|
129
|
+
"Minimum font size threshold used as part of heuristic rules to detect title elements based on formatting (e.g., bold, centered, capitalized).",
|
|
130
|
+
typeConverter=TypeConverters.toInt
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
def setTitleFontSize(self, value):
|
|
134
|
+
return self._set(titleFontSize=value)
|
|
135
|
+
|
|
136
|
+
def getTitleFontSize(self):
|
|
137
|
+
return self.getOrDefault(self.titleFontSize)
|
|
138
|
+
|
|
139
|
+
inferTableStructure = Param(
|
|
140
|
+
Params._dummy(),
|
|
141
|
+
"inferTableStructure",
|
|
142
|
+
"Whether to generate an HTML table representation from structured table content. When enabled, a full <table> element is added alongside cell-level elements, based on row and column layout.",
|
|
143
|
+
typeConverter=TypeConverters.toBoolean
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
def setInferTableStructure(self, value):
|
|
147
|
+
return self._set(inferTableStructure=value)
|
|
148
|
+
|
|
149
|
+
def getInferTableStructure(self):
|
|
150
|
+
return self.getOrDefault(self.inferTableStructure)
|
|
151
|
+
|
|
152
|
+
includePageBreaks = Param(
|
|
153
|
+
Params._dummy(),
|
|
154
|
+
"includePageBreaks",
|
|
155
|
+
"Whether to detect and tag content with page break metadata. In Word documents, this includes manual and section breaks. In Excel files, this includes page breaks based on column boundaries.",
|
|
156
|
+
typeConverter=TypeConverters.toBoolean
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def setIncludePageBreaks(self, value):
|
|
160
|
+
return self._set(includePageBreaks=value)
|
|
161
|
+
|
|
162
|
+
def getIncludePageBreaks(self):
|
|
163
|
+
return self.getOrDefault(self.includePageBreaks)
|
|
164
|
+
|
|
165
|
+
# def setHeaders(self, headers: Dict[str, str]):
|
|
166
|
+
# self._call_java("setHeadersPython", headers)
|
|
167
|
+
# return self
|
|
168
|
+
|
|
169
|
+
@keyword_only
|
|
170
|
+
def __init__(self, classname="com.johnsnowlabs.partition.PartitionTransformer",
|
|
171
|
+
java_model=None):
|
|
172
|
+
super(PartitionTransformer, self).__init__(
|
|
173
|
+
classname=classname,
|
|
174
|
+
java_model=java_model
|
|
175
|
+
)
|
|
176
|
+
DOUBLE_PARAGRAPH_PATTERN = r"(?:\s*\n\s*){2,}"
|
|
177
|
+
|
|
178
|
+
self._setDefault(
|
|
179
|
+
contentPath="",
|
|
180
|
+
contentType="text/plain",
|
|
181
|
+
storeContent=False,
|
|
182
|
+
titleFontSize = 9,
|
|
183
|
+
inferTableStructure=False,
|
|
184
|
+
includePageBreaks=False,
|
|
185
|
+
addAttachmentContent=False,
|
|
186
|
+
cellSeparator="\t",
|
|
187
|
+
appendCells=False,
|
|
188
|
+
timeout=0,
|
|
189
|
+
includeSlideNotes=False,
|
|
190
|
+
titleLengthSize=50,
|
|
191
|
+
groupBrokenParagraphs=False,
|
|
192
|
+
paragraphSplit=DOUBLE_PARAGRAPH_PATTERN,
|
|
193
|
+
shortLineWordThreshold=5,
|
|
194
|
+
maxLineCount=2000,
|
|
195
|
+
threshold=0.1
|
|
196
|
+
)
|
sparknlp/reader/pdf_to_text.py
CHANGED
|
@@ -10,10 +10,56 @@ from sparknlp.reader.enums import TextStripperType
|
|
|
10
10
|
class PdfToText(JavaTransformer, HasInputCol, HasOutputCol,
|
|
11
11
|
JavaMLReadable, JavaMLWritable):
|
|
12
12
|
"""
|
|
13
|
-
Extract text from
|
|
14
|
-
Input is a column with binary
|
|
15
|
-
|
|
16
|
-
|
|
13
|
+
Extract text from PDF documents as either a single string or multiple strings per page.
|
|
14
|
+
Input is a column with binary content of PDF files. Output is a column with extracted text,
|
|
15
|
+
with options to include page numbers or split pages.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
pageNumCol : str, optional
|
|
20
|
+
Page number output column name.
|
|
21
|
+
partitionNum : int, optional
|
|
22
|
+
Number of partitions (default is 0).
|
|
23
|
+
storeSplittedPdf : bool, optional
|
|
24
|
+
Whether to store content of split PDFs (default is False).
|
|
25
|
+
splitPage : bool, optional
|
|
26
|
+
Enable/disable splitting per page (default is True).
|
|
27
|
+
onlyPageNum : bool, optional
|
|
28
|
+
Whether to extract only page numbers (default is False).
|
|
29
|
+
textStripper : str or TextStripperType, optional
|
|
30
|
+
Defines layout and formatting type.
|
|
31
|
+
sort : bool, optional
|
|
32
|
+
Enable/disable sorting content per page (default is False).
|
|
33
|
+
|
|
34
|
+
Examples
|
|
35
|
+
--------
|
|
36
|
+
>>> import sparknlp
|
|
37
|
+
>>> from sparknlp.reader import *
|
|
38
|
+
>>> from pyspark.ml import Pipeline
|
|
39
|
+
>>> pdf_path = "Documents/files/pdf"
|
|
40
|
+
>>> data_frame = spark.read.format("binaryFile").load(pdf_path)
|
|
41
|
+
>>> pdf_to_text = PdfToText().setStoreSplittedPdf(True)
|
|
42
|
+
>>> pipeline = Pipeline(stages=[pdf_to_text])
|
|
43
|
+
>>> pipeline_model = pipeline.fit(data_frame)
|
|
44
|
+
>>> pdf_df = pipeline_model.transform(data_frame)
|
|
45
|
+
>>> pdf_df.show()
|
|
46
|
+
+--------------------+--------------------+
|
|
47
|
+
| path| modificationTime|
|
|
48
|
+
+--------------------+--------------------+
|
|
49
|
+
|file:/Users/paula...|2025-05-15 11:33:...|
|
|
50
|
+
|file:/Users/paula...|2025-05-15 11:33:...|
|
|
51
|
+
+--------------------+--------------------+
|
|
52
|
+
>>> pdf_df.printSchema()
|
|
53
|
+
root
|
|
54
|
+
|-- path: string (nullable = true)
|
|
55
|
+
|-- modificationTime: timestamp (nullable = true)
|
|
56
|
+
|-- length: long (nullable = true)
|
|
57
|
+
|-- text: string (nullable = true)
|
|
58
|
+
|-- height_dimension: integer (nullable = true)
|
|
59
|
+
|-- width_dimension: integer (nullable = true)
|
|
60
|
+
|-- content: binary (nullable = true)
|
|
61
|
+
|-- exception: string (nullable = true)
|
|
62
|
+
|-- pagenum: integer (nullable = true)
|
|
17
63
|
"""
|
|
18
64
|
pageNumCol = Param(Params._dummy(), "pageNumCol",
|
|
19
65
|
"Page number output column name.",
|