spark-nlp 6.0.1__py2.py3-none-any.whl → 6.0.2__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spark-nlp might be problematic. Click here for more details.

Files changed (36) hide show
  1. {spark_nlp-6.0.1.dist-info → spark_nlp-6.0.2.dist-info}/METADATA +13 -6
  2. {spark_nlp-6.0.1.dist-info → spark_nlp-6.0.2.dist-info}/RECORD +36 -30
  3. {spark_nlp-6.0.1.dist-info → spark_nlp-6.0.2.dist-info}/WHEEL +1 -1
  4. sparknlp/__init__.py +4 -2
  5. sparknlp/annotator/cv/__init__.py +2 -0
  6. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  7. sparknlp/annotator/cv/gemma3_for_multimodal.py +5 -10
  8. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  9. sparknlp/annotator/cv/janus_for_multimodal.py +8 -13
  10. sparknlp/annotator/cv/llava_for_multimodal.py +1 -1
  11. sparknlp/annotator/cv/paligemma_for_multimodal.py +7 -7
  12. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +1 -1
  13. sparknlp/annotator/cv/qwen2vl_transformer.py +1 -1
  14. sparknlp/annotator/cv/smolvlm_transformer.py +7 -13
  15. sparknlp/annotator/date2_chunk.py +1 -1
  16. sparknlp/annotator/document_character_text_splitter.py +8 -8
  17. sparknlp/annotator/document_token_splitter.py +7 -7
  18. sparknlp/annotator/embeddings/bge_embeddings.py +21 -19
  19. sparknlp/annotator/embeddings/snowflake_embeddings.py +15 -15
  20. sparknlp/annotator/openai/openai_completion.py +3 -4
  21. sparknlp/annotator/seq2seq/m2m100_transformer.py +1 -1
  22. sparknlp/annotator/seq2seq/mistral_transformer.py +2 -3
  23. sparknlp/annotator/seq2seq/nllb_transformer.py +1 -1
  24. sparknlp/annotator/seq2seq/qwen_transformer.py +26 -25
  25. sparknlp/annotator/spell_check/context_spell_checker.py +1 -1
  26. sparknlp/base/prompt_assembler.py +1 -1
  27. sparknlp/common/properties.py +7 -7
  28. sparknlp/internal/__init__.py +19 -0
  29. sparknlp/partition/__init__.py +16 -0
  30. sparknlp/partition/partition.py +244 -0
  31. sparknlp/partition/partition_properties.py +257 -0
  32. sparknlp/partition/partition_transformer.py +196 -0
  33. sparknlp/reader/pdf_to_text.py +50 -4
  34. sparknlp/reader/sparknlp_reader.py +56 -52
  35. sparknlp/training/spacy_to_annotation.py +7 -7
  36. {spark_nlp-6.0.1.dist-info → spark_nlp-6.0.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,257 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for partition properties used in reading various document types."""
15
+ from typing import Dict
16
+
17
+ from pyspark.ml.param import TypeConverters, Params, Param
18
+
19
+
20
+ class HasEmailReaderProperties(Params):
21
+
22
+ addAttachmentContent = Param(
23
+ Params._dummy(),
24
+ "addAttachmentContent",
25
+ "Whether to extract and include the textual content of plain-text attachments in the output",
26
+ typeConverter=TypeConverters.toBoolean
27
+ )
28
+
29
+ def setAddAttachmentContent(self, value):
30
+ """
31
+ Sets whether to extract and include the textual content of plain-text attachments in the output.
32
+
33
+ Parameters
34
+ ----------
35
+ value : bool
36
+ Whether to include text from plain-text attachments.
37
+ """
38
+ return self._set(addAttachmentContent=value)
39
+
40
+ def getAddAttachmentContent(self):
41
+ """
42
+ Gets whether to extract and include the textual content of plain-text attachments in the output.
43
+
44
+ Returns
45
+ -------
46
+ bool
47
+ Whether to include text from plain-text attachments.
48
+ """
49
+ return self.getOrDefault(self.addAttachmentContent)
50
+
51
+
52
+ class HasExcelReaderProperties(Params):
53
+
54
+ cellSeparator = Param(
55
+ Params._dummy(),
56
+ "cellSeparator",
57
+ "String used to join cell values in a row when assembling textual output.",
58
+ typeConverter=TypeConverters.toString
59
+ )
60
+
61
+ def setCellSeparator(self, value):
62
+ """
63
+ Sets the string used to join cell values in a row when assembling textual output.
64
+
65
+ Parameters
66
+ ----------
67
+ value : str
68
+ Delimiter used to concatenate cell values.
69
+ """
70
+ return self._set(cellSeparator=value)
71
+
72
+ def getCellSeparator(self):
73
+ """
74
+ Gets the string used to join cell values in a row when assembling textual output.
75
+
76
+ Returns
77
+ -------
78
+ str
79
+ Delimiter used to concatenate cell values.
80
+ """
81
+ return self.getOrDefault(self.cellSeparator)
82
+
83
+ appendCells = Param(
84
+ Params._dummy(),
85
+ "appendCells",
86
+ "Whether to append all rows into a single content block instead of creating separate elements per row.",
87
+ typeConverter=TypeConverters.toBoolean
88
+ )
89
+
90
+ def setAppendCells(self, value):
91
+ """
92
+ Sets whether to append all rows into a single content block.
93
+
94
+ Parameters
95
+ ----------
96
+ value : bool
97
+ True to merge rows into one block, False for individual elements.
98
+ """
99
+ return self._set(appendCells=value)
100
+
101
+ def getAppendCells(self):
102
+ """
103
+ Gets whether to append all rows into a single content block.
104
+
105
+ Returns
106
+ -------
107
+ bool
108
+ True to merge rows into one block, False for individual elements.
109
+ """
110
+ return self.getOrDefault(self.appendCells)
111
+
112
+ class HasHTMLReaderProperties(Params):
113
+
114
+ timeout = Param(
115
+ Params._dummy(),
116
+ "timeout",
117
+ "Timeout value in seconds for reading remote HTML resources. Applied when fetching content from URLs.",
118
+ typeConverter=TypeConverters.toInt
119
+ )
120
+
121
+ def setTimeout(self, value):
122
+ """
123
+ Sets the timeout (in seconds) for reading remote HTML resources.
124
+
125
+ Parameters
126
+ ----------
127
+ value : int
128
+ Timeout in seconds for remote content retrieval.
129
+ """
130
+ return self._set(timeout=value)
131
+
132
+ def getTimeout(self):
133
+ """
134
+ Gets the timeout value for reading remote HTML resources.
135
+
136
+ Returns
137
+ -------
138
+ int
139
+ Timeout in seconds.
140
+ """
141
+ return self.getOrDefault(self.timeout)
142
+
143
+ def setHeaders(self, headers: Dict[str, str]):
144
+ self._call_java("setHeadersPython", headers)
145
+ return self
146
+
147
+
148
+ class HasPowerPointProperties(Params):
149
+
150
+ includeSlideNotes = Param(
151
+ Params._dummy(),
152
+ "includeSlideNotes",
153
+ "Whether to extract speaker notes from slides. When enabled, notes are included as narrative text elements.",
154
+ typeConverter=TypeConverters.toBoolean
155
+ )
156
+
157
+ def setIncludeSlideNotes(self, value):
158
+ """
159
+ Sets whether to extract speaker notes from slides.
160
+
161
+ Parameters
162
+ ----------
163
+ value : bool
164
+ If True, notes are included as narrative text elements.
165
+ """
166
+ return self._set(includeSlideNotes=value)
167
+
168
+ def getIncludeSlideNotes(self):
169
+ """
170
+ Gets whether to extract speaker notes from slides.
171
+
172
+ Returns
173
+ -------
174
+ bool
175
+ True if notes are included as narrative text elements.
176
+ """
177
+ return self.getOrDefault(self.includeSlideNotes)
178
+
179
+ class HasTextReaderProperties(Params):
180
+
181
+ titleLengthSize = Param(
182
+ Params._dummy(),
183
+ "titleLengthSize",
184
+ "Maximum character length used to determine if a text block qualifies as a title during parsing.",
185
+ typeConverter=TypeConverters.toInt
186
+ )
187
+
188
+ def setTitleLengthSize(self, value):
189
+ return self._set(titleLengthSize=value)
190
+
191
+ def getTitleLengthSize(self):
192
+ return self.getOrDefault(self.titleLengthSize)
193
+
194
+ groupBrokenParagraphs = Param(
195
+ Params._dummy(),
196
+ "groupBrokenParagraphs",
197
+ "Whether to merge fragmented lines into coherent paragraphs using heuristics based on line length and structure.",
198
+ typeConverter=TypeConverters.toBoolean
199
+ )
200
+
201
+ def setGroupBrokenParagraphs(self, value):
202
+ return self._set(groupBrokenParagraphs=value)
203
+
204
+ def getGroupBrokenParagraphs(self):
205
+ return self.getOrDefault(self.groupBrokenParagraphs)
206
+
207
+ paragraphSplit = Param(
208
+ Params._dummy(),
209
+ "paragraphSplit",
210
+ "Regex pattern used to detect paragraph boundaries when grouping broken paragraphs.",
211
+ typeConverter=TypeConverters.toString
212
+ )
213
+
214
+ def setParagraphSplit(self, value):
215
+ return self._set(paragraphSplit=value)
216
+
217
+ def getParagraphSplit(self):
218
+ return self.getOrDefault(self.paragraphSplit)
219
+
220
+ shortLineWordThreshold = Param(
221
+ Params._dummy(),
222
+ "shortLineWordThreshold",
223
+ "Maximum word count for a line to be considered 'short' during broken paragraph grouping.",
224
+ typeConverter=TypeConverters.toInt
225
+ )
226
+
227
+ def setShortLineWordThreshold(self, value):
228
+ return self._set(shortLineWordThreshold=value)
229
+
230
+ def getShortLineWordThreshold(self):
231
+ return self.getOrDefault(self.shortLineWordThreshold)
232
+
233
+ maxLineCount = Param(
234
+ Params._dummy(),
235
+ "maxLineCount",
236
+ "Maximum number of lines to evaluate when estimating paragraph layout characteristics.",
237
+ typeConverter=TypeConverters.toInt
238
+ )
239
+
240
+ def setMaxLineCount(self, value):
241
+ return self._set(maxLineCount=value)
242
+
243
+ def getMaxLineCount(self):
244
+ return self.getOrDefault(self.maxLineCount)
245
+
246
+ threshold = Param(
247
+ Params._dummy(),
248
+ "threshold",
249
+ "Threshold ratio of empty lines used to decide between new line-based or broken-paragraph grouping.",
250
+ typeConverter=TypeConverters.toFloat
251
+ )
252
+
253
+ def setThreshold(self, value):
254
+ return self._set(threshold=value)
255
+
256
+ def getThreshold(self):
257
+ return self.getOrDefault(self.threshold)
@@ -0,0 +1,196 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains the PartitionTransformer class for reading various types of documents into chunks."""
15
+ from sparknlp.common import *
16
+ from sparknlp.partition.partition_properties import *
17
+
18
+ class PartitionTransformer(
19
+ AnnotatorModel,
20
+ HasEmailReaderProperties,
21
+ HasExcelReaderProperties,
22
+ HasHTMLReaderProperties,
23
+ HasPowerPointProperties,
24
+ HasTextReaderProperties
25
+ ):
26
+ """
27
+ The PartitionTransformer annotator allows you to use the Partition feature more smoothly
28
+ within existing Spark NLP workflows, enabling seamless reuse of your pipelines.
29
+
30
+ It supports reading from files, URLs, in-memory strings, or byte arrays, and works
31
+ within a Spark NLP pipeline.
32
+
33
+ Supported formats include:
34
+ - Plain text
35
+ - HTML
36
+ - Word (.doc/.docx)
37
+ - Excel (.xls/.xlsx)
38
+ - PowerPoint (.ppt/.pptx)
39
+ - Email files (.eml, .msg)
40
+ - PDFs
41
+
42
+ Parameters
43
+ ----------
44
+ inputCols : list of str
45
+ Names of input columns (typically from DocumentAssembler).
46
+ outputCol : str
47
+ Name of the column to store the output.
48
+ contentType : str
49
+ The type of content: e.g., "text", "url", "file", etc.
50
+ headers : dict, optional
51
+ Headers to be used if content type is a URL.
52
+
53
+ Examples
54
+ --------
55
+ >>> dataset = spark.createDataFrame([
56
+ ... ("https://www.blizzard.com",),
57
+ ... ], ["text"])
58
+
59
+ >>> documentAssembler = DocumentAssembler() \\
60
+ ... .setInputCol("text") \\
61
+ ... .setOutputCol("document")
62
+
63
+ >>> partition = PartitionTransformer() \\
64
+ ... .setInputCols(["document"]) \\
65
+ ... .setOutputCol("partition") \\
66
+ ... .setContentType("url") \\
67
+ ... .setHeaders({"Accept-Language": "es-ES"})
68
+
69
+ >>> pipeline = Pipeline(stages=[documentAssembler, partition])
70
+ >>> pipelineModel = pipeline.fit(dataset)
71
+ >>> resultDf = pipelineModel.transform(dataset)
72
+ >>> resultDf.show()
73
+ +--------------------+--------------------+--------------------+
74
+ | text| document| partition|
75
+ +--------------------+--------------------+--------------------+
76
+ |https://www.blizz...|[{Title, Juegos d...|[{document, 0, 16...|
77
+ |https://www.googl...|[{Title, Gmail Im...|[{document, 0, 28...|
78
+ +--------------------+--------------------+--------------------+
79
+ """
80
+
81
+ name = "PartitionTransformer"
82
+
83
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
84
+
85
+ outputAnnotatorType = AnnotatorType.DOCUMENT
86
+
87
+ contentPath = Param(
88
+ Params._dummy(),
89
+ "contentPath",
90
+ "Path to the content source",
91
+ typeConverter=TypeConverters.toString
92
+ )
93
+
94
+ def setContentPath(self, value):
95
+ return self._set(contentPath=value)
96
+
97
+ def getContentPath(self):
98
+ return self.getOrDefault(self.contentPath)
99
+
100
+ contentType = Param(
101
+ Params._dummy(),
102
+ "contentType",
103
+ "Set the content type to load following MIME specification",
104
+ typeConverter=TypeConverters.toString
105
+ )
106
+
107
+ def setContentType(self, value):
108
+ return self._set(contentType=value)
109
+
110
+ def getContentType(self):
111
+ return self.getOrDefault(self.contentType)
112
+
113
+ storeContent = Param(
114
+ Params._dummy(),
115
+ "storeContent",
116
+ "Whether to include the raw file content in the output DataFrame as a separate 'content' column, alongside the structured output.",
117
+ typeConverter=TypeConverters.toBoolean
118
+ )
119
+
120
+ def setStoreContent(self, value):
121
+ return self._set(storeContent=value)
122
+
123
+ def getStoreContent(self):
124
+ return self.getOrDefault(self.storeContent)
125
+
126
+ titleFontSize = Param(
127
+ Params._dummy(),
128
+ "titleFontSize",
129
+ "Minimum font size threshold used as part of heuristic rules to detect title elements based on formatting (e.g., bold, centered, capitalized).",
130
+ typeConverter=TypeConverters.toInt
131
+ )
132
+
133
+ def setTitleFontSize(self, value):
134
+ return self._set(titleFontSize=value)
135
+
136
+ def getTitleFontSize(self):
137
+ return self.getOrDefault(self.titleFontSize)
138
+
139
+ inferTableStructure = Param(
140
+ Params._dummy(),
141
+ "inferTableStructure",
142
+ "Whether to generate an HTML table representation from structured table content. When enabled, a full <table> element is added alongside cell-level elements, based on row and column layout.",
143
+ typeConverter=TypeConverters.toBoolean
144
+ )
145
+
146
+ def setInferTableStructure(self, value):
147
+ return self._set(inferTableStructure=value)
148
+
149
+ def getInferTableStructure(self):
150
+ return self.getOrDefault(self.inferTableStructure)
151
+
152
+ includePageBreaks = Param(
153
+ Params._dummy(),
154
+ "includePageBreaks",
155
+ "Whether to detect and tag content with page break metadata. In Word documents, this includes manual and section breaks. In Excel files, this includes page breaks based on column boundaries.",
156
+ typeConverter=TypeConverters.toBoolean
157
+ )
158
+
159
+ def setIncludePageBreaks(self, value):
160
+ return self._set(includePageBreaks=value)
161
+
162
+ def getIncludePageBreaks(self):
163
+ return self.getOrDefault(self.includePageBreaks)
164
+
165
+ # def setHeaders(self, headers: Dict[str, str]):
166
+ # self._call_java("setHeadersPython", headers)
167
+ # return self
168
+
169
+ @keyword_only
170
+ def __init__(self, classname="com.johnsnowlabs.partition.PartitionTransformer",
171
+ java_model=None):
172
+ super(PartitionTransformer, self).__init__(
173
+ classname=classname,
174
+ java_model=java_model
175
+ )
176
+ DOUBLE_PARAGRAPH_PATTERN = r"(?:\s*\n\s*){2,}"
177
+
178
+ self._setDefault(
179
+ contentPath="",
180
+ contentType="text/plain",
181
+ storeContent=False,
182
+ titleFontSize = 9,
183
+ inferTableStructure=False,
184
+ includePageBreaks=False,
185
+ addAttachmentContent=False,
186
+ cellSeparator="\t",
187
+ appendCells=False,
188
+ timeout=0,
189
+ includeSlideNotes=False,
190
+ titleLengthSize=50,
191
+ groupBrokenParagraphs=False,
192
+ paragraphSplit=DOUBLE_PARAGRAPH_PATTERN,
193
+ shortLineWordThreshold=5,
194
+ maxLineCount=2000,
195
+ threshold=0.1
196
+ )
@@ -10,10 +10,56 @@ from sparknlp.reader.enums import TextStripperType
10
10
  class PdfToText(JavaTransformer, HasInputCol, HasOutputCol,
11
11
  JavaMLReadable, JavaMLWritable):
12
12
  """
13
- Extract text from Pdf document to single string or to several strings per each page.
14
- Input is a column with binary representation of PDF document.
15
- As output generate column with text and page number.
16
- Explode each page as separate row if split to page enabled.
13
+ Extract text from PDF documents as either a single string or multiple strings per page.
14
+ Input is a column with binary content of PDF files. Output is a column with extracted text,
15
+ with options to include page numbers or split pages.
16
+
17
+ Parameters
18
+ ----------
19
+ pageNumCol : str, optional
20
+ Page number output column name.
21
+ partitionNum : int, optional
22
+ Number of partitions (default is 0).
23
+ storeSplittedPdf : bool, optional
24
+ Whether to store content of split PDFs (default is False).
25
+ splitPage : bool, optional
26
+ Enable/disable splitting per page (default is True).
27
+ onlyPageNum : bool, optional
28
+ Whether to extract only page numbers (default is False).
29
+ textStripper : str or TextStripperType, optional
30
+ Defines layout and formatting type.
31
+ sort : bool, optional
32
+ Enable/disable sorting content per page (default is False).
33
+
34
+ Examples
35
+ --------
36
+ >>> import sparknlp
37
+ >>> from sparknlp.reader import *
38
+ >>> from pyspark.ml import Pipeline
39
+ >>> pdf_path = "Documents/files/pdf"
40
+ >>> data_frame = spark.read.format("binaryFile").load(pdf_path)
41
+ >>> pdf_to_text = PdfToText().setStoreSplittedPdf(True)
42
+ >>> pipeline = Pipeline(stages=[pdf_to_text])
43
+ >>> pipeline_model = pipeline.fit(data_frame)
44
+ >>> pdf_df = pipeline_model.transform(data_frame)
45
+ >>> pdf_df.show()
46
+ +--------------------+--------------------+
47
+ | path| modificationTime|
48
+ +--------------------+--------------------+
49
+ |file:/Users/paula...|2025-05-15 11:33:...|
50
+ |file:/Users/paula...|2025-05-15 11:33:...|
51
+ +--------------------+--------------------+
52
+ >>> pdf_df.printSchema()
53
+ root
54
+ |-- path: string (nullable = true)
55
+ |-- modificationTime: timestamp (nullable = true)
56
+ |-- length: long (nullable = true)
57
+ |-- text: string (nullable = true)
58
+ |-- height_dimension: integer (nullable = true)
59
+ |-- width_dimension: integer (nullable = true)
60
+ |-- content: binary (nullable = true)
61
+ |-- exception: string (nullable = true)
62
+ |-- pagenum: integer (nullable = true)
17
63
  """
18
64
  pageNumCol = Param(Params._dummy(), "pageNumCol",
19
65
  "Page number output column name.",