spark-nlp 6.0.1rc1__py2.py3-none-any.whl → 6.0.3__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spark-nlp might be problematic. Click here for more details.

Files changed (39) hide show
  1. {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.3.dist-info}/METADATA +13 -6
  2. {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.3.dist-info}/RECORD +39 -32
  3. {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.3.dist-info}/WHEEL +1 -1
  4. sparknlp/__init__.py +4 -2
  5. sparknlp/annotator/cv/__init__.py +2 -0
  6. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  7. sparknlp/annotator/cv/gemma3_for_multimodal.py +5 -10
  8. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  9. sparknlp/annotator/cv/janus_for_multimodal.py +8 -13
  10. sparknlp/annotator/cv/llava_for_multimodal.py +1 -1
  11. sparknlp/annotator/cv/paligemma_for_multimodal.py +7 -7
  12. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +1 -1
  13. sparknlp/annotator/cv/qwen2vl_transformer.py +1 -1
  14. sparknlp/annotator/cv/smolvlm_transformer.py +7 -13
  15. sparknlp/annotator/date2_chunk.py +1 -1
  16. sparknlp/annotator/document_character_text_splitter.py +8 -8
  17. sparknlp/annotator/document_token_splitter.py +7 -7
  18. sparknlp/annotator/embeddings/__init__.py +1 -0
  19. sparknlp/annotator/embeddings/bge_embeddings.py +21 -19
  20. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  21. sparknlp/annotator/embeddings/snowflake_embeddings.py +15 -15
  22. sparknlp/annotator/openai/openai_completion.py +3 -4
  23. sparknlp/annotator/seq2seq/m2m100_transformer.py +1 -1
  24. sparknlp/annotator/seq2seq/mistral_transformer.py +2 -3
  25. sparknlp/annotator/seq2seq/nllb_transformer.py +1 -1
  26. sparknlp/annotator/seq2seq/qwen_transformer.py +26 -25
  27. sparknlp/annotator/spell_check/context_spell_checker.py +1 -1
  28. sparknlp/base/prompt_assembler.py +1 -1
  29. sparknlp/common/properties.py +7 -7
  30. sparknlp/internal/__init__.py +27 -0
  31. sparknlp/partition/__init__.py +16 -0
  32. sparknlp/partition/partition.py +244 -0
  33. sparknlp/partition/partition_properties.py +319 -0
  34. sparknlp/partition/partition_transformer.py +200 -0
  35. sparknlp/reader/pdf_to_text.py +50 -4
  36. sparknlp/reader/sparknlp_reader.py +101 -52
  37. sparknlp/training/spacy_to_annotation.py +7 -7
  38. sparknlp/util.py +26 -0
  39. {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,244 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains the Partition annotator for reading and processing various document types."""
15
+ import sparknlp
16
+ from sparknlp.internal import ExtendedJavaWrapper
17
+
18
+
19
+ class Partition(ExtendedJavaWrapper):
20
+ """
21
+ A unified interface for extracting structured content from various document types
22
+ using Spark NLP readers.
23
+
24
+ This class supports reading from files, URLs, in-memory strings, or byte arrays,
25
+ and returns parsed output as a structured Spark DataFrame.
26
+
27
+ Supported formats include:
28
+ - Plain text
29
+ - HTML
30
+ - Word (.doc/.docx)
31
+ - Excel (.xls/.xlsx)
32
+ - PowerPoint (.ppt/.pptx)
33
+ - Email files (.eml, .msg)
34
+ - PDFs
35
+
36
+ Parameters
37
+ ----------
38
+ params : dict, optional
39
+ Configuration parameters, including:
40
+
41
+ - content_type : str
42
+ Override automatic file type detection.
43
+ - store_content : bool
44
+ Include raw file content in the output DataFrame.
45
+ - timeout : int
46
+ Timeout for fetching HTML content.
47
+ - title_font_size : int
48
+ Font size used to identify titles.
49
+ - include_page_breaks : bool
50
+ Tag content with page break metadata.
51
+ - group_broken_paragraphs : bool
52
+ Merge broken lines into full paragraphs.
53
+ - title_length_size : int
54
+ Max character length to qualify as title.
55
+ - paragraph_split : str
56
+ Regex to detect paragraph boundaries.
57
+ - short_line_word_threshold : int
58
+ Max words in a line to be considered short.
59
+ - threshold : float
60
+ Ratio of empty lines for switching grouping.
61
+ - max_line_count : int
62
+ Max lines evaluated in paragraph analysis.
63
+ - include_slide_notes : bool
64
+ Include speaker notes in output.
65
+ - infer_table_structure : bool
66
+ Generate HTML table structure.
67
+ - append_cells : bool
68
+ Merge Excel rows into one block.
69
+ - cell_separator : str
70
+ Join cell values in a row.
71
+ - add_attachment_content : bool
72
+ Include text of plain-text attachments.
73
+ - headers : dict
74
+ Request headers when using URLs.
75
+
76
+ Examples
77
+ --------
78
+
79
+ Reading Text Files
80
+
81
+ >>> txt_directory = "/content/txtfiles/reader/txt"
82
+ >>> partition_df = Partition(content_type="text/plain").partition(txt_directory)
83
+ >>> partition_df.show()
84
+ >>> partition_df = Partition().partition("./email-files/test-several-attachments.eml")
85
+ >>> partition_df.show()
86
+ >>> partition_df = Partition().partition(
87
+ ... "https://www.wikipedia.com",
88
+ ... headers={"Accept-Language": "es-ES"}
89
+ ... )
90
+ >>> partition_df.show()
91
+ +--------------------+--------------------+
92
+ | path| txt|
93
+ +--------------------+--------------------+
94
+ |file:/content/txt...|[{Title, BIG DATA...|
95
+ +--------------------+--------------------+
96
+
97
+ Reading Email Files
98
+
99
+ >>> partition_df = Partition().partition("./email-files/test-several-attachments.eml")
100
+ >>> partition_df.show()
101
+ +--------------------+--------------------+
102
+ | path| email|
103
+ +--------------------+--------------------+
104
+ |file:/content/ema...|[{Title, Test Sev...|
105
+ +--------------------+--------------------+
106
+
107
+ Reading Webpages
108
+
109
+ >>> partition_df = Partition().partition("https://www.wikipedia.com", headers = {"Accept-Language": "es-ES"})
110
+ >>> partition_df.show()
111
+ +--------------------+--------------------+
112
+ | url| html|
113
+ +--------------------+--------------------+
114
+ |https://www.wikip...|[{Title, Wikipedi...|
115
+ +--------------------+--------------------+
116
+
117
+ For more examples, refer to:
118
+ `examples/python/data-preprocessing/SparkNLP_Partition_Reader_Demo.ipynb`
119
+ """
120
+ def __init__(self, **kwargs):
121
+ self.spark = sparknlp.start()
122
+ params = {}
123
+ for key, value in kwargs.items():
124
+ try:
125
+ params[key] = str(value)
126
+ except Exception as e:
127
+ raise ValueError(f"Invalid value for key '{key}': Cannot cast {type(value)} to string. Original error: {e}")
128
+
129
+ super(Partition, self).__init__("com.johnsnowlabs.partition.Partition", params)
130
+
131
+
132
+ def partition(self, path, headers=None):
133
+ """
134
+ Reads and parses content from a URL, file, or directory path.
135
+
136
+ Parameters
137
+ ----------
138
+ path : str
139
+ Path to file or directory. URLs and DFS are supported.
140
+ headers : dict, optional
141
+ Headers for URL requests.
142
+
143
+ Returns
144
+ -------
145
+ pyspark.sql.DataFrame
146
+ DataFrame with parsed content.
147
+ """
148
+ if headers is None:
149
+ headers = {}
150
+ jdf = self._java_obj.partition(path, headers)
151
+ dataframe = self.getDataFrame(self.spark, jdf)
152
+ return dataframe
153
+
154
+
155
+ def partition_urls(self, path, headers=None):
156
+ """
157
+ Reads and parses content from multiple URLs.
158
+
159
+ Parameters
160
+ ----------
161
+ path : list[str]
162
+ List of URLs.
163
+ headers : dict, optional
164
+ Request headers for URLs.
165
+
166
+ Returns
167
+ -------
168
+ pyspark.sql.DataFrame
169
+ DataFrame with parsed URL content.
170
+
171
+ Examples
172
+ --------
173
+ >>> urls_df = Partition().partition_urls([
174
+ ... "https://www.wikipedia.org", "https://example.com/"
175
+ ... ])
176
+ >>> urls_df.show()
177
+ +--------------------+--------------------+
178
+ | url| html|
179
+ +--------------------+--------------------+
180
+ |https://www.wikip...|[{Title, Wikipedi...|
181
+ |https://example.com/|[{Title, Example ...|
182
+ +--------------------+--------------------+
183
+
184
+ >>> urls_df.printSchema()
185
+ root
186
+ |-- url: string (nullable = true)
187
+ |-- html: array (nullable = true)
188
+ | |-- element: struct (containsNull = true)
189
+ | | |-- elementType: string (nullable = true)
190
+ | | |-- content: string (nullable = true)
191
+ | | |-- metadata: map (nullable = true)
192
+ | | | |-- key: string
193
+ | | | |-- value: string (valueContainsNull = true)
194
+ """
195
+ if headers is None:
196
+ headers = {}
197
+ jdf = self._java_obj.partitionUrlsJava(path, headers)
198
+ dataframe = self.getDataFrame(self.spark, jdf)
199
+ return dataframe
200
+
201
+
202
+ def partition_text(self, text):
203
+ """
204
+ Parses content from a raw text string.
205
+
206
+ Parameters
207
+ ----------
208
+ text : str
209
+ Raw text input.
210
+
211
+ Returns
212
+ -------
213
+ pyspark.sql.DataFrame
214
+ DataFrame with parsed text.
215
+
216
+ Examples
217
+ --------
218
+ >>> raw_text = (
219
+ ... "The big brown fox\\n"
220
+ ... "was walking down the lane.\\n"
221
+ ... "\\n"
222
+ ... "At the end of the lane,\\n"
223
+ ... "the fox met a bear."
224
+ ... )
225
+ >>> text_df = Partition(group_broken_paragraphs=True).partition_text(text=raw_text)
226
+ >>> text_df.show()
227
+ +--------------------------------------+
228
+ |txt |
229
+ +--------------------------------------+
230
+ |[{NarrativeText, The big brown fox was|
231
+ +--------------------------------------+
232
+ >>> text_df.printSchema()
233
+ root
234
+ |-- txt: array (nullable = true)
235
+ | |-- element: struct (containsNull = true)
236
+ | | |-- elementType: string (nullable = true)
237
+ | | |-- content: string (nullable = true)
238
+ | | |-- metadata: map (nullable = true)
239
+ | | | |-- key: string
240
+ | | | |-- value: string (valueContainsNull = true)
241
+ """
242
+ jdf = self._java_obj.partitionText(text)
243
+ dataframe = self.getDataFrame(self.spark, jdf)
244
+ return dataframe
@@ -0,0 +1,319 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for partition properties used in reading various document types."""
15
+ from typing import Dict
16
+
17
+ from pyspark.ml.param import TypeConverters, Params, Param
18
+
19
+
20
+ class HasEmailReaderProperties(Params):
21
+
22
+ addAttachmentContent = Param(
23
+ Params._dummy(),
24
+ "addAttachmentContent",
25
+ "Whether to extract and include the textual content of plain-text attachments in the output",
26
+ typeConverter=TypeConverters.toBoolean
27
+ )
28
+
29
+ def setAddAttachmentContent(self, value):
30
+ """
31
+ Sets whether to extract and include the textual content of plain-text attachments in the output.
32
+
33
+ Parameters
34
+ ----------
35
+ value : bool
36
+ Whether to include text from plain-text attachments.
37
+ """
38
+ return self._set(addAttachmentContent=value)
39
+
40
+ def getAddAttachmentContent(self):
41
+ """
42
+ Gets whether to extract and include the textual content of plain-text attachments in the output.
43
+
44
+ Returns
45
+ -------
46
+ bool
47
+ Whether to include text from plain-text attachments.
48
+ """
49
+ return self.getOrDefault(self.addAttachmentContent)
50
+
51
+
52
+ class HasExcelReaderProperties(Params):
53
+
54
+ cellSeparator = Param(
55
+ Params._dummy(),
56
+ "cellSeparator",
57
+ "String used to join cell values in a row when assembling textual output.",
58
+ typeConverter=TypeConverters.toString
59
+ )
60
+
61
+ def setCellSeparator(self, value):
62
+ """
63
+ Sets the string used to join cell values in a row when assembling textual output.
64
+
65
+ Parameters
66
+ ----------
67
+ value : str
68
+ Delimiter used to concatenate cell values.
69
+ """
70
+ return self._set(cellSeparator=value)
71
+
72
+ def getCellSeparator(self):
73
+ """
74
+ Gets the string used to join cell values in a row when assembling textual output.
75
+
76
+ Returns
77
+ -------
78
+ str
79
+ Delimiter used to concatenate cell values.
80
+ """
81
+ return self.getOrDefault(self.cellSeparator)
82
+
83
+ appendCells = Param(
84
+ Params._dummy(),
85
+ "appendCells",
86
+ "Whether to append all rows into a single content block instead of creating separate elements per row.",
87
+ typeConverter=TypeConverters.toBoolean
88
+ )
89
+
90
+ def setAppendCells(self, value):
91
+ """
92
+ Sets whether to append all rows into a single content block.
93
+
94
+ Parameters
95
+ ----------
96
+ value : bool
97
+ True to merge rows into one block, False for individual elements.
98
+ """
99
+ return self._set(appendCells=value)
100
+
101
+ def getAppendCells(self):
102
+ """
103
+ Gets whether to append all rows into a single content block.
104
+
105
+ Returns
106
+ -------
107
+ bool
108
+ True to merge rows into one block, False for individual elements.
109
+ """
110
+ return self.getOrDefault(self.appendCells)
111
+
112
+ class HasHTMLReaderProperties(Params):
113
+
114
+ timeout = Param(
115
+ Params._dummy(),
116
+ "timeout",
117
+ "Timeout value in seconds for reading remote HTML resources. Applied when fetching content from URLs.",
118
+ typeConverter=TypeConverters.toInt
119
+ )
120
+
121
+ def setTimeout(self, value):
122
+ """
123
+ Sets the timeout (in seconds) for reading remote HTML resources.
124
+
125
+ Parameters
126
+ ----------
127
+ value : int
128
+ Timeout in seconds for remote content retrieval.
129
+ """
130
+ return self._set(timeout=value)
131
+
132
+ def getTimeout(self):
133
+ """
134
+ Gets the timeout value for reading remote HTML resources.
135
+
136
+ Returns
137
+ -------
138
+ int
139
+ Timeout in seconds.
140
+ """
141
+ return self.getOrDefault(self.timeout)
142
+
143
+ def setHeaders(self, headers: Dict[str, str]):
144
+ self._call_java("setHeadersPython", headers)
145
+ return self
146
+
147
+
148
+ class HasPowerPointProperties(Params):
149
+
150
+ includeSlideNotes = Param(
151
+ Params._dummy(),
152
+ "includeSlideNotes",
153
+ "Whether to extract speaker notes from slides. When enabled, notes are included as narrative text elements.",
154
+ typeConverter=TypeConverters.toBoolean
155
+ )
156
+
157
+ def setIncludeSlideNotes(self, value):
158
+ """
159
+ Sets whether to extract speaker notes from slides.
160
+
161
+ Parameters
162
+ ----------
163
+ value : bool
164
+ If True, notes are included as narrative text elements.
165
+ """
166
+ return self._set(includeSlideNotes=value)
167
+
168
+ def getIncludeSlideNotes(self):
169
+ """
170
+ Gets whether to extract speaker notes from slides.
171
+
172
+ Returns
173
+ -------
174
+ bool
175
+ True if notes are included as narrative text elements.
176
+ """
177
+ return self.getOrDefault(self.includeSlideNotes)
178
+
179
+ class HasTextReaderProperties(Params):
180
+
181
+ titleLengthSize = Param(
182
+ Params._dummy(),
183
+ "titleLengthSize",
184
+ "Maximum character length used to determine if a text block qualifies as a title during parsing.",
185
+ typeConverter=TypeConverters.toInt
186
+ )
187
+
188
+ def setTitleLengthSize(self, value):
189
+ return self._set(titleLengthSize=value)
190
+
191
+ def getTitleLengthSize(self):
192
+ return self.getOrDefault(self.titleLengthSize)
193
+
194
+ groupBrokenParagraphs = Param(
195
+ Params._dummy(),
196
+ "groupBrokenParagraphs",
197
+ "Whether to merge fragmented lines into coherent paragraphs using heuristics based on line length and structure.",
198
+ typeConverter=TypeConverters.toBoolean
199
+ )
200
+
201
+ def setGroupBrokenParagraphs(self, value):
202
+ return self._set(groupBrokenParagraphs=value)
203
+
204
+ def getGroupBrokenParagraphs(self):
205
+ return self.getOrDefault(self.groupBrokenParagraphs)
206
+
207
+ paragraphSplit = Param(
208
+ Params._dummy(),
209
+ "paragraphSplit",
210
+ "Regex pattern used to detect paragraph boundaries when grouping broken paragraphs.",
211
+ typeConverter=TypeConverters.toString
212
+ )
213
+
214
+ def setParagraphSplit(self, value):
215
+ return self._set(paragraphSplit=value)
216
+
217
+ def getParagraphSplit(self):
218
+ return self.getOrDefault(self.paragraphSplit)
219
+
220
+ shortLineWordThreshold = Param(
221
+ Params._dummy(),
222
+ "shortLineWordThreshold",
223
+ "Maximum word count for a line to be considered 'short' during broken paragraph grouping.",
224
+ typeConverter=TypeConverters.toInt
225
+ )
226
+
227
+ def setShortLineWordThreshold(self, value):
228
+ return self._set(shortLineWordThreshold=value)
229
+
230
+ def getShortLineWordThreshold(self):
231
+ return self.getOrDefault(self.shortLineWordThreshold)
232
+
233
+ maxLineCount = Param(
234
+ Params._dummy(),
235
+ "maxLineCount",
236
+ "Maximum number of lines to evaluate when estimating paragraph layout characteristics.",
237
+ typeConverter=TypeConverters.toInt
238
+ )
239
+
240
+ def setMaxLineCount(self, value):
241
+ return self._set(maxLineCount=value)
242
+
243
+ def getMaxLineCount(self):
244
+ return self.getOrDefault(self.maxLineCount)
245
+
246
+ threshold = Param(
247
+ Params._dummy(),
248
+ "threshold",
249
+ "Threshold ratio of empty lines used to decide between new line-based or broken-paragraph grouping.",
250
+ typeConverter=TypeConverters.toFloat
251
+ )
252
+
253
+ def setThreshold(self, value):
254
+ return self._set(threshold=value)
255
+
256
+ def getThreshold(self):
257
+ return self.getOrDefault(self.threshold)
258
+
259
+ class HasChunkerProperties(Params):
260
+
261
+ chunkingStrategy = Param(
262
+ Params._dummy(),
263
+ "chunkingStrategy",
264
+ "Set the chunking strategy",
265
+ typeConverter=TypeConverters.toString
266
+ )
267
+
268
+ def setChunkingStrategy(self, value):
269
+ return self._set(chunkingStrategy=value)
270
+
271
+ maxCharacters = Param(
272
+ Params._dummy(),
273
+ "maxCharacters",
274
+ "Set the maximum number of characters",
275
+ typeConverter=TypeConverters.toInt
276
+ )
277
+
278
+ def setMaxCharacters(self, value):
279
+ return self._set(maxCharacters=value)
280
+
281
+ newAfterNChars = Param(
282
+ Params._dummy(),
283
+ "newAfterNChars",
284
+ "Insert a new chunk after N characters",
285
+ typeConverter=TypeConverters.toInt
286
+ )
287
+
288
+ def setNewAfterNChars(self, value):
289
+ return self._set(newAfterNChars=value)
290
+
291
+ overlap = Param(
292
+ Params._dummy(),
293
+ "overlap",
294
+ "Set the number of overlapping characters between chunks",
295
+ typeConverter=TypeConverters.toInt
296
+ )
297
+
298
+ def setOverlap(self, value):
299
+ return self._set(overlap=value)
300
+
301
+ combineTextUnderNChars = Param(
302
+ Params._dummy(),
303
+ "combineTextUnderNChars",
304
+ "Threshold to merge adjacent small sections",
305
+ typeConverter=TypeConverters.toInt
306
+ )
307
+
308
+ def setCombineTextUnderNChars(self, value):
309
+ return self._set(combineTextUnderNChars=value)
310
+
311
+ overlapAll = Param(
312
+ Params._dummy(),
313
+ "overlapAll",
314
+ "Apply overlap context between all sections, not just split chunks",
315
+ typeConverter=TypeConverters.toBoolean
316
+ )
317
+
318
+ def setOverlapAll(self, value):
319
+ return self._set(overlapAll=value)