spark-nlp 6.0.1rc1__py2.py3-none-any.whl → 6.0.2__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of spark-nlp might be problematic. Click here for more details.
- {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.2.dist-info}/METADATA +13 -6
- {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.2.dist-info}/RECORD +36 -30
- {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.2.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +4 -2
- sparknlp/annotator/cv/__init__.py +2 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +5 -10
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +8 -13
- sparknlp/annotator/cv/llava_for_multimodal.py +1 -1
- sparknlp/annotator/cv/paligemma_for_multimodal.py +7 -7
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +1 -1
- sparknlp/annotator/cv/qwen2vl_transformer.py +1 -1
- sparknlp/annotator/cv/smolvlm_transformer.py +7 -13
- sparknlp/annotator/date2_chunk.py +1 -1
- sparknlp/annotator/document_character_text_splitter.py +8 -8
- sparknlp/annotator/document_token_splitter.py +7 -7
- sparknlp/annotator/embeddings/bge_embeddings.py +21 -19
- sparknlp/annotator/embeddings/snowflake_embeddings.py +15 -15
- sparknlp/annotator/openai/openai_completion.py +3 -4
- sparknlp/annotator/seq2seq/m2m100_transformer.py +1 -1
- sparknlp/annotator/seq2seq/mistral_transformer.py +2 -3
- sparknlp/annotator/seq2seq/nllb_transformer.py +1 -1
- sparknlp/annotator/seq2seq/qwen_transformer.py +26 -25
- sparknlp/annotator/spell_check/context_spell_checker.py +1 -1
- sparknlp/base/prompt_assembler.py +1 -1
- sparknlp/common/properties.py +7 -7
- sparknlp/internal/__init__.py +19 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +257 -0
- sparknlp/partition/partition_transformer.py +196 -0
- sparknlp/reader/pdf_to_text.py +50 -4
- sparknlp/reader/sparknlp_reader.py +56 -52
- sparknlp/training/spacy_to_annotation.py +7 -7
- {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.2.dist-info}/top_level.txt +0 -0
|
@@ -33,27 +33,30 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
33
33
|
>>> from sparknlp.reader import SparkNLPReader
|
|
34
34
|
>>> reader = SparkNLPReader(spark)
|
|
35
35
|
|
|
36
|
-
|
|
36
|
+
Reading HTML
|
|
37
|
+
|
|
37
38
|
>>> html_df = reader.html("https://www.wikipedia.org")
|
|
38
39
|
>>> # Or with shorthand
|
|
39
40
|
>>> import sparknlp
|
|
40
41
|
>>> html_df = sparknlp.read().html("https://www.wikipedia.org")
|
|
41
42
|
|
|
42
|
-
|
|
43
|
+
Reading PDF
|
|
44
|
+
|
|
43
45
|
>>> pdf_df = reader.pdf("home/user/pdfs-directory")
|
|
44
46
|
>>> # Or with shorthand
|
|
45
47
|
>>> pdf_df = sparknlp.read().pdf("home/user/pdfs-directory")
|
|
46
48
|
|
|
47
|
-
|
|
49
|
+
Reading Email
|
|
50
|
+
|
|
48
51
|
>>> email_df = reader.email("home/user/emails-directory")
|
|
49
52
|
>>> # Or with shorthand
|
|
50
53
|
>>> email_df = sparknlp.read().email("home/user/emails-directory")
|
|
51
54
|
"""
|
|
52
55
|
|
|
53
|
-
def __init__(self, spark, params=None):
|
|
56
|
+
def __init__(self, spark, params=None, headers=None):
|
|
54
57
|
if params is None:
|
|
55
58
|
params = {}
|
|
56
|
-
super(SparkNLPReader, self).__init__("com.johnsnowlabs.reader.SparkNLPReader", params)
|
|
59
|
+
super(SparkNLPReader, self).__init__("com.johnsnowlabs.reader.SparkNLPReader", params, headers)
|
|
57
60
|
self.spark = spark
|
|
58
61
|
|
|
59
62
|
def html(self, htmlPath):
|
|
@@ -72,7 +75,7 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
72
75
|
Examples
|
|
73
76
|
--------
|
|
74
77
|
>>> from sparknlp.reader import SparkNLPReader
|
|
75
|
-
>>> html_df = SparkNLPReader(
|
|
78
|
+
>>> html_df = SparkNLPReader().html("https://www.wikipedia.org")
|
|
76
79
|
|
|
77
80
|
You can also use SparkNLP to simplify the process:
|
|
78
81
|
|
|
@@ -86,7 +89,6 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
86
89
|
|https://example.com/|[{Title, Example Domain, {pageNumber -> 1}}, {NarrativeText, 0, This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission., {pageNumber -> 1}}, {NarrativeText, 0, More information... More information..., {pageNumber -> 1}}] |
|
|
87
90
|
+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
88
91
|
>>> html_df.printSchema()
|
|
89
|
-
|
|
90
92
|
root
|
|
91
93
|
|-- url: string (nullable = true)
|
|
92
94
|
|-- html: array (nullable = true)
|
|
@@ -125,13 +127,12 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
125
127
|
|
|
126
128
|
>>> import sparknlp
|
|
127
129
|
>>> email_df = sparknlp.read().email("home/user/emails-directory")
|
|
128
|
-
>>> email_df.show(
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
130
|
+
>>> email_df.show()
|
|
131
|
+
+---------------------------------------------------+
|
|
132
|
+
|email |
|
|
133
|
+
+---------------------------------------------------+
|
|
134
|
+
|[{Title, Email Text Attachments, {sent_to -> Danilo|
|
|
135
|
+
+---------------------------------------------------+
|
|
135
136
|
>>> email_df.printSchema()
|
|
136
137
|
root
|
|
137
138
|
|-- path: string (nullable = true)
|
|
@@ -170,16 +171,17 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
170
171
|
>>> doc_df = SparkNLPReader().doc(spark, "home/user/word-directory")
|
|
171
172
|
|
|
172
173
|
You can use SparkNLP for one line of code
|
|
174
|
+
|
|
173
175
|
>>> import sparknlp
|
|
174
176
|
>>> doc_df = sparknlp.read().doc("home/user/word-directory")
|
|
175
|
-
>>> doc_df.show(
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
>>>
|
|
177
|
+
>>> doc_df.show()
|
|
178
|
+
+-------------------------------------------------+
|
|
179
|
+
|doc | |
|
|
180
|
+
+-------------------------------------------------+
|
|
181
|
+
|[{Table, Header Col 1, {}}, {Table, Header Col 2,|
|
|
182
|
+
+-------------------------------------------------+
|
|
183
|
+
|
|
184
|
+
>>> doc_df.printSchema()
|
|
183
185
|
root
|
|
184
186
|
|-- path: string (nullable = true)
|
|
185
187
|
|-- content: array (nullable = true)
|
|
@@ -224,27 +226,27 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
224
226
|
>>> xlsDf = SparkNLPReader().xls(spark, "home/user/excel-directory")
|
|
225
227
|
|
|
226
228
|
You can use SparkNLP for one line of code
|
|
229
|
+
|
|
227
230
|
>>> import sparknlp
|
|
228
231
|
>>> xlsDf = sparknlp.read().xls("home/user/excel-directory")
|
|
229
|
-
>>> xlsDf.show(
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
| | | |-- value: string (valueContainsNull = true)
|
|
232
|
+
>>> xlsDf.show()
|
|
233
|
+
+--------------------------------------------+
|
|
234
|
+
|xls |
|
|
235
|
+
+--------------------------------------------+
|
|
236
|
+
|[{Title, Financial performance, {SheetNam}}]|
|
|
237
|
+
+--------------------------------------------+
|
|
238
|
+
|
|
239
|
+
>>> xlsDf.printSchema()
|
|
240
|
+
root
|
|
241
|
+
|-- path: string (nullable = true)
|
|
242
|
+
|-- content: binary (nullable = true)
|
|
243
|
+
|-- xls: array (nullable = true)
|
|
244
|
+
| |-- element: struct (containsNull = true)
|
|
245
|
+
| | |-- elementType: string (nullable = true)
|
|
246
|
+
| | |-- content: string (nullable = true)
|
|
247
|
+
| | |-- metadata: map (nullable = true)
|
|
248
|
+
| | | |-- key: string
|
|
249
|
+
| | | |-- value: string (valueContainsNull = true)
|
|
248
250
|
"""
|
|
249
251
|
if not isinstance(docPath, str):
|
|
250
252
|
raise TypeError("docPath must be a string")
|
|
@@ -259,7 +261,7 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
259
261
|
Parameters
|
|
260
262
|
----------
|
|
261
263
|
docPath : str
|
|
262
|
-
Path to an
|
|
264
|
+
Path to an power point document file.
|
|
263
265
|
|
|
264
266
|
Returns
|
|
265
267
|
-------
|
|
@@ -272,14 +274,15 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
272
274
|
>>> pptDf = SparkNLPReader().ppt(spark, "home/user/powerpoint-directory")
|
|
273
275
|
|
|
274
276
|
You can use SparkNLP for one line of code
|
|
277
|
+
|
|
275
278
|
>>> import sparknlp
|
|
276
279
|
>>> pptDf = sparknlp.read().ppt("home/user/powerpoint-directory")
|
|
277
280
|
>>> pptDf.show(truncate=False)
|
|
278
|
-
|
|
279
|
-
|ppt
|
|
280
|
-
|
|
281
|
-
|[{Title, Adding a Bullet Slide, {}},
|
|
282
|
-
|
|
281
|
+
+-------------------------------------+
|
|
282
|
+
|ppt |
|
|
283
|
+
+-------------------------------------+
|
|
284
|
+
|[{Title, Adding a Bullet Slide, {}},]|
|
|
285
|
+
+-------------------------------------+
|
|
283
286
|
"""
|
|
284
287
|
if not isinstance(docPath, str):
|
|
285
288
|
raise TypeError("docPath must be a string")
|
|
@@ -306,14 +309,15 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
306
309
|
>>> txtDf = SparkNLPReader().txt(spark, "home/user/txt/files")
|
|
307
310
|
|
|
308
311
|
You can use SparkNLP for one line of code
|
|
312
|
+
|
|
309
313
|
>>> import sparknlp
|
|
310
314
|
>>> txtDf = sparknlp.read().txt("home/user/txt/files")
|
|
311
315
|
>>> txtDf.show(truncate=False)
|
|
312
|
-
|
|
313
|
-
|txt
|
|
314
|
-
|
|
315
|
-
|[{Title, BIG DATA ANALYTICS, {paragraph -> 0}}
|
|
316
|
-
|
|
316
|
+
+-----------------------------------------------+
|
|
317
|
+
|txt |
|
|
318
|
+
+-----------------------------------------------+
|
|
319
|
+
|[{Title, BIG DATA ANALYTICS, {paragraph -> 0}}]|
|
|
320
|
+
+-----------------------------------------------+
|
|
317
321
|
"""
|
|
318
322
|
if not isinstance(docPath, str):
|
|
319
323
|
raise TypeError("docPath must be a string")
|
|
@@ -21,13 +21,13 @@ class SpacyToAnnotation(ExtendedJavaWrapper):
|
|
|
21
21
|
"""Helper class to load a list of tokens/sentences as JSON to Annotation.
|
|
22
22
|
|
|
23
23
|
The JSON will be in this format:
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
24
|
+
[
|
|
25
|
+
{
|
|
26
|
+
"tokens": ["Hello", "world", "!", "How", "are", "you", "today", "?", "I", "'m", "fine", "thanks", "."],
|
|
27
|
+
"token_spaces": [true, false, true, true, true, true, false, true, false, true, true, false, false],
|
|
28
|
+
"sentence_ends": [2, 7, 12]
|
|
29
|
+
}
|
|
30
|
+
]
|
|
31
31
|
|
|
32
32
|
Examples
|
|
33
33
|
--------
|
|
File without changes
|