spark-nlp 6.0.1__py2.py3-none-any.whl → 6.0.2__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of spark-nlp might be problematic. Click here for more details.
- {spark_nlp-6.0.1.dist-info → spark_nlp-6.0.2.dist-info}/METADATA +13 -6
- {spark_nlp-6.0.1.dist-info → spark_nlp-6.0.2.dist-info}/RECORD +36 -30
- {spark_nlp-6.0.1.dist-info → spark_nlp-6.0.2.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +4 -2
- sparknlp/annotator/cv/__init__.py +2 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +5 -10
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +8 -13
- sparknlp/annotator/cv/llava_for_multimodal.py +1 -1
- sparknlp/annotator/cv/paligemma_for_multimodal.py +7 -7
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +1 -1
- sparknlp/annotator/cv/qwen2vl_transformer.py +1 -1
- sparknlp/annotator/cv/smolvlm_transformer.py +7 -13
- sparknlp/annotator/date2_chunk.py +1 -1
- sparknlp/annotator/document_character_text_splitter.py +8 -8
- sparknlp/annotator/document_token_splitter.py +7 -7
- sparknlp/annotator/embeddings/bge_embeddings.py +21 -19
- sparknlp/annotator/embeddings/snowflake_embeddings.py +15 -15
- sparknlp/annotator/openai/openai_completion.py +3 -4
- sparknlp/annotator/seq2seq/m2m100_transformer.py +1 -1
- sparknlp/annotator/seq2seq/mistral_transformer.py +2 -3
- sparknlp/annotator/seq2seq/nllb_transformer.py +1 -1
- sparknlp/annotator/seq2seq/qwen_transformer.py +26 -25
- sparknlp/annotator/spell_check/context_spell_checker.py +1 -1
- sparknlp/base/prompt_assembler.py +1 -1
- sparknlp/common/properties.py +7 -7
- sparknlp/internal/__init__.py +19 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +257 -0
- sparknlp/partition/partition_transformer.py +196 -0
- sparknlp/reader/pdf_to_text.py +50 -4
- sparknlp/reader/sparknlp_reader.py +56 -52
- sparknlp/training/spacy_to_annotation.py +7 -7
- {spark_nlp-6.0.1.dist-info → spark_nlp-6.0.2.dist-info}/top_level.txt +0 -0
|
@@ -91,8 +91,7 @@ class MistralTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
|
|
|
91
91
|
|
|
92
92
|
References
|
|
93
93
|
----------
|
|
94
|
-
- `Mistral 7B
|
|
95
|
-
<https://mistral.ai/news/announcing-mistral_7b/>`__
|
|
94
|
+
- `Mistral 7B <https://mistral.ai/news/announcing-mistral_7b/>`__
|
|
96
95
|
- https://github.com/mistralai/mistral-src
|
|
97
96
|
|
|
98
97
|
**Paper Abstract:**
|
|
@@ -126,7 +125,7 @@ class MistralTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
|
|
|
126
125
|
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
127
126
|
|result |
|
|
128
127
|
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
129
|
-
|[Leonardo Da Vinci invented the microscope
|
|
128
|
+
|[Leonardo Da Vinci invented the microscope?\\n Question: Leonardo Da Vinci invented the microscope?\\n Answer: No, Leonardo Da Vinci did not invent the microscope. The first microscope was invented |
|
|
130
129
|
| in the late 16th century, long after Leonardo'] |
|
|
131
130
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
132
131
|
"""
|
|
@@ -77,7 +77,7 @@ class NLLBTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
|
|
|
77
77
|
Target Language (Default: `fr`)
|
|
78
78
|
|
|
79
79
|
Languages Covered
|
|
80
|
-
|
|
80
|
+
-----------------
|
|
81
81
|
Acehnese (Arabic script) (ace_Arab), Acehnese (Latin script) (ace_Latn), Mesopotamian Arabic
|
|
82
82
|
(acm_Arab), Ta’izzi-Adeni Arabic (acq_Arab), Tunisian Arabic (aeb_Arab), Afrikaans (afr_Latn),
|
|
83
83
|
South Levantine Arabic (ajp_Arab), Akan (aka_Latn), Amharic (amh_Ethi), North Levantine Arabic
|
|
@@ -52,6 +52,32 @@ class QwenTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
|
|
|
52
52
|
``DOCUMENT`` ``DOCUMENT``
|
|
53
53
|
====================== ======================
|
|
54
54
|
|
|
55
|
+
**References**
|
|
56
|
+
|
|
57
|
+
- `Qwen Technical Report
|
|
58
|
+
<https://arxiv.org/pdf/2309.16609.pdf>`__
|
|
59
|
+
- https://qwenlm.github.io/blog/qwen1.5/
|
|
60
|
+
- https://github.com/QwenLM/Qwen1.5
|
|
61
|
+
|
|
62
|
+
**Paper Abstract:**
|
|
63
|
+
|
|
64
|
+
*Large language models (LLMs) have revolutionized the field of artificial intelligence,
|
|
65
|
+
enabling natural language processing tasks that were previously thought to be exclusive to
|
|
66
|
+
humans. In this work, we introduce Qwen, the first installment of our large language model
|
|
67
|
+
series. Qwen is a comprehensive language model series that encompasses distinct models with
|
|
68
|
+
varying parameter counts. It includes Qwen, the base pretrained language models, and
|
|
69
|
+
Qwen-Chat, the chat models finetuned with human alignment techniques. The base language models
|
|
70
|
+
consistently demonstrate superior performance across a multitude of downstream tasks, and the
|
|
71
|
+
chat models, particularly those trained using Reinforcement Learning from Human Feedback
|
|
72
|
+
(RLHF), are highly competitive. The chat models possess advanced tool-use and planning
|
|
73
|
+
capabilities for creating agent applications, showcasing impressive performance even when
|
|
74
|
+
compared to bigger models on complex tasks like utilizing a code interpreter. Furthermore, we
|
|
75
|
+
have developed coding-specialized models, Code-Qwen and Code-Qwen-Chat, as well as
|
|
76
|
+
mathematics-focused models, Math-Qwen-Chat, which are built upon base language models. These
|
|
77
|
+
models demonstrate significantly improved performance in comparison with open-source models,
|
|
78
|
+
and slightly fall behind the proprietary models.*
|
|
79
|
+
|
|
80
|
+
|
|
55
81
|
Parameters
|
|
56
82
|
----------
|
|
57
83
|
configProtoBytes
|
|
@@ -87,31 +113,6 @@ class QwenTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
|
|
|
87
113
|
This is a very computationally expensive module especially on larger
|
|
88
114
|
sequence. The use of an accelerator such as GPU is recommended.
|
|
89
115
|
|
|
90
|
-
References
|
|
91
|
-
----------
|
|
92
|
-
- `Qwen Technical Report
|
|
93
|
-
<https://arxiv.org/pdf/2309.16609.pdf>`__
|
|
94
|
-
- https://qwenlm.github.io/blog/qwen1.5/
|
|
95
|
-
- https://github.com/QwenLM/Qwen1.5
|
|
96
|
-
|
|
97
|
-
**Paper Abstract:**
|
|
98
|
-
|
|
99
|
-
*Large language models (LLMs) have revolutionized the field of artificial intelligence,
|
|
100
|
-
enabling natural language processing tasks that were previously thought to be exclusive to
|
|
101
|
-
humans. In this work, we introduce Qwen, the first installment of our large language model
|
|
102
|
-
series. Qwen is a comprehensive language model series that encompasses distinct models with
|
|
103
|
-
varying parameter counts. It includes Qwen, the base pretrained language models, and
|
|
104
|
-
Qwen-Chat, the chat models finetuned with human alignment techniques. The base language models
|
|
105
|
-
consistently demonstrate superior performance across a multitude of downstream tasks, and the
|
|
106
|
-
chat models, particularly those trained using Reinforcement Learning from Human Feedback
|
|
107
|
-
(RLHF), are highly competitive. The chat models possess advanced tool-use and planning
|
|
108
|
-
capabilities for creating agent applications, showcasing impressive performance even when
|
|
109
|
-
compared to bigger models on complex tasks like utilizing a code interpreter. Furthermore, we
|
|
110
|
-
have developed coding-specialized models, Code-Qwen and Code-Qwen-Chat, as well as
|
|
111
|
-
mathematics-focused models, Math-Qwen-Chat, which are built upon base language models. These
|
|
112
|
-
models demonstrate significantly improved performance in comparison with open-source models,
|
|
113
|
-
and slightly fall behind the proprietary models.*
|
|
114
|
-
|
|
115
116
|
Examples
|
|
116
117
|
--------
|
|
117
118
|
>>> import sparknlp
|
|
@@ -565,7 +565,7 @@ class ContextSpellCheckerModel(AnnotatorModel, HasEngine):
|
|
|
565
565
|
|
|
566
566
|
|
|
567
567
|
References
|
|
568
|
-
|
|
568
|
+
----------
|
|
569
569
|
For an in-depth explanation of the module see the article `Applying Context
|
|
570
570
|
Aware Spell Checking in Spark NLP
|
|
571
571
|
<https://medium.com/spark-nlp/applying-context-aware-spell-checking-in-spark-nlp-3c29c46963bc>`__.
|
|
@@ -122,7 +122,7 @@ class PromptAssembler(AnnotatorTransformer):
|
|
|
122
122
|
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
123
123
|
|result |
|
|
124
124
|
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
125
|
-
|[<|start_header_id|>system<|end_header_id
|
|
125
|
+
|[<|start_header_id|>system<|end_header_id|>\\n\\nYou are a helpful assistant.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\nHello there, how can I help you?<|eot_id|><|start_header_id|>user<|end_header_id|>\\n\\nI need help with organizing my room.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n]|
|
|
126
126
|
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
127
127
|
"""
|
|
128
128
|
|
sparknlp/common/properties.py
CHANGED
|
@@ -38,7 +38,7 @@ class HasBatchedAnnotate:
|
|
|
38
38
|
int
|
|
39
39
|
Current batch size
|
|
40
40
|
"""
|
|
41
|
-
return self.getOrDefault(
|
|
41
|
+
return self.getOrDefault(self.batchSize)
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
class HasCaseSensitiveProperties:
|
|
@@ -245,7 +245,7 @@ class HasBatchedAnnotateImage:
|
|
|
245
245
|
int
|
|
246
246
|
Current batch size
|
|
247
247
|
"""
|
|
248
|
-
return self.getOrDefault(
|
|
248
|
+
return self.getOrDefault(self.batchSize)
|
|
249
249
|
|
|
250
250
|
|
|
251
251
|
class HasImageFeatureProperties:
|
|
@@ -402,7 +402,7 @@ class HasBatchedAnnotateAudio:
|
|
|
402
402
|
int
|
|
403
403
|
Current batch size
|
|
404
404
|
"""
|
|
405
|
-
return self.getOrDefault(
|
|
405
|
+
return self.getOrDefault(self.batchSize)
|
|
406
406
|
|
|
407
407
|
|
|
408
408
|
class HasAudioFeatureProperties:
|
|
@@ -1099,7 +1099,7 @@ class HasLlamaCppProperties:
|
|
|
1099
1099
|
return self._set(flashAttention=flashAttention)
|
|
1100
1100
|
|
|
1101
1101
|
def setInputPrefixBos(self, inputPrefixBos: bool):
|
|
1102
|
-
"""Whether to add prefix BOS to user inputs, preceding the `--in-prefix`
|
|
1102
|
+
"""Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string"""
|
|
1103
1103
|
return self._set(inputPrefixBos=inputPrefixBos)
|
|
1104
1104
|
|
|
1105
1105
|
def setUseMmap(self, useMmap: bool):
|
|
@@ -1114,7 +1114,7 @@ class HasLlamaCppProperties:
|
|
|
1114
1114
|
"""Whether to disable KV offload"""
|
|
1115
1115
|
return self._set(noKvOffload=noKvOffload)
|
|
1116
1116
|
|
|
1117
|
-
def setSystemPrompt(self, systemPrompt:
|
|
1117
|
+
def setSystemPrompt(self, systemPrompt: str):
|
|
1118
1118
|
"""Set a system prompt to use"""
|
|
1119
1119
|
return self._set(systemPrompt=systemPrompt)
|
|
1120
1120
|
|
|
@@ -1219,7 +1219,7 @@ class HasLlamaCppProperties:
|
|
|
1219
1219
|
"""Set the amount of tokens the samplers should return at least (0 = disabled)"""
|
|
1220
1220
|
return self._set(minKeep=minKeep)
|
|
1221
1221
|
|
|
1222
|
-
def setGrammar(self, grammar:
|
|
1222
|
+
def setGrammar(self, grammar: str):
|
|
1223
1223
|
"""Set BNF-like grammar to constrain generations"""
|
|
1224
1224
|
return self._set(grammar=grammar)
|
|
1225
1225
|
|
|
@@ -1261,7 +1261,7 @@ class HasLlamaCppProperties:
|
|
|
1261
1261
|
return self._call_java("setTokenBias", tokenBias)
|
|
1262
1262
|
|
|
1263
1263
|
def setLoraAdapters(self, loraAdapters: Dict[str, float]):
|
|
1264
|
-
"""Set
|
|
1264
|
+
"""Set LoRA adapters with their scaling factors"""
|
|
1265
1265
|
return self._call_java("setLoraAdapters", loraAdapters)
|
|
1266
1266
|
|
|
1267
1267
|
def getMetadata(self):
|
sparknlp/internal/__init__.py
CHANGED
|
@@ -281,6 +281,16 @@ class _Gemma3ForMultiModalLoader(ExtendedJavaWrapper):
|
|
|
281
281
|
use_openvino
|
|
282
282
|
)
|
|
283
283
|
|
|
284
|
+
class _InternVLForMultiModalLoader(ExtendedJavaWrapper):
|
|
285
|
+
def __init__(self, path, jspark, use_openvino=False):
|
|
286
|
+
super(_InternVLForMultiModalLoader, self).__init__(
|
|
287
|
+
"com.johnsnowlabs.nlp.annotators.cv.InternVLForMultiModal.loadSavedModel",
|
|
288
|
+
path,
|
|
289
|
+
jspark,
|
|
290
|
+
use_openvino
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
|
|
284
294
|
class _JanusForMultiModalLoader(ExtendedJavaWrapper):
|
|
285
295
|
def __init__(self, path, jspark, use_openvino=False):
|
|
286
296
|
super(_JanusForMultiModalLoader, self).__init__(
|
|
@@ -1146,3 +1156,12 @@ class _SmolVLMTransformerLoader(ExtendedJavaWrapper):
|
|
|
1146
1156
|
jspark,
|
|
1147
1157
|
use_openvino
|
|
1148
1158
|
)
|
|
1159
|
+
|
|
1160
|
+
class _Florence2TransformerLoader(ExtendedJavaWrapper):
|
|
1161
|
+
def __init__(self, path, jspark, use_openvino=False):
|
|
1162
|
+
super(_Florence2TransformerLoader, self).__init__(
|
|
1163
|
+
"com.johnsnowlabs.nlp.annotators.cv.Florence2Transformer.loadSavedModel",
|
|
1164
|
+
path,
|
|
1165
|
+
jspark,
|
|
1166
|
+
use_openvino,
|
|
1167
|
+
)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Copyright 2017-2025 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Module to read various types of documents into chunks"""
|
|
15
|
+
from sparknlp.partition.partition import *
|
|
16
|
+
from sparknlp.partition.partition_transformer import *
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
# Copyright 2017-2025 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains the Partition annotator for reading and processing various document types."""
|
|
15
|
+
import sparknlp
|
|
16
|
+
from sparknlp.internal import ExtendedJavaWrapper
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Partition(ExtendedJavaWrapper):
|
|
20
|
+
"""
|
|
21
|
+
A unified interface for extracting structured content from various document types
|
|
22
|
+
using Spark NLP readers.
|
|
23
|
+
|
|
24
|
+
This class supports reading from files, URLs, in-memory strings, or byte arrays,
|
|
25
|
+
and returns parsed output as a structured Spark DataFrame.
|
|
26
|
+
|
|
27
|
+
Supported formats include:
|
|
28
|
+
- Plain text
|
|
29
|
+
- HTML
|
|
30
|
+
- Word (.doc/.docx)
|
|
31
|
+
- Excel (.xls/.xlsx)
|
|
32
|
+
- PowerPoint (.ppt/.pptx)
|
|
33
|
+
- Email files (.eml, .msg)
|
|
34
|
+
- PDFs
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
params : dict, optional
|
|
39
|
+
Configuration parameters, including:
|
|
40
|
+
|
|
41
|
+
- content_type : str
|
|
42
|
+
Override automatic file type detection.
|
|
43
|
+
- store_content : bool
|
|
44
|
+
Include raw file content in the output DataFrame.
|
|
45
|
+
- timeout : int
|
|
46
|
+
Timeout for fetching HTML content.
|
|
47
|
+
- title_font_size : int
|
|
48
|
+
Font size used to identify titles.
|
|
49
|
+
- include_page_breaks : bool
|
|
50
|
+
Tag content with page break metadata.
|
|
51
|
+
- group_broken_paragraphs : bool
|
|
52
|
+
Merge broken lines into full paragraphs.
|
|
53
|
+
- title_length_size : int
|
|
54
|
+
Max character length to qualify as title.
|
|
55
|
+
- paragraph_split : str
|
|
56
|
+
Regex to detect paragraph boundaries.
|
|
57
|
+
- short_line_word_threshold : int
|
|
58
|
+
Max words in a line to be considered short.
|
|
59
|
+
- threshold : float
|
|
60
|
+
Ratio of empty lines for switching grouping.
|
|
61
|
+
- max_line_count : int
|
|
62
|
+
Max lines evaluated in paragraph analysis.
|
|
63
|
+
- include_slide_notes : bool
|
|
64
|
+
Include speaker notes in output.
|
|
65
|
+
- infer_table_structure : bool
|
|
66
|
+
Generate HTML table structure.
|
|
67
|
+
- append_cells : bool
|
|
68
|
+
Merge Excel rows into one block.
|
|
69
|
+
- cell_separator : str
|
|
70
|
+
Join cell values in a row.
|
|
71
|
+
- add_attachment_content : bool
|
|
72
|
+
Include text of plain-text attachments.
|
|
73
|
+
- headers : dict
|
|
74
|
+
Request headers when using URLs.
|
|
75
|
+
|
|
76
|
+
Examples
|
|
77
|
+
--------
|
|
78
|
+
|
|
79
|
+
Reading Text Files
|
|
80
|
+
|
|
81
|
+
>>> txt_directory = "/content/txtfiles/reader/txt"
|
|
82
|
+
>>> partition_df = Partition(content_type="text/plain").partition(txt_directory)
|
|
83
|
+
>>> partition_df.show()
|
|
84
|
+
>>> partition_df = Partition().partition("./email-files/test-several-attachments.eml")
|
|
85
|
+
>>> partition_df.show()
|
|
86
|
+
>>> partition_df = Partition().partition(
|
|
87
|
+
... "https://www.wikipedia.com",
|
|
88
|
+
... headers={"Accept-Language": "es-ES"}
|
|
89
|
+
... )
|
|
90
|
+
>>> partition_df.show()
|
|
91
|
+
+--------------------+--------------------+
|
|
92
|
+
| path| txt|
|
|
93
|
+
+--------------------+--------------------+
|
|
94
|
+
|file:/content/txt...|[{Title, BIG DATA...|
|
|
95
|
+
+--------------------+--------------------+
|
|
96
|
+
|
|
97
|
+
Reading Email Files
|
|
98
|
+
|
|
99
|
+
>>> partition_df = Partition().partition("./email-files/test-several-attachments.eml")
|
|
100
|
+
>>> partition_df.show()
|
|
101
|
+
+--------------------+--------------------+
|
|
102
|
+
| path| email|
|
|
103
|
+
+--------------------+--------------------+
|
|
104
|
+
|file:/content/ema...|[{Title, Test Sev...|
|
|
105
|
+
+--------------------+--------------------+
|
|
106
|
+
|
|
107
|
+
Reading Webpages
|
|
108
|
+
|
|
109
|
+
>>> partition_df = Partition().partition("https://www.wikipedia.com", headers = {"Accept-Language": "es-ES"})
|
|
110
|
+
>>> partition_df.show()
|
|
111
|
+
+--------------------+--------------------+
|
|
112
|
+
| url| html|
|
|
113
|
+
+--------------------+--------------------+
|
|
114
|
+
|https://www.wikip...|[{Title, Wikipedi...|
|
|
115
|
+
+--------------------+--------------------+
|
|
116
|
+
|
|
117
|
+
For more examples, refer to:
|
|
118
|
+
`examples/python/data-preprocessing/SparkNLP_Partition_Reader_Demo.ipynb`
|
|
119
|
+
"""
|
|
120
|
+
def __init__(self, **kwargs):
|
|
121
|
+
self.spark = sparknlp.start()
|
|
122
|
+
params = {}
|
|
123
|
+
for key, value in kwargs.items():
|
|
124
|
+
try:
|
|
125
|
+
params[key] = str(value)
|
|
126
|
+
except Exception as e:
|
|
127
|
+
raise ValueError(f"Invalid value for key '{key}': Cannot cast {type(value)} to string. Original error: {e}")
|
|
128
|
+
|
|
129
|
+
super(Partition, self).__init__("com.johnsnowlabs.partition.Partition", params)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def partition(self, path, headers=None):
|
|
133
|
+
"""
|
|
134
|
+
Reads and parses content from a URL, file, or directory path.
|
|
135
|
+
|
|
136
|
+
Parameters
|
|
137
|
+
----------
|
|
138
|
+
path : str
|
|
139
|
+
Path to file or directory. URLs and DFS are supported.
|
|
140
|
+
headers : dict, optional
|
|
141
|
+
Headers for URL requests.
|
|
142
|
+
|
|
143
|
+
Returns
|
|
144
|
+
-------
|
|
145
|
+
pyspark.sql.DataFrame
|
|
146
|
+
DataFrame with parsed content.
|
|
147
|
+
"""
|
|
148
|
+
if headers is None:
|
|
149
|
+
headers = {}
|
|
150
|
+
jdf = self._java_obj.partition(path, headers)
|
|
151
|
+
dataframe = self.getDataFrame(self.spark, jdf)
|
|
152
|
+
return dataframe
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def partition_urls(self, path, headers=None):
|
|
156
|
+
"""
|
|
157
|
+
Reads and parses content from multiple URLs.
|
|
158
|
+
|
|
159
|
+
Parameters
|
|
160
|
+
----------
|
|
161
|
+
path : list[str]
|
|
162
|
+
List of URLs.
|
|
163
|
+
headers : dict, optional
|
|
164
|
+
Request headers for URLs.
|
|
165
|
+
|
|
166
|
+
Returns
|
|
167
|
+
-------
|
|
168
|
+
pyspark.sql.DataFrame
|
|
169
|
+
DataFrame with parsed URL content.
|
|
170
|
+
|
|
171
|
+
Examples
|
|
172
|
+
--------
|
|
173
|
+
>>> urls_df = Partition().partition_urls([
|
|
174
|
+
... "https://www.wikipedia.org", "https://example.com/"
|
|
175
|
+
... ])
|
|
176
|
+
>>> urls_df.show()
|
|
177
|
+
+--------------------+--------------------+
|
|
178
|
+
| url| html|
|
|
179
|
+
+--------------------+--------------------+
|
|
180
|
+
|https://www.wikip...|[{Title, Wikipedi...|
|
|
181
|
+
|https://example.com/|[{Title, Example ...|
|
|
182
|
+
+--------------------+--------------------+
|
|
183
|
+
|
|
184
|
+
>>> urls_df.printSchema()
|
|
185
|
+
root
|
|
186
|
+
|-- url: string (nullable = true)
|
|
187
|
+
|-- html: array (nullable = true)
|
|
188
|
+
| |-- element: struct (containsNull = true)
|
|
189
|
+
| | |-- elementType: string (nullable = true)
|
|
190
|
+
| | |-- content: string (nullable = true)
|
|
191
|
+
| | |-- metadata: map (nullable = true)
|
|
192
|
+
| | | |-- key: string
|
|
193
|
+
| | | |-- value: string (valueContainsNull = true)
|
|
194
|
+
"""
|
|
195
|
+
if headers is None:
|
|
196
|
+
headers = {}
|
|
197
|
+
jdf = self._java_obj.partitionUrlsJava(path, headers)
|
|
198
|
+
dataframe = self.getDataFrame(self.spark, jdf)
|
|
199
|
+
return dataframe
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def partition_text(self, text):
|
|
203
|
+
"""
|
|
204
|
+
Parses content from a raw text string.
|
|
205
|
+
|
|
206
|
+
Parameters
|
|
207
|
+
----------
|
|
208
|
+
text : str
|
|
209
|
+
Raw text input.
|
|
210
|
+
|
|
211
|
+
Returns
|
|
212
|
+
-------
|
|
213
|
+
pyspark.sql.DataFrame
|
|
214
|
+
DataFrame with parsed text.
|
|
215
|
+
|
|
216
|
+
Examples
|
|
217
|
+
--------
|
|
218
|
+
>>> raw_text = (
|
|
219
|
+
... "The big brown fox\\n"
|
|
220
|
+
... "was walking down the lane.\\n"
|
|
221
|
+
... "\\n"
|
|
222
|
+
... "At the end of the lane,\\n"
|
|
223
|
+
... "the fox met a bear."
|
|
224
|
+
... )
|
|
225
|
+
>>> text_df = Partition(group_broken_paragraphs=True).partition_text(text=raw_text)
|
|
226
|
+
>>> text_df.show()
|
|
227
|
+
+--------------------------------------+
|
|
228
|
+
|txt |
|
|
229
|
+
+--------------------------------------+
|
|
230
|
+
|[{NarrativeText, The big brown fox was|
|
|
231
|
+
+--------------------------------------+
|
|
232
|
+
>>> text_df.printSchema()
|
|
233
|
+
root
|
|
234
|
+
|-- txt: array (nullable = true)
|
|
235
|
+
| |-- element: struct (containsNull = true)
|
|
236
|
+
| | |-- elementType: string (nullable = true)
|
|
237
|
+
| | |-- content: string (nullable = true)
|
|
238
|
+
| | |-- metadata: map (nullable = true)
|
|
239
|
+
| | | |-- key: string
|
|
240
|
+
| | | |-- value: string (valueContainsNull = true)
|
|
241
|
+
"""
|
|
242
|
+
jdf = self._java_obj.partitionText(text)
|
|
243
|
+
dataframe = self.getDataFrame(self.spark, jdf)
|
|
244
|
+
return dataframe
|