spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +81 -28
- sparknlp/annotation.py +3 -2
- sparknlp/annotator/__init__.py +6 -0
- sparknlp/annotator/audio/__init__.py +2 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/{base → annotator}/chunk2_doc.py +4 -7
- sparknlp/annotator/chunker.py +1 -2
- sparknlp/annotator/classifier_dl/__init__.py +17 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/spanbert_coref.py +4 -18
- sparknlp/annotator/cv/__init__.py +15 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/dependency_parser.py +2 -3
- sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +37 -1
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +11 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
- sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
- sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
- sparknlp/annotator/embeddings/doc2vec.py +7 -1
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
- sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
- sparknlp/annotator/embeddings/word2vec.py +7 -1
- sparknlp/annotator/embeddings/word_embeddings.py +4 -5
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
- sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
- sparknlp/annotator/er/entity_ruler.py +37 -23
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
- sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
- sparknlp/annotator/lemmatizer.py +3 -4
- sparknlp/annotator/matcher/date_matcher.py +35 -3
- sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
- sparknlp/annotator/matcher/regex_matcher.py +3 -3
- sparknlp/annotator/matcher/text_matcher.py +2 -3
- sparknlp/annotator/n_gram_generator.py +1 -2
- sparknlp/annotator/ner/__init__.py +3 -1
- sparknlp/annotator/ner/ner_converter.py +18 -0
- sparknlp/annotator/ner/ner_crf.py +4 -5
- sparknlp/annotator/ner/ner_dl.py +10 -5
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +2 -2
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +2 -2
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/pos/perceptron.py +6 -7
- sparknlp/annotator/sentence/sentence_detector.py +2 -2
- sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
- sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
- sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
- sparknlp/annotator/seq2seq/__init__.py +17 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
- sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
- sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
- sparknlp/annotator/stemmer.py +2 -3
- sparknlp/annotator/stop_words_cleaner.py +3 -4
- sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
- sparknlp/annotator/token/__init__.py +0 -1
- sparknlp/annotator/token/recursive_tokenizer.py +2 -3
- sparknlp/annotator/token/tokenizer.py +2 -3
- sparknlp/annotator/ws/word_segmenter.py +35 -10
- sparknlp/base/__init__.py +2 -3
- sparknlp/base/doc2_chunk.py +0 -3
- sparknlp/base/document_assembler.py +5 -5
- sparknlp/base/embeddings_finisher.py +14 -2
- sparknlp/base/finisher.py +15 -4
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/image_assembler.py +69 -0
- sparknlp/base/light_pipeline.py +53 -21
- sparknlp/base/multi_document_assembler.py +9 -13
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/token_assembler.py +1 -2
- sparknlp/common/__init__.py +2 -0
- sparknlp/common/annotator_type.py +1 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +914 -9
- sparknlp/internal/__init__.py +841 -116
- sparknlp/internal/annotator_java_ml.py +1 -1
- sparknlp/internal/annotator_transformer.py +3 -0
- sparknlp/logging/comet.py +2 -2
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/pretrained_pipeline.py +1 -1
- sparknlp/pretrained/resource_downloader.py +126 -2
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +1 -0
- sparknlp/training/conll.py +8 -2
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/util.py +26 -0
- spark_nlp-4.2.6.dist-info/METADATA +0 -1256
- spark_nlp-4.2.6.dist-info/RECORD +0 -196
- {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
- /sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0
|
@@ -22,6 +22,7 @@ class AnnotatorJavaMLReadable(JavaMLReadable):
|
|
|
22
22
|
"""Returns an MLReader instance for this class."""
|
|
23
23
|
return AnnotatorJavaMLReader(cls())
|
|
24
24
|
|
|
25
|
+
|
|
25
26
|
class AnnotatorJavaMLReader(JavaMLReader):
|
|
26
27
|
@classmethod
|
|
27
28
|
def _java_loader_class(cls, clazz):
|
|
@@ -29,4 +30,3 @@ class AnnotatorJavaMLReader(JavaMLReader):
|
|
|
29
30
|
return clazz._java_class_name
|
|
30
31
|
else:
|
|
31
32
|
return JavaMLReader._java_loader_class(clazz)
|
|
32
|
-
|
|
@@ -22,6 +22,9 @@ from sparknlp.internal.annotator_java_ml import AnnotatorJavaMLReadable
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class AnnotatorTransformer(JavaTransformer, AnnotatorJavaMLReadable, JavaMLWritable, ParamsGettersSetters):
|
|
25
|
+
|
|
26
|
+
outputAnnotatorType = None
|
|
27
|
+
|
|
25
28
|
@keyword_only
|
|
26
29
|
def __init__(self, classname):
|
|
27
30
|
super(AnnotatorTransformer, self).__init__()
|
sparknlp/logging/comet.py
CHANGED
|
@@ -40,8 +40,8 @@ class CometLogger:
|
|
|
40
40
|
To log a Spark NLP annotator, it will need an "outputLogPath" parameter, as the
|
|
41
41
|
CometLogger reads the log file generated during the training process.
|
|
42
42
|
|
|
43
|
-
For more examples see the `
|
|
44
|
-
<https://github.com/JohnSnowLabs/spark-nlp
|
|
43
|
+
For more examples see the `Examples
|
|
44
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/logging/Comet_SparkNLP_Integration.ipynb>`__.
|
|
45
45
|
|
|
46
46
|
Parameters
|
|
47
47
|
----------
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Copyright 2017-2025 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Module to read various types of documents into chunks"""
|
|
15
|
+
from sparknlp.partition.partition import *
|
|
16
|
+
from sparknlp.partition.partition_transformer import *
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
# Copyright 2017-2025 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains the Partition annotator for reading and processing various document types."""
|
|
15
|
+
import sparknlp
|
|
16
|
+
from sparknlp.internal import ExtendedJavaWrapper
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Partition(ExtendedJavaWrapper):
|
|
20
|
+
"""
|
|
21
|
+
A unified interface for extracting structured content from various document types
|
|
22
|
+
using Spark NLP readers.
|
|
23
|
+
|
|
24
|
+
This class supports reading from files, URLs, in-memory strings, or byte arrays,
|
|
25
|
+
and returns parsed output as a structured Spark DataFrame.
|
|
26
|
+
|
|
27
|
+
Supported formats include:
|
|
28
|
+
- Plain text
|
|
29
|
+
- HTML
|
|
30
|
+
- Word (.doc/.docx)
|
|
31
|
+
- Excel (.xls/.xlsx)
|
|
32
|
+
- PowerPoint (.ppt/.pptx)
|
|
33
|
+
- Email files (.eml, .msg)
|
|
34
|
+
- PDFs
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
params : dict, optional
|
|
39
|
+
Configuration parameters, including:
|
|
40
|
+
|
|
41
|
+
- content_type : str
|
|
42
|
+
Override automatic file type detection.
|
|
43
|
+
- store_content : bool
|
|
44
|
+
Include raw file content in the output DataFrame.
|
|
45
|
+
- timeout : int
|
|
46
|
+
Timeout for fetching HTML content.
|
|
47
|
+
- title_font_size : int
|
|
48
|
+
Font size used to identify titles.
|
|
49
|
+
- include_page_breaks : bool
|
|
50
|
+
Tag content with page break metadata.
|
|
51
|
+
- group_broken_paragraphs : bool
|
|
52
|
+
Merge broken lines into full paragraphs.
|
|
53
|
+
- title_length_size : int
|
|
54
|
+
Max character length to qualify as title.
|
|
55
|
+
- paragraph_split : str
|
|
56
|
+
Regex to detect paragraph boundaries.
|
|
57
|
+
- short_line_word_threshold : int
|
|
58
|
+
Max words in a line to be considered short.
|
|
59
|
+
- threshold : float
|
|
60
|
+
Ratio of empty lines for switching grouping.
|
|
61
|
+
- max_line_count : int
|
|
62
|
+
Max lines evaluated in paragraph analysis.
|
|
63
|
+
- include_slide_notes : bool
|
|
64
|
+
Include speaker notes in output.
|
|
65
|
+
- infer_table_structure : bool
|
|
66
|
+
Generate HTML table structure.
|
|
67
|
+
- append_cells : bool
|
|
68
|
+
Merge Excel rows into one block.
|
|
69
|
+
- cell_separator : str
|
|
70
|
+
Join cell values in a row.
|
|
71
|
+
- add_attachment_content : bool
|
|
72
|
+
Include text of plain-text attachments.
|
|
73
|
+
- headers : dict
|
|
74
|
+
Request headers when using URLs.
|
|
75
|
+
|
|
76
|
+
Examples
|
|
77
|
+
--------
|
|
78
|
+
|
|
79
|
+
Reading Text Files
|
|
80
|
+
|
|
81
|
+
>>> txt_directory = "/content/txtfiles/reader/txt"
|
|
82
|
+
>>> partition_df = Partition(content_type="text/plain").partition(txt_directory)
|
|
83
|
+
>>> partition_df.show()
|
|
84
|
+
>>> partition_df = Partition().partition("./email-files/test-several-attachments.eml")
|
|
85
|
+
>>> partition_df.show()
|
|
86
|
+
>>> partition_df = Partition().partition(
|
|
87
|
+
... "https://www.wikipedia.com",
|
|
88
|
+
... headers={"Accept-Language": "es-ES"}
|
|
89
|
+
... )
|
|
90
|
+
>>> partition_df.show()
|
|
91
|
+
+--------------------+--------------------+
|
|
92
|
+
| path| txt|
|
|
93
|
+
+--------------------+--------------------+
|
|
94
|
+
|file:/content/txt...|[{Title, BIG DATA...|
|
|
95
|
+
+--------------------+--------------------+
|
|
96
|
+
|
|
97
|
+
Reading Email Files
|
|
98
|
+
|
|
99
|
+
>>> partition_df = Partition().partition("./email-files/test-several-attachments.eml")
|
|
100
|
+
>>> partition_df.show()
|
|
101
|
+
+--------------------+--------------------+
|
|
102
|
+
| path| email|
|
|
103
|
+
+--------------------+--------------------+
|
|
104
|
+
|file:/content/ema...|[{Title, Test Sev...|
|
|
105
|
+
+--------------------+--------------------+
|
|
106
|
+
|
|
107
|
+
Reading Webpages
|
|
108
|
+
|
|
109
|
+
>>> partition_df = Partition().partition("https://www.wikipedia.com", headers = {"Accept-Language": "es-ES"})
|
|
110
|
+
>>> partition_df.show()
|
|
111
|
+
+--------------------+--------------------+
|
|
112
|
+
| url| html|
|
|
113
|
+
+--------------------+--------------------+
|
|
114
|
+
|https://www.wikip...|[{Title, Wikipedi...|
|
|
115
|
+
+--------------------+--------------------+
|
|
116
|
+
|
|
117
|
+
For more examples, refer to:
|
|
118
|
+
`examples/python/data-preprocessing/SparkNLP_Partition_Reader_Demo.ipynb`
|
|
119
|
+
"""
|
|
120
|
+
def __init__(self, **kwargs):
|
|
121
|
+
self.spark = sparknlp.start()
|
|
122
|
+
params = {}
|
|
123
|
+
for key, value in kwargs.items():
|
|
124
|
+
try:
|
|
125
|
+
params[key] = str(value)
|
|
126
|
+
except Exception as e:
|
|
127
|
+
raise ValueError(f"Invalid value for key '{key}': Cannot cast {type(value)} to string. Original error: {e}")
|
|
128
|
+
|
|
129
|
+
super(Partition, self).__init__("com.johnsnowlabs.partition.Partition", params)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def partition(self, path, headers=None):
|
|
133
|
+
"""
|
|
134
|
+
Reads and parses content from a URL, file, or directory path.
|
|
135
|
+
|
|
136
|
+
Parameters
|
|
137
|
+
----------
|
|
138
|
+
path : str
|
|
139
|
+
Path to file or directory. URLs and DFS are supported.
|
|
140
|
+
headers : dict, optional
|
|
141
|
+
Headers for URL requests.
|
|
142
|
+
|
|
143
|
+
Returns
|
|
144
|
+
-------
|
|
145
|
+
pyspark.sql.DataFrame
|
|
146
|
+
DataFrame with parsed content.
|
|
147
|
+
"""
|
|
148
|
+
if headers is None:
|
|
149
|
+
headers = {}
|
|
150
|
+
jdf = self._java_obj.partition(path, headers)
|
|
151
|
+
dataframe = self.getDataFrame(self.spark, jdf)
|
|
152
|
+
return dataframe
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def partition_urls(self, path, headers=None):
|
|
156
|
+
"""
|
|
157
|
+
Reads and parses content from multiple URLs.
|
|
158
|
+
|
|
159
|
+
Parameters
|
|
160
|
+
----------
|
|
161
|
+
path : list[str]
|
|
162
|
+
List of URLs.
|
|
163
|
+
headers : dict, optional
|
|
164
|
+
Request headers for URLs.
|
|
165
|
+
|
|
166
|
+
Returns
|
|
167
|
+
-------
|
|
168
|
+
pyspark.sql.DataFrame
|
|
169
|
+
DataFrame with parsed URL content.
|
|
170
|
+
|
|
171
|
+
Examples
|
|
172
|
+
--------
|
|
173
|
+
>>> urls_df = Partition().partition_urls([
|
|
174
|
+
... "https://www.wikipedia.org", "https://example.com/"
|
|
175
|
+
... ])
|
|
176
|
+
>>> urls_df.show()
|
|
177
|
+
+--------------------+--------------------+
|
|
178
|
+
| url| html|
|
|
179
|
+
+--------------------+--------------------+
|
|
180
|
+
|https://www.wikip...|[{Title, Wikipedi...|
|
|
181
|
+
|https://example.com/|[{Title, Example ...|
|
|
182
|
+
+--------------------+--------------------+
|
|
183
|
+
|
|
184
|
+
>>> urls_df.printSchema()
|
|
185
|
+
root
|
|
186
|
+
|-- url: string (nullable = true)
|
|
187
|
+
|-- html: array (nullable = true)
|
|
188
|
+
| |-- element: struct (containsNull = true)
|
|
189
|
+
| | |-- elementType: string (nullable = true)
|
|
190
|
+
| | |-- content: string (nullable = true)
|
|
191
|
+
| | |-- metadata: map (nullable = true)
|
|
192
|
+
| | | |-- key: string
|
|
193
|
+
| | | |-- value: string (valueContainsNull = true)
|
|
194
|
+
"""
|
|
195
|
+
if headers is None:
|
|
196
|
+
headers = {}
|
|
197
|
+
jdf = self._java_obj.partitionUrlsJava(path, headers)
|
|
198
|
+
dataframe = self.getDataFrame(self.spark, jdf)
|
|
199
|
+
return dataframe
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def partition_text(self, text):
|
|
203
|
+
"""
|
|
204
|
+
Parses content from a raw text string.
|
|
205
|
+
|
|
206
|
+
Parameters
|
|
207
|
+
----------
|
|
208
|
+
text : str
|
|
209
|
+
Raw text input.
|
|
210
|
+
|
|
211
|
+
Returns
|
|
212
|
+
-------
|
|
213
|
+
pyspark.sql.DataFrame
|
|
214
|
+
DataFrame with parsed text.
|
|
215
|
+
|
|
216
|
+
Examples
|
|
217
|
+
--------
|
|
218
|
+
>>> raw_text = (
|
|
219
|
+
... "The big brown fox\\n"
|
|
220
|
+
... "was walking down the lane.\\n"
|
|
221
|
+
... "\\n"
|
|
222
|
+
... "At the end of the lane,\\n"
|
|
223
|
+
... "the fox met a bear."
|
|
224
|
+
... )
|
|
225
|
+
>>> text_df = Partition(group_broken_paragraphs=True).partition_text(text=raw_text)
|
|
226
|
+
>>> text_df.show()
|
|
227
|
+
+--------------------------------------+
|
|
228
|
+
|txt |
|
|
229
|
+
+--------------------------------------+
|
|
230
|
+
|[{NarrativeText, The big brown fox was|
|
|
231
|
+
+--------------------------------------+
|
|
232
|
+
>>> text_df.printSchema()
|
|
233
|
+
root
|
|
234
|
+
|-- txt: array (nullable = true)
|
|
235
|
+
| |-- element: struct (containsNull = true)
|
|
236
|
+
| | |-- elementType: string (nullable = true)
|
|
237
|
+
| | |-- content: string (nullable = true)
|
|
238
|
+
| | |-- metadata: map (nullable = true)
|
|
239
|
+
| | | |-- key: string
|
|
240
|
+
| | | |-- value: string (valueContainsNull = true)
|
|
241
|
+
"""
|
|
242
|
+
jdf = self._java_obj.partitionText(text)
|
|
243
|
+
dataframe = self.getDataFrame(self.spark, jdf)
|
|
244
|
+
return dataframe
|