spark-nlp 5.5.2__py2.py3-none-any.whl → 6.0.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of spark-nlp might be problematic. Click here for more details.
- {spark_nlp-5.5.2.dist-info → spark_nlp-6.0.0.dist-info}/METADATA +20 -11
- {spark_nlp-5.5.2.dist-info → spark_nlp-6.0.0.dist-info}/RECORD +33 -18
- sparknlp/__init__.py +2 -2
- sparknlp/annotator/classifier_dl/__init__.py +4 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +2 -2
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/cv/__init__.py +6 -1
- sparknlp/annotator/cv/janus_for_multimodal.py +356 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +10 -6
- sparknlp/annotator/embeddings/bge_embeddings.py +7 -3
- sparknlp/annotator/seq2seq/__init__.py +3 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +8 -503
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +333 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +4 -4
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/base/image_assembler.py +58 -0
- sparknlp/common/properties.py +632 -96
- sparknlp/internal/__init__.py +100 -2
- sparknlp/reader/pdf_to_text.py +65 -0
- sparknlp/reader/sparknlp_reader.py +260 -60
- spark_nlp-5.5.2.dist-info/.uuid +0 -1
- {spark_nlp-5.5.2.dist-info → spark_nlp-6.0.0.dist-info}/WHEEL +0 -0
- {spark_nlp-5.5.2.dist-info → spark_nlp-6.0.0.dist-info}/top_level.txt +0 -0
|
@@ -15,14 +15,7 @@ from sparknlp.internal import ExtendedJavaWrapper
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class SparkNLPReader(ExtendedJavaWrapper):
|
|
18
|
-
"""Instantiates class to read
|
|
19
|
-
|
|
20
|
-
Two types of input paths are supported,
|
|
21
|
-
|
|
22
|
-
htmlPath: this is a path to a directory of HTML files or a path to an HTML file
|
|
23
|
-
E.g. "path/html/files"
|
|
24
|
-
|
|
25
|
-
url: this is the URL or set of URLs of a website . E.g., "https://www.wikipedia.org"
|
|
18
|
+
"""Instantiates class to read documents in various formats.
|
|
26
19
|
|
|
27
20
|
Parameters
|
|
28
21
|
----------
|
|
@@ -31,66 +24,30 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
31
24
|
params : dict, optional
|
|
32
25
|
Parameter with custom configuration
|
|
33
26
|
|
|
27
|
+
Notes
|
|
28
|
+
-----
|
|
29
|
+
This class can read HTML, email, PDF, MS Word, Excel, PowerPoint, and text files.
|
|
30
|
+
|
|
34
31
|
Examples
|
|
35
32
|
--------
|
|
36
33
|
>>> from sparknlp.reader import SparkNLPReader
|
|
37
|
-
>>>
|
|
34
|
+
>>> reader = SparkNLPReader(spark)
|
|
38
35
|
|
|
39
|
-
|
|
36
|
+
# Reading HTML
|
|
37
|
+
>>> html_df = reader.html("https://www.wikipedia.org")
|
|
38
|
+
>>> # Or with shorthand
|
|
40
39
|
>>> import sparknlp
|
|
41
40
|
>>> html_df = sparknlp.read().html("https://www.wikipedia.org")
|
|
42
|
-
>>> html_df.show(truncate=False)
|
|
43
|
-
|
|
44
|
-
+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
45
|
-
|url |html |
|
|
46
|
-
+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
47
|
-
|https://example.com/|[{Title, Example Domain, {pageNumber -> 1}}, {NarrativeText, 0, This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission., {pageNumber -> 1}}, {NarrativeText, 0, More information... More information..., {pageNumber -> 1}}] |
|
|
48
|
-
+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
49
|
-
>>> html_df.printSchema()
|
|
50
|
-
|
|
51
|
-
root
|
|
52
|
-
|-- url: string (nullable = true)
|
|
53
|
-
|-- html: array (nullable = true)
|
|
54
|
-
| |-- element: struct (containsNull = true)
|
|
55
|
-
| | |-- elementType: string (nullable = true)
|
|
56
|
-
| | |-- content: string (nullable = true)
|
|
57
|
-
| | |-- metadata: map (nullable = true)
|
|
58
|
-
| | | |-- key: string
|
|
59
|
-
| | | |-- value: string (valueContainsNull = true)
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
Instantiates class to read email files.
|
|
64
41
|
|
|
65
|
-
|
|
66
|
-
"
|
|
42
|
+
# Reading PDF
|
|
43
|
+
>>> pdf_df = reader.pdf("home/user/pdfs-directory")
|
|
44
|
+
>>> # Or with shorthand
|
|
45
|
+
>>> pdf_df = sparknlp.read().pdf("home/user/pdfs-directory")
|
|
67
46
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
>>>
|
|
71
|
-
>>> email_df = SparkNLPReader().email(spark, "home/user/emails-directory")
|
|
72
|
-
|
|
73
|
-
You can use SparkNLP for one line of code
|
|
74
|
-
>>> import sparknlp
|
|
47
|
+
# Reading Email
|
|
48
|
+
>>> email_df = reader.email("home/user/emails-directory")
|
|
49
|
+
>>> # Or with shorthand
|
|
75
50
|
>>> email_df = sparknlp.read().email("home/user/emails-directory")
|
|
76
|
-
>>> email_df.show(truncate=False)
|
|
77
|
-
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
78
|
-
|email |
|
|
79
|
-
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
80
|
-
|[{Title, Email Text Attachments, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>}}, {NarrativeText, Email test with two text attachments\r\n\r\nCheers,\r\n\r\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/plain}}, {NarrativeText, <html>\r\n<head>\r\n<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">\r\n<style type="text/css" style="display:none;"> P {margin-top:0;margin-bottom:0;} </style>\r\n</head>\r\n<body dir="ltr">\r\n<span style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">Email test with two text attachments</span>\r\n<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r\n<br>\r\n</div>\r\n<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r\nCheers,</div>\r\n<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r\n<br>\r\n</div>\r\n</body>\r\n</html>\r\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/html}}, {Attachment, filename.txt, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, contentType -> text/plain; name="filename.txt"}}, {NarrativeText, This is the content of the file.\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/plain}}, {Attachment, filename2.txt, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, contentType -> text/plain; name="filename2.txt"}}, {NarrativeText, This is an additional content file.\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/plain}}]|
|
|
81
|
-
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
82
|
-
email_df.printSchema()
|
|
83
|
-
root
|
|
84
|
-
|-- path: string (nullable = true)
|
|
85
|
-
|-- content: array (nullable = true)
|
|
86
|
-
|-- email: array (nullable = true)
|
|
87
|
-
| |-- element: struct (containsNull = true)
|
|
88
|
-
| | |-- elementType: string (nullable = true)
|
|
89
|
-
| | |-- content: string (nullable = true)
|
|
90
|
-
| | |-- metadata: map (nullable = true)
|
|
91
|
-
| | | |-- key: string
|
|
92
|
-
| | | |-- value: string (valueContainsNull = true)
|
|
93
|
-
|
|
94
51
|
"""
|
|
95
52
|
|
|
96
53
|
def __init__(self, spark, params=None):
|
|
@@ -100,6 +57,46 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
100
57
|
self.spark = spark
|
|
101
58
|
|
|
102
59
|
def html(self, htmlPath):
|
|
60
|
+
"""Reads HTML files or URLs and returns a Spark DataFrame.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
htmlPath : str or list of str
|
|
65
|
+
Path(s) to HTML file(s) or a list of URLs.
|
|
66
|
+
|
|
67
|
+
Returns
|
|
68
|
+
-------
|
|
69
|
+
pyspark.sql.DataFrame
|
|
70
|
+
A DataFrame containing the parsed HTML content.
|
|
71
|
+
|
|
72
|
+
Examples
|
|
73
|
+
--------
|
|
74
|
+
>>> from sparknlp.reader import SparkNLPReader
|
|
75
|
+
>>> html_df = SparkNLPReader(spark).html("https://www.wikipedia.org")
|
|
76
|
+
|
|
77
|
+
You can also use SparkNLP to simplify the process:
|
|
78
|
+
|
|
79
|
+
>>> import sparknlp
|
|
80
|
+
>>> html_df = sparknlp.read().html("https://www.wikipedia.org")
|
|
81
|
+
>>> html_df.show(truncate=False)
|
|
82
|
+
|
|
83
|
+
+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
84
|
+
|url |html |
|
|
85
|
+
+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
86
|
+
|https://example.com/|[{Title, Example Domain, {pageNumber -> 1}}, {NarrativeText, 0, This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission., {pageNumber -> 1}}, {NarrativeText, 0, More information... More information..., {pageNumber -> 1}}] |
|
|
87
|
+
+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
88
|
+
>>> html_df.printSchema()
|
|
89
|
+
|
|
90
|
+
root
|
|
91
|
+
|-- url: string (nullable = true)
|
|
92
|
+
|-- html: array (nullable = true)
|
|
93
|
+
| |-- element: struct (containsNull = true)
|
|
94
|
+
| | |-- elementType: string (nullable = true)
|
|
95
|
+
| | |-- content: string (nullable = true)
|
|
96
|
+
| | |-- metadata: map (nullable = true)
|
|
97
|
+
| | | |-- key: string
|
|
98
|
+
| | | |-- value: string (valueContainsNull = true)
|
|
99
|
+
"""
|
|
103
100
|
if not isinstance(htmlPath, (str, list)) or (isinstance(htmlPath, list) and not all(isinstance(item, str) for item in htmlPath)):
|
|
104
101
|
raise TypeError("htmlPath must be a string or a list of strings")
|
|
105
102
|
jdf = self._java_obj.html(htmlPath)
|
|
@@ -107,6 +104,47 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
107
104
|
return dataframe
|
|
108
105
|
|
|
109
106
|
def email(self, filePath):
|
|
107
|
+
"""Reads email files and returns a Spark DataFrame.
|
|
108
|
+
|
|
109
|
+
Parameters
|
|
110
|
+
----------
|
|
111
|
+
filePath : str
|
|
112
|
+
Path to an email file or a directory containing emails.
|
|
113
|
+
|
|
114
|
+
Returns
|
|
115
|
+
-------
|
|
116
|
+
pyspark.sql.DataFrame
|
|
117
|
+
A DataFrame containing parsed email data.
|
|
118
|
+
|
|
119
|
+
Examples
|
|
120
|
+
--------
|
|
121
|
+
>>> from sparknlp.reader import SparkNLPReader
|
|
122
|
+
>>> email_df = SparkNLPReader(spark).email("home/user/emails-directory")
|
|
123
|
+
|
|
124
|
+
You can also use SparkNLP to simplify the process:
|
|
125
|
+
|
|
126
|
+
>>> import sparknlp
|
|
127
|
+
>>> email_df = sparknlp.read().email("home/user/emails-directory")
|
|
128
|
+
>>> email_df.show(truncate=False)
|
|
129
|
+
|
|
130
|
+
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
131
|
+
|email |
|
|
132
|
+
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
133
|
+
|[{Title, Email Text Attachments, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>}}, {NarrativeText, Email test with two text attachments\r\n\r\nCheers,\r\n\r\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/plain}}, {NarrativeText, <html>\r\n<head>\r\n<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">\r\n<style type="text/css" style="display:none;"> P {margin-top:0;margin-bottom:0;} </style>\r\n</head>\r\n<body dir="ltr">\r\n<span style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">Email test with two text attachments</span>\r\n<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r\n<br>\r\n</div>\r\n<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r\nCheers,</div>\r\n<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r\n<br>\r\n</div>\r\n</body>\r\n</html>\r\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/html}}, {Attachment, filename.txt, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, contentType -> text/plain; name="filename.txt"}}, {NarrativeText, This is the content of the file.\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/plain}}, {Attachment, filename2.txt, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, contentType -> text/plain; name="filename2.txt"}}, {NarrativeText, This is an additional content file.\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/plain}}]|
|
|
134
|
+
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
135
|
+
>>> email_df.printSchema()
|
|
136
|
+
root
|
|
137
|
+
|-- path: string (nullable = true)
|
|
138
|
+
|-- content: array (nullable = true)
|
|
139
|
+
|-- email: array (nullable = true)
|
|
140
|
+
| |-- element: struct (containsNull = true)
|
|
141
|
+
| | |-- elementType: string (nullable = true)
|
|
142
|
+
| | |-- content: string (nullable = true)
|
|
143
|
+
| | |-- metadata: map (nullable = true)
|
|
144
|
+
| | | |-- key: string
|
|
145
|
+
| | | |-- value: string (valueContainsNull = true)
|
|
146
|
+
|
|
147
|
+
"""
|
|
110
148
|
if not isinstance(filePath, str):
|
|
111
149
|
raise TypeError("filePath must be a string")
|
|
112
150
|
jdf = self._java_obj.email(filePath)
|
|
@@ -114,8 +152,170 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
114
152
|
return dataframe
|
|
115
153
|
|
|
116
154
|
def doc(self, docPath):
|
|
155
|
+
"""Reads word document files and returns a Spark DataFrame.
|
|
156
|
+
|
|
157
|
+
Parameters
|
|
158
|
+
----------
|
|
159
|
+
docPath : str
|
|
160
|
+
Path to a word document file.
|
|
161
|
+
|
|
162
|
+
Returns
|
|
163
|
+
-------
|
|
164
|
+
pyspark.sql.DataFrame
|
|
165
|
+
A DataFrame containing parsed document content.
|
|
166
|
+
|
|
167
|
+
Examples
|
|
168
|
+
--------
|
|
169
|
+
>>> from sparknlp.reader import SparkNLPReader
|
|
170
|
+
>>> doc_df = SparkNLPReader().doc(spark, "home/user/word-directory")
|
|
171
|
+
|
|
172
|
+
You can use SparkNLP for one line of code
|
|
173
|
+
>>> import sparknlp
|
|
174
|
+
>>> doc_df = sparknlp.read().doc("home/user/word-directory")
|
|
175
|
+
>>> doc_df.show(truncate=False)
|
|
176
|
+
|
|
177
|
+
+----------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
178
|
+
|doc | |
|
|
179
|
+
+----------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
180
|
+
|[{Table, Header Col 1, {}}, {Table, Header Col 2, {}}, {Table, Lorem ipsum, {}}, {Table, A Link example, {}}, {NarrativeText, Dolor sit amet, {}}] |
|
|
181
|
+
+----------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
182
|
+
>>> docsDf.printSchema()
|
|
183
|
+
root
|
|
184
|
+
|-- path: string (nullable = true)
|
|
185
|
+
|-- content: array (nullable = true)
|
|
186
|
+
|-- doc: array (nullable = true)
|
|
187
|
+
| |-- element: struct (containsNull = true)
|
|
188
|
+
| | |-- elementType: string (nullable = true)
|
|
189
|
+
| | |-- content: string (nullable = true)
|
|
190
|
+
| | |-- metadata: map (nullable = true)
|
|
191
|
+
| | | |-- key: string
|
|
192
|
+
| | | |-- value: string (valueContainsNull = true)
|
|
193
|
+
|
|
194
|
+
"""
|
|
117
195
|
if not isinstance(docPath, str):
|
|
118
196
|
raise TypeError("docPath must be a string")
|
|
119
197
|
jdf = self._java_obj.doc(docPath)
|
|
120
198
|
dataframe = self.getDataFrame(self.spark, jdf)
|
|
121
|
-
return dataframe
|
|
199
|
+
return dataframe
|
|
200
|
+
|
|
201
|
+
def pdf(self, pdfPath):
|
|
202
|
+
if not isinstance(pdfPath, str):
|
|
203
|
+
raise TypeError("docPath must be a string")
|
|
204
|
+
jdf = self._java_obj.pdf(pdfPath)
|
|
205
|
+
dataframe = self.getDataFrame(self.spark, jdf)
|
|
206
|
+
return dataframe
|
|
207
|
+
|
|
208
|
+
def xls(self, docPath):
|
|
209
|
+
"""Reads excel document files and returns a Spark DataFrame.
|
|
210
|
+
|
|
211
|
+
Parameters
|
|
212
|
+
----------
|
|
213
|
+
docPath : str
|
|
214
|
+
Path to an excel document file.
|
|
215
|
+
|
|
216
|
+
Returns
|
|
217
|
+
-------
|
|
218
|
+
pyspark.sql.DataFrame
|
|
219
|
+
A DataFrame containing parsed document content.
|
|
220
|
+
|
|
221
|
+
Examples
|
|
222
|
+
--------
|
|
223
|
+
>>> from sparknlp.reader import SparkNLPReader
|
|
224
|
+
>>> xlsDf = SparkNLPReader().xls(spark, "home/user/excel-directory")
|
|
225
|
+
|
|
226
|
+
You can use SparkNLP for one line of code
|
|
227
|
+
>>> import sparknlp
|
|
228
|
+
>>> xlsDf = sparknlp.read().xls("home/user/excel-directory")
|
|
229
|
+
>>> xlsDf.show(truncate=False)
|
|
230
|
+
|
|
231
|
+
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
232
|
+
|xls |
|
|
233
|
+
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
234
|
+
|[{Title, Financial performance, {SheetName -> Index}}, {Title, Topic\tPeriod\t\t\tPage, {SheetName -> Index}}, {NarrativeText, Quarterly revenue\tNine quarters to 30 June 2023\t\t\t1.0, {SheetName -> Index}}, {NarrativeText, Group financial performance\tFY 22\tFY 23\t\t2.0, {SheetName -> Index}}, {NarrativeText, Segmental results\tFY 22\tFY 23\t\t3.0, {SheetName -> Index}}, {NarrativeText, Segmental analysis\tFY 22\tFY 23\t\t4.0, {SheetName -> Index}}, {NarrativeText, Cash flow\tFY 22\tFY 23\t\t5.0, {SheetName -> Index}}, {Title, Operational metrics, {SheetName -> Index}}, {Title, Topic\tPeriod\t\t\tPage, {SheetName -> Index}}, {NarrativeText, Mobile customers\tNine quarters to 30 June 2023\t\t\t6.0, {SheetName -> Index}}, {NarrativeText, Fixed broadband customers\tNine quarters to 30 June 2023\t\t\t7.0, {SheetName -> Index}}, {NarrativeText, Marketable homes passed\tNine quarters to 30 June 2023\t\t\t8.0, {SheetName -> Index}}, {NarrativeText, TV customers\tNine quarters to 30 June 2023\t\t\t9.0, {SheetName -> Index}}, {NarrativeText, Converged customers\tNine quarters to 30 June 2023\t\t\t10.0, {SheetName -> Index}}, {NarrativeText, Mobile churn\tNine quarters to 30 June 2023\t\t\t11.0, {SheetName -> Index}}, {NarrativeText, Mobile data usage\tNine quarters to 30 June 2023\t\t\t12.0, {SheetName -> Index}}, {NarrativeText, Mobile ARPU\tNine quarters to 30 June 2023\t\t\t13.0, {SheetName -> Index}}, {Title, Other, {SheetName -> Index}}, {Title, Topic\tPeriod\t\t\tPage, {SheetName -> Index}}, {NarrativeText, Average foreign exchange rates\tNine quarters to 30 June 2023\t\t\t14.0, {SheetName -> Index}}, {NarrativeText, Guidance rates\tFY 23/24\t\t\t14.0, {SheetName -> Index}}]|
|
|
235
|
+
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
236
|
+
|
|
237
|
+
>>> xlsDf.printSchema()
|
|
238
|
+
root
|
|
239
|
+
|-- path: string (nullable = true)
|
|
240
|
+
|-- content: binary (nullable = true)
|
|
241
|
+
|-- xls: array (nullable = true)
|
|
242
|
+
| |-- element: struct (containsNull = true)
|
|
243
|
+
| | |-- elementType: string (nullable = true)
|
|
244
|
+
| | |-- content: string (nullable = true)
|
|
245
|
+
| | |-- metadata: map (nullable = true)
|
|
246
|
+
| | | |-- key: string
|
|
247
|
+
| | | |-- value: string (valueContainsNull = true)
|
|
248
|
+
"""
|
|
249
|
+
if not isinstance(docPath, str):
|
|
250
|
+
raise TypeError("docPath must be a string")
|
|
251
|
+
jdf = self._java_obj.xls(docPath)
|
|
252
|
+
dataframe = self.getDataFrame(self.spark, jdf)
|
|
253
|
+
return dataframe
|
|
254
|
+
|
|
255
|
+
def ppt(self, docPath):
|
|
256
|
+
"""
|
|
257
|
+
Reads power point document files and returns a Spark DataFrame.
|
|
258
|
+
|
|
259
|
+
Parameters
|
|
260
|
+
----------
|
|
261
|
+
docPath : str
|
|
262
|
+
Path to an excel document file.
|
|
263
|
+
|
|
264
|
+
Returns
|
|
265
|
+
-------
|
|
266
|
+
pyspark.sql.DataFrame
|
|
267
|
+
A DataFrame containing parsed document content.
|
|
268
|
+
|
|
269
|
+
Examples
|
|
270
|
+
--------
|
|
271
|
+
>>> from sparknlp.reader import SparkNLPReader
|
|
272
|
+
>>> pptDf = SparkNLPReader().ppt(spark, "home/user/powerpoint-directory")
|
|
273
|
+
|
|
274
|
+
You can use SparkNLP for one line of code
|
|
275
|
+
>>> import sparknlp
|
|
276
|
+
>>> pptDf = sparknlp.read().ppt("home/user/powerpoint-directory")
|
|
277
|
+
>>> pptDf.show(truncate=False)
|
|
278
|
+
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
279
|
+
|ppt |
|
|
280
|
+
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
281
|
+
|[{Title, Adding a Bullet Slide, {}}, {ListItem, • Find the bullet slide layout, {}}, {ListItem, – Use _TextFrame.text for first bullet, {}}, {ListItem, • Use _TextFrame.add_paragraph() for subsequent bullets, {}}, {NarrativeText, Here is a lot of text!, {}}, {NarrativeText, Here is some text in a text box!, {}}]|
|
|
282
|
+
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
283
|
+
"""
|
|
284
|
+
if not isinstance(docPath, str):
|
|
285
|
+
raise TypeError("docPath must be a string")
|
|
286
|
+
jdf = self._java_obj.ppt(docPath)
|
|
287
|
+
dataframe = self.getDataFrame(self.spark, jdf)
|
|
288
|
+
return dataframe
|
|
289
|
+
|
|
290
|
+
def txt(self, docPath):
|
|
291
|
+
"""Reads TXT files and returns a Spark DataFrame.
|
|
292
|
+
|
|
293
|
+
Parameters
|
|
294
|
+
----------
|
|
295
|
+
docPath : str
|
|
296
|
+
Path to a TXT file.
|
|
297
|
+
|
|
298
|
+
Returns
|
|
299
|
+
-------
|
|
300
|
+
pyspark.sql.DataFrame
|
|
301
|
+
A DataFrame containing parsed document content.
|
|
302
|
+
|
|
303
|
+
Examples
|
|
304
|
+
--------
|
|
305
|
+
>>> from sparknlp.reader import SparkNLPReader
|
|
306
|
+
>>> txtDf = SparkNLPReader().txt(spark, "home/user/txt/files")
|
|
307
|
+
|
|
308
|
+
You can use SparkNLP for one line of code
|
|
309
|
+
>>> import sparknlp
|
|
310
|
+
>>> txtDf = sparknlp.read().txt("home/user/txt/files")
|
|
311
|
+
>>> txtDf.show(truncate=False)
|
|
312
|
+
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
313
|
+
|txt |
|
|
314
|
+
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
315
|
+
|[{Title, BIG DATA ANALYTICS, {paragraph -> 0}}, {NarrativeText, Apache Spark is a fast and general-purpose cluster computing system.\nIt provides high-level APIs in Java, Scala, Python, and R., {paragraph -> 0}}, {Title, MACHINE LEARNING, {paragraph -> 1}}, {NarrativeText, Spark's MLlib provides scalable machine learning algorithms.\nIt includes tools for classification, regression, clustering, and more., {paragraph -> 1}}]|
|
|
316
|
+
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
317
|
+
"""
|
|
318
|
+
if not isinstance(docPath, str):
|
|
319
|
+
raise TypeError("docPath must be a string")
|
|
320
|
+
jdf = self._java_obj.txt(docPath)
|
|
321
|
+
return self.getDataFrame(self.spark, jdf)
|
spark_nlp-5.5.2.dist-info/.uuid
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
90f78083-0ee0-43e9-8240-7263731b6707
|
|
File without changes
|
|
File without changes
|