spark-nlp 5.5.2__py2.py3-none-any.whl → 6.0.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spark-nlp might be problematic. Click here for more details.

Files changed (34) hide show
  1. {spark_nlp-5.5.2.dist-info → spark_nlp-6.0.0.dist-info}/METADATA +20 -11
  2. {spark_nlp-5.5.2.dist-info → spark_nlp-6.0.0.dist-info}/RECORD +33 -18
  3. sparknlp/__init__.py +2 -2
  4. sparknlp/annotator/classifier_dl/__init__.py +4 -0
  5. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  6. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +2 -2
  7. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  8. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  9. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  10. sparknlp/annotator/cleaners/__init__.py +15 -0
  11. sparknlp/annotator/cleaners/cleaner.py +202 -0
  12. sparknlp/annotator/cleaners/extractor.py +191 -0
  13. sparknlp/annotator/cv/__init__.py +6 -1
  14. sparknlp/annotator/cv/janus_for_multimodal.py +356 -0
  15. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  16. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  17. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  18. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  19. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +10 -6
  20. sparknlp/annotator/embeddings/bge_embeddings.py +7 -3
  21. sparknlp/annotator/seq2seq/__init__.py +3 -0
  22. sparknlp/annotator/seq2seq/auto_gguf_model.py +8 -503
  23. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +333 -0
  24. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  25. sparknlp/annotator/seq2seq/llama3_transformer.py +4 -4
  26. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  27. sparknlp/base/image_assembler.py +58 -0
  28. sparknlp/common/properties.py +632 -96
  29. sparknlp/internal/__init__.py +100 -2
  30. sparknlp/reader/pdf_to_text.py +65 -0
  31. sparknlp/reader/sparknlp_reader.py +260 -60
  32. spark_nlp-5.5.2.dist-info/.uuid +0 -1
  33. {spark_nlp-5.5.2.dist-info → spark_nlp-6.0.0.dist-info}/WHEEL +0 -0
  34. {spark_nlp-5.5.2.dist-info → spark_nlp-6.0.0.dist-info}/top_level.txt +0 -0
@@ -15,14 +15,7 @@ from sparknlp.internal import ExtendedJavaWrapper
15
15
 
16
16
 
17
17
  class SparkNLPReader(ExtendedJavaWrapper):
18
- """Instantiates class to read HTML files.
19
-
20
- Two types of input paths are supported,
21
-
22
- htmlPath: this is a path to a directory of HTML files or a path to an HTML file
23
- E.g. "path/html/files"
24
-
25
- url: this is the URL or set of URLs of a website . E.g., "https://www.wikipedia.org"
18
+ """Instantiates class to read documents in various formats.
26
19
 
27
20
  Parameters
28
21
  ----------
@@ -31,66 +24,30 @@ class SparkNLPReader(ExtendedJavaWrapper):
31
24
  params : dict, optional
32
25
  Parameter with custom configuration
33
26
 
27
+ Notes
28
+ -----
29
+ This class can read HTML, email, PDF, MS Word, Excel, PowerPoint, and text files.
30
+
34
31
  Examples
35
32
  --------
36
33
  >>> from sparknlp.reader import SparkNLPReader
37
- >>> html_df = SparkNLPReader().html(spark, "https://www.wikipedia.org")
34
+ >>> reader = SparkNLPReader(spark)
38
35
 
39
- You can use SparkNLP for one line of code
36
+ # Reading HTML
37
+ >>> html_df = reader.html("https://www.wikipedia.org")
38
+ >>> # Or with shorthand
40
39
  >>> import sparknlp
41
40
  >>> html_df = sparknlp.read().html("https://www.wikipedia.org")
42
- >>> html_df.show(truncate=False)
43
-
44
- +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
45
- |url |html |
46
- +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
47
- |https://example.com/|[{Title, Example Domain, {pageNumber -> 1}}, {NarrativeText, 0, This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission., {pageNumber -> 1}}, {NarrativeText, 0, More information... More information..., {pageNumber -> 1}}] |
48
- +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
49
- >>> html_df.printSchema()
50
-
51
- root
52
- |-- url: string (nullable = true)
53
- |-- html: array (nullable = true)
54
- | |-- element: struct (containsNull = true)
55
- | | |-- elementType: string (nullable = true)
56
- | | |-- content: string (nullable = true)
57
- | | |-- metadata: map (nullable = true)
58
- | | | |-- key: string
59
- | | | |-- value: string (valueContainsNull = true)
60
-
61
-
62
-
63
- Instantiates class to read email files.
64
41
 
65
- emailPath: this is a path to a directory of HTML files or a path to an HTML file E.g.
66
- "path/html/emails"
42
+ # Reading PDF
43
+ >>> pdf_df = reader.pdf("home/user/pdfs-directory")
44
+ >>> # Or with shorthand
45
+ >>> pdf_df = sparknlp.read().pdf("home/user/pdfs-directory")
67
46
 
68
- Examples
69
- --------
70
- >>> from sparknlp.reader import SparkNLPReader
71
- >>> email_df = SparkNLPReader().email(spark, "home/user/emails-directory")
72
-
73
- You can use SparkNLP for one line of code
74
- >>> import sparknlp
47
+ # Reading Email
48
+ >>> email_df = reader.email("home/user/emails-directory")
49
+ >>> # Or with shorthand
75
50
  >>> email_df = sparknlp.read().email("home/user/emails-directory")
76
- >>> email_df.show(truncate=False)
77
- +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
78
- |email |
79
- +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
80
- |[{Title, Email Text Attachments, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>}}, {NarrativeText, Email test with two text attachments\r\n\r\nCheers,\r\n\r\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/plain}}, {NarrativeText, <html>\r\n<head>\r\n<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">\r\n<style type="text/css" style="display:none;"> P {margin-top:0;margin-bottom:0;} </style>\r\n</head>\r\n<body dir="ltr">\r\n<span style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">Email&nbsp; test with two text attachments</span>\r\n<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r\n<br>\r\n</div>\r\n<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r\nCheers,</div>\r\n<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r\n<br>\r\n</div>\r\n</body>\r\n</html>\r\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/html}}, {Attachment, filename.txt, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, contentType -> text/plain; name="filename.txt"}}, {NarrativeText, This is the content of the file.\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/plain}}, {Attachment, filename2.txt, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, contentType -> text/plain; name="filename2.txt"}}, {NarrativeText, This is an additional content file.\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/plain}}]|
81
- +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
82
- email_df.printSchema()
83
- root
84
- |-- path: string (nullable = true)
85
- |-- content: array (nullable = true)
86
- |-- email: array (nullable = true)
87
- | |-- element: struct (containsNull = true)
88
- | | |-- elementType: string (nullable = true)
89
- | | |-- content: string (nullable = true)
90
- | | |-- metadata: map (nullable = true)
91
- | | | |-- key: string
92
- | | | |-- value: string (valueContainsNull = true)
93
-
94
51
  """
95
52
 
96
53
  def __init__(self, spark, params=None):
@@ -100,6 +57,46 @@ class SparkNLPReader(ExtendedJavaWrapper):
100
57
  self.spark = spark
101
58
 
102
59
  def html(self, htmlPath):
60
+ """Reads HTML files or URLs and returns a Spark DataFrame.
61
+
62
+ Parameters
63
+ ----------
64
+ htmlPath : str or list of str
65
+ Path(s) to HTML file(s) or a list of URLs.
66
+
67
+ Returns
68
+ -------
69
+ pyspark.sql.DataFrame
70
+ A DataFrame containing the parsed HTML content.
71
+
72
+ Examples
73
+ --------
74
+ >>> from sparknlp.reader import SparkNLPReader
75
+ >>> html_df = SparkNLPReader(spark).html("https://www.wikipedia.org")
76
+
77
+ You can also use SparkNLP to simplify the process:
78
+
79
+ >>> import sparknlp
80
+ >>> html_df = sparknlp.read().html("https://www.wikipedia.org")
81
+ >>> html_df.show(truncate=False)
82
+
83
+ +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
84
+ |url |html |
85
+ +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
86
+ |https://example.com/|[{Title, Example Domain, {pageNumber -> 1}}, {NarrativeText, 0, This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission., {pageNumber -> 1}}, {NarrativeText, 0, More information... More information..., {pageNumber -> 1}}] |
87
+ +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
88
+ >>> html_df.printSchema()
89
+
90
+ root
91
+ |-- url: string (nullable = true)
92
+ |-- html: array (nullable = true)
93
+ | |-- element: struct (containsNull = true)
94
+ | | |-- elementType: string (nullable = true)
95
+ | | |-- content: string (nullable = true)
96
+ | | |-- metadata: map (nullable = true)
97
+ | | | |-- key: string
98
+ | | | |-- value: string (valueContainsNull = true)
99
+ """
103
100
  if not isinstance(htmlPath, (str, list)) or (isinstance(htmlPath, list) and not all(isinstance(item, str) for item in htmlPath)):
104
101
  raise TypeError("htmlPath must be a string or a list of strings")
105
102
  jdf = self._java_obj.html(htmlPath)
@@ -107,6 +104,47 @@ class SparkNLPReader(ExtendedJavaWrapper):
107
104
  return dataframe
108
105
 
109
106
  def email(self, filePath):
107
+ """Reads email files and returns a Spark DataFrame.
108
+
109
+ Parameters
110
+ ----------
111
+ filePath : str
112
+ Path to an email file or a directory containing emails.
113
+
114
+ Returns
115
+ -------
116
+ pyspark.sql.DataFrame
117
+ A DataFrame containing parsed email data.
118
+
119
+ Examples
120
+ --------
121
+ >>> from sparknlp.reader import SparkNLPReader
122
+ >>> email_df = SparkNLPReader(spark).email("home/user/emails-directory")
123
+
124
+ You can also use SparkNLP to simplify the process:
125
+
126
+ >>> import sparknlp
127
+ >>> email_df = sparknlp.read().email("home/user/emails-directory")
128
+ >>> email_df.show(truncate=False)
129
+
130
+ +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
131
+ |email |
132
+ +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
133
+ |[{Title, Email Text Attachments, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>}}, {NarrativeText, Email test with two text attachments\r\n\r\nCheers,\r\n\r\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/plain}}, {NarrativeText, <html>\r\n<head>\r\n<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">\r\n<style type="text/css" style="display:none;"> P {margin-top:0;margin-bottom:0;} </style>\r\n</head>\r\n<body dir="ltr">\r\n<span style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">Email&nbsp; test with two text attachments</span>\r\n<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r\n<br>\r\n</div>\r\n<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r\nCheers,</div>\r\n<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r\n<br>\r\n</div>\r\n</body>\r\n</html>\r\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/html}}, {Attachment, filename.txt, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, contentType -> text/plain; name="filename.txt"}}, {NarrativeText, This is the content of the file.\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/plain}}, {Attachment, filename2.txt, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, contentType -> text/plain; name="filename2.txt"}}, {NarrativeText, This is an additional content file.\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/plain}}]|
134
+ +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
135
+ >>> email_df.printSchema()
136
+ root
137
+ |-- path: string (nullable = true)
138
+ |-- content: array (nullable = true)
139
+ |-- email: array (nullable = true)
140
+ | |-- element: struct (containsNull = true)
141
+ | | |-- elementType: string (nullable = true)
142
+ | | |-- content: string (nullable = true)
143
+ | | |-- metadata: map (nullable = true)
144
+ | | | |-- key: string
145
+ | | | |-- value: string (valueContainsNull = true)
146
+
147
+ """
110
148
  if not isinstance(filePath, str):
111
149
  raise TypeError("filePath must be a string")
112
150
  jdf = self._java_obj.email(filePath)
@@ -114,8 +152,170 @@ class SparkNLPReader(ExtendedJavaWrapper):
114
152
  return dataframe
115
153
 
116
154
  def doc(self, docPath):
155
+ """Reads word document files and returns a Spark DataFrame.
156
+
157
+ Parameters
158
+ ----------
159
+ docPath : str
160
+ Path to a word document file.
161
+
162
+ Returns
163
+ -------
164
+ pyspark.sql.DataFrame
165
+ A DataFrame containing parsed document content.
166
+
167
+ Examples
168
+ --------
169
+ >>> from sparknlp.reader import SparkNLPReader
170
+ >>> doc_df = SparkNLPReader().doc(spark, "home/user/word-directory")
171
+
172
+ You can use SparkNLP for one line of code
173
+ >>> import sparknlp
174
+ >>> doc_df = sparknlp.read().doc("home/user/word-directory")
175
+ >>> doc_df.show(truncate=False)
176
+
177
+ +----------------------------------------------------------------------------------------------------------------------------------------------------+
178
+ |doc | |
179
+ +----------------------------------------------------------------------------------------------------------------------------------------------------+
180
+ |[{Table, Header Col 1, {}}, {Table, Header Col 2, {}}, {Table, Lorem ipsum, {}}, {Table, A Link example, {}}, {NarrativeText, Dolor sit amet, {}}] |
181
+ +----------------------------------------------------------------------------------------------------------------------------------------------------+
182
+ >>> docsDf.printSchema()
183
+ root
184
+ |-- path: string (nullable = true)
185
+ |-- content: array (nullable = true)
186
+ |-- doc: array (nullable = true)
187
+ | |-- element: struct (containsNull = true)
188
+ | | |-- elementType: string (nullable = true)
189
+ | | |-- content: string (nullable = true)
190
+ | | |-- metadata: map (nullable = true)
191
+ | | | |-- key: string
192
+ | | | |-- value: string (valueContainsNull = true)
193
+
194
+ """
117
195
  if not isinstance(docPath, str):
118
196
  raise TypeError("docPath must be a string")
119
197
  jdf = self._java_obj.doc(docPath)
120
198
  dataframe = self.getDataFrame(self.spark, jdf)
121
- return dataframe
199
+ return dataframe
200
+
201
+ def pdf(self, pdfPath):
202
+ if not isinstance(pdfPath, str):
203
+ raise TypeError("docPath must be a string")
204
+ jdf = self._java_obj.pdf(pdfPath)
205
+ dataframe = self.getDataFrame(self.spark, jdf)
206
+ return dataframe
207
+
208
+ def xls(self, docPath):
209
+ """Reads excel document files and returns a Spark DataFrame.
210
+
211
+ Parameters
212
+ ----------
213
+ docPath : str
214
+ Path to an excel document file.
215
+
216
+ Returns
217
+ -------
218
+ pyspark.sql.DataFrame
219
+ A DataFrame containing parsed document content.
220
+
221
+ Examples
222
+ --------
223
+ >>> from sparknlp.reader import SparkNLPReader
224
+ >>> xlsDf = SparkNLPReader().xls(spark, "home/user/excel-directory")
225
+
226
+ You can use SparkNLP for one line of code
227
+ >>> import sparknlp
228
+ >>> xlsDf = sparknlp.read().xls("home/user/excel-directory")
229
+ >>> xlsDf.show(truncate=False)
230
+
231
+ +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
232
+ |xls |
233
+ +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
234
+ |[{Title, Financial performance, {SheetName -> Index}}, {Title, Topic\tPeriod\t\t\tPage, {SheetName -> Index}}, {NarrativeText, Quarterly revenue\tNine quarters to 30 June 2023\t\t\t1.0, {SheetName -> Index}}, {NarrativeText, Group financial performance\tFY 22\tFY 23\t\t2.0, {SheetName -> Index}}, {NarrativeText, Segmental results\tFY 22\tFY 23\t\t3.0, {SheetName -> Index}}, {NarrativeText, Segmental analysis\tFY 22\tFY 23\t\t4.0, {SheetName -> Index}}, {NarrativeText, Cash flow\tFY 22\tFY 23\t\t5.0, {SheetName -> Index}}, {Title, Operational metrics, {SheetName -> Index}}, {Title, Topic\tPeriod\t\t\tPage, {SheetName -> Index}}, {NarrativeText, Mobile customers\tNine quarters to 30 June 2023\t\t\t6.0, {SheetName -> Index}}, {NarrativeText, Fixed broadband customers\tNine quarters to 30 June 2023\t\t\t7.0, {SheetName -> Index}}, {NarrativeText, Marketable homes passed\tNine quarters to 30 June 2023\t\t\t8.0, {SheetName -> Index}}, {NarrativeText, TV customers\tNine quarters to 30 June 2023\t\t\t9.0, {SheetName -> Index}}, {NarrativeText, Converged customers\tNine quarters to 30 June 2023\t\t\t10.0, {SheetName -> Index}}, {NarrativeText, Mobile churn\tNine quarters to 30 June 2023\t\t\t11.0, {SheetName -> Index}}, {NarrativeText, Mobile data usage\tNine quarters to 30 June 2023\t\t\t12.0, {SheetName -> Index}}, {NarrativeText, Mobile ARPU\tNine quarters to 30 June 2023\t\t\t13.0, {SheetName -> Index}}, {Title, Other, {SheetName -> Index}}, {Title, Topic\tPeriod\t\t\tPage, {SheetName -> Index}}, {NarrativeText, Average foreign exchange rates\tNine quarters to 30 June 2023\t\t\t14.0, {SheetName -> Index}}, {NarrativeText, Guidance rates\tFY 23/24\t\t\t14.0, {SheetName -> Index}}]|
235
+ +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
236
+
237
+ >>> xlsDf.printSchema()
238
+ root
239
+ |-- path: string (nullable = true)
240
+ |-- content: binary (nullable = true)
241
+ |-- xls: array (nullable = true)
242
+ | |-- element: struct (containsNull = true)
243
+ | | |-- elementType: string (nullable = true)
244
+ | | |-- content: string (nullable = true)
245
+ | | |-- metadata: map (nullable = true)
246
+ | | | |-- key: string
247
+ | | | |-- value: string (valueContainsNull = true)
248
+ """
249
+ if not isinstance(docPath, str):
250
+ raise TypeError("docPath must be a string")
251
+ jdf = self._java_obj.xls(docPath)
252
+ dataframe = self.getDataFrame(self.spark, jdf)
253
+ return dataframe
254
+
255
+ def ppt(self, docPath):
256
+ """
257
+ Reads power point document files and returns a Spark DataFrame.
258
+
259
+ Parameters
260
+ ----------
261
+ docPath : str
262
+ Path to an excel document file.
263
+
264
+ Returns
265
+ -------
266
+ pyspark.sql.DataFrame
267
+ A DataFrame containing parsed document content.
268
+
269
+ Examples
270
+ --------
271
+ >>> from sparknlp.reader import SparkNLPReader
272
+ >>> pptDf = SparkNLPReader().ppt(spark, "home/user/powerpoint-directory")
273
+
274
+ You can use SparkNLP for one line of code
275
+ >>> import sparknlp
276
+ >>> pptDf = sparknlp.read().ppt("home/user/powerpoint-directory")
277
+ >>> pptDf.show(truncate=False)
278
+ +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
279
+ |ppt |
280
+ +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
281
+ |[{Title, Adding a Bullet Slide, {}}, {ListItem, • Find the bullet slide layout, {}}, {ListItem, – Use _TextFrame.text for first bullet, {}}, {ListItem, • Use _TextFrame.add_paragraph() for subsequent bullets, {}}, {NarrativeText, Here is a lot of text!, {}}, {NarrativeText, Here is some text in a text box!, {}}]|
282
+ +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
283
+ """
284
+ if not isinstance(docPath, str):
285
+ raise TypeError("docPath must be a string")
286
+ jdf = self._java_obj.ppt(docPath)
287
+ dataframe = self.getDataFrame(self.spark, jdf)
288
+ return dataframe
289
+
290
+ def txt(self, docPath):
291
+ """Reads TXT files and returns a Spark DataFrame.
292
+
293
+ Parameters
294
+ ----------
295
+ docPath : str
296
+ Path to a TXT file.
297
+
298
+ Returns
299
+ -------
300
+ pyspark.sql.DataFrame
301
+ A DataFrame containing parsed document content.
302
+
303
+ Examples
304
+ --------
305
+ >>> from sparknlp.reader import SparkNLPReader
306
+ >>> txtDf = SparkNLPReader().txt(spark, "home/user/txt/files")
307
+
308
+ You can use SparkNLP for one line of code
309
+ >>> import sparknlp
310
+ >>> txtDf = sparknlp.read().txt("home/user/txt/files")
311
+ >>> txtDf.show(truncate=False)
312
+ +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
313
+ |txt |
314
+ +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
315
+ |[{Title, BIG DATA ANALYTICS, {paragraph -> 0}}, {NarrativeText, Apache Spark is a fast and general-purpose cluster computing system.\nIt provides high-level APIs in Java, Scala, Python, and R., {paragraph -> 0}}, {Title, MACHINE LEARNING, {paragraph -> 1}}, {NarrativeText, Spark's MLlib provides scalable machine learning algorithms.\nIt includes tools for classification, regression, clustering, and more., {paragraph -> 1}}]|
316
+ +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
317
+ """
318
+ if not isinstance(docPath, str):
319
+ raise TypeError("docPath must be a string")
320
+ jdf = self._java_obj.txt(docPath)
321
+ return self.getDataFrame(self.spark, jdf)
@@ -1 +0,0 @@
1
- 90f78083-0ee0-43e9-8240-7263731b6707