spark-nlp 5.5.3__py2.py3-none-any.whl → 6.0.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spark-nlp might be problematic. Click here for more details.

Files changed (37) hide show
  1. {spark_nlp-5.5.3.dist-info → spark_nlp-6.0.1.dist-info}/METADATA +20 -11
  2. {spark_nlp-5.5.3.dist-info → spark_nlp-6.0.1.dist-info}/RECORD +36 -17
  3. {spark_nlp-5.5.3.dist-info → spark_nlp-6.0.1.dist-info}/WHEEL +1 -1
  4. sparknlp/__init__.py +2 -2
  5. sparknlp/annotator/classifier_dl/__init__.py +4 -0
  6. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  7. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +2 -2
  8. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  9. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  10. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  11. sparknlp/annotator/cleaners/__init__.py +15 -0
  12. sparknlp/annotator/cleaners/cleaner.py +202 -0
  13. sparknlp/annotator/cleaners/extractor.py +191 -0
  14. sparknlp/annotator/cv/__init__.py +9 -1
  15. sparknlp/annotator/cv/gemma3_for_multimodal.py +351 -0
  16. sparknlp/annotator/cv/janus_for_multimodal.py +356 -0
  17. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  18. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  19. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  20. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  21. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  22. sparknlp/annotator/cv/smolvlm_transformer.py +432 -0
  23. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +10 -6
  24. sparknlp/annotator/seq2seq/__init__.py +3 -0
  25. sparknlp/annotator/seq2seq/auto_gguf_model.py +8 -503
  26. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +333 -0
  27. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  28. sparknlp/annotator/seq2seq/llama3_transformer.py +4 -4
  29. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  30. sparknlp/base/image_assembler.py +58 -0
  31. sparknlp/common/properties.py +605 -96
  32. sparknlp/internal/__init__.py +127 -2
  33. sparknlp/reader/enums.py +19 -0
  34. sparknlp/reader/pdf_to_text.py +111 -0
  35. sparknlp/reader/sparknlp_reader.py +222 -14
  36. spark_nlp-5.5.3.dist-info/.uuid +0 -1
  37. {spark_nlp-5.5.3.dist-info → spark_nlp-6.0.1.dist-info}/top_level.txt +0 -0
@@ -67,6 +67,15 @@ class _AlbertForZeroShotClassificationLoader(ExtendedJavaWrapper):
67
67
  )
68
68
 
69
69
 
70
+ class _AlbertMultipleChoiceLoader(ExtendedJavaWrapper):
71
+ def __init__(self, path, jspark):
72
+ super(_AlbertMultipleChoiceLoader, self).__init__(
73
+ "com.johnsnowlabs.nlp.annotators.classifier.dl.AlbertForMultipleChoice.loadSavedModel",
74
+ path,
75
+ jspark,
76
+ )
77
+
78
+
70
79
  class _BertLoader(ExtendedJavaWrapper):
71
80
  def __init__(self, path, jspark, use_openvino=False):
72
81
  super(_BertLoader, self).__init__(
@@ -121,6 +130,15 @@ class _BertMultipleChoiceLoader(ExtendedJavaWrapper):
121
130
  jspark,
122
131
  )
123
132
 
133
+ class _CoHereLoader(ExtendedJavaWrapper):
134
+ def __init__(self, path, jspark, use_openvino=False):
135
+ super(_CoHereLoader, self).__init__(
136
+ "com.johnsnowlabs.nlp.annotators.seq2seq.CoHereTransformer.loadSavedModel",
137
+ path,
138
+ jspark,
139
+ use_openvino,
140
+ )
141
+
124
142
  class _DeBERTaLoader(ExtendedJavaWrapper):
125
143
  def __init__(self, path, jspark):
126
144
  super(_DeBERTaLoader, self).__init__(
@@ -211,6 +229,15 @@ class _DistilBertQuestionAnsweringLoader(ExtendedJavaWrapper):
211
229
  )
212
230
 
213
231
 
232
+ class _DistilBertMultipleChoiceLoader(ExtendedJavaWrapper):
233
+ def __init__(self, path, jspark):
234
+ super(_DistilBertMultipleChoiceLoader, self).__init__(
235
+ "com.johnsnowlabs.nlp.annotators.classifier.dl.DistilBertForMultipleChoice.loadSavedModel",
236
+ path,
237
+ jspark,
238
+ )
239
+
240
+
214
241
  class _ElmoLoader(ExtendedJavaWrapper):
215
242
  def __init__(self, path, jspark):
216
243
  super(_ElmoLoader, self).__init__(
@@ -245,6 +272,23 @@ class _GPT2Loader(ExtendedJavaWrapper):
245
272
  jspark,
246
273
  )
247
274
 
275
+ class _Gemma3ForMultiModalLoader(ExtendedJavaWrapper):
276
+ def __init__(self, path, jspark, use_openvino=False):
277
+ super(_Gemma3ForMultiModalLoader, self).__init__(
278
+ "com.johnsnowlabs.nlp.annotators.cv.Gemma3ForMultiModal.loadSavedModel",
279
+ path,
280
+ jspark,
281
+ use_openvino
282
+ )
283
+
284
+ class _JanusForMultiModalLoader(ExtendedJavaWrapper):
285
+ def __init__(self, path, jspark, use_openvino=False):
286
+ super(_JanusForMultiModalLoader, self).__init__(
287
+ "com.johnsnowlabs.nlp.annotators.cv.JanusForMultiModal.loadSavedModel",
288
+ path,
289
+ jspark,
290
+ use_openvino
291
+ )
248
292
 
249
293
  class _LLAMA2Loader(ExtendedJavaWrapper):
250
294
  def __init__(self, path, jspark, use_openvino=False):
@@ -299,6 +343,14 @@ class _LongformerQuestionAnsweringLoader(ExtendedJavaWrapper):
299
343
  jspark,
300
344
  )
301
345
 
346
+ class _LLAVAForMultiModalLoader(ExtendedJavaWrapper):
347
+ def __init__(self, path, jspark, use_openvino=False):
348
+ super(_LLAVAForMultiModalLoader, self).__init__(
349
+ "com.johnsnowlabs.nlp.annotators.cv.LLAVAForMultiModal.loadSavedModel",
350
+ path,
351
+ jspark,
352
+ use_openvino
353
+ )
302
354
 
303
355
  class _M2M100Loader(ExtendedJavaWrapper):
304
356
  def __init__(self, path, jspark, use_openvino=False):
@@ -318,6 +370,14 @@ class _MistralLoader(ExtendedJavaWrapper):
318
370
  use_openvino,
319
371
  )
320
372
 
373
+ class _MLLamaForMultimodalLoader(ExtendedJavaWrapper):
374
+ def __init__(self, path, jspark, use_openvino=False):
375
+ super(_MLLamaForMultimodalLoader, self).__init__(
376
+ "com.johnsnowlabs.nlp.annotators.cv.MLLamaForMultimodal.loadSavedModel",
377
+ path,
378
+ jspark,
379
+ use_openvino
380
+ )
321
381
 
322
382
  class _NLLBLoader(ExtendedJavaWrapper):
323
383
  def __init__(self, path, jspark, use_openvino=False):
@@ -345,6 +405,10 @@ class _MPNetLoader(ExtendedJavaWrapper):
345
405
  )
346
406
 
347
407
 
408
+ class _OLMoLoader(ExtendedJavaWrapper):
409
+ def __init__(self, path, jspark):
410
+ super(_OLMoLoader, self).__init__(
411
+ "com.johnsnowlabs.nlp.annotators.seq2seq.OLMoTransformer.loadSavedModel", path, jspark)
348
412
  class _Phi2Loader(ExtendedJavaWrapper):
349
413
  def __init__(self, path, jspark, use_openvino=False):
350
414
  super(_Phi2Loader, self).__init__(
@@ -363,6 +427,15 @@ class _Phi3Loader(ExtendedJavaWrapper):
363
427
  use_openvino,
364
428
  )
365
429
 
430
+ class _Phi3VisionLoader(ExtendedJavaWrapper):
431
+ def __init__(self, path, jspark, use_openvino=False):
432
+ super(_Phi3VisionLoader, self).__init__(
433
+ "com.johnsnowlabs.nlp.annotators.cv.Phi3Vision.loadSavedModel",
434
+ path,
435
+ jspark,
436
+ use_openvino
437
+ )
438
+
366
439
  class _RoBertaLoader(ExtendedJavaWrapper):
367
440
  def __init__(self, path, jspark, use_openvino=False):
368
441
  super(_RoBertaLoader, self).__init__(
@@ -409,6 +482,15 @@ class _RoBertaQuestionAnsweringLoader(ExtendedJavaWrapper):
409
482
  )
410
483
 
411
484
 
485
+ class _RoBertaMultipleChoiceLoader(ExtendedJavaWrapper):
486
+ def __init__(self, path, jspark):
487
+ super(_RoBertaMultipleChoiceLoader, self).__init__(
488
+ "com.johnsnowlabs.nlp.annotators.classifier.dl.RoBertaForMultipleChoice.loadSavedModel",
489
+ path,
490
+ jspark,
491
+ )
492
+
493
+
412
494
  class _StarCoderLoader(ExtendedJavaWrapper):
413
495
  def __init__(self, path, jspark, use_openvino=False):
414
496
  super(_StarCoderLoader, self).__init__(
@@ -504,6 +586,15 @@ class _XlmRoBertaQuestionAnsweringLoader(ExtendedJavaWrapper):
504
586
  )
505
587
 
506
588
 
589
+ class _XlmRoBertaMultipleChoiceLoader(ExtendedJavaWrapper):
590
+ def __init__(self, path, jspark):
591
+ super(_XlmRoBertaMultipleChoiceLoader, self).__init__(
592
+ "com.johnsnowlabs.nlp.annotators.classifier.dl.XlmRoBertaForMultipleChoice.loadSavedModel",
593
+ path,
594
+ jspark,
595
+ )
596
+
597
+
507
598
  class _XlnetLoader(ExtendedJavaWrapper):
508
599
  def __init__(self, path, jspark):
509
600
  super(_XlnetLoader, self).__init__(
@@ -992,8 +1083,8 @@ class _AutoGGUFLoader(ExtendedJavaWrapper):
992
1083
  def __init__(self, path, jspark):
993
1084
  super(_AutoGGUFLoader, self).__init__(
994
1085
  "com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel.loadSavedModel", path, jspark)
995
-
996
-
1086
+
1087
+
997
1088
  class _MxbaiEmbeddingsLoader(ExtendedJavaWrapper):
998
1089
  def __init__(self, path, jspark):
999
1090
  super(_MxbaiEmbeddingsLoader, self).__init__(
@@ -1021,3 +1112,37 @@ class _BLIPForQuestionAnswering(ExtendedJavaWrapper):
1021
1112
  path,
1022
1113
  jspark,
1023
1114
  )
1115
+
1116
+
1117
+ class _AutoGGUFVisionLoader(ExtendedJavaWrapper):
1118
+ def __init__(self, modelPath, mmprojPath, jspark):
1119
+ super(_AutoGGUFVisionLoader, self).__init__(
1120
+ "com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFVisionModel.loadSavedModel", modelPath, mmprojPath, jspark)
1121
+
1122
+
1123
+ class _Qwen2VLTransformerLoader(ExtendedJavaWrapper):
1124
+ def __init__(self, path, jspark, use_openvino=False):
1125
+ super(_Qwen2VLTransformerLoader, self).__init__(
1126
+ "com.johnsnowlabs.nlp.annotators.cv.Qwen2VLTransformer.loadSavedModel",
1127
+ path,
1128
+ jspark,
1129
+ use_openvino,
1130
+ )
1131
+
1132
+ class _PaliGemmaForMultiModalLoader(ExtendedJavaWrapper):
1133
+ def __init__(self, path, jspark, use_openvino=False):
1134
+ super(_PaliGemmaForMultiModalLoader, self).__init__(
1135
+ "com.johnsnowlabs.nlp.annotators.cv.PaliGemmaForMultiModal.loadSavedModel",
1136
+ path,
1137
+ jspark,
1138
+ use_openvino,
1139
+ )
1140
+
1141
+ class _SmolVLMTransformerLoader(ExtendedJavaWrapper):
1142
+ def __init__(self, path, jspark, use_openvino=False):
1143
+ super(_SmolVLMTransformerLoader, self).__init__(
1144
+ "com.johnsnowlabs.nlp.annotators.cv.SmolVLMTransformer.loadSavedModel",
1145
+ path,
1146
+ jspark,
1147
+ use_openvino
1148
+ )
@@ -0,0 +1,19 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from enum import Enum
15
+
16
+ class TextStripperType(Enum):
17
+ """Text Stripper Type"""
18
+ PDF_TEXT_STRIPPER = "PDFTextStripper"
19
+ PDF_LAYOUT_TEXT_STRIPPER = "PDFLayoutTextStripper"
@@ -0,0 +1,111 @@
1
+ from pyspark import keyword_only
2
+ from pyspark.ml.param import Param, Params, TypeConverters
3
+ from pyspark.ml.param.shared import HasInputCol, HasOutputCol
4
+ from pyspark.ml.util import JavaMLReadable, JavaMLWritable
5
+ from pyspark.ml.wrapper import JavaTransformer
6
+
7
+ from sparknlp.reader.enums import TextStripperType
8
+
9
+
10
+ class PdfToText(JavaTransformer, HasInputCol, HasOutputCol,
11
+ JavaMLReadable, JavaMLWritable):
12
+ """
13
+ Extract text from Pdf document to single string or to several strings per each page.
14
+ Input is a column with binary representation of PDF document.
15
+ As output generate column with text and page number.
16
+ Explode each page as separate row if split to page enabled.
17
+ """
18
+ pageNumCol = Param(Params._dummy(), "pageNumCol",
19
+ "Page number output column name.",
20
+ typeConverter=TypeConverters.toString)
21
+
22
+ partitionNum = Param(Params._dummy(), "partitionNum",
23
+ "Number of partitions.",
24
+ typeConverter=TypeConverters.toInt)
25
+
26
+ storeSplittedPdf = Param(Params._dummy(), "storeSplittedPdf",
27
+ "Force to store splitted pdf.",
28
+ typeConverter=TypeConverters.toBoolean)
29
+
30
+ splitPage = Param(Params._dummy(), "splitPage",
31
+ "Param for enable/disable splitting document per page",
32
+ typeConverter=TypeConverters.toBoolean)
33
+
34
+ textStripper = Param(Params._dummy(), "textStripper",
35
+ "Text stripper type used for output layout and formatting",
36
+ typeConverter=TypeConverters.toString)
37
+
38
+ sort = Param(Params._dummy(), "sort",
39
+ "Param for enable/disable sort lines",
40
+ typeConverter=TypeConverters.toBoolean)
41
+
42
+ onlyPageNum = Param(Params._dummy(), "onlyPageNum",
43
+ "Force to extract only number of pages",
44
+ typeConverter=TypeConverters.toBoolean)
45
+
46
+ @keyword_only
47
+ def __init__(self):
48
+ """
49
+ __init__(self)
50
+ """
51
+ super(PdfToText, self).__init__()
52
+ self._java_obj = self._new_java_obj("com.johnsnowlabs.reader.PdfToText", self.uid)
53
+
54
+ def setInputCol(self, value):
55
+ """
56
+ Sets the value of :py:attr:`inputCol`.
57
+ """
58
+ return self._set(inputCol=value)
59
+
60
+ def setOutputCol(self, value):
61
+ """
62
+ Sets the value of :py:attr:`outputCol`.
63
+ """
64
+ return self._set(outputCol=value)
65
+
66
+ def setPageNumCol(self, value):
67
+ """
68
+ Sets the value of :py:attr:`pageNumCol`.
69
+ """
70
+ return self._set(pageNumCol=value)
71
+
72
+ def setPartitionNum(self, value):
73
+ """
74
+ Sets the value of :py:attr:`partitionNum`.
75
+ """
76
+ return self._set(partitionNum=value)
77
+
78
+ def setStoreSplittedPdf(self, value):
79
+ """
80
+ Sets the value of :py:attr:`storeSplittedPdf`.
81
+ """
82
+ return self._set(storeSplittedPdf=value)
83
+
84
+ def setSplitPage(self, value):
85
+ """
86
+ Sets the value of :py:attr:`splitPage`.
87
+ """
88
+ return self._set(splitPage=value)
89
+
90
+ def setOnlyPageNum(self, value):
91
+ """
92
+ Sets the value of :py:attr:`onlyPageNum`.
93
+ """
94
+ return self._set(onlyPageNum=value)
95
+
96
+ def setTextStripper(self, value):
97
+ """
98
+ Sets the value of :py:attr:`textStripper`.
99
+ """
100
+ if isinstance(value, TextStripperType):
101
+ value = value.value
102
+ if value not in [i.value for i in TextStripperType]:
103
+ type_value = type(value)
104
+ raise ValueError(f"Param textStripper must be a 'TextStripperType' enum but got {type_value}.")
105
+ return self._set(textStripper=str(value))
106
+
107
+ def setSort(self, value):
108
+ """
109
+ Sets the value of :py:attr:`sort`.
110
+ """
111
+ return self._set(sort=value)
@@ -15,19 +15,39 @@ from sparknlp.internal import ExtendedJavaWrapper
15
15
 
16
16
 
17
17
  class SparkNLPReader(ExtendedJavaWrapper):
18
- """Instantiates class to read HTML, email, and document files.
19
-
20
- Two types of input paths are supported:
21
-
22
- - `htmlPath`: A path to a directory of HTML files or a single HTML file (e.g., `"path/html/files"`).
23
- - `url`: A single URL or a set of URLs (e.g., `"https://www.wikipedia.org"`).
18
+ """Instantiates class to read documents in various formats.
24
19
 
25
20
  Parameters
26
21
  ----------
27
- spark : SparkSession
28
- The active Spark session.
22
+ params : spark
23
+ Spark session
29
24
  params : dict, optional
30
- A dictionary with custom configurations.
25
+ Parameter with custom configuration
26
+
27
+ Notes
28
+ -----
29
+ This class can read HTML, email, PDF, MS Word, Excel, PowerPoint, and text files.
30
+
31
+ Examples
32
+ --------
33
+ >>> from sparknlp.reader import SparkNLPReader
34
+ >>> reader = SparkNLPReader(spark)
35
+
36
+ # Reading HTML
37
+ >>> html_df = reader.html("https://www.wikipedia.org")
38
+ >>> # Or with shorthand
39
+ >>> import sparknlp
40
+ >>> html_df = sparknlp.read().html("https://www.wikipedia.org")
41
+
42
+ # Reading PDF
43
+ >>> pdf_df = reader.pdf("home/user/pdfs-directory")
44
+ >>> # Or with shorthand
45
+ >>> pdf_df = sparknlp.read().pdf("home/user/pdfs-directory")
46
+
47
+ # Reading Email
48
+ >>> email_df = reader.email("home/user/emails-directory")
49
+ >>> # Or with shorthand
50
+ >>> email_df = sparknlp.read().email("home/user/emails-directory")
31
51
  """
32
52
 
33
53
  def __init__(self, spark, params=None):
@@ -59,11 +79,29 @@ class SparkNLPReader(ExtendedJavaWrapper):
59
79
  >>> import sparknlp
60
80
  >>> html_df = sparknlp.read().html("https://www.wikipedia.org")
61
81
  >>> html_df.show(truncate=False)
82
+
83
+ +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
84
+ |url |html |
85
+ +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
86
+ |https://example.com/|[{Title, Example Domain, {pageNumber -> 1}}, {NarrativeText, 0, This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission., {pageNumber -> 1}}, {NarrativeText, 0, More information... More information..., {pageNumber -> 1}}] |
87
+ +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
88
+ >>> html_df.printSchema()
89
+
90
+ root
91
+ |-- url: string (nullable = true)
92
+ |-- html: array (nullable = true)
93
+ | |-- element: struct (containsNull = true)
94
+ | | |-- elementType: string (nullable = true)
95
+ | | |-- content: string (nullable = true)
96
+ | | |-- metadata: map (nullable = true)
97
+ | | | |-- key: string
98
+ | | | |-- value: string (valueContainsNull = true)
62
99
  """
63
100
  if not isinstance(htmlPath, (str, list)) or (isinstance(htmlPath, list) and not all(isinstance(item, str) for item in htmlPath)):
64
101
  raise TypeError("htmlPath must be a string or a list of strings")
65
102
  jdf = self._java_obj.html(htmlPath)
66
- return self.getDataFrame(self.spark, jdf)
103
+ dataframe = self.getDataFrame(self.spark, jdf)
104
+ return dataframe
67
105
 
68
106
  def email(self, filePath):
69
107
  """Reads email files and returns a Spark DataFrame.
@@ -83,31 +121,201 @@ class SparkNLPReader(ExtendedJavaWrapper):
83
121
  >>> from sparknlp.reader import SparkNLPReader
84
122
  >>> email_df = SparkNLPReader(spark).email("home/user/emails-directory")
85
123
 
86
- Using SparkNLP:
124
+ You can also use SparkNLP to simplify the process:
87
125
 
88
126
  >>> import sparknlp
89
127
  >>> email_df = sparknlp.read().email("home/user/emails-directory")
90
128
  >>> email_df.show(truncate=False)
129
+
130
+ +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
131
+ |email |
132
+ +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
133
+ |[{Title, Email Text Attachments, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>}}, {NarrativeText, Email test with two text attachments\r\n\r\nCheers,\r\n\r\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/plain}}, {NarrativeText, <html>\r\n<head>\r\n<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">\r\n<style type="text/css" style="display:none;"> P {margin-top:0;margin-bottom:0;} </style>\r\n</head>\r\n<body dir="ltr">\r\n<span style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">Email&nbsp; test with two text attachments</span>\r\n<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r\n<br>\r\n</div>\r\n<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r\nCheers,</div>\r\n<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r\n<br>\r\n</div>\r\n</body>\r\n</html>\r\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/html}}, {Attachment, filename.txt, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, contentType -> text/plain; name="filename.txt"}}, {NarrativeText, This is the content of the file.\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/plain}}, {Attachment, filename2.txt, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, contentType -> text/plain; name="filename2.txt"}}, {NarrativeText, This is an additional content file.\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/plain}}]|
134
+ +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
135
+ >>> email_df.printSchema()
136
+ root
137
+ |-- path: string (nullable = true)
138
+ |-- content: array (nullable = true)
139
+ |-- email: array (nullable = true)
140
+ | |-- element: struct (containsNull = true)
141
+ | | |-- elementType: string (nullable = true)
142
+ | | |-- content: string (nullable = true)
143
+ | | |-- metadata: map (nullable = true)
144
+ | | | |-- key: string
145
+ | | | |-- value: string (valueContainsNull = true)
146
+
91
147
  """
92
148
  if not isinstance(filePath, str):
93
149
  raise TypeError("filePath must be a string")
94
150
  jdf = self._java_obj.email(filePath)
95
- return self.getDataFrame(self.spark, jdf)
151
+ dataframe = self.getDataFrame(self.spark, jdf)
152
+ return dataframe
96
153
 
97
154
  def doc(self, docPath):
98
- """Reads document files and returns a Spark DataFrame.
155
+ """Reads word document files and returns a Spark DataFrame.
99
156
 
100
157
  Parameters
101
158
  ----------
102
159
  docPath : str
103
- Path to a document file.
160
+ Path to a word document file.
104
161
 
105
162
  Returns
106
163
  -------
107
164
  pyspark.sql.DataFrame
108
165
  A DataFrame containing parsed document content.
166
+
167
+ Examples
168
+ --------
169
+ >>> from sparknlp.reader import SparkNLPReader
170
+ >>> doc_df = SparkNLPReader().doc(spark, "home/user/word-directory")
171
+
172
+ You can use SparkNLP for one line of code
173
+ >>> import sparknlp
174
+ >>> doc_df = sparknlp.read().doc("home/user/word-directory")
175
+ >>> doc_df.show(truncate=False)
176
+
177
+ +----------------------------------------------------------------------------------------------------------------------------------------------------+
178
+ |doc | |
179
+ +----------------------------------------------------------------------------------------------------------------------------------------------------+
180
+ |[{Table, Header Col 1, {}}, {Table, Header Col 2, {}}, {Table, Lorem ipsum, {}}, {Table, A Link example, {}}, {NarrativeText, Dolor sit amet, {}}] |
181
+ +----------------------------------------------------------------------------------------------------------------------------------------------------+
182
+ >>> docsDf.printSchema()
183
+ root
184
+ |-- path: string (nullable = true)
185
+ |-- content: array (nullable = true)
186
+ |-- doc: array (nullable = true)
187
+ | |-- element: struct (containsNull = true)
188
+ | | |-- elementType: string (nullable = true)
189
+ | | |-- content: string (nullable = true)
190
+ | | |-- metadata: map (nullable = true)
191
+ | | | |-- key: string
192
+ | | | |-- value: string (valueContainsNull = true)
193
+
109
194
  """
110
195
  if not isinstance(docPath, str):
111
196
  raise TypeError("docPath must be a string")
112
197
  jdf = self._java_obj.doc(docPath)
198
+ dataframe = self.getDataFrame(self.spark, jdf)
199
+ return dataframe
200
+
201
+ def pdf(self, pdfPath):
202
+ if not isinstance(pdfPath, str):
203
+ raise TypeError("docPath must be a string")
204
+ jdf = self._java_obj.pdf(pdfPath)
205
+ dataframe = self.getDataFrame(self.spark, jdf)
206
+ return dataframe
207
+
208
+ def xls(self, docPath):
209
+ """Reads excel document files and returns a Spark DataFrame.
210
+
211
+ Parameters
212
+ ----------
213
+ docPath : str
214
+ Path to an excel document file.
215
+
216
+ Returns
217
+ -------
218
+ pyspark.sql.DataFrame
219
+ A DataFrame containing parsed document content.
220
+
221
+ Examples
222
+ --------
223
+ >>> from sparknlp.reader import SparkNLPReader
224
+ >>> xlsDf = SparkNLPReader().xls(spark, "home/user/excel-directory")
225
+
226
+ You can use SparkNLP for one line of code
227
+ >>> import sparknlp
228
+ >>> xlsDf = sparknlp.read().xls("home/user/excel-directory")
229
+ >>> xlsDf.show(truncate=False)
230
+
231
+ +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
232
+ |xls |
233
+ +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
234
+ |[{Title, Financial performance, {SheetName -> Index}}, {Title, Topic\tPeriod\t\t\tPage, {SheetName -> Index}}, {NarrativeText, Quarterly revenue\tNine quarters to 30 June 2023\t\t\t1.0, {SheetName -> Index}}, {NarrativeText, Group financial performance\tFY 22\tFY 23\t\t2.0, {SheetName -> Index}}, {NarrativeText, Segmental results\tFY 22\tFY 23\t\t3.0, {SheetName -> Index}}, {NarrativeText, Segmental analysis\tFY 22\tFY 23\t\t4.0, {SheetName -> Index}}, {NarrativeText, Cash flow\tFY 22\tFY 23\t\t5.0, {SheetName -> Index}}, {Title, Operational metrics, {SheetName -> Index}}, {Title, Topic\tPeriod\t\t\tPage, {SheetName -> Index}}, {NarrativeText, Mobile customers\tNine quarters to 30 June 2023\t\t\t6.0, {SheetName -> Index}}, {NarrativeText, Fixed broadband customers\tNine quarters to 30 June 2023\t\t\t7.0, {SheetName -> Index}}, {NarrativeText, Marketable homes passed\tNine quarters to 30 June 2023\t\t\t8.0, {SheetName -> Index}}, {NarrativeText, TV customers\tNine quarters to 30 June 2023\t\t\t9.0, {SheetName -> Index}}, {NarrativeText, Converged customers\tNine quarters to 30 June 2023\t\t\t10.0, {SheetName -> Index}}, {NarrativeText, Mobile churn\tNine quarters to 30 June 2023\t\t\t11.0, {SheetName -> Index}}, {NarrativeText, Mobile data usage\tNine quarters to 30 June 2023\t\t\t12.0, {SheetName -> Index}}, {NarrativeText, Mobile ARPU\tNine quarters to 30 June 2023\t\t\t13.0, {SheetName -> Index}}, {Title, Other, {SheetName -> Index}}, {Title, Topic\tPeriod\t\t\tPage, {SheetName -> Index}}, {NarrativeText, Average foreign exchange rates\tNine quarters to 30 June 2023\t\t\t14.0, {SheetName -> Index}}, {NarrativeText, Guidance rates\tFY 23/24\t\t\t14.0, {SheetName -> Index}}]|
235
+ +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
236
+
237
+ >>> xlsDf.printSchema()
238
+ root
239
+ |-- path: string (nullable = true)
240
+ |-- content: binary (nullable = true)
241
+ |-- xls: array (nullable = true)
242
+ | |-- element: struct (containsNull = true)
243
+ | | |-- elementType: string (nullable = true)
244
+ | | |-- content: string (nullable = true)
245
+ | | |-- metadata: map (nullable = true)
246
+ | | | |-- key: string
247
+ | | | |-- value: string (valueContainsNull = true)
248
+ """
249
+ if not isinstance(docPath, str):
250
+ raise TypeError("docPath must be a string")
251
+ jdf = self._java_obj.xls(docPath)
252
+ dataframe = self.getDataFrame(self.spark, jdf)
253
+ return dataframe
254
+
255
+ def ppt(self, docPath):
256
+ """
257
+ Reads power point document files and returns a Spark DataFrame.
258
+
259
+ Parameters
260
+ ----------
261
+ docPath : str
262
+ Path to an excel document file.
263
+
264
+ Returns
265
+ -------
266
+ pyspark.sql.DataFrame
267
+ A DataFrame containing parsed document content.
268
+
269
+ Examples
270
+ --------
271
+ >>> from sparknlp.reader import SparkNLPReader
272
+ >>> pptDf = SparkNLPReader().ppt(spark, "home/user/powerpoint-directory")
273
+
274
+ You can use SparkNLP for one line of code
275
+ >>> import sparknlp
276
+ >>> pptDf = sparknlp.read().ppt("home/user/powerpoint-directory")
277
+ >>> pptDf.show(truncate=False)
278
+ +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
279
+ |ppt |
280
+ +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
281
+ |[{Title, Adding a Bullet Slide, {}}, {ListItem, • Find the bullet slide layout, {}}, {ListItem, – Use _TextFrame.text for first bullet, {}}, {ListItem, • Use _TextFrame.add_paragraph() for subsequent bullets, {}}, {NarrativeText, Here is a lot of text!, {}}, {NarrativeText, Here is some text in a text box!, {}}]|
282
+ +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
283
+ """
284
+ if not isinstance(docPath, str):
285
+ raise TypeError("docPath must be a string")
286
+ jdf = self._java_obj.ppt(docPath)
287
+ dataframe = self.getDataFrame(self.spark, jdf)
288
+ return dataframe
289
+
290
+ def txt(self, docPath):
291
+ """Reads TXT files and returns a Spark DataFrame.
292
+
293
+ Parameters
294
+ ----------
295
+ docPath : str
296
+ Path to a TXT file.
297
+
298
+ Returns
299
+ -------
300
+ pyspark.sql.DataFrame
301
+ A DataFrame containing parsed document content.
302
+
303
+ Examples
304
+ --------
305
+ >>> from sparknlp.reader import SparkNLPReader
306
+ >>> txtDf = SparkNLPReader().txt(spark, "home/user/txt/files")
307
+
308
+ You can use SparkNLP for one line of code
309
+ >>> import sparknlp
310
+ >>> txtDf = sparknlp.read().txt("home/user/txt/files")
311
+ >>> txtDf.show(truncate=False)
312
+ +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
313
+ |txt |
314
+ +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
315
+ |[{Title, BIG DATA ANALYTICS, {paragraph -> 0}}, {NarrativeText, Apache Spark is a fast and general-purpose cluster computing system.\nIt provides high-level APIs in Java, Scala, Python, and R., {paragraph -> 0}}, {Title, MACHINE LEARNING, {paragraph -> 1}}, {NarrativeText, Spark's MLlib provides scalable machine learning algorithms.\nIt includes tools for classification, regression, clustering, and more., {paragraph -> 1}}]|
316
+ +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
317
+ """
318
+ if not isinstance(docPath, str):
319
+ raise TypeError("docPath must be a string")
320
+ jdf = self._java_obj.txt(docPath)
113
321
  return self.getDataFrame(self.spark, jdf)
@@ -1 +0,0 @@
1
- 90f78083-0ee0-43e9-8240-7263731b6707