spark-nlp 6.0.1rc1__py2.py3-none-any.whl → 6.0.3__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spark-nlp might be problematic. Click here for more details.

Files changed (39) hide show
  1. {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.3.dist-info}/METADATA +13 -6
  2. {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.3.dist-info}/RECORD +39 -32
  3. {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.3.dist-info}/WHEEL +1 -1
  4. sparknlp/__init__.py +4 -2
  5. sparknlp/annotator/cv/__init__.py +2 -0
  6. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  7. sparknlp/annotator/cv/gemma3_for_multimodal.py +5 -10
  8. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  9. sparknlp/annotator/cv/janus_for_multimodal.py +8 -13
  10. sparknlp/annotator/cv/llava_for_multimodal.py +1 -1
  11. sparknlp/annotator/cv/paligemma_for_multimodal.py +7 -7
  12. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +1 -1
  13. sparknlp/annotator/cv/qwen2vl_transformer.py +1 -1
  14. sparknlp/annotator/cv/smolvlm_transformer.py +7 -13
  15. sparknlp/annotator/date2_chunk.py +1 -1
  16. sparknlp/annotator/document_character_text_splitter.py +8 -8
  17. sparknlp/annotator/document_token_splitter.py +7 -7
  18. sparknlp/annotator/embeddings/__init__.py +1 -0
  19. sparknlp/annotator/embeddings/bge_embeddings.py +21 -19
  20. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  21. sparknlp/annotator/embeddings/snowflake_embeddings.py +15 -15
  22. sparknlp/annotator/openai/openai_completion.py +3 -4
  23. sparknlp/annotator/seq2seq/m2m100_transformer.py +1 -1
  24. sparknlp/annotator/seq2seq/mistral_transformer.py +2 -3
  25. sparknlp/annotator/seq2seq/nllb_transformer.py +1 -1
  26. sparknlp/annotator/seq2seq/qwen_transformer.py +26 -25
  27. sparknlp/annotator/spell_check/context_spell_checker.py +1 -1
  28. sparknlp/base/prompt_assembler.py +1 -1
  29. sparknlp/common/properties.py +7 -7
  30. sparknlp/internal/__init__.py +27 -0
  31. sparknlp/partition/__init__.py +16 -0
  32. sparknlp/partition/partition.py +244 -0
  33. sparknlp/partition/partition_properties.py +319 -0
  34. sparknlp/partition/partition_transformer.py +200 -0
  35. sparknlp/reader/pdf_to_text.py +50 -4
  36. sparknlp/reader/sparknlp_reader.py +101 -52
  37. sparknlp/training/spacy_to_annotation.py +7 -7
  38. sparknlp/util.py +26 -0
  39. {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,200 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains the PartitionTransformer class for reading various types of documents into chunks."""
15
+ from sparknlp.common import *
16
+ from sparknlp.partition.partition_properties import *
17
+
18
+
19
+ class PartitionTransformer(
20
+ AnnotatorModel,
21
+ HasEmailReaderProperties,
22
+ HasExcelReaderProperties,
23
+ HasHTMLReaderProperties,
24
+ HasPowerPointProperties,
25
+ HasTextReaderProperties,
26
+ HasChunkerProperties
27
+ ):
28
+ """
29
+ The PartitionTransformer annotator allows you to use the Partition feature more smoothly
30
+ within existing Spark NLP workflows, enabling seamless reuse of your pipelines.
31
+
32
+ It supports reading from files, URLs, in-memory strings, or byte arrays, and works
33
+ within a Spark NLP pipeline.
34
+
35
+ Supported formats include:
36
+ - Plain text
37
+ - HTML
38
+ - Word (.doc/.docx)
39
+ - Excel (.xls/.xlsx)
40
+ - PowerPoint (.ppt/.pptx)
41
+ - Email files (.eml, .msg)
42
+ - PDFs
43
+
44
+ Parameters
45
+ ----------
46
+ inputCols : list of str
47
+ Names of input columns (typically from DocumentAssembler).
48
+ outputCol : str
49
+ Name of the column to store the output.
50
+ contentType : str
51
+ The type of content: e.g., "text", "url", "file", etc.
52
+ headers : dict, optional
53
+ Headers to be used if content type is a URL.
54
+
55
+ Examples
56
+ --------
57
+ >>> dataset = spark.createDataFrame([
58
+ ... ("https://www.blizzard.com",),
59
+ ... ], ["text"])
60
+
61
+ >>> documentAssembler = DocumentAssembler() \\
62
+ ... .setInputCol("text") \\
63
+ ... .setOutputCol("document")
64
+
65
+ >>> partition = PartitionTransformer() \\
66
+ ... .setInputCols(["document"]) \\
67
+ ... .setOutputCol("partition") \\
68
+ ... .setContentType("url") \\
69
+ ... .setHeaders({"Accept-Language": "es-ES"})
70
+
71
+ >>> pipeline = Pipeline(stages=[documentAssembler, partition])
72
+ >>> pipelineModel = pipeline.fit(dataset)
73
+ >>> resultDf = pipelineModel.transform(dataset)
74
+ >>> resultDf.show()
75
+ +--------------------+--------------------+--------------------+
76
+ | text| document| partition|
77
+ +--------------------+--------------------+--------------------+
78
+ |https://www.blizz...|[{Title, Juegos d...|[{document, 0, 16...|
79
+ |https://www.googl...|[{Title, Gmail Im...|[{document, 0, 28...|
80
+ +--------------------+--------------------+--------------------+
81
+ """
82
+
83
+ name = "PartitionTransformer"
84
+
85
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
86
+
87
+ outputAnnotatorType = AnnotatorType.DOCUMENT
88
+
89
+ contentPath = Param(
90
+ Params._dummy(),
91
+ "contentPath",
92
+ "Path to the content source",
93
+ typeConverter=TypeConverters.toString
94
+ )
95
+
96
+ def setContentPath(self, value):
97
+ return self._set(contentPath=value)
98
+
99
+ def getContentPath(self):
100
+ return self.getOrDefault(self.contentPath)
101
+
102
+ contentType = Param(
103
+ Params._dummy(),
104
+ "contentType",
105
+ "Set the content type to load following MIME specification",
106
+ typeConverter=TypeConverters.toString
107
+ )
108
+
109
+ def setContentType(self, value):
110
+ return self._set(contentType=value)
111
+
112
+ def getContentType(self):
113
+ return self.getOrDefault(self.contentType)
114
+
115
+ storeContent = Param(
116
+ Params._dummy(),
117
+ "storeContent",
118
+ "Whether to include the raw file content in the output DataFrame as a separate 'content' column, alongside the structured output.",
119
+ typeConverter=TypeConverters.toBoolean
120
+ )
121
+
122
+ def setStoreContent(self, value):
123
+ return self._set(storeContent=value)
124
+
125
+ def getStoreContent(self):
126
+ return self.getOrDefault(self.storeContent)
127
+
128
+ titleFontSize = Param(
129
+ Params._dummy(),
130
+ "titleFontSize",
131
+ "Minimum font size threshold used as part of heuristic rules to detect title elements based on formatting (e.g., bold, centered, capitalized).",
132
+ typeConverter=TypeConverters.toInt
133
+ )
134
+
135
+ def setTitleFontSize(self, value):
136
+ return self._set(titleFontSize=value)
137
+
138
+ def getTitleFontSize(self):
139
+ return self.getOrDefault(self.titleFontSize)
140
+
141
+ inferTableStructure = Param(
142
+ Params._dummy(),
143
+ "inferTableStructure",
144
+ "Whether to generate an HTML table representation from structured table content. When enabled, a full <table> element is added alongside cell-level elements, based on row and column layout.",
145
+ typeConverter=TypeConverters.toBoolean
146
+ )
147
+
148
+ def setInferTableStructure(self, value):
149
+ return self._set(inferTableStructure=value)
150
+
151
+ def getInferTableStructure(self):
152
+ return self.getOrDefault(self.inferTableStructure)
153
+
154
+ includePageBreaks = Param(
155
+ Params._dummy(),
156
+ "includePageBreaks",
157
+ "Whether to detect and tag content with page break metadata. In Word documents, this includes manual and section breaks. In Excel files, this includes page breaks based on column boundaries.",
158
+ typeConverter=TypeConverters.toBoolean
159
+ )
160
+
161
+ def setIncludePageBreaks(self, value):
162
+ return self._set(includePageBreaks=value)
163
+
164
+ def getIncludePageBreaks(self):
165
+ return self.getOrDefault(self.includePageBreaks)
166
+
167
+ @keyword_only
168
+ def __init__(self, classname="com.johnsnowlabs.partition.PartitionTransformer",
169
+ java_model=None):
170
+ super(PartitionTransformer, self).__init__(
171
+ classname=classname,
172
+ java_model=java_model
173
+ )
174
+ DOUBLE_PARAGRAPH_PATTERN = r"(?:\s*\n\s*){2,}"
175
+
176
+ self._setDefault(
177
+ contentPath="",
178
+ contentType="text/plain",
179
+ storeContent=False,
180
+ titleFontSize = 9,
181
+ inferTableStructure=False,
182
+ includePageBreaks=False,
183
+ addAttachmentContent=False,
184
+ cellSeparator="\t",
185
+ appendCells=False,
186
+ timeout=0,
187
+ includeSlideNotes=False,
188
+ titleLengthSize=50,
189
+ groupBrokenParagraphs=False,
190
+ paragraphSplit=DOUBLE_PARAGRAPH_PATTERN,
191
+ shortLineWordThreshold=5,
192
+ maxLineCount=2000,
193
+ threshold=0.1,
194
+ chunkingStrategy="",
195
+ maxCharacters=100,
196
+ newAfterNChars=-1,
197
+ overlap=0,
198
+ combineTextUnderNChars=0,
199
+ overlapAll=False
200
+ )
@@ -10,10 +10,56 @@ from sparknlp.reader.enums import TextStripperType
10
10
  class PdfToText(JavaTransformer, HasInputCol, HasOutputCol,
11
11
  JavaMLReadable, JavaMLWritable):
12
12
  """
13
- Extract text from Pdf document to single string or to several strings per each page.
14
- Input is a column with binary representation of PDF document.
15
- As output generate column with text and page number.
16
- Explode each page as separate row if split to page enabled.
13
+ Extract text from PDF documents as either a single string or multiple strings per page.
14
+ Input is a column with binary content of PDF files. Output is a column with extracted text,
15
+ with options to include page numbers or split pages.
16
+
17
+ Parameters
18
+ ----------
19
+ pageNumCol : str, optional
20
+ Page number output column name.
21
+ partitionNum : int, optional
22
+ Number of partitions (default is 0).
23
+ storeSplittedPdf : bool, optional
24
+ Whether to store content of split PDFs (default is False).
25
+ splitPage : bool, optional
26
+ Enable/disable splitting per page (default is True).
27
+ onlyPageNum : bool, optional
28
+ Whether to extract only page numbers (default is False).
29
+ textStripper : str or TextStripperType, optional
30
+ Defines layout and formatting type.
31
+ sort : bool, optional
32
+ Enable/disable sorting content per page (default is False).
33
+
34
+ Examples
35
+ --------
36
+ >>> import sparknlp
37
+ >>> from sparknlp.reader import *
38
+ >>> from pyspark.ml import Pipeline
39
+ >>> pdf_path = "Documents/files/pdf"
40
+ >>> data_frame = spark.read.format("binaryFile").load(pdf_path)
41
+ >>> pdf_to_text = PdfToText().setStoreSplittedPdf(True)
42
+ >>> pipeline = Pipeline(stages=[pdf_to_text])
43
+ >>> pipeline_model = pipeline.fit(data_frame)
44
+ >>> pdf_df = pipeline_model.transform(data_frame)
45
+ >>> pdf_df.show()
46
+ +--------------------+--------------------+
47
+ | path| modificationTime|
48
+ +--------------------+--------------------+
49
+ |file:/Users/paula...|2025-05-15 11:33:...|
50
+ |file:/Users/paula...|2025-05-15 11:33:...|
51
+ +--------------------+--------------------+
52
+ >>> pdf_df.printSchema()
53
+ root
54
+ |-- path: string (nullable = true)
55
+ |-- modificationTime: timestamp (nullable = true)
56
+ |-- length: long (nullable = true)
57
+ |-- text: string (nullable = true)
58
+ |-- height_dimension: integer (nullable = true)
59
+ |-- width_dimension: integer (nullable = true)
60
+ |-- content: binary (nullable = true)
61
+ |-- exception: string (nullable = true)
62
+ |-- pagenum: integer (nullable = true)
17
63
  """
18
64
  pageNumCol = Param(Params._dummy(), "pageNumCol",
19
65
  "Page number output column name.",
@@ -33,27 +33,30 @@ class SparkNLPReader(ExtendedJavaWrapper):
33
33
  >>> from sparknlp.reader import SparkNLPReader
34
34
  >>> reader = SparkNLPReader(spark)
35
35
 
36
- # Reading HTML
36
+ Reading HTML
37
+
37
38
  >>> html_df = reader.html("https://www.wikipedia.org")
38
39
  >>> # Or with shorthand
39
40
  >>> import sparknlp
40
41
  >>> html_df = sparknlp.read().html("https://www.wikipedia.org")
41
42
 
42
- # Reading PDF
43
+ Reading PDF
44
+
43
45
  >>> pdf_df = reader.pdf("home/user/pdfs-directory")
44
46
  >>> # Or with shorthand
45
47
  >>> pdf_df = sparknlp.read().pdf("home/user/pdfs-directory")
46
48
 
47
- # Reading Email
49
+ Reading Email
50
+
48
51
  >>> email_df = reader.email("home/user/emails-directory")
49
52
  >>> # Or with shorthand
50
53
  >>> email_df = sparknlp.read().email("home/user/emails-directory")
51
54
  """
52
55
 
53
- def __init__(self, spark, params=None):
56
+ def __init__(self, spark, params=None, headers=None):
54
57
  if params is None:
55
58
  params = {}
56
- super(SparkNLPReader, self).__init__("com.johnsnowlabs.reader.SparkNLPReader", params)
59
+ super(SparkNLPReader, self).__init__("com.johnsnowlabs.reader.SparkNLPReader", params, headers)
57
60
  self.spark = spark
58
61
 
59
62
  def html(self, htmlPath):
@@ -72,7 +75,7 @@ class SparkNLPReader(ExtendedJavaWrapper):
72
75
  Examples
73
76
  --------
74
77
  >>> from sparknlp.reader import SparkNLPReader
75
- >>> html_df = SparkNLPReader(spark).html("https://www.wikipedia.org")
78
+ >>> html_df = SparkNLPReader().html("https://www.wikipedia.org")
76
79
 
77
80
  You can also use SparkNLP to simplify the process:
78
81
 
@@ -86,7 +89,6 @@ class SparkNLPReader(ExtendedJavaWrapper):
86
89
  |https://example.com/|[{Title, Example Domain, {pageNumber -> 1}}, {NarrativeText, 0, This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission., {pageNumber -> 1}}, {NarrativeText, 0, More information... More information..., {pageNumber -> 1}}] |
87
90
  +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
88
91
  >>> html_df.printSchema()
89
-
90
92
  root
91
93
  |-- url: string (nullable = true)
92
94
  |-- html: array (nullable = true)
@@ -125,13 +127,12 @@ class SparkNLPReader(ExtendedJavaWrapper):
125
127
 
126
128
  >>> import sparknlp
127
129
  >>> email_df = sparknlp.read().email("home/user/emails-directory")
128
- >>> email_df.show(truncate=False)
129
-
130
- +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
131
- |email |
132
- +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
133
- |[{Title, Email Text Attachments, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>}}, {NarrativeText, Email test with two text attachments\r\n\r\nCheers,\r\n\r\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/plain}}, {NarrativeText, <html>\r\n<head>\r\n<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">\r\n<style type="text/css" style="display:none;"> P {margin-top:0;margin-bottom:0;} </style>\r\n</head>\r\n<body dir="ltr">\r\n<span style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">Email&nbsp; test with two text attachments</span>\r\n<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r\n<br>\r\n</div>\r\n<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r\nCheers,</div>\r\n<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r\n<br>\r\n</div>\r\n</body>\r\n</html>\r\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/html}}, {Attachment, filename.txt, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, contentType -> text/plain; name="filename.txt"}}, {NarrativeText, This is the content of the file.\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/plain}}, {Attachment, filename2.txt, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, contentType -> text/plain; name="filename2.txt"}}, {NarrativeText, This is an additional content file.\n, {sent_to -> Danilo Burbano <danilo@johnsnowlabs.com>, sent_from -> Danilo Burbano <danilo@johnsnowlabs.com>, mimeType -> text/plain}}]|
134
- +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
130
+ >>> email_df.show()
131
+ +---------------------------------------------------+
132
+ |email |
133
+ +---------------------------------------------------+
134
+ |[{Title, Email Text Attachments, {sent_to -> Danilo|
135
+ +---------------------------------------------------+
135
136
  >>> email_df.printSchema()
136
137
  root
137
138
  |-- path: string (nullable = true)
@@ -170,16 +171,17 @@ class SparkNLPReader(ExtendedJavaWrapper):
170
171
  >>> doc_df = SparkNLPReader().doc(spark, "home/user/word-directory")
171
172
 
172
173
  You can use SparkNLP for one line of code
174
+
173
175
  >>> import sparknlp
174
176
  >>> doc_df = sparknlp.read().doc("home/user/word-directory")
175
- >>> doc_df.show(truncate=False)
176
-
177
- +----------------------------------------------------------------------------------------------------------------------------------------------------+
178
- |doc | |
179
- +----------------------------------------------------------------------------------------------------------------------------------------------------+
180
- |[{Table, Header Col 1, {}}, {Table, Header Col 2, {}}, {Table, Lorem ipsum, {}}, {Table, A Link example, {}}, {NarrativeText, Dolor sit amet, {}}] |
181
- +----------------------------------------------------------------------------------------------------------------------------------------------------+
182
- >>> docsDf.printSchema()
177
+ >>> doc_df.show()
178
+ +-------------------------------------------------+
179
+ |doc | |
180
+ +-------------------------------------------------+
181
+ |[{Table, Header Col 1, {}}, {Table, Header Col 2,|
182
+ +-------------------------------------------------+
183
+
184
+ >>> doc_df.printSchema()
183
185
  root
184
186
  |-- path: string (nullable = true)
185
187
  |-- content: array (nullable = true)
@@ -224,27 +226,27 @@ class SparkNLPReader(ExtendedJavaWrapper):
224
226
  >>> xlsDf = SparkNLPReader().xls(spark, "home/user/excel-directory")
225
227
 
226
228
  You can use SparkNLP for one line of code
229
+
227
230
  >>> import sparknlp
228
231
  >>> xlsDf = sparknlp.read().xls("home/user/excel-directory")
229
- >>> xlsDf.show(truncate=False)
230
-
231
- +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
232
- |xls |
233
- +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
234
- |[{Title, Financial performance, {SheetName -> Index}}, {Title, Topic\tPeriod\t\t\tPage, {SheetName -> Index}}, {NarrativeText, Quarterly revenue\tNine quarters to 30 June 2023\t\t\t1.0, {SheetName -> Index}}, {NarrativeText, Group financial performance\tFY 22\tFY 23\t\t2.0, {SheetName -> Index}}, {NarrativeText, Segmental results\tFY 22\tFY 23\t\t3.0, {SheetName -> Index}}, {NarrativeText, Segmental analysis\tFY 22\tFY 23\t\t4.0, {SheetName -> Index}}, {NarrativeText, Cash flow\tFY 22\tFY 23\t\t5.0, {SheetName -> Index}}, {Title, Operational metrics, {SheetName -> Index}}, {Title, Topic\tPeriod\t\t\tPage, {SheetName -> Index}}, {NarrativeText, Mobile customers\tNine quarters to 30 June 2023\t\t\t6.0, {SheetName -> Index}}, {NarrativeText, Fixed broadband customers\tNine quarters to 30 June 2023\t\t\t7.0, {SheetName -> Index}}, {NarrativeText, Marketable homes passed\tNine quarters to 30 June 2023\t\t\t8.0, {SheetName -> Index}}, {NarrativeText, TV customers\tNine quarters to 30 June 2023\t\t\t9.0, {SheetName -> Index}}, {NarrativeText, Converged customers\tNine quarters to 30 June 2023\t\t\t10.0, {SheetName -> Index}}, {NarrativeText, Mobile churn\tNine quarters to 30 June 2023\t\t\t11.0, {SheetName -> Index}}, {NarrativeText, Mobile data usage\tNine quarters to 30 June 2023\t\t\t12.0, {SheetName -> Index}}, {NarrativeText, Mobile ARPU\tNine quarters to 30 June 2023\t\t\t13.0, {SheetName -> Index}}, {Title, Other, {SheetName -> Index}}, {Title, Topic\tPeriod\t\t\tPage, {SheetName -> Index}}, {NarrativeText, Average foreign exchange rates\tNine quarters to 30 June 2023\t\t\t14.0, {SheetName -> Index}}, {NarrativeText, Guidance rates\tFY 23/24\t\t\t14.0, {SheetName -> Index}}]|
235
- +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
236
-
237
- >>> xlsDf.printSchema()
238
- root
239
- |-- path: string (nullable = true)
240
- |-- content: binary (nullable = true)
241
- |-- xls: array (nullable = true)
242
- | |-- element: struct (containsNull = true)
243
- | | |-- elementType: string (nullable = true)
244
- | | |-- content: string (nullable = true)
245
- | | |-- metadata: map (nullable = true)
246
- | | | |-- key: string
247
- | | | |-- value: string (valueContainsNull = true)
232
+ >>> xlsDf.show()
233
+ +--------------------------------------------+
234
+ |xls |
235
+ +--------------------------------------------+
236
+ |[{Title, Financial performance, {SheetNam}}]|
237
+ +--------------------------------------------+
238
+
239
+ >>> xlsDf.printSchema()
240
+ root
241
+ |-- path: string (nullable = true)
242
+ |-- content: binary (nullable = true)
243
+ |-- xls: array (nullable = true)
244
+ | |-- element: struct (containsNull = true)
245
+ | | |-- elementType: string (nullable = true)
246
+ | | |-- content: string (nullable = true)
247
+ | | |-- metadata: map (nullable = true)
248
+ | | | |-- key: string
249
+ | | | |-- value: string (valueContainsNull = true)
248
250
  """
249
251
  if not isinstance(docPath, str):
250
252
  raise TypeError("docPath must be a string")
@@ -259,7 +261,7 @@ class SparkNLPReader(ExtendedJavaWrapper):
259
261
  Parameters
260
262
  ----------
261
263
  docPath : str
262
- Path to an excel document file.
264
+ Path to an power point document file.
263
265
 
264
266
  Returns
265
267
  -------
@@ -272,14 +274,15 @@ class SparkNLPReader(ExtendedJavaWrapper):
272
274
  >>> pptDf = SparkNLPReader().ppt(spark, "home/user/powerpoint-directory")
273
275
 
274
276
  You can use SparkNLP for one line of code
277
+
275
278
  >>> import sparknlp
276
279
  >>> pptDf = sparknlp.read().ppt("home/user/powerpoint-directory")
277
280
  >>> pptDf.show(truncate=False)
278
- +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
279
- |ppt |
280
- +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
281
- |[{Title, Adding a Bullet Slide, {}}, {ListItem, • Find the bullet slide layout, {}}, {ListItem, – Use _TextFrame.text for first bullet, {}}, {ListItem, • Use _TextFrame.add_paragraph() for subsequent bullets, {}}, {NarrativeText, Here is a lot of text!, {}}, {NarrativeText, Here is some text in a text box!, {}}]|
282
- +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
281
+ +-------------------------------------+
282
+ |ppt |
283
+ +-------------------------------------+
284
+ |[{Title, Adding a Bullet Slide, {}},]|
285
+ +-------------------------------------+
283
286
  """
284
287
  if not isinstance(docPath, str):
285
288
  raise TypeError("docPath must be a string")
@@ -306,16 +309,62 @@ class SparkNLPReader(ExtendedJavaWrapper):
306
309
  >>> txtDf = SparkNLPReader().txt(spark, "home/user/txt/files")
307
310
 
308
311
  You can use SparkNLP for one line of code
312
+
309
313
  >>> import sparknlp
310
314
  >>> txtDf = sparknlp.read().txt("home/user/txt/files")
311
315
  >>> txtDf.show(truncate=False)
312
- +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
313
- |txt |
314
- +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
315
- |[{Title, BIG DATA ANALYTICS, {paragraph -> 0}}, {NarrativeText, Apache Spark is a fast and general-purpose cluster computing system.\nIt provides high-level APIs in Java, Scala, Python, and R., {paragraph -> 0}}, {Title, MACHINE LEARNING, {paragraph -> 1}}, {NarrativeText, Spark's MLlib provides scalable machine learning algorithms.\nIt includes tools for classification, regression, clustering, and more., {paragraph -> 1}}]|
316
- +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
316
+ +-----------------------------------------------+
317
+ |txt |
318
+ +-----------------------------------------------+
319
+ |[{Title, BIG DATA ANALYTICS, {paragraph -> 0}}]|
320
+ +-----------------------------------------------+
317
321
  """
318
322
  if not isinstance(docPath, str):
319
323
  raise TypeError("docPath must be a string")
320
324
  jdf = self._java_obj.txt(docPath)
325
+ return self.getDataFrame(self.spark, jdf)
326
+
327
+ def xml(self, docPath):
328
+ """Reads XML files and returns a Spark DataFrame.
329
+
330
+ Parameters
331
+ ----------
332
+ docPath : str
333
+ Path to an XML file or a directory containing XML files.
334
+
335
+ Returns
336
+ -------
337
+ pyspark.sql.DataFrame
338
+ A DataFrame containing parsed XML content.
339
+
340
+ Examples
341
+ --------
342
+ >>> from sparknlp.reader import SparkNLPReader
343
+ >>> xml_df = SparkNLPReader(spark).xml("home/user/xml-directory")
344
+
345
+ You can use SparkNLP for one line of code
346
+
347
+ >>> import sparknlp
348
+ >>> xml_df = sparknlp.read().xml("home/user/xml-directory")
349
+ >>> xml_df.show(truncate=False)
350
+ +-----------------------------------------------------------+
351
+ |xml |
352
+ +-----------------------------------------------------------+
353
+ |[{Title, John Smith, {elementId -> ..., tag -> title}}] |
354
+ +-----------------------------------------------------------+
355
+
356
+ >>> xml_df.printSchema()
357
+ root
358
+ |-- path: string (nullable = true)
359
+ |-- xml: array (nullable = true)
360
+ | |-- element: struct (containsNull = true)
361
+ | | |-- elementType: string (nullable = true)
362
+ | | |-- content: string (nullable = true)
363
+ | | |-- metadata: map (nullable = true)
364
+ | | | |-- key: string
365
+ | | | |-- value: string (valueContainsNull = true)
366
+ """
367
+ if not isinstance(docPath, str):
368
+ raise TypeError("docPath must be a string")
369
+ jdf = self._java_obj.xml(docPath)
321
370
  return self.getDataFrame(self.spark, jdf)
@@ -21,13 +21,13 @@ class SpacyToAnnotation(ExtendedJavaWrapper):
21
21
  """Helper class to load a list of tokens/sentences as JSON to Annotation.
22
22
 
23
23
  The JSON will be in this format:
24
- [
25
- {
26
- "tokens": ["Hello", "world", "!", "How", "are", "you", "today", "?", "I", "'m", "fine", "thanks", "."],
27
- "token_spaces": [true, false, true, true, true, true, false, true, false, true, true, false, false],
28
- "sentence_ends": [2, 7, 12]
29
- }
30
- ]
24
+ [
25
+ {
26
+ "tokens": ["Hello", "world", "!", "How", "are", "you", "today", "?", "I", "'m", "fine", "thanks", "."],
27
+ "token_spaces": [true, false, true, true, true, true, false, true, false, true, true, false, false],
28
+ "sentence_ends": [2, 7, 12]
29
+ }
30
+ ]
31
31
 
32
32
  Examples
33
33
  --------
sparknlp/util.py CHANGED
@@ -15,6 +15,9 @@
15
15
 
16
16
 
17
17
  import sparknlp.internal as _internal
18
+ import numpy as np
19
+ from pyspark.sql import Row
20
+ from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BinaryType
18
21
 
19
22
 
20
23
  def get_config_path():
@@ -33,3 +36,26 @@ class CoNLLGenerator:
33
36
  _internal._CoNLLGeneratorExportFromTargetAndPipeline(*args).apply()
34
37
  else:
35
38
  raise NotImplementedError(f"No exportConllFiles alternative takes {num_args} parameters")
39
+
40
+
41
+ class EmbeddingsDataFrameUtils:
42
+ """
43
+ Utility for creating DataFrames compatible with multimodal embedding models (e.g., E5VEmbeddings) for text-only scenarios.
44
+ Provides:
45
+ - imageSchema: the expected schema for Spark image DataFrames
46
+ - emptyImageRow: a dummy image row for text-only embedding
47
+ """
48
+ imageSchema = StructType([
49
+ StructField(
50
+ "image",
51
+ StructType([
52
+ StructField("origin", StringType(), True),
53
+ StructField("height", IntegerType(), True),
54
+ StructField("width", IntegerType(), True),
55
+ StructField("nChannels", IntegerType(), True),
56
+ StructField("mode", IntegerType(), True),
57
+ StructField("data", BinaryType(), True),
58
+ ]),
59
+ )
60
+ ])
61
+ emptyImageRow = Row(Row("", 0, 0, 0, 0, bytes()))