spark-nlp 6.0.1rc1__py2.py3-none-any.whl → 6.0.3__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of spark-nlp might be problematic. Click here for more details.
- {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.3.dist-info}/METADATA +13 -6
- {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.3.dist-info}/RECORD +39 -32
- {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.3.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +4 -2
- sparknlp/annotator/cv/__init__.py +2 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +5 -10
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +8 -13
- sparknlp/annotator/cv/llava_for_multimodal.py +1 -1
- sparknlp/annotator/cv/paligemma_for_multimodal.py +7 -7
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +1 -1
- sparknlp/annotator/cv/qwen2vl_transformer.py +1 -1
- sparknlp/annotator/cv/smolvlm_transformer.py +7 -13
- sparknlp/annotator/date2_chunk.py +1 -1
- sparknlp/annotator/document_character_text_splitter.py +8 -8
- sparknlp/annotator/document_token_splitter.py +7 -7
- sparknlp/annotator/embeddings/__init__.py +1 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +21 -19
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +15 -15
- sparknlp/annotator/openai/openai_completion.py +3 -4
- sparknlp/annotator/seq2seq/m2m100_transformer.py +1 -1
- sparknlp/annotator/seq2seq/mistral_transformer.py +2 -3
- sparknlp/annotator/seq2seq/nllb_transformer.py +1 -1
- sparknlp/annotator/seq2seq/qwen_transformer.py +26 -25
- sparknlp/annotator/spell_check/context_spell_checker.py +1 -1
- sparknlp/base/prompt_assembler.py +1 -1
- sparknlp/common/properties.py +7 -7
- sparknlp/internal/__init__.py +27 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +319 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/reader/pdf_to_text.py +50 -4
- sparknlp/reader/sparknlp_reader.py +101 -52
- sparknlp/training/spacy_to_annotation.py +7 -7
- sparknlp/util.py +26 -0
- {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
# Copyright 2017-2025 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains the PartitionTransformer class for reading various types of documents into chunks."""
|
|
15
|
+
from sparknlp.common import *
|
|
16
|
+
from sparknlp.partition.partition_properties import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class PartitionTransformer(
|
|
20
|
+
AnnotatorModel,
|
|
21
|
+
HasEmailReaderProperties,
|
|
22
|
+
HasExcelReaderProperties,
|
|
23
|
+
HasHTMLReaderProperties,
|
|
24
|
+
HasPowerPointProperties,
|
|
25
|
+
HasTextReaderProperties,
|
|
26
|
+
HasChunkerProperties
|
|
27
|
+
):
|
|
28
|
+
"""
|
|
29
|
+
The PartitionTransformer annotator allows you to use the Partition feature more smoothly
|
|
30
|
+
within existing Spark NLP workflows, enabling seamless reuse of your pipelines.
|
|
31
|
+
|
|
32
|
+
It supports reading from files, URLs, in-memory strings, or byte arrays, and works
|
|
33
|
+
within a Spark NLP pipeline.
|
|
34
|
+
|
|
35
|
+
Supported formats include:
|
|
36
|
+
- Plain text
|
|
37
|
+
- HTML
|
|
38
|
+
- Word (.doc/.docx)
|
|
39
|
+
- Excel (.xls/.xlsx)
|
|
40
|
+
- PowerPoint (.ppt/.pptx)
|
|
41
|
+
- Email files (.eml, .msg)
|
|
42
|
+
- PDFs
|
|
43
|
+
|
|
44
|
+
Parameters
|
|
45
|
+
----------
|
|
46
|
+
inputCols : list of str
|
|
47
|
+
Names of input columns (typically from DocumentAssembler).
|
|
48
|
+
outputCol : str
|
|
49
|
+
Name of the column to store the output.
|
|
50
|
+
contentType : str
|
|
51
|
+
The type of content: e.g., "text", "url", "file", etc.
|
|
52
|
+
headers : dict, optional
|
|
53
|
+
Headers to be used if content type is a URL.
|
|
54
|
+
|
|
55
|
+
Examples
|
|
56
|
+
--------
|
|
57
|
+
>>> dataset = spark.createDataFrame([
|
|
58
|
+
... ("https://www.blizzard.com",),
|
|
59
|
+
... ], ["text"])
|
|
60
|
+
|
|
61
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
62
|
+
... .setInputCol("text") \\
|
|
63
|
+
... .setOutputCol("document")
|
|
64
|
+
|
|
65
|
+
>>> partition = PartitionTransformer() \\
|
|
66
|
+
... .setInputCols(["document"]) \\
|
|
67
|
+
... .setOutputCol("partition") \\
|
|
68
|
+
... .setContentType("url") \\
|
|
69
|
+
... .setHeaders({"Accept-Language": "es-ES"})
|
|
70
|
+
|
|
71
|
+
>>> pipeline = Pipeline(stages=[documentAssembler, partition])
|
|
72
|
+
>>> pipelineModel = pipeline.fit(dataset)
|
|
73
|
+
>>> resultDf = pipelineModel.transform(dataset)
|
|
74
|
+
>>> resultDf.show()
|
|
75
|
+
+--------------------+--------------------+--------------------+
|
|
76
|
+
| text| document| partition|
|
|
77
|
+
+--------------------+--------------------+--------------------+
|
|
78
|
+
|https://www.blizz...|[{Title, Juegos d...|[{document, 0, 16...|
|
|
79
|
+
|https://www.googl...|[{Title, Gmail Im...|[{document, 0, 28...|
|
|
80
|
+
+--------------------+--------------------+--------------------+
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
name = "PartitionTransformer"
|
|
84
|
+
|
|
85
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
|
|
86
|
+
|
|
87
|
+
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
88
|
+
|
|
89
|
+
contentPath = Param(
|
|
90
|
+
Params._dummy(),
|
|
91
|
+
"contentPath",
|
|
92
|
+
"Path to the content source",
|
|
93
|
+
typeConverter=TypeConverters.toString
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def setContentPath(self, value):
|
|
97
|
+
return self._set(contentPath=value)
|
|
98
|
+
|
|
99
|
+
def getContentPath(self):
|
|
100
|
+
return self.getOrDefault(self.contentPath)
|
|
101
|
+
|
|
102
|
+
contentType = Param(
|
|
103
|
+
Params._dummy(),
|
|
104
|
+
"contentType",
|
|
105
|
+
"Set the content type to load following MIME specification",
|
|
106
|
+
typeConverter=TypeConverters.toString
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def setContentType(self, value):
|
|
110
|
+
return self._set(contentType=value)
|
|
111
|
+
|
|
112
|
+
def getContentType(self):
|
|
113
|
+
return self.getOrDefault(self.contentType)
|
|
114
|
+
|
|
115
|
+
storeContent = Param(
|
|
116
|
+
Params._dummy(),
|
|
117
|
+
"storeContent",
|
|
118
|
+
"Whether to include the raw file content in the output DataFrame as a separate 'content' column, alongside the structured output.",
|
|
119
|
+
typeConverter=TypeConverters.toBoolean
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
def setStoreContent(self, value):
|
|
123
|
+
return self._set(storeContent=value)
|
|
124
|
+
|
|
125
|
+
def getStoreContent(self):
|
|
126
|
+
return self.getOrDefault(self.storeContent)
|
|
127
|
+
|
|
128
|
+
titleFontSize = Param(
|
|
129
|
+
Params._dummy(),
|
|
130
|
+
"titleFontSize",
|
|
131
|
+
"Minimum font size threshold used as part of heuristic rules to detect title elements based on formatting (e.g., bold, centered, capitalized).",
|
|
132
|
+
typeConverter=TypeConverters.toInt
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
def setTitleFontSize(self, value):
|
|
136
|
+
return self._set(titleFontSize=value)
|
|
137
|
+
|
|
138
|
+
def getTitleFontSize(self):
|
|
139
|
+
return self.getOrDefault(self.titleFontSize)
|
|
140
|
+
|
|
141
|
+
inferTableStructure = Param(
|
|
142
|
+
Params._dummy(),
|
|
143
|
+
"inferTableStructure",
|
|
144
|
+
"Whether to generate an HTML table representation from structured table content. When enabled, a full <table> element is added alongside cell-level elements, based on row and column layout.",
|
|
145
|
+
typeConverter=TypeConverters.toBoolean
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
def setInferTableStructure(self, value):
|
|
149
|
+
return self._set(inferTableStructure=value)
|
|
150
|
+
|
|
151
|
+
def getInferTableStructure(self):
|
|
152
|
+
return self.getOrDefault(self.inferTableStructure)
|
|
153
|
+
|
|
154
|
+
includePageBreaks = Param(
|
|
155
|
+
Params._dummy(),
|
|
156
|
+
"includePageBreaks",
|
|
157
|
+
"Whether to detect and tag content with page break metadata. In Word documents, this includes manual and section breaks. In Excel files, this includes page breaks based on column boundaries.",
|
|
158
|
+
typeConverter=TypeConverters.toBoolean
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
def setIncludePageBreaks(self, value):
|
|
162
|
+
return self._set(includePageBreaks=value)
|
|
163
|
+
|
|
164
|
+
def getIncludePageBreaks(self):
|
|
165
|
+
return self.getOrDefault(self.includePageBreaks)
|
|
166
|
+
|
|
167
|
+
@keyword_only
|
|
168
|
+
def __init__(self, classname="com.johnsnowlabs.partition.PartitionTransformer",
|
|
169
|
+
java_model=None):
|
|
170
|
+
super(PartitionTransformer, self).__init__(
|
|
171
|
+
classname=classname,
|
|
172
|
+
java_model=java_model
|
|
173
|
+
)
|
|
174
|
+
DOUBLE_PARAGRAPH_PATTERN = r"(?:\s*\n\s*){2,}"
|
|
175
|
+
|
|
176
|
+
self._setDefault(
|
|
177
|
+
contentPath="",
|
|
178
|
+
contentType="text/plain",
|
|
179
|
+
storeContent=False,
|
|
180
|
+
titleFontSize = 9,
|
|
181
|
+
inferTableStructure=False,
|
|
182
|
+
includePageBreaks=False,
|
|
183
|
+
addAttachmentContent=False,
|
|
184
|
+
cellSeparator="\t",
|
|
185
|
+
appendCells=False,
|
|
186
|
+
timeout=0,
|
|
187
|
+
includeSlideNotes=False,
|
|
188
|
+
titleLengthSize=50,
|
|
189
|
+
groupBrokenParagraphs=False,
|
|
190
|
+
paragraphSplit=DOUBLE_PARAGRAPH_PATTERN,
|
|
191
|
+
shortLineWordThreshold=5,
|
|
192
|
+
maxLineCount=2000,
|
|
193
|
+
threshold=0.1,
|
|
194
|
+
chunkingStrategy="",
|
|
195
|
+
maxCharacters=100,
|
|
196
|
+
newAfterNChars=-1,
|
|
197
|
+
overlap=0,
|
|
198
|
+
combineTextUnderNChars=0,
|
|
199
|
+
overlapAll=False
|
|
200
|
+
)
|
sparknlp/reader/pdf_to_text.py
CHANGED
|
@@ -10,10 +10,56 @@ from sparknlp.reader.enums import TextStripperType
|
|
|
10
10
|
class PdfToText(JavaTransformer, HasInputCol, HasOutputCol,
|
|
11
11
|
JavaMLReadable, JavaMLWritable):
|
|
12
12
|
"""
|
|
13
|
-
Extract text from
|
|
14
|
-
Input is a column with binary
|
|
15
|
-
|
|
16
|
-
|
|
13
|
+
Extract text from PDF documents as either a single string or multiple strings per page.
|
|
14
|
+
Input is a column with binary content of PDF files. Output is a column with extracted text,
|
|
15
|
+
with options to include page numbers or split pages.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
pageNumCol : str, optional
|
|
20
|
+
Page number output column name.
|
|
21
|
+
partitionNum : int, optional
|
|
22
|
+
Number of partitions (default is 0).
|
|
23
|
+
storeSplittedPdf : bool, optional
|
|
24
|
+
Whether to store content of split PDFs (default is False).
|
|
25
|
+
splitPage : bool, optional
|
|
26
|
+
Enable/disable splitting per page (default is True).
|
|
27
|
+
onlyPageNum : bool, optional
|
|
28
|
+
Whether to extract only page numbers (default is False).
|
|
29
|
+
textStripper : str or TextStripperType, optional
|
|
30
|
+
Defines layout and formatting type.
|
|
31
|
+
sort : bool, optional
|
|
32
|
+
Enable/disable sorting content per page (default is False).
|
|
33
|
+
|
|
34
|
+
Examples
|
|
35
|
+
--------
|
|
36
|
+
>>> import sparknlp
|
|
37
|
+
>>> from sparknlp.reader import *
|
|
38
|
+
>>> from pyspark.ml import Pipeline
|
|
39
|
+
>>> pdf_path = "Documents/files/pdf"
|
|
40
|
+
>>> data_frame = spark.read.format("binaryFile").load(pdf_path)
|
|
41
|
+
>>> pdf_to_text = PdfToText().setStoreSplittedPdf(True)
|
|
42
|
+
>>> pipeline = Pipeline(stages=[pdf_to_text])
|
|
43
|
+
>>> pipeline_model = pipeline.fit(data_frame)
|
|
44
|
+
>>> pdf_df = pipeline_model.transform(data_frame)
|
|
45
|
+
>>> pdf_df.show()
|
|
46
|
+
+--------------------+--------------------+
|
|
47
|
+
| path| modificationTime|
|
|
48
|
+
+--------------------+--------------------+
|
|
49
|
+
|file:/Users/paula...|2025-05-15 11:33:...|
|
|
50
|
+
|file:/Users/paula...|2025-05-15 11:33:...|
|
|
51
|
+
+--------------------+--------------------+
|
|
52
|
+
>>> pdf_df.printSchema()
|
|
53
|
+
root
|
|
54
|
+
|-- path: string (nullable = true)
|
|
55
|
+
|-- modificationTime: timestamp (nullable = true)
|
|
56
|
+
|-- length: long (nullable = true)
|
|
57
|
+
|-- text: string (nullable = true)
|
|
58
|
+
|-- height_dimension: integer (nullable = true)
|
|
59
|
+
|-- width_dimension: integer (nullable = true)
|
|
60
|
+
|-- content: binary (nullable = true)
|
|
61
|
+
|-- exception: string (nullable = true)
|
|
62
|
+
|-- pagenum: integer (nullable = true)
|
|
17
63
|
"""
|
|
18
64
|
pageNumCol = Param(Params._dummy(), "pageNumCol",
|
|
19
65
|
"Page number output column name.",
|
|
@@ -33,27 +33,30 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
33
33
|
>>> from sparknlp.reader import SparkNLPReader
|
|
34
34
|
>>> reader = SparkNLPReader(spark)
|
|
35
35
|
|
|
36
|
-
|
|
36
|
+
Reading HTML
|
|
37
|
+
|
|
37
38
|
>>> html_df = reader.html("https://www.wikipedia.org")
|
|
38
39
|
>>> # Or with shorthand
|
|
39
40
|
>>> import sparknlp
|
|
40
41
|
>>> html_df = sparknlp.read().html("https://www.wikipedia.org")
|
|
41
42
|
|
|
42
|
-
|
|
43
|
+
Reading PDF
|
|
44
|
+
|
|
43
45
|
>>> pdf_df = reader.pdf("home/user/pdfs-directory")
|
|
44
46
|
>>> # Or with shorthand
|
|
45
47
|
>>> pdf_df = sparknlp.read().pdf("home/user/pdfs-directory")
|
|
46
48
|
|
|
47
|
-
|
|
49
|
+
Reading Email
|
|
50
|
+
|
|
48
51
|
>>> email_df = reader.email("home/user/emails-directory")
|
|
49
52
|
>>> # Or with shorthand
|
|
50
53
|
>>> email_df = sparknlp.read().email("home/user/emails-directory")
|
|
51
54
|
"""
|
|
52
55
|
|
|
53
|
-
def __init__(self, spark, params=None):
|
|
56
|
+
def __init__(self, spark, params=None, headers=None):
|
|
54
57
|
if params is None:
|
|
55
58
|
params = {}
|
|
56
|
-
super(SparkNLPReader, self).__init__("com.johnsnowlabs.reader.SparkNLPReader", params)
|
|
59
|
+
super(SparkNLPReader, self).__init__("com.johnsnowlabs.reader.SparkNLPReader", params, headers)
|
|
57
60
|
self.spark = spark
|
|
58
61
|
|
|
59
62
|
def html(self, htmlPath):
|
|
@@ -72,7 +75,7 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
72
75
|
Examples
|
|
73
76
|
--------
|
|
74
77
|
>>> from sparknlp.reader import SparkNLPReader
|
|
75
|
-
>>> html_df = SparkNLPReader(
|
|
78
|
+
>>> html_df = SparkNLPReader().html("https://www.wikipedia.org")
|
|
76
79
|
|
|
77
80
|
You can also use SparkNLP to simplify the process:
|
|
78
81
|
|
|
@@ -86,7 +89,6 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
86
89
|
|https://example.com/|[{Title, Example Domain, {pageNumber -> 1}}, {NarrativeText, 0, This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission., {pageNumber -> 1}}, {NarrativeText, 0, More information... More information..., {pageNumber -> 1}}] |
|
|
87
90
|
+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
88
91
|
>>> html_df.printSchema()
|
|
89
|
-
|
|
90
92
|
root
|
|
91
93
|
|-- url: string (nullable = true)
|
|
92
94
|
|-- html: array (nullable = true)
|
|
@@ -125,13 +127,12 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
125
127
|
|
|
126
128
|
>>> import sparknlp
|
|
127
129
|
>>> email_df = sparknlp.read().email("home/user/emails-directory")
|
|
128
|
-
>>> email_df.show(
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
130
|
+
>>> email_df.show()
|
|
131
|
+
+---------------------------------------------------+
|
|
132
|
+
|email |
|
|
133
|
+
+---------------------------------------------------+
|
|
134
|
+
|[{Title, Email Text Attachments, {sent_to -> Danilo|
|
|
135
|
+
+---------------------------------------------------+
|
|
135
136
|
>>> email_df.printSchema()
|
|
136
137
|
root
|
|
137
138
|
|-- path: string (nullable = true)
|
|
@@ -170,16 +171,17 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
170
171
|
>>> doc_df = SparkNLPReader().doc(spark, "home/user/word-directory")
|
|
171
172
|
|
|
172
173
|
You can use SparkNLP for one line of code
|
|
174
|
+
|
|
173
175
|
>>> import sparknlp
|
|
174
176
|
>>> doc_df = sparknlp.read().doc("home/user/word-directory")
|
|
175
|
-
>>> doc_df.show(
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
>>>
|
|
177
|
+
>>> doc_df.show()
|
|
178
|
+
+-------------------------------------------------+
|
|
179
|
+
|doc | |
|
|
180
|
+
+-------------------------------------------------+
|
|
181
|
+
|[{Table, Header Col 1, {}}, {Table, Header Col 2,|
|
|
182
|
+
+-------------------------------------------------+
|
|
183
|
+
|
|
184
|
+
>>> doc_df.printSchema()
|
|
183
185
|
root
|
|
184
186
|
|-- path: string (nullable = true)
|
|
185
187
|
|-- content: array (nullable = true)
|
|
@@ -224,27 +226,27 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
224
226
|
>>> xlsDf = SparkNLPReader().xls(spark, "home/user/excel-directory")
|
|
225
227
|
|
|
226
228
|
You can use SparkNLP for one line of code
|
|
229
|
+
|
|
227
230
|
>>> import sparknlp
|
|
228
231
|
>>> xlsDf = sparknlp.read().xls("home/user/excel-directory")
|
|
229
|
-
>>> xlsDf.show(
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
| | | |-- value: string (valueContainsNull = true)
|
|
232
|
+
>>> xlsDf.show()
|
|
233
|
+
+--------------------------------------------+
|
|
234
|
+
|xls |
|
|
235
|
+
+--------------------------------------------+
|
|
236
|
+
|[{Title, Financial performance, {SheetNam}}]|
|
|
237
|
+
+--------------------------------------------+
|
|
238
|
+
|
|
239
|
+
>>> xlsDf.printSchema()
|
|
240
|
+
root
|
|
241
|
+
|-- path: string (nullable = true)
|
|
242
|
+
|-- content: binary (nullable = true)
|
|
243
|
+
|-- xls: array (nullable = true)
|
|
244
|
+
| |-- element: struct (containsNull = true)
|
|
245
|
+
| | |-- elementType: string (nullable = true)
|
|
246
|
+
| | |-- content: string (nullable = true)
|
|
247
|
+
| | |-- metadata: map (nullable = true)
|
|
248
|
+
| | | |-- key: string
|
|
249
|
+
| | | |-- value: string (valueContainsNull = true)
|
|
248
250
|
"""
|
|
249
251
|
if not isinstance(docPath, str):
|
|
250
252
|
raise TypeError("docPath must be a string")
|
|
@@ -259,7 +261,7 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
259
261
|
Parameters
|
|
260
262
|
----------
|
|
261
263
|
docPath : str
|
|
262
|
-
Path to an
|
|
264
|
+
Path to an power point document file.
|
|
263
265
|
|
|
264
266
|
Returns
|
|
265
267
|
-------
|
|
@@ -272,14 +274,15 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
272
274
|
>>> pptDf = SparkNLPReader().ppt(spark, "home/user/powerpoint-directory")
|
|
273
275
|
|
|
274
276
|
You can use SparkNLP for one line of code
|
|
277
|
+
|
|
275
278
|
>>> import sparknlp
|
|
276
279
|
>>> pptDf = sparknlp.read().ppt("home/user/powerpoint-directory")
|
|
277
280
|
>>> pptDf.show(truncate=False)
|
|
278
|
-
|
|
279
|
-
|ppt
|
|
280
|
-
|
|
281
|
-
|[{Title, Adding a Bullet Slide, {}},
|
|
282
|
-
|
|
281
|
+
+-------------------------------------+
|
|
282
|
+
|ppt |
|
|
283
|
+
+-------------------------------------+
|
|
284
|
+
|[{Title, Adding a Bullet Slide, {}},]|
|
|
285
|
+
+-------------------------------------+
|
|
283
286
|
"""
|
|
284
287
|
if not isinstance(docPath, str):
|
|
285
288
|
raise TypeError("docPath must be a string")
|
|
@@ -306,16 +309,62 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
306
309
|
>>> txtDf = SparkNLPReader().txt(spark, "home/user/txt/files")
|
|
307
310
|
|
|
308
311
|
You can use SparkNLP for one line of code
|
|
312
|
+
|
|
309
313
|
>>> import sparknlp
|
|
310
314
|
>>> txtDf = sparknlp.read().txt("home/user/txt/files")
|
|
311
315
|
>>> txtDf.show(truncate=False)
|
|
312
|
-
|
|
313
|
-
|txt
|
|
314
|
-
|
|
315
|
-
|[{Title, BIG DATA ANALYTICS, {paragraph -> 0}}
|
|
316
|
-
|
|
316
|
+
+-----------------------------------------------+
|
|
317
|
+
|txt |
|
|
318
|
+
+-----------------------------------------------+
|
|
319
|
+
|[{Title, BIG DATA ANALYTICS, {paragraph -> 0}}]|
|
|
320
|
+
+-----------------------------------------------+
|
|
317
321
|
"""
|
|
318
322
|
if not isinstance(docPath, str):
|
|
319
323
|
raise TypeError("docPath must be a string")
|
|
320
324
|
jdf = self._java_obj.txt(docPath)
|
|
325
|
+
return self.getDataFrame(self.spark, jdf)
|
|
326
|
+
|
|
327
|
+
def xml(self, docPath):
|
|
328
|
+
"""Reads XML files and returns a Spark DataFrame.
|
|
329
|
+
|
|
330
|
+
Parameters
|
|
331
|
+
----------
|
|
332
|
+
docPath : str
|
|
333
|
+
Path to an XML file or a directory containing XML files.
|
|
334
|
+
|
|
335
|
+
Returns
|
|
336
|
+
-------
|
|
337
|
+
pyspark.sql.DataFrame
|
|
338
|
+
A DataFrame containing parsed XML content.
|
|
339
|
+
|
|
340
|
+
Examples
|
|
341
|
+
--------
|
|
342
|
+
>>> from sparknlp.reader import SparkNLPReader
|
|
343
|
+
>>> xml_df = SparkNLPReader(spark).xml("home/user/xml-directory")
|
|
344
|
+
|
|
345
|
+
You can use SparkNLP for one line of code
|
|
346
|
+
|
|
347
|
+
>>> import sparknlp
|
|
348
|
+
>>> xml_df = sparknlp.read().xml("home/user/xml-directory")
|
|
349
|
+
>>> xml_df.show(truncate=False)
|
|
350
|
+
+-----------------------------------------------------------+
|
|
351
|
+
|xml |
|
|
352
|
+
+-----------------------------------------------------------+
|
|
353
|
+
|[{Title, John Smith, {elementId -> ..., tag -> title}}] |
|
|
354
|
+
+-----------------------------------------------------------+
|
|
355
|
+
|
|
356
|
+
>>> xml_df.printSchema()
|
|
357
|
+
root
|
|
358
|
+
|-- path: string (nullable = true)
|
|
359
|
+
|-- xml: array (nullable = true)
|
|
360
|
+
| |-- element: struct (containsNull = true)
|
|
361
|
+
| | |-- elementType: string (nullable = true)
|
|
362
|
+
| | |-- content: string (nullable = true)
|
|
363
|
+
| | |-- metadata: map (nullable = true)
|
|
364
|
+
| | | |-- key: string
|
|
365
|
+
| | | |-- value: string (valueContainsNull = true)
|
|
366
|
+
"""
|
|
367
|
+
if not isinstance(docPath, str):
|
|
368
|
+
raise TypeError("docPath must be a string")
|
|
369
|
+
jdf = self._java_obj.xml(docPath)
|
|
321
370
|
return self.getDataFrame(self.spark, jdf)
|
|
@@ -21,13 +21,13 @@ class SpacyToAnnotation(ExtendedJavaWrapper):
|
|
|
21
21
|
"""Helper class to load a list of tokens/sentences as JSON to Annotation.
|
|
22
22
|
|
|
23
23
|
The JSON will be in this format:
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
24
|
+
[
|
|
25
|
+
{
|
|
26
|
+
"tokens": ["Hello", "world", "!", "How", "are", "you", "today", "?", "I", "'m", "fine", "thanks", "."],
|
|
27
|
+
"token_spaces": [true, false, true, true, true, true, false, true, false, true, true, false, false],
|
|
28
|
+
"sentence_ends": [2, 7, 12]
|
|
29
|
+
}
|
|
30
|
+
]
|
|
31
31
|
|
|
32
32
|
Examples
|
|
33
33
|
--------
|
sparknlp/util.py
CHANGED
|
@@ -15,6 +15,9 @@
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
import sparknlp.internal as _internal
|
|
18
|
+
import numpy as np
|
|
19
|
+
from pyspark.sql import Row
|
|
20
|
+
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BinaryType
|
|
18
21
|
|
|
19
22
|
|
|
20
23
|
def get_config_path():
|
|
@@ -33,3 +36,26 @@ class CoNLLGenerator:
|
|
|
33
36
|
_internal._CoNLLGeneratorExportFromTargetAndPipeline(*args).apply()
|
|
34
37
|
else:
|
|
35
38
|
raise NotImplementedError(f"No exportConllFiles alternative takes {num_args} parameters")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class EmbeddingsDataFrameUtils:
|
|
42
|
+
"""
|
|
43
|
+
Utility for creating DataFrames compatible with multimodal embedding models (e.g., E5VEmbeddings) for text-only scenarios.
|
|
44
|
+
Provides:
|
|
45
|
+
- imageSchema: the expected schema for Spark image DataFrames
|
|
46
|
+
- emptyImageRow: a dummy image row for text-only embedding
|
|
47
|
+
"""
|
|
48
|
+
imageSchema = StructType([
|
|
49
|
+
StructField(
|
|
50
|
+
"image",
|
|
51
|
+
StructType([
|
|
52
|
+
StructField("origin", StringType(), True),
|
|
53
|
+
StructField("height", IntegerType(), True),
|
|
54
|
+
StructField("width", IntegerType(), True),
|
|
55
|
+
StructField("nChannels", IntegerType(), True),
|
|
56
|
+
StructField("mode", IntegerType(), True),
|
|
57
|
+
StructField("data", BinaryType(), True),
|
|
58
|
+
]),
|
|
59
|
+
)
|
|
60
|
+
])
|
|
61
|
+
emptyImageRow = Row(Row("", 0, 0, 0, 0, bytes()))
|
|
File without changes
|