spark-nlp 6.0.5__py2.py3-none-any.whl → 6.1.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of spark-nlp might be problematic. Click here for more details.
- {spark_nlp-6.0.5.dist-info → spark_nlp-6.1.1.dist-info}/METADATA +12 -11
- {spark_nlp-6.0.5.dist-info → spark_nlp-6.1.1.dist-info}/RECORD +15 -12
- sparknlp/__init__.py +1 -1
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +4 -12
- sparknlp/annotator/seq2seq/__init__.py +1 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +10 -7
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +3 -3
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/common/properties.py +114 -85
- sparknlp/internal/__init__.py +9 -0
- sparknlp/reader/reader2doc.py +210 -0
- sparknlp/reader/reader2table.py +163 -0
- sparknlp/reader/sparknlp_reader.py +45 -0
- {spark_nlp-6.0.5.dist-info → spark_nlp-6.1.1.dist-info}/WHEEL +0 -0
- {spark_nlp-6.0.5.dist-info → spark_nlp-6.1.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
# Copyright 2017-2025 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from pyspark import keyword_only
|
|
16
|
+
from pyspark.ml.param import TypeConverters, Params, Param
|
|
17
|
+
|
|
18
|
+
from sparknlp.common import AnnotatorType
|
|
19
|
+
from sparknlp.internal import AnnotatorTransformer
|
|
20
|
+
from sparknlp.partition.partition_properties import *
|
|
21
|
+
|
|
22
|
+
class Reader2Table(
|
|
23
|
+
AnnotatorTransformer,
|
|
24
|
+
HasEmailReaderProperties,
|
|
25
|
+
HasExcelReaderProperties,
|
|
26
|
+
HasHTMLReaderProperties,
|
|
27
|
+
HasPowerPointProperties,
|
|
28
|
+
HasTextReaderProperties
|
|
29
|
+
):
|
|
30
|
+
name = 'Reader2Table'
|
|
31
|
+
|
|
32
|
+
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
33
|
+
|
|
34
|
+
contentPath = Param(
|
|
35
|
+
Params._dummy(),
|
|
36
|
+
"contentPath",
|
|
37
|
+
"contentPath path to files to read",
|
|
38
|
+
typeConverter=TypeConverters.toString
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
outputCol = Param(
|
|
42
|
+
Params._dummy(),
|
|
43
|
+
"outputCol",
|
|
44
|
+
"output column name",
|
|
45
|
+
typeConverter=TypeConverters.toString
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
contentType = Param(
|
|
49
|
+
Params._dummy(),
|
|
50
|
+
"contentType",
|
|
51
|
+
"Set the content type to load following MIME specification",
|
|
52
|
+
typeConverter=TypeConverters.toString
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
explodeDocs = Param(
|
|
56
|
+
Params._dummy(),
|
|
57
|
+
"explodeDocs",
|
|
58
|
+
"whether to explode the documents into separate rows",
|
|
59
|
+
typeConverter=TypeConverters.toBoolean
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
flattenOutput = Param(
|
|
63
|
+
Params._dummy(),
|
|
64
|
+
"flattenOutput",
|
|
65
|
+
"If true, output is flattened to plain text with minimal metadata",
|
|
66
|
+
typeConverter=TypeConverters.toBoolean
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
titleThreshold = Param(
|
|
70
|
+
Params._dummy(),
|
|
71
|
+
"titleThreshold",
|
|
72
|
+
"Minimum font size threshold for title detection in PDF docs",
|
|
73
|
+
typeConverter=TypeConverters.toFloat
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
outputFormat = Param(
|
|
77
|
+
Params._dummy(),
|
|
78
|
+
"outputFormat",
|
|
79
|
+
"Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
|
|
80
|
+
typeConverter=TypeConverters.toString
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
@keyword_only
|
|
84
|
+
def __init__(self):
|
|
85
|
+
super(Reader2Table, self).__init__(classname="com.johnsnowlabs.reader.Reader2Table")
|
|
86
|
+
self._setDefault(outputCol="document")
|
|
87
|
+
|
|
88
|
+
@keyword_only
|
|
89
|
+
def setParams(self):
|
|
90
|
+
kwargs = self._input_kwargs
|
|
91
|
+
return self._set(**kwargs)
|
|
92
|
+
|
|
93
|
+
def setContentPath(self, value):
|
|
94
|
+
"""Sets content path.
|
|
95
|
+
|
|
96
|
+
Parameters
|
|
97
|
+
----------
|
|
98
|
+
value : str
|
|
99
|
+
contentPath path to files to read
|
|
100
|
+
"""
|
|
101
|
+
return self._set(contentPath=value)
|
|
102
|
+
|
|
103
|
+
def setContentType(self, value):
|
|
104
|
+
"""
|
|
105
|
+
Set the content type to load following MIME specification
|
|
106
|
+
|
|
107
|
+
Parameters
|
|
108
|
+
----------
|
|
109
|
+
value : str
|
|
110
|
+
content type to load following MIME specification
|
|
111
|
+
"""
|
|
112
|
+
return self._set(contentType=value)
|
|
113
|
+
|
|
114
|
+
def setExplodeDocs(self, value):
|
|
115
|
+
"""Sets whether to explode the documents into separate rows.
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
Parameters
|
|
119
|
+
----------
|
|
120
|
+
value : boolean
|
|
121
|
+
Whether to explode the documents into separate rows
|
|
122
|
+
"""
|
|
123
|
+
return self._set(explodeDocs=value)
|
|
124
|
+
|
|
125
|
+
def setOutputCol(self, value):
|
|
126
|
+
"""Sets output column name.
|
|
127
|
+
|
|
128
|
+
Parameters
|
|
129
|
+
----------
|
|
130
|
+
value : str
|
|
131
|
+
Name of the Output Column
|
|
132
|
+
"""
|
|
133
|
+
return self._set(outputCol=value)
|
|
134
|
+
|
|
135
|
+
def setFlattenOutput(self, value):
|
|
136
|
+
"""Sets whether to flatten the output to plain text with minimal metadata.
|
|
137
|
+
|
|
138
|
+
Parameters
|
|
139
|
+
----------
|
|
140
|
+
value : bool
|
|
141
|
+
If true, output is flattened to plain text with minimal metadata
|
|
142
|
+
"""
|
|
143
|
+
return self._set(flattenOutput=value)
|
|
144
|
+
|
|
145
|
+
def setTitleThreshold(self, value):
|
|
146
|
+
"""Sets the minimum font size threshold for title detection in PDF documents.
|
|
147
|
+
|
|
148
|
+
Parameters
|
|
149
|
+
----------
|
|
150
|
+
value : float
|
|
151
|
+
Minimum font size threshold for title detection in PDF docs
|
|
152
|
+
"""
|
|
153
|
+
return self._set(titleThreshold=value)
|
|
154
|
+
|
|
155
|
+
def setOutputFormat(self, value):
|
|
156
|
+
"""Sets the output format for the table content.
|
|
157
|
+
|
|
158
|
+
Parameters
|
|
159
|
+
----------
|
|
160
|
+
value : str
|
|
161
|
+
Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.
|
|
162
|
+
"""
|
|
163
|
+
return self._set(outputFormat=value)
|
|
@@ -413,4 +413,49 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
413
413
|
if not isinstance(filePath, str):
|
|
414
414
|
raise TypeError("filePath must be a string")
|
|
415
415
|
jdf = self._java_obj.md(filePath)
|
|
416
|
+
return self.getDataFrame(self.spark, jdf)
|
|
417
|
+
|
|
418
|
+
def csv(self, csvPath):
|
|
419
|
+
"""Reads CSV files and returns a Spark DataFrame.
|
|
420
|
+
|
|
421
|
+
Parameters
|
|
422
|
+
----------
|
|
423
|
+
docPath : str
|
|
424
|
+
Path to an CSV file or a directory containing CSV files.
|
|
425
|
+
|
|
426
|
+
Returns
|
|
427
|
+
-------
|
|
428
|
+
pyspark.sql.DataFrame
|
|
429
|
+
A DataFrame containing parsed CSV content.
|
|
430
|
+
|
|
431
|
+
Examples
|
|
432
|
+
--------
|
|
433
|
+
>>> from sparknlp.reader import SparkNLPReader
|
|
434
|
+
>>> csv_df = SparkNLPReader(spark).csv("home/user/csv-directory")
|
|
435
|
+
|
|
436
|
+
You can use SparkNLP for one line of code
|
|
437
|
+
|
|
438
|
+
>>> import sparknlp
|
|
439
|
+
>>> csv_df = sparknlp.read().csv("home/user/csv-directory")
|
|
440
|
+
>>> csv_df.show(truncate=False)
|
|
441
|
+
+-----------------------------------------------------------------------------------------------------------------------------------------+
|
|
442
|
+
|csv |
|
|
443
|
+
+-----------------------------------------------------------------------------------------------------------------------------------------+
|
|
444
|
+
|[{NarrativeText, Alice 100 Bob 95, {}}, {Table, <table><tr><td>Alice</td><td>100</td></tr><tr><td>Bob</td><td>95</td></tr></table>, {}}] |
|
|
445
|
+
+-----------------------------------------------------------------------------------------------------------------------------------------+
|
|
446
|
+
|
|
447
|
+
>>> csv_df.printSchema()
|
|
448
|
+
root
|
|
449
|
+
|-- path: string (nullable = true)
|
|
450
|
+
|-- csv: array (nullable = true)
|
|
451
|
+
| |-- element: struct (containsNull = true)
|
|
452
|
+
| | |-- elementType: string (nullable = true)
|
|
453
|
+
| | |-- content: string (nullable = true)
|
|
454
|
+
| | |-- metadata: map (nullable = true)
|
|
455
|
+
| | | |-- key: string
|
|
456
|
+
| | | |-- value: string (valueContainsNull = true)
|
|
457
|
+
"""
|
|
458
|
+
if not isinstance(csvPath, str):
|
|
459
|
+
raise TypeError("docPath must be a string")
|
|
460
|
+
jdf = self._java_obj.csv(csvPath)
|
|
416
461
|
return self.getDataFrame(self.spark, jdf)
|
|
File without changes
|
|
File without changes
|