spark-nlp 6.0.5__py2.py3-none-any.whl → 6.1.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spark-nlp might be problematic. Click here for more details.

@@ -0,0 +1,163 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from pyspark import keyword_only
16
+ from pyspark.ml.param import TypeConverters, Params, Param
17
+
18
+ from sparknlp.common import AnnotatorType
19
+ from sparknlp.internal import AnnotatorTransformer
20
+ from sparknlp.partition.partition_properties import *
21
+
22
+ class Reader2Table(
23
+ AnnotatorTransformer,
24
+ HasEmailReaderProperties,
25
+ HasExcelReaderProperties,
26
+ HasHTMLReaderProperties,
27
+ HasPowerPointProperties,
28
+ HasTextReaderProperties
29
+ ):
30
+ name = 'Reader2Table'
31
+
32
+ outputAnnotatorType = AnnotatorType.DOCUMENT
33
+
34
+ contentPath = Param(
35
+ Params._dummy(),
36
+ "contentPath",
37
+ "contentPath path to files to read",
38
+ typeConverter=TypeConverters.toString
39
+ )
40
+
41
+ outputCol = Param(
42
+ Params._dummy(),
43
+ "outputCol",
44
+ "output column name",
45
+ typeConverter=TypeConverters.toString
46
+ )
47
+
48
+ contentType = Param(
49
+ Params._dummy(),
50
+ "contentType",
51
+ "Set the content type to load following MIME specification",
52
+ typeConverter=TypeConverters.toString
53
+ )
54
+
55
+ explodeDocs = Param(
56
+ Params._dummy(),
57
+ "explodeDocs",
58
+ "whether to explode the documents into separate rows",
59
+ typeConverter=TypeConverters.toBoolean
60
+ )
61
+
62
+ flattenOutput = Param(
63
+ Params._dummy(),
64
+ "flattenOutput",
65
+ "If true, output is flattened to plain text with minimal metadata",
66
+ typeConverter=TypeConverters.toBoolean
67
+ )
68
+
69
+ titleThreshold = Param(
70
+ Params._dummy(),
71
+ "titleThreshold",
72
+ "Minimum font size threshold for title detection in PDF docs",
73
+ typeConverter=TypeConverters.toFloat
74
+ )
75
+
76
+ outputFormat = Param(
77
+ Params._dummy(),
78
+ "outputFormat",
79
+ "Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
80
+ typeConverter=TypeConverters.toString
81
+ )
82
+
83
+ @keyword_only
84
+ def __init__(self):
85
+ super(Reader2Table, self).__init__(classname="com.johnsnowlabs.reader.Reader2Table")
86
+ self._setDefault(outputCol="document")
87
+
88
+ @keyword_only
89
+ def setParams(self):
90
+ kwargs = self._input_kwargs
91
+ return self._set(**kwargs)
92
+
93
+ def setContentPath(self, value):
94
+ """Sets content path.
95
+
96
+ Parameters
97
+ ----------
98
+ value : str
99
+ contentPath path to files to read
100
+ """
101
+ return self._set(contentPath=value)
102
+
103
+ def setContentType(self, value):
104
+ """
105
+ Set the content type to load following MIME specification
106
+
107
+ Parameters
108
+ ----------
109
+ value : str
110
+ content type to load following MIME specification
111
+ """
112
+ return self._set(contentType=value)
113
+
114
+ def setExplodeDocs(self, value):
115
+ """Sets whether to explode the documents into separate rows.
116
+
117
+
118
+ Parameters
119
+ ----------
120
+ value : boolean
121
+ Whether to explode the documents into separate rows
122
+ """
123
+ return self._set(explodeDocs=value)
124
+
125
+ def setOutputCol(self, value):
126
+ """Sets output column name.
127
+
128
+ Parameters
129
+ ----------
130
+ value : str
131
+ Name of the Output Column
132
+ """
133
+ return self._set(outputCol=value)
134
+
135
+ def setFlattenOutput(self, value):
136
+ """Sets whether to flatten the output to plain text with minimal metadata.
137
+
138
+ Parameters
139
+ ----------
140
+ value : bool
141
+ If true, output is flattened to plain text with minimal metadata
142
+ """
143
+ return self._set(flattenOutput=value)
144
+
145
+ def setTitleThreshold(self, value):
146
+ """Sets the minimum font size threshold for title detection in PDF documents.
147
+
148
+ Parameters
149
+ ----------
150
+ value : float
151
+ Minimum font size threshold for title detection in PDF docs
152
+ """
153
+ return self._set(titleThreshold=value)
154
+
155
+ def setOutputFormat(self, value):
156
+ """Sets the output format for the table content.
157
+
158
+ Parameters
159
+ ----------
160
+ value : str
161
+ Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.
162
+ """
163
+ return self._set(outputFormat=value)
@@ -413,4 +413,49 @@ class SparkNLPReader(ExtendedJavaWrapper):
413
413
  if not isinstance(filePath, str):
414
414
  raise TypeError("filePath must be a string")
415
415
  jdf = self._java_obj.md(filePath)
416
+ return self.getDataFrame(self.spark, jdf)
417
+
418
+ def csv(self, csvPath):
419
+ """Reads CSV files and returns a Spark DataFrame.
420
+
421
+ Parameters
422
+ ----------
423
+ docPath : str
424
+ Path to an CSV file or a directory containing CSV files.
425
+
426
+ Returns
427
+ -------
428
+ pyspark.sql.DataFrame
429
+ A DataFrame containing parsed CSV content.
430
+
431
+ Examples
432
+ --------
433
+ >>> from sparknlp.reader import SparkNLPReader
434
+ >>> csv_df = SparkNLPReader(spark).csv("home/user/csv-directory")
435
+
436
+ You can use SparkNLP for one line of code
437
+
438
+ >>> import sparknlp
439
+ >>> csv_df = sparknlp.read().csv("home/user/csv-directory")
440
+ >>> csv_df.show(truncate=False)
441
+ +-----------------------------------------------------------------------------------------------------------------------------------------+
442
+ |csv |
443
+ +-----------------------------------------------------------------------------------------------------------------------------------------+
444
+ |[{NarrativeText, Alice 100 Bob 95, {}}, {Table, <table><tr><td>Alice</td><td>100</td></tr><tr><td>Bob</td><td>95</td></tr></table>, {}}] |
445
+ +-----------------------------------------------------------------------------------------------------------------------------------------+
446
+
447
+ >>> csv_df.printSchema()
448
+ root
449
+ |-- path: string (nullable = true)
450
+ |-- csv: array (nullable = true)
451
+ | |-- element: struct (containsNull = true)
452
+ | | |-- elementType: string (nullable = true)
453
+ | | |-- content: string (nullable = true)
454
+ | | |-- metadata: map (nullable = true)
455
+ | | | |-- key: string
456
+ | | | |-- value: string (valueContainsNull = true)
457
+ """
458
+ if not isinstance(csvPath, str):
459
+ raise TypeError("docPath must be a string")
460
+ jdf = self._java_obj.csv(csvPath)
416
461
  return self.getDataFrame(self.spark, jdf)