spark-nlp 6.1.3__py2.py3-none-any.whl → 6.1.4__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of spark-nlp might be problematic. Click here for more details.
- {spark_nlp-6.1.3.dist-info → spark_nlp-6.1.4.dist-info}/METADATA +5 -5
- {spark_nlp-6.1.3.dist-info → spark_nlp-6.1.4.dist-info}/RECORD +9 -8
- sparknlp/__init__.py +1 -1
- sparknlp/partition/partition_properties.py +377 -1
- sparknlp/reader/reader2doc.py +2 -86
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +2 -88
- {spark_nlp-6.1.3.dist-info → spark_nlp-6.1.4.dist-info}/WHEEL +0 -0
- {spark_nlp-6.1.3.dist-info → spark_nlp-6.1.4.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: spark-nlp
|
|
3
|
-
Version: 6.1.
|
|
3
|
+
Version: 6.1.4
|
|
4
4
|
Summary: John Snow Labs Spark NLP is a natural language processing library built on top of Apache Spark ML. It provides simple, performant & accurate NLP annotations for machine learning pipelines, that scale easily in a distributed environment.
|
|
5
5
|
Home-page: https://github.com/JohnSnowLabs/spark-nlp
|
|
6
6
|
Author: John Snow Labs
|
|
@@ -102,7 +102,7 @@ $ java -version
|
|
|
102
102
|
$ conda create -n sparknlp python=3.7 -y
|
|
103
103
|
$ conda activate sparknlp
|
|
104
104
|
# spark-nlp by default is based on pyspark 3.x
|
|
105
|
-
$ pip install spark-nlp==6.1.
|
|
105
|
+
$ pip install spark-nlp==6.1.4 pyspark==3.3.1
|
|
106
106
|
```
|
|
107
107
|
|
|
108
108
|
In Python console or Jupyter `Python3` kernel:
|
|
@@ -168,7 +168,7 @@ For a quick example of using pipelines and models take a look at our official [d
|
|
|
168
168
|
|
|
169
169
|
### Apache Spark Support
|
|
170
170
|
|
|
171
|
-
Spark NLP *6.1.
|
|
171
|
+
Spark NLP *6.1.4* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x
|
|
172
172
|
|
|
173
173
|
| Spark NLP | Apache Spark 3.5.x | Apache Spark 3.4.x | Apache Spark 3.3.x | Apache Spark 3.2.x | Apache Spark 3.1.x | Apache Spark 3.0.x | Apache Spark 2.4.x | Apache Spark 2.3.x |
|
|
174
174
|
|-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
|
|
@@ -198,7 +198,7 @@ Find out more about 4.x `SparkNLP` versions in our official [documentation](http
|
|
|
198
198
|
|
|
199
199
|
### Databricks Support
|
|
200
200
|
|
|
201
|
-
Spark NLP 6.1.
|
|
201
|
+
Spark NLP 6.1.4 has been tested and is compatible with the following runtimes:
|
|
202
202
|
|
|
203
203
|
| **CPU** | **GPU** |
|
|
204
204
|
|--------------------|--------------------|
|
|
@@ -216,7 +216,7 @@ We are compatible with older runtimes. For a full list check databricks support
|
|
|
216
216
|
|
|
217
217
|
### EMR Support
|
|
218
218
|
|
|
219
|
-
Spark NLP 6.1.
|
|
219
|
+
Spark NLP 6.1.4 has been tested and is compatible with the following EMR releases:
|
|
220
220
|
|
|
221
221
|
| **EMR Release** |
|
|
222
222
|
|--------------------|
|
|
@@ -3,7 +3,7 @@ com/johnsnowlabs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
|
|
|
3
3
|
com/johnsnowlabs/ml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
com/johnsnowlabs/ml/ai/__init__.py,sha256=YQiK2M7U4d8y5irPy_HB8ae0mSpqS9583MH44pnKJXc,295
|
|
5
5
|
com/johnsnowlabs/nlp/__init__.py,sha256=DPIVXtONO5xXyOk-HB0-sNiHAcco17NN13zPS_6Uw8c,294
|
|
6
|
-
sparknlp/__init__.py,sha256=
|
|
6
|
+
sparknlp/__init__.py,sha256=LcfC7bWeae5XgjWbNbWH94LlJkBon5dA8fYnb_2NyGc,13814
|
|
7
7
|
sparknlp/annotation.py,sha256=I5zOxG5vV2RfPZfqN9enT1i4mo6oBcn3Lrzs37QiOiA,5635
|
|
8
8
|
sparknlp/annotation_audio.py,sha256=iRV_InSVhgvAwSRe9NTbUH9v6OGvTM-FPCpSAKVu0mE,1917
|
|
9
9
|
sparknlp/annotation_image.py,sha256=xhCe8Ko-77XqWVuuYHFrjKqF6zPd8Z-RY_rmZXNwCXU,2547
|
|
@@ -241,7 +241,7 @@ sparknlp/logging/__init__.py,sha256=DoROFF5KLZe4t4Q-OHxqk1nhqbw9NQ-wb64y8icNwgw,
|
|
|
241
241
|
sparknlp/logging/comet.py,sha256=_ZBi9-hlilCAnd4lvdYMWiq4Vqsppv8kow3k0cf-NG4,15958
|
|
242
242
|
sparknlp/partition/__init__.py,sha256=L0w-yv_HnnvoKlSX5MzI2GKHW3RLLfGyq8bgWYVeKjU,749
|
|
243
243
|
sparknlp/partition/partition.py,sha256=GXEAUvOea04Vc_JK0z112cAKFrJ4AEpjLJ8xlzZt6Kw,8551
|
|
244
|
-
sparknlp/partition/partition_properties.py,sha256=
|
|
244
|
+
sparknlp/partition/partition_properties.py,sha256=2tGdIv1NaJNaux_TTskKQHnARAwBkFctaqCcNw21Wr8,19920
|
|
245
245
|
sparknlp/partition/partition_transformer.py,sha256=lRR1h-IMlHR8M0VeB50SbU39GHHF5PgMaJ42qOriS6A,6855
|
|
246
246
|
sparknlp/pretrained/__init__.py,sha256=GV-x9UBK8F2_IR6zYatrzFcVJtkSUIMbxqWsxRUePmQ,793
|
|
247
247
|
sparknlp/pretrained/pretrained_pipeline.py,sha256=lquxiaABuA68Rmu7csamJPqBoRJqMUO0oNHsmEZDAIs,5740
|
|
@@ -250,8 +250,9 @@ sparknlp/pretrained/utils.py,sha256=T1MrvW_DaWk_jcOjVLOea0NMFE9w8fe0ZT_5urZ_nEY,
|
|
|
250
250
|
sparknlp/reader/__init__.py,sha256=-Toj3AIBki-zXPpV8ezFTI2LX1yP_rK2bhpoa8nBkTw,685
|
|
251
251
|
sparknlp/reader/enums.py,sha256=MNGug9oJ1BBLM1Pbske13kAabalDzHa2kucF5xzFpHs,770
|
|
252
252
|
sparknlp/reader/pdf_to_text.py,sha256=eWw-cwjosmcSZ9eHso0F5QQoeGBBnwsOhzhCXXvMjZA,7169
|
|
253
|
-
sparknlp/reader/reader2doc.py,sha256=
|
|
254
|
-
sparknlp/reader/
|
|
253
|
+
sparknlp/reader/reader2doc.py,sha256=87aMk8-_1NHd3bB1rxw56BQMJc6mGgtnYGXwKw2uCmU,5916
|
|
254
|
+
sparknlp/reader/reader2image.py,sha256=k3gb4LEiqDV-pnD-HEaA1KHoAxXmoYys2Y817i1yvP0,4557
|
|
255
|
+
sparknlp/reader/reader2table.py,sha256=pIR9r6NapUV4xdsFecadWlKTSJmRMAm36eqM9aXf13k,2416
|
|
255
256
|
sparknlp/reader/sparknlp_reader.py,sha256=MJs8v_ECYaV1SOabI1L_2MkVYEDVImtwgbYypO7DJSY,20623
|
|
256
257
|
sparknlp/training/__init__.py,sha256=qREi9u-5Vc2VjpL6-XZsyvu5jSEIdIhowW7_kKaqMqo,852
|
|
257
258
|
sparknlp/training/conll.py,sha256=wKBiSTrjc6mjsl7Nyt6B8f4yXsDJkZb-sn8iOjix9cE,6961
|
|
@@ -283,7 +284,7 @@ sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py,sha256=R4yHFN3
|
|
|
283
284
|
sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py,sha256=EoCSdcIjqQ3wv13MAuuWrKV8wyVBP0SbOEW41omHlR0,23189
|
|
284
285
|
sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py,sha256=k5CQ7gKV6HZbZMB8cKLUJuZxoZWlP_DFWdZ--aIDwsc,2356
|
|
285
286
|
sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py,sha256=pAxjWhjazSX8Vg0MFqJiuRVw1IbnQNSs-8Xp26L4nko,870
|
|
286
|
-
spark_nlp-6.1.
|
|
287
|
-
spark_nlp-6.1.
|
|
288
|
-
spark_nlp-6.1.
|
|
289
|
-
spark_nlp-6.1.
|
|
287
|
+
spark_nlp-6.1.4.dist-info/METADATA,sha256=CqRyNEZCA_8F_J5vHG4GUZXRiavXyfb3tPMTStidr4c,19774
|
|
288
|
+
spark_nlp-6.1.4.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
|
|
289
|
+
spark_nlp-6.1.4.dist-info/top_level.txt,sha256=uuytur4pyMRw2H_txNY2ZkaucZHUs22QF8-R03ch_-E,13
|
|
290
|
+
spark_nlp-6.1.4.dist-info/RECORD,,
|
sparknlp/__init__.py
CHANGED
|
@@ -13,8 +13,159 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
"""Contains classes for partition properties used in reading various document types."""
|
|
15
15
|
from typing import Dict
|
|
16
|
+
from pyspark.ml.param import Param, Params, TypeConverters
|
|
16
17
|
|
|
17
|
-
|
|
18
|
+
|
|
19
|
+
class HasReaderProperties(Params):
|
|
20
|
+
|
|
21
|
+
outputCol = Param(
|
|
22
|
+
Params._dummy(),
|
|
23
|
+
"outputCol",
|
|
24
|
+
"output column name",
|
|
25
|
+
typeConverter=TypeConverters.toString
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
contentPath = Param(
|
|
29
|
+
Params._dummy(),
|
|
30
|
+
"contentPath",
|
|
31
|
+
"Path to the content source.",
|
|
32
|
+
typeConverter=TypeConverters.toString
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
def setContentPath(self, value: str):
|
|
36
|
+
"""Sets content path.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
value : str
|
|
41
|
+
Path to the content source.
|
|
42
|
+
"""
|
|
43
|
+
return self._set(contentPath=value)
|
|
44
|
+
|
|
45
|
+
contentType = Param(
|
|
46
|
+
Params._dummy(),
|
|
47
|
+
"contentType",
|
|
48
|
+
"Set the content type to load following MIME specification.",
|
|
49
|
+
typeConverter=TypeConverters.toString
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
def setContentType(self, value: str):
|
|
53
|
+
"""Sets content type following MIME specification.
|
|
54
|
+
|
|
55
|
+
Parameters
|
|
56
|
+
----------
|
|
57
|
+
value : str
|
|
58
|
+
Content type string (MIME format).
|
|
59
|
+
"""
|
|
60
|
+
return self._set(contentType=value)
|
|
61
|
+
|
|
62
|
+
storeContent = Param(
|
|
63
|
+
Params._dummy(),
|
|
64
|
+
"storeContent",
|
|
65
|
+
"Whether to include the raw file content in the output DataFrame "
|
|
66
|
+
"as a separate 'content' column, alongside the structured output.",
|
|
67
|
+
typeConverter=TypeConverters.toBoolean
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
def setStoreContent(self, value: bool):
|
|
71
|
+
"""Sets whether to store raw file content.
|
|
72
|
+
|
|
73
|
+
Parameters
|
|
74
|
+
----------
|
|
75
|
+
value : bool
|
|
76
|
+
True to include raw file content, False otherwise.
|
|
77
|
+
"""
|
|
78
|
+
return self._set(storeContent=value)
|
|
79
|
+
|
|
80
|
+
titleFontSize = Param(
|
|
81
|
+
Params._dummy(),
|
|
82
|
+
"titleFontSize",
|
|
83
|
+
"Minimum font size threshold used as part of heuristic rules to detect "
|
|
84
|
+
"title elements based on formatting (e.g., bold, centered, capitalized).",
|
|
85
|
+
typeConverter=TypeConverters.toInt
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def setTitleFontSize(self, value: int):
|
|
89
|
+
"""Sets minimum font size for detecting titles.
|
|
90
|
+
|
|
91
|
+
Parameters
|
|
92
|
+
----------
|
|
93
|
+
value : int
|
|
94
|
+
Minimum font size threshold for title detection.
|
|
95
|
+
"""
|
|
96
|
+
return self._set(titleFontSize=value)
|
|
97
|
+
|
|
98
|
+
inferTableStructure = Param(
|
|
99
|
+
Params._dummy(),
|
|
100
|
+
"inferTableStructure",
|
|
101
|
+
"Whether to generate an HTML table representation from structured table content. "
|
|
102
|
+
"When enabled, a full <table> element is added alongside cell-level elements, "
|
|
103
|
+
"based on row and column layout.",
|
|
104
|
+
typeConverter=TypeConverters.toBoolean
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
def setInferTableStructure(self, value: bool):
|
|
108
|
+
"""Sets whether to infer table structure.
|
|
109
|
+
|
|
110
|
+
Parameters
|
|
111
|
+
----------
|
|
112
|
+
value : bool
|
|
113
|
+
True to generate HTML table representation, False otherwise.
|
|
114
|
+
"""
|
|
115
|
+
return self._set(inferTableStructure=value)
|
|
116
|
+
|
|
117
|
+
includePageBreaks = Param(
|
|
118
|
+
Params._dummy(),
|
|
119
|
+
"includePageBreaks",
|
|
120
|
+
"Whether to detect and tag content with page break metadata. "
|
|
121
|
+
"In Word documents, this includes manual and section breaks. "
|
|
122
|
+
"In Excel files, this includes page breaks based on column boundaries.",
|
|
123
|
+
typeConverter=TypeConverters.toBoolean
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def setIncludePageBreaks(self, value: bool):
|
|
127
|
+
"""Sets whether to include page break metadata.
|
|
128
|
+
|
|
129
|
+
Parameters
|
|
130
|
+
----------
|
|
131
|
+
value : bool
|
|
132
|
+
True to detect and tag page breaks, False otherwise.
|
|
133
|
+
"""
|
|
134
|
+
return self._set(includePageBreaks=value)
|
|
135
|
+
|
|
136
|
+
ignoreExceptions = Param(
|
|
137
|
+
Params._dummy(),
|
|
138
|
+
"ignoreExceptions",
|
|
139
|
+
"Whether to ignore exceptions during processing.",
|
|
140
|
+
typeConverter=TypeConverters.toBoolean
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
def setIgnoreExceptions(self, value: bool):
|
|
144
|
+
"""Sets whether to ignore exceptions during processing.
|
|
145
|
+
|
|
146
|
+
Parameters
|
|
147
|
+
----------
|
|
148
|
+
value : bool
|
|
149
|
+
True to ignore exceptions, False otherwise.
|
|
150
|
+
"""
|
|
151
|
+
return self._set(ignoreExceptions=value)
|
|
152
|
+
|
|
153
|
+
explodeDocs = Param(
|
|
154
|
+
Params._dummy(),
|
|
155
|
+
"explodeDocs",
|
|
156
|
+
"Whether to explode the documents into separate rows.",
|
|
157
|
+
typeConverter=TypeConverters.toBoolean
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
def setExplodeDocs(self, value: bool):
|
|
161
|
+
"""Sets whether to explode the documents into separate rows.
|
|
162
|
+
|
|
163
|
+
Parameters
|
|
164
|
+
----------
|
|
165
|
+
value : bool
|
|
166
|
+
True to split documents into multiple rows, False to keep them in one row.
|
|
167
|
+
"""
|
|
168
|
+
return self._set(explodeDocs=value)
|
|
18
169
|
|
|
19
170
|
|
|
20
171
|
class HasEmailReaderProperties(Params):
|
|
@@ -144,6 +295,28 @@ class HasHTMLReaderProperties(Params):
|
|
|
144
295
|
self._call_java("setHeadersPython", headers)
|
|
145
296
|
return self
|
|
146
297
|
|
|
298
|
+
outputFormat = Param(
|
|
299
|
+
Params._dummy(),
|
|
300
|
+
"outputFormat",
|
|
301
|
+
"Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
|
|
302
|
+
typeConverter=TypeConverters.toString
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
def setOutputFormat(self, value: str):
|
|
306
|
+
"""Sets output format for the table content.
|
|
307
|
+
|
|
308
|
+
Options
|
|
309
|
+
-------
|
|
310
|
+
- 'plain-text'
|
|
311
|
+
- 'html-table'
|
|
312
|
+
- 'json-table' (default)
|
|
313
|
+
|
|
314
|
+
Parameters
|
|
315
|
+
----------
|
|
316
|
+
value : str
|
|
317
|
+
Output format for the table content.
|
|
318
|
+
"""
|
|
319
|
+
return self._set(outputFormat=value)
|
|
147
320
|
|
|
148
321
|
class HasPowerPointProperties(Params):
|
|
149
322
|
|
|
@@ -317,3 +490,206 @@ class HasChunkerProperties(Params):
|
|
|
317
490
|
|
|
318
491
|
def setOverlapAll(self, value):
|
|
319
492
|
return self._set(overlapAll=value)
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
from pyspark.ml.param import Param, Params, TypeConverters
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
class HasPdfProperties(Params):
|
|
499
|
+
|
|
500
|
+
pageNumCol = Param(
|
|
501
|
+
Params._dummy(),
|
|
502
|
+
"pageNumCol",
|
|
503
|
+
"Page number output column name.",
|
|
504
|
+
typeConverter=TypeConverters.toString
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
def setPageNumCol(self, value: str):
|
|
508
|
+
"""Sets page number output column name.
|
|
509
|
+
|
|
510
|
+
Parameters
|
|
511
|
+
----------
|
|
512
|
+
value : str
|
|
513
|
+
Name of the column for page numbers.
|
|
514
|
+
"""
|
|
515
|
+
return self._set(pageNumCol=value)
|
|
516
|
+
|
|
517
|
+
originCol = Param(
|
|
518
|
+
Params._dummy(),
|
|
519
|
+
"originCol",
|
|
520
|
+
"Input column name with original path of file.",
|
|
521
|
+
typeConverter=TypeConverters.toString
|
|
522
|
+
)
|
|
523
|
+
|
|
524
|
+
def setOriginCol(self, value: str):
|
|
525
|
+
"""Sets input column with original file path.
|
|
526
|
+
|
|
527
|
+
Parameters
|
|
528
|
+
----------
|
|
529
|
+
value : str
|
|
530
|
+
Column name that stores the file path.
|
|
531
|
+
"""
|
|
532
|
+
return self._set(originCol=value)
|
|
533
|
+
|
|
534
|
+
partitionNum = Param(
|
|
535
|
+
Params._dummy(),
|
|
536
|
+
"partitionNum",
|
|
537
|
+
"Number of partitions.",
|
|
538
|
+
typeConverter=TypeConverters.toInt
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
def setPartitionNum(self, value: int):
|
|
542
|
+
"""Sets number of partitions.
|
|
543
|
+
|
|
544
|
+
Parameters
|
|
545
|
+
----------
|
|
546
|
+
value : int
|
|
547
|
+
Number of partitions to use.
|
|
548
|
+
"""
|
|
549
|
+
return self._set(partitionNum=value)
|
|
550
|
+
|
|
551
|
+
storeSplittedPdf = Param(
|
|
552
|
+
Params._dummy(),
|
|
553
|
+
"storeSplittedPdf",
|
|
554
|
+
"Force to store bytes content of splitted pdf.",
|
|
555
|
+
typeConverter=TypeConverters.toBoolean
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
def setStoreSplittedPdf(self, value: bool):
|
|
559
|
+
"""Sets whether to store byte content of split PDF pages.
|
|
560
|
+
|
|
561
|
+
Parameters
|
|
562
|
+
----------
|
|
563
|
+
value : bool
|
|
564
|
+
True to store PDF page bytes, False otherwise.
|
|
565
|
+
"""
|
|
566
|
+
return self._set(storeSplittedPdf=value)
|
|
567
|
+
|
|
568
|
+
splitPage = Param(
|
|
569
|
+
Params._dummy(),
|
|
570
|
+
"splitPage",
|
|
571
|
+
"Enable/disable splitting per page to identify page numbers and improve performance.",
|
|
572
|
+
typeConverter=TypeConverters.toBoolean
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
def setSplitPage(self, value: bool):
|
|
576
|
+
"""Sets whether to split PDF into pages.
|
|
577
|
+
|
|
578
|
+
Parameters
|
|
579
|
+
----------
|
|
580
|
+
value : bool
|
|
581
|
+
True to split per page, False otherwise.
|
|
582
|
+
"""
|
|
583
|
+
return self._set(splitPage=value)
|
|
584
|
+
|
|
585
|
+
onlyPageNum = Param(
|
|
586
|
+
Params._dummy(),
|
|
587
|
+
"onlyPageNum",
|
|
588
|
+
"Extract only page numbers.",
|
|
589
|
+
typeConverter=TypeConverters.toBoolean
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
def setOnlyPageNum(self, value: bool):
|
|
593
|
+
"""Sets whether to extract only page numbers.
|
|
594
|
+
|
|
595
|
+
Parameters
|
|
596
|
+
----------
|
|
597
|
+
value : bool
|
|
598
|
+
True to extract only page numbers, False otherwise.
|
|
599
|
+
"""
|
|
600
|
+
return self._set(onlyPageNum=value)
|
|
601
|
+
|
|
602
|
+
textStripper = Param(
|
|
603
|
+
Params._dummy(),
|
|
604
|
+
"textStripper",
|
|
605
|
+
"Text stripper type used for output layout and formatting.",
|
|
606
|
+
typeConverter=TypeConverters.toString
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
def setTextStripper(self, value: str):
|
|
610
|
+
"""Sets text stripper type.
|
|
611
|
+
|
|
612
|
+
Parameters
|
|
613
|
+
----------
|
|
614
|
+
value : str
|
|
615
|
+
Text stripper type for layout and formatting.
|
|
616
|
+
"""
|
|
617
|
+
return self._set(textStripper=value)
|
|
618
|
+
|
|
619
|
+
sort = Param(
|
|
620
|
+
Params._dummy(),
|
|
621
|
+
"sort",
|
|
622
|
+
"Enable/disable sorting content on the page.",
|
|
623
|
+
typeConverter=TypeConverters.toBoolean
|
|
624
|
+
)
|
|
625
|
+
|
|
626
|
+
def setSort(self, value: bool):
|
|
627
|
+
"""Sets whether to sort content on the page.
|
|
628
|
+
|
|
629
|
+
Parameters
|
|
630
|
+
----------
|
|
631
|
+
value : bool
|
|
632
|
+
True to sort content, False otherwise.
|
|
633
|
+
"""
|
|
634
|
+
return self._set(sort=value)
|
|
635
|
+
|
|
636
|
+
extractCoordinates = Param(
|
|
637
|
+
Params._dummy(),
|
|
638
|
+
"extractCoordinates",
|
|
639
|
+
"Force extract coordinates of text.",
|
|
640
|
+
typeConverter=TypeConverters.toBoolean
|
|
641
|
+
)
|
|
642
|
+
|
|
643
|
+
def setExtractCoordinates(self, value: bool):
|
|
644
|
+
"""Sets whether to extract coordinates of text.
|
|
645
|
+
|
|
646
|
+
Parameters
|
|
647
|
+
----------
|
|
648
|
+
value : bool
|
|
649
|
+
True to extract coordinates, False otherwise.
|
|
650
|
+
"""
|
|
651
|
+
return self._set(extractCoordinates=value)
|
|
652
|
+
|
|
653
|
+
normalizeLigatures = Param(
|
|
654
|
+
Params._dummy(),
|
|
655
|
+
"normalizeLigatures",
|
|
656
|
+
"Whether to convert ligature chars such as 'fl' into its corresponding chars (e.g., {'f', 'l'}).",
|
|
657
|
+
typeConverter=TypeConverters.toBoolean
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
def setNormalizeLigatures(self, value: bool):
|
|
661
|
+
"""Sets whether to normalize ligatures (e.g., fl → f + l).
|
|
662
|
+
|
|
663
|
+
Parameters
|
|
664
|
+
----------
|
|
665
|
+
value : bool
|
|
666
|
+
True to normalize ligatures, False otherwise.
|
|
667
|
+
"""
|
|
668
|
+
return self._set(normalizeLigatures=value)
|
|
669
|
+
|
|
670
|
+
readAsImage = Param(
|
|
671
|
+
Params._dummy(),
|
|
672
|
+
"readAsImage",
|
|
673
|
+
"Read PDF pages as images.",
|
|
674
|
+
typeConverter=TypeConverters.toBoolean
|
|
675
|
+
)
|
|
676
|
+
|
|
677
|
+
def setReadAsImage(self, value: bool):
|
|
678
|
+
"""Sets whether to read PDF pages as images.
|
|
679
|
+
|
|
680
|
+
Parameters
|
|
681
|
+
----------
|
|
682
|
+
value : bool
|
|
683
|
+
True to read as images, False otherwise.
|
|
684
|
+
"""
|
|
685
|
+
return self._set(readAsImage=value)
|
|
686
|
+
|
|
687
|
+
def setOutputCol(self, value):
|
|
688
|
+
"""Sets output column name.
|
|
689
|
+
|
|
690
|
+
Parameters
|
|
691
|
+
----------
|
|
692
|
+
value : str
|
|
693
|
+
Name of the Output Column
|
|
694
|
+
"""
|
|
695
|
+
return self._set(outputCol=value)
|
sparknlp/reader/reader2doc.py
CHANGED
|
@@ -21,9 +21,10 @@ from sparknlp.partition.partition_properties import *
|
|
|
21
21
|
|
|
22
22
|
class Reader2Doc(
|
|
23
23
|
AnnotatorTransformer,
|
|
24
|
+
HasReaderProperties,
|
|
25
|
+
HasHTMLReaderProperties,
|
|
24
26
|
HasEmailReaderProperties,
|
|
25
27
|
HasExcelReaderProperties,
|
|
26
|
-
HasHTMLReaderProperties,
|
|
27
28
|
HasPowerPointProperties,
|
|
28
29
|
HasTextReaderProperties
|
|
29
30
|
):
|
|
@@ -73,33 +74,6 @@ class Reader2Doc(
|
|
|
73
74
|
name = "Reader2Doc"
|
|
74
75
|
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
75
76
|
|
|
76
|
-
contentPath = Param(
|
|
77
|
-
Params._dummy(),
|
|
78
|
-
"contentPath",
|
|
79
|
-
"contentPath path to files to read",
|
|
80
|
-
typeConverter=TypeConverters.toString
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
outputCol = Param(
|
|
84
|
-
Params._dummy(),
|
|
85
|
-
"outputCol",
|
|
86
|
-
"output column name",
|
|
87
|
-
typeConverter=TypeConverters.toString
|
|
88
|
-
)
|
|
89
|
-
|
|
90
|
-
contentType = Param(
|
|
91
|
-
Params._dummy(),
|
|
92
|
-
"contentType",
|
|
93
|
-
"Set the content type to load following MIME specification",
|
|
94
|
-
typeConverter=TypeConverters.toString
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
explodeDocs = Param(
|
|
98
|
-
Params._dummy(),
|
|
99
|
-
"explodeDocs",
|
|
100
|
-
"whether to explode the documents into separate rows",
|
|
101
|
-
typeConverter=TypeConverters.toBoolean
|
|
102
|
-
)
|
|
103
77
|
|
|
104
78
|
flattenOutput = Param(
|
|
105
79
|
Params._dummy(),
|
|
@@ -115,13 +89,6 @@ class Reader2Doc(
|
|
|
115
89
|
typeConverter=TypeConverters.toFloat
|
|
116
90
|
)
|
|
117
91
|
|
|
118
|
-
outputFormat = Param(
|
|
119
|
-
Params._dummy(),
|
|
120
|
-
"outputFormat",
|
|
121
|
-
"Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
|
|
122
|
-
typeConverter=TypeConverters.toString
|
|
123
|
-
)
|
|
124
|
-
|
|
125
92
|
outputAsDocument = Param(
|
|
126
93
|
Params._dummy(),
|
|
127
94
|
"outputAsDocument",
|
|
@@ -151,47 +118,6 @@ class Reader2Doc(
|
|
|
151
118
|
kwargs = self._input_kwargs
|
|
152
119
|
return self._set(**kwargs)
|
|
153
120
|
|
|
154
|
-
def setContentPath(self, value):
|
|
155
|
-
"""Sets content path.
|
|
156
|
-
|
|
157
|
-
Parameters
|
|
158
|
-
----------
|
|
159
|
-
value : str
|
|
160
|
-
contentPath path to files to read
|
|
161
|
-
"""
|
|
162
|
-
return self._set(contentPath=value)
|
|
163
|
-
|
|
164
|
-
def setContentType(self, value):
|
|
165
|
-
"""
|
|
166
|
-
Set the content type to load following MIME specification
|
|
167
|
-
|
|
168
|
-
Parameters
|
|
169
|
-
----------
|
|
170
|
-
value : str
|
|
171
|
-
content type to load following MIME specification
|
|
172
|
-
"""
|
|
173
|
-
return self._set(contentType=value)
|
|
174
|
-
|
|
175
|
-
def setExplodeDocs(self, value):
|
|
176
|
-
"""Sets whether to explode the documents into separate rows.
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
Parameters
|
|
180
|
-
----------
|
|
181
|
-
value : boolean
|
|
182
|
-
Whether to explode the documents into separate rows
|
|
183
|
-
"""
|
|
184
|
-
return self._set(explodeDocs=value)
|
|
185
|
-
|
|
186
|
-
def setOutputCol(self, value):
|
|
187
|
-
"""Sets output column name.
|
|
188
|
-
|
|
189
|
-
Parameters
|
|
190
|
-
----------
|
|
191
|
-
value : str
|
|
192
|
-
Name of the Output Column
|
|
193
|
-
"""
|
|
194
|
-
return self._set(outputCol=value)
|
|
195
121
|
|
|
196
122
|
def setFlattenOutput(self, value):
|
|
197
123
|
"""Sets whether to flatten the output to plain text with minimal metadata.
|
|
@@ -213,16 +139,6 @@ class Reader2Doc(
|
|
|
213
139
|
"""
|
|
214
140
|
return self._set(titleThreshold=value)
|
|
215
141
|
|
|
216
|
-
def setOutputFormat(self, value):
|
|
217
|
-
"""Sets the output format for the table content.
|
|
218
|
-
|
|
219
|
-
Parameters
|
|
220
|
-
----------
|
|
221
|
-
value : str
|
|
222
|
-
Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.
|
|
223
|
-
"""
|
|
224
|
-
return self._set(outputFormat=value)
|
|
225
|
-
|
|
226
142
|
def setOutputAsDocument(self, value):
|
|
227
143
|
"""Sets whether to return all sentences joined into a single document.
|
|
228
144
|
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# Copyright 2017-2025 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from pyspark import keyword_only
|
|
15
|
+
from pyspark.ml.param import TypeConverters, Params, Param
|
|
16
|
+
|
|
17
|
+
from sparknlp.common import AnnotatorType
|
|
18
|
+
from sparknlp.internal import AnnotatorTransformer
|
|
19
|
+
from sparknlp.partition.partition_properties import *
|
|
20
|
+
|
|
21
|
+
class Reader2Image(
|
|
22
|
+
AnnotatorTransformer,
|
|
23
|
+
HasReaderProperties,
|
|
24
|
+
HasHTMLReaderProperties,
|
|
25
|
+
HasPdfProperties
|
|
26
|
+
):
|
|
27
|
+
"""
|
|
28
|
+
The Reader2Image annotator allows you to use the reading files with images more smoothly within existing
|
|
29
|
+
Spark NLP workflows, enabling seamless reuse of your pipelines. Reader2Image can be used for
|
|
30
|
+
extracting structured image content from various document types using Spark NLP readers. It supports
|
|
31
|
+
reading from many file types and returns parsed output as a structured Spark DataFrame.
|
|
32
|
+
|
|
33
|
+
Supported formats include HTML and Markdown.
|
|
34
|
+
|
|
35
|
+
== Example ==
|
|
36
|
+
This example demonstrates how to load HTML files with images and process them into a structured
|
|
37
|
+
Spark DataFrame using Reader2Image.
|
|
38
|
+
|
|
39
|
+
Expected output:
|
|
40
|
+
+-------------------+--------------------+
|
|
41
|
+
| fileName| image|
|
|
42
|
+
+-------------------+--------------------+
|
|
43
|
+
|example-images.html|[{image, example-...|
|
|
44
|
+
|example-images.html|[{image, example-...|
|
|
45
|
+
+-------------------+--------------------+
|
|
46
|
+
|
|
47
|
+
Schema:
|
|
48
|
+
root
|
|
49
|
+
|-- fileName: string (nullable = true)
|
|
50
|
+
|-- image: array (nullable = false)
|
|
51
|
+
| |-- element: struct (containsNull = true)
|
|
52
|
+
| | |-- annotatorType: string (nullable = true)
|
|
53
|
+
| | |-- origin: string (nullable = true)
|
|
54
|
+
| | |-- height: integer (nullable = false)
|
|
55
|
+
| | |-- width: integer (nullable = false)
|
|
56
|
+
| | |-- nChannels: integer (nullable = false)
|
|
57
|
+
| | |-- mode: integer (nullable = false)
|
|
58
|
+
| | |-- result: binary (nullable = true)
|
|
59
|
+
| | |-- metadata: map (nullable = true)
|
|
60
|
+
| | | |-- key: string
|
|
61
|
+
| | | |-- value: string (valueContainsNull = true)
|
|
62
|
+
| | |-- text: string (nullable = true)
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
name = "Reader2Image"
|
|
66
|
+
outputAnnotatorType = AnnotatorType.IMAGE
|
|
67
|
+
|
|
68
|
+
userMessage = Param(
|
|
69
|
+
Params._dummy(),
|
|
70
|
+
"userMessage",
|
|
71
|
+
"Custom user message.",
|
|
72
|
+
typeConverter=TypeConverters.toString
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
promptTemplate = Param(
|
|
76
|
+
Params._dummy(),
|
|
77
|
+
"promptTemplate",
|
|
78
|
+
"Format of the output prompt.",
|
|
79
|
+
typeConverter=TypeConverters.toString
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
customPromptTemplate = Param(
|
|
83
|
+
Params._dummy(),
|
|
84
|
+
"customPromptTemplate",
|
|
85
|
+
"Custom prompt template for image models.",
|
|
86
|
+
typeConverter=TypeConverters.toString
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
@keyword_only
|
|
90
|
+
def __init__(self):
|
|
91
|
+
super(Reader2Image, self).__init__(classname="com.johnsnowlabs.reader.Reader2Image")
|
|
92
|
+
self._setDefault(
|
|
93
|
+
contentType="",
|
|
94
|
+
outputFormat="image",
|
|
95
|
+
explodeDocs=True,
|
|
96
|
+
userMessage="Describe this image",
|
|
97
|
+
promptTemplate="qwen2vl-chat",
|
|
98
|
+
readAsImage=True,
|
|
99
|
+
customPromptTemplate="",
|
|
100
|
+
ignoreExceptions=True
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
@keyword_only
|
|
104
|
+
def setParams(self):
|
|
105
|
+
kwargs = self._input_kwargs
|
|
106
|
+
return self._set(**kwargs)
|
|
107
|
+
|
|
108
|
+
def setUserMessage(self, value: str):
|
|
109
|
+
"""Sets custom user message.
|
|
110
|
+
|
|
111
|
+
Parameters
|
|
112
|
+
----------
|
|
113
|
+
value : str
|
|
114
|
+
Custom user message to include.
|
|
115
|
+
"""
|
|
116
|
+
return self._set(userMessage=value)
|
|
117
|
+
|
|
118
|
+
def setPromptTemplate(self, value: str):
|
|
119
|
+
"""Sets format of the output prompt.
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
value : str
|
|
124
|
+
Prompt template format.
|
|
125
|
+
"""
|
|
126
|
+
return self._set(promptTemplate=value)
|
|
127
|
+
|
|
128
|
+
def setCustomPromptTemplate(self, value: str):
|
|
129
|
+
"""Sets custom prompt template for image models.
|
|
130
|
+
|
|
131
|
+
Parameters
|
|
132
|
+
----------
|
|
133
|
+
value : str
|
|
134
|
+
Custom prompt template string.
|
|
135
|
+
"""
|
|
136
|
+
return self._set(customPromptTemplate=value)
|
sparknlp/reader/reader2table.py
CHANGED
|
@@ -13,14 +13,15 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
from pyspark import keyword_only
|
|
16
|
-
from pyspark.ml.param import TypeConverters, Params, Param
|
|
17
16
|
|
|
18
17
|
from sparknlp.common import AnnotatorType
|
|
19
18
|
from sparknlp.internal import AnnotatorTransformer
|
|
20
19
|
from sparknlp.partition.partition_properties import *
|
|
21
20
|
|
|
21
|
+
|
|
22
22
|
class Reader2Table(
|
|
23
23
|
AnnotatorTransformer,
|
|
24
|
+
HasReaderProperties,
|
|
24
25
|
HasEmailReaderProperties,
|
|
25
26
|
HasExcelReaderProperties,
|
|
26
27
|
HasHTMLReaderProperties,
|
|
@@ -31,34 +32,6 @@ class Reader2Table(
|
|
|
31
32
|
|
|
32
33
|
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
33
34
|
|
|
34
|
-
contentPath = Param(
|
|
35
|
-
Params._dummy(),
|
|
36
|
-
"contentPath",
|
|
37
|
-
"contentPath path to files to read",
|
|
38
|
-
typeConverter=TypeConverters.toString
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
outputCol = Param(
|
|
42
|
-
Params._dummy(),
|
|
43
|
-
"outputCol",
|
|
44
|
-
"output column name",
|
|
45
|
-
typeConverter=TypeConverters.toString
|
|
46
|
-
)
|
|
47
|
-
|
|
48
|
-
contentType = Param(
|
|
49
|
-
Params._dummy(),
|
|
50
|
-
"contentType",
|
|
51
|
-
"Set the content type to load following MIME specification",
|
|
52
|
-
typeConverter=TypeConverters.toString
|
|
53
|
-
)
|
|
54
|
-
|
|
55
|
-
explodeDocs = Param(
|
|
56
|
-
Params._dummy(),
|
|
57
|
-
"explodeDocs",
|
|
58
|
-
"whether to explode the documents into separate rows",
|
|
59
|
-
typeConverter=TypeConverters.toBoolean
|
|
60
|
-
)
|
|
61
|
-
|
|
62
35
|
flattenOutput = Param(
|
|
63
36
|
Params._dummy(),
|
|
64
37
|
"flattenOutput",
|
|
@@ -73,13 +46,6 @@ class Reader2Table(
|
|
|
73
46
|
typeConverter=TypeConverters.toFloat
|
|
74
47
|
)
|
|
75
48
|
|
|
76
|
-
outputFormat = Param(
|
|
77
|
-
Params._dummy(),
|
|
78
|
-
"outputFormat",
|
|
79
|
-
"Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
|
|
80
|
-
typeConverter=TypeConverters.toString
|
|
81
|
-
)
|
|
82
|
-
|
|
83
49
|
@keyword_only
|
|
84
50
|
def __init__(self):
|
|
85
51
|
super(Reader2Table, self).__init__(classname="com.johnsnowlabs.reader.Reader2Table")
|
|
@@ -90,48 +56,6 @@ class Reader2Table(
|
|
|
90
56
|
kwargs = self._input_kwargs
|
|
91
57
|
return self._set(**kwargs)
|
|
92
58
|
|
|
93
|
-
def setContentPath(self, value):
|
|
94
|
-
"""Sets content path.
|
|
95
|
-
|
|
96
|
-
Parameters
|
|
97
|
-
----------
|
|
98
|
-
value : str
|
|
99
|
-
contentPath path to files to read
|
|
100
|
-
"""
|
|
101
|
-
return self._set(contentPath=value)
|
|
102
|
-
|
|
103
|
-
def setContentType(self, value):
|
|
104
|
-
"""
|
|
105
|
-
Set the content type to load following MIME specification
|
|
106
|
-
|
|
107
|
-
Parameters
|
|
108
|
-
----------
|
|
109
|
-
value : str
|
|
110
|
-
content type to load following MIME specification
|
|
111
|
-
"""
|
|
112
|
-
return self._set(contentType=value)
|
|
113
|
-
|
|
114
|
-
def setExplodeDocs(self, value):
|
|
115
|
-
"""Sets whether to explode the documents into separate rows.
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
Parameters
|
|
119
|
-
----------
|
|
120
|
-
value : boolean
|
|
121
|
-
Whether to explode the documents into separate rows
|
|
122
|
-
"""
|
|
123
|
-
return self._set(explodeDocs=value)
|
|
124
|
-
|
|
125
|
-
def setOutputCol(self, value):
|
|
126
|
-
"""Sets output column name.
|
|
127
|
-
|
|
128
|
-
Parameters
|
|
129
|
-
----------
|
|
130
|
-
value : str
|
|
131
|
-
Name of the Output Column
|
|
132
|
-
"""
|
|
133
|
-
return self._set(outputCol=value)
|
|
134
|
-
|
|
135
59
|
def setFlattenOutput(self, value):
|
|
136
60
|
"""Sets whether to flatten the output to plain text with minimal metadata.
|
|
137
61
|
|
|
@@ -151,13 +75,3 @@ class Reader2Table(
|
|
|
151
75
|
Minimum font size threshold for title detection in PDF docs
|
|
152
76
|
"""
|
|
153
77
|
return self._set(titleThreshold=value)
|
|
154
|
-
|
|
155
|
-
def setOutputFormat(self, value):
|
|
156
|
-
"""Sets the output format for the table content.
|
|
157
|
-
|
|
158
|
-
Parameters
|
|
159
|
-
----------
|
|
160
|
-
value : str
|
|
161
|
-
Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.
|
|
162
|
-
"""
|
|
163
|
-
return self._set(outputFormat=value)
|
|
File without changes
|
|
File without changes
|