spark-nlp 6.1.3__py2.py3-none-any.whl → 6.1.4__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spark-nlp might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: spark-nlp
3
- Version: 6.1.3
3
+ Version: 6.1.4
4
4
  Summary: John Snow Labs Spark NLP is a natural language processing library built on top of Apache Spark ML. It provides simple, performant & accurate NLP annotations for machine learning pipelines, that scale easily in a distributed environment.
5
5
  Home-page: https://github.com/JohnSnowLabs/spark-nlp
6
6
  Author: John Snow Labs
@@ -102,7 +102,7 @@ $ java -version
102
102
  $ conda create -n sparknlp python=3.7 -y
103
103
  $ conda activate sparknlp
104
104
  # spark-nlp by default is based on pyspark 3.x
105
- $ pip install spark-nlp==6.1.3 pyspark==3.3.1
105
+ $ pip install spark-nlp==6.1.4 pyspark==3.3.1
106
106
  ```
107
107
 
108
108
  In Python console or Jupyter `Python3` kernel:
@@ -168,7 +168,7 @@ For a quick example of using pipelines and models take a look at our official [d
168
168
 
169
169
  ### Apache Spark Support
170
170
 
171
- Spark NLP *6.1.3* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x
171
+ Spark NLP *6.1.4* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x
172
172
 
173
173
  | Spark NLP | Apache Spark 3.5.x | Apache Spark 3.4.x | Apache Spark 3.3.x | Apache Spark 3.2.x | Apache Spark 3.1.x | Apache Spark 3.0.x | Apache Spark 2.4.x | Apache Spark 2.3.x |
174
174
  |-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
@@ -198,7 +198,7 @@ Find out more about 4.x `SparkNLP` versions in our official [documentation](http
198
198
 
199
199
  ### Databricks Support
200
200
 
201
- Spark NLP 6.1.3 has been tested and is compatible with the following runtimes:
201
+ Spark NLP 6.1.4 has been tested and is compatible with the following runtimes:
202
202
 
203
203
  | **CPU** | **GPU** |
204
204
  |--------------------|--------------------|
@@ -216,7 +216,7 @@ We are compatible with older runtimes. For a full list check databricks support
216
216
 
217
217
  ### EMR Support
218
218
 
219
- Spark NLP 6.1.3 has been tested and is compatible with the following EMR releases:
219
+ Spark NLP 6.1.4 has been tested and is compatible with the following EMR releases:
220
220
 
221
221
  | **EMR Release** |
222
222
  |--------------------|
@@ -3,7 +3,7 @@ com/johnsnowlabs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
3
3
  com/johnsnowlabs/ml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  com/johnsnowlabs/ml/ai/__init__.py,sha256=YQiK2M7U4d8y5irPy_HB8ae0mSpqS9583MH44pnKJXc,295
5
5
  com/johnsnowlabs/nlp/__init__.py,sha256=DPIVXtONO5xXyOk-HB0-sNiHAcco17NN13zPS_6Uw8c,294
6
- sparknlp/__init__.py,sha256=UR0dRykX67j-Ksuzk5Xe-Mod5qCK24iBjHHa0omOp2w,13814
6
+ sparknlp/__init__.py,sha256=LcfC7bWeae5XgjWbNbWH94LlJkBon5dA8fYnb_2NyGc,13814
7
7
  sparknlp/annotation.py,sha256=I5zOxG5vV2RfPZfqN9enT1i4mo6oBcn3Lrzs37QiOiA,5635
8
8
  sparknlp/annotation_audio.py,sha256=iRV_InSVhgvAwSRe9NTbUH9v6OGvTM-FPCpSAKVu0mE,1917
9
9
  sparknlp/annotation_image.py,sha256=xhCe8Ko-77XqWVuuYHFrjKqF6zPd8Z-RY_rmZXNwCXU,2547
@@ -241,7 +241,7 @@ sparknlp/logging/__init__.py,sha256=DoROFF5KLZe4t4Q-OHxqk1nhqbw9NQ-wb64y8icNwgw,
241
241
  sparknlp/logging/comet.py,sha256=_ZBi9-hlilCAnd4lvdYMWiq4Vqsppv8kow3k0cf-NG4,15958
242
242
  sparknlp/partition/__init__.py,sha256=L0w-yv_HnnvoKlSX5MzI2GKHW3RLLfGyq8bgWYVeKjU,749
243
243
  sparknlp/partition/partition.py,sha256=GXEAUvOea04Vc_JK0z112cAKFrJ4AEpjLJ8xlzZt6Kw,8551
244
- sparknlp/partition/partition_properties.py,sha256=xhAMhlsTBg-WS6KWDyVbRPwO7IzpowVVhJNR-ZGhvdo,9520
244
+ sparknlp/partition/partition_properties.py,sha256=2tGdIv1NaJNaux_TTskKQHnARAwBkFctaqCcNw21Wr8,19920
245
245
  sparknlp/partition/partition_transformer.py,sha256=lRR1h-IMlHR8M0VeB50SbU39GHHF5PgMaJ42qOriS6A,6855
246
246
  sparknlp/pretrained/__init__.py,sha256=GV-x9UBK8F2_IR6zYatrzFcVJtkSUIMbxqWsxRUePmQ,793
247
247
  sparknlp/pretrained/pretrained_pipeline.py,sha256=lquxiaABuA68Rmu7csamJPqBoRJqMUO0oNHsmEZDAIs,5740
@@ -250,8 +250,9 @@ sparknlp/pretrained/utils.py,sha256=T1MrvW_DaWk_jcOjVLOea0NMFE9w8fe0ZT_5urZ_nEY,
250
250
  sparknlp/reader/__init__.py,sha256=-Toj3AIBki-zXPpV8ezFTI2LX1yP_rK2bhpoa8nBkTw,685
251
251
  sparknlp/reader/enums.py,sha256=MNGug9oJ1BBLM1Pbske13kAabalDzHa2kucF5xzFpHs,770
252
252
  sparknlp/reader/pdf_to_text.py,sha256=eWw-cwjosmcSZ9eHso0F5QQoeGBBnwsOhzhCXXvMjZA,7169
253
- sparknlp/reader/reader2doc.py,sha256=8x1tvx7Hj2J4xpyRiCUvrG-kmOPBvIE8K1tJZY-e0Xw,8200
254
- sparknlp/reader/reader2table.py,sha256=GC6Yz0gQ83S6XKOi329TUNQuAvLrBxysqDkDRZPvcYA,4759
253
+ sparknlp/reader/reader2doc.py,sha256=87aMk8-_1NHd3bB1rxw56BQMJc6mGgtnYGXwKw2uCmU,5916
254
+ sparknlp/reader/reader2image.py,sha256=k3gb4LEiqDV-pnD-HEaA1KHoAxXmoYys2Y817i1yvP0,4557
255
+ sparknlp/reader/reader2table.py,sha256=pIR9r6NapUV4xdsFecadWlKTSJmRMAm36eqM9aXf13k,2416
255
256
  sparknlp/reader/sparknlp_reader.py,sha256=MJs8v_ECYaV1SOabI1L_2MkVYEDVImtwgbYypO7DJSY,20623
256
257
  sparknlp/training/__init__.py,sha256=qREi9u-5Vc2VjpL6-XZsyvu5jSEIdIhowW7_kKaqMqo,852
257
258
  sparknlp/training/conll.py,sha256=wKBiSTrjc6mjsl7Nyt6B8f4yXsDJkZb-sn8iOjix9cE,6961
@@ -283,7 +284,7 @@ sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py,sha256=R4yHFN3
283
284
  sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py,sha256=EoCSdcIjqQ3wv13MAuuWrKV8wyVBP0SbOEW41omHlR0,23189
284
285
  sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py,sha256=k5CQ7gKV6HZbZMB8cKLUJuZxoZWlP_DFWdZ--aIDwsc,2356
285
286
  sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py,sha256=pAxjWhjazSX8Vg0MFqJiuRVw1IbnQNSs-8Xp26L4nko,870
286
- spark_nlp-6.1.3.dist-info/METADATA,sha256=U4Fb5wRd8Ql6BULfRwQSE6Pa77wsLwOGwTk-s038YuI,19774
287
- spark_nlp-6.1.3.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
288
- spark_nlp-6.1.3.dist-info/top_level.txt,sha256=uuytur4pyMRw2H_txNY2ZkaucZHUs22QF8-R03ch_-E,13
289
- spark_nlp-6.1.3.dist-info/RECORD,,
287
+ spark_nlp-6.1.4.dist-info/METADATA,sha256=CqRyNEZCA_8F_J5vHG4GUZXRiavXyfb3tPMTStidr4c,19774
288
+ spark_nlp-6.1.4.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
289
+ spark_nlp-6.1.4.dist-info/top_level.txt,sha256=uuytur4pyMRw2H_txNY2ZkaucZHUs22QF8-R03ch_-E,13
290
+ spark_nlp-6.1.4.dist-info/RECORD,,
sparknlp/__init__.py CHANGED
@@ -66,7 +66,7 @@ sys.modules['com.johnsnowlabs.ml.ai'] = annotator
66
66
  annotators = annotator
67
67
  embeddings = annotator
68
68
 
69
- __version__ = "6.1.3"
69
+ __version__ = "6.1.4"
70
70
 
71
71
 
72
72
  def start(gpu=False,
@@ -13,8 +13,159 @@
13
13
  # limitations under the License.
14
14
  """Contains classes for partition properties used in reading various document types."""
15
15
  from typing import Dict
16
+ from pyspark.ml.param import Param, Params, TypeConverters
16
17
 
17
- from pyspark.ml.param import TypeConverters, Params, Param
18
+
19
+ class HasReaderProperties(Params):
20
+
21
+ outputCol = Param(
22
+ Params._dummy(),
23
+ "outputCol",
24
+ "output column name",
25
+ typeConverter=TypeConverters.toString
26
+ )
27
+
28
+ contentPath = Param(
29
+ Params._dummy(),
30
+ "contentPath",
31
+ "Path to the content source.",
32
+ typeConverter=TypeConverters.toString
33
+ )
34
+
35
+ def setContentPath(self, value: str):
36
+ """Sets content path.
37
+
38
+ Parameters
39
+ ----------
40
+ value : str
41
+ Path to the content source.
42
+ """
43
+ return self._set(contentPath=value)
44
+
45
+ contentType = Param(
46
+ Params._dummy(),
47
+ "contentType",
48
+ "Set the content type to load following MIME specification.",
49
+ typeConverter=TypeConverters.toString
50
+ )
51
+
52
+ def setContentType(self, value: str):
53
+ """Sets content type following MIME specification.
54
+
55
+ Parameters
56
+ ----------
57
+ value : str
58
+ Content type string (MIME format).
59
+ """
60
+ return self._set(contentType=value)
61
+
62
+ storeContent = Param(
63
+ Params._dummy(),
64
+ "storeContent",
65
+ "Whether to include the raw file content in the output DataFrame "
66
+ "as a separate 'content' column, alongside the structured output.",
67
+ typeConverter=TypeConverters.toBoolean
68
+ )
69
+
70
+ def setStoreContent(self, value: bool):
71
+ """Sets whether to store raw file content.
72
+
73
+ Parameters
74
+ ----------
75
+ value : bool
76
+ True to include raw file content, False otherwise.
77
+ """
78
+ return self._set(storeContent=value)
79
+
80
+ titleFontSize = Param(
81
+ Params._dummy(),
82
+ "titleFontSize",
83
+ "Minimum font size threshold used as part of heuristic rules to detect "
84
+ "title elements based on formatting (e.g., bold, centered, capitalized).",
85
+ typeConverter=TypeConverters.toInt
86
+ )
87
+
88
+ def setTitleFontSize(self, value: int):
89
+ """Sets minimum font size for detecting titles.
90
+
91
+ Parameters
92
+ ----------
93
+ value : int
94
+ Minimum font size threshold for title detection.
95
+ """
96
+ return self._set(titleFontSize=value)
97
+
98
+ inferTableStructure = Param(
99
+ Params._dummy(),
100
+ "inferTableStructure",
101
+ "Whether to generate an HTML table representation from structured table content. "
102
+ "When enabled, a full <table> element is added alongside cell-level elements, "
103
+ "based on row and column layout.",
104
+ typeConverter=TypeConverters.toBoolean
105
+ )
106
+
107
+ def setInferTableStructure(self, value: bool):
108
+ """Sets whether to infer table structure.
109
+
110
+ Parameters
111
+ ----------
112
+ value : bool
113
+ True to generate HTML table representation, False otherwise.
114
+ """
115
+ return self._set(inferTableStructure=value)
116
+
117
+ includePageBreaks = Param(
118
+ Params._dummy(),
119
+ "includePageBreaks",
120
+ "Whether to detect and tag content with page break metadata. "
121
+ "In Word documents, this includes manual and section breaks. "
122
+ "In Excel files, this includes page breaks based on column boundaries.",
123
+ typeConverter=TypeConverters.toBoolean
124
+ )
125
+
126
+ def setIncludePageBreaks(self, value: bool):
127
+ """Sets whether to include page break metadata.
128
+
129
+ Parameters
130
+ ----------
131
+ value : bool
132
+ True to detect and tag page breaks, False otherwise.
133
+ """
134
+ return self._set(includePageBreaks=value)
135
+
136
+ ignoreExceptions = Param(
137
+ Params._dummy(),
138
+ "ignoreExceptions",
139
+ "Whether to ignore exceptions during processing.",
140
+ typeConverter=TypeConverters.toBoolean
141
+ )
142
+
143
+ def setIgnoreExceptions(self, value: bool):
144
+ """Sets whether to ignore exceptions during processing.
145
+
146
+ Parameters
147
+ ----------
148
+ value : bool
149
+ True to ignore exceptions, False otherwise.
150
+ """
151
+ return self._set(ignoreExceptions=value)
152
+
153
+ explodeDocs = Param(
154
+ Params._dummy(),
155
+ "explodeDocs",
156
+ "Whether to explode the documents into separate rows.",
157
+ typeConverter=TypeConverters.toBoolean
158
+ )
159
+
160
+ def setExplodeDocs(self, value: bool):
161
+ """Sets whether to explode the documents into separate rows.
162
+
163
+ Parameters
164
+ ----------
165
+ value : bool
166
+ True to split documents into multiple rows, False to keep them in one row.
167
+ """
168
+ return self._set(explodeDocs=value)
18
169
 
19
170
 
20
171
  class HasEmailReaderProperties(Params):
@@ -144,6 +295,28 @@ class HasHTMLReaderProperties(Params):
144
295
  self._call_java("setHeadersPython", headers)
145
296
  return self
146
297
 
298
+ outputFormat = Param(
299
+ Params._dummy(),
300
+ "outputFormat",
301
+ "Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
302
+ typeConverter=TypeConverters.toString
303
+ )
304
+
305
+ def setOutputFormat(self, value: str):
306
+ """Sets output format for the table content.
307
+
308
+ Options
309
+ -------
310
+ - 'plain-text'
311
+ - 'html-table'
312
+ - 'json-table' (default)
313
+
314
+ Parameters
315
+ ----------
316
+ value : str
317
+ Output format for the table content.
318
+ """
319
+ return self._set(outputFormat=value)
147
320
 
148
321
  class HasPowerPointProperties(Params):
149
322
 
@@ -317,3 +490,206 @@ class HasChunkerProperties(Params):
317
490
 
318
491
  def setOverlapAll(self, value):
319
492
  return self._set(overlapAll=value)
493
+
494
+
495
+ from pyspark.ml.param import Param, Params, TypeConverters
496
+
497
+
498
+ class HasPdfProperties(Params):
499
+
500
+ pageNumCol = Param(
501
+ Params._dummy(),
502
+ "pageNumCol",
503
+ "Page number output column name.",
504
+ typeConverter=TypeConverters.toString
505
+ )
506
+
507
+ def setPageNumCol(self, value: str):
508
+ """Sets page number output column name.
509
+
510
+ Parameters
511
+ ----------
512
+ value : str
513
+ Name of the column for page numbers.
514
+ """
515
+ return self._set(pageNumCol=value)
516
+
517
+ originCol = Param(
518
+ Params._dummy(),
519
+ "originCol",
520
+ "Input column name with original path of file.",
521
+ typeConverter=TypeConverters.toString
522
+ )
523
+
524
+ def setOriginCol(self, value: str):
525
+ """Sets input column with original file path.
526
+
527
+ Parameters
528
+ ----------
529
+ value : str
530
+ Column name that stores the file path.
531
+ """
532
+ return self._set(originCol=value)
533
+
534
+ partitionNum = Param(
535
+ Params._dummy(),
536
+ "partitionNum",
537
+ "Number of partitions.",
538
+ typeConverter=TypeConverters.toInt
539
+ )
540
+
541
+ def setPartitionNum(self, value: int):
542
+ """Sets number of partitions.
543
+
544
+ Parameters
545
+ ----------
546
+ value : int
547
+ Number of partitions to use.
548
+ """
549
+ return self._set(partitionNum=value)
550
+
551
+ storeSplittedPdf = Param(
552
+ Params._dummy(),
553
+ "storeSplittedPdf",
554
+ "Force to store bytes content of splitted pdf.",
555
+ typeConverter=TypeConverters.toBoolean
556
+ )
557
+
558
+ def setStoreSplittedPdf(self, value: bool):
559
+ """Sets whether to store byte content of split PDF pages.
560
+
561
+ Parameters
562
+ ----------
563
+ value : bool
564
+ True to store PDF page bytes, False otherwise.
565
+ """
566
+ return self._set(storeSplittedPdf=value)
567
+
568
+ splitPage = Param(
569
+ Params._dummy(),
570
+ "splitPage",
571
+ "Enable/disable splitting per page to identify page numbers and improve performance.",
572
+ typeConverter=TypeConverters.toBoolean
573
+ )
574
+
575
+ def setSplitPage(self, value: bool):
576
+ """Sets whether to split PDF into pages.
577
+
578
+ Parameters
579
+ ----------
580
+ value : bool
581
+ True to split per page, False otherwise.
582
+ """
583
+ return self._set(splitPage=value)
584
+
585
+ onlyPageNum = Param(
586
+ Params._dummy(),
587
+ "onlyPageNum",
588
+ "Extract only page numbers.",
589
+ typeConverter=TypeConverters.toBoolean
590
+ )
591
+
592
+ def setOnlyPageNum(self, value: bool):
593
+ """Sets whether to extract only page numbers.
594
+
595
+ Parameters
596
+ ----------
597
+ value : bool
598
+ True to extract only page numbers, False otherwise.
599
+ """
600
+ return self._set(onlyPageNum=value)
601
+
602
+ textStripper = Param(
603
+ Params._dummy(),
604
+ "textStripper",
605
+ "Text stripper type used for output layout and formatting.",
606
+ typeConverter=TypeConverters.toString
607
+ )
608
+
609
+ def setTextStripper(self, value: str):
610
+ """Sets text stripper type.
611
+
612
+ Parameters
613
+ ----------
614
+ value : str
615
+ Text stripper type for layout and formatting.
616
+ """
617
+ return self._set(textStripper=value)
618
+
619
+ sort = Param(
620
+ Params._dummy(),
621
+ "sort",
622
+ "Enable/disable sorting content on the page.",
623
+ typeConverter=TypeConverters.toBoolean
624
+ )
625
+
626
+ def setSort(self, value: bool):
627
+ """Sets whether to sort content on the page.
628
+
629
+ Parameters
630
+ ----------
631
+ value : bool
632
+ True to sort content, False otherwise.
633
+ """
634
+ return self._set(sort=value)
635
+
636
+ extractCoordinates = Param(
637
+ Params._dummy(),
638
+ "extractCoordinates",
639
+ "Force extract coordinates of text.",
640
+ typeConverter=TypeConverters.toBoolean
641
+ )
642
+
643
+ def setExtractCoordinates(self, value: bool):
644
+ """Sets whether to extract coordinates of text.
645
+
646
+ Parameters
647
+ ----------
648
+ value : bool
649
+ True to extract coordinates, False otherwise.
650
+ """
651
+ return self._set(extractCoordinates=value)
652
+
653
+ normalizeLigatures = Param(
654
+ Params._dummy(),
655
+ "normalizeLigatures",
656
+ "Whether to convert ligature chars such as 'fl' into its corresponding chars (e.g., {'f', 'l'}).",
657
+ typeConverter=TypeConverters.toBoolean
658
+ )
659
+
660
+ def setNormalizeLigatures(self, value: bool):
661
+ """Sets whether to normalize ligatures (e.g., fl → f + l).
662
+
663
+ Parameters
664
+ ----------
665
+ value : bool
666
+ True to normalize ligatures, False otherwise.
667
+ """
668
+ return self._set(normalizeLigatures=value)
669
+
670
+ readAsImage = Param(
671
+ Params._dummy(),
672
+ "readAsImage",
673
+ "Read PDF pages as images.",
674
+ typeConverter=TypeConverters.toBoolean
675
+ )
676
+
677
+ def setReadAsImage(self, value: bool):
678
+ """Sets whether to read PDF pages as images.
679
+
680
+ Parameters
681
+ ----------
682
+ value : bool
683
+ True to read as images, False otherwise.
684
+ """
685
+ return self._set(readAsImage=value)
686
+
687
+ def setOutputCol(self, value):
688
+ """Sets output column name.
689
+
690
+ Parameters
691
+ ----------
692
+ value : str
693
+ Name of the Output Column
694
+ """
695
+ return self._set(outputCol=value)
@@ -21,9 +21,10 @@ from sparknlp.partition.partition_properties import *
21
21
 
22
22
  class Reader2Doc(
23
23
  AnnotatorTransformer,
24
+ HasReaderProperties,
25
+ HasHTMLReaderProperties,
24
26
  HasEmailReaderProperties,
25
27
  HasExcelReaderProperties,
26
- HasHTMLReaderProperties,
27
28
  HasPowerPointProperties,
28
29
  HasTextReaderProperties
29
30
  ):
@@ -73,33 +74,6 @@ class Reader2Doc(
73
74
  name = "Reader2Doc"
74
75
  outputAnnotatorType = AnnotatorType.DOCUMENT
75
76
 
76
- contentPath = Param(
77
- Params._dummy(),
78
- "contentPath",
79
- "contentPath path to files to read",
80
- typeConverter=TypeConverters.toString
81
- )
82
-
83
- outputCol = Param(
84
- Params._dummy(),
85
- "outputCol",
86
- "output column name",
87
- typeConverter=TypeConverters.toString
88
- )
89
-
90
- contentType = Param(
91
- Params._dummy(),
92
- "contentType",
93
- "Set the content type to load following MIME specification",
94
- typeConverter=TypeConverters.toString
95
- )
96
-
97
- explodeDocs = Param(
98
- Params._dummy(),
99
- "explodeDocs",
100
- "whether to explode the documents into separate rows",
101
- typeConverter=TypeConverters.toBoolean
102
- )
103
77
 
104
78
  flattenOutput = Param(
105
79
  Params._dummy(),
@@ -115,13 +89,6 @@ class Reader2Doc(
115
89
  typeConverter=TypeConverters.toFloat
116
90
  )
117
91
 
118
- outputFormat = Param(
119
- Params._dummy(),
120
- "outputFormat",
121
- "Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
122
- typeConverter=TypeConverters.toString
123
- )
124
-
125
92
  outputAsDocument = Param(
126
93
  Params._dummy(),
127
94
  "outputAsDocument",
@@ -151,47 +118,6 @@ class Reader2Doc(
151
118
  kwargs = self._input_kwargs
152
119
  return self._set(**kwargs)
153
120
 
154
- def setContentPath(self, value):
155
- """Sets content path.
156
-
157
- Parameters
158
- ----------
159
- value : str
160
- contentPath path to files to read
161
- """
162
- return self._set(contentPath=value)
163
-
164
- def setContentType(self, value):
165
- """
166
- Set the content type to load following MIME specification
167
-
168
- Parameters
169
- ----------
170
- value : str
171
- content type to load following MIME specification
172
- """
173
- return self._set(contentType=value)
174
-
175
- def setExplodeDocs(self, value):
176
- """Sets whether to explode the documents into separate rows.
177
-
178
-
179
- Parameters
180
- ----------
181
- value : boolean
182
- Whether to explode the documents into separate rows
183
- """
184
- return self._set(explodeDocs=value)
185
-
186
- def setOutputCol(self, value):
187
- """Sets output column name.
188
-
189
- Parameters
190
- ----------
191
- value : str
192
- Name of the Output Column
193
- """
194
- return self._set(outputCol=value)
195
121
 
196
122
  def setFlattenOutput(self, value):
197
123
  """Sets whether to flatten the output to plain text with minimal metadata.
@@ -213,16 +139,6 @@ class Reader2Doc(
213
139
  """
214
140
  return self._set(titleThreshold=value)
215
141
 
216
- def setOutputFormat(self, value):
217
- """Sets the output format for the table content.
218
-
219
- Parameters
220
- ----------
221
- value : str
222
- Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.
223
- """
224
- return self._set(outputFormat=value)
225
-
226
142
  def setOutputAsDocument(self, value):
227
143
  """Sets whether to return all sentences joined into a single document.
228
144
 
@@ -0,0 +1,136 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from pyspark import keyword_only
15
+ from pyspark.ml.param import TypeConverters, Params, Param
16
+
17
+ from sparknlp.common import AnnotatorType
18
+ from sparknlp.internal import AnnotatorTransformer
19
+ from sparknlp.partition.partition_properties import *
20
+
21
+ class Reader2Image(
22
+ AnnotatorTransformer,
23
+ HasReaderProperties,
24
+ HasHTMLReaderProperties,
25
+ HasPdfProperties
26
+ ):
27
+ """
28
+ The Reader2Image annotator allows you to use the reading files with images more smoothly within existing
29
+ Spark NLP workflows, enabling seamless reuse of your pipelines. Reader2Image can be used for
30
+ extracting structured image content from various document types using Spark NLP readers. It supports
31
+ reading from many file types and returns parsed output as a structured Spark DataFrame.
32
+
33
+ Supported formats include HTML and Markdown.
34
+
35
+ == Example ==
36
+ This example demonstrates how to load HTML files with images and process them into a structured
37
+ Spark DataFrame using Reader2Image.
38
+
39
+ Expected output:
40
+ +-------------------+--------------------+
41
+ | fileName| image|
42
+ +-------------------+--------------------+
43
+ |example-images.html|[{image, example-...|
44
+ |example-images.html|[{image, example-...|
45
+ +-------------------+--------------------+
46
+
47
+ Schema:
48
+ root
49
+ |-- fileName: string (nullable = true)
50
+ |-- image: array (nullable = false)
51
+ | |-- element: struct (containsNull = true)
52
+ | | |-- annotatorType: string (nullable = true)
53
+ | | |-- origin: string (nullable = true)
54
+ | | |-- height: integer (nullable = false)
55
+ | | |-- width: integer (nullable = false)
56
+ | | |-- nChannels: integer (nullable = false)
57
+ | | |-- mode: integer (nullable = false)
58
+ | | |-- result: binary (nullable = true)
59
+ | | |-- metadata: map (nullable = true)
60
+ | | | |-- key: string
61
+ | | | |-- value: string (valueContainsNull = true)
62
+ | | |-- text: string (nullable = true)
63
+ """
64
+
65
+ name = "Reader2Image"
66
+ outputAnnotatorType = AnnotatorType.IMAGE
67
+
68
+ userMessage = Param(
69
+ Params._dummy(),
70
+ "userMessage",
71
+ "Custom user message.",
72
+ typeConverter=TypeConverters.toString
73
+ )
74
+
75
+ promptTemplate = Param(
76
+ Params._dummy(),
77
+ "promptTemplate",
78
+ "Format of the output prompt.",
79
+ typeConverter=TypeConverters.toString
80
+ )
81
+
82
+ customPromptTemplate = Param(
83
+ Params._dummy(),
84
+ "customPromptTemplate",
85
+ "Custom prompt template for image models.",
86
+ typeConverter=TypeConverters.toString
87
+ )
88
+
89
+ @keyword_only
90
+ def __init__(self):
91
+ super(Reader2Image, self).__init__(classname="com.johnsnowlabs.reader.Reader2Image")
92
+ self._setDefault(
93
+ contentType="",
94
+ outputFormat="image",
95
+ explodeDocs=True,
96
+ userMessage="Describe this image",
97
+ promptTemplate="qwen2vl-chat",
98
+ readAsImage=True,
99
+ customPromptTemplate="",
100
+ ignoreExceptions=True
101
+ )
102
+
103
+ @keyword_only
104
+ def setParams(self):
105
+ kwargs = self._input_kwargs
106
+ return self._set(**kwargs)
107
+
108
+ def setUserMessage(self, value: str):
109
+ """Sets custom user message.
110
+
111
+ Parameters
112
+ ----------
113
+ value : str
114
+ Custom user message to include.
115
+ """
116
+ return self._set(userMessage=value)
117
+
118
+ def setPromptTemplate(self, value: str):
119
+ """Sets format of the output prompt.
120
+
121
+ Parameters
122
+ ----------
123
+ value : str
124
+ Prompt template format.
125
+ """
126
+ return self._set(promptTemplate=value)
127
+
128
+ def setCustomPromptTemplate(self, value: str):
129
+ """Sets custom prompt template for image models.
130
+
131
+ Parameters
132
+ ----------
133
+ value : str
134
+ Custom prompt template string.
135
+ """
136
+ return self._set(customPromptTemplate=value)
@@ -13,14 +13,15 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from pyspark import keyword_only
16
- from pyspark.ml.param import TypeConverters, Params, Param
17
16
 
18
17
  from sparknlp.common import AnnotatorType
19
18
  from sparknlp.internal import AnnotatorTransformer
20
19
  from sparknlp.partition.partition_properties import *
21
20
 
21
+
22
22
  class Reader2Table(
23
23
  AnnotatorTransformer,
24
+ HasReaderProperties,
24
25
  HasEmailReaderProperties,
25
26
  HasExcelReaderProperties,
26
27
  HasHTMLReaderProperties,
@@ -31,34 +32,6 @@ class Reader2Table(
31
32
 
32
33
  outputAnnotatorType = AnnotatorType.DOCUMENT
33
34
 
34
- contentPath = Param(
35
- Params._dummy(),
36
- "contentPath",
37
- "contentPath path to files to read",
38
- typeConverter=TypeConverters.toString
39
- )
40
-
41
- outputCol = Param(
42
- Params._dummy(),
43
- "outputCol",
44
- "output column name",
45
- typeConverter=TypeConverters.toString
46
- )
47
-
48
- contentType = Param(
49
- Params._dummy(),
50
- "contentType",
51
- "Set the content type to load following MIME specification",
52
- typeConverter=TypeConverters.toString
53
- )
54
-
55
- explodeDocs = Param(
56
- Params._dummy(),
57
- "explodeDocs",
58
- "whether to explode the documents into separate rows",
59
- typeConverter=TypeConverters.toBoolean
60
- )
61
-
62
35
  flattenOutput = Param(
63
36
  Params._dummy(),
64
37
  "flattenOutput",
@@ -73,13 +46,6 @@ class Reader2Table(
73
46
  typeConverter=TypeConverters.toFloat
74
47
  )
75
48
 
76
- outputFormat = Param(
77
- Params._dummy(),
78
- "outputFormat",
79
- "Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
80
- typeConverter=TypeConverters.toString
81
- )
82
-
83
49
  @keyword_only
84
50
  def __init__(self):
85
51
  super(Reader2Table, self).__init__(classname="com.johnsnowlabs.reader.Reader2Table")
@@ -90,48 +56,6 @@ class Reader2Table(
90
56
  kwargs = self._input_kwargs
91
57
  return self._set(**kwargs)
92
58
 
93
- def setContentPath(self, value):
94
- """Sets content path.
95
-
96
- Parameters
97
- ----------
98
- value : str
99
- contentPath path to files to read
100
- """
101
- return self._set(contentPath=value)
102
-
103
- def setContentType(self, value):
104
- """
105
- Set the content type to load following MIME specification
106
-
107
- Parameters
108
- ----------
109
- value : str
110
- content type to load following MIME specification
111
- """
112
- return self._set(contentType=value)
113
-
114
- def setExplodeDocs(self, value):
115
- """Sets whether to explode the documents into separate rows.
116
-
117
-
118
- Parameters
119
- ----------
120
- value : boolean
121
- Whether to explode the documents into separate rows
122
- """
123
- return self._set(explodeDocs=value)
124
-
125
- def setOutputCol(self, value):
126
- """Sets output column name.
127
-
128
- Parameters
129
- ----------
130
- value : str
131
- Name of the Output Column
132
- """
133
- return self._set(outputCol=value)
134
-
135
59
  def setFlattenOutput(self, value):
136
60
  """Sets whether to flatten the output to plain text with minimal metadata.
137
61
 
@@ -151,13 +75,3 @@ class Reader2Table(
151
75
  Minimum font size threshold for title detection in PDF docs
152
76
  """
153
77
  return self._set(titleThreshold=value)
154
-
155
- def setOutputFormat(self, value):
156
- """Sets the output format for the table content.
157
-
158
- Parameters
159
- ----------
160
- value : str
161
- Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.
162
- """
163
- return self._set(outputFormat=value)