spark-nlp 6.1.3rc1__py2.py3-none-any.whl → 6.1.4__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spark-nlp might be problematic. Click here for more details.

@@ -0,0 +1,234 @@
1
+ # Copyright 2017-2024 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the GGUFRankingFinisher."""
15
+
16
+ from pyspark import keyword_only
17
+ from pyspark.ml.param import TypeConverters, Params, Param
18
+ from sparknlp.internal import AnnotatorTransformer
19
+
20
+
21
+ class GGUFRankingFinisher(AnnotatorTransformer):
22
+ """Finisher for AutoGGUFReranker outputs that provides ranking capabilities
23
+ including top-k selection, sorting by relevance score, and score normalization.
24
+
25
+ This finisher processes the output of AutoGGUFReranker, which contains documents with
26
+ relevance scores in their metadata. It provides several options for post-processing:
27
+
28
+ - Top-k selection: Select only the top k documents by relevance score
29
+ - Score thresholding: Filter documents by minimum relevance score
30
+ - Min-max scaling: Normalize relevance scores to 0-1 range
31
+ - Sorting: Sort documents by relevance score in descending order
32
+ - Ranking: Add rank information to document metadata
33
+
34
+ The finisher preserves the document annotation structure while adding ranking information
35
+ to the metadata and optionally filtering/sorting the documents.
36
+
37
+ For extended examples of usage, see the `Examples
38
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/finisher/gguf_ranking_finisher_example.py>`__.
39
+
40
+ ====================== ======================
41
+ Input Annotation types Output Annotation type
42
+ ====================== ======================
43
+ ``DOCUMENT`` ``DOCUMENT``
44
+ ====================== ======================
45
+
46
+ Parameters
47
+ ----------
48
+ inputCols
49
+ Name of input annotation columns containing reranked documents
50
+ outputCol
51
+ Name of output annotation column containing ranked documents, by default "ranked_documents"
52
+ topK
53
+ Maximum number of top documents to return based on relevance score (-1 for no limit), by default -1
54
+ minRelevanceScore
55
+ Minimum relevance score threshold for filtering documents, by default Double.MinValue
56
+ minMaxScaling
57
+ Whether to apply min-max scaling to normalize relevance scores to 0-1 range, by default False
58
+
59
+ Examples
60
+ --------
61
+ >>> import sparknlp
62
+ >>> from sparknlp.base import *
63
+ >>> from sparknlp.annotator import *
64
+ >>> from pyspark.ml import Pipeline
65
+ >>> documentAssembler = DocumentAssembler() \\
66
+ ... .setInputCol("text") \\
67
+ ... .setOutputCol("document")
68
+ >>> reranker = AutoGGUFReranker.pretrained() \\
69
+ ... .setInputCols("document") \\
70
+ ... .setOutputCol("reranked_documents") \\
71
+ ... .setQuery("A man is eating pasta.")
72
+ >>> finisher = GGUFRankingFinisher() \\
73
+ ... .setInputCols("reranked_documents") \\
74
+ ... .setOutputCol("ranked_documents") \\
75
+ ... .setTopK(3) \\
76
+ ... .setMinMaxScaling(True)
77
+ >>> pipeline = Pipeline().setStages([documentAssembler, reranker, finisher])
78
+ >>> data = spark.createDataFrame([
79
+ ... ("A man is eating food.",),
80
+ ... ("A man is eating a piece of bread.",),
81
+ ... ("The girl is carrying a baby.",),
82
+ ... ("A man is riding a horse.",)
83
+ ... ], ["text"])
84
+ >>> result = pipeline.fit(data).transform(data)
85
+ >>> result.select("ranked_documents").show(truncate=False)
86
+ # Documents will be sorted by relevance with rank information in metadata
87
+ """
88
+
89
+ name = "GGUFRankingFinisher"
90
+
91
+ inputCols = Param(Params._dummy(),
92
+ "inputCols",
93
+ "Name of input annotation columns containing reranked documents",
94
+ typeConverter=TypeConverters.toListString)
95
+
96
+ outputCol = Param(Params._dummy(),
97
+ "outputCol",
98
+ "Name of output annotation column containing ranked documents",
99
+ typeConverter=TypeConverters.toListString)
100
+
101
+ topK = Param(Params._dummy(),
102
+ "topK",
103
+ "Maximum number of top documents to return based on relevance score (-1 for no limit)",
104
+ typeConverter=TypeConverters.toInt)
105
+
106
+ minRelevanceScore = Param(Params._dummy(),
107
+ "minRelevanceScore",
108
+ "Minimum relevance score threshold for filtering documents",
109
+ typeConverter=TypeConverters.toFloat)
110
+
111
+ minMaxScaling = Param(Params._dummy(),
112
+ "minMaxScaling",
113
+ "Whether to apply min-max scaling to normalize relevance scores to 0-1 range",
114
+ typeConverter=TypeConverters.toBoolean)
115
+
116
+ @keyword_only
117
+ def __init__(self):
118
+ super(GGUFRankingFinisher, self).__init__(
119
+ classname="com.johnsnowlabs.nlp.finisher.GGUFRankingFinisher")
120
+ self._setDefault(
121
+ topK=-1,
122
+ minRelevanceScore=float('-inf'), # Equivalent to Double.MinValue
123
+ minMaxScaling=False,
124
+ outputCol=["ranked_documents"]
125
+ )
126
+
127
+ @keyword_only
128
+ def setParams(self):
129
+ kwargs = self._input_kwargs
130
+ return self._set(**kwargs)
131
+
132
+ def setInputCols(self, *value):
133
+ """Sets input annotation column names.
134
+
135
+ Parameters
136
+ ----------
137
+ value : List[str]
138
+ Input annotation column names containing reranked documents
139
+ """
140
+ if len(value) == 1 and isinstance(value[0], list):
141
+ return self._set(inputCols=value[0])
142
+ else:
143
+ return self._set(inputCols=list(value))
144
+
145
+ def getInputCols(self):
146
+ """Gets input annotation column names.
147
+
148
+ Returns
149
+ -------
150
+ List[str]
151
+ Input annotation column names
152
+ """
153
+ return self.getOrDefault(self.inputCols)
154
+
155
+ def setOutputCol(self, value):
156
+ """Sets output annotation column name.
157
+
158
+ Parameters
159
+ ----------
160
+ value : str
161
+ Output annotation column name
162
+ """
163
+ return self._set(outputCol=[value])
164
+
165
+ def getOutputCol(self):
166
+ """Gets output annotation column name.
167
+
168
+ Returns
169
+ -------
170
+ str
171
+ Output annotation column name
172
+ """
173
+ output_cols = self.getOrDefault(self.outputCol)
174
+ return output_cols[0] if output_cols else "ranked_documents"
175
+
176
+ def setTopK(self, value):
177
+ """Sets maximum number of top documents to return.
178
+
179
+ Parameters
180
+ ----------
181
+ value : int
182
+ Maximum number of top documents to return (-1 for no limit)
183
+ """
184
+ return self._set(topK=value)
185
+
186
+ def getTopK(self):
187
+ """Gets maximum number of top documents to return.
188
+
189
+ Returns
190
+ -------
191
+ int
192
+ Maximum number of top documents to return
193
+ """
194
+ return self.getOrDefault(self.topK)
195
+
196
+ def setMinRelevanceScore(self, value):
197
+ """Sets minimum relevance score threshold.
198
+
199
+ Parameters
200
+ ----------
201
+ value : float
202
+ Minimum relevance score threshold
203
+ """
204
+ return self._set(minRelevanceScore=value)
205
+
206
+ def getMinRelevanceScore(self):
207
+ """Gets minimum relevance score threshold.
208
+
209
+ Returns
210
+ -------
211
+ float
212
+ Minimum relevance score threshold
213
+ """
214
+ return self.getOrDefault(self.minRelevanceScore)
215
+
216
+ def setMinMaxScaling(self, value):
217
+ """Sets whether to apply min-max scaling.
218
+
219
+ Parameters
220
+ ----------
221
+ value : bool
222
+ Whether to apply min-max scaling to normalize scores
223
+ """
224
+ return self._set(minMaxScaling=value)
225
+
226
+ def getMinMaxScaling(self):
227
+ """Gets whether to apply min-max scaling.
228
+
229
+ Returns
230
+ -------
231
+ bool
232
+ Whether min-max scaling is enabled
233
+ """
234
+ return self.getOrDefault(self.minMaxScaling)
@@ -13,8 +13,159 @@
13
13
  # limitations under the License.
14
14
  """Contains classes for partition properties used in reading various document types."""
15
15
  from typing import Dict
16
+ from pyspark.ml.param import Param, Params, TypeConverters
16
17
 
17
- from pyspark.ml.param import TypeConverters, Params, Param
18
+
19
+ class HasReaderProperties(Params):
20
+
21
+ outputCol = Param(
22
+ Params._dummy(),
23
+ "outputCol",
24
+ "output column name",
25
+ typeConverter=TypeConverters.toString
26
+ )
27
+
28
+ contentPath = Param(
29
+ Params._dummy(),
30
+ "contentPath",
31
+ "Path to the content source.",
32
+ typeConverter=TypeConverters.toString
33
+ )
34
+
35
+ def setContentPath(self, value: str):
36
+ """Sets content path.
37
+
38
+ Parameters
39
+ ----------
40
+ value : str
41
+ Path to the content source.
42
+ """
43
+ return self._set(contentPath=value)
44
+
45
+ contentType = Param(
46
+ Params._dummy(),
47
+ "contentType",
48
+ "Set the content type to load following MIME specification.",
49
+ typeConverter=TypeConverters.toString
50
+ )
51
+
52
+ def setContentType(self, value: str):
53
+ """Sets content type following MIME specification.
54
+
55
+ Parameters
56
+ ----------
57
+ value : str
58
+ Content type string (MIME format).
59
+ """
60
+ return self._set(contentType=value)
61
+
62
+ storeContent = Param(
63
+ Params._dummy(),
64
+ "storeContent",
65
+ "Whether to include the raw file content in the output DataFrame "
66
+ "as a separate 'content' column, alongside the structured output.",
67
+ typeConverter=TypeConverters.toBoolean
68
+ )
69
+
70
+ def setStoreContent(self, value: bool):
71
+ """Sets whether to store raw file content.
72
+
73
+ Parameters
74
+ ----------
75
+ value : bool
76
+ True to include raw file content, False otherwise.
77
+ """
78
+ return self._set(storeContent=value)
79
+
80
+ titleFontSize = Param(
81
+ Params._dummy(),
82
+ "titleFontSize",
83
+ "Minimum font size threshold used as part of heuristic rules to detect "
84
+ "title elements based on formatting (e.g., bold, centered, capitalized).",
85
+ typeConverter=TypeConverters.toInt
86
+ )
87
+
88
+ def setTitleFontSize(self, value: int):
89
+ """Sets minimum font size for detecting titles.
90
+
91
+ Parameters
92
+ ----------
93
+ value : int
94
+ Minimum font size threshold for title detection.
95
+ """
96
+ return self._set(titleFontSize=value)
97
+
98
+ inferTableStructure = Param(
99
+ Params._dummy(),
100
+ "inferTableStructure",
101
+ "Whether to generate an HTML table representation from structured table content. "
102
+ "When enabled, a full <table> element is added alongside cell-level elements, "
103
+ "based on row and column layout.",
104
+ typeConverter=TypeConverters.toBoolean
105
+ )
106
+
107
+ def setInferTableStructure(self, value: bool):
108
+ """Sets whether to infer table structure.
109
+
110
+ Parameters
111
+ ----------
112
+ value : bool
113
+ True to generate HTML table representation, False otherwise.
114
+ """
115
+ return self._set(inferTableStructure=value)
116
+
117
+ includePageBreaks = Param(
118
+ Params._dummy(),
119
+ "includePageBreaks",
120
+ "Whether to detect and tag content with page break metadata. "
121
+ "In Word documents, this includes manual and section breaks. "
122
+ "In Excel files, this includes page breaks based on column boundaries.",
123
+ typeConverter=TypeConverters.toBoolean
124
+ )
125
+
126
+ def setIncludePageBreaks(self, value: bool):
127
+ """Sets whether to include page break metadata.
128
+
129
+ Parameters
130
+ ----------
131
+ value : bool
132
+ True to detect and tag page breaks, False otherwise.
133
+ """
134
+ return self._set(includePageBreaks=value)
135
+
136
+ ignoreExceptions = Param(
137
+ Params._dummy(),
138
+ "ignoreExceptions",
139
+ "Whether to ignore exceptions during processing.",
140
+ typeConverter=TypeConverters.toBoolean
141
+ )
142
+
143
+ def setIgnoreExceptions(self, value: bool):
144
+ """Sets whether to ignore exceptions during processing.
145
+
146
+ Parameters
147
+ ----------
148
+ value : bool
149
+ True to ignore exceptions, False otherwise.
150
+ """
151
+ return self._set(ignoreExceptions=value)
152
+
153
+ explodeDocs = Param(
154
+ Params._dummy(),
155
+ "explodeDocs",
156
+ "Whether to explode the documents into separate rows.",
157
+ typeConverter=TypeConverters.toBoolean
158
+ )
159
+
160
+ def setExplodeDocs(self, value: bool):
161
+ """Sets whether to explode the documents into separate rows.
162
+
163
+ Parameters
164
+ ----------
165
+ value : bool
166
+ True to split documents into multiple rows, False to keep them in one row.
167
+ """
168
+ return self._set(explodeDocs=value)
18
169
 
19
170
 
20
171
  class HasEmailReaderProperties(Params):
@@ -144,6 +295,28 @@ class HasHTMLReaderProperties(Params):
144
295
  self._call_java("setHeadersPython", headers)
145
296
  return self
146
297
 
298
+ outputFormat = Param(
299
+ Params._dummy(),
300
+ "outputFormat",
301
+ "Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
302
+ typeConverter=TypeConverters.toString
303
+ )
304
+
305
+ def setOutputFormat(self, value: str):
306
+ """Sets output format for the table content.
307
+
308
+ Options
309
+ -------
310
+ - 'plain-text'
311
+ - 'html-table'
312
+ - 'json-table' (default)
313
+
314
+ Parameters
315
+ ----------
316
+ value : str
317
+ Output format for the table content.
318
+ """
319
+ return self._set(outputFormat=value)
147
320
 
148
321
  class HasPowerPointProperties(Params):
149
322
 
@@ -317,3 +490,206 @@ class HasChunkerProperties(Params):
317
490
 
318
491
  def setOverlapAll(self, value):
319
492
  return self._set(overlapAll=value)
493
+
494
+
495
+ from pyspark.ml.param import Param, Params, TypeConverters
496
+
497
+
498
+ class HasPdfProperties(Params):
499
+
500
+ pageNumCol = Param(
501
+ Params._dummy(),
502
+ "pageNumCol",
503
+ "Page number output column name.",
504
+ typeConverter=TypeConverters.toString
505
+ )
506
+
507
+ def setPageNumCol(self, value: str):
508
+ """Sets page number output column name.
509
+
510
+ Parameters
511
+ ----------
512
+ value : str
513
+ Name of the column for page numbers.
514
+ """
515
+ return self._set(pageNumCol=value)
516
+
517
+ originCol = Param(
518
+ Params._dummy(),
519
+ "originCol",
520
+ "Input column name with original path of file.",
521
+ typeConverter=TypeConverters.toString
522
+ )
523
+
524
+ def setOriginCol(self, value: str):
525
+ """Sets input column with original file path.
526
+
527
+ Parameters
528
+ ----------
529
+ value : str
530
+ Column name that stores the file path.
531
+ """
532
+ return self._set(originCol=value)
533
+
534
+ partitionNum = Param(
535
+ Params._dummy(),
536
+ "partitionNum",
537
+ "Number of partitions.",
538
+ typeConverter=TypeConverters.toInt
539
+ )
540
+
541
+ def setPartitionNum(self, value: int):
542
+ """Sets number of partitions.
543
+
544
+ Parameters
545
+ ----------
546
+ value : int
547
+ Number of partitions to use.
548
+ """
549
+ return self._set(partitionNum=value)
550
+
551
+ storeSplittedPdf = Param(
552
+ Params._dummy(),
553
+ "storeSplittedPdf",
554
+ "Force to store bytes content of splitted pdf.",
555
+ typeConverter=TypeConverters.toBoolean
556
+ )
557
+
558
+ def setStoreSplittedPdf(self, value: bool):
559
+ """Sets whether to store byte content of split PDF pages.
560
+
561
+ Parameters
562
+ ----------
563
+ value : bool
564
+ True to store PDF page bytes, False otherwise.
565
+ """
566
+ return self._set(storeSplittedPdf=value)
567
+
568
+ splitPage = Param(
569
+ Params._dummy(),
570
+ "splitPage",
571
+ "Enable/disable splitting per page to identify page numbers and improve performance.",
572
+ typeConverter=TypeConverters.toBoolean
573
+ )
574
+
575
+ def setSplitPage(self, value: bool):
576
+ """Sets whether to split PDF into pages.
577
+
578
+ Parameters
579
+ ----------
580
+ value : bool
581
+ True to split per page, False otherwise.
582
+ """
583
+ return self._set(splitPage=value)
584
+
585
+ onlyPageNum = Param(
586
+ Params._dummy(),
587
+ "onlyPageNum",
588
+ "Extract only page numbers.",
589
+ typeConverter=TypeConverters.toBoolean
590
+ )
591
+
592
+ def setOnlyPageNum(self, value: bool):
593
+ """Sets whether to extract only page numbers.
594
+
595
+ Parameters
596
+ ----------
597
+ value : bool
598
+ True to extract only page numbers, False otherwise.
599
+ """
600
+ return self._set(onlyPageNum=value)
601
+
602
+ textStripper = Param(
603
+ Params._dummy(),
604
+ "textStripper",
605
+ "Text stripper type used for output layout and formatting.",
606
+ typeConverter=TypeConverters.toString
607
+ )
608
+
609
+ def setTextStripper(self, value: str):
610
+ """Sets text stripper type.
611
+
612
+ Parameters
613
+ ----------
614
+ value : str
615
+ Text stripper type for layout and formatting.
616
+ """
617
+ return self._set(textStripper=value)
618
+
619
+ sort = Param(
620
+ Params._dummy(),
621
+ "sort",
622
+ "Enable/disable sorting content on the page.",
623
+ typeConverter=TypeConverters.toBoolean
624
+ )
625
+
626
+ def setSort(self, value: bool):
627
+ """Sets whether to sort content on the page.
628
+
629
+ Parameters
630
+ ----------
631
+ value : bool
632
+ True to sort content, False otherwise.
633
+ """
634
+ return self._set(sort=value)
635
+
636
+ extractCoordinates = Param(
637
+ Params._dummy(),
638
+ "extractCoordinates",
639
+ "Force extract coordinates of text.",
640
+ typeConverter=TypeConverters.toBoolean
641
+ )
642
+
643
+ def setExtractCoordinates(self, value: bool):
644
+ """Sets whether to extract coordinates of text.
645
+
646
+ Parameters
647
+ ----------
648
+ value : bool
649
+ True to extract coordinates, False otherwise.
650
+ """
651
+ return self._set(extractCoordinates=value)
652
+
653
+ normalizeLigatures = Param(
654
+ Params._dummy(),
655
+ "normalizeLigatures",
656
+ "Whether to convert ligature chars such as 'fl' into its corresponding chars (e.g., {'f', 'l'}).",
657
+ typeConverter=TypeConverters.toBoolean
658
+ )
659
+
660
+ def setNormalizeLigatures(self, value: bool):
661
+ """Sets whether to normalize ligatures (e.g., fl → f + l).
662
+
663
+ Parameters
664
+ ----------
665
+ value : bool
666
+ True to normalize ligatures, False otherwise.
667
+ """
668
+ return self._set(normalizeLigatures=value)
669
+
670
+ readAsImage = Param(
671
+ Params._dummy(),
672
+ "readAsImage",
673
+ "Read PDF pages as images.",
674
+ typeConverter=TypeConverters.toBoolean
675
+ )
676
+
677
+ def setReadAsImage(self, value: bool):
678
+ """Sets whether to read PDF pages as images.
679
+
680
+ Parameters
681
+ ----------
682
+ value : bool
683
+ True to read as images, False otherwise.
684
+ """
685
+ return self._set(readAsImage=value)
686
+
687
+ def setOutputCol(self, value):
688
+ """Sets output column name.
689
+
690
+ Parameters
691
+ ----------
692
+ value : str
693
+ Name of the Output Column
694
+ """
695
+ return self._set(outputCol=value)