spark-nlp 6.1.3rc1__py2.py3-none-any.whl → 6.1.5__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spark-nlp might be problematic. Click here for more details.

@@ -0,0 +1,234 @@
1
+ # Copyright 2017-2024 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the GGUFRankingFinisher."""
15
+
16
+ from pyspark import keyword_only
17
+ from pyspark.ml.param import TypeConverters, Params, Param
18
+ from sparknlp.internal import AnnotatorTransformer
19
+
20
+
21
+ class GGUFRankingFinisher(AnnotatorTransformer):
22
+ """Finisher for AutoGGUFReranker outputs that provides ranking capabilities
23
+ including top-k selection, sorting by relevance score, and score normalization.
24
+
25
+ This finisher processes the output of AutoGGUFReranker, which contains documents with
26
+ relevance scores in their metadata. It provides several options for post-processing:
27
+
28
+ - Top-k selection: Select only the top k documents by relevance score
29
+ - Score thresholding: Filter documents by minimum relevance score
30
+ - Min-max scaling: Normalize relevance scores to 0-1 range
31
+ - Sorting: Sort documents by relevance score in descending order
32
+ - Ranking: Add rank information to document metadata
33
+
34
+ The finisher preserves the document annotation structure while adding ranking information
35
+ to the metadata and optionally filtering/sorting the documents.
36
+
37
+ For extended examples of usage, see the `Examples
38
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/finisher/gguf_ranking_finisher_example.py>`__.
39
+
40
+ ====================== ======================
41
+ Input Annotation types Output Annotation type
42
+ ====================== ======================
43
+ ``DOCUMENT`` ``DOCUMENT``
44
+ ====================== ======================
45
+
46
+ Parameters
47
+ ----------
48
+ inputCols
49
+ Name of input annotation columns containing reranked documents
50
+ outputCol
51
+ Name of output annotation column containing ranked documents, by default "ranked_documents"
52
+ topK
53
+ Maximum number of top documents to return based on relevance score (-1 for no limit), by default -1
54
+ minRelevanceScore
55
+ Minimum relevance score threshold for filtering documents, by default Double.MinValue
56
+ minMaxScaling
57
+ Whether to apply min-max scaling to normalize relevance scores to 0-1 range, by default False
58
+
59
+ Examples
60
+ --------
61
+ >>> import sparknlp
62
+ >>> from sparknlp.base import *
63
+ >>> from sparknlp.annotator import *
64
+ >>> from pyspark.ml import Pipeline
65
+ >>> documentAssembler = DocumentAssembler() \\
66
+ ... .setInputCol("text") \\
67
+ ... .setOutputCol("document")
68
+ >>> reranker = AutoGGUFReranker.pretrained() \\
69
+ ... .setInputCols("document") \\
70
+ ... .setOutputCol("reranked_documents") \\
71
+ ... .setQuery("A man is eating pasta.")
72
+ >>> finisher = GGUFRankingFinisher() \\
73
+ ... .setInputCols("reranked_documents") \\
74
+ ... .setOutputCol("ranked_documents") \\
75
+ ... .setTopK(3) \\
76
+ ... .setMinMaxScaling(True)
77
+ >>> pipeline = Pipeline().setStages([documentAssembler, reranker, finisher])
78
+ >>> data = spark.createDataFrame([
79
+ ... ("A man is eating food.",),
80
+ ... ("A man is eating a piece of bread.",),
81
+ ... ("The girl is carrying a baby.",),
82
+ ... ("A man is riding a horse.",)
83
+ ... ], ["text"])
84
+ >>> result = pipeline.fit(data).transform(data)
85
+ >>> result.select("ranked_documents").show(truncate=False)
86
+ # Documents will be sorted by relevance with rank information in metadata
87
+ """
88
+
89
+ name = "GGUFRankingFinisher"
90
+
91
+ inputCols = Param(Params._dummy(),
92
+ "inputCols",
93
+ "Name of input annotation columns containing reranked documents",
94
+ typeConverter=TypeConverters.toListString)
95
+
96
+ outputCol = Param(Params._dummy(),
97
+ "outputCol",
98
+ "Name of output annotation column containing ranked documents",
99
+ typeConverter=TypeConverters.toListString)
100
+
101
+ topK = Param(Params._dummy(),
102
+ "topK",
103
+ "Maximum number of top documents to return based on relevance score (-1 for no limit)",
104
+ typeConverter=TypeConverters.toInt)
105
+
106
+ minRelevanceScore = Param(Params._dummy(),
107
+ "minRelevanceScore",
108
+ "Minimum relevance score threshold for filtering documents",
109
+ typeConverter=TypeConverters.toFloat)
110
+
111
+ minMaxScaling = Param(Params._dummy(),
112
+ "minMaxScaling",
113
+ "Whether to apply min-max scaling to normalize relevance scores to 0-1 range",
114
+ typeConverter=TypeConverters.toBoolean)
115
+
116
+ @keyword_only
117
+ def __init__(self):
118
+ super(GGUFRankingFinisher, self).__init__(
119
+ classname="com.johnsnowlabs.nlp.finisher.GGUFRankingFinisher")
120
+ self._setDefault(
121
+ topK=-1,
122
+ minRelevanceScore=float('-inf'), # Equivalent to Double.MinValue
123
+ minMaxScaling=False,
124
+ outputCol=["ranked_documents"]
125
+ )
126
+
127
+ @keyword_only
128
+ def setParams(self):
129
+ kwargs = self._input_kwargs
130
+ return self._set(**kwargs)
131
+
132
+ def setInputCols(self, *value):
133
+ """Sets input annotation column names.
134
+
135
+ Parameters
136
+ ----------
137
+ value : List[str]
138
+ Input annotation column names containing reranked documents
139
+ """
140
+ if len(value) == 1 and isinstance(value[0], list):
141
+ return self._set(inputCols=value[0])
142
+ else:
143
+ return self._set(inputCols=list(value))
144
+
145
+ def getInputCols(self):
146
+ """Gets input annotation column names.
147
+
148
+ Returns
149
+ -------
150
+ List[str]
151
+ Input annotation column names
152
+ """
153
+ return self.getOrDefault(self.inputCols)
154
+
155
+ def setOutputCol(self, value):
156
+ """Sets output annotation column name.
157
+
158
+ Parameters
159
+ ----------
160
+ value : str
161
+ Output annotation column name
162
+ """
163
+ return self._set(outputCol=[value])
164
+
165
+ def getOutputCol(self):
166
+ """Gets output annotation column name.
167
+
168
+ Returns
169
+ -------
170
+ str
171
+ Output annotation column name
172
+ """
173
+ output_cols = self.getOrDefault(self.outputCol)
174
+ return output_cols[0] if output_cols else "ranked_documents"
175
+
176
+ def setTopK(self, value):
177
+ """Sets maximum number of top documents to return.
178
+
179
+ Parameters
180
+ ----------
181
+ value : int
182
+ Maximum number of top documents to return (-1 for no limit)
183
+ """
184
+ return self._set(topK=value)
185
+
186
+ def getTopK(self):
187
+ """Gets maximum number of top documents to return.
188
+
189
+ Returns
190
+ -------
191
+ int
192
+ Maximum number of top documents to return
193
+ """
194
+ return self.getOrDefault(self.topK)
195
+
196
+ def setMinRelevanceScore(self, value):
197
+ """Sets minimum relevance score threshold.
198
+
199
+ Parameters
200
+ ----------
201
+ value : float
202
+ Minimum relevance score threshold
203
+ """
204
+ return self._set(minRelevanceScore=value)
205
+
206
+ def getMinRelevanceScore(self):
207
+ """Gets minimum relevance score threshold.
208
+
209
+ Returns
210
+ -------
211
+ float
212
+ Minimum relevance score threshold
213
+ """
214
+ return self.getOrDefault(self.minRelevanceScore)
215
+
216
+ def setMinMaxScaling(self, value):
217
+ """Sets whether to apply min-max scaling.
218
+
219
+ Parameters
220
+ ----------
221
+ value : bool
222
+ Whether to apply min-max scaling to normalize scores
223
+ """
224
+ return self._set(minMaxScaling=value)
225
+
226
+ def getMinMaxScaling(self):
227
+ """Gets whether to apply min-max scaling.
228
+
229
+ Returns
230
+ -------
231
+ bool
232
+ Whether min-max scaling is enabled
233
+ """
234
+ return self.getOrDefault(self.minMaxScaling)
@@ -13,9 +13,237 @@
13
13
  # limitations under the License.
14
14
  """Contains classes for partition properties used in reading various document types."""
15
15
  from typing import Dict
16
+ from pyspark.ml.param import Param, Params, TypeConverters
16
17
 
17
- from pyspark.ml.param import TypeConverters, Params, Param
18
18
 
19
+ class HasReaderProperties(Params):
20
+
21
+ inputCol = Param(
22
+ Params._dummy(),
23
+ "inputCol",
24
+ "input column name",
25
+ typeConverter=TypeConverters.toString
26
+ )
27
+
28
+ def setInputCol(self, value):
29
+ """Sets input column name.
30
+
31
+ Parameters
32
+ ----------
33
+ value : str
34
+ Name of the Input Column
35
+ """
36
+ return self._set(inputCol=value)
37
+
38
+ outputCol = Param(
39
+ Params._dummy(),
40
+ "outputCol",
41
+ "output column name",
42
+ typeConverter=TypeConverters.toString
43
+ )
44
+
45
+ def setOutputCol(self, value):
46
+ """Sets output column name.
47
+
48
+ Parameters
49
+ ----------
50
+ value : str
51
+ Name of the Output Column
52
+ """
53
+ return self._set(outputCol=value)
54
+
55
+ contentPath = Param(
56
+ Params._dummy(),
57
+ "contentPath",
58
+ "Path to the content source.",
59
+ typeConverter=TypeConverters.toString
60
+ )
61
+
62
+ def setContentPath(self, value: str):
63
+ """Sets content path.
64
+
65
+ Parameters
66
+ ----------
67
+ value : str
68
+ Path to the content source.
69
+ """
70
+ return self._set(contentPath=value)
71
+
72
+ contentType = Param(
73
+ Params._dummy(),
74
+ "contentType",
75
+ "Set the content type to load following MIME specification.",
76
+ typeConverter=TypeConverters.toString
77
+ )
78
+
79
+ def setContentType(self, value: str):
80
+ """Sets content type following MIME specification.
81
+
82
+ Parameters
83
+ ----------
84
+ value : str
85
+ Content type string (MIME format).
86
+ """
87
+ return self._set(contentType=value)
88
+
89
+ storeContent = Param(
90
+ Params._dummy(),
91
+ "storeContent",
92
+ "Whether to include the raw file content in the output DataFrame "
93
+ "as a separate 'content' column, alongside the structured output.",
94
+ typeConverter=TypeConverters.toBoolean
95
+ )
96
+
97
+ def setStoreContent(self, value: bool):
98
+ """Sets whether to store raw file content.
99
+
100
+ Parameters
101
+ ----------
102
+ value : bool
103
+ True to include raw file content, False otherwise.
104
+ """
105
+ return self._set(storeContent=value)
106
+
107
+ titleFontSize = Param(
108
+ Params._dummy(),
109
+ "titleFontSize",
110
+ "Minimum font size threshold used as part of heuristic rules to detect "
111
+ "title elements based on formatting (e.g., bold, centered, capitalized).",
112
+ typeConverter=TypeConverters.toInt
113
+ )
114
+
115
+ def setTitleFontSize(self, value: int):
116
+ """Sets minimum font size for detecting titles.
117
+
118
+ Parameters
119
+ ----------
120
+ value : int
121
+ Minimum font size threshold for title detection.
122
+ """
123
+ return self._set(titleFontSize=value)
124
+
125
+ inferTableStructure = Param(
126
+ Params._dummy(),
127
+ "inferTableStructure",
128
+ "Whether to generate an HTML table representation from structured table content. "
129
+ "When enabled, a full <table> element is added alongside cell-level elements, "
130
+ "based on row and column layout.",
131
+ typeConverter=TypeConverters.toBoolean
132
+ )
133
+
134
+ def setInferTableStructure(self, value: bool):
135
+ """Sets whether to infer table structure.
136
+
137
+ Parameters
138
+ ----------
139
+ value : bool
140
+ True to generate HTML table representation, False otherwise.
141
+ """
142
+ return self._set(inferTableStructure=value)
143
+
144
+ includePageBreaks = Param(
145
+ Params._dummy(),
146
+ "includePageBreaks",
147
+ "Whether to detect and tag content with page break metadata. "
148
+ "In Word documents, this includes manual and section breaks. "
149
+ "In Excel files, this includes page breaks based on column boundaries.",
150
+ typeConverter=TypeConverters.toBoolean
151
+ )
152
+
153
+ def setIncludePageBreaks(self, value: bool):
154
+ """Sets whether to include page break metadata.
155
+
156
+ Parameters
157
+ ----------
158
+ value : bool
159
+ True to detect and tag page breaks, False otherwise.
160
+ """
161
+ return self._set(includePageBreaks=value)
162
+
163
+ ignoreExceptions = Param(
164
+ Params._dummy(),
165
+ "ignoreExceptions",
166
+ "Whether to ignore exceptions during processing.",
167
+ typeConverter=TypeConverters.toBoolean
168
+ )
169
+
170
+ def setIgnoreExceptions(self, value: bool):
171
+ """Sets whether to ignore exceptions during processing.
172
+
173
+ Parameters
174
+ ----------
175
+ value : bool
176
+ True to ignore exceptions, False otherwise.
177
+ """
178
+ return self._set(ignoreExceptions=value)
179
+
180
+ explodeDocs = Param(
181
+ Params._dummy(),
182
+ "explodeDocs",
183
+ "Whether to explode the documents into separate rows.",
184
+ typeConverter=TypeConverters.toBoolean
185
+ )
186
+
187
+ def setExplodeDocs(self, value: bool):
188
+ """Sets whether to explode the documents into separate rows.
189
+
190
+ Parameters
191
+ ----------
192
+ value : bool
193
+ True to split documents into multiple rows, False to keep them in one row.
194
+ """
195
+ return self._set(explodeDocs=value)
196
+
197
+ flattenOutput = Param(
198
+ Params._dummy(),
199
+ "flattenOutput",
200
+ "If true, output is flattened to plain text with minimal metadata",
201
+ typeConverter=TypeConverters.toBoolean
202
+ )
203
+
204
+ def setFlattenOutput(self, value):
205
+ """Sets whether to flatten the output to plain text with minimal metadata.
206
+
207
+ ParametersF
208
+ ----------
209
+ value : bool
210
+ If true, output is flattened to plain text with minimal metadata
211
+ """
212
+ return self._set(flattenOutput=value)
213
+
214
+ titleThreshold = Param(
215
+ Params._dummy(),
216
+ "titleThreshold",
217
+ "Minimum font size threshold for title detection in PDF docs",
218
+ typeConverter=TypeConverters.toFloat
219
+ )
220
+
221
+ def setTitleThreshold(self, value):
222
+ """Sets the minimum font size threshold for title detection in PDF documents.
223
+
224
+ Parameters
225
+ ----------
226
+ value : float
227
+ Minimum font size threshold for title detection in PDF docs
228
+ """
229
+ return self._set(titleThreshold=value)
230
+
231
+ outputAsDocument = Param(
232
+ Params._dummy(),
233
+ "outputAsDocument",
234
+ "Whether to return all sentences joined into a single document",
235
+ typeConverter=TypeConverters.toBoolean
236
+ )
237
+
238
+ def setOutputAsDocument(self, value):
239
+ """Sets whether to return all sentences joined into a single document.
240
+
241
+ Parameters
242
+ ----------
243
+ value : bool
244
+ Whether to return all sentences joined into a single document
245
+ """
246
+ return self._set(outputAsDocument=value)
19
247
 
20
248
  class HasEmailReaderProperties(Params):
21
249
 
@@ -144,6 +372,28 @@ class HasHTMLReaderProperties(Params):
144
372
  self._call_java("setHeadersPython", headers)
145
373
  return self
146
374
 
375
+ outputFormat = Param(
376
+ Params._dummy(),
377
+ "outputFormat",
378
+ "Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
379
+ typeConverter=TypeConverters.toString
380
+ )
381
+
382
+ def setOutputFormat(self, value: str):
383
+ """Sets output format for the table content.
384
+
385
+ Options
386
+ -------
387
+ - 'plain-text'
388
+ - 'html-table'
389
+ - 'json-table' (default)
390
+
391
+ Parameters
392
+ ----------
393
+ value : str
394
+ Output format for the table content.
395
+ """
396
+ return self._set(outputFormat=value)
147
397
 
148
398
  class HasPowerPointProperties(Params):
149
399
 
@@ -317,3 +567,196 @@ class HasChunkerProperties(Params):
317
567
 
318
568
  def setOverlapAll(self, value):
319
569
  return self._set(overlapAll=value)
570
+
571
+
572
+ from pyspark.ml.param import Param, Params, TypeConverters
573
+
574
+
575
+ class HasPdfProperties(Params):
576
+
577
+ pageNumCol = Param(
578
+ Params._dummy(),
579
+ "pageNumCol",
580
+ "Page number output column name.",
581
+ typeConverter=TypeConverters.toString
582
+ )
583
+
584
+ def setPageNumCol(self, value: str):
585
+ """Sets page number output column name.
586
+
587
+ Parameters
588
+ ----------
589
+ value : str
590
+ Name of the column for page numbers.
591
+ """
592
+ return self._set(pageNumCol=value)
593
+
594
+ originCol = Param(
595
+ Params._dummy(),
596
+ "originCol",
597
+ "Input column name with original path of file.",
598
+ typeConverter=TypeConverters.toString
599
+ )
600
+
601
+ def setOriginCol(self, value: str):
602
+ """Sets input column with original file path.
603
+
604
+ Parameters
605
+ ----------
606
+ value : str
607
+ Column name that stores the file path.
608
+ """
609
+ return self._set(originCol=value)
610
+
611
+ partitionNum = Param(
612
+ Params._dummy(),
613
+ "partitionNum",
614
+ "Number of partitions.",
615
+ typeConverter=TypeConverters.toInt
616
+ )
617
+
618
+ def setPartitionNum(self, value: int):
619
+ """Sets number of partitions.
620
+
621
+ Parameters
622
+ ----------
623
+ value : int
624
+ Number of partitions to use.
625
+ """
626
+ return self._set(partitionNum=value)
627
+
628
+ storeSplittedPdf = Param(
629
+ Params._dummy(),
630
+ "storeSplittedPdf",
631
+ "Force to store bytes content of splitted pdf.",
632
+ typeConverter=TypeConverters.toBoolean
633
+ )
634
+
635
+ def setStoreSplittedPdf(self, value: bool):
636
+ """Sets whether to store byte content of split PDF pages.
637
+
638
+ Parameters
639
+ ----------
640
+ value : bool
641
+ True to store PDF page bytes, False otherwise.
642
+ """
643
+ return self._set(storeSplittedPdf=value)
644
+
645
+ splitPage = Param(
646
+ Params._dummy(),
647
+ "splitPage",
648
+ "Enable/disable splitting per page to identify page numbers and improve performance.",
649
+ typeConverter=TypeConverters.toBoolean
650
+ )
651
+
652
+ def setSplitPage(self, value: bool):
653
+ """Sets whether to split PDF into pages.
654
+
655
+ Parameters
656
+ ----------
657
+ value : bool
658
+ True to split per page, False otherwise.
659
+ """
660
+ return self._set(splitPage=value)
661
+
662
+ onlyPageNum = Param(
663
+ Params._dummy(),
664
+ "onlyPageNum",
665
+ "Extract only page numbers.",
666
+ typeConverter=TypeConverters.toBoolean
667
+ )
668
+
669
+ def setOnlyPageNum(self, value: bool):
670
+ """Sets whether to extract only page numbers.
671
+
672
+ Parameters
673
+ ----------
674
+ value : bool
675
+ True to extract only page numbers, False otherwise.
676
+ """
677
+ return self._set(onlyPageNum=value)
678
+
679
+ textStripper = Param(
680
+ Params._dummy(),
681
+ "textStripper",
682
+ "Text stripper type used for output layout and formatting.",
683
+ typeConverter=TypeConverters.toString
684
+ )
685
+
686
+ def setTextStripper(self, value: str):
687
+ """Sets text stripper type.
688
+
689
+ Parameters
690
+ ----------
691
+ value : str
692
+ Text stripper type for layout and formatting.
693
+ """
694
+ return self._set(textStripper=value)
695
+
696
+ sort = Param(
697
+ Params._dummy(),
698
+ "sort",
699
+ "Enable/disable sorting content on the page.",
700
+ typeConverter=TypeConverters.toBoolean
701
+ )
702
+
703
+ def setSort(self, value: bool):
704
+ """Sets whether to sort content on the page.
705
+
706
+ Parameters
707
+ ----------
708
+ value : bool
709
+ True to sort content, False otherwise.
710
+ """
711
+ return self._set(sort=value)
712
+
713
+ extractCoordinates = Param(
714
+ Params._dummy(),
715
+ "extractCoordinates",
716
+ "Force extract coordinates of text.",
717
+ typeConverter=TypeConverters.toBoolean
718
+ )
719
+
720
+ def setExtractCoordinates(self, value: bool):
721
+ """Sets whether to extract coordinates of text.
722
+
723
+ Parameters
724
+ ----------
725
+ value : bool
726
+ True to extract coordinates, False otherwise.
727
+ """
728
+ return self._set(extractCoordinates=value)
729
+
730
+ normalizeLigatures = Param(
731
+ Params._dummy(),
732
+ "normalizeLigatures",
733
+ "Whether to convert ligature chars such as 'fl' into its corresponding chars (e.g., {'f', 'l'}).",
734
+ typeConverter=TypeConverters.toBoolean
735
+ )
736
+
737
+ def setNormalizeLigatures(self, value: bool):
738
+ """Sets whether to normalize ligatures (e.g., fl → f + l).
739
+
740
+ Parameters
741
+ ----------
742
+ value : bool
743
+ True to normalize ligatures, False otherwise.
744
+ """
745
+ return self._set(normalizeLigatures=value)
746
+
747
+ readAsImage = Param(
748
+ Params._dummy(),
749
+ "readAsImage",
750
+ "Read PDF pages as images.",
751
+ typeConverter=TypeConverters.toBoolean
752
+ )
753
+
754
+ def setReadAsImage(self, value: bool):
755
+ """Sets whether to read PDF pages as images.
756
+
757
+ Parameters
758
+ ----------
759
+ value : bool
760
+ True to read as images, False otherwise.
761
+ """
762
+ return self._set(readAsImage=value)