spark-nlp 6.0.5__py2.py3-none-any.whl → 6.1.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spark-nlp might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: spark-nlp
3
- Version: 6.0.5
3
+ Version: 6.1.0
4
4
  Summary: John Snow Labs Spark NLP is a natural language processing library built on top of Apache Spark ML. It provides simple, performant & accurate NLP annotations for machine learning pipelines, that scale easily in a distributed environment.
5
5
  Home-page: https://github.com/JohnSnowLabs/spark-nlp
6
6
  Author: John Snow Labs
@@ -102,7 +102,7 @@ $ java -version
102
102
  $ conda create -n sparknlp python=3.7 -y
103
103
  $ conda activate sparknlp
104
104
  # spark-nlp by default is based on pyspark 3.x
105
- $ pip install spark-nlp==6.0.5 pyspark==3.3.1
105
+ $ pip install spark-nlp==6.1.0 pyspark==3.3.1
106
106
  ```
107
107
 
108
108
  In Python console or Jupyter `Python3` kernel:
@@ -168,7 +168,7 @@ For a quick example of using pipelines and models take a look at our official [d
168
168
 
169
169
  ### Apache Spark Support
170
170
 
171
- Spark NLP *6.0.5* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x
171
+ Spark NLP *6.1.0* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x
172
172
 
173
173
  | Spark NLP | Apache Spark 3.5.x | Apache Spark 3.4.x | Apache Spark 3.3.x | Apache Spark 3.2.x | Apache Spark 3.1.x | Apache Spark 3.0.x | Apache Spark 2.4.x | Apache Spark 2.3.x |
174
174
  |-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
@@ -198,7 +198,7 @@ Find out more about 4.x `SparkNLP` versions in our official [documentation](http
198
198
 
199
199
  ### Databricks Support
200
200
 
201
- Spark NLP 6.0.5 has been tested and is compatible with the following runtimes:
201
+ Spark NLP 6.1.0 has been tested and is compatible with the following runtimes:
202
202
 
203
203
  | **CPU** | **GPU** |
204
204
  |--------------------|--------------------|
@@ -215,7 +215,7 @@ We are compatible with older runtimes. For a full list check databricks support
215
215
 
216
216
  ### EMR Support
217
217
 
218
- Spark NLP 6.0.5 has been tested and is compatible with the following EMR releases:
218
+ Spark NLP 6.1.0 has been tested and is compatible with the following EMR releases:
219
219
 
220
220
  | **EMR Release** |
221
221
  |--------------------|
@@ -3,7 +3,7 @@ com/johnsnowlabs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
3
3
  com/johnsnowlabs/ml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  com/johnsnowlabs/ml/ai/__init__.py,sha256=YQiK2M7U4d8y5irPy_HB8ae0mSpqS9583MH44pnKJXc,295
5
5
  com/johnsnowlabs/nlp/__init__.py,sha256=DPIVXtONO5xXyOk-HB0-sNiHAcco17NN13zPS_6Uw8c,294
6
- sparknlp/__init__.py,sha256=peVwWLyO7M5yWlQdRGzOBql6cvaWcdQyy5pNhGR8-sg,13814
6
+ sparknlp/__init__.py,sha256=wxPbTrab8A3tELe8XRaGCfuZ-T8Dc8szbOHXH9ZgLIU,13814
7
7
  sparknlp/annotation.py,sha256=I5zOxG5vV2RfPZfqN9enT1i4mo6oBcn3Lrzs37QiOiA,5635
8
8
  sparknlp/annotation_audio.py,sha256=iRV_InSVhgvAwSRe9NTbUH9v6OGvTM-FPCpSAKVu0mE,1917
9
9
  sparknlp/annotation_image.py,sha256=xhCe8Ko-77XqWVuuYHFrjKqF6zPd8Z-RY_rmZXNwCXU,2547
@@ -167,8 +167,8 @@ sparknlp/annotator/sentence/sentence_detector_dl.py,sha256=-Osj9Bm9KyZRTAWkOsK9c
167
167
  sparknlp/annotator/sentiment/__init__.py,sha256=Lq3vKaZS1YATLMg0VNXSVtkWL5q5G9taGBvdrvSwnfg,766
168
168
  sparknlp/annotator/sentiment/sentiment_detector.py,sha256=m545NGU0Xzg_PO6_qIfpli1uZj7JQcyFgqe9R6wAPFI,8154
169
169
  sparknlp/annotator/sentiment/vivekn_sentiment.py,sha256=4rpXWDgzU6ddnbrSCp9VdLb2epCc9oZ3c6XcqxEw8nk,9655
170
- sparknlp/annotator/seq2seq/__init__.py,sha256=4h6taLL4pgzs_pR2uFx9AcLrCqtu8bx3hXNeK1_u_EE,1723
171
- sparknlp/annotator/seq2seq/auto_gguf_model.py,sha256=PYHfljxtSSTL0lBm1-6ZBEHxkBLZvID63HSuxWv7XS4,11554
170
+ sparknlp/annotator/seq2seq/__init__.py,sha256=Aj43G1MuQE0mW7LakCWPjiTkIGl7iHPAnKIwT_DfdIM,1781
171
+ sparknlp/annotator/seq2seq/auto_gguf_model.py,sha256=Oah_RvOy9YrvfnnMRMKOGJHnAMYxo0SeczBZsndM3kY,11638
172
172
  sparknlp/annotator/seq2seq/auto_gguf_vision_model.py,sha256=EYrm8EW7AMq3AoIKPe7Gp6ayBlFpWeg76AsAr4nanqU,15346
173
173
  sparknlp/annotator/seq2seq/bart_transformer.py,sha256=I1flM4yeCzEAKOdQllBC30XuedxVJ7ferkFhZ6gwEbE,18481
174
174
  sparknlp/annotator/seq2seq/cohere_transformer.py,sha256=43LZBVazZMgJRCsN7HaYjVYfJ5hRMV95QZyxMtXq-m4,13496
@@ -183,6 +183,7 @@ sparknlp/annotator/seq2seq/nllb_transformer.py,sha256=1ys01yaC0nVzXETy8oD2wZHyom
183
183
  sparknlp/annotator/seq2seq/olmo_transformer.py,sha256=B_zhYkAfYycw5uBq1tVNPmaKuYtpJOxRC6PArit7XiE,13634
184
184
  sparknlp/annotator/seq2seq/phi2_transformer.py,sha256=WwKCUOH8qGFv62YF63HjuT7bMVldh06gHvaZH3tbSDk,13787
185
185
  sparknlp/annotator/seq2seq/phi3_transformer.py,sha256=arIcw5NDMv3ubBwWz3KYRdLMsspTiEI8vk4s00lyq1c,14293
186
+ sparknlp/annotator/seq2seq/phi4_transformer.py,sha256=iVbsqIzKS2MG-LmA3tljjsjeCUzBqATw1btDBOnFviM,14324
186
187
  sparknlp/annotator/seq2seq/qwen_transformer.py,sha256=IYxveoHGWWoiwzJ_VMLMgUBe6jr1JSHKSY0PApnTCOI,14640
187
188
  sparknlp/annotator/seq2seq/starcoder_transformer.py,sha256=BTXbSMRpXnDvrfh-6iFS5k6g6EcPV9zBl4U-SSC19wA,14293
188
189
  sparknlp/annotator/seq2seq/t5_transformer.py,sha256=wDVxNLluIU1HGZFqaKKc4YTt4l-elPlAtQ7EEa0f5tg,17308
@@ -222,12 +223,12 @@ sparknlp/common/annotator_properties.py,sha256=7B1os7pBUfHo6b7IPQAXQ-nir0u3tQLzD
222
223
  sparknlp/common/annotator_type.py,sha256=ash2Ip1IOOiJamPVyy_XQj8Ja_DRHm0b9Vj4Ni75oKM,1225
223
224
  sparknlp/common/coverage_result.py,sha256=No4PSh1HSs3PyRI1zC47x65tWgfirqPI290icHQoXEI,823
224
225
  sparknlp/common/match_strategy.py,sha256=kt1MUPqU1wCwk5qCdYk6jubHbU-5yfAYxb9jjAOrdnY,1678
225
- sparknlp/common/properties.py,sha256=v8PUB0YqeiZRzP8mX3kXSFoQVMZOg_ips0Y5M54hUIc,51493
226
+ sparknlp/common/properties.py,sha256=4jDyxr2IGWEuNlGtOoPzqdCF7oLAKGy1z6MtqxUVMug,52704
226
227
  sparknlp/common/read_as.py,sha256=imxPGwV7jr4Li_acbo0OAHHRGCBbYv-akzEGaBWEfcY,1226
227
228
  sparknlp/common/recursive_annotator_approach.py,sha256=vqugBw22cE3Ff7PIpRlnYFuOlchgL0nM26D8j-NdpqU,1449
228
229
  sparknlp/common/storage.py,sha256=D91H3p8EIjNspjqAYu6ephRpCUtdcAir4_PrAbkIQWE,4842
229
230
  sparknlp/common/utils.py,sha256=Yne6yYcwKxhOZC-U4qfYoDhWUP_6BIaAjI5X_P_df1E,1306
230
- sparknlp/internal/__init__.py,sha256=wvC7ovDfII5GiYSwNpA1HHttnlXjbFgpYAGV68NsiQo,40446
231
+ sparknlp/internal/__init__.py,sha256=f5kxLz6bEX8FCMo6lek30MXbn_lDC-dBTI9s_fARblI,40748
231
232
  sparknlp/internal/annotator_java_ml.py,sha256=UGPoThG0rGXUOXGSQnDzEDW81Mu1s5RPF29v7DFyE3c,1187
232
233
  sparknlp/internal/annotator_transformer.py,sha256=fXmc2IWXGybqZpbEU9obmbdBYPc798y42zvSB4tqV9U,1448
233
234
  sparknlp/internal/extended_java_wrapper.py,sha256=hwP0133-hDiDf5sBF-P3MtUsuuDj1PpQbtGZQIRwzfk,2240
@@ -246,7 +247,8 @@ sparknlp/pretrained/utils.py,sha256=T1MrvW_DaWk_jcOjVLOea0NMFE9w8fe0ZT_5urZ_nEY,
246
247
  sparknlp/reader/__init__.py,sha256=-Toj3AIBki-zXPpV8ezFTI2LX1yP_rK2bhpoa8nBkTw,685
247
248
  sparknlp/reader/enums.py,sha256=MNGug9oJ1BBLM1Pbske13kAabalDzHa2kucF5xzFpHs,770
248
249
  sparknlp/reader/pdf_to_text.py,sha256=eWw-cwjosmcSZ9eHso0F5QQoeGBBnwsOhzhCXXvMjZA,7169
249
- sparknlp/reader/sparknlp_reader.py,sha256=IG0_wYKT1cIIU3EibzOVBZ-GhvX50mC5meXYv0WsYKs,18524
250
+ sparknlp/reader/reader2doc.py,sha256=xahxkEuNM21mb0-MHQoYLtDF1cbAYrMTRpN1-u5K3ec,6587
251
+ sparknlp/reader/sparknlp_reader.py,sha256=MJs8v_ECYaV1SOabI1L_2MkVYEDVImtwgbYypO7DJSY,20623
250
252
  sparknlp/training/__init__.py,sha256=qREi9u-5Vc2VjpL6-XZsyvu5jSEIdIhowW7_kKaqMqo,852
251
253
  sparknlp/training/conll.py,sha256=wKBiSTrjc6mjsl7Nyt6B8f4yXsDJkZb-sn8iOjix9cE,6961
252
254
  sparknlp/training/conllu.py,sha256=8r3i-tmyrLsyk1DtZ9uo2mMDCWb1yw2Y5W6UsV13MkY,4953
@@ -277,7 +279,7 @@ sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py,sha256=R4yHFN3
277
279
  sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py,sha256=EoCSdcIjqQ3wv13MAuuWrKV8wyVBP0SbOEW41omHlR0,23189
278
280
  sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py,sha256=k5CQ7gKV6HZbZMB8cKLUJuZxoZWlP_DFWdZ--aIDwsc,2356
279
281
  sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py,sha256=pAxjWhjazSX8Vg0MFqJiuRVw1IbnQNSs-8Xp26L4nko,870
280
- spark_nlp-6.0.5.dist-info/METADATA,sha256=BL1PeMYps-L3LAkmNpwxMkrGUw_KwO164VZ5AoqDZLg,19722
281
- spark_nlp-6.0.5.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
282
- spark_nlp-6.0.5.dist-info/top_level.txt,sha256=uuytur4pyMRw2H_txNY2ZkaucZHUs22QF8-R03ch_-E,13
283
- spark_nlp-6.0.5.dist-info/RECORD,,
282
+ spark_nlp-6.1.0.dist-info/METADATA,sha256=MDLwobOveRxQL45CWF-NY26iHa3a7PijF9wntBXpeZE,19722
283
+ spark_nlp-6.1.0.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
284
+ spark_nlp-6.1.0.dist-info/top_level.txt,sha256=uuytur4pyMRw2H_txNY2ZkaucZHUs22QF8-R03ch_-E,13
285
+ spark_nlp-6.1.0.dist-info/RECORD,,
sparknlp/__init__.py CHANGED
@@ -66,7 +66,7 @@ sys.modules['com.johnsnowlabs.ml.ai'] = annotator
66
66
  annotators = annotator
67
67
  embeddings = annotator
68
68
 
69
- __version__ = "6.0.5"
69
+ __version__ = "6.1.0"
70
70
 
71
71
 
72
72
  def start(gpu=False,
@@ -31,3 +31,4 @@ from sparknlp.annotator.seq2seq.starcoder_transformer import *
31
31
  from sparknlp.annotator.seq2seq.llama3_transformer import *
32
32
  from sparknlp.annotator.seq2seq.cohere_transformer import *
33
33
  from sparknlp.annotator.seq2seq.olmo_transformer import *
34
+ from sparknlp.annotator.seq2seq.phi4_transformer import *
@@ -253,7 +253,9 @@ class AutoGGUFModel(AnnotatorModel, HasBatchedAnnotate, HasLlamaCppProperties):
253
253
  nCtx=4096,
254
254
  nBatch=512,
255
255
  embedding=False,
256
- nPredict=100
256
+ nPredict=100,
257
+ nGpuLayers=99,
258
+ systemPrompt="You are a helpful assistant."
257
259
  )
258
260
 
259
261
  @staticmethod
@@ -0,0 +1,387 @@
1
+ # Copyright 2017-2024 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the Phi4Transformer."""
15
+
16
+ from sparknlp.common import *
17
+
18
+ class Phi4Transformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
19
+ """Phi-4: State-of-the-art open model by Microsoft Research
20
+
21
+ phi-4 is a 14B parameter, dense decoder-only Transformer model trained on 9.8T tokens, designed for advanced reasoning, code, and general NLP tasks.
22
+ For more details, see: https://huggingface.co/microsoft/phi-4
23
+
24
+ Model Overview
25
+ --------------
26
+ - 14B parameters, dense decoder-only Transformer
27
+ - 16K context length
28
+ - Trained on 9.8T tokens (synthetic, public domain, academic, Q&A, code)
29
+ - Focus on high-quality, advanced reasoning, math, code, and general NLP
30
+ - Multilingual data: ~8% (primarily English)
31
+ - Released under MIT License
32
+
33
+ Intended Use
34
+ ------------
35
+ - General-purpose AI, research, and generative features
36
+ - Memory/compute constrained and latency-bound environments
37
+ - Reasoning, logic, and code generation
38
+
39
+ Benchmarks
40
+ ----------
41
+ - MMLU: 84.8 | HumanEval: 82.6 | GPQA: 56.1 | DROP: 75.5 | MATH: 80.6
42
+ - Outperforms or matches other 14B/70B models on many tasks
43
+
44
+ Safety & Limitations
45
+ -------------------
46
+ - Safety alignment via SFT and DPO, red-teamed by Microsoft AIRT
47
+ - Not intended for high-risk or consequential domains without further assessment
48
+ - Primarily English; other languages may have reduced performance
49
+ - May generate inaccurate, offensive, or biased content; use with care
50
+
51
+ Usage
52
+ -----
53
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion object:
54
+
55
+ >>> phi4 = Phi4Transformer.pretrained() \
56
+ ... .setInputCols(["document"]) \
57
+ ... .setOutputCol("generation")
58
+
59
+ The default model is ``"phi-4"``, if no name is provided. For available pretrained models please see the `Models Hub <https://huggingface.co/microsoft/phi-4>`__.
60
+
61
+ Note
62
+ ----
63
+ This is a resource-intensive module, especially with larger models and sequences. Use of accelerators such as GPUs is strongly recommended.
64
+
65
+ References
66
+ ----------
67
+ - https://huggingface.co/microsoft/phi-4
68
+ - arXiv:2412.08905
69
+
70
+ ====================== ======================
71
+ Input Annotation types Output Annotation type
72
+ ====================== ======================
73
+ ``DOCUMENT`` ``DOCUMENT``
74
+ ====================== ======================
75
+
76
+ Parameters
77
+ ----------
78
+ configProtoBytes
79
+ ConfigProto from tensorflow, serialized into byte array.
80
+ minOutputLength
81
+ Minimum length of the sequence to be generated, by default 0
82
+ maxOutputLength
83
+ Maximum length of output text, by default 60
84
+ doSample
85
+ Whether or not to use sampling; use greedy decoding otherwise, by default False
86
+ temperature
87
+ The value used to modulate the next token probabilities, by default 1.0
88
+ topK
89
+ The number of highest probability vocabulary tokens to keep for
90
+ top-k-filtering, by default 40
91
+ topP
92
+ Top cumulative probability for vocabulary tokens, by default 1.0
93
+
94
+ If set to float < 1, only the most probable tokens with probabilities
95
+ that add up to ``topP`` or higher are kept for generation.
96
+ repetitionPenalty
97
+ The parameter for repetition penalty, 1.0 means no penalty. , by default
98
+ 1.0
99
+ noRepeatNgramSize
100
+ If set to int > 0, all ngrams of that size can only occur once, by
101
+ default 0
102
+ ignoreTokenIds
103
+ A list of token ids which are ignored in the decoder's output, by
104
+ default []
105
+
106
+ Notes
107
+ -----
108
+ This is a very computationally expensive module, especially on larger
109
+ sequences. The use of an accelerator such as GPU is recommended.
110
+
111
+ Examples
112
+ --------
113
+ >>> import sparknlp
114
+ >>> from sparknlp.base import *
115
+ >>> from sparknlp.annotator import *
116
+ >>> from pyspark.ml import Pipeline
117
+ >>> documentAssembler = DocumentAssembler() \
118
+ ... .setInputCol("text") \
119
+ ... .setOutputCol("documents")
120
+ >>> phi4 = Phi4Transformer.pretrained("phi-4") \
121
+ ... .setInputCols(["documents"]) \
122
+ ... .setMaxOutputLength(60) \
123
+ ... .setOutputCol("generation")
124
+ >>> pipeline = Pipeline().setStages([documentAssembler, phi4])
125
+ >>> data = spark.createDataFrame([
126
+ ... (
127
+ ... 1,
128
+ ... "<|start_header_id|>system<|end_header_id|> \\n"+ \
129
+ ... "You are a helpful assistant! \\n" + \
130
+ ... "<|start_header_id|>user<|end_header_id|> \\n" + \
131
+ ... "What is Phi-4? \\n" + \
132
+ ... "<|start_header_id|>assistant<|end_header_id|> \\n"
133
+ ... )
134
+ ... ]).toDF("id", "text")
135
+ >>> result = pipeline.fit(data).transform(data)
136
+ >>> result.select("generation.result").show(truncate=False)
137
+ +------------------------------------------------+
138
+ |result |
139
+ +------------------------------------------------+
140
+ |[Phi-4 is a 14B parameter, dense decoder-only Transformer model developed by Microsoft Research for advanced reasoning, code, and general NLP tasks.]|
141
+ +------------------------------------------------+
142
+ """
143
+
144
+ name = "Phi4Transformer"
145
+
146
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
147
+
148
+ outputAnnotatorType = AnnotatorType.DOCUMENT
149
+
150
+ configProtoBytes = Param(Params._dummy(),
151
+ "configProtoBytes",
152
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
153
+ TypeConverters.toListInt)
154
+
155
+ minOutputLength = Param(Params._dummy(), "minOutputLength", "Minimum length of the sequence to be generated",
156
+ typeConverter=TypeConverters.toInt)
157
+
158
+ maxOutputLength = Param(Params._dummy(), "maxOutputLength", "Maximum length of output text",
159
+ typeConverter=TypeConverters.toInt)
160
+
161
+ doSample = Param(Params._dummy(), "doSample", "Whether or not to use sampling; use greedy decoding otherwise",
162
+ typeConverter=TypeConverters.toBoolean)
163
+
164
+ temperature = Param(Params._dummy(), "temperature", "The value used to module the next token probabilities",
165
+ typeConverter=TypeConverters.toFloat)
166
+
167
+ topK = Param(Params._dummy(), "topK",
168
+ "The number of highest probability vocabulary tokens to keep for top-k-filtering",
169
+ typeConverter=TypeConverters.toInt)
170
+
171
+ topP = Param(Params._dummy(), "topP",
172
+ "If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are kept for generation",
173
+ typeConverter=TypeConverters.toFloat)
174
+
175
+ repetitionPenalty = Param(Params._dummy(), "repetitionPenalty",
176
+ "The parameter for repetition penalty. 1.0 means no penalty. See `this paper <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details",
177
+ typeConverter=TypeConverters.toFloat)
178
+
179
+ noRepeatNgramSize = Param(Params._dummy(), "noRepeatNgramSize",
180
+ "If set to int > 0, all ngrams of that size can only occur once",
181
+ typeConverter=TypeConverters.toInt)
182
+
183
+ ignoreTokenIds = Param(Params._dummy(), "ignoreTokenIds",
184
+ "A list of token ids which are ignored in the decoder's output",
185
+ typeConverter=TypeConverters.toListInt)
186
+
187
+ beamSize = Param(Params._dummy(), "beamSize",
188
+ "The number of beams to use for beam search",
189
+ typeConverter=TypeConverters.toInt)
190
+
191
+ stopTokenIds = Param(Params._dummy(), "stopTokenIds",
192
+ "A list of token ids which are considered as stop tokens in the decoder's output",
193
+ typeConverter=TypeConverters.toListInt)
194
+
195
+ def setIgnoreTokenIds(self, value):
196
+ """A list of token ids which are ignored in the decoder's output.
197
+
198
+ Parameters
199
+ ----------
200
+ value : List[int]
201
+ The words to be filtered out
202
+ """
203
+ return self._set(ignoreTokenIds=value)
204
+
205
+ def setConfigProtoBytes(self, b):
206
+ """Sets configProto from tensorflow, serialized into byte array.
207
+
208
+ Parameters
209
+ ----------
210
+ b : List[int]
211
+ ConfigProto from tensorflow, serialized into byte array
212
+ """
213
+ return self._set(configProtoBytes=b)
214
+
215
+ def setMinOutputLength(self, value):
216
+ """Sets minimum length of the sequence to be generated.
217
+
218
+ Parameters
219
+ ----------
220
+ value : int
221
+ Minimum length of the sequence to be generated
222
+ """
223
+ return self._set(minOutputLength=value)
224
+
225
+ def setMaxOutputLength(self, value):
226
+ """Sets maximum length of output text.
227
+
228
+ Parameters
229
+ ----------
230
+ value : int
231
+ Maximum length of output text
232
+ """
233
+ return self._set(maxOutputLength=value)
234
+
235
+ def setDoSample(self, value):
236
+ """Sets whether or not to use sampling, use greedy decoding otherwise.
237
+
238
+ Parameters
239
+ ----------
240
+ value : bool
241
+ Whether or not to use sampling; use greedy decoding otherwise
242
+ """
243
+ return self._set(doSample=value)
244
+
245
+ def setTemperature(self, value):
246
+ """Sets the value used to module the next token probabilities.
247
+
248
+ Parameters
249
+ ----------
250
+ value : float
251
+ The value used to module the next token probabilities
252
+ """
253
+ return self._set(temperature=value)
254
+
255
+ def setTopK(self, value):
256
+ """Sets the number of highest probability vocabulary tokens to keep for
257
+ top-k-filtering.
258
+
259
+ Parameters
260
+ ----------
261
+ value : int
262
+ Number of highest probability vocabulary tokens to keep
263
+ """
264
+ return self._set(topK=value)
265
+
266
+ def setTopP(self, value):
267
+ """Sets the top cumulative probability for vocabulary tokens.
268
+
269
+ If set to float < 1, only the most probable tokens with probabilities
270
+ that add up to ``topP`` or higher are kept for generation.
271
+
272
+ Parameters
273
+ ----------
274
+ value : float
275
+ Cumulative probability for vocabulary tokens
276
+ """
277
+ return self._set(topP=value)
278
+
279
+ def setRepetitionPenalty(self, value):
280
+ """Sets the parameter for repetition penalty. 1.0 means no penalty.
281
+
282
+ Parameters
283
+ ----------
284
+ value : float
285
+ The repetition penalty
286
+
287
+ References
288
+ ----------
289
+ See `Ctrl: A Conditional Transformer Language Model For Controllable
290
+ Generation <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
291
+ """
292
+ return self._set(repetitionPenalty=value)
293
+
294
+ def setNoRepeatNgramSize(self, value):
295
+ """Sets size of n-grams that can only occur once.
296
+
297
+ If set to int > 0, all ngrams of that size can only occur once.
298
+
299
+ Parameters
300
+ ----------
301
+ value : int
302
+ N-gram size can only occur once
303
+ """
304
+ return self._set(noRepeatNgramSize=value)
305
+
306
+ def setBeamSize(self, value):
307
+ """Sets the number of beams to use for beam search.
308
+
309
+ Parameters
310
+ ----------
311
+ value : int
312
+ The number of beams to use for beam search
313
+ """
314
+ return self._set(beamSize=value)
315
+
316
+ def setStopTokenIds(self, value):
317
+ """Sets a list of token ids which are considered as stop tokens in the decoder's output.
318
+
319
+ Parameters
320
+ ----------
321
+ value : List[int]
322
+ The words to be considered as stop tokens
323
+ """
324
+ return self._set(stopTokenIds=value)
325
+
326
+ @keyword_only
327
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.Phi4Transformer", java_model=None):
328
+ super(Phi4Transformer, self).__init__(
329
+ classname=classname,
330
+ java_model=java_model
331
+ )
332
+ self._setDefault(
333
+ minOutputLength=0,
334
+ maxOutputLength=20,
335
+ doSample=False,
336
+ temperature=0.6,
337
+ topK=-1,
338
+ topP=0.9,
339
+ repetitionPenalty=1.0,
340
+ noRepeatNgramSize=3,
341
+ ignoreTokenIds=[],
342
+ batchSize=1,
343
+ beamSize=1,
344
+ stopTokenIds=[128001,]
345
+ )
346
+
347
+ @staticmethod
348
+ def loadSavedModel(folder, spark_session, use_openvino = False):
349
+ """Loads a locally saved model.
350
+
351
+ Parameters
352
+ ----------
353
+ folder : str
354
+ Folder of the saved model
355
+ spark_session : pyspark.sql.SparkSession
356
+ The current SparkSession
357
+
358
+ Returns
359
+ -------
360
+ Phi4Transformer
361
+ The restored model
362
+ """
363
+ from sparknlp.internal import _Phi4Loader
364
+ jModel = _Phi4Loader(folder, spark_session._jsparkSession, use_openvino)._java_obj
365
+ return Phi4Transformer(java_model=jModel)
366
+
367
+ @staticmethod
368
+ def pretrained(name="phi-4", lang="en", remote_loc=None):
369
+ """Downloads and loads a pretrained model.
370
+
371
+ Parameters
372
+ ----------
373
+ name : str, optional
374
+ Name of the pretrained model, by default "phi-4"
375
+ lang : str, optional
376
+ Language of the pretrained model, by default "en"
377
+ remote_loc : str, optional
378
+ Optional remote address of the resource, by default None. Will use
379
+ Spark NLPs repositories otherwise.
380
+
381
+ Returns
382
+ -------
383
+ Phi4Transformer
384
+ The restored model
385
+ """
386
+ from sparknlp.pretrained import ResourceDownloader
387
+ return ResourceDownloader.downloadModel(Phi4Transformer, name, lang, remote_loc)
@@ -765,14 +765,14 @@ class HasLlamaCppProperties:
765
765
  # -------- MODEl PARAMETERS --------
766
766
  nThreads = Param(Params._dummy(), "nThreads", "Set the number of threads to use during generation",
767
767
  typeConverter=TypeConverters.toInt)
768
- nThreadsDraft = Param(Params._dummy(), "nThreadsDraft", "Set the number of threads to use during draft generation",
769
- typeConverter=TypeConverters.toInt)
768
+ # nThreadsDraft = Param(Params._dummy(), "nThreadsDraft", "Set the number of threads to use during draft generation",
769
+ # typeConverter=TypeConverters.toInt)
770
770
  nThreadsBatch = Param(Params._dummy(), "nThreadsBatch",
771
771
  "Set the number of threads to use during batch and prompt processing",
772
772
  typeConverter=TypeConverters.toInt)
773
- nThreadsBatchDraft = Param(Params._dummy(), "nThreadsBatchDraft",
774
- "Set the number of threads to use during batch and prompt processing",
775
- typeConverter=TypeConverters.toInt)
773
+ # nThreadsBatchDraft = Param(Params._dummy(), "nThreadsBatchDraft",
774
+ # "Set the number of threads to use during batch and prompt processing",
775
+ # typeConverter=TypeConverters.toInt)
776
776
  nCtx = Param(Params._dummy(), "nCtx", "Set the size of the prompt context", typeConverter=TypeConverters.toInt)
777
777
  nBatch = Param(Params._dummy(), "nBatch",
778
778
  "Set the logical batch size for prompt processing (must be >=32 to use BLAS)",
@@ -782,12 +782,12 @@ class HasLlamaCppProperties:
782
782
  typeConverter=TypeConverters.toInt)
783
783
  nDraft = Param(Params._dummy(), "nDraft", "Set the number of tokens to draft for speculative decoding",
784
784
  typeConverter=TypeConverters.toInt)
785
- nChunks = Param(Params._dummy(), "nChunks", "Set the maximal number of chunks to process",
786
- typeConverter=TypeConverters.toInt)
787
- nSequences = Param(Params._dummy(), "nSequences", "Set the number of sequences to decode",
788
- typeConverter=TypeConverters.toInt)
789
- pSplit = Param(Params._dummy(), "pSplit", "Set the speculative decoding split probability",
790
- typeConverter=TypeConverters.toFloat)
785
+ # nChunks = Param(Params._dummy(), "nChunks", "Set the maximal number of chunks to process",
786
+ # typeConverter=TypeConverters.toInt)
787
+ # nSequences = Param(Params._dummy(), "nSequences", "Set the number of sequences to decode",
788
+ # typeConverter=TypeConverters.toInt)
789
+ # pSplit = Param(Params._dummy(), "pSplit", "Set the speculative decoding split probability",
790
+ # typeConverter=TypeConverters.toFloat)
791
791
  nGpuLayers = Param(Params._dummy(), "nGpuLayers", "Set the number of layers to store in VRAM (-1 - use default)",
792
792
  typeConverter=TypeConverters.toInt)
793
793
  nGpuLayersDraft = Param(Params._dummy(), "nGpuLayersDraft",
@@ -802,10 +802,10 @@ class HasLlamaCppProperties:
802
802
  typeConverter=TypeConverters.toString)
803
803
  mainGpu = Param(Params._dummy(), "mainGpu", "Set the main GPU that is used for scratch and small tensors.",
804
804
  typeConverter=TypeConverters.toInt)
805
- tensorSplit = Param(Params._dummy(), "tensorSplit", "Set how split tensors should be distributed across GPUs",
806
- typeConverter=TypeConverters.toListFloat)
807
- grpAttnN = Param(Params._dummy(), "grpAttnN", "Set the group-attention factor", typeConverter=TypeConverters.toInt)
808
- grpAttnW = Param(Params._dummy(), "grpAttnW", "Set the group-attention width", typeConverter=TypeConverters.toInt)
805
+ # tensorSplit = Param(Params._dummy(), "tensorSplit", "Set how split tensors should be distributed across GPUs",
806
+ # typeConverter=TypeConverters.toListFloat)
807
+ # grpAttnN = Param(Params._dummy(), "grpAttnN", "Set the group-attention factor", typeConverter=TypeConverters.toInt)
808
+ # grpAttnW = Param(Params._dummy(), "grpAttnW", "Set the group-attention width", typeConverter=TypeConverters.toInt)
809
809
  ropeFreqBase = Param(Params._dummy(), "ropeFreqBase", "Set the RoPE base frequency, used by NTK-aware scaling",
810
810
  typeConverter=TypeConverters.toFloat)
811
811
  ropeFreqScale = Param(Params._dummy(), "ropeFreqScale",
@@ -837,7 +837,7 @@ class HasLlamaCppProperties:
837
837
  typeConverter=TypeConverters.toString)
838
838
  # Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
839
839
  #
840
- # - UNSPECIFIED: Don't use any scaling
840
+ # - NONE: Don't use any scaling
841
841
  # - LINEAR: Linear scaling
842
842
  # - YARN: YaRN RoPE scaling
843
843
  ropeScalingType = Param(Params._dummy(), "ropeScalingType",
@@ -848,26 +848,28 @@ class HasLlamaCppProperties:
848
848
  # - 0 NONE: Don't use any pooling
849
849
  # - 1 MEAN: Mean Pooling
850
850
  # - 2 CLS: CLS Pooling
851
+ # - 3 LAST: Last token pooling
852
+ # - 4 RANK: For reranked models
851
853
  poolingType = Param(Params._dummy(), "poolingType",
852
854
  "Set the pooling type for embeddings, use model default if unspecified",
853
855
  typeConverter=TypeConverters.toString)
854
856
  modelDraft = Param(Params._dummy(), "modelDraft", "Set the draft model for speculative decoding",
855
857
  typeConverter=TypeConverters.toString)
856
858
  modelAlias = Param(Params._dummy(), "modelAlias", "Set a model alias", typeConverter=TypeConverters.toString)
857
- lookupCacheStaticFilePath = Param(Params._dummy(), "lookupCacheStaticFilePath",
858
- "Set path to static lookup cache to use for lookup decoding (not updated by generation)",
859
- typeConverter=TypeConverters.toString)
860
- lookupCacheDynamicFilePath = Param(Params._dummy(), "lookupCacheDynamicFilePath",
861
- "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)",
862
- typeConverter=TypeConverters.toString)
859
+ # lookupCacheStaticFilePath = Param(Params._dummy(), "lookupCacheStaticFilePath",
860
+ # "Set path to static lookup cache to use for lookup decoding (not updated by generation)",
861
+ # typeConverter=TypeConverters.toString)
862
+ # lookupCacheDynamicFilePath = Param(Params._dummy(), "lookupCacheDynamicFilePath",
863
+ # "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)",
864
+ # typeConverter=TypeConverters.toString)
863
865
  # loraAdapters = new StructFeature[Map[String, Float]](this, "loraAdapters")
864
866
  embedding = Param(Params._dummy(), "embedding", "Whether to load model with embedding support",
865
867
  typeConverter=TypeConverters.toBoolean)
866
868
  flashAttention = Param(Params._dummy(), "flashAttention", "Whether to enable Flash Attention",
867
869
  typeConverter=TypeConverters.toBoolean)
868
- inputPrefixBos = Param(Params._dummy(), "inputPrefixBos",
869
- "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string",
870
- typeConverter=TypeConverters.toBoolean)
870
+ # inputPrefixBos = Param(Params._dummy(), "inputPrefixBos",
871
+ # "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string",
872
+ # typeConverter=TypeConverters.toBoolean)
871
873
  useMmap = Param(Params._dummy(), "useMmap",
872
874
  "Whether to use memory-map model (faster load but may increase pageouts if not using mlock)",
873
875
  typeConverter=TypeConverters.toBoolean)
@@ -948,17 +950,17 @@ class HasLlamaCppProperties:
948
950
  """Set the number of threads to use during generation"""
949
951
  return self._set(nThreads=nThreads)
950
952
 
951
- def setNThreadsDraft(self, nThreadsDraft: int):
952
- """Set the number of threads to use during draft generation"""
953
- return self._set(nThreadsDraft=nThreadsDraft)
953
+ # def setNThreadsDraft(self, nThreadsDraft: int):
954
+ # """Set the number of threads to use during draft generation"""
955
+ # return self._set(nThreadsDraft=nThreadsDraft)
954
956
 
955
957
  def setNThreadsBatch(self, nThreadsBatch: int):
956
958
  """Set the number of threads to use during batch and prompt processing"""
957
959
  return self._set(nThreadsBatch=nThreadsBatch)
958
960
 
959
- def setNThreadsBatchDraft(self, nThreadsBatchDraft: int):
960
- """Set the number of threads to use during batch and prompt processing"""
961
- return self._set(nThreadsBatchDraft=nThreadsBatchDraft)
961
+ # def setNThreadsBatchDraft(self, nThreadsBatchDraft: int):
962
+ # """Set the number of threads to use during batch and prompt processing"""
963
+ # return self._set(nThreadsBatchDraft=nThreadsBatchDraft)
962
964
 
963
965
  def setNCtx(self, nCtx: int):
964
966
  """Set the size of the prompt context"""
@@ -976,17 +978,17 @@ class HasLlamaCppProperties:
976
978
  """Set the number of tokens to draft for speculative decoding"""
977
979
  return self._set(nDraft=nDraft)
978
980
 
979
- def setNChunks(self, nChunks: int):
980
- """Set the maximal number of chunks to process"""
981
- return self._set(nChunks=nChunks)
981
+ # def setNChunks(self, nChunks: int):
982
+ # """Set the maximal number of chunks to process"""
983
+ # return self._set(nChunks=nChunks)
982
984
 
983
- def setNSequences(self, nSequences: int):
984
- """Set the number of sequences to decode"""
985
- return self._set(nSequences=nSequences)
985
+ # def setNSequences(self, nSequences: int):
986
+ # """Set the number of sequences to decode"""
987
+ # return self._set(nSequences=nSequences)
986
988
 
987
- def setPSplit(self, pSplit: float):
988
- """Set the speculative decoding split probability"""
989
- return self._set(pSplit=pSplit)
989
+ # def setPSplit(self, pSplit: float):
990
+ # """Set the speculative decoding split probability"""
991
+ # return self._set(pSplit=pSplit)
990
992
 
991
993
  def setNGpuLayers(self, nGpuLayers: int):
992
994
  """Set the number of layers to store in VRAM (-1 - use default)"""
@@ -1004,17 +1006,17 @@ class HasLlamaCppProperties:
1004
1006
  """Set the main GPU that is used for scratch and small tensors."""
1005
1007
  return self._set(mainGpu=mainGpu)
1006
1008
 
1007
- def setTensorSplit(self, tensorSplit: List[float]):
1008
- """Set how split tensors should be distributed across GPUs"""
1009
- return self._set(tensorSplit=tensorSplit)
1009
+ # def setTensorSplit(self, tensorSplit: List[float]):
1010
+ # """Set how split tensors should be distributed across GPUs"""
1011
+ # return self._set(tensorSplit=tensorSplit)
1010
1012
 
1011
- def setGrpAttnN(self, grpAttnN: int):
1012
- """Set the group-attention factor"""
1013
- return self._set(grpAttnN=grpAttnN)
1013
+ # def setGrpAttnN(self, grpAttnN: int):
1014
+ # """Set the group-attention factor"""
1015
+ # return self._set(grpAttnN=grpAttnN)
1014
1016
 
1015
- def setGrpAttnW(self, grpAttnW: int):
1016
- """Set the group-attention width"""
1017
- return self._set(grpAttnW=grpAttnW)
1017
+ # def setGrpAttnW(self, grpAttnW: int):
1018
+ # """Set the group-attention width"""
1019
+ # return self._set(grpAttnW=grpAttnW)
1018
1020
 
1019
1021
  def setRopeFreqBase(self, ropeFreqBase: float):
1020
1022
  """Set the RoPE base frequency, used by NTK-aware scaling"""
@@ -1049,7 +1051,16 @@ class HasLlamaCppProperties:
1049
1051
  return self._set(defragmentationThreshold=defragmentationThreshold)
1050
1052
 
1051
1053
  def setNumaStrategy(self, numaStrategy: str):
1052
- """Set optimization strategies that help on some NUMA systems (if available)"""
1054
+ """Set optimization strategies that help on some NUMA systems (if available)
1055
+
1056
+ Possible values:
1057
+
1058
+ - DISABLED: No NUMA optimizations
1059
+ - DISTRIBUTE: spread execution evenly over all
1060
+ - ISOLATE: only spawn threads on CPUs on the node that execution started on
1061
+ - NUMA_CTL: use the CPU map provided by numactl
1062
+ - MIRROR: Mirrors the model across NUMA nodes
1063
+ """
1053
1064
  numaUpper = numaStrategy.upper()
1054
1065
  numaStrategies = ["DISABLED", "DISTRIBUTE", "ISOLATE", "NUMA_CTL", "MIRROR"]
1055
1066
  if numaUpper not in numaStrategies:
@@ -1060,13 +1071,36 @@ class HasLlamaCppProperties:
1060
1071
  return self._set(numaStrategy=numaStrategy)
1061
1072
 
1062
1073
  def setRopeScalingType(self, ropeScalingType: str):
1063
- """Set the RoPE frequency scaling method, defaults to linear unless specified by the model"""
1064
- return self._set(ropeScalingType=ropeScalingType)
1074
+ """Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
1075
+
1076
+ Possible values:
1077
+
1078
+ - NONE: Don't use any scaling
1079
+ - LINEAR: Linear scaling
1080
+ - YARN: YaRN RoPE scaling
1081
+ """
1082
+ ropeScalingTypeUpper = ropeScalingType.upper()
1083
+ ropeScalingTypes = ["NONE", "LINEAR", "YARN"]
1084
+ if ropeScalingTypeUpper not in ropeScalingTypes:
1085
+ raise ValueError(
1086
+ f"Invalid RoPE scaling type: {ropeScalingType}. "
1087
+ + f"Valid values are: {ropeScalingTypes}"
1088
+ )
1089
+ return self._set(ropeScalingType=ropeScalingTypeUpper)
1065
1090
 
1066
1091
  def setPoolingType(self, poolingType: str):
1067
- """Set the pooling type for embeddings, use model default if unspecified"""
1092
+ """Set the pooling type for embeddings, use model default if unspecified
1093
+
1094
+ Possible values:
1095
+
1096
+ - 0 NONE: Don't use any pooling
1097
+ - 1 MEAN: Mean Pooling
1098
+ - 2 CLS: CLS Pooling
1099
+ - 3 LAST: Last token pooling
1100
+ - 4 RANK: For reranked models
1101
+ """
1068
1102
  poolingTypeUpper = poolingType.upper()
1069
- poolingTypes = ["NONE", "MEAN", "CLS", "LAST"]
1103
+ poolingTypes = ["NONE", "MEAN", "CLS", "LAST", "RANK"]
1070
1104
  if poolingTypeUpper not in poolingTypes:
1071
1105
  raise ValueError(
1072
1106
  f"Invalid pooling type: {poolingType}. "
@@ -1082,13 +1116,13 @@ class HasLlamaCppProperties:
1082
1116
  """Set a model alias"""
1083
1117
  return self._set(modelAlias=modelAlias)
1084
1118
 
1085
- def setLookupCacheStaticFilePath(self, lookupCacheStaticFilePath: str):
1086
- """Set path to static lookup cache to use for lookup decoding (not updated by generation)"""
1087
- return self._set(lookupCacheStaticFilePath=lookupCacheStaticFilePath)
1119
+ # def setLookupCacheStaticFilePath(self, lookupCacheStaticFilePath: str):
1120
+ # """Set path to static lookup cache to use for lookup decoding (not updated by generation)"""
1121
+ # return self._set(lookupCacheStaticFilePath=lookupCacheStaticFilePath)
1088
1122
 
1089
- def setLookupCacheDynamicFilePath(self, lookupCacheDynamicFilePath: str):
1090
- """Set path to dynamic lookup cache to use for lookup decoding (updated by generation)"""
1091
- return self._set(lookupCacheDynamicFilePath=lookupCacheDynamicFilePath)
1123
+ # def setLookupCacheDynamicFilePath(self, lookupCacheDynamicFilePath: str):
1124
+ # """Set path to dynamic lookup cache to use for lookup decoding (updated by generation)"""
1125
+ # return self._set(lookupCacheDynamicFilePath=lookupCacheDynamicFilePath)
1092
1126
 
1093
1127
  def setEmbedding(self, embedding: bool):
1094
1128
  """Whether to load model with embedding support"""
@@ -1098,9 +1132,9 @@ class HasLlamaCppProperties:
1098
1132
  """Whether to enable Flash Attention"""
1099
1133
  return self._set(flashAttention=flashAttention)
1100
1134
 
1101
- def setInputPrefixBos(self, inputPrefixBos: bool):
1102
- """Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string"""
1103
- return self._set(inputPrefixBos=inputPrefixBos)
1135
+ # def setInputPrefixBos(self, inputPrefixBos: bool):
1136
+ # """Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string"""
1137
+ # return self._set(inputPrefixBos=inputPrefixBos)
1104
1138
 
1105
1139
  def setUseMmap(self, useMmap: bool):
1106
1140
  """Whether to use memory-map model (faster load but may increase pageouts if not using mlock)"""
@@ -1260,9 +1294,9 @@ class HasLlamaCppProperties:
1260
1294
  """Set token id bias"""
1261
1295
  return self._call_java("setTokenBias", tokenBias)
1262
1296
 
1263
- def setLoraAdapters(self, loraAdapters: Dict[str, float]):
1264
- """Set LoRA adapters with their scaling factors"""
1265
- return self._call_java("setLoraAdapters", loraAdapters)
1297
+ # def setLoraAdapters(self, loraAdapters: Dict[str, float]):
1298
+ # """Set LoRA adapters with their scaling factors"""
1299
+ # return self._call_java("setLoraAdapters", loraAdapters)
1266
1300
 
1267
1301
  def getMetadata(self):
1268
1302
  """Gets the metadata of the model"""
@@ -1182,4 +1182,13 @@ class _E5VEmbeddingsLoader(ExtendedJavaWrapper):
1182
1182
  path,
1183
1183
  jspark,
1184
1184
  use_openvino
1185
+ )
1186
+
1187
+ class _Phi4Loader(ExtendedJavaWrapper):
1188
+ def __init__(self, path, jspark, use_openvino=False):
1189
+ super(_Phi4Loader, self).__init__(
1190
+ "com.johnsnowlabs.nlp.annotators.seq2seq.Phi4Transformer.loadSavedModel",
1191
+ path,
1192
+ jspark,
1193
+ use_openvino,
1185
1194
  )
@@ -0,0 +1,194 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from pyspark import keyword_only
15
+ from pyspark.ml.param import TypeConverters, Params, Param
16
+
17
+ from sparknlp.common import AnnotatorType
18
+ from sparknlp.internal import AnnotatorTransformer
19
+ from sparknlp.partition.partition_properties import *
20
+
21
+
22
+ class Reader2Doc(
23
+ AnnotatorTransformer,
24
+ HasEmailReaderProperties,
25
+ HasExcelReaderProperties,
26
+ HasHTMLReaderProperties,
27
+ HasPowerPointProperties,
28
+ HasTextReaderProperties,
29
+ ):
30
+ """
31
+ The Reader2Doc annotator allows you to use reading files more smoothly within existing
32
+ Spark NLP workflows, enabling seamless reuse of your pipelines.
33
+
34
+ Reader2Doc can be used for extracting structured content from various document types
35
+ using Spark NLP readers. It supports reading from many file types and returns parsed
36
+ output as a structured Spark DataFrame.
37
+
38
+ Supported formats include:
39
+
40
+ - Plain text
41
+ - HTML
42
+ - Word (.doc/.docx)
43
+ - Excel (.xls/.xlsx)
44
+ - PowerPoint (.ppt/.pptx)
45
+ - Email files (.eml, .msg)
46
+ - PDFs
47
+
48
+ Examples
49
+ --------
50
+ >>> from johnsnowlabs.reader import Reader2Doc
51
+ >>> from johnsnowlabs.nlp.base import DocumentAssembler
52
+ >>> from pyspark.ml import Pipeline
53
+ >>> # Initialize Reader2Doc for PDF files
54
+ >>> reader2doc = Reader2Doc() \\
55
+ ... .setContentType("application/pdf") \\
56
+ ... .setContentPath(f"{pdf_directory}/")
57
+ >>> # Build the pipeline with the Reader2Doc stage
58
+ >>> pipeline = Pipeline(stages=[reader2doc])
59
+ >>> # Fit the pipeline to an empty DataFrame
60
+ >>> pipeline_model = pipeline.fit(empty_data_set)
61
+ >>> result_df = pipeline_model.transform(empty_data_set)
62
+ >>> # Show the resulting DataFrame
63
+ >>> result_df.show()
64
+ +------------------------------------------------------------------------------------------------------------------------------------+
65
+ |document |
66
+ +------------------------------------------------------------------------------------------------------------------------------------+
67
+ |[{'document', 0, 14, 'This is a Title', {'pageNumber': 1, 'elementType': 'Title', 'fileName': 'pdf-title.pdf'}, []}] |
68
+ |[{'document', 15, 38, 'This is a narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
69
+ |[{'document', 39, 68, 'This is another narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
70
+ +------------------------------------------------------------------------------------------------------------------------------------+
71
+ """
72
+
73
+ name = "Reader2Doc"
74
+ outputAnnotatorType = AnnotatorType.DOCUMENT
75
+
76
+ contentPath = Param(
77
+ Params._dummy(),
78
+ "contentPath",
79
+ "contentPath path to files to read",
80
+ typeConverter=TypeConverters.toString,
81
+ )
82
+
83
+ outputCol = Param(
84
+ Params._dummy(),
85
+ "outputCol",
86
+ "output column name",
87
+ typeConverter=TypeConverters.toString,
88
+ )
89
+
90
+ contentType = Param(
91
+ Params._dummy(),
92
+ "contentType",
93
+ "Set the content type to load following MIME specification",
94
+ typeConverter=TypeConverters.toString,
95
+ )
96
+
97
+ explodeDocs = Param(
98
+ Params._dummy(),
99
+ "explodeDocs",
100
+ "whether to explode the documents into separate rows",
101
+ typeConverter=TypeConverters.toBoolean,
102
+ )
103
+
104
+ flattenOutput = Param(
105
+ Params._dummy(),
106
+ "flattenOutput",
107
+ "If true, output is flattened to plain text with minimal metadata",
108
+ typeConverter=TypeConverters.toBoolean,
109
+ )
110
+
111
+ titleThreshold = Param(
112
+ Params._dummy(),
113
+ "titleThreshold",
114
+ "Minimum font size threshold for title detection in PDF docs",
115
+ typeConverter=TypeConverters.toFloat,
116
+ )
117
+
118
+ @keyword_only
119
+ def __init__(self):
120
+ super(Reader2Doc, self).__init__(classname="com.johnsnowlabs.reader.Reader2Doc")
121
+ self._setDefault(
122
+ outputCol="document",
123
+ explodeDocs=False,
124
+ contentType="",
125
+ flattenOutput=False,
126
+ titleThreshold=18
127
+ )
128
+ @keyword_only
129
+
130
+ def setParams(self):
131
+ kwargs = self._input_kwargs
132
+ return self._set(**kwargs)
133
+
134
+ def setContentPath(self, value):
135
+ """Sets content path.
136
+
137
+ Parameters
138
+ ----------
139
+ value : str
140
+ contentPath path to files to read
141
+ """
142
+ return self._set(contentPath=value)
143
+
144
+ def setContentType(self, value):
145
+ """
146
+ Set the content type to load following MIME specification
147
+
148
+ Parameters
149
+ ----------
150
+ value : str
151
+ content type to load following MIME specification
152
+ """
153
+ return self._set(contentType=value)
154
+
155
+ def setExplodeDocs(self, value):
156
+ """Sets whether to explode the documents into separate rows.
157
+
158
+
159
+ Parameters
160
+ ----------
161
+ value : boolean
162
+ Whether to explode the documents into separate rows
163
+ """
164
+ return self._set(explodeDocs=value)
165
+
166
+ def setOutputCol(self, value):
167
+ """Sets output column name.
168
+
169
+ Parameters
170
+ ----------
171
+ value : str
172
+ Name of the Output Column
173
+ """
174
+ return self._set(outputCol=value)
175
+
176
+ def setFlattenOutput(self, value):
177
+ """Sets whether to flatten the output to plain text with minimal metadata.
178
+
179
+ Parameters
180
+ ----------
181
+ value : bool
182
+ If true, output is flattened to plain text with minimal metadata
183
+ """
184
+ return self._set(flattenOutput=value)
185
+
186
+ def setTitleThreshold(self, value):
187
+ """Sets the minimum font size threshold for title detection in PDF documents.
188
+
189
+ Parameters
190
+ ----------
191
+ value : float
192
+ Minimum font size threshold for title detection in PDF docs
193
+ """
194
+ return self._set(titleThreshold=value)
@@ -413,4 +413,49 @@ class SparkNLPReader(ExtendedJavaWrapper):
413
413
  if not isinstance(filePath, str):
414
414
  raise TypeError("filePath must be a string")
415
415
  jdf = self._java_obj.md(filePath)
416
+ return self.getDataFrame(self.spark, jdf)
417
+
418
+ def csv(self, csvPath):
419
+ """Reads CSV files and returns a Spark DataFrame.
420
+
421
+ Parameters
422
+ ----------
423
+ docPath : str
424
+ Path to an CSV file or a directory containing CSV files.
425
+
426
+ Returns
427
+ -------
428
+ pyspark.sql.DataFrame
429
+ A DataFrame containing parsed CSV content.
430
+
431
+ Examples
432
+ --------
433
+ >>> from sparknlp.reader import SparkNLPReader
434
+ >>> csv_df = SparkNLPReader(spark).csv("home/user/csv-directory")
435
+
436
+ You can use SparkNLP for one line of code
437
+
438
+ >>> import sparknlp
439
+ >>> csv_df = sparknlp.read().csv("home/user/csv-directory")
440
+ >>> csv_df.show(truncate=False)
441
+ +-----------------------------------------------------------------------------------------------------------------------------------------+
442
+ |csv |
443
+ +-----------------------------------------------------------------------------------------------------------------------------------------+
444
+ |[{NarrativeText, Alice 100 Bob 95, {}}, {Table, <table><tr><td>Alice</td><td>100</td></tr><tr><td>Bob</td><td>95</td></tr></table>, {}}] |
445
+ +-----------------------------------------------------------------------------------------------------------------------------------------+
446
+
447
+ >>> csv_df.printSchema()
448
+ root
449
+ |-- path: string (nullable = true)
450
+ |-- csv: array (nullable = true)
451
+ | |-- element: struct (containsNull = true)
452
+ | | |-- elementType: string (nullable = true)
453
+ | | |-- content: string (nullable = true)
454
+ | | |-- metadata: map (nullable = true)
455
+ | | | |-- key: string
456
+ | | | |-- value: string (valueContainsNull = true)
457
+ """
458
+ if not isinstance(csvPath, str):
459
+ raise TypeError("docPath must be a string")
460
+ jdf = self._java_obj.csv(csvPath)
416
461
  return self.getDataFrame(self.spark, jdf)