spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  4. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  5. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  6. sparknlp/__init__.py +81 -28
  7. sparknlp/annotation.py +3 -2
  8. sparknlp/annotator/__init__.py +6 -0
  9. sparknlp/annotator/audio/__init__.py +2 -0
  10. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  11. sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
  12. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  13. sparknlp/{base → annotator}/chunk2_doc.py +4 -7
  14. sparknlp/annotator/chunker.py +1 -2
  15. sparknlp/annotator/classifier_dl/__init__.py +17 -0
  16. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  17. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
  18. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
  19. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
  20. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  21. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  22. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  23. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
  24. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
  25. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
  26. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  27. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  28. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
  29. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
  30. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  31. sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
  32. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
  33. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
  34. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
  35. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  36. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
  37. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
  38. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
  39. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  40. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  41. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
  42. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
  43. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
  44. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  45. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  46. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  47. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
  48. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  49. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
  50. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
  51. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
  52. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  53. sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
  54. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
  55. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  56. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
  57. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  60. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
  61. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
  62. sparknlp/annotator/cleaners/__init__.py +15 -0
  63. sparknlp/annotator/cleaners/cleaner.py +202 -0
  64. sparknlp/annotator/cleaners/extractor.py +191 -0
  65. sparknlp/annotator/coref/spanbert_coref.py +4 -18
  66. sparknlp/annotator/cv/__init__.py +15 -0
  67. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  68. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  69. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  70. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  71. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  72. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  73. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  74. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  75. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  76. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  77. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  78. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  79. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  80. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  81. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  82. sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
  83. sparknlp/annotator/dataframe_optimizer.py +216 -0
  84. sparknlp/annotator/date2_chunk.py +88 -0
  85. sparknlp/annotator/dependency/dependency_parser.py +2 -3
  86. sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
  87. sparknlp/annotator/document_character_text_splitter.py +228 -0
  88. sparknlp/annotator/document_normalizer.py +37 -1
  89. sparknlp/annotator/document_token_splitter.py +175 -0
  90. sparknlp/annotator/document_token_splitter_test.py +85 -0
  91. sparknlp/annotator/embeddings/__init__.py +11 -0
  92. sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
  93. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  94. sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
  95. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
  96. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  97. sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
  98. sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
  99. sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
  100. sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
  101. sparknlp/annotator/embeddings/doc2vec.py +7 -1
  102. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  103. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  104. sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
  105. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  106. sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
  107. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  108. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  109. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  110. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  111. sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
  112. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
  113. sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
  114. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  115. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  116. sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
  117. sparknlp/annotator/embeddings/word2vec.py +7 -1
  118. sparknlp/annotator/embeddings/word_embeddings.py +4 -5
  119. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
  120. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
  121. sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
  122. sparknlp/annotator/er/entity_ruler.py +37 -23
  123. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
  124. sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
  125. sparknlp/annotator/lemmatizer.py +3 -4
  126. sparknlp/annotator/matcher/date_matcher.py +35 -3
  127. sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
  128. sparknlp/annotator/matcher/regex_matcher.py +3 -3
  129. sparknlp/annotator/matcher/text_matcher.py +2 -3
  130. sparknlp/annotator/n_gram_generator.py +1 -2
  131. sparknlp/annotator/ner/__init__.py +3 -1
  132. sparknlp/annotator/ner/ner_converter.py +18 -0
  133. sparknlp/annotator/ner/ner_crf.py +4 -5
  134. sparknlp/annotator/ner/ner_dl.py +10 -5
  135. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  136. sparknlp/annotator/ner/ner_overwriter.py +2 -2
  137. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  138. sparknlp/annotator/normalizer.py +2 -2
  139. sparknlp/annotator/openai/__init__.py +16 -0
  140. sparknlp/annotator/openai/openai_completion.py +349 -0
  141. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  142. sparknlp/annotator/pos/perceptron.py +6 -7
  143. sparknlp/annotator/sentence/sentence_detector.py +2 -2
  144. sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
  145. sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
  146. sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
  147. sparknlp/annotator/seq2seq/__init__.py +17 -0
  148. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  149. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  150. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  151. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  152. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  153. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  154. sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
  155. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  156. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  157. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  158. sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
  159. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  160. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  161. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  162. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  163. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  164. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  165. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  166. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  167. sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
  168. sparknlp/annotator/similarity/__init__.py +0 -0
  169. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  170. sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
  171. sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
  172. sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
  173. sparknlp/annotator/stemmer.py +2 -3
  174. sparknlp/annotator/stop_words_cleaner.py +3 -4
  175. sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
  176. sparknlp/annotator/token/__init__.py +0 -1
  177. sparknlp/annotator/token/recursive_tokenizer.py +2 -3
  178. sparknlp/annotator/token/tokenizer.py +2 -3
  179. sparknlp/annotator/ws/word_segmenter.py +35 -10
  180. sparknlp/base/__init__.py +2 -3
  181. sparknlp/base/doc2_chunk.py +0 -3
  182. sparknlp/base/document_assembler.py +5 -5
  183. sparknlp/base/embeddings_finisher.py +14 -2
  184. sparknlp/base/finisher.py +15 -4
  185. sparknlp/base/gguf_ranking_finisher.py +234 -0
  186. sparknlp/base/image_assembler.py +69 -0
  187. sparknlp/base/light_pipeline.py +53 -21
  188. sparknlp/base/multi_document_assembler.py +9 -13
  189. sparknlp/base/prompt_assembler.py +207 -0
  190. sparknlp/base/token_assembler.py +1 -2
  191. sparknlp/common/__init__.py +2 -0
  192. sparknlp/common/annotator_type.py +1 -0
  193. sparknlp/common/completion_post_processing.py +37 -0
  194. sparknlp/common/match_strategy.py +33 -0
  195. sparknlp/common/properties.py +914 -9
  196. sparknlp/internal/__init__.py +841 -116
  197. sparknlp/internal/annotator_java_ml.py +1 -1
  198. sparknlp/internal/annotator_transformer.py +3 -0
  199. sparknlp/logging/comet.py +2 -2
  200. sparknlp/partition/__init__.py +16 -0
  201. sparknlp/partition/partition.py +244 -0
  202. sparknlp/partition/partition_properties.py +902 -0
  203. sparknlp/partition/partition_transformer.py +200 -0
  204. sparknlp/pretrained/pretrained_pipeline.py +1 -1
  205. sparknlp/pretrained/resource_downloader.py +126 -2
  206. sparknlp/reader/__init__.py +15 -0
  207. sparknlp/reader/enums.py +19 -0
  208. sparknlp/reader/pdf_to_text.py +190 -0
  209. sparknlp/reader/reader2doc.py +124 -0
  210. sparknlp/reader/reader2image.py +136 -0
  211. sparknlp/reader/reader2table.py +44 -0
  212. sparknlp/reader/reader_assembler.py +159 -0
  213. sparknlp/reader/sparknlp_reader.py +461 -0
  214. sparknlp/training/__init__.py +1 -0
  215. sparknlp/training/conll.py +8 -2
  216. sparknlp/training/spacy_to_annotation.py +57 -0
  217. sparknlp/util.py +26 -0
  218. spark_nlp-4.2.6.dist-info/METADATA +0 -1256
  219. spark_nlp-4.2.6.dist-info/RECORD +0 -196
  220. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
  221. /sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0
@@ -0,0 +1,387 @@
1
+ # Copyright 2017-2024 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the Phi4Transformer."""
15
+
16
+ from sparknlp.common import *
17
+
18
+ class Phi4Transformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
19
+ """Phi-4: State-of-the-art open model by Microsoft Research
20
+
21
+ phi-4 is a 14B parameter, dense decoder-only Transformer model trained on 9.8T tokens, designed for advanced reasoning, code, and general NLP tasks.
22
+ For more details, see: https://huggingface.co/microsoft/phi-4
23
+
24
+ Model Overview
25
+ --------------
26
+ - 14B parameters, dense decoder-only Transformer
27
+ - 16K context length
28
+ - Trained on 9.8T tokens (synthetic, public domain, academic, Q&A, code)
29
+ - Focus on high-quality, advanced reasoning, math, code, and general NLP
30
+ - Multilingual data: ~8% (primarily English)
31
+ - Released under MIT License
32
+
33
+ Intended Use
34
+ ------------
35
+ - General-purpose AI, research, and generative features
36
+ - Memory/compute constrained and latency-bound environments
37
+ - Reasoning, logic, and code generation
38
+
39
+ Benchmarks
40
+ ----------
41
+ - MMLU: 84.8 | HumanEval: 82.6 | GPQA: 56.1 | DROP: 75.5 | MATH: 80.6
42
+ - Outperforms or matches other 14B/70B models on many tasks
43
+
44
+ Safety & Limitations
45
+ -------------------
46
+ - Safety alignment via SFT and DPO, red-teamed by Microsoft AIRT
47
+ - Not intended for high-risk or consequential domains without further assessment
48
+ - Primarily English; other languages may have reduced performance
49
+ - May generate inaccurate, offensive, or biased content; use with care
50
+
51
+ Usage
52
+ -----
53
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion object:
54
+
55
+ >>> phi4 = Phi4Transformer.pretrained() \
56
+ ... .setInputCols(["document"]) \
57
+ ... .setOutputCol("generation")
58
+
59
+ The default model is ``"phi-4"``, if no name is provided. For available pretrained models please see the `Models Hub <https://huggingface.co/microsoft/phi-4>`__.
60
+
61
+ Note
62
+ ----
63
+ This is a resource-intensive module, especially with larger models and sequences. Use of accelerators such as GPUs is strongly recommended.
64
+
65
+ References
66
+ ----------
67
+ - https://huggingface.co/microsoft/phi-4
68
+ - arXiv:2412.08905
69
+
70
+ ====================== ======================
71
+ Input Annotation types Output Annotation type
72
+ ====================== ======================
73
+ ``DOCUMENT`` ``DOCUMENT``
74
+ ====================== ======================
75
+
76
+ Parameters
77
+ ----------
78
+ configProtoBytes
79
+ ConfigProto from tensorflow, serialized into byte array.
80
+ minOutputLength
81
+ Minimum length of the sequence to be generated, by default 0
82
+ maxOutputLength
83
+ Maximum length of output text, by default 60
84
+ doSample
85
+ Whether or not to use sampling; use greedy decoding otherwise, by default False
86
+ temperature
87
+ The value used to modulate the next token probabilities, by default 1.0
88
+ topK
89
+ The number of highest probability vocabulary tokens to keep for
90
+ top-k-filtering, by default 40
91
+ topP
92
+ Top cumulative probability for vocabulary tokens, by default 1.0
93
+
94
+ If set to float < 1, only the most probable tokens with probabilities
95
+ that add up to ``topP`` or higher are kept for generation.
96
+ repetitionPenalty
97
+ The parameter for repetition penalty, 1.0 means no penalty. , by default
98
+ 1.0
99
+ noRepeatNgramSize
100
+ If set to int > 0, all ngrams of that size can only occur once, by
101
+ default 0
102
+ ignoreTokenIds
103
+ A list of token ids which are ignored in the decoder's output, by
104
+ default []
105
+
106
+ Notes
107
+ -----
108
+ This is a very computationally expensive module, especially on larger
109
+ sequences. The use of an accelerator such as GPU is recommended.
110
+
111
+ Examples
112
+ --------
113
+ >>> import sparknlp
114
+ >>> from sparknlp.base import *
115
+ >>> from sparknlp.annotator import *
116
+ >>> from pyspark.ml import Pipeline
117
+ >>> documentAssembler = DocumentAssembler() \
118
+ ... .setInputCol("text") \
119
+ ... .setOutputCol("documents")
120
+ >>> phi4 = Phi4Transformer.pretrained("phi-4") \
121
+ ... .setInputCols(["documents"]) \
122
+ ... .setMaxOutputLength(60) \
123
+ ... .setOutputCol("generation")
124
+ >>> pipeline = Pipeline().setStages([documentAssembler, phi4])
125
+ >>> data = spark.createDataFrame([
126
+ ... (
127
+ ... 1,
128
+ ... "<|start_header_id|>system<|end_header_id|> \\n"+ \
129
+ ... "You are a helpful assistant! \\n" + \
130
+ ... "<|start_header_id|>user<|end_header_id|> \\n" + \
131
+ ... "What is Phi-4? \\n" + \
132
+ ... "<|start_header_id|>assistant<|end_header_id|> \\n"
133
+ ... )
134
+ ... ]).toDF("id", "text")
135
+ >>> result = pipeline.fit(data).transform(data)
136
+ >>> result.select("generation.result").show(truncate=False)
137
+ +------------------------------------------------+
138
+ |result |
139
+ +------------------------------------------------+
140
+ |[Phi-4 is a 14B parameter, dense decoder-only Transformer model developed by Microsoft Research for advanced reasoning, code, and general NLP tasks.]|
141
+ +------------------------------------------------+
142
+ """
143
+
144
+ name = "Phi4Transformer"
145
+
146
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
147
+
148
+ outputAnnotatorType = AnnotatorType.DOCUMENT
149
+
150
+ configProtoBytes = Param(Params._dummy(),
151
+ "configProtoBytes",
152
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
153
+ TypeConverters.toListInt)
154
+
155
+ minOutputLength = Param(Params._dummy(), "minOutputLength", "Minimum length of the sequence to be generated",
156
+ typeConverter=TypeConverters.toInt)
157
+
158
+ maxOutputLength = Param(Params._dummy(), "maxOutputLength", "Maximum length of output text",
159
+ typeConverter=TypeConverters.toInt)
160
+
161
+ doSample = Param(Params._dummy(), "doSample", "Whether or not to use sampling; use greedy decoding otherwise",
162
+ typeConverter=TypeConverters.toBoolean)
163
+
164
+ temperature = Param(Params._dummy(), "temperature", "The value used to module the next token probabilities",
165
+ typeConverter=TypeConverters.toFloat)
166
+
167
+ topK = Param(Params._dummy(), "topK",
168
+ "The number of highest probability vocabulary tokens to keep for top-k-filtering",
169
+ typeConverter=TypeConverters.toInt)
170
+
171
+ topP = Param(Params._dummy(), "topP",
172
+ "If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are kept for generation",
173
+ typeConverter=TypeConverters.toFloat)
174
+
175
+ repetitionPenalty = Param(Params._dummy(), "repetitionPenalty",
176
+ "The parameter for repetition penalty. 1.0 means no penalty. See `this paper <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details",
177
+ typeConverter=TypeConverters.toFloat)
178
+
179
+ noRepeatNgramSize = Param(Params._dummy(), "noRepeatNgramSize",
180
+ "If set to int > 0, all ngrams of that size can only occur once",
181
+ typeConverter=TypeConverters.toInt)
182
+
183
+ ignoreTokenIds = Param(Params._dummy(), "ignoreTokenIds",
184
+ "A list of token ids which are ignored in the decoder's output",
185
+ typeConverter=TypeConverters.toListInt)
186
+
187
+ beamSize = Param(Params._dummy(), "beamSize",
188
+ "The number of beams to use for beam search",
189
+ typeConverter=TypeConverters.toInt)
190
+
191
+ stopTokenIds = Param(Params._dummy(), "stopTokenIds",
192
+ "A list of token ids which are considered as stop tokens in the decoder's output",
193
+ typeConverter=TypeConverters.toListInt)
194
+
195
+ def setIgnoreTokenIds(self, value):
196
+ """A list of token ids which are ignored in the decoder's output.
197
+
198
+ Parameters
199
+ ----------
200
+ value : List[int]
201
+ The words to be filtered out
202
+ """
203
+ return self._set(ignoreTokenIds=value)
204
+
205
+ def setConfigProtoBytes(self, b):
206
+ """Sets configProto from tensorflow, serialized into byte array.
207
+
208
+ Parameters
209
+ ----------
210
+ b : List[int]
211
+ ConfigProto from tensorflow, serialized into byte array
212
+ """
213
+ return self._set(configProtoBytes=b)
214
+
215
+ def setMinOutputLength(self, value):
216
+ """Sets minimum length of the sequence to be generated.
217
+
218
+ Parameters
219
+ ----------
220
+ value : int
221
+ Minimum length of the sequence to be generated
222
+ """
223
+ return self._set(minOutputLength=value)
224
+
225
+ def setMaxOutputLength(self, value):
226
+ """Sets maximum length of output text.
227
+
228
+ Parameters
229
+ ----------
230
+ value : int
231
+ Maximum length of output text
232
+ """
233
+ return self._set(maxOutputLength=value)
234
+
235
+ def setDoSample(self, value):
236
+ """Sets whether or not to use sampling, use greedy decoding otherwise.
237
+
238
+ Parameters
239
+ ----------
240
+ value : bool
241
+ Whether or not to use sampling; use greedy decoding otherwise
242
+ """
243
+ return self._set(doSample=value)
244
+
245
+ def setTemperature(self, value):
246
+ """Sets the value used to module the next token probabilities.
247
+
248
+ Parameters
249
+ ----------
250
+ value : float
251
+ The value used to module the next token probabilities
252
+ """
253
+ return self._set(temperature=value)
254
+
255
+ def setTopK(self, value):
256
+ """Sets the number of highest probability vocabulary tokens to keep for
257
+ top-k-filtering.
258
+
259
+ Parameters
260
+ ----------
261
+ value : int
262
+ Number of highest probability vocabulary tokens to keep
263
+ """
264
+ return self._set(topK=value)
265
+
266
+ def setTopP(self, value):
267
+ """Sets the top cumulative probability for vocabulary tokens.
268
+
269
+ If set to float < 1, only the most probable tokens with probabilities
270
+ that add up to ``topP`` or higher are kept for generation.
271
+
272
+ Parameters
273
+ ----------
274
+ value : float
275
+ Cumulative probability for vocabulary tokens
276
+ """
277
+ return self._set(topP=value)
278
+
279
+ def setRepetitionPenalty(self, value):
280
+ """Sets the parameter for repetition penalty. 1.0 means no penalty.
281
+
282
+ Parameters
283
+ ----------
284
+ value : float
285
+ The repetition penalty
286
+
287
+ References
288
+ ----------
289
+ See `Ctrl: A Conditional Transformer Language Model For Controllable
290
+ Generation <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
291
+ """
292
+ return self._set(repetitionPenalty=value)
293
+
294
+ def setNoRepeatNgramSize(self, value):
295
+ """Sets size of n-grams that can only occur once.
296
+
297
+ If set to int > 0, all ngrams of that size can only occur once.
298
+
299
+ Parameters
300
+ ----------
301
+ value : int
302
+ N-gram size can only occur once
303
+ """
304
+ return self._set(noRepeatNgramSize=value)
305
+
306
+ def setBeamSize(self, value):
307
+ """Sets the number of beams to use for beam search.
308
+
309
+ Parameters
310
+ ----------
311
+ value : int
312
+ The number of beams to use for beam search
313
+ """
314
+ return self._set(beamSize=value)
315
+
316
+ def setStopTokenIds(self, value):
317
+ """Sets a list of token ids which are considered as stop tokens in the decoder's output.
318
+
319
+ Parameters
320
+ ----------
321
+ value : List[int]
322
+ The words to be considered as stop tokens
323
+ """
324
+ return self._set(stopTokenIds=value)
325
+
326
+ @keyword_only
327
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.Phi4Transformer", java_model=None):
328
+ super(Phi4Transformer, self).__init__(
329
+ classname=classname,
330
+ java_model=java_model
331
+ )
332
+ self._setDefault(
333
+ minOutputLength=0,
334
+ maxOutputLength=20,
335
+ doSample=False,
336
+ temperature=0.6,
337
+ topK=-1,
338
+ topP=0.9,
339
+ repetitionPenalty=1.0,
340
+ noRepeatNgramSize=3,
341
+ ignoreTokenIds=[],
342
+ batchSize=1,
343
+ beamSize=1,
344
+ stopTokenIds=[128001,]
345
+ )
346
+
347
+ @staticmethod
348
+ def loadSavedModel(folder, spark_session, use_openvino = False):
349
+ """Loads a locally saved model.
350
+
351
+ Parameters
352
+ ----------
353
+ folder : str
354
+ Folder of the saved model
355
+ spark_session : pyspark.sql.SparkSession
356
+ The current SparkSession
357
+
358
+ Returns
359
+ -------
360
+ Phi4Transformer
361
+ The restored model
362
+ """
363
+ from sparknlp.internal import _Phi4Loader
364
+ jModel = _Phi4Loader(folder, spark_session._jsparkSession, use_openvino)._java_obj
365
+ return Phi4Transformer(java_model=jModel)
366
+
367
+ @staticmethod
368
+ def pretrained(name="phi-4", lang="en", remote_loc=None):
369
+ """Downloads and loads a pretrained model.
370
+
371
+ Parameters
372
+ ----------
373
+ name : str, optional
374
+ Name of the pretrained model, by default "phi-4"
375
+ lang : str, optional
376
+ Language of the pretrained model, by default "en"
377
+ remote_loc : str, optional
378
+ Optional remote address of the resource, by default None. Will use
379
+ Spark NLPs repositories otherwise.
380
+
381
+ Returns
382
+ -------
383
+ Phi4Transformer
384
+ The restored model
385
+ """
386
+ from sparknlp.pretrained import ResourceDownloader
387
+ return ResourceDownloader.downloadModel(Phi4Transformer, name, lang, remote_loc)