spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,326 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the OLMoTransformer."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class OLMoTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
20
+ """OLMo: Open Language Models
21
+
22
+ OLMo is a series of Open Language Models designed to enable the science of language models.
23
+ The OLMo models are trained on the Dolma dataset. We release all code, checkpoints, logs
24
+ (coming soon), and details involved in training these models.
25
+
26
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
27
+ object:
28
+
29
+ >>> olmo = OLMoTransformer.pretrained() \\
30
+ ... .setInputCols(["document"]) \\
31
+ ... .setOutputCol("generation")
32
+
33
+
34
+ The default model is ``"olmo_1b_int4"``, if no name is provided. For available
35
+ pretrained models please see the `Models Hub
36
+ <https://sparknlp.org/models?q=olmo>`__.
37
+
38
+ ====================== ======================
39
+ Input Annotation types Output Annotation type
40
+ ====================== ======================
41
+ ``DOCUMENT`` ``DOCUMENT``
42
+ ====================== ======================
43
+
44
+ Parameters
45
+ ----------
46
+ configProtoBytes
47
+ ConfigProto from tensorflow, serialized into byte array.
48
+ minOutputLength
49
+ Minimum length of the sequence to be generated, by default 0
50
+ maxOutputLength
51
+ Maximum length of output text, by default 20
52
+ doSample
53
+ Whether or not to use sampling; use greedy decoding otherwise, by default False
54
+ temperature
55
+ The value used to module the next token probabilities, by default 1.0
56
+ topK
57
+ The number of highest probability vocabulary tokens to keep for
58
+ top-k-filtering, by default 50
59
+ topP
60
+ Top cumulative probability for vocabulary tokens, by default 1.0
61
+
62
+ If set to float < 1, only the most probable tokens with probabilities
63
+ that add up to ``topP`` or higher are kept for generation.
64
+ repetitionPenalty
65
+ The parameter for repetition penalty, 1.0 means no penalty. , by default
66
+ 1.0
67
+ noRepeatNgramSize
68
+ If set to int > 0, all ngrams of that size can only occur once, by
69
+ default 0
70
+ ignoreTokenIds
71
+ A list of token ids which are ignored in the decoder's output, by
72
+ default []
73
+
74
+ Notes
75
+ -----
76
+ This is a very computationally expensive module especially on larger
77
+ sequence. The use of an accelerator such as GPU is recommended.
78
+
79
+ References
80
+ ----------
81
+ - `OLMo Project Page.
82
+ <https://allenai.org/olmo>`__
83
+ - `OLMO GitHub Repository.
84
+ <https://github.com/allenai/OLMo>`__
85
+ - `OLMo: Accelerating the Science of Language Models
86
+ <https://arxiv.org/pdf/2402.00838.pdf>`__
87
+
88
+ **Paper Abstract:**
89
+
90
+ *Language models (LMs) have become ubiquitous in both NLP research and in commercial product offerings.
91
+ As their commercial importance has surged, the most powerful models have become closed off, gated behind
92
+ proprietary interfaces, with important details of their training data, architectures, and development
93
+ undisclosed. Given the importance of these details in scientifically studying these models, including
94
+ their biases and potential risks, we believe it is essential for the research community to have access
95
+ to powerful, truly open LMs. To this end, this technical report details the first release of OLMo,
96
+ a state-of-the-art, truly Open Language Model and its framework to build and study the science of
97
+ language modeling. Unlike most prior efforts that have only released model weights and inference code,
98
+ we release OLMo and the whole framework, including training data and training and evaluation code.
99
+ We hope this release will empower and strengthen the open research community and inspire a new wave
100
+ of innovation.*
101
+
102
+ Examples
103
+ --------
104
+ >>> import sparknlp
105
+ >>> from sparknlp.base import *
106
+ >>> from sparknlp.annotator import *
107
+ >>> from pyspark.ml import Pipeline
108
+ >>> documentAssembler = DocumentAssembler() \\
109
+ ... .setInputCol("text") \\
110
+ ... .setOutputCol("documents")
111
+ >>> olmo = OLMoTransformer.pretrained("olmo-7b") \\
112
+ ... .setInputCols(["documents"]) \\
113
+ ... .setMaxOutputLength(50) \\
114
+ ... .setOutputCol("generation")
115
+ >>> pipeline = Pipeline().setStages([documentAssembler, olmo])
116
+ >>> data = spark.createDataFrame([["My name is Leonardo."]]).toDF("text")
117
+ >>> result = pipeline.fit(data).transform(data)
118
+ >>> result.select("summaries.generation").show(truncate=False)
119
+ +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
120
+ |result |
121
+ +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
122
+ |[My name is Leonardo . I am a student of the University of California, Berkeley. I am interested in the field of Artificial Intelligence and its applications in the real world. I have a strong |
123
+ | passion for learning and am always looking for ways to improve my knowledge and skills] |
124
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
125
+ """
126
+
127
+ name = "OLMoTransformer"
128
+
129
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
130
+
131
+ outputAnnotatorType = AnnotatorType.DOCUMENT
132
+
133
+ configProtoBytes = Param(Params._dummy(), "configProtoBytes",
134
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
135
+ TypeConverters.toListInt)
136
+
137
+ minOutputLength = Param(Params._dummy(), "minOutputLength", "Minimum length of the sequence to be generated",
138
+ typeConverter=TypeConverters.toInt)
139
+
140
+ maxOutputLength = Param(Params._dummy(), "maxOutputLength", "Maximum length of output text",
141
+ typeConverter=TypeConverters.toInt)
142
+
143
+ doSample = Param(Params._dummy(), "doSample", "Whether or not to use sampling; use greedy decoding otherwise",
144
+ typeConverter=TypeConverters.toBoolean)
145
+
146
+ temperature = Param(Params._dummy(), "temperature", "The value used to module the next token probabilities",
147
+ typeConverter=TypeConverters.toFloat)
148
+
149
+ topK = Param(Params._dummy(), "topK",
150
+ "The number of highest probability vocabulary tokens to keep for top-k-filtering",
151
+ typeConverter=TypeConverters.toInt)
152
+
153
+ topP = Param(Params._dummy(), "topP",
154
+ "If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are kept for generation",
155
+ typeConverter=TypeConverters.toFloat)
156
+
157
+ repetitionPenalty = Param(Params._dummy(), "repetitionPenalty",
158
+ "The parameter for repetition penalty. 1.0 means no penalty. See `this paper <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details",
159
+ typeConverter=TypeConverters.toFloat)
160
+
161
+ noRepeatNgramSize = Param(Params._dummy(), "noRepeatNgramSize",
162
+ "If set to int > 0, all ngrams of that size can only occur once",
163
+ typeConverter=TypeConverters.toInt)
164
+
165
+ ignoreTokenIds = Param(Params._dummy(), "ignoreTokenIds",
166
+ "A list of token ids which are ignored in the decoder's output",
167
+ typeConverter=TypeConverters.toListInt)
168
+
169
+ def setIgnoreTokenIds(self, value):
170
+ """A list of token ids which are ignored in the decoder's output.
171
+
172
+ Parameters
173
+ ----------
174
+ value : List[int]
175
+ The words to be filtered out
176
+ """
177
+ return self._set(ignoreTokenIds=value)
178
+
179
+ def setConfigProtoBytes(self, b):
180
+ """Sets configProto from tensorflow, serialized into byte array.
181
+
182
+ Parameters
183
+ ----------
184
+ b : List[int]
185
+ ConfigProto from tensorflow, serialized into byte array
186
+ """
187
+ return self._set(configProtoBytes=b)
188
+
189
+ def setMinOutputLength(self, value):
190
+ """Sets minimum length of the sequence to be generated.
191
+
192
+ Parameters
193
+ ----------
194
+ value : int
195
+ Minimum length of the sequence to be generated
196
+ """
197
+ return self._set(minOutputLength=value)
198
+
199
+ def setMaxOutputLength(self, value):
200
+ """Sets maximum length of output text.
201
+
202
+ Parameters
203
+ ----------
204
+ value : int
205
+ Maximum length of output text
206
+ """
207
+ return self._set(maxOutputLength=value)
208
+
209
+ def setDoSample(self, value):
210
+ """Sets whether or not to use sampling, use greedy decoding otherwise.
211
+
212
+ Parameters
213
+ ----------
214
+ value : bool
215
+ Whether or not to use sampling; use greedy decoding otherwise
216
+ """
217
+ return self._set(doSample=value)
218
+
219
+ def setTemperature(self, value):
220
+ """Sets the value used to module the next token probabilities.
221
+
222
+ Parameters
223
+ ----------
224
+ value : float
225
+ The value used to module the next token probabilities
226
+ """
227
+ return self._set(temperature=value)
228
+
229
+ def setTopK(self, value):
230
+ """Sets the number of highest probability vocabulary tokens to keep for
231
+ top-k-filtering.
232
+
233
+ Parameters
234
+ ----------
235
+ value : int
236
+ Number of highest probability vocabulary tokens to keep
237
+ """
238
+ return self._set(topK=value)
239
+
240
+ def setTopP(self, value):
241
+ """Sets the top cumulative probability for vocabulary tokens.
242
+
243
+ If set to float < 1, only the most probable tokens with probabilities
244
+ that add up to ``topP`` or higher are kept for generation.
245
+
246
+ Parameters
247
+ ----------
248
+ value : float
249
+ Cumulative probability for vocabulary tokens
250
+ """
251
+ return self._set(topP=value)
252
+
253
+ def setRepetitionPenalty(self, value):
254
+ """Sets the parameter for repetition penalty. 1.0 means no penalty.
255
+
256
+ Parameters
257
+ ----------
258
+ value : float
259
+ The repetition penalty
260
+
261
+ References
262
+ ----------
263
+ See `Ctrl: A Conditional Transformer Language Model For Controllable
264
+ Generation <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
265
+ """
266
+ return self._set(repetitionPenalty=value)
267
+
268
+ def setNoRepeatNgramSize(self, value):
269
+ """Sets size of n-grams that can only occur once.
270
+
271
+ If set to int > 0, all ngrams of that size can only occur once.
272
+
273
+ Parameters
274
+ ----------
275
+ value : int
276
+ N-gram size can only occur once
277
+ """
278
+ return self._set(noRepeatNgramSize=value)
279
+
280
+ @keyword_only
281
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.OLMoTransformer", java_model=None):
282
+ super(OLMoTransformer, self).__init__(classname=classname, java_model=java_model)
283
+ self._setDefault(minOutputLength=0, maxOutputLength=20, doSample=False, temperature=0.6, topK=50, topP=0.9,
284
+ repetitionPenalty=1.0, noRepeatNgramSize=0, ignoreTokenIds=[], batchSize=1)
285
+
286
+ @staticmethod
287
+ def loadSavedModel(folder, spark_session):
288
+ """Loads a locally saved model.
289
+
290
+ Parameters
291
+ ----------
292
+ folder : str
293
+ Folder of the saved model
294
+ spark_session : pyspark.sql.SparkSession
295
+ The current SparkSession
296
+
297
+ Returns
298
+ -------
299
+ OLMoTransformer
300
+ The restored model
301
+ """
302
+ from sparknlp.internal import _OLMoLoader
303
+ jModel = _OLMoLoader(folder, spark_session._jsparkSession)._java_obj
304
+ return OLMoTransformer(java_model=jModel)
305
+
306
+ @staticmethod
307
+ def pretrained(name="olmo_1b_int4", lang="en", remote_loc=None):
308
+ """Downloads and loads a pretrained model.
309
+
310
+ Parameters
311
+ ----------
312
+ name : str, optional
313
+ Name of the pretrained model, by default "olmo-7b"
314
+ lang : str, optional
315
+ Language of the pretrained model, by default "en"
316
+ remote_loc : str, optional
317
+ Optional remote address of the resource, by default None. Will use
318
+ Spark NLPs repositories otherwise.
319
+
320
+ Returns
321
+ -------
322
+ OLMoTransformer
323
+ The restored model
324
+ """
325
+ from sparknlp.pretrained import ResourceDownloader
326
+ return ResourceDownloader.downloadModel(OLMoTransformer, name, lang, remote_loc)
@@ -0,0 +1,326 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the Phi2Transformer."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class Phi2Transformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
20
+ """Phi-2: Textbooks Are All You Need.
21
+
22
+ Phi-2 is a Transformer with 2.7 billion parameters. It was trained using the same data sources as Phi-1.5,
23
+ augmented with a new data source that consists of various NLP synthetic texts and filtered websites
24
+ (for safety and educational value). When assessed against benchmarks testing common sense, language understanding,
25
+ and logical reasoning, Phi-2 showcased a nearly state-of-the-art performance among models with less than 13 billion
26
+ parameters.
27
+
28
+ Phi-2 hasn't been fine-tuned through reinforcement learning from human feedback. The intention behind crafting
29
+ this open-source model is to provide the research community with a non-restricted small model to explore vital
30
+ safety challenges, such as reducing toxicity, understanding societal biases, enhancing controllability, and more.
31
+
32
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
33
+ object:
34
+
35
+ >>> phi2 = Phi2Transformer.pretrained() \\
36
+ ... .setInputCols(["document"]) \\
37
+ ... .setOutputCol("generation")
38
+
39
+
40
+ The default model is ``"llam2-7b"``, if no name is provided. For available
41
+ pretrained models please see the `Models Hub
42
+ <https://sparknlp.org/models?q=phi2>`__.
43
+
44
+ ====================== ======================
45
+ Input Annotation types Output Annotation type
46
+ ====================== ======================
47
+ ``DOCUMENT`` ``DOCUMENT``
48
+ ====================== ======================
49
+
50
+ Parameters
51
+ ----------
52
+ configProtoBytes
53
+ ConfigProto from tensorflow, serialized into byte array.
54
+ minOutputLength
55
+ Minimum length of the sequence to be generated, by default 0
56
+ maxOutputLength
57
+ Maximum length of output text, by default 20
58
+ doSample
59
+ Whether or not to use sampling; use greedy decoding otherwise, by default False
60
+ temperature
61
+ The value used to module the next token probabilities, by default 1.0
62
+ topK
63
+ The number of highest probability vocabulary tokens to keep for
64
+ top-k-filtering, by default 50
65
+ topP
66
+ Top cumulative probability for vocabulary tokens, by default 1.0
67
+
68
+ If set to float < 1, only the most probable tokens with probabilities
69
+ that add up to ``topP`` or higher are kept for generation.
70
+ repetitionPenalty
71
+ The parameter for repetition penalty, 1.0 means no penalty. , by default
72
+ 1.0
73
+ noRepeatNgramSize
74
+ If set to int > 0, all ngrams of that size can only occur once, by
75
+ default 0
76
+ ignoreTokenIds
77
+ A list of token ids which are ignored in the decoder's output, by
78
+ default []
79
+
80
+ Notes
81
+ -----
82
+ This is a very computationally expensive module especially on larger
83
+ sequence. The use of an accelerator such as GPU is recommended.
84
+
85
+ References
86
+ ----------
87
+ - `Phi-2: Textbooks Are All You Need.
88
+ <https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/>`__
89
+ - https://huggingface.co/microsoft/phi-2
90
+
91
+ **Paper Abstract:**
92
+
93
+ *In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned
94
+ large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. Our
95
+ fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our models
96
+ outperform open-source chat models on most benchmarks we tested, and based on our human
97
+ evaluations for helpfulness and safety, may be a suitable substitute for closed-source models.
98
+ We provide a detailed description of our approach to fine-tuning and safety improvements of
99
+ Llama 2-Chat in order to enable the community to build on our work and contribute to the
100
+ responsible development of LLMs.*
101
+
102
+ Examples
103
+ --------
104
+ >>> import sparknlp
105
+ >>> from sparknlp.base import *
106
+ >>> from sparknlp.annotator import *
107
+ >>> from pyspark.ml import Pipeline
108
+ >>> documentAssembler = DocumentAssembler() \\
109
+ ... .setInputCol("text") \\
110
+ ... .setOutputCol("documents")
111
+ >>> phi2 = Phi2Transformer.pretrained("phi2") \\
112
+ ... .setInputCols(["documents"]) \\
113
+ ... .setMaxOutputLength(50) \\
114
+ ... .setOutputCol("generation")
115
+ >>> pipeline = Pipeline().setStages([documentAssembler, phi2])
116
+ >>> data = spark.createDataFrame([["My name is Leonardo."]]).toDF("text")
117
+ >>> result = pipeline.fit(data).transform(data)
118
+ >>> result.select("summaries.generation").show(truncate=False)
119
+ +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
120
+ |result |
121
+ +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
122
+ |[My name is Leonardo . I am a student of the University of California, Berkeley. I am interested in the field of Artificial Intelligence and its applications in the real world. I have a strong |
123
+ | passion for learning and am always looking for ways to improve my knowledge and skills] |
124
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
125
+ """
126
+
127
+ name = "Phi2Transformer"
128
+
129
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
130
+
131
+ outputAnnotatorType = AnnotatorType.DOCUMENT
132
+
133
+ configProtoBytes = Param(Params._dummy(), "configProtoBytes",
134
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
135
+ TypeConverters.toListInt)
136
+
137
+ minOutputLength = Param(Params._dummy(), "minOutputLength", "Minimum length of the sequence to be generated",
138
+ typeConverter=TypeConverters.toInt)
139
+
140
+ maxOutputLength = Param(Params._dummy(), "maxOutputLength", "Maximum length of output text",
141
+ typeConverter=TypeConverters.toInt)
142
+
143
+ doSample = Param(Params._dummy(), "doSample", "Whether or not to use sampling; use greedy decoding otherwise",
144
+ typeConverter=TypeConverters.toBoolean)
145
+
146
+ temperature = Param(Params._dummy(), "temperature", "The value used to module the next token probabilities",
147
+ typeConverter=TypeConverters.toFloat)
148
+
149
+ topK = Param(Params._dummy(), "topK",
150
+ "The number of highest probability vocabulary tokens to keep for top-k-filtering",
151
+ typeConverter=TypeConverters.toInt)
152
+
153
+ topP = Param(Params._dummy(), "topP",
154
+ "If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are kept for generation",
155
+ typeConverter=TypeConverters.toFloat)
156
+
157
+ repetitionPenalty = Param(Params._dummy(), "repetitionPenalty",
158
+ "The parameter for repetition penalty. 1.0 means no penalty. See `this paper <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details",
159
+ typeConverter=TypeConverters.toFloat)
160
+
161
+ noRepeatNgramSize = Param(Params._dummy(), "noRepeatNgramSize",
162
+ "If set to int > 0, all ngrams of that size can only occur once",
163
+ typeConverter=TypeConverters.toInt)
164
+
165
+ ignoreTokenIds = Param(Params._dummy(), "ignoreTokenIds",
166
+ "A list of token ids which are ignored in the decoder's output",
167
+ typeConverter=TypeConverters.toListInt)
168
+
169
+ def setIgnoreTokenIds(self, value):
170
+ """A list of token ids which are ignored in the decoder's output.
171
+
172
+ Parameters
173
+ ----------
174
+ value : List[int]
175
+ The words to be filtered out
176
+ """
177
+ return self._set(ignoreTokenIds=value)
178
+
179
+ def setConfigProtoBytes(self, b):
180
+ """Sets configProto from tensorflow, serialized into byte array.
181
+
182
+ Parameters
183
+ ----------
184
+ b : List[int]
185
+ ConfigProto from tensorflow, serialized into byte array
186
+ """
187
+ return self._set(configProtoBytes=b)
188
+
189
+ def setMinOutputLength(self, value):
190
+ """Sets minimum length of the sequence to be generated.
191
+
192
+ Parameters
193
+ ----------
194
+ value : int
195
+ Minimum length of the sequence to be generated
196
+ """
197
+ return self._set(minOutputLength=value)
198
+
199
+ def setMaxOutputLength(self, value):
200
+ """Sets maximum length of output text.
201
+
202
+ Parameters
203
+ ----------
204
+ value : int
205
+ Maximum length of output text
206
+ """
207
+ return self._set(maxOutputLength=value)
208
+
209
+ def setDoSample(self, value):
210
+ """Sets whether or not to use sampling, use greedy decoding otherwise.
211
+
212
+ Parameters
213
+ ----------
214
+ value : bool
215
+ Whether or not to use sampling; use greedy decoding otherwise
216
+ """
217
+ return self._set(doSample=value)
218
+
219
+ def setTemperature(self, value):
220
+ """Sets the value used to module the next token probabilities.
221
+
222
+ Parameters
223
+ ----------
224
+ value : float
225
+ The value used to module the next token probabilities
226
+ """
227
+ return self._set(temperature=value)
228
+
229
+ def setTopK(self, value):
230
+ """Sets the number of highest probability vocabulary tokens to keep for
231
+ top-k-filtering.
232
+
233
+ Parameters
234
+ ----------
235
+ value : int
236
+ Number of highest probability vocabulary tokens to keep
237
+ """
238
+ return self._set(topK=value)
239
+
240
+ def setTopP(self, value):
241
+ """Sets the top cumulative probability for vocabulary tokens.
242
+
243
+ If set to float < 1, only the most probable tokens with probabilities
244
+ that add up to ``topP`` or higher are kept for generation.
245
+
246
+ Parameters
247
+ ----------
248
+ value : float
249
+ Cumulative probability for vocabulary tokens
250
+ """
251
+ return self._set(topP=value)
252
+
253
+ def setRepetitionPenalty(self, value):
254
+ """Sets the parameter for repetition penalty. 1.0 means no penalty.
255
+
256
+ Parameters
257
+ ----------
258
+ value : float
259
+ The repetition penalty
260
+
261
+ References
262
+ ----------
263
+ See `Ctrl: A Conditional Transformer Language Model For Controllable
264
+ Generation <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
265
+ """
266
+ return self._set(repetitionPenalty=value)
267
+
268
+ def setNoRepeatNgramSize(self, value):
269
+ """Sets size of n-grams that can only occur once.
270
+
271
+ If set to int > 0, all ngrams of that size can only occur once.
272
+
273
+ Parameters
274
+ ----------
275
+ value : int
276
+ N-gram size can only occur once
277
+ """
278
+ return self._set(noRepeatNgramSize=value)
279
+
280
+ @keyword_only
281
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.Phi2Transformer", java_model=None):
282
+ super(Phi2Transformer, self).__init__(classname=classname, java_model=java_model)
283
+ self._setDefault(minOutputLength=0, maxOutputLength=20, doSample=False, temperature=0.6, topK=50, topP=0.9,
284
+ repetitionPenalty=1.0, noRepeatNgramSize=0, ignoreTokenIds=[], batchSize=1)
285
+
286
+ @staticmethod
287
+ def loadSavedModel(folder, spark_session, use_openvino=False):
288
+ """Loads a locally saved model.
289
+
290
+ Parameters
291
+ ----------
292
+ folder : str
293
+ Folder of the saved model
294
+ spark_session : pyspark.sql.SparkSession
295
+ The current SparkSession
296
+
297
+ Returns
298
+ -------
299
+ Phi2Transformer
300
+ The restored model
301
+ """
302
+ from sparknlp.internal import _Phi2Loader
303
+ jModel = _Phi2Loader(folder, spark_session._jsparkSession, use_openvino)._java_obj
304
+ return Phi2Transformer(java_model=jModel)
305
+
306
+ @staticmethod
307
+ def pretrained(name="phi2", lang="en", remote_loc=None):
308
+ """Downloads and loads a pretrained model.
309
+
310
+ Parameters
311
+ ----------
312
+ name : str, optional
313
+ Name of the pretrained model, by default "phi2"
314
+ lang : str, optional
315
+ Language of the pretrained model, by default "en"
316
+ remote_loc : str, optional
317
+ Optional remote address of the resource, by default None. Will use
318
+ Spark NLPs repositories otherwise.
319
+
320
+ Returns
321
+ -------
322
+ Phi2Transformer
323
+ The restored model
324
+ """
325
+ from sparknlp.pretrained import ResourceDownloader
326
+ return ResourceDownloader.downloadModel(Phi2Transformer, name, lang, remote_loc)