spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,381 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the LLAMA3Transformer."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class LLAMA3Transformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
20
+ """Llama 3: Cutting-Edge Foundation and Fine-Tuned Chat Models
21
+
22
+ The Llama 3 release introduces a new family of pretrained and fine-tuned LLMs, ranging in scale
23
+ from 8B and 70B parameters. Llama 3 models are designed with enhanced
24
+ efficiency, performance, and safety, making them more capable than previous versions. These models
25
+ are trained on a more diverse and expansive dataset, offering improved contextual understanding
26
+ and generation quality.
27
+
28
+ The fine-tuned models, known as Llama 3-instruct, are optimized for dialogue applications using an advanced
29
+ version of Reinforcement Learning from Human Feedback (RLHF). Llama 3-instruct models demonstrate superior
30
+ performance across multiple benchmarks, outperforming Llama 2 and even matching or exceeding the capabilities
31
+ of some closed-source models.
32
+
33
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
34
+ object:
35
+
36
+ >>> llama3 = LLAMA3Transformer.pretrained() \\
37
+ ... .setInputCols(["document"]) \\
38
+ ... .setOutputCol("generation")
39
+
40
+
41
+ The default model is ``"llama_3_7b_instruct_hf_int4"``, if no name is provided. For available
42
+ pretrained models please see the `Models Hub
43
+ <https://sparknlp.org/models?q=llama3>`__.
44
+
45
+ ====================== ======================
46
+ Input Annotation types Output Annotation type
47
+ ====================== ======================
48
+ ``DOCUMENT`` ``DOCUMENT``
49
+ ====================== ======================
50
+
51
+ Parameters
52
+ ----------
53
+ configProtoBytes
54
+ ConfigProto from tensorflow, serialized into byte array.
55
+ minOutputLength
56
+ Minimum length of the sequence to be generated, by default 0
57
+ maxOutputLength
58
+ Maximum length of output text, by default 60
59
+ doSample
60
+ Whether or not to use sampling; use greedy decoding otherwise, by default False
61
+ temperature
62
+ The value used to modulate the next token probabilities, by default 1.0
63
+ topK
64
+ The number of highest probability vocabulary tokens to keep for
65
+ top-k-filtering, by default 40
66
+ topP
67
+ Top cumulative probability for vocabulary tokens, by default 1.0
68
+
69
+ If set to float < 1, only the most probable tokens with probabilities
70
+ that add up to ``topP`` or higher are kept for generation.
71
+ repetitionPenalty
72
+ The parameter for repetition penalty, 1.0 means no penalty. , by default
73
+ 1.0
74
+ noRepeatNgramSize
75
+ If set to int > 0, all ngrams of that size can only occur once, by
76
+ default 0
77
+ ignoreTokenIds
78
+ A list of token ids which are ignored in the decoder's output, by
79
+ default []
80
+
81
+ Notes
82
+ -----
83
+ This is a very computationally expensive module, especially on larger
84
+ sequences. The use of an accelerator such as GPU is recommended.
85
+
86
+ References
87
+ ----------
88
+ - `Llama 3: Cutting-Edge Foundation and Fine-Tuned Chat Models
89
+ <https://ai.meta.com/blog/meta-llama-3/>`__
90
+ - https://github.com/facebookresearch/llama
91
+
92
+ **Paper Abstract:**
93
+
94
+ *Llama 3 is the latest iteration of large language models from Meta, offering a range of models
95
+ from 1 billion to 70 billion parameters. The fine-tuned versions, known as Llama 3-Chat, are
96
+ specifically designed for dialogue applications and have been optimized using advanced techniques
97
+ such as RLHF. Llama 3 models show remarkable improvements in both safety and performance, making
98
+ them a leading choice in both open-source and commercial settings. Our comprehensive approach to
99
+ training and fine-tuning these models is aimed at encouraging responsible AI development and fostering
100
+ community collaboration.*
101
+
102
+ Examples
103
+ --------
104
+ >>> import sparknlp
105
+ >>> from sparknlp.base import *
106
+ >>> from sparknlp.annotator import *
107
+ >>> from pyspark.ml import Pipeline
108
+ >>> documentAssembler = DocumentAssembler() \\
109
+ ... .setInputCol("text") \\
110
+ ... .setOutputCol("documents")
111
+ >>> llama3 = LLAMA3Transformer.pretrained("llama_3_7b_instruct_hf_int4") \\
112
+ ... .setInputCols(["documents"]) \\
113
+ ... .setMaxOutputLength(60) \\
114
+ ... .setOutputCol("generation")
115
+ >>> pipeline = Pipeline().setStages([documentAssembler, llama3])
116
+ >>> data = spark.createDataFrame([
117
+ ... (
118
+ ... 1,
119
+ ... "<|start_header_id|>system<|end_header_id|> \\n"+ \
120
+ ... "You are a minion chatbot who always responds in minion speak! \\n" + \
121
+ ... "<|start_header_id|>user<|end_header_id|> \\n" + \
122
+ ... "Who are you? \\n" + \
123
+ ... "<|start_header_id|>assistant<|end_header_id|> \\n"
124
+ ... )
125
+ ... ]).toDF("id", "text")
126
+ >>> result = pipeline.fit(data).transform(data)
127
+ >>> result.select("generation.result").show(truncate=False)
128
+ +------------------------------------------------+
129
+ |result |
130
+ +------------------------------------------------+
131
+ |[Oooh, me am Minion! Me help you with things! Me speak Minion language, yeah! Bana-na-na!]|
132
+ +------------------------------------------------+
133
+ """
134
+
135
+
136
+ name = "LLAMA3Transformer"
137
+
138
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
139
+
140
+ outputAnnotatorType = AnnotatorType.DOCUMENT
141
+
142
+
143
+ configProtoBytes = Param(Params._dummy(),
144
+ "configProtoBytes",
145
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
146
+ TypeConverters.toListInt)
147
+
148
+ minOutputLength = Param(Params._dummy(), "minOutputLength", "Minimum length of the sequence to be generated",
149
+ typeConverter=TypeConverters.toInt)
150
+
151
+ maxOutputLength = Param(Params._dummy(), "maxOutputLength", "Maximum length of output text",
152
+ typeConverter=TypeConverters.toInt)
153
+
154
+ doSample = Param(Params._dummy(), "doSample", "Whether or not to use sampling; use greedy decoding otherwise",
155
+ typeConverter=TypeConverters.toBoolean)
156
+
157
+ temperature = Param(Params._dummy(), "temperature", "The value used to module the next token probabilities",
158
+ typeConverter=TypeConverters.toFloat)
159
+
160
+ topK = Param(Params._dummy(), "topK",
161
+ "The number of highest probability vocabulary tokens to keep for top-k-filtering",
162
+ typeConverter=TypeConverters.toInt)
163
+
164
+ topP = Param(Params._dummy(), "topP",
165
+ "If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are kept for generation",
166
+ typeConverter=TypeConverters.toFloat)
167
+
168
+ repetitionPenalty = Param(Params._dummy(), "repetitionPenalty",
169
+ "The parameter for repetition penalty. 1.0 means no penalty. See `this paper <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details",
170
+ typeConverter=TypeConverters.toFloat)
171
+
172
+ noRepeatNgramSize = Param(Params._dummy(), "noRepeatNgramSize",
173
+ "If set to int > 0, all ngrams of that size can only occur once",
174
+ typeConverter=TypeConverters.toInt)
175
+
176
+ ignoreTokenIds = Param(Params._dummy(), "ignoreTokenIds",
177
+ "A list of token ids which are ignored in the decoder's output",
178
+ typeConverter=TypeConverters.toListInt)
179
+
180
+ beamSize = Param(Params._dummy(), "beamSize",
181
+ "The number of beams to use for beam search",
182
+ typeConverter=TypeConverters.toInt)
183
+
184
+ stopTokenIds = Param(Params._dummy(), "stopTokenIds",
185
+ "A list of token ids which are considered as stop tokens in the decoder's output",
186
+ typeConverter=TypeConverters.toListInt)
187
+
188
+
189
+ def setIgnoreTokenIds(self, value):
190
+ """A list of token ids which are ignored in the decoder's output.
191
+
192
+ Parameters
193
+ ----------
194
+ value : List[int]
195
+ The words to be filtered out
196
+ """
197
+ return self._set(ignoreTokenIds=value)
198
+
199
+ def setConfigProtoBytes(self, b):
200
+ """Sets configProto from tensorflow, serialized into byte array.
201
+
202
+ Parameters
203
+ ----------
204
+ b : List[int]
205
+ ConfigProto from tensorflow, serialized into byte array
206
+ """
207
+ return self._set(configProtoBytes=b)
208
+
209
+ def setMinOutputLength(self, value):
210
+ """Sets minimum length of the sequence to be generated.
211
+
212
+ Parameters
213
+ ----------
214
+ value : int
215
+ Minimum length of the sequence to be generated
216
+ """
217
+ return self._set(minOutputLength=value)
218
+
219
+ def setMaxOutputLength(self, value):
220
+ """Sets maximum length of output text.
221
+
222
+ Parameters
223
+ ----------
224
+ value : int
225
+ Maximum length of output text
226
+ """
227
+ return self._set(maxOutputLength=value)
228
+
229
+ def setDoSample(self, value):
230
+ """Sets whether or not to use sampling, use greedy decoding otherwise.
231
+
232
+ Parameters
233
+ ----------
234
+ value : bool
235
+ Whether or not to use sampling; use greedy decoding otherwise
236
+ """
237
+ return self._set(doSample=value)
238
+
239
+ def setTemperature(self, value):
240
+ """Sets the value used to module the next token probabilities.
241
+
242
+ Parameters
243
+ ----------
244
+ value : float
245
+ The value used to module the next token probabilities
246
+ """
247
+ return self._set(temperature=value)
248
+
249
+ def setTopK(self, value):
250
+ """Sets the number of highest probability vocabulary tokens to keep for
251
+ top-k-filtering.
252
+
253
+ Parameters
254
+ ----------
255
+ value : int
256
+ Number of highest probability vocabulary tokens to keep
257
+ """
258
+ return self._set(topK=value)
259
+
260
+ def setTopP(self, value):
261
+ """Sets the top cumulative probability for vocabulary tokens.
262
+
263
+ If set to float < 1, only the most probable tokens with probabilities
264
+ that add up to ``topP`` or higher are kept for generation.
265
+
266
+ Parameters
267
+ ----------
268
+ value : float
269
+ Cumulative probability for vocabulary tokens
270
+ """
271
+ return self._set(topP=value)
272
+
273
+ def setRepetitionPenalty(self, value):
274
+ """Sets the parameter for repetition penalty. 1.0 means no penalty.
275
+
276
+ Parameters
277
+ ----------
278
+ value : float
279
+ The repetition penalty
280
+
281
+ References
282
+ ----------
283
+ See `Ctrl: A Conditional Transformer Language Model For Controllable
284
+ Generation <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
285
+ """
286
+ return self._set(repetitionPenalty=value)
287
+
288
+ def setNoRepeatNgramSize(self, value):
289
+ """Sets size of n-grams that can only occur once.
290
+
291
+ If set to int > 0, all ngrams of that size can only occur once.
292
+
293
+ Parameters
294
+ ----------
295
+ value : int
296
+ N-gram size can only occur once
297
+ """
298
+ return self._set(noRepeatNgramSize=value)
299
+
300
+ def setBeamSize(self, value):
301
+ """Sets the number of beams to use for beam search.
302
+
303
+ Parameters
304
+ ----------
305
+ value : int
306
+ The number of beams to use for beam search
307
+ """
308
+ return self._set(beamSize=value)
309
+
310
+ def setStopTokenIds(self, value):
311
+ """Sets a list of token ids which are considered as stop tokens in the decoder's output.
312
+
313
+ Parameters
314
+ ----------
315
+ value : List[int]
316
+ The words to be considered as stop tokens
317
+ """
318
+ return self._set(stopTokenIds=value)
319
+
320
+ @keyword_only
321
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.LLAMA3Transformer", java_model=None):
322
+ super(LLAMA3Transformer, self).__init__(
323
+ classname=classname,
324
+ java_model=java_model
325
+ )
326
+ self._setDefault(
327
+ minOutputLength=0,
328
+ maxOutputLength=20,
329
+ doSample=False,
330
+ temperature=0.6,
331
+ topK=-1,
332
+ topP=0.9,
333
+ repetitionPenalty=1.0,
334
+ noRepeatNgramSize=3,
335
+ ignoreTokenIds=[],
336
+ batchSize=1,
337
+ beamSize=1,
338
+ stopTokenIds=[128001,]
339
+ )
340
+
341
+ @staticmethod
342
+ def loadSavedModel(folder, spark_session, use_openvino = False):
343
+ """Loads a locally saved model.
344
+
345
+ Parameters
346
+ ----------
347
+ folder : str
348
+ Folder of the saved model
349
+ spark_session : pyspark.sql.SparkSession
350
+ The current SparkSession
351
+
352
+ Returns
353
+ -------
354
+ LLAMA3Transformer
355
+ The restored model
356
+ """
357
+ from sparknlp.internal import _LLAMA3Loader
358
+ jModel = _LLAMA3Loader(folder, spark_session._jsparkSession, use_openvino)._java_obj
359
+ return LLAMA3Transformer(java_model=jModel)
360
+
361
+ @staticmethod
362
+ def pretrained(name="llama_3_7b_instruct_hf_int4", lang="en", remote_loc=None):
363
+ """Downloads and loads a pretrained model.
364
+
365
+ Parameters
366
+ ----------
367
+ name : str, optional
368
+ Name of the pretrained model, by default "llama_3_7b_instruct_hf_int4"
369
+ lang : str, optional
370
+ Language of the pretrained model, by default "en"
371
+ remote_loc : str, optional
372
+ Optional remote address of the resource, by default None. Will use
373
+ Spark NLPs repositories otherwise.
374
+
375
+ Returns
376
+ -------
377
+ LLAMA3Transformer
378
+ The restored model
379
+ """
380
+ from sparknlp.pretrained import ResourceDownloader
381
+ return ResourceDownloader.downloadModel(LLAMA3Transformer, name, lang, remote_loc)