spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,195 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for E5Embeddings."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class E5Embeddings(AnnotatorModel,
20
+ HasEmbeddingsProperties,
21
+ HasCaseSensitiveProperties,
22
+ HasStorageRef,
23
+ HasBatchedAnnotate,
24
+ HasMaxSentenceLengthLimit):
25
+ """Sentence embeddings using E5.
26
+
27
+ E5, a weakly supervised text embedding model that can generate text embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation, etc.)
28
+ Note that this annotator is only supported for Spark Versions 3.4 and up.
29
+
30
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
31
+ object:
32
+
33
+ >>> embeddings = E5Embeddings.pretrained() \\
34
+ ... .setInputCols(["document"]) \\
35
+ ... .setOutputCol("e5_embeddings")
36
+
37
+
38
+ The default model is ``"e5_small"``, if no name is provided.
39
+
40
+ For available pretrained models please see the
41
+ `Models Hub <https://sparknlp.org/models?q=E5>`__.
42
+
43
+
44
+ ====================== ======================
45
+ Input Annotation types Output Annotation type
46
+ ====================== ======================
47
+ ``DOCUMENT`` ``SENTENCE_EMBEDDINGS``
48
+ ====================== ======================
49
+
50
+ Parameters
51
+ ----------
52
+ batchSize
53
+ Size of every batch , by default 8
54
+ dimension
55
+ Number of embedding dimensions, by default 768
56
+ caseSensitive
57
+ Whether to ignore case in tokens for embeddings matching, by default False
58
+ maxSentenceLength
59
+ Max sentence length to process, by default 512
60
+ configProtoBytes
61
+ ConfigProto from tensorflow, serialized into byte array.
62
+
63
+ References
64
+ ----------
65
+ `Text Embeddings by Weakly-Supervised Contrastive Pre-training <https://arxiv.org/pdf/2212.03533>`__
66
+
67
+ https://github.com/microsoft/unilm/tree/master/e5
68
+
69
+ **Paper abstract**
70
+
71
+ *This paper presents E5, a family of state-of-the-art text embeddings that transfer
72
+ well to a wide range of tasks. The model is trained in a contrastive manner with
73
+ weak supervision signals from our curated large-scale text pair dataset (called
74
+ CCPairs). E5 can be readily used as a general-purpose embedding model for any
75
+ tasks requiring a single-vector representation of texts such as retrieval, clustering,
76
+ and classification, achieving strong performance in both zero-shot and fine-tuned
77
+ settings. We conduct extensive evaluations on 56 datasets from the BEIR and
78
+ MTEB benchmarks. For zero-shot settings, E5 is the first model that outperforms
79
+ the strong BM25 baseline on the BEIR retrieval benchmark without using any
80
+ labeled data. When fine-tuned, E5 obtains the best results on the MTEB benchmark,
81
+ beating existing embedding models with 40× more parameters.*
82
+
83
+ Examples
84
+ --------
85
+ >>> import sparknlp
86
+ >>> from sparknlp.base import *
87
+ >>> from sparknlp.annotator import *
88
+ >>> from pyspark.ml import Pipeline
89
+ >>> documentAssembler = DocumentAssembler() \\
90
+ ... .setInputCol("text") \\
91
+ ... .setOutputCol("document")
92
+ >>> embeddings = E5Embeddings.pretrained() \\
93
+ ... .setInputCols(["document"]) \\
94
+ ... .setOutputCol("e5_embeddings")
95
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
96
+ ... .setInputCols(["e5_embeddings"]) \\
97
+ ... .setOutputCols("finished_embeddings") \\
98
+ ... .setOutputAsVector(True)
99
+ >>> pipeline = Pipeline().setStages([
100
+ ... documentAssembler,
101
+ ... embeddings,
102
+ ... embeddingsFinisher
103
+ ... ])
104
+ >>> data = spark.createDataFrame([["query: how much protein should a female eat",
105
+ ... "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." + \
106
+ ... "But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" + \
107
+ ... "marathon. Check out the chart below to see how much protein you should be eating each day.",
108
+ ... ]]).toDF("text")
109
+ >>> result = pipeline.fit(data).transform(data)
110
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
111
+ +--------------------------------------------------------------------------------+
112
+ | result|
113
+ +--------------------------------------------------------------------------------+
114
+ |[[8.0190285E-4, -0.005974853, -0.072875895, 0.007944068, 0.026059335, -0.0080...|
115
+ |[[0.050514214, 0.010061974, -0.04340176, -0.020937217, 0.05170225, 0.01157857...|
116
+ +--------------------------------------------------------------------------------+
117
+ """
118
+
119
+ name = "E5Embeddings"
120
+
121
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
122
+
123
+ outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
124
+ configProtoBytes = Param(Params._dummy(),
125
+ "configProtoBytes",
126
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
127
+ TypeConverters.toListInt)
128
+
129
+
130
+ def setConfigProtoBytes(self, b):
131
+ """Sets configProto from tensorflow, serialized into byte array.
132
+
133
+ Parameters
134
+ ----------
135
+ b : List[int]
136
+ ConfigProto from tensorflow, serialized into byte array
137
+ """
138
+ return self._set(configProtoBytes=b)
139
+
140
+ @keyword_only
141
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.E5Embeddings", java_model=None):
142
+ super(E5Embeddings, self).__init__(
143
+ classname=classname,
144
+ java_model=java_model
145
+ )
146
+ self._setDefault(
147
+ dimension=768,
148
+ batchSize=8,
149
+ maxSentenceLength=512,
150
+ caseSensitive=False,
151
+ )
152
+
153
+ @staticmethod
154
+ def loadSavedModel(folder, spark_session, use_openvino=False):
155
+ """Loads a locally saved model.
156
+
157
+ Parameters
158
+ ----------
159
+ folder : str
160
+ Folder of the saved model
161
+ spark_session : pyspark.sql.SparkSession
162
+ The current SparkSession
163
+ use_openvino : bool
164
+ Use OpenVINO backend
165
+
166
+ Returns
167
+ -------
168
+ E5Embeddings
169
+ The restored model
170
+ """
171
+ from sparknlp.internal import _E5Loader
172
+ jModel = _E5Loader(folder, spark_session._jsparkSession, use_openvino)._java_obj
173
+ return E5Embeddings(java_model=jModel)
174
+
175
+ @staticmethod
176
+ def pretrained(name="e5_small", lang="en", remote_loc=None):
177
+ """Downloads and loads a pretrained model.
178
+
179
+ Parameters
180
+ ----------
181
+ name : str, optional
182
+ Name of the pretrained model, by default "e5_small"
183
+ lang : str, optional
184
+ Language of the pretrained model, by default "en"
185
+ remote_loc : str, optional
186
+ Optional remote address of the resource, by default None. Will use
187
+ Spark NLPs repositories otherwise.
188
+
189
+ Returns
190
+ -------
191
+ E5Embeddings
192
+ The restored model
193
+ """
194
+ from sparknlp.pretrained import ResourceDownloader
195
+ return ResourceDownloader.downloadModel(E5Embeddings, name, lang, remote_loc)
@@ -0,0 +1,138 @@
1
+ # Copyright 2017-2024 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from sparknlp.common import *
16
+
17
+ class E5VEmbeddings(AnnotatorModel,
18
+ HasBatchedAnnotateImage,
19
+ HasImageFeatureProperties,
20
+ HasEngine,
21
+ HasRescaleFactor):
22
+ """Universal multimodal embeddings using the E5-V model (see https://huggingface.co/royokong/e5-v).
23
+
24
+ E5-V bridges the modality gap between different input types (text, image) and demonstrates strong performance in multimodal embeddings, even without fine-tuning. It also supports a single-modality training approach, where the model is trained exclusively on text pairs, often yielding better performance than multimodal training.
25
+
26
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion object:
27
+
28
+ >>> e5vEmbeddings = E5VEmbeddings.pretrained() \
29
+ ... .setInputCols(["image_assembler"]) \
30
+ ... .setOutputCol("e5v")
31
+
32
+ The default model is ``"e5v_int4"``, if no name is provided.
33
+
34
+ For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Question+Answering>`__.
35
+
36
+ ====================== ======================
37
+ Input Annotation types Output Annotation type
38
+ ====================== ======================
39
+ ``IMAGE`` ``SENTENCE_EMBEDDINGS``
40
+ ====================== ======================
41
+
42
+ Examples
43
+ --------
44
+ Image + Text Embedding:
45
+ >>> import sparknlp
46
+ >>> from sparknlp.base import *
47
+ >>> from sparknlp.annotator import *
48
+ >>> from pyspark.ml import Pipeline
49
+ >>> image_df = spark.read.format("image").option("dropInvalid", value = True).load(imageFolder)
50
+ >>> imagePrompt = "<|start_header_id|>user<|end_header_id|>\n\n<image>\\nSummary above image in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"
51
+ >>> test_df = image_df.withColumn("text", lit(imagePrompt))
52
+ >>> imageAssembler = ImageAssembler() \
53
+ ... .setInputCol("image") \
54
+ ... .setOutputCol("image_assembler")
55
+ >>> e5vEmbeddings = E5VEmbeddings.pretrained() \
56
+ ... .setInputCols(["image_assembler"]) \
57
+ ... .setOutputCol("e5v")
58
+ >>> pipeline = Pipeline().setStages([
59
+ ... imageAssembler,
60
+ ... e5vEmbeddings
61
+ ... ])
62
+ >>> result = pipeline.fit(test_df).transform(test_df)
63
+ >>> result.select("e5v.embeddings").show(truncate = False)
64
+
65
+ Text-Only Embedding:
66
+ >>> from sparknlp.util import EmbeddingsDataFrameUtils
67
+ >>> textPrompt = "<|start_header_id|>user<|end_header_id|>\n\n<sent>\\nSummary above sentence in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"
68
+ >>> textDesc = "A cat sitting in a box."
69
+ >>> nullImageDF = spark.createDataFrame(spark.sparkContext.parallelize([EmbeddingsDataFrameUtils.emptyImageRow]), EmbeddingsDataFrameUtils.imageSchema)
70
+ >>> textDF = nullImageDF.withColumn("text", lit(textPrompt.replace("<sent>", textDesc)))
71
+ >>> e5vEmbeddings = E5VEmbeddings.pretrained() \
72
+ ... .setInputCols(["image"]) \
73
+ ... .setOutputCol("e5v")
74
+ >>> result = e5vEmbeddings.transform(textDF)
75
+ >>> result.select("e5v.embeddings").show(truncate = False)
76
+ """
77
+
78
+ name = "E5VEmbeddings"
79
+
80
+ inputAnnotatorTypes = [AnnotatorType.IMAGE]
81
+ outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
82
+
83
+ @keyword_only
84
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.E5VEmbeddings", java_model=None):
85
+ """Initializes the E5VEmbeddings annotator.
86
+
87
+ Parameters
88
+ ----------
89
+ classname : str, optional
90
+ The Java class name of the annotator, by default "com.johnsnowlabs.nlp.annotators.embeddings.E5VEmbeddings"
91
+ java_model : Optional[java.lang.Object], optional
92
+ A pre-initialized Java model, by default None
93
+ """
94
+ super(E5VEmbeddings, self).__init__(classname=classname, java_model=java_model)
95
+ self._setDefault()
96
+
97
+ @staticmethod
98
+ def loadSavedModel(folder, spark_session, use_openvino=False):
99
+ """Loads a locally saved model.
100
+
101
+ Parameters
102
+ ----------
103
+ folder : str
104
+ Folder of the saved model
105
+ spark_session : pyspark.sql.SparkSession
106
+ The current SparkSession
107
+ use_openvino : bool, optional
108
+ Whether to use OpenVINO engine, by default False
109
+
110
+ Returns
111
+ -------
112
+ E5VEmbeddings
113
+ The restored model
114
+ """
115
+ from sparknlp.internal import _E5VEmbeddingsLoader
116
+ jModel = _E5VEmbeddingsLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
117
+ return E5VEmbeddings(java_model=jModel)
118
+
119
+ @staticmethod
120
+ def pretrained(name="e5v_int4", lang="en", remote_loc=None):
121
+ """Downloads and loads a pretrained model.
122
+
123
+ Parameters
124
+ ----------
125
+ name : str, optional
126
+ Name of the pretrained model, by default "e5v_int4"
127
+ lang : str, optional
128
+ Language of the pretrained model, by default "en"
129
+ remote_loc : str, optional
130
+ Optional remote address of the resource, by default None. Will use Spark NLPs repositories otherwise.
131
+
132
+ Returns
133
+ -------
134
+ E5VEmbeddings
135
+ The restored model
136
+ """
137
+ from sparknlp.pretrained import ResourceDownloader
138
+ return ResourceDownloader.downloadModel(E5VEmbeddings, name, lang, remote_loc)
@@ -0,0 +1,251 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for ElmoEmbeddings."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class ElmoEmbeddings(AnnotatorModel,
20
+ HasEmbeddingsProperties,
21
+ HasCaseSensitiveProperties,
22
+ HasStorageRef,
23
+ HasEngine):
24
+ """Word embeddings from ELMo (Embeddings from Language Models), a language
25
+ model trained on the 1 Billion Word Benchmark.
26
+
27
+ Note that this is a very computationally expensive module compared to word
28
+ embedding modules that only perform embedding lookups. The use of an
29
+ accelerator is recommended.
30
+
31
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
32
+ object:
33
+
34
+ >>> embeddings = ElmoEmbeddings.pretrained() \\
35
+ ... .setInputCols(["sentence", "token"]) \\
36
+ ... .setOutputCol("elmo_embeddings")
37
+
38
+
39
+ The default model is ``"elmo"``, if no name is provided.
40
+
41
+ For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Embeddings>`__.
42
+
43
+ The pooling layer can be set with :meth:`.setPoolingLayer` to the following
44
+ values:
45
+
46
+ - ``"word_emb"``: the character-based word representations with shape
47
+ ``[batch_size, max_length, 512]``.
48
+ - ``"lstm_outputs1"``: the first LSTM hidden state with shape
49
+ ``[batch_size, max_length, 1024]``.
50
+ - ``"lstm_outputs2"``: the second LSTM hidden state with shape
51
+ ``[batch_size, max_length, 1024]``.
52
+ - ``"elmo"``: the weighted sum of the 3 layers, where the weights are
53
+ trainable. This tensor has shape ``[batch_size, max_length, 1024]``.
54
+
55
+ For extended examples of usage, see the
56
+ `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner/ner_elmo.ipynb>`__.
57
+
58
+ ====================== ======================
59
+ Input Annotation types Output Annotation type
60
+ ====================== ======================
61
+ ``DOCUMENT, TOKEN`` ``WORD_EMBEDDINGS``
62
+ ====================== ======================
63
+
64
+ Parameters
65
+ ----------
66
+ batchSize
67
+ Batch size. Large values allows faster processing but requires more
68
+ memory, by default 32
69
+ dimension
70
+ Number of embedding dimensions
71
+ caseSensitive
72
+ Whether to ignore case in tokens for embeddings matching
73
+ configProtoBytes
74
+ ConfigProto from tensorflow, serialized into byte array.
75
+ poolingLayer
76
+ Set ELMO pooling layer to: word_emb, lstm_outputs1, lstm_outputs2, or
77
+ elmo, by default word_emb
78
+
79
+ References
80
+ ----------
81
+ https://tfhub.dev/google/elmo/3
82
+
83
+ `Deep contextualized word representations <https://arxiv.org/abs/1802.05365>`__
84
+
85
+ **Paper abstract:**
86
+
87
+ *We introduce a new type of deep contextualized word representation that
88
+ models both (1) complex characteristics of word use (e.g., syntax and
89
+ semantics), and (2) how these uses vary across linguistic contexts (i.e.,
90
+ to model polysemy). Our word vectors are learned functions of the internal
91
+ states of a deep bidirectional language model (biLM), which is pre-trained
92
+ on a large text corpus. We show that these representations can be easily
93
+ added to existing models and significantly improve the state of the art
94
+ across six challenging NLP problems, including question answering, textual
95
+ entailment and sentiment analysis. We also present an analysis showing that
96
+ exposing the deep internals of the pre-trained network is crucial, allowing
97
+ downstream models to mix different types of semi-supervision signals.*
98
+
99
+ Examples
100
+ --------
101
+ >>> import sparknlp
102
+ >>> from sparknlp.base import *
103
+ >>> from sparknlp.annotator import *
104
+ >>> from pyspark.ml import Pipeline
105
+ >>> documentAssembler = DocumentAssembler() \\
106
+ ... .setInputCol("text") \\
107
+ ... .setOutputCol("document")
108
+ >>> tokenizer = Tokenizer() \\
109
+ ... .setInputCols(["document"]) \\
110
+ ... .setOutputCol("token")
111
+ >>> embeddings = ElmoEmbeddings.pretrained() \\
112
+ ... .setPoolingLayer("word_emb") \\
113
+ ... .setInputCols(["token", "document"]) \\
114
+ ... .setOutputCol("embeddings")
115
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
116
+ ... .setInputCols(["embeddings"]) \\
117
+ ... .setOutputCols("finished_embeddings") \\
118
+ ... .setOutputAsVector(True) \\
119
+ ... .setCleanAnnotations(False)
120
+ >>> pipeline = Pipeline().setStages([
121
+ ... documentAssembler,
122
+ ... tokenizer,
123
+ ... embeddings,
124
+ ... embeddingsFinisher
125
+ ... ])
126
+ >>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text")
127
+ >>> result = pipeline.fit(data).transform(data)
128
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
129
+ +--------------------------------------------------------------------------------+
130
+ | result|
131
+ +--------------------------------------------------------------------------------+
132
+ |[6.662458181381226E-4,-0.2541114091873169,-0.6275503039360046,0.5787073969841...|
133
+ |[0.19154725968837738,0.22998669743537903,-0.2894386649131775,0.21524395048618...|
134
+ |[0.10400570929050446,0.12288510054349899,-0.07056470215320587,-0.246389418840...|
135
+ |[0.49932169914245605,-0.12706467509269714,0.30969417095184326,0.2643227577209...|
136
+ |[-0.8871506452560425,-0.20039963722229004,-1.0601330995559692,0.0348707810044...|
137
+ +--------------------------------------------------------------------------------+
138
+ """
139
+
140
+ name = "ElmoEmbeddings"
141
+
142
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
143
+
144
+ outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS
145
+
146
+ batchSize = Param(Params._dummy(),
147
+ "batchSize",
148
+ "Batch size. Large values allows faster processing but requires more memory.",
149
+ typeConverter=TypeConverters.toInt)
150
+
151
+ configProtoBytes = Param(Params._dummy(),
152
+ "configProtoBytes",
153
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
154
+ TypeConverters.toListInt)
155
+
156
+ poolingLayer = Param(Params._dummy(),
157
+ "poolingLayer", "Set ELMO pooling layer to: word_emb, lstm_outputs1, lstm_outputs2, or elmo",
158
+ typeConverter=TypeConverters.toString)
159
+
160
+ def setConfigProtoBytes(self, b):
161
+ """Sets configProto from tensorflow, serialized into byte array.
162
+
163
+ Parameters
164
+ ----------
165
+ b : List[int]
166
+ ConfigProto from tensorflow, serialized into byte array
167
+ """
168
+ return self._set(configProtoBytes=b)
169
+
170
+ def setBatchSize(self, value):
171
+ """Sets batch size, by default 32.
172
+
173
+ Parameters
174
+ ----------
175
+ value : int
176
+ Batch size
177
+ """
178
+ return self._set(batchSize=value)
179
+
180
+ def setPoolingLayer(self, layer):
181
+ """Sets ELMO pooling layer to: word_emb, lstm_outputs1, lstm_outputs2, or
182
+ elmo, by default word_emb
183
+
184
+ Parameters
185
+ ----------
186
+ layer : str
187
+ ELMO pooling layer
188
+ """
189
+ if layer == "word_emb":
190
+ return self._set(poolingLayer=layer)
191
+ elif layer == "lstm_outputs1":
192
+ return self._set(poolingLayer=layer)
193
+ elif layer == "lstm_outputs2":
194
+ return self._set(poolingLayer=layer)
195
+ elif layer == "elmo":
196
+ return self._set(poolingLayer=layer)
197
+ else:
198
+ return self._set(poolingLayer="word_emb")
199
+
200
+ @keyword_only
201
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.ElmoEmbeddings", java_model=None):
202
+ super(ElmoEmbeddings, self).__init__(
203
+ classname=classname,
204
+ java_model=java_model
205
+ )
206
+ self._setDefault(
207
+ batchSize=32,
208
+ poolingLayer="word_emb"
209
+ )
210
+
211
+ @staticmethod
212
+ def loadSavedModel(folder, spark_session):
213
+ """Loads a locally saved model.
214
+
215
+ Parameters
216
+ ----------
217
+ folder : str
218
+ Folder of the saved model
219
+ spark_session : pyspark.sql.SparkSession
220
+ The current SparkSession
221
+
222
+ Returns
223
+ -------
224
+ ElmoEmbeddings
225
+ The restored model
226
+ """
227
+ from sparknlp.internal import _ElmoLoader
228
+ jModel = _ElmoLoader(folder, spark_session._jsparkSession)._java_obj
229
+ return ElmoEmbeddings(java_model=jModel)
230
+
231
+ @staticmethod
232
+ def pretrained(name="elmo", lang="en", remote_loc=None):
233
+ """Downloads and loads a pretrained model.
234
+
235
+ Parameters
236
+ ----------
237
+ name : str, optional
238
+ Name of the pretrained model, by default "elmo"
239
+ lang : str, optional
240
+ Language of the pretrained model, by default "en"
241
+ remote_loc : str, optional
242
+ Optional remote address of the resource, by default None. Will use
243
+ Spark NLPs repositories otherwise.
244
+
245
+ Returns
246
+ -------
247
+ ElmoEmbeddings
248
+ The restored model
249
+ """
250
+ from sparknlp.pretrained import ResourceDownloader
251
+ return ResourceDownloader.downloadModel(ElmoEmbeddings, name, lang, remote_loc)