spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,202 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for SnowFlakeEmbeddings."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class SnowFlakeEmbeddings(AnnotatorModel,
20
+ HasEmbeddingsProperties,
21
+ HasCaseSensitiveProperties,
22
+ HasStorageRef,
23
+ HasBatchedAnnotate,
24
+ HasMaxSentenceLengthLimit):
25
+ """Sentence embeddings using SnowFlake.
26
+
27
+ snowflake-arctic-embed is a suite of text embedding models that focuses on creating
28
+ high-quality retrieval models optimized for performance.
29
+
30
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
31
+ object:
32
+
33
+ >>> embeddings = SnowFlakeEmbeddings.pretrained() \\
34
+ ... .setInputCols(["document"]) \\
35
+ ... .setOutputCol("SnowFlake_embeddings")
36
+
37
+
38
+ The default model is ``"snowflake_artic_m"``, if no name is provided.
39
+
40
+ For available pretrained models please see the
41
+ `Models Hub <https://sparknlp.org/models?q=SnowFlake>`__.
42
+
43
+
44
+ ====================== ======================
45
+ Input Annotation types Output Annotation type
46
+ ====================== ======================
47
+ ``DOCUMENT`` ``SENTENCE_EMBEDDINGS``
48
+ ====================== ======================
49
+
50
+ **References**
51
+
52
+ `Arctic-Embed: Scalable, Efficient, and Accurate Text Embedding Models <https://arxiv.org/abs/2405.05374>`__
53
+ `Snowflake Arctic-Embed Models <https://github.com/Snowflake-Labs/arctic-embed>`__
54
+
55
+ **Paper abstract**
56
+
57
+ *The models are trained by leveraging existing open-source text representation models, such
58
+ as bert-base-uncased, and are trained in a multi-stage pipeline to optimize their retrieval
59
+ performance. First, the models are trained with large batches of query-document pairs where
60
+ negatives are derived in-batch—pretraining leverages about 400m samples of a mix of public
61
+ datasets and proprietary web search data. Following pretraining models are further optimized
62
+ with long training on a smaller dataset (about 1m samples) of triplets of query, positive
63
+ document, and negative document derived from hard harmful mining. Mining of the negatives and
64
+ data curation is crucial to retrieval accuracy. A detailed technical report will be available
65
+ shortly. *
66
+
67
+ Parameters
68
+ ----------
69
+ batchSize
70
+ Size of every batch , by default 8
71
+ dimension
72
+ Number of embedding dimensions, by default 768
73
+ caseSensitive
74
+ Whether to ignore case in tokens for embeddings matching, by default False
75
+ maxSentenceLength
76
+ Max sentence length to process, by default 512
77
+ configProtoBytes
78
+ ConfigProto from tensorflow, serialized into byte array.
79
+
80
+
81
+ Examples
82
+ --------
83
+ >>> import sparknlp
84
+ >>> from sparknlp.base import *
85
+ >>> from sparknlp.annotator import *
86
+ >>> from pyspark.ml import Pipeline
87
+ >>> documentAssembler = DocumentAssembler() \\
88
+ ... .setInputCol("text") \\
89
+ ... .setOutputCol("document")
90
+ >>> embeddings = SnowFlakeEmbeddings.pretrained() \\
91
+ ... .setInputCols(["document"]) \\
92
+ ... .setOutputCol("embeddings")
93
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
94
+ ... .setInputCols("embeddings") \\
95
+ ... .setOutputCols("finished_embeddings") \\
96
+ ... .setOutputAsVector(True)
97
+ >>> pipeline = Pipeline().setStages([
98
+ ... documentAssembler,
99
+ ... embeddings,
100
+ ... embeddingsFinisher
101
+ ... ])
102
+ >>> data = spark.createDataFrame([["hello world", "hello moon"]]).toDF("text")
103
+ >>> result = pipeline.fit(data).transform(data)
104
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
105
+ +--------------------------------------------------------------------------------+
106
+ | result|
107
+ +--------------------------------------------------------------------------------+
108
+ |[0.50387806, 0.5861606, 0.35129607, -0.76046336, -0.32446072, -0.117674336, 0...|
109
+ |[0.6660665, 0.961762, 0.24854276, -0.1018044, -0.6569202, 0.027635604, 0.1915...|
110
+ +--------------------------------------------------------------------------------+
111
+ """
112
+
113
+ name = "SnowFlakeEmbeddings"
114
+
115
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
116
+
117
+ outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
118
+ poolingStrategy = Param(Params._dummy(),
119
+ "poolingStrategy",
120
+ "Pooling strategy to use for sentence embeddings",
121
+ TypeConverters.toString)
122
+
123
+ def setPoolingStrategy(self, value):
124
+ """Pooling strategy to use for sentence embeddings.
125
+
126
+ Available pooling strategies for sentence embeddings are:
127
+ - `"cls"`: leading `[CLS]` token
128
+ - `"cls_avg"`: leading `[CLS]` token + mean of all other tokens
129
+ - `"last"`: embeddings of the last token in the sequence
130
+ - `"avg"`: mean of all tokens
131
+ - `"max"`: max of all embedding features of the entire token sequence
132
+ - `"int"`: An integer number, which represents the index of the token to use as the
133
+ embedding
134
+
135
+ Parameters
136
+ ----------
137
+ value : str
138
+ Pooling strategy to use for sentence embeddings
139
+ """
140
+
141
+ valid_strategies = {"cls", "cls_avg", "last", "avg", "max"}
142
+ if value in valid_strategies or value.isdigit():
143
+ return self._set(poolingStrategy=value)
144
+ else:
145
+ raise ValueError(f"Invalid pooling strategy: {value}. "
146
+ f"Valid strategies are: {', '.join(self.valid_strategies)} or an integer.")
147
+
148
+ @keyword_only
149
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.SnowFlakeEmbeddings", java_model=None):
150
+ super(SnowFlakeEmbeddings, self).__init__(
151
+ classname=classname,
152
+ java_model=java_model
153
+ )
154
+ self._setDefault(
155
+ dimension=1024,
156
+ batchSize=8,
157
+ maxSentenceLength=512,
158
+ caseSensitive=False,
159
+ poolingStrategy="cls"
160
+ )
161
+
162
+ @staticmethod
163
+ def loadSavedModel(folder, spark_session):
164
+ """Loads a locally saved model.
165
+
166
+ Parameters
167
+ ----------
168
+ folder : str
169
+ Folder of the saved model
170
+ spark_session : pyspark.sql.SparkSession
171
+ The current SparkSession
172
+
173
+ Returns
174
+ -------
175
+ SnowFlakeEmbeddings
176
+ The restored model
177
+ """
178
+ from sparknlp.internal import _SnowFlakeEmbeddingsLoader
179
+ jModel = _SnowFlakeEmbeddingsLoader(folder, spark_session._jsparkSession)._java_obj
180
+ return SnowFlakeEmbeddings(java_model=jModel)
181
+
182
+ @staticmethod
183
+ def pretrained(name="snowflake_artic_m", lang="en", remote_loc=None):
184
+ """Downloads and loads a pretrained model.
185
+
186
+ Parameters
187
+ ----------
188
+ name : str, optional
189
+ Name of the pretrained model, by default "snowflake_artic_m"
190
+ lang : str, optional
191
+ Language of the pretrained model, by default "en"
192
+ remote_loc : str, optional
193
+ Optional remote address of the resource, by default None. Will use
194
+ Spark NLPs repositories otherwise.
195
+
196
+ Returns
197
+ -------
198
+ SnowFlakeEmbeddings
199
+ The restored model
200
+ """
201
+ from sparknlp.pretrained import ResourceDownloader
202
+ return ResourceDownloader.downloadModel(SnowFlakeEmbeddings, name, lang, remote_loc)
@@ -0,0 +1,211 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for UAEEmbeddings."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class UAEEmbeddings(AnnotatorModel,
20
+ HasEmbeddingsProperties,
21
+ HasCaseSensitiveProperties,
22
+ HasStorageRef,
23
+ HasBatchedAnnotate,
24
+ HasMaxSentenceLengthLimit):
25
+ """Sentence embeddings using Universal AnglE Embedding (UAE).
26
+
27
+ UAE is a novel angle-optimized text embedding model, designed to improve semantic textual
28
+ similarity tasks, which are crucial for Large Language Model (LLM) applications. By
29
+ introducing angle optimization in a complex space, AnglE effectively mitigates saturation of
30
+ the cosine similarity function.
31
+
32
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
33
+ object:
34
+
35
+ >>> embeddings = UAEEmbeddings.pretrained() \\
36
+ ... .setInputCols(["document"]) \\
37
+ ... .setOutputCol("UAE_embeddings")
38
+
39
+
40
+ The default model is ``"uae_large_v1"``, if no name is provided.
41
+
42
+ For available pretrained models please see the
43
+ `Models Hub <https://sparknlp.org/models?q=UAE>`__.
44
+
45
+
46
+ ====================== ======================
47
+ Input Annotation types Output Annotation type
48
+ ====================== ======================
49
+ ``DOCUMENT`` ``SENTENCE_EMBEDDINGS``
50
+ ====================== ======================
51
+
52
+ Parameters
53
+ ----------
54
+ batchSize
55
+ Size of every batch , by default 8
56
+ dimension
57
+ Number of embedding dimensions, by default 768
58
+ caseSensitive
59
+ Whether to ignore case in tokens for embeddings matching, by default False
60
+ maxSentenceLength
61
+ Max sentence length to process, by default 512
62
+ configProtoBytes
63
+ ConfigProto from tensorflow, serialized into byte array.
64
+
65
+ References
66
+ ----------
67
+
68
+ `AnglE-optimized Text Embeddings <https://arxiv.org/abs/2309.12871>`__
69
+ `UAE Github Repository <https://github.com/baochi0212/uae-embedding>`__
70
+
71
+ **Paper abstract**
72
+
73
+ *High-quality text embedding is pivotal in improving semantic textual similarity (STS) tasks,
74
+ which are crucial components in Large Language Model (LLM) applications. However, a common
75
+ challenge existing text embedding models face is the problem of vanishing gradients, primarily
76
+ due to their reliance on the cosine function in the optimization objective, which has
77
+ saturation zones. To address this issue, this paper proposes a novel angle-optimized text
78
+ embedding model called AnglE. The core idea of AnglE is to introduce angle optimization in a
79
+ complex space. This novel approach effectively mitigates the adverse effects of the saturation
80
+ zone in the cosine function, which can impede gradient and hinder optimization processes. To
81
+ set up a comprehensive STS evaluation, we experimented on existing short-text STS datasets and
82
+ a newly collected long-text STS dataset from GitHub Issues. Furthermore, we examine
83
+ domain-specific STS scenarios with limited labeled data and explore how AnglE works with
84
+ LLM-annotated data. Extensive experiments were conducted on various tasks including short-text
85
+ STS, long-text STS, and domain-specific STS tasks. The results show that AnglE outperforms the
86
+ state-of-the-art (SOTA) STS models that ignore the cosine saturation zone. These findings
87
+ demonstrate the ability of AnglE to generate high-quality text embeddings and the usefulness
88
+ of angle optimization in STS.*
89
+
90
+ Examples
91
+ --------
92
+ >>> import sparknlp
93
+ >>> from sparknlp.base import *
94
+ >>> from sparknlp.annotator import *
95
+ >>> from pyspark.ml import Pipeline
96
+ >>> documentAssembler = DocumentAssembler() \\
97
+ ... .setInputCol("text") \\
98
+ ... .setOutputCol("document")
99
+ >>> embeddings = UAEEmbeddings.pretrained() \\
100
+ ... .setInputCols(["document"]) \\
101
+ ... .setOutputCol("embeddings")
102
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
103
+ ... .setInputCols("embeddings") \\
104
+ ... .setOutputCols("finished_embeddings") \\
105
+ ... .setOutputAsVector(True)
106
+ >>> pipeline = Pipeline().setStages([
107
+ ... documentAssembler,
108
+ ... embeddings,
109
+ ... embeddingsFinisher
110
+ ... ])
111
+ >>> data = spark.createDataFrame([["hello world", "hello moon"]]).toDF("text")
112
+ >>> result = pipeline.fit(data).transform(data)
113
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
114
+ +--------------------------------------------------------------------------------+
115
+ | result|
116
+ +--------------------------------------------------------------------------------+
117
+ |[0.50387806, 0.5861606, 0.35129607, -0.76046336, -0.32446072, -0.117674336, 0...|
118
+ |[0.6660665, 0.961762, 0.24854276, -0.1018044, -0.6569202, 0.027635604, 0.1915...|
119
+ +--------------------------------------------------------------------------------+
120
+ """
121
+
122
+ name = "UAEEmbeddings"
123
+
124
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
125
+
126
+ outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
127
+ poolingStrategy = Param(Params._dummy(),
128
+ "poolingStrategy",
129
+ "Pooling strategy to use for sentence embeddings",
130
+ TypeConverters.toString)
131
+
132
+ def setPoolingStrategy(self, value):
133
+ """Pooling strategy to use for sentence embeddings.
134
+
135
+ Available pooling strategies for sentence embeddings are:
136
+ - `"cls"`: leading `[CLS]` token
137
+ - `"cls_avg"`: leading `[CLS]` token + mean of all other tokens
138
+ - `"last"`: embeddings of the last token in the sequence
139
+ - `"avg"`: mean of all tokens
140
+ - `"max"`: max of all embedding features of the entire token sequence
141
+ - `"int"`: An integer number, which represents the index of the token to use as the
142
+ embedding
143
+
144
+ Parameters
145
+ ----------
146
+ value : str
147
+ Pooling strategy to use for sentence embeddings
148
+ """
149
+
150
+ valid_strategies = {"cls", "cls_avg", "last", "avg", "max"}
151
+ if value in valid_strategies or value.isdigit():
152
+ return self._set(poolingStrategy=value)
153
+ else:
154
+ raise ValueError(f"Invalid pooling strategy: {value}. "
155
+ f"Valid strategies are: {', '.join(self.valid_strategies)} or an integer.")
156
+
157
+ @keyword_only
158
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.UAEEmbeddings", java_model=None):
159
+ super(UAEEmbeddings, self).__init__(
160
+ classname=classname,
161
+ java_model=java_model
162
+ )
163
+ self._setDefault(
164
+ dimension=1024,
165
+ batchSize=8,
166
+ maxSentenceLength=512,
167
+ caseSensitive=False,
168
+ poolingStrategy="cls"
169
+ )
170
+
171
+ @staticmethod
172
+ def loadSavedModel(folder, spark_session):
173
+ """Loads a locally saved model.
174
+
175
+ Parameters
176
+ ----------
177
+ folder : str
178
+ Folder of the saved model
179
+ spark_session : pyspark.sql.SparkSession
180
+ The current SparkSession
181
+
182
+ Returns
183
+ -------
184
+ UAEEmbeddings
185
+ The restored model
186
+ """
187
+ from sparknlp.internal import _UAEEmbeddingsLoader
188
+ jModel = _UAEEmbeddingsLoader(folder, spark_session._jsparkSession)._java_obj
189
+ return UAEEmbeddings(java_model=jModel)
190
+
191
+ @staticmethod
192
+ def pretrained(name="uae_large_v1", lang="en", remote_loc=None):
193
+ """Downloads and loads a pretrained model.
194
+
195
+ Parameters
196
+ ----------
197
+ name : str, optional
198
+ Name of the pretrained model, by default "UAE_small"
199
+ lang : str, optional
200
+ Language of the pretrained model, by default "en"
201
+ remote_loc : str, optional
202
+ Optional remote address of the resource, by default None. Will use
203
+ Spark NLPs repositories otherwise.
204
+
205
+ Returns
206
+ -------
207
+ UAEEmbeddings
208
+ The restored model
209
+ """
210
+ from sparknlp.pretrained import ResourceDownloader
211
+ return ResourceDownloader.downloadModel(UAEEmbeddings, name, lang, remote_loc)
@@ -0,0 +1,211 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the UniversalSentenceEncoder."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class UniversalSentenceEncoder(AnnotatorModel,
20
+ HasEmbeddingsProperties,
21
+ HasStorageRef,
22
+ HasBatchedAnnotate,
23
+ HasEngine):
24
+ """The Universal Sentence Encoder encodes text into high dimensional vectors
25
+ that can be used for text classification, semantic similarity, clustering
26
+ and other natural language tasks.
27
+
28
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
29
+ object:
30
+
31
+ >>> useEmbeddings = UniversalSentenceEncoder.pretrained() \\
32
+ ... .setInputCols(["sentence"]) \\
33
+ ... .setOutputCol("sentence_embeddings")
34
+
35
+
36
+ The default model is ``"tfhub_use"``, if no name is provided. For available
37
+ pretrained models please see the `Models Hub
38
+ <https://sparknlp.org/models?task=Embeddings>`__.
39
+
40
+ For extended examples of usage, see the `Examples
41
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/classification/ClassifierDL_Train_multi_class_news_category_classifier.ipynb>`__.
42
+
43
+ ====================== =======================
44
+ Input Annotation types Output Annotation type
45
+ ====================== =======================
46
+ ``DOCUMENT`` ``SENTENCE_EMBEDDINGS``
47
+ ====================== =======================
48
+
49
+ Parameters
50
+ ----------
51
+ dimension
52
+ Number of embedding dimensions
53
+ loadSP
54
+ Whether to load SentencePiece ops file which is required only by
55
+ multi-lingual models, by default False
56
+ configProtoBytes
57
+ ConfigProto from tensorflow, serialized into byte array.
58
+
59
+ References
60
+ ----------
61
+ `Universal Sentence Encoder <https://arxiv.org/abs/1803.11175>`__
62
+
63
+ https://tfhub.dev/google/universal-sentence-encoder/2
64
+
65
+ **Paper abstract:**
66
+
67
+ *We present models for encoding sentences into embedding vectors that
68
+ specifically target transfer learning to other NLP tasks. The models are
69
+ efficient and result in accurate performance on diverse transfer tasks. Two
70
+ variants of the encoding models allow for trade-offs between accuracy and
71
+ compute resources. For both variants, we investigate and report the
72
+ relationship between model complexity, resource consumption, the
73
+ availability of transfer task training data, and task performance.
74
+ Comparisons are made with baselines that use word level transfer learning
75
+ via pretrained word embeddings as well as baselines do not use any transfer
76
+ learning. We find that transfer learning using sentence embeddings tends to
77
+ outperform word level transfer. With transfer learning via sentence
78
+ embeddings, we observe surprisingly good performance with minimal amounts of
79
+ supervised training data for a transfer task. We obtain encouraging results
80
+ on Word Embedding Association Tests (WEAT) targeted at detecting model bias.
81
+ Our pre-trained sentence encoding models are made freely available for
82
+ download and on TF Hub.*
83
+
84
+ Examples
85
+ --------
86
+ >>> import sparknlp
87
+ >>> from sparknlp.base import *
88
+ >>> from sparknlp.annotator import *
89
+ >>> from pyspark.ml import Pipeline
90
+ >>> documentAssembler = DocumentAssembler() \\
91
+ ... .setInputCol("text") \\
92
+ ... .setOutputCol("document")
93
+ >>> sentence = SentenceDetector() \\
94
+ ... .setInputCols(["document"]) \\
95
+ ... .setOutputCol("sentence")
96
+ >>> embeddings = UniversalSentenceEncoder.pretrained() \\
97
+ ... .setInputCols(["sentence"]) \\
98
+ ... .setOutputCol("sentence_embeddings")
99
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
100
+ ... .setInputCols(["sentence_embeddings"]) \\
101
+ ... .setOutputCols("finished_embeddings") \\
102
+ ... .setOutputAsVector(True) \\
103
+ ... .setCleanAnnotations(False)
104
+ >>> pipeline = Pipeline() \\
105
+ ... .setStages([
106
+ ... documentAssembler,
107
+ ... sentence,
108
+ ... embeddings,
109
+ ... embeddingsFinisher
110
+ ... ])
111
+ >>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text")
112
+ >>> result = pipeline.fit(data).transform(data)
113
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
114
+ +--------------------------------------------------------------------------------+
115
+ | result|
116
+ +--------------------------------------------------------------------------------+
117
+ |[0.04616805538535118,0.022307956591248512,-0.044395286589860916,-0.0016493503...|
118
+ +--------------------------------------------------------------------------------+
119
+ """
120
+
121
+ name = "UniversalSentenceEncoder"
122
+
123
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
124
+
125
+ outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
126
+
127
+ loadSP = Param(Params._dummy(), "loadSP",
128
+ "Whether to load SentencePiece ops file which is required only by multi-lingual models. "
129
+ "This is not changeable after it's set with a pretrained model nor it is compatible with Windows.",
130
+ typeConverter=TypeConverters.toBoolean)
131
+
132
+ configProtoBytes = Param(Params._dummy(),
133
+ "configProtoBytes",
134
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
135
+ TypeConverters.toListInt)
136
+
137
+ def setLoadSP(self, value):
138
+ """Sets whether to load SentencePiece ops file which is required only by
139
+ multi-lingual models, by default False.
140
+
141
+ Parameters
142
+ ----------
143
+ value : bool
144
+ Whether to load SentencePiece ops file which is required only by
145
+ multi-lingual models
146
+ """
147
+ return self._set(loadSP=value)
148
+
149
+ def setConfigProtoBytes(self, b):
150
+ """Sets configProto from tensorflow, serialized into byte array.
151
+
152
+ Parameters
153
+ ----------
154
+ b : List[int]
155
+ ConfigProto from tensorflow, serialized into byte array
156
+ """
157
+ return self._set(configProtoBytes=b)
158
+
159
+ @keyword_only
160
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.UniversalSentenceEncoder", java_model=None):
161
+ super(UniversalSentenceEncoder, self).__init__(
162
+ classname=classname,
163
+ java_model=java_model
164
+ )
165
+ self._setDefault(
166
+ loadSP=False,
167
+ dimension=512,
168
+ batchSize=2
169
+ )
170
+
171
+ @staticmethod
172
+ def loadSavedModel(folder, spark_session, loadsp=False):
173
+ """Loads a locally saved model.
174
+
175
+ Parameters
176
+ ----------
177
+ folder : str
178
+ Folder of the saved model
179
+ spark_session : pyspark.sql.SparkSession
180
+ The current SparkSession
181
+
182
+ Returns
183
+ -------
184
+ UniversalSentenceEncoder
185
+ The restored model
186
+ """
187
+ from sparknlp.internal import _USELoader
188
+ jModel = _USELoader(folder, spark_session._jsparkSession, loadsp)._java_obj
189
+ return UniversalSentenceEncoder(java_model=jModel)
190
+
191
+ @staticmethod
192
+ def pretrained(name="tfhub_use", lang="en", remote_loc=None):
193
+ """Downloads and loads a pretrained model.
194
+
195
+ Parameters
196
+ ----------
197
+ name : str, optional
198
+ Name of the pretrained model, by default "tfhub_use"
199
+ lang : str, optional
200
+ Language of the pretrained model, by default "en"
201
+ remote_loc : str, optional
202
+ Optional remote address of the resource, by default None. Will use
203
+ Spark NLPs repositories otherwise.
204
+
205
+ Returns
206
+ -------
207
+ UniversalSentenceEncoder
208
+ The restored model
209
+ """
210
+ from sparknlp.pretrained import ResourceDownloader
211
+ return ResourceDownloader.downloadModel(UniversalSentenceEncoder, name, lang, remote_loc)