spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,204 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for BertEmbeddings."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class InstructorEmbeddings(AnnotatorModel,
20
+ HasEmbeddingsProperties,
21
+ HasCaseSensitiveProperties,
22
+ HasStorageRef,
23
+ HasBatchedAnnotate,
24
+ HasMaxSentenceLengthLimit):
25
+ """Sentence embeddings using INSTRUCTOR.
26
+
27
+ Instructor👨‍🏫, an instruction-finetuned text embedding model that can generate text embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation, etc.) and domains (e.g., science, finance, etc.) by simply providing the task instruction, without any finetuning. Instructor👨‍ achieves sota on 70 diverse embedding tasks!
28
+
29
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
30
+ object:
31
+
32
+ >>> embeddings = InstructorEmbeddings.pretrained() \\
33
+ ... .setInputCols(["document"]) \\
34
+ ... .setInstruction("Represent the Medicine sentence for clustering: ") \\
35
+ ... .setOutputCol("instructor_embeddings")
36
+
37
+
38
+ The default model is ``"instructor_base"``, if no name is provided.
39
+
40
+ For available pretrained models please see the
41
+ `Models Hub <https://sparknlp.org/models?q=Instructor>`__.
42
+
43
+
44
+ ====================== ======================
45
+ Input Annotation types Output Annotation type
46
+ ====================== ======================
47
+ ``DOCUMENT`` ``SENTENCE_EMBEDDINGS``
48
+ ====================== ======================
49
+
50
+ Parameters
51
+ ----------
52
+ batchSize
53
+ Size of every batch , by default 8
54
+ dimension
55
+ Number of embedding dimensions, by default 768
56
+ caseSensitive
57
+ Whether to ignore case in tokens for embeddings matching, by default False
58
+ instruction
59
+ Set transformer instruction, e.g. 'summarize:'
60
+ maxSentenceLength
61
+ Max sentence length to process, by default 128
62
+ configProtoBytes
63
+ ConfigProto from tensorflow, serialized into byte array.
64
+
65
+ References
66
+ ----------
67
+ `One Embedder, Any Task: Instruction-Finetuned Text Embeddings <https://arxiv.org/abs/2212.09741>`__
68
+
69
+ https://github.com/HKUNLP/instructor-embedding/
70
+
71
+ **Paper abstract**
72
+
73
+ *We introduce INSTRUCTOR, a new method for computing text embeddings given task instructions:
74
+ every text input is embedded together with instructions explaining the use case (e.g., task and
75
+ domain descriptions). Unlike encoders from prior work that are more specialized, INSTRUCTOR is a
76
+ single embedder that can generate text embeddings tailored to different downstream tasks and domains,
77
+ without any further training. We first annotate instructions for 330 diverse tasks and train INSTRUCTOR
78
+ on this multitask mixture with a contrastive loss. We evaluate INSTRUCTOR on 70 embedding evaluation tasks
79
+ (66 of which are unseen during training), ranging from classification and information retrieval to semantic
80
+ textual similarity and text generation evaluation. INSTRUCTOR, while having an order of magnitude fewer
81
+ parameters than the previous best model, achieves state-of-the-art performance, with an average improvement
82
+ of 3.4% compared to the previous best results on the 70 diverse datasets. Our analysis suggests that
83
+ INSTRUCTOR is robust to changes in instructions, and that instruction finetuning mitigates the challenge of
84
+ training a single model on diverse datasets. Our model, code, and data are available at this https
85
+ URL <https://instructor-embedding.github.io/>.*
86
+
87
+ Examples
88
+ --------
89
+ >>> import sparknlp
90
+ >>> from sparknlp.base import *
91
+ >>> from sparknlp.annotator import *
92
+ >>> from pyspark.ml import Pipeline
93
+ >>> documentAssembler = DocumentAssembler() \\
94
+ ... .setInputCol("text") \\
95
+ ... .setOutputCol("document")
96
+ >>> embeddings = InstructorEmbeddings.pretrained() \\
97
+ ... .setInputCols(["document"]) \\
98
+ ... .setInstruction("Represent the Medicine sentence for clustering: ") \\
99
+ ... .setOutputCol("instructor_embeddings")
100
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
101
+ ... .setInputCols(["instructor_embeddings"]) \\
102
+ ... .setOutputCols("finished_embeddings") \\
103
+ ... .setOutputAsVector(True)
104
+ >>> pipeline = Pipeline().setStages([
105
+ ... documentAssembler,
106
+ ... embeddings,
107
+ ... embeddingsFinisher
108
+ ... ])
109
+ >>> data = spark.createDataFrame([["Dynamical Scalar Degree of Freedom in Horava-Lifshitz Gravity"]]).toDF("text")
110
+ >>> result = pipeline.fit(data).transform(data)
111
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
112
+ +--------------------------------------------------------------------------------+
113
+ | result|
114
+ +--------------------------------------------------------------------------------+
115
+ |[-2.3497989177703857,0.480538547039032,-0.3238905668258667,-1.612930893898010...|
116
+ +--------------------------------------------------------------------------------+
117
+ """
118
+
119
+ name = "InstructorEmbeddings"
120
+
121
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
122
+
123
+ outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
124
+ instruction = Param(Params._dummy(), "instruction", "Set transformer instruction, e.g. 'summarize:'",
125
+ typeConverter=TypeConverters.toString)
126
+ configProtoBytes = Param(Params._dummy(),
127
+ "configProtoBytes",
128
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
129
+ TypeConverters.toListInt)
130
+
131
+ def setInstruction(self, value):
132
+ """ Sets transformer instruction, e.g. 'summarize:'.
133
+
134
+ Parameters
135
+ ----------
136
+ value : str
137
+ """
138
+ return self._set(instruction=value)
139
+
140
+ def setConfigProtoBytes(self, b):
141
+ """Sets configProto from tensorflow, serialized into byte array.
142
+
143
+ Parameters
144
+ ----------
145
+ b : List[int]
146
+ ConfigProto from tensorflow, serialized into byte array
147
+ """
148
+ return self._set(configProtoBytes=b)
149
+
150
+ @keyword_only
151
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.InstructorEmbeddings", java_model=None):
152
+ super(InstructorEmbeddings, self).__init__(
153
+ classname=classname,
154
+ java_model=java_model
155
+ )
156
+ self._setDefault(
157
+ dimension=768,
158
+ batchSize=8,
159
+ maxSentenceLength=128,
160
+ caseSensitive=False,
161
+ instruction="",
162
+ )
163
+
164
+ @staticmethod
165
+ def loadSavedModel(folder, spark_session):
166
+ """Loads a locally saved model.
167
+
168
+ Parameters
169
+ ----------
170
+ folder : str
171
+ Folder of the saved model
172
+ spark_session : pyspark.sql.SparkSession
173
+ The current SparkSession
174
+
175
+ Returns
176
+ -------
177
+ InstructorEmbeddings
178
+ The restored model
179
+ """
180
+ from sparknlp.internal import _InstructorLoader
181
+ jModel = _InstructorLoader(folder, spark_session._jsparkSession)._java_obj
182
+ return InstructorEmbeddings(java_model=jModel)
183
+
184
+ @staticmethod
185
+ def pretrained(name="instructor_base", lang="en", remote_loc=None):
186
+ """Downloads and loads a pretrained model.
187
+
188
+ Parameters
189
+ ----------
190
+ name : str, optional
191
+ Name of the pretrained model, by default "instructor_base"
192
+ lang : str, optional
193
+ Language of the pretrained model, by default "en"
194
+ remote_loc : str, optional
195
+ Optional remote address of the resource, by default None. Will use
196
+ Spark NLPs repositories otherwise.
197
+
198
+ Returns
199
+ -------
200
+ InstructorEmbeddings
201
+ The restored model
202
+ """
203
+ from sparknlp.pretrained import ResourceDownloader
204
+ return ResourceDownloader.downloadModel(InstructorEmbeddings, name, lang, remote_loc)
@@ -0,0 +1,211 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for LongformerEmbeddings."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class LongformerEmbeddings(AnnotatorModel,
20
+ HasEmbeddingsProperties,
21
+ HasCaseSensitiveProperties,
22
+ HasStorageRef,
23
+ HasBatchedAnnotate,
24
+ HasEngine,
25
+ HasLongMaxSentenceLengthLimit):
26
+ """Longformer is a transformer model for long documents. The Longformer
27
+ model was presented in `Longformer: The Long-Document Transformer` by Iz
28
+ Beltagy, Matthew E. Peters, Arman Cohan. longformer-base-4096 is a BERT-like
29
+ model started from the RoBERTa checkpoint and pretrained for MLM on long
30
+ documents. It supports sequences of length up to 4,096.
31
+
32
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
33
+ object:
34
+
35
+ >>> embeddings = LongformerEmbeddings.pretrained() \\
36
+ ... .setInputCols(["document", "token"]) \\
37
+ ... .setOutputCol("embeddings")
38
+
39
+
40
+ The default model is ``"longformer_base_4096"``, if no name is provided. For
41
+ available pretrained models please see the `Models Hub
42
+ <https://sparknlp.org/models?task=Embeddings>`__.
43
+
44
+ To see which models are compatible and how to import them see
45
+ `Import Transformers into Spark NLP 🚀
46
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
47
+
48
+ ====================== ======================
49
+ Input Annotation types Output Annotation type
50
+ ====================== ======================
51
+ ``DOCUMENT, TOKEN`` ``WORD_EMBEDDINGS``
52
+ ====================== ======================
53
+
54
+ Parameters
55
+ ----------
56
+ batchSize
57
+ Size of every batch, by default 8
58
+ dimension
59
+ Number of embedding dimensions, by default 768
60
+ caseSensitive
61
+ Whether to ignore case in tokens for embeddings matching, by default
62
+ True
63
+ maxSentenceLength
64
+ Max sentence length to process, by default 1024
65
+ configProtoBytes
66
+ ConfigProto from tensorflow, serialized into byte array.
67
+
68
+ References
69
+ ----------
70
+ `Longformer: The Long-Document Transformer
71
+ <https://arxiv.org/pdf/2004.05150.pdf>`__
72
+
73
+
74
+ **Paper Abstract:**
75
+
76
+ *Transformer-based models are unable to process long sequences due to their
77
+ self-attention operation, which scales quadratically with the sequence
78
+ length. To address this limitation, we introduce the Longformer with an
79
+ attention mechanism that scales linearly with sequence length, making it
80
+ easy to process documents of thousands of tokens or longer. Longformer's
81
+ attention mechanism is a drop-in replacement for the standard self-attention
82
+ and combines a local windowed attention with a task motivated global
83
+ attention. Following prior work on long-sequence transformers, we evaluate
84
+ Longformer on character-level language modeling and achieve state-of-the-art
85
+ results on text8 and enwik8. In contrast to most prior work, we also
86
+ pretrain Longformer and finetune it on a variety of downstream tasks. Our
87
+ pretrained Longformer consistently outperforms RoBERTa on long document
88
+ tasks and sets new state-of-the-art results on WikiHop and TriviaQA. We
89
+ finally introduce the Longformer-Encoder-Decoder (LED), a Longformer variant
90
+ for supporting long document generative sequence-to-sequence tasks, and
91
+ demonstrate its effectiveness on the arXiv summarization dataset.*
92
+
93
+ The original code can be found at `Longformer: The Long-Document Transformer
94
+ <https://github.com/allenai/longformer>`__.
95
+
96
+ Examples
97
+ --------
98
+ >>> import sparknlp
99
+ >>> from sparknlp.base import *
100
+ >>> from sparknlp.annotator import *
101
+ >>> from pyspark.ml import Pipeline
102
+ >>> documentAssembler = DocumentAssembler() \\
103
+ ... .setInputCol("text") \\
104
+ ... .setOutputCol("document")
105
+ >>> tokenizer = Tokenizer() \\
106
+ ... .setInputCols(["document"]) \\
107
+ ... .setOutputCol("token")
108
+ >>> embeddings = LongformerEmbeddings.pretrained() \\
109
+ ... .setInputCols(["document", "token"]) \\
110
+ ... .setOutputCol("embeddings") \\
111
+ ... .setCaseSensitive(True)
112
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
113
+ >>> .setInputCols(["embeddings"]) \\
114
+ ... .setOutputCols("finished_embeddings") \\
115
+ ... .setOutputAsVector(True) \\
116
+ ... .setCleanAnnotations(False)
117
+ >>> pipeline = Pipeline() \\
118
+ ... .setStages([
119
+ ... documentAssembler,
120
+ ... tokenizer,
121
+ ... embeddings,
122
+ ... embeddingsFinisher
123
+ ... ])
124
+ >>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text")
125
+ >>> result = pipeline.fit(data).transform(data)
126
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
127
+ +--------------------------------------------------------------------------------+
128
+ | result|
129
+ +--------------------------------------------------------------------------------+
130
+ |[0.18792399764060974,-0.14591649174690247,0.20547787845134735,0.1468472778797...|
131
+ |[0.22845706343650818,0.18073144555091858,0.09725798666477203,-0.0417917296290...|
132
+ |[0.07037967443466187,-0.14801117777824402,-0.03603338822722435,-0.17893412709...|
133
+ |[-0.08734266459941864,0.2486150562763214,-0.009067727252840996,-0.24408400058...|
134
+ |[0.22409197688102722,-0.4312366545200348,0.1401449590921402,0.356410235166549...|
135
+ +--------------------------------------------------------------------------------+
136
+ """
137
+ name = "LongformerEmbeddings"
138
+
139
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
140
+
141
+ outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS
142
+
143
+ configProtoBytes = Param(Params._dummy(),
144
+ "configProtoBytes",
145
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
146
+ TypeConverters.toListInt)
147
+
148
+ def setConfigProtoBytes(self, b):
149
+ """Sets configProto from tensorflow, serialized into byte array.
150
+
151
+ Parameters
152
+ ----------
153
+ b : List[int]
154
+ ConfigProto from tensorflow, serialized into byte array
155
+ """
156
+ return self._set(configProtoBytes=b)
157
+
158
+ @keyword_only
159
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.LongformerEmbeddings", java_model=None):
160
+ super(LongformerEmbeddings, self).__init__(
161
+ classname=classname,
162
+ java_model=java_model
163
+ )
164
+ self._setDefault(
165
+ dimension=768,
166
+ batchSize=8,
167
+ maxSentenceLength=1024,
168
+ caseSensitive=True
169
+ )
170
+
171
+ @staticmethod
172
+ def loadSavedModel(folder, spark_session):
173
+ """Loads a locally saved model.
174
+
175
+ Parameters
176
+ ----------
177
+ folder : str
178
+ Folder of the saved model
179
+ spark_session : pyspark.sql.SparkSession
180
+ The current SparkSession
181
+
182
+ Returns
183
+ -------
184
+ LongformerEmbeddings
185
+ The restored model
186
+ """
187
+ from sparknlp.internal import _LongformerLoader
188
+ jModel = _LongformerLoader(folder, spark_session._jsparkSession)._java_obj
189
+ return LongformerEmbeddings(java_model=jModel)
190
+
191
+ @staticmethod
192
+ def pretrained(name="longformer_base_4096", lang="en", remote_loc=None):
193
+ """Downloads and loads a pretrained model.
194
+
195
+ Parameters
196
+ ----------
197
+ name : str, optional
198
+ Name of the pretrained model, by default "longformer_base_4096"
199
+ lang : str, optional
200
+ Language of the pretrained model, by default "en"
201
+ remote_loc : str, optional
202
+ Optional remote address of the resource, by default None. Will use
203
+ Spark NLPs repositories otherwise.
204
+
205
+ Returns
206
+ -------
207
+ LongformerEmbeddings
208
+ The restored model
209
+ """
210
+ from sparknlp.pretrained import ResourceDownloader
211
+ return ResourceDownloader.downloadModel(LongformerEmbeddings, name, lang, remote_loc)
@@ -0,0 +1,189 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for MiniLMEmbeddings."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class MiniLMEmbeddings(AnnotatorModel,
20
+ HasEmbeddingsProperties,
21
+ HasCaseSensitiveProperties,
22
+ HasStorageRef,
23
+ HasBatchedAnnotate,
24
+ HasMaxSentenceLengthLimit):
25
+ """Sentence embeddings using MiniLM.
26
+
27
+ MiniLM, a lightweight and efficient sentence embedding model that can generate text embeddings for various NLP tasks (e.g., classification, retrieval, clustering, text evaluation, etc.)
28
+ Note that this annotator is only supported for Spark Versions 3.4 and up.
29
+
30
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
31
+ object:
32
+
33
+ >>> embeddings = MiniLMEmbeddings.pretrained() \\
34
+ ... .setInputCols(["document"]) \\
35
+ ... .setOutputCol("minilm_embeddings")
36
+
37
+
38
+ The default model is ``"minilm_l6_v2"``, if no name is provided.
39
+
40
+ For available pretrained models please see the
41
+ `Models Hub <https://sparknlp.org/models?q=MiniLM>`__.
42
+
43
+
44
+ ====================== ======================
45
+ Input Annotation types Output Annotation type
46
+ ====================== ======================
47
+ ``DOCUMENT`` ``SENTENCE_EMBEDDINGS``
48
+ ====================== ======================
49
+
50
+ Parameters
51
+ ----------
52
+ batchSize
53
+ Size of every batch , by default 8
54
+ dimension
55
+ Number of embedding dimensions, by default 384
56
+ caseSensitive
57
+ Whether to ignore case in tokens for embeddings matching, by default False
58
+ maxSentenceLength
59
+ Max sentence length to process, by default 512
60
+ configProtoBytes
61
+ ConfigProto from tensorflow, serialized into byte array.
62
+
63
+ References
64
+ ----------
65
+ `MiniLM: Deep Self-Attention Distillation for Task-Agnostic Compression of Pre-Trained Transformers <https://arxiv.org/abs/2002.10957>`__
66
+
67
+ `MiniLM Github Repository <https://github.com/microsoft/unilm/tree/master/minilm>`__
68
+
69
+ **Paper abstract**
70
+
71
+ *We present a simple and effective approach to compress large pre-trained Transformer models
72
+ by distilling the self-attention module of the last Transformer layer. The compressed model
73
+ (called MiniLM) can be trained with task-agnostic distillation and then fine-tuned on various
74
+ downstream tasks. We evaluate MiniLM on the GLUE benchmark and show that it achieves comparable
75
+ results with BERT-base while being 4.3x smaller and 5.5x faster. We also show that MiniLM can
76
+ be further compressed to 22x smaller and 12x faster than BERT-base while maintaining comparable
77
+ performance.*
78
+
79
+ Examples
80
+ --------
81
+ >>> import sparknlp
82
+ >>> from sparknlp.base import *
83
+ >>> from sparknlp.annotator import *
84
+ >>> from pyspark.ml import Pipeline
85
+ >>> documentAssembler = DocumentAssembler() \\
86
+ ... .setInputCol("text") \\
87
+ ... .setOutputCol("document")
88
+ >>> embeddings = MiniLMEmbeddings.pretrained() \\
89
+ ... .setInputCols(["document"]) \\
90
+ ... .setOutputCol("minilm_embeddings")
91
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
92
+ ... .setInputCols(["minilm_embeddings"]) \\
93
+ ... .setOutputCols("finished_embeddings") \\
94
+ ... .setOutputAsVector(True)
95
+ >>> pipeline = Pipeline().setStages([
96
+ ... documentAssembler,
97
+ ... embeddings,
98
+ ... embeddingsFinisher
99
+ ... ])
100
+ >>> data = spark.createDataFrame([["This is a sample sentence for embedding generation.",
101
+ ... "Another example sentence to demonstrate MiniLM embeddings.",
102
+ ... ]]).toDF("text")
103
+ >>> result = pipeline.fit(data).transform(data)
104
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
105
+ +--------------------------------------------------------------------------------+
106
+ | result|
107
+ +--------------------------------------------------------------------------------+
108
+ |[[0.1234567, -0.2345678, 0.3456789, -0.4567890, 0.5678901, -0.6789012...|
109
+ |[[0.2345678, -0.3456789, 0.4567890, -0.5678901, 0.6789012, -0.7890123...|
110
+ +--------------------------------------------------------------------------------+
111
+ """
112
+
113
+ name = "MiniLMEmbeddings"
114
+
115
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
116
+
117
+ outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
118
+ configProtoBytes = Param(Params._dummy(),
119
+ "configProtoBytes",
120
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
121
+ TypeConverters.toListInt)
122
+
123
+
124
+ def setConfigProtoBytes(self, b):
125
+ """Sets configProto from tensorflow, serialized into byte array.
126
+
127
+ Parameters
128
+ ----------
129
+ b : List[int]
130
+ ConfigProto from tensorflow, serialized into byte array
131
+ """
132
+ return self._set(configProtoBytes=b)
133
+
134
+ @keyword_only
135
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.MiniLMEmbeddings", java_model=None):
136
+ super(MiniLMEmbeddings, self).__init__(
137
+ classname=classname,
138
+ java_model=java_model
139
+ )
140
+ self._setDefault(
141
+ dimension=384,
142
+ batchSize=8,
143
+ maxSentenceLength=512,
144
+ caseSensitive=False,
145
+ )
146
+
147
+ @staticmethod
148
+ def loadSavedModel(folder, spark_session, use_openvino=False):
149
+ """Loads a locally saved model.
150
+
151
+ Parameters
152
+ ----------
153
+ folder : str
154
+ Folder of the saved model
155
+ spark_session : pyspark.sql.SparkSession
156
+ The current SparkSession
157
+ use_openvino : bool
158
+ Use OpenVINO backend
159
+
160
+ Returns
161
+ -------
162
+ MiniLMEmbeddings
163
+ The restored model
164
+ """
165
+ from sparknlp.internal import _MiniLMLoader
166
+ jModel = _MiniLMLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
167
+ return MiniLMEmbeddings(java_model=jModel)
168
+
169
+ @staticmethod
170
+ def pretrained(name="minilm_l6_v2", lang="en", remote_loc=None):
171
+ """Downloads and loads a pretrained model.
172
+
173
+ Parameters
174
+ ----------
175
+ name : str, optional
176
+ Name of the pretrained model, by default "minilm_l6_v2"
177
+ lang : str, optional
178
+ Language of the pretrained model, by default "en"
179
+ remote_loc : str, optional
180
+ Optional remote address of the resource, by default None. Will use
181
+ Spark NLPs repositories otherwise.
182
+
183
+ Returns
184
+ -------
185
+ MiniLMEmbeddings
186
+ The restored model
187
+ """
188
+ from sparknlp.pretrained import ResourceDownloader
189
+ return ResourceDownloader.downloadModel(MiniLMEmbeddings, name, lang, remote_loc)