spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,208 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for BertEmbeddings."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class BertEmbeddings(AnnotatorModel,
20
+ HasEmbeddingsProperties,
21
+ HasCaseSensitiveProperties,
22
+ HasStorageRef,
23
+ HasBatchedAnnotate,
24
+ HasMaxSentenceLengthLimit):
25
+ """Token-level embeddings using BERT.
26
+
27
+ BERT (Bidirectional Encoder Representations from Transformers) provides
28
+ dense vector representations for natural language by using a deep,
29
+ pre-trained neural network with the Transformer architecture.
30
+
31
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
32
+ object:
33
+
34
+ >>> embeddings = BertEmbeddings.pretrained() \\
35
+ ... .setInputCols(["token", "document"]) \\
36
+ ... .setOutputCol("bert_embeddings")
37
+
38
+
39
+ The default model is ``"small_bert_L2_768"``, if no name is provided.
40
+
41
+ For available pretrained models please see the
42
+ `Models Hub <https://sparknlp.org/models?task=Embeddings>`__.
43
+
44
+ For extended examples of usage, see the `Examples
45
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner/ner_bert.ipynb>`__.
46
+ To see which models are compatible and how to import them see
47
+ `Import Transformers into Spark NLP 🚀
48
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
49
+
50
+ ====================== ======================
51
+ Input Annotation types Output Annotation type
52
+ ====================== ======================
53
+ ``DOCUMENT, TOKEN`` ``WORD_EMBEDDINGS``
54
+ ====================== ======================
55
+
56
+ Parameters
57
+ ----------
58
+ batchSize
59
+ Size of every batch , by default 8
60
+ dimension
61
+ Number of embedding dimensions, by default 768
62
+ caseSensitive
63
+ Whether to ignore case in tokens for embeddings matching, by default False
64
+ maxSentenceLength
65
+ Max sentence length to process, by default 128
66
+ configProtoBytes
67
+ ConfigProto from tensorflow, serialized into byte array.
68
+
69
+ References
70
+ ----------
71
+ `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`__
72
+
73
+ https://github.com/google-research/bert
74
+
75
+ **Paper abstract**
76
+
77
+ *We introduce a new language representation model called BERT, which stands
78
+ for Bidirectional Encoder Representations from Transformers. Unlike recent
79
+ language representation models, BERT is designed to pre-train deep
80
+ bidirectional representations from unlabeled text by jointly conditioning on
81
+ both left and right context in all layers. As a result, the pre-trained BERT
82
+ model can be fine-tuned with just one additional output layer to create
83
+ state-of-the-art models for a wide range of tasks, such as question
84
+ answering and language inference, without substantial task-specific
85
+ architecture modifications. BERT is conceptually simple and empirically
86
+ powerful. It obtains new state-of-the-art results on eleven natural language
87
+ processing tasks, including pushing the GLUE score to 80.5% (7.7% point
88
+ absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute
89
+ improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point
90
+ absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute
91
+ improvement).*
92
+
93
+ Examples
94
+ --------
95
+ >>> import sparknlp
96
+ >>> from sparknlp.base import *
97
+ >>> from sparknlp.annotator import *
98
+ >>> from pyspark.ml import Pipeline
99
+ >>> documentAssembler = DocumentAssembler() \\
100
+ ... .setInputCol("text") \\
101
+ ... .setOutputCol("document")
102
+ >>> tokenizer = Tokenizer() \\
103
+ ... .setInputCols(["document"]) \\
104
+ ... .setOutputCol("token")
105
+ >>> embeddings = BertEmbeddings.pretrained("small_bert_L2_128", "en") \\
106
+ ... .setInputCols(["token", "document"]) \\
107
+ ... .setOutputCol("bert_embeddings")
108
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
109
+ ... .setInputCols(["bert_embeddings"]) \\
110
+ ... .setOutputCols("finished_embeddings") \\
111
+ ... .setOutputAsVector(True)
112
+ >>> pipeline = Pipeline().setStages([
113
+ ... documentAssembler,
114
+ ... tokenizer,
115
+ ... embeddings,
116
+ ... embeddingsFinisher
117
+ ... ])
118
+ >>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text")
119
+ >>> result = pipeline.fit(data).transform(data)
120
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
121
+ +--------------------------------------------------------------------------------+
122
+ | result|
123
+ +--------------------------------------------------------------------------------+
124
+ |[-2.3497989177703857,0.480538547039032,-0.3238905668258667,-1.612930893898010...|
125
+ |[-2.1357314586639404,0.32984697818756104,-0.6032363176345825,-1.6791689395904...|
126
+ |[-1.8244884014129639,-0.27088963985443115,-1.059438943862915,-0.9817547798156...|
127
+ |[-1.1648050546646118,-0.4725411534309387,-0.5938255786895752,-1.5780693292617...|
128
+ |[-0.9125322699546814,0.4563939869403839,-0.3975459933280945,-1.81611204147338...|
129
+ +--------------------------------------------------------------------------------+
130
+ """
131
+
132
+ name = "BertEmbeddings"
133
+
134
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
135
+
136
+ outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS
137
+
138
+ configProtoBytes = Param(Params._dummy(),
139
+ "configProtoBytes",
140
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
141
+ TypeConverters.toListInt)
142
+
143
+ def setConfigProtoBytes(self, b):
144
+ """Sets configProto from tensorflow, serialized into byte array.
145
+
146
+ Parameters
147
+ ----------
148
+ b : List[int]
149
+ ConfigProto from tensorflow, serialized into byte array
150
+ """
151
+ return self._set(configProtoBytes=b)
152
+
153
+ @keyword_only
154
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.BertEmbeddings", java_model=None):
155
+ super(BertEmbeddings, self).__init__(
156
+ classname=classname,
157
+ java_model=java_model
158
+ )
159
+ self._setDefault(
160
+ dimension=768,
161
+ batchSize=8,
162
+ maxSentenceLength=128,
163
+ caseSensitive=False
164
+ )
165
+
166
+ @staticmethod
167
+ def loadSavedModel(folder, spark_session, use_openvino=False):
168
+ """Loads a locally saved model.
169
+
170
+ Parameters
171
+ ----------
172
+ folder : str
173
+ Folder of the saved model
174
+ spark_session : pyspark.sql.SparkSession
175
+ The current SparkSession
176
+ use_openvino: bool
177
+ Use OpenVINO backend
178
+
179
+ Returns
180
+ -------
181
+ BertEmbeddings
182
+ The restored model
183
+ """
184
+ from sparknlp.internal import _BertLoader
185
+ jModel = _BertLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
186
+ return BertEmbeddings(java_model=jModel)
187
+
188
+ @staticmethod
189
+ def pretrained(name="small_bert_L2_768", lang="en", remote_loc=None):
190
+ """Downloads and loads a pretrained model.
191
+
192
+ Parameters
193
+ ----------
194
+ name : str, optional
195
+ Name of the pretrained model, by default "small_bert_L2_768"
196
+ lang : str, optional
197
+ Language of the pretrained model, by default "en"
198
+ remote_loc : str, optional
199
+ Optional remote address of the resource, by default None. Will use
200
+ Spark NLPs repositories otherwise.
201
+
202
+ Returns
203
+ -------
204
+ BertEmbeddings
205
+ The restored model
206
+ """
207
+ from sparknlp.pretrained import ResourceDownloader
208
+ return ResourceDownloader.downloadModel(BertEmbeddings, name, lang, remote_loc)
@@ -0,0 +1,224 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for BertSentenceEmbeddings."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class BertSentenceEmbeddings(AnnotatorModel,
20
+ HasEmbeddingsProperties,
21
+ HasCaseSensitiveProperties,
22
+ HasStorageRef,
23
+ HasBatchedAnnotate,
24
+ HasEngine,
25
+ HasMaxSentenceLengthLimit):
26
+ """Sentence-level embeddings using BERT. BERT (Bidirectional Encoder
27
+ Representations from Transformers) provides dense vector representations for
28
+ natural language by using a deep, pre-trained neural network with the
29
+ Transformer architecture.
30
+
31
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
32
+ object:
33
+
34
+ >>>embeddings = BertSentenceEmbeddings.pretrained() \\
35
+ ... .setInputCols(["sentence"]) \\
36
+ ... .setOutputCol("sentence_bert_embeddings")
37
+
38
+
39
+ The default model is ``"sent_small_bert_L2_768"``, if no name is provided.
40
+
41
+ For available pretrained models please see the
42
+ `Models Hub <https://sparknlp.org/models?task=Embeddings>`__.
43
+
44
+ For extended examples of usage, see the
45
+ `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BERT%20Sentence.ipynb>`__.
46
+
47
+ ====================== =======================
48
+ Input Annotation types Output Annotation type
49
+ ====================== =======================
50
+ ``DOCUMENT`` ``SENTENCE_EMBEDDINGS``
51
+ ====================== =======================
52
+
53
+ Parameters
54
+ ----------
55
+ batchSize
56
+ Size of every batch, by default 8
57
+ caseSensitive
58
+ Whether to ignore case in tokens for embeddings matching, by default
59
+ False
60
+ dimension
61
+ Number of embedding dimensions, by default 768
62
+ maxSentenceLength
63
+ Max sentence length to process, by default 128
64
+ isLong
65
+ Use Long type instead of Int type for inputs buffer - Some Bert models
66
+ require Long instead of Int.
67
+ configProtoBytes
68
+ ConfigProto from tensorflow, serialized into byte array.
69
+
70
+ References
71
+ ----------
72
+ `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`__
73
+
74
+ https://github.com/google-research/bert
75
+
76
+ **Paper abstract**
77
+
78
+ *We introduce a new language representation model called BERT, which stands
79
+ for Bidirectional Encoder Representations from Transformers. Unlike recent
80
+ language representation models, BERT is designed to pre-train deep
81
+ bidirectional representations from unlabeled text by jointly conditioning on
82
+ both left and right context in all layers. As a result, the pre-trained BERT
83
+ model can be fine-tuned with just one additional output layer to create
84
+ state-of-the-art models for a wide range of tasks, such as question
85
+ answering and language inference, without substantial task-specific
86
+ architecture modifications. BERT is conceptually simple and empirically
87
+ powerful. It obtains new state-of-the-art results on eleven natural language
88
+ processing tasks, including pushing the GLUE score to 80.5% (7.7% point
89
+ absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute
90
+ improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point
91
+ absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute
92
+ improvement).*
93
+
94
+ Examples
95
+ --------
96
+ >>> import sparknlp
97
+ >>> from sparknlp.base import *
98
+ >>> from sparknlp.annotator import *
99
+ >>> from pyspark.ml import Pipeline
100
+ >>> documentAssembler = DocumentAssembler() \\
101
+ ... .setInputCol("text") \\
102
+ ... .setOutputCol("document")
103
+ >>> sentence = SentenceDetector() \\
104
+ ... .setInputCols(["document"]) \\
105
+ ... .setOutputCol("sentence")
106
+ >>> embeddings = BertSentenceEmbeddings.pretrained("sent_small_bert_L2_128") \\
107
+ ... .setInputCols(["sentence"]) \\
108
+ ... .setOutputCol("sentence_bert_embeddings")
109
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
110
+ ... .setInputCols(["sentence_bert_embeddings"]) \\
111
+ ... .setOutputCols("finished_embeddings") \\
112
+ ... .setOutputAsVector(True)
113
+ >>> pipeline = Pipeline().setStages([
114
+ ... documentAssembler,
115
+ ... sentence,
116
+ ... embeddings,
117
+ ... embeddingsFinisher
118
+ ... ])
119
+ >>> data = spark.createDataFrame([["John loves apples. Mary loves oranges. John loves Mary."]]).toDF("text")
120
+ >>> result = pipeline.fit(data).transform(data)
121
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
122
+ +--------------------------------------------------------------------------------+
123
+ | result|
124
+ +--------------------------------------------------------------------------------+
125
+ |[-0.8951074481010437,0.13753940165042877,0.3108254075050354,-1.65693199634552...|
126
+ |[-0.6180210709571838,-0.12179657071828842,-0.191165953874588,-1.4497021436691...|
127
+ |[-0.822715163230896,0.7568016648292542,-0.1165061742067337,-1.59048593044281,...|
128
+ +--------------------------------------------------------------------------------+
129
+ """
130
+
131
+ name = "BertSentenceEmbeddings"
132
+
133
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
134
+
135
+ outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
136
+
137
+ isLong = Param(Params._dummy(),
138
+ "isLong",
139
+ "Use Long type instead of Int type for inputs buffer - Some Bert models require Long instead of Int.",
140
+ typeConverter=TypeConverters.toBoolean)
141
+
142
+ configProtoBytes = Param(Params._dummy(),
143
+ "configProtoBytes",
144
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
145
+ TypeConverters.toListInt)
146
+
147
+ def setConfigProtoBytes(self, b):
148
+ """Sets configProto from tensorflow, serialized into byte array.
149
+
150
+ Parameters
151
+ ----------
152
+ b : List[int]
153
+ ConfigProto from tensorflow, serialized into byte array
154
+ """
155
+ return self._set(configProtoBytes=b)
156
+
157
+ def setIsLong(self, value):
158
+ """Sets whether to use Long type instead of Int type for inputs buffer.
159
+
160
+ Some Bert models require Long instead of Int.
161
+
162
+ Parameters
163
+ ----------
164
+ value : bool
165
+ Whether to use Long type instead of Int type for inputs buffer
166
+ """
167
+ return self._set(isLong=value)
168
+
169
+ @keyword_only
170
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.BertSentenceEmbeddings", java_model=None):
171
+ super(BertSentenceEmbeddings, self).__init__(
172
+ classname=classname,
173
+ java_model=java_model
174
+ )
175
+ self._setDefault(
176
+ dimension=768,
177
+ batchSize=8,
178
+ maxSentenceLength=128,
179
+ caseSensitive=False
180
+ )
181
+
182
+ @staticmethod
183
+ def loadSavedModel(folder, spark_session, use_openvino=False):
184
+ """Loads a locally saved model.
185
+
186
+ Parameters
187
+ ----------
188
+ folder : str
189
+ Folder of the saved model
190
+ spark_session : pyspark.sql.SparkSession
191
+ The current SparkSession
192
+ use_openvino: bool
193
+ Use OpenVINO backend
194
+
195
+ Returns
196
+ -------
197
+ BertSentenceEmbeddings
198
+ The restored model
199
+ """
200
+ from sparknlp.internal import _BertSentenceLoader
201
+ jModel = _BertSentenceLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
202
+ return BertSentenceEmbeddings(java_model=jModel)
203
+
204
+ @staticmethod
205
+ def pretrained(name="sent_small_bert_L2_768", lang="en", remote_loc=None):
206
+ """Downloads and loads a pretrained model.
207
+
208
+ Parameters
209
+ ----------
210
+ name : str, optional
211
+ Name of the pretrained model, by default "sent_small_bert_L2_768"
212
+ lang : str, optional
213
+ Language of the pretrained model, by default "en"
214
+ remote_loc : str, optional
215
+ Optional remote address of the resource, by default None. Will use
216
+ Spark NLPs repositories otherwise.
217
+
218
+ Returns
219
+ -------
220
+ BertSentenceEmbeddings
221
+ The restored model
222
+ """
223
+ from sparknlp.pretrained import ResourceDownloader
224
+ return ResourceDownloader.downloadModel(BertSentenceEmbeddings, name, lang, remote_loc)
@@ -0,0 +1,199 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for BGEEmbeddings."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class BGEEmbeddings(AnnotatorModel,
20
+ HasEmbeddingsProperties,
21
+ HasCaseSensitiveProperties,
22
+ HasStorageRef,
23
+ HasBatchedAnnotate,
24
+ HasMaxSentenceLengthLimit,
25
+ HasClsTokenProperties):
26
+ """Sentence embeddings using BGE.
27
+
28
+ BGE, or BAAI General Embeddings, a model that can map any text to a low-dimensional dense
29
+ vector which can be used for tasks like retrieval, classification, clustering, or semantic search.
30
+
31
+ Note that this annotator is only supported for Spark Versions 3.4 and up.
32
+
33
+ Pretrained models can be loaded with `pretrained` of the companion object:
34
+
35
+ >>> embeddings = BGEEmbeddings.pretrained() \\
36
+ ... .setInputCols(["document"]) \\
37
+ ... .setOutputCol("bge_embeddings")
38
+
39
+
40
+ The default model is ``"bge_base"``, if no name is provided.
41
+
42
+ For available pretrained models please see the
43
+ `Models Hub <https://sparknlp.org/models?q=BGE>`__.
44
+
45
+
46
+ ====================== ======================
47
+ Input Annotation types Output Annotation type
48
+ ====================== ======================
49
+ ``DOCUMENT`` ``SENTENCE_EMBEDDINGS``
50
+ ====================== ======================
51
+
52
+
53
+ **References**
54
+
55
+ `C-Pack: Packaged Resources To Advance General Chinese Embedding <https://arxiv.org/pdf/2309.07597>`__
56
+ `BGE Github Repository <https://github.com/FlagOpen/FlagEmbedding>`__
57
+
58
+ **Paper abstract**
59
+
60
+ *We introduce C-Pack, a package of resources that significantly advance the field of general
61
+ Chinese embeddings. C-Pack includes three critical resources.
62
+ 1) C-MTEB is a comprehensive benchmark for Chinese text embeddings covering 6 tasks and 35 datasets.
63
+ 2) C-MTP is a massive text embedding dataset curated from labeled and unlabeled Chinese corpora
64
+ for training embedding models.
65
+ 3) C-TEM is a family of embedding models covering multiple sizes.
66
+ Our models outperform all prior Chinese text embeddings on C-MTEB by up to +10% upon the
67
+ time of the release. We also integrate and optimize the entire suite of training methods for
68
+ C-TEM. Along with our resources on general Chinese embedding, we release our data and models for
69
+ English text embeddings. The English models achieve stateof-the-art performance on the MTEB
70
+ benchmark; meanwhile, our released English data is 2 times larger than the Chinese data. All
71
+ these resources are made publicly available at https://github.com/FlagOpen/FlagEmbedding.*
72
+
73
+
74
+ Parameters
75
+ ----------
76
+ batchSize
77
+ Size of every batch , by default 8
78
+ dimension
79
+ Number of embedding dimensions, by default 768
80
+ caseSensitive
81
+ Whether to ignore case in tokens for embeddings matching, by default False
82
+ maxSentenceLength
83
+ Max sentence length to process, by default 512
84
+ configProtoBytes
85
+ ConfigProto from tensorflow, serialized into byte array.
86
+ useCLSToken
87
+ Whether to use the CLS token for sentence embeddings, by default True
88
+
89
+ Examples
90
+ --------
91
+ >>> import sparknlp
92
+ >>> from sparknlp.base import *
93
+ >>> from sparknlp.annotator import *
94
+ >>> from pyspark.ml import Pipeline
95
+ >>> documentAssembler = DocumentAssembler() \\
96
+ ... .setInputCol("text") \\
97
+ ... .setOutputCol("document")
98
+ >>> embeddings = BGEEmbeddings.pretrained() \\
99
+ ... .setInputCols(["document"]) \\
100
+ ... .setOutputCol("bge_embeddings")
101
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
102
+ ... .setInputCols(["bge_embeddings"]) \\
103
+ ... .setOutputCols("finished_embeddings") \\
104
+ ... .setOutputAsVector(True)
105
+ >>> pipeline = Pipeline().setStages([
106
+ ... documentAssembler,
107
+ ... embeddings,
108
+ ... embeddingsFinisher
109
+ ... ])
110
+ >>> data = spark.createDataFrame([["query: how much protein should a female eat",
111
+ ... "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." + \\
112
+ ... "But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" + \\
113
+ ... "marathon. Check out the chart below to see how much protein you should be eating each day.",
114
+ ... ]]).toDF("text")
115
+ >>> result = pipeline.fit(data).transform(data)
116
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
117
+ +--------------------------------------------------------------------------------+
118
+ | result|
119
+ +--------------------------------------------------------------------------------+
120
+ |[[8.0190285E-4, -0.005974853, -0.072875895, 0.007944068, 0.026059335, -0.0080...|
121
+ |[[0.050514214, 0.010061974, -0.04340176, -0.020937217, 0.05170225, 0.01157857...|
122
+ +--------------------------------------------------------------------------------+
123
+ """
124
+
125
+ name = "BGEEmbeddings"
126
+
127
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
128
+
129
+ outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
130
+ configProtoBytes = Param(Params._dummy(),
131
+ "configProtoBytes",
132
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
133
+ TypeConverters.toListInt)
134
+
135
+ def setConfigProtoBytes(self, b):
136
+ """Sets configProto from tensorflow, serialized into byte array.
137
+
138
+ Parameters
139
+ ----------
140
+ b : List[int]
141
+ ConfigProto from tensorflow, serialized into byte array
142
+ """
143
+ return self._set(configProtoBytes=b)
144
+
145
+ @keyword_only
146
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.BGEEmbeddings", java_model=None):
147
+ super(BGEEmbeddings, self).__init__(
148
+ classname=classname,
149
+ java_model=java_model
150
+ )
151
+ self._setDefault(
152
+ dimension=768,
153
+ batchSize=8,
154
+ maxSentenceLength=512,
155
+ caseSensitive=False,
156
+ useCLSToken=True
157
+ )
158
+
159
+ @staticmethod
160
+ def loadSavedModel(folder, spark_session):
161
+ """Loads a locally saved model.
162
+
163
+ Parameters
164
+ ----------
165
+ folder : str
166
+ Folder of the saved model
167
+ spark_session : pyspark.sql.SparkSession
168
+ The current SparkSession
169
+
170
+ Returns
171
+ -------
172
+ BGEEmbeddings
173
+ The restored model
174
+ """
175
+ from sparknlp.internal import _BGELoader
176
+ jModel = _BGELoader(folder, spark_session._jsparkSession)._java_obj
177
+ return BGEEmbeddings(java_model=jModel)
178
+
179
+ @staticmethod
180
+ def pretrained(name="bge_small_en_v1.5", lang="en", remote_loc=None):
181
+ """Downloads and loads a pretrained model.
182
+
183
+ Parameters
184
+ ----------
185
+ name : str, optional
186
+ Name of the pretrained model, by default "bge_small_en_v1.5"
187
+ lang : str, optional
188
+ Language of the pretrained model, by default "en"
189
+ remote_loc : str, optional
190
+ Optional remote address of the resource, by default None. Will use
191
+ Spark NLPs repositories otherwise.
192
+
193
+ Returns
194
+ -------
195
+ BGEEmbeddings
196
+ The restored model
197
+ """
198
+ from sparknlp.pretrained import ResourceDownloader
199
+ return ResourceDownloader.downloadModel(BGEEmbeddings, name, lang, remote_loc)