spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,385 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for WordEmbeddings."""
15
+
16
+
17
+ from sparknlp.common import *
18
+
19
+
20
+ class WordEmbeddings(AnnotatorApproach, HasEmbeddingsProperties, HasStorage):
21
+ """Word Embeddings lookup annotator that maps tokens to vectors.
22
+
23
+ For instantiated/pretrained models, see :class:`.WordEmbeddingsModel`.
24
+
25
+ A custom token lookup dictionary for embeddings can be set with
26
+ :meth:`.setStoragePath`. Each line of the provided file needs to have a
27
+ token, followed by their vector representation, delimited by a spaces::
28
+
29
+ ...
30
+ are 0.39658191506190343 0.630968081620067 0.5393722253731201 0.8428180123359783
31
+ were 0.7535235923631415 0.9699218875629833 0.10397182122983872 0.11833962569383116
32
+ stress 0.0492683418305907 0.9415954572751959 0.47624463167525755 0.16790967216778263
33
+ induced 0.1535748762292387 0.33498936903209897 0.9235178224122094 0.1158772920395934
34
+ ...
35
+
36
+
37
+ If a token is not found in the dictionary, then the result will be a zero
38
+ vector of the same dimension. Statistics about the rate of converted tokens,
39
+ can be retrieved with :meth:`WordEmbeddingsModel.withCoverageColumn()
40
+ <sparknlp.annotator.WordEmbeddingsModel.withCoverageColumn>` and
41
+ :meth:`WordEmbeddingsModel.overallCoverage()
42
+ <sparknlp.annotator.WordEmbeddingsModel.overallCoverage>`.
43
+
44
+ For extended examples of usage, see the `Examples
45
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/scala/training/NerDL/win/customNerDlPipeline/CustomForNerDLPipeline.java>`__.
46
+
47
+ ====================== ======================
48
+ Input Annotation types Output Annotation type
49
+ ====================== ======================
50
+ ``DOCUMENT, TOKEN`` ``WORD_EMBEDDINGS``
51
+ ====================== ======================
52
+
53
+ Parameters
54
+ ----------
55
+ writeBufferSize
56
+ Buffer size limit before dumping to disk storage while writing, by
57
+ default 10000
58
+ readCacheSize
59
+ Cache size for items retrieved from storage. Increase for performance
60
+ but higher memory consumption
61
+
62
+ Examples
63
+ --------
64
+ In this example, the file ``random_embeddings_dim4.txt`` has the form of the
65
+ content above.
66
+
67
+ >>> import sparknlp
68
+ >>> from sparknlp.base import *
69
+ >>> from sparknlp.annotator import *
70
+ >>> from pyspark.ml import Pipeline
71
+ >>> documentAssembler = DocumentAssembler() \\
72
+ ... .setInputCol("text") \\
73
+ ... .setOutputCol("document")
74
+ >>> tokenizer = Tokenizer() \\
75
+ ... .setInputCols(["document"]) \\
76
+ ... .setOutputCol("token")
77
+ >>> embeddings = WordEmbeddings() \\
78
+ ... .setStoragePath("src/test/resources/random_embeddings_dim4.txt", ReadAs.TEXT) \\
79
+ ... .setStorageRef("glove_4d") \\
80
+ ... .setDimension(4) \\
81
+ ... .setInputCols(["document", "token"]) \\
82
+ ... .setOutputCol("embeddings")
83
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
84
+ ... .setInputCols(["embeddings"]) \\
85
+ ... .setOutputCols("finished_embeddings") \\
86
+ ... .setOutputAsVector(True) \\
87
+ ... .setCleanAnnotations(False)
88
+ >>> pipeline = Pipeline() \\
89
+ ... .setStages([
90
+ ... documentAssembler,
91
+ ... tokenizer,
92
+ ... embeddings,
93
+ ... embeddingsFinisher
94
+ ... ])
95
+ >>> data = spark.createDataFrame([["The patient was diagnosed with diabetes."]]).toDF("text")
96
+ >>> result = pipeline.fit(data).transform(data)
97
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(truncate=False)
98
+ +----------------------------------------------------------------------------------+
99
+ |result |
100
+ +----------------------------------------------------------------------------------+
101
+ |[0.9439099431037903,0.4707513153553009,0.806300163269043,0.16176554560661316] |
102
+ |[0.7966810464859009,0.5551124811172485,0.8861005902290344,0.28284206986427307] |
103
+ |[0.025029370561242104,0.35177749395370483,0.052506182342767715,0.1887107789516449]|
104
+ |[0.08617766946554184,0.8399239182472229,0.5395117998123169,0.7864698767662048] |
105
+ |[0.6599600911140442,0.16109347343444824,0.6041093468666077,0.8913561105728149] |
106
+ |[0.5955275893211365,0.01899011991918087,0.4397728443145752,0.8911281824111938] |
107
+ |[0.9840458631515503,0.7599489092826843,0.9417727589607239,0.8624503016471863] |
108
+ +----------------------------------------------------------------------------------+
109
+
110
+ See Also
111
+ --------
112
+ SentenceEmbeddings : to combine embeddings into a sentence-level representation
113
+ """
114
+
115
+ name = "WordEmbeddings"
116
+
117
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
118
+
119
+ outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS
120
+
121
+ writeBufferSize = Param(Params._dummy(),
122
+ "writeBufferSize",
123
+ "buffer size limit before dumping to disk storage while writing",
124
+ typeConverter=TypeConverters.toInt)
125
+
126
+ readCacheSize = Param(Params._dummy(),
127
+ "readCacheSize",
128
+ "cache size for items retrieved from storage. Increase for performance but higher memory consumption",
129
+ typeConverter=TypeConverters.toInt)
130
+
131
+ def setWriteBufferSize(self, v):
132
+ """Sets buffer size limit before dumping to disk storage while writing,
133
+ by default 10000.
134
+
135
+ Parameters
136
+ ----------
137
+ v : int
138
+ Buffer size limit
139
+ """
140
+ return self._set(writeBufferSize=v)
141
+
142
+ def setReadCacheSize(self, v):
143
+ """Sets cache size for items retrieved from storage. Increase for
144
+ performance but higher memory consumption.
145
+
146
+ Parameters
147
+ ----------
148
+ v : int
149
+ Cache size for items retrieved from storage
150
+ """
151
+ return self._set(readCacheSize=v)
152
+
153
+ @keyword_only
154
+ def __init__(self):
155
+ super(WordEmbeddings, self).__init__(classname="com.johnsnowlabs.nlp.embeddings.WordEmbeddings")
156
+ self._setDefault(
157
+ caseSensitive=False,
158
+ writeBufferSize=10000,
159
+ storageRef=self.uid
160
+ )
161
+
162
+ def _create_model(self, java_model):
163
+ return WordEmbeddingsModel(java_model=java_model)
164
+
165
+
166
+ class WordEmbeddingsModel(AnnotatorModel, HasEmbeddingsProperties, HasStorageModel):
167
+ """Word Embeddings lookup annotator that maps tokens to vectors
168
+
169
+ This is the instantiated model of :class:`.WordEmbeddings`.
170
+
171
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
172
+ object:
173
+
174
+ >>> embeddings = WordEmbeddingsModel.pretrained() \\
175
+ ... .setInputCols(["document", "token"]) \\
176
+ ... .setOutputCol("embeddings")
177
+
178
+ The default model is ``"glove_100d"``, if no name is provided. For available
179
+ pretrained models please see the `Models Hub
180
+ <https://sparknlp.org/models?task=Embeddings>`__.
181
+
182
+ For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_offline.ipynb>`__.
183
+
184
+ ====================== ======================
185
+ Input Annotation types Output Annotation type
186
+ ====================== ======================
187
+ ``DOCUMENT, TOKEN`` ``WORD_EMBEDDINGS``
188
+ ====================== ======================
189
+
190
+ Parameters
191
+ ----------
192
+ dimension
193
+ Number of embedding dimensions
194
+ readCacheSize
195
+ Cache size for items retrieved from storage. Increase for performance
196
+ but higher memory consumption
197
+
198
+ Notes
199
+ -----
200
+ There are also two convenient functions to retrieve the embeddings coverage
201
+ with respect to the transformed dataset:
202
+
203
+ - :meth:`.withCoverageColumn`: Adds a custom
204
+ column with word coverage stats for the embedded field. This creates
205
+ a new column with statistics for each row.
206
+ - :meth:`.overallCoverage`: Calculates overall word
207
+ coverage for the whole data in the embedded field. This returns a single
208
+ coverage object considering all rows in the field.
209
+
210
+ Examples
211
+ --------
212
+ >>> import sparknlp
213
+ >>> from sparknlp.base import *
214
+ >>> from sparknlp.annotator import *
215
+ >>> from pyspark.ml import Pipeline
216
+ >>> documentAssembler = DocumentAssembler() \\
217
+ ... .setInputCol("text") \\
218
+ ... .setOutputCol("document")
219
+ >>> tokenizer = Tokenizer() \\
220
+ ... .setInputCols(["document"]) \\
221
+ ... .setOutputCol("token")
222
+ >>> embeddings = WordEmbeddingsModel.pretrained() \\
223
+ ... .setInputCols(["document", "token"]) \\
224
+ ... .setOutputCol("embeddings")
225
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
226
+ ... .setInputCols(["embeddings"]) \\
227
+ ... .setOutputCols("finished_embeddings") \\
228
+ ... .setOutputAsVector(True) \\
229
+ ... .setCleanAnnotations(False)
230
+ >>> pipeline = Pipeline() \\
231
+ ... .setStages([
232
+ ... documentAssembler,
233
+ ... tokenizer,
234
+ ... embeddings,
235
+ ... embeddingsFinisher
236
+ ... ])
237
+ >>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text")
238
+ >>> result = pipeline.fit(data).transform(data)
239
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
240
+ +--------------------------------------------------------------------------------+
241
+ | result|
242
+ +--------------------------------------------------------------------------------+
243
+ |[-0.570580005645752,0.44183000922203064,0.7010200023651123,-0.417129993438720...|
244
+ |[-0.542639970779419,0.4147599935531616,1.0321999788284302,-0.4024400115013122...|
245
+ |[-0.2708599865436554,0.04400600120425224,-0.020260000601410866,-0.17395000159...|
246
+ |[0.6191999912261963,0.14650000631809235,-0.08592499792575836,-0.2629800140857...|
247
+ |[-0.3397899866104126,0.20940999686717987,0.46347999572753906,-0.6479200124740...|
248
+ +--------------------------------------------------------------------------------+
249
+
250
+ See Also
251
+ --------
252
+ SentenceEmbeddings : to combine embeddings into a sentence-level representation
253
+ """
254
+
255
+ name = "WordEmbeddingsModel"
256
+
257
+ databases = ['EMBEDDINGS']
258
+
259
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
260
+
261
+ outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS
262
+
263
+ readCacheSize = Param(Params._dummy(),
264
+ "readCacheSize",
265
+ "cache size for items retrieved from storage. Increase for performance but higher memory consumption",
266
+ typeConverter=TypeConverters.toInt)
267
+
268
+ def setReadCacheSize(self, v):
269
+ """Sets cache size for items retrieved from storage. Increase for
270
+ performance but higher memory consumption.
271
+
272
+ Parameters
273
+ ----------
274
+ v : int
275
+ Cache size for items retrieved from storage
276
+ """
277
+ return self._set(readCacheSize=v)
278
+
279
+ @keyword_only
280
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel", java_model=None):
281
+ super(WordEmbeddingsModel, self).__init__(
282
+ classname=classname,
283
+ java_model=java_model
284
+ )
285
+
286
+ @staticmethod
287
+ def overallCoverage(dataset, embeddings_col):
288
+ """Calculates overall word coverage for the whole data in the embedded
289
+ field.
290
+
291
+ This returns a single coverage object considering all rows in the
292
+ field.
293
+
294
+ Parameters
295
+ ----------
296
+ dataset : :class:`pyspark.sql.DataFrame`
297
+ The dataset with embeddings column
298
+ embeddings_col : str
299
+ Name of the embeddings column
300
+
301
+ Returns
302
+ -------
303
+ :class:`.CoverageResult`
304
+ CoverateResult object with extracted information
305
+
306
+ Examples
307
+ --------
308
+ >>> wordsOverallCoverage = WordEmbeddingsModel.overallCoverage(
309
+ ... resultDF,"embeddings"
310
+ ... ).percentage
311
+ 1.0
312
+ """
313
+ from sparknlp.internal import _EmbeddingsOverallCoverage
314
+ from sparknlp.common import CoverageResult
315
+ return CoverageResult(_EmbeddingsOverallCoverage(dataset, embeddings_col).apply())
316
+
317
+ @staticmethod
318
+ def withCoverageColumn(dataset, embeddings_col, output_col='coverage'):
319
+ """Adds a custom column with word coverage stats for the embedded field.
320
+ This creates a new column with statistics for each row.
321
+
322
+ Parameters
323
+ ----------
324
+ dataset : :class:`pyspark.sql.DataFrame`
325
+ The dataset with embeddings column
326
+ embeddings_col : str
327
+ Name of the embeddings column
328
+ output_col : str, optional
329
+ Name for the resulting column, by default 'coverage'
330
+
331
+ Returns
332
+ -------
333
+ :class:`pyspark.sql.DataFrame`
334
+ Dataframe with calculated coverage
335
+
336
+ Examples
337
+ --------
338
+ >>> wordsCoverage = WordEmbeddingsModel.withCoverageColumn(resultDF, "embeddings", "cov_embeddings")
339
+ >>> wordsCoverage.select("text","cov_embeddings").show(truncate=False)
340
+ +-------------------+--------------+
341
+ |text |cov_embeddings|
342
+ +-------------------+--------------+
343
+ |This is a sentence.|[5, 5, 1.0] |
344
+ +-------------------+--------------+
345
+ """
346
+ from sparknlp.internal import _EmbeddingsCoverageColumn
347
+ from pyspark.sql import DataFrame
348
+ return DataFrame(_EmbeddingsCoverageColumn(dataset, embeddings_col, output_col).apply(), dataset.sql_ctx)
349
+
350
+ @staticmethod
351
+ def pretrained(name="glove_100d", lang="en", remote_loc=None):
352
+ """Downloads and loads a pretrained model.
353
+
354
+ Parameters
355
+ ----------
356
+ name : str, optional
357
+ Name of the pretrained model, by default "glove_100d"
358
+ lang : str, optional
359
+ Language of the pretrained model, by default "en"
360
+ remote_loc : str, optional
361
+ Optional remote address of the resource, by default None. Will use
362
+ Spark NLPs repositories otherwise.
363
+
364
+ Returns
365
+ -------
366
+ WordEmbeddingsModel
367
+ The restored model
368
+ """
369
+ from sparknlp.pretrained import ResourceDownloader
370
+ return ResourceDownloader.downloadModel(WordEmbeddingsModel, name, lang, remote_loc)
371
+
372
+ @staticmethod
373
+ def loadStorage(path, spark, storage_ref):
374
+ """Loads the model from storage.
375
+
376
+ Parameters
377
+ ----------
378
+ path : str
379
+ Path to the model
380
+ spark : :class:`pyspark.sql.SparkSession`
381
+ The current SparkSession
382
+ storage_ref : str
383
+ Identifiers for the model parameters
384
+ """
385
+ HasStorageModel.loadStorages(path, spark, storage_ref, WordEmbeddingsModel.databases)
@@ -0,0 +1,225 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for XlmRoBertaEmbeddings."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class XlmRoBertaEmbeddings(AnnotatorModel,
20
+ HasEmbeddingsProperties,
21
+ HasCaseSensitiveProperties,
22
+ HasStorageRef,
23
+ HasBatchedAnnotate,
24
+ HasEngine,
25
+ HasMaxSentenceLengthLimit):
26
+ """The XLM-RoBERTa model was proposed in `Unsupervised Cross-lingual
27
+ Representation Learning at Scale` by Alexis Conneau, Kartikay Khandelwal,
28
+ Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzman, Edouard
29
+ Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
30
+
31
+ It is based on Facebook's RoBERTa model released in 2019. It is a large
32
+ multi-lingual language model, trained on 2.5TB of filtered CommonCrawl data.
33
+
34
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
35
+ object:
36
+
37
+ >>> embeddings = XlmRoBertaEmbeddings.pretrained() \\
38
+ ... .setInputCols(["document", "token"]) \\
39
+ ... .setOutputCol("embeddings")
40
+
41
+ The default model is ``"xlm_roberta_base"``, default language is ``"xx"``
42
+ (meaning multi-lingual), if no values are provided. For available pretrained
43
+ models please see the `Models Hub
44
+ <https://sparknlp.org/models?task=Embeddings>`__.
45
+
46
+ For extended examples of usage, see the `Examples
47
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20XLM-RoBERTa.ipynb>`__.
48
+ To see which models are compatible and how to import them see
49
+ `Import Transformers into Spark NLP 🚀
50
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
51
+
52
+ ====================== ======================
53
+ Input Annotation types Output Annotation type
54
+ ====================== ======================
55
+ ``DOCUMENT, TOKEN`` ``WORD_EMBEDDINGS``
56
+ ====================== ======================
57
+
58
+ Parameters
59
+ ----------
60
+ batchSize
61
+ Size of every batch, by default 8
62
+ dimension
63
+ Number of embedding dimensions, by default 768
64
+ caseSensitive
65
+ Whether to ignore case in tokens for embeddings matching, by default
66
+ True
67
+ maxSentenceLength
68
+ Max sentence length to process, by default 128
69
+ configProtoBytes
70
+ ConfigProto from tensorflow, serialized into byte array.
71
+
72
+ Notes
73
+ -----
74
+ - XLM-RoBERTa is a multilingual model trained on 100 different languages.
75
+ Unlike some XLM multilingual models, it does not require **lang**
76
+ parameter to understand which language is used, and should be able to
77
+ determine the correct language from the input ids.
78
+ - This implementation is the same as RoBERTa. Refer to
79
+ :class:`.RoBertaEmbeddings` for usage examples as well as the information
80
+ relative to the inputs and outputs.
81
+
82
+ References
83
+ ----------
84
+ `Unsupervised Cross-lingual
85
+ Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__
86
+
87
+ **Paper Abstract:**
88
+
89
+ *This paper shows that pretraining multilingual language models at scale
90
+ leads to significant performance gains for a wide range of cross-lingual
91
+ transfer tasks. We train a Transformer-based masked language model on one
92
+ hundred languages, using more than two terabytes of filtered CommonCrawl
93
+ data. Our model, dubbed XLM-R, significantly outperforms multilingual BERT
94
+ (mBERT) on a variety of cross-lingual benchmarks, including +13.8% average
95
+ accuracy on XNLI, +12.3% average F1 score on MLQA, and +2.1% average F1
96
+ score on NER. XLM-R performs particularly well on low-resource languages,
97
+ improving 11.8% in XNLI accuracy for Swahili and 9.2% for Urdu over the
98
+ previous XLM model. We also present a detailed empirical evaluation of the
99
+ key factors that are required to achieve these gains, including the
100
+ trade-offs between (1) positive transfer and capacity dilution and (2) the
101
+ performance of high and low resource languages at scale. Finally, we show,
102
+ for the first time, the possibility of multilingual modeling without
103
+ sacrificing per-language performance; XLM-Ris very competitive with strong
104
+ monolingual models on the GLUE and XNLI benchmarks. We will make XLM-R code,
105
+ data, and models publicly available.*
106
+
107
+ Examples
108
+ --------
109
+ >>> import sparknlp
110
+ >>> from sparknlp.base import *
111
+ >>> from sparknlp.annotator import *
112
+ >>> from pyspark.ml import Pipeline
113
+ >>> documentAssembler = DocumentAssembler() \\
114
+ ... .setInputCol("text") \\
115
+ ... .setOutputCol("document")
116
+ >>> tokenizer = Tokenizer() \\
117
+ ... .setInputCols(["document"]) \\
118
+ ... .setOutputCol("token")
119
+ >>> embeddings = XlmRoBertaEmbeddings.pretrained() \\
120
+ ... .setInputCols(["document", "token"]) \\
121
+ ... .setOutputCol("embeddings") \\
122
+ ... .setCaseSensitive(True)
123
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
124
+ ... .setInputCols(["embeddings"]) \\
125
+ ... .setOutputCols("finished_embeddings") \\
126
+ ... .setOutputAsVector(True) \\
127
+ ... .setCleanAnnotations(False)
128
+ >>> pipeline = Pipeline() \\
129
+ ... .setStages([
130
+ ... documentAssembler,
131
+ ... tokenizer,
132
+ ... embeddings,
133
+ ... embeddingsFinisher
134
+ ... ])
135
+ >>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text")
136
+ >>> result = pipeline.fit(data).transform(data)
137
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
138
+ +--------------------------------------------------------------------------------+
139
+ | result|
140
+ +--------------------------------------------------------------------------------+
141
+ |[-0.05969233065843582,-0.030789051204919815,0.04443822056055069,0.09564960747...|
142
+ |[-0.038839809596538544,0.011712731793522835,0.019954433664679527,0.0667808502...|
143
+ |[-0.03952755779027939,-0.03455188870429993,0.019103847444057465,0.04311436787...|
144
+ |[-0.09579929709434509,0.02494969218969345,-0.014753809198737144,0.10259044915...|
145
+ |[0.004710011184215546,-0.022148698568344116,0.011723337695002556,-0.013356896...|
146
+ +--------------------------------------------------------------------------------+
147
+ """
148
+
149
+ name = "XlmRoBertaEmbeddings"
150
+
151
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
152
+
153
+ outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS
154
+
155
+ configProtoBytes = Param(Params._dummy(),
156
+ "configProtoBytes",
157
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
158
+ TypeConverters.toListInt)
159
+
160
+ def setConfigProtoBytes(self, b):
161
+ """Sets configProto from tensorflow, serialized into byte array.
162
+
163
+ Parameters
164
+ ----------
165
+ b : List[int]
166
+ ConfigProto from tensorflow, serialized into byte array
167
+ """
168
+ return self._set(configProtoBytes=b)
169
+
170
+ @keyword_only
171
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.XlmRoBertaEmbeddings", java_model=None):
172
+ super(XlmRoBertaEmbeddings, self).__init__(
173
+ classname=classname,
174
+ java_model=java_model
175
+ )
176
+ self._setDefault(
177
+ dimension=768,
178
+ batchSize=8,
179
+ maxSentenceLength=128,
180
+ caseSensitive=True
181
+ )
182
+
183
+ @staticmethod
184
+ def loadSavedModel(folder, spark_session, use_openvino=False):
185
+ """Loads a locally saved model.
186
+
187
+ Parameters
188
+ ----------
189
+ folder : str
190
+ Folder of the saved model
191
+ spark_session : pyspark.sql.SparkSession
192
+ The current SparkSession
193
+ use_openvino: bool
194
+ Use OpenVINO backend
195
+
196
+ Returns
197
+ -------
198
+ XlmRoBertaEmbeddings
199
+ The restored model
200
+ """
201
+ from sparknlp.internal import _XlmRoBertaLoader
202
+ jModel = _XlmRoBertaLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
203
+ return XlmRoBertaEmbeddings(java_model=jModel)
204
+
205
+ @staticmethod
206
+ def pretrained(name="xlm_roberta_base", lang="xx", remote_loc=None):
207
+ """Downloads and loads a pretrained model.
208
+
209
+ Parameters
210
+ ----------
211
+ name : str, optional
212
+ Name of the pretrained model, by default "xlm_roberta_base"
213
+ lang : str, optional
214
+ Language of the pretrained model, by default "xx"
215
+ remote_loc : str, optional
216
+ Optional remote address of the resource, by default None. Will use
217
+ Spark NLPs repositories otherwise.
218
+
219
+ Returns
220
+ -------
221
+ XlmRoBertaEmbeddings
222
+ The restored model
223
+ """
224
+ from sparknlp.pretrained import ResourceDownloader
225
+ return ResourceDownloader.downloadModel(XlmRoBertaEmbeddings, name, lang, remote_loc)