spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
sparknlp/annotation.py CHANGED
@@ -1,32 +1,120 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Contains the Annotation data format
16
+ """
17
+
1
18
  from pyspark.sql.types import *
2
19
 
3
20
 
4
21
  class Annotation:
22
+ """Represents the output of Spark NLP Annotators and their details.
23
+
24
+ Parameters
25
+ ----------
26
+ annotator_type : str
27
+ The type of the output of the annotator. Possible values are ``DOCUMENT,
28
+ TOKEN, WORDPIECE, WORD_EMBEDDINGS, SENTENCE_EMBEDDINGS, CATEGORY, DATE,
29
+ ENTITY, SENTIMENT, POS, CHUNK, NAMED_ENTITY, NEGEX, DEPENDENCY,
30
+ LABELED_DEPENDENCY, LANGUAGE, KEYWORD, DUMMY``.
31
+ begin : int
32
+ The index of the first character under this annotation.
33
+ end : int
34
+ The index of the last character under this annotation.
35
+ result : str
36
+ The resulting string of the annotation.
37
+ metadata : dict
38
+ Associated metadata for this annotation
39
+ embeddings : list
40
+ Embeddings vector where applicable
41
+ """
5
42
 
6
- def __init__(self, annotator_type, begin, end, result, metadata, embeddings):
7
- self.annotator_type = annotator_type
43
+ def __init__(self, annotatorType, begin, end, result, metadata, embeddings):
44
+ self.annotatorType = annotatorType
8
45
  self.begin = begin
9
46
  self.end = end
10
47
  self.result = result
11
48
  self.metadata = metadata
12
49
  self.embeddings = embeddings
13
50
 
51
+ def copy(self, result):
52
+ """Creates new Annotation with a different result, containing all
53
+ settings of this Annotation.
54
+
55
+ Parameters
56
+ ----------
57
+ result : str
58
+ The result of the annotation that should be copied.
59
+
60
+ Returns
61
+ -------
62
+ Annotation
63
+ Newly created Annotation
64
+ """
65
+ return Annotation(self.annotatorType, self.begin, self.end, result, self.metadata, self.embeddings)
66
+
14
67
  def __str__(self):
15
- return "Annotation(%s, %i, %i, %s, %s)" % (
16
- self.annotator_type,
68
+ return "Annotation(%s, %i, %i, %s, %s, %s)" % (
69
+ self.annotatorType,
17
70
  self.begin,
18
71
  self.end,
19
72
  self.result,
20
- str(self.metadata)
73
+ str(self.metadata),
74
+ str(self.embeddings)
21
75
  )
22
76
 
23
77
  def __repr__(self):
24
78
  return self.__str__()
25
79
 
80
+ def __eq__(self, other):
81
+ same_annotator_type = self.annotatorType == other.annotatorType
82
+ same_result = self.result == other.result
83
+ same_begin = self.begin == other.begin
84
+ same_end = self.end == other.end
85
+ same_metadata = dict(self.metadata) == other.metadata
86
+ same_embeddings = self.embeddings == other.embeddings
87
+
88
+ same_annotation = \
89
+ same_annotator_type and same_result and same_begin and same_end and same_metadata and same_embeddings
90
+
91
+ return same_annotation
92
+
26
93
  @staticmethod
27
94
  def dataType():
95
+ """Returns a Spark `StructType`, that represents the schema of the
96
+ Annotation.
97
+
98
+ The Schema looks like::
99
+
100
+ struct (containsNull = True)
101
+ |-- annotatorType: string (nullable = False)
102
+ |-- begin: integer (nullable = False)
103
+ |-- end: integer (nullable = False)
104
+ |-- result: string (nullable = False)
105
+ |-- metadata: map (nullable = False)
106
+ | |-- key: string
107
+ | |-- value: string (valueContainsNull = True)
108
+ |-- embeddings: array (nullable = False)
109
+ | |-- element: float (containsNull = False)
110
+
111
+ Returns
112
+ -------
113
+ :class:`pyspark.sql.types.StructType`
114
+ Spark Schema of the Annotation
115
+ """
28
116
  return StructType([
29
- StructField('annotator_type', StringType(), False),
117
+ StructField('annotatorType', StringType(), False),
30
118
  StructField('begin', IntegerType(), False),
31
119
  StructField('end', IntegerType(), False),
32
120
  StructField('result', StringType(), False),
@@ -36,4 +124,47 @@ class Annotation:
36
124
 
37
125
  @staticmethod
38
126
  def arrayType():
127
+ """Returns a Spark `ArrayType`, that contains the `dataType` of the
128
+ annotation.
129
+
130
+ Returns
131
+ -------
132
+ :class:`pyspark.sql.types.ArrayType`
133
+ ArrayType with the Annotation data type embedded.
134
+ """
39
135
  return ArrayType(Annotation.dataType())
136
+
137
+ @staticmethod
138
+ def fromRow(row):
139
+ """Creates a Annotation from a Spark `Row`.
140
+
141
+ Parameters
142
+ ----------
143
+ row : :class:`pyspark.sql.Row`
144
+ Spark row containing columns for ``annotatorType, begin, end,
145
+ result, metadata, embeddings``.
146
+
147
+ Returns
148
+ -------
149
+ Annotation
150
+ The new Annotation.
151
+ """
152
+ return Annotation(row.annotatorType, row.begin, row.end, row.result, row.metadata, row.embeddings)
153
+
154
+ @staticmethod
155
+ def toRow(annotation):
156
+ """Transforms an Annotation to a Spark `Row`.
157
+
158
+ Parameters
159
+ ----------
160
+ annotation : Annotation
161
+ The Annotation to be transformed.
162
+
163
+ Returns
164
+ -------
165
+ :class:`pyspark.sql.Row`
166
+ The new Row.
167
+ """
168
+ from pyspark.sql import Row
169
+ return Row(annotation.annotatorType, annotation.begin, annotation.end, annotation.result, annotation.metadata,
170
+ annotation.embeddings)
@@ -0,0 +1,61 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Contains the AnnotationAudio data format
16
+ """
17
+
18
+
19
+ class AnnotationAudio:
20
+ """Represents the output of Spark NLP Annotators for audio output and their details.
21
+
22
+ Parameters
23
+ ----------
24
+ annotator_type : str
25
+ The type of the output of the annotator. Possible values are ``AUDIO``.
26
+ result : list(floats)
27
+ Audio data in floats - already loaded/processed audio files
28
+ metadata : dict
29
+ Associated metadata for this annotation
30
+ """
31
+
32
+ def __init__(self, annotatorType, result, metadata):
33
+ self.annotatorType = annotatorType
34
+ self.result = result
35
+ self.metadata = metadata
36
+
37
+ def copy(self, result):
38
+ """Creates new AnnotationAudio with a different result, containing all
39
+ settings of this Annotation.
40
+
41
+ Parameters
42
+ ----------
43
+ result : list(bytes)
44
+ The result of the annotation that should be copied.
45
+
46
+ Returns
47
+ -------
48
+ AnnotationAudio
49
+ Newly created AnnotationAudio
50
+ """
51
+ return AnnotationAudio(self.annotatorType, result, self.metadata)
52
+
53
+ def __str__(self):
54
+ return "AnnotationAudio(%s, %s, %s)" % (
55
+ self.annotatorType,
56
+ str(self.result),
57
+ str(self.metadata)
58
+ )
59
+
60
+ def __repr__(self):
61
+ return self.__str__()
@@ -0,0 +1,82 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Contains the AnnotationImage data format
16
+ """
17
+
18
+
19
+ class AnnotationImage:
20
+ """Represents the output of Spark NLP Annotators for image output and their details.
21
+
22
+ Parameters
23
+ ----------
24
+ annotator_type : str
25
+ The type of the output of the annotator. Possible values are ``IMAGE``.
26
+ origin: str
27
+ * Represents the source URI of the image
28
+ height : int
29
+ Image height in pixels
30
+ width : int
31
+ Image width in pixels
32
+ nChannels: int
33
+ Number of color channels
34
+ mode: int
35
+ OpenCV type
36
+ result : list(bytes)
37
+ Image data in bytes
38
+ metadata : dict
39
+ Associated metadata for this annotation
40
+ """
41
+
42
+ def __init__(self, annotatorType, origin, height, width, nChannels, mode, result, metadata):
43
+ self.annotatorType = annotatorType
44
+ self.origin = origin
45
+ self.height = height
46
+ self.width = width
47
+ self.nChannels = nChannels
48
+ self.mode = mode
49
+ self.result = result
50
+ self.metadata = metadata
51
+
52
+ def copy(self, result):
53
+ """Creates new AnnotationImage with a different result, containing all
54
+ settings of this Annotation.
55
+
56
+ Parameters
57
+ ----------
58
+ result : list(bytes)
59
+ The result of the annotation that should be copied.
60
+
61
+ Returns
62
+ -------
63
+ AnnotationImage
64
+ Newly created AnnotationImage
65
+ """
66
+ return AnnotationImage(self.annotatorType, self.origin, self.height, self.width,
67
+ self.nChannels, self.mode, result, self.metadata)
68
+
69
+ def __str__(self):
70
+ return "AnnotationImage(%s, %s, %i, %i, %i, %i, %s, %s)" % (
71
+ self.annotatorType,
72
+ self.origin,
73
+ self.height,
74
+ self.width,
75
+ self.nChannels,
76
+ self.mode,
77
+ str(self.result),
78
+ str(self.metadata)
79
+ )
80
+
81
+ def __repr__(self):
82
+ return self.__str__()
@@ -0,0 +1,93 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Module containing all available Annotators of Spark NLP and their base
16
+ classes.
17
+ """
18
+ # New Annotators need to be imported here
19
+ from sparknlp.annotator.classifier_dl import *
20
+ from sparknlp.annotator.embeddings import *
21
+ from sparknlp.annotator.er import *
22
+ from sparknlp.annotator.keyword_extraction import *
23
+ from sparknlp.annotator.ld_dl import *
24
+ from sparknlp.annotator.matcher import *
25
+ from sparknlp.annotator.ner import *
26
+ from sparknlp.annotator.dependency import *
27
+ from sparknlp.annotator.pos import *
28
+ from sparknlp.annotator.sentence import *
29
+ from sparknlp.annotator.sentiment import *
30
+ from sparknlp.annotator.seq2seq import *
31
+ from sparknlp.annotator.spell_check import *
32
+ from sparknlp.annotator.token import *
33
+ from sparknlp.annotator.ws import *
34
+ from sparknlp.annotator.chunker import *
35
+ from sparknlp.annotator.document_normalizer import *
36
+ from sparknlp.annotator.graph_extraction import *
37
+ from sparknlp.annotator.lemmatizer import *
38
+ from sparknlp.annotator.n_gram_generator import *
39
+ from sparknlp.annotator.normalizer import *
40
+ from sparknlp.annotator.stemmer import *
41
+ from sparknlp.annotator.stop_words_cleaner import *
42
+ from sparknlp.annotator.coref import *
43
+ from sparknlp.annotator.tf_ner_dl_graph_builder import *
44
+ from sparknlp.annotator.cv import *
45
+ from sparknlp.annotator.audio import *
46
+ from sparknlp.annotator.chunk2_doc import *
47
+ from sparknlp.annotator.date2_chunk import *
48
+ from sparknlp.annotator.openai import *
49
+ from sparknlp.annotator.token2_chunk import *
50
+ from sparknlp.annotator.document_character_text_splitter import *
51
+ from sparknlp.annotator.document_token_splitter import *
52
+
53
+ if sys.version_info[0] == 2:
54
+ raise ImportError(
55
+ "Spark NLP only supports Python 3.6 and above. "
56
+ "Please use Python 3.6 or above that is compatible with both Spark NLP and PySpark"
57
+ )
58
+ else:
59
+ __import__("com.johnsnowlabs.nlp")
60
+
61
+ annotators = sys.modules[__name__]
62
+ pos = sys.modules[__name__]
63
+ pos.perceptron = sys.modules[__name__]
64
+ ner = sys.modules[__name__]
65
+ ner.crf = sys.modules[__name__]
66
+ ner.dl = sys.modules[__name__]
67
+ regex = sys.modules[__name__]
68
+ sbd = sys.modules[__name__]
69
+ sbd.pragmatic = sys.modules[__name__]
70
+ sda = sys.modules[__name__]
71
+ sda.pragmatic = sys.modules[__name__]
72
+ sda.vivekn = sys.modules[__name__]
73
+ spell = sys.modules[__name__]
74
+ spell.norvig = sys.modules[__name__]
75
+ spell.symmetric = sys.modules[__name__]
76
+ spell.context = sys.modules[__name__]
77
+ parser = sys.modules[__name__]
78
+ parser.dep = sys.modules[__name__]
79
+ parser.typdep = sys.modules[__name__]
80
+ embeddings = sys.modules[__name__]
81
+ classifier = sys.modules[__name__]
82
+ classifier.dl = sys.modules[__name__]
83
+ ld = sys.modules[__name__]
84
+ ld.dl = sys.modules[__name__]
85
+ keyword = sys.modules[__name__]
86
+ keyword.yake = sys.modules[__name__]
87
+ sentence_detector_dl = sys.modules[__name__]
88
+ seq2seq = sys.modules[__name__]
89
+ ws = sys.modules[__name__]
90
+ er = sys.modules[__name__]
91
+ coref = sys.modules[__name__]
92
+ cv = sys.modules[__name__]
93
+ audio = sys.modules[__name__]
@@ -0,0 +1,16 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from sparknlp.annotator.audio.wav2vec2_for_ctc import *
15
+ from sparknlp.annotator.audio.hubert_for_ctc import *
16
+ from sparknlp.annotator.audio.whisper_for_ctc import *
@@ -0,0 +1,188 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Contains classes concerning HubertForCTC."""
16
+
17
+ from sparknlp.common import *
18
+
19
+
20
+ class HubertForCTC(AnnotatorModel,
21
+ HasBatchedAnnotateAudio,
22
+ HasAudioFeatureProperties,
23
+ HasEngine):
24
+ """Hubert Model with a language modeling head on top for Connectionist Temporal
25
+ Classification (CTC). Hubert was proposed in HuBERT: Self-Supervised Speech
26
+ Representation Learning by Masked Prediction of Hidden Units by Wei-Ning Hsu,
27
+ Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov,
28
+ Abdelrahman Mohamed.
29
+
30
+ The annotator takes audio files and transcribes it as text. The audio needs to be
31
+ provided pre-processed an array of floats.
32
+
33
+ Note that this annotator is currently not supported on Apple Silicon processors such
34
+ as the M1. This is due to the processor not supporting instructions for XLA.
35
+
36
+ Pretrained models can be loaded with ``pretrained`` of the companion object:
37
+
38
+ >>> speechToText = HubertForCTC.pretrained() \\
39
+ ... .setInputCols(["audio_assembler"]) \\
40
+ ... .setOutputCol("text")
41
+
42
+
43
+ The default model is ``"asr_hubert_large_ls960"``, if no name is provided.
44
+
45
+ For available pretrained models please see the
46
+ `Models Hub <https://sparknlp.org/models>`__.
47
+
48
+ To see which models are compatible and how to import them see
49
+ https://github.com/JohnSnowLabs/spark-nlp/discussions/5669 and to see more extended
50
+ examples, see
51
+ `HubertForCTCTestSpec <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/audio/HubertForCTCTestSpec.scala>`__.
52
+
53
+ **Paper Abstract:**
54
+
55
+ *Self-supervised approaches for speech representation learning are challenged by
56
+ three unique problems: (1) there are multiple sound units in each input utterance,
57
+ (2) there is no lexicon of input sound units during the pre-training phase, and (3)
58
+ sound units have variable lengths with no explicit segmentation. To deal with these
59
+ three problems, we propose the Hidden-Unit BERT (HuBERT) approach for
60
+ self-supervised speech representation learning, which utilizes an offline clustering
61
+ step to provide aligned target labels for a BERT-like prediction loss. A key
62
+ ingredient of our approach is applying the prediction loss over the masked regions
63
+ only, which forces the model to learn a combined acoustic and language model over
64
+ the continuous inputs. HuBERT relies primarily on the consistency of the
65
+ unsupervised clustering step rather than the intrinsic quality of the assigned
66
+ cluster labels. Starting with a simple k-means teacher of 100 clusters, and using
67
+ two iterations of clustering, the HuBERT model either matches or improves upon the
68
+ state-of-the-art wav2vec 2.0 performance on the Librispeech (960h) and Libri-light
69
+ (60,000h) benchmarks with 10min, 1h, 10h, 100h, and 960h fine-tuning subsets. Using
70
+ a 1B parameter model, HuBERT shows up to 19% and 13% relative WER reduction on the
71
+ more challenging dev-other and test-other evaluation subsets.*
72
+
73
+ References
74
+ ----------
75
+
76
+ `HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units
77
+ <https://arxiv.org/abs/2106.07447>`__
78
+
79
+ ====================== ======================
80
+ Input Annotation types Output Annotation type
81
+ ====================== ======================
82
+ ``AUDIO`` ``DOCUMENT``
83
+ ====================== ======================
84
+
85
+ Parameters
86
+ ----------
87
+
88
+ batchSize
89
+ Size of each batch, by default 4
90
+
91
+ Examples
92
+ --------
93
+ >>> import sparknlp
94
+ >>> from sparknlp.base import *
95
+ >>> from sparknlp.annotator import *
96
+ >>> from pyspark.ml import Pipeline
97
+ >>> audioAssembler = AudioAssembler() \\
98
+ ... .setInputCol("audio_content") \\
99
+ ... .setOutputCol("audio_assembler")
100
+ >>> speechToText = HubertForCTC \\
101
+ ... .pretrained() \\
102
+ ... .setInputCols(["audio_assembler"]) \\
103
+ ... .setOutputCol("text")
104
+ >>> pipeline = Pipeline().setStages([audioAssembler, speechToText])
105
+ >>> processedAudioFloats = spark.createDataFrame([[rawFloats]]).toDF("audio_content")
106
+ >>> result = pipeline.fit(processedAudioFloats).transform(processedAudioFloats)
107
+ >>> result.select("text.result").show(truncate = False)
108
+ +------------------------------------------------------------------------------------------+
109
+ |result |
110
+ +------------------------------------------------------------------------------------------+
111
+ |[MISTER QUILTER IS THE APOSTLE OF THE MIDLE CLASES AND WE ARE GLAD TO WELCOME HIS GOSPEL ]|
112
+ +------------------------------------------------------------------------------------------+
113
+ """
114
+ name = "HubertForCTC"
115
+
116
+ inputAnnotatorTypes = [AnnotatorType.AUDIO]
117
+
118
+ outputAnnotatorType = AnnotatorType.DOCUMENT
119
+
120
+ configProtoBytes = Param(Params._dummy(),
121
+ "configProtoBytes",
122
+ "ConfigProto from tensorflow, serialized into byte array. Get with "
123
+ "config_proto.SerializeToString()",
124
+ TypeConverters.toListInt)
125
+
126
+ def setConfigProtoBytes(self, b):
127
+ """Sets configProto from tensorflow, serialized into byte array.
128
+
129
+ Parameters
130
+ ----------
131
+ b : List[int]
132
+ ConfigProto from tensorflow, serialized into byte array
133
+ """
134
+ return self._set(configProtoBytes=b)
135
+
136
+ @keyword_only
137
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.audio.HubertForCTC",
138
+ java_model=None):
139
+ super(HubertForCTC, self).__init__(
140
+ classname=classname,
141
+ java_model=java_model
142
+ )
143
+ self._setDefault(
144
+ batchSize=4
145
+ )
146
+
147
+ @staticmethod
148
+ def loadSavedModel(folder, spark_session):
149
+ """Loads a locally saved model.
150
+
151
+ Parameters
152
+ ----------
153
+ folder : str
154
+ Folder of the saved model
155
+ spark_session : pyspark.sql.SparkSession
156
+ The current SparkSession
157
+
158
+ Returns
159
+ -------
160
+ HubertForCTC
161
+ The restored model
162
+ """
163
+ from sparknlp.internal import _HubertForCTC
164
+ jModel = _HubertForCTC(folder, spark_session._jsparkSession)._java_obj
165
+ return HubertForCTC(java_model=jModel)
166
+
167
+ @staticmethod
168
+ def pretrained(name="asr_hubert_large_ls960", lang="en", remote_loc=None):
169
+ """Downloads and loads a pretrained model.
170
+
171
+ Parameters
172
+ ----------
173
+ name : str, optional
174
+ Name of the pretrained model, by default
175
+ "asr_hubert_large_ls960"
176
+ lang : str, optional
177
+ Language of the pretrained model, by default "en"
178
+ remote_loc : str, optional
179
+ Optional remote address of the resource, by default None. Will use
180
+ Spark NLPs repositories otherwise.
181
+
182
+ Returns
183
+ -------
184
+ HubertForCTC
185
+ The restored model
186
+ """
187
+ from sparknlp.pretrained import ResourceDownloader
188
+ return ResourceDownloader.downloadModel(HubertForCTC, name, lang, remote_loc)