spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,358 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the NorvigSweeting spell checker."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class NorvigSweetingApproach(AnnotatorApproach):
20
+ """Trains annotator, that retrieves tokens and makes corrections automatically if
21
+ not found in an English dictionary, based on the algorithm by Peter Norvig.
22
+
23
+ The algorithm is based on a Bayesian approach to spell checking: Given the word we
24
+ look in the provided dictionary to choose the word with the highest probability
25
+ to be the correct one.
26
+
27
+ A dictionary of correct spellings must be provided with :meth:`.setDictionary` in
28
+ the form of a text file, where each word is parsed by a regex pattern.
29
+
30
+ For instantiated/pretrained models, see :class:`.NorvigSweetingModel`.
31
+
32
+ ====================== ======================
33
+ Input Annotation types Output Annotation type
34
+ ====================== ======================
35
+ ``TOKEN`` ``TOKEN``
36
+ ====================== ======================
37
+
38
+ Parameters
39
+ ----------
40
+ dictionary
41
+ Dictionary needs 'tokenPattern' regex in dictionary for separating words
42
+ caseSensitive
43
+ Whether to ignore case sensitivity, by default False
44
+ doubleVariants
45
+ Whether to use more expensive spell checker, by default False
46
+
47
+ Increase search at cost of performance. Enables extra check for word
48
+ combinations.
49
+ shortCircuit
50
+ Whether to use faster mode, by default False
51
+
52
+ Increase performance at cost of accuracy. Faster but less accurate.
53
+ frequencyPriority
54
+ Applies frequency over hamming in intersections, when false hamming
55
+ takes priority, by default True
56
+ wordSizeIgnore
57
+ Minimum size of word before ignoring, by default 3
58
+ dupsLimit
59
+ Maximum duplicate of characters in a word to consider, by default 2
60
+ reductLimit
61
+ Word reductions limit, by default 3
62
+ intersections
63
+ Hamming intersections to attempt, by default 10
64
+ vowelSwapLimit
65
+ Vowel swap attempts, by default 6
66
+
67
+ References
68
+ ----------
69
+
70
+ Inspired by the spell checker by Peter Norvig:
71
+ `How to Write a Spelling Corrector <https://norvig.com/spell-correct.html>`__
72
+
73
+ Examples
74
+ --------
75
+ >>> import sparknlp
76
+ >>> from sparknlp.base import *
77
+ >>> from sparknlp.annotator import *
78
+ >>> from pyspark.ml import Pipeline
79
+
80
+ In this example, the dictionary ``"words.txt"`` has the form of::
81
+
82
+ ...
83
+ gummy
84
+ gummic
85
+ gummier
86
+ gummiest
87
+ gummiferous
88
+ ...
89
+
90
+ This dictionary is then set to be the basis of the spell checker.
91
+
92
+ >>> documentAssembler = DocumentAssembler() \\
93
+ ... .setInputCol("text") \\
94
+ ... .setOutputCol("document")
95
+ >>> tokenizer = Tokenizer() \\
96
+ ... .setInputCols(["document"]) \\
97
+ ... .setOutputCol("token")
98
+ >>> spellChecker = NorvigSweetingApproach() \\
99
+ ... .setInputCols(["token"]) \\
100
+ ... .setOutputCol("spell") \\
101
+ ... .setDictionary("src/test/resources/spell/words.txt")
102
+ >>> pipeline = Pipeline().setStages([
103
+ ... documentAssembler,
104
+ ... tokenizer,
105
+ ... spellChecker
106
+ ... ])
107
+ >>> pipelineModel = pipeline.fit(trainingData)
108
+
109
+ See Also
110
+ --------
111
+ SymmetricDeleteApproach : for an alternative approach to spell checking
112
+ ContextSpellCheckerApproach : for a DL based approach
113
+ """
114
+ inputAnnotatorTypes = [AnnotatorType.TOKEN]
115
+
116
+ outputAnnotatorType = AnnotatorType.TOKEN
117
+
118
+ dictionary = Param(Params._dummy(),
119
+ "dictionary",
120
+ "dictionary needs 'tokenPattern' regex in dictionary for separating words",
121
+ typeConverter=TypeConverters.identity)
122
+
123
+ caseSensitive = Param(Params._dummy(),
124
+ "caseSensitive",
125
+ "whether to ignore case sensitivty",
126
+ typeConverter=TypeConverters.toBoolean)
127
+
128
+ doubleVariants = Param(Params._dummy(),
129
+ "doubleVariants",
130
+ "whether to use more expensive spell checker",
131
+ typeConverter=TypeConverters.toBoolean)
132
+
133
+ shortCircuit = Param(Params._dummy(),
134
+ "shortCircuit",
135
+ "whether to use faster mode",
136
+ typeConverter=TypeConverters.toBoolean)
137
+
138
+ frequencyPriority = Param(Params._dummy(),
139
+ "frequencyPriority",
140
+ "applies frequency over hamming in intersections. When false hamming takes priority",
141
+ typeConverter=TypeConverters.toBoolean)
142
+
143
+ wordSizeIgnore = Param(Params._dummy(),
144
+ "wordSizeIgnore",
145
+ "minimum size of word before ignoring. Defaults to 3",
146
+ typeConverter=TypeConverters.toInt)
147
+
148
+ dupsLimit = Param(Params._dummy(),
149
+ "dupsLimit",
150
+ "maximum duplicate of characters in a word to consider. Defaults to 2",
151
+ typeConverter=TypeConverters.toInt)
152
+
153
+ reductLimit = Param(Params._dummy(),
154
+ "reductLimit",
155
+ "word reductions limit. Defaults to 3",
156
+ typeConverter=TypeConverters.toInt)
157
+
158
+ intersections = Param(Params._dummy(),
159
+ "intersections",
160
+ "hamming intersections to attempt. Defaults to 10",
161
+ typeConverter=TypeConverters.toInt)
162
+
163
+ vowelSwapLimit = Param(Params._dummy(),
164
+ "vowelSwapLimit",
165
+ "vowel swap attempts. Defaults to 6",
166
+ typeConverter=TypeConverters.toInt)
167
+
168
+ @keyword_only
169
+ def __init__(self):
170
+ super(NorvigSweetingApproach, self).__init__(
171
+ classname="com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingApproach")
172
+ self._setDefault(caseSensitive=False, doubleVariants=False, shortCircuit=False, wordSizeIgnore=3, dupsLimit=2,
173
+ reductLimit=3, intersections=10, vowelSwapLimit=6, frequencyPriority=True)
174
+ self.dictionary_path = ""
175
+
176
+ def setDictionary(self, path, token_pattern="\S+", read_as=ReadAs.TEXT, options={"format": "text"}):
177
+ """Sets dictionary which needs 'tokenPattern' regex for separating
178
+ words.
179
+
180
+ Parameters
181
+ ----------
182
+ path : str
183
+ Path to the source file
184
+ token_pattern : str, optional
185
+ Pattern for token separation, by default ``\\S+``
186
+ read_as : str, optional
187
+ How to read the file, by default ReadAs.TEXT
188
+ options : dict, optional
189
+ Options to read the resource, by default {"format": "text"}
190
+ """
191
+ self.dictionary_path = path
192
+ opts = options.copy()
193
+ if "tokenPattern" not in opts:
194
+ opts["tokenPattern"] = token_pattern
195
+ return self._set(dictionary=ExternalResource(path, read_as, opts))
196
+
197
+ def setCaseSensitive(self, value):
198
+ """Sets whether to ignore case sensitivity, by default False.
199
+
200
+ Parameters
201
+ ----------
202
+ value : bool
203
+ Whether to ignore case sensitivity
204
+ """
205
+ return self._set(caseSensitive=value)
206
+
207
+ def setDoubleVariants(self, value):
208
+ """Sets whether to use more expensive spell checker, by default False.
209
+
210
+ Increase search at cost of performance. Enables extra check for word
211
+ combinations.
212
+
213
+ Parameters
214
+ ----------
215
+ value : bool
216
+ [description]
217
+ """
218
+ return self._set(doubleVariants=value)
219
+
220
+ def setShortCircuit(self, value):
221
+ """Sets whether to use faster mode, by default False.
222
+
223
+ Increase performance at cost of accuracy. Faster but less accurate.
224
+
225
+ Parameters
226
+ ----------
227
+ value : bool
228
+ Whether to use faster mode
229
+ """
230
+ return self._set(shortCircuit=value)
231
+
232
+ def setFrequencyPriority(self, value):
233
+ """Sets whether to consider frequency over hamming in intersections,
234
+ when false hamming takes priority, by default True.
235
+
236
+ Parameters
237
+ ----------
238
+ value : bool
239
+ Whether to consider frequency over hamming in intersections
240
+ """
241
+ return self._set(frequencyPriority=value)
242
+
243
+ def _create_model(self, java_model):
244
+ return NorvigSweetingModel(java_model=java_model)
245
+
246
+
247
+ class NorvigSweetingModel(AnnotatorModel):
248
+ """This annotator retrieves tokens and makes corrections automatically if
249
+ not found in an English dictionary.
250
+
251
+ The Symmetric Delete spelling correction algorithm reduces the complexity of
252
+ edit candidate generation and dictionary lookup for a given
253
+ Damerau-Levenshtein distance. It is six orders of magnitude faster (than the
254
+ standard approach with deletes + transposes + replaces + inserts) and
255
+ language independent.
256
+
257
+ This is the instantiated model of the :class:`.NorvigSweetingApproach`. For
258
+ training your own model, please see the documentation of that class.
259
+
260
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
261
+ object:
262
+
263
+ >>> spellChecker = NorvigSweetingModel.pretrained() \\
264
+ ... .setInputCols(["token"]) \\
265
+ ... .setOutputCol("spell") \\
266
+
267
+
268
+ The default model is ``"spellcheck_norvig"``, if no name is provided. For
269
+ available pretrained models please see the `Models Hub
270
+ <https://sparknlp.org/models?task=Spell+Check>`__.
271
+
272
+
273
+ For extended examples of usage, see the `Examples
274
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/vivekn-sentiment/VivekNarayanSentimentApproach.ipynb>`__.
275
+
276
+ ====================== ======================
277
+ Input Annotation types Output Annotation type
278
+ ====================== ======================
279
+ ``TOKEN`` ``TOKEN``
280
+ ====================== ======================
281
+
282
+ Parameters
283
+ ----------
284
+ None
285
+
286
+ References
287
+ ----------
288
+ Inspired by Norvig model and `SymSpell
289
+ <https://github.com/wolfgarbe/SymSpell>`__.
290
+
291
+ Examples
292
+ --------
293
+ >>> import sparknlp
294
+ >>> from sparknlp.base import *
295
+ >>> from sparknlp.annotator import *
296
+ >>> from pyspark.ml import Pipeline
297
+ >>> documentAssembler = DocumentAssembler() \\
298
+ ... .setInputCol("text") \\
299
+ ... .setOutputCol("document")
300
+ >>> tokenizer = Tokenizer() \\
301
+ ... .setInputCols(["document"]) \\
302
+ ... .setOutputCol("token")
303
+ >>> spellChecker = NorvigSweetingModel.pretrained() \\
304
+ ... .setInputCols(["token"]) \\
305
+ ... .setOutputCol("spell")
306
+ >>> pipeline = Pipeline().setStages([
307
+ ... documentAssembler,
308
+ ... tokenizer,
309
+ ... spellChecker
310
+ ... ])
311
+ >>> data = spark.createDataFrame([["somtimes i wrrite wordz erong."]]).toDF("text")
312
+ >>> result = pipeline.fit(data).transform(data)
313
+ >>> result.select("spell.result").show(truncate=False)
314
+ +--------------------------------------+
315
+ |result |
316
+ +--------------------------------------+
317
+ |[sometimes, i, write, words, wrong, .]|
318
+ +--------------------------------------+
319
+
320
+ See Also
321
+ --------
322
+ SymmetricDeleteModel : for an alternative approach to spell checking
323
+ ContextSpellCheckerModel : for a DL based approach
324
+ """
325
+ name = "NorvigSweetingModel"
326
+
327
+ inputAnnotatorTypes = [AnnotatorType.TOKEN]
328
+
329
+ outputAnnotatorType = AnnotatorType.TOKEN
330
+
331
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel", java_model=None):
332
+ super(NorvigSweetingModel, self).__init__(
333
+ classname=classname,
334
+ java_model=java_model
335
+ )
336
+
337
+ @staticmethod
338
+ def pretrained(name="spellcheck_norvig", lang="en", remote_loc=None):
339
+ """Downloads and loads a pretrained model.
340
+
341
+ Parameters
342
+ ----------
343
+ name : str, optional
344
+ Name of the pretrained model, by default "spellcheck_norvig"
345
+ lang : str, optional
346
+ Language of the pretrained model, by default "en"
347
+ remote_loc : str, optional
348
+ Optional remote address of the resource, by default None. Will use
349
+ Spark NLPs repositories otherwise.
350
+
351
+ Returns
352
+ -------
353
+ NorvigSweetingModel
354
+ The restored model
355
+ """
356
+ from sparknlp.pretrained import ResourceDownloader
357
+ return ResourceDownloader.downloadModel(NorvigSweetingModel, name, lang, remote_loc)
358
+
@@ -0,0 +1,299 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for SymmetricDelete."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class SymmetricDeleteApproach(AnnotatorApproach):
20
+ """Trains a Symmetric Delete spelling correction algorithm. Retrieves tokens
21
+ and utilizes distance metrics to compute possible derived words.
22
+
23
+ The Symmetric Delete spelling correction algorithm reduces the complexity of edit
24
+ candidate generation and dictionary lookup for a given Damerau-Levenshtein distance.
25
+ It is six orders of magnitude faster (than the standard approach with deletes +
26
+ transposes + replaces + inserts) and language independent.
27
+
28
+ A dictionary of correct spellings must be provided with :meth:`.setDictionary` in
29
+ the form of a text file, where each word is parsed by a regex pattern.
30
+
31
+ For instantiated/pretrained models, see :class:`.SymmetricDeleteModel`.
32
+
33
+ ====================== ======================
34
+ Input Annotation types Output Annotation type
35
+ ====================== ======================
36
+ ``TOKEN`` ``TOKEN``
37
+ ====================== ======================
38
+
39
+ Parameters
40
+ ----------
41
+ dictionary
42
+ folder or file with text that teaches about the language
43
+ maxEditDistance
44
+ max edit distance characters to derive strings from a word, by default 3
45
+ frequencyThreshold
46
+ minimum frequency of words to be considered from training, by default 0
47
+ deletesThreshold
48
+ minimum frequency of corrections a word needs to have to be considered
49
+ from training, by default 0
50
+
51
+ References
52
+ ----------
53
+ Inspired by `SymSpell <https://github.com/wolfgarbe/SymSpell>`__.
54
+
55
+ Examples
56
+ --------
57
+ In this example, the dictionary ``"words.txt"`` has the form of::
58
+
59
+ ...
60
+ gummy
61
+ gummic
62
+ gummier
63
+ gummiest
64
+ gummiferous
65
+ ...
66
+
67
+ This dictionary is then set to be the basis of the spell checker.
68
+
69
+ >>> import sparknlp
70
+ >>> from sparknlp.base import *
71
+ >>> from sparknlp.annotator import *
72
+ >>> from pyspark.ml import Pipeline
73
+ >>> documentAssembler = DocumentAssembler() \\
74
+ ... .setInputCol("text") \\
75
+ ... .setOutputCol("document")
76
+ >>> tokenizer = Tokenizer() \\
77
+ ... .setInputCols(["document"]) \\
78
+ ... .setOutputCol("token")
79
+ >>> spellChecker = SymmetricDeleteApproach() \\
80
+ ... .setInputCols(["token"]) \\
81
+ ... .setOutputCol("spell") \\
82
+ ... .setDictionary("src/test/resources/spell/words.txt")
83
+ >>> pipeline = Pipeline().setStages([
84
+ ... documentAssembler,
85
+ ... tokenizer,
86
+ ... spellChecker
87
+ ... ])
88
+ >>> pipelineModel = pipeline.fit(trainingData)
89
+
90
+ See Also
91
+ --------
92
+ NorvigSweetingApproach : for an alternative approach to spell checking
93
+ ContextSpellCheckerApproach : for a DL based approach
94
+ """
95
+ inputAnnotatorTypes = [AnnotatorType.TOKEN]
96
+
97
+ outputAnnotatorType = AnnotatorType.TOKEN
98
+
99
+ corpus = Param(Params._dummy(),
100
+ "corpus",
101
+ "folder or file with text that teaches about the language",
102
+ typeConverter=TypeConverters.identity)
103
+
104
+ dictionary = Param(Params._dummy(),
105
+ "dictionary",
106
+ "folder or file with text that teaches about the language",
107
+ typeConverter=TypeConverters.identity)
108
+
109
+ maxEditDistance = Param(Params._dummy(),
110
+ "maxEditDistance",
111
+ "max edit distance characters to derive strings from a word",
112
+ typeConverter=TypeConverters.toInt)
113
+
114
+ frequencyThreshold = Param(Params._dummy(),
115
+ "frequencyThreshold",
116
+ "minimum frequency of words to be considered from training. " +
117
+ "Increase if training set is LARGE. Defaults to 0",
118
+ typeConverter=TypeConverters.toInt)
119
+
120
+ deletesThreshold = Param(Params._dummy(),
121
+ "deletesThreshold",
122
+ "minimum frequency of corrections a word needs to have to be considered from training." +
123
+ "Increase if training set is LARGE. Defaults to 0",
124
+ typeConverter=TypeConverters.toInt)
125
+
126
+ dupsLimit = Param(Params._dummy(),
127
+ "dupsLimit",
128
+ "maximum duplicate of characters in a word to consider. Defaults to 2",
129
+ typeConverter=TypeConverters.toInt)
130
+
131
+ @keyword_only
132
+ def __init__(self):
133
+ super(SymmetricDeleteApproach, self).__init__(
134
+ classname="com.johnsnowlabs.nlp.annotators.spell.symmetric.SymmetricDeleteApproach")
135
+ self._setDefault(maxEditDistance=3, frequencyThreshold=0, deletesThreshold=0, dupsLimit=2)
136
+ self.dictionary_path = ""
137
+
138
+ def setDictionary(self, path, token_pattern="\S+", read_as=ReadAs.TEXT, options={"format": "text"}):
139
+ """Sets folder or file with text that teaches about the language.
140
+
141
+ Parameters
142
+ ----------
143
+ path : str
144
+ Path to the resource
145
+ token_pattern : str, optional
146
+ Regex patttern to extract tokens, by default "\S+"
147
+ read_as : str, optional
148
+ How to read the resource, by default ReadAs.TEXT
149
+ options : dict, optional
150
+ Options for reading the resource, by default {"format": "text"}
151
+ """
152
+ self.dictionary_path = path
153
+ opts = options.copy()
154
+ if "tokenPattern" not in opts:
155
+ opts["tokenPattern"] = token_pattern
156
+ return self._set(dictionary=ExternalResource(path, read_as, opts))
157
+
158
+ def setMaxEditDistance(self, v):
159
+ """Sets max edit distance characters to derive strings from a word, by
160
+ default 3.
161
+
162
+ Parameters
163
+ ----------
164
+ v : int
165
+ Max edit distance characters to derive strings from a word
166
+ """
167
+ return self._set(maxEditDistance=v)
168
+
169
+ def setFrequencyThreshold(self, v):
170
+ """Sets minimum frequency of words to be considered from training, by
171
+ default 0.
172
+
173
+ Parameters
174
+ ----------
175
+ v : int
176
+ Minimum frequency of words to be considered from training
177
+ """
178
+ return self._set(frequencyThreshold=v)
179
+
180
+ def setDeletesThreshold(self, v):
181
+ """Sets minimum frequency of corrections a word needs to have to be
182
+ considered from training, by default 0.
183
+
184
+ Parameters
185
+ ----------
186
+ v : int
187
+ Minimum frequency of corrections a word needs to have to be
188
+ considered from training
189
+ """
190
+ return self._set(deletesThreshold=v)
191
+
192
+ def _create_model(self, java_model):
193
+ return SymmetricDeleteModel(java_model=java_model)
194
+
195
+
196
+ class SymmetricDeleteModel(AnnotatorModel):
197
+ """Symmetric Delete spelling correction algorithm.
198
+
199
+ The Symmetric Delete spelling correction algorithm reduces the complexity of
200
+ edit candidate generation and dictionary lookup for a given
201
+ Damerau-Levenshtein distance. It is six orders of magnitude faster (than the
202
+ standard approach with deletes + transposes + replaces + inserts) and
203
+ language independent.
204
+
205
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
206
+ object:
207
+
208
+ >>> spell = SymmetricDeleteModel.pretrained() \\
209
+ ... .setInputCols(["token"]) \\
210
+ ... .setOutputCol("spell")
211
+
212
+
213
+ The default model is ``"spellcheck_sd"``, if no name is provided. For
214
+ available pretrained models please see the `Models Hub
215
+ <https://sparknlp.org/models?task=Spell+Check>`__.
216
+
217
+ ====================== ======================
218
+ Input Annotation types Output Annotation type
219
+ ====================== ======================
220
+ ``TOKEN`` ``TOKEN``
221
+ ====================== ======================
222
+
223
+ Parameters
224
+ ----------
225
+ None
226
+
227
+ References
228
+ ----------
229
+ Inspired by `SymSpell <https://github.com/wolfgarbe/SymSpell>`__.
230
+
231
+ Examples
232
+ --------
233
+ >>> import sparknlp
234
+ >>> from sparknlp.base import *
235
+ >>> from sparknlp.annotator import *
236
+ >>> from pyspark.ml import Pipeline
237
+ >>> documentAssembler = DocumentAssembler() \\
238
+ ... .setInputCol("text") \\
239
+ ... .setOutputCol("document")
240
+ >>> tokenizer = Tokenizer() \\
241
+ ... .setInputCols(["document"]) \\
242
+ ... .setOutputCol("token")
243
+ >>> spellChecker = SymmetricDeleteModel.pretrained() \\
244
+ ... .setInputCols(["token"]) \\
245
+ ... .setOutputCol("spell")
246
+ >>> pipeline = Pipeline().setStages([
247
+ ... documentAssembler,
248
+ ... tokenizer,
249
+ ... spellChecker
250
+ ... ])
251
+ >>> data = spark.createDataFrame([["spmetimes i wrrite wordz erong."]]).toDF("text")
252
+ >>> result = pipeline.fit(data).transform(data)
253
+ >>> result.select("spell.result").show(truncate=False)
254
+ +--------------------------------------+
255
+ |result |
256
+ +--------------------------------------+
257
+ |[sometimes, i, write, words, wrong, .]|
258
+ +--------------------------------------+
259
+
260
+ See Also
261
+ --------
262
+ NorvigSweetingModel : for an alternative approach to spell checking
263
+ ContextSpellCheckerModel : for a DL based approach
264
+ """
265
+ name = "SymmetricDeleteModel"
266
+
267
+ inputAnnotatorTypes = [AnnotatorType.TOKEN]
268
+
269
+ outputAnnotatorType = AnnotatorType.TOKEN
270
+
271
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.spell.symmetric.SymmetricDeleteModel",
272
+ java_model=None):
273
+ super(SymmetricDeleteModel, self).__init__(
274
+ classname=classname,
275
+ java_model=java_model
276
+ )
277
+
278
+ @staticmethod
279
+ def pretrained(name="spellcheck_sd", lang="en", remote_loc=None):
280
+ """Downloads and loads a pretrained model.
281
+
282
+ Parameters
283
+ ----------
284
+ name : str, optional
285
+ Name of the pretrained model, by default "spellcheck_sd"
286
+ lang : str, optional
287
+ Language of the pretrained model, by default "en"
288
+ remote_loc : str, optional
289
+ Optional remote address of the resource, by default None. Will use
290
+ Spark NLPs repositories otherwise.
291
+
292
+ Returns
293
+ -------
294
+ SymmetricDeleteModel
295
+ The restored model
296
+ """
297
+ from sparknlp.pretrained import ResourceDownloader
298
+ return ResourceDownloader.downloadModel(SymmetricDeleteModel, name, lang, remote_loc)
299
+