spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,911 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the ContextSpellChecker."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class ContextSpellCheckerApproach(AnnotatorApproach):
20
+ """Trains a deep-learning based Noisy Channel Model Spell Algorithm.
21
+
22
+ Correction candidates are extracted combining context information and word
23
+ information.
24
+
25
+ For instantiated/pretrained models, see :class:`.ContextSpellCheckerModel`.
26
+
27
+ Spell Checking is a sequence to sequence mapping problem. Given an input
28
+ sequence, potentially containing a certain number of errors,
29
+ ``ContextSpellChecker`` will rank correction sequences according to three
30
+ things:
31
+
32
+ #. Different correction candidates for each word — **word level**.
33
+ #. The surrounding text of each word, i.e. it’s context —
34
+ **sentence level**.
35
+ #. The relative cost of different correction candidates according to the
36
+ edit operations at the character level it requires — **subword level**.
37
+
38
+ For extended examples of usage, see the article
39
+ `Training a Contextual Spell Checker for Italian Language <https://towardsdatascience.com/training-a-contextual-spell-checker-for-italian-language-66dda528e4bf>`__,
40
+ the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/italian/Training_Context_Spell_Checker_Italian.ipynb>`__.
41
+
42
+ ====================== ======================
43
+ Input Annotation types Output Annotation type
44
+ ====================== ======================
45
+ ``TOKEN`` ``TOKEN``
46
+ ====================== ======================
47
+
48
+ Parameters
49
+ ----------
50
+ languageModelClasses
51
+ Number of classes to use during factorization of the softmax output in
52
+ the LM.
53
+ wordMaxDistance
54
+ Maximum distance for the generated candidates for every word.
55
+ maxCandidates
56
+ Maximum number of candidates for every word.
57
+ caseStrategy
58
+ What case combinations to try when generating candidates, by default 2.
59
+ Possible values are:
60
+
61
+ - 0: All uppercase letters
62
+ - 1: First letter capitalized
63
+ - 2: All letters
64
+ errorThreshold
65
+ Threshold perplexity for a word to be considered as an error.
66
+ epochs
67
+ Number of epochs to train the language model.
68
+ batchSize
69
+ Batch size for the training in NLM.
70
+ initialRate
71
+ Initial learning rate for the LM.
72
+ finalRate
73
+ Final learning rate for the LM.
74
+ validationFraction
75
+ Percentage of datapoints to use for validation.
76
+ minCount
77
+ Min number of times a token should appear to be included in vocab.
78
+ compoundCount
79
+ Min number of times a compound word should appear to be included in
80
+ vocab.
81
+ classCount
82
+ Min number of times the word need to appear in corpus to not be
83
+ considered of a special class.
84
+ tradeoff
85
+ Tradeoff between the cost of a word error and a transition in the
86
+ language model.
87
+ weightedDistPath
88
+ The path to the file containing the weights for the levenshtein
89
+ distance.
90
+ maxWindowLen
91
+ Maximum size for the window used to remember history prior to every
92
+ correction.
93
+ configProtoBytes
94
+ ConfigProto from tensorflow, serialized into byte array.
95
+ maxSentLen
96
+ Maximum length for a sentence - internal use during training.
97
+ graphFolder
98
+ Folder path that contain external graph files.
99
+
100
+ References
101
+ ----------
102
+ For an in-depth explanation of the module see the article
103
+ `Applying Context Aware Spell Checking in Spark NLP <https://medium.com/spark-nlp/applying-context-aware-spell-checking-in-spark-nlp-3c29c46963bc>`__.
104
+
105
+ Examples
106
+ --------
107
+ >>> import sparknlp
108
+ >>> from sparknlp.base import *
109
+ >>> from sparknlp.annotator import *
110
+ >>> from pyspark.ml import Pipeline
111
+
112
+ For this example, we use the first Sherlock Holmes book as the training dataset.
113
+
114
+ >>> documentAssembler = DocumentAssembler() \\
115
+ ... .setInputCol("text") \\
116
+ ... .setOutputCol("document")
117
+ >>> tokenizer = Tokenizer() \\
118
+ ... .setInputCols("document") \\
119
+ ... .setOutputCol("token")
120
+ >>> spellChecker = ContextSpellCheckerApproach() \\
121
+ ... .setInputCols("token") \\
122
+ ... .setOutputCol("corrected") \\
123
+ ... .setWordMaxDistance(3) \\
124
+ ... .setBatchSize(24) \\
125
+ ... .setEpochs(8) \\
126
+ ... .setLanguageModelClasses(1650) # dependant on vocabulary size
127
+ ... # .addVocabClass("_NAME_", names) # Extra classes for correction could be added like this
128
+ >>> pipeline = Pipeline().setStages([
129
+ ... documentAssembler,
130
+ ... tokenizer,
131
+ ... spellChecker
132
+ ... ])
133
+ >>> path = "sherlockholmes.txt"
134
+ >>> dataset = spark.read.text(path) \\
135
+ ... .toDF("text")
136
+ >>> pipelineModel = pipeline.fit(dataset)
137
+
138
+ See Also
139
+ --------
140
+ NorvigSweetingApproach, SymmetricDeleteApproach : For alternative approaches to spell checking
141
+ """
142
+
143
+ name = "ContextSpellCheckerApproach"
144
+
145
+ inputAnnotatorTypes = [AnnotatorType.TOKEN]
146
+
147
+ outputAnnotatorType = AnnotatorType.TOKEN
148
+
149
+ languageModelClasses = Param(Params._dummy(),
150
+ "languageModelClasses",
151
+ "Number of classes to use during factorization of the softmax output in the LM.",
152
+ typeConverter=TypeConverters.toInt)
153
+
154
+ wordMaxDistance = Param(Params._dummy(),
155
+ "wordMaxDistance",
156
+ "Maximum distance for the generated candidates for every word.",
157
+ typeConverter=TypeConverters.toInt)
158
+
159
+ maxCandidates = Param(Params._dummy(),
160
+ "maxCandidates",
161
+ "Maximum number of candidates for every word.",
162
+ typeConverter=TypeConverters.toInt)
163
+
164
+ caseStrategy = Param(Params._dummy(),
165
+ "caseStrategy",
166
+ "What case combinations to try when generating candidates.",
167
+ typeConverter=TypeConverters.toInt)
168
+
169
+ errorThreshold = Param(Params._dummy(),
170
+ "errorThreshold",
171
+ "Threshold perplexity for a word to be considered as an error.",
172
+ typeConverter=TypeConverters.toFloat)
173
+
174
+ epochs = Param(Params._dummy(),
175
+ "epochs",
176
+ "Number of epochs to train the language model.",
177
+ typeConverter=TypeConverters.toInt)
178
+
179
+ batchSize = Param(Params._dummy(),
180
+ "batchSize",
181
+ "Batch size for the training in NLM.",
182
+ typeConverter=TypeConverters.toInt)
183
+
184
+ initialRate = Param(Params._dummy(),
185
+ "initialRate",
186
+ "Initial learning rate for the LM.",
187
+ typeConverter=TypeConverters.toFloat)
188
+
189
+ finalRate = Param(Params._dummy(),
190
+ "finalRate",
191
+ "Final learning rate for the LM.",
192
+ typeConverter=TypeConverters.toFloat)
193
+
194
+ validationFraction = Param(Params._dummy(),
195
+ "validationFraction",
196
+ "Percentage of datapoints to use for validation.",
197
+ typeConverter=TypeConverters.toFloat)
198
+
199
+ minCount = Param(Params._dummy(),
200
+ "minCount",
201
+ "Min number of times a token should appear to be included in vocab.",
202
+ typeConverter=TypeConverters.toFloat)
203
+
204
+ compoundCount = Param(Params._dummy(),
205
+ "compoundCount",
206
+ "Min number of times a compound word should appear to be included in vocab.",
207
+ typeConverter=TypeConverters.toInt)
208
+
209
+ classCount = Param(Params._dummy(),
210
+ "classCount",
211
+ "Min number of times the word need to appear in corpus to not be considered of a special class.",
212
+ typeConverter=TypeConverters.toFloat)
213
+
214
+ tradeoff = Param(Params._dummy(),
215
+ "tradeoff",
216
+ "Tradeoff between the cost of a word error and a transition in the language model.",
217
+ typeConverter=TypeConverters.toFloat)
218
+
219
+ weightedDistPath = Param(Params._dummy(),
220
+ "weightedDistPath",
221
+ "The path to the file containing the weights for the levenshtein distance.",
222
+ typeConverter=TypeConverters.toString)
223
+
224
+ maxWindowLen = Param(Params._dummy(),
225
+ "maxWindowLen",
226
+ "Maximum size for the window used to remember history prior to every correction.",
227
+ typeConverter=TypeConverters.toInt)
228
+
229
+ configProtoBytes = Param(Params._dummy(), "configProtoBytes",
230
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
231
+ TypeConverters.toListInt)
232
+
233
+ maxSentLen = Param(Params._dummy(),
234
+ "maxSentLen",
235
+ "Maximum length of a sentence to be considered for training.",
236
+ typeConverter=TypeConverters.toInt)
237
+
238
+ graphFolder = Param(Params._dummy(),
239
+ "graphFolder",
240
+ "Folder path that contain external graph files.",
241
+ typeConverter=TypeConverters.toString)
242
+
243
+ def setLanguageModelClasses(self, count):
244
+ """Sets number of classes to use during factorization of the softmax
245
+ output in the Language Model.
246
+
247
+ Parameters
248
+ ----------
249
+ count : int
250
+ Number of classes
251
+ """
252
+ return self._set(languageModelClasses=count)
253
+
254
+ def setWordMaxDistance(self, dist):
255
+ """Sets maximum distance for the generated candidates for every word.
256
+
257
+ Parameters
258
+ ----------
259
+ dist : int
260
+ Maximum distance for the generated candidates for every word
261
+ """
262
+ return self._set(wordMaxDistance=dist)
263
+
264
+ def setMaxCandidates(self, candidates):
265
+ """Sets maximum number of candidates for every word.
266
+
267
+ Parameters
268
+ ----------
269
+ candidates : int
270
+ Maximum number of candidates for every word.
271
+ """
272
+ return self._set(maxCandidates=candidates)
273
+
274
+ def setCaseStrategy(self, strategy):
275
+ """Sets what case combinations to try when generating candidates.
276
+
277
+ Possible values are:
278
+
279
+ - 0: All uppercase letters
280
+ - 1: First letter capitalized
281
+ - 2: All letters
282
+
283
+ Parameters
284
+ ----------
285
+ strategy : int
286
+ Case combinations to try when generating candidates
287
+ """
288
+ return self._set(caseStrategy=strategy)
289
+
290
+ def setErrorThreshold(self, threshold):
291
+ """Sets threshold perplexity for a word to be considered as an error.
292
+
293
+ Parameters
294
+ ----------
295
+ threshold : float
296
+ Threshold perplexity for a word to be considered as an error
297
+ """
298
+ return self._set(errorThreshold=threshold)
299
+
300
+ def setEpochs(self, count):
301
+ """Sets number of epochs to train the language model.
302
+
303
+ Parameters
304
+ ----------
305
+ count : int
306
+ Number of epochs
307
+ """
308
+ return self._set(epochs=count)
309
+
310
+ def setBatchSize(self, size):
311
+ """Sets batch size.
312
+
313
+ Parameters
314
+ ----------
315
+ size : int
316
+ Batch size
317
+ """
318
+ return self._set(batchSize=size)
319
+
320
+ def setInitialRate(self, rate):
321
+ """Sets initial learning rate for the LM.
322
+
323
+ Parameters
324
+ ----------
325
+ rate : float
326
+ Initial learning rate for the LM
327
+ """
328
+ return self._set(initialRate=rate)
329
+
330
+ def setFinalRate(self, rate):
331
+ """Sets final learning rate for the LM.
332
+
333
+ Parameters
334
+ ----------
335
+ rate : float
336
+ Final learning rate for the LM
337
+ """
338
+ return self._set(finalRate=rate)
339
+
340
+ def setValidationFraction(self, fraction):
341
+ """Sets percentage of datapoints to use for validation.
342
+
343
+ Parameters
344
+ ----------
345
+ fraction : float
346
+ Percentage of datapoints to use for validation
347
+ """
348
+ return self._set(validationFraction=fraction)
349
+
350
+ def setMinCount(self, count):
351
+ """Sets min number of times a token should appear to be included in
352
+ vocab.
353
+
354
+ Parameters
355
+ ----------
356
+ count : float
357
+ Min number of times a token should appear to be included in vocab
358
+ """
359
+ return self._set(minCount=count)
360
+
361
+ def setCompoundCount(self, count):
362
+ """Sets min number of times a compound word should appear to be included
363
+ in vocab.
364
+
365
+ Parameters
366
+ ----------
367
+ count : int
368
+ Min number of times a compound word should appear to be included in
369
+ vocab.
370
+ """
371
+ return self._set(compoundCount=count)
372
+
373
+ def setClassCount(self, count):
374
+ """Sets min number of times the word need to appear in corpus to not be
375
+ considered of a special class.
376
+
377
+ Parameters
378
+ ----------
379
+ count : float
380
+ Min number of times the word need to appear in corpus to not be
381
+ considered of a special class.
382
+ """
383
+
384
+ return self._set(classCount=count)
385
+
386
+ def setTradeoff(self, alpha):
387
+ """Sets tradeoff between the cost of a word error and a transition in
388
+ the language model.
389
+
390
+ Parameters
391
+ ----------
392
+ alpha : float
393
+ Tradeoff between the cost of a word error and a transition in the
394
+ language model
395
+ """
396
+ return self._set(tradeoff=alpha)
397
+
398
+ def setWeightedDistPath(self, path):
399
+ """Sets the path to the file containing the weights for the levenshtein
400
+ distance.
401
+
402
+ Parameters
403
+ ----------
404
+ path : str
405
+ Path to the file containing the weights for the levenshtein
406
+ distance.
407
+ """
408
+ return self._set(weightedDistPath=path)
409
+
410
+ def setMaxWindowLen(self, length):
411
+ """Sets the maximum size for the window used to remember history prior
412
+ to every correction.
413
+
414
+ Parameters
415
+ ----------
416
+ length : int
417
+ Maximum size for the window used to remember history prior to
418
+ every correction
419
+ """
420
+ return self._set(maxWindowLen=length)
421
+
422
+ def setConfigProtoBytes(self, b):
423
+ """Sets configProto from tensorflow, serialized into byte array.
424
+
425
+ Parameters
426
+ ----------
427
+ b : List[int]
428
+ ConfigProto from tensorflow, serialized into byte array
429
+ """
430
+ return self._set(configProtoBytes=b)
431
+
432
+ def setGraphFolder(self, path):
433
+ """Sets folder path that contain external graph files.
434
+
435
+ Parameters
436
+ ----------
437
+ path : str
438
+ Folder path that contain external graph files.
439
+ """
440
+ return self._set(graphFolder=path)
441
+
442
+ def setMaxSentLen(self, sentlen):
443
+ """Sets the maximum length of a sentence.
444
+
445
+ Parameters
446
+ ----------
447
+ sentlen : int
448
+ Maximum length of a sentence
449
+ """
450
+ return self._set(maxSentLen=sentlen)
451
+
452
+ def addVocabClass(self, label, vocab, userdist=3):
453
+ """Adds a new class of words to correct, based on a vocabulary.
454
+
455
+ Parameters
456
+ ----------
457
+ label : str
458
+ Name of the class
459
+ vocab : List[str]
460
+ Vocabulary as a list
461
+ userdist : int, optional
462
+ Maximal distance to the word, by default 3
463
+ """
464
+ self._call_java('addVocabClass', label, vocab, userdist)
465
+ return self
466
+
467
+ def addRegexClass(self, label, regex, userdist=3):
468
+ """Adds a new class of words to correct, based on regex.
469
+
470
+ Parameters
471
+ ----------
472
+ label : str
473
+ Name of the class
474
+ regex : str
475
+ Regex to add
476
+ userdist : int, optional
477
+ Maximal distance to the word, by default 3
478
+ """
479
+ self._call_java('addRegexClass', label, regex, userdist)
480
+ return self
481
+
482
+ @keyword_only
483
+ def __init__(self):
484
+ super(ContextSpellCheckerApproach, self). \
485
+ __init__(classname="com.johnsnowlabs.nlp.annotators.spell.context.ContextSpellCheckerApproach")
486
+
487
+ def _create_model(self, java_model):
488
+ return ContextSpellCheckerModel(java_model=java_model)
489
+
490
+
491
+ class ContextSpellCheckerModel(AnnotatorModel, HasEngine):
492
+ """Implements a deep-learning based Noisy Channel Model Spell Algorithm.
493
+ Correction candidates are extracted combining context information and word
494
+ information.
495
+
496
+ Spell Checking is a sequence to sequence mapping problem. Given an input
497
+ sequence, potentially containing a certain number of errors,
498
+ ``ContextSpellChecker`` will rank correction sequences according to three
499
+ things:
500
+
501
+ #. Different correction candidates for each word — **word level**.
502
+ #. The surrounding text of each word, i.e. it’s context —
503
+ **sentence level**.
504
+ #. The relative cost of different correction candidates according to the
505
+ edit operations at the character level it requires — **subword level**.
506
+
507
+ This is the instantiated model of the :class:`.ContextSpellCheckerApproach`.
508
+ For training your own model, please see the documentation of that class.
509
+
510
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
511
+ object:
512
+
513
+ >>> spellChecker = ContextSpellCheckerModel.pretrained() \\
514
+ ... .setInputCols(["token"]) \\
515
+ ... .setOutputCol("checked")
516
+
517
+
518
+ The default model is ``"spellcheck_dl"``, if no name is provided.
519
+ For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Spell+Check>`__.
520
+
521
+ For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/italian/Training_Context_Spell_Checker_Italian.ipynb>`__.
522
+
523
+ ====================== ======================
524
+ Input Annotation types Output Annotation type
525
+ ====================== ======================
526
+ ``TOKEN`` ``TOKEN``
527
+ ====================== ======================
528
+
529
+ Parameters
530
+ ----------
531
+ wordMaxDistance
532
+ Maximum distance for the generated candidates for every word.
533
+ maxCandidates
534
+ Maximum number of candidates for every word.
535
+ caseStrategy
536
+ What case combinations to try when generating candidates.
537
+ errorThreshold
538
+ Threshold perplexity for a word to be considered as an error.
539
+ tradeoff
540
+ Tradeoff between the cost of a word error and a transition in the
541
+ language model.
542
+ maxWindowLen
543
+ Maximum size for the window used to remember history prior to every
544
+ correction.
545
+ gamma
546
+ Controls the influence of individual word frequency in the decision.
547
+ correctSymbols
548
+ Whether to correct special symbols or skip spell checking for them
549
+ compareLowcase
550
+ If true will compare tokens in low case with vocabulary.
551
+ configProtoBytes
552
+ ConfigProto from tensorflow, serialized into byte array.
553
+ vocabFreq
554
+ Frequency words from the vocabulary.
555
+ idsVocab
556
+ Mapping of ids to vocabulary.
557
+ vocabIds
558
+ Mapping of vocabulary to ids.
559
+ classes
560
+ Classes the spell checker recognizes.
561
+ weights
562
+ Levenshtein weights.
563
+ useNewLines
564
+ When set to true new lines will be treated as any other character. When set to false correction is applied on paragraphs as defined by newline characters.
565
+
566
+
567
+ References
568
+ ----------
569
+ For an in-depth explanation of the module see the article `Applying Context
570
+ Aware Spell Checking in Spark NLP
571
+ <https://medium.com/spark-nlp/applying-context-aware-spell-checking-in-spark-nlp-3c29c46963bc>`__.
572
+
573
+
574
+ Examples
575
+ --------
576
+ >>> import sparknlp
577
+ >>> from sparknlp.base import *
578
+ >>> from sparknlp.annotator import *
579
+ >>> from pyspark.ml import Pipeline
580
+ >>> documentAssembler = DocumentAssembler() \\
581
+ ... .setInputCol("text") \\
582
+ ... .setOutputCol("doc")
583
+ >>> tokenizer = Tokenizer() \\
584
+ ... .setInputCols(["doc"]) \\
585
+ ... .setOutputCol("token")
586
+ >>> spellChecker = ContextSpellCheckerModel \\
587
+ ... .pretrained() \\
588
+ ... .setTradeoff(12.0) \\
589
+ ... .setInputCols("token") \\
590
+ ... .setOutputCol("checked")
591
+ >>> pipeline = Pipeline().setStages([
592
+ ... documentAssembler,
593
+ ... tokenizer,
594
+ ... spellChecker
595
+ ... ])
596
+ >>> data = spark.createDataFrame([["It was a cold , dreary day and the country was white with smow ."]]).toDF("text")
597
+ >>> result = pipeline.fit(data).transform(data)
598
+ >>> result.select("checked.result").show(truncate=False)
599
+ +--------------------------------------------------------------------------------+
600
+ |result |
601
+ +--------------------------------------------------------------------------------+
602
+ |[It, was, a, cold, ,, dreary, day, and, the, country, was, white, with, snow, .]|
603
+ +--------------------------------------------------------------------------------+
604
+
605
+ See Also
606
+ --------
607
+ NorvigSweetingModel, SymmetricDeleteModel: For alternative approaches to spell checking
608
+ """
609
+ name = "ContextSpellCheckerModel"
610
+
611
+ inputAnnotatorTypes = [AnnotatorType.TOKEN]
612
+
613
+ outputAnnotatorType = AnnotatorType.TOKEN
614
+
615
+ wordMaxDistance = Param(Params._dummy(),
616
+ "wordMaxDistance",
617
+ "Maximum distance for the generated candidates for every word.",
618
+ typeConverter=TypeConverters.toInt)
619
+
620
+ maxCandidates = Param(Params._dummy(),
621
+ "maxCandidates",
622
+ "Maximum number of candidates for every word.",
623
+ typeConverter=TypeConverters.toInt)
624
+
625
+ caseStrategy = Param(Params._dummy(),
626
+ "caseStrategy",
627
+ "What case combinations to try when generating candidates.",
628
+ typeConverter=TypeConverters.toInt)
629
+
630
+ errorThreshold = Param(Params._dummy(),
631
+ "errorThreshold",
632
+ "Threshold perplexity for a word to be considered as an error.",
633
+ typeConverter=TypeConverters.toFloat)
634
+
635
+ tradeoff = Param(Params._dummy(),
636
+ "tradeoff",
637
+ "Tradeoff between the cost of a word error and a transition in the language model.",
638
+ typeConverter=TypeConverters.toFloat)
639
+
640
+ maxWindowLen = Param(Params._dummy(),
641
+ "maxWindowLen",
642
+ "Maximum size for the window used to remember history prior to every correction.",
643
+ typeConverter=TypeConverters.toInt)
644
+
645
+ gamma = Param(Params._dummy(),
646
+ "gamma",
647
+ "Controls the influence of individual word frequency in the decision.",
648
+ typeConverter=TypeConverters.toFloat)
649
+
650
+ correctSymbols = Param(Params._dummy(), "correctSymbols",
651
+ "Whether to correct special symbols or skip spell checking for them",
652
+ typeConverter=TypeConverters.toBoolean)
653
+
654
+ compareLowcase = Param(Params._dummy(), "compareLowcase", "If true will compare tokens in low case with vocabulary",
655
+ typeConverter=TypeConverters.toBoolean)
656
+
657
+ configProtoBytes = Param(Params._dummy(), "configProtoBytes",
658
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
659
+ TypeConverters.toListInt)
660
+
661
+ vocabFreq = Param(
662
+ Params._dummy(),
663
+ "vocabFreq",
664
+ "Frequency words from the vocabulary.",
665
+ TypeConverters.identity,
666
+ )
667
+ idsVocab = Param(
668
+ Params._dummy(),
669
+ "idsVocab",
670
+ "Mapping of ids to vocabulary.",
671
+ TypeConverters.identity,
672
+ )
673
+ vocabIds = Param(
674
+ Params._dummy(),
675
+ "vocabIds",
676
+ "Mapping of vocabulary to ids.",
677
+ TypeConverters.identity,
678
+ )
679
+ classes = Param(
680
+ Params._dummy(),
681
+ "classes",
682
+ "Classes the spell checker recognizes.",
683
+ TypeConverters.identity,
684
+ )
685
+
686
+ def setWordMaxDistance(self, dist):
687
+ """Sets maximum distance for the generated candidates for every word.
688
+
689
+ Parameters
690
+ ----------
691
+ dist : int
692
+ Maximum distance for the generated candidates for every word.
693
+ """
694
+ return self._set(wordMaxDistance=dist)
695
+
696
+ def setMaxCandidates(self, candidates):
697
+ """Sets maximum number of candidates for every word.
698
+
699
+ Parameters
700
+ ----------
701
+ candidates : int
702
+ Maximum number of candidates for every word.
703
+ """
704
+ return self._set(maxCandidates=candidates)
705
+
706
+ def setCaseStrategy(self, strategy):
707
+ """Sets what case combinations to try when generating candidates.
708
+
709
+ Parameters
710
+ ----------
711
+ strategy : int
712
+ Case combinations to try when generating candidates.
713
+ """
714
+ return self._set(caseStrategy=strategy)
715
+
716
+ def setErrorThreshold(self, threshold):
717
+ """Sets threshold perplexity for a word to be considered as an error.
718
+
719
+ Parameters
720
+ ----------
721
+ threshold : float
722
+ Threshold perplexity for a word to be considered as an error
723
+ """
724
+ return self._set(errorThreshold=threshold)
725
+
726
+ def setTradeoff(self, alpha):
727
+ """Sets tradeoff between the cost of a word error and a transition in the
728
+ language model.
729
+
730
+ Parameters
731
+ ----------
732
+ alpha : float
733
+ Tradeoff between the cost of a word error and a transition in the
734
+ language model
735
+ """
736
+ return self._set(tradeoff=alpha)
737
+
738
+ def setWeights(self, weights):
739
+ """Sets weights of each word for Levenshtein distance.
740
+
741
+ Parameters
742
+ ----------
743
+ weights : Dict[str, float]
744
+ Weights for Levenshtein distance as a mapping.
745
+ """
746
+ self._call_java('setWeights', weights)
747
+
748
+ def setMaxWindowLen(self, length):
749
+ """Sets the maximum size for the window used to remember history prior to
750
+ every correction.
751
+
752
+ Parameters
753
+ ----------
754
+ length : int
755
+ Maximum size for the window used to remember history prior to
756
+ every correction
757
+ """
758
+ return self._set(maxWindowLen=length)
759
+
760
+ def setGamma(self, g):
761
+ """Sets the influence of individual word frequency in the decision.
762
+
763
+ Parameters
764
+ ----------
765
+ g : float
766
+ Controls the influence of individual word frequency in the decision.
767
+ """
768
+ return self._set(gamma=g)
769
+
770
+ def setConfigProtoBytes(self, b):
771
+ """Sets configProto from tensorflow, serialized into byte array.
772
+
773
+ Parameters
774
+ ----------
775
+ b : List[int]
776
+ ConfigProto from tensorflow, serialized into byte array
777
+ """
778
+ return self._set(configProtoBytes=b)
779
+
780
+ def setVocabFreq(self, value: dict):
781
+ """Sets frequency words from the vocabulary.
782
+
783
+ Parameters
784
+ ----------
785
+ value : dict
786
+ Frequency words from the vocabulary.
787
+ """
788
+ return self._set(vocabFreq=value)
789
+
790
+ def setIdsVocab(self, idsVocab: dict):
791
+ """Sets mapping of ids to vocabulary.
792
+
793
+ Parameters
794
+ ----------
795
+ idsVocab : dict
796
+ Mapping of ids to vocabulary.
797
+ """
798
+ return self._set(idsVocab=idsVocab)
799
+
800
+ def setVocabIds(self, vocabIds: dict):
801
+ """Sets mapping of vocabulary to ids.
802
+
803
+ Parameters
804
+ ----------
805
+ vocabIds : dict
806
+ Mapping of vocabulary to ids.
807
+ """
808
+ return self._set(vocabIds=vocabIds)
809
+
810
+ def setClasses(self, value):
811
+ """Sets classes the spell checker recognizes.
812
+
813
+ Parameters
814
+ ----------
815
+ value : list
816
+ Classes the spell checker recognizes.
817
+ """
818
+ return self._set(classes=value)
819
+
820
+ def getWordClasses(self):
821
+ """Gets the classes of words to be corrected.
822
+
823
+ Returns
824
+ -------
825
+ List[str]
826
+ Classes of words to be corrected
827
+ """
828
+ it = self._call_java('getWordClasses').toIterator()
829
+ result = []
830
+ while (it.hasNext()):
831
+ result.append(it.next().toString())
832
+ return result
833
+
834
+ def updateRegexClass(self, label, regex):
835
+ """Update existing class to correct, based on regex
836
+
837
+ Parameters
838
+ ----------
839
+ label : str
840
+ Label of the class
841
+ regex : str
842
+ Regex to parse the class
843
+ """
844
+ self._call_java('updateRegexClass', label, regex)
845
+ return self
846
+
847
+ def updateVocabClass(self, label, vocab, append=True):
848
+ """Update existing class to correct, based on a vocabulary.
849
+
850
+ Parameters
851
+ ----------
852
+ label : str
853
+ Label of the class
854
+ vocab : List[str]
855
+ Vocabulary as a list
856
+ append : bool, optional
857
+ Whether to append to the existing vocabulary, by default True
858
+ """
859
+ self._call_java('updateVocabClass', label, vocab, append)
860
+ return self
861
+
862
+ def setCorrectSymbols(self, value):
863
+ """Sets whether to correct special symbols or skip spell checking for
864
+ them.
865
+
866
+ Parameters
867
+ ----------
868
+ value : bool
869
+ Whether to correct special symbols or skip spell checking for
870
+ them
871
+ """
872
+ return self._set(correctSymbols=value)
873
+
874
+ def setCompareLowcase(self, value):
875
+ """Sets whether to compare tokens in lower case with vocabulary.
876
+
877
+ Parameters
878
+ ----------
879
+ value : bool
880
+ Whether to compare tokens in lower case with vocabulary.
881
+ """
882
+ return self._set(compareLowcase=value)
883
+
884
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.spell.context.ContextSpellCheckerModel",
885
+ java_model=None):
886
+ super(ContextSpellCheckerModel, self).__init__(
887
+ classname=classname,
888
+ java_model=java_model
889
+ )
890
+
891
+ @staticmethod
892
+ def pretrained(name="spellcheck_dl", lang="en", remote_loc=None):
893
+ """Downloads and loads a pretrained model.
894
+
895
+ Parameters
896
+ ----------
897
+ name : str, optional
898
+ Name of the pretrained model, by default "spellcheck_dl"
899
+ lang : str, optional
900
+ Language of the pretrained model, by default "en"
901
+ remote_loc : str, optional
902
+ Optional remote address of the resource, by default None. Will use
903
+ Spark NLPs repositories otherwise.
904
+
905
+ Returns
906
+ -------
907
+ ContextSpellCheckerModel
908
+ The restored model
909
+ """
910
+ from sparknlp.pretrained import ResourceDownloader
911
+ return ResourceDownloader.downloadModel(ContextSpellCheckerModel, name, lang, remote_loc)