spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,911 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the ContextSpellChecker."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ContextSpellCheckerApproach(AnnotatorApproach):
|
|
20
|
+
"""Trains a deep-learning based Noisy Channel Model Spell Algorithm.
|
|
21
|
+
|
|
22
|
+
Correction candidates are extracted combining context information and word
|
|
23
|
+
information.
|
|
24
|
+
|
|
25
|
+
For instantiated/pretrained models, see :class:`.ContextSpellCheckerModel`.
|
|
26
|
+
|
|
27
|
+
Spell Checking is a sequence to sequence mapping problem. Given an input
|
|
28
|
+
sequence, potentially containing a certain number of errors,
|
|
29
|
+
``ContextSpellChecker`` will rank correction sequences according to three
|
|
30
|
+
things:
|
|
31
|
+
|
|
32
|
+
#. Different correction candidates for each word — **word level**.
|
|
33
|
+
#. The surrounding text of each word, i.e. it’s context —
|
|
34
|
+
**sentence level**.
|
|
35
|
+
#. The relative cost of different correction candidates according to the
|
|
36
|
+
edit operations at the character level it requires — **subword level**.
|
|
37
|
+
|
|
38
|
+
For extended examples of usage, see the article
|
|
39
|
+
`Training a Contextual Spell Checker for Italian Language <https://towardsdatascience.com/training-a-contextual-spell-checker-for-italian-language-66dda528e4bf>`__,
|
|
40
|
+
the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/italian/Training_Context_Spell_Checker_Italian.ipynb>`__.
|
|
41
|
+
|
|
42
|
+
====================== ======================
|
|
43
|
+
Input Annotation types Output Annotation type
|
|
44
|
+
====================== ======================
|
|
45
|
+
``TOKEN`` ``TOKEN``
|
|
46
|
+
====================== ======================
|
|
47
|
+
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
languageModelClasses
|
|
51
|
+
Number of classes to use during factorization of the softmax output in
|
|
52
|
+
the LM.
|
|
53
|
+
wordMaxDistance
|
|
54
|
+
Maximum distance for the generated candidates for every word.
|
|
55
|
+
maxCandidates
|
|
56
|
+
Maximum number of candidates for every word.
|
|
57
|
+
caseStrategy
|
|
58
|
+
What case combinations to try when generating candidates, by default 2.
|
|
59
|
+
Possible values are:
|
|
60
|
+
|
|
61
|
+
- 0: All uppercase letters
|
|
62
|
+
- 1: First letter capitalized
|
|
63
|
+
- 2: All letters
|
|
64
|
+
errorThreshold
|
|
65
|
+
Threshold perplexity for a word to be considered as an error.
|
|
66
|
+
epochs
|
|
67
|
+
Number of epochs to train the language model.
|
|
68
|
+
batchSize
|
|
69
|
+
Batch size for the training in NLM.
|
|
70
|
+
initialRate
|
|
71
|
+
Initial learning rate for the LM.
|
|
72
|
+
finalRate
|
|
73
|
+
Final learning rate for the LM.
|
|
74
|
+
validationFraction
|
|
75
|
+
Percentage of datapoints to use for validation.
|
|
76
|
+
minCount
|
|
77
|
+
Min number of times a token should appear to be included in vocab.
|
|
78
|
+
compoundCount
|
|
79
|
+
Min number of times a compound word should appear to be included in
|
|
80
|
+
vocab.
|
|
81
|
+
classCount
|
|
82
|
+
Min number of times the word need to appear in corpus to not be
|
|
83
|
+
considered of a special class.
|
|
84
|
+
tradeoff
|
|
85
|
+
Tradeoff between the cost of a word error and a transition in the
|
|
86
|
+
language model.
|
|
87
|
+
weightedDistPath
|
|
88
|
+
The path to the file containing the weights for the levenshtein
|
|
89
|
+
distance.
|
|
90
|
+
maxWindowLen
|
|
91
|
+
Maximum size for the window used to remember history prior to every
|
|
92
|
+
correction.
|
|
93
|
+
configProtoBytes
|
|
94
|
+
ConfigProto from tensorflow, serialized into byte array.
|
|
95
|
+
maxSentLen
|
|
96
|
+
Maximum length for a sentence - internal use during training.
|
|
97
|
+
graphFolder
|
|
98
|
+
Folder path that contain external graph files.
|
|
99
|
+
|
|
100
|
+
References
|
|
101
|
+
----------
|
|
102
|
+
For an in-depth explanation of the module see the article
|
|
103
|
+
`Applying Context Aware Spell Checking in Spark NLP <https://medium.com/spark-nlp/applying-context-aware-spell-checking-in-spark-nlp-3c29c46963bc>`__.
|
|
104
|
+
|
|
105
|
+
Examples
|
|
106
|
+
--------
|
|
107
|
+
>>> import sparknlp
|
|
108
|
+
>>> from sparknlp.base import *
|
|
109
|
+
>>> from sparknlp.annotator import *
|
|
110
|
+
>>> from pyspark.ml import Pipeline
|
|
111
|
+
|
|
112
|
+
For this example, we use the first Sherlock Holmes book as the training dataset.
|
|
113
|
+
|
|
114
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
115
|
+
... .setInputCol("text") \\
|
|
116
|
+
... .setOutputCol("document")
|
|
117
|
+
>>> tokenizer = Tokenizer() \\
|
|
118
|
+
... .setInputCols("document") \\
|
|
119
|
+
... .setOutputCol("token")
|
|
120
|
+
>>> spellChecker = ContextSpellCheckerApproach() \\
|
|
121
|
+
... .setInputCols("token") \\
|
|
122
|
+
... .setOutputCol("corrected") \\
|
|
123
|
+
... .setWordMaxDistance(3) \\
|
|
124
|
+
... .setBatchSize(24) \\
|
|
125
|
+
... .setEpochs(8) \\
|
|
126
|
+
... .setLanguageModelClasses(1650) # dependant on vocabulary size
|
|
127
|
+
... # .addVocabClass("_NAME_", names) # Extra classes for correction could be added like this
|
|
128
|
+
>>> pipeline = Pipeline().setStages([
|
|
129
|
+
... documentAssembler,
|
|
130
|
+
... tokenizer,
|
|
131
|
+
... spellChecker
|
|
132
|
+
... ])
|
|
133
|
+
>>> path = "sherlockholmes.txt"
|
|
134
|
+
>>> dataset = spark.read.text(path) \\
|
|
135
|
+
... .toDF("text")
|
|
136
|
+
>>> pipelineModel = pipeline.fit(dataset)
|
|
137
|
+
|
|
138
|
+
See Also
|
|
139
|
+
--------
|
|
140
|
+
NorvigSweetingApproach, SymmetricDeleteApproach : For alternative approaches to spell checking
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
name = "ContextSpellCheckerApproach"
|
|
144
|
+
|
|
145
|
+
inputAnnotatorTypes = [AnnotatorType.TOKEN]
|
|
146
|
+
|
|
147
|
+
outputAnnotatorType = AnnotatorType.TOKEN
|
|
148
|
+
|
|
149
|
+
languageModelClasses = Param(Params._dummy(),
|
|
150
|
+
"languageModelClasses",
|
|
151
|
+
"Number of classes to use during factorization of the softmax output in the LM.",
|
|
152
|
+
typeConverter=TypeConverters.toInt)
|
|
153
|
+
|
|
154
|
+
wordMaxDistance = Param(Params._dummy(),
|
|
155
|
+
"wordMaxDistance",
|
|
156
|
+
"Maximum distance for the generated candidates for every word.",
|
|
157
|
+
typeConverter=TypeConverters.toInt)
|
|
158
|
+
|
|
159
|
+
maxCandidates = Param(Params._dummy(),
|
|
160
|
+
"maxCandidates",
|
|
161
|
+
"Maximum number of candidates for every word.",
|
|
162
|
+
typeConverter=TypeConverters.toInt)
|
|
163
|
+
|
|
164
|
+
caseStrategy = Param(Params._dummy(),
|
|
165
|
+
"caseStrategy",
|
|
166
|
+
"What case combinations to try when generating candidates.",
|
|
167
|
+
typeConverter=TypeConverters.toInt)
|
|
168
|
+
|
|
169
|
+
errorThreshold = Param(Params._dummy(),
|
|
170
|
+
"errorThreshold",
|
|
171
|
+
"Threshold perplexity for a word to be considered as an error.",
|
|
172
|
+
typeConverter=TypeConverters.toFloat)
|
|
173
|
+
|
|
174
|
+
epochs = Param(Params._dummy(),
|
|
175
|
+
"epochs",
|
|
176
|
+
"Number of epochs to train the language model.",
|
|
177
|
+
typeConverter=TypeConverters.toInt)
|
|
178
|
+
|
|
179
|
+
batchSize = Param(Params._dummy(),
|
|
180
|
+
"batchSize",
|
|
181
|
+
"Batch size for the training in NLM.",
|
|
182
|
+
typeConverter=TypeConverters.toInt)
|
|
183
|
+
|
|
184
|
+
initialRate = Param(Params._dummy(),
|
|
185
|
+
"initialRate",
|
|
186
|
+
"Initial learning rate for the LM.",
|
|
187
|
+
typeConverter=TypeConverters.toFloat)
|
|
188
|
+
|
|
189
|
+
finalRate = Param(Params._dummy(),
|
|
190
|
+
"finalRate",
|
|
191
|
+
"Final learning rate for the LM.",
|
|
192
|
+
typeConverter=TypeConverters.toFloat)
|
|
193
|
+
|
|
194
|
+
validationFraction = Param(Params._dummy(),
|
|
195
|
+
"validationFraction",
|
|
196
|
+
"Percentage of datapoints to use for validation.",
|
|
197
|
+
typeConverter=TypeConverters.toFloat)
|
|
198
|
+
|
|
199
|
+
minCount = Param(Params._dummy(),
|
|
200
|
+
"minCount",
|
|
201
|
+
"Min number of times a token should appear to be included in vocab.",
|
|
202
|
+
typeConverter=TypeConverters.toFloat)
|
|
203
|
+
|
|
204
|
+
compoundCount = Param(Params._dummy(),
|
|
205
|
+
"compoundCount",
|
|
206
|
+
"Min number of times a compound word should appear to be included in vocab.",
|
|
207
|
+
typeConverter=TypeConverters.toInt)
|
|
208
|
+
|
|
209
|
+
classCount = Param(Params._dummy(),
|
|
210
|
+
"classCount",
|
|
211
|
+
"Min number of times the word need to appear in corpus to not be considered of a special class.",
|
|
212
|
+
typeConverter=TypeConverters.toFloat)
|
|
213
|
+
|
|
214
|
+
tradeoff = Param(Params._dummy(),
|
|
215
|
+
"tradeoff",
|
|
216
|
+
"Tradeoff between the cost of a word error and a transition in the language model.",
|
|
217
|
+
typeConverter=TypeConverters.toFloat)
|
|
218
|
+
|
|
219
|
+
weightedDistPath = Param(Params._dummy(),
|
|
220
|
+
"weightedDistPath",
|
|
221
|
+
"The path to the file containing the weights for the levenshtein distance.",
|
|
222
|
+
typeConverter=TypeConverters.toString)
|
|
223
|
+
|
|
224
|
+
maxWindowLen = Param(Params._dummy(),
|
|
225
|
+
"maxWindowLen",
|
|
226
|
+
"Maximum size for the window used to remember history prior to every correction.",
|
|
227
|
+
typeConverter=TypeConverters.toInt)
|
|
228
|
+
|
|
229
|
+
configProtoBytes = Param(Params._dummy(), "configProtoBytes",
|
|
230
|
+
"ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
|
|
231
|
+
TypeConverters.toListInt)
|
|
232
|
+
|
|
233
|
+
maxSentLen = Param(Params._dummy(),
|
|
234
|
+
"maxSentLen",
|
|
235
|
+
"Maximum length of a sentence to be considered for training.",
|
|
236
|
+
typeConverter=TypeConverters.toInt)
|
|
237
|
+
|
|
238
|
+
graphFolder = Param(Params._dummy(),
|
|
239
|
+
"graphFolder",
|
|
240
|
+
"Folder path that contain external graph files.",
|
|
241
|
+
typeConverter=TypeConverters.toString)
|
|
242
|
+
|
|
243
|
+
def setLanguageModelClasses(self, count):
|
|
244
|
+
"""Sets number of classes to use during factorization of the softmax
|
|
245
|
+
output in the Language Model.
|
|
246
|
+
|
|
247
|
+
Parameters
|
|
248
|
+
----------
|
|
249
|
+
count : int
|
|
250
|
+
Number of classes
|
|
251
|
+
"""
|
|
252
|
+
return self._set(languageModelClasses=count)
|
|
253
|
+
|
|
254
|
+
def setWordMaxDistance(self, dist):
|
|
255
|
+
"""Sets maximum distance for the generated candidates for every word.
|
|
256
|
+
|
|
257
|
+
Parameters
|
|
258
|
+
----------
|
|
259
|
+
dist : int
|
|
260
|
+
Maximum distance for the generated candidates for every word
|
|
261
|
+
"""
|
|
262
|
+
return self._set(wordMaxDistance=dist)
|
|
263
|
+
|
|
264
|
+
def setMaxCandidates(self, candidates):
|
|
265
|
+
"""Sets maximum number of candidates for every word.
|
|
266
|
+
|
|
267
|
+
Parameters
|
|
268
|
+
----------
|
|
269
|
+
candidates : int
|
|
270
|
+
Maximum number of candidates for every word.
|
|
271
|
+
"""
|
|
272
|
+
return self._set(maxCandidates=candidates)
|
|
273
|
+
|
|
274
|
+
def setCaseStrategy(self, strategy):
|
|
275
|
+
"""Sets what case combinations to try when generating candidates.
|
|
276
|
+
|
|
277
|
+
Possible values are:
|
|
278
|
+
|
|
279
|
+
- 0: All uppercase letters
|
|
280
|
+
- 1: First letter capitalized
|
|
281
|
+
- 2: All letters
|
|
282
|
+
|
|
283
|
+
Parameters
|
|
284
|
+
----------
|
|
285
|
+
strategy : int
|
|
286
|
+
Case combinations to try when generating candidates
|
|
287
|
+
"""
|
|
288
|
+
return self._set(caseStrategy=strategy)
|
|
289
|
+
|
|
290
|
+
def setErrorThreshold(self, threshold):
|
|
291
|
+
"""Sets threshold perplexity for a word to be considered as an error.
|
|
292
|
+
|
|
293
|
+
Parameters
|
|
294
|
+
----------
|
|
295
|
+
threshold : float
|
|
296
|
+
Threshold perplexity for a word to be considered as an error
|
|
297
|
+
"""
|
|
298
|
+
return self._set(errorThreshold=threshold)
|
|
299
|
+
|
|
300
|
+
def setEpochs(self, count):
|
|
301
|
+
"""Sets number of epochs to train the language model.
|
|
302
|
+
|
|
303
|
+
Parameters
|
|
304
|
+
----------
|
|
305
|
+
count : int
|
|
306
|
+
Number of epochs
|
|
307
|
+
"""
|
|
308
|
+
return self._set(epochs=count)
|
|
309
|
+
|
|
310
|
+
def setBatchSize(self, size):
|
|
311
|
+
"""Sets batch size.
|
|
312
|
+
|
|
313
|
+
Parameters
|
|
314
|
+
----------
|
|
315
|
+
size : int
|
|
316
|
+
Batch size
|
|
317
|
+
"""
|
|
318
|
+
return self._set(batchSize=size)
|
|
319
|
+
|
|
320
|
+
def setInitialRate(self, rate):
|
|
321
|
+
"""Sets initial learning rate for the LM.
|
|
322
|
+
|
|
323
|
+
Parameters
|
|
324
|
+
----------
|
|
325
|
+
rate : float
|
|
326
|
+
Initial learning rate for the LM
|
|
327
|
+
"""
|
|
328
|
+
return self._set(initialRate=rate)
|
|
329
|
+
|
|
330
|
+
def setFinalRate(self, rate):
|
|
331
|
+
"""Sets final learning rate for the LM.
|
|
332
|
+
|
|
333
|
+
Parameters
|
|
334
|
+
----------
|
|
335
|
+
rate : float
|
|
336
|
+
Final learning rate for the LM
|
|
337
|
+
"""
|
|
338
|
+
return self._set(finalRate=rate)
|
|
339
|
+
|
|
340
|
+
def setValidationFraction(self, fraction):
|
|
341
|
+
"""Sets percentage of datapoints to use for validation.
|
|
342
|
+
|
|
343
|
+
Parameters
|
|
344
|
+
----------
|
|
345
|
+
fraction : float
|
|
346
|
+
Percentage of datapoints to use for validation
|
|
347
|
+
"""
|
|
348
|
+
return self._set(validationFraction=fraction)
|
|
349
|
+
|
|
350
|
+
def setMinCount(self, count):
|
|
351
|
+
"""Sets min number of times a token should appear to be included in
|
|
352
|
+
vocab.
|
|
353
|
+
|
|
354
|
+
Parameters
|
|
355
|
+
----------
|
|
356
|
+
count : float
|
|
357
|
+
Min number of times a token should appear to be included in vocab
|
|
358
|
+
"""
|
|
359
|
+
return self._set(minCount=count)
|
|
360
|
+
|
|
361
|
+
def setCompoundCount(self, count):
|
|
362
|
+
"""Sets min number of times a compound word should appear to be included
|
|
363
|
+
in vocab.
|
|
364
|
+
|
|
365
|
+
Parameters
|
|
366
|
+
----------
|
|
367
|
+
count : int
|
|
368
|
+
Min number of times a compound word should appear to be included in
|
|
369
|
+
vocab.
|
|
370
|
+
"""
|
|
371
|
+
return self._set(compoundCount=count)
|
|
372
|
+
|
|
373
|
+
def setClassCount(self, count):
|
|
374
|
+
"""Sets min number of times the word need to appear in corpus to not be
|
|
375
|
+
considered of a special class.
|
|
376
|
+
|
|
377
|
+
Parameters
|
|
378
|
+
----------
|
|
379
|
+
count : float
|
|
380
|
+
Min number of times the word need to appear in corpus to not be
|
|
381
|
+
considered of a special class.
|
|
382
|
+
"""
|
|
383
|
+
|
|
384
|
+
return self._set(classCount=count)
|
|
385
|
+
|
|
386
|
+
def setTradeoff(self, alpha):
|
|
387
|
+
"""Sets tradeoff between the cost of a word error and a transition in
|
|
388
|
+
the language model.
|
|
389
|
+
|
|
390
|
+
Parameters
|
|
391
|
+
----------
|
|
392
|
+
alpha : float
|
|
393
|
+
Tradeoff between the cost of a word error and a transition in the
|
|
394
|
+
language model
|
|
395
|
+
"""
|
|
396
|
+
return self._set(tradeoff=alpha)
|
|
397
|
+
|
|
398
|
+
def setWeightedDistPath(self, path):
|
|
399
|
+
"""Sets the path to the file containing the weights for the levenshtein
|
|
400
|
+
distance.
|
|
401
|
+
|
|
402
|
+
Parameters
|
|
403
|
+
----------
|
|
404
|
+
path : str
|
|
405
|
+
Path to the file containing the weights for the levenshtein
|
|
406
|
+
distance.
|
|
407
|
+
"""
|
|
408
|
+
return self._set(weightedDistPath=path)
|
|
409
|
+
|
|
410
|
+
def setMaxWindowLen(self, length):
|
|
411
|
+
"""Sets the maximum size for the window used to remember history prior
|
|
412
|
+
to every correction.
|
|
413
|
+
|
|
414
|
+
Parameters
|
|
415
|
+
----------
|
|
416
|
+
length : int
|
|
417
|
+
Maximum size for the window used to remember history prior to
|
|
418
|
+
every correction
|
|
419
|
+
"""
|
|
420
|
+
return self._set(maxWindowLen=length)
|
|
421
|
+
|
|
422
|
+
def setConfigProtoBytes(self, b):
|
|
423
|
+
"""Sets configProto from tensorflow, serialized into byte array.
|
|
424
|
+
|
|
425
|
+
Parameters
|
|
426
|
+
----------
|
|
427
|
+
b : List[int]
|
|
428
|
+
ConfigProto from tensorflow, serialized into byte array
|
|
429
|
+
"""
|
|
430
|
+
return self._set(configProtoBytes=b)
|
|
431
|
+
|
|
432
|
+
def setGraphFolder(self, path):
|
|
433
|
+
"""Sets folder path that contain external graph files.
|
|
434
|
+
|
|
435
|
+
Parameters
|
|
436
|
+
----------
|
|
437
|
+
path : str
|
|
438
|
+
Folder path that contain external graph files.
|
|
439
|
+
"""
|
|
440
|
+
return self._set(graphFolder=path)
|
|
441
|
+
|
|
442
|
+
def setMaxSentLen(self, sentlen):
|
|
443
|
+
"""Sets the maximum length of a sentence.
|
|
444
|
+
|
|
445
|
+
Parameters
|
|
446
|
+
----------
|
|
447
|
+
sentlen : int
|
|
448
|
+
Maximum length of a sentence
|
|
449
|
+
"""
|
|
450
|
+
return self._set(maxSentLen=sentlen)
|
|
451
|
+
|
|
452
|
+
def addVocabClass(self, label, vocab, userdist=3):
|
|
453
|
+
"""Adds a new class of words to correct, based on a vocabulary.
|
|
454
|
+
|
|
455
|
+
Parameters
|
|
456
|
+
----------
|
|
457
|
+
label : str
|
|
458
|
+
Name of the class
|
|
459
|
+
vocab : List[str]
|
|
460
|
+
Vocabulary as a list
|
|
461
|
+
userdist : int, optional
|
|
462
|
+
Maximal distance to the word, by default 3
|
|
463
|
+
"""
|
|
464
|
+
self._call_java('addVocabClass', label, vocab, userdist)
|
|
465
|
+
return self
|
|
466
|
+
|
|
467
|
+
def addRegexClass(self, label, regex, userdist=3):
|
|
468
|
+
"""Adds a new class of words to correct, based on regex.
|
|
469
|
+
|
|
470
|
+
Parameters
|
|
471
|
+
----------
|
|
472
|
+
label : str
|
|
473
|
+
Name of the class
|
|
474
|
+
regex : str
|
|
475
|
+
Regex to add
|
|
476
|
+
userdist : int, optional
|
|
477
|
+
Maximal distance to the word, by default 3
|
|
478
|
+
"""
|
|
479
|
+
self._call_java('addRegexClass', label, regex, userdist)
|
|
480
|
+
return self
|
|
481
|
+
|
|
482
|
+
@keyword_only
|
|
483
|
+
def __init__(self):
|
|
484
|
+
super(ContextSpellCheckerApproach, self). \
|
|
485
|
+
__init__(classname="com.johnsnowlabs.nlp.annotators.spell.context.ContextSpellCheckerApproach")
|
|
486
|
+
|
|
487
|
+
def _create_model(self, java_model):
|
|
488
|
+
return ContextSpellCheckerModel(java_model=java_model)
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
class ContextSpellCheckerModel(AnnotatorModel, HasEngine):
|
|
492
|
+
"""Implements a deep-learning based Noisy Channel Model Spell Algorithm.
|
|
493
|
+
Correction candidates are extracted combining context information and word
|
|
494
|
+
information.
|
|
495
|
+
|
|
496
|
+
Spell Checking is a sequence to sequence mapping problem. Given an input
|
|
497
|
+
sequence, potentially containing a certain number of errors,
|
|
498
|
+
``ContextSpellChecker`` will rank correction sequences according to three
|
|
499
|
+
things:
|
|
500
|
+
|
|
501
|
+
#. Different correction candidates for each word — **word level**.
|
|
502
|
+
#. The surrounding text of each word, i.e. it’s context —
|
|
503
|
+
**sentence level**.
|
|
504
|
+
#. The relative cost of different correction candidates according to the
|
|
505
|
+
edit operations at the character level it requires — **subword level**.
|
|
506
|
+
|
|
507
|
+
This is the instantiated model of the :class:`.ContextSpellCheckerApproach`.
|
|
508
|
+
For training your own model, please see the documentation of that class.
|
|
509
|
+
|
|
510
|
+
Pretrained models can be loaded with :meth:`.pretrained` of the companion
|
|
511
|
+
object:
|
|
512
|
+
|
|
513
|
+
>>> spellChecker = ContextSpellCheckerModel.pretrained() \\
|
|
514
|
+
... .setInputCols(["token"]) \\
|
|
515
|
+
... .setOutputCol("checked")
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
The default model is ``"spellcheck_dl"``, if no name is provided.
|
|
519
|
+
For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Spell+Check>`__.
|
|
520
|
+
|
|
521
|
+
For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/italian/Training_Context_Spell_Checker_Italian.ipynb>`__.
|
|
522
|
+
|
|
523
|
+
====================== ======================
|
|
524
|
+
Input Annotation types Output Annotation type
|
|
525
|
+
====================== ======================
|
|
526
|
+
``TOKEN`` ``TOKEN``
|
|
527
|
+
====================== ======================
|
|
528
|
+
|
|
529
|
+
Parameters
|
|
530
|
+
----------
|
|
531
|
+
wordMaxDistance
|
|
532
|
+
Maximum distance for the generated candidates for every word.
|
|
533
|
+
maxCandidates
|
|
534
|
+
Maximum number of candidates for every word.
|
|
535
|
+
caseStrategy
|
|
536
|
+
What case combinations to try when generating candidates.
|
|
537
|
+
errorThreshold
|
|
538
|
+
Threshold perplexity for a word to be considered as an error.
|
|
539
|
+
tradeoff
|
|
540
|
+
Tradeoff between the cost of a word error and a transition in the
|
|
541
|
+
language model.
|
|
542
|
+
maxWindowLen
|
|
543
|
+
Maximum size for the window used to remember history prior to every
|
|
544
|
+
correction.
|
|
545
|
+
gamma
|
|
546
|
+
Controls the influence of individual word frequency in the decision.
|
|
547
|
+
correctSymbols
|
|
548
|
+
Whether to correct special symbols or skip spell checking for them
|
|
549
|
+
compareLowcase
|
|
550
|
+
If true will compare tokens in low case with vocabulary.
|
|
551
|
+
configProtoBytes
|
|
552
|
+
ConfigProto from tensorflow, serialized into byte array.
|
|
553
|
+
vocabFreq
|
|
554
|
+
Frequency words from the vocabulary.
|
|
555
|
+
idsVocab
|
|
556
|
+
Mapping of ids to vocabulary.
|
|
557
|
+
vocabIds
|
|
558
|
+
Mapping of vocabulary to ids.
|
|
559
|
+
classes
|
|
560
|
+
Classes the spell checker recognizes.
|
|
561
|
+
weights
|
|
562
|
+
Levenshtein weights.
|
|
563
|
+
useNewLines
|
|
564
|
+
When set to true new lines will be treated as any other character. When set to false correction is applied on paragraphs as defined by newline characters.
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
References
|
|
568
|
+
----------
|
|
569
|
+
For an in-depth explanation of the module see the article `Applying Context
|
|
570
|
+
Aware Spell Checking in Spark NLP
|
|
571
|
+
<https://medium.com/spark-nlp/applying-context-aware-spell-checking-in-spark-nlp-3c29c46963bc>`__.
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
Examples
|
|
575
|
+
--------
|
|
576
|
+
>>> import sparknlp
|
|
577
|
+
>>> from sparknlp.base import *
|
|
578
|
+
>>> from sparknlp.annotator import *
|
|
579
|
+
>>> from pyspark.ml import Pipeline
|
|
580
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
581
|
+
... .setInputCol("text") \\
|
|
582
|
+
... .setOutputCol("doc")
|
|
583
|
+
>>> tokenizer = Tokenizer() \\
|
|
584
|
+
... .setInputCols(["doc"]) \\
|
|
585
|
+
... .setOutputCol("token")
|
|
586
|
+
>>> spellChecker = ContextSpellCheckerModel \\
|
|
587
|
+
... .pretrained() \\
|
|
588
|
+
... .setTradeoff(12.0) \\
|
|
589
|
+
... .setInputCols("token") \\
|
|
590
|
+
... .setOutputCol("checked")
|
|
591
|
+
>>> pipeline = Pipeline().setStages([
|
|
592
|
+
... documentAssembler,
|
|
593
|
+
... tokenizer,
|
|
594
|
+
... spellChecker
|
|
595
|
+
... ])
|
|
596
|
+
>>> data = spark.createDataFrame([["It was a cold , dreary day and the country was white with smow ."]]).toDF("text")
|
|
597
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
598
|
+
>>> result.select("checked.result").show(truncate=False)
|
|
599
|
+
+--------------------------------------------------------------------------------+
|
|
600
|
+
|result |
|
|
601
|
+
+--------------------------------------------------------------------------------+
|
|
602
|
+
|[It, was, a, cold, ,, dreary, day, and, the, country, was, white, with, snow, .]|
|
|
603
|
+
+--------------------------------------------------------------------------------+
|
|
604
|
+
|
|
605
|
+
See Also
|
|
606
|
+
--------
|
|
607
|
+
NorvigSweetingModel, SymmetricDeleteModel: For alternative approaches to spell checking
|
|
608
|
+
"""
|
|
609
|
+
name = "ContextSpellCheckerModel"
|
|
610
|
+
|
|
611
|
+
inputAnnotatorTypes = [AnnotatorType.TOKEN]
|
|
612
|
+
|
|
613
|
+
outputAnnotatorType = AnnotatorType.TOKEN
|
|
614
|
+
|
|
615
|
+
wordMaxDistance = Param(Params._dummy(),
|
|
616
|
+
"wordMaxDistance",
|
|
617
|
+
"Maximum distance for the generated candidates for every word.",
|
|
618
|
+
typeConverter=TypeConverters.toInt)
|
|
619
|
+
|
|
620
|
+
maxCandidates = Param(Params._dummy(),
|
|
621
|
+
"maxCandidates",
|
|
622
|
+
"Maximum number of candidates for every word.",
|
|
623
|
+
typeConverter=TypeConverters.toInt)
|
|
624
|
+
|
|
625
|
+
caseStrategy = Param(Params._dummy(),
|
|
626
|
+
"caseStrategy",
|
|
627
|
+
"What case combinations to try when generating candidates.",
|
|
628
|
+
typeConverter=TypeConverters.toInt)
|
|
629
|
+
|
|
630
|
+
errorThreshold = Param(Params._dummy(),
|
|
631
|
+
"errorThreshold",
|
|
632
|
+
"Threshold perplexity for a word to be considered as an error.",
|
|
633
|
+
typeConverter=TypeConverters.toFloat)
|
|
634
|
+
|
|
635
|
+
tradeoff = Param(Params._dummy(),
|
|
636
|
+
"tradeoff",
|
|
637
|
+
"Tradeoff between the cost of a word error and a transition in the language model.",
|
|
638
|
+
typeConverter=TypeConverters.toFloat)
|
|
639
|
+
|
|
640
|
+
maxWindowLen = Param(Params._dummy(),
|
|
641
|
+
"maxWindowLen",
|
|
642
|
+
"Maximum size for the window used to remember history prior to every correction.",
|
|
643
|
+
typeConverter=TypeConverters.toInt)
|
|
644
|
+
|
|
645
|
+
gamma = Param(Params._dummy(),
|
|
646
|
+
"gamma",
|
|
647
|
+
"Controls the influence of individual word frequency in the decision.",
|
|
648
|
+
typeConverter=TypeConverters.toFloat)
|
|
649
|
+
|
|
650
|
+
correctSymbols = Param(Params._dummy(), "correctSymbols",
|
|
651
|
+
"Whether to correct special symbols or skip spell checking for them",
|
|
652
|
+
typeConverter=TypeConverters.toBoolean)
|
|
653
|
+
|
|
654
|
+
compareLowcase = Param(Params._dummy(), "compareLowcase", "If true will compare tokens in low case with vocabulary",
|
|
655
|
+
typeConverter=TypeConverters.toBoolean)
|
|
656
|
+
|
|
657
|
+
configProtoBytes = Param(Params._dummy(), "configProtoBytes",
|
|
658
|
+
"ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
|
|
659
|
+
TypeConverters.toListInt)
|
|
660
|
+
|
|
661
|
+
vocabFreq = Param(
|
|
662
|
+
Params._dummy(),
|
|
663
|
+
"vocabFreq",
|
|
664
|
+
"Frequency words from the vocabulary.",
|
|
665
|
+
TypeConverters.identity,
|
|
666
|
+
)
|
|
667
|
+
idsVocab = Param(
|
|
668
|
+
Params._dummy(),
|
|
669
|
+
"idsVocab",
|
|
670
|
+
"Mapping of ids to vocabulary.",
|
|
671
|
+
TypeConverters.identity,
|
|
672
|
+
)
|
|
673
|
+
vocabIds = Param(
|
|
674
|
+
Params._dummy(),
|
|
675
|
+
"vocabIds",
|
|
676
|
+
"Mapping of vocabulary to ids.",
|
|
677
|
+
TypeConverters.identity,
|
|
678
|
+
)
|
|
679
|
+
classes = Param(
|
|
680
|
+
Params._dummy(),
|
|
681
|
+
"classes",
|
|
682
|
+
"Classes the spell checker recognizes.",
|
|
683
|
+
TypeConverters.identity,
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
def setWordMaxDistance(self, dist):
|
|
687
|
+
"""Sets maximum distance for the generated candidates for every word.
|
|
688
|
+
|
|
689
|
+
Parameters
|
|
690
|
+
----------
|
|
691
|
+
dist : int
|
|
692
|
+
Maximum distance for the generated candidates for every word.
|
|
693
|
+
"""
|
|
694
|
+
return self._set(wordMaxDistance=dist)
|
|
695
|
+
|
|
696
|
+
def setMaxCandidates(self, candidates):
|
|
697
|
+
"""Sets maximum number of candidates for every word.
|
|
698
|
+
|
|
699
|
+
Parameters
|
|
700
|
+
----------
|
|
701
|
+
candidates : int
|
|
702
|
+
Maximum number of candidates for every word.
|
|
703
|
+
"""
|
|
704
|
+
return self._set(maxCandidates=candidates)
|
|
705
|
+
|
|
706
|
+
def setCaseStrategy(self, strategy):
|
|
707
|
+
"""Sets what case combinations to try when generating candidates.
|
|
708
|
+
|
|
709
|
+
Parameters
|
|
710
|
+
----------
|
|
711
|
+
strategy : int
|
|
712
|
+
Case combinations to try when generating candidates.
|
|
713
|
+
"""
|
|
714
|
+
return self._set(caseStrategy=strategy)
|
|
715
|
+
|
|
716
|
+
def setErrorThreshold(self, threshold):
|
|
717
|
+
"""Sets threshold perplexity for a word to be considered as an error.
|
|
718
|
+
|
|
719
|
+
Parameters
|
|
720
|
+
----------
|
|
721
|
+
threshold : float
|
|
722
|
+
Threshold perplexity for a word to be considered as an error
|
|
723
|
+
"""
|
|
724
|
+
return self._set(errorThreshold=threshold)
|
|
725
|
+
|
|
726
|
+
def setTradeoff(self, alpha):
|
|
727
|
+
"""Sets tradeoff between the cost of a word error and a transition in the
|
|
728
|
+
language model.
|
|
729
|
+
|
|
730
|
+
Parameters
|
|
731
|
+
----------
|
|
732
|
+
alpha : float
|
|
733
|
+
Tradeoff between the cost of a word error and a transition in the
|
|
734
|
+
language model
|
|
735
|
+
"""
|
|
736
|
+
return self._set(tradeoff=alpha)
|
|
737
|
+
|
|
738
|
+
def setWeights(self, weights):
|
|
739
|
+
"""Sets weights of each word for Levenshtein distance.
|
|
740
|
+
|
|
741
|
+
Parameters
|
|
742
|
+
----------
|
|
743
|
+
weights : Dict[str, float]
|
|
744
|
+
Weights for Levenshtein distance as a mapping.
|
|
745
|
+
"""
|
|
746
|
+
self._call_java('setWeights', weights)
|
|
747
|
+
|
|
748
|
+
def setMaxWindowLen(self, length):
|
|
749
|
+
"""Sets the maximum size for the window used to remember history prior to
|
|
750
|
+
every correction.
|
|
751
|
+
|
|
752
|
+
Parameters
|
|
753
|
+
----------
|
|
754
|
+
length : int
|
|
755
|
+
Maximum size for the window used to remember history prior to
|
|
756
|
+
every correction
|
|
757
|
+
"""
|
|
758
|
+
return self._set(maxWindowLen=length)
|
|
759
|
+
|
|
760
|
+
def setGamma(self, g):
|
|
761
|
+
"""Sets the influence of individual word frequency in the decision.
|
|
762
|
+
|
|
763
|
+
Parameters
|
|
764
|
+
----------
|
|
765
|
+
g : float
|
|
766
|
+
Controls the influence of individual word frequency in the decision.
|
|
767
|
+
"""
|
|
768
|
+
return self._set(gamma=g)
|
|
769
|
+
|
|
770
|
+
def setConfigProtoBytes(self, b):
|
|
771
|
+
"""Sets configProto from tensorflow, serialized into byte array.
|
|
772
|
+
|
|
773
|
+
Parameters
|
|
774
|
+
----------
|
|
775
|
+
b : List[int]
|
|
776
|
+
ConfigProto from tensorflow, serialized into byte array
|
|
777
|
+
"""
|
|
778
|
+
return self._set(configProtoBytes=b)
|
|
779
|
+
|
|
780
|
+
def setVocabFreq(self, value: dict):
|
|
781
|
+
"""Sets frequency words from the vocabulary.
|
|
782
|
+
|
|
783
|
+
Parameters
|
|
784
|
+
----------
|
|
785
|
+
value : dict
|
|
786
|
+
Frequency words from the vocabulary.
|
|
787
|
+
"""
|
|
788
|
+
return self._set(vocabFreq=value)
|
|
789
|
+
|
|
790
|
+
def setIdsVocab(self, idsVocab: dict):
|
|
791
|
+
"""Sets mapping of ids to vocabulary.
|
|
792
|
+
|
|
793
|
+
Parameters
|
|
794
|
+
----------
|
|
795
|
+
idsVocab : dict
|
|
796
|
+
Mapping of ids to vocabulary.
|
|
797
|
+
"""
|
|
798
|
+
return self._set(idsVocab=idsVocab)
|
|
799
|
+
|
|
800
|
+
def setVocabIds(self, vocabIds: dict):
|
|
801
|
+
"""Sets mapping of vocabulary to ids.
|
|
802
|
+
|
|
803
|
+
Parameters
|
|
804
|
+
----------
|
|
805
|
+
vocabIds : dict
|
|
806
|
+
Mapping of vocabulary to ids.
|
|
807
|
+
"""
|
|
808
|
+
return self._set(vocabIds=vocabIds)
|
|
809
|
+
|
|
810
|
+
def setClasses(self, value):
|
|
811
|
+
"""Sets classes the spell checker recognizes.
|
|
812
|
+
|
|
813
|
+
Parameters
|
|
814
|
+
----------
|
|
815
|
+
value : list
|
|
816
|
+
Classes the spell checker recognizes.
|
|
817
|
+
"""
|
|
818
|
+
return self._set(classes=value)
|
|
819
|
+
|
|
820
|
+
def getWordClasses(self):
|
|
821
|
+
"""Gets the classes of words to be corrected.
|
|
822
|
+
|
|
823
|
+
Returns
|
|
824
|
+
-------
|
|
825
|
+
List[str]
|
|
826
|
+
Classes of words to be corrected
|
|
827
|
+
"""
|
|
828
|
+
it = self._call_java('getWordClasses').toIterator()
|
|
829
|
+
result = []
|
|
830
|
+
while (it.hasNext()):
|
|
831
|
+
result.append(it.next().toString())
|
|
832
|
+
return result
|
|
833
|
+
|
|
834
|
+
def updateRegexClass(self, label, regex):
|
|
835
|
+
"""Update existing class to correct, based on regex
|
|
836
|
+
|
|
837
|
+
Parameters
|
|
838
|
+
----------
|
|
839
|
+
label : str
|
|
840
|
+
Label of the class
|
|
841
|
+
regex : str
|
|
842
|
+
Regex to parse the class
|
|
843
|
+
"""
|
|
844
|
+
self._call_java('updateRegexClass', label, regex)
|
|
845
|
+
return self
|
|
846
|
+
|
|
847
|
+
def updateVocabClass(self, label, vocab, append=True):
|
|
848
|
+
"""Update existing class to correct, based on a vocabulary.
|
|
849
|
+
|
|
850
|
+
Parameters
|
|
851
|
+
----------
|
|
852
|
+
label : str
|
|
853
|
+
Label of the class
|
|
854
|
+
vocab : List[str]
|
|
855
|
+
Vocabulary as a list
|
|
856
|
+
append : bool, optional
|
|
857
|
+
Whether to append to the existing vocabulary, by default True
|
|
858
|
+
"""
|
|
859
|
+
self._call_java('updateVocabClass', label, vocab, append)
|
|
860
|
+
return self
|
|
861
|
+
|
|
862
|
+
def setCorrectSymbols(self, value):
|
|
863
|
+
"""Sets whether to correct special symbols or skip spell checking for
|
|
864
|
+
them.
|
|
865
|
+
|
|
866
|
+
Parameters
|
|
867
|
+
----------
|
|
868
|
+
value : bool
|
|
869
|
+
Whether to correct special symbols or skip spell checking for
|
|
870
|
+
them
|
|
871
|
+
"""
|
|
872
|
+
return self._set(correctSymbols=value)
|
|
873
|
+
|
|
874
|
+
def setCompareLowcase(self, value):
|
|
875
|
+
"""Sets whether to compare tokens in lower case with vocabulary.
|
|
876
|
+
|
|
877
|
+
Parameters
|
|
878
|
+
----------
|
|
879
|
+
value : bool
|
|
880
|
+
Whether to compare tokens in lower case with vocabulary.
|
|
881
|
+
"""
|
|
882
|
+
return self._set(compareLowcase=value)
|
|
883
|
+
|
|
884
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.spell.context.ContextSpellCheckerModel",
|
|
885
|
+
java_model=None):
|
|
886
|
+
super(ContextSpellCheckerModel, self).__init__(
|
|
887
|
+
classname=classname,
|
|
888
|
+
java_model=java_model
|
|
889
|
+
)
|
|
890
|
+
|
|
891
|
+
@staticmethod
|
|
892
|
+
def pretrained(name="spellcheck_dl", lang="en", remote_loc=None):
|
|
893
|
+
"""Downloads and loads a pretrained model.
|
|
894
|
+
|
|
895
|
+
Parameters
|
|
896
|
+
----------
|
|
897
|
+
name : str, optional
|
|
898
|
+
Name of the pretrained model, by default "spellcheck_dl"
|
|
899
|
+
lang : str, optional
|
|
900
|
+
Language of the pretrained model, by default "en"
|
|
901
|
+
remote_loc : str, optional
|
|
902
|
+
Optional remote address of the resource, by default None. Will use
|
|
903
|
+
Spark NLPs repositories otherwise.
|
|
904
|
+
|
|
905
|
+
Returns
|
|
906
|
+
-------
|
|
907
|
+
ContextSpellCheckerModel
|
|
908
|
+
The restored model
|
|
909
|
+
"""
|
|
910
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
911
|
+
return ResourceDownloader.downloadModel(ContextSpellCheckerModel, name, lang, remote_loc)
|