spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the NorvigSweeting spell checker."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class NorvigSweetingApproach(AnnotatorApproach):
|
|
20
|
+
"""Trains annotator, that retrieves tokens and makes corrections automatically if
|
|
21
|
+
not found in an English dictionary, based on the algorithm by Peter Norvig.
|
|
22
|
+
|
|
23
|
+
The algorithm is based on a Bayesian approach to spell checking: Given the word we
|
|
24
|
+
look in the provided dictionary to choose the word with the highest probability
|
|
25
|
+
to be the correct one.
|
|
26
|
+
|
|
27
|
+
A dictionary of correct spellings must be provided with :meth:`.setDictionary` in
|
|
28
|
+
the form of a text file, where each word is parsed by a regex pattern.
|
|
29
|
+
|
|
30
|
+
For instantiated/pretrained models, see :class:`.NorvigSweetingModel`.
|
|
31
|
+
|
|
32
|
+
====================== ======================
|
|
33
|
+
Input Annotation types Output Annotation type
|
|
34
|
+
====================== ======================
|
|
35
|
+
``TOKEN`` ``TOKEN``
|
|
36
|
+
====================== ======================
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
dictionary
|
|
41
|
+
Dictionary needs 'tokenPattern' regex in dictionary for separating words
|
|
42
|
+
caseSensitive
|
|
43
|
+
Whether to ignore case sensitivity, by default False
|
|
44
|
+
doubleVariants
|
|
45
|
+
Whether to use more expensive spell checker, by default False
|
|
46
|
+
|
|
47
|
+
Increase search at cost of performance. Enables extra check for word
|
|
48
|
+
combinations.
|
|
49
|
+
shortCircuit
|
|
50
|
+
Whether to use faster mode, by default False
|
|
51
|
+
|
|
52
|
+
Increase performance at cost of accuracy. Faster but less accurate.
|
|
53
|
+
frequencyPriority
|
|
54
|
+
Applies frequency over hamming in intersections, when false hamming
|
|
55
|
+
takes priority, by default True
|
|
56
|
+
wordSizeIgnore
|
|
57
|
+
Minimum size of word before ignoring, by default 3
|
|
58
|
+
dupsLimit
|
|
59
|
+
Maximum duplicate of characters in a word to consider, by default 2
|
|
60
|
+
reductLimit
|
|
61
|
+
Word reductions limit, by default 3
|
|
62
|
+
intersections
|
|
63
|
+
Hamming intersections to attempt, by default 10
|
|
64
|
+
vowelSwapLimit
|
|
65
|
+
Vowel swap attempts, by default 6
|
|
66
|
+
|
|
67
|
+
References
|
|
68
|
+
----------
|
|
69
|
+
|
|
70
|
+
Inspired by the spell checker by Peter Norvig:
|
|
71
|
+
`How to Write a Spelling Corrector <https://norvig.com/spell-correct.html>`__
|
|
72
|
+
|
|
73
|
+
Examples
|
|
74
|
+
--------
|
|
75
|
+
>>> import sparknlp
|
|
76
|
+
>>> from sparknlp.base import *
|
|
77
|
+
>>> from sparknlp.annotator import *
|
|
78
|
+
>>> from pyspark.ml import Pipeline
|
|
79
|
+
|
|
80
|
+
In this example, the dictionary ``"words.txt"`` has the form of::
|
|
81
|
+
|
|
82
|
+
...
|
|
83
|
+
gummy
|
|
84
|
+
gummic
|
|
85
|
+
gummier
|
|
86
|
+
gummiest
|
|
87
|
+
gummiferous
|
|
88
|
+
...
|
|
89
|
+
|
|
90
|
+
This dictionary is then set to be the basis of the spell checker.
|
|
91
|
+
|
|
92
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
93
|
+
... .setInputCol("text") \\
|
|
94
|
+
... .setOutputCol("document")
|
|
95
|
+
>>> tokenizer = Tokenizer() \\
|
|
96
|
+
... .setInputCols(["document"]) \\
|
|
97
|
+
... .setOutputCol("token")
|
|
98
|
+
>>> spellChecker = NorvigSweetingApproach() \\
|
|
99
|
+
... .setInputCols(["token"]) \\
|
|
100
|
+
... .setOutputCol("spell") \\
|
|
101
|
+
... .setDictionary("src/test/resources/spell/words.txt")
|
|
102
|
+
>>> pipeline = Pipeline().setStages([
|
|
103
|
+
... documentAssembler,
|
|
104
|
+
... tokenizer,
|
|
105
|
+
... spellChecker
|
|
106
|
+
... ])
|
|
107
|
+
>>> pipelineModel = pipeline.fit(trainingData)
|
|
108
|
+
|
|
109
|
+
See Also
|
|
110
|
+
--------
|
|
111
|
+
SymmetricDeleteApproach : for an alternative approach to spell checking
|
|
112
|
+
ContextSpellCheckerApproach : for a DL based approach
|
|
113
|
+
"""
|
|
114
|
+
inputAnnotatorTypes = [AnnotatorType.TOKEN]
|
|
115
|
+
|
|
116
|
+
outputAnnotatorType = AnnotatorType.TOKEN
|
|
117
|
+
|
|
118
|
+
dictionary = Param(Params._dummy(),
|
|
119
|
+
"dictionary",
|
|
120
|
+
"dictionary needs 'tokenPattern' regex in dictionary for separating words",
|
|
121
|
+
typeConverter=TypeConverters.identity)
|
|
122
|
+
|
|
123
|
+
caseSensitive = Param(Params._dummy(),
|
|
124
|
+
"caseSensitive",
|
|
125
|
+
"whether to ignore case sensitivty",
|
|
126
|
+
typeConverter=TypeConverters.toBoolean)
|
|
127
|
+
|
|
128
|
+
doubleVariants = Param(Params._dummy(),
|
|
129
|
+
"doubleVariants",
|
|
130
|
+
"whether to use more expensive spell checker",
|
|
131
|
+
typeConverter=TypeConverters.toBoolean)
|
|
132
|
+
|
|
133
|
+
shortCircuit = Param(Params._dummy(),
|
|
134
|
+
"shortCircuit",
|
|
135
|
+
"whether to use faster mode",
|
|
136
|
+
typeConverter=TypeConverters.toBoolean)
|
|
137
|
+
|
|
138
|
+
frequencyPriority = Param(Params._dummy(),
|
|
139
|
+
"frequencyPriority",
|
|
140
|
+
"applies frequency over hamming in intersections. When false hamming takes priority",
|
|
141
|
+
typeConverter=TypeConverters.toBoolean)
|
|
142
|
+
|
|
143
|
+
wordSizeIgnore = Param(Params._dummy(),
|
|
144
|
+
"wordSizeIgnore",
|
|
145
|
+
"minimum size of word before ignoring. Defaults to 3",
|
|
146
|
+
typeConverter=TypeConverters.toInt)
|
|
147
|
+
|
|
148
|
+
dupsLimit = Param(Params._dummy(),
|
|
149
|
+
"dupsLimit",
|
|
150
|
+
"maximum duplicate of characters in a word to consider. Defaults to 2",
|
|
151
|
+
typeConverter=TypeConverters.toInt)
|
|
152
|
+
|
|
153
|
+
reductLimit = Param(Params._dummy(),
|
|
154
|
+
"reductLimit",
|
|
155
|
+
"word reductions limit. Defaults to 3",
|
|
156
|
+
typeConverter=TypeConverters.toInt)
|
|
157
|
+
|
|
158
|
+
intersections = Param(Params._dummy(),
|
|
159
|
+
"intersections",
|
|
160
|
+
"hamming intersections to attempt. Defaults to 10",
|
|
161
|
+
typeConverter=TypeConverters.toInt)
|
|
162
|
+
|
|
163
|
+
vowelSwapLimit = Param(Params._dummy(),
|
|
164
|
+
"vowelSwapLimit",
|
|
165
|
+
"vowel swap attempts. Defaults to 6",
|
|
166
|
+
typeConverter=TypeConverters.toInt)
|
|
167
|
+
|
|
168
|
+
@keyword_only
|
|
169
|
+
def __init__(self):
|
|
170
|
+
super(NorvigSweetingApproach, self).__init__(
|
|
171
|
+
classname="com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingApproach")
|
|
172
|
+
self._setDefault(caseSensitive=False, doubleVariants=False, shortCircuit=False, wordSizeIgnore=3, dupsLimit=2,
|
|
173
|
+
reductLimit=3, intersections=10, vowelSwapLimit=6, frequencyPriority=True)
|
|
174
|
+
self.dictionary_path = ""
|
|
175
|
+
|
|
176
|
+
def setDictionary(self, path, token_pattern="\S+", read_as=ReadAs.TEXT, options={"format": "text"}):
|
|
177
|
+
"""Sets dictionary which needs 'tokenPattern' regex for separating
|
|
178
|
+
words.
|
|
179
|
+
|
|
180
|
+
Parameters
|
|
181
|
+
----------
|
|
182
|
+
path : str
|
|
183
|
+
Path to the source file
|
|
184
|
+
token_pattern : str, optional
|
|
185
|
+
Pattern for token separation, by default ``\\S+``
|
|
186
|
+
read_as : str, optional
|
|
187
|
+
How to read the file, by default ReadAs.TEXT
|
|
188
|
+
options : dict, optional
|
|
189
|
+
Options to read the resource, by default {"format": "text"}
|
|
190
|
+
"""
|
|
191
|
+
self.dictionary_path = path
|
|
192
|
+
opts = options.copy()
|
|
193
|
+
if "tokenPattern" not in opts:
|
|
194
|
+
opts["tokenPattern"] = token_pattern
|
|
195
|
+
return self._set(dictionary=ExternalResource(path, read_as, opts))
|
|
196
|
+
|
|
197
|
+
def setCaseSensitive(self, value):
|
|
198
|
+
"""Sets whether to ignore case sensitivity, by default False.
|
|
199
|
+
|
|
200
|
+
Parameters
|
|
201
|
+
----------
|
|
202
|
+
value : bool
|
|
203
|
+
Whether to ignore case sensitivity
|
|
204
|
+
"""
|
|
205
|
+
return self._set(caseSensitive=value)
|
|
206
|
+
|
|
207
|
+
def setDoubleVariants(self, value):
|
|
208
|
+
"""Sets whether to use more expensive spell checker, by default False.
|
|
209
|
+
|
|
210
|
+
Increase search at cost of performance. Enables extra check for word
|
|
211
|
+
combinations.
|
|
212
|
+
|
|
213
|
+
Parameters
|
|
214
|
+
----------
|
|
215
|
+
value : bool
|
|
216
|
+
[description]
|
|
217
|
+
"""
|
|
218
|
+
return self._set(doubleVariants=value)
|
|
219
|
+
|
|
220
|
+
def setShortCircuit(self, value):
|
|
221
|
+
"""Sets whether to use faster mode, by default False.
|
|
222
|
+
|
|
223
|
+
Increase performance at cost of accuracy. Faster but less accurate.
|
|
224
|
+
|
|
225
|
+
Parameters
|
|
226
|
+
----------
|
|
227
|
+
value : bool
|
|
228
|
+
Whether to use faster mode
|
|
229
|
+
"""
|
|
230
|
+
return self._set(shortCircuit=value)
|
|
231
|
+
|
|
232
|
+
def setFrequencyPriority(self, value):
|
|
233
|
+
"""Sets whether to consider frequency over hamming in intersections,
|
|
234
|
+
when false hamming takes priority, by default True.
|
|
235
|
+
|
|
236
|
+
Parameters
|
|
237
|
+
----------
|
|
238
|
+
value : bool
|
|
239
|
+
Whether to consider frequency over hamming in intersections
|
|
240
|
+
"""
|
|
241
|
+
return self._set(frequencyPriority=value)
|
|
242
|
+
|
|
243
|
+
def _create_model(self, java_model):
|
|
244
|
+
return NorvigSweetingModel(java_model=java_model)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
class NorvigSweetingModel(AnnotatorModel):
|
|
248
|
+
"""This annotator retrieves tokens and makes corrections automatically if
|
|
249
|
+
not found in an English dictionary.
|
|
250
|
+
|
|
251
|
+
The Symmetric Delete spelling correction algorithm reduces the complexity of
|
|
252
|
+
edit candidate generation and dictionary lookup for a given
|
|
253
|
+
Damerau-Levenshtein distance. It is six orders of magnitude faster (than the
|
|
254
|
+
standard approach with deletes + transposes + replaces + inserts) and
|
|
255
|
+
language independent.
|
|
256
|
+
|
|
257
|
+
This is the instantiated model of the :class:`.NorvigSweetingApproach`. For
|
|
258
|
+
training your own model, please see the documentation of that class.
|
|
259
|
+
|
|
260
|
+
Pretrained models can be loaded with :meth:`.pretrained` of the companion
|
|
261
|
+
object:
|
|
262
|
+
|
|
263
|
+
>>> spellChecker = NorvigSweetingModel.pretrained() \\
|
|
264
|
+
... .setInputCols(["token"]) \\
|
|
265
|
+
... .setOutputCol("spell") \\
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
The default model is ``"spellcheck_norvig"``, if no name is provided. For
|
|
269
|
+
available pretrained models please see the `Models Hub
|
|
270
|
+
<https://sparknlp.org/models?task=Spell+Check>`__.
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
For extended examples of usage, see the `Examples
|
|
274
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/vivekn-sentiment/VivekNarayanSentimentApproach.ipynb>`__.
|
|
275
|
+
|
|
276
|
+
====================== ======================
|
|
277
|
+
Input Annotation types Output Annotation type
|
|
278
|
+
====================== ======================
|
|
279
|
+
``TOKEN`` ``TOKEN``
|
|
280
|
+
====================== ======================
|
|
281
|
+
|
|
282
|
+
Parameters
|
|
283
|
+
----------
|
|
284
|
+
None
|
|
285
|
+
|
|
286
|
+
References
|
|
287
|
+
----------
|
|
288
|
+
Inspired by Norvig model and `SymSpell
|
|
289
|
+
<https://github.com/wolfgarbe/SymSpell>`__.
|
|
290
|
+
|
|
291
|
+
Examples
|
|
292
|
+
--------
|
|
293
|
+
>>> import sparknlp
|
|
294
|
+
>>> from sparknlp.base import *
|
|
295
|
+
>>> from sparknlp.annotator import *
|
|
296
|
+
>>> from pyspark.ml import Pipeline
|
|
297
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
298
|
+
... .setInputCol("text") \\
|
|
299
|
+
... .setOutputCol("document")
|
|
300
|
+
>>> tokenizer = Tokenizer() \\
|
|
301
|
+
... .setInputCols(["document"]) \\
|
|
302
|
+
... .setOutputCol("token")
|
|
303
|
+
>>> spellChecker = NorvigSweetingModel.pretrained() \\
|
|
304
|
+
... .setInputCols(["token"]) \\
|
|
305
|
+
... .setOutputCol("spell")
|
|
306
|
+
>>> pipeline = Pipeline().setStages([
|
|
307
|
+
... documentAssembler,
|
|
308
|
+
... tokenizer,
|
|
309
|
+
... spellChecker
|
|
310
|
+
... ])
|
|
311
|
+
>>> data = spark.createDataFrame([["somtimes i wrrite wordz erong."]]).toDF("text")
|
|
312
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
313
|
+
>>> result.select("spell.result").show(truncate=False)
|
|
314
|
+
+--------------------------------------+
|
|
315
|
+
|result |
|
|
316
|
+
+--------------------------------------+
|
|
317
|
+
|[sometimes, i, write, words, wrong, .]|
|
|
318
|
+
+--------------------------------------+
|
|
319
|
+
|
|
320
|
+
See Also
|
|
321
|
+
--------
|
|
322
|
+
SymmetricDeleteModel : for an alternative approach to spell checking
|
|
323
|
+
ContextSpellCheckerModel : for a DL based approach
|
|
324
|
+
"""
|
|
325
|
+
name = "NorvigSweetingModel"
|
|
326
|
+
|
|
327
|
+
inputAnnotatorTypes = [AnnotatorType.TOKEN]
|
|
328
|
+
|
|
329
|
+
outputAnnotatorType = AnnotatorType.TOKEN
|
|
330
|
+
|
|
331
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel", java_model=None):
|
|
332
|
+
super(NorvigSweetingModel, self).__init__(
|
|
333
|
+
classname=classname,
|
|
334
|
+
java_model=java_model
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
@staticmethod
|
|
338
|
+
def pretrained(name="spellcheck_norvig", lang="en", remote_loc=None):
|
|
339
|
+
"""Downloads and loads a pretrained model.
|
|
340
|
+
|
|
341
|
+
Parameters
|
|
342
|
+
----------
|
|
343
|
+
name : str, optional
|
|
344
|
+
Name of the pretrained model, by default "spellcheck_norvig"
|
|
345
|
+
lang : str, optional
|
|
346
|
+
Language of the pretrained model, by default "en"
|
|
347
|
+
remote_loc : str, optional
|
|
348
|
+
Optional remote address of the resource, by default None. Will use
|
|
349
|
+
Spark NLPs repositories otherwise.
|
|
350
|
+
|
|
351
|
+
Returns
|
|
352
|
+
-------
|
|
353
|
+
NorvigSweetingModel
|
|
354
|
+
The restored model
|
|
355
|
+
"""
|
|
356
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
357
|
+
return ResourceDownloader.downloadModel(NorvigSweetingModel, name, lang, remote_loc)
|
|
358
|
+
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for SymmetricDelete."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SymmetricDeleteApproach(AnnotatorApproach):
|
|
20
|
+
"""Trains a Symmetric Delete spelling correction algorithm. Retrieves tokens
|
|
21
|
+
and utilizes distance metrics to compute possible derived words.
|
|
22
|
+
|
|
23
|
+
The Symmetric Delete spelling correction algorithm reduces the complexity of edit
|
|
24
|
+
candidate generation and dictionary lookup for a given Damerau-Levenshtein distance.
|
|
25
|
+
It is six orders of magnitude faster (than the standard approach with deletes +
|
|
26
|
+
transposes + replaces + inserts) and language independent.
|
|
27
|
+
|
|
28
|
+
A dictionary of correct spellings must be provided with :meth:`.setDictionary` in
|
|
29
|
+
the form of a text file, where each word is parsed by a regex pattern.
|
|
30
|
+
|
|
31
|
+
For instantiated/pretrained models, see :class:`.SymmetricDeleteModel`.
|
|
32
|
+
|
|
33
|
+
====================== ======================
|
|
34
|
+
Input Annotation types Output Annotation type
|
|
35
|
+
====================== ======================
|
|
36
|
+
``TOKEN`` ``TOKEN``
|
|
37
|
+
====================== ======================
|
|
38
|
+
|
|
39
|
+
Parameters
|
|
40
|
+
----------
|
|
41
|
+
dictionary
|
|
42
|
+
folder or file with text that teaches about the language
|
|
43
|
+
maxEditDistance
|
|
44
|
+
max edit distance characters to derive strings from a word, by default 3
|
|
45
|
+
frequencyThreshold
|
|
46
|
+
minimum frequency of words to be considered from training, by default 0
|
|
47
|
+
deletesThreshold
|
|
48
|
+
minimum frequency of corrections a word needs to have to be considered
|
|
49
|
+
from training, by default 0
|
|
50
|
+
|
|
51
|
+
References
|
|
52
|
+
----------
|
|
53
|
+
Inspired by `SymSpell <https://github.com/wolfgarbe/SymSpell>`__.
|
|
54
|
+
|
|
55
|
+
Examples
|
|
56
|
+
--------
|
|
57
|
+
In this example, the dictionary ``"words.txt"`` has the form of::
|
|
58
|
+
|
|
59
|
+
...
|
|
60
|
+
gummy
|
|
61
|
+
gummic
|
|
62
|
+
gummier
|
|
63
|
+
gummiest
|
|
64
|
+
gummiferous
|
|
65
|
+
...
|
|
66
|
+
|
|
67
|
+
This dictionary is then set to be the basis of the spell checker.
|
|
68
|
+
|
|
69
|
+
>>> import sparknlp
|
|
70
|
+
>>> from sparknlp.base import *
|
|
71
|
+
>>> from sparknlp.annotator import *
|
|
72
|
+
>>> from pyspark.ml import Pipeline
|
|
73
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
74
|
+
... .setInputCol("text") \\
|
|
75
|
+
... .setOutputCol("document")
|
|
76
|
+
>>> tokenizer = Tokenizer() \\
|
|
77
|
+
... .setInputCols(["document"]) \\
|
|
78
|
+
... .setOutputCol("token")
|
|
79
|
+
>>> spellChecker = SymmetricDeleteApproach() \\
|
|
80
|
+
... .setInputCols(["token"]) \\
|
|
81
|
+
... .setOutputCol("spell") \\
|
|
82
|
+
... .setDictionary("src/test/resources/spell/words.txt")
|
|
83
|
+
>>> pipeline = Pipeline().setStages([
|
|
84
|
+
... documentAssembler,
|
|
85
|
+
... tokenizer,
|
|
86
|
+
... spellChecker
|
|
87
|
+
... ])
|
|
88
|
+
>>> pipelineModel = pipeline.fit(trainingData)
|
|
89
|
+
|
|
90
|
+
See Also
|
|
91
|
+
--------
|
|
92
|
+
NorvigSweetingApproach : for an alternative approach to spell checking
|
|
93
|
+
ContextSpellCheckerApproach : for a DL based approach
|
|
94
|
+
"""
|
|
95
|
+
inputAnnotatorTypes = [AnnotatorType.TOKEN]
|
|
96
|
+
|
|
97
|
+
outputAnnotatorType = AnnotatorType.TOKEN
|
|
98
|
+
|
|
99
|
+
corpus = Param(Params._dummy(),
|
|
100
|
+
"corpus",
|
|
101
|
+
"folder or file with text that teaches about the language",
|
|
102
|
+
typeConverter=TypeConverters.identity)
|
|
103
|
+
|
|
104
|
+
dictionary = Param(Params._dummy(),
|
|
105
|
+
"dictionary",
|
|
106
|
+
"folder or file with text that teaches about the language",
|
|
107
|
+
typeConverter=TypeConverters.identity)
|
|
108
|
+
|
|
109
|
+
maxEditDistance = Param(Params._dummy(),
|
|
110
|
+
"maxEditDistance",
|
|
111
|
+
"max edit distance characters to derive strings from a word",
|
|
112
|
+
typeConverter=TypeConverters.toInt)
|
|
113
|
+
|
|
114
|
+
frequencyThreshold = Param(Params._dummy(),
|
|
115
|
+
"frequencyThreshold",
|
|
116
|
+
"minimum frequency of words to be considered from training. " +
|
|
117
|
+
"Increase if training set is LARGE. Defaults to 0",
|
|
118
|
+
typeConverter=TypeConverters.toInt)
|
|
119
|
+
|
|
120
|
+
deletesThreshold = Param(Params._dummy(),
|
|
121
|
+
"deletesThreshold",
|
|
122
|
+
"minimum frequency of corrections a word needs to have to be considered from training." +
|
|
123
|
+
"Increase if training set is LARGE. Defaults to 0",
|
|
124
|
+
typeConverter=TypeConverters.toInt)
|
|
125
|
+
|
|
126
|
+
dupsLimit = Param(Params._dummy(),
|
|
127
|
+
"dupsLimit",
|
|
128
|
+
"maximum duplicate of characters in a word to consider. Defaults to 2",
|
|
129
|
+
typeConverter=TypeConverters.toInt)
|
|
130
|
+
|
|
131
|
+
@keyword_only
|
|
132
|
+
def __init__(self):
|
|
133
|
+
super(SymmetricDeleteApproach, self).__init__(
|
|
134
|
+
classname="com.johnsnowlabs.nlp.annotators.spell.symmetric.SymmetricDeleteApproach")
|
|
135
|
+
self._setDefault(maxEditDistance=3, frequencyThreshold=0, deletesThreshold=0, dupsLimit=2)
|
|
136
|
+
self.dictionary_path = ""
|
|
137
|
+
|
|
138
|
+
def setDictionary(self, path, token_pattern="\S+", read_as=ReadAs.TEXT, options={"format": "text"}):
|
|
139
|
+
"""Sets folder or file with text that teaches about the language.
|
|
140
|
+
|
|
141
|
+
Parameters
|
|
142
|
+
----------
|
|
143
|
+
path : str
|
|
144
|
+
Path to the resource
|
|
145
|
+
token_pattern : str, optional
|
|
146
|
+
Regex patttern to extract tokens, by default "\S+"
|
|
147
|
+
read_as : str, optional
|
|
148
|
+
How to read the resource, by default ReadAs.TEXT
|
|
149
|
+
options : dict, optional
|
|
150
|
+
Options for reading the resource, by default {"format": "text"}
|
|
151
|
+
"""
|
|
152
|
+
self.dictionary_path = path
|
|
153
|
+
opts = options.copy()
|
|
154
|
+
if "tokenPattern" not in opts:
|
|
155
|
+
opts["tokenPattern"] = token_pattern
|
|
156
|
+
return self._set(dictionary=ExternalResource(path, read_as, opts))
|
|
157
|
+
|
|
158
|
+
def setMaxEditDistance(self, v):
|
|
159
|
+
"""Sets max edit distance characters to derive strings from a word, by
|
|
160
|
+
default 3.
|
|
161
|
+
|
|
162
|
+
Parameters
|
|
163
|
+
----------
|
|
164
|
+
v : int
|
|
165
|
+
Max edit distance characters to derive strings from a word
|
|
166
|
+
"""
|
|
167
|
+
return self._set(maxEditDistance=v)
|
|
168
|
+
|
|
169
|
+
def setFrequencyThreshold(self, v):
|
|
170
|
+
"""Sets minimum frequency of words to be considered from training, by
|
|
171
|
+
default 0.
|
|
172
|
+
|
|
173
|
+
Parameters
|
|
174
|
+
----------
|
|
175
|
+
v : int
|
|
176
|
+
Minimum frequency of words to be considered from training
|
|
177
|
+
"""
|
|
178
|
+
return self._set(frequencyThreshold=v)
|
|
179
|
+
|
|
180
|
+
def setDeletesThreshold(self, v):
|
|
181
|
+
"""Sets minimum frequency of corrections a word needs to have to be
|
|
182
|
+
considered from training, by default 0.
|
|
183
|
+
|
|
184
|
+
Parameters
|
|
185
|
+
----------
|
|
186
|
+
v : int
|
|
187
|
+
Minimum frequency of corrections a word needs to have to be
|
|
188
|
+
considered from training
|
|
189
|
+
"""
|
|
190
|
+
return self._set(deletesThreshold=v)
|
|
191
|
+
|
|
192
|
+
def _create_model(self, java_model):
|
|
193
|
+
return SymmetricDeleteModel(java_model=java_model)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
class SymmetricDeleteModel(AnnotatorModel):
|
|
197
|
+
"""Symmetric Delete spelling correction algorithm.
|
|
198
|
+
|
|
199
|
+
The Symmetric Delete spelling correction algorithm reduces the complexity of
|
|
200
|
+
edit candidate generation and dictionary lookup for a given
|
|
201
|
+
Damerau-Levenshtein distance. It is six orders of magnitude faster (than the
|
|
202
|
+
standard approach with deletes + transposes + replaces + inserts) and
|
|
203
|
+
language independent.
|
|
204
|
+
|
|
205
|
+
Pretrained models can be loaded with :meth:`.pretrained` of the companion
|
|
206
|
+
object:
|
|
207
|
+
|
|
208
|
+
>>> spell = SymmetricDeleteModel.pretrained() \\
|
|
209
|
+
... .setInputCols(["token"]) \\
|
|
210
|
+
... .setOutputCol("spell")
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
The default model is ``"spellcheck_sd"``, if no name is provided. For
|
|
214
|
+
available pretrained models please see the `Models Hub
|
|
215
|
+
<https://sparknlp.org/models?task=Spell+Check>`__.
|
|
216
|
+
|
|
217
|
+
====================== ======================
|
|
218
|
+
Input Annotation types Output Annotation type
|
|
219
|
+
====================== ======================
|
|
220
|
+
``TOKEN`` ``TOKEN``
|
|
221
|
+
====================== ======================
|
|
222
|
+
|
|
223
|
+
Parameters
|
|
224
|
+
----------
|
|
225
|
+
None
|
|
226
|
+
|
|
227
|
+
References
|
|
228
|
+
----------
|
|
229
|
+
Inspired by `SymSpell <https://github.com/wolfgarbe/SymSpell>`__.
|
|
230
|
+
|
|
231
|
+
Examples
|
|
232
|
+
--------
|
|
233
|
+
>>> import sparknlp
|
|
234
|
+
>>> from sparknlp.base import *
|
|
235
|
+
>>> from sparknlp.annotator import *
|
|
236
|
+
>>> from pyspark.ml import Pipeline
|
|
237
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
238
|
+
... .setInputCol("text") \\
|
|
239
|
+
... .setOutputCol("document")
|
|
240
|
+
>>> tokenizer = Tokenizer() \\
|
|
241
|
+
... .setInputCols(["document"]) \\
|
|
242
|
+
... .setOutputCol("token")
|
|
243
|
+
>>> spellChecker = SymmetricDeleteModel.pretrained() \\
|
|
244
|
+
... .setInputCols(["token"]) \\
|
|
245
|
+
... .setOutputCol("spell")
|
|
246
|
+
>>> pipeline = Pipeline().setStages([
|
|
247
|
+
... documentAssembler,
|
|
248
|
+
... tokenizer,
|
|
249
|
+
... spellChecker
|
|
250
|
+
... ])
|
|
251
|
+
>>> data = spark.createDataFrame([["spmetimes i wrrite wordz erong."]]).toDF("text")
|
|
252
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
253
|
+
>>> result.select("spell.result").show(truncate=False)
|
|
254
|
+
+--------------------------------------+
|
|
255
|
+
|result |
|
|
256
|
+
+--------------------------------------+
|
|
257
|
+
|[sometimes, i, write, words, wrong, .]|
|
|
258
|
+
+--------------------------------------+
|
|
259
|
+
|
|
260
|
+
See Also
|
|
261
|
+
--------
|
|
262
|
+
NorvigSweetingModel : for an alternative approach to spell checking
|
|
263
|
+
ContextSpellCheckerModel : for a DL based approach
|
|
264
|
+
"""
|
|
265
|
+
name = "SymmetricDeleteModel"
|
|
266
|
+
|
|
267
|
+
inputAnnotatorTypes = [AnnotatorType.TOKEN]
|
|
268
|
+
|
|
269
|
+
outputAnnotatorType = AnnotatorType.TOKEN
|
|
270
|
+
|
|
271
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.spell.symmetric.SymmetricDeleteModel",
|
|
272
|
+
java_model=None):
|
|
273
|
+
super(SymmetricDeleteModel, self).__init__(
|
|
274
|
+
classname=classname,
|
|
275
|
+
java_model=java_model
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
@staticmethod
|
|
279
|
+
def pretrained(name="spellcheck_sd", lang="en", remote_loc=None):
|
|
280
|
+
"""Downloads and loads a pretrained model.
|
|
281
|
+
|
|
282
|
+
Parameters
|
|
283
|
+
----------
|
|
284
|
+
name : str, optional
|
|
285
|
+
Name of the pretrained model, by default "spellcheck_sd"
|
|
286
|
+
lang : str, optional
|
|
287
|
+
Language of the pretrained model, by default "en"
|
|
288
|
+
remote_loc : str, optional
|
|
289
|
+
Optional remote address of the resource, by default None. Will use
|
|
290
|
+
Spark NLPs repositories otherwise.
|
|
291
|
+
|
|
292
|
+
Returns
|
|
293
|
+
-------
|
|
294
|
+
SymmetricDeleteModel
|
|
295
|
+
The restored model
|
|
296
|
+
"""
|
|
297
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
298
|
+
return ResourceDownloader.downloadModel(SymmetricDeleteModel, name, lang, remote_loc)
|
|
299
|
+
|