spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,467 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for SentenceDetectorDl."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SentenceDetectorDLApproach(AnnotatorApproach):
|
|
20
|
+
"""Trains an annotator that detects sentence boundaries using a deep
|
|
21
|
+
learning approach.
|
|
22
|
+
|
|
23
|
+
Currently, only the CNN model is supported for training, but in the future
|
|
24
|
+
the architecture of the model can be set with :meth:`.setModel`.
|
|
25
|
+
|
|
26
|
+
For pretrained models see :class:`.SentenceDetectorDLModel`.
|
|
27
|
+
|
|
28
|
+
Each extracted sentence can be returned in an Array or exploded to separate
|
|
29
|
+
rows, if ``explodeSentences`` is set to ``True``.
|
|
30
|
+
|
|
31
|
+
====================== ======================
|
|
32
|
+
Input Annotation types Output Annotation type
|
|
33
|
+
====================== ======================
|
|
34
|
+
``DOCUMENT`` ``DOCUMENT``
|
|
35
|
+
====================== ======================
|
|
36
|
+
|
|
37
|
+
Parameters
|
|
38
|
+
----------
|
|
39
|
+
modelArchitecture
|
|
40
|
+
Model architecture (CNN)
|
|
41
|
+
impossiblePenultimates
|
|
42
|
+
Impossible penultimates - list of strings which a sentence can't end
|
|
43
|
+
with
|
|
44
|
+
validationSplit
|
|
45
|
+
Choose the proportion of training dataset to be validated against the
|
|
46
|
+
model on each
|
|
47
|
+
epochsNumber
|
|
48
|
+
Number of epochs for the optimization process
|
|
49
|
+
outputLogsPath
|
|
50
|
+
Path to folder where logs will be saved. If no path is specified, no
|
|
51
|
+
logs are generated
|
|
52
|
+
explodeSentences
|
|
53
|
+
Whether to explode each sentence into a different row, for better
|
|
54
|
+
parallelization. Defaults to False.
|
|
55
|
+
|
|
56
|
+
References
|
|
57
|
+
----------
|
|
58
|
+
The default model ``"cnn"`` is based on the paper `Deep-EOS: General-Purpose
|
|
59
|
+
Neural Networks for Sentence Boundary Detection (2020, Stefan Schweter,
|
|
60
|
+
Sajawel Ahmed)
|
|
61
|
+
<https://konvens.org/proceedings/2019/papers/KONVENS2019_paper_41.pdf>`__
|
|
62
|
+
using a CNN architecture. We also modified the original implementation a
|
|
63
|
+
little bit to cover broken sentences and some impossible end of line chars.
|
|
64
|
+
|
|
65
|
+
Examples
|
|
66
|
+
--------
|
|
67
|
+
The training process needs data, where each data point is a sentence.
|
|
68
|
+
In this example the ``train.txt`` file has the form of::
|
|
69
|
+
|
|
70
|
+
...
|
|
71
|
+
Slightly more moderate language would make our present situation – namely the lack of progress – a little easier.
|
|
72
|
+
His political successors now have great responsibilities to history and to the heritage of values bequeathed to them by Nelson Mandela.
|
|
73
|
+
...
|
|
74
|
+
|
|
75
|
+
where each line is one sentence.
|
|
76
|
+
|
|
77
|
+
Training can then be started like so:
|
|
78
|
+
|
|
79
|
+
>>> import sparknlp
|
|
80
|
+
>>> from sparknlp.base import *
|
|
81
|
+
>>> from sparknlp.annotator import *
|
|
82
|
+
>>> from pyspark.ml import Pipeline
|
|
83
|
+
>>> trainingData = spark.read.text("train.txt").toDF("text")
|
|
84
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
85
|
+
... .setInputCol("text") \\
|
|
86
|
+
... .setOutputCol("document")
|
|
87
|
+
>>> sentenceDetector = SentenceDetectorDLApproach() \\
|
|
88
|
+
... .setInputCols(["document"]) \\
|
|
89
|
+
... .setOutputCol("sentences") \\
|
|
90
|
+
... .setEpochsNumber(100)
|
|
91
|
+
>>> pipeline = Pipeline().setStages([documentAssembler, sentenceDetector])
|
|
92
|
+
>>> model = pipeline.fit(trainingData)
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
name = "SentenceDetectorDLApproach"
|
|
96
|
+
|
|
97
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
|
|
98
|
+
|
|
99
|
+
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
100
|
+
|
|
101
|
+
modelArchitecture = Param(Params._dummy(),
|
|
102
|
+
"modelArchitecture",
|
|
103
|
+
"Model architecture (CNN)",
|
|
104
|
+
typeConverter=TypeConverters.toString)
|
|
105
|
+
|
|
106
|
+
impossiblePenultimates = Param(Params._dummy(),
|
|
107
|
+
"impossiblePenultimates",
|
|
108
|
+
"Impossible penultimates - list of strings which a sentence can't end with",
|
|
109
|
+
typeConverter=TypeConverters.toListString)
|
|
110
|
+
|
|
111
|
+
validationSplit = Param(Params._dummy(),
|
|
112
|
+
"validationSplit",
|
|
113
|
+
"Choose the proportion of training dataset to be validated against the model on each "
|
|
114
|
+
"Epoch. The value should be between 0.0 and 1.0 and by default it is 0.0 and off.",
|
|
115
|
+
TypeConverters.toFloat)
|
|
116
|
+
|
|
117
|
+
epochsNumber = Param(Params._dummy(),
|
|
118
|
+
"epochsNumber",
|
|
119
|
+
"Number of epochs for the optimization process",
|
|
120
|
+
TypeConverters.toInt)
|
|
121
|
+
|
|
122
|
+
outputLogsPath = Param(Params._dummy(),
|
|
123
|
+
"outputLogsPath",
|
|
124
|
+
"Path to folder where logs will be saved. If no path is specified, no logs are generated",
|
|
125
|
+
TypeConverters.toString)
|
|
126
|
+
|
|
127
|
+
explodeSentences = Param(Params._dummy(),
|
|
128
|
+
"explodeSentences",
|
|
129
|
+
"whether to explode each sentence into a different row, for better parallelization. Defaults to false.",
|
|
130
|
+
TypeConverters.toBoolean)
|
|
131
|
+
|
|
132
|
+
def setModel(self, model_architecture):
|
|
133
|
+
"""Sets the Model architecture. Currently only ``"cnn"`` is available.
|
|
134
|
+
|
|
135
|
+
Parameters
|
|
136
|
+
----------
|
|
137
|
+
model_architecture : str
|
|
138
|
+
Model architecture
|
|
139
|
+
"""
|
|
140
|
+
return self._set(modelArchitecture=model_architecture)
|
|
141
|
+
|
|
142
|
+
def setValidationSplit(self, validation_split):
|
|
143
|
+
"""Sets the proportion of training dataset to be validated against the
|
|
144
|
+
model on each Epoch, by default it is 0.0 and off. The value should be
|
|
145
|
+
between 0.0 and 1.0.
|
|
146
|
+
|
|
147
|
+
Parameters
|
|
148
|
+
----------
|
|
149
|
+
validation_split : float
|
|
150
|
+
Proportion of training dataset to be validated
|
|
151
|
+
"""
|
|
152
|
+
return self._set(validationSplit=validation_split)
|
|
153
|
+
|
|
154
|
+
def setEpochsNumber(self, epochs_number):
|
|
155
|
+
"""Sets number of epochs to train.
|
|
156
|
+
|
|
157
|
+
Parameters
|
|
158
|
+
----------
|
|
159
|
+
epochs_number : int
|
|
160
|
+
Number of epochs
|
|
161
|
+
"""
|
|
162
|
+
return self._set(epochsNumber=epochs_number)
|
|
163
|
+
|
|
164
|
+
def setOutputLogsPath(self, output_logs_path):
|
|
165
|
+
"""Sets folder path to save training logs.
|
|
166
|
+
|
|
167
|
+
Parameters
|
|
168
|
+
----------
|
|
169
|
+
output_logs_path : str
|
|
170
|
+
Folder path to save training logs
|
|
171
|
+
"""
|
|
172
|
+
return self._set(outputLogsPath=output_logs_path)
|
|
173
|
+
|
|
174
|
+
def setImpossiblePenultimates(self, impossible_penultimates):
|
|
175
|
+
"""Sets impossible penultimates - list of strings which a sentence can't
|
|
176
|
+
end with.
|
|
177
|
+
|
|
178
|
+
Parameters
|
|
179
|
+
----------
|
|
180
|
+
impossible_penultimates : List[str]
|
|
181
|
+
List of strings which a sentence can't end with
|
|
182
|
+
|
|
183
|
+
"""
|
|
184
|
+
return self._set(impossiblePenultimates=impossible_penultimates)
|
|
185
|
+
|
|
186
|
+
def setExplodeSentences(self, value):
|
|
187
|
+
"""Sets whether to explode each sentence into a different row, for
|
|
188
|
+
better parallelization, by default False.
|
|
189
|
+
|
|
190
|
+
Parameters
|
|
191
|
+
----------
|
|
192
|
+
value : bool
|
|
193
|
+
Whether to explode each sentence into a different row
|
|
194
|
+
"""
|
|
195
|
+
return self._set(explodeSentences=value)
|
|
196
|
+
|
|
197
|
+
def _create_model(self, java_model):
|
|
198
|
+
return SentenceDetectorDLModel(java_model=java_model)
|
|
199
|
+
|
|
200
|
+
@keyword_only
|
|
201
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.sentence_detector_dl.SentenceDetectorDLApproach"):
|
|
202
|
+
super(SentenceDetectorDLApproach, self).__init__(classname=classname)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
class SentenceDetectorDLModel(AnnotatorModel, HasEngine):
|
|
206
|
+
"""Annotator that detects sentence boundaries using a deep learning approach.
|
|
207
|
+
|
|
208
|
+
Instantiated Model of the :class:`.SentenceDetectorDLApproach`.
|
|
209
|
+
Detects sentence boundaries using a deep learning approach.
|
|
210
|
+
|
|
211
|
+
Pretrained models can be loaded with :meth:`.pretrained` of the companion
|
|
212
|
+
object:
|
|
213
|
+
|
|
214
|
+
>>> sentenceDL = SentenceDetectorDLModel.pretrained() \\
|
|
215
|
+
... .setInputCols(["document"]) \\
|
|
216
|
+
... .setOutputCol("sentencesDL")
|
|
217
|
+
|
|
218
|
+
The default model is ``"sentence_detector_dl"``, if no name is provided. For
|
|
219
|
+
available pretrained models please see the `Models Hub
|
|
220
|
+
<https://sparknlp.org/models?task=Sentence+Detection>`__.
|
|
221
|
+
|
|
222
|
+
Each extracted sentence can be returned in an Array or exploded to separate
|
|
223
|
+
rows, if ``explodeSentences`` is set to ``true``.
|
|
224
|
+
|
|
225
|
+
For extended examples of usage, see the `Examples
|
|
226
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/multilingual/SentenceDetectorDL.ipynb>`__.
|
|
227
|
+
|
|
228
|
+
====================== ======================
|
|
229
|
+
Input Annotation types Output Annotation type
|
|
230
|
+
====================== ======================
|
|
231
|
+
``DOCUMENT`` ``DOCUMENT``
|
|
232
|
+
====================== ======================
|
|
233
|
+
|
|
234
|
+
Parameters
|
|
235
|
+
----------
|
|
236
|
+
modelArchitecture
|
|
237
|
+
Model architecture (CNN)
|
|
238
|
+
explodeSentences
|
|
239
|
+
whether to explode each sentence into a different row, for better
|
|
240
|
+
parallelization. Defaults to false.
|
|
241
|
+
customBounds
|
|
242
|
+
characters used to explicitly mark sentence bounds, by default []
|
|
243
|
+
useCustomBoundsOnly
|
|
244
|
+
Only utilize custom bounds in sentence detection, by default False
|
|
245
|
+
splitLength
|
|
246
|
+
length at which sentences will be forcibly split
|
|
247
|
+
minLength
|
|
248
|
+
Set the minimum allowed length for each sentence, by default 0
|
|
249
|
+
maxLength
|
|
250
|
+
Set the maximum allowed length for each sentence, by default 99999
|
|
251
|
+
impossiblePenultimates
|
|
252
|
+
Impossible penultimates - list of strings which a sentence can't end
|
|
253
|
+
with
|
|
254
|
+
|
|
255
|
+
Examples
|
|
256
|
+
--------
|
|
257
|
+
In this example, the normal `SentenceDetector` is compared to the
|
|
258
|
+
`SentenceDetectorDLModel`. In a pipeline, `SentenceDetectorDLModel` can be
|
|
259
|
+
used as a replacement for the `SentenceDetector`.
|
|
260
|
+
|
|
261
|
+
>>> import sparknlp
|
|
262
|
+
>>> from sparknlp.base import *
|
|
263
|
+
>>> from sparknlp.annotator import *
|
|
264
|
+
>>> from pyspark.ml import Pipeline
|
|
265
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
266
|
+
... .setInputCol("text") \\
|
|
267
|
+
... .setOutputCol("document")
|
|
268
|
+
>>> sentence = SentenceDetector() \\
|
|
269
|
+
... .setInputCols(["document"]) \\
|
|
270
|
+
... .setOutputCol("sentences")
|
|
271
|
+
>>> sentenceDL = SentenceDetectorDLModel \\
|
|
272
|
+
... .pretrained("sentence_detector_dl", "en") \\
|
|
273
|
+
... .setInputCols(["document"]) \\
|
|
274
|
+
... .setOutputCol("sentencesDL")
|
|
275
|
+
>>> pipeline = Pipeline().setStages([
|
|
276
|
+
... documentAssembler,
|
|
277
|
+
... sentence,
|
|
278
|
+
... sentenceDL
|
|
279
|
+
... ])
|
|
280
|
+
>>> data = spark.createDataFrame([[\"\"\"John loves Mary.Mary loves Peter
|
|
281
|
+
... Peter loves Helen .Helen loves John;
|
|
282
|
+
... Total: four people involved.\"\"\"]]).toDF("text")
|
|
283
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
284
|
+
>>> result.selectExpr("explode(sentences.result) as sentences").show(truncate=False)
|
|
285
|
+
+----------------------------------------------------------+
|
|
286
|
+
|sentences |
|
|
287
|
+
+----------------------------------------------------------+
|
|
288
|
+
|John loves Mary.Mary loves Peter\\n Peter loves Helen .|
|
|
289
|
+
|Helen loves John; |
|
|
290
|
+
|Total: four people involved. |
|
|
291
|
+
+----------------------------------------------------------+
|
|
292
|
+
>>> result.selectExpr("explode(sentencesDL.result) as sentencesDL").show(truncate=False)
|
|
293
|
+
+----------------------------+
|
|
294
|
+
|sentencesDL |
|
|
295
|
+
+----------------------------+
|
|
296
|
+
|John loves Mary. |
|
|
297
|
+
|Mary loves Peter |
|
|
298
|
+
|Peter loves Helen . |
|
|
299
|
+
|Helen loves John; |
|
|
300
|
+
|Total: four people involved.|
|
|
301
|
+
+----------------------------+
|
|
302
|
+
"""
|
|
303
|
+
name = "SentenceDetectorDLModel"
|
|
304
|
+
|
|
305
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
|
|
306
|
+
|
|
307
|
+
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
308
|
+
|
|
309
|
+
modelArchitecture = Param(Params._dummy(), "modelArchitecture", "Model architecture (CNN)",
|
|
310
|
+
typeConverter=TypeConverters.toString)
|
|
311
|
+
|
|
312
|
+
explodeSentences = Param(Params._dummy(),
|
|
313
|
+
"explodeSentences",
|
|
314
|
+
"whether to explode each sentence into a different row, for better parallelization. Defaults to false.",
|
|
315
|
+
TypeConverters.toBoolean)
|
|
316
|
+
|
|
317
|
+
customBounds = Param(Params._dummy(),
|
|
318
|
+
"customBounds",
|
|
319
|
+
"characters used to explicitly mark sentence bounds",
|
|
320
|
+
typeConverter=TypeConverters.toListString)
|
|
321
|
+
|
|
322
|
+
useCustomBoundsOnly = Param(Params._dummy(),
|
|
323
|
+
"useCustomBoundsOnly",
|
|
324
|
+
"Only utilize custom bounds in sentence detection",
|
|
325
|
+
typeConverter=TypeConverters.toBoolean)
|
|
326
|
+
|
|
327
|
+
splitLength = Param(Params._dummy(),
|
|
328
|
+
"splitLength",
|
|
329
|
+
"length at which sentences will be forcibly split.",
|
|
330
|
+
typeConverter=TypeConverters.toInt)
|
|
331
|
+
|
|
332
|
+
minLength = Param(Params._dummy(),
|
|
333
|
+
"minLength",
|
|
334
|
+
"Set the minimum allowed length for each sentence.",
|
|
335
|
+
typeConverter=TypeConverters.toInt)
|
|
336
|
+
|
|
337
|
+
maxLength = Param(Params._dummy(),
|
|
338
|
+
"maxLength",
|
|
339
|
+
"Set the maximum allowed length for each sentence",
|
|
340
|
+
typeConverter=TypeConverters.toInt)
|
|
341
|
+
|
|
342
|
+
impossiblePenultimates = Param(Params._dummy(),
|
|
343
|
+
"impossiblePenultimates",
|
|
344
|
+
"Impossible penultimates - list of strings which a sentence can't end with",
|
|
345
|
+
typeConverter=TypeConverters.toListString)
|
|
346
|
+
|
|
347
|
+
def setModel(self, modelArchitecture):
|
|
348
|
+
"""Sets the Model architecture. Currently only ``"cnn"`` is available.
|
|
349
|
+
|
|
350
|
+
Parameters
|
|
351
|
+
----------
|
|
352
|
+
model_architecture : str
|
|
353
|
+
Model architecture
|
|
354
|
+
"""
|
|
355
|
+
return self._set(modelArchitecture=modelArchitecture)
|
|
356
|
+
|
|
357
|
+
def setExplodeSentences(self, value):
|
|
358
|
+
"""Sets whether to explode each sentence into a different row, for
|
|
359
|
+
better parallelization, by default False.
|
|
360
|
+
|
|
361
|
+
Parameters
|
|
362
|
+
----------
|
|
363
|
+
value : bool
|
|
364
|
+
Whether to explode each sentence into a different row
|
|
365
|
+
"""
|
|
366
|
+
return self._set(explodeSentences=value)
|
|
367
|
+
|
|
368
|
+
def setCustomBounds(self, value):
|
|
369
|
+
"""Sets characters used to explicitly mark sentence bounds, by default
|
|
370
|
+
[].
|
|
371
|
+
|
|
372
|
+
Parameters
|
|
373
|
+
----------
|
|
374
|
+
value : List[str]
|
|
375
|
+
Characters used to explicitly mark sentence bounds
|
|
376
|
+
"""
|
|
377
|
+
return self._set(customBounds=value)
|
|
378
|
+
|
|
379
|
+
def setUseCustomBoundsOnly(self, value):
|
|
380
|
+
"""Sets whether to only utilize custom bounds in sentence detection, by
|
|
381
|
+
default False.
|
|
382
|
+
|
|
383
|
+
Parameters
|
|
384
|
+
----------
|
|
385
|
+
value : bool
|
|
386
|
+
Whether to only utilize custom bounds
|
|
387
|
+
"""
|
|
388
|
+
return self._set(useCustomBoundsOnly=value)
|
|
389
|
+
|
|
390
|
+
def setSplitLength(self, value):
|
|
391
|
+
"""Sets length at which sentences will be forcibly split.
|
|
392
|
+
|
|
393
|
+
Parameters
|
|
394
|
+
----------
|
|
395
|
+
value : int
|
|
396
|
+
Length at which sentences will be forcibly split.
|
|
397
|
+
"""
|
|
398
|
+
return self._set(splitLength=value)
|
|
399
|
+
|
|
400
|
+
def setMinLength(self, value):
|
|
401
|
+
"""Sets minimum allowed length for each sentence, by default 0
|
|
402
|
+
|
|
403
|
+
Parameters
|
|
404
|
+
----------
|
|
405
|
+
value : int
|
|
406
|
+
Minimum allowed length for each sentence
|
|
407
|
+
"""
|
|
408
|
+
return self._set(minLength=value)
|
|
409
|
+
|
|
410
|
+
def setMaxLength(self, value):
|
|
411
|
+
"""Sets the maximum allowed length for each sentence, by default
|
|
412
|
+
99999
|
|
413
|
+
|
|
414
|
+
Parameters
|
|
415
|
+
----------
|
|
416
|
+
value : int
|
|
417
|
+
Maximum allowed length for each sentence
|
|
418
|
+
"""
|
|
419
|
+
return self._set(maxLength=value)
|
|
420
|
+
|
|
421
|
+
def setImpossiblePenultimates(self, impossible_penultimates):
|
|
422
|
+
"""Sets impossible penultimates - list of strings which a sentence can't
|
|
423
|
+
end with.
|
|
424
|
+
|
|
425
|
+
Parameters
|
|
426
|
+
----------
|
|
427
|
+
impossible_penultimates : List[str]
|
|
428
|
+
List of strings which a sentence can't end with
|
|
429
|
+
|
|
430
|
+
"""
|
|
431
|
+
return self._set(impossiblePenultimates=impossible_penultimates)
|
|
432
|
+
|
|
433
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.sentence_detector_dl.SentenceDetectorDLModel",
|
|
434
|
+
java_model=None):
|
|
435
|
+
super(SentenceDetectorDLModel, self).__init__(
|
|
436
|
+
classname=classname,
|
|
437
|
+
java_model=java_model
|
|
438
|
+
)
|
|
439
|
+
self._setDefault(
|
|
440
|
+
useCustomBoundsOnly=False,
|
|
441
|
+
customBounds=[],
|
|
442
|
+
explodeSentences=False,
|
|
443
|
+
minLength=0,
|
|
444
|
+
maxLength=99999
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
@staticmethod
|
|
448
|
+
def pretrained(name="sentence_detector_dl", lang="en", remote_loc=None):
|
|
449
|
+
"""Downloads and loads a pretrained model.
|
|
450
|
+
|
|
451
|
+
Parameters
|
|
452
|
+
----------
|
|
453
|
+
name : str, optional
|
|
454
|
+
Name of the pretrained model, by default "sentence_detector_dl"
|
|
455
|
+
lang : str, optional
|
|
456
|
+
Language of the pretrained model, by default "en"
|
|
457
|
+
remote_loc : str, optional
|
|
458
|
+
Optional remote address of the resource, by default None. Will use
|
|
459
|
+
Spark NLPs repositories otherwise.
|
|
460
|
+
|
|
461
|
+
Returns
|
|
462
|
+
-------
|
|
463
|
+
SentenceDetectorDLModel
|
|
464
|
+
The restored model
|
|
465
|
+
"""
|
|
466
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
467
|
+
return ResourceDownloader.downloadModel(SentenceDetectorDLModel, name, lang, remote_loc)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""Module of annotators for sentiment analysis."""
|
|
16
|
+
from sparknlp.annotator.sentiment.sentiment_detector import *
|
|
17
|
+
from sparknlp.annotator.sentiment.vivekn_sentiment import *
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the SentimentDetector."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SentimentDetector(AnnotatorApproach):
|
|
20
|
+
"""Trains a rule based sentiment detector, which calculates a score based on
|
|
21
|
+
predefined keywords.
|
|
22
|
+
|
|
23
|
+
A dictionary of predefined sentiment keywords must be provided with
|
|
24
|
+
:meth:`.setDictionary`, where each line is a word delimited to its class
|
|
25
|
+
(either ``positive`` or ``negative``). The dictionary can be set in the form
|
|
26
|
+
of a delimited text file.
|
|
27
|
+
|
|
28
|
+
By default, the sentiment score will be assigned labels ``"positive"`` if
|
|
29
|
+
the score is ``>= 0``, else ``"negative"``.
|
|
30
|
+
|
|
31
|
+
For extended examples of usage, see the `Examples
|
|
32
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dictionary-sentiment/sentiment.ipynb>`__.
|
|
33
|
+
|
|
34
|
+
====================== ======================
|
|
35
|
+
Input Annotation types Output Annotation type
|
|
36
|
+
====================== ======================
|
|
37
|
+
``TOKEN, DOCUMENT`` ``SENTIMENT``
|
|
38
|
+
====================== ======================
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
dictionary
|
|
43
|
+
path for dictionary to sentiment analysis
|
|
44
|
+
|
|
45
|
+
Examples
|
|
46
|
+
--------
|
|
47
|
+
In this example, the dictionary ``default-sentiment-dict.txt`` has the form
|
|
48
|
+
of::
|
|
49
|
+
|
|
50
|
+
...
|
|
51
|
+
cool,positive
|
|
52
|
+
superb,positive
|
|
53
|
+
bad,negative
|
|
54
|
+
uninspired,negative
|
|
55
|
+
...
|
|
56
|
+
|
|
57
|
+
where each sentiment keyword is delimited by ``","``.
|
|
58
|
+
|
|
59
|
+
>>> import sparknlp
|
|
60
|
+
>>> from sparknlp.base import *
|
|
61
|
+
>>> from sparknlp.annotator import *
|
|
62
|
+
>>> from pyspark.ml import Pipeline
|
|
63
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
64
|
+
... .setInputCol("text") \\
|
|
65
|
+
... .setOutputCol("document")
|
|
66
|
+
>>> tokenizer = Tokenizer() \\
|
|
67
|
+
... .setInputCols(["document"]) \\
|
|
68
|
+
... .setOutputCol("token")
|
|
69
|
+
>>> lemmatizer = Lemmatizer() \\
|
|
70
|
+
... .setInputCols(["token"]) \\
|
|
71
|
+
... .setOutputCol("lemma") \\
|
|
72
|
+
... .setDictionary("lemmas_small.txt", "->", "\\t")
|
|
73
|
+
>>> sentimentDetector = SentimentDetector() \\
|
|
74
|
+
... .setInputCols(["lemma", "document"]) \\
|
|
75
|
+
... .setOutputCol("sentimentScore") \\
|
|
76
|
+
... .setDictionary("default-sentiment-dict.txt", ",", ReadAs.TEXT)
|
|
77
|
+
>>> pipeline = Pipeline().setStages([
|
|
78
|
+
... documentAssembler,
|
|
79
|
+
... tokenizer,
|
|
80
|
+
... lemmatizer,
|
|
81
|
+
... sentimentDetector,
|
|
82
|
+
... ])
|
|
83
|
+
>>> data = spark.createDataFrame([
|
|
84
|
+
... ["The staff of the restaurant is nice"],
|
|
85
|
+
... ["I recommend others to avoid because it is too expensive"]
|
|
86
|
+
... ]).toDF("text")
|
|
87
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
88
|
+
>>> result.selectExpr("sentimentScore.result").show(truncate=False)
|
|
89
|
+
+----------+
|
|
90
|
+
|result |
|
|
91
|
+
+----------+
|
|
92
|
+
|[positive]|
|
|
93
|
+
|[negative]|
|
|
94
|
+
+----------+
|
|
95
|
+
|
|
96
|
+
See Also
|
|
97
|
+
--------
|
|
98
|
+
ViveknSentimentApproach : for an alternative approach to sentiment extraction
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.DOCUMENT]
|
|
102
|
+
|
|
103
|
+
outputAnnotatorType = AnnotatorType.SENTIMENT
|
|
104
|
+
|
|
105
|
+
dictionary = Param(Params._dummy(),
|
|
106
|
+
"dictionary",
|
|
107
|
+
"path for dictionary to sentiment analysis",
|
|
108
|
+
typeConverter=TypeConverters.identity)
|
|
109
|
+
|
|
110
|
+
positiveMultiplier = Param(Params._dummy(),
|
|
111
|
+
"positiveMultiplier",
|
|
112
|
+
"multiplier for positive sentiments. Defaults 1.0",
|
|
113
|
+
typeConverter=TypeConverters.toFloat)
|
|
114
|
+
|
|
115
|
+
negativeMultiplier = Param(Params._dummy(),
|
|
116
|
+
"negativeMultiplier",
|
|
117
|
+
"multiplier for negative sentiments. Defaults -1.0",
|
|
118
|
+
typeConverter=TypeConverters.toFloat)
|
|
119
|
+
|
|
120
|
+
incrementMultiplier = Param(Params._dummy(),
|
|
121
|
+
"incrementMultiplier",
|
|
122
|
+
"multiplier for increment sentiments. Defaults 2.0",
|
|
123
|
+
typeConverter=TypeConverters.toFloat)
|
|
124
|
+
|
|
125
|
+
decrementMultiplier = Param(Params._dummy(),
|
|
126
|
+
"decrementMultiplier",
|
|
127
|
+
"multiplier for decrement sentiments. Defaults -2.0",
|
|
128
|
+
typeConverter=TypeConverters.toFloat)
|
|
129
|
+
|
|
130
|
+
reverseMultiplier = Param(Params._dummy(),
|
|
131
|
+
"reverseMultiplier",
|
|
132
|
+
"multiplier for revert sentiments. Defaults -1.0",
|
|
133
|
+
typeConverter=TypeConverters.toFloat)
|
|
134
|
+
|
|
135
|
+
enableScore = Param(Params._dummy(),
|
|
136
|
+
"enableScore",
|
|
137
|
+
"if true, score will show as the double value, else will output string \"positive\" or \"negative\". Defaults false",
|
|
138
|
+
typeConverter=TypeConverters.toBoolean)
|
|
139
|
+
|
|
140
|
+
def __init__(self):
|
|
141
|
+
super(SentimentDetector, self).__init__(
|
|
142
|
+
classname="com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetector")
|
|
143
|
+
self._setDefault(positiveMultiplier=1.0, negativeMultiplier=-1.0, incrementMultiplier=2.0,
|
|
144
|
+
decrementMultiplier=-2.0, reverseMultiplier=-1.0, enableScore=False)
|
|
145
|
+
|
|
146
|
+
def setDictionary(self, path, delimiter, read_as=ReadAs.TEXT, options={'format': 'text'}):
|
|
147
|
+
"""Sets path for dictionary to sentiment analysis
|
|
148
|
+
|
|
149
|
+
Parameters
|
|
150
|
+
----------
|
|
151
|
+
path : str
|
|
152
|
+
Path to dictionary file
|
|
153
|
+
delimiter : str
|
|
154
|
+
Delimiter for entries
|
|
155
|
+
read_as : sttr, optional
|
|
156
|
+
How to read the resource, by default ReadAs.TEXT
|
|
157
|
+
options : dict, optional
|
|
158
|
+
Options for reading the resource, by default {'format': 'text'}
|
|
159
|
+
"""
|
|
160
|
+
opts = options.copy()
|
|
161
|
+
if "delimiter" not in opts:
|
|
162
|
+
opts["delimiter"] = delimiter
|
|
163
|
+
return self._set(dictionary=ExternalResource(path, read_as, opts))
|
|
164
|
+
|
|
165
|
+
def _create_model(self, java_model):
|
|
166
|
+
return SentimentDetectorModel(java_model=java_model)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
class SentimentDetectorModel(AnnotatorModel):
|
|
170
|
+
"""Rule based sentiment detector, which calculates a score based on
|
|
171
|
+
predefined keywords.
|
|
172
|
+
|
|
173
|
+
This is the instantiated model of the :class:`.SentimentDetector`. For
|
|
174
|
+
training your own model, please see the documentation of that class.
|
|
175
|
+
|
|
176
|
+
By default, the sentiment score will be assigned labels ``"positive"`` if
|
|
177
|
+
the score is ``>= 0``, else ``"negative"``.
|
|
178
|
+
|
|
179
|
+
For extended examples of usage, see the `Examples
|
|
180
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dictionary-sentiment/sentiment.ipynb>`__.
|
|
181
|
+
|
|
182
|
+
====================== ======================
|
|
183
|
+
Input Annotation types Output Annotation type
|
|
184
|
+
====================== ======================
|
|
185
|
+
``TOKEN, DOCUMENT`` ``SENTIMENT``
|
|
186
|
+
====================== ======================
|
|
187
|
+
|
|
188
|
+
Parameters
|
|
189
|
+
----------
|
|
190
|
+
None
|
|
191
|
+
"""
|
|
192
|
+
name = "SentimentDetectorModel"
|
|
193
|
+
|
|
194
|
+
inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.DOCUMENT]
|
|
195
|
+
|
|
196
|
+
outputAnnotatorType = AnnotatorType.SENTIMENT
|
|
197
|
+
|
|
198
|
+
positiveMultiplier = Param(Params._dummy(),
|
|
199
|
+
"positiveMultiplier",
|
|
200
|
+
"multiplier for positive sentiments. Defaults 1.0",
|
|
201
|
+
typeConverter=TypeConverters.toFloat)
|
|
202
|
+
|
|
203
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetectorModel",
|
|
204
|
+
java_model=None):
|
|
205
|
+
super(SentimentDetectorModel, self).__init__(
|
|
206
|
+
classname=classname,
|
|
207
|
+
java_model=java_model
|
|
208
|
+
)
|