spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the Perceptron Annotator."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class PerceptronApproach(AnnotatorApproach):
|
|
20
|
+
"""Trains an averaged Perceptron model to tag words part-of-speech. Sets a
|
|
21
|
+
POS tag to each word within a sentence.
|
|
22
|
+
|
|
23
|
+
For pretrained models please see the :class:`.PerceptronModel`.
|
|
24
|
+
|
|
25
|
+
The training data needs to be in a Spark DataFrame, where the column needs
|
|
26
|
+
to consist of Annotations of type ``POS``. The `Annotation` needs to have
|
|
27
|
+
member ``result`` set to the POS tag and have a ``"word"`` mapping to its
|
|
28
|
+
word inside of member ``metadata``. This DataFrame for training can easily
|
|
29
|
+
created by the helper class :class:`.POS`.
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
>>> POS().readDataset(spark, datasetPath) \\
|
|
33
|
+
... .selectExpr("explode(tags) as tags").show(truncate=False)
|
|
34
|
+
+---------------------------------------------+
|
|
35
|
+
|tags |
|
|
36
|
+
+---------------------------------------------+
|
|
37
|
+
|[pos, 0, 5, NNP, [word -> Pierre], []] |
|
|
38
|
+
|[pos, 7, 12, NNP, [word -> Vinken], []] |
|
|
39
|
+
|[pos, 14, 14, ,, [word -> ,], []] |
|
|
40
|
+
|[pos, 31, 34, MD, [word -> will], []] |
|
|
41
|
+
|[pos, 36, 39, VB, [word -> join], []] |
|
|
42
|
+
|[pos, 41, 43, DT, [word -> the], []] |
|
|
43
|
+
|[pos, 45, 49, NN, [word -> board], []] |
|
|
44
|
+
...
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
For extended examples of usage, see the `Examples
|
|
48
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/french/Train-Perceptron-French.ipynb>`__.
|
|
49
|
+
|
|
50
|
+
====================== ======================
|
|
51
|
+
Input Annotation types Output Annotation type
|
|
52
|
+
====================== ======================
|
|
53
|
+
``TOKEN, DOCUMENT`` ``POS``
|
|
54
|
+
====================== ======================
|
|
55
|
+
|
|
56
|
+
Parameters
|
|
57
|
+
----------
|
|
58
|
+
posCol
|
|
59
|
+
Column name for Array of POS tags that match tokens
|
|
60
|
+
nIterations
|
|
61
|
+
Number of iterations in training, converges to better accuracy, by
|
|
62
|
+
default 5
|
|
63
|
+
|
|
64
|
+
Examples
|
|
65
|
+
--------
|
|
66
|
+
>>> import sparknlp
|
|
67
|
+
>>> from sparknlp.base import *
|
|
68
|
+
>>> from sparknlp.annotator import *
|
|
69
|
+
>>> from sparknlp.training import *
|
|
70
|
+
>>> from pyspark.ml import Pipeline
|
|
71
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
72
|
+
... .setInputCol("text") \\
|
|
73
|
+
... .setOutputCol("document")
|
|
74
|
+
>>> sentence = SentenceDetector() \\
|
|
75
|
+
... .setInputCols(["document"]) \\
|
|
76
|
+
... .setOutputCol("sentence")
|
|
77
|
+
>>> tokenizer = Tokenizer() \\
|
|
78
|
+
... .setInputCols(["sentence"]) \\
|
|
79
|
+
... .setOutputCol("token")
|
|
80
|
+
>>> datasetPath = "src/test/resources/anc-pos-corpus-small/test-training.txt"
|
|
81
|
+
>>> trainingPerceptronDF = POS().readDataset(spark, datasetPath)
|
|
82
|
+
>>> trainedPos = PerceptronApproach() \\
|
|
83
|
+
... .setInputCols(["document", "token"]) \\
|
|
84
|
+
... .setOutputCol("pos") \\
|
|
85
|
+
... .setPosColumn("tags") \\
|
|
86
|
+
... .fit(trainingPerceptronDF)
|
|
87
|
+
>>> pipeline = Pipeline().setStages([
|
|
88
|
+
... documentAssembler,
|
|
89
|
+
... sentence,
|
|
90
|
+
... tokenizer,
|
|
91
|
+
... trainedPos
|
|
92
|
+
... ])
|
|
93
|
+
>>> data = spark.createDataFrame([["To be or not to be, is this the question?"]]).toDF("text")
|
|
94
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
95
|
+
>>> result.selectExpr("pos.result").show(truncate=False)
|
|
96
|
+
+--------------------------------------------------+
|
|
97
|
+
|result |
|
|
98
|
+
+--------------------------------------------------+
|
|
99
|
+
|[NNP, NNP, CD, JJ, NNP, NNP, ,, MD, VB, DT, CD, .]|
|
|
100
|
+
+--------------------------------------------------+
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.DOCUMENT]
|
|
104
|
+
|
|
105
|
+
outputAnnotatorType = AnnotatorType.POS
|
|
106
|
+
|
|
107
|
+
posCol = Param(Params._dummy(),
|
|
108
|
+
"posCol",
|
|
109
|
+
"column of Array of POS tags that match tokens",
|
|
110
|
+
typeConverter=TypeConverters.toString)
|
|
111
|
+
|
|
112
|
+
nIterations = Param(Params._dummy(),
|
|
113
|
+
"nIterations",
|
|
114
|
+
"Number of iterations in training, converges to better accuracy",
|
|
115
|
+
typeConverter=TypeConverters.toInt)
|
|
116
|
+
|
|
117
|
+
@keyword_only
|
|
118
|
+
def __init__(self):
|
|
119
|
+
super(PerceptronApproach, self).__init__(
|
|
120
|
+
classname="com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronApproach")
|
|
121
|
+
self._setDefault(
|
|
122
|
+
nIterations=5
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
def setPosColumn(self, value):
|
|
126
|
+
"""Sets column name for Array of POS tags that match tokens.
|
|
127
|
+
|
|
128
|
+
Parameters
|
|
129
|
+
----------
|
|
130
|
+
value : str
|
|
131
|
+
Name of column for Array of POS tags
|
|
132
|
+
"""
|
|
133
|
+
return self._set(posCol=value)
|
|
134
|
+
|
|
135
|
+
def setIterations(self, value):
|
|
136
|
+
"""Sets number of iterations in training, by default 5.
|
|
137
|
+
|
|
138
|
+
Parameters
|
|
139
|
+
----------
|
|
140
|
+
value : int
|
|
141
|
+
Number of iterations in training
|
|
142
|
+
"""
|
|
143
|
+
return self._set(nIterations=value)
|
|
144
|
+
|
|
145
|
+
def getNIterations(self):
|
|
146
|
+
"""Gets number of iterations in training, by default 5.
|
|
147
|
+
|
|
148
|
+
Returns
|
|
149
|
+
-------
|
|
150
|
+
int
|
|
151
|
+
Number of iterations in training
|
|
152
|
+
"""
|
|
153
|
+
return self.getOrDefault(self.nIterations)
|
|
154
|
+
|
|
155
|
+
def _create_model(self, java_model):
|
|
156
|
+
return PerceptronModel(java_model=java_model)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class PerceptronModel(AnnotatorModel):
|
|
160
|
+
"""Averaged Perceptron model to tag words part-of-speech. Sets a POS tag to
|
|
161
|
+
each word within a sentence.
|
|
162
|
+
|
|
163
|
+
This is the instantiated model of the :class:`.PerceptronApproach`. For
|
|
164
|
+
training your own model, please see the documentation of that class.
|
|
165
|
+
|
|
166
|
+
Pretrained models can be loaded with :meth:`.pretrained` of the companion
|
|
167
|
+
object:
|
|
168
|
+
|
|
169
|
+
>>> posTagger = PerceptronModel.pretrained() \\
|
|
170
|
+
... .setInputCols(["document", "token"]) \\
|
|
171
|
+
... .setOutputCol("pos")
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
The default model is ``"pos_anc"``, if no name is provided.
|
|
175
|
+
|
|
176
|
+
For available pretrained models please see the `Models Hub
|
|
177
|
+
<https://sparknlp.org/models?task=Part+of+Speech+Tagging>`__.
|
|
178
|
+
Additionally, pretrained pipelines are available for this module, see
|
|
179
|
+
`Pipelines <https://sparknlp.org/docs/en/pipelines>`__.
|
|
180
|
+
|
|
181
|
+
For extended examples of usage, see the `Examples
|
|
182
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/french/Train-Perceptron-French.ipynb>`__.
|
|
183
|
+
|
|
184
|
+
====================== ======================
|
|
185
|
+
Input Annotation types Output Annotation type
|
|
186
|
+
====================== ======================
|
|
187
|
+
``TOKEN, DOCUMENT`` ``POS``
|
|
188
|
+
====================== ======================
|
|
189
|
+
|
|
190
|
+
Parameters
|
|
191
|
+
----------
|
|
192
|
+
None
|
|
193
|
+
|
|
194
|
+
Examples
|
|
195
|
+
--------
|
|
196
|
+
>>> import sparknlp
|
|
197
|
+
>>> from sparknlp.base import *
|
|
198
|
+
>>> from sparknlp.annotator import *
|
|
199
|
+
>>> from pyspark.ml import Pipeline
|
|
200
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
201
|
+
... .setInputCol("text") \\
|
|
202
|
+
... .setOutputCol("document")
|
|
203
|
+
>>> tokenizer = Tokenizer() \\
|
|
204
|
+
... .setInputCols(["document"]) \\
|
|
205
|
+
... .setOutputCol("token")
|
|
206
|
+
>>> posTagger = PerceptronModel.pretrained() \\
|
|
207
|
+
... .setInputCols(["document", "token"]) \\
|
|
208
|
+
... .setOutputCol("pos")
|
|
209
|
+
>>> pipeline = Pipeline().setStages([
|
|
210
|
+
... documentAssembler,
|
|
211
|
+
... tokenizer,
|
|
212
|
+
... posTagger
|
|
213
|
+
... ])
|
|
214
|
+
>>> data = spark.createDataFrame([["Peter Pipers employees are picking pecks of pickled peppers"]]).toDF("text")
|
|
215
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
216
|
+
>>> result.selectExpr("explode(pos) as pos").show(truncate=False)
|
|
217
|
+
+-------------------------------------------+
|
|
218
|
+
|pos |
|
|
219
|
+
+-------------------------------------------+
|
|
220
|
+
|[pos, 0, 4, NNP, [word -> Peter], []] |
|
|
221
|
+
|[pos, 6, 11, NNP, [word -> Pipers], []] |
|
|
222
|
+
|[pos, 13, 21, NNS, [word -> employees], []]|
|
|
223
|
+
|[pos, 23, 25, VBP, [word -> are], []] |
|
|
224
|
+
|[pos, 27, 33, VBG, [word -> picking], []] |
|
|
225
|
+
|[pos, 35, 39, NNS, [word -> pecks], []] |
|
|
226
|
+
|[pos, 41, 42, IN, [word -> of], []] |
|
|
227
|
+
|[pos, 44, 50, JJ, [word -> pickled], []] |
|
|
228
|
+
|[pos, 52, 58, NNS, [word -> peppers], []] |
|
|
229
|
+
+-------------------------------------------+
|
|
230
|
+
"""
|
|
231
|
+
name = "PerceptronModel"
|
|
232
|
+
|
|
233
|
+
inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.DOCUMENT]
|
|
234
|
+
|
|
235
|
+
outputAnnotatorType = AnnotatorType.POS
|
|
236
|
+
|
|
237
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronModel", java_model=None):
|
|
238
|
+
super(PerceptronModel, self).__init__(
|
|
239
|
+
classname=classname,
|
|
240
|
+
java_model=java_model
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
@staticmethod
|
|
244
|
+
def pretrained(name="pos_anc", lang="en", remote_loc=None):
|
|
245
|
+
"""Downloads and loads a pretrained model.
|
|
246
|
+
|
|
247
|
+
Parameters
|
|
248
|
+
----------
|
|
249
|
+
name : str, optional
|
|
250
|
+
Name of the pretrained model, by default "pos_anc"
|
|
251
|
+
lang : str, optional
|
|
252
|
+
Language of the pretrained model, by default "en"
|
|
253
|
+
remote_loc : str, optional
|
|
254
|
+
Optional remote address of the resource, by default None. Will use
|
|
255
|
+
Spark NLPs repositories otherwise.
|
|
256
|
+
|
|
257
|
+
Returns
|
|
258
|
+
-------
|
|
259
|
+
PerceptronModel
|
|
260
|
+
The restored model
|
|
261
|
+
"""
|
|
262
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
263
|
+
return ResourceDownloader.downloadModel(PerceptronModel, name, lang, remote_loc)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""Module of annotators for sentence detection."""
|
|
16
|
+
from sparknlp.annotator.sentence.sentence_detector import *
|
|
17
|
+
from sparknlp.annotator.sentence.sentence_detector_dl import *
|
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for the SentenceDetector."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SentenceDetectorParams:
|
|
20
|
+
"""Base class for SentenceDetector parameters
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
useAbbreviations = Param(Params._dummy(),
|
|
24
|
+
"useAbbreviations",
|
|
25
|
+
"whether to apply abbreviations at sentence detection",
|
|
26
|
+
typeConverter=TypeConverters.toBoolean)
|
|
27
|
+
|
|
28
|
+
customBounds = Param(Params._dummy(),
|
|
29
|
+
"customBounds",
|
|
30
|
+
"characters used to explicitly mark sentence bounds",
|
|
31
|
+
typeConverter=TypeConverters.toListString)
|
|
32
|
+
|
|
33
|
+
useCustomBoundsOnly = Param(Params._dummy(),
|
|
34
|
+
"useCustomBoundsOnly",
|
|
35
|
+
"Only utilize custom bounds in sentence detection",
|
|
36
|
+
typeConverter=TypeConverters.toBoolean)
|
|
37
|
+
|
|
38
|
+
customBoundsStrategy = Param(Params._dummy(),
|
|
39
|
+
"customBoundsStrategy",
|
|
40
|
+
"How to return matched custom bounds",
|
|
41
|
+
typeConverter=TypeConverters.toString)
|
|
42
|
+
|
|
43
|
+
explodeSentences = Param(Params._dummy(),
|
|
44
|
+
"explodeSentences",
|
|
45
|
+
"whether to explode each sentence into a different row, for better parallelization. Defaults to false.",
|
|
46
|
+
typeConverter=TypeConverters.toBoolean)
|
|
47
|
+
|
|
48
|
+
splitLength = Param(Params._dummy(),
|
|
49
|
+
"splitLength",
|
|
50
|
+
"length at which sentences will be forcibly split.",
|
|
51
|
+
typeConverter=TypeConverters.toInt)
|
|
52
|
+
|
|
53
|
+
minLength = Param(Params._dummy(),
|
|
54
|
+
"minLength",
|
|
55
|
+
"Set the minimum allowed length for each sentence.",
|
|
56
|
+
typeConverter=TypeConverters.toInt)
|
|
57
|
+
|
|
58
|
+
maxLength = Param(Params._dummy(),
|
|
59
|
+
"maxLength",
|
|
60
|
+
"Set the maximum allowed length for each sentence",
|
|
61
|
+
typeConverter=TypeConverters.toInt)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class SentenceDetector(AnnotatorModel, SentenceDetectorParams):
|
|
65
|
+
"""Annotator that detects sentence boundaries using regular expressions.
|
|
66
|
+
|
|
67
|
+
The following characters are checked as sentence boundaries:
|
|
68
|
+
|
|
69
|
+
1. Lists ("(i), (ii)", "(a), (b)", "1., 2.")
|
|
70
|
+
2. Numbers
|
|
71
|
+
3. Abbreviations
|
|
72
|
+
4. Punctuations
|
|
73
|
+
5. Multiple Periods
|
|
74
|
+
6. Geo-Locations/Coordinates ("N°. 1026.253.553.")
|
|
75
|
+
7. Ellipsis ("...")
|
|
76
|
+
8. In-between punctuations
|
|
77
|
+
9. Quotation marks
|
|
78
|
+
10. Exclamation Points
|
|
79
|
+
11. Basic Breakers (".", ";")
|
|
80
|
+
|
|
81
|
+
For the explicit regular expressions used for detection, refer to source of
|
|
82
|
+
`PragmaticContentFormatter <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticContentFormatter.scala>`__.
|
|
83
|
+
|
|
84
|
+
To add additional custom bounds, the parameter ``customBounds`` can be set with an array:
|
|
85
|
+
|
|
86
|
+
>>> sentence = SentenceDetector() \\
|
|
87
|
+
>>> .setInputCols(["document"]) \\
|
|
88
|
+
>>> .setOutputCol("sentence") \\
|
|
89
|
+
>>> .setCustomBounds(["\\n\\n"])
|
|
90
|
+
|
|
91
|
+
If only the custom bounds should be used, then the parameter ``useCustomBoundsOnly`` should be set to ``true``.
|
|
92
|
+
|
|
93
|
+
Each extracted sentence can be returned in an Array or exploded to separate rows,
|
|
94
|
+
if ``explodeSentences`` is set to ``true``.
|
|
95
|
+
|
|
96
|
+
For extended examples of usage, see the `Examples
|
|
97
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/sentence-detection/SentenceDetector_advanced_examples.ipynb>`__.
|
|
98
|
+
|
|
99
|
+
====================== ======================
|
|
100
|
+
Input Annotation types Output Annotation type
|
|
101
|
+
====================== ======================
|
|
102
|
+
``DOCUMENT`` ``DOCUMENT``
|
|
103
|
+
====================== ======================
|
|
104
|
+
|
|
105
|
+
Parameters
|
|
106
|
+
----------
|
|
107
|
+
useAbbreviations
|
|
108
|
+
whether to apply abbreviations at sentence detection, by default True
|
|
109
|
+
customBounds
|
|
110
|
+
characters used to explicitly mark sentence bounds, by default []
|
|
111
|
+
useCustomBoundsOnly
|
|
112
|
+
Only utilize custom bounds in sentence detection, by default False
|
|
113
|
+
customBoundsStrategy
|
|
114
|
+
Sets how to return matched custom bounds, by default "none".
|
|
115
|
+
|
|
116
|
+
Will have no effect if no custom bounds are used.
|
|
117
|
+
Possible values are:
|
|
118
|
+
|
|
119
|
+
- "none" - Will not return the matched bound
|
|
120
|
+
- "prepend" - Prepends a sentence break to the match
|
|
121
|
+
- "append" - Appends a sentence break to the match
|
|
122
|
+
explodeSentences
|
|
123
|
+
whether to explode each sentence into a different row, for better
|
|
124
|
+
parallelization, by default False
|
|
125
|
+
splitLength
|
|
126
|
+
length at which sentences will be forcibly split
|
|
127
|
+
minLength
|
|
128
|
+
Set the minimum allowed length for each sentence, by default 0
|
|
129
|
+
maxLength
|
|
130
|
+
Set the maximum allowed length for each sentence, by default 99999
|
|
131
|
+
detectLists
|
|
132
|
+
whether detect lists during sentence detection, by default True
|
|
133
|
+
|
|
134
|
+
Examples
|
|
135
|
+
--------
|
|
136
|
+
>>> import sparknlp
|
|
137
|
+
>>> from sparknlp.base import *
|
|
138
|
+
>>> from sparknlp.annotator import *
|
|
139
|
+
>>> from pyspark.ml import Pipeline
|
|
140
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
141
|
+
... .setInputCol("text") \\
|
|
142
|
+
... .setOutputCol("document")
|
|
143
|
+
>>> sentence = SentenceDetector() \\
|
|
144
|
+
... .setInputCols(["document"]) \\
|
|
145
|
+
... .setOutputCol("sentence")
|
|
146
|
+
... .setCustomBounds(["\\n\\n"])
|
|
147
|
+
>>> pipeline = Pipeline().setStages([
|
|
148
|
+
... documentAssembler,
|
|
149
|
+
... sentence
|
|
150
|
+
... ])
|
|
151
|
+
>>> data = spark.createDataFrame([["This is my first sentence. This my second.\\n\\nHow about a third?"]]).toDF("text")
|
|
152
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
153
|
+
>>> result.selectExpr("explode(sentence) as sentences").show(truncate=False)
|
|
154
|
+
+------------------------------------------------------------------+
|
|
155
|
+
|sentences |
|
|
156
|
+
+------------------------------------------------------------------+
|
|
157
|
+
|[document, 0, 25, This is my first sentence., [sentence -> 0], []]|
|
|
158
|
+
|[document, 27, 41, This my second., [sentence -> 1], []] |
|
|
159
|
+
|[document, 43, 60, How about a third?, [sentence -> 2], []] |
|
|
160
|
+
+------------------------------------------------------------------+
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
name = 'SentenceDetector'
|
|
164
|
+
|
|
165
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
|
|
166
|
+
|
|
167
|
+
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
168
|
+
|
|
169
|
+
# this one is exclusive to this detector
|
|
170
|
+
detectLists = Param(Params._dummy(),
|
|
171
|
+
"detectLists",
|
|
172
|
+
"whether detect lists during sentence detection",
|
|
173
|
+
typeConverter=TypeConverters.toBoolean)
|
|
174
|
+
|
|
175
|
+
def setCustomBounds(self, value):
|
|
176
|
+
"""Sets characters used to explicitly mark sentence bounds, by default
|
|
177
|
+
[].
|
|
178
|
+
|
|
179
|
+
Parameters
|
|
180
|
+
----------
|
|
181
|
+
value : List[str]
|
|
182
|
+
Characters used to explicitly mark sentence bounds
|
|
183
|
+
"""
|
|
184
|
+
return self._set(customBounds=value)
|
|
185
|
+
|
|
186
|
+
def setCustomBoundsStrategy(self, value):
|
|
187
|
+
"""Sets how to return matched custom bounds, by default "none".
|
|
188
|
+
|
|
189
|
+
Will have no effect if no custom bounds are used.
|
|
190
|
+
Possible values are:
|
|
191
|
+
|
|
192
|
+
- "none" - Will not return the matched bound
|
|
193
|
+
- "prepend" - Prepends a sentence break to the match
|
|
194
|
+
- "append" - Appends a sentence break to the match
|
|
195
|
+
|
|
196
|
+
Parameters
|
|
197
|
+
----------
|
|
198
|
+
value : str
|
|
199
|
+
Strategy to use
|
|
200
|
+
"""
|
|
201
|
+
return self._set(customBoundsStrategy=value)
|
|
202
|
+
|
|
203
|
+
def setUseAbbreviations(self, value):
|
|
204
|
+
"""Sets whether to apply abbreviations at sentence detection, by default
|
|
205
|
+
True
|
|
206
|
+
|
|
207
|
+
Parameters
|
|
208
|
+
----------
|
|
209
|
+
value : bool
|
|
210
|
+
Whether to apply abbreviations at sentence detection
|
|
211
|
+
"""
|
|
212
|
+
return self._set(useAbbreviations=value)
|
|
213
|
+
|
|
214
|
+
def setDetectLists(self, value):
|
|
215
|
+
"""Sets whether detect lists during sentence detection, by default True
|
|
216
|
+
|
|
217
|
+
Parameters
|
|
218
|
+
----------
|
|
219
|
+
value : bool
|
|
220
|
+
Whether detect lists during sentence detection
|
|
221
|
+
"""
|
|
222
|
+
return self._set(detectLists=value)
|
|
223
|
+
|
|
224
|
+
def setUseCustomBoundsOnly(self, value):
|
|
225
|
+
"""Sets whether to only utilize custom bounds in sentence detection, by
|
|
226
|
+
default False.
|
|
227
|
+
|
|
228
|
+
Parameters
|
|
229
|
+
----------
|
|
230
|
+
value : bool
|
|
231
|
+
Whether to only utilize custom bounds
|
|
232
|
+
"""
|
|
233
|
+
return self._set(useCustomBoundsOnly=value)
|
|
234
|
+
|
|
235
|
+
def setExplodeSentences(self, value):
|
|
236
|
+
"""Sets whether to explode each sentence into a different row, for
|
|
237
|
+
better parallelization, by default False.
|
|
238
|
+
|
|
239
|
+
Parameters
|
|
240
|
+
----------
|
|
241
|
+
value : bool
|
|
242
|
+
Whether to explode each sentence into a different row
|
|
243
|
+
"""
|
|
244
|
+
return self._set(explodeSentences=value)
|
|
245
|
+
|
|
246
|
+
def setSplitLength(self, value):
|
|
247
|
+
"""Sets length at which sentences will be forcibly split.
|
|
248
|
+
|
|
249
|
+
Parameters
|
|
250
|
+
----------
|
|
251
|
+
value : int
|
|
252
|
+
Length at which sentences will be forcibly split.
|
|
253
|
+
"""
|
|
254
|
+
return self._set(splitLength=value)
|
|
255
|
+
|
|
256
|
+
def setMinLength(self, value):
|
|
257
|
+
"""Sets minimum allowed length for each sentence, by default 0
|
|
258
|
+
|
|
259
|
+
Parameters
|
|
260
|
+
----------
|
|
261
|
+
value : int
|
|
262
|
+
Minimum allowed length for each sentence
|
|
263
|
+
"""
|
|
264
|
+
return self._set(minLength=value)
|
|
265
|
+
|
|
266
|
+
def setMaxLength(self, value):
|
|
267
|
+
"""Sets the maximum allowed length for each sentence, by default
|
|
268
|
+
99999
|
|
269
|
+
|
|
270
|
+
Parameters
|
|
271
|
+
----------
|
|
272
|
+
value : int
|
|
273
|
+
Maximum allowed length for each sentence
|
|
274
|
+
"""
|
|
275
|
+
return self._set(maxLength=value)
|
|
276
|
+
|
|
277
|
+
@keyword_only
|
|
278
|
+
def __init__(self):
|
|
279
|
+
super(SentenceDetector, self).__init__(
|
|
280
|
+
classname="com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector")
|
|
281
|
+
self._setDefault(
|
|
282
|
+
useAbbreviations=True,
|
|
283
|
+
detectLists=True,
|
|
284
|
+
useCustomBoundsOnly=False,
|
|
285
|
+
customBounds=[],
|
|
286
|
+
customBoundsStrategy="none",
|
|
287
|
+
explodeSentences=False,
|
|
288
|
+
minLength=0,
|
|
289
|
+
maxLength=99999
|
|
290
|
+
)
|