spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
sparknlp/annotator.py
DELETED
|
@@ -1,3006 +0,0 @@
|
|
|
1
|
-
##
|
|
2
|
-
# Prototyping for py4j to pipeline from Python
|
|
3
|
-
##
|
|
4
|
-
|
|
5
|
-
import sys
|
|
6
|
-
from sparknlp.common import *
|
|
7
|
-
|
|
8
|
-
# Do NOT delete. Looks redundant but this is key work around for python 2 support.
|
|
9
|
-
if sys.version_info[0] == 2:
|
|
10
|
-
from sparknlp.base import DocumentAssembler, Finisher, EmbeddingsFinisher, TokenAssembler
|
|
11
|
-
else:
|
|
12
|
-
import com.johnsnowlabs.nlp
|
|
13
|
-
|
|
14
|
-
annotators = sys.modules[__name__]
|
|
15
|
-
pos = sys.modules[__name__]
|
|
16
|
-
pos.perceptron = sys.modules[__name__]
|
|
17
|
-
ner = sys.modules[__name__]
|
|
18
|
-
ner.crf = sys.modules[__name__]
|
|
19
|
-
ner.dl = sys.modules[__name__]
|
|
20
|
-
regex = sys.modules[__name__]
|
|
21
|
-
sbd = sys.modules[__name__]
|
|
22
|
-
sbd.pragmatic = sys.modules[__name__]
|
|
23
|
-
sda = sys.modules[__name__]
|
|
24
|
-
sda.pragmatic = sys.modules[__name__]
|
|
25
|
-
sda.vivekn = sys.modules[__name__]
|
|
26
|
-
spell = sys.modules[__name__]
|
|
27
|
-
spell.norvig = sys.modules[__name__]
|
|
28
|
-
spell.symmetric = sys.modules[__name__]
|
|
29
|
-
spell.context = sys.modules[__name__]
|
|
30
|
-
parser = sys.modules[__name__]
|
|
31
|
-
parser.dep = sys.modules[__name__]
|
|
32
|
-
parser.typdep = sys.modules[__name__]
|
|
33
|
-
embeddings = sys.modules[__name__]
|
|
34
|
-
classifier = sys.modules[__name__]
|
|
35
|
-
classifier.dl = sys.modules[__name__]
|
|
36
|
-
ld = sys.modules[__name__]
|
|
37
|
-
ld.dl = sys.modules[__name__]
|
|
38
|
-
keyword = sys.modules[__name__]
|
|
39
|
-
keyword.yake = sys.modules[__name__]
|
|
40
|
-
sentence_detector_dl = sys.modules[__name__]
|
|
41
|
-
|
|
42
|
-
class RecursiveTokenizer(AnnotatorApproach):
|
|
43
|
-
name = 'RecursiveTokenizer'
|
|
44
|
-
|
|
45
|
-
prefixes = Param(Params._dummy(),
|
|
46
|
-
"prefixes",
|
|
47
|
-
"strings to be considered independent tokens when found at the beginning of a word",
|
|
48
|
-
typeConverter=TypeConverters.toListString)
|
|
49
|
-
|
|
50
|
-
suffixes = Param(Params._dummy(),
|
|
51
|
-
"suffixes",
|
|
52
|
-
"strings to be considered independent tokens when found at the end of a word",
|
|
53
|
-
typeConverter=TypeConverters.toListString)
|
|
54
|
-
|
|
55
|
-
infixes = Param(Params._dummy(),
|
|
56
|
-
"infixes",
|
|
57
|
-
"strings to be considered independent tokens when found in the middle of a word",
|
|
58
|
-
typeConverter=TypeConverters.toListString)
|
|
59
|
-
|
|
60
|
-
whitelist = Param(Params._dummy(),
|
|
61
|
-
"whitelist",
|
|
62
|
-
"strings to be considered as single tokens",
|
|
63
|
-
typeConverter=TypeConverters.toListString)
|
|
64
|
-
|
|
65
|
-
def setPrefixes(self, p):
|
|
66
|
-
return self._set(prefixes=p)
|
|
67
|
-
|
|
68
|
-
def setSuffixes(self, s):
|
|
69
|
-
return self._set(suffixes=s)
|
|
70
|
-
|
|
71
|
-
def setInfixes(self, i):
|
|
72
|
-
return self._set(infixes=i)
|
|
73
|
-
|
|
74
|
-
def setWhitelist(self, w):
|
|
75
|
-
return self._set(whitelist=w)
|
|
76
|
-
|
|
77
|
-
@keyword_only
|
|
78
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.RecursiveTokenizer"):
|
|
79
|
-
super(RecursiveTokenizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.RecursiveTokenizer")
|
|
80
|
-
self._setDefault(
|
|
81
|
-
prefixes=["'", "\"", "(", "[", "\n"],
|
|
82
|
-
infixes=["\n", "(", ")"],
|
|
83
|
-
suffixes=[".", ":", "%", ",", ";", "?", "'", "\"", ")", "]", "\n", "!", "'s"],
|
|
84
|
-
whitelist=["it's", "that's", "there's", "he's", "she's", "what's", "let's", "who's", \
|
|
85
|
-
"It's", "That's", "There's", "He's", "She's", "What's", "Let's", "Who's"]
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def _create_model(self, java_model):
|
|
90
|
-
return RecursiveTokenizerModel(java_model=java_model)
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
class RecursiveTokenizerModel(AnnotatorModel):
|
|
94
|
-
name = 'RecursiveTokenizerModel'
|
|
95
|
-
|
|
96
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.RecursiveTokenizerModel", java_model=None):
|
|
97
|
-
super(RecursiveTokenizerModel, self).__init__(
|
|
98
|
-
classname=classname,
|
|
99
|
-
java_model=java_model
|
|
100
|
-
)
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
class Tokenizer(AnnotatorApproach):
|
|
104
|
-
|
|
105
|
-
targetPattern = Param(Params._dummy(),
|
|
106
|
-
"targetPattern",
|
|
107
|
-
"pattern to grab from text as token candidates. Defaults \S+",
|
|
108
|
-
typeConverter=TypeConverters.toString)
|
|
109
|
-
|
|
110
|
-
prefixPattern = Param(Params._dummy(),
|
|
111
|
-
"prefixPattern",
|
|
112
|
-
"regex with groups and begins with \A to match target prefix. Defaults to \A([^\s\w\$\.]*)",
|
|
113
|
-
typeConverter=TypeConverters.toString)
|
|
114
|
-
|
|
115
|
-
suffixPattern = Param(Params._dummy(),
|
|
116
|
-
"suffixPattern",
|
|
117
|
-
"regex with groups and ends with \z to match target suffix. Defaults to ([^\s\w]?)([^\s\w]*)\z",
|
|
118
|
-
typeConverter=TypeConverters.toString)
|
|
119
|
-
|
|
120
|
-
infixPatterns = Param(Params._dummy(),
|
|
121
|
-
"infixPatterns",
|
|
122
|
-
"regex patterns that match tokens within a single target. groups identify different sub-tokens. multiple defaults",
|
|
123
|
-
typeConverter=TypeConverters.toListString)
|
|
124
|
-
|
|
125
|
-
exceptions = Param(Params._dummy(),
|
|
126
|
-
"exceptions",
|
|
127
|
-
"Words that won't be affected by tokenization rules",
|
|
128
|
-
typeConverter=TypeConverters.toListString)
|
|
129
|
-
|
|
130
|
-
exceptionsPath = Param(Params._dummy(),
|
|
131
|
-
"exceptionsPath",
|
|
132
|
-
"path to file containing list of exceptions",
|
|
133
|
-
typeConverter=TypeConverters.toString)
|
|
134
|
-
|
|
135
|
-
caseSensitiveExceptions = Param(Params._dummy(),
|
|
136
|
-
"caseSensitiveExceptions",
|
|
137
|
-
"Whether to care for case sensitiveness in exceptions",
|
|
138
|
-
typeConverter=TypeConverters.toBoolean)
|
|
139
|
-
|
|
140
|
-
contextChars = Param(Params._dummy(),
|
|
141
|
-
"contextChars",
|
|
142
|
-
"character list used to separate from token boundaries",
|
|
143
|
-
typeConverter=TypeConverters.toListString)
|
|
144
|
-
|
|
145
|
-
splitPattern = Param(Params._dummy(),
|
|
146
|
-
"splitPattern",
|
|
147
|
-
"character list used to separate from the inside of tokens",
|
|
148
|
-
typeConverter=TypeConverters.toString)
|
|
149
|
-
|
|
150
|
-
splitChars = Param(Params._dummy(),
|
|
151
|
-
"splitChars",
|
|
152
|
-
"character list used to separate from the inside of tokens",
|
|
153
|
-
typeConverter=TypeConverters.toListString)
|
|
154
|
-
|
|
155
|
-
minLength = Param(Params._dummy(),
|
|
156
|
-
"minLength",
|
|
157
|
-
"Set the minimum allowed legth for each token",
|
|
158
|
-
typeConverter=TypeConverters.toInt)
|
|
159
|
-
|
|
160
|
-
maxLength = Param(Params._dummy(),
|
|
161
|
-
"maxLength",
|
|
162
|
-
"Set the maximum allowed legth for each token",
|
|
163
|
-
typeConverter=TypeConverters.toInt)
|
|
164
|
-
|
|
165
|
-
name = 'Tokenizer'
|
|
166
|
-
|
|
167
|
-
@keyword_only
|
|
168
|
-
def __init__(self):
|
|
169
|
-
super(Tokenizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Tokenizer")
|
|
170
|
-
self._setDefault(
|
|
171
|
-
targetPattern="\\S+",
|
|
172
|
-
contextChars=[".", ",", ";", ":", "!", "?", "*", "-", "(", ")", "\"", "'"],
|
|
173
|
-
caseSensitiveExceptions=True,
|
|
174
|
-
minLength=0,
|
|
175
|
-
maxLength=99999
|
|
176
|
-
)
|
|
177
|
-
|
|
178
|
-
def getInfixPatterns(self):
|
|
179
|
-
return self.getOrDefault("infixPatterns")
|
|
180
|
-
|
|
181
|
-
def getSuffixPattern(self):
|
|
182
|
-
return self.getOrDefault("suffixPattern")
|
|
183
|
-
|
|
184
|
-
def getPrefixPattern(self):
|
|
185
|
-
return self.getOrDefault("prefixPattern")
|
|
186
|
-
|
|
187
|
-
def getContextChars(self):
|
|
188
|
-
return self.getOrDefault("contextChars")
|
|
189
|
-
|
|
190
|
-
def getSplitChars(self):
|
|
191
|
-
return self.getOrDefault("splitChars")
|
|
192
|
-
|
|
193
|
-
def setTargetPattern(self, value):
|
|
194
|
-
return self._set(targetPattern=value)
|
|
195
|
-
|
|
196
|
-
def setPrefixPattern(self, value):
|
|
197
|
-
return self._set(prefixPattern=value)
|
|
198
|
-
|
|
199
|
-
def setSuffixPattern(self, value):
|
|
200
|
-
return self._set(suffixPattern=value)
|
|
201
|
-
|
|
202
|
-
def setInfixPatterns(self, value):
|
|
203
|
-
return self._set(infixPatterns=value)
|
|
204
|
-
|
|
205
|
-
def addInfixPattern(self, value):
|
|
206
|
-
try:
|
|
207
|
-
infix_patterns = self.getInfixPatterns()
|
|
208
|
-
except KeyError:
|
|
209
|
-
infix_patterns = []
|
|
210
|
-
infix_patterns.insert(0, value)
|
|
211
|
-
return self._set(infixPatterns=infix_patterns)
|
|
212
|
-
|
|
213
|
-
def setExceptions(self, value):
|
|
214
|
-
return self._set(exceptions=value)
|
|
215
|
-
|
|
216
|
-
def getExceptions(self):
|
|
217
|
-
return self.getOrDefault("exceptions")
|
|
218
|
-
|
|
219
|
-
def addException(self, value):
|
|
220
|
-
try:
|
|
221
|
-
exception_tokens = self.getExceptions()
|
|
222
|
-
except KeyError:
|
|
223
|
-
exception_tokens = []
|
|
224
|
-
exception_tokens.append(value)
|
|
225
|
-
return self._set(exceptions=exception_tokens)
|
|
226
|
-
|
|
227
|
-
def setCaseSensitiveExceptions(self, value):
|
|
228
|
-
return self._set(caseSensitiveExceptions=value)
|
|
229
|
-
|
|
230
|
-
def getCaseSensitiveExceptions(self):
|
|
231
|
-
return self.getOrDefault("caseSensitiveExceptions")
|
|
232
|
-
|
|
233
|
-
def setContextChars(self, value):
|
|
234
|
-
return self._set(contextChars=value)
|
|
235
|
-
|
|
236
|
-
def addContextChars(self, value):
|
|
237
|
-
try:
|
|
238
|
-
context_chars = self.getContextChars()
|
|
239
|
-
except KeyError:
|
|
240
|
-
context_chars = []
|
|
241
|
-
context_chars.append(value)
|
|
242
|
-
return self._set(contextChars=context_chars)
|
|
243
|
-
|
|
244
|
-
def setSplitPattern(self, value):
|
|
245
|
-
return self._set(splitPattern=value)
|
|
246
|
-
|
|
247
|
-
def setSplitChars(self, value):
|
|
248
|
-
return self._set(splitChars=value)
|
|
249
|
-
|
|
250
|
-
def addSplitChars(self, value):
|
|
251
|
-
try:
|
|
252
|
-
split_chars = self.getSplitChars()
|
|
253
|
-
except KeyError:
|
|
254
|
-
split_chars = []
|
|
255
|
-
split_chars.append(value)
|
|
256
|
-
return self._set(splitChars=split_chars)
|
|
257
|
-
|
|
258
|
-
def setMinLength(self, value):
|
|
259
|
-
return self._set(minLength=value)
|
|
260
|
-
|
|
261
|
-
def setMaxLength(self, value):
|
|
262
|
-
return self._set(maxLength=value)
|
|
263
|
-
|
|
264
|
-
def _create_model(self, java_model):
|
|
265
|
-
return TokenizerModel(java_model=java_model)
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
class TokenizerModel(AnnotatorModel):
|
|
269
|
-
name = "TokenizerModel"
|
|
270
|
-
|
|
271
|
-
exceptions = Param(Params._dummy(),
|
|
272
|
-
"exceptions",
|
|
273
|
-
"Words that won't be affected by tokenization rules",
|
|
274
|
-
typeConverter=TypeConverters.toListString)
|
|
275
|
-
|
|
276
|
-
caseSensitiveExceptions = Param(Params._dummy(),
|
|
277
|
-
"caseSensitiveExceptions",
|
|
278
|
-
"Whether to care for case sensitiveness in exceptions",
|
|
279
|
-
typeConverter=TypeConverters.toBoolean)
|
|
280
|
-
|
|
281
|
-
targetPattern = Param(Params._dummy(),
|
|
282
|
-
"targetPattern",
|
|
283
|
-
"pattern to grab from text as token candidates. Defaults \S+",
|
|
284
|
-
typeConverter=TypeConverters.toString)
|
|
285
|
-
|
|
286
|
-
rules = Param(Params._dummy(),
|
|
287
|
-
"rules",
|
|
288
|
-
"Rules structure factory containing pre processed regex rules",
|
|
289
|
-
typeConverter=TypeConverters.identity)
|
|
290
|
-
|
|
291
|
-
splitPattern = Param(Params._dummy(),
|
|
292
|
-
"splitPattern",
|
|
293
|
-
"character list used to separate from the inside of tokens",
|
|
294
|
-
typeConverter=TypeConverters.toString)
|
|
295
|
-
|
|
296
|
-
splitChars = Param(Params._dummy(),
|
|
297
|
-
"splitChars",
|
|
298
|
-
"character list used to separate from the inside of tokens",
|
|
299
|
-
typeConverter=TypeConverters.toListString)
|
|
300
|
-
|
|
301
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.TokenizerModel", java_model=None):
|
|
302
|
-
super(TokenizerModel, self).__init__(
|
|
303
|
-
classname=classname,
|
|
304
|
-
java_model=java_model
|
|
305
|
-
)
|
|
306
|
-
self._setDefault(
|
|
307
|
-
targetPattern="\\S+",
|
|
308
|
-
caseSensitiveExceptions=True
|
|
309
|
-
)
|
|
310
|
-
|
|
311
|
-
def setSplitPattern(self, value):
|
|
312
|
-
return self._set(splitPattern=value)
|
|
313
|
-
|
|
314
|
-
def setSplitChars(self, value):
|
|
315
|
-
return self._set(splitChars=value)
|
|
316
|
-
|
|
317
|
-
def addSplitChars(self, value):
|
|
318
|
-
try:
|
|
319
|
-
split_chars = self.getSplitChars()
|
|
320
|
-
except KeyError:
|
|
321
|
-
split_chars = []
|
|
322
|
-
split_chars.append(value)
|
|
323
|
-
return self._set(splitChars=split_chars)
|
|
324
|
-
|
|
325
|
-
@staticmethod
|
|
326
|
-
def pretrained(name="token_rules", lang="en", remote_loc=None):
|
|
327
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
328
|
-
return ResourceDownloader.downloadModel(TokenizerModel, name, lang, remote_loc)
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
class RegexTokenizer(AnnotatorModel):
|
|
332
|
-
|
|
333
|
-
name = "RegexTokenizer"
|
|
334
|
-
|
|
335
|
-
@keyword_only
|
|
336
|
-
def __init__(self):
|
|
337
|
-
super(RegexTokenizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.RegexTokenizer")
|
|
338
|
-
self._setDefault(
|
|
339
|
-
inputCols=["document"],
|
|
340
|
-
outputCol="regexToken",
|
|
341
|
-
toLowercase=False,
|
|
342
|
-
minLength=1,
|
|
343
|
-
pattern="\\s+"
|
|
344
|
-
)
|
|
345
|
-
|
|
346
|
-
minLength = Param(Params._dummy(),
|
|
347
|
-
"minLength",
|
|
348
|
-
"Set the minimum allowed legth for each token",
|
|
349
|
-
typeConverter=TypeConverters.toInt)
|
|
350
|
-
|
|
351
|
-
maxLength = Param(Params._dummy(),
|
|
352
|
-
"maxLength",
|
|
353
|
-
"Set the maximum allowed legth for each token",
|
|
354
|
-
typeConverter=TypeConverters.toInt)
|
|
355
|
-
|
|
356
|
-
toLowercase = Param(Params._dummy(),
|
|
357
|
-
"toLowercase",
|
|
358
|
-
"Indicates whether to convert all characters to lowercase before tokenizing.",
|
|
359
|
-
typeConverter=TypeConverters.toBoolean)
|
|
360
|
-
|
|
361
|
-
pattern = Param(Params._dummy(),
|
|
362
|
-
"pattern",
|
|
363
|
-
"regex pattern used for tokenizing. Defaults \S+",
|
|
364
|
-
typeConverter=TypeConverters.toString)
|
|
365
|
-
|
|
366
|
-
def setMinLength(self, value):
|
|
367
|
-
return self._set(minLength=value)
|
|
368
|
-
|
|
369
|
-
def setMaxLength(self, value):
|
|
370
|
-
return self._set(maxLength=value)
|
|
371
|
-
|
|
372
|
-
def setToLowercase(self, value):
|
|
373
|
-
return self._set(toLowercase=value)
|
|
374
|
-
|
|
375
|
-
def setPattern(self, value):
|
|
376
|
-
return self._set(pattern=value)
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
class ChunkTokenizer(Tokenizer):
|
|
380
|
-
name = 'ChunkTokenizer'
|
|
381
|
-
|
|
382
|
-
@keyword_only
|
|
383
|
-
def __init__(self):
|
|
384
|
-
super(Tokenizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.ChunkTokenizer")
|
|
385
|
-
|
|
386
|
-
def _create_model(self, java_model):
|
|
387
|
-
return ChunkTokenizerModel(java_model=java_model)
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
class ChunkTokenizerModel(TokenizerModel):
|
|
391
|
-
name = 'ChunkTokenizerModel'
|
|
392
|
-
|
|
393
|
-
@keyword_only
|
|
394
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.ChunkTokenizerModel", java_model=None):
|
|
395
|
-
super(TokenizerModel, self).__init__(
|
|
396
|
-
classname=classname,
|
|
397
|
-
java_model=java_model
|
|
398
|
-
)
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
class Token2Chunk(AnnotatorModel):
|
|
402
|
-
name = "Token2Chunk"
|
|
403
|
-
|
|
404
|
-
def __init__(self):
|
|
405
|
-
super(Token2Chunk, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Token2Chunk")
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
class Stemmer(AnnotatorModel):
|
|
409
|
-
|
|
410
|
-
language = Param(Params._dummy(), "language", "stemmer algorithm", typeConverter=TypeConverters.toString)
|
|
411
|
-
|
|
412
|
-
name = "Stemmer"
|
|
413
|
-
|
|
414
|
-
@keyword_only
|
|
415
|
-
def __init__(self):
|
|
416
|
-
super(Stemmer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Stemmer")
|
|
417
|
-
self._setDefault(
|
|
418
|
-
language="english"
|
|
419
|
-
)
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
class Chunker(AnnotatorModel):
|
|
423
|
-
|
|
424
|
-
regexParsers = Param(Params._dummy(),
|
|
425
|
-
"regexParsers",
|
|
426
|
-
"an array of grammar based chunk parsers",
|
|
427
|
-
typeConverter=TypeConverters.toListString)
|
|
428
|
-
|
|
429
|
-
name = "Chunker"
|
|
430
|
-
|
|
431
|
-
@keyword_only
|
|
432
|
-
def __init__(self):
|
|
433
|
-
super(Chunker, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Chunker")
|
|
434
|
-
|
|
435
|
-
def setRegexParsers(self, value):
|
|
436
|
-
return self._set(regexParsers=value)
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
class Normalizer(AnnotatorApproach):
|
|
440
|
-
|
|
441
|
-
cleanupPatterns = Param(Params._dummy(),
|
|
442
|
-
"cleanupPatterns",
|
|
443
|
-
"normalization regex patterns which match will be removed from token",
|
|
444
|
-
typeConverter=TypeConverters.toListString)
|
|
445
|
-
|
|
446
|
-
lowercase = Param(Params._dummy(),
|
|
447
|
-
"lowercase",
|
|
448
|
-
"whether to convert strings to lowercase")
|
|
449
|
-
|
|
450
|
-
slangMatchCase = Param(Params._dummy(),
|
|
451
|
-
"slangMatchCase",
|
|
452
|
-
"whether or not to be case sensitive to match slangs. Defaults to false.")
|
|
453
|
-
|
|
454
|
-
slangDictionary = Param(Params._dummy(),
|
|
455
|
-
"slangDictionary",
|
|
456
|
-
"slang dictionary is a delimited text. needs 'delimiter' in options",
|
|
457
|
-
typeConverter=TypeConverters.identity)
|
|
458
|
-
|
|
459
|
-
@keyword_only
|
|
460
|
-
def __init__(self):
|
|
461
|
-
super(Normalizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Normalizer")
|
|
462
|
-
self._setDefault(
|
|
463
|
-
cleanupPatterns=["[^\\pL+]"],
|
|
464
|
-
lowercase=False,
|
|
465
|
-
slangMatchCase=False
|
|
466
|
-
)
|
|
467
|
-
|
|
468
|
-
def setCleanupPatterns(self, value):
|
|
469
|
-
return self._set(cleanupPatterns=value)
|
|
470
|
-
|
|
471
|
-
def setLowercase(self, value):
|
|
472
|
-
return self._set(lowercase=value)
|
|
473
|
-
|
|
474
|
-
def setSlangDictionary(self, path, delimiter, read_as=ReadAs.TEXT, options={"format": "text"}):
|
|
475
|
-
opts = options.copy()
|
|
476
|
-
if "delimiter" not in opts:
|
|
477
|
-
opts["delimiter"] = delimiter
|
|
478
|
-
return self._set(slangDictionary=ExternalResource(path, read_as, opts))
|
|
479
|
-
|
|
480
|
-
def _create_model(self, java_model):
|
|
481
|
-
return NormalizerModel(java_model=java_model)
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
class NormalizerModel(AnnotatorModel):
|
|
485
|
-
|
|
486
|
-
cleanupPatterns = Param(Params._dummy(),
|
|
487
|
-
"cleanupPatterns",
|
|
488
|
-
"normalization regex patterns which match will be removed from token",
|
|
489
|
-
typeConverter=TypeConverters.toListString)
|
|
490
|
-
|
|
491
|
-
lowercase = Param(Params._dummy(),
|
|
492
|
-
"lowercase",
|
|
493
|
-
"whether to convert strings to lowercase")
|
|
494
|
-
|
|
495
|
-
slangMatchCase = Param(Params._dummy(),
|
|
496
|
-
"slangMatchCase",
|
|
497
|
-
"whether or not to be case sensitive to match slangs. Defaults to false.")
|
|
498
|
-
|
|
499
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.NormalizerModel", java_model=None):
|
|
500
|
-
super(NormalizerModel, self).__init__(
|
|
501
|
-
classname=classname,
|
|
502
|
-
java_model=java_model
|
|
503
|
-
)
|
|
504
|
-
|
|
505
|
-
name = "NormalizerModel"
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
class RegexMatcher(AnnotatorApproach):
|
|
509
|
-
|
|
510
|
-
strategy = Param(Params._dummy(),
|
|
511
|
-
"strategy",
|
|
512
|
-
"MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE",
|
|
513
|
-
typeConverter=TypeConverters.toString)
|
|
514
|
-
externalRules = Param(Params._dummy(),
|
|
515
|
-
"externalRules",
|
|
516
|
-
"external resource to rules, needs 'delimiter' in options",
|
|
517
|
-
typeConverter=TypeConverters.identity)
|
|
518
|
-
|
|
519
|
-
@keyword_only
|
|
520
|
-
def __init__(self):
|
|
521
|
-
super(RegexMatcher, self).__init__(classname="com.johnsnowlabs.nlp.annotators.RegexMatcher")
|
|
522
|
-
self._setDefault(
|
|
523
|
-
strategy="MATCH_ALL"
|
|
524
|
-
)
|
|
525
|
-
|
|
526
|
-
def setStrategy(self, value):
|
|
527
|
-
return self._set(strategy=value)
|
|
528
|
-
|
|
529
|
-
def setExternalRules(self, path, delimiter, read_as=ReadAs.TEXT, options={"format": "text"}):
|
|
530
|
-
opts = options.copy()
|
|
531
|
-
if "delimiter" not in opts:
|
|
532
|
-
opts["delimiter"] = delimiter
|
|
533
|
-
return self._set(externalRules=ExternalResource(path, read_as, opts))
|
|
534
|
-
|
|
535
|
-
def _create_model(self, java_model):
|
|
536
|
-
return RegexMatcherModel(java_model=java_model)
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
class RegexMatcherModel(AnnotatorModel):
|
|
540
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.RegexMatcherModel", java_model=None):
|
|
541
|
-
super(RegexMatcherModel, self).__init__(
|
|
542
|
-
classname=classname,
|
|
543
|
-
java_model=java_model
|
|
544
|
-
)
|
|
545
|
-
|
|
546
|
-
name = "RegexMatcherModel"
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
class Lemmatizer(AnnotatorApproach):
|
|
550
|
-
dictionary = Param(Params._dummy(),
|
|
551
|
-
"dictionary",
|
|
552
|
-
"lemmatizer external dictionary." +
|
|
553
|
-
" needs 'keyDelimiter' and 'valueDelimiter' in options for parsing target text",
|
|
554
|
-
typeConverter=TypeConverters.identity)
|
|
555
|
-
|
|
556
|
-
@keyword_only
|
|
557
|
-
def __init__(self):
|
|
558
|
-
super(Lemmatizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Lemmatizer")
|
|
559
|
-
|
|
560
|
-
def _create_model(self, java_model):
|
|
561
|
-
return LemmatizerModel(java_model=java_model)
|
|
562
|
-
|
|
563
|
-
def setDictionary(self, path, key_delimiter, value_delimiter, read_as=ReadAs.TEXT,
|
|
564
|
-
options={"format": "text"}):
|
|
565
|
-
opts = options.copy()
|
|
566
|
-
if "keyDelimiter" not in opts:
|
|
567
|
-
opts["keyDelimiter"] = key_delimiter
|
|
568
|
-
if "valueDelimiter" not in opts:
|
|
569
|
-
opts["valueDelimiter"] = value_delimiter
|
|
570
|
-
return self._set(dictionary=ExternalResource(path, read_as, opts))
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
class LemmatizerModel(AnnotatorModel):
|
|
574
|
-
name = "LemmatizerModel"
|
|
575
|
-
|
|
576
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.LemmatizerModel", java_model=None):
|
|
577
|
-
super(LemmatizerModel, self).__init__(
|
|
578
|
-
classname=classname,
|
|
579
|
-
java_model=java_model
|
|
580
|
-
)
|
|
581
|
-
|
|
582
|
-
@staticmethod
|
|
583
|
-
def pretrained(name="lemma_antbnc", lang="en", remote_loc=None):
|
|
584
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
585
|
-
return ResourceDownloader.downloadModel(LemmatizerModel, name, lang, remote_loc)
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
class DateMatcherUtils(Params):
|
|
589
|
-
dateFormat = Param(Params._dummy(),
|
|
590
|
-
"dateFormat",
|
|
591
|
-
"desired format for dates extracted",
|
|
592
|
-
typeConverter=TypeConverters.toString)
|
|
593
|
-
|
|
594
|
-
readMonthFirst = Param(Params._dummy(),
|
|
595
|
-
"readMonthFirst",
|
|
596
|
-
"Whether to parse july 07/05/2015 or as 05/07/2015",
|
|
597
|
-
typeConverter=TypeConverters.toBoolean
|
|
598
|
-
)
|
|
599
|
-
|
|
600
|
-
defaultDayWhenMissing = Param(Params._dummy(),
|
|
601
|
-
"defaultDayWhenMissing",
|
|
602
|
-
"which day to set when it is missing from parsed input",
|
|
603
|
-
typeConverter=TypeConverters.toInt
|
|
604
|
-
)
|
|
605
|
-
|
|
606
|
-
def setFormat(self, value):
|
|
607
|
-
return self._set(dateFormat=value)
|
|
608
|
-
|
|
609
|
-
def setReadMonthFirst(self, value):
|
|
610
|
-
return self._set(readMonthFirst=value)
|
|
611
|
-
|
|
612
|
-
def setDefaultDayWhenMissing(self, value):
|
|
613
|
-
return self._set(defaultDayWhenMissing=value)
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
class DateMatcher(AnnotatorModel, DateMatcherUtils):
|
|
617
|
-
|
|
618
|
-
name = "DateMatcher"
|
|
619
|
-
|
|
620
|
-
@keyword_only
|
|
621
|
-
def __init__(self):
|
|
622
|
-
super(DateMatcher, self).__init__(classname="com.johnsnowlabs.nlp.annotators.DateMatcher")
|
|
623
|
-
self._setDefault(
|
|
624
|
-
dateFormat="yyyy/MM/dd",
|
|
625
|
-
readMonthFirst=True,
|
|
626
|
-
defaultDayWhenMissing=1
|
|
627
|
-
)
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
class MultiDateMatcher(AnnotatorModel, DateMatcherUtils):
|
|
631
|
-
|
|
632
|
-
name = "MultiDateMatcher"
|
|
633
|
-
|
|
634
|
-
@keyword_only
|
|
635
|
-
def __init__(self):
|
|
636
|
-
super(MultiDateMatcher, self).__init__(classname="com.johnsnowlabs.nlp.annotators.MultiDateMatcher")
|
|
637
|
-
self._setDefault(
|
|
638
|
-
dateFormat="yyyy/MM/dd",
|
|
639
|
-
readMonthFirst=True,
|
|
640
|
-
defaultDayWhenMissing=1
|
|
641
|
-
)
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
class TextMatcher(AnnotatorApproach):
|
|
645
|
-
|
|
646
|
-
entities = Param(Params._dummy(),
|
|
647
|
-
"entities",
|
|
648
|
-
"ExternalResource for entities",
|
|
649
|
-
typeConverter=TypeConverters.identity)
|
|
650
|
-
|
|
651
|
-
caseSensitive = Param(Params._dummy(),
|
|
652
|
-
"caseSensitive",
|
|
653
|
-
"whether to match regardless of case. Defaults true",
|
|
654
|
-
typeConverter=TypeConverters.toBoolean)
|
|
655
|
-
|
|
656
|
-
mergeOverlapping = Param(Params._dummy(),
|
|
657
|
-
"mergeOverlapping",
|
|
658
|
-
"whether to merge overlapping matched chunks. Defaults false",
|
|
659
|
-
typeConverter=TypeConverters.toBoolean)
|
|
660
|
-
|
|
661
|
-
entityValue = Param(Params._dummy(),
|
|
662
|
-
"entityValue",
|
|
663
|
-
"value for the entity metadata field",
|
|
664
|
-
typeConverter=TypeConverters.toString)
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
buildFromTokens = Param(Params._dummy(),
|
|
668
|
-
"buildFromTokens",
|
|
669
|
-
"whether the TextMatcher should take the CHUNK from TOKEN or not",
|
|
670
|
-
typeConverter=TypeConverters.toBoolean)
|
|
671
|
-
|
|
672
|
-
@keyword_only
|
|
673
|
-
def __init__(self):
|
|
674
|
-
super(TextMatcher, self).__init__(classname="com.johnsnowlabs.nlp.annotators.TextMatcher")
|
|
675
|
-
self._setDefault(caseSensitive=True)
|
|
676
|
-
self._setDefault(mergeOverlapping=False)
|
|
677
|
-
|
|
678
|
-
def _create_model(self, java_model):
|
|
679
|
-
return TextMatcherModel(java_model=java_model)
|
|
680
|
-
|
|
681
|
-
def setEntities(self, path, read_as=ReadAs.TEXT, options={"format": "text"}):
|
|
682
|
-
return self._set(entities=ExternalResource(path, read_as, options.copy()))
|
|
683
|
-
|
|
684
|
-
def setCaseSensitive(self, b):
|
|
685
|
-
return self._set(caseSensitive=b)
|
|
686
|
-
|
|
687
|
-
def setMergeOverlapping(self, b):
|
|
688
|
-
return self._set(mergeOverlapping=b)
|
|
689
|
-
|
|
690
|
-
def setEntityValue(self, b):
|
|
691
|
-
return self._set(entityValue=b)
|
|
692
|
-
|
|
693
|
-
def setBuildFromTokens(self, b):
|
|
694
|
-
return self._set(buildFromTokens=b)
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
class TextMatcherModel(AnnotatorModel):
|
|
698
|
-
name = "TextMatcherModel"
|
|
699
|
-
|
|
700
|
-
mergeOverlapping = Param(Params._dummy(),
|
|
701
|
-
"mergeOverlapping",
|
|
702
|
-
"whether to merge overlapping matched chunks. Defaults false",
|
|
703
|
-
typeConverter=TypeConverters.toBoolean)
|
|
704
|
-
|
|
705
|
-
searchTrie = Param(Params._dummy(),
|
|
706
|
-
"searchTrie",
|
|
707
|
-
"searchTrie",
|
|
708
|
-
typeConverter=TypeConverters.identity)
|
|
709
|
-
|
|
710
|
-
entityValue = Param(Params._dummy(),
|
|
711
|
-
"entityValue",
|
|
712
|
-
"value for the entity metadata field",
|
|
713
|
-
typeConverter=TypeConverters.toString)
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
buildFromTokens = Param(Params._dummy(),
|
|
717
|
-
"buildFromTokens",
|
|
718
|
-
"whether the TextMatcher should take the CHUNK from TOKEN or not",
|
|
719
|
-
typeConverter=TypeConverters.toBoolean)
|
|
720
|
-
|
|
721
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.TextMatcherModel", java_model=None):
|
|
722
|
-
super(TextMatcherModel, self).__init__(
|
|
723
|
-
classname=classname,
|
|
724
|
-
java_model=java_model
|
|
725
|
-
)
|
|
726
|
-
|
|
727
|
-
def setMergeOverlapping(self, b):
|
|
728
|
-
return self._set(mergeOverlapping=b)
|
|
729
|
-
|
|
730
|
-
def setEntityValue(self, b):
|
|
731
|
-
return self._set(entityValue=b)
|
|
732
|
-
|
|
733
|
-
def setBuildFromTokens(self, b):
|
|
734
|
-
return self._set(buildFromTokens=b)
|
|
735
|
-
|
|
736
|
-
@staticmethod
|
|
737
|
-
def pretrained(name, lang="en", remote_loc=None):
|
|
738
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
739
|
-
return ResourceDownloader.downloadModel(TextMatcherModel, name, lang, remote_loc)
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
class BigTextMatcher(AnnotatorApproach, HasStorage):
|
|
743
|
-
|
|
744
|
-
entities = Param(Params._dummy(),
|
|
745
|
-
"entities",
|
|
746
|
-
"ExternalResource for entities",
|
|
747
|
-
typeConverter=TypeConverters.identity)
|
|
748
|
-
|
|
749
|
-
caseSensitive = Param(Params._dummy(),
|
|
750
|
-
"caseSensitive",
|
|
751
|
-
"whether to ignore case in index lookups",
|
|
752
|
-
typeConverter=TypeConverters.toBoolean)
|
|
753
|
-
|
|
754
|
-
mergeOverlapping = Param(Params._dummy(),
|
|
755
|
-
"mergeOverlapping",
|
|
756
|
-
"whether to merge overlapping matched chunks. Defaults false",
|
|
757
|
-
typeConverter=TypeConverters.toBoolean)
|
|
758
|
-
|
|
759
|
-
tokenizer = Param(Params._dummy(),
|
|
760
|
-
"tokenizer",
|
|
761
|
-
"TokenizerModel to use to tokenize input file for building a Trie",
|
|
762
|
-
typeConverter=TypeConverters.identity)
|
|
763
|
-
|
|
764
|
-
@keyword_only
|
|
765
|
-
def __init__(self):
|
|
766
|
-
super(BigTextMatcher, self).__init__(classname="com.johnsnowlabs.nlp.annotators.btm.BigTextMatcher")
|
|
767
|
-
self._setDefault(caseSensitive=True)
|
|
768
|
-
self._setDefault(mergeOverlapping=False)
|
|
769
|
-
|
|
770
|
-
def _create_model(self, java_model):
|
|
771
|
-
return TextMatcherModel(java_model=java_model)
|
|
772
|
-
|
|
773
|
-
def setEntities(self, path, read_as=ReadAs.TEXT, options={"format": "text"}):
|
|
774
|
-
return self._set(entities=ExternalResource(path, read_as, options.copy()))
|
|
775
|
-
|
|
776
|
-
def setCaseSensitive(self, b):
|
|
777
|
-
return self._set(caseSensitive=b)
|
|
778
|
-
|
|
779
|
-
def setMergeOverlapping(self, b):
|
|
780
|
-
return self._set(mergeOverlapping=b)
|
|
781
|
-
|
|
782
|
-
def setTokenizer(self, tokenizer_model):
|
|
783
|
-
tokenizer_model._transfer_params_to_java()
|
|
784
|
-
return self._set(tokenizer_model._java_obj)
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
class BigTextMatcherModel(AnnotatorModel, HasStorageModel):
|
|
788
|
-
name = "BigTextMatcherModel"
|
|
789
|
-
databases = ['TMVOCAB', 'TMEDGES', 'TMNODES']
|
|
790
|
-
|
|
791
|
-
caseSensitive = Param(Params._dummy(),
|
|
792
|
-
"caseSensitive",
|
|
793
|
-
"whether to ignore case in index lookups",
|
|
794
|
-
typeConverter=TypeConverters.toBoolean)
|
|
795
|
-
|
|
796
|
-
mergeOverlapping = Param(Params._dummy(),
|
|
797
|
-
"mergeOverlapping",
|
|
798
|
-
"whether to merge overlapping matched chunks. Defaults false",
|
|
799
|
-
typeConverter=TypeConverters.toBoolean)
|
|
800
|
-
|
|
801
|
-
searchTrie = Param(Params._dummy(),
|
|
802
|
-
"searchTrie",
|
|
803
|
-
"searchTrie",
|
|
804
|
-
typeConverter=TypeConverters.identity)
|
|
805
|
-
|
|
806
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.btm.TextMatcherModel", java_model=None):
|
|
807
|
-
super(BigTextMatcherModel, self).__init__(
|
|
808
|
-
classname=classname,
|
|
809
|
-
java_model=java_model
|
|
810
|
-
)
|
|
811
|
-
|
|
812
|
-
def setMergeOverlapping(self, b):
|
|
813
|
-
return self._set(mergeOverlapping=b)
|
|
814
|
-
|
|
815
|
-
def setCaseSensitive(self, v):
|
|
816
|
-
return self._set(caseSensitive=v)
|
|
817
|
-
|
|
818
|
-
@staticmethod
|
|
819
|
-
def pretrained(name, lang="en", remote_loc=None):
|
|
820
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
821
|
-
return ResourceDownloader.downloadModel(TextMatcherModel, name, lang, remote_loc)
|
|
822
|
-
|
|
823
|
-
@staticmethod
|
|
824
|
-
def loadStorage(path, spark, storage_ref):
|
|
825
|
-
HasStorageModel.loadStorages(path, spark, storage_ref, BigTextMatcherModel.databases)
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
class PerceptronApproach(AnnotatorApproach):
|
|
829
|
-
posCol = Param(Params._dummy(),
|
|
830
|
-
"posCol",
|
|
831
|
-
"column of Array of POS tags that match tokens",
|
|
832
|
-
typeConverter=TypeConverters.toString)
|
|
833
|
-
|
|
834
|
-
nIterations = Param(Params._dummy(),
|
|
835
|
-
"nIterations",
|
|
836
|
-
"Number of iterations in training, converges to better accuracy",
|
|
837
|
-
typeConverter=TypeConverters.toInt)
|
|
838
|
-
|
|
839
|
-
@keyword_only
|
|
840
|
-
def __init__(self):
|
|
841
|
-
super(PerceptronApproach, self).__init__(
|
|
842
|
-
classname="com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronApproach")
|
|
843
|
-
self._setDefault(
|
|
844
|
-
nIterations=5
|
|
845
|
-
)
|
|
846
|
-
|
|
847
|
-
def setPosCol(self, value):
|
|
848
|
-
return self._set(posCol=value)
|
|
849
|
-
|
|
850
|
-
def setIterations(self, value):
|
|
851
|
-
return self._set(nIterations=value)
|
|
852
|
-
|
|
853
|
-
def getNIterations(self):
|
|
854
|
-
return self.getOrDefault(self.nIterations)
|
|
855
|
-
|
|
856
|
-
def _create_model(self, java_model):
|
|
857
|
-
return PerceptronModel(java_model=java_model)
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
class PerceptronModel(AnnotatorModel):
|
|
861
|
-
name = "PerceptronModel"
|
|
862
|
-
|
|
863
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronModel", java_model=None):
|
|
864
|
-
super(PerceptronModel, self).__init__(
|
|
865
|
-
classname=classname,
|
|
866
|
-
java_model=java_model
|
|
867
|
-
)
|
|
868
|
-
|
|
869
|
-
@staticmethod
|
|
870
|
-
def pretrained(name="pos_anc", lang="en", remote_loc=None):
|
|
871
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
872
|
-
return ResourceDownloader.downloadModel(PerceptronModel, name, lang, remote_loc)
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
class SentenceDetectorParams:
|
|
876
|
-
useAbbreviations = Param(Params._dummy(),
|
|
877
|
-
"useAbbreviations",
|
|
878
|
-
"whether to apply abbreviations at sentence detection",
|
|
879
|
-
typeConverter=TypeConverters.toBoolean)
|
|
880
|
-
|
|
881
|
-
customBounds = Param(Params._dummy(),
|
|
882
|
-
"customBounds",
|
|
883
|
-
"characters used to explicitly mark sentence bounds",
|
|
884
|
-
typeConverter=TypeConverters.toListString)
|
|
885
|
-
|
|
886
|
-
useCustomBoundsOnly = Param(Params._dummy(),
|
|
887
|
-
"useCustomBoundsOnly",
|
|
888
|
-
"Only utilize custom bounds in sentence detection",
|
|
889
|
-
typeConverter=TypeConverters.toBoolean)
|
|
890
|
-
|
|
891
|
-
explodeSentences = Param(Params._dummy(),
|
|
892
|
-
"explodeSentences",
|
|
893
|
-
"whether to explode each sentence into a different row, for better parallelization. Defaults to false.",
|
|
894
|
-
typeConverter=TypeConverters.toBoolean)
|
|
895
|
-
|
|
896
|
-
splitLength = Param(Params._dummy(),
|
|
897
|
-
"splitLength",
|
|
898
|
-
"length at which sentences will be forcibly split.",
|
|
899
|
-
typeConverter=TypeConverters.toInt)
|
|
900
|
-
|
|
901
|
-
minLength = Param(Params._dummy(),
|
|
902
|
-
"minLength",
|
|
903
|
-
"Set the minimum allowed length for each sentence.",
|
|
904
|
-
typeConverter=TypeConverters.toInt)
|
|
905
|
-
|
|
906
|
-
maxLength = Param(Params._dummy(),
|
|
907
|
-
"maxLength",
|
|
908
|
-
"Set the maximum allowed length for each sentence",
|
|
909
|
-
typeConverter=TypeConverters.toInt)
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
class SentenceDetector(AnnotatorModel, SentenceDetectorParams):
|
|
913
|
-
|
|
914
|
-
name = 'SentenceDetector'
|
|
915
|
-
|
|
916
|
-
# this one is exclusive to this detector
|
|
917
|
-
detectLists = Param(Params._dummy(),
|
|
918
|
-
"detectLists",
|
|
919
|
-
"whether detect lists during sentence detection",
|
|
920
|
-
typeConverter=TypeConverters.toBoolean)
|
|
921
|
-
|
|
922
|
-
def setCustomBounds(self, value):
|
|
923
|
-
return self._set(customBounds=value)
|
|
924
|
-
|
|
925
|
-
def setUseAbbreviations(self, value):
|
|
926
|
-
return self._set(useAbbreviations=value)
|
|
927
|
-
|
|
928
|
-
def setDetectLists(self, value):
|
|
929
|
-
return self._set(detectLists=value)
|
|
930
|
-
|
|
931
|
-
def setUseCustomBoundsOnly(self, value):
|
|
932
|
-
return self._set(useCustomBoundsOnly=value)
|
|
933
|
-
|
|
934
|
-
def setExplodeSentences(self, value):
|
|
935
|
-
return self._set(explodeSentences=value)
|
|
936
|
-
|
|
937
|
-
def setSplitLength(self, value):
|
|
938
|
-
return self._set(splitLength=value)
|
|
939
|
-
|
|
940
|
-
def setMinLength(self, value):
|
|
941
|
-
return self._set(minLength=value)
|
|
942
|
-
|
|
943
|
-
def setMaxLength(self, value):
|
|
944
|
-
return self._set(maxLength=value)
|
|
945
|
-
|
|
946
|
-
@keyword_only
|
|
947
|
-
def __init__(self):
|
|
948
|
-
super(SentenceDetector, self).__init__(
|
|
949
|
-
classname="com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector")
|
|
950
|
-
self._setDefault(
|
|
951
|
-
useAbbreviations=True,
|
|
952
|
-
detectLists=True,
|
|
953
|
-
useCustomBoundsOnly=False,
|
|
954
|
-
customBounds=[],
|
|
955
|
-
explodeSentences=False,
|
|
956
|
-
minLength=0,
|
|
957
|
-
maxLength=99999
|
|
958
|
-
)
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
class SentimentDetector(AnnotatorApproach):
|
|
962
|
-
dictionary = Param(Params._dummy(),
|
|
963
|
-
"dictionary",
|
|
964
|
-
"path for dictionary to sentiment analysis",
|
|
965
|
-
typeConverter=TypeConverters.identity)
|
|
966
|
-
|
|
967
|
-
positiveMultiplier = Param(Params._dummy(),
|
|
968
|
-
"positiveMultiplier",
|
|
969
|
-
"multiplier for positive sentiments. Defaults 1.0",
|
|
970
|
-
typeConverter=TypeConverters.toFloat)
|
|
971
|
-
|
|
972
|
-
negativeMultiplier = Param(Params._dummy(),
|
|
973
|
-
"negativeMultiplier",
|
|
974
|
-
"multiplier for negative sentiments. Defaults -1.0",
|
|
975
|
-
typeConverter=TypeConverters.toFloat)
|
|
976
|
-
|
|
977
|
-
incrementMultiplier = Param(Params._dummy(),
|
|
978
|
-
"incrementMultiplier",
|
|
979
|
-
"multiplier for increment sentiments. Defaults 2.0",
|
|
980
|
-
typeConverter=TypeConverters.toFloat)
|
|
981
|
-
|
|
982
|
-
decrementMultiplier = Param(Params._dummy(),
|
|
983
|
-
"decrementMultiplier",
|
|
984
|
-
"multiplier for decrement sentiments. Defaults -2.0",
|
|
985
|
-
typeConverter=TypeConverters.toFloat)
|
|
986
|
-
|
|
987
|
-
reverseMultiplier = Param(Params._dummy(),
|
|
988
|
-
"reverseMultiplier",
|
|
989
|
-
"multiplier for revert sentiments. Defaults -1.0",
|
|
990
|
-
typeConverter=TypeConverters.toFloat)
|
|
991
|
-
|
|
992
|
-
enableScore = Param(Params._dummy(),
|
|
993
|
-
"enableScore",
|
|
994
|
-
"if true, score will show as the double value, else will output string \"positive\" or \"negative\". Defaults false",
|
|
995
|
-
typeConverter=TypeConverters.toBoolean)
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
def __init__(self):
|
|
999
|
-
super(SentimentDetector, self).__init__(
|
|
1000
|
-
classname="com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetector")
|
|
1001
|
-
self._setDefault(positiveMultiplier=1.0, negativeMultiplier=-1.0, incrementMultiplier=2.0,
|
|
1002
|
-
decrementMultiplier=-2.0, reverseMultiplier=-1.0, enableScore=False)
|
|
1003
|
-
|
|
1004
|
-
def setDictionary(self, path, delimiter, read_as=ReadAs.TEXT, options={'format': 'text'}):
|
|
1005
|
-
opts = options.copy()
|
|
1006
|
-
if "delimiter" not in opts:
|
|
1007
|
-
opts["delimiter"] = delimiter
|
|
1008
|
-
return self._set(dictionary=ExternalResource(path, read_as, opts))
|
|
1009
|
-
|
|
1010
|
-
def _create_model(self, java_model):
|
|
1011
|
-
return SentimentDetectorModel(java_model=java_model)
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
class SentimentDetectorModel(AnnotatorModel):
|
|
1015
|
-
name = "SentimentDetectorModel"
|
|
1016
|
-
|
|
1017
|
-
positiveMultiplier = Param(Params._dummy(),
|
|
1018
|
-
"positiveMultiplier",
|
|
1019
|
-
"multiplier for positive sentiments. Defaults 1.0",
|
|
1020
|
-
typeConverter=TypeConverters.toFloat)
|
|
1021
|
-
|
|
1022
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetectorModel",
|
|
1023
|
-
java_model=None):
|
|
1024
|
-
super(SentimentDetectorModel, self).__init__(
|
|
1025
|
-
classname=classname,
|
|
1026
|
-
java_model=java_model
|
|
1027
|
-
)
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
class ViveknSentimentApproach(AnnotatorApproach):
|
|
1031
|
-
sentimentCol = Param(Params._dummy(),
|
|
1032
|
-
"sentimentCol",
|
|
1033
|
-
"column with the sentiment result of every row. Must be 'positive' or 'negative'",
|
|
1034
|
-
typeConverter=TypeConverters.toString)
|
|
1035
|
-
|
|
1036
|
-
pruneCorpus = Param(Params._dummy(),
|
|
1037
|
-
"pruneCorpus",
|
|
1038
|
-
"Removes unfrequent scenarios from scope. The higher the better performance. Defaults 1",
|
|
1039
|
-
typeConverter=TypeConverters.toInt)
|
|
1040
|
-
|
|
1041
|
-
importantFeatureRatio = Param(Params._dummy(),
|
|
1042
|
-
"importantFeatureRatio",
|
|
1043
|
-
"proportion of feature content to be considered relevant. Defaults to 0.5",
|
|
1044
|
-
typeConverter=TypeConverters.toFloat)
|
|
1045
|
-
|
|
1046
|
-
unimportantFeatureStep = Param(Params._dummy(),
|
|
1047
|
-
"unimportantFeatureStep",
|
|
1048
|
-
"proportion to lookahead in unimportant features. Defaults to 0.025",
|
|
1049
|
-
typeConverter=TypeConverters.toFloat)
|
|
1050
|
-
|
|
1051
|
-
featureLimit = Param(Params._dummy(),
|
|
1052
|
-
"featureLimit",
|
|
1053
|
-
"content feature limit, to boost performance in very dirt text. Default disabled with -1",
|
|
1054
|
-
typeConverter=TypeConverters.toInt)
|
|
1055
|
-
|
|
1056
|
-
@keyword_only
|
|
1057
|
-
def __init__(self):
|
|
1058
|
-
super(ViveknSentimentApproach, self).__init__(
|
|
1059
|
-
classname="com.johnsnowlabs.nlp.annotators.sda.vivekn.ViveknSentimentApproach")
|
|
1060
|
-
self._setDefault(pruneCorpus=1, importantFeatureRatio=0.5, unimportantFeatureStep=0.025, featureLimit=-1)
|
|
1061
|
-
|
|
1062
|
-
def setSentimentCol(self, value):
|
|
1063
|
-
return self._set(sentimentCol=value)
|
|
1064
|
-
|
|
1065
|
-
def setPruneCorpus(self, value):
|
|
1066
|
-
return self._set(pruneCorpus=value)
|
|
1067
|
-
|
|
1068
|
-
def _create_model(self, java_model):
|
|
1069
|
-
return ViveknSentimentModel(java_model=java_model)
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
class ViveknSentimentModel(AnnotatorModel):
|
|
1073
|
-
name = "ViveknSentimentModel"
|
|
1074
|
-
|
|
1075
|
-
importantFeatureRatio = Param(Params._dummy(),
|
|
1076
|
-
"importantFeatureRatio",
|
|
1077
|
-
"proportion of feature content to be considered relevant. Defaults to 0.5",
|
|
1078
|
-
typeConverter=TypeConverters.toFloat)
|
|
1079
|
-
|
|
1080
|
-
unimportantFeatureStep = Param(Params._dummy(),
|
|
1081
|
-
"unimportantFeatureStep",
|
|
1082
|
-
"proportion to lookahead in unimportant features. Defaults to 0.025",
|
|
1083
|
-
typeConverter=TypeConverters.toFloat)
|
|
1084
|
-
|
|
1085
|
-
featureLimit = Param(Params._dummy(),
|
|
1086
|
-
"featureLimit",
|
|
1087
|
-
"content feature limit, to boost performance in very dirt text. Default disabled with -1",
|
|
1088
|
-
typeConverter=TypeConverters.toInt)
|
|
1089
|
-
|
|
1090
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.sda.vivekn.ViveknSentimentModel", java_model=None):
|
|
1091
|
-
super(ViveknSentimentModel, self).__init__(
|
|
1092
|
-
classname=classname,
|
|
1093
|
-
java_model=java_model
|
|
1094
|
-
)
|
|
1095
|
-
|
|
1096
|
-
@staticmethod
|
|
1097
|
-
def pretrained(name="sentiment_vivekn", lang="en", remote_loc=None):
|
|
1098
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
1099
|
-
return ResourceDownloader.downloadModel(ViveknSentimentModel, name, lang, remote_loc)
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
class NorvigSweetingApproach(AnnotatorApproach):
|
|
1103
|
-
dictionary = Param(Params._dummy(),
|
|
1104
|
-
"dictionary",
|
|
1105
|
-
"dictionary needs 'tokenPattern' regex in dictionary for separating words",
|
|
1106
|
-
typeConverter=TypeConverters.identity)
|
|
1107
|
-
|
|
1108
|
-
caseSensitive = Param(Params._dummy(),
|
|
1109
|
-
"caseSensitive",
|
|
1110
|
-
"whether to ignore case sensitivty",
|
|
1111
|
-
typeConverter=TypeConverters.toBoolean)
|
|
1112
|
-
|
|
1113
|
-
doubleVariants = Param(Params._dummy(),
|
|
1114
|
-
"doubleVariants",
|
|
1115
|
-
"whether to use more expensive spell checker",
|
|
1116
|
-
typeConverter=TypeConverters.toBoolean)
|
|
1117
|
-
|
|
1118
|
-
shortCircuit = Param(Params._dummy(),
|
|
1119
|
-
"shortCircuit",
|
|
1120
|
-
"whether to use faster mode",
|
|
1121
|
-
typeConverter=TypeConverters.toBoolean)
|
|
1122
|
-
|
|
1123
|
-
frequencyPriority = Param(Params._dummy(),
|
|
1124
|
-
"frequencyPriority",
|
|
1125
|
-
"applies frequency over hamming in intersections. When false hamming takes priority",
|
|
1126
|
-
typeConverter=TypeConverters.toBoolean)
|
|
1127
|
-
|
|
1128
|
-
wordSizeIgnore = Param(Params._dummy(),
|
|
1129
|
-
"wordSizeIgnore",
|
|
1130
|
-
"minimum size of word before ignoring. Defaults to 3",
|
|
1131
|
-
typeConverter=TypeConverters.toInt)
|
|
1132
|
-
|
|
1133
|
-
dupsLimit = Param(Params._dummy(),
|
|
1134
|
-
"dupsLimit",
|
|
1135
|
-
"maximum duplicate of characters in a word to consider. Defaults to 2",
|
|
1136
|
-
typeConverter=TypeConverters.toInt)
|
|
1137
|
-
|
|
1138
|
-
reductLimit = Param(Params._dummy(),
|
|
1139
|
-
"reductLimit",
|
|
1140
|
-
"word reductions limit. Defaults to 3",
|
|
1141
|
-
typeConverter=TypeConverters.toInt)
|
|
1142
|
-
|
|
1143
|
-
intersections = Param(Params._dummy(),
|
|
1144
|
-
"intersections",
|
|
1145
|
-
"hamming intersections to attempt. Defaults to 10",
|
|
1146
|
-
typeConverter=TypeConverters.toInt)
|
|
1147
|
-
|
|
1148
|
-
vowelSwapLimit = Param(Params._dummy(),
|
|
1149
|
-
"vowelSwapLimit",
|
|
1150
|
-
"vowel swap attempts. Defaults to 6",
|
|
1151
|
-
typeConverter=TypeConverters.toInt)
|
|
1152
|
-
|
|
1153
|
-
@keyword_only
|
|
1154
|
-
def __init__(self):
|
|
1155
|
-
super(NorvigSweetingApproach, self).__init__(
|
|
1156
|
-
classname="com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingApproach")
|
|
1157
|
-
self._setDefault(caseSensitive=False, doubleVariants=False, shortCircuit=False, wordSizeIgnore=3, dupsLimit=2,
|
|
1158
|
-
reductLimit=3, intersections=10, vowelSwapLimit=6, frequencyPriority=True)
|
|
1159
|
-
self.dictionary_path = ""
|
|
1160
|
-
|
|
1161
|
-
def setDictionary(self, path, token_pattern="\S+", read_as=ReadAs.TEXT, options={"format": "text"}):
|
|
1162
|
-
self.dictionary_path = path
|
|
1163
|
-
opts = options.copy()
|
|
1164
|
-
if "tokenPattern" not in opts:
|
|
1165
|
-
opts["tokenPattern"] = token_pattern
|
|
1166
|
-
return self._set(dictionary=ExternalResource(path, read_as, opts))
|
|
1167
|
-
|
|
1168
|
-
def setCaseSensitive(self, value):
|
|
1169
|
-
return self._set(caseSensitive=value)
|
|
1170
|
-
|
|
1171
|
-
def setDoubleVariants(self, value):
|
|
1172
|
-
return self._set(doubleVariants=value)
|
|
1173
|
-
|
|
1174
|
-
def setShortCircuit(self, value):
|
|
1175
|
-
return self._set(shortCircuit=value)
|
|
1176
|
-
|
|
1177
|
-
def setFrequencyPriority(self, value):
|
|
1178
|
-
return self._set(frequencyPriority=value)
|
|
1179
|
-
|
|
1180
|
-
def _create_model(self, java_model):
|
|
1181
|
-
return NorvigSweetingModel(java_model=java_model)
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
class NorvigSweetingModel(AnnotatorModel):
|
|
1185
|
-
name = "NorvigSweetingModel"
|
|
1186
|
-
|
|
1187
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel", java_model=None):
|
|
1188
|
-
super(NorvigSweetingModel, self).__init__(
|
|
1189
|
-
classname=classname,
|
|
1190
|
-
java_model=java_model
|
|
1191
|
-
)
|
|
1192
|
-
|
|
1193
|
-
@staticmethod
|
|
1194
|
-
def pretrained(name="spellcheck_norvig", lang="en", remote_loc=None):
|
|
1195
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
1196
|
-
return ResourceDownloader.downloadModel(NorvigSweetingModel, name, lang, remote_loc)
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
class SymmetricDeleteApproach(AnnotatorApproach):
|
|
1200
|
-
corpus = Param(Params._dummy(),
|
|
1201
|
-
"corpus",
|
|
1202
|
-
"folder or file with text that teaches about the language",
|
|
1203
|
-
typeConverter=TypeConverters.identity)
|
|
1204
|
-
|
|
1205
|
-
dictionary = Param(Params._dummy(),
|
|
1206
|
-
"dictionary",
|
|
1207
|
-
"folder or file with text that teaches about the language",
|
|
1208
|
-
typeConverter=TypeConverters.identity)
|
|
1209
|
-
|
|
1210
|
-
maxEditDistance = Param(Params._dummy(),
|
|
1211
|
-
"maxEditDistance",
|
|
1212
|
-
"max edit distance characters to derive strings from a word",
|
|
1213
|
-
typeConverter=TypeConverters.toInt)
|
|
1214
|
-
|
|
1215
|
-
frequencyThreshold = Param(Params._dummy(),
|
|
1216
|
-
"frequencyThreshold",
|
|
1217
|
-
"minimum frequency of words to be considered from training. " +
|
|
1218
|
-
"Increase if training set is LARGE. Defaults to 0",
|
|
1219
|
-
typeConverter=TypeConverters.toInt)
|
|
1220
|
-
|
|
1221
|
-
deletesThreshold = Param(Params._dummy(),
|
|
1222
|
-
"deletesThreshold",
|
|
1223
|
-
"minimum frequency of corrections a word needs to have to be considered from training." +
|
|
1224
|
-
"Increase if training set is LARGE. Defaults to 0",
|
|
1225
|
-
typeConverter=TypeConverters.toInt)
|
|
1226
|
-
|
|
1227
|
-
dupsLimit = Param(Params._dummy(),
|
|
1228
|
-
"dupsLimit",
|
|
1229
|
-
"maximum duplicate of characters in a word to consider. Defaults to 2",
|
|
1230
|
-
typeConverter=TypeConverters.toInt)
|
|
1231
|
-
|
|
1232
|
-
@keyword_only
|
|
1233
|
-
def __init__(self):
|
|
1234
|
-
super(SymmetricDeleteApproach, self).__init__(
|
|
1235
|
-
classname="com.johnsnowlabs.nlp.annotators.spell.symmetric.SymmetricDeleteApproach")
|
|
1236
|
-
self._setDefault(maxEditDistance=3, frequencyThreshold=0, deletesThreshold=0, dupsLimit=2)
|
|
1237
|
-
self.dictionary_path = ""
|
|
1238
|
-
|
|
1239
|
-
def setDictionary(self, path, token_pattern="\S+", read_as=ReadAs.TEXT, options={"format": "text"}):
|
|
1240
|
-
self.dictionary_path = path
|
|
1241
|
-
opts = options.copy()
|
|
1242
|
-
if "tokenPattern" not in opts:
|
|
1243
|
-
opts["tokenPattern"] = token_pattern
|
|
1244
|
-
return self._set(dictionary=ExternalResource(path, read_as, opts))
|
|
1245
|
-
|
|
1246
|
-
def setMaxEditDistance(self, v):
|
|
1247
|
-
return self._set(maxEditDistance=v)
|
|
1248
|
-
|
|
1249
|
-
def setFrequencyThreshold(self, v):
|
|
1250
|
-
return self._set(frequencyThreshold=v)
|
|
1251
|
-
|
|
1252
|
-
def setDeletesThreshold(self, v):
|
|
1253
|
-
return self._set(deletesThreshold=v)
|
|
1254
|
-
|
|
1255
|
-
def _create_model(self, java_model):
|
|
1256
|
-
return SymmetricDeleteModel(java_model=java_model)
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
class SymmetricDeleteModel(AnnotatorModel):
|
|
1260
|
-
name = "SymmetricDeleteModel"
|
|
1261
|
-
|
|
1262
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.spell.symmetric.SymmetricDeleteModel",
|
|
1263
|
-
java_model=None):
|
|
1264
|
-
super(SymmetricDeleteModel, self).__init__(
|
|
1265
|
-
classname=classname,
|
|
1266
|
-
java_model=java_model
|
|
1267
|
-
)
|
|
1268
|
-
|
|
1269
|
-
@staticmethod
|
|
1270
|
-
def pretrained(name="spellcheck_sd", lang="en", remote_loc=None):
|
|
1271
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
1272
|
-
return ResourceDownloader.downloadModel(SymmetricDeleteModel, name, lang, remote_loc)
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
class NerApproach(Params):
|
|
1276
|
-
labelColumn = Param(Params._dummy(),
|
|
1277
|
-
"labelColumn",
|
|
1278
|
-
"Column with label per each token",
|
|
1279
|
-
typeConverter=TypeConverters.toString)
|
|
1280
|
-
|
|
1281
|
-
entities = Param(Params._dummy(), "entities", "Entities to recognize", TypeConverters.toListString)
|
|
1282
|
-
|
|
1283
|
-
minEpochs = Param(Params._dummy(), "minEpochs", "Minimum number of epochs to train", TypeConverters.toInt)
|
|
1284
|
-
maxEpochs = Param(Params._dummy(), "maxEpochs", "Maximum number of epochs to train", TypeConverters.toInt)
|
|
1285
|
-
|
|
1286
|
-
verbose = Param(Params._dummy(), "verbose", "Level of verbosity during training", TypeConverters.toInt)
|
|
1287
|
-
randomSeed = Param(Params._dummy(), "randomSeed", "Random seed", TypeConverters.toInt)
|
|
1288
|
-
|
|
1289
|
-
def setLabelColumn(self, value):
|
|
1290
|
-
return self._set(labelColumn=value)
|
|
1291
|
-
|
|
1292
|
-
def setEntities(self, tags):
|
|
1293
|
-
return self._set(entities=tags)
|
|
1294
|
-
|
|
1295
|
-
def setMinEpochs(self, epochs):
|
|
1296
|
-
return self._set(minEpochs=epochs)
|
|
1297
|
-
|
|
1298
|
-
def setMaxEpochs(self, epochs):
|
|
1299
|
-
return self._set(maxEpochs=epochs)
|
|
1300
|
-
|
|
1301
|
-
def setVerbose(self, verboseValue):
|
|
1302
|
-
return self._set(verbose=verboseValue)
|
|
1303
|
-
|
|
1304
|
-
def setRandomSeed(self, seed):
|
|
1305
|
-
return self._set(randomSeed=seed)
|
|
1306
|
-
|
|
1307
|
-
def getLabelColumn(self):
|
|
1308
|
-
return self.getOrDefault(self.labelColumn)
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
class NerCrfApproach(AnnotatorApproach, NerApproach):
|
|
1312
|
-
|
|
1313
|
-
l2 = Param(Params._dummy(), "l2", "L2 regularization coefficient", TypeConverters.toFloat)
|
|
1314
|
-
|
|
1315
|
-
c0 = Param(Params._dummy(), "c0", "c0 params defining decay speed for gradient", TypeConverters.toInt)
|
|
1316
|
-
|
|
1317
|
-
lossEps = Param(Params._dummy(), "lossEps", "If Epoch relative improvement less than eps then training is stopped",
|
|
1318
|
-
TypeConverters.toFloat)
|
|
1319
|
-
|
|
1320
|
-
minW = Param(Params._dummy(), "minW", "Features with less weights then this param value will be filtered",
|
|
1321
|
-
TypeConverters.toFloat)
|
|
1322
|
-
|
|
1323
|
-
includeConfidence = Param(Params._dummy(), "includeConfidence", "external features is a delimited text. needs 'delimiter' in options",
|
|
1324
|
-
TypeConverters.toBoolean)
|
|
1325
|
-
|
|
1326
|
-
externalFeatures = Param(Params._dummy(), "externalFeatures", "Additional dictionaries paths to use as a features",
|
|
1327
|
-
TypeConverters.identity)
|
|
1328
|
-
|
|
1329
|
-
def setL2(self, l2value):
|
|
1330
|
-
return self._set(l2=l2value)
|
|
1331
|
-
|
|
1332
|
-
def setC0(self, c0value):
|
|
1333
|
-
return self._set(c0=c0value)
|
|
1334
|
-
|
|
1335
|
-
def setLossEps(self, eps):
|
|
1336
|
-
return self._set(lossEps=eps)
|
|
1337
|
-
|
|
1338
|
-
def setMinW(self, w):
|
|
1339
|
-
return self._set(minW=w)
|
|
1340
|
-
|
|
1341
|
-
def setExternalFeatures(self, path, delimiter, read_as=ReadAs.TEXT, options={"format": "text"}):
|
|
1342
|
-
opts = options.copy()
|
|
1343
|
-
if "delimiter" not in opts:
|
|
1344
|
-
opts["delimiter"] = delimiter
|
|
1345
|
-
return self._set(externalFeatures=ExternalResource(path, read_as, opts))
|
|
1346
|
-
|
|
1347
|
-
def setIncludeConfidence(self, b):
|
|
1348
|
-
return self._set(includeConfidence=b)
|
|
1349
|
-
|
|
1350
|
-
def _create_model(self, java_model):
|
|
1351
|
-
return NerCrfModel(java_model=java_model)
|
|
1352
|
-
|
|
1353
|
-
@keyword_only
|
|
1354
|
-
def __init__(self):
|
|
1355
|
-
super(NerCrfApproach, self).__init__(classname="com.johnsnowlabs.nlp.annotators.ner.crf.NerCrfApproach")
|
|
1356
|
-
self._setDefault(
|
|
1357
|
-
minEpochs=0,
|
|
1358
|
-
maxEpochs=1000,
|
|
1359
|
-
l2=float(1),
|
|
1360
|
-
c0=2250000,
|
|
1361
|
-
lossEps=float(1e-3),
|
|
1362
|
-
verbose=4,
|
|
1363
|
-
includeConfidence=False
|
|
1364
|
-
)
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
class NerCrfModel(AnnotatorModel):
|
|
1368
|
-
name = "NerCrfModel"
|
|
1369
|
-
|
|
1370
|
-
includeConfidence = Param(Params._dummy(), "includeConfidence", "external features is a delimited text. needs 'delimiter' in options",
|
|
1371
|
-
TypeConverters.toBoolean)
|
|
1372
|
-
|
|
1373
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.ner.crf.NerCrfModel", java_model=None):
|
|
1374
|
-
super(NerCrfModel, self).__init__(
|
|
1375
|
-
classname=classname,
|
|
1376
|
-
java_model=java_model
|
|
1377
|
-
)
|
|
1378
|
-
|
|
1379
|
-
def setIncludeConfidence(self, b):
|
|
1380
|
-
return self._set(includeConfidence=b)
|
|
1381
|
-
|
|
1382
|
-
@staticmethod
|
|
1383
|
-
def pretrained(name="ner_crf", lang="en", remote_loc=None):
|
|
1384
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
1385
|
-
return ResourceDownloader.downloadModel(NerCrfModel, name, lang, remote_loc)
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
class NerDLApproach(AnnotatorApproach, NerApproach):
|
|
1389
|
-
|
|
1390
|
-
lr = Param(Params._dummy(), "lr", "Learning Rate", TypeConverters.toFloat)
|
|
1391
|
-
|
|
1392
|
-
po = Param(Params._dummy(), "po", "Learning rate decay coefficient. Real Learning Rage = lr / (1 + po * epoch)",
|
|
1393
|
-
TypeConverters.toFloat)
|
|
1394
|
-
|
|
1395
|
-
batchSize = Param(Params._dummy(), "batchSize", "Batch size", TypeConverters.toInt)
|
|
1396
|
-
|
|
1397
|
-
dropout = Param(Params._dummy(), "dropout", "Dropout coefficient", TypeConverters.toFloat)
|
|
1398
|
-
|
|
1399
|
-
graphFolder = Param(Params._dummy(), "graphFolder", "Folder path that contain external graph files", TypeConverters.toString)
|
|
1400
|
-
|
|
1401
|
-
configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", TypeConverters.toListString)
|
|
1402
|
-
|
|
1403
|
-
useContrib = Param(Params._dummy(), "useContrib", "whether to use contrib LSTM Cells. Not compatible with Windows. Might slightly improve accuracy.", TypeConverters.toBoolean)
|
|
1404
|
-
|
|
1405
|
-
validationSplit = Param(Params._dummy(), "validationSplit", "Choose the proportion of training dataset to be validated against the model on each Epoch. The value should be between 0.0 and 1.0 and by default it is 0.0 and off.",
|
|
1406
|
-
TypeConverters.toFloat)
|
|
1407
|
-
|
|
1408
|
-
evaluationLogExtended = Param(Params._dummy(), "evaluationLogExtended", "Choose the proportion of training dataset to be validated against the model on each Epoch. The value should be between 0.0 and 1.0 and by default it is 0.0 and off.",
|
|
1409
|
-
TypeConverters.toBoolean)
|
|
1410
|
-
|
|
1411
|
-
testDataset = Param(Params._dummy(), "testDataset",
|
|
1412
|
-
"Path to test dataset. If set used to calculate statistic on it during training.",
|
|
1413
|
-
TypeConverters.identity)
|
|
1414
|
-
|
|
1415
|
-
includeConfidence = Param(Params._dummy(), "includeConfidence",
|
|
1416
|
-
"whether to include confidence scores in annotation metadata",
|
|
1417
|
-
TypeConverters.toBoolean)
|
|
1418
|
-
|
|
1419
|
-
enableOutputLogs = Param(Params._dummy(), "enableOutputLogs",
|
|
1420
|
-
"Whether to use stdout in addition to Spark logs.",
|
|
1421
|
-
TypeConverters.toBoolean)
|
|
1422
|
-
|
|
1423
|
-
outputLogsPath = Param(Params._dummy(), "outputLogsPath", "Folder path to save training logs", TypeConverters.toString)
|
|
1424
|
-
|
|
1425
|
-
def setConfigProtoBytes(self, b):
|
|
1426
|
-
return self._set(configProtoBytes=b)
|
|
1427
|
-
|
|
1428
|
-
def setGraphFolder(self, p):
|
|
1429
|
-
return self._set(graphFolder=p)
|
|
1430
|
-
|
|
1431
|
-
def setUseContrib(self, v):
|
|
1432
|
-
if v and sys.version == 'win32':
|
|
1433
|
-
raise Exception("Windows not supported to use contrib")
|
|
1434
|
-
return self._set(useContrib=v)
|
|
1435
|
-
|
|
1436
|
-
def setLr(self, v):
|
|
1437
|
-
self._set(lr=v)
|
|
1438
|
-
return self
|
|
1439
|
-
|
|
1440
|
-
def setPo(self, v):
|
|
1441
|
-
self._set(po=v)
|
|
1442
|
-
return self
|
|
1443
|
-
|
|
1444
|
-
def setBatchSize(self, v):
|
|
1445
|
-
self._set(batchSize=v)
|
|
1446
|
-
return self
|
|
1447
|
-
|
|
1448
|
-
def setDropout(self, v):
|
|
1449
|
-
self._set(dropout=v)
|
|
1450
|
-
return self
|
|
1451
|
-
|
|
1452
|
-
def _create_model(self, java_model):
|
|
1453
|
-
return NerDLModel(java_model=java_model)
|
|
1454
|
-
|
|
1455
|
-
def setValidationSplit(self, v):
|
|
1456
|
-
self._set(validationSplit=v)
|
|
1457
|
-
return self
|
|
1458
|
-
|
|
1459
|
-
def setEvaluationLogExtended(self, v):
|
|
1460
|
-
self._set(evaluationLogExtended=v)
|
|
1461
|
-
return self
|
|
1462
|
-
|
|
1463
|
-
def setTestDataset(self, path, read_as=ReadAs.SPARK, options={"format": "parquet"}):
|
|
1464
|
-
return self._set(testDataset=ExternalResource(path, read_as, options.copy()))
|
|
1465
|
-
|
|
1466
|
-
def setIncludeConfidence(self, value):
|
|
1467
|
-
return self._set(includeConfidence=value)
|
|
1468
|
-
|
|
1469
|
-
def setEnableOutputLogs(self, value):
|
|
1470
|
-
return self._set(enableOutputLogs=value)
|
|
1471
|
-
|
|
1472
|
-
def setOutputLogsPath(self, p):
|
|
1473
|
-
return self._set(outputLogsPath=p)
|
|
1474
|
-
|
|
1475
|
-
@keyword_only
|
|
1476
|
-
def __init__(self):
|
|
1477
|
-
super(NerDLApproach, self).__init__(classname="com.johnsnowlabs.nlp.annotators.ner.dl.NerDLApproach")
|
|
1478
|
-
uc = False if sys.platform == 'win32' else True
|
|
1479
|
-
self._setDefault(
|
|
1480
|
-
minEpochs=0,
|
|
1481
|
-
maxEpochs=50,
|
|
1482
|
-
lr=float(0.001),
|
|
1483
|
-
po=float(0.005),
|
|
1484
|
-
batchSize=8,
|
|
1485
|
-
dropout=float(0.5),
|
|
1486
|
-
verbose=2,
|
|
1487
|
-
useContrib=uc,
|
|
1488
|
-
validationSplit=float(0.0),
|
|
1489
|
-
evaluationLogExtended=False,
|
|
1490
|
-
includeConfidence=False,
|
|
1491
|
-
enableOutputLogs=False
|
|
1492
|
-
)
|
|
1493
|
-
|
|
1494
|
-
|
|
1495
|
-
class NerDLModel(AnnotatorModel, HasStorageRef):
|
|
1496
|
-
name = "NerDLModel"
|
|
1497
|
-
|
|
1498
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.ner.dl.NerDLModel", java_model=None):
|
|
1499
|
-
super(NerDLModel, self).__init__(
|
|
1500
|
-
classname=classname,
|
|
1501
|
-
java_model=java_model
|
|
1502
|
-
)
|
|
1503
|
-
self._setDefault(includeConfidence=False)
|
|
1504
|
-
|
|
1505
|
-
configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", TypeConverters.toListString)
|
|
1506
|
-
includeConfidence = Param(Params._dummy(), "includeConfidence",
|
|
1507
|
-
"whether to include confidence scores in annotation metadata",
|
|
1508
|
-
TypeConverters.toBoolean)
|
|
1509
|
-
classes = Param(Params._dummy(), "classes",
|
|
1510
|
-
"get the tags used to trained this NerDLModel",
|
|
1511
|
-
TypeConverters.toListString)
|
|
1512
|
-
|
|
1513
|
-
def setConfigProtoBytes(self, b):
|
|
1514
|
-
return self._set(configProtoBytes=b)
|
|
1515
|
-
|
|
1516
|
-
def setIncludeConfidence(self, value):
|
|
1517
|
-
return self._set(includeConfidence=value)
|
|
1518
|
-
|
|
1519
|
-
@staticmethod
|
|
1520
|
-
def pretrained(name="ner_dl", lang="en", remote_loc=None):
|
|
1521
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
1522
|
-
return ResourceDownloader.downloadModel(NerDLModel, name, lang, remote_loc)
|
|
1523
|
-
|
|
1524
|
-
|
|
1525
|
-
class NerConverter(AnnotatorModel):
|
|
1526
|
-
name = 'NerConverter'
|
|
1527
|
-
|
|
1528
|
-
whiteList = Param(
|
|
1529
|
-
Params._dummy(),
|
|
1530
|
-
"whiteList",
|
|
1531
|
-
"If defined, list of entities to process. The rest will be ignored. Do not include IOB prefix on labels",
|
|
1532
|
-
typeConverter=TypeConverters.toListString
|
|
1533
|
-
)
|
|
1534
|
-
|
|
1535
|
-
def setWhiteList(self, entities):
|
|
1536
|
-
return self._set(whiteList=entities)
|
|
1537
|
-
|
|
1538
|
-
@keyword_only
|
|
1539
|
-
def __init__(self):
|
|
1540
|
-
super(NerConverter, self).__init__(classname="com.johnsnowlabs.nlp.annotators.ner.NerConverter")
|
|
1541
|
-
|
|
1542
|
-
|
|
1543
|
-
class DependencyParserApproach(AnnotatorApproach):
|
|
1544
|
-
dependencyTreeBank = Param(Params._dummy(),
|
|
1545
|
-
"dependencyTreeBank",
|
|
1546
|
-
"Dependency treebank source files",
|
|
1547
|
-
typeConverter=TypeConverters.identity)
|
|
1548
|
-
|
|
1549
|
-
conllU = Param(Params._dummy(),
|
|
1550
|
-
"conllU",
|
|
1551
|
-
"Universal Dependencies source files",
|
|
1552
|
-
typeConverter=TypeConverters.identity)
|
|
1553
|
-
|
|
1554
|
-
numberOfIterations = Param(Params._dummy(),
|
|
1555
|
-
"numberOfIterations",
|
|
1556
|
-
"Number of iterations in training, converges to better accuracy",
|
|
1557
|
-
typeConverter=TypeConverters.toInt)
|
|
1558
|
-
|
|
1559
|
-
@keyword_only
|
|
1560
|
-
def __init__(self):
|
|
1561
|
-
super(DependencyParserApproach,
|
|
1562
|
-
self).__init__(classname="com.johnsnowlabs.nlp.annotators.parser.dep.DependencyParserApproach")
|
|
1563
|
-
self._setDefault(numberOfIterations=10)
|
|
1564
|
-
|
|
1565
|
-
def setNumberOfIterations(self, value):
|
|
1566
|
-
return self._set(numberOfIterations=value)
|
|
1567
|
-
|
|
1568
|
-
def setDependencyTreeBank(self, path, read_as=ReadAs.TEXT, options={"key": "value"}):
|
|
1569
|
-
opts = options.copy()
|
|
1570
|
-
return self._set(dependencyTreeBank=ExternalResource(path, read_as, opts))
|
|
1571
|
-
|
|
1572
|
-
def setConllU(self, path, read_as=ReadAs.TEXT, options={"key": "value"}):
|
|
1573
|
-
opts = options.copy()
|
|
1574
|
-
return self._set(conllU=ExternalResource(path, read_as, opts))
|
|
1575
|
-
|
|
1576
|
-
def _create_model(self, java_model):
|
|
1577
|
-
return DependencyParserModel(java_model=java_model)
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
class DependencyParserModel(AnnotatorModel):
|
|
1581
|
-
name = "DependencyParserModel"
|
|
1582
|
-
|
|
1583
|
-
perceptron = Param(Params._dummy(),
|
|
1584
|
-
"perceptron",
|
|
1585
|
-
"Dependency parsing perceptron features",
|
|
1586
|
-
typeConverter=TypeConverters.identity)
|
|
1587
|
-
|
|
1588
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.parser.dep.DependencyParserModel", java_model=None):
|
|
1589
|
-
super(DependencyParserModel, self).__init__(
|
|
1590
|
-
classname=classname,
|
|
1591
|
-
java_model=java_model
|
|
1592
|
-
)
|
|
1593
|
-
|
|
1594
|
-
@staticmethod
|
|
1595
|
-
def pretrained(name="dependency_conllu", lang="en", remote_loc=None):
|
|
1596
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
1597
|
-
return ResourceDownloader.downloadModel(DependencyParserModel, name, lang, remote_loc)
|
|
1598
|
-
|
|
1599
|
-
|
|
1600
|
-
class TypedDependencyParserApproach(AnnotatorApproach):
|
|
1601
|
-
conll2009 = Param(Params._dummy(),
|
|
1602
|
-
"conll2009",
|
|
1603
|
-
"Path to file with CoNLL 2009 format",
|
|
1604
|
-
typeConverter=TypeConverters.identity)
|
|
1605
|
-
|
|
1606
|
-
conllU = Param(Params._dummy(),
|
|
1607
|
-
"conllU",
|
|
1608
|
-
"Universal Dependencies source files",
|
|
1609
|
-
typeConverter=TypeConverters.identity)
|
|
1610
|
-
|
|
1611
|
-
numberOfIterations = Param(Params._dummy(),
|
|
1612
|
-
"numberOfIterations",
|
|
1613
|
-
"Number of iterations in training, converges to better accuracy",
|
|
1614
|
-
typeConverter=TypeConverters.toInt)
|
|
1615
|
-
|
|
1616
|
-
@keyword_only
|
|
1617
|
-
def __init__(self):
|
|
1618
|
-
super(TypedDependencyParserApproach,
|
|
1619
|
-
self).__init__(classname="com.johnsnowlabs.nlp.annotators.parser.typdep.TypedDependencyParserApproach")
|
|
1620
|
-
|
|
1621
|
-
def setConll2009(self, path, read_as=ReadAs.TEXT, options={"key": "value"}):
|
|
1622
|
-
opts = options.copy()
|
|
1623
|
-
return self._set(conll2009=ExternalResource(path, read_as, opts))
|
|
1624
|
-
|
|
1625
|
-
def setConllU(self, path, read_as=ReadAs.TEXT, options={"key": "value"}):
|
|
1626
|
-
opts = options.copy()
|
|
1627
|
-
return self._set(conllU=ExternalResource(path, read_as, opts))
|
|
1628
|
-
|
|
1629
|
-
def setNumberOfIterations(self, value):
|
|
1630
|
-
return self._set(numberOfIterations=value)
|
|
1631
|
-
|
|
1632
|
-
def _create_model(self, java_model):
|
|
1633
|
-
return TypedDependencyParserModel(java_model=java_model)
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
class TypedDependencyParserModel(AnnotatorModel):
|
|
1637
|
-
|
|
1638
|
-
name = "TypedDependencyParserModel"
|
|
1639
|
-
|
|
1640
|
-
trainOptions = Param(Params._dummy(),
|
|
1641
|
-
"trainOptions",
|
|
1642
|
-
"Training Options",
|
|
1643
|
-
typeConverter=TypeConverters.identity)
|
|
1644
|
-
|
|
1645
|
-
trainParameters = Param(Params._dummy(),
|
|
1646
|
-
"trainParameters",
|
|
1647
|
-
"Training Parameters",
|
|
1648
|
-
typeConverter=TypeConverters.identity)
|
|
1649
|
-
|
|
1650
|
-
trainDependencyPipe = Param(Params._dummy(),
|
|
1651
|
-
"trainDependencyPipe",
|
|
1652
|
-
"Training dependency pipe",
|
|
1653
|
-
typeConverter=TypeConverters.identity)
|
|
1654
|
-
|
|
1655
|
-
conllFormat = Param(Params._dummy(),
|
|
1656
|
-
"conllFormat",
|
|
1657
|
-
"CoNLL Format",
|
|
1658
|
-
typeConverter=TypeConverters.toString)
|
|
1659
|
-
|
|
1660
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.parser.typdep.TypedDependencyParserModel",
|
|
1661
|
-
java_model=None):
|
|
1662
|
-
super(TypedDependencyParserModel, self).__init__(
|
|
1663
|
-
classname=classname,
|
|
1664
|
-
java_model=java_model
|
|
1665
|
-
)
|
|
1666
|
-
|
|
1667
|
-
@staticmethod
|
|
1668
|
-
def pretrained(name="dependency_typed_conllu", lang="en", remote_loc=None):
|
|
1669
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
1670
|
-
return ResourceDownloader.downloadModel(TypedDependencyParserModel, name, lang, remote_loc)
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
class WordEmbeddings(AnnotatorApproach, HasEmbeddingsProperties, HasStorage):
|
|
1674
|
-
|
|
1675
|
-
name = "WordEmbeddings"
|
|
1676
|
-
|
|
1677
|
-
writeBufferSize = Param(Params._dummy(),
|
|
1678
|
-
"writeBufferSize",
|
|
1679
|
-
"buffer size limit before dumping to disk storage while writing",
|
|
1680
|
-
typeConverter=TypeConverters.toInt)
|
|
1681
|
-
|
|
1682
|
-
readCacheSize = Param(Params._dummy(),
|
|
1683
|
-
"readCacheSize",
|
|
1684
|
-
"cache size for items retrieved from storage. Increase for performance but higher memory consumption",
|
|
1685
|
-
typeConverter=TypeConverters.toInt)
|
|
1686
|
-
|
|
1687
|
-
def setWriteBufferSize(self, v):
|
|
1688
|
-
return self._set(writeBufferSize=v)
|
|
1689
|
-
|
|
1690
|
-
def setReadCacheSize(self, v):
|
|
1691
|
-
return self._set(readCacheSize=v)
|
|
1692
|
-
|
|
1693
|
-
@keyword_only
|
|
1694
|
-
def __init__(self):
|
|
1695
|
-
super(WordEmbeddings, self).__init__(classname="com.johnsnowlabs.nlp.embeddings.WordEmbeddings")
|
|
1696
|
-
self._setDefault(
|
|
1697
|
-
caseSensitive=False,
|
|
1698
|
-
writeBufferSize=10000,
|
|
1699
|
-
storageRef=self.uid
|
|
1700
|
-
)
|
|
1701
|
-
|
|
1702
|
-
def _create_model(self, java_model):
|
|
1703
|
-
return WordEmbeddingsModel(java_model=java_model)
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
class WordEmbeddingsModel(AnnotatorModel, HasEmbeddingsProperties, HasStorageModel):
|
|
1707
|
-
|
|
1708
|
-
name = "WordEmbeddingsModel"
|
|
1709
|
-
databases = ['EMBEDDINGS']
|
|
1710
|
-
|
|
1711
|
-
readCacheSize = Param(Params._dummy(),
|
|
1712
|
-
"readCacheSize",
|
|
1713
|
-
"cache size for items retrieved from storage. Increase for performance but higher memory consumption",
|
|
1714
|
-
typeConverter=TypeConverters.toInt)
|
|
1715
|
-
|
|
1716
|
-
def setReadCacheSize(self, v):
|
|
1717
|
-
return self._set(readCacheSize=v)
|
|
1718
|
-
|
|
1719
|
-
@keyword_only
|
|
1720
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel", java_model=None):
|
|
1721
|
-
super(WordEmbeddingsModel, self).__init__(
|
|
1722
|
-
classname=classname,
|
|
1723
|
-
java_model=java_model
|
|
1724
|
-
)
|
|
1725
|
-
|
|
1726
|
-
@staticmethod
|
|
1727
|
-
def overallCoverage(dataset, embeddings_col):
|
|
1728
|
-
from sparknlp.internal import _EmbeddingsOverallCoverage
|
|
1729
|
-
from sparknlp.common import CoverageResult
|
|
1730
|
-
return CoverageResult(_EmbeddingsOverallCoverage(dataset, embeddings_col).apply())
|
|
1731
|
-
|
|
1732
|
-
@staticmethod
|
|
1733
|
-
def withCoverageColumn(dataset, embeddings_col, output_col='coverage'):
|
|
1734
|
-
from sparknlp.internal import _EmbeddingsCoverageColumn
|
|
1735
|
-
from pyspark.sql import DataFrame
|
|
1736
|
-
return DataFrame(_EmbeddingsCoverageColumn(dataset, embeddings_col, output_col).apply(), dataset.sql_ctx)
|
|
1737
|
-
|
|
1738
|
-
@staticmethod
|
|
1739
|
-
def pretrained(name="glove_100d", lang="en", remote_loc=None):
|
|
1740
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
1741
|
-
return ResourceDownloader.downloadModel(WordEmbeddingsModel, name, lang, remote_loc)
|
|
1742
|
-
|
|
1743
|
-
@staticmethod
|
|
1744
|
-
def loadStorage(path, spark, storage_ref):
|
|
1745
|
-
HasStorageModel.loadStorages(path, spark, storage_ref, WordEmbeddingsModel.databases)
|
|
1746
|
-
|
|
1747
|
-
|
|
1748
|
-
class BertEmbeddings(AnnotatorModel, HasEmbeddingsProperties, HasCaseSensitiveProperties, HasStorageRef):
|
|
1749
|
-
|
|
1750
|
-
name = "BertEmbeddings"
|
|
1751
|
-
|
|
1752
|
-
maxSentenceLength = Param(Params._dummy(),
|
|
1753
|
-
"maxSentenceLength",
|
|
1754
|
-
"Max sentence length to process",
|
|
1755
|
-
typeConverter=TypeConverters.toInt)
|
|
1756
|
-
|
|
1757
|
-
batchSize = Param(Params._dummy(),
|
|
1758
|
-
"batchSize",
|
|
1759
|
-
"Batch size. Large values allows faster processing but requires more memory.",
|
|
1760
|
-
typeConverter=TypeConverters.toInt)
|
|
1761
|
-
|
|
1762
|
-
configProtoBytes = Param(Params._dummy(),
|
|
1763
|
-
"configProtoBytes",
|
|
1764
|
-
"ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
|
|
1765
|
-
TypeConverters.toListString)
|
|
1766
|
-
|
|
1767
|
-
def setConfigProtoBytes(self, b):
|
|
1768
|
-
return self._set(configProtoBytes=b)
|
|
1769
|
-
|
|
1770
|
-
def setMaxSentenceLength(self, value):
|
|
1771
|
-
return self._set(maxSentenceLength=value)
|
|
1772
|
-
|
|
1773
|
-
def setBatchSize(self, value):
|
|
1774
|
-
return self._set(batchSize=value)
|
|
1775
|
-
|
|
1776
|
-
@keyword_only
|
|
1777
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.BertEmbeddings", java_model=None):
|
|
1778
|
-
super(BertEmbeddings, self).__init__(
|
|
1779
|
-
classname=classname,
|
|
1780
|
-
java_model=java_model
|
|
1781
|
-
)
|
|
1782
|
-
self._setDefault(
|
|
1783
|
-
dimension=768,
|
|
1784
|
-
batchSize=32,
|
|
1785
|
-
maxSentenceLength=128,
|
|
1786
|
-
caseSensitive=False
|
|
1787
|
-
)
|
|
1788
|
-
|
|
1789
|
-
@staticmethod
|
|
1790
|
-
def loadSavedModel(folder, spark_session):
|
|
1791
|
-
from sparknlp.internal import _BertLoader
|
|
1792
|
-
jModel = _BertLoader(folder, spark_session._jsparkSession)._java_obj
|
|
1793
|
-
return BertEmbeddings(java_model=jModel)
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
@staticmethod
|
|
1797
|
-
def pretrained(name="small_bert_L2_768", lang="en", remote_loc=None):
|
|
1798
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
1799
|
-
return ResourceDownloader.downloadModel(BertEmbeddings, name, lang, remote_loc)
|
|
1800
|
-
|
|
1801
|
-
|
|
1802
|
-
class BertSentenceEmbeddings(AnnotatorModel, HasEmbeddingsProperties, HasCaseSensitiveProperties, HasStorageRef):
|
|
1803
|
-
|
|
1804
|
-
name = "BertSentenceEmbeddings"
|
|
1805
|
-
|
|
1806
|
-
maxSentenceLength = Param(Params._dummy(),
|
|
1807
|
-
"maxSentenceLength",
|
|
1808
|
-
"Max sentence length to process",
|
|
1809
|
-
typeConverter=TypeConverters.toInt)
|
|
1810
|
-
|
|
1811
|
-
batchSize = Param(Params._dummy(),
|
|
1812
|
-
"batchSize",
|
|
1813
|
-
"Batch size. Large values allows faster processing but requires more memory.",
|
|
1814
|
-
typeConverter=TypeConverters.toInt)
|
|
1815
|
-
|
|
1816
|
-
configProtoBytes = Param(Params._dummy(),
|
|
1817
|
-
"configProtoBytes",
|
|
1818
|
-
"ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
|
|
1819
|
-
TypeConverters.toListString)
|
|
1820
|
-
|
|
1821
|
-
def setConfigProtoBytes(self, b):
|
|
1822
|
-
return self._set(configProtoBytes=b)
|
|
1823
|
-
|
|
1824
|
-
def setMaxSentenceLength(self, value):
|
|
1825
|
-
return self._set(maxSentenceLength=value)
|
|
1826
|
-
|
|
1827
|
-
def setBatchSize(self, value):
|
|
1828
|
-
return self._set(batchSize=value)
|
|
1829
|
-
|
|
1830
|
-
@keyword_only
|
|
1831
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.BertSentenceEmbeddings", java_model=None):
|
|
1832
|
-
super(BertSentenceEmbeddings, self).__init__(
|
|
1833
|
-
classname=classname,
|
|
1834
|
-
java_model=java_model
|
|
1835
|
-
)
|
|
1836
|
-
self._setDefault(
|
|
1837
|
-
dimension=768,
|
|
1838
|
-
batchSize=32,
|
|
1839
|
-
maxSentenceLength=128,
|
|
1840
|
-
caseSensitive=False
|
|
1841
|
-
)
|
|
1842
|
-
|
|
1843
|
-
@staticmethod
|
|
1844
|
-
def loadSavedModel(folder, spark_session):
|
|
1845
|
-
from sparknlp.internal import _BertSentenceLoader
|
|
1846
|
-
jModel = _BertSentenceLoader(folder, spark_session._jsparkSession)._java_obj
|
|
1847
|
-
return BertSentenceEmbeddings(java_model=jModel)
|
|
1848
|
-
|
|
1849
|
-
@staticmethod
|
|
1850
|
-
def pretrained(name="sent_small_bert_L2_768", lang="en", remote_loc=None):
|
|
1851
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
1852
|
-
return ResourceDownloader.downloadModel(BertSentenceEmbeddings, name, lang, remote_loc)
|
|
1853
|
-
|
|
1854
|
-
|
|
1855
|
-
class SentenceEmbeddings(AnnotatorModel, HasEmbeddingsProperties, HasStorageRef):
|
|
1856
|
-
|
|
1857
|
-
name = "SentenceEmbeddings"
|
|
1858
|
-
|
|
1859
|
-
@keyword_only
|
|
1860
|
-
def __init__(self):
|
|
1861
|
-
super(SentenceEmbeddings, self).__init__(classname="com.johnsnowlabs.nlp.embeddings.SentenceEmbeddings")
|
|
1862
|
-
self._setDefault(
|
|
1863
|
-
poolingStrategy="AVERAGE"
|
|
1864
|
-
)
|
|
1865
|
-
|
|
1866
|
-
poolingStrategy = Param(Params._dummy(),
|
|
1867
|
-
"poolingStrategy",
|
|
1868
|
-
"Choose how you would like to aggregate Word Embeddings to Sentence Embeddings: AVERAGE or SUM",
|
|
1869
|
-
typeConverter=TypeConverters.toString)
|
|
1870
|
-
|
|
1871
|
-
def setPoolingStrategy(self, strategy):
|
|
1872
|
-
if strategy == "AVERAGE":
|
|
1873
|
-
return self._set(poolingStrategy=strategy)
|
|
1874
|
-
elif strategy == "SUM":
|
|
1875
|
-
return self._set(poolingStrategy=strategy)
|
|
1876
|
-
else:
|
|
1877
|
-
return self._set(poolingStrategy="AVERAGE")
|
|
1878
|
-
|
|
1879
|
-
|
|
1880
|
-
class StopWordsCleaner(AnnotatorModel):
|
|
1881
|
-
|
|
1882
|
-
name = "StopWordsCleaner"
|
|
1883
|
-
|
|
1884
|
-
@keyword_only
|
|
1885
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.StopWordsCleaner", java_model=None):
|
|
1886
|
-
super(StopWordsCleaner, self).__init__(
|
|
1887
|
-
classname=classname,
|
|
1888
|
-
java_model=java_model
|
|
1889
|
-
)
|
|
1890
|
-
self._setDefault(
|
|
1891
|
-
stopWords=StopWordsCleaner.loadDefaultStopWords("english"),
|
|
1892
|
-
caseSensitive=False,
|
|
1893
|
-
locale=self._java_obj.getLocale()
|
|
1894
|
-
)
|
|
1895
|
-
|
|
1896
|
-
stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered out",
|
|
1897
|
-
typeConverter=TypeConverters.toListString)
|
|
1898
|
-
caseSensitive = Param(Params._dummy(), "caseSensitive", "whether to do a case sensitive " +
|
|
1899
|
-
"comparison over the stop words", typeConverter=TypeConverters.toBoolean)
|
|
1900
|
-
locale = Param(Params._dummy(), "locale", "locale of the input. ignored when case sensitive " +
|
|
1901
|
-
"is true", typeConverter=TypeConverters.toString)
|
|
1902
|
-
|
|
1903
|
-
def setStopWords(self, value):
|
|
1904
|
-
return self._set(stopWords=value)
|
|
1905
|
-
|
|
1906
|
-
def setCaseSensitive(self, value):
|
|
1907
|
-
return self._set(caseSensitive=value)
|
|
1908
|
-
|
|
1909
|
-
def setLocale(self, value):
|
|
1910
|
-
return self._set(locale=value)
|
|
1911
|
-
|
|
1912
|
-
def loadDefaultStopWords(language="english"):
|
|
1913
|
-
from pyspark.ml.wrapper import _jvm
|
|
1914
|
-
|
|
1915
|
-
"""
|
|
1916
|
-
Loads the default stop words for the given language.
|
|
1917
|
-
Supported languages: danish, dutch, english, finnish, french, german, hungarian,
|
|
1918
|
-
italian, norwegian, portuguese, russian, spanish, swedish, turkish
|
|
1919
|
-
"""
|
|
1920
|
-
stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWordsRemover
|
|
1921
|
-
return list(stopWordsObj.loadDefaultStopWords(language))
|
|
1922
|
-
|
|
1923
|
-
@staticmethod
|
|
1924
|
-
def pretrained(name="stopwords_en", lang="en", remote_loc=None):
|
|
1925
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
1926
|
-
return ResourceDownloader.downloadModel(StopWordsCleaner, name, lang, remote_loc)
|
|
1927
|
-
|
|
1928
|
-
|
|
1929
|
-
class NGramGenerator(AnnotatorModel):
|
|
1930
|
-
|
|
1931
|
-
name = "NGramGenerator"
|
|
1932
|
-
|
|
1933
|
-
@keyword_only
|
|
1934
|
-
def __init__(self):
|
|
1935
|
-
super(NGramGenerator, self).__init__(classname="com.johnsnowlabs.nlp.annotators.NGramGenerator")
|
|
1936
|
-
self._setDefault(
|
|
1937
|
-
n=2,
|
|
1938
|
-
enableCumulative=False
|
|
1939
|
-
)
|
|
1940
|
-
|
|
1941
|
-
n = Param(Params._dummy(), "n", "number elements per n-gram (>=1)", typeConverter=TypeConverters.toInt)
|
|
1942
|
-
enableCumulative = Param(Params._dummy(), "enableCumulative", "whether to calculate just the actual n-grams " +
|
|
1943
|
-
"or all n-grams from 1 through n", typeConverter=TypeConverters.toBoolean)
|
|
1944
|
-
|
|
1945
|
-
delimiter = Param(Params._dummy(), "delimiter", "String to use to join the tokens ", typeConverter=TypeConverters.toString)
|
|
1946
|
-
|
|
1947
|
-
def setN(self, value):
|
|
1948
|
-
"""
|
|
1949
|
-
Sets the value of :py:attr:`n`.
|
|
1950
|
-
"""
|
|
1951
|
-
return self._set(n=value)
|
|
1952
|
-
|
|
1953
|
-
def setEnableCumulative(self, value):
|
|
1954
|
-
"""
|
|
1955
|
-
Sets the value of :py:attr:`enableCumulative`.
|
|
1956
|
-
"""
|
|
1957
|
-
return self._set(enableCumulative=value)
|
|
1958
|
-
|
|
1959
|
-
def setDelimiter(self, value):
|
|
1960
|
-
"""
|
|
1961
|
-
Sets the value of :py:attr:`delimiter`.
|
|
1962
|
-
"""
|
|
1963
|
-
if len(value) > 1:
|
|
1964
|
-
raise Exception("Delimiter should have length == 1")
|
|
1965
|
-
return self._set(delimiter=value)
|
|
1966
|
-
|
|
1967
|
-
|
|
1968
|
-
class ChunkEmbeddings(AnnotatorModel):
|
|
1969
|
-
|
|
1970
|
-
name = "ChunkEmbeddings"
|
|
1971
|
-
|
|
1972
|
-
@keyword_only
|
|
1973
|
-
def __init__(self):
|
|
1974
|
-
super(ChunkEmbeddings, self).__init__(classname="com.johnsnowlabs.nlp.embeddings.ChunkEmbeddings")
|
|
1975
|
-
self._setDefault(
|
|
1976
|
-
poolingStrategy="AVERAGE"
|
|
1977
|
-
)
|
|
1978
|
-
|
|
1979
|
-
poolingStrategy = Param(Params._dummy(),
|
|
1980
|
-
"poolingStrategy",
|
|
1981
|
-
"Choose how you would like to aggregate Word Embeddings to Chunk Embeddings:" +
|
|
1982
|
-
"AVERAGE or SUM",
|
|
1983
|
-
typeConverter=TypeConverters.toString)
|
|
1984
|
-
skipOOV = Param(Params._dummy(), "skipOOV", "Whether to discard default vectors for OOV words from the aggregation / pooling ", typeConverter=TypeConverters.toBoolean)
|
|
1985
|
-
|
|
1986
|
-
def setPoolingStrategy(self, strategy):
|
|
1987
|
-
"""
|
|
1988
|
-
Sets the value of :py:attr:`poolingStrategy`.
|
|
1989
|
-
"""
|
|
1990
|
-
if strategy == "AVERAGE":
|
|
1991
|
-
return self._set(poolingStrategy=strategy)
|
|
1992
|
-
elif strategy == "SUM":
|
|
1993
|
-
return self._set(poolingStrategy=strategy)
|
|
1994
|
-
else:
|
|
1995
|
-
return self._set(poolingStrategy="AVERAGE")
|
|
1996
|
-
|
|
1997
|
-
def setSkipOOV(self, value):
|
|
1998
|
-
"""
|
|
1999
|
-
Sets the value of :py:attr:`skipOOV`.
|
|
2000
|
-
"""
|
|
2001
|
-
return self._set(skipOOV=value)
|
|
2002
|
-
|
|
2003
|
-
|
|
2004
|
-
class NerOverwriter(AnnotatorModel):
|
|
2005
|
-
|
|
2006
|
-
name = "NerOverwriter"
|
|
2007
|
-
|
|
2008
|
-
@keyword_only
|
|
2009
|
-
def __init__(self):
|
|
2010
|
-
super(NerOverwriter, self).__init__(classname="com.johnsnowlabs.nlp.annotators.ner.NerOverwriter")
|
|
2011
|
-
self._setDefault(
|
|
2012
|
-
newResult="I-OVERWRITE"
|
|
2013
|
-
)
|
|
2014
|
-
|
|
2015
|
-
stopWords = Param(Params._dummy(), "stopWords", "The words to be overwritten",
|
|
2016
|
-
typeConverter=TypeConverters.toListString)
|
|
2017
|
-
newResult = Param(Params._dummy(), "newResult", "new NER class to apply to those stopwords",
|
|
2018
|
-
typeConverter=TypeConverters.toString)
|
|
2019
|
-
|
|
2020
|
-
def setStopWords(self, value):
|
|
2021
|
-
return self._set(stopWords=value)
|
|
2022
|
-
|
|
2023
|
-
def setNewResult(self, value):
|
|
2024
|
-
return self._set(newResult=value)
|
|
2025
|
-
|
|
2026
|
-
|
|
2027
|
-
class UniversalSentenceEncoder(AnnotatorModel, HasEmbeddingsProperties, HasStorageRef):
|
|
2028
|
-
|
|
2029
|
-
name = "UniversalSentenceEncoder"
|
|
2030
|
-
|
|
2031
|
-
configProtoBytes = Param(Params._dummy(),
|
|
2032
|
-
"configProtoBytes",
|
|
2033
|
-
"ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
|
|
2034
|
-
TypeConverters.toListString)
|
|
2035
|
-
|
|
2036
|
-
def setConfigProtoBytes(self, b):
|
|
2037
|
-
return self._set(configProtoBytes=b)
|
|
2038
|
-
|
|
2039
|
-
@keyword_only
|
|
2040
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.UniversalSentenceEncoder", java_model=None):
|
|
2041
|
-
super(UniversalSentenceEncoder, self).__init__(
|
|
2042
|
-
classname=classname,
|
|
2043
|
-
java_model=java_model
|
|
2044
|
-
)
|
|
2045
|
-
|
|
2046
|
-
@staticmethod
|
|
2047
|
-
def loadSavedModel(folder, spark_session):
|
|
2048
|
-
from sparknlp.internal import _USELoader
|
|
2049
|
-
jModel = _USELoader(folder, spark_session._jsparkSession)._java_obj
|
|
2050
|
-
return UniversalSentenceEncoder(java_model=jModel)
|
|
2051
|
-
|
|
2052
|
-
|
|
2053
|
-
@staticmethod
|
|
2054
|
-
def pretrained(name="tfhub_use", lang="en", remote_loc=None):
|
|
2055
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
2056
|
-
return ResourceDownloader.downloadModel(UniversalSentenceEncoder, name, lang, remote_loc)
|
|
2057
|
-
|
|
2058
|
-
|
|
2059
|
-
class ElmoEmbeddings(AnnotatorModel, HasEmbeddingsProperties, HasCaseSensitiveProperties, HasStorageRef):
|
|
2060
|
-
|
|
2061
|
-
name = "ElmoEmbeddings"
|
|
2062
|
-
|
|
2063
|
-
batchSize = Param(Params._dummy(),
|
|
2064
|
-
"batchSize",
|
|
2065
|
-
"Batch size. Large values allows faster processing but requires more memory.",
|
|
2066
|
-
typeConverter=TypeConverters.toInt)
|
|
2067
|
-
|
|
2068
|
-
configProtoBytes = Param(Params._dummy(),
|
|
2069
|
-
"configProtoBytes",
|
|
2070
|
-
"ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
|
|
2071
|
-
TypeConverters.toListString)
|
|
2072
|
-
|
|
2073
|
-
poolingLayer = Param(Params._dummy(),
|
|
2074
|
-
"poolingLayer", "Set ELMO pooling layer to: word_emb, lstm_outputs1, lstm_outputs2, or elmo",
|
|
2075
|
-
typeConverter=TypeConverters.toString)
|
|
2076
|
-
|
|
2077
|
-
def setConfigProtoBytes(self, b):
|
|
2078
|
-
return self._set(configProtoBytes=b)
|
|
2079
|
-
|
|
2080
|
-
def setBatchSize(self, value):
|
|
2081
|
-
return self._set(batchSize=value)
|
|
2082
|
-
|
|
2083
|
-
def setPoolingLayer(self, layer):
|
|
2084
|
-
if layer == "word_emb":
|
|
2085
|
-
return self._set(poolingLayer=layer)
|
|
2086
|
-
elif layer == "lstm_outputs1":
|
|
2087
|
-
return self._set(poolingLayer=layer)
|
|
2088
|
-
elif layer == "lstm_outputs2":
|
|
2089
|
-
return self._set(poolingLayer=layer)
|
|
2090
|
-
elif layer == "elmo":
|
|
2091
|
-
return self._set(poolingLayer=layer)
|
|
2092
|
-
else:
|
|
2093
|
-
return self._set(poolingLayer="word_emb")
|
|
2094
|
-
|
|
2095
|
-
@keyword_only
|
|
2096
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.ElmoEmbeddings", java_model=None):
|
|
2097
|
-
super(ElmoEmbeddings, self).__init__(
|
|
2098
|
-
classname=classname,
|
|
2099
|
-
java_model=java_model
|
|
2100
|
-
)
|
|
2101
|
-
self._setDefault(
|
|
2102
|
-
batchSize=32,
|
|
2103
|
-
poolingLayer="word_emb"
|
|
2104
|
-
)
|
|
2105
|
-
|
|
2106
|
-
@staticmethod
|
|
2107
|
-
def loadSavedModel(folder, spark_session):
|
|
2108
|
-
from sparknlp.internal import _ElmoLoader
|
|
2109
|
-
jModel = _ElmoLoader(folder, spark_session._jsparkSession)._java_obj
|
|
2110
|
-
return ElmoEmbeddings(java_model=jModel)
|
|
2111
|
-
|
|
2112
|
-
|
|
2113
|
-
@staticmethod
|
|
2114
|
-
def pretrained(name="elmo", lang="en", remote_loc=None):
|
|
2115
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
2116
|
-
return ResourceDownloader.downloadModel(ElmoEmbeddings, name, lang, remote_loc)
|
|
2117
|
-
|
|
2118
|
-
|
|
2119
|
-
class ClassifierDLApproach(AnnotatorApproach):
|
|
2120
|
-
|
|
2121
|
-
lr = Param(Params._dummy(), "lr", "Learning Rate", TypeConverters.toFloat)
|
|
2122
|
-
|
|
2123
|
-
batchSize = Param(Params._dummy(), "batchSize", "Batch size", TypeConverters.toInt)
|
|
2124
|
-
|
|
2125
|
-
dropout = Param(Params._dummy(), "dropout", "Dropout coefficient", TypeConverters.toFloat)
|
|
2126
|
-
|
|
2127
|
-
maxEpochs = Param(Params._dummy(), "maxEpochs", "Maximum number of epochs to train", TypeConverters.toInt)
|
|
2128
|
-
|
|
2129
|
-
configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", TypeConverters.toListString)
|
|
2130
|
-
|
|
2131
|
-
validationSplit = Param(Params._dummy(), "validationSplit", "Choose the proportion of training dataset to be validated against the model on each Epoch. The value should be between 0.0 and 1.0 and by default it is 0.0 and off.",
|
|
2132
|
-
TypeConverters.toFloat)
|
|
2133
|
-
|
|
2134
|
-
enableOutputLogs = Param(Params._dummy(), "enableOutputLogs",
|
|
2135
|
-
"Whether to use stdout in addition to Spark logs.",
|
|
2136
|
-
TypeConverters.toBoolean)
|
|
2137
|
-
|
|
2138
|
-
outputLogsPath = Param(Params._dummy(), "outputLogsPath", "Folder path to save training logs", TypeConverters.toString)
|
|
2139
|
-
|
|
2140
|
-
labelColumn = Param(Params._dummy(),
|
|
2141
|
-
"labelColumn",
|
|
2142
|
-
"Column with label per each token",
|
|
2143
|
-
typeConverter=TypeConverters.toString)
|
|
2144
|
-
|
|
2145
|
-
verbose = Param(Params._dummy(), "verbose", "Level of verbosity during training", TypeConverters.toInt)
|
|
2146
|
-
randomSeed = Param(Params._dummy(), "randomSeed", "Random seed", TypeConverters.toInt)
|
|
2147
|
-
|
|
2148
|
-
def setVerbose(self, value):
|
|
2149
|
-
return self._set(verbose=value)
|
|
2150
|
-
|
|
2151
|
-
def setRandomSeed(self, seed):
|
|
2152
|
-
return self._set(randomSeed=seed)
|
|
2153
|
-
|
|
2154
|
-
def setLabelColumn(self, value):
|
|
2155
|
-
return self._set(labelColumn=value)
|
|
2156
|
-
|
|
2157
|
-
def setConfigProtoBytes(self, b):
|
|
2158
|
-
return self._set(configProtoBytes=b)
|
|
2159
|
-
|
|
2160
|
-
def setLr(self, v):
|
|
2161
|
-
self._set(lr=v)
|
|
2162
|
-
return self
|
|
2163
|
-
|
|
2164
|
-
def setBatchSize(self, v):
|
|
2165
|
-
self._set(batchSize=v)
|
|
2166
|
-
return self
|
|
2167
|
-
|
|
2168
|
-
def setDropout(self, v):
|
|
2169
|
-
self._set(dropout=v)
|
|
2170
|
-
return self
|
|
2171
|
-
|
|
2172
|
-
def setMaxEpochs(self, epochs):
|
|
2173
|
-
return self._set(maxEpochs=epochs)
|
|
2174
|
-
|
|
2175
|
-
def _create_model(self, java_model):
|
|
2176
|
-
return ClassifierDLModel(java_model=java_model)
|
|
2177
|
-
|
|
2178
|
-
def setValidationSplit(self, v):
|
|
2179
|
-
self._set(validationSplit=v)
|
|
2180
|
-
return self
|
|
2181
|
-
|
|
2182
|
-
def setEnableOutputLogs(self, value):
|
|
2183
|
-
return self._set(enableOutputLogs=value)
|
|
2184
|
-
|
|
2185
|
-
def setOutputLogsPath(self, p):
|
|
2186
|
-
return self._set(outputLogsPath=p)
|
|
2187
|
-
|
|
2188
|
-
@keyword_only
|
|
2189
|
-
def __init__(self):
|
|
2190
|
-
super(ClassifierDLApproach, self).__init__(classname="com.johnsnowlabs.nlp.annotators.classifier.dl.ClassifierDLApproach")
|
|
2191
|
-
self._setDefault(
|
|
2192
|
-
maxEpochs=30,
|
|
2193
|
-
lr=float(0.005),
|
|
2194
|
-
batchSize=64,
|
|
2195
|
-
dropout=float(0.5),
|
|
2196
|
-
enableOutputLogs=False
|
|
2197
|
-
)
|
|
2198
|
-
|
|
2199
|
-
|
|
2200
|
-
class ClassifierDLModel(AnnotatorModel, HasStorageRef):
|
|
2201
|
-
name = "ClassifierDLModel"
|
|
2202
|
-
|
|
2203
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.ClassifierDLModel", java_model=None):
|
|
2204
|
-
super(ClassifierDLModel, self).__init__(
|
|
2205
|
-
classname=classname,
|
|
2206
|
-
java_model=java_model
|
|
2207
|
-
)
|
|
2208
|
-
|
|
2209
|
-
configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", TypeConverters.toListString)
|
|
2210
|
-
|
|
2211
|
-
classes = Param(Params._dummy(), "classes",
|
|
2212
|
-
"get the tags used to trained this NerDLModel",
|
|
2213
|
-
TypeConverters.toListString)
|
|
2214
|
-
|
|
2215
|
-
def setConfigProtoBytes(self, b):
|
|
2216
|
-
return self._set(configProtoBytes=b)
|
|
2217
|
-
|
|
2218
|
-
@staticmethod
|
|
2219
|
-
def pretrained(name="classifierdl_use_trec6", lang="en", remote_loc=None):
|
|
2220
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
2221
|
-
return ResourceDownloader.downloadModel(ClassifierDLModel, name, lang, remote_loc)
|
|
2222
|
-
|
|
2223
|
-
|
|
2224
|
-
class AlbertEmbeddings(AnnotatorModel, HasEmbeddingsProperties, HasCaseSensitiveProperties, HasStorageRef):
|
|
2225
|
-
|
|
2226
|
-
name = "AlbertEmbeddings"
|
|
2227
|
-
|
|
2228
|
-
batchSize = Param(Params._dummy(),
|
|
2229
|
-
"batchSize",
|
|
2230
|
-
"Batch size. Large values allows faster processing but requires more memory.",
|
|
2231
|
-
typeConverter=TypeConverters.toInt)
|
|
2232
|
-
|
|
2233
|
-
configProtoBytes = Param(Params._dummy(),
|
|
2234
|
-
"configProtoBytes",
|
|
2235
|
-
"ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
|
|
2236
|
-
TypeConverters.toListString)
|
|
2237
|
-
|
|
2238
|
-
maxSentenceLength = Param(Params._dummy(),
|
|
2239
|
-
"maxSentenceLength",
|
|
2240
|
-
"Max sentence length to process",
|
|
2241
|
-
typeConverter=TypeConverters.toInt)
|
|
2242
|
-
|
|
2243
|
-
def setConfigProtoBytes(self, b):
|
|
2244
|
-
return self._set(configProtoBytes=b)
|
|
2245
|
-
|
|
2246
|
-
def setBatchSize(self, value):
|
|
2247
|
-
return self._set(batchSize=value)
|
|
2248
|
-
|
|
2249
|
-
def setMaxSentenceLength(self, value):
|
|
2250
|
-
return self._set(maxSentenceLength=value)
|
|
2251
|
-
|
|
2252
|
-
@keyword_only
|
|
2253
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.AlbertEmbeddings", java_model=None):
|
|
2254
|
-
super(AlbertEmbeddings, self).__init__(
|
|
2255
|
-
classname=classname,
|
|
2256
|
-
java_model=java_model
|
|
2257
|
-
)
|
|
2258
|
-
self._setDefault(
|
|
2259
|
-
batchSize=32,
|
|
2260
|
-
dimension=768,
|
|
2261
|
-
maxSentenceLength=128
|
|
2262
|
-
)
|
|
2263
|
-
|
|
2264
|
-
@staticmethod
|
|
2265
|
-
def loadSavedModel(folder, spark_session):
|
|
2266
|
-
from sparknlp.internal import _AlbertLoader
|
|
2267
|
-
jModel = _AlbertLoader(folder, spark_session._jsparkSession)._java_obj
|
|
2268
|
-
return AlbertEmbeddings(java_model=jModel)
|
|
2269
|
-
|
|
2270
|
-
@staticmethod
|
|
2271
|
-
def pretrained(name="albert_base_uncased", lang="en", remote_loc=None):
|
|
2272
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
2273
|
-
return ResourceDownloader.downloadModel(AlbertEmbeddings, name, lang, remote_loc)
|
|
2274
|
-
|
|
2275
|
-
|
|
2276
|
-
class XlnetEmbeddings(AnnotatorModel, HasEmbeddingsProperties, HasCaseSensitiveProperties, HasStorageRef):
|
|
2277
|
-
|
|
2278
|
-
name = "XlnetEmbeddings"
|
|
2279
|
-
|
|
2280
|
-
batchSize = Param(Params._dummy(),
|
|
2281
|
-
"batchSize",
|
|
2282
|
-
"Batch size. Large values allows faster processing but requires more memory.",
|
|
2283
|
-
typeConverter=TypeConverters.toInt)
|
|
2284
|
-
|
|
2285
|
-
configProtoBytes = Param(Params._dummy(),
|
|
2286
|
-
"configProtoBytes",
|
|
2287
|
-
"ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
|
|
2288
|
-
TypeConverters.toListString)
|
|
2289
|
-
|
|
2290
|
-
maxSentenceLength = Param(Params._dummy(),
|
|
2291
|
-
"maxSentenceLength",
|
|
2292
|
-
"Max sentence length to process",
|
|
2293
|
-
typeConverter=TypeConverters.toInt)
|
|
2294
|
-
|
|
2295
|
-
def setConfigProtoBytes(self, b):
|
|
2296
|
-
return self._set(configProtoBytes=b)
|
|
2297
|
-
|
|
2298
|
-
def setBatchSize(self, value):
|
|
2299
|
-
return self._set(batchSize=value)
|
|
2300
|
-
|
|
2301
|
-
def setMaxSentenceLength(self, value):
|
|
2302
|
-
return self._set(maxSentenceLength=value)
|
|
2303
|
-
|
|
2304
|
-
@keyword_only
|
|
2305
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.XlnetEmbeddings", java_model=None):
|
|
2306
|
-
super(XlnetEmbeddings, self).__init__(
|
|
2307
|
-
classname=classname,
|
|
2308
|
-
java_model=java_model
|
|
2309
|
-
)
|
|
2310
|
-
self._setDefault(
|
|
2311
|
-
batchSize=32,
|
|
2312
|
-
dimension=768,
|
|
2313
|
-
maxSentenceLength=128
|
|
2314
|
-
)
|
|
2315
|
-
|
|
2316
|
-
@staticmethod
|
|
2317
|
-
def loadSavedModel(folder, spark_session):
|
|
2318
|
-
from sparknlp.internal import _XlnetLoader
|
|
2319
|
-
jModel = _XlnetLoader(folder, spark_session._jsparkSession)._java_obj
|
|
2320
|
-
return XlnetEmbeddings(java_model=jModel)
|
|
2321
|
-
|
|
2322
|
-
@staticmethod
|
|
2323
|
-
def pretrained(name="xlnet_base_cased", lang="en", remote_loc=None):
|
|
2324
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
2325
|
-
return ResourceDownloader.downloadModel(XlnetEmbeddings, name, lang, remote_loc)
|
|
2326
|
-
|
|
2327
|
-
|
|
2328
|
-
class ContextSpellCheckerApproach(AnnotatorApproach):
|
|
2329
|
-
|
|
2330
|
-
name = "ContextSpellCheckerApproach"
|
|
2331
|
-
|
|
2332
|
-
languageModelClasses = Param(Params._dummy(),
|
|
2333
|
-
"languageModelClasses",
|
|
2334
|
-
"Number of classes to use during factorization of the softmax output in the LM.",
|
|
2335
|
-
typeConverter=TypeConverters.toInt)
|
|
2336
|
-
|
|
2337
|
-
wordMaxDistance = Param(Params._dummy(),
|
|
2338
|
-
"wordMaxDistance",
|
|
2339
|
-
"Maximum distance for the generated candidates for every word.",
|
|
2340
|
-
typeConverter=TypeConverters.toInt)
|
|
2341
|
-
|
|
2342
|
-
maxCandidates = Param(Params._dummy(),
|
|
2343
|
-
"maxCandidates",
|
|
2344
|
-
"Maximum number of candidates for every word.",
|
|
2345
|
-
typeConverter=TypeConverters.toInt)
|
|
2346
|
-
|
|
2347
|
-
caseStrategy = Param(Params._dummy(),
|
|
2348
|
-
"caseStrategy",
|
|
2349
|
-
"What case combinations to try when generating candidates.",
|
|
2350
|
-
typeConverter=TypeConverters.toInt)
|
|
2351
|
-
|
|
2352
|
-
errorThreshold = Param(Params._dummy(),
|
|
2353
|
-
"errorThreshold",
|
|
2354
|
-
"Threshold perplexity for a word to be considered as an error.",
|
|
2355
|
-
typeConverter=TypeConverters.toFloat)
|
|
2356
|
-
|
|
2357
|
-
epochs = Param(Params._dummy(),
|
|
2358
|
-
"epochs",
|
|
2359
|
-
"Number of epochs to train the language model.",
|
|
2360
|
-
typeConverter=TypeConverters.toInt)
|
|
2361
|
-
|
|
2362
|
-
batchSize = Param(Params._dummy(),
|
|
2363
|
-
"batchSize",
|
|
2364
|
-
"Batch size for the training in NLM.",
|
|
2365
|
-
typeConverter=TypeConverters.toInt)
|
|
2366
|
-
|
|
2367
|
-
initialRate = Param(Params._dummy(),
|
|
2368
|
-
"initialRate",
|
|
2369
|
-
"Initial learning rate for the LM.",
|
|
2370
|
-
typeConverter=TypeConverters.toFloat)
|
|
2371
|
-
|
|
2372
|
-
finalRate = Param(Params._dummy(),
|
|
2373
|
-
"finalRate",
|
|
2374
|
-
"Final learning rate for the LM.",
|
|
2375
|
-
typeConverter=TypeConverters.toFloat)
|
|
2376
|
-
|
|
2377
|
-
validationFraction = Param(Params._dummy(),
|
|
2378
|
-
"validationFraction",
|
|
2379
|
-
"Percentage of datapoints to use for validation.",
|
|
2380
|
-
typeConverter=TypeConverters.toFloat)
|
|
2381
|
-
|
|
2382
|
-
minCount = Param(Params._dummy(),
|
|
2383
|
-
"minCount",
|
|
2384
|
-
"Min number of times a token should appear to be included in vocab.",
|
|
2385
|
-
typeConverter=TypeConverters.toInt)
|
|
2386
|
-
|
|
2387
|
-
compoundCount = Param(Params._dummy(),
|
|
2388
|
-
"compoundCount",
|
|
2389
|
-
"Min number of times a compound word should appear to be included in vocab.",
|
|
2390
|
-
typeConverter=TypeConverters.toInt)
|
|
2391
|
-
|
|
2392
|
-
classCount = Param(Params._dummy(),
|
|
2393
|
-
"classCount",
|
|
2394
|
-
"Min number of times the word need to appear in corpus to not be considered of a special class.",
|
|
2395
|
-
typeConverter=TypeConverters.toInt)
|
|
2396
|
-
|
|
2397
|
-
tradeoff = Param(Params._dummy(),
|
|
2398
|
-
"tradeoff",
|
|
2399
|
-
"Tradeoff between the cost of a word error and a transition in the language model.",
|
|
2400
|
-
typeConverter=TypeConverters.toFloat)
|
|
2401
|
-
|
|
2402
|
-
weightedDistPath = Param(Params._dummy(),
|
|
2403
|
-
"weightedDistPath",
|
|
2404
|
-
"The path to the file containing the weights for the levenshtein distance.",
|
|
2405
|
-
typeConverter=TypeConverters.toString)
|
|
2406
|
-
|
|
2407
|
-
maxWindowLen = Param(Params._dummy(),
|
|
2408
|
-
"maxWindowLen",
|
|
2409
|
-
"Maximum size for the window used to remember history prior to every correction.",
|
|
2410
|
-
typeConverter=TypeConverters.toInt)
|
|
2411
|
-
|
|
2412
|
-
configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", TypeConverters.toListString)
|
|
2413
|
-
|
|
2414
|
-
|
|
2415
|
-
def setLanguageModelClasses(self, count):
|
|
2416
|
-
return self._set(languageModelClasses=count)
|
|
2417
|
-
|
|
2418
|
-
def setWordMaxDistance(self, dist):
|
|
2419
|
-
return self._set(wordMaxDistance=dist)
|
|
2420
|
-
|
|
2421
|
-
def setMaxCandidates(self, candidates):
|
|
2422
|
-
return self._set(maxCandidates=candidates)
|
|
2423
|
-
|
|
2424
|
-
def setCaseStrategy(self, strategy):
|
|
2425
|
-
return self._set(caseStrategy=strategy)
|
|
2426
|
-
|
|
2427
|
-
def setErrorThreshold(self, threshold):
|
|
2428
|
-
return self._set(errorThreshold=threshold)
|
|
2429
|
-
|
|
2430
|
-
def setEpochs(self, count):
|
|
2431
|
-
return self._set(epochs=count)
|
|
2432
|
-
|
|
2433
|
-
def setInitialBatchSize(self, size):
|
|
2434
|
-
return self._set(batchSize=size)
|
|
2435
|
-
|
|
2436
|
-
def setInitialRate(self, rate):
|
|
2437
|
-
return self._set(initialRate=rate)
|
|
2438
|
-
|
|
2439
|
-
def setFinalRate(self, rate):
|
|
2440
|
-
return self._set(finalRate=rate)
|
|
2441
|
-
|
|
2442
|
-
def setValidationFraction(self, fraction):
|
|
2443
|
-
return self._set(validationFraction=fraction)
|
|
2444
|
-
|
|
2445
|
-
def setMinCount(self, count):
|
|
2446
|
-
return self._set(minCount=count)
|
|
2447
|
-
|
|
2448
|
-
def setCompoundCount(self, count):
|
|
2449
|
-
return self._set(compoundCount=count)
|
|
2450
|
-
|
|
2451
|
-
def setClassCount(self, count):
|
|
2452
|
-
return self._set(classCount=count)
|
|
2453
|
-
|
|
2454
|
-
def setTradeoff(self, alpha):
|
|
2455
|
-
return self._set(tradeoff=alpha)
|
|
2456
|
-
|
|
2457
|
-
def setWeightedDistPath(self, path):
|
|
2458
|
-
return self._set(weightedDistPath=path)
|
|
2459
|
-
|
|
2460
|
-
def setWeightedDistPath(self, path):
|
|
2461
|
-
return self._set(weightedDistPath=path)
|
|
2462
|
-
|
|
2463
|
-
def setMaxWindowLen(self, length):
|
|
2464
|
-
return self._set(maxWindowLen=length)
|
|
2465
|
-
|
|
2466
|
-
def setConfigProtoBytes(self, b):
|
|
2467
|
-
return self._set(configProtoBytes=b)
|
|
2468
|
-
|
|
2469
|
-
def addVocabClass(self, label, vocab, userdist=3):
|
|
2470
|
-
self._call_java('addVocabClass', label, vocab, userdist)
|
|
2471
|
-
return self
|
|
2472
|
-
|
|
2473
|
-
def addRegexClass(self, label, regex, userdist=3):
|
|
2474
|
-
self._call_java('addRegexClass', label, regex, userdist)
|
|
2475
|
-
return self
|
|
2476
|
-
|
|
2477
|
-
@keyword_only
|
|
2478
|
-
def __init__(self):
|
|
2479
|
-
super(ContextSpellCheckerApproach, self). \
|
|
2480
|
-
__init__(classname="com.johnsnowlabs.nlp.annotators.spell.context.ContextSpellCheckerApproach")
|
|
2481
|
-
|
|
2482
|
-
def _create_model(self, java_model):
|
|
2483
|
-
return ContextSpellCheckerModel(java_model=java_model)
|
|
2484
|
-
|
|
2485
|
-
|
|
2486
|
-
class ContextSpellCheckerModel(AnnotatorModel):
|
|
2487
|
-
name = "ContextSpellCheckerModel"
|
|
2488
|
-
|
|
2489
|
-
wordMaxDistance = Param(Params._dummy(),
|
|
2490
|
-
"wordMaxDistance",
|
|
2491
|
-
"Maximum distance for the generated candidates for every word.",
|
|
2492
|
-
typeConverter=TypeConverters.toInt)
|
|
2493
|
-
|
|
2494
|
-
maxCandidates = Param(Params._dummy(),
|
|
2495
|
-
"maxCandidates",
|
|
2496
|
-
"Maximum number of candidates for every word.",
|
|
2497
|
-
typeConverter=TypeConverters.toInt)
|
|
2498
|
-
|
|
2499
|
-
caseStrategy = Param(Params._dummy(),
|
|
2500
|
-
"caseStrategy",
|
|
2501
|
-
"What case combinations to try when generating candidates.",
|
|
2502
|
-
typeConverter=TypeConverters.toInt)
|
|
2503
|
-
|
|
2504
|
-
errorThreshold = Param(Params._dummy(),
|
|
2505
|
-
"errorThreshold",
|
|
2506
|
-
"Threshold perplexity for a word to be considered as an error.",
|
|
2507
|
-
typeConverter=TypeConverters.toFloat)
|
|
2508
|
-
|
|
2509
|
-
tradeoff = Param(Params._dummy(),
|
|
2510
|
-
"tradeoff",
|
|
2511
|
-
"Tradeoff between the cost of a word error and a transition in the language model.",
|
|
2512
|
-
typeConverter=TypeConverters.toFloat)
|
|
2513
|
-
|
|
2514
|
-
weightedDistPath = Param(Params._dummy(),
|
|
2515
|
-
"weightedDistPath",
|
|
2516
|
-
"The path to the file containing the weights for the levenshtein distance.",
|
|
2517
|
-
typeConverter=TypeConverters.toString)
|
|
2518
|
-
|
|
2519
|
-
maxWindowLen = Param(Params._dummy(),
|
|
2520
|
-
"maxWindowLen",
|
|
2521
|
-
"Maximum size for the window used to remember history prior to every correction.",
|
|
2522
|
-
typeConverter=TypeConverters.toInt)
|
|
2523
|
-
|
|
2524
|
-
gamma = Param(Params._dummy(),
|
|
2525
|
-
"gamma",
|
|
2526
|
-
"Controls the influence of individual word frequency in the decision.",
|
|
2527
|
-
typeConverter=TypeConverters.toFloat)
|
|
2528
|
-
|
|
2529
|
-
correctSymbols = Param(Params._dummy(), "correctSymbols", "Whether to correct special symbols or skip spell checking for them", typeConverter=TypeConverters.toBoolean)
|
|
2530
|
-
|
|
2531
|
-
compareLowcase = Param(Params._dummy(), "compareLowcase", "If true will compare tokens in low case with vocabulary", typeConverter=TypeConverters.toBoolean)
|
|
2532
|
-
|
|
2533
|
-
configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", TypeConverters.toListString)
|
|
2534
|
-
|
|
2535
|
-
|
|
2536
|
-
def setWordMaxDistance(self, dist):
|
|
2537
|
-
return self._set(wordMaxDistance=dist)
|
|
2538
|
-
|
|
2539
|
-
def setMaxCandidates(self, candidates):
|
|
2540
|
-
return self._set(maxCandidates=candidates)
|
|
2541
|
-
|
|
2542
|
-
def setCaseStrategy(self, strategy):
|
|
2543
|
-
return self._set(caseStrategy=strategy)
|
|
2544
|
-
|
|
2545
|
-
def setErrorThreshold(self, threshold):
|
|
2546
|
-
return self._set(errorThreshold=threshold)
|
|
2547
|
-
|
|
2548
|
-
def setTradeoff(self, alpha):
|
|
2549
|
-
return self._set(tradeoff=alpha)
|
|
2550
|
-
|
|
2551
|
-
def setWeights(self, weights):
|
|
2552
|
-
self._call_java('setWeights', weights)
|
|
2553
|
-
|
|
2554
|
-
def setMaxWindowLen(self, length):
|
|
2555
|
-
return self._set(maxWindowLen=length)
|
|
2556
|
-
|
|
2557
|
-
def setGamma(self, g):
|
|
2558
|
-
return self._set(gamma=g)
|
|
2559
|
-
|
|
2560
|
-
def setConfigProtoBytes(self, b):
|
|
2561
|
-
return self._set(configProtoBytes=b)
|
|
2562
|
-
|
|
2563
|
-
def getWordClasses(self):
|
|
2564
|
-
it = self._call_java('getWordClasses').toIterator()
|
|
2565
|
-
result = []
|
|
2566
|
-
while(it.hasNext()):
|
|
2567
|
-
result.append(it.next().toString())
|
|
2568
|
-
return result
|
|
2569
|
-
|
|
2570
|
-
def updateRegexClass(self, label, regex):
|
|
2571
|
-
self._call_java('updateRegexClass', label, regex)
|
|
2572
|
-
return self
|
|
2573
|
-
|
|
2574
|
-
def updateVocabClass(self, label, vocab, append=True):
|
|
2575
|
-
self._call_java('updateVocabClass', label, vocab, append)
|
|
2576
|
-
return self
|
|
2577
|
-
|
|
2578
|
-
def setCorrectSymbols(self, value):
|
|
2579
|
-
return self._set(correctSymbols=value)
|
|
2580
|
-
|
|
2581
|
-
def setCompareLowcase(self, value):
|
|
2582
|
-
return self._set(compareLowcase=value)
|
|
2583
|
-
|
|
2584
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.spell.context.ContextSpellCheckerModel", java_model=None):
|
|
2585
|
-
super(ContextSpellCheckerModel, self).__init__(
|
|
2586
|
-
classname=classname,
|
|
2587
|
-
java_model=java_model
|
|
2588
|
-
)
|
|
2589
|
-
|
|
2590
|
-
@staticmethod
|
|
2591
|
-
def pretrained(name="spellcheck_dl", lang="en", remote_loc=None):
|
|
2592
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
2593
|
-
return ResourceDownloader.downloadModel(ContextSpellCheckerModel, name, lang, remote_loc)
|
|
2594
|
-
|
|
2595
|
-
|
|
2596
|
-
class SentimentDLApproach(AnnotatorApproach):
|
|
2597
|
-
|
|
2598
|
-
lr = Param(Params._dummy(), "lr", "Learning Rate", TypeConverters.toFloat)
|
|
2599
|
-
|
|
2600
|
-
batchSize = Param(Params._dummy(), "batchSize", "Batch size", TypeConverters.toInt)
|
|
2601
|
-
|
|
2602
|
-
dropout = Param(Params._dummy(), "dropout", "Dropout coefficient", TypeConverters.toFloat)
|
|
2603
|
-
|
|
2604
|
-
maxEpochs = Param(Params._dummy(), "maxEpochs", "Maximum number of epochs to train", TypeConverters.toInt)
|
|
2605
|
-
|
|
2606
|
-
configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", TypeConverters.toListString)
|
|
2607
|
-
|
|
2608
|
-
validationSplit = Param(Params._dummy(), "validationSplit", "Choose the proportion of training dataset to be validated against the model on each Epoch. The value should be between 0.0 and 1.0 and by default it is 0.0 and off.",
|
|
2609
|
-
TypeConverters.toFloat)
|
|
2610
|
-
|
|
2611
|
-
enableOutputLogs = Param(Params._dummy(), "enableOutputLogs",
|
|
2612
|
-
"Whether to use stdout in addition to Spark logs.",
|
|
2613
|
-
TypeConverters.toBoolean)
|
|
2614
|
-
|
|
2615
|
-
outputLogsPath = Param(Params._dummy(), "outputLogsPath", "Folder path to save training logs", TypeConverters.toString)
|
|
2616
|
-
|
|
2617
|
-
labelColumn = Param(Params._dummy(),
|
|
2618
|
-
"labelColumn",
|
|
2619
|
-
"Column with label per each token",
|
|
2620
|
-
typeConverter=TypeConverters.toString)
|
|
2621
|
-
|
|
2622
|
-
verbose = Param(Params._dummy(), "verbose", "Level of verbosity during training", TypeConverters.toInt)
|
|
2623
|
-
randomSeed = Param(Params._dummy(), "randomSeed", "Random seed", TypeConverters.toInt)
|
|
2624
|
-
threshold = Param(Params._dummy(), "threshold", "The minimum threshold for the final result otheriwse it will be neutral", TypeConverters.toFloat)
|
|
2625
|
-
thresholdLabel = Param(Params._dummy(), "thresholdLabel", "In case the score is less than threshold, what should be the label. Default is neutral.", TypeConverters.toString)
|
|
2626
|
-
|
|
2627
|
-
def setVerbose(self, value):
|
|
2628
|
-
return self._set(verbose=value)
|
|
2629
|
-
|
|
2630
|
-
def setRandomSeed(self, seed):
|
|
2631
|
-
return self._set(randomSeed=seed)
|
|
2632
|
-
|
|
2633
|
-
def setLabelColumn(self, value):
|
|
2634
|
-
return self._set(labelColumn=value)
|
|
2635
|
-
|
|
2636
|
-
def setConfigProtoBytes(self, b):
|
|
2637
|
-
return self._set(configProtoBytes=b)
|
|
2638
|
-
|
|
2639
|
-
def setLr(self, v):
|
|
2640
|
-
self._set(lr=v)
|
|
2641
|
-
return self
|
|
2642
|
-
|
|
2643
|
-
def setBatchSize(self, v):
|
|
2644
|
-
self._set(batchSize=v)
|
|
2645
|
-
return self
|
|
2646
|
-
|
|
2647
|
-
def setDropout(self, v):
|
|
2648
|
-
self._set(dropout=v)
|
|
2649
|
-
return self
|
|
2650
|
-
|
|
2651
|
-
def setMaxEpochs(self, epochs):
|
|
2652
|
-
return self._set(maxEpochs=epochs)
|
|
2653
|
-
|
|
2654
|
-
def _create_model(self, java_model):
|
|
2655
|
-
return SentimentDLModel(java_model=java_model)
|
|
2656
|
-
|
|
2657
|
-
def setValidationSplit(self, v):
|
|
2658
|
-
self._set(validationSplit=v)
|
|
2659
|
-
return self
|
|
2660
|
-
|
|
2661
|
-
def setEnableOutputLogs(self, value):
|
|
2662
|
-
return self._set(enableOutputLogs=value)
|
|
2663
|
-
|
|
2664
|
-
def setOutputLogsPath(self, p):
|
|
2665
|
-
return self._set(outputLogsPath=p)
|
|
2666
|
-
|
|
2667
|
-
def setThreshold(self, v):
|
|
2668
|
-
self._set(threshold=v)
|
|
2669
|
-
return self
|
|
2670
|
-
|
|
2671
|
-
def setThresholdLabel(self, p):
|
|
2672
|
-
return self._set(thresholdLabel=p)
|
|
2673
|
-
|
|
2674
|
-
@keyword_only
|
|
2675
|
-
def __init__(self):
|
|
2676
|
-
super(SentimentDLApproach, self).__init__(classname="com.johnsnowlabs.nlp.annotators.classifier.dl.SentimentDLApproach")
|
|
2677
|
-
self._setDefault(
|
|
2678
|
-
maxEpochs=30,
|
|
2679
|
-
lr=float(0.005),
|
|
2680
|
-
batchSize=64,
|
|
2681
|
-
dropout=float(0.5),
|
|
2682
|
-
enableOutputLogs=False,
|
|
2683
|
-
threshold=0.6,
|
|
2684
|
-
thresholdLabel="neutral"
|
|
2685
|
-
)
|
|
2686
|
-
|
|
2687
|
-
|
|
2688
|
-
class SentimentDLModel(AnnotatorModel, HasStorageRef):
|
|
2689
|
-
name = "SentimentDLModel"
|
|
2690
|
-
|
|
2691
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.SentimentDLModel", java_model=None):
|
|
2692
|
-
super(SentimentDLModel, self).__init__(
|
|
2693
|
-
classname=classname,
|
|
2694
|
-
java_model=java_model
|
|
2695
|
-
)
|
|
2696
|
-
self._setDefault(
|
|
2697
|
-
threshold=0.6,
|
|
2698
|
-
thresholdLabel="neutral"
|
|
2699
|
-
)
|
|
2700
|
-
|
|
2701
|
-
configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", TypeConverters.toListString)
|
|
2702
|
-
threshold = Param(Params._dummy(), "threshold", "The minimum threshold for the final result otheriwse it will be neutral", TypeConverters.toFloat)
|
|
2703
|
-
thresholdLabel = Param(Params._dummy(), "thresholdLabel", "In case the score is less than threshold, what should be the label. Default is neutral.", TypeConverters.toString)
|
|
2704
|
-
classes = Param(Params._dummy(), "classes",
|
|
2705
|
-
"get the tags used to trained this NerDLModel",
|
|
2706
|
-
TypeConverters.toListString)
|
|
2707
|
-
|
|
2708
|
-
def setConfigProtoBytes(self, b):
|
|
2709
|
-
return self._set(configProtoBytes=b)
|
|
2710
|
-
|
|
2711
|
-
def setThreshold(self, v):
|
|
2712
|
-
self._set(threshold=v)
|
|
2713
|
-
return self
|
|
2714
|
-
|
|
2715
|
-
def setThresholdLabel(self, p):
|
|
2716
|
-
return self._set(thresholdLabel=p)
|
|
2717
|
-
|
|
2718
|
-
@staticmethod
|
|
2719
|
-
def pretrained(name="sentimentdl_use_imdb", lang="en", remote_loc=None):
|
|
2720
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
2721
|
-
return ResourceDownloader.downloadModel(SentimentDLModel, name, lang, remote_loc)
|
|
2722
|
-
|
|
2723
|
-
|
|
2724
|
-
class LanguageDetectorDL(AnnotatorModel, HasStorageRef):
|
|
2725
|
-
name = "LanguageDetectorDL"
|
|
2726
|
-
|
|
2727
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.ld.dl.LanguageDetectorDL", java_model=None):
|
|
2728
|
-
super(LanguageDetectorDL, self).__init__(
|
|
2729
|
-
classname=classname,
|
|
2730
|
-
java_model=java_model
|
|
2731
|
-
)
|
|
2732
|
-
self._setDefault(
|
|
2733
|
-
threshold=0.5,
|
|
2734
|
-
thresholdLabel="Unknown",
|
|
2735
|
-
coalesceSentences=True
|
|
2736
|
-
)
|
|
2737
|
-
|
|
2738
|
-
configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", TypeConverters.toListString)
|
|
2739
|
-
threshold = Param(Params._dummy(), "threshold", "The minimum threshold for the final result otheriwse it will be either neutral or the value set in thresholdLabel.", TypeConverters.toFloat)
|
|
2740
|
-
thresholdLabel = Param(Params._dummy(), "thresholdLabel", "In case the score is less than threshold, what should be the label. Default is neutral.", TypeConverters.toString)
|
|
2741
|
-
coalesceSentences = Param(Params._dummy(), "coalesceSentences", "If sets to true the output of all sentences will be averaged to one output instead of one output per sentence. Default to false.", TypeConverters.toBoolean)
|
|
2742
|
-
|
|
2743
|
-
def setConfigProtoBytes(self, b):
|
|
2744
|
-
return self._set(configProtoBytes=b)
|
|
2745
|
-
|
|
2746
|
-
def setThreshold(self, v):
|
|
2747
|
-
self._set(threshold=v)
|
|
2748
|
-
return self
|
|
2749
|
-
|
|
2750
|
-
def setThresholdLabel(self, p):
|
|
2751
|
-
return self._set(thresholdLabel=p)
|
|
2752
|
-
|
|
2753
|
-
def setCoalesceSentences(self, value):
|
|
2754
|
-
return self._set(coalesceSentences=value)
|
|
2755
|
-
|
|
2756
|
-
@staticmethod
|
|
2757
|
-
def pretrained(name="ld_wiki_20", lang="xx", remote_loc=None):
|
|
2758
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
2759
|
-
return ResourceDownloader.downloadModel(LanguageDetectorDL, name, lang, remote_loc)
|
|
2760
|
-
|
|
2761
|
-
|
|
2762
|
-
class MultiClassifierDLApproach(AnnotatorApproach):
|
|
2763
|
-
|
|
2764
|
-
lr = Param(Params._dummy(), "lr", "Learning Rate", TypeConverters.toFloat)
|
|
2765
|
-
|
|
2766
|
-
batchSize = Param(Params._dummy(), "batchSize", "Batch size", TypeConverters.toInt)
|
|
2767
|
-
|
|
2768
|
-
maxEpochs = Param(Params._dummy(), "maxEpochs", "Maximum number of epochs to train", TypeConverters.toInt)
|
|
2769
|
-
|
|
2770
|
-
configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", TypeConverters.toListString)
|
|
2771
|
-
|
|
2772
|
-
validationSplit = Param(Params._dummy(), "validationSplit", "Choose the proportion of training dataset to be validated against the model on each Epoch. The value should be between 0.0 and 1.0 and by default it is 0.0 and off.",
|
|
2773
|
-
TypeConverters.toFloat)
|
|
2774
|
-
|
|
2775
|
-
enableOutputLogs = Param(Params._dummy(), "enableOutputLogs",
|
|
2776
|
-
"Whether to use stdout in addition to Spark logs.",
|
|
2777
|
-
TypeConverters.toBoolean)
|
|
2778
|
-
|
|
2779
|
-
outputLogsPath = Param(Params._dummy(), "outputLogsPath", "Folder path to save training logs", TypeConverters.toString)
|
|
2780
|
-
|
|
2781
|
-
labelColumn = Param(Params._dummy(),
|
|
2782
|
-
"labelColumn",
|
|
2783
|
-
"Column with label per each token",
|
|
2784
|
-
typeConverter=TypeConverters.toString)
|
|
2785
|
-
|
|
2786
|
-
verbose = Param(Params._dummy(), "verbose", "Level of verbosity during training", TypeConverters.toInt)
|
|
2787
|
-
randomSeed = Param(Params._dummy(), "randomSeed", "Random seed", TypeConverters.toInt)
|
|
2788
|
-
shufflePerEpoch = Param(Params._dummy(), "shufflePerEpoch", "whether to shuffle the training data on each Epoch", TypeConverters.toBoolean)
|
|
2789
|
-
threshold = Param(Params._dummy(), "threshold", "The minimum threshold for each label to be accepted. Default is 0.5", TypeConverters.toFloat)
|
|
2790
|
-
|
|
2791
|
-
def setVerbose(self, v):
|
|
2792
|
-
return self._set(verbose=v)
|
|
2793
|
-
|
|
2794
|
-
def setRandomSeed(self, seed):
|
|
2795
|
-
return self._set(randomSeed=seed)
|
|
2796
|
-
|
|
2797
|
-
def setLabelColumn(self, v):
|
|
2798
|
-
return self._set(labelColumn=v)
|
|
2799
|
-
|
|
2800
|
-
def setConfigProtoBytes(self, v):
|
|
2801
|
-
return self._set(configProtoBytes=v)
|
|
2802
|
-
|
|
2803
|
-
def setLr(self, v):
|
|
2804
|
-
self._set(lr=v)
|
|
2805
|
-
return self
|
|
2806
|
-
|
|
2807
|
-
def setBatchSize(self, v):
|
|
2808
|
-
self._set(batchSize=v)
|
|
2809
|
-
return self
|
|
2810
|
-
|
|
2811
|
-
def setMaxEpochs(self, v):
|
|
2812
|
-
return self._set(maxEpochs=v)
|
|
2813
|
-
|
|
2814
|
-
def _create_model(self, java_model):
|
|
2815
|
-
return ClassifierDLModel(java_model=java_model)
|
|
2816
|
-
|
|
2817
|
-
def setValidationSplit(self, v):
|
|
2818
|
-
self._set(validationSplit=v)
|
|
2819
|
-
return self
|
|
2820
|
-
|
|
2821
|
-
def setEnableOutputLogs(self, v):
|
|
2822
|
-
return self._set(enableOutputLogs=v)
|
|
2823
|
-
|
|
2824
|
-
def setOutputLogsPath(self, v):
|
|
2825
|
-
return self._set(outputLogsPath=v)
|
|
2826
|
-
|
|
2827
|
-
def setShufflePerEpoch(self, v):
|
|
2828
|
-
return self._set(shufflePerEpoch=v)
|
|
2829
|
-
|
|
2830
|
-
def setThreshold(self, v):
|
|
2831
|
-
self._set(threshold=v)
|
|
2832
|
-
return self
|
|
2833
|
-
|
|
2834
|
-
@keyword_only
|
|
2835
|
-
def __init__(self):
|
|
2836
|
-
super(MultiClassifierDLApproach, self).__init__(classname="com.johnsnowlabs.nlp.annotators.classifier.dl.MultiClassifierDLApproach")
|
|
2837
|
-
self._setDefault(
|
|
2838
|
-
maxEpochs=10,
|
|
2839
|
-
lr=float(0.001),
|
|
2840
|
-
batchSize=64,
|
|
2841
|
-
validationSplit=float(0.0),
|
|
2842
|
-
threshold=float(0.5),
|
|
2843
|
-
randomSeed=44,
|
|
2844
|
-
shufflePerEpoch=False,
|
|
2845
|
-
enableOutputLogs=False
|
|
2846
|
-
)
|
|
2847
|
-
|
|
2848
|
-
|
|
2849
|
-
class MultiClassifierDLModel(AnnotatorModel, HasStorageRef):
|
|
2850
|
-
name = "MultiClassifierDLModel"
|
|
2851
|
-
|
|
2852
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.MultiClassifierDLModel", java_model=None):
|
|
2853
|
-
super(MultiClassifierDLModel, self).__init__(
|
|
2854
|
-
classname=classname,
|
|
2855
|
-
java_model=java_model
|
|
2856
|
-
)
|
|
2857
|
-
self._setDefault(
|
|
2858
|
-
threshold=float(0.5)
|
|
2859
|
-
)
|
|
2860
|
-
|
|
2861
|
-
configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", TypeConverters.toListString)
|
|
2862
|
-
threshold = Param(Params._dummy(), "threshold", "The minimum threshold for each label to be accepted. Default is 0.5", TypeConverters.toFloat)
|
|
2863
|
-
classes = Param(Params._dummy(), "classes",
|
|
2864
|
-
"get the tags used to trained this NerDLModel",
|
|
2865
|
-
TypeConverters.toListString)
|
|
2866
|
-
|
|
2867
|
-
def setThreshold(self, v):
|
|
2868
|
-
self._set(threshold=v)
|
|
2869
|
-
return self
|
|
2870
|
-
|
|
2871
|
-
def setConfigProtoBytes(self, b):
|
|
2872
|
-
return self._set(configProtoBytes=b)
|
|
2873
|
-
|
|
2874
|
-
@staticmethod
|
|
2875
|
-
def pretrained(name="multiclassifierdl_use_toxic", lang="en", remote_loc=None):
|
|
2876
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
2877
|
-
return ResourceDownloader.downloadModel(MultiClassifierDLModel, name, lang, remote_loc)
|
|
2878
|
-
|
|
2879
|
-
|
|
2880
|
-
class YakeModel(AnnotatorModel):
|
|
2881
|
-
name = "YakeModel"
|
|
2882
|
-
@keyword_only
|
|
2883
|
-
def __init__(self):
|
|
2884
|
-
super(YakeModel, self).__init__(classname="com.johnsnowlabs.nlp.annotators.keyword.yake.YakeModel")
|
|
2885
|
-
self._setDefault(
|
|
2886
|
-
minNGrams=2,
|
|
2887
|
-
maxNGrams=3,
|
|
2888
|
-
nKeywords=30,
|
|
2889
|
-
windowSize=3,
|
|
2890
|
-
threshold=-1,
|
|
2891
|
-
stopWords=YakeModel.loadDefaultStopWords("english")
|
|
2892
|
-
)
|
|
2893
|
-
|
|
2894
|
-
minNGrams = Param(Params._dummy(), "minNGrams", "Minimum N-grams a keyword should have", typeConverter=TypeConverters.toInt)
|
|
2895
|
-
maxNGrams = Param(Params._dummy(), "maxNGrams", "Maximum N-grams a keyword should have", typeConverter=TypeConverters.toInt)
|
|
2896
|
-
threshold = Param(Params._dummy(), "maxNGrams", "Keyword Score threshold", typeConverter=TypeConverters.toInt)
|
|
2897
|
-
windowSize = Param(Params._dummy(), "windowSize", "Window size for Co-Occurrence", typeConverter=TypeConverters.toInt)
|
|
2898
|
-
nKeywords = Param(Params._dummy(), "nKeywords", "Number of Keywords to extract", typeConverter=TypeConverters.toInt)
|
|
2899
|
-
stopWords = Param(Params._dummy(), "stopWords", "the words to be filtered out. by default it's english stop words from Spark ML",typeConverter=TypeConverters.toListString)
|
|
2900
|
-
|
|
2901
|
-
def setWindowSize(self, value):
|
|
2902
|
-
return self._set(windowSize=value)
|
|
2903
|
-
|
|
2904
|
-
def setMinNGrams(self, value):
|
|
2905
|
-
return self._set(minNGrams=value)
|
|
2906
|
-
|
|
2907
|
-
def setMaxNGrams(self, value):
|
|
2908
|
-
return self._set(maxNGrams=value)
|
|
2909
|
-
|
|
2910
|
-
def setThreshold(self, value):
|
|
2911
|
-
return self._set(threshold=value)
|
|
2912
|
-
|
|
2913
|
-
def setNKeywords(self, value):
|
|
2914
|
-
return self._set(nKeywords=value)
|
|
2915
|
-
|
|
2916
|
-
def setStopWords(self, value):
|
|
2917
|
-
return self._set(stopWords=value)
|
|
2918
|
-
|
|
2919
|
-
def getStopWords(self):
|
|
2920
|
-
return self.getOrDefault(self.stopWords)
|
|
2921
|
-
|
|
2922
|
-
def loadDefaultStopWords(language="english"):
|
|
2923
|
-
from pyspark.ml.wrapper import _jvm
|
|
2924
|
-
|
|
2925
|
-
"""
|
|
2926
|
-
Loads the default stop words for the given language.
|
|
2927
|
-
Supported languages: danish, dutch, english, finnish, french, german, hungarian,
|
|
2928
|
-
italian, norwegian, portuguese, russian, spanish, swedish, turkish
|
|
2929
|
-
"""
|
|
2930
|
-
stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWordsRemover
|
|
2931
|
-
return list(stopWordsObj.loadDefaultStopWords(language))
|
|
2932
|
-
|
|
2933
|
-
|
|
2934
|
-
class SentenceDetectorDLModel(AnnotatorModel):
|
|
2935
|
-
name = "SentenceDetectorDLModel"
|
|
2936
|
-
|
|
2937
|
-
modelArchitecture = Param(Params._dummy(), "modelArchitecture", "Model architecture (CNN)",
|
|
2938
|
-
typeConverter=TypeConverters.toString)
|
|
2939
|
-
|
|
2940
|
-
def setModel(self, modelArchitecture):
|
|
2941
|
-
return self._set(modelArchitecture=modelArchitecture)
|
|
2942
|
-
|
|
2943
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.sentence_detector_dl.SentenceDetectorDLModel",
|
|
2944
|
-
java_model=None):
|
|
2945
|
-
super(SentenceDetectorDLModel, self).__init__(
|
|
2946
|
-
classname=classname,
|
|
2947
|
-
java_model=java_model
|
|
2948
|
-
)
|
|
2949
|
-
|
|
2950
|
-
@staticmethod
|
|
2951
|
-
def pretrained(name="sentence_detector_dl", lang="en", remote_loc=None):
|
|
2952
|
-
from sparknlp.pretrained import ResourceDownloader
|
|
2953
|
-
return ResourceDownloader.downloadModel(SentenceDetectorDLModel, name, lang, remote_loc)
|
|
2954
|
-
|
|
2955
|
-
|
|
2956
|
-
class SentenceDetectorDLApproach(AnnotatorApproach):
|
|
2957
|
-
|
|
2958
|
-
name = "SentenceDetectorDLApproach"
|
|
2959
|
-
|
|
2960
|
-
modelArchitecture = Param(Params._dummy(),
|
|
2961
|
-
"modelArchitecture",
|
|
2962
|
-
"Model architecture (CNN)",
|
|
2963
|
-
typeConverter=TypeConverters.toString)
|
|
2964
|
-
|
|
2965
|
-
impossiblePenultimates = Param(Params._dummy(),
|
|
2966
|
-
"impossiblePenultimates",
|
|
2967
|
-
"Impossible penultimates - list of strings which a sentence can't end with",
|
|
2968
|
-
typeConverter=TypeConverters.toListString)
|
|
2969
|
-
|
|
2970
|
-
validationSplit = Param(Params._dummy(),
|
|
2971
|
-
"validationSplit",
|
|
2972
|
-
"Choose the proportion of training dataset to be validated against the model on each "
|
|
2973
|
-
"Epoch. The value should be between 0.0 and 1.0 and by default it is 0.0 and off.",
|
|
2974
|
-
TypeConverters.toFloat)
|
|
2975
|
-
|
|
2976
|
-
epochsNumber = Param(Params._dummy(),
|
|
2977
|
-
"epochsNumber",
|
|
2978
|
-
"Number of epochs for the optimization process",
|
|
2979
|
-
TypeConverters.toInt)
|
|
2980
|
-
|
|
2981
|
-
outputLogsPath = Param(Params._dummy(),
|
|
2982
|
-
"outputLogsPath",
|
|
2983
|
-
"Path to folder where logs will be saved. If no path is specified, no logs are generated",
|
|
2984
|
-
TypeConverters.toString)
|
|
2985
|
-
|
|
2986
|
-
def setModel(self, model_architecture):
|
|
2987
|
-
return self._set(modelArchitecture=model_architecture)
|
|
2988
|
-
|
|
2989
|
-
def setValidationSplit(self, validation_split):
|
|
2990
|
-
return self._set(validationSplit=validation_split)
|
|
2991
|
-
|
|
2992
|
-
def setEpochsNumber(self, epochs_number):
|
|
2993
|
-
return self._set(epochsNumber=epochs_number)
|
|
2994
|
-
|
|
2995
|
-
def setOutputLogsPath(self, output_logs_path):
|
|
2996
|
-
return self._set(outputLogsPath=output_logs_path)
|
|
2997
|
-
|
|
2998
|
-
def setImpossiblePenultimates(self, impossible_penultimates):
|
|
2999
|
-
return self._set(impossiblePenultimates=impossible_penultimates)
|
|
3000
|
-
|
|
3001
|
-
def _create_model(self, java_model):
|
|
3002
|
-
return SentenceDetectorDLModel(java_model=java_model)
|
|
3003
|
-
|
|
3004
|
-
@keyword_only
|
|
3005
|
-
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.sentence_detector_dl.SentenceDetectorDLApproach"):
|
|
3006
|
-
super(SentenceDetectorDLApproach, self).__init__(classname=classname)
|