PyPI - spark-nlp - Versions diffs - 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl - Mend

spark-nlp 2.6.3rc1py2.py3-none-any.whl → 6.2.1py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (329) hide show

com/johnsnowlabs/ml/__init__.py +0 -0
com/johnsnowlabs/ml/ai/__init__.py +10 -0
com/johnsnowlabs/nlp/__init__.py +4 -2
spark_nlp-6.2.1.dist-info/METADATA +362 -0
spark_nlp-6.2.1.dist-info/RECORD +292 -0
{spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
sparknlp/__init__.py +281 -27
sparknlp/annotation.py +137 -6
sparknlp/annotation_audio.py +61 -0
sparknlp/annotation_image.py +82 -0
sparknlp/annotator/__init__.py +93 -0
sparknlp/annotator/audio/__init__.py +16 -0
sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
sparknlp/annotator/chunk2_doc.py +85 -0
sparknlp/annotator/chunker.py +137 -0
sparknlp/annotator/classifier_dl/__init__.py +61 -0
sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
sparknlp/annotator/cleaners/__init__.py +15 -0
sparknlp/annotator/cleaners/cleaner.py +202 -0
sparknlp/annotator/cleaners/extractor.py +191 -0
sparknlp/annotator/coref/__init__.py +1 -0
sparknlp/annotator/coref/spanbert_coref.py +221 -0
sparknlp/annotator/cv/__init__.py +29 -0
sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
sparknlp/annotator/cv/florence2_transformer.py +180 -0
sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
sparknlp/annotator/dataframe_optimizer.py +216 -0
sparknlp/annotator/date2_chunk.py +88 -0
sparknlp/annotator/dependency/__init__.py +17 -0
sparknlp/annotator/dependency/dependency_parser.py +294 -0
sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
sparknlp/annotator/document_character_text_splitter.py +228 -0
sparknlp/annotator/document_normalizer.py +235 -0
sparknlp/annotator/document_token_splitter.py +175 -0
sparknlp/annotator/document_token_splitter_test.py +85 -0
sparknlp/annotator/embeddings/__init__.py +45 -0
sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
sparknlp/annotator/embeddings/doc2vec.py +352 -0
sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
sparknlp/annotator/embeddings/word2vec.py +353 -0
sparknlp/annotator/embeddings/word_embeddings.py +385 -0
sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
sparknlp/annotator/er/__init__.py +16 -0
sparknlp/annotator/er/entity_ruler.py +267 -0
sparknlp/annotator/graph_extraction.py +368 -0
sparknlp/annotator/keyword_extraction/__init__.py +16 -0
sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
sparknlp/annotator/ld_dl/__init__.py +16 -0
sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
sparknlp/annotator/lemmatizer.py +250 -0
sparknlp/annotator/matcher/__init__.py +20 -0
sparknlp/annotator/matcher/big_text_matcher.py +272 -0
sparknlp/annotator/matcher/date_matcher.py +303 -0
sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
sparknlp/annotator/matcher/regex_matcher.py +221 -0
sparknlp/annotator/matcher/text_matcher.py +290 -0
sparknlp/annotator/n_gram_generator.py +141 -0
sparknlp/annotator/ner/__init__.py +21 -0
sparknlp/annotator/ner/ner_approach.py +94 -0
sparknlp/annotator/ner/ner_converter.py +148 -0
sparknlp/annotator/ner/ner_crf.py +397 -0
sparknlp/annotator/ner/ner_dl.py +591 -0
sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
sparknlp/annotator/ner/ner_overwriter.py +166 -0
sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
sparknlp/annotator/normalizer.py +230 -0
sparknlp/annotator/openai/__init__.py +16 -0
sparknlp/annotator/openai/openai_completion.py +349 -0
sparknlp/annotator/openai/openai_embeddings.py +106 -0
sparknlp/annotator/param/__init__.py +17 -0
sparknlp/annotator/param/classifier_encoder.py +98 -0
sparknlp/annotator/param/evaluation_dl_params.py +130 -0
sparknlp/annotator/pos/__init__.py +16 -0
sparknlp/annotator/pos/perceptron.py +263 -0
sparknlp/annotator/sentence/__init__.py +17 -0
sparknlp/annotator/sentence/sentence_detector.py +290 -0
sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
sparknlp/annotator/sentiment/__init__.py +17 -0
sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
sparknlp/annotator/seq2seq/__init__.py +35 -0
sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
sparknlp/annotator/similarity/__init__.py +0 -0
sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
sparknlp/annotator/spell_check/__init__.py +18 -0
sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
sparknlp/annotator/stemmer.py +79 -0
sparknlp/annotator/stop_words_cleaner.py +190 -0
sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
sparknlp/annotator/token/__init__.py +19 -0
sparknlp/annotator/token/chunk_tokenizer.py +118 -0
sparknlp/annotator/token/recursive_tokenizer.py +205 -0
sparknlp/annotator/token/regex_tokenizer.py +208 -0
sparknlp/annotator/token/tokenizer.py +561 -0
sparknlp/annotator/token2_chunk.py +76 -0
sparknlp/annotator/ws/__init__.py +16 -0
sparknlp/annotator/ws/word_segmenter.py +429 -0
sparknlp/base/__init__.py +30 -0
sparknlp/base/audio_assembler.py +95 -0
sparknlp/base/doc2_chunk.py +169 -0
sparknlp/base/document_assembler.py +164 -0
sparknlp/base/embeddings_finisher.py +201 -0
sparknlp/base/finisher.py +217 -0
sparknlp/base/gguf_ranking_finisher.py +234 -0
sparknlp/base/graph_finisher.py +125 -0
sparknlp/base/has_recursive_fit.py +24 -0
sparknlp/base/has_recursive_transform.py +22 -0
sparknlp/base/image_assembler.py +172 -0
sparknlp/base/light_pipeline.py +429 -0
sparknlp/base/multi_document_assembler.py +164 -0
sparknlp/base/prompt_assembler.py +207 -0
sparknlp/base/recursive_pipeline.py +107 -0
sparknlp/base/table_assembler.py +145 -0
sparknlp/base/token_assembler.py +124 -0
sparknlp/common/__init__.py +26 -0
sparknlp/common/annotator_approach.py +41 -0
sparknlp/common/annotator_model.py +47 -0
sparknlp/common/annotator_properties.py +114 -0
sparknlp/common/annotator_type.py +38 -0
sparknlp/common/completion_post_processing.py +37 -0
sparknlp/common/coverage_result.py +22 -0
sparknlp/common/match_strategy.py +33 -0
sparknlp/common/properties.py +1298 -0
sparknlp/common/read_as.py +33 -0
sparknlp/common/recursive_annotator_approach.py +35 -0
sparknlp/common/storage.py +149 -0
sparknlp/common/utils.py +39 -0
sparknlp/functions.py +315 -5
sparknlp/internal/__init__.py +1199 -0
sparknlp/internal/annotator_java_ml.py +32 -0
sparknlp/internal/annotator_transformer.py +37 -0
sparknlp/internal/extended_java_wrapper.py +63 -0
sparknlp/internal/params_getters_setters.py +71 -0
sparknlp/internal/recursive.py +70 -0
sparknlp/logging/__init__.py +15 -0
sparknlp/logging/comet.py +467 -0
sparknlp/partition/__init__.py +16 -0
sparknlp/partition/partition.py +244 -0
sparknlp/partition/partition_properties.py +902 -0
sparknlp/partition/partition_transformer.py +200 -0
sparknlp/pretrained/__init__.py +17 -0
sparknlp/pretrained/pretrained_pipeline.py +158 -0
sparknlp/pretrained/resource_downloader.py +216 -0
sparknlp/pretrained/utils.py +35 -0
sparknlp/reader/__init__.py +15 -0
sparknlp/reader/enums.py +19 -0
sparknlp/reader/pdf_to_text.py +190 -0
sparknlp/reader/reader2doc.py +124 -0
sparknlp/reader/reader2image.py +136 -0
sparknlp/reader/reader2table.py +44 -0
sparknlp/reader/reader_assembler.py +159 -0
sparknlp/reader/sparknlp_reader.py +461 -0
sparknlp/training/__init__.py +20 -0
sparknlp/training/_tf_graph_builders/__init__.py +0 -0
sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
sparknlp/training/conll.py +150 -0
sparknlp/training/conllu.py +103 -0
sparknlp/training/pos.py +103 -0
sparknlp/training/pub_tator.py +76 -0
sparknlp/training/spacy_to_annotation.py +57 -0
sparknlp/training/tfgraphs.py +5 -0
sparknlp/upload_to_hub.py +149 -0
sparknlp/util.py +51 -5
com/__init__.pyc +0 -0
com/__pycache__/__init__.cpython-36.pyc +0 -0
com/johnsnowlabs/__init__.pyc +0 -0
com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
com/johnsnowlabs/nlp/__init__.pyc +0 -0
com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
sparknlp/__init__.pyc +0 -0
sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
sparknlp/__pycache__/base.cpython-36.pyc +0 -0
sparknlp/__pycache__/common.cpython-36.pyc +0 -0
sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
sparknlp/__pycache__/training.cpython-36.pyc +0 -0
sparknlp/__pycache__/util.cpython-36.pyc +0 -0
sparknlp/annotation.pyc +0 -0
sparknlp/annotator.py +0 -3006
sparknlp/annotator.pyc +0 -0
sparknlp/base.py +0 -347
sparknlp/base.pyc +0 -0
sparknlp/common.py +0 -193
sparknlp/common.pyc +0 -0
sparknlp/embeddings.py +0 -40
sparknlp/embeddings.pyc +0 -0
sparknlp/internal.py +0 -288
sparknlp/internal.pyc +0 -0
sparknlp/pretrained.py +0 -123
sparknlp/pretrained.pyc +0 -0
sparknlp/storage.py +0 -32
sparknlp/storage.pyc +0 -0
sparknlp/training.py +0 -62
sparknlp/training.pyc +0 -0
sparknlp/util.pyc +0 -0
{spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0

sparknlp/annotator/ner/ner_dl.py ADDED Viewed

@@ -0,0 +1,591 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for NerDL."""
+import sys
+from sparknlp.annotator.param import EvaluationDLParams
+from sparknlp.common import *
+from sparknlp.annotator.ner.ner_approach import NerApproach
+class NerDLApproach(AnnotatorApproach, NerApproach, EvaluationDLParams):
+    """This Named Entity recognition annotator allows to train generic NER model
+    based on Neural Networks.
+    The architecture of the neural network is a Char CNNs - BiLSTM - CRF that
+    achieves state-of-the-art in most datasets.
+    For instantiated/pretrained models, see :class:`.NerDLModel`.
+    The training data should be a labeled Spark Dataset, in the format of
+    :class:`.CoNLL` 2003 IOB with `Annotation` type columns. The data should
+    have columns of type ``DOCUMENT, TOKEN, WORD_EMBEDDINGS`` and an additional
+    label column of annotator type ``NAMED_ENTITY``.
+    Excluding the label, this can be done with for example:
+    - a SentenceDetector,
+    - a Tokenizer and
+    - a WordEmbeddingsModel (any embeddings can be chosen, e.g. BertEmbeddings
+      for BERT based embeddings).
+    By default, collects all data points into memory for training. For larger datasets, use
+    ``setEnableMemoryOptimizer(true)``. This will optimize memory usage during training at the cost
+    of speed. Note that this annotator will use as much memory as the largest partition of the
+    input dataset, so we recommend repartitioning to batch sizes.
+    Setting a test dataset to monitor model metrics can be done with
+    ``.setTestDataset``. The method expects a path to a parquet file containing a
+    dataframe that has the same required columns as the training dataframe. The
+    pre-processing steps for the training dataframe should also be applied to the test
+    dataframe. The following example will show how to create the test dataset with a
+    CoNLL dataset:
+    >>> documentAssembler = DocumentAssembler() \\
+    ...     .setInputCol("text") \\
+    ...     .setOutputCol("document")
+    >>> embeddings = WordEmbeddingsModel \\
+    ...     .pretrained() \\
+    ...     .setInputCols(["document", "token"]) \\
+    ...     .setOutputCol("embeddings")
+    >>> preProcessingPipeline = Pipeline().setStages([documentAssembler, embeddings])
+    >>> conll = CoNLL()
+    >>> (train, test) = conll \\
+    ...     .readDataset(spark, "src/test/resources/conll2003/eng.train") \\
+    ...     .randomSplit([0.8, 0.2])
+    >>> preProcessingPipeline \\
+    ...     .fit(test) \\
+    ...     .transform(test)
+    ...     .write \\
+    ...     .mode("overwrite") \\
+    ...     .parquet("test_data")
+    >>> tagger = NerDLApproach() \\
+    ...     .setInputCols(["document", "token", "embeddings"]) \\
+    ...     .setLabelColumn("label") \\
+    ...     .setOutputCol("ner") \\
+    ...     .setTestDataset("test_data")
+    For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner>`__.
+    ==================================== ======================
+    Input Annotation types               Output Annotation type
+    ==================================== ======================
+    ``DOCUMENT, TOKEN, WORD_EMBEDDINGS`` ``NAMED_ENTITY``
+    ==================================== ======================
+    Parameters
+    ----------
+    labelColumn
+        Column with label per each token
+    entities
+        Entities to recognize
+    minEpochs
+        Minimum number of epochs to train, by default 0
+    maxEpochs
+        Maximum number of epochs to train, by default 50
+    verbose
+        Level of verbosity during training, by default 2
+    randomSeed
+        Random seed
+    lr
+        Learning Rate, by default 0.001
+    po
+        Learning rate decay coefficient. Real Learning Rage = lr / (1 + po *
+        epoch), by default 0.005
+    batchSize
+        Batch size, by default 8
+    dropout
+        Dropout coefficient, by default 0.5
+    graphFolder
+        Folder path that contain external graph files
+    configProtoBytes
+        ConfigProto from tensorflow, serialized into byte array.
+    useContrib
+        whether to use contrib LSTM Cells. Not compatible with Windows. Might
+        slightly improve accuracy
+    validationSplit
+        Choose the proportion of training dataset to be validated against the
+        model on each Epoch. The value should be between 0.0 and 1.0 and by
+        default it is 0.0 and off, by default 0.0
+    evaluationLogExtended
+        Whether logs for validation to be extended, by default False.
+    testDataset
+        Path to a parquet file of a test dataset. If set, it is used to calculate
+        statistics on it during training.
+    includeConfidence
+        whether to include confidence scores in annotation metadata, by default
+        False
+    includeAllConfidenceScores
+        whether to include all confidence scores in annotation metadata or just
+        the score of the predicted tag, by default False
+    enableOutputLogs
+        Whether to use stdout in addition to Spark logs, by default False
+    outputLogsPath
+        Folder path to save training logs
+    enableMemoryOptimizer
+        Whether to optimize for large datasets or not. Enabling this option can
+        slow down training, by default False
+    useBestModel
+        Whether to restore and use the model that has achieved the best performance
+        at the end of the training.
+    bestModelMetric
+        Whether to check F1 Micro-average or F1 Macro-average as a final metric for the best model
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from sparknlp.training import *
+    >>> from pyspark.ml import Pipeline
+    This CoNLL dataset already includes a sentence, token and label
+    column with their respective annotator types. If a custom dataset is used,
+    these need to be defined with for example:
+    >>> documentAssembler = DocumentAssembler() \\
+    ...     .setInputCol("text") \\
+    ...     .setOutputCol("document")
+    >>> sentence = SentenceDetector() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("sentence")
+    >>> tokenizer = Tokenizer() \\
+    ...     .setInputCols(["sentence"]) \\
+    ...     .setOutputCol("token")
+    Then the training can start
+    >>> embeddings = BertEmbeddings.pretrained() \\
+    ...     .setInputCols(["sentence", "token"]) \\
+    ...     .setOutputCol("embeddings")
+    >>> nerTagger = NerDLApproach() \\
+    ...     .setInputCols(["sentence", "token", "embeddings"]) \\
+    ...     .setLabelColumn("label") \\
+    ...     .setOutputCol("ner") \\
+    ...     .setMaxEpochs(1) \\
+    ...     .setRandomSeed(0) \\
+    ...     .setVerbose(0)
+    >>> pipeline = Pipeline().setStages([
+    ...     embeddings,
+    ...     nerTagger
+    ... ])
+    We use the sentences, tokens, and labels from the CoNLL dataset.
+    >>> conll = CoNLL()
+    >>> trainingData = conll.readDataset(spark, "src/test/resources/conll2003/eng.train")
+    >>> pipelineModel = pipeline.fit(trainingData)
+    See Also
+    --------
+    NerCrfApproach : for a generic CRF approach
+    NerConverter : to further process the results
+    """
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN, AnnotatorType.WORD_EMBEDDINGS]
+    outputAnnotatorType = AnnotatorType.NAMED_ENTITY
+    lr = Param(Params._dummy(), "lr", "Learning Rate", TypeConverters.toFloat)
+    po = Param(Params._dummy(), "po", "Learning rate decay coefficient. Real Learning Rage = lr / (1 + po * epoch)",
+               TypeConverters.toFloat)
+    batchSize = Param(Params._dummy(), "batchSize", "Batch size", TypeConverters.toInt)
+    dropout = Param(Params._dummy(), "dropout", "Dropout coefficient", TypeConverters.toFloat)
+    graphFolder = Param(Params._dummy(), "graphFolder", "Folder path that contain external graph files",
+                        TypeConverters.toString)
+    configProtoBytes = Param(Params._dummy(), "configProtoBytes",
+                             "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
+                             TypeConverters.toListInt)
+    useContrib = Param(Params._dummy(), "useContrib",
+                       "whether to use contrib LSTM Cells. Not compatible with Windows. Might slightly improve accuracy.",
+                       TypeConverters.toBoolean)
+    includeConfidence = Param(Params._dummy(), "includeConfidence",
+                              "whether to include confidence scores in annotation metadata",
+                              TypeConverters.toBoolean)
+    includeAllConfidenceScores = Param(Params._dummy(), "includeAllConfidenceScores",
+                                       "whether to include all confidence scores in annotation metadata or just the score of the predicted tag",
+                                       TypeConverters.toBoolean)
+    enableMemoryOptimizer = Param(Params._dummy(), "enableMemoryOptimizer",
+                                  "Whether to optimize for large datasets or not. Enabling this option can slow down training.",
+                                  TypeConverters.toBoolean)
+    useBestModel = Param(Params._dummy(), "useBestModel",
+                         "Whether to restore and use the model that has achieved the best performance at the end of the training.",
+                         TypeConverters.toBoolean)
+    bestModelMetric = Param(Params._dummy(), "bestModelMetric",
+                            "Whether to check F1 Micro-average or F1 Macro-average as a final metric for the best model.",
+                            TypeConverters.toString)
+    def setConfigProtoBytes(self, b):
+        """Sets configProto from tensorflow, serialized into byte array.
+        Parameters
+        ----------
+        b : List[int]
+            ConfigProto from tensorflow, serialized into byte array
+        """
+        return self._set(configProtoBytes=b)
+    def setGraphFolder(self, p):
+        """Sets folder path that contain external graph files.
+        Parameters
+        ----------
+        p : str
+            Folder path that contain external graph files
+        """
+        return self._set(graphFolder=p)
+    def setUseContrib(self, v):
+        """Sets whether to use contrib LSTM Cells. Not compatible with Windows.
+        Might slightly improve accuracy.
+        Parameters
+        ----------
+        v : bool
+            Whether to use contrib LSTM Cells
+        Raises
+        ------
+        Exception
+            Windows not supported to use contrib
+        """
+        if v and sys.version == 'win32':
+            raise Exception("Windows not supported to use contrib")
+        return self._set(useContrib=v)
+    def setLr(self, v):
+        """Sets Learning Rate, by default 0.001.
+        Parameters
+        ----------
+        v : float
+            Learning Rate
+        """
+        self._set(lr=v)
+        return self
+    def setPo(self, v):
+        """Sets Learning rate decay coefficient, by default 0.005.
+        Real Learning Rage is lr / (1 + po * epoch).
+        Parameters
+        ----------
+        v : float
+            Learning rate decay coefficient
+        """
+        self._set(po=v)
+        return self
+    def setBatchSize(self, v):
+        """Sets batch size, by default 64.
+        Parameters
+        ----------
+        v : int
+            Batch size
+        """
+        self._set(batchSize=v)
+        return self
+    def setDropout(self, v):
+        """Sets dropout coefficient, by default 0.5.
+        Parameters
+        ----------
+        v : float
+            Dropout coefficient
+        """
+        self._set(dropout=v)
+        return self
+    def setIncludeConfidence(self, value):
+        """Sets whether to include confidence scores in annotation metadata, by
+        default False.
+        Parameters
+        ----------
+        value : bool
+            Whether to include the confidence value in the output.
+        """
+        return self._set(includeConfidence=value)
+    def setIncludeAllConfidenceScores(self, value):
+        """Sets whether to include all confidence scores in annotation metadata
+        or just the score of the predicted tag, by default False.
+        Parameters
+        ----------
+        value : bool
+            Whether to include all confidence scores in annotation metadata or
+            just the score of the predicted tag
+        """
+        return self._set(includeAllConfidenceScores=value)
+    def setEnableMemoryOptimizer(self, value):
+        """Sets Whether to optimize for large datasets or not, by default False.
+        Enabling this option can slow down training.
+        Parameters
+        ----------
+        value : bool
+            Whether to optimize for large datasets
+        """
+        return self._set(enableMemoryOptimizer=value)
+    def setUseBestModel(self, value):
+        """Whether to restore and use the model that has achieved the best performance at the end of the training.
+        The metric that is being monitored is F1 for testDataset and if it's not set it will be validationSplit, and if it's not set finally looks for loss.
+        Parameters
+        ----------
+        value : bool
+            Whether to restore and use the model that has achieved the best performance at the end of the training.
+        """
+        return self._set(useBestModel=value)
+    def setBestModelMetric(self, value):
+        """Whether to check F1 Micro-average or F1 Macro-average as a final metric for the best model when setUseBestModel is True
+        Parameters
+        ----------
+        value : str
+            Whether to check F1 Micro-average or F1 Macro-average as a final metric for the best model
+        """
+        return self._set(bestModelMetric=value)
+    def _create_model(self, java_model):
+        return NerDLModel(java_model=java_model)
+    @keyword_only
+    def __init__(self):
+        super(NerDLApproach, self).__init__(classname="com.johnsnowlabs.nlp.annotators.ner.dl.NerDLApproach")
+        uc = False if sys.platform == 'win32' else True
+        self._setDefault(
+            minEpochs=0,
+            maxEpochs=50,
+            lr=float(0.001),
+            po=float(0.005),
+            batchSize=8,
+            dropout=float(0.5),
+            verbose=2,
+            useContrib=uc,
+            validationSplit=float(0.0),
+            evaluationLogExtended=False,
+            includeConfidence=False,
+            includeAllConfidenceScores=False,
+            enableOutputLogs=False,
+            enableMemoryOptimizer=False,
+            useBestModel=False,
+            bestModelMetric="f1_micro"
+        )
+class NerDLModel(AnnotatorModel, HasStorageRef, HasBatchedAnnotate, HasEngine):
+    """This Named Entity recognition annotator is a generic NER model based on
+    Neural Networks.
+    Neural Network architecture is Char CNNs - BiLSTM - CRF that achieves
+    state-of-the-art in most datasets.
+    This is the instantiated model of the :class:`.NerDLApproach`. For training
+    your own model, please see the documentation of that class.
+    Pretrained models can be loaded with :meth:`.pretrained` of the companion
+    object:
+    >>> nerModel = NerDLModel.pretrained() \\
+    ...     .setInputCols(["sentence", "token", "embeddings"]) \\
+    ...     .setOutputCol("ner")
+    The default model is ``"ner_dl"``, if no name is provided.
+    For available pretrained models please see the `Models Hub
+    <https://sparknlp.org/models?task=Named+Entity+Recognition>`__.
+    Additionally, pretrained pipelines are available for this module, see
+    `Pipelines <https://sparknlp.org/docs/en/pipelines>`__.
+    Note that some pretrained models require specific types of embeddings,
+    depending on which they were trained on. For example, the default model
+    ``"ner_dl"`` requires the WordEmbeddings ``"glove_100d"``.
+    For extended examples of usage, see the `Examples
+    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/model-downloader/Create%20custom%20pipeline%20-%20NerDL.ipynb>`__.
+    ==================================== ======================
+    Input Annotation types               Output Annotation type
+    ==================================== ======================
+    ``DOCUMENT, TOKEN, WORD_EMBEDDINGS`` ``NAMED_ENTITY``
+    ==================================== ======================
+    Parameters
+    ----------
+    batchSize
+        Size of every batch, by default 8
+    configProtoBytes
+        ConfigProto from tensorflow, serialized into byte array.
+    includeConfidence
+        Whether to include confidence scores in annotation metadata, by default
+        False
+    includeAllConfidenceScores
+        Whether to include all confidence scores in annotation metadata or just
+        the score of the predicted tag, by default False
+    classes
+        Tags used to trained this NerDLModel
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    First extract the prerequisites for the NerDLModel
+    >>> documentAssembler = DocumentAssembler() \\
+    ...     .setInputCol("text") \\
+    ...     .setOutputCol("document")
+    >>> sentence = SentenceDetector() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("sentence")
+    >>> tokenizer = Tokenizer() \\
+    ...     .setInputCols(["sentence"]) \\
+    ...     .setOutputCol("token")
+    >>> embeddings = WordEmbeddingsModel.pretrained() \\
+    ...     .setInputCols(["sentence", "token"]) \\
+    ...     .setOutputCol("bert")
+    Then NER can be extracted
+    >>> nerTagger = NerDLModel.pretrained() \\
+    ...     .setInputCols(["sentence", "token", "bert"]) \\
+    ...     .setOutputCol("ner")
+    >>> pipeline = Pipeline().setStages([
+    ...     documentAssembler,
+    ...     sentence,
+    ...     tokenizer,
+    ...     embeddings,
+    ...     nerTagger
+    ... ])
+    >>> data = spark.createDataFrame([["U.N. official Ekeus heads for Baghdad."]]).toDF("text")
+    >>> result = pipeline.fit(data).transform(data)
+    >>> result.select("ner.result").show(truncate=False)
+    +------------------------------------+
+    |result                              |
+    +------------------------------------+
+    |[B-ORG, O, O, B-PER, O, O, B-LOC, O]|
+    +------------------------------------+
+    See Also
+    --------
+    NerCrfModel : for a generic CRF approach
+    NerConverter : to further process the results
+    """
+    name = "NerDLModel"
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN, AnnotatorType.WORD_EMBEDDINGS]
+    outputAnnotatorType = AnnotatorType.NAMED_ENTITY
+    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.ner.dl.NerDLModel", java_model=None):
+        super(NerDLModel, self).__init__(
+            classname=classname,
+            java_model=java_model
+        )
+        self._setDefault(
+            includeConfidence=False,
+            includeAllConfidenceScores=False,
+            batchSize=8
+        )
+    configProtoBytes = Param(Params._dummy(), "configProtoBytes",
+                             "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
+                             TypeConverters.toListInt)
+    includeConfidence = Param(Params._dummy(), "includeConfidence",
+                              "whether to include confidence scores in annotation metadata",
+                              TypeConverters.toBoolean)
+    includeAllConfidenceScores = Param(Params._dummy(), "includeAllConfidenceScores",
+                                       "whether to include all confidence scores in annotation metadata or just the score of the predicted tag",
+                                       TypeConverters.toBoolean)
+    classes = Param(Params._dummy(), "classes",
+                    "get the tags used to trained this NerDLModel",
+                    TypeConverters.toListString)
+    def setConfigProtoBytes(self, b):
+        """Sets configProto from tensorflow, serialized into byte array.
+        Parameters
+        ----------
+        b : List[int]
+            ConfigProto from tensorflow, serialized into byte array
+        """
+        return self._set(configProtoBytes=b)
+    def setIncludeConfidence(self, value):
+        """Sets whether to include confidence scores in annotation metadata, by
+        default False.
+        Parameters
+        ----------
+        value : bool
+            Whether to include the confidence value in the output.
+        """
+        return self._set(includeConfidence=value)
+    def setIncludeAllConfidenceScores(self, value):
+        """Sets whether to include all confidence scores in annotation metadata
+        or just the score of the predicted tag, by default False.
+        Parameters
+        ----------
+        value : bool
+            Whether to include all confidence scores in annotation metadata or
+            just the score of the predicted tag
+        """
+        return self._set(includeAllConfidenceScores=value)
+    @staticmethod
+    def pretrained(name="ner_dl", lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model, by default "ner_dl"
+        lang : str, optional
+            Language of the pretrained model, by default "en"
+        remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use
+            Spark NLPs repositories otherwise.
+        Returns
+        -------
+        NerDLModel
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+        return ResourceDownloader.downloadModel(NerDLModel, name, lang, remote_loc)

spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

spark-nlp 2.6.3rc1py2.py3-none-any.whl → 6.2.1py2.py3-none-any.whl