spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,397 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for NerCrf."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
from sparknlp.annotator.ner.ner_approach import NerApproach
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class NerCrfApproach(AnnotatorApproach, NerApproach):
|
|
21
|
+
"""Algorithm for training a Named Entity Recognition Model
|
|
22
|
+
|
|
23
|
+
For instantiated/pretrained models, see :class:`.NerCrfModel`.
|
|
24
|
+
|
|
25
|
+
This Named Entity recognition annotator allows for a generic model to be
|
|
26
|
+
trained by utilizing a CRF machine learning algorithm. The training data
|
|
27
|
+
should be a labeled Spark Dataset, e.g. :class:`.CoNLL` 2003 IOB with
|
|
28
|
+
`Annotation` type columns. The data should have columns of type
|
|
29
|
+
``DOCUMENT, TOKEN, POS, WORD_EMBEDDINGS`` and an additional label column of
|
|
30
|
+
annotator type ``NAMED_ENTITY``.
|
|
31
|
+
|
|
32
|
+
Excluding the label, this can be done with for example:
|
|
33
|
+
|
|
34
|
+
- a :class:`.SentenceDetector`,
|
|
35
|
+
- a :class:`.Tokenizer`,
|
|
36
|
+
- a :class:`.PerceptronModel` and
|
|
37
|
+
- a :class:`.WordEmbeddingsModel`.
|
|
38
|
+
|
|
39
|
+
Optionally the user can provide an entity dictionary file with
|
|
40
|
+
:meth:`.setExternalFeatures` for better accuracy.
|
|
41
|
+
|
|
42
|
+
For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/crf-ner/ner_dl_crf.ipynb>`__.
|
|
43
|
+
|
|
44
|
+
========================================= ======================
|
|
45
|
+
Input Annotation types Output Annotation type
|
|
46
|
+
========================================= ======================
|
|
47
|
+
``DOCUMENT, TOKEN, POS, WORD_EMBEDDINGS`` ``NAMED_ENTITY``
|
|
48
|
+
========================================= ======================
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
labelColumn
|
|
53
|
+
Column with label per each token
|
|
54
|
+
entities
|
|
55
|
+
Entities to recognize
|
|
56
|
+
minEpochs
|
|
57
|
+
Minimum number of epochs to train, by default 0
|
|
58
|
+
maxEpochs
|
|
59
|
+
Maximum number of epochs to train, by default 1000
|
|
60
|
+
verbose
|
|
61
|
+
Level of verbosity during training, by default 4
|
|
62
|
+
randomSeed
|
|
63
|
+
Random seed
|
|
64
|
+
l2
|
|
65
|
+
L2 regularization coefficient, by default 1.0
|
|
66
|
+
c0
|
|
67
|
+
c0 params defining decay speed for gradient, by default 2250000
|
|
68
|
+
lossEps
|
|
69
|
+
If Epoch relative improvement less than eps then training is stopped, by
|
|
70
|
+
default 0.001
|
|
71
|
+
minW
|
|
72
|
+
Features with less weights then this param value will be filtered
|
|
73
|
+
includeConfidence
|
|
74
|
+
Whether to include confidence scores in annotation metadata, by default
|
|
75
|
+
False
|
|
76
|
+
externalFeatures
|
|
77
|
+
Additional dictionaries paths to use as a features
|
|
78
|
+
|
|
79
|
+
Examples
|
|
80
|
+
--------
|
|
81
|
+
>>> import sparknlp
|
|
82
|
+
>>> from sparknlp.base import *
|
|
83
|
+
>>> from sparknlp.annotator import *
|
|
84
|
+
>>> from sparknlp.training import *
|
|
85
|
+
>>> from pyspark.ml import Pipeline
|
|
86
|
+
|
|
87
|
+
This CoNLL dataset already includes a sentence, token, POS tags and label
|
|
88
|
+
column with their respective annotator types. If a custom dataset is used,
|
|
89
|
+
these need to be defined with for example:
|
|
90
|
+
|
|
91
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
92
|
+
... .setInputCol("text") \\
|
|
93
|
+
... .setOutputCol("document")
|
|
94
|
+
>>> sentence = SentenceDetector() \\
|
|
95
|
+
... .setInputCols(["document"]) \\
|
|
96
|
+
... .setOutputCol("sentence")
|
|
97
|
+
>>> tokenizer = Tokenizer() \\
|
|
98
|
+
... .setInputCols(["sentence"]) \\
|
|
99
|
+
... .setOutputCol("token")
|
|
100
|
+
>>> posTagger = PerceptronModel.pretrained() \\
|
|
101
|
+
... .setInputCols(["sentence", "token"]) \\
|
|
102
|
+
... .setOutputCol("pos")
|
|
103
|
+
|
|
104
|
+
Then training can start:
|
|
105
|
+
|
|
106
|
+
>>> embeddings = WordEmbeddingsModel.pretrained() \\
|
|
107
|
+
... .setInputCols(["sentence", "token"]) \\
|
|
108
|
+
... .setOutputCol("embeddings") \\
|
|
109
|
+
... .setCaseSensitive(False)
|
|
110
|
+
>>> nerTagger = NerCrfApproach() \\
|
|
111
|
+
... .setInputCols(["sentence", "token", "pos", "embeddings"]) \\
|
|
112
|
+
... .setLabelColumn("label") \\
|
|
113
|
+
... .setMinEpochs(1) \\
|
|
114
|
+
... .setMaxEpochs(3) \\
|
|
115
|
+
... .setOutputCol("ner")
|
|
116
|
+
>>> pipeline = Pipeline().setStages([
|
|
117
|
+
... embeddings,
|
|
118
|
+
... nerTagger
|
|
119
|
+
... ])
|
|
120
|
+
|
|
121
|
+
We use the sentences, tokens, POS tags and labels from the CoNLL dataset.
|
|
122
|
+
|
|
123
|
+
>>> conll = CoNLL()
|
|
124
|
+
>>> trainingData = conll.readDataset(spark, "src/test/resources/conll2003/eng.train")
|
|
125
|
+
>>> pipelineModel = pipeline.fit(trainingData)
|
|
126
|
+
|
|
127
|
+
See Also
|
|
128
|
+
--------
|
|
129
|
+
NerDLApproach : for a deep learning based approach
|
|
130
|
+
NerConverter : to further process the results
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN, AnnotatorType.POS, AnnotatorType.WORD_EMBEDDINGS]
|
|
134
|
+
|
|
135
|
+
outputAnnotatorType = AnnotatorType.NAMED_ENTITY
|
|
136
|
+
|
|
137
|
+
l2 = Param(Params._dummy(), "l2", "L2 regularization coefficient", TypeConverters.toFloat)
|
|
138
|
+
|
|
139
|
+
c0 = Param(Params._dummy(), "c0", "c0 params defining decay speed for gradient", TypeConverters.toInt)
|
|
140
|
+
|
|
141
|
+
lossEps = Param(Params._dummy(), "lossEps", "If Epoch relative improvement less than eps then training is stopped",
|
|
142
|
+
TypeConverters.toFloat)
|
|
143
|
+
|
|
144
|
+
minW = Param(Params._dummy(), "minW", "Features with less weights then this param value will be filtered",
|
|
145
|
+
TypeConverters.toFloat)
|
|
146
|
+
|
|
147
|
+
includeConfidence = Param(Params._dummy(), "includeConfidence",
|
|
148
|
+
"external features is a delimited text. needs 'delimiter' in options",
|
|
149
|
+
TypeConverters.toBoolean)
|
|
150
|
+
|
|
151
|
+
externalFeatures = Param(Params._dummy(), "externalFeatures", "Additional dictionaries paths to use as a features",
|
|
152
|
+
TypeConverters.identity)
|
|
153
|
+
|
|
154
|
+
verbose = Param(Params._dummy(), "verbose", "Level of verbosity during training", TypeConverters.toInt)
|
|
155
|
+
|
|
156
|
+
def setL2(self, l2value):
|
|
157
|
+
"""Sets L2 regularization coefficient, by default 1.0.
|
|
158
|
+
|
|
159
|
+
Parameters
|
|
160
|
+
----------
|
|
161
|
+
l2value : float
|
|
162
|
+
L2 regularization coefficient
|
|
163
|
+
"""
|
|
164
|
+
return self._set(l2=l2value)
|
|
165
|
+
|
|
166
|
+
def setC0(self, c0value):
|
|
167
|
+
"""Sets c0 params defining decay speed for gradient, by default 2250000.
|
|
168
|
+
|
|
169
|
+
Parameters
|
|
170
|
+
----------
|
|
171
|
+
c0value : int
|
|
172
|
+
c0 params defining decay speed for gradient
|
|
173
|
+
"""
|
|
174
|
+
return self._set(c0=c0value)
|
|
175
|
+
|
|
176
|
+
def setLossEps(self, eps):
|
|
177
|
+
"""Sets If Epoch relative improvement less than eps then training is
|
|
178
|
+
stopped, by default 0.001.
|
|
179
|
+
|
|
180
|
+
Parameters
|
|
181
|
+
----------
|
|
182
|
+
eps : float
|
|
183
|
+
The threshold
|
|
184
|
+
"""
|
|
185
|
+
return self._set(lossEps=eps)
|
|
186
|
+
|
|
187
|
+
def setMinW(self, w):
|
|
188
|
+
"""Sets minimum weight value.
|
|
189
|
+
|
|
190
|
+
Features with less weights then this param value will be filtered.
|
|
191
|
+
|
|
192
|
+
Parameters
|
|
193
|
+
----------
|
|
194
|
+
w : float
|
|
195
|
+
Minimum weight value
|
|
196
|
+
"""
|
|
197
|
+
return self._set(minW=w)
|
|
198
|
+
|
|
199
|
+
def setExternalFeatures(self, path, delimiter, read_as=ReadAs.TEXT, options={"format": "text"}):
|
|
200
|
+
"""Sets Additional dictionaries paths to use as a features.
|
|
201
|
+
|
|
202
|
+
Parameters
|
|
203
|
+
----------
|
|
204
|
+
path : str
|
|
205
|
+
Path to the source files
|
|
206
|
+
delimiter : str
|
|
207
|
+
Delimiter for the dictionary file. Can also be set it `options`.
|
|
208
|
+
read_as : str, optional
|
|
209
|
+
How to read the file, by default ReadAs.TEXT
|
|
210
|
+
options : dict, optional
|
|
211
|
+
Options to read the resource, by default {"format": "text"}
|
|
212
|
+
"""
|
|
213
|
+
opts = options.copy()
|
|
214
|
+
if "delimiter" not in opts:
|
|
215
|
+
opts["delimiter"] = delimiter
|
|
216
|
+
return self._set(externalFeatures=ExternalResource(path, read_as, opts))
|
|
217
|
+
|
|
218
|
+
def setIncludeConfidence(self, b):
|
|
219
|
+
"""Sets whether to include confidence scores in annotation metadata, by
|
|
220
|
+
default False.
|
|
221
|
+
|
|
222
|
+
Parameters
|
|
223
|
+
----------
|
|
224
|
+
b : bool
|
|
225
|
+
Whether to include the confidence value in the output.
|
|
226
|
+
"""
|
|
227
|
+
return self._set(includeConfidence=b)
|
|
228
|
+
|
|
229
|
+
def setVerbose(self, verboseValue):
|
|
230
|
+
"""Sets level of verbosity during training.
|
|
231
|
+
|
|
232
|
+
Parameters
|
|
233
|
+
----------
|
|
234
|
+
verboseValue : int
|
|
235
|
+
Level of verbosity
|
|
236
|
+
"""
|
|
237
|
+
return self._set(verbose=verboseValue)
|
|
238
|
+
|
|
239
|
+
def _create_model(self, java_model):
|
|
240
|
+
return NerCrfModel(java_model=java_model)
|
|
241
|
+
|
|
242
|
+
@keyword_only
|
|
243
|
+
def __init__(self):
|
|
244
|
+
super(NerCrfApproach, self).__init__(classname="com.johnsnowlabs.nlp.annotators.ner.crf.NerCrfApproach")
|
|
245
|
+
self._setDefault(
|
|
246
|
+
minEpochs=0,
|
|
247
|
+
maxEpochs=1000,
|
|
248
|
+
l2=float(1),
|
|
249
|
+
c0=2250000,
|
|
250
|
+
lossEps=float(1e-3),
|
|
251
|
+
verbose=4,
|
|
252
|
+
includeConfidence=False
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
class NerCrfModel(AnnotatorModel):
|
|
257
|
+
"""Extracts Named Entities based on a CRF Model.
|
|
258
|
+
|
|
259
|
+
This Named Entity recognition annotator allows for a generic model to be
|
|
260
|
+
trained by utilizing a CRF machine learning algorithm. The data should have
|
|
261
|
+
columns of type ``DOCUMENT, TOKEN, POS, WORD_EMBEDDINGS``. These can be
|
|
262
|
+
extracted with for example
|
|
263
|
+
|
|
264
|
+
- a SentenceDetector,
|
|
265
|
+
- a Tokenizer and
|
|
266
|
+
- a PerceptronModel.
|
|
267
|
+
|
|
268
|
+
This is the instantiated model of the :class:`.NerCrfApproach`. For training
|
|
269
|
+
your own model, please see the documentation of that class.
|
|
270
|
+
|
|
271
|
+
Pretrained models can be loaded with :meth:`.pretrained` of the companion
|
|
272
|
+
object:
|
|
273
|
+
|
|
274
|
+
>>> nerTagger = NerCrfModel.pretrained() \\
|
|
275
|
+
... .setInputCols(["sentence", "token", "word_embeddings", "pos"]) \\
|
|
276
|
+
... .setOutputCol("ner")
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
The default model is ``"ner_crf"``, if no name is provided. For available
|
|
280
|
+
pretrained models please see the `Models Hub
|
|
281
|
+
<https://sparknlp.org/models?task=Named+Entity+Recognition>`__.
|
|
282
|
+
|
|
283
|
+
For extended examples of usage, see the `Examples
|
|
284
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/crf-ner/ner_dl_crf.ipynb>`__.
|
|
285
|
+
|
|
286
|
+
========================================= ======================
|
|
287
|
+
Input Annotation types Output Annotation type
|
|
288
|
+
========================================= ======================
|
|
289
|
+
``DOCUMENT, TOKEN, POS, WORD_EMBEDDINGS`` ``NAMED_ENTITY``
|
|
290
|
+
========================================= ======================
|
|
291
|
+
|
|
292
|
+
Parameters
|
|
293
|
+
----------
|
|
294
|
+
includeConfidence
|
|
295
|
+
Whether to include confidence scores in annotation metadata, by default
|
|
296
|
+
False
|
|
297
|
+
|
|
298
|
+
Examples
|
|
299
|
+
--------
|
|
300
|
+
>>> import sparknlp
|
|
301
|
+
>>> from sparknlp.base import *
|
|
302
|
+
>>> from sparknlp.annotator import *
|
|
303
|
+
>>> from pyspark.ml import Pipeline
|
|
304
|
+
|
|
305
|
+
First extract the prerequisites for the NerCrfModel
|
|
306
|
+
|
|
307
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
308
|
+
... .setInputCol("text") \\
|
|
309
|
+
... .setOutputCol("document")
|
|
310
|
+
>>> sentence = SentenceDetector() \\
|
|
311
|
+
... .setInputCols(["document"]) \\
|
|
312
|
+
... .setOutputCol("sentence")
|
|
313
|
+
>>> tokenizer = Tokenizer() \\
|
|
314
|
+
... .setInputCols(["sentence"]) \\
|
|
315
|
+
... .setOutputCol("token")
|
|
316
|
+
>>> embeddings = WordEmbeddingsModel.pretrained() \\
|
|
317
|
+
... .setInputCols(["sentence", "token"]) \\
|
|
318
|
+
... .setOutputCol("word_embeddings")
|
|
319
|
+
>>> posTagger = PerceptronModel.pretrained() \\
|
|
320
|
+
... .setInputCols(["sentence", "token"]) \\
|
|
321
|
+
... .setOutputCol("pos")
|
|
322
|
+
|
|
323
|
+
Then NER can be extracted
|
|
324
|
+
|
|
325
|
+
>>> nerTagger = NerCrfModel.pretrained() \\
|
|
326
|
+
... .setInputCols(["sentence", "token", "word_embeddings", "pos"]) \\
|
|
327
|
+
... .setOutputCol("ner")
|
|
328
|
+
>>> pipeline = Pipeline().setStages([
|
|
329
|
+
... documentAssembler,
|
|
330
|
+
... sentence,
|
|
331
|
+
... tokenizer,
|
|
332
|
+
... embeddings,
|
|
333
|
+
... posTagger,
|
|
334
|
+
... nerTagger
|
|
335
|
+
... ])
|
|
336
|
+
>>> data = spark.createDataFrame([["U.N. official Ekeus heads for Baghdad."]]).toDF("text")
|
|
337
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
338
|
+
>>> result.select("ner.result").show(truncate=False)
|
|
339
|
+
+------------------------------------+
|
|
340
|
+
|result |
|
|
341
|
+
+------------------------------------+
|
|
342
|
+
|[I-ORG, O, O, I-PER, O, O, I-LOC, O]|
|
|
343
|
+
+------------------------------------+
|
|
344
|
+
|
|
345
|
+
See Also
|
|
346
|
+
--------
|
|
347
|
+
NerDLModel : for a deep learning based approach
|
|
348
|
+
NerConverter : to further process the results
|
|
349
|
+
"""
|
|
350
|
+
name = "NerCrfModel"
|
|
351
|
+
|
|
352
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN, AnnotatorType.POS, AnnotatorType.WORD_EMBEDDINGS]
|
|
353
|
+
|
|
354
|
+
outputAnnotatorType = AnnotatorType.NAMED_ENTITY
|
|
355
|
+
|
|
356
|
+
includeConfidence = Param(Params._dummy(), "includeConfidence",
|
|
357
|
+
"external features is a delimited text. needs 'delimiter' in options",
|
|
358
|
+
TypeConverters.toBoolean)
|
|
359
|
+
|
|
360
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.ner.crf.NerCrfModel", java_model=None):
|
|
361
|
+
super(NerCrfModel, self).__init__(
|
|
362
|
+
classname=classname,
|
|
363
|
+
java_model=java_model
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
def setIncludeConfidence(self, b):
|
|
367
|
+
"""Sets whether to include confidence scores in annotation metadata, by
|
|
368
|
+
default False.
|
|
369
|
+
|
|
370
|
+
Parameters
|
|
371
|
+
----------
|
|
372
|
+
b : bool
|
|
373
|
+
Whether to include the confidence value in the output.
|
|
374
|
+
"""
|
|
375
|
+
return self._set(includeConfidence=b)
|
|
376
|
+
|
|
377
|
+
@staticmethod
|
|
378
|
+
def pretrained(name="ner_crf", lang="en", remote_loc=None):
|
|
379
|
+
"""Downloads and loads a pretrained model.
|
|
380
|
+
|
|
381
|
+
Parameters
|
|
382
|
+
----------
|
|
383
|
+
name : str, optional
|
|
384
|
+
Name of the pretrained model, by default "ner_crf"
|
|
385
|
+
lang : str, optional
|
|
386
|
+
Language of the pretrained model, by default "en"
|
|
387
|
+
remote_loc : str, optional
|
|
388
|
+
Optional remote address of the resource, by default None. Will use
|
|
389
|
+
Spark NLPs repositories otherwise.
|
|
390
|
+
|
|
391
|
+
Returns
|
|
392
|
+
-------
|
|
393
|
+
NerCrfModel
|
|
394
|
+
The restored model
|
|
395
|
+
"""
|
|
396
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
397
|
+
return ResourceDownloader.downloadModel(NerCrfModel, name, lang, remote_loc)
|