spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for CamemBertEmbeddings."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class CamemBertEmbeddings(AnnotatorModel,
|
|
20
|
+
HasEmbeddingsProperties,
|
|
21
|
+
HasCaseSensitiveProperties,
|
|
22
|
+
HasStorageRef,
|
|
23
|
+
HasBatchedAnnotate,
|
|
24
|
+
HasEngine,
|
|
25
|
+
HasMaxSentenceLengthLimit):
|
|
26
|
+
"""The CamemBERT model was proposed in CamemBERT: a Tasty French Language Model by
|
|
27
|
+
Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent
|
|
28
|
+
Romary, Éric Villemonte de la Clergerie, Djamé Seddah, and Benoît Sagot.
|
|
29
|
+
|
|
30
|
+
It is based on Facebook's RoBERTa model released in 2019. It is a model trained
|
|
31
|
+
on 138GB of French text.
|
|
32
|
+
|
|
33
|
+
Pretrained models can be loaded with ``pretrained`` of the companion object:
|
|
34
|
+
|
|
35
|
+
>>> embeddings = CamemBertEmbeddings.pretrained() \\
|
|
36
|
+
... .setInputCols(["token", "document"]) \\
|
|
37
|
+
... .setOutputCol("camembert_embeddings")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
The default model is ``"camembert_base"``, if no name is provided.
|
|
41
|
+
|
|
42
|
+
For available pretrained models please see the
|
|
43
|
+
`Models Hub <https://sparknlp.org/models?task=Embeddings>`__.
|
|
44
|
+
|
|
45
|
+
For extended examples of usage, see the
|
|
46
|
+
`Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner/ner_bert.ipynb>`__
|
|
47
|
+
and the
|
|
48
|
+
`CamemBertEmbeddingsTestSpec <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/embeddings/CamemBertEmbeddingsTestSpec.scala>`__.
|
|
49
|
+
|
|
50
|
+
To see which models are compatible and how to import them see
|
|
51
|
+
https://github.com/JohnSnowLabs/spark-nlp/discussions/5669.
|
|
52
|
+
|
|
53
|
+
====================== ======================
|
|
54
|
+
Input Annotation types Output Annotation type
|
|
55
|
+
====================== ======================
|
|
56
|
+
``DOCUMENT, TOKEN`` ``WORD_EMBEDDINGS``
|
|
57
|
+
====================== ======================
|
|
58
|
+
|
|
59
|
+
Parameters
|
|
60
|
+
----------
|
|
61
|
+
|
|
62
|
+
batchSize
|
|
63
|
+
Size of every batch , by default 8
|
|
64
|
+
dimension
|
|
65
|
+
Number of embedding dimensions, by default 768
|
|
66
|
+
caseSensitive
|
|
67
|
+
Whether to ignore case in tokens for embeddings matching, by default False
|
|
68
|
+
maxSentenceLength
|
|
69
|
+
Max sentence length to process, by default 128
|
|
70
|
+
configProtoBytes
|
|
71
|
+
ConfigProto from tensorflow, serialized into byte array.
|
|
72
|
+
|
|
73
|
+
References
|
|
74
|
+
----------
|
|
75
|
+
|
|
76
|
+
`CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`__
|
|
77
|
+
|
|
78
|
+
https://huggingface.co/camembert
|
|
79
|
+
|
|
80
|
+
**Paper abstract**
|
|
81
|
+
|
|
82
|
+
*Pretrained language models are now ubiquitous in Natural Language Processing.
|
|
83
|
+
Despite their success, most available models have either been trained on English
|
|
84
|
+
data or on the concatenation of data in multiple languages. This makes practical
|
|
85
|
+
use of such models --in all languages except English-- very limited. In this
|
|
86
|
+
paper, we investigate the feasibility of training monolingual Transformer-based
|
|
87
|
+
language models for other languages, taking French as an example and evaluating
|
|
88
|
+
our language models on part-of-speech tagging, dependency parsing, named entity
|
|
89
|
+
recognition and natural language inference tasks. We show that the use of web
|
|
90
|
+
crawled data is preferable to the use of Wikipedia data. More surprisingly, we
|
|
91
|
+
show that a relatively small web crawled dataset (4GB) leads to results that are
|
|
92
|
+
as good as those obtained using larger datasets (130+GB). Our best performing
|
|
93
|
+
model CamemBERT reaches or improves the state of the art in all four downstream
|
|
94
|
+
tasks.*
|
|
95
|
+
|
|
96
|
+
Examples
|
|
97
|
+
--------
|
|
98
|
+
>>> import sparknlp
|
|
99
|
+
>>> from sparknlp.base import *
|
|
100
|
+
>>> from sparknlp.annotator import *
|
|
101
|
+
>>> from pyspark.ml import Pipeline
|
|
102
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
103
|
+
... .setInputCol("text") \\
|
|
104
|
+
... .setOutputCol("document")
|
|
105
|
+
>>> tokenizer = Tokenizer() \\
|
|
106
|
+
... .setInputCols(["document"]) \\
|
|
107
|
+
... .setOutputCol("token")
|
|
108
|
+
>>> embeddings = CamemBertEmbeddings.pretrained() \\
|
|
109
|
+
... .setInputCols(["token", "document"]) \\
|
|
110
|
+
... .setOutputCol("camembert_embeddings")
|
|
111
|
+
>>> embeddingsFinisher = EmbeddingsFinisher() \\
|
|
112
|
+
... .setInputCols(["camembert_embeddings"]) \\
|
|
113
|
+
... .setOutputCols("finished_embeddings") \\
|
|
114
|
+
... .setOutputAsVector(True)
|
|
115
|
+
>>> pipeline = Pipeline().setStages([
|
|
116
|
+
... documentAssembler,
|
|
117
|
+
... tokenizer,
|
|
118
|
+
... embeddings,
|
|
119
|
+
... embeddingsFinisher
|
|
120
|
+
... ])
|
|
121
|
+
>>> data = spark.createDataFrame([["C'est une phrase."]]).toDF("text")
|
|
122
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
123
|
+
>>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
|
|
124
|
+
+--------------------------------------------------------------------------------+
|
|
125
|
+
| result|
|
|
126
|
+
+--------------------------------------------------------------------------------+
|
|
127
|
+
|[0.08442357927560806,-0.12863239645957947,-0.03835778683423996,0.200479581952...|
|
|
128
|
+
|[0.048462312668561935,0.12637358903884888,-0.27429091930389404,-0.07516729831...|
|
|
129
|
+
|[0.02690504491329193,0.12104076147079468,0.012526623904705048,-0.031543646007...|
|
|
130
|
+
|[0.05877285450696945,-0.08773420006036758,-0.06381352990865707,0.122621834278...|
|
|
131
|
+
+--------------------------------------------------------------------------------+
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
name = "CamemBertEmbeddings"
|
|
135
|
+
|
|
136
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
|
|
137
|
+
|
|
138
|
+
outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS
|
|
139
|
+
|
|
140
|
+
configProtoBytes = Param(
|
|
141
|
+
Params._dummy(),
|
|
142
|
+
"configProtoBytes",
|
|
143
|
+
"ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
|
|
144
|
+
TypeConverters.toListInt,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
def setConfigProtoBytes(self, b):
|
|
148
|
+
"""Sets configProto from tensorflow, serialized into byte array.
|
|
149
|
+
|
|
150
|
+
Parameters
|
|
151
|
+
----------
|
|
152
|
+
b : List[int]
|
|
153
|
+
ConfigProto from tensorflow, serialized into byte array
|
|
154
|
+
"""
|
|
155
|
+
return self._set(configProtoBytes=b)
|
|
156
|
+
|
|
157
|
+
@keyword_only
|
|
158
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.CamemBertEmbeddings", java_model=None):
|
|
159
|
+
super(CamemBertEmbeddings, self).__init__(
|
|
160
|
+
classname=classname,
|
|
161
|
+
java_model=java_model
|
|
162
|
+
)
|
|
163
|
+
self._setDefault(
|
|
164
|
+
batchSize=8,
|
|
165
|
+
dimension=768,
|
|
166
|
+
maxSentenceLength=128,
|
|
167
|
+
caseSensitive=True
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
@staticmethod
|
|
171
|
+
def loadSavedModel(folder, spark_session):
|
|
172
|
+
"""Loads a locally saved model.
|
|
173
|
+
|
|
174
|
+
Parameters
|
|
175
|
+
----------
|
|
176
|
+
folder : str
|
|
177
|
+
Folder of the saved model
|
|
178
|
+
spark_session : pyspark.sql.SparkSession
|
|
179
|
+
The current SparkSession
|
|
180
|
+
|
|
181
|
+
Returns
|
|
182
|
+
-------
|
|
183
|
+
CamemBertEmbeddings
|
|
184
|
+
The restored model
|
|
185
|
+
"""
|
|
186
|
+
from sparknlp.internal import _CamemBertLoader
|
|
187
|
+
jModel = _CamemBertLoader(folder, spark_session._jsparkSession)._java_obj
|
|
188
|
+
return CamemBertEmbeddings(java_model=jModel)
|
|
189
|
+
|
|
190
|
+
@staticmethod
|
|
191
|
+
def pretrained(name="camembert_base", lang="fr", remote_loc=None):
|
|
192
|
+
"""Downloads and loads a pretrained model.
|
|
193
|
+
|
|
194
|
+
Parameters
|
|
195
|
+
----------
|
|
196
|
+
name : str, optional
|
|
197
|
+
Name of the pretrained model, by default "camembert_base"
|
|
198
|
+
lang : str, optional
|
|
199
|
+
Language of the pretrained model, by default "fr"
|
|
200
|
+
remote_loc : str, optional
|
|
201
|
+
Optional remote address of the resource, by default None. Will use
|
|
202
|
+
Spark NLPs repositories otherwise.
|
|
203
|
+
|
|
204
|
+
Returns
|
|
205
|
+
-------
|
|
206
|
+
CamemBertEmbeddings
|
|
207
|
+
The restored model
|
|
208
|
+
"""
|
|
209
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
210
|
+
return ResourceDownloader.downloadModel(CamemBertEmbeddings, name, lang, remote_loc)
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for ChunkEmbeddings"""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ChunkEmbeddings(AnnotatorModel):
|
|
20
|
+
"""This annotator utilizes WordEmbeddings, BertEmbeddings etc. to generate
|
|
21
|
+
chunk embeddings from either Chunker, NGramGenerator, or NerConverter
|
|
22
|
+
outputs.
|
|
23
|
+
|
|
24
|
+
For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/embeddings/ChunkEmbeddings.ipynb>`__.
|
|
25
|
+
|
|
26
|
+
========================== ======================
|
|
27
|
+
Input Annotation types Output Annotation type
|
|
28
|
+
========================== ======================
|
|
29
|
+
``CHUNK, WORD_EMBEDDINGS`` ``WORD_EMBEDDINGS``
|
|
30
|
+
========================== ======================
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
poolingStrategy
|
|
35
|
+
Choose how you would like to aggregate Word Embeddings to Chunk
|
|
36
|
+
Embeddings, by default AVERAGE.
|
|
37
|
+
Possible Values: ``AVERAGE, SUM``
|
|
38
|
+
skipOOV
|
|
39
|
+
Whether to discard default vectors for OOV words from the
|
|
40
|
+
aggregation/pooling.
|
|
41
|
+
|
|
42
|
+
Examples
|
|
43
|
+
--------
|
|
44
|
+
>>> import sparknlp
|
|
45
|
+
>>> from sparknlp.base import *
|
|
46
|
+
>>> from sparknlp.annotator import *
|
|
47
|
+
>>> from pyspark.ml import Pipeline
|
|
48
|
+
|
|
49
|
+
Extract the Embeddings from the NGrams
|
|
50
|
+
|
|
51
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
52
|
+
... .setInputCol("text") \\
|
|
53
|
+
... .setOutputCol("document")
|
|
54
|
+
>>> sentence = SentenceDetector() \\
|
|
55
|
+
... .setInputCols(["document"]) \\
|
|
56
|
+
... .setOutputCol("sentence")
|
|
57
|
+
>>> tokenizer = Tokenizer() \\
|
|
58
|
+
... .setInputCols(["sentence"]) \\
|
|
59
|
+
... .setOutputCol("token")
|
|
60
|
+
>>> nGrams = NGramGenerator() \\
|
|
61
|
+
... .setInputCols(["token"]) \\
|
|
62
|
+
... .setOutputCol("chunk") \\
|
|
63
|
+
... .setN(2)
|
|
64
|
+
>>> embeddings = WordEmbeddingsModel.pretrained() \\
|
|
65
|
+
... .setInputCols(["sentence", "token"]) \\
|
|
66
|
+
... .setOutputCol("embeddings") \\
|
|
67
|
+
... .setCaseSensitive(False)
|
|
68
|
+
|
|
69
|
+
Convert the NGram chunks into Word Embeddings
|
|
70
|
+
|
|
71
|
+
>>> chunkEmbeddings = ChunkEmbeddings() \\
|
|
72
|
+
... .setInputCols(["chunk", "embeddings"]) \\
|
|
73
|
+
... .setOutputCol("chunk_embeddings") \\
|
|
74
|
+
... .setPoolingStrategy("AVERAGE")
|
|
75
|
+
>>> pipeline = Pipeline() \\
|
|
76
|
+
... .setStages([
|
|
77
|
+
... documentAssembler,
|
|
78
|
+
... sentence,
|
|
79
|
+
... tokenizer,
|
|
80
|
+
... nGrams,
|
|
81
|
+
... embeddings,
|
|
82
|
+
... chunkEmbeddings
|
|
83
|
+
... ])
|
|
84
|
+
>>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text")
|
|
85
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
86
|
+
>>> result.selectExpr("explode(chunk_embeddings) as result") \\
|
|
87
|
+
... .select("result.annotatorType", "result.result", "result.embeddings") \\
|
|
88
|
+
... .show(5, 80)
|
|
89
|
+
+---------------+----------+--------------------------------------------------------------------------------+
|
|
90
|
+
| annotatorType| result| embeddings|
|
|
91
|
+
+---------------+----------+--------------------------------------------------------------------------------+
|
|
92
|
+
|word_embeddings| This is|[-0.55661, 0.42829502, 0.86661, -0.409785, 0.06316501, 0.120775, -0.0732005, ...|
|
|
93
|
+
|word_embeddings| is a|[-0.40674996, 0.22938299, 0.50597, -0.288195, 0.555655, 0.465145, 0.140118, 0...|
|
|
94
|
+
|word_embeddings|a sentence|[0.17417, 0.095253006, -0.0530925, -0.218465, 0.714395, 0.79860497, 0.0129999...|
|
|
95
|
+
|word_embeddings|sentence .|[0.139705, 0.177955, 0.1887775, -0.45545, 0.20030999, 0.461557, -0.07891501, ...|
|
|
96
|
+
+---------------+----------+--------------------------------------------------------------------------------+
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
name = "ChunkEmbeddings"
|
|
100
|
+
|
|
101
|
+
inputAnnotatorTypes = [AnnotatorType.CHUNK, AnnotatorType.WORD_EMBEDDINGS]
|
|
102
|
+
|
|
103
|
+
outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS
|
|
104
|
+
|
|
105
|
+
@keyword_only
|
|
106
|
+
def __init__(self):
|
|
107
|
+
super(ChunkEmbeddings, self).__init__(classname="com.johnsnowlabs.nlp.embeddings.ChunkEmbeddings")
|
|
108
|
+
self._setDefault(
|
|
109
|
+
poolingStrategy="AVERAGE"
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
poolingStrategy = Param(Params._dummy(),
|
|
113
|
+
"poolingStrategy",
|
|
114
|
+
"Choose how you would like to aggregate Word Embeddings to Chunk Embeddings:" +
|
|
115
|
+
"AVERAGE or SUM",
|
|
116
|
+
typeConverter=TypeConverters.toString)
|
|
117
|
+
skipOOV = Param(Params._dummy(), "skipOOV",
|
|
118
|
+
"Whether to discard default vectors for OOV words from the aggregation / pooling ",
|
|
119
|
+
typeConverter=TypeConverters.toBoolean)
|
|
120
|
+
|
|
121
|
+
def setPoolingStrategy(self, strategy):
|
|
122
|
+
"""Sets how to aggregate Word Embeddings to Chunk Embeddings, by default
|
|
123
|
+
AVERAGE.
|
|
124
|
+
|
|
125
|
+
Possible Values: ``AVERAGE, SUM``
|
|
126
|
+
|
|
127
|
+
Parameters
|
|
128
|
+
----------
|
|
129
|
+
strategy : str
|
|
130
|
+
Aggregation Strategy
|
|
131
|
+
"""
|
|
132
|
+
if strategy == "AVERAGE":
|
|
133
|
+
return self._set(poolingStrategy=strategy)
|
|
134
|
+
elif strategy == "SUM":
|
|
135
|
+
return self._set(poolingStrategy=strategy)
|
|
136
|
+
else:
|
|
137
|
+
return self._set(poolingStrategy="AVERAGE")
|
|
138
|
+
|
|
139
|
+
def setSkipOOV(self, value):
|
|
140
|
+
"""Sets whether to discard default vectors for OOV words from the
|
|
141
|
+
aggregation/pooling.
|
|
142
|
+
|
|
143
|
+
Parameters
|
|
144
|
+
----------
|
|
145
|
+
value : bool
|
|
146
|
+
whether to discard default vectors for OOV words from the
|
|
147
|
+
aggregation/pooling.
|
|
148
|
+
"""
|
|
149
|
+
return self._set(skipOOV=value)
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for DeBertaEmbeddings."""
|
|
15
|
+
from sparknlp.common import *
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DeBertaEmbeddings(AnnotatorModel,
|
|
19
|
+
HasEmbeddingsProperties,
|
|
20
|
+
HasCaseSensitiveProperties,
|
|
21
|
+
HasStorageRef,
|
|
22
|
+
HasBatchedAnnotate,
|
|
23
|
+
HasEngine,
|
|
24
|
+
HasMaxSentenceLengthLimit):
|
|
25
|
+
"""The DeBERTa model was proposed in DeBERTa: Decoding-enhanced BERT with
|
|
26
|
+
Disentangled Attention by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu
|
|
27
|
+
Chen It is based on Google’s BERT model released in 2018 and Facebook’s
|
|
28
|
+
RoBERTa model released in 2019.
|
|
29
|
+
|
|
30
|
+
This model requires input tokenization with
|
|
31
|
+
SentencePiece model, which is provided by Spark NLP (See tokenizers
|
|
32
|
+
package).
|
|
33
|
+
|
|
34
|
+
It builds on RoBERTa with disentangled attention and enhanced mask decoder
|
|
35
|
+
training with half of the data used in RoBERTa.
|
|
36
|
+
|
|
37
|
+
Pretrained models can be loaded with pretrained of the companion object:
|
|
38
|
+
|
|
39
|
+
>>> embeddings = DeBertaEmbeddings.pretrained() \\
|
|
40
|
+
... .setInputCols(["sentence", "token"]) \\
|
|
41
|
+
... .setOutputCol("embeddings")
|
|
42
|
+
|
|
43
|
+
The default model is ``"deberta_v3_base"``, if no name is provided.
|
|
44
|
+
|
|
45
|
+
To see which models are compatible and how to import them see
|
|
46
|
+
`Import Transformers into Spark NLP 🚀
|
|
47
|
+
<https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
|
|
48
|
+
|
|
49
|
+
====================== ======================
|
|
50
|
+
Input Annotation types Output Annotation type
|
|
51
|
+
====================== ======================
|
|
52
|
+
``DOCUMENT, TOKEN`` ``WORD_EMBEDDINGS``
|
|
53
|
+
====================== ======================
|
|
54
|
+
|
|
55
|
+
Parameters
|
|
56
|
+
----------
|
|
57
|
+
batchSize
|
|
58
|
+
Size of every batch, by default 8
|
|
59
|
+
dimension
|
|
60
|
+
Number of embedding dimensions, by default 768
|
|
61
|
+
caseSensitive
|
|
62
|
+
Whether to ignore case in tokens for embeddings matching, by default
|
|
63
|
+
False
|
|
64
|
+
configProtoBytes
|
|
65
|
+
ConfigProto from tensorflow, serialized into byte array.
|
|
66
|
+
maxSentenceLength
|
|
67
|
+
Max sentence length to process, by default 128
|
|
68
|
+
|
|
69
|
+
References
|
|
70
|
+
----------
|
|
71
|
+
https://github.com/microsoft/DeBERTa
|
|
72
|
+
|
|
73
|
+
https://www.microsoft.com/en-us/research/blog/microsoft-deberta-surpasses-human-performance-on-the-superglue-benchmark/
|
|
74
|
+
|
|
75
|
+
**Paper abstract:**
|
|
76
|
+
|
|
77
|
+
*Paper abstract: Recent progress in pre-trained neural language models has
|
|
78
|
+
significantly improved the performance of many natural language processing
|
|
79
|
+
(NLP) tasks. In this paper we propose a new model architecture DeBERTa (
|
|
80
|
+
Decoding-enhanced BERT with disentangled attention) that improves the BERT
|
|
81
|
+
and RoBERTa models using two novel techniques. The first is the disentangled
|
|
82
|
+
attention mechanism, where each word is represented using two vectors that
|
|
83
|
+
encode its content and position, respectively, and the attention weights
|
|
84
|
+
among words are computed using disentangled matrices on their contents and
|
|
85
|
+
relative positions. Second, an enhanced mask decoder is used to replace the
|
|
86
|
+
output softmax layer to predict the masked tokens for model pretraining. We
|
|
87
|
+
show that these two techniques significantly improve the efficiency of model
|
|
88
|
+
pretraining and performance of downstream tasks. Compared to RoBERTa-Large,
|
|
89
|
+
a DeBERTa model trained on half of the training data performs consistently
|
|
90
|
+
better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9%
|
|
91
|
+
(90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by
|
|
92
|
+
+3.6% (83.2% vs. 86.8%). The DeBERTa code and pre-trained models will be
|
|
93
|
+
made publicly available at https://github.com/microsoft/DeBERTa.*
|
|
94
|
+
|
|
95
|
+
Examples
|
|
96
|
+
--------
|
|
97
|
+
>>> import sparknlp
|
|
98
|
+
>>> from sparknlp.base import *
|
|
99
|
+
>>> from sparknlp.annotator import *
|
|
100
|
+
>>> from pyspark.ml import Pipeline
|
|
101
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
102
|
+
... .setInputCol("text") \\
|
|
103
|
+
... .setOutputCol("document")
|
|
104
|
+
>>> tokenizer = Tokenizer() \\
|
|
105
|
+
... .setInputCols(["document"]) \\
|
|
106
|
+
>>> embeddings = DeBertaEmbeddings.pretrained() \\
|
|
107
|
+
... .setInputCols(["token", "document"]) \\
|
|
108
|
+
... .setOutputCol("embeddings")
|
|
109
|
+
>>> embeddingsFinisher = EmbeddingsFinisher() \\
|
|
110
|
+
... .setInputCols(["embeddings"]) \\
|
|
111
|
+
... .setOutputCols("finished_embeddings") \\
|
|
112
|
+
... .setOutputAsVector(True) \\
|
|
113
|
+
... .setCleanAnnotations(False)
|
|
114
|
+
>>> pipeline = Pipeline().setStages([
|
|
115
|
+
... documentAssembler,
|
|
116
|
+
... tokenizer,
|
|
117
|
+
... embeddings,
|
|
118
|
+
... embeddingsFinisher
|
|
119
|
+
... ])
|
|
120
|
+
>>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text")
|
|
121
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
122
|
+
>>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
|
|
123
|
+
+--------------------------------------------------------------------------------+
|
|
124
|
+
| result|
|
|
125
|
+
+--------------------------------------------------------------------------------+
|
|
126
|
+
|[1.1342473030090332,-1.3855540752410889,0.9818322062492371,-0.784737348556518...|
|
|
127
|
+
|[0.847029983997345,-1.047153353691101,-0.1520637571811676,-0.6245765686035156...|
|
|
128
|
+
|[-0.009860038757324219,-0.13450059294700623,2.707749128341675,1.2916892766952...|
|
|
129
|
+
|[-0.04192575812339783,-0.5764210224151611,-0.3196685314178467,-0.527840495109...|
|
|
130
|
+
|[0.15583214163780212,-0.1614152491092682,-0.28423872590065,-0.135491415858268...|
|
|
131
|
+
+--------------------------------------------------------------------------------+
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
name = "DeBertaEmbeddings"
|
|
135
|
+
|
|
136
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
|
|
137
|
+
|
|
138
|
+
outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS
|
|
139
|
+
|
|
140
|
+
configProtoBytes = Param(Params._dummy(),
|
|
141
|
+
"configProtoBytes",
|
|
142
|
+
"ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
|
|
143
|
+
TypeConverters.toListInt)
|
|
144
|
+
|
|
145
|
+
def setConfigProtoBytes(self, b):
|
|
146
|
+
"""Sets configProto from tensorflow, serialized into byte array.
|
|
147
|
+
|
|
148
|
+
Parameters
|
|
149
|
+
----------
|
|
150
|
+
b : List[int]
|
|
151
|
+
ConfigProto from tensorflow, serialized into byte array
|
|
152
|
+
"""
|
|
153
|
+
return self._set(configProtoBytes=b)
|
|
154
|
+
|
|
155
|
+
@keyword_only
|
|
156
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.DeBertaEmbeddings", java_model=None):
|
|
157
|
+
super(DeBertaEmbeddings, self).__init__(
|
|
158
|
+
classname=classname,
|
|
159
|
+
java_model=java_model
|
|
160
|
+
)
|
|
161
|
+
self._setDefault(
|
|
162
|
+
batchSize=8,
|
|
163
|
+
dimension=768,
|
|
164
|
+
maxSentenceLength=128,
|
|
165
|
+
caseSensitive=True
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
@staticmethod
|
|
169
|
+
def loadSavedModel(folder, spark_session):
|
|
170
|
+
"""Loads a locally saved model.
|
|
171
|
+
|
|
172
|
+
Parameters
|
|
173
|
+
----------
|
|
174
|
+
folder : str
|
|
175
|
+
Folder of the saved model
|
|
176
|
+
spark_session : pyspark.sql.SparkSession
|
|
177
|
+
The current SparkSession
|
|
178
|
+
|
|
179
|
+
Returns
|
|
180
|
+
-------
|
|
181
|
+
DeBertaEmbeddings
|
|
182
|
+
The restored model
|
|
183
|
+
"""
|
|
184
|
+
from sparknlp.internal import _DeBERTaLoader
|
|
185
|
+
jModel = _DeBERTaLoader(folder, spark_session._jsparkSession)._java_obj
|
|
186
|
+
return DeBertaEmbeddings(java_model=jModel)
|
|
187
|
+
|
|
188
|
+
@staticmethod
|
|
189
|
+
def pretrained(name="deberta_v3_base", lang="en", remote_loc=None):
|
|
190
|
+
"""Downloads and loads a pretrained model.
|
|
191
|
+
|
|
192
|
+
Parameters
|
|
193
|
+
----------
|
|
194
|
+
name : str, optional
|
|
195
|
+
Name of the pretrained model, by default "deberta_v3_base"
|
|
196
|
+
lang : str, optional
|
|
197
|
+
Language of the pretrained model, by default "en"
|
|
198
|
+
remote_loc : str, optional
|
|
199
|
+
Optional remote address of the resource, by default None. Will use
|
|
200
|
+
Spark NLPs repositories otherwise.
|
|
201
|
+
|
|
202
|
+
Returns
|
|
203
|
+
-------
|
|
204
|
+
DeBertaEmbeddings
|
|
205
|
+
The restored model
|
|
206
|
+
"""
|
|
207
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
208
|
+
return ResourceDownloader.downloadModel(DeBertaEmbeddings, name, lang, remote_loc)
|