spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for DistilBertEmbeddings."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DistilBertEmbeddings(AnnotatorModel,
|
|
20
|
+
HasEmbeddingsProperties,
|
|
21
|
+
HasCaseSensitiveProperties,
|
|
22
|
+
HasStorageRef,
|
|
23
|
+
HasBatchedAnnotate,
|
|
24
|
+
HasEngine,
|
|
25
|
+
HasMaxSentenceLengthLimit):
|
|
26
|
+
"""DistilBERT is a small, fast, cheap and light Transformer model trained by
|
|
27
|
+
distilling BERT base. It has 40% less parameters than ``bert-base-uncased``,
|
|
28
|
+
runs 60% faster while preserving over 95% of BERT's performances as measured
|
|
29
|
+
on the GLUE language understanding benchmark.
|
|
30
|
+
|
|
31
|
+
Pretrained models can be loaded with :meth:`.pretrained` of the companion
|
|
32
|
+
object:
|
|
33
|
+
|
|
34
|
+
>>> embeddings = DistilBertEmbeddings.pretrained() \\
|
|
35
|
+
... .setInputCols(["document", "token"]) \\
|
|
36
|
+
... .setOutputCol("embeddings")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
The default model is ``"distilbert_base_cased"``, if no name is provided.
|
|
40
|
+
For available pretrained models please see the
|
|
41
|
+
`Models Hub <https://sparknlp.org/models?task=Embeddings>`__.
|
|
42
|
+
|
|
43
|
+
For extended examples of usage, see the `Examples
|
|
44
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBERT.ipynb>`__.
|
|
45
|
+
To see which models are compatible and how to import them see
|
|
46
|
+
`Import Transformers into Spark NLP 🚀
|
|
47
|
+
<https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
|
|
48
|
+
|
|
49
|
+
====================== ======================
|
|
50
|
+
Input Annotation types Output Annotation type
|
|
51
|
+
====================== ======================
|
|
52
|
+
``DOCUMENT, TOKEN`` ``WORD_EMBEDDINGS``
|
|
53
|
+
====================== ======================
|
|
54
|
+
|
|
55
|
+
Parameters
|
|
56
|
+
----------
|
|
57
|
+
batchSize
|
|
58
|
+
Size of every batch, by default 8
|
|
59
|
+
dimension
|
|
60
|
+
Number of embedding dimensions, by default 768
|
|
61
|
+
caseSensitive
|
|
62
|
+
Whether to ignore case in tokens for embeddings matching, by default
|
|
63
|
+
False
|
|
64
|
+
maxSentenceLength
|
|
65
|
+
Max sentence length to process, by default 128
|
|
66
|
+
configProtoBytes
|
|
67
|
+
ConfigProto from tensorflow, serialized into byte array.
|
|
68
|
+
|
|
69
|
+
Notes
|
|
70
|
+
-----
|
|
71
|
+
- DistilBERT doesn't have ``token_type_ids``, you don't need to
|
|
72
|
+
indicate which token belongs to which segment. Just separate your segments
|
|
73
|
+
with the separation token ``tokenizer.sep_token`` (or ``[SEP]``).
|
|
74
|
+
- DistilBERT doesn't have options to select the input positions
|
|
75
|
+
(``position_ids`` input). This could be added if necessary though,
|
|
76
|
+
just let us know if you need this option.
|
|
77
|
+
|
|
78
|
+
References
|
|
79
|
+
----------
|
|
80
|
+
The DistilBERT model was proposed in the paper
|
|
81
|
+
`DistilBERT, a distilled version of BERT: smaller, faster, cheaper and
|
|
82
|
+
lighter <https://arxiv.org/abs/1910.01108>`__.
|
|
83
|
+
|
|
84
|
+
**Paper Abstract:**
|
|
85
|
+
|
|
86
|
+
*As Transfer Learning from large-scale pre-trained models becomes more
|
|
87
|
+
prevalent in Natural Language Processing (NLP), operating these
|
|
88
|
+
large models in on-the- edge and/or under constrained computational
|
|
89
|
+
training or inference budgets remains challenging. In this work, we
|
|
90
|
+
propose a method to pre-train a smaller general-purpose language
|
|
91
|
+
representation model, called DistilBERT, which can then be
|
|
92
|
+
fine-tuned with good performances on a wide range of tasks like its
|
|
93
|
+
larger counterparts. While most prior work investigated the use of
|
|
94
|
+
distillation for building task-specific models, we leverage
|
|
95
|
+
knowledge distillation during the pretraining phase and show that it
|
|
96
|
+
is possible to reduce the size of a BERT model by 40%, while
|
|
97
|
+
retaining 97% of its language understanding capabilities and being
|
|
98
|
+
60% faster. To leverage the inductive biases learned by larger
|
|
99
|
+
models during pretraining, we introduce a triple loss combining
|
|
100
|
+
language modeling, distillation and cosine-distance losses. Our
|
|
101
|
+
smaller, faster and lighter model is cheaper to pre-train and we
|
|
102
|
+
demonstrate its capabilities for on-device computations in a
|
|
103
|
+
proof-of-concept experiment and a comparative on-device study.*
|
|
104
|
+
|
|
105
|
+
Examples
|
|
106
|
+
--------
|
|
107
|
+
>>> import sparknlp
|
|
108
|
+
>>> from sparknlp.base import *
|
|
109
|
+
>>> from sparknlp.annotator import *
|
|
110
|
+
>>> from pyspark.ml import Pipeline
|
|
111
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
112
|
+
... .setInputCol("text") \\
|
|
113
|
+
... .setOutputCol("document")
|
|
114
|
+
>>> tokenizer = Tokenizer() \\
|
|
115
|
+
... .setInputCols(["document"]) \\
|
|
116
|
+
... .setOutputCol("token")
|
|
117
|
+
>>> embeddings = DistilBertEmbeddings.pretrained() \\
|
|
118
|
+
... .setInputCols(["document", "token"]) \\
|
|
119
|
+
... .setOutputCol("embeddings") \\
|
|
120
|
+
... .setCaseSensitive(True)
|
|
121
|
+
>>> embeddingsFinisher = EmbeddingsFinisher() \\
|
|
122
|
+
... .setInputCols(["embeddings"]) \\
|
|
123
|
+
... .setOutputCols("finished_embeddings") \\
|
|
124
|
+
... .setOutputAsVector(True) \\
|
|
125
|
+
... .setCleanAnnotations(False)
|
|
126
|
+
>>> pipeline = Pipeline() \\
|
|
127
|
+
... .setStages([
|
|
128
|
+
... documentAssembler,
|
|
129
|
+
... tokenizer,
|
|
130
|
+
... embeddings,
|
|
131
|
+
... embeddingsFinisher
|
|
132
|
+
... ])
|
|
133
|
+
>>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text")
|
|
134
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
135
|
+
>>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
|
|
136
|
+
+--------------------------------------------------------------------------------+
|
|
137
|
+
| result|
|
|
138
|
+
+--------------------------------------------------------------------------------+
|
|
139
|
+
|[0.1127224713563919,-0.1982710212469101,0.5360898375511169,-0.272536993026733...|
|
|
140
|
+
|[0.35534414649009705,0.13215228915214539,0.40981462597846985,0.14036104083061...|
|
|
141
|
+
|[0.328085333108902,-0.06269335001707077,-0.017595693469047546,-0.024373905733...|
|
|
142
|
+
|[0.15617232024669647,0.2967822253704071,0.22324979305267334,-0.04568954557180...|
|
|
143
|
+
|[0.45411425828933716,0.01173491682857275,0.190129816532135,0.1178255230188369...|
|
|
144
|
+
+--------------------------------------------------------------------------------+
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
name = "DistilBertEmbeddings"
|
|
148
|
+
|
|
149
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
|
|
150
|
+
|
|
151
|
+
outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS
|
|
152
|
+
|
|
153
|
+
configProtoBytes = Param(Params._dummy(),
|
|
154
|
+
"configProtoBytes",
|
|
155
|
+
"ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
|
|
156
|
+
TypeConverters.toListInt)
|
|
157
|
+
|
|
158
|
+
def setConfigProtoBytes(self, b):
|
|
159
|
+
"""Sets configProto from tensorflow, serialized into byte array.
|
|
160
|
+
|
|
161
|
+
Parameters
|
|
162
|
+
----------
|
|
163
|
+
b : List[int]
|
|
164
|
+
ConfigProto from tensorflow, serialized into byte array
|
|
165
|
+
"""
|
|
166
|
+
return self._set(configProtoBytes=b)
|
|
167
|
+
|
|
168
|
+
@keyword_only
|
|
169
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.DistilBertEmbeddings", java_model=None):
|
|
170
|
+
super(DistilBertEmbeddings, self).__init__(
|
|
171
|
+
classname=classname,
|
|
172
|
+
java_model=java_model
|
|
173
|
+
)
|
|
174
|
+
self._setDefault(
|
|
175
|
+
dimension=768,
|
|
176
|
+
batchSize=8,
|
|
177
|
+
maxSentenceLength=128,
|
|
178
|
+
caseSensitive=False
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
@staticmethod
|
|
182
|
+
def loadSavedModel(folder, spark_session):
|
|
183
|
+
"""Loads a locally saved model.
|
|
184
|
+
|
|
185
|
+
Parameters
|
|
186
|
+
----------
|
|
187
|
+
folder : str
|
|
188
|
+
Folder of the saved model
|
|
189
|
+
spark_session : pyspark.sql.SparkSession
|
|
190
|
+
The current SparkSession
|
|
191
|
+
|
|
192
|
+
Returns
|
|
193
|
+
-------
|
|
194
|
+
DistilBertEmbeddings
|
|
195
|
+
The restored model
|
|
196
|
+
"""
|
|
197
|
+
from sparknlp.internal import _DistilBertLoader
|
|
198
|
+
jModel = _DistilBertLoader(folder, spark_session._jsparkSession)._java_obj
|
|
199
|
+
return DistilBertEmbeddings(java_model=jModel)
|
|
200
|
+
|
|
201
|
+
@staticmethod
|
|
202
|
+
def pretrained(name="distilbert_base_cased", lang="en", remote_loc=None):
|
|
203
|
+
"""Downloads and loads a pretrained model.
|
|
204
|
+
|
|
205
|
+
Parameters
|
|
206
|
+
----------
|
|
207
|
+
name : str, optional
|
|
208
|
+
Name of the pretrained model, by default "distilbert_base_cased"
|
|
209
|
+
lang : str, optional
|
|
210
|
+
Language of the pretrained model, by default "en"
|
|
211
|
+
remote_loc : str, optional
|
|
212
|
+
Optional remote address of the resource, by default None. Will use
|
|
213
|
+
Spark NLPs repositories otherwise.
|
|
214
|
+
|
|
215
|
+
Returns
|
|
216
|
+
-------
|
|
217
|
+
DistilBertEmbeddings
|
|
218
|
+
The restored model
|
|
219
|
+
"""
|
|
220
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
221
|
+
return ResourceDownloader.downloadModel(DistilBertEmbeddings, name, lang, remote_loc)
|
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for Doc2Vec."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Doc2VecApproach(AnnotatorApproach, HasStorageRef, HasEnableCachingProperties):
|
|
20
|
+
"""Trains a Word2Vec model that creates vector representations of words in a
|
|
21
|
+
text corpus.
|
|
22
|
+
|
|
23
|
+
The algorithm first constructs a vocabulary from the corpus and then learns
|
|
24
|
+
vector representation of words in the vocabulary. The vector representation
|
|
25
|
+
can be used as features in natural language processing and machine learning
|
|
26
|
+
algorithms.
|
|
27
|
+
|
|
28
|
+
We use Word2Vec implemented in Spark ML. It uses skip-gram model in our
|
|
29
|
+
implementation and a hierarchical softmax method to train the model. The
|
|
30
|
+
variable names in the implementation match the original C implementation.
|
|
31
|
+
|
|
32
|
+
For instantiated/pretrained models, see :class:`.Doc2VecModel`.
|
|
33
|
+
|
|
34
|
+
For available pretrained models please see the `Models Hub <https://sparknlp.org/models>`__.
|
|
35
|
+
|
|
36
|
+
====================== =======================
|
|
37
|
+
Input Annotation types Output Annotation type
|
|
38
|
+
====================== =======================
|
|
39
|
+
``TOKEN`` ``SENTENCE_EMBEDDINGS``
|
|
40
|
+
====================== =======================
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
vectorSize
|
|
45
|
+
The dimension of codes after transforming from words (> 0), by default
|
|
46
|
+
100
|
|
47
|
+
windowSize
|
|
48
|
+
The window size (context words from [-window, window]) (> 0), by default
|
|
49
|
+
5
|
|
50
|
+
numPartitions
|
|
51
|
+
Number of partitions for sentences of words (> 0), by default 1
|
|
52
|
+
minCount
|
|
53
|
+
The minimum number of times a token must appear to be included in the
|
|
54
|
+
word2vec model's vocabulary (>= 0), by default 1
|
|
55
|
+
maxSentenceLength
|
|
56
|
+
The window size (Maximum length (in words) of each sentence in the input
|
|
57
|
+
data. Any sentence longer than this threshold will be divided into
|
|
58
|
+
chunks up to the size (> 0), by default 1000
|
|
59
|
+
stepSize
|
|
60
|
+
Step size (learning rate) to be used for each iteration of optimization
|
|
61
|
+
(> 0), by default 0.025
|
|
62
|
+
maxIter
|
|
63
|
+
Maximum number of iterations (>= 0), by default 1
|
|
64
|
+
seed
|
|
65
|
+
Random seed, by default 44
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
References
|
|
69
|
+
----------
|
|
70
|
+
For the original C implementation, see https://code.google.com/p/word2vec/
|
|
71
|
+
|
|
72
|
+
For the research paper, see `Efficient Estimation of Word Representations in
|
|
73
|
+
Vector Space <https://arxiv.org/abs/1301.3781>`__ and `Distributed
|
|
74
|
+
Representations of Words and Phrases and their Compositionality
|
|
75
|
+
<https://arxiv.org/pdf/1310.4546v1.pdf>`__.
|
|
76
|
+
|
|
77
|
+
Examples
|
|
78
|
+
--------
|
|
79
|
+
>>> import sparknlp
|
|
80
|
+
>>> from sparknlp.base import *
|
|
81
|
+
>>> from sparknlp.annotator import *
|
|
82
|
+
>>> from pyspark.ml import Pipeline
|
|
83
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
84
|
+
... .setInputCol("text") \\
|
|
85
|
+
... .setOutputCol("document")
|
|
86
|
+
>>> tokenizer = Tokenizer() \\
|
|
87
|
+
... .setInputCols(["document"]) \\
|
|
88
|
+
... .setOutputCol("token")
|
|
89
|
+
>>> embeddings = Doc2VecApproach() \\
|
|
90
|
+
... .setInputCols(["token"]) \\
|
|
91
|
+
... .setOutputCol("embeddings")
|
|
92
|
+
>>> pipeline = Pipeline() \\
|
|
93
|
+
... .setStages([
|
|
94
|
+
... documentAssembler,
|
|
95
|
+
... tokenizer,
|
|
96
|
+
... embeddings
|
|
97
|
+
... ])
|
|
98
|
+
>>> path = "sherlockholmes.txt"
|
|
99
|
+
>>> dataset = spark.read.text(path).toDF("text")
|
|
100
|
+
>>> pipelineModel = pipeline.fit(dataset)
|
|
101
|
+
"""
|
|
102
|
+
inputAnnotatorTypes = [AnnotatorType.TOKEN]
|
|
103
|
+
|
|
104
|
+
outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
|
|
105
|
+
|
|
106
|
+
vectorSize = Param(Params._dummy(),
|
|
107
|
+
"vectorSize",
|
|
108
|
+
"the dimension of codes after transforming from words (> 0)",
|
|
109
|
+
typeConverter=TypeConverters.toInt)
|
|
110
|
+
|
|
111
|
+
windowSize = Param(Params._dummy(),
|
|
112
|
+
"windowSize",
|
|
113
|
+
"the window size (context words from [-window, window]) (> 0)",
|
|
114
|
+
typeConverter=TypeConverters.toInt)
|
|
115
|
+
|
|
116
|
+
numPartitions = Param(Params._dummy(),
|
|
117
|
+
"numPartitions",
|
|
118
|
+
"number of partitions for sentences of words (> 0)",
|
|
119
|
+
typeConverter=TypeConverters.toInt)
|
|
120
|
+
|
|
121
|
+
minCount = Param(Params._dummy(),
|
|
122
|
+
"minCount",
|
|
123
|
+
"the minimum number of times a token must " +
|
|
124
|
+
"appear to be included in the word2vec model's vocabulary (>= 0)",
|
|
125
|
+
typeConverter=TypeConverters.toInt)
|
|
126
|
+
|
|
127
|
+
maxSentenceLength = Param(Params._dummy(),
|
|
128
|
+
"maxSentenceLength",
|
|
129
|
+
"the window size (Maximum length (in words) of each sentence in the input data. Any sentence longer than this threshold will " +
|
|
130
|
+
"be divided into chunks up to the size (> 0)",
|
|
131
|
+
typeConverter=TypeConverters.toInt)
|
|
132
|
+
|
|
133
|
+
stepSize = Param(Params._dummy(),
|
|
134
|
+
"stepSize",
|
|
135
|
+
"Step size (learning rate) to be used for each iteration of optimization (> 0)",
|
|
136
|
+
typeConverter=TypeConverters.toFloat)
|
|
137
|
+
|
|
138
|
+
maxIter = Param(Params._dummy(),
|
|
139
|
+
"maxIter",
|
|
140
|
+
"maximum number of iterations (>= 0)",
|
|
141
|
+
typeConverter=TypeConverters.toInt)
|
|
142
|
+
|
|
143
|
+
seed = Param(Params._dummy(),
|
|
144
|
+
"seed",
|
|
145
|
+
"Random seed",
|
|
146
|
+
typeConverter=TypeConverters.toInt)
|
|
147
|
+
|
|
148
|
+
def setVectorSize(self, vectorSize):
|
|
149
|
+
"""
|
|
150
|
+
Sets vector size (default: 100).
|
|
151
|
+
"""
|
|
152
|
+
return self._set(vectorSize=vectorSize)
|
|
153
|
+
|
|
154
|
+
def setWindowSize(self, windowSize):
|
|
155
|
+
"""
|
|
156
|
+
Sets window size (default: 5).
|
|
157
|
+
"""
|
|
158
|
+
return self._set(windowSize=windowSize)
|
|
159
|
+
|
|
160
|
+
def setStepSize(self, stepSize):
|
|
161
|
+
"""
|
|
162
|
+
Sets initial learning rate (default: 0.025).
|
|
163
|
+
"""
|
|
164
|
+
return self._set(stepSize=stepSize)
|
|
165
|
+
|
|
166
|
+
def setNumPartitions(self, numPartitions):
|
|
167
|
+
"""
|
|
168
|
+
Sets number of partitions (default: 1). Use a small number for
|
|
169
|
+
accuracy.
|
|
170
|
+
"""
|
|
171
|
+
return self._set(numPartitions=numPartitions)
|
|
172
|
+
|
|
173
|
+
def setMaxIter(self, numIterations):
|
|
174
|
+
"""
|
|
175
|
+
Sets number of iterations (default: 1), which should be smaller
|
|
176
|
+
than or equal to number of partitions.
|
|
177
|
+
"""
|
|
178
|
+
return self._set(maxIter=numIterations)
|
|
179
|
+
|
|
180
|
+
def setSeed(self, seed):
|
|
181
|
+
"""
|
|
182
|
+
Sets random seed.
|
|
183
|
+
"""
|
|
184
|
+
return self._set(seed=seed)
|
|
185
|
+
|
|
186
|
+
def setMinCount(self, minCount):
|
|
187
|
+
"""
|
|
188
|
+
Sets minCount, the minimum number of times a token must appear
|
|
189
|
+
to be included in the word2vec model's vocabulary (default: 5).
|
|
190
|
+
"""
|
|
191
|
+
return self._set(minCount=minCount)
|
|
192
|
+
|
|
193
|
+
def setMaxSentenceLength(self, maxSentenceLength):
|
|
194
|
+
"""
|
|
195
|
+
Maximum length (in words) of each sentence in the input data.
|
|
196
|
+
Any sentence longer than this threshold will be divided into
|
|
197
|
+
chunks up to the size (> 0)
|
|
198
|
+
"""
|
|
199
|
+
return self._set(maxSentenceLength=maxSentenceLength)
|
|
200
|
+
|
|
201
|
+
@keyword_only
|
|
202
|
+
def __init__(self):
|
|
203
|
+
super(Doc2VecApproach, self).__init__(classname="com.johnsnowlabs.nlp.embeddings.Doc2VecApproach")
|
|
204
|
+
self._setDefault(
|
|
205
|
+
vectorSize=100,
|
|
206
|
+
windowSize=5,
|
|
207
|
+
numPartitions=1,
|
|
208
|
+
minCount=1,
|
|
209
|
+
maxSentenceLength=1000,
|
|
210
|
+
stepSize=0.025,
|
|
211
|
+
maxIter=1,
|
|
212
|
+
seed=44
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
def _create_model(self, java_model):
|
|
216
|
+
return Doc2VecModel(java_model=java_model)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
class Doc2VecModel(AnnotatorModel, HasStorageRef, HasEmbeddingsProperties):
|
|
220
|
+
"""Word2Vec model that creates vector representations of words in a text
|
|
221
|
+
corpus.
|
|
222
|
+
|
|
223
|
+
The algorithm first constructs a vocabulary from the corpus and then learns
|
|
224
|
+
vector representation of words in the vocabulary. The vector representation
|
|
225
|
+
can be used as features in natural language processing and machine learning
|
|
226
|
+
algorithms.
|
|
227
|
+
|
|
228
|
+
We use Word2Vec implemented in Spark ML. It uses skip-gram model in our
|
|
229
|
+
implementation and a hierarchical softmax method to train the model. The
|
|
230
|
+
variable names in the implementation match the original C implementation.
|
|
231
|
+
|
|
232
|
+
This is the instantiated model of the :class:`.Doc2VecApproach`. For
|
|
233
|
+
training your own model, please see the documentation of that class.
|
|
234
|
+
|
|
235
|
+
Pretrained models can be loaded with :meth:`.pretrained` of the companion
|
|
236
|
+
object:
|
|
237
|
+
|
|
238
|
+
>>> embeddings = Doc2VecModel.pretrained() \\
|
|
239
|
+
... .setInputCols(["token"]) \\
|
|
240
|
+
... .setOutputCol("embeddings")
|
|
241
|
+
|
|
242
|
+
The default model is `"doc2vec_gigaword_300"`, if no name is provided.
|
|
243
|
+
|
|
244
|
+
====================== =======================
|
|
245
|
+
Input Annotation types Output Annotation type
|
|
246
|
+
====================== =======================
|
|
247
|
+
``TOKEN`` ``SENTENCE_EMBEDDINGS``
|
|
248
|
+
====================== =======================
|
|
249
|
+
|
|
250
|
+
Parameters
|
|
251
|
+
----------
|
|
252
|
+
vectorSize
|
|
253
|
+
The dimension of codes after transforming from words (> 0) , by default
|
|
254
|
+
100
|
|
255
|
+
|
|
256
|
+
References
|
|
257
|
+
----------
|
|
258
|
+
For the original C implementation, see https://code.google.com/p/word2vec/
|
|
259
|
+
|
|
260
|
+
For the research paper, see `Efficient Estimation of Word Representations in
|
|
261
|
+
Vector Space <https://arxiv.org/abs/1301.3781>`__ and `Distributed
|
|
262
|
+
Representations of Words and Phrases and their Compositionality
|
|
263
|
+
<https://arxiv.org/pdf/1310.4546v1.pdf>`__.
|
|
264
|
+
|
|
265
|
+
Examples
|
|
266
|
+
--------
|
|
267
|
+
>>> import sparknlp
|
|
268
|
+
>>> from sparknlp.base import *
|
|
269
|
+
>>> from sparknlp.annotator import *
|
|
270
|
+
>>> from pyspark.ml import Pipeline
|
|
271
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
272
|
+
... .setInputCol("text") \\
|
|
273
|
+
... .setOutputCol("document")
|
|
274
|
+
>>> tokenizer = Tokenizer() \\
|
|
275
|
+
... .setInputCols(["document"]) \\
|
|
276
|
+
... .setOutputCol("token")
|
|
277
|
+
>>> embeddings = Doc2VecModel.pretrained() \\
|
|
278
|
+
... .setInputCols(["token"]) \\
|
|
279
|
+
... .setOutputCol("embeddings")
|
|
280
|
+
>>> embeddingsFinisher = EmbeddingsFinisher() \\
|
|
281
|
+
... .setInputCols(["embeddings"]) \\
|
|
282
|
+
... .setOutputCols("finished_embeddings") \\
|
|
283
|
+
... .setOutputAsVector(True)
|
|
284
|
+
>>> pipeline = Pipeline().setStages([
|
|
285
|
+
... documentAssembler,
|
|
286
|
+
... tokenizer,
|
|
287
|
+
... embeddings,
|
|
288
|
+
... embeddingsFinisher
|
|
289
|
+
... ])
|
|
290
|
+
>>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text")
|
|
291
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
292
|
+
>>> result.selectExpr("explode(finished_embeddings) as result").show(1, 80)
|
|
293
|
+
+--------------------------------------------------------------------------------+
|
|
294
|
+
| result|
|
|
295
|
+
+--------------------------------------------------------------------------------+
|
|
296
|
+
|[0.06222493574023247,0.011579325422644615,0.009919632226228714,0.109361454844...|
|
|
297
|
+
+--------------------------------------------------------------------------------+
|
|
298
|
+
"""
|
|
299
|
+
name = "Doc2VecModel"
|
|
300
|
+
|
|
301
|
+
inputAnnotatorTypes = [AnnotatorType.TOKEN]
|
|
302
|
+
|
|
303
|
+
outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
|
|
304
|
+
|
|
305
|
+
vectorSize = Param(Params._dummy(),
|
|
306
|
+
"vectorSize",
|
|
307
|
+
"the dimension of codes after transforming from words (> 0)",
|
|
308
|
+
typeConverter=TypeConverters.toInt)
|
|
309
|
+
|
|
310
|
+
def setVectorSize(self, vectorSize):
|
|
311
|
+
"""
|
|
312
|
+
Sets vector size (default: 100).
|
|
313
|
+
"""
|
|
314
|
+
return self._set(vectorSize=vectorSize)
|
|
315
|
+
|
|
316
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.Doc2VecModel", java_model=None):
|
|
317
|
+
super(Doc2VecModel, self).__init__(
|
|
318
|
+
classname=classname,
|
|
319
|
+
java_model=java_model
|
|
320
|
+
)
|
|
321
|
+
self._setDefault(
|
|
322
|
+
vectorSize=100
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
@staticmethod
|
|
326
|
+
def pretrained(name="doc2vec_gigaword_300", lang="en", remote_loc=None):
|
|
327
|
+
"""Downloads and loads a pretrained model.
|
|
328
|
+
|
|
329
|
+
Parameters
|
|
330
|
+
----------
|
|
331
|
+
name : str, optional
|
|
332
|
+
Name of the pretrained model, by default "doc2vec_wiki"
|
|
333
|
+
lang : str, optional
|
|
334
|
+
Language of the pretrained model, by default "en"
|
|
335
|
+
remote_loc : str, optional
|
|
336
|
+
Optional remote address of the resource, by default None. Will use
|
|
337
|
+
Spark NLPs repositories otherwise.
|
|
338
|
+
|
|
339
|
+
Returns
|
|
340
|
+
-------
|
|
341
|
+
Doc2VecModel
|
|
342
|
+
The restored model
|
|
343
|
+
"""
|
|
344
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
345
|
+
return ResourceDownloader.downloadModel(Doc2VecModel, name, lang, remote_loc)
|
|
346
|
+
|
|
347
|
+
def getVectors(self):
|
|
348
|
+
"""
|
|
349
|
+
Returns the vector representation of the words as a dataframe
|
|
350
|
+
with two fields, word and vector.
|
|
351
|
+
"""
|
|
352
|
+
return self._call_java("getVectors")
|