spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- com/johnsnowlabs/nlp/__init__.py +4 -2
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +281 -27
- sparknlp/annotation.py +137 -6
- sparknlp/annotation_audio.py +61 -0
- sparknlp/annotation_image.py +82 -0
- sparknlp/annotator/__init__.py +93 -0
- sparknlp/annotator/audio/__init__.py +16 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/annotator/chunk2_doc.py +85 -0
- sparknlp/annotator/chunker.py +137 -0
- sparknlp/annotator/classifier_dl/__init__.py +61 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/__init__.py +1 -0
- sparknlp/annotator/coref/spanbert_coref.py +221 -0
- sparknlp/annotator/cv/__init__.py +29 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/__init__.py +17 -0
- sparknlp/annotator/dependency/dependency_parser.py +294 -0
- sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +235 -0
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +45 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
- sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
- sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
- sparknlp/annotator/embeddings/doc2vec.py +352 -0
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
- sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
- sparknlp/annotator/embeddings/word2vec.py +353 -0
- sparknlp/annotator/embeddings/word_embeddings.py +385 -0
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
- sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
- sparknlp/annotator/er/__init__.py +16 -0
- sparknlp/annotator/er/entity_ruler.py +267 -0
- sparknlp/annotator/graph_extraction.py +368 -0
- sparknlp/annotator/keyword_extraction/__init__.py +16 -0
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
- sparknlp/annotator/ld_dl/__init__.py +16 -0
- sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
- sparknlp/annotator/lemmatizer.py +250 -0
- sparknlp/annotator/matcher/__init__.py +20 -0
- sparknlp/annotator/matcher/big_text_matcher.py +272 -0
- sparknlp/annotator/matcher/date_matcher.py +303 -0
- sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
- sparknlp/annotator/matcher/regex_matcher.py +221 -0
- sparknlp/annotator/matcher/text_matcher.py +290 -0
- sparknlp/annotator/n_gram_generator.py +141 -0
- sparknlp/annotator/ner/__init__.py +21 -0
- sparknlp/annotator/ner/ner_approach.py +94 -0
- sparknlp/annotator/ner/ner_converter.py +148 -0
- sparknlp/annotator/ner/ner_crf.py +397 -0
- sparknlp/annotator/ner/ner_dl.py +591 -0
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +166 -0
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +230 -0
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/param/__init__.py +17 -0
- sparknlp/annotator/param/classifier_encoder.py +98 -0
- sparknlp/annotator/param/evaluation_dl_params.py +130 -0
- sparknlp/annotator/pos/__init__.py +16 -0
- sparknlp/annotator/pos/perceptron.py +263 -0
- sparknlp/annotator/sentence/__init__.py +17 -0
- sparknlp/annotator/sentence/sentence_detector.py +290 -0
- sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
- sparknlp/annotator/sentiment/__init__.py +17 -0
- sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
- sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
- sparknlp/annotator/seq2seq/__init__.py +35 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/__init__.py +18 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
- sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
- sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
- sparknlp/annotator/stemmer.py +79 -0
- sparknlp/annotator/stop_words_cleaner.py +190 -0
- sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
- sparknlp/annotator/token/__init__.py +19 -0
- sparknlp/annotator/token/chunk_tokenizer.py +118 -0
- sparknlp/annotator/token/recursive_tokenizer.py +205 -0
- sparknlp/annotator/token/regex_tokenizer.py +208 -0
- sparknlp/annotator/token/tokenizer.py +561 -0
- sparknlp/annotator/token2_chunk.py +76 -0
- sparknlp/annotator/ws/__init__.py +16 -0
- sparknlp/annotator/ws/word_segmenter.py +429 -0
- sparknlp/base/__init__.py +30 -0
- sparknlp/base/audio_assembler.py +95 -0
- sparknlp/base/doc2_chunk.py +169 -0
- sparknlp/base/document_assembler.py +164 -0
- sparknlp/base/embeddings_finisher.py +201 -0
- sparknlp/base/finisher.py +217 -0
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/graph_finisher.py +125 -0
- sparknlp/base/has_recursive_fit.py +24 -0
- sparknlp/base/has_recursive_transform.py +22 -0
- sparknlp/base/image_assembler.py +172 -0
- sparknlp/base/light_pipeline.py +429 -0
- sparknlp/base/multi_document_assembler.py +164 -0
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/recursive_pipeline.py +107 -0
- sparknlp/base/table_assembler.py +145 -0
- sparknlp/base/token_assembler.py +124 -0
- sparknlp/common/__init__.py +26 -0
- sparknlp/common/annotator_approach.py +41 -0
- sparknlp/common/annotator_model.py +47 -0
- sparknlp/common/annotator_properties.py +114 -0
- sparknlp/common/annotator_type.py +38 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/coverage_result.py +22 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +1298 -0
- sparknlp/common/read_as.py +33 -0
- sparknlp/common/recursive_annotator_approach.py +35 -0
- sparknlp/common/storage.py +149 -0
- sparknlp/common/utils.py +39 -0
- sparknlp/functions.py +315 -5
- sparknlp/internal/__init__.py +1199 -0
- sparknlp/internal/annotator_java_ml.py +32 -0
- sparknlp/internal/annotator_transformer.py +37 -0
- sparknlp/internal/extended_java_wrapper.py +63 -0
- sparknlp/internal/params_getters_setters.py +71 -0
- sparknlp/internal/recursive.py +70 -0
- sparknlp/logging/__init__.py +15 -0
- sparknlp/logging/comet.py +467 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/__init__.py +17 -0
- sparknlp/pretrained/pretrained_pipeline.py +158 -0
- sparknlp/pretrained/resource_downloader.py +216 -0
- sparknlp/pretrained/utils.py +35 -0
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +20 -0
- sparknlp/training/_tf_graph_builders/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
- sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
- sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
- sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
- sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
- sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
- sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
- sparknlp/training/conll.py +150 -0
- sparknlp/training/conllu.py +103 -0
- sparknlp/training/pos.py +103 -0
- sparknlp/training/pub_tator.py +76 -0
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/training/tfgraphs.py +5 -0
- sparknlp/upload_to_hub.py +149 -0
- sparknlp/util.py +51 -5
- com/__init__.pyc +0 -0
- com/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/__init__.pyc +0 -0
- com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
- com/johnsnowlabs/nlp/__init__.pyc +0 -0
- com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
- spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
- spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
- sparknlp/__init__.pyc +0 -0
- sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
- sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
- sparknlp/__pycache__/base.cpython-36.pyc +0 -0
- sparknlp/__pycache__/common.cpython-36.pyc +0 -0
- sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
- sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
- sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
- sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
- sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
- sparknlp/__pycache__/training.cpython-36.pyc +0 -0
- sparknlp/__pycache__/util.cpython-36.pyc +0 -0
- sparknlp/annotation.pyc +0 -0
- sparknlp/annotator.py +0 -3006
- sparknlp/annotator.pyc +0 -0
- sparknlp/base.py +0 -347
- sparknlp/base.pyc +0 -0
- sparknlp/common.py +0 -193
- sparknlp/common.pyc +0 -0
- sparknlp/embeddings.py +0 -40
- sparknlp/embeddings.pyc +0 -0
- sparknlp/internal.py +0 -288
- sparknlp/internal.pyc +0 -0
- sparknlp/pretrained.py +0 -123
- sparknlp/pretrained.pyc +0 -0
- sparknlp/storage.py +0 -32
- sparknlp/storage.pyc +0 -0
- sparknlp/training.py +0 -62
- sparknlp/training.pyc +0 -0
- sparknlp/util.pyc +0 -0
- {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for Word2Vec."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
from sparknlp.common import *
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Word2VecApproach(AnnotatorApproach, HasStorageRef, HasEnableCachingProperties):
|
|
21
|
+
"""Trains a Word2Vec model that creates vector representations of words in a
|
|
22
|
+
text corpus.
|
|
23
|
+
|
|
24
|
+
The algorithm first constructs a vocabulary from the corpus and then learns
|
|
25
|
+
vector representation of words in the vocabulary. The vector representation
|
|
26
|
+
can be used as features in natural language processing and machine learning
|
|
27
|
+
algorithms.
|
|
28
|
+
|
|
29
|
+
We use Word2Vec implemented in Spark ML. It uses skip-gram model in our
|
|
30
|
+
implementation and a hierarchical softmax method to train the model. The
|
|
31
|
+
variable names in the implementation match the original C implementation.
|
|
32
|
+
|
|
33
|
+
For instantiated/pretrained models, see :class:`.Word2VecModel`.
|
|
34
|
+
|
|
35
|
+
For available pretrained models please see the `Models Hub <https://sparknlp.org/models>`__.
|
|
36
|
+
|
|
37
|
+
====================== =======================
|
|
38
|
+
Input Annotation types Output Annotation type
|
|
39
|
+
====================== =======================
|
|
40
|
+
``TOKEN`` ``WORD_EMBEDDINGS``
|
|
41
|
+
====================== =======================
|
|
42
|
+
|
|
43
|
+
Parameters
|
|
44
|
+
----------
|
|
45
|
+
vectorSize
|
|
46
|
+
The dimension of codes after transforming from words (> 0), by default
|
|
47
|
+
100
|
|
48
|
+
windowSize
|
|
49
|
+
The window size (context words from [-window, window]) (> 0), by default
|
|
50
|
+
5
|
|
51
|
+
numPartitions
|
|
52
|
+
Number of partitions for sentences of words (> 0), by default 1
|
|
53
|
+
minCount
|
|
54
|
+
The minimum number of times a token must appear to be included in the
|
|
55
|
+
word2vec model's vocabulary (>= 0), by default 1
|
|
56
|
+
maxSentenceLength
|
|
57
|
+
The window size (Maximum length (in words) of each sentence in the input
|
|
58
|
+
data. Any sentence longer than this threshold will be divided into
|
|
59
|
+
chunks up to the size (> 0), by default 1000
|
|
60
|
+
stepSize
|
|
61
|
+
Step size (learning rate) to be used for each iteration of optimization
|
|
62
|
+
(> 0), by default 0.025
|
|
63
|
+
maxIter
|
|
64
|
+
Maximum number of iterations (>= 0), by default 1
|
|
65
|
+
seed
|
|
66
|
+
Random seed, by default 44
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
References
|
|
70
|
+
----------
|
|
71
|
+
For the original C implementation, see https://code.google.com/p/word2vec/
|
|
72
|
+
|
|
73
|
+
For the research paper, see `Efficient Estimation of Word Representations in
|
|
74
|
+
Vector Space <https://arxiv.org/abs/1301.3781>`__ and `Distributed
|
|
75
|
+
Representations of Words and Phrases and their Compositionality
|
|
76
|
+
<https://arxiv.org/pdf/1310.4546v1.pdf>`__.
|
|
77
|
+
|
|
78
|
+
Examples
|
|
79
|
+
--------
|
|
80
|
+
>>> import sparknlp
|
|
81
|
+
>>> from sparknlp.base import *
|
|
82
|
+
>>> from sparknlp.annotator import *
|
|
83
|
+
>>> from pyspark.ml import Pipeline
|
|
84
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
85
|
+
... .setInputCol("text") \\
|
|
86
|
+
... .setOutputCol("document")
|
|
87
|
+
>>> tokenizer = Tokenizer() \\
|
|
88
|
+
... .setInputCols(["document"]) \\
|
|
89
|
+
... .setOutputCol("token")
|
|
90
|
+
>>> embeddings = Word2VecApproach() \\
|
|
91
|
+
... .setInputCols(["token"]) \\
|
|
92
|
+
... .setOutputCol("embeddings")
|
|
93
|
+
>>> pipeline = Pipeline() \\
|
|
94
|
+
... .setStages([
|
|
95
|
+
... documentAssembler,
|
|
96
|
+
... tokenizer,
|
|
97
|
+
... embeddings
|
|
98
|
+
... ])
|
|
99
|
+
>>> path = "sherlockholmes.txt"
|
|
100
|
+
>>> dataset = spark.read.text(path).toDF("text")
|
|
101
|
+
>>> pipelineModel = pipeline.fit(dataset)
|
|
102
|
+
"""
|
|
103
|
+
inputAnnotatorTypes = [AnnotatorType.TOKEN]
|
|
104
|
+
|
|
105
|
+
outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS
|
|
106
|
+
|
|
107
|
+
vectorSize = Param(Params._dummy(),
|
|
108
|
+
"vectorSize",
|
|
109
|
+
"the dimension of codes after transforming from words (> 0)",
|
|
110
|
+
typeConverter=TypeConverters.toInt)
|
|
111
|
+
|
|
112
|
+
windowSize = Param(Params._dummy(),
|
|
113
|
+
"windowSize",
|
|
114
|
+
"the window size (context words from [-window, window]) (> 0)",
|
|
115
|
+
typeConverter=TypeConverters.toInt)
|
|
116
|
+
|
|
117
|
+
numPartitions = Param(Params._dummy(),
|
|
118
|
+
"numPartitions",
|
|
119
|
+
"number of partitions for sentences of words (> 0)",
|
|
120
|
+
typeConverter=TypeConverters.toInt)
|
|
121
|
+
|
|
122
|
+
minCount = Param(Params._dummy(),
|
|
123
|
+
"minCount",
|
|
124
|
+
"the minimum number of times a token must " +
|
|
125
|
+
"appear to be included in the word2vec model's vocabulary (>= 0)",
|
|
126
|
+
typeConverter=TypeConverters.toInt)
|
|
127
|
+
|
|
128
|
+
maxSentenceLength = Param(Params._dummy(),
|
|
129
|
+
"maxSentenceLength",
|
|
130
|
+
"the window size (Maximum length (in words) of each sentence in the input data. Any sentence longer than this threshold will " +
|
|
131
|
+
"be divided into chunks up to the size (> 0)",
|
|
132
|
+
typeConverter=TypeConverters.toInt)
|
|
133
|
+
|
|
134
|
+
stepSize = Param(Params._dummy(),
|
|
135
|
+
"stepSize",
|
|
136
|
+
"Step size (learning rate) to be used for each iteration of optimization (> 0)",
|
|
137
|
+
typeConverter=TypeConverters.toFloat)
|
|
138
|
+
|
|
139
|
+
maxIter = Param(Params._dummy(),
|
|
140
|
+
"maxIter",
|
|
141
|
+
"maximum number of iterations (>= 0)",
|
|
142
|
+
typeConverter=TypeConverters.toInt)
|
|
143
|
+
|
|
144
|
+
seed = Param(Params._dummy(),
|
|
145
|
+
"seed",
|
|
146
|
+
"Random seed",
|
|
147
|
+
typeConverter=TypeConverters.toInt)
|
|
148
|
+
|
|
149
|
+
def setVectorSize(self, vectorSize):
|
|
150
|
+
"""
|
|
151
|
+
Sets vector size (default: 100).
|
|
152
|
+
"""
|
|
153
|
+
return self._set(vectorSize=vectorSize)
|
|
154
|
+
|
|
155
|
+
def setWindowSize(self, windowSize):
|
|
156
|
+
"""
|
|
157
|
+
Sets window size (default: 5).
|
|
158
|
+
"""
|
|
159
|
+
return self._set(windowSize=windowSize)
|
|
160
|
+
|
|
161
|
+
def setStepSize(self, stepSize):
|
|
162
|
+
"""
|
|
163
|
+
Sets initial learning rate (default: 0.025).
|
|
164
|
+
"""
|
|
165
|
+
return self._set(stepSize=stepSize)
|
|
166
|
+
|
|
167
|
+
def setNumPartitions(self, numPartitions):
|
|
168
|
+
"""
|
|
169
|
+
Sets number of partitions (default: 1). Use a small number for
|
|
170
|
+
accuracy.
|
|
171
|
+
"""
|
|
172
|
+
return self._set(numPartitions=numPartitions)
|
|
173
|
+
|
|
174
|
+
def setMaxIter(self, numIterations):
|
|
175
|
+
"""
|
|
176
|
+
Sets number of iterations (default: 1), which should be smaller
|
|
177
|
+
than or equal to number of partitions.
|
|
178
|
+
"""
|
|
179
|
+
return self._set(maxIter=numIterations)
|
|
180
|
+
|
|
181
|
+
def setSeed(self, seed):
|
|
182
|
+
"""
|
|
183
|
+
Sets random seed.
|
|
184
|
+
"""
|
|
185
|
+
return self._set(seed=seed)
|
|
186
|
+
|
|
187
|
+
def setMinCount(self, minCount):
|
|
188
|
+
"""
|
|
189
|
+
Sets minCount, the minimum number of times a token must appear
|
|
190
|
+
to be included in the word2vec model's vocabulary (default: 5).
|
|
191
|
+
"""
|
|
192
|
+
return self._set(minCount=minCount)
|
|
193
|
+
|
|
194
|
+
def setMaxSentenceLength(self, maxSentenceLength):
|
|
195
|
+
"""
|
|
196
|
+
Maximum length (in words) of each sentence in the input data.
|
|
197
|
+
Any sentence longer than this threshold will be divided into
|
|
198
|
+
chunks up to the size (> 0)
|
|
199
|
+
"""
|
|
200
|
+
return self._set(maxSentenceLength=maxSentenceLength)
|
|
201
|
+
|
|
202
|
+
@keyword_only
|
|
203
|
+
def __init__(self):
|
|
204
|
+
super(Word2VecApproach, self).__init__(classname="com.johnsnowlabs.nlp.embeddings.Word2VecApproach")
|
|
205
|
+
self._setDefault(
|
|
206
|
+
vectorSize=100,
|
|
207
|
+
windowSize=5,
|
|
208
|
+
numPartitions=1,
|
|
209
|
+
minCount=1,
|
|
210
|
+
maxSentenceLength=1000,
|
|
211
|
+
stepSize=0.025,
|
|
212
|
+
maxIter=1,
|
|
213
|
+
seed=44
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
def _create_model(self, java_model):
|
|
217
|
+
return Word2VecModel(java_model=java_model)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
class Word2VecModel(AnnotatorModel, HasStorageRef, HasEmbeddingsProperties):
|
|
221
|
+
"""Word2Vec model that creates vector representations of words in a text
|
|
222
|
+
corpus.
|
|
223
|
+
|
|
224
|
+
The algorithm first constructs a vocabulary from the corpus and then learns
|
|
225
|
+
vector representation of words in the vocabulary. The vector representation
|
|
226
|
+
can be used as features in natural language processing and machine learning
|
|
227
|
+
algorithms.
|
|
228
|
+
|
|
229
|
+
We use Word2Vec implemented in Spark ML. It uses skip-gram model in our
|
|
230
|
+
implementation and a hierarchical softmax method to train the model. The
|
|
231
|
+
variable names in the implementation match the original C implementation.
|
|
232
|
+
|
|
233
|
+
This is the instantiated model of the :class:`.Word2VecApproach`. For
|
|
234
|
+
training your own model, please see the documentation of that class.
|
|
235
|
+
|
|
236
|
+
Pretrained models can be loaded with :meth:`.pretrained` of the companion
|
|
237
|
+
object:
|
|
238
|
+
|
|
239
|
+
>>> embeddings = Word2VecModel.pretrained() \\
|
|
240
|
+
... .setInputCols(["token"]) \\
|
|
241
|
+
... .setOutputCol("embeddings")
|
|
242
|
+
|
|
243
|
+
The default model is `"word2vec_gigaword_300"`, if no name is provided.
|
|
244
|
+
|
|
245
|
+
====================== =======================
|
|
246
|
+
Input Annotation types Output Annotation type
|
|
247
|
+
====================== =======================
|
|
248
|
+
``TOKEN`` ``WORD_EMBEDDINGS``
|
|
249
|
+
====================== =======================
|
|
250
|
+
|
|
251
|
+
Parameters
|
|
252
|
+
----------
|
|
253
|
+
vectorSize
|
|
254
|
+
The dimension of codes after transforming from words (> 0), by default
|
|
255
|
+
100
|
|
256
|
+
|
|
257
|
+
References
|
|
258
|
+
----------
|
|
259
|
+
For the original C implementation, see https://code.google.com/p/word2vec/
|
|
260
|
+
|
|
261
|
+
For the research paper, see `Efficient Estimation of Word Representations in
|
|
262
|
+
Vector Space <https://arxiv.org/abs/1301.3781>`__ and `Distributed
|
|
263
|
+
Representations of Words and Phrases and their Compositionality
|
|
264
|
+
<https://arxiv.org/pdf/1310.4546v1.pdf>`__.
|
|
265
|
+
|
|
266
|
+
Examples
|
|
267
|
+
--------
|
|
268
|
+
>>> import sparknlp
|
|
269
|
+
>>> from sparknlp.base import *
|
|
270
|
+
>>> from sparknlp.annotator import *
|
|
271
|
+
>>> from pyspark.ml import Pipeline
|
|
272
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
273
|
+
... .setInputCol("text") \\
|
|
274
|
+
... .setOutputCol("document")
|
|
275
|
+
>>> tokenizer = Tokenizer() \\
|
|
276
|
+
... .setInputCols(["document"]) \\
|
|
277
|
+
... .setOutputCol("token")
|
|
278
|
+
>>> embeddings = Word2VecModel.pretrained() \\
|
|
279
|
+
... .setInputCols(["token"]) \\
|
|
280
|
+
... .setOutputCol("embeddings")
|
|
281
|
+
>>> embeddingsFinisher = EmbeddingsFinisher() \\
|
|
282
|
+
... .setInputCols(["embeddings"]) \\
|
|
283
|
+
... .setOutputCols("finished_embeddings") \\
|
|
284
|
+
... .setOutputAsVector(True)
|
|
285
|
+
>>> pipeline = Pipeline().setStages([
|
|
286
|
+
... documentAssembler,
|
|
287
|
+
... tokenizer,
|
|
288
|
+
... embeddings,
|
|
289
|
+
... embeddingsFinisher
|
|
290
|
+
... ])
|
|
291
|
+
>>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text")
|
|
292
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
293
|
+
>>> result.selectExpr("explode(finished_embeddings) as result").show(1, 80)
|
|
294
|
+
+--------------------------------------------------------------------------------+
|
|
295
|
+
| result|
|
|
296
|
+
+--------------------------------------------------------------------------------+
|
|
297
|
+
|[0.06222493574023247,0.011579325422644615,0.009919632226228714,0.109361454844...|
|
|
298
|
+
+--------------------------------------------------------------------------------+
|
|
299
|
+
"""
|
|
300
|
+
name = "Word2VecModel"
|
|
301
|
+
|
|
302
|
+
inputAnnotatorTypes = [AnnotatorType.TOKEN]
|
|
303
|
+
|
|
304
|
+
outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS
|
|
305
|
+
|
|
306
|
+
vectorSize = Param(Params._dummy(),
|
|
307
|
+
"vectorSize",
|
|
308
|
+
"the dimension of codes after transforming from words (> 0)",
|
|
309
|
+
typeConverter=TypeConverters.toInt)
|
|
310
|
+
|
|
311
|
+
def setVectorSize(self, vectorSize):
|
|
312
|
+
"""
|
|
313
|
+
Sets vector size (default: 100).
|
|
314
|
+
"""
|
|
315
|
+
return self._set(vectorSize=vectorSize)
|
|
316
|
+
|
|
317
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.Word2VecModel", java_model=None):
|
|
318
|
+
super(Word2VecModel, self).__init__(
|
|
319
|
+
classname=classname,
|
|
320
|
+
java_model=java_model
|
|
321
|
+
)
|
|
322
|
+
self._setDefault(
|
|
323
|
+
vectorSize=100
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
@staticmethod
|
|
327
|
+
def pretrained(name="word2vec_gigaword_300", lang="en", remote_loc=None):
|
|
328
|
+
"""Downloads and loads a pretrained model.
|
|
329
|
+
|
|
330
|
+
Parameters
|
|
331
|
+
----------
|
|
332
|
+
name : str, optional
|
|
333
|
+
Name of the pretrained model, by default "word2vec_wiki"
|
|
334
|
+
lang : str, optional
|
|
335
|
+
Language of the pretrained model, by default "en"
|
|
336
|
+
remote_loc : str, optional
|
|
337
|
+
Optional remote address of the resource, by default None. Will use
|
|
338
|
+
Spark NLPs repositories otherwise.
|
|
339
|
+
|
|
340
|
+
Returns
|
|
341
|
+
-------
|
|
342
|
+
Word2VecModel
|
|
343
|
+
The restored model
|
|
344
|
+
"""
|
|
345
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
346
|
+
return ResourceDownloader.downloadModel(Word2VecModel, name, lang, remote_loc)
|
|
347
|
+
|
|
348
|
+
def getVectors(self):
|
|
349
|
+
"""
|
|
350
|
+
Returns the vector representation of the words as a dataframe
|
|
351
|
+
with two fields, word and vector.
|
|
352
|
+
"""
|
|
353
|
+
return self._call_java("getVectors")
|