spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- com/johnsnowlabs/ml/__init__.py +0 -0
- com/johnsnowlabs/ml/ai/__init__.py +10 -0
- spark_nlp-6.2.1.dist-info/METADATA +362 -0
- spark_nlp-6.2.1.dist-info/RECORD +292 -0
- {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +81 -28
- sparknlp/annotation.py +3 -2
- sparknlp/annotator/__init__.py +6 -0
- sparknlp/annotator/audio/__init__.py +2 -0
- sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
- sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
- sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
- sparknlp/{base → annotator}/chunk2_doc.py +4 -7
- sparknlp/annotator/chunker.py +1 -2
- sparknlp/annotator/classifier_dl/__init__.py +17 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
- sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
- sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
- sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
- sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
- sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
- sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
- sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
- sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
- sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
- sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
- sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
- sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
- sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
- sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
- sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
- sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
- sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
- sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
- sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
- sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
- sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
- sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
- sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
- sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
- sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
- sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
- sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
- sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/coref/spanbert_coref.py +4 -18
- sparknlp/annotator/cv/__init__.py +15 -0
- sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
- sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
- sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
- sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
- sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
- sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
- sparknlp/annotator/dataframe_optimizer.py +216 -0
- sparknlp/annotator/date2_chunk.py +88 -0
- sparknlp/annotator/dependency/dependency_parser.py +2 -3
- sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
- sparknlp/annotator/document_character_text_splitter.py +228 -0
- sparknlp/annotator/document_normalizer.py +37 -1
- sparknlp/annotator/document_token_splitter.py +175 -0
- sparknlp/annotator/document_token_splitter_test.py +85 -0
- sparknlp/annotator/embeddings/__init__.py +11 -0
- sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
- sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
- sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
- sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
- sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
- sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
- sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
- sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
- sparknlp/annotator/embeddings/doc2vec.py +7 -1
- sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
- sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
- sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
- sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
- sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
- sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
- sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
- sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
- sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
- sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
- sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
- sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
- sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
- sparknlp/annotator/embeddings/word2vec.py +7 -1
- sparknlp/annotator/embeddings/word_embeddings.py +4 -5
- sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
- sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
- sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
- sparknlp/annotator/er/entity_ruler.py +37 -23
- sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
- sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
- sparknlp/annotator/lemmatizer.py +3 -4
- sparknlp/annotator/matcher/date_matcher.py +35 -3
- sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
- sparknlp/annotator/matcher/regex_matcher.py +3 -3
- sparknlp/annotator/matcher/text_matcher.py +2 -3
- sparknlp/annotator/n_gram_generator.py +1 -2
- sparknlp/annotator/ner/__init__.py +3 -1
- sparknlp/annotator/ner/ner_converter.py +18 -0
- sparknlp/annotator/ner/ner_crf.py +4 -5
- sparknlp/annotator/ner/ner_dl.py +10 -5
- sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
- sparknlp/annotator/ner/ner_overwriter.py +2 -2
- sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
- sparknlp/annotator/normalizer.py +2 -2
- sparknlp/annotator/openai/__init__.py +16 -0
- sparknlp/annotator/openai/openai_completion.py +349 -0
- sparknlp/annotator/openai/openai_embeddings.py +106 -0
- sparknlp/annotator/pos/perceptron.py +6 -7
- sparknlp/annotator/sentence/sentence_detector.py +2 -2
- sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
- sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
- sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
- sparknlp/annotator/seq2seq/__init__.py +17 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
- sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
- sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
- sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
- sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
- sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
- sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
- sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
- sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
- sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
- sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
- sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
- sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
- sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
- sparknlp/annotator/similarity/__init__.py +0 -0
- sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
- sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
- sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
- sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
- sparknlp/annotator/stemmer.py +2 -3
- sparknlp/annotator/stop_words_cleaner.py +3 -4
- sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
- sparknlp/annotator/token/__init__.py +0 -1
- sparknlp/annotator/token/recursive_tokenizer.py +2 -3
- sparknlp/annotator/token/tokenizer.py +2 -3
- sparknlp/annotator/ws/word_segmenter.py +35 -10
- sparknlp/base/__init__.py +2 -3
- sparknlp/base/doc2_chunk.py +0 -3
- sparknlp/base/document_assembler.py +5 -5
- sparknlp/base/embeddings_finisher.py +14 -2
- sparknlp/base/finisher.py +15 -4
- sparknlp/base/gguf_ranking_finisher.py +234 -0
- sparknlp/base/image_assembler.py +69 -0
- sparknlp/base/light_pipeline.py +53 -21
- sparknlp/base/multi_document_assembler.py +9 -13
- sparknlp/base/prompt_assembler.py +207 -0
- sparknlp/base/token_assembler.py +1 -2
- sparknlp/common/__init__.py +2 -0
- sparknlp/common/annotator_type.py +1 -0
- sparknlp/common/completion_post_processing.py +37 -0
- sparknlp/common/match_strategy.py +33 -0
- sparknlp/common/properties.py +914 -9
- sparknlp/internal/__init__.py +841 -116
- sparknlp/internal/annotator_java_ml.py +1 -1
- sparknlp/internal/annotator_transformer.py +3 -0
- sparknlp/logging/comet.py +2 -2
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +902 -0
- sparknlp/partition/partition_transformer.py +200 -0
- sparknlp/pretrained/pretrained_pipeline.py +1 -1
- sparknlp/pretrained/resource_downloader.py +126 -2
- sparknlp/reader/__init__.py +15 -0
- sparknlp/reader/enums.py +19 -0
- sparknlp/reader/pdf_to_text.py +190 -0
- sparknlp/reader/reader2doc.py +124 -0
- sparknlp/reader/reader2image.py +136 -0
- sparknlp/reader/reader2table.py +44 -0
- sparknlp/reader/reader_assembler.py +159 -0
- sparknlp/reader/sparknlp_reader.py +461 -0
- sparknlp/training/__init__.py +1 -0
- sparknlp/training/conll.py +8 -2
- sparknlp/training/spacy_to_annotation.py +57 -0
- sparknlp/util.py +26 -0
- spark_nlp-4.2.6.dist-info/METADATA +0 -1256
- spark_nlp-4.2.6.dist-info/RECORD +0 -196
- {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
- /sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for E5Embeddings."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class MPNetEmbeddings(AnnotatorModel,
|
|
20
|
+
HasEmbeddingsProperties,
|
|
21
|
+
HasCaseSensitiveProperties,
|
|
22
|
+
HasStorageRef,
|
|
23
|
+
HasBatchedAnnotate,
|
|
24
|
+
HasMaxSentenceLengthLimit):
|
|
25
|
+
"""Sentence embeddings using MPNet.
|
|
26
|
+
|
|
27
|
+
MPNet adopts a novel pre-training method, named masked and permuted language modeling,
|
|
28
|
+
to inherit the advantages of masked language modeling and permuted language modeling for
|
|
29
|
+
natural language understanding.
|
|
30
|
+
|
|
31
|
+
Note that this annotator is only supported for Spark Versions 3.4 and up.
|
|
32
|
+
|
|
33
|
+
Pretrained models can be loaded with :meth:`.pretrained` of the companion
|
|
34
|
+
object:
|
|
35
|
+
|
|
36
|
+
>>> embeddings = MPNetEmbeddings.pretrained() \\
|
|
37
|
+
... .setInputCols(["document"]) \\
|
|
38
|
+
... .setOutputCol("mpnet_embeddings")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
The default model is ``"all_mpnet_base_v2"``, if no name is provided.
|
|
42
|
+
|
|
43
|
+
For available pretrained models please see the
|
|
44
|
+
`Models Hub <https://sparknlp.org/models?q=MPNet>`__.
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
====================== ======================
|
|
48
|
+
Input Annotation types Output Annotation type
|
|
49
|
+
====================== ======================
|
|
50
|
+
``DOCUMENT`` ``SENTENCE_EMBEDDINGS``
|
|
51
|
+
====================== ======================
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
batchSize
|
|
56
|
+
Size of every batch , by default 8
|
|
57
|
+
dimension
|
|
58
|
+
Number of embedding dimensions, by default 768
|
|
59
|
+
caseSensitive
|
|
60
|
+
Whether to ignore case in tokens for embeddings matching, by default False
|
|
61
|
+
maxSentenceLength
|
|
62
|
+
Max sentence length to process, by default 512
|
|
63
|
+
configProtoBytes
|
|
64
|
+
ConfigProto from tensorflow, serialized into byte array.
|
|
65
|
+
|
|
66
|
+
References
|
|
67
|
+
----------
|
|
68
|
+
`MPNet: Masked and Permuted Pre-training for Language Understanding <https://arxiv.org/pdf/2004.09297>`__
|
|
69
|
+
|
|
70
|
+
https://github.com/microsoft/MPNet
|
|
71
|
+
|
|
72
|
+
**Paper abstract**
|
|
73
|
+
|
|
74
|
+
*BERT adopts masked language modeling (MLM) for pre-training and is one of the most successful pre-training models.
|
|
75
|
+
Since BERT neglects dependency among predicted tokens, XLNet introduces permuted language modeling (PLM) for
|
|
76
|
+
pre-training to address this problem. However, XLNet does not leverage the full position information of a sentence
|
|
77
|
+
and thus suffers from position discrepancy between pre-training and fine-tuning. In this paper, we propose MPNet,
|
|
78
|
+
a novel pre-training method that inherits the advantages of BERT and XLNet and avoids their limitations. MPNet
|
|
79
|
+
leverages the dependency among predicted tokens through permuted language modeling (vs. MLM in BERT), and takes
|
|
80
|
+
auxiliary position information as input to make the model see a full sentence and thus reducing the position
|
|
81
|
+
discrepancy (vs. PLM in XLNet). We pre-train MPNet on a large-scale dataset (over 160GB text corpora) and fine-tune
|
|
82
|
+
on a variety of down-streaming tasks (GLUE, SQuAD, etc). Experimental results show that MPNet outperforms MLM and
|
|
83
|
+
PLM by a large margin, and achieves better results on these tasks compared with previous state-of-the-art
|
|
84
|
+
pre-trained methods (e.g., BERT, XLNet, RoBERTa) under the same model setting.*
|
|
85
|
+
|
|
86
|
+
Examples
|
|
87
|
+
--------
|
|
88
|
+
>>> import sparknlp
|
|
89
|
+
>>> from sparknlp.base import *
|
|
90
|
+
>>> from sparknlp.annotator import *
|
|
91
|
+
>>> from pyspark.ml import Pipeline
|
|
92
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
93
|
+
... .setInputCol("text") \\
|
|
94
|
+
... .setOutputCol("document")
|
|
95
|
+
>>> embeddings = MPNetEmbeddings.pretrained() \\
|
|
96
|
+
... .setInputCols(["document"]) \\
|
|
97
|
+
... .setOutputCol("mpnet_embeddings")
|
|
98
|
+
>>> embeddingsFinisher = EmbeddingsFinisher() \\
|
|
99
|
+
... .setInputCols(["mpnet_embeddings"]) \\
|
|
100
|
+
... .setOutputCols("finished_embeddings") \\
|
|
101
|
+
... .setOutputAsVector(True)
|
|
102
|
+
>>> pipeline = Pipeline().setStages([
|
|
103
|
+
... documentAssembler,
|
|
104
|
+
... embeddings,
|
|
105
|
+
... embeddingsFinisher
|
|
106
|
+
... ])
|
|
107
|
+
>>> data = spark.createDataFrame([["This is an example sentence", "Each sentence is converted"]]).toDF("text")
|
|
108
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
109
|
+
>>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
|
|
110
|
+
+--------------------------------------------------------------------------------+
|
|
111
|
+
| result|
|
|
112
|
+
+--------------------------------------------------------------------------------+
|
|
113
|
+
|[[0.022502584, -0.078291744, -0.023030775, -0.0051000593, -0.080340415, 0.039...|
|
|
114
|
+
|[[0.041702367, 0.0010974605, -0.015534201, 0.07092203, -0.0017729357, 0.04661...|
|
|
115
|
+
+--------------------------------------------------------------------------------+
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
name = "MPNetEmbeddings"
|
|
119
|
+
|
|
120
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
|
|
121
|
+
|
|
122
|
+
outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
|
|
123
|
+
configProtoBytes = Param(Params._dummy(),
|
|
124
|
+
"configProtoBytes",
|
|
125
|
+
"ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
|
|
126
|
+
TypeConverters.toListInt)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def setConfigProtoBytes(self, b):
|
|
130
|
+
"""Sets configProto from tensorflow, serialized into byte array.
|
|
131
|
+
|
|
132
|
+
Parameters
|
|
133
|
+
----------
|
|
134
|
+
b : List[int]
|
|
135
|
+
ConfigProto from tensorflow, serialized into byte array
|
|
136
|
+
"""
|
|
137
|
+
return self._set(configProtoBytes=b)
|
|
138
|
+
|
|
139
|
+
@keyword_only
|
|
140
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.MPNetEmbeddings", java_model=None):
|
|
141
|
+
super(MPNetEmbeddings, self).__init__(
|
|
142
|
+
classname=classname,
|
|
143
|
+
java_model=java_model
|
|
144
|
+
)
|
|
145
|
+
self._setDefault(
|
|
146
|
+
dimension=768,
|
|
147
|
+
batchSize=8,
|
|
148
|
+
maxSentenceLength=512,
|
|
149
|
+
caseSensitive=False,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
@staticmethod
|
|
153
|
+
def loadSavedModel(folder, spark_session):
|
|
154
|
+
"""Loads a locally saved model.
|
|
155
|
+
|
|
156
|
+
Parameters
|
|
157
|
+
----------
|
|
158
|
+
folder : str
|
|
159
|
+
Folder of the saved model
|
|
160
|
+
spark_session : pyspark.sql.SparkSession
|
|
161
|
+
The current SparkSession
|
|
162
|
+
|
|
163
|
+
Returns
|
|
164
|
+
-------
|
|
165
|
+
MPNetEmbeddings
|
|
166
|
+
The restored model
|
|
167
|
+
"""
|
|
168
|
+
from sparknlp.internal import _MPNetLoader
|
|
169
|
+
jModel = _MPNetLoader(folder, spark_session._jsparkSession)._java_obj
|
|
170
|
+
return MPNetEmbeddings(java_model=jModel)
|
|
171
|
+
|
|
172
|
+
@staticmethod
|
|
173
|
+
def pretrained(name="all_mpnet_base_v2", lang="en", remote_loc=None):
|
|
174
|
+
"""Downloads and loads a pretrained model.
|
|
175
|
+
|
|
176
|
+
Parameters
|
|
177
|
+
----------
|
|
178
|
+
name : str, optional
|
|
179
|
+
Name of the pretrained model, by default "all_mpnet_base_v2"
|
|
180
|
+
lang : str, optional
|
|
181
|
+
Language of the pretrained model, by default "en"
|
|
182
|
+
remote_loc : str, optional
|
|
183
|
+
Optional remote address of the resource, by default None. Will use
|
|
184
|
+
Spark NLPs repositories otherwise.
|
|
185
|
+
|
|
186
|
+
Returns
|
|
187
|
+
-------
|
|
188
|
+
MPNetEmbeddings
|
|
189
|
+
The restored model
|
|
190
|
+
"""
|
|
191
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
192
|
+
return ResourceDownloader.downloadModel(MPNetEmbeddings, name, lang, remote_loc)
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for MxbaiEmbeddings."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class MxbaiEmbeddings(AnnotatorModel,
|
|
20
|
+
HasEmbeddingsProperties,
|
|
21
|
+
HasCaseSensitiveProperties,
|
|
22
|
+
HasStorageRef,
|
|
23
|
+
HasBatchedAnnotate,
|
|
24
|
+
HasMaxSentenceLengthLimit):
|
|
25
|
+
"""Sentence embeddings using Mxbai Embeddings.
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
Pretrained models can be loaded with :meth:`.pretrained` of the companion
|
|
29
|
+
object:
|
|
30
|
+
|
|
31
|
+
>>> embeddings = MxbaiEmbeddings.pretrained() \\
|
|
32
|
+
... .setInputCols(["document"]) \\
|
|
33
|
+
... .setOutputCol("Mxbai_embeddings")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
The default model is ``"mxbai_large_v1"``, if no name is provided.
|
|
37
|
+
|
|
38
|
+
For available pretrained models please see the
|
|
39
|
+
`Models Hub <https://sparknlp.org/models?q=Mxbai>`__.
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
====================== ======================
|
|
43
|
+
Input Annotation types Output Annotation type
|
|
44
|
+
====================== ======================
|
|
45
|
+
``DOCUMENT`` ``SENTENCE_EMBEDDINGS``
|
|
46
|
+
====================== ======================
|
|
47
|
+
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
batchSize
|
|
51
|
+
Size of every batch , by default 8
|
|
52
|
+
dimension
|
|
53
|
+
Number of embedding dimensions, by default 768
|
|
54
|
+
caseSensitive
|
|
55
|
+
Whether to ignore case in tokens for embeddings matching, by default False
|
|
56
|
+
maxSentenceLength
|
|
57
|
+
Max sentence length to process, by default 512
|
|
58
|
+
configProtoBytes
|
|
59
|
+
ConfigProto from tensorflow, serialized into byte array.
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
Examples
|
|
64
|
+
--------
|
|
65
|
+
>>> import sparknlp
|
|
66
|
+
>>> from sparknlp.base import *
|
|
67
|
+
>>> from sparknlp.annotator import *
|
|
68
|
+
>>> from pyspark.ml import Pipeline
|
|
69
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
70
|
+
... .setInputCol("text") \\
|
|
71
|
+
... .setOutputCol("document")
|
|
72
|
+
>>> embeddings = MxbaiEmbeddings.pretrained() \\
|
|
73
|
+
... .setInputCols(["document"]) \\
|
|
74
|
+
... .setOutputCol("embeddings")
|
|
75
|
+
>>> embeddingsFinisher = EmbeddingsFinisher() \\
|
|
76
|
+
... .setInputCols("embeddings") \\
|
|
77
|
+
... .setOutputCols("finished_embeddings") \\
|
|
78
|
+
... .setOutputAsVector(True)
|
|
79
|
+
>>> pipeline = Pipeline().setStages([
|
|
80
|
+
... documentAssembler,
|
|
81
|
+
... embeddings,
|
|
82
|
+
... embeddingsFinisher
|
|
83
|
+
... ])
|
|
84
|
+
>>> data = spark.createDataFrame([["hello world", "hello moon"]]).toDF("text")
|
|
85
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
86
|
+
>>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
|
|
87
|
+
+--------------------------------------------------------------------------------+
|
|
88
|
+
| result|
|
|
89
|
+
+--------------------------------------------------------------------------------+
|
|
90
|
+
|[0.50387806, 0.5861606, 0.35129607, -0.76046336, -0.32446072, -0.117674336, 0...|
|
|
91
|
+
|[0.6660665, 0.961762, 0.24854276, -0.1018044, -0.6569202, 0.027635604, 0.1915...|
|
|
92
|
+
+--------------------------------------------------------------------------------+
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
name = "MxbaiEmbeddings"
|
|
96
|
+
|
|
97
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
|
|
98
|
+
|
|
99
|
+
outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
|
|
100
|
+
poolingStrategy = Param(Params._dummy(),
|
|
101
|
+
"poolingStrategy",
|
|
102
|
+
"Pooling strategy to use for sentence embeddings",
|
|
103
|
+
TypeConverters.toString)
|
|
104
|
+
|
|
105
|
+
def setPoolingStrategy(self, value):
|
|
106
|
+
"""Pooling strategy to use for sentence embeddings.
|
|
107
|
+
|
|
108
|
+
Available pooling strategies for sentence embeddings are:
|
|
109
|
+
- `"cls"`: leading `[CLS]` token
|
|
110
|
+
- `"cls_avg"`: leading `[CLS]` token + mean of all other tokens
|
|
111
|
+
- `"last"`: embeddings of the last token in the sequence
|
|
112
|
+
- `"avg"`: mean of all tokens
|
|
113
|
+
- `"max"`: max of all embedding features of the entire token sequence
|
|
114
|
+
- `"int"`: An integer number, which represents the index of the token to use as the
|
|
115
|
+
embedding
|
|
116
|
+
|
|
117
|
+
Parameters
|
|
118
|
+
----------
|
|
119
|
+
value : str
|
|
120
|
+
Pooling strategy to use for sentence embeddings
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
valid_strategies = {"cls", "cls_avg", "last", "avg", "max"}
|
|
124
|
+
if value in valid_strategies or value.isdigit():
|
|
125
|
+
return self._set(poolingStrategy=value)
|
|
126
|
+
else:
|
|
127
|
+
raise ValueError(f"Invalid pooling strategy: {value}. "
|
|
128
|
+
f"Valid strategies are: {', '.join(self.valid_strategies)} or an integer.")
|
|
129
|
+
|
|
130
|
+
@keyword_only
|
|
131
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.MxbaiEmbeddings", java_model=None):
|
|
132
|
+
super(MxbaiEmbeddings, self).__init__(
|
|
133
|
+
classname=classname,
|
|
134
|
+
java_model=java_model
|
|
135
|
+
)
|
|
136
|
+
self._setDefault(
|
|
137
|
+
dimension=1024,
|
|
138
|
+
batchSize=8,
|
|
139
|
+
maxSentenceLength=512,
|
|
140
|
+
caseSensitive=False,
|
|
141
|
+
poolingStrategy="cls"
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
@staticmethod
|
|
145
|
+
def loadSavedModel(folder, spark_session):
|
|
146
|
+
"""Loads a locally saved model.
|
|
147
|
+
|
|
148
|
+
Parameters
|
|
149
|
+
----------
|
|
150
|
+
folder : str
|
|
151
|
+
Folder of the saved model
|
|
152
|
+
spark_session : pyspark.sql.SparkSession
|
|
153
|
+
The current SparkSession
|
|
154
|
+
|
|
155
|
+
Returns
|
|
156
|
+
-------
|
|
157
|
+
MxbaiEmbeddings
|
|
158
|
+
The restored model
|
|
159
|
+
"""
|
|
160
|
+
from sparknlp.internal import _MxbaiEmbeddingsLoader
|
|
161
|
+
jModel = _MxbaiEmbeddingsLoader(folder, spark_session._jsparkSession)._java_obj
|
|
162
|
+
return MxbaiEmbeddings(java_model=jModel)
|
|
163
|
+
|
|
164
|
+
@staticmethod
|
|
165
|
+
def pretrained(name="mxbai_large_v1", lang="en", remote_loc=None):
|
|
166
|
+
"""Downloads and loads a pretrained model.
|
|
167
|
+
|
|
168
|
+
Parameters
|
|
169
|
+
----------
|
|
170
|
+
name : str, optional
|
|
171
|
+
Name of the pretrained model, by default "mxbai_large_v1"
|
|
172
|
+
lang : str, optional
|
|
173
|
+
Language of the pretrained model, by default "en"
|
|
174
|
+
remote_loc : str, optional
|
|
175
|
+
Optional remote address of the resource, by default None. Will use
|
|
176
|
+
Spark NLPs repositories otherwise.
|
|
177
|
+
|
|
178
|
+
Returns
|
|
179
|
+
-------
|
|
180
|
+
MxbaiEmbeddings
|
|
181
|
+
The restored model
|
|
182
|
+
"""
|
|
183
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
184
|
+
return ResourceDownloader.downloadModel(MxbaiEmbeddings, name, lang, remote_loc)
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# Copyright 2017-2022 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains classes for E5Embeddings."""
|
|
15
|
+
|
|
16
|
+
from sparknlp.common import *
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class NomicEmbeddings(AnnotatorModel, HasEmbeddingsProperties, HasCaseSensitiveProperties, HasStorageRef,
|
|
20
|
+
HasBatchedAnnotate, HasMaxSentenceLengthLimit):
|
|
21
|
+
"""Sentence embeddings using NomicEmbeddings.
|
|
22
|
+
|
|
23
|
+
nomic-embed-text-v1 is 8192 context length text encoder that surpasses OpenAI
|
|
24
|
+
text-embedding-ada-002 and text-embedding-3-small performance on short and long context tasks.
|
|
25
|
+
|
|
26
|
+
Pretrained models can be loaded with :meth:`.pretrained` of the companion
|
|
27
|
+
object:
|
|
28
|
+
|
|
29
|
+
>>> embeddings = NomicEmbeddings.pretrained() \\
|
|
30
|
+
... .setInputCols(["document"]) \\
|
|
31
|
+
... .setOutputCol("nomic_embeddings")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
The default model is ``"nomic_embed_v1"``, if no name is provided.
|
|
35
|
+
|
|
36
|
+
For available pretrained models please see the
|
|
37
|
+
`Models Hub <https://sparknlp.org/models?q=Nomic>`__.
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
====================== ======================
|
|
41
|
+
Input Annotation types Output Annotation type
|
|
42
|
+
====================== ======================
|
|
43
|
+
``DOCUMENT`` ``SENTENCE_EMBEDDINGS``
|
|
44
|
+
====================== ======================
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
batchSize
|
|
49
|
+
Size of every batch , by default 8
|
|
50
|
+
dimension
|
|
51
|
+
Number of embedding dimensions, by default 768
|
|
52
|
+
caseSensitive
|
|
53
|
+
Whether to ignore case in tokens for embeddings matching, by default False
|
|
54
|
+
maxSentenceLength
|
|
55
|
+
Max sentence length to process, by default 512
|
|
56
|
+
configProtoBytes
|
|
57
|
+
ConfigProto from tensorflow, serialized into byte array.
|
|
58
|
+
|
|
59
|
+
References
|
|
60
|
+
----------
|
|
61
|
+
`Text Embeddings by Weakly-Supervised Contrastive Pre-training <https://arxiv.org/pdf/2212.03533>`__
|
|
62
|
+
|
|
63
|
+
https://github.com/microsoft/unilm/tree/master/nomic
|
|
64
|
+
|
|
65
|
+
**Paper abstract**
|
|
66
|
+
|
|
67
|
+
*This technical report describes the training
|
|
68
|
+
of nomic-embed-text-v1, the first fully reproducible,
|
|
69
|
+
open-source, open-weights, opendata, 8192 context length
|
|
70
|
+
English text embedding model that outperforms both OpenAI
|
|
71
|
+
Ada-002 and OpenAI text-embedding-3-small
|
|
72
|
+
on short and long-context tasks. We release
|
|
73
|
+
the training code and model weights under
|
|
74
|
+
an Apache 2 license. In contrast with other
|
|
75
|
+
open-source models, we release a training data
|
|
76
|
+
loader with 235 million curated text pairs that
|
|
77
|
+
allows for the full replication of nomic-embedtext-v1.
|
|
78
|
+
You can find code and data to replicate the
|
|
79
|
+
model at https://github.com/nomicai/contrastors.*
|
|
80
|
+
|
|
81
|
+
Examples
|
|
82
|
+
--------
|
|
83
|
+
>>> import sparknlp
|
|
84
|
+
>>> from sparknlp.base import *
|
|
85
|
+
>>> from sparknlp.annotator import *
|
|
86
|
+
>>> from pyspark.ml import Pipeline
|
|
87
|
+
>>> documentAssembler = DocumentAssembler() \\
|
|
88
|
+
... .setInputCol("text") \\
|
|
89
|
+
... .setOutputCol("document")
|
|
90
|
+
>>> embeddings = NomicEmbeddings.pretrained() \\
|
|
91
|
+
... .setInputCols(["document"]) \\
|
|
92
|
+
... .setOutputCol("nomic_embeddings")
|
|
93
|
+
>>> embeddingsFinisher = EmbeddingsFinisher() \\
|
|
94
|
+
... .setInputCols(["nomic_embeddings"]) \\
|
|
95
|
+
... .setOutputCols("finished_embeddings") \\
|
|
96
|
+
... .setOutputAsVector(True)
|
|
97
|
+
>>> pipeline = Pipeline().setStages([
|
|
98
|
+
... documentAssembler,
|
|
99
|
+
... embeddings,
|
|
100
|
+
... embeddingsFinisher
|
|
101
|
+
... ])
|
|
102
|
+
>>> data = spark.createDataFrame([["query: how much protein should a female eat",
|
|
103
|
+
... "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." + \
|
|
104
|
+
... "But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" + \
|
|
105
|
+
... "marathon. Check out the chart below to see how much protein you should be eating each day.",
|
|
106
|
+
... ]]).toDF("text")
|
|
107
|
+
>>> result = pipeline.fit(data).transform(data)
|
|
108
|
+
>>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
|
|
109
|
+
+--------------------------------------------------------------------------------+
|
|
110
|
+
| result|
|
|
111
|
+
+--------------------------------------------------------------------------------+
|
|
112
|
+
|[[8.0190285E-4, -0.005974853, -0.072875895, 0.007944068, 0.026059335, -0.0080...|
|
|
113
|
+
|[[0.050514214, 0.010061974, -0.04340176, -0.020937217, 0.05170225, 0.01157857...|
|
|
114
|
+
+--------------------------------------------------------------------------------+
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
name = "NomicEmbeddings"
|
|
118
|
+
|
|
119
|
+
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
|
|
120
|
+
|
|
121
|
+
outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
|
|
122
|
+
configProtoBytes = Param(Params._dummy(), "configProtoBytes",
|
|
123
|
+
"ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
|
|
124
|
+
TypeConverters.toListInt)
|
|
125
|
+
|
|
126
|
+
def setConfigProtoBytes(self, b):
|
|
127
|
+
"""Sets configProto from tensorflow, serialized into byte array.
|
|
128
|
+
|
|
129
|
+
Parameters
|
|
130
|
+
----------
|
|
131
|
+
b : List[int]
|
|
132
|
+
ConfigProto from tensorflow, serialized into byte array
|
|
133
|
+
"""
|
|
134
|
+
return self._set(configProtoBytes=b)
|
|
135
|
+
|
|
136
|
+
@keyword_only
|
|
137
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.NomicEmbeddings", java_model=None):
|
|
138
|
+
super(NomicEmbeddings, self).__init__(classname=classname, java_model=java_model)
|
|
139
|
+
self._setDefault(dimension=768, batchSize=8, maxSentenceLength=512, caseSensitive=False, )
|
|
140
|
+
|
|
141
|
+
@staticmethod
|
|
142
|
+
def loadSavedModel(folder, spark_session, use_openvino=False):
|
|
143
|
+
"""Loads a locally saved model.
|
|
144
|
+
|
|
145
|
+
Parameters
|
|
146
|
+
----------
|
|
147
|
+
folder : str
|
|
148
|
+
Folder of the saved model
|
|
149
|
+
spark_session : pyspark.sql.SparkSession
|
|
150
|
+
The current SparkSession
|
|
151
|
+
|
|
152
|
+
Returns
|
|
153
|
+
-------
|
|
154
|
+
NomicEmbeddings
|
|
155
|
+
The restored model
|
|
156
|
+
"""
|
|
157
|
+
from sparknlp.internal import _NomicLoader
|
|
158
|
+
jModel = _NomicLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
|
|
159
|
+
return NomicEmbeddings(java_model=jModel)
|
|
160
|
+
|
|
161
|
+
@staticmethod
|
|
162
|
+
def pretrained(name="nomic_embed_v1", lang="en", remote_loc=None):
|
|
163
|
+
"""Downloads and loads a pretrained model.
|
|
164
|
+
|
|
165
|
+
Parameters
|
|
166
|
+
----------
|
|
167
|
+
name : str, optional
|
|
168
|
+
Name of the pretrained model, by default "nomic_embed_v1"
|
|
169
|
+
lang : str, optional
|
|
170
|
+
Language of the pretrained model, by default "en"
|
|
171
|
+
remote_loc : str, optional
|
|
172
|
+
Optional remote address of the resource, by default None. Will use
|
|
173
|
+
Spark NLPs repositories otherwise.
|
|
174
|
+
|
|
175
|
+
Returns
|
|
176
|
+
-------
|
|
177
|
+
NomicEmbeddings
|
|
178
|
+
The restored model
|
|
179
|
+
"""
|
|
180
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
181
|
+
return ResourceDownloader.downloadModel(NomicEmbeddings, name, lang, remote_loc)
|
|
@@ -21,7 +21,8 @@ class RoBertaEmbeddings(AnnotatorModel,
|
|
|
21
21
|
HasCaseSensitiveProperties,
|
|
22
22
|
HasStorageRef,
|
|
23
23
|
HasBatchedAnnotate,
|
|
24
|
-
HasEngine
|
|
24
|
+
HasEngine,
|
|
25
|
+
HasMaxSentenceLengthLimit):
|
|
25
26
|
"""Creates word embeddings using RoBERTa.
|
|
26
27
|
|
|
27
28
|
The RoBERTa model was proposed in `RoBERTa: A Robustly Optimized BERT
|
|
@@ -42,10 +43,10 @@ class RoBertaEmbeddings(AnnotatorModel,
|
|
|
42
43
|
|
|
43
44
|
The default model is ``"roberta_base"``, if no name is provided. For
|
|
44
45
|
available pretrained models please see the `Models Hub
|
|
45
|
-
<https://
|
|
46
|
+
<https://sparknlp.org/models?task=Embeddings>`__.
|
|
46
47
|
|
|
47
|
-
For extended examples of usage, see the `
|
|
48
|
-
<https://github.com/JohnSnowLabs/spark-nlp
|
|
48
|
+
For extended examples of usage, see the `Examples
|
|
49
|
+
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20RoBERTa.ipynb>`__.
|
|
49
50
|
To see which models are compatible and how to import them see
|
|
50
51
|
`Import Transformers into Spark NLP 🚀
|
|
51
52
|
<https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
|
|
@@ -151,11 +152,6 @@ class RoBertaEmbeddings(AnnotatorModel,
|
|
|
151
152
|
|
|
152
153
|
outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS
|
|
153
154
|
|
|
154
|
-
maxSentenceLength = Param(Params._dummy(),
|
|
155
|
-
"maxSentenceLength",
|
|
156
|
-
"Max sentence length to process",
|
|
157
|
-
typeConverter=TypeConverters.toInt)
|
|
158
|
-
|
|
159
155
|
configProtoBytes = Param(Params._dummy(),
|
|
160
156
|
"configProtoBytes",
|
|
161
157
|
"ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
|
|
@@ -171,16 +167,6 @@ class RoBertaEmbeddings(AnnotatorModel,
|
|
|
171
167
|
"""
|
|
172
168
|
return self._set(configProtoBytes=b)
|
|
173
169
|
|
|
174
|
-
def setMaxSentenceLength(self, value):
|
|
175
|
-
"""Sets max sentence length to process.
|
|
176
|
-
|
|
177
|
-
Parameters
|
|
178
|
-
----------
|
|
179
|
-
value : int
|
|
180
|
-
Max sentence length to process
|
|
181
|
-
"""
|
|
182
|
-
return self._set(maxSentenceLength=value)
|
|
183
|
-
|
|
184
170
|
@keyword_only
|
|
185
171
|
def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.RoBertaEmbeddings", java_model=None):
|
|
186
172
|
super(RoBertaEmbeddings, self).__init__(
|
|
@@ -195,7 +181,7 @@ class RoBertaEmbeddings(AnnotatorModel,
|
|
|
195
181
|
)
|
|
196
182
|
|
|
197
183
|
@staticmethod
|
|
198
|
-
def loadSavedModel(folder, spark_session):
|
|
184
|
+
def loadSavedModel(folder, spark_session, use_openvino=False):
|
|
199
185
|
"""Loads a locally saved model.
|
|
200
186
|
|
|
201
187
|
Parameters
|
|
@@ -204,6 +190,8 @@ class RoBertaEmbeddings(AnnotatorModel,
|
|
|
204
190
|
Folder of the saved model
|
|
205
191
|
spark_session : pyspark.sql.SparkSession
|
|
206
192
|
The current SparkSession
|
|
193
|
+
use_openvino: bool
|
|
194
|
+
Use OpenVINO backend
|
|
207
195
|
|
|
208
196
|
Returns
|
|
209
197
|
-------
|
|
@@ -211,7 +199,7 @@ class RoBertaEmbeddings(AnnotatorModel,
|
|
|
211
199
|
The restored model
|
|
212
200
|
"""
|
|
213
201
|
from sparknlp.internal import _RoBertaLoader
|
|
214
|
-
jModel = _RoBertaLoader(folder, spark_session._jsparkSession)._java_obj
|
|
202
|
+
jModel = _RoBertaLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
|
|
215
203
|
return RoBertaEmbeddings(java_model=jModel)
|
|
216
204
|
|
|
217
205
|
@staticmethod
|
|
@@ -17,11 +17,12 @@ from sparknlp.common import *
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class RoBertaSentenceEmbeddings(AnnotatorModel,
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
20
|
+
HasEmbeddingsProperties,
|
|
21
|
+
HasCaseSensitiveProperties,
|
|
22
|
+
HasStorageRef,
|
|
23
|
+
HasBatchedAnnotate,
|
|
24
|
+
HasEngine,
|
|
25
|
+
HasMaxSentenceLengthLimit):
|
|
25
26
|
"""Sentence-level embeddings using RoBERTa. The RoBERTa model was proposed in RoBERTa: A Robustly Optimized BERT
|
|
26
27
|
Pretraining Approach by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy,
|
|
27
28
|
Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. It is based on Google's BERT model released in 2018. It builds on
|
|
@@ -39,7 +40,7 @@ class RoBertaSentenceEmbeddings(AnnotatorModel,
|
|
|
39
40
|
The default model is ``"sent_roberta_base"``, if no name is provided.
|
|
40
41
|
|
|
41
42
|
For available pretrained models please see the
|
|
42
|
-
`Models Hub <https://
|
|
43
|
+
`Models Hub <https://sparknlp.org/models?task=Embeddings>`__.
|
|
43
44
|
|
|
44
45
|
====================== =======================
|
|
45
46
|
Input Annotation types Output Annotation type
|
|
@@ -119,11 +120,6 @@ class RoBertaSentenceEmbeddings(AnnotatorModel,
|
|
|
119
120
|
|
|
120
121
|
outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
|
|
121
122
|
|
|
122
|
-
maxSentenceLength = Param(Params._dummy(),
|
|
123
|
-
"maxSentenceLength",
|
|
124
|
-
"Max sentence length to process",
|
|
125
|
-
typeConverter=TypeConverters.toInt)
|
|
126
|
-
|
|
127
123
|
configProtoBytes = Param(Params._dummy(),
|
|
128
124
|
"configProtoBytes",
|
|
129
125
|
"ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
|
|
@@ -139,16 +135,6 @@ class RoBertaSentenceEmbeddings(AnnotatorModel,
|
|
|
139
135
|
"""
|
|
140
136
|
return self._set(configProtoBytes=b)
|
|
141
137
|
|
|
142
|
-
def setMaxSentenceLength(self, value):
|
|
143
|
-
"""Sets max sentence length to process.
|
|
144
|
-
|
|
145
|
-
Parameters
|
|
146
|
-
----------
|
|
147
|
-
value : int
|
|
148
|
-
Max sentence length to process
|
|
149
|
-
"""
|
|
150
|
-
return self._set(maxSentenceLength=value)
|
|
151
|
-
|
|
152
138
|
@keyword_only
|
|
153
139
|
def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.RoBertaSentenceEmbeddings", java_model=None):
|
|
154
140
|
super(RoBertaSentenceEmbeddings, self).__init__(
|