spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  4. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  5. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  6. sparknlp/__init__.py +81 -28
  7. sparknlp/annotation.py +3 -2
  8. sparknlp/annotator/__init__.py +6 -0
  9. sparknlp/annotator/audio/__init__.py +2 -0
  10. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  11. sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
  12. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  13. sparknlp/{base → annotator}/chunk2_doc.py +4 -7
  14. sparknlp/annotator/chunker.py +1 -2
  15. sparknlp/annotator/classifier_dl/__init__.py +17 -0
  16. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  17. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
  18. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
  19. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
  20. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  21. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  22. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  23. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
  24. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
  25. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
  26. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  27. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  28. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
  29. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
  30. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  31. sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
  32. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
  33. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
  34. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
  35. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  36. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
  37. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
  38. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
  39. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  40. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  41. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
  42. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
  43. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
  44. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  45. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  46. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  47. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
  48. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  49. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
  50. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
  51. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
  52. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  53. sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
  54. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
  55. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  56. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
  57. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  60. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
  61. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
  62. sparknlp/annotator/cleaners/__init__.py +15 -0
  63. sparknlp/annotator/cleaners/cleaner.py +202 -0
  64. sparknlp/annotator/cleaners/extractor.py +191 -0
  65. sparknlp/annotator/coref/spanbert_coref.py +4 -18
  66. sparknlp/annotator/cv/__init__.py +15 -0
  67. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  68. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  69. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  70. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  71. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  72. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  73. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  74. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  75. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  76. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  77. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  78. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  79. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  80. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  81. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  82. sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
  83. sparknlp/annotator/dataframe_optimizer.py +216 -0
  84. sparknlp/annotator/date2_chunk.py +88 -0
  85. sparknlp/annotator/dependency/dependency_parser.py +2 -3
  86. sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
  87. sparknlp/annotator/document_character_text_splitter.py +228 -0
  88. sparknlp/annotator/document_normalizer.py +37 -1
  89. sparknlp/annotator/document_token_splitter.py +175 -0
  90. sparknlp/annotator/document_token_splitter_test.py +85 -0
  91. sparknlp/annotator/embeddings/__init__.py +11 -0
  92. sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
  93. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  94. sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
  95. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
  96. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  97. sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
  98. sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
  99. sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
  100. sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
  101. sparknlp/annotator/embeddings/doc2vec.py +7 -1
  102. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  103. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  104. sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
  105. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  106. sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
  107. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  108. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  109. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  110. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  111. sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
  112. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
  113. sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
  114. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  115. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  116. sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
  117. sparknlp/annotator/embeddings/word2vec.py +7 -1
  118. sparknlp/annotator/embeddings/word_embeddings.py +4 -5
  119. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
  120. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
  121. sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
  122. sparknlp/annotator/er/entity_ruler.py +37 -23
  123. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
  124. sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
  125. sparknlp/annotator/lemmatizer.py +3 -4
  126. sparknlp/annotator/matcher/date_matcher.py +35 -3
  127. sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
  128. sparknlp/annotator/matcher/regex_matcher.py +3 -3
  129. sparknlp/annotator/matcher/text_matcher.py +2 -3
  130. sparknlp/annotator/n_gram_generator.py +1 -2
  131. sparknlp/annotator/ner/__init__.py +3 -1
  132. sparknlp/annotator/ner/ner_converter.py +18 -0
  133. sparknlp/annotator/ner/ner_crf.py +4 -5
  134. sparknlp/annotator/ner/ner_dl.py +10 -5
  135. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  136. sparknlp/annotator/ner/ner_overwriter.py +2 -2
  137. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  138. sparknlp/annotator/normalizer.py +2 -2
  139. sparknlp/annotator/openai/__init__.py +16 -0
  140. sparknlp/annotator/openai/openai_completion.py +349 -0
  141. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  142. sparknlp/annotator/pos/perceptron.py +6 -7
  143. sparknlp/annotator/sentence/sentence_detector.py +2 -2
  144. sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
  145. sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
  146. sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
  147. sparknlp/annotator/seq2seq/__init__.py +17 -0
  148. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  149. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  150. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  151. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  152. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  153. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  154. sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
  155. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  156. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  157. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  158. sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
  159. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  160. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  161. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  162. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  163. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  164. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  165. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  166. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  167. sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
  168. sparknlp/annotator/similarity/__init__.py +0 -0
  169. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  170. sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
  171. sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
  172. sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
  173. sparknlp/annotator/stemmer.py +2 -3
  174. sparknlp/annotator/stop_words_cleaner.py +3 -4
  175. sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
  176. sparknlp/annotator/token/__init__.py +0 -1
  177. sparknlp/annotator/token/recursive_tokenizer.py +2 -3
  178. sparknlp/annotator/token/tokenizer.py +2 -3
  179. sparknlp/annotator/ws/word_segmenter.py +35 -10
  180. sparknlp/base/__init__.py +2 -3
  181. sparknlp/base/doc2_chunk.py +0 -3
  182. sparknlp/base/document_assembler.py +5 -5
  183. sparknlp/base/embeddings_finisher.py +14 -2
  184. sparknlp/base/finisher.py +15 -4
  185. sparknlp/base/gguf_ranking_finisher.py +234 -0
  186. sparknlp/base/image_assembler.py +69 -0
  187. sparknlp/base/light_pipeline.py +53 -21
  188. sparknlp/base/multi_document_assembler.py +9 -13
  189. sparknlp/base/prompt_assembler.py +207 -0
  190. sparknlp/base/token_assembler.py +1 -2
  191. sparknlp/common/__init__.py +2 -0
  192. sparknlp/common/annotator_type.py +1 -0
  193. sparknlp/common/completion_post_processing.py +37 -0
  194. sparknlp/common/match_strategy.py +33 -0
  195. sparknlp/common/properties.py +914 -9
  196. sparknlp/internal/__init__.py +841 -116
  197. sparknlp/internal/annotator_java_ml.py +1 -1
  198. sparknlp/internal/annotator_transformer.py +3 -0
  199. sparknlp/logging/comet.py +2 -2
  200. sparknlp/partition/__init__.py +16 -0
  201. sparknlp/partition/partition.py +244 -0
  202. sparknlp/partition/partition_properties.py +902 -0
  203. sparknlp/partition/partition_transformer.py +200 -0
  204. sparknlp/pretrained/pretrained_pipeline.py +1 -1
  205. sparknlp/pretrained/resource_downloader.py +126 -2
  206. sparknlp/reader/__init__.py +15 -0
  207. sparknlp/reader/enums.py +19 -0
  208. sparknlp/reader/pdf_to_text.py +190 -0
  209. sparknlp/reader/reader2doc.py +124 -0
  210. sparknlp/reader/reader2image.py +136 -0
  211. sparknlp/reader/reader2table.py +44 -0
  212. sparknlp/reader/reader_assembler.py +159 -0
  213. sparknlp/reader/sparknlp_reader.py +461 -0
  214. sparknlp/training/__init__.py +1 -0
  215. sparknlp/training/conll.py +8 -2
  216. sparknlp/training/spacy_to_annotation.py +57 -0
  217. sparknlp/util.py +26 -0
  218. spark_nlp-4.2.6.dist-info/METADATA +0 -1256
  219. spark_nlp-4.2.6.dist-info/RECORD +0 -196
  220. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
  221. /sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0
@@ -0,0 +1,192 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for E5Embeddings."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class MPNetEmbeddings(AnnotatorModel,
20
+ HasEmbeddingsProperties,
21
+ HasCaseSensitiveProperties,
22
+ HasStorageRef,
23
+ HasBatchedAnnotate,
24
+ HasMaxSentenceLengthLimit):
25
+ """Sentence embeddings using MPNet.
26
+
27
+ MPNet adopts a novel pre-training method, named masked and permuted language modeling,
28
+ to inherit the advantages of masked language modeling and permuted language modeling for
29
+ natural language understanding.
30
+
31
+ Note that this annotator is only supported for Spark Versions 3.4 and up.
32
+
33
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
34
+ object:
35
+
36
+ >>> embeddings = MPNetEmbeddings.pretrained() \\
37
+ ... .setInputCols(["document"]) \\
38
+ ... .setOutputCol("mpnet_embeddings")
39
+
40
+
41
+ The default model is ``"all_mpnet_base_v2"``, if no name is provided.
42
+
43
+ For available pretrained models please see the
44
+ `Models Hub <https://sparknlp.org/models?q=MPNet>`__.
45
+
46
+
47
+ ====================== ======================
48
+ Input Annotation types Output Annotation type
49
+ ====================== ======================
50
+ ``DOCUMENT`` ``SENTENCE_EMBEDDINGS``
51
+ ====================== ======================
52
+
53
+ Parameters
54
+ ----------
55
+ batchSize
56
+ Size of every batch , by default 8
57
+ dimension
58
+ Number of embedding dimensions, by default 768
59
+ caseSensitive
60
+ Whether to ignore case in tokens for embeddings matching, by default False
61
+ maxSentenceLength
62
+ Max sentence length to process, by default 512
63
+ configProtoBytes
64
+ ConfigProto from tensorflow, serialized into byte array.
65
+
66
+ References
67
+ ----------
68
+ `MPNet: Masked and Permuted Pre-training for Language Understanding <https://arxiv.org/pdf/2004.09297>`__
69
+
70
+ https://github.com/microsoft/MPNet
71
+
72
+ **Paper abstract**
73
+
74
+ *BERT adopts masked language modeling (MLM) for pre-training and is one of the most successful pre-training models.
75
+ Since BERT neglects dependency among predicted tokens, XLNet introduces permuted language modeling (PLM) for
76
+ pre-training to address this problem. However, XLNet does not leverage the full position information of a sentence
77
+ and thus suffers from position discrepancy between pre-training and fine-tuning. In this paper, we propose MPNet,
78
+ a novel pre-training method that inherits the advantages of BERT and XLNet and avoids their limitations. MPNet
79
+ leverages the dependency among predicted tokens through permuted language modeling (vs. MLM in BERT), and takes
80
+ auxiliary position information as input to make the model see a full sentence and thus reducing the position
81
+ discrepancy (vs. PLM in XLNet). We pre-train MPNet on a large-scale dataset (over 160GB text corpora) and fine-tune
82
+ on a variety of down-streaming tasks (GLUE, SQuAD, etc). Experimental results show that MPNet outperforms MLM and
83
+ PLM by a large margin, and achieves better results on these tasks compared with previous state-of-the-art
84
+ pre-trained methods (e.g., BERT, XLNet, RoBERTa) under the same model setting.*
85
+
86
+ Examples
87
+ --------
88
+ >>> import sparknlp
89
+ >>> from sparknlp.base import *
90
+ >>> from sparknlp.annotator import *
91
+ >>> from pyspark.ml import Pipeline
92
+ >>> documentAssembler = DocumentAssembler() \\
93
+ ... .setInputCol("text") \\
94
+ ... .setOutputCol("document")
95
+ >>> embeddings = MPNetEmbeddings.pretrained() \\
96
+ ... .setInputCols(["document"]) \\
97
+ ... .setOutputCol("mpnet_embeddings")
98
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
99
+ ... .setInputCols(["mpnet_embeddings"]) \\
100
+ ... .setOutputCols("finished_embeddings") \\
101
+ ... .setOutputAsVector(True)
102
+ >>> pipeline = Pipeline().setStages([
103
+ ... documentAssembler,
104
+ ... embeddings,
105
+ ... embeddingsFinisher
106
+ ... ])
107
+ >>> data = spark.createDataFrame([["This is an example sentence", "Each sentence is converted"]]).toDF("text")
108
+ >>> result = pipeline.fit(data).transform(data)
109
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
110
+ +--------------------------------------------------------------------------------+
111
+ | result|
112
+ +--------------------------------------------------------------------------------+
113
+ |[[0.022502584, -0.078291744, -0.023030775, -0.0051000593, -0.080340415, 0.039...|
114
+ |[[0.041702367, 0.0010974605, -0.015534201, 0.07092203, -0.0017729357, 0.04661...|
115
+ +--------------------------------------------------------------------------------+
116
+ """
117
+
118
+ name = "MPNetEmbeddings"
119
+
120
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
121
+
122
+ outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
123
+ configProtoBytes = Param(Params._dummy(),
124
+ "configProtoBytes",
125
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
126
+ TypeConverters.toListInt)
127
+
128
+
129
+ def setConfigProtoBytes(self, b):
130
+ """Sets configProto from tensorflow, serialized into byte array.
131
+
132
+ Parameters
133
+ ----------
134
+ b : List[int]
135
+ ConfigProto from tensorflow, serialized into byte array
136
+ """
137
+ return self._set(configProtoBytes=b)
138
+
139
+ @keyword_only
140
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.MPNetEmbeddings", java_model=None):
141
+ super(MPNetEmbeddings, self).__init__(
142
+ classname=classname,
143
+ java_model=java_model
144
+ )
145
+ self._setDefault(
146
+ dimension=768,
147
+ batchSize=8,
148
+ maxSentenceLength=512,
149
+ caseSensitive=False,
150
+ )
151
+
152
+ @staticmethod
153
+ def loadSavedModel(folder, spark_session):
154
+ """Loads a locally saved model.
155
+
156
+ Parameters
157
+ ----------
158
+ folder : str
159
+ Folder of the saved model
160
+ spark_session : pyspark.sql.SparkSession
161
+ The current SparkSession
162
+
163
+ Returns
164
+ -------
165
+ MPNetEmbeddings
166
+ The restored model
167
+ """
168
+ from sparknlp.internal import _MPNetLoader
169
+ jModel = _MPNetLoader(folder, spark_session._jsparkSession)._java_obj
170
+ return MPNetEmbeddings(java_model=jModel)
171
+
172
+ @staticmethod
173
+ def pretrained(name="all_mpnet_base_v2", lang="en", remote_loc=None):
174
+ """Downloads and loads a pretrained model.
175
+
176
+ Parameters
177
+ ----------
178
+ name : str, optional
179
+ Name of the pretrained model, by default "all_mpnet_base_v2"
180
+ lang : str, optional
181
+ Language of the pretrained model, by default "en"
182
+ remote_loc : str, optional
183
+ Optional remote address of the resource, by default None. Will use
184
+ Spark NLPs repositories otherwise.
185
+
186
+ Returns
187
+ -------
188
+ MPNetEmbeddings
189
+ The restored model
190
+ """
191
+ from sparknlp.pretrained import ResourceDownloader
192
+ return ResourceDownloader.downloadModel(MPNetEmbeddings, name, lang, remote_loc)
@@ -0,0 +1,184 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for MxbaiEmbeddings."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class MxbaiEmbeddings(AnnotatorModel,
20
+ HasEmbeddingsProperties,
21
+ HasCaseSensitiveProperties,
22
+ HasStorageRef,
23
+ HasBatchedAnnotate,
24
+ HasMaxSentenceLengthLimit):
25
+ """Sentence embeddings using Mxbai Embeddings.
26
+
27
+
28
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
29
+ object:
30
+
31
+ >>> embeddings = MxbaiEmbeddings.pretrained() \\
32
+ ... .setInputCols(["document"]) \\
33
+ ... .setOutputCol("Mxbai_embeddings")
34
+
35
+
36
+ The default model is ``"mxbai_large_v1"``, if no name is provided.
37
+
38
+ For available pretrained models please see the
39
+ `Models Hub <https://sparknlp.org/models?q=Mxbai>`__.
40
+
41
+
42
+ ====================== ======================
43
+ Input Annotation types Output Annotation type
44
+ ====================== ======================
45
+ ``DOCUMENT`` ``SENTENCE_EMBEDDINGS``
46
+ ====================== ======================
47
+
48
+ Parameters
49
+ ----------
50
+ batchSize
51
+ Size of every batch , by default 8
52
+ dimension
53
+ Number of embedding dimensions, by default 768
54
+ caseSensitive
55
+ Whether to ignore case in tokens for embeddings matching, by default False
56
+ maxSentenceLength
57
+ Max sentence length to process, by default 512
58
+ configProtoBytes
59
+ ConfigProto from tensorflow, serialized into byte array.
60
+
61
+
62
+
63
+ Examples
64
+ --------
65
+ >>> import sparknlp
66
+ >>> from sparknlp.base import *
67
+ >>> from sparknlp.annotator import *
68
+ >>> from pyspark.ml import Pipeline
69
+ >>> documentAssembler = DocumentAssembler() \\
70
+ ... .setInputCol("text") \\
71
+ ... .setOutputCol("document")
72
+ >>> embeddings = MxbaiEmbeddings.pretrained() \\
73
+ ... .setInputCols(["document"]) \\
74
+ ... .setOutputCol("embeddings")
75
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
76
+ ... .setInputCols("embeddings") \\
77
+ ... .setOutputCols("finished_embeddings") \\
78
+ ... .setOutputAsVector(True)
79
+ >>> pipeline = Pipeline().setStages([
80
+ ... documentAssembler,
81
+ ... embeddings,
82
+ ... embeddingsFinisher
83
+ ... ])
84
+ >>> data = spark.createDataFrame([["hello world", "hello moon"]]).toDF("text")
85
+ >>> result = pipeline.fit(data).transform(data)
86
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
87
+ +--------------------------------------------------------------------------------+
88
+ | result|
89
+ +--------------------------------------------------------------------------------+
90
+ |[0.50387806, 0.5861606, 0.35129607, -0.76046336, -0.32446072, -0.117674336, 0...|
91
+ |[0.6660665, 0.961762, 0.24854276, -0.1018044, -0.6569202, 0.027635604, 0.1915...|
92
+ +--------------------------------------------------------------------------------+
93
+ """
94
+
95
+ name = "MxbaiEmbeddings"
96
+
97
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
98
+
99
+ outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
100
+ poolingStrategy = Param(Params._dummy(),
101
+ "poolingStrategy",
102
+ "Pooling strategy to use for sentence embeddings",
103
+ TypeConverters.toString)
104
+
105
+ def setPoolingStrategy(self, value):
106
+ """Pooling strategy to use for sentence embeddings.
107
+
108
+ Available pooling strategies for sentence embeddings are:
109
+ - `"cls"`: leading `[CLS]` token
110
+ - `"cls_avg"`: leading `[CLS]` token + mean of all other tokens
111
+ - `"last"`: embeddings of the last token in the sequence
112
+ - `"avg"`: mean of all tokens
113
+ - `"max"`: max of all embedding features of the entire token sequence
114
+ - `"int"`: An integer number, which represents the index of the token to use as the
115
+ embedding
116
+
117
+ Parameters
118
+ ----------
119
+ value : str
120
+ Pooling strategy to use for sentence embeddings
121
+ """
122
+
123
+ valid_strategies = {"cls", "cls_avg", "last", "avg", "max"}
124
+ if value in valid_strategies or value.isdigit():
125
+ return self._set(poolingStrategy=value)
126
+ else:
127
+ raise ValueError(f"Invalid pooling strategy: {value}. "
128
+ f"Valid strategies are: {', '.join(self.valid_strategies)} or an integer.")
129
+
130
+ @keyword_only
131
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.MxbaiEmbeddings", java_model=None):
132
+ super(MxbaiEmbeddings, self).__init__(
133
+ classname=classname,
134
+ java_model=java_model
135
+ )
136
+ self._setDefault(
137
+ dimension=1024,
138
+ batchSize=8,
139
+ maxSentenceLength=512,
140
+ caseSensitive=False,
141
+ poolingStrategy="cls"
142
+ )
143
+
144
+ @staticmethod
145
+ def loadSavedModel(folder, spark_session):
146
+ """Loads a locally saved model.
147
+
148
+ Parameters
149
+ ----------
150
+ folder : str
151
+ Folder of the saved model
152
+ spark_session : pyspark.sql.SparkSession
153
+ The current SparkSession
154
+
155
+ Returns
156
+ -------
157
+ MxbaiEmbeddings
158
+ The restored model
159
+ """
160
+ from sparknlp.internal import _MxbaiEmbeddingsLoader
161
+ jModel = _MxbaiEmbeddingsLoader(folder, spark_session._jsparkSession)._java_obj
162
+ return MxbaiEmbeddings(java_model=jModel)
163
+
164
+ @staticmethod
165
+ def pretrained(name="mxbai_large_v1", lang="en", remote_loc=None):
166
+ """Downloads and loads a pretrained model.
167
+
168
+ Parameters
169
+ ----------
170
+ name : str, optional
171
+ Name of the pretrained model, by default "mxbai_large_v1"
172
+ lang : str, optional
173
+ Language of the pretrained model, by default "en"
174
+ remote_loc : str, optional
175
+ Optional remote address of the resource, by default None. Will use
176
+ Spark NLPs repositories otherwise.
177
+
178
+ Returns
179
+ -------
180
+ MxbaiEmbeddings
181
+ The restored model
182
+ """
183
+ from sparknlp.pretrained import ResourceDownloader
184
+ return ResourceDownloader.downloadModel(MxbaiEmbeddings, name, lang, remote_loc)
@@ -0,0 +1,181 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for E5Embeddings."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class NomicEmbeddings(AnnotatorModel, HasEmbeddingsProperties, HasCaseSensitiveProperties, HasStorageRef,
20
+ HasBatchedAnnotate, HasMaxSentenceLengthLimit):
21
+ """Sentence embeddings using NomicEmbeddings.
22
+
23
+ nomic-embed-text-v1 is 8192 context length text encoder that surpasses OpenAI
24
+ text-embedding-ada-002 and text-embedding-3-small performance on short and long context tasks.
25
+
26
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
27
+ object:
28
+
29
+ >>> embeddings = NomicEmbeddings.pretrained() \\
30
+ ... .setInputCols(["document"]) \\
31
+ ... .setOutputCol("nomic_embeddings")
32
+
33
+
34
+ The default model is ``"nomic_embed_v1"``, if no name is provided.
35
+
36
+ For available pretrained models please see the
37
+ `Models Hub <https://sparknlp.org/models?q=Nomic>`__.
38
+
39
+
40
+ ====================== ======================
41
+ Input Annotation types Output Annotation type
42
+ ====================== ======================
43
+ ``DOCUMENT`` ``SENTENCE_EMBEDDINGS``
44
+ ====================== ======================
45
+
46
+ Parameters
47
+ ----------
48
+ batchSize
49
+ Size of every batch , by default 8
50
+ dimension
51
+ Number of embedding dimensions, by default 768
52
+ caseSensitive
53
+ Whether to ignore case in tokens for embeddings matching, by default False
54
+ maxSentenceLength
55
+ Max sentence length to process, by default 512
56
+ configProtoBytes
57
+ ConfigProto from tensorflow, serialized into byte array.
58
+
59
+ References
60
+ ----------
61
+ `Text Embeddings by Weakly-Supervised Contrastive Pre-training <https://arxiv.org/pdf/2212.03533>`__
62
+
63
+ https://github.com/microsoft/unilm/tree/master/nomic
64
+
65
+ **Paper abstract**
66
+
67
+ *This technical report describes the training
68
+ of nomic-embed-text-v1, the first fully reproducible,
69
+ open-source, open-weights, opendata, 8192 context length
70
+ English text embedding model that outperforms both OpenAI
71
+ Ada-002 and OpenAI text-embedding-3-small
72
+ on short and long-context tasks. We release
73
+ the training code and model weights under
74
+ an Apache 2 license. In contrast with other
75
+ open-source models, we release a training data
76
+ loader with 235 million curated text pairs that
77
+ allows for the full replication of nomic-embedtext-v1.
78
+ You can find code and data to replicate the
79
+ model at https://github.com/nomicai/contrastors.*
80
+
81
+ Examples
82
+ --------
83
+ >>> import sparknlp
84
+ >>> from sparknlp.base import *
85
+ >>> from sparknlp.annotator import *
86
+ >>> from pyspark.ml import Pipeline
87
+ >>> documentAssembler = DocumentAssembler() \\
88
+ ... .setInputCol("text") \\
89
+ ... .setOutputCol("document")
90
+ >>> embeddings = NomicEmbeddings.pretrained() \\
91
+ ... .setInputCols(["document"]) \\
92
+ ... .setOutputCol("nomic_embeddings")
93
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
94
+ ... .setInputCols(["nomic_embeddings"]) \\
95
+ ... .setOutputCols("finished_embeddings") \\
96
+ ... .setOutputAsVector(True)
97
+ >>> pipeline = Pipeline().setStages([
98
+ ... documentAssembler,
99
+ ... embeddings,
100
+ ... embeddingsFinisher
101
+ ... ])
102
+ >>> data = spark.createDataFrame([["query: how much protein should a female eat",
103
+ ... "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." + \
104
+ ... "But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" + \
105
+ ... "marathon. Check out the chart below to see how much protein you should be eating each day.",
106
+ ... ]]).toDF("text")
107
+ >>> result = pipeline.fit(data).transform(data)
108
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
109
+ +--------------------------------------------------------------------------------+
110
+ | result|
111
+ +--------------------------------------------------------------------------------+
112
+ |[[8.0190285E-4, -0.005974853, -0.072875895, 0.007944068, 0.026059335, -0.0080...|
113
+ |[[0.050514214, 0.010061974, -0.04340176, -0.020937217, 0.05170225, 0.01157857...|
114
+ +--------------------------------------------------------------------------------+
115
+ """
116
+
117
+ name = "NomicEmbeddings"
118
+
119
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
120
+
121
+ outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
122
+ configProtoBytes = Param(Params._dummy(), "configProtoBytes",
123
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
124
+ TypeConverters.toListInt)
125
+
126
+ def setConfigProtoBytes(self, b):
127
+ """Sets configProto from tensorflow, serialized into byte array.
128
+
129
+ Parameters
130
+ ----------
131
+ b : List[int]
132
+ ConfigProto from tensorflow, serialized into byte array
133
+ """
134
+ return self._set(configProtoBytes=b)
135
+
136
+ @keyword_only
137
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.NomicEmbeddings", java_model=None):
138
+ super(NomicEmbeddings, self).__init__(classname=classname, java_model=java_model)
139
+ self._setDefault(dimension=768, batchSize=8, maxSentenceLength=512, caseSensitive=False, )
140
+
141
+ @staticmethod
142
+ def loadSavedModel(folder, spark_session, use_openvino=False):
143
+ """Loads a locally saved model.
144
+
145
+ Parameters
146
+ ----------
147
+ folder : str
148
+ Folder of the saved model
149
+ spark_session : pyspark.sql.SparkSession
150
+ The current SparkSession
151
+
152
+ Returns
153
+ -------
154
+ NomicEmbeddings
155
+ The restored model
156
+ """
157
+ from sparknlp.internal import _NomicLoader
158
+ jModel = _NomicLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
159
+ return NomicEmbeddings(java_model=jModel)
160
+
161
+ @staticmethod
162
+ def pretrained(name="nomic_embed_v1", lang="en", remote_loc=None):
163
+ """Downloads and loads a pretrained model.
164
+
165
+ Parameters
166
+ ----------
167
+ name : str, optional
168
+ Name of the pretrained model, by default "nomic_embed_v1"
169
+ lang : str, optional
170
+ Language of the pretrained model, by default "en"
171
+ remote_loc : str, optional
172
+ Optional remote address of the resource, by default None. Will use
173
+ Spark NLPs repositories otherwise.
174
+
175
+ Returns
176
+ -------
177
+ NomicEmbeddings
178
+ The restored model
179
+ """
180
+ from sparknlp.pretrained import ResourceDownloader
181
+ return ResourceDownloader.downloadModel(NomicEmbeddings, name, lang, remote_loc)
@@ -21,7 +21,8 @@ class RoBertaEmbeddings(AnnotatorModel,
21
21
  HasCaseSensitiveProperties,
22
22
  HasStorageRef,
23
23
  HasBatchedAnnotate,
24
- HasEngine):
24
+ HasEngine,
25
+ HasMaxSentenceLengthLimit):
25
26
  """Creates word embeddings using RoBERTa.
26
27
 
27
28
  The RoBERTa model was proposed in `RoBERTa: A Robustly Optimized BERT
@@ -42,10 +43,10 @@ class RoBertaEmbeddings(AnnotatorModel,
42
43
 
43
44
  The default model is ``"roberta_base"``, if no name is provided. For
44
45
  available pretrained models please see the `Models Hub
45
- <https://nlp.johnsnowlabs.com/models?task=Embeddings>`__.
46
+ <https://sparknlp.org/models?task=Embeddings>`__.
46
47
 
47
- For extended examples of usage, see the `Spark NLP Workshop
48
- <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/transformers/HuggingFace%20in%20Spark%20NLP%20-%20RoBERTa.ipynb>`__.
48
+ For extended examples of usage, see the `Examples
49
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20RoBERTa.ipynb>`__.
49
50
  To see which models are compatible and how to import them see
50
51
  `Import Transformers into Spark NLP 🚀
51
52
  <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
@@ -151,11 +152,6 @@ class RoBertaEmbeddings(AnnotatorModel,
151
152
 
152
153
  outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS
153
154
 
154
- maxSentenceLength = Param(Params._dummy(),
155
- "maxSentenceLength",
156
- "Max sentence length to process",
157
- typeConverter=TypeConverters.toInt)
158
-
159
155
  configProtoBytes = Param(Params._dummy(),
160
156
  "configProtoBytes",
161
157
  "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
@@ -171,16 +167,6 @@ class RoBertaEmbeddings(AnnotatorModel,
171
167
  """
172
168
  return self._set(configProtoBytes=b)
173
169
 
174
- def setMaxSentenceLength(self, value):
175
- """Sets max sentence length to process.
176
-
177
- Parameters
178
- ----------
179
- value : int
180
- Max sentence length to process
181
- """
182
- return self._set(maxSentenceLength=value)
183
-
184
170
  @keyword_only
185
171
  def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.RoBertaEmbeddings", java_model=None):
186
172
  super(RoBertaEmbeddings, self).__init__(
@@ -195,7 +181,7 @@ class RoBertaEmbeddings(AnnotatorModel,
195
181
  )
196
182
 
197
183
  @staticmethod
198
- def loadSavedModel(folder, spark_session):
184
+ def loadSavedModel(folder, spark_session, use_openvino=False):
199
185
  """Loads a locally saved model.
200
186
 
201
187
  Parameters
@@ -204,6 +190,8 @@ class RoBertaEmbeddings(AnnotatorModel,
204
190
  Folder of the saved model
205
191
  spark_session : pyspark.sql.SparkSession
206
192
  The current SparkSession
193
+ use_openvino: bool
194
+ Use OpenVINO backend
207
195
 
208
196
  Returns
209
197
  -------
@@ -211,7 +199,7 @@ class RoBertaEmbeddings(AnnotatorModel,
211
199
  The restored model
212
200
  """
213
201
  from sparknlp.internal import _RoBertaLoader
214
- jModel = _RoBertaLoader(folder, spark_session._jsparkSession)._java_obj
202
+ jModel = _RoBertaLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
215
203
  return RoBertaEmbeddings(java_model=jModel)
216
204
 
217
205
  @staticmethod
@@ -17,11 +17,12 @@ from sparknlp.common import *
17
17
 
18
18
 
19
19
  class RoBertaSentenceEmbeddings(AnnotatorModel,
20
- HasEmbeddingsProperties,
21
- HasCaseSensitiveProperties,
22
- HasStorageRef,
23
- HasBatchedAnnotate,
24
- HasEngine):
20
+ HasEmbeddingsProperties,
21
+ HasCaseSensitiveProperties,
22
+ HasStorageRef,
23
+ HasBatchedAnnotate,
24
+ HasEngine,
25
+ HasMaxSentenceLengthLimit):
25
26
  """Sentence-level embeddings using RoBERTa. The RoBERTa model was proposed in RoBERTa: A Robustly Optimized BERT
26
27
  Pretraining Approach by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy,
27
28
  Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. It is based on Google's BERT model released in 2018. It builds on
@@ -39,7 +40,7 @@ class RoBertaSentenceEmbeddings(AnnotatorModel,
39
40
  The default model is ``"sent_roberta_base"``, if no name is provided.
40
41
 
41
42
  For available pretrained models please see the
42
- `Models Hub <https://nlp.johnsnowlabs.com/models?task=Embeddings>`__.
43
+ `Models Hub <https://sparknlp.org/models?task=Embeddings>`__.
43
44
 
44
45
  ====================== =======================
45
46
  Input Annotation types Output Annotation type
@@ -119,11 +120,6 @@ class RoBertaSentenceEmbeddings(AnnotatorModel,
119
120
 
120
121
  outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
121
122
 
122
- maxSentenceLength = Param(Params._dummy(),
123
- "maxSentenceLength",
124
- "Max sentence length to process",
125
- typeConverter=TypeConverters.toInt)
126
-
127
123
  configProtoBytes = Param(Params._dummy(),
128
124
  "configProtoBytes",
129
125
  "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
@@ -139,16 +135,6 @@ class RoBertaSentenceEmbeddings(AnnotatorModel,
139
135
  """
140
136
  return self._set(configProtoBytes=b)
141
137
 
142
- def setMaxSentenceLength(self, value):
143
- """Sets max sentence length to process.
144
-
145
- Parameters
146
- ----------
147
- value : int
148
- Max sentence length to process
149
- """
150
- return self._set(maxSentenceLength=value)
151
-
152
138
  @keyword_only
153
139
  def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.RoBertaSentenceEmbeddings", java_model=None):
154
140
  super(RoBertaSentenceEmbeddings, self).__init__(