spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  4. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  5. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  6. sparknlp/__init__.py +81 -28
  7. sparknlp/annotation.py +3 -2
  8. sparknlp/annotator/__init__.py +6 -0
  9. sparknlp/annotator/audio/__init__.py +2 -0
  10. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  11. sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
  12. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  13. sparknlp/{base → annotator}/chunk2_doc.py +4 -7
  14. sparknlp/annotator/chunker.py +1 -2
  15. sparknlp/annotator/classifier_dl/__init__.py +17 -0
  16. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  17. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
  18. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
  19. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
  20. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  21. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  22. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  23. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
  24. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
  25. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
  26. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  27. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  28. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
  29. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
  30. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  31. sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
  32. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
  33. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
  34. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
  35. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  36. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
  37. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
  38. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
  39. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  40. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  41. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
  42. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
  43. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
  44. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  45. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  46. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  47. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
  48. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  49. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
  50. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
  51. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
  52. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  53. sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
  54. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
  55. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  56. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
  57. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  60. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
  61. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
  62. sparknlp/annotator/cleaners/__init__.py +15 -0
  63. sparknlp/annotator/cleaners/cleaner.py +202 -0
  64. sparknlp/annotator/cleaners/extractor.py +191 -0
  65. sparknlp/annotator/coref/spanbert_coref.py +4 -18
  66. sparknlp/annotator/cv/__init__.py +15 -0
  67. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  68. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  69. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  70. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  71. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  72. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  73. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  74. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  75. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  76. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  77. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  78. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  79. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  80. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  81. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  82. sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
  83. sparknlp/annotator/dataframe_optimizer.py +216 -0
  84. sparknlp/annotator/date2_chunk.py +88 -0
  85. sparknlp/annotator/dependency/dependency_parser.py +2 -3
  86. sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
  87. sparknlp/annotator/document_character_text_splitter.py +228 -0
  88. sparknlp/annotator/document_normalizer.py +37 -1
  89. sparknlp/annotator/document_token_splitter.py +175 -0
  90. sparknlp/annotator/document_token_splitter_test.py +85 -0
  91. sparknlp/annotator/embeddings/__init__.py +11 -0
  92. sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
  93. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  94. sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
  95. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
  96. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  97. sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
  98. sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
  99. sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
  100. sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
  101. sparknlp/annotator/embeddings/doc2vec.py +7 -1
  102. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  103. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  104. sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
  105. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  106. sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
  107. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  108. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  109. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  110. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  111. sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
  112. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
  113. sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
  114. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  115. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  116. sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
  117. sparknlp/annotator/embeddings/word2vec.py +7 -1
  118. sparknlp/annotator/embeddings/word_embeddings.py +4 -5
  119. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
  120. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
  121. sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
  122. sparknlp/annotator/er/entity_ruler.py +37 -23
  123. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
  124. sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
  125. sparknlp/annotator/lemmatizer.py +3 -4
  126. sparknlp/annotator/matcher/date_matcher.py +35 -3
  127. sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
  128. sparknlp/annotator/matcher/regex_matcher.py +3 -3
  129. sparknlp/annotator/matcher/text_matcher.py +2 -3
  130. sparknlp/annotator/n_gram_generator.py +1 -2
  131. sparknlp/annotator/ner/__init__.py +3 -1
  132. sparknlp/annotator/ner/ner_converter.py +18 -0
  133. sparknlp/annotator/ner/ner_crf.py +4 -5
  134. sparknlp/annotator/ner/ner_dl.py +10 -5
  135. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  136. sparknlp/annotator/ner/ner_overwriter.py +2 -2
  137. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  138. sparknlp/annotator/normalizer.py +2 -2
  139. sparknlp/annotator/openai/__init__.py +16 -0
  140. sparknlp/annotator/openai/openai_completion.py +349 -0
  141. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  142. sparknlp/annotator/pos/perceptron.py +6 -7
  143. sparknlp/annotator/sentence/sentence_detector.py +2 -2
  144. sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
  145. sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
  146. sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
  147. sparknlp/annotator/seq2seq/__init__.py +17 -0
  148. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  149. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  150. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  151. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  152. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  153. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  154. sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
  155. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  156. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  157. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  158. sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
  159. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  160. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  161. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  162. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  163. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  164. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  165. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  166. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  167. sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
  168. sparknlp/annotator/similarity/__init__.py +0 -0
  169. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  170. sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
  171. sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
  172. sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
  173. sparknlp/annotator/stemmer.py +2 -3
  174. sparknlp/annotator/stop_words_cleaner.py +3 -4
  175. sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
  176. sparknlp/annotator/token/__init__.py +0 -1
  177. sparknlp/annotator/token/recursive_tokenizer.py +2 -3
  178. sparknlp/annotator/token/tokenizer.py +2 -3
  179. sparknlp/annotator/ws/word_segmenter.py +35 -10
  180. sparknlp/base/__init__.py +2 -3
  181. sparknlp/base/doc2_chunk.py +0 -3
  182. sparknlp/base/document_assembler.py +5 -5
  183. sparknlp/base/embeddings_finisher.py +14 -2
  184. sparknlp/base/finisher.py +15 -4
  185. sparknlp/base/gguf_ranking_finisher.py +234 -0
  186. sparknlp/base/image_assembler.py +69 -0
  187. sparknlp/base/light_pipeline.py +53 -21
  188. sparknlp/base/multi_document_assembler.py +9 -13
  189. sparknlp/base/prompt_assembler.py +207 -0
  190. sparknlp/base/token_assembler.py +1 -2
  191. sparknlp/common/__init__.py +2 -0
  192. sparknlp/common/annotator_type.py +1 -0
  193. sparknlp/common/completion_post_processing.py +37 -0
  194. sparknlp/common/match_strategy.py +33 -0
  195. sparknlp/common/properties.py +914 -9
  196. sparknlp/internal/__init__.py +841 -116
  197. sparknlp/internal/annotator_java_ml.py +1 -1
  198. sparknlp/internal/annotator_transformer.py +3 -0
  199. sparknlp/logging/comet.py +2 -2
  200. sparknlp/partition/__init__.py +16 -0
  201. sparknlp/partition/partition.py +244 -0
  202. sparknlp/partition/partition_properties.py +902 -0
  203. sparknlp/partition/partition_transformer.py +200 -0
  204. sparknlp/pretrained/pretrained_pipeline.py +1 -1
  205. sparknlp/pretrained/resource_downloader.py +126 -2
  206. sparknlp/reader/__init__.py +15 -0
  207. sparknlp/reader/enums.py +19 -0
  208. sparknlp/reader/pdf_to_text.py +190 -0
  209. sparknlp/reader/reader2doc.py +124 -0
  210. sparknlp/reader/reader2image.py +136 -0
  211. sparknlp/reader/reader2table.py +44 -0
  212. sparknlp/reader/reader_assembler.py +159 -0
  213. sparknlp/reader/sparknlp_reader.py +461 -0
  214. sparknlp/training/__init__.py +1 -0
  215. sparknlp/training/conll.py +8 -2
  216. sparknlp/training/spacy_to_annotation.py +57 -0
  217. sparknlp/util.py +26 -0
  218. spark_nlp-4.2.6.dist-info/METADATA +0 -1256
  219. spark_nlp-4.2.6.dist-info/RECORD +0 -196
  220. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
  221. /sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0
@@ -0,0 +1,138 @@
1
+ # Copyright 2017-2024 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from sparknlp.common import *
16
+
17
+ class E5VEmbeddings(AnnotatorModel,
18
+ HasBatchedAnnotateImage,
19
+ HasImageFeatureProperties,
20
+ HasEngine,
21
+ HasRescaleFactor):
22
+ """Universal multimodal embeddings using the E5-V model (see https://huggingface.co/royokong/e5-v).
23
+
24
+ E5-V bridges the modality gap between different input types (text, image) and demonstrates strong performance in multimodal embeddings, even without fine-tuning. It also supports a single-modality training approach, where the model is trained exclusively on text pairs, often yielding better performance than multimodal training.
25
+
26
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion object:
27
+
28
+ >>> e5vEmbeddings = E5VEmbeddings.pretrained() \
29
+ ... .setInputCols(["image_assembler"]) \
30
+ ... .setOutputCol("e5v")
31
+
32
+ The default model is ``"e5v_int4"``, if no name is provided.
33
+
34
+ For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Question+Answering>`__.
35
+
36
+ ====================== ======================
37
+ Input Annotation types Output Annotation type
38
+ ====================== ======================
39
+ ``IMAGE`` ``SENTENCE_EMBEDDINGS``
40
+ ====================== ======================
41
+
42
+ Examples
43
+ --------
44
+ Image + Text Embedding:
45
+ >>> import sparknlp
46
+ >>> from sparknlp.base import *
47
+ >>> from sparknlp.annotator import *
48
+ >>> from pyspark.ml import Pipeline
49
+ >>> image_df = spark.read.format("image").option("dropInvalid", value = True).load(imageFolder)
50
+ >>> imagePrompt = "<|start_header_id|>user<|end_header_id|>\n\n<image>\\nSummary above image in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"
51
+ >>> test_df = image_df.withColumn("text", lit(imagePrompt))
52
+ >>> imageAssembler = ImageAssembler() \
53
+ ... .setInputCol("image") \
54
+ ... .setOutputCol("image_assembler")
55
+ >>> e5vEmbeddings = E5VEmbeddings.pretrained() \
56
+ ... .setInputCols(["image_assembler"]) \
57
+ ... .setOutputCol("e5v")
58
+ >>> pipeline = Pipeline().setStages([
59
+ ... imageAssembler,
60
+ ... e5vEmbeddings
61
+ ... ])
62
+ >>> result = pipeline.fit(test_df).transform(test_df)
63
+ >>> result.select("e5v.embeddings").show(truncate = False)
64
+
65
+ Text-Only Embedding:
66
+ >>> from sparknlp.util import EmbeddingsDataFrameUtils
67
+ >>> textPrompt = "<|start_header_id|>user<|end_header_id|>\n\n<sent>\\nSummary above sentence in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"
68
+ >>> textDesc = "A cat sitting in a box."
69
+ >>> nullImageDF = spark.createDataFrame(spark.sparkContext.parallelize([EmbeddingsDataFrameUtils.emptyImageRow]), EmbeddingsDataFrameUtils.imageSchema)
70
+ >>> textDF = nullImageDF.withColumn("text", lit(textPrompt.replace("<sent>", textDesc)))
71
+ >>> e5vEmbeddings = E5VEmbeddings.pretrained() \
72
+ ... .setInputCols(["image"]) \
73
+ ... .setOutputCol("e5v")
74
+ >>> result = e5vEmbeddings.transform(textDF)
75
+ >>> result.select("e5v.embeddings").show(truncate = False)
76
+ """
77
+
78
+ name = "E5VEmbeddings"
79
+
80
+ inputAnnotatorTypes = [AnnotatorType.IMAGE]
81
+ outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
82
+
83
+ @keyword_only
84
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.E5VEmbeddings", java_model=None):
85
+ """Initializes the E5VEmbeddings annotator.
86
+
87
+ Parameters
88
+ ----------
89
+ classname : str, optional
90
+ The Java class name of the annotator, by default "com.johnsnowlabs.nlp.annotators.embeddings.E5VEmbeddings"
91
+ java_model : Optional[java.lang.Object], optional
92
+ A pre-initialized Java model, by default None
93
+ """
94
+ super(E5VEmbeddings, self).__init__(classname=classname, java_model=java_model)
95
+ self._setDefault()
96
+
97
+ @staticmethod
98
+ def loadSavedModel(folder, spark_session, use_openvino=False):
99
+ """Loads a locally saved model.
100
+
101
+ Parameters
102
+ ----------
103
+ folder : str
104
+ Folder of the saved model
105
+ spark_session : pyspark.sql.SparkSession
106
+ The current SparkSession
107
+ use_openvino : bool, optional
108
+ Whether to use OpenVINO engine, by default False
109
+
110
+ Returns
111
+ -------
112
+ E5VEmbeddings
113
+ The restored model
114
+ """
115
+ from sparknlp.internal import _E5VEmbeddingsLoader
116
+ jModel = _E5VEmbeddingsLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
117
+ return E5VEmbeddings(java_model=jModel)
118
+
119
+ @staticmethod
120
+ def pretrained(name="e5v_int4", lang="en", remote_loc=None):
121
+ """Downloads and loads a pretrained model.
122
+
123
+ Parameters
124
+ ----------
125
+ name : str, optional
126
+ Name of the pretrained model, by default "e5v_int4"
127
+ lang : str, optional
128
+ Language of the pretrained model, by default "en"
129
+ remote_loc : str, optional
130
+ Optional remote address of the resource, by default None. Will use Spark NLPs repositories otherwise.
131
+
132
+ Returns
133
+ -------
134
+ E5VEmbeddings
135
+ The restored model
136
+ """
137
+ from sparknlp.pretrained import ResourceDownloader
138
+ return ResourceDownloader.downloadModel(E5VEmbeddings, name, lang, remote_loc)
@@ -38,7 +38,7 @@ class ElmoEmbeddings(AnnotatorModel,
38
38
 
39
39
  The default model is ``"elmo"``, if no name is provided.
40
40
 
41
- For available pretrained models please see the `Models Hub <https://nlp.johnsnowlabs.com/models?task=Embeddings>`__.
41
+ For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Embeddings>`__.
42
42
 
43
43
  The pooling layer can be set with :meth:`.setPoolingLayer` to the following
44
44
  values:
@@ -53,7 +53,7 @@ class ElmoEmbeddings(AnnotatorModel,
53
53
  trainable. This tensor has shape ``[batch_size, max_length, 1024]``.
54
54
 
55
55
  For extended examples of usage, see the
56
- `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/dl-ner/ner_elmo.ipynb>`__.
56
+ `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner/ner_elmo.ipynb>`__.
57
57
 
58
58
  ====================== ======================
59
59
  Input Annotation types Output Annotation type
@@ -0,0 +1,204 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for BertEmbeddings."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class InstructorEmbeddings(AnnotatorModel,
20
+ HasEmbeddingsProperties,
21
+ HasCaseSensitiveProperties,
22
+ HasStorageRef,
23
+ HasBatchedAnnotate,
24
+ HasMaxSentenceLengthLimit):
25
+ """Sentence embeddings using INSTRUCTOR.
26
+
27
+ Instructor👨‍🏫, an instruction-finetuned text embedding model that can generate text embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation, etc.) and domains (e.g., science, finance, etc.) by simply providing the task instruction, without any finetuning. Instructor👨‍ achieves sota on 70 diverse embedding tasks!
28
+
29
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
30
+ object:
31
+
32
+ >>> embeddings = InstructorEmbeddings.pretrained() \\
33
+ ... .setInputCols(["document"]) \\
34
+ ... .setInstruction("Represent the Medicine sentence for clustering: ") \\
35
+ ... .setOutputCol("instructor_embeddings")
36
+
37
+
38
+ The default model is ``"instructor_base"``, if no name is provided.
39
+
40
+ For available pretrained models please see the
41
+ `Models Hub <https://sparknlp.org/models?q=Instructor>`__.
42
+
43
+
44
+ ====================== ======================
45
+ Input Annotation types Output Annotation type
46
+ ====================== ======================
47
+ ``DOCUMENT`` ``SENTENCE_EMBEDDINGS``
48
+ ====================== ======================
49
+
50
+ Parameters
51
+ ----------
52
+ batchSize
53
+ Size of every batch , by default 8
54
+ dimension
55
+ Number of embedding dimensions, by default 768
56
+ caseSensitive
57
+ Whether to ignore case in tokens for embeddings matching, by default False
58
+ instruction
59
+ Set transformer instruction, e.g. 'summarize:'
60
+ maxSentenceLength
61
+ Max sentence length to process, by default 128
62
+ configProtoBytes
63
+ ConfigProto from tensorflow, serialized into byte array.
64
+
65
+ References
66
+ ----------
67
+ `One Embedder, Any Task: Instruction-Finetuned Text Embeddings <https://arxiv.org/abs/2212.09741>`__
68
+
69
+ https://github.com/HKUNLP/instructor-embedding/
70
+
71
+ **Paper abstract**
72
+
73
+ *We introduce INSTRUCTOR, a new method for computing text embeddings given task instructions:
74
+ every text input is embedded together with instructions explaining the use case (e.g., task and
75
+ domain descriptions). Unlike encoders from prior work that are more specialized, INSTRUCTOR is a
76
+ single embedder that can generate text embeddings tailored to different downstream tasks and domains,
77
+ without any further training. We first annotate instructions for 330 diverse tasks and train INSTRUCTOR
78
+ on this multitask mixture with a contrastive loss. We evaluate INSTRUCTOR on 70 embedding evaluation tasks
79
+ (66 of which are unseen during training), ranging from classification and information retrieval to semantic
80
+ textual similarity and text generation evaluation. INSTRUCTOR, while having an order of magnitude fewer
81
+ parameters than the previous best model, achieves state-of-the-art performance, with an average improvement
82
+ of 3.4% compared to the previous best results on the 70 diverse datasets. Our analysis suggests that
83
+ INSTRUCTOR is robust to changes in instructions, and that instruction finetuning mitigates the challenge of
84
+ training a single model on diverse datasets. Our model, code, and data are available at this https
85
+ URL <https://instructor-embedding.github.io/>.*
86
+
87
+ Examples
88
+ --------
89
+ >>> import sparknlp
90
+ >>> from sparknlp.base import *
91
+ >>> from sparknlp.annotator import *
92
+ >>> from pyspark.ml import Pipeline
93
+ >>> documentAssembler = DocumentAssembler() \\
94
+ ... .setInputCol("text") \\
95
+ ... .setOutputCol("document")
96
+ >>> embeddings = InstructorEmbeddings.pretrained() \\
97
+ ... .setInputCols(["document"]) \\
98
+ ... .setInstruction("Represent the Medicine sentence for clustering: ") \\
99
+ ... .setOutputCol("instructor_embeddings")
100
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
101
+ ... .setInputCols(["instructor_embeddings"]) \\
102
+ ... .setOutputCols("finished_embeddings") \\
103
+ ... .setOutputAsVector(True)
104
+ >>> pipeline = Pipeline().setStages([
105
+ ... documentAssembler,
106
+ ... embeddings,
107
+ ... embeddingsFinisher
108
+ ... ])
109
+ >>> data = spark.createDataFrame([["Dynamical Scalar Degree of Freedom in Horava-Lifshitz Gravity"]]).toDF("text")
110
+ >>> result = pipeline.fit(data).transform(data)
111
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
112
+ +--------------------------------------------------------------------------------+
113
+ | result|
114
+ +--------------------------------------------------------------------------------+
115
+ |[-2.3497989177703857,0.480538547039032,-0.3238905668258667,-1.612930893898010...|
116
+ +--------------------------------------------------------------------------------+
117
+ """
118
+
119
+ name = "InstructorEmbeddings"
120
+
121
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
122
+
123
+ outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
124
+ instruction = Param(Params._dummy(), "instruction", "Set transformer instruction, e.g. 'summarize:'",
125
+ typeConverter=TypeConverters.toString)
126
+ configProtoBytes = Param(Params._dummy(),
127
+ "configProtoBytes",
128
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
129
+ TypeConverters.toListInt)
130
+
131
+ def setInstruction(self, value):
132
+ """ Sets transformer instruction, e.g. 'summarize:'.
133
+
134
+ Parameters
135
+ ----------
136
+ value : str
137
+ """
138
+ return self._set(instruction=value)
139
+
140
+ def setConfigProtoBytes(self, b):
141
+ """Sets configProto from tensorflow, serialized into byte array.
142
+
143
+ Parameters
144
+ ----------
145
+ b : List[int]
146
+ ConfigProto from tensorflow, serialized into byte array
147
+ """
148
+ return self._set(configProtoBytes=b)
149
+
150
+ @keyword_only
151
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.InstructorEmbeddings", java_model=None):
152
+ super(InstructorEmbeddings, self).__init__(
153
+ classname=classname,
154
+ java_model=java_model
155
+ )
156
+ self._setDefault(
157
+ dimension=768,
158
+ batchSize=8,
159
+ maxSentenceLength=128,
160
+ caseSensitive=False,
161
+ instruction="",
162
+ )
163
+
164
+ @staticmethod
165
+ def loadSavedModel(folder, spark_session):
166
+ """Loads a locally saved model.
167
+
168
+ Parameters
169
+ ----------
170
+ folder : str
171
+ Folder of the saved model
172
+ spark_session : pyspark.sql.SparkSession
173
+ The current SparkSession
174
+
175
+ Returns
176
+ -------
177
+ InstructorEmbeddings
178
+ The restored model
179
+ """
180
+ from sparknlp.internal import _InstructorLoader
181
+ jModel = _InstructorLoader(folder, spark_session._jsparkSession)._java_obj
182
+ return InstructorEmbeddings(java_model=jModel)
183
+
184
+ @staticmethod
185
+ def pretrained(name="instructor_base", lang="en", remote_loc=None):
186
+ """Downloads and loads a pretrained model.
187
+
188
+ Parameters
189
+ ----------
190
+ name : str, optional
191
+ Name of the pretrained model, by default "instructor_base"
192
+ lang : str, optional
193
+ Language of the pretrained model, by default "en"
194
+ remote_loc : str, optional
195
+ Optional remote address of the resource, by default None. Will use
196
+ Spark NLPs repositories otherwise.
197
+
198
+ Returns
199
+ -------
200
+ InstructorEmbeddings
201
+ The restored model
202
+ """
203
+ from sparknlp.pretrained import ResourceDownloader
204
+ return ResourceDownloader.downloadModel(InstructorEmbeddings, name, lang, remote_loc)
@@ -21,7 +21,8 @@ class LongformerEmbeddings(AnnotatorModel,
21
21
  HasCaseSensitiveProperties,
22
22
  HasStorageRef,
23
23
  HasBatchedAnnotate,
24
- HasEngine):
24
+ HasEngine,
25
+ HasLongMaxSentenceLengthLimit):
25
26
  """Longformer is a transformer model for long documents. The Longformer
26
27
  model was presented in `Longformer: The Long-Document Transformer` by Iz
27
28
  Beltagy, Matthew E. Peters, Arman Cohan. longformer-base-4096 is a BERT-like
@@ -38,7 +39,7 @@ class LongformerEmbeddings(AnnotatorModel,
38
39
 
39
40
  The default model is ``"longformer_base_4096"``, if no name is provided. For
40
41
  available pretrained models please see the `Models Hub
41
- <https://nlp.johnsnowlabs.com/models?task=Embeddings>`__.
42
+ <https://sparknlp.org/models?task=Embeddings>`__.
42
43
 
43
44
  To see which models are compatible and how to import them see
44
45
  `Import Transformers into Spark NLP 🚀
@@ -139,11 +140,6 @@ class LongformerEmbeddings(AnnotatorModel,
139
140
 
140
141
  outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS
141
142
 
142
- maxSentenceLength = Param(Params._dummy(),
143
- "maxSentenceLength",
144
- "Max sentence length to process",
145
- typeConverter=TypeConverters.toInt)
146
-
147
143
  configProtoBytes = Param(Params._dummy(),
148
144
  "configProtoBytes",
149
145
  "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
@@ -159,16 +155,6 @@ class LongformerEmbeddings(AnnotatorModel,
159
155
  """
160
156
  return self._set(configProtoBytes=b)
161
157
 
162
- def setMaxSentenceLength(self, value):
163
- """Sets max sentence length to process, by default 1024.
164
-
165
- Parameters
166
- ----------
167
- value : int
168
- Max sentence length to process
169
- """
170
- return self._set(maxSentenceLength=value)
171
-
172
158
  @keyword_only
173
159
  def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.LongformerEmbeddings", java_model=None):
174
160
  super(LongformerEmbeddings, self).__init__(
@@ -0,0 +1,189 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for MiniLMEmbeddings."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class MiniLMEmbeddings(AnnotatorModel,
20
+ HasEmbeddingsProperties,
21
+ HasCaseSensitiveProperties,
22
+ HasStorageRef,
23
+ HasBatchedAnnotate,
24
+ HasMaxSentenceLengthLimit):
25
+ """Sentence embeddings using MiniLM.
26
+
27
+ MiniLM, a lightweight and efficient sentence embedding model that can generate text embeddings for various NLP tasks (e.g., classification, retrieval, clustering, text evaluation, etc.)
28
+ Note that this annotator is only supported for Spark Versions 3.4 and up.
29
+
30
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
31
+ object:
32
+
33
+ >>> embeddings = MiniLMEmbeddings.pretrained() \\
34
+ ... .setInputCols(["document"]) \\
35
+ ... .setOutputCol("minilm_embeddings")
36
+
37
+
38
+ The default model is ``"minilm_l6_v2"``, if no name is provided.
39
+
40
+ For available pretrained models please see the
41
+ `Models Hub <https://sparknlp.org/models?q=MiniLM>`__.
42
+
43
+
44
+ ====================== ======================
45
+ Input Annotation types Output Annotation type
46
+ ====================== ======================
47
+ ``DOCUMENT`` ``SENTENCE_EMBEDDINGS``
48
+ ====================== ======================
49
+
50
+ Parameters
51
+ ----------
52
+ batchSize
53
+ Size of every batch , by default 8
54
+ dimension
55
+ Number of embedding dimensions, by default 384
56
+ caseSensitive
57
+ Whether to ignore case in tokens for embeddings matching, by default False
58
+ maxSentenceLength
59
+ Max sentence length to process, by default 512
60
+ configProtoBytes
61
+ ConfigProto from tensorflow, serialized into byte array.
62
+
63
+ References
64
+ ----------
65
+ `MiniLM: Deep Self-Attention Distillation for Task-Agnostic Compression of Pre-Trained Transformers <https://arxiv.org/abs/2002.10957>`__
66
+
67
+ `MiniLM Github Repository <https://github.com/microsoft/unilm/tree/master/minilm>`__
68
+
69
+ **Paper abstract**
70
+
71
+ *We present a simple and effective approach to compress large pre-trained Transformer models
72
+ by distilling the self-attention module of the last Transformer layer. The compressed model
73
+ (called MiniLM) can be trained with task-agnostic distillation and then fine-tuned on various
74
+ downstream tasks. We evaluate MiniLM on the GLUE benchmark and show that it achieves comparable
75
+ results with BERT-base while being 4.3x smaller and 5.5x faster. We also show that MiniLM can
76
+ be further compressed to 22x smaller and 12x faster than BERT-base while maintaining comparable
77
+ performance.*
78
+
79
+ Examples
80
+ --------
81
+ >>> import sparknlp
82
+ >>> from sparknlp.base import *
83
+ >>> from sparknlp.annotator import *
84
+ >>> from pyspark.ml import Pipeline
85
+ >>> documentAssembler = DocumentAssembler() \\
86
+ ... .setInputCol("text") \\
87
+ ... .setOutputCol("document")
88
+ >>> embeddings = MiniLMEmbeddings.pretrained() \\
89
+ ... .setInputCols(["document"]) \\
90
+ ... .setOutputCol("minilm_embeddings")
91
+ >>> embeddingsFinisher = EmbeddingsFinisher() \\
92
+ ... .setInputCols(["minilm_embeddings"]) \\
93
+ ... .setOutputCols("finished_embeddings") \\
94
+ ... .setOutputAsVector(True)
95
+ >>> pipeline = Pipeline().setStages([
96
+ ... documentAssembler,
97
+ ... embeddings,
98
+ ... embeddingsFinisher
99
+ ... ])
100
+ >>> data = spark.createDataFrame([["This is a sample sentence for embedding generation.",
101
+ ... "Another example sentence to demonstrate MiniLM embeddings.",
102
+ ... ]]).toDF("text")
103
+ >>> result = pipeline.fit(data).transform(data)
104
+ >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
105
+ +--------------------------------------------------------------------------------+
106
+ | result|
107
+ +--------------------------------------------------------------------------------+
108
+ |[[0.1234567, -0.2345678, 0.3456789, -0.4567890, 0.5678901, -0.6789012...|
109
+ |[[0.2345678, -0.3456789, 0.4567890, -0.5678901, 0.6789012, -0.7890123...|
110
+ +--------------------------------------------------------------------------------+
111
+ """
112
+
113
+ name = "MiniLMEmbeddings"
114
+
115
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
116
+
117
+ outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
118
+ configProtoBytes = Param(Params._dummy(),
119
+ "configProtoBytes",
120
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
121
+ TypeConverters.toListInt)
122
+
123
+
124
+ def setConfigProtoBytes(self, b):
125
+ """Sets configProto from tensorflow, serialized into byte array.
126
+
127
+ Parameters
128
+ ----------
129
+ b : List[int]
130
+ ConfigProto from tensorflow, serialized into byte array
131
+ """
132
+ return self._set(configProtoBytes=b)
133
+
134
+ @keyword_only
135
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.MiniLMEmbeddings", java_model=None):
136
+ super(MiniLMEmbeddings, self).__init__(
137
+ classname=classname,
138
+ java_model=java_model
139
+ )
140
+ self._setDefault(
141
+ dimension=384,
142
+ batchSize=8,
143
+ maxSentenceLength=512,
144
+ caseSensitive=False,
145
+ )
146
+
147
+ @staticmethod
148
+ def loadSavedModel(folder, spark_session, use_openvino=False):
149
+ """Loads a locally saved model.
150
+
151
+ Parameters
152
+ ----------
153
+ folder : str
154
+ Folder of the saved model
155
+ spark_session : pyspark.sql.SparkSession
156
+ The current SparkSession
157
+ use_openvino : bool
158
+ Use OpenVINO backend
159
+
160
+ Returns
161
+ -------
162
+ MiniLMEmbeddings
163
+ The restored model
164
+ """
165
+ from sparknlp.internal import _MiniLMLoader
166
+ jModel = _MiniLMLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
167
+ return MiniLMEmbeddings(java_model=jModel)
168
+
169
+ @staticmethod
170
+ def pretrained(name="minilm_l6_v2", lang="en", remote_loc=None):
171
+ """Downloads and loads a pretrained model.
172
+
173
+ Parameters
174
+ ----------
175
+ name : str, optional
176
+ Name of the pretrained model, by default "minilm_l6_v2"
177
+ lang : str, optional
178
+ Language of the pretrained model, by default "en"
179
+ remote_loc : str, optional
180
+ Optional remote address of the resource, by default None. Will use
181
+ Spark NLPs repositories otherwise.
182
+
183
+ Returns
184
+ -------
185
+ MiniLMEmbeddings
186
+ The restored model
187
+ """
188
+ from sparknlp.pretrained import ResourceDownloader
189
+ return ResourceDownloader.downloadModel(MiniLMEmbeddings, name, lang, remote_loc)