spark-nlp 6.0.1rc1__py2.py3-none-any.whl → 6.0.2__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of spark-nlp might be problematic. Click here for more details.
- {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.2.dist-info}/METADATA +13 -6
- {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.2.dist-info}/RECORD +36 -30
- {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.2.dist-info}/WHEEL +1 -1
- sparknlp/__init__.py +4 -2
- sparknlp/annotator/cv/__init__.py +2 -0
- sparknlp/annotator/cv/florence2_transformer.py +180 -0
- sparknlp/annotator/cv/gemma3_for_multimodal.py +5 -10
- sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
- sparknlp/annotator/cv/janus_for_multimodal.py +8 -13
- sparknlp/annotator/cv/llava_for_multimodal.py +1 -1
- sparknlp/annotator/cv/paligemma_for_multimodal.py +7 -7
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +1 -1
- sparknlp/annotator/cv/qwen2vl_transformer.py +1 -1
- sparknlp/annotator/cv/smolvlm_transformer.py +7 -13
- sparknlp/annotator/date2_chunk.py +1 -1
- sparknlp/annotator/document_character_text_splitter.py +8 -8
- sparknlp/annotator/document_token_splitter.py +7 -7
- sparknlp/annotator/embeddings/bge_embeddings.py +21 -19
- sparknlp/annotator/embeddings/snowflake_embeddings.py +15 -15
- sparknlp/annotator/openai/openai_completion.py +3 -4
- sparknlp/annotator/seq2seq/m2m100_transformer.py +1 -1
- sparknlp/annotator/seq2seq/mistral_transformer.py +2 -3
- sparknlp/annotator/seq2seq/nllb_transformer.py +1 -1
- sparknlp/annotator/seq2seq/qwen_transformer.py +26 -25
- sparknlp/annotator/spell_check/context_spell_checker.py +1 -1
- sparknlp/base/prompt_assembler.py +1 -1
- sparknlp/common/properties.py +7 -7
- sparknlp/internal/__init__.py +19 -0
- sparknlp/partition/__init__.py +16 -0
- sparknlp/partition/partition.py +244 -0
- sparknlp/partition/partition_properties.py +257 -0
- sparknlp/partition/partition_transformer.py +196 -0
- sparknlp/reader/pdf_to_text.py +50 -4
- sparknlp/reader/sparknlp_reader.py +56 -52
- sparknlp/training/spacy_to_annotation.py +7 -7
- {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
from sparknlp.common import *
|
|
2
|
+
|
|
3
|
+
class InternVLForMultiModal(AnnotatorModel,
|
|
4
|
+
HasBatchedAnnotateImage,
|
|
5
|
+
HasImageFeatureProperties,
|
|
6
|
+
HasEngine,
|
|
7
|
+
HasGeneratorProperties):
|
|
8
|
+
"""
|
|
9
|
+
InternVLForMultiModal can load InternVL Vision models for visual question answering.
|
|
10
|
+
The model consists of a vision encoder, a text encoder, a text decoder and a model merger.
|
|
11
|
+
The vision encoder will encode the input image, the text encoder will encode the input text,
|
|
12
|
+
the model merger will merge the image and text embeddings, and the text decoder will output the answer.
|
|
13
|
+
|
|
14
|
+
InternVL 2.5 is an advanced multimodal large language model (MLLM) series that builds upon InternVL 2.0,
|
|
15
|
+
maintaining its core model architecture while introducing significant enhancements in training and testing
|
|
16
|
+
strategies as well as data quality. Key features include:
|
|
17
|
+
- Large context window support
|
|
18
|
+
- Multilingual support
|
|
19
|
+
- Multimodal capabilities handling both text and image inputs
|
|
20
|
+
- Optimized for deployment with int4 quantization
|
|
21
|
+
|
|
22
|
+
Pretrained models can be loaded with :meth:`.pretrained` of the companion object:
|
|
23
|
+
>>> visualQA = InternVLForMultiModal.pretrained() \\
|
|
24
|
+
... .setInputCols("image_assembler") \\
|
|
25
|
+
... .setOutputCol("answer")
|
|
26
|
+
|
|
27
|
+
The default model is `"internvl2_5_1b_int4"`, if no name is provided.
|
|
28
|
+
For available pretrained models, refer to the `Models Hub
|
|
29
|
+
<https://sparknlp.org/models?task=Question+Answering>`__.
|
|
30
|
+
|
|
31
|
+
====================== ======================
|
|
32
|
+
Input Annotation types Output Annotation type
|
|
33
|
+
====================== ======================
|
|
34
|
+
``IMAGE`` ``DOCUMENT``
|
|
35
|
+
====================== ======================
|
|
36
|
+
|
|
37
|
+
Parameters
|
|
38
|
+
----------
|
|
39
|
+
batchSize : int, optional
|
|
40
|
+
Batch size. Larger values allow faster processing but require more memory,
|
|
41
|
+
by default 1.
|
|
42
|
+
maxSentenceLength : int, optional
|
|
43
|
+
Maximum sentence length to process, by default 4096.
|
|
44
|
+
|
|
45
|
+
Examples
|
|
46
|
+
--------
|
|
47
|
+
>>> import sparknlp
|
|
48
|
+
>>> from sparknlp.base import *
|
|
49
|
+
>>> from sparknlp.annotator import *
|
|
50
|
+
>>> from pyspark.ml import Pipeline
|
|
51
|
+
>>> from pyspark.sql.functions import lit
|
|
52
|
+
>>> image_df = spark.read.format("image").load(path=images_path)
|
|
53
|
+
>>> test_df = image_df.withColumn(
|
|
54
|
+
... "text",
|
|
55
|
+
... lit("<|im_start|><image>\\nDescribe this image in detail.<|im_end|><|im_start|>assistant\\n")
|
|
56
|
+
... )
|
|
57
|
+
>>> imageAssembler = ImageAssembler() \\
|
|
58
|
+
... .setInputCol("image") \\
|
|
59
|
+
... .setOutputCol("image_assembler")
|
|
60
|
+
>>> visualQA = InternVLForMultiModal.pretrained() \\
|
|
61
|
+
... .setInputCols("image_assembler") \\
|
|
62
|
+
... .setOutputCol("answer")
|
|
63
|
+
>>> pipeline = Pipeline().setStages([
|
|
64
|
+
... imageAssembler,
|
|
65
|
+
... visualQA
|
|
66
|
+
... ])
|
|
67
|
+
|
|
68
|
+
>>> result = pipeline.fit(test_df).transform(test_df)
|
|
69
|
+
>>> result.select("image_assembler.origin", "answer.result").show(truncate=False)
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
name = "InternVLForMultiModal"
|
|
73
|
+
|
|
74
|
+
inputAnnotatorTypes = [AnnotatorType.IMAGE]
|
|
75
|
+
|
|
76
|
+
outputAnnotatorType = AnnotatorType.DOCUMENT
|
|
77
|
+
|
|
78
|
+
minOutputLength = Param(Params._dummy(), "minOutputLength", "Minimum length of the sequence to be generated",
|
|
79
|
+
typeConverter=TypeConverters.toInt)
|
|
80
|
+
|
|
81
|
+
maxOutputLength = Param(Params._dummy(), "maxOutputLength", "Maximum length of output text",
|
|
82
|
+
typeConverter=TypeConverters.toInt)
|
|
83
|
+
|
|
84
|
+
doSample = Param(Params._dummy(), "doSample", "Whether or not to use sampling; use greedy decoding otherwise",
|
|
85
|
+
typeConverter=TypeConverters.toBoolean)
|
|
86
|
+
|
|
87
|
+
temperature = Param(Params._dummy(), "temperature", "The value used to module the next token probabilities",
|
|
88
|
+
typeConverter=TypeConverters.toFloat)
|
|
89
|
+
|
|
90
|
+
topK = Param(Params._dummy(), "topK",
|
|
91
|
+
"The number of highest probability vocabulary tokens to keep for top-k-filtering",
|
|
92
|
+
typeConverter=TypeConverters.toInt)
|
|
93
|
+
|
|
94
|
+
topP = Param(Params._dummy(), "topP",
|
|
95
|
+
"If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are kept for generation",
|
|
96
|
+
typeConverter=TypeConverters.toFloat)
|
|
97
|
+
|
|
98
|
+
repetitionPenalty = Param(Params._dummy(), "repetitionPenalty",
|
|
99
|
+
"The parameter for repetition penalty. 1.0 means no penalty. See `this paper <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details",
|
|
100
|
+
typeConverter=TypeConverters.toFloat)
|
|
101
|
+
|
|
102
|
+
noRepeatNgramSize = Param(Params._dummy(), "noRepeatNgramSize",
|
|
103
|
+
"If set to int > 0, all ngrams of that size can only occur once",
|
|
104
|
+
typeConverter=TypeConverters.toInt)
|
|
105
|
+
|
|
106
|
+
ignoreTokenIds = Param(Params._dummy(), "ignoreTokenIds",
|
|
107
|
+
"A list of token ids which are ignored in the decoder's output",
|
|
108
|
+
typeConverter=TypeConverters.toListInt)
|
|
109
|
+
|
|
110
|
+
beamSize = Param(Params._dummy(), "beamSize",
|
|
111
|
+
"The Number of beams for beam search.",
|
|
112
|
+
typeConverter=TypeConverters.toInt)
|
|
113
|
+
|
|
114
|
+
def setMaxSentenceSize(self, value):
|
|
115
|
+
"""Sets Maximum sentence length that the annotator will process, by
|
|
116
|
+
default 4096.
|
|
117
|
+
Parameters
|
|
118
|
+
----------
|
|
119
|
+
value : int
|
|
120
|
+
Maximum sentence length that the annotator will process
|
|
121
|
+
"""
|
|
122
|
+
return self._set(maxSentenceLength=value)
|
|
123
|
+
|
|
124
|
+
def setIgnoreTokenIds(self, value):
|
|
125
|
+
"""A list of token ids which are ignored in the decoder's output.
|
|
126
|
+
Parameters
|
|
127
|
+
----------
|
|
128
|
+
value : List[int]
|
|
129
|
+
The words to be filtered out
|
|
130
|
+
"""
|
|
131
|
+
return self._set(ignoreTokenIds=value)
|
|
132
|
+
|
|
133
|
+
def setMinOutputLength(self, value):
|
|
134
|
+
"""Sets minimum length of the sequence to be generated.
|
|
135
|
+
Parameters
|
|
136
|
+
----------
|
|
137
|
+
value : int
|
|
138
|
+
Minimum length of the sequence to be generated
|
|
139
|
+
"""
|
|
140
|
+
return self._set(minOutputLength=value)
|
|
141
|
+
|
|
142
|
+
def setMaxOutputLength(self, value):
|
|
143
|
+
"""Sets maximum length of output text.
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
value : int
|
|
147
|
+
Maximum length of output text
|
|
148
|
+
"""
|
|
149
|
+
return self._set(maxOutputLength=value)
|
|
150
|
+
|
|
151
|
+
def setDoSample(self, value):
|
|
152
|
+
"""Sets whether or not to use sampling, use greedy decoding otherwise.
|
|
153
|
+
Parameters
|
|
154
|
+
----------
|
|
155
|
+
value : bool
|
|
156
|
+
Whether or not to use sampling; use greedy decoding otherwise
|
|
157
|
+
"""
|
|
158
|
+
return self._set(doSample=value)
|
|
159
|
+
|
|
160
|
+
def setTemperature(self, value):
|
|
161
|
+
"""Sets the value used to module the next token probabilities.
|
|
162
|
+
Parameters
|
|
163
|
+
----------
|
|
164
|
+
value : float
|
|
165
|
+
The value used to module the next token probabilities
|
|
166
|
+
"""
|
|
167
|
+
return self._set(temperature=value)
|
|
168
|
+
|
|
169
|
+
def setTopK(self, value):
|
|
170
|
+
"""Sets the number of highest probability vocabulary tokens to keep for
|
|
171
|
+
top-k-filtering.
|
|
172
|
+
Parameters
|
|
173
|
+
----------
|
|
174
|
+
value : int
|
|
175
|
+
Number of highest probability vocabulary tokens to keep
|
|
176
|
+
"""
|
|
177
|
+
return self._set(topK=value)
|
|
178
|
+
|
|
179
|
+
def setTopP(self, value):
|
|
180
|
+
"""Sets the top cumulative probability for vocabulary tokens.
|
|
181
|
+
If set to float < 1, only the most probable tokens with probabilities
|
|
182
|
+
that add up to ``topP`` or higher are kept for generation.
|
|
183
|
+
Parameters
|
|
184
|
+
----------
|
|
185
|
+
value : float
|
|
186
|
+
Cumulative probability for vocabulary tokens
|
|
187
|
+
"""
|
|
188
|
+
return self._set(topP=value)
|
|
189
|
+
|
|
190
|
+
def setRepetitionPenalty(self, value):
|
|
191
|
+
"""Sets the parameter for repetition penalty. 1.0 means no penalty.
|
|
192
|
+
Parameters
|
|
193
|
+
----------
|
|
194
|
+
value : float
|
|
195
|
+
The repetition penalty
|
|
196
|
+
References
|
|
197
|
+
----------
|
|
198
|
+
See `Ctrl: A Conditional Transformer Language Model For Controllable
|
|
199
|
+
Generation <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
|
|
200
|
+
"""
|
|
201
|
+
return self._set(repetitionPenalty=value)
|
|
202
|
+
|
|
203
|
+
def setNoRepeatNgramSize(self, value):
|
|
204
|
+
"""Sets size of n-grams that can only occur once.
|
|
205
|
+
If set to int > 0, all ngrams of that size can only occur once.
|
|
206
|
+
Parameters
|
|
207
|
+
----------
|
|
208
|
+
value : int
|
|
209
|
+
N-gram size can only occur once
|
|
210
|
+
"""
|
|
211
|
+
return self._set(noRepeatNgramSize=value)
|
|
212
|
+
|
|
213
|
+
def setBeamSize(self, value):
|
|
214
|
+
"""Sets the number of beam size for beam search, by default `1`.
|
|
215
|
+
Parameters
|
|
216
|
+
----------
|
|
217
|
+
value : int
|
|
218
|
+
Number of beam size for beam search
|
|
219
|
+
"""
|
|
220
|
+
return self._set(beamSize=value)
|
|
221
|
+
|
|
222
|
+
@keyword_only
|
|
223
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.cv.InternVLForMultiModal",
|
|
224
|
+
java_model=None):
|
|
225
|
+
super(InternVLForMultiModal, self).__init__(
|
|
226
|
+
classname=classname,
|
|
227
|
+
java_model=java_model
|
|
228
|
+
)
|
|
229
|
+
self._setDefault(
|
|
230
|
+
batchSize=1,
|
|
231
|
+
minOutputLength=0,
|
|
232
|
+
maxOutputLength=20,
|
|
233
|
+
doSample=False,
|
|
234
|
+
temperature=0.6,
|
|
235
|
+
topK=-1,
|
|
236
|
+
topP=0.9,
|
|
237
|
+
repetitionPenalty=1.0,
|
|
238
|
+
noRepeatNgramSize=3,
|
|
239
|
+
ignoreTokenIds=[],
|
|
240
|
+
beamSize=1
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
@staticmethod
|
|
244
|
+
def loadSavedModel(folder, spark_session, use_openvino=False):
|
|
245
|
+
"""Loads a locally saved model.
|
|
246
|
+
Parameters
|
|
247
|
+
----------
|
|
248
|
+
folder : str
|
|
249
|
+
Folder of the saved model
|
|
250
|
+
spark_session : pyspark.sql.SparkSession
|
|
251
|
+
The current SparkSession
|
|
252
|
+
Returns
|
|
253
|
+
-------
|
|
254
|
+
InternVLForMultiModal
|
|
255
|
+
The restored model
|
|
256
|
+
"""
|
|
257
|
+
from sparknlp.internal import _InternVLForMultiModalLoader
|
|
258
|
+
jModel = _InternVLForMultiModalLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
|
|
259
|
+
return InternVLForMultiModal(java_model=jModel)
|
|
260
|
+
|
|
261
|
+
@staticmethod
|
|
262
|
+
def pretrained(name="internvl2_5_1b_int4", lang="en", remote_loc=None):
|
|
263
|
+
"""Downloads and loads a pretrained model.
|
|
264
|
+
Parameters
|
|
265
|
+
----------
|
|
266
|
+
name : str, optional
|
|
267
|
+
Name of the pretrained model, by default
|
|
268
|
+
"internvl2_5_1b_int4"
|
|
269
|
+
lang : str, optional
|
|
270
|
+
Language of the pretrained model, by default "en"
|
|
271
|
+
remote_loc : str, optional
|
|
272
|
+
Optional remote address of the resource, by default None. Will use
|
|
273
|
+
Spark NLPs repositories otherwise.
|
|
274
|
+
Returns
|
|
275
|
+
-------
|
|
276
|
+
InternVLForMultiModal
|
|
277
|
+
The restored model
|
|
278
|
+
"""
|
|
279
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
280
|
+
return ResourceDownloader.downloadModel(InternVLForMultiModal, name, lang, remote_loc)
|
|
@@ -36,8 +36,9 @@ class JanusForMultiModal(AnnotatorModel,
|
|
|
36
36
|
and for image generation, it uses a tokenizer with a downsample rate of 16.
|
|
37
37
|
|
|
38
38
|
Pretrained models can be loaded with :meth:`.pretrained` of the companion object:
|
|
39
|
-
|
|
40
|
-
|
|
39
|
+
|
|
40
|
+
>>> visualQAClassifier = JanusForMultiModal.pretrained() \\
|
|
41
|
+
... .setInputCols(["image_assembler"]) \\
|
|
41
42
|
... .setOutputCol("answer")
|
|
42
43
|
|
|
43
44
|
The default model is `"janus_1_3b_int4"`, if no name is provided.
|
|
@@ -73,29 +74,23 @@ class JanusForMultiModal(AnnotatorModel,
|
|
|
73
74
|
>>> from sparknlp.annotator import *
|
|
74
75
|
>>> from pyspark.ml import Pipeline
|
|
75
76
|
>>> from pyspark.sql.functions import lit
|
|
76
|
-
|
|
77
77
|
>>> image_df = SparkSessionForTest.spark.read.format("image").load(path=images_path)
|
|
78
78
|
>>> test_df = image_df.withColumn(
|
|
79
79
|
... "text",
|
|
80
|
-
... lit("User: <image_placeholder>Describe image in details
|
|
80
|
+
... lit("User: <image_placeholder>Describe image in details\\n\\nAssistant:")
|
|
81
81
|
... )
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
... .setInputCol("image") \
|
|
82
|
+
>>> imageAssembler = ImageAssembler() \\
|
|
83
|
+
... .setInputCol("image") \\
|
|
85
84
|
... .setOutputCol("image_assembler")
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
... .setInputCols("image_assembler") \
|
|
85
|
+
>>> visualQAClassifier = JanusForMultiModal.pretrained() \\
|
|
86
|
+
... .setInputCols("image_assembler") \\
|
|
89
87
|
... .setOutputCol("answer")
|
|
90
|
-
|
|
91
88
|
>>> pipeline = Pipeline().setStages([
|
|
92
89
|
... imageAssembler,
|
|
93
90
|
... visualQAClassifier
|
|
94
91
|
... ])
|
|
95
|
-
|
|
96
92
|
>>> result = pipeline.fit(test_df).transform(test_df)
|
|
97
93
|
>>> result.select("image_assembler.origin", "answer.result").show(truncate=False)
|
|
98
|
-
|
|
99
94
|
+--------------------------------------+----------------------------------------------------------------------+
|
|
100
95
|
|origin |result |
|
|
101
96
|
+--------------------------------------+----------------------------------------------------------------------+
|
|
@@ -65,7 +65,7 @@ class LLAVAForMultiModal(AnnotatorModel,
|
|
|
65
65
|
>>> from sparknlp.annotator import *
|
|
66
66
|
>>> from pyspark.ml import Pipeline
|
|
67
67
|
>>> image_df = SparkSessionForTest.spark.read.format("image").load(path=images_path)
|
|
68
|
-
>>> test_df = image_df.withColumn("text", lit("USER:
|
|
68
|
+
>>> test_df = image_df.withColumn("text", lit("USER: \\n <|image|> \\n What's this picture about? \\n ASSISTANT:\\n"))
|
|
69
69
|
>>> imageAssembler = ImageAssembler() \\
|
|
70
70
|
... .setInputCol("image") \\
|
|
71
71
|
... .setOutputCol("image_assembler")
|
|
@@ -28,8 +28,8 @@ class PaliGemmaForMultiModal(AnnotatorModel,
|
|
|
28
28
|
Pretrained models can be loaded with :meth:`.pretrained` of the companion
|
|
29
29
|
object:
|
|
30
30
|
|
|
31
|
-
>>> visualQAClassifier = PaliGemmaForMultiModal.pretrained()
|
|
32
|
-
... .setInputCols(["image_assembler"])
|
|
31
|
+
>>> visualQAClassifier = PaliGemmaForMultiModal.pretrained() \\
|
|
32
|
+
... .setInputCols(["image_assembler"]) \\
|
|
33
33
|
... .setOutputCol("answer")
|
|
34
34
|
|
|
35
35
|
The default model is ``"paligemma_3b_pt_224_int4"``, if no name is
|
|
@@ -59,12 +59,12 @@ class PaliGemmaForMultiModal(AnnotatorModel,
|
|
|
59
59
|
>>> from sparknlp.annotator import *
|
|
60
60
|
>>> from pyspark.ml import Pipeline
|
|
61
61
|
>>> image_df = SparkSessionForTest.spark.read.format("image").load(path=images_path)
|
|
62
|
-
>>> test_df = image_df.withColumn("text", lit("USER:
|
|
63
|
-
>>> imageAssembler = ImageAssembler()
|
|
64
|
-
... .setInputCol("image")
|
|
62
|
+
>>> test_df = image_df.withColumn("text", lit("USER: \\n <image> \\nDescribe this image. \\nASSISTANT:\\n"))
|
|
63
|
+
>>> imageAssembler = ImageAssembler() \\
|
|
64
|
+
... .setInputCol("image") \\
|
|
65
65
|
... .setOutputCol("image_assembler")
|
|
66
|
-
>>> visualQAClassifier = PaliGemmaForMultiModal.pretrained()
|
|
67
|
-
... .setInputCols("image_assembler")
|
|
66
|
+
>>> visualQAClassifier = PaliGemmaForMultiModal.pretrained() \\
|
|
67
|
+
... .setInputCols("image_assembler") \\
|
|
68
68
|
... .setOutputCol("answer")
|
|
69
69
|
>>> pipeline = Pipeline().setStages([
|
|
70
70
|
... imageAssembler,
|
|
@@ -65,7 +65,7 @@ class Phi3Vision(AnnotatorModel,
|
|
|
65
65
|
>>> from sparknlp.annotator import *
|
|
66
66
|
>>> from pyspark.ml import Pipeline
|
|
67
67
|
>>> image_df = SparkSessionForTest.spark.read.format("image").load(path=images_path)
|
|
68
|
-
>>> test_df = image_df.withColumn("text", lit("<|user|>
|
|
68
|
+
>>> test_df = image_df.withColumn("text", lit("<|user|> \\n <|image_1|> \\nWhat is unusual on this picture? <|end|>\\n <|assistant|>\\n"))
|
|
69
69
|
>>> imageAssembler = ImageAssembler() \\
|
|
70
70
|
... .setInputCol("image") \\
|
|
71
71
|
... .setOutputCol("image_assembler")
|
|
@@ -68,7 +68,7 @@ class Qwen2VLTransformer(AnnotatorModel,
|
|
|
68
68
|
>>> from sparknlp.annotator import *
|
|
69
69
|
>>> from pyspark.ml import Pipeline
|
|
70
70
|
>>> image_df = SparkSessionForTest.spark.read.format("image").load(path=images_path)
|
|
71
|
-
>>> test_df = image_df.withColumn("text", lit("<|im_start|>system
|
|
71
|
+
>>> test_df = image_df.withColumn("text", lit("<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n<|im_start|>user\\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\\n<|im_start|>assistant\\n"))
|
|
72
72
|
>>> imageAssembler = ImageAssembler() \\
|
|
73
73
|
... .setInputCol("image") \\
|
|
74
74
|
... .setOutputCol("image_assembler")
|
|
@@ -33,8 +33,8 @@ class SmolVLMTransformer(AnnotatorModel,
|
|
|
33
33
|
while maintaining strong performance on multimodal tasks.
|
|
34
34
|
|
|
35
35
|
Pretrained models can be loaded with :meth:`.pretrained` of the companion object:
|
|
36
|
-
>>> visualQA = SmolVLMTransformer.pretrained()
|
|
37
|
-
... .setInputCols(["image_assembler"])
|
|
36
|
+
>>> visualQA = SmolVLMTransformer.pretrained() \\
|
|
37
|
+
... .setInputCols(["image_assembler"]) \\
|
|
38
38
|
... .setOutputCol("answer")
|
|
39
39
|
|
|
40
40
|
The default model is `"smolvlm_instruct_int4"`, if no name is provided.
|
|
@@ -82,29 +82,23 @@ class SmolVLMTransformer(AnnotatorModel,
|
|
|
82
82
|
>>> from sparknlp.annotator import *
|
|
83
83
|
>>> from pyspark.ml import Pipeline
|
|
84
84
|
>>> from pyspark.sql.functions import lit
|
|
85
|
-
|
|
86
85
|
>>> imageDF = spark.read.format("image").load(path=images_path)
|
|
87
86
|
>>> testDF = imageDF.withColumn(
|
|
88
87
|
... "text",
|
|
89
|
-
... lit("<|im_start|>User:<image>Can you describe the image?<end_of_utterance
|
|
88
|
+
... lit("<|im_start|>User:<image>Can you describe the image?<end_of_utterance>\\nAssistant:")
|
|
90
89
|
... )
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
... .setInputCol("image") \
|
|
90
|
+
>>> imageAssembler = ImageAssembler() \\
|
|
91
|
+
... .setInputCol("image") \\
|
|
94
92
|
... .setOutputCol("image_assembler")
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
... .setInputCols("image_assembler") \
|
|
93
|
+
>>> visualQAClassifier = SmolVLMTransformer.pretrained() \\
|
|
94
|
+
... .setInputCols("image_assembler") \\
|
|
98
95
|
... .setOutputCol("answer")
|
|
99
|
-
|
|
100
96
|
>>> pipeline = Pipeline().setStages([
|
|
101
97
|
... imageAssembler,
|
|
102
98
|
... visualQAClassifier
|
|
103
99
|
... ])
|
|
104
|
-
|
|
105
100
|
>>> result = pipeline.fit(testDF).transform(testDF)
|
|
106
101
|
>>> result.select("image_assembler.origin", "answer.result").show(truncate=False)
|
|
107
|
-
|
|
108
102
|
+--------------------------------------+----------------------------------------------------------------------+
|
|
109
103
|
|origin |result |
|
|
110
104
|
+--------------------------------------+----------------------------------------------------------------------+
|
|
@@ -24,7 +24,7 @@ class Date2Chunk(AnnotatorModel):
|
|
|
24
24
|
====================== ======================
|
|
25
25
|
Input Annotation types Output Annotation type
|
|
26
26
|
====================== ======================
|
|
27
|
-
``DATE``
|
|
27
|
+
``DATE`` ``CHUNK``
|
|
28
28
|
====================== ======================
|
|
29
29
|
|
|
30
30
|
Parameters
|
|
@@ -55,7 +55,7 @@ class DocumentCharacterTextSplitter(AnnotatorModel):
|
|
|
55
55
|
chunkOverlap
|
|
56
56
|
Length of the overlap between text chunks , by default `0`.
|
|
57
57
|
splitPatterns
|
|
58
|
-
Patterns to separate the text by in decreasing priority , by default `["
|
|
58
|
+
Patterns to separate the text by in decreasing priority , by default `["\\n\\n", "\\n", " ", ""]`.
|
|
59
59
|
patternsAreRegex
|
|
60
60
|
Whether to interpret the split patterns as regular expressions , by default `False`.
|
|
61
61
|
keepSeparators
|
|
@@ -94,13 +94,13 @@ class DocumentCharacterTextSplitter(AnnotatorModel):
|
|
|
94
94
|
| result|splits[0].begin|splits[0].end|length|
|
|
95
95
|
+--------------------------------------------------------------------------------+---------------+-------------+------+
|
|
96
96
|
|[ Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyl...| 0| 19994| 19994|
|
|
97
|
-
|["And Mademoiselle's address?" he asked
|
|
98
|
-
|["How did that help you?"
|
|
99
|
-
|["'But,' said I, 'there would be millions of red-headed men who
|
|
100
|
-
|[My friend was an enthusiastic musician, being himself not only a
|
|
101
|
-
|["And yet I am not convinced of it," I answered. "The cases which
|
|
102
|
-
|["Well, she had a slate-coloured, broad-brimmed straw hat, with a
|
|
103
|
-
|["That sounds a little paradoxical."
|
|
97
|
+
|["And Mademoiselle's address?" he asked.\\n\\n"Is Briony Lodge, Serpentine Aven...| 19798| 39395| 19597|
|
|
98
|
+
|["How did that help you?"\\n\\n"It was all-important. When a woman thinks that ...| 39371| 59242| 19871|
|
|
99
|
+
|["'But,' said I, 'there would be millions of red-headed men who\\nwould apply....| 59166| 77833| 18667|
|
|
100
|
+
|[My friend was an enthusiastic musician, being himself not only a\\nvery capab...| 77835| 97769| 19934|
|
|
101
|
+
|["And yet I am not convinced of it," I answered. "The cases which\\ncome to li...| 97771| 117248| 19477|
|
|
102
|
+
|["Well, she had a slate-coloured, broad-brimmed straw hat, with a\\nfeather of...| 117250| 137242| 19992|
|
|
103
|
+
|["That sounds a little paradoxical."\\n\\n"But it is profoundly True. Singulari...| 137244| 157171| 19927|
|
|
104
104
|
+--------------------------------------------------------------------------------+---------------+-------------+------+
|
|
105
105
|
|
|
106
106
|
"""
|
|
@@ -88,13 +88,13 @@ class DocumentTokenSplitter(AnnotatorModel):
|
|
|
88
88
|
| result|begin| end|length|tokens|
|
|
89
89
|
+--------------------------------------------------------------------------------+-----+-----+------+------+
|
|
90
90
|
|[ Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyl...| 0| 3018| 3018| 512|
|
|
91
|
-
|[study of crime, and occupied his
|
|
92
|
-
|[but as I have changed my clothes I can't imagine how you
|
|
93
|
-
|[quarters received. Be in your chamber then at that hour, and do
|
|
94
|
-
|[a pity
|
|
95
|
-
|[person who employs me wishes his agent to be unknown to
|
|
96
|
-
|[letters back."
|
|
97
|
-
|[seven hundred in
|
|
91
|
+
|[study of crime, and occupied his\\nimmense faculties and extraordinary powers...| 2950| 5707| 2757| 512|
|
|
92
|
+
|[but as I have changed my clothes I can't imagine how you\\ndeduce it. As to M...| 5659| 8483| 2824| 512|
|
|
93
|
+
|[quarters received. Be in your chamber then at that hour, and do\\nnot take it...| 8427|11241| 2814| 512|
|
|
94
|
+
|[a pity\\nto miss it."\\n\\n"But your client--"\\n\\n"Never mind him. I may want y...|11188|13970| 2782| 512|
|
|
95
|
+
|[person who employs me wishes his agent to be unknown to\\nyou, and I may conf...|13918|16898| 2980| 512|
|
|
96
|
+
|[letters back."\\n\\n"Precisely so. But how--"\\n\\n"Was there a secret marriage?...|16836|19744| 2908| 512|
|
|
97
|
+
|[seven hundred in\\nnotes," he said.\\n\\nHolmes scribbled a receipt upon a shee...|19683|22551| 2868| 512|
|
|
98
98
|
+--------------------------------------------------------------------------------+-----+-----+------+------+
|
|
99
99
|
|
|
100
100
|
"""
|
|
@@ -49,23 +49,9 @@ class BGEEmbeddings(AnnotatorModel,
|
|
|
49
49
|
``DOCUMENT`` ``SENTENCE_EMBEDDINGS``
|
|
50
50
|
====================== ======================
|
|
51
51
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
Size of every batch , by default 8
|
|
56
|
-
dimension
|
|
57
|
-
Number of embedding dimensions, by default 768
|
|
58
|
-
caseSensitive
|
|
59
|
-
Whether to ignore case in tokens for embeddings matching, by default False
|
|
60
|
-
maxSentenceLength
|
|
61
|
-
Max sentence length to process, by default 512
|
|
62
|
-
configProtoBytes
|
|
63
|
-
ConfigProto from tensorflow, serialized into byte array.
|
|
64
|
-
useCLSToken
|
|
65
|
-
Whether to use the CLS token for sentence embeddings, by default True
|
|
66
|
-
|
|
67
|
-
References
|
|
68
|
-
----------
|
|
52
|
+
|
|
53
|
+
**References**
|
|
54
|
+
|
|
69
55
|
`C-Pack: Packaged Resources To Advance General Chinese Embedding <https://arxiv.org/pdf/2309.07597>`__
|
|
70
56
|
`BGE Github Repository <https://github.com/FlagOpen/FlagEmbedding>`__
|
|
71
57
|
|
|
@@ -84,6 +70,22 @@ class BGEEmbeddings(AnnotatorModel,
|
|
|
84
70
|
benchmark; meanwhile, our released English data is 2 times larger than the Chinese data. All
|
|
85
71
|
these resources are made publicly available at https://github.com/FlagOpen/FlagEmbedding.*
|
|
86
72
|
|
|
73
|
+
|
|
74
|
+
Parameters
|
|
75
|
+
----------
|
|
76
|
+
batchSize
|
|
77
|
+
Size of every batch , by default 8
|
|
78
|
+
dimension
|
|
79
|
+
Number of embedding dimensions, by default 768
|
|
80
|
+
caseSensitive
|
|
81
|
+
Whether to ignore case in tokens for embeddings matching, by default False
|
|
82
|
+
maxSentenceLength
|
|
83
|
+
Max sentence length to process, by default 512
|
|
84
|
+
configProtoBytes
|
|
85
|
+
ConfigProto from tensorflow, serialized into byte array.
|
|
86
|
+
useCLSToken
|
|
87
|
+
Whether to use the CLS token for sentence embeddings, by default True
|
|
88
|
+
|
|
87
89
|
Examples
|
|
88
90
|
--------
|
|
89
91
|
>>> import sparknlp
|
|
@@ -106,8 +108,8 @@ class BGEEmbeddings(AnnotatorModel,
|
|
|
106
108
|
... embeddingsFinisher
|
|
107
109
|
... ])
|
|
108
110
|
>>> data = spark.createDataFrame([["query: how much protein should a female eat",
|
|
109
|
-
... "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." +
|
|
110
|
-
... "But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" +
|
|
111
|
+
... "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." + \\
|
|
112
|
+
... "But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" + \\
|
|
111
113
|
... "marathon. Check out the chart below to see how much protein you should be eating each day.",
|
|
112
114
|
... ]]).toDF("text")
|
|
113
115
|
>>> result = pipeline.fit(data).transform(data)
|
|
@@ -47,21 +47,7 @@ class SnowFlakeEmbeddings(AnnotatorModel,
|
|
|
47
47
|
``DOCUMENT`` ``SENTENCE_EMBEDDINGS``
|
|
48
48
|
====================== ======================
|
|
49
49
|
|
|
50
|
-
|
|
51
|
-
----------
|
|
52
|
-
batchSize
|
|
53
|
-
Size of every batch , by default 8
|
|
54
|
-
dimension
|
|
55
|
-
Number of embedding dimensions, by default 768
|
|
56
|
-
caseSensitive
|
|
57
|
-
Whether to ignore case in tokens for embeddings matching, by default False
|
|
58
|
-
maxSentenceLength
|
|
59
|
-
Max sentence length to process, by default 512
|
|
60
|
-
configProtoBytes
|
|
61
|
-
ConfigProto from tensorflow, serialized into byte array.
|
|
62
|
-
|
|
63
|
-
References
|
|
64
|
-
----------
|
|
50
|
+
**References**
|
|
65
51
|
|
|
66
52
|
`Arctic-Embed: Scalable, Efficient, and Accurate Text Embedding Models <https://arxiv.org/abs/2405.05374>`__
|
|
67
53
|
`Snowflake Arctic-Embed Models <https://github.com/Snowflake-Labs/arctic-embed>`__
|
|
@@ -78,6 +64,20 @@ class SnowFlakeEmbeddings(AnnotatorModel,
|
|
|
78
64
|
data curation is crucial to retrieval accuracy. A detailed technical report will be available
|
|
79
65
|
shortly. *
|
|
80
66
|
|
|
67
|
+
Parameters
|
|
68
|
+
----------
|
|
69
|
+
batchSize
|
|
70
|
+
Size of every batch , by default 8
|
|
71
|
+
dimension
|
|
72
|
+
Number of embedding dimensions, by default 768
|
|
73
|
+
caseSensitive
|
|
74
|
+
Whether to ignore case in tokens for embeddings matching, by default False
|
|
75
|
+
maxSentenceLength
|
|
76
|
+
Max sentence length to process, by default 512
|
|
77
|
+
configProtoBytes
|
|
78
|
+
ConfigProto from tensorflow, serialized into byte array.
|
|
79
|
+
|
|
80
|
+
|
|
81
81
|
Examples
|
|
82
82
|
--------
|
|
83
83
|
>>> import sparknlp
|
|
@@ -63,7 +63,6 @@ class OpenAICompletion(AnnotatorModel):
|
|
|
63
63
|
>>> from sparknlp.annotator import *
|
|
64
64
|
>>> from sparknlp.common import *
|
|
65
65
|
>>> from pyspark.ml import Pipeline
|
|
66
|
-
|
|
67
66
|
>>> documentAssembler = DocumentAssembler() \\
|
|
68
67
|
... .setInputCol("text") \\
|
|
69
68
|
... .setOutputCol("document")
|
|
@@ -83,9 +82,9 @@ class OpenAICompletion(AnnotatorModel):
|
|
|
83
82
|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
84
83
|
|completion |
|
|
85
84
|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
86
|
-
|[{document, 0, 258,
|
|
87
|
-
|[{document, 0, 227,
|
|
88
|
-
|[{document, 0, 172,
|
|
85
|
+
|[{document, 0, 258, \\n\\nI had the pleasure of dining at La Fiorita recently, and it was a truly delightful experience! The menu boasted a wonderful selection of classic Italian dishes, all exquisitely prepared and presented. The service staff was friendly and attentive and really, {}, []}]|
|
|
86
|
+
|[{document, 0, 227, \\n\\nI recently visited Barbecue Joe's for dinner and it was amazing! The menu had so many items to choose from including pulled pork, smoked turkey, brisket, pork ribs, and sandwiches. I opted for the pulled pork sandwich and let, {}, []}] |
|
|
87
|
+
|[{document, 0, 172, \\n\\n{ \\n "review": { \\n "overallRating": 4, \\n "reviewBody": "I enjoyed my meal at this restaurant. The food was flavourful, well-prepared and beautifully presented., {}, []}] |
|
|
89
88
|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
|
90
89
|
"""
|
|
91
90
|
|
|
@@ -77,7 +77,7 @@ class M2M100Transformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
|
|
|
77
77
|
Target Language (Default: `fr`)
|
|
78
78
|
|
|
79
79
|
Languages Covered
|
|
80
|
-
|
|
80
|
+
-----------------
|
|
81
81
|
Afrikaans (af), Amharic (am), Arabic (ar), Asturian (ast), Azerbaijani (az), Bashkir (ba),
|
|
82
82
|
Belarusian (be), Bulgarian (bg), Bengali (bn), Breton (br), Bosnian (bs), Catalan; Valencian
|
|
83
83
|
(ca), Cebuano (ceb), Czech (cs), Welsh (cy), Danish (da), German (de), Greeek (el), English
|