spark-nlp 6.0.0__py2.py3-none-any.whl → 6.0.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spark-nlp might be problematic. Click here for more details.

@@ -0,0 +1,432 @@
1
+ # Copyright 2017-2024 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from sparknlp.common import *
16
+
17
+ class SmolVLMTransformer(AnnotatorModel,
18
+ HasBatchedAnnotateImage,
19
+ HasImageFeatureProperties,
20
+ HasEngine,
21
+ HasCandidateLabelsProperties,
22
+ HasRescaleFactor):
23
+ """
24
+ SmolVLMTransformer can load SmolVLM models for visual question answering. The model
25
+ consists of a vision encoder, a text encoder as well as a text decoder. The vision encoder
26
+ will encode the input image, the text encoder will encode the input question together with the
27
+ encoding of the image, and the text decoder will output the answer to the question.
28
+
29
+ SmolVLM is a compact open multimodal model that accepts arbitrary sequences of image and text
30
+ inputs to produce text outputs. Designed for efficiency, SmolVLM can answer questions about images,
31
+ describe visual content, create stories grounded on multiple images, or function as a pure language
32
+ model without visual inputs. Its lightweight architecture makes it suitable for on-device applications
33
+ while maintaining strong performance on multimodal tasks.
34
+
35
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion object:
36
+ >>> visualQA = SmolVLMTransformer.pretrained() \
37
+ ... .setInputCols(["image_assembler"]) \
38
+ ... .setOutputCol("answer")
39
+
40
+ The default model is `"smolvlm_instruct_int4"`, if no name is provided.
41
+ For available pretrained models, refer to the `Models Hub
42
+ <https://sparknlp.org/models?task=Question+Answering>`__.
43
+
44
+ Models from the HuggingFace 🧧 Transformers library are also compatible with Spark NLP 🚀.
45
+ To check compatibility and learn how to import them, see `Import Transformers into Spark NLP 🚀
46
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
47
+ For extended examples, refer to the `SmolVLMTransformer Test Suite
48
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/SmolVLMTransformerTest.scala>`_.
49
+
50
+ ====================== ======================
51
+ Input Annotation types Output Annotation type
52
+ ====================== ======================
53
+ ``IMAGE`` ``DOCUMENT``
54
+ ====================== ======================
55
+
56
+ Parameters
57
+ ----------
58
+ batchSize : int, optional
59
+ Batch size. Larger values allow faster processing but require more memory,
60
+ by default 1.
61
+ configProtoBytes : bytes, optional
62
+ ConfigProto from TensorFlow, serialized into a byte array.
63
+ maxSentenceLength : int, optional
64
+ Maximum sentence length to process, by default 20.
65
+ doImageSplitting : bool, optional
66
+ Whether to split the image, by default True.
67
+ imageToken : int, optional
68
+ Token ID for image embeddings, by default 49153.
69
+ numVisionTokens : int, optional
70
+ Number of vision tokens, by default 81.
71
+ maxImageSize : int, optional
72
+ Maximum image size for the model, by default 384.
73
+ patchSize : int, optional
74
+ Patch size for the model, by default 14.
75
+ paddingConstant : int, optional
76
+ Padding constant for the model, by default 0.
77
+
78
+ Examples
79
+ --------
80
+ >>> import sparknlp
81
+ >>> from sparknlp.base import *
82
+ >>> from sparknlp.annotator import *
83
+ >>> from pyspark.ml import Pipeline
84
+ >>> from pyspark.sql.functions import lit
85
+
86
+ >>> imageDF = spark.read.format("image").load(path=images_path)
87
+ >>> testDF = imageDF.withColumn(
88
+ ... "text",
89
+ ... lit("<|im_start|>User:<image>Can you describe the image?<end_of_utterance>\nAssistant:")
90
+ ... )
91
+
92
+ >>> imageAssembler = ImageAssembler() \
93
+ ... .setInputCol("image") \
94
+ ... .setOutputCol("image_assembler")
95
+
96
+ >>> visualQAClassifier = SmolVLMTransformer.pretrained() \
97
+ ... .setInputCols("image_assembler") \
98
+ ... .setOutputCol("answer")
99
+
100
+ >>> pipeline = Pipeline().setStages([
101
+ ... imageAssembler,
102
+ ... visualQAClassifier
103
+ ... ])
104
+
105
+ >>> result = pipeline.fit(testDF).transform(testDF)
106
+ >>> result.select("image_assembler.origin", "answer.result").show(truncate=False)
107
+
108
+ +--------------------------------------+----------------------------------------------------------------------+
109
+ |origin |result |
110
+ +--------------------------------------+----------------------------------------------------------------------+
111
+ |[file:///content/images/cat_image.jpg]|[The unusual aspect of this picture is the presence of two cats lying on a pink couch]|
112
+ +--------------------------------------+----------------------------------------------------------------------+
113
+ """
114
+
115
+ name = "SmolVLMTransformer"
116
+
117
+ inputAnnotatorTypes = [AnnotatorType.IMAGE]
118
+
119
+ outputAnnotatorType = AnnotatorType.DOCUMENT
120
+
121
+ minOutputLength = Param(Params._dummy(), "minOutputLength", "Minimum length of the sequence to be generated",
122
+ typeConverter=TypeConverters.toInt)
123
+
124
+ maxOutputLength = Param(Params._dummy(), "maxOutputLength", "Maximum length of output text",
125
+ typeConverter=TypeConverters.toInt)
126
+
127
+ doSample = Param(Params._dummy(), "doSample", "Whether or not to use sampling; use greedy decoding otherwise",
128
+ typeConverter=TypeConverters.toBoolean)
129
+
130
+ temperature = Param(Params._dummy(), "temperature", "The value used to module the next token probabilities",
131
+ typeConverter=TypeConverters.toFloat)
132
+
133
+ topK = Param(Params._dummy(), "topK",
134
+ "The number of highest probability vocabulary tokens to keep for top-k-filtering",
135
+ typeConverter=TypeConverters.toInt)
136
+
137
+ topP = Param(Params._dummy(), "topP",
138
+ "If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are kept for generation",
139
+ typeConverter=TypeConverters.toFloat)
140
+
141
+ repetitionPenalty = Param(Params._dummy(), "repetitionPenalty",
142
+ "The parameter for repetition penalty. 1.0 means no penalty. See `this paper <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details",
143
+ typeConverter=TypeConverters.toFloat)
144
+
145
+ noRepeatNgramSize = Param(Params._dummy(), "noRepeatNgramSize",
146
+ "If set to int > 0, all ngrams of that size can only occur once",
147
+ typeConverter=TypeConverters.toInt)
148
+
149
+ ignoreTokenIds = Param(Params._dummy(), "ignoreTokenIds",
150
+ "A list of token ids which are ignored in the decoder's output",
151
+ typeConverter=TypeConverters.toListInt)
152
+
153
+ beamSize = Param(Params._dummy(), "beamSize",
154
+ "The Number of beams for beam search.",
155
+ typeConverter=TypeConverters.toInt)
156
+
157
+ stopTokenIds = Param(Params._dummy(), "stopTokenIds",
158
+ "Stop tokens to terminate the generation",
159
+ typeConverter=TypeConverters.toListInt)
160
+
161
+ imageToken = Param(Params._dummy(), "imageToken",
162
+ "Token id for image embeddings",
163
+ typeConverter=TypeConverters.toInt)
164
+
165
+ numVisionTokens = Param(Params._dummy(), "numVisionTokens",
166
+ "Number of vision tokens",
167
+ typeConverter=TypeConverters.toInt)
168
+
169
+ maxImageSize = Param(Params._dummy(), "maxImageSize",
170
+ "Maximum image size for the model",
171
+ typeConverter=TypeConverters.toInt)
172
+
173
+ patchSize = Param(Params._dummy(), "patchSize",
174
+ "Patch size for the model",
175
+ typeConverter=TypeConverters.toInt)
176
+
177
+ paddingConstant = Param(Params._dummy(), "paddingConstant",
178
+ "Padding constant for the model",
179
+ typeConverter=TypeConverters.toInt)
180
+
181
+ doImageSplitting = Param(Params._dummy(), "doImageSplitting",
182
+ "Whether to split the image",
183
+ typeConverter=TypeConverters.toBoolean)
184
+
185
+ def setMaxSentenceSize(self, value):
186
+ """Sets Maximum sentence length that the annotator will process, by
187
+ default 20.
188
+ Parameters
189
+ ----------
190
+ value : int
191
+ Maximum sentence length that the annotator will process
192
+ """
193
+ return self._set(maxSentenceLength=value)
194
+
195
+ def setIgnoreTokenIds(self, value):
196
+ """A list of token ids which are ignored in the decoder's output.
197
+ Parameters
198
+ ----------
199
+ value : List[int]
200
+ The words to be filtered out
201
+ """
202
+ return self._set(ignoreTokenIds=value)
203
+
204
+ def setStopTokenIds(self, value):
205
+ """Stop tokens to terminate the generation.
206
+ Parameters
207
+ ----------
208
+ value : List[int]
209
+ The tokens that terminate generation
210
+ """
211
+ return self._set(stopTokenIds=value)
212
+
213
+ def setConfigProtoBytes(self, b):
214
+ """Sets configProto from tensorflow, serialized into byte array.
215
+ Parameters
216
+ ----------
217
+ b : List[int]
218
+ ConfigProto from tensorflow, serialized into byte array
219
+ """
220
+ return self._set(configProtoBytes=b)
221
+
222
+ def setMinOutputLength(self, value):
223
+ """Sets minimum length of the sequence to be generated.
224
+ Parameters
225
+ ----------
226
+ value : int
227
+ Minimum length of the sequence to be generated
228
+ """
229
+ return self._set(minOutputLength=value)
230
+
231
+ def setMaxOutputLength(self, value):
232
+ """Sets maximum length of output text.
233
+ Parameters
234
+ ----------
235
+ value : int
236
+ Maximum length of output text
237
+ """
238
+ return self._set(maxOutputLength=value)
239
+
240
+ def setDoSample(self, value):
241
+ """Sets whether or not to use sampling, use greedy decoding otherwise.
242
+ Parameters
243
+ ----------
244
+ value : bool
245
+ Whether or not to use sampling; use greedy decoding otherwise
246
+ """
247
+ return self._set(doSample=value)
248
+
249
+ def setTemperature(self, value):
250
+ """Sets the value used to module the next token probabilities.
251
+ Parameters
252
+ ----------
253
+ value : float
254
+ The value used to module the next token probabilities
255
+ """
256
+ return self._set(temperature=value)
257
+
258
+ def setTopK(self, value):
259
+ """Sets the number of highest probability vocabulary tokens to keep for
260
+ top-k-filtering.
261
+ Parameters
262
+ ----------
263
+ value : int
264
+ Number of highest probability vocabulary tokens to keep
265
+ """
266
+ return self._set(topK=value)
267
+
268
+ def setTopP(self, value):
269
+ """Sets the top cumulative probability for vocabulary tokens.
270
+ If set to float < 1, only the most probable tokens with probabilities
271
+ that add up to ``topP`` or higher are kept for generation.
272
+ Parameters
273
+ ----------
274
+ value : float
275
+ Cumulative probability for vocabulary tokens
276
+ """
277
+ return self._set(topP=value)
278
+
279
+ def setRepetitionPenalty(self, value):
280
+ """Sets the parameter for repetition penalty. 1.0 means no penalty.
281
+ Parameters
282
+ ----------
283
+ value : float
284
+ The repetition penalty
285
+ References
286
+ ----------
287
+ See `Ctrl: A Conditional Transformer Language Model For Controllable
288
+ Generation <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
289
+ """
290
+ return self._set(repetitionPenalty=value)
291
+
292
+ def setNoRepeatNgramSize(self, value):
293
+ """Sets size of n-grams that can only occur once.
294
+ If set to int > 0, all ngrams of that size can only occur once.
295
+ Parameters
296
+ ----------
297
+ value : int
298
+ N-gram size can only occur once
299
+ """
300
+ return self._set(noRepeatNgramSize=value)
301
+
302
+ def setBeamSize(self, value):
303
+ """Sets the number of beam size for beam search, by default `1`.
304
+ Parameters
305
+ ----------
306
+ value : int
307
+ Number of beam size for beam search
308
+ """
309
+ return self._set(beamSize=value)
310
+
311
+ def setImageToken(self, value):
312
+ """Sets the token ID for image embeddings.
313
+ Parameters
314
+ ----------
315
+ value : int
316
+ Token ID for image embeddings
317
+ """
318
+ return self._set(imageToken=value)
319
+
320
+ def setNumVisionTokens(self, value):
321
+ """Sets the number of vision tokens.
322
+ Parameters
323
+ ----------
324
+ value : int
325
+ Number of vision tokens
326
+ """
327
+ return self._set(numVisionTokens=value)
328
+
329
+ def setMaxImageSize(self, value):
330
+ """Sets the maximum image size for the model.
331
+ Parameters
332
+ ----------
333
+ value : int
334
+ Maximum image size
335
+ """
336
+ return self._set(maxImageSize=value)
337
+
338
+ def setPatchSize(self, value):
339
+ """Sets the patch size for the model.
340
+ Parameters
341
+ ----------
342
+ value : int
343
+ Patch size
344
+ """
345
+ return self._set(patchSize=value)
346
+
347
+ def setPaddingConstant(self, value):
348
+ """Sets the padding constant for the model.
349
+ Parameters
350
+ ----------
351
+ value : int
352
+ Padding constant
353
+ """
354
+ return self._set(paddingConstant=value)
355
+
356
+ def setDoImageSplitting(self, value):
357
+ """Sets whether to split the image.
358
+ Parameters
359
+ ----------
360
+ value : bool
361
+ Whether to split the image
362
+ """
363
+ return self._set(doImageSplitting=value)
364
+
365
+ @keyword_only
366
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.cv.SmolVLMTransformer",
367
+ java_model=None):
368
+ super(SmolVLMTransformer, self).__init__(
369
+ classname=classname,
370
+ java_model=java_model
371
+ )
372
+ self._setDefault(
373
+ batchSize=1,
374
+ minOutputLength=0,
375
+ maxOutputLength=20,
376
+ doSample=False,
377
+ temperature=0.6,
378
+ topK=-1,
379
+ topP=0.9,
380
+ repetitionPenalty=1.0,
381
+ noRepeatNgramSize=3,
382
+ ignoreTokenIds=[],
383
+ beamSize=1,
384
+ stopTokenIds=[49154],
385
+ imageToken=49153,
386
+ numVisionTokens=81,
387
+ maxImageSize=384,
388
+ patchSize=14,
389
+ paddingConstant=0,
390
+ doImageSplitting=True
391
+ )
392
+
393
+ @staticmethod
394
+ def loadSavedModel(folder, spark_session, use_openvino=False):
395
+ """Loads a locally saved model.
396
+ Parameters
397
+ ----------
398
+ folder : str
399
+ Folder of the saved model
400
+ spark_session : pyspark.sql.SparkSession
401
+ The current SparkSession
402
+ use_openvino : bool, optional
403
+ Whether to use OpenVINO for inference, by default False
404
+ Returns
405
+ -------
406
+ SmolVLMTransformer
407
+ The restored model
408
+ """
409
+ from sparknlp.internal import _SmolVLMTransformerLoader
410
+ jModel = _SmolVLMTransformerLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
411
+ return SmolVLMTransformer(java_model=jModel)
412
+
413
+ @staticmethod
414
+ def pretrained(name="smolvlm_instruct_int4", lang="en", remote_loc=None):
415
+ """Downloads and loads a pretrained model.
416
+ Parameters
417
+ ----------
418
+ name : str, optional
419
+ Name of the pretrained model, by default
420
+ "smolvlm_instruct_int4"
421
+ lang : str, optional
422
+ Language of the pretrained model, by default "en"
423
+ remote_loc : str, optional
424
+ Optional remote address of the resource, by default None. Will use
425
+ Spark NLPs repositories otherwise.
426
+ Returns
427
+ -------
428
+ SmolVLMTransformer
429
+ The restored model
430
+ """
431
+ from sparknlp.pretrained import ResourceDownloader
432
+ return ResourceDownloader.downloadModel(SmolVLMTransformer, name, lang, remote_loc)
@@ -272,6 +272,15 @@ class _GPT2Loader(ExtendedJavaWrapper):
272
272
  jspark,
273
273
  )
274
274
 
275
+ class _Gemma3ForMultiModalLoader(ExtendedJavaWrapper):
276
+ def __init__(self, path, jspark, use_openvino=False):
277
+ super(_Gemma3ForMultiModalLoader, self).__init__(
278
+ "com.johnsnowlabs.nlp.annotators.cv.Gemma3ForMultiModal.loadSavedModel",
279
+ path,
280
+ jspark,
281
+ use_openvino
282
+ )
283
+
275
284
  class _JanusForMultiModalLoader(ExtendedJavaWrapper):
276
285
  def __init__(self, path, jspark, use_openvino=False):
277
286
  super(_JanusForMultiModalLoader, self).__init__(
@@ -1119,3 +1128,21 @@ class _Qwen2VLTransformerLoader(ExtendedJavaWrapper):
1119
1128
  jspark,
1120
1129
  use_openvino,
1121
1130
  )
1131
+
1132
+ class _PaliGemmaForMultiModalLoader(ExtendedJavaWrapper):
1133
+ def __init__(self, path, jspark, use_openvino=False):
1134
+ super(_PaliGemmaForMultiModalLoader, self).__init__(
1135
+ "com.johnsnowlabs.nlp.annotators.cv.PaliGemmaForMultiModal.loadSavedModel",
1136
+ path,
1137
+ jspark,
1138
+ use_openvino,
1139
+ )
1140
+
1141
+ class _SmolVLMTransformerLoader(ExtendedJavaWrapper):
1142
+ def __init__(self, path, jspark, use_openvino=False):
1143
+ super(_SmolVLMTransformerLoader, self).__init__(
1144
+ "com.johnsnowlabs.nlp.annotators.cv.SmolVLMTransformer.loadSavedModel",
1145
+ path,
1146
+ jspark,
1147
+ use_openvino
1148
+ )
@@ -0,0 +1,19 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from enum import Enum
15
+
16
+ class TextStripperType(Enum):
17
+ """Text Stripper Type"""
18
+ PDF_TEXT_STRIPPER = "PDFTextStripper"
19
+ PDF_LAYOUT_TEXT_STRIPPER = "PDFLayoutTextStripper"
@@ -4,6 +4,8 @@ from pyspark.ml.param.shared import HasInputCol, HasOutputCol
4
4
  from pyspark.ml.util import JavaMLReadable, JavaMLWritable
5
5
  from pyspark.ml.wrapper import JavaTransformer
6
6
 
7
+ from sparknlp.reader.enums import TextStripperType
8
+
7
9
 
8
10
  class PdfToText(JavaTransformer, HasInputCol, HasOutputCol,
9
11
  JavaMLReadable, JavaMLWritable):
@@ -25,6 +27,22 @@ class PdfToText(JavaTransformer, HasInputCol, HasOutputCol,
25
27
  "Force to store splitted pdf.",
26
28
  typeConverter=TypeConverters.toBoolean)
27
29
 
30
+ splitPage = Param(Params._dummy(), "splitPage",
31
+ "Param for enable/disable splitting document per page",
32
+ typeConverter=TypeConverters.toBoolean)
33
+
34
+ textStripper = Param(Params._dummy(), "textStripper",
35
+ "Text stripper type used for output layout and formatting",
36
+ typeConverter=TypeConverters.toString)
37
+
38
+ sort = Param(Params._dummy(), "sort",
39
+ "Param for enable/disable sort lines",
40
+ typeConverter=TypeConverters.toBoolean)
41
+
42
+ onlyPageNum = Param(Params._dummy(), "onlyPageNum",
43
+ "Force to extract only number of pages",
44
+ typeConverter=TypeConverters.toBoolean)
45
+
28
46
  @keyword_only
29
47
  def __init__(self):
30
48
  """
@@ -33,7 +51,6 @@ class PdfToText(JavaTransformer, HasInputCol, HasOutputCol,
33
51
  super(PdfToText, self).__init__()
34
52
  self._java_obj = self._new_java_obj("com.johnsnowlabs.reader.PdfToText", self.uid)
35
53
 
36
-
37
54
  def setInputCol(self, value):
38
55
  """
39
56
  Sets the value of :py:attr:`inputCol`.
@@ -63,3 +80,32 @@ class PdfToText(JavaTransformer, HasInputCol, HasOutputCol,
63
80
  Sets the value of :py:attr:`storeSplittedPdf`.
64
81
  """
65
82
  return self._set(storeSplittedPdf=value)
83
+
84
+ def setSplitPage(self, value):
85
+ """
86
+ Sets the value of :py:attr:`splitPage`.
87
+ """
88
+ return self._set(splitPage=value)
89
+
90
+ def setOnlyPageNum(self, value):
91
+ """
92
+ Sets the value of :py:attr:`onlyPageNum`.
93
+ """
94
+ return self._set(onlyPageNum=value)
95
+
96
+ def setTextStripper(self, value):
97
+ """
98
+ Sets the value of :py:attr:`textStripper`.
99
+ """
100
+ if isinstance(value, TextStripperType):
101
+ value = value.value
102
+ if value not in [i.value for i in TextStripperType]:
103
+ type_value = type(value)
104
+ raise ValueError(f"Param textStripper must be a 'TextStripperType' enum but got {type_value}.")
105
+ return self._set(textStripper=str(value))
106
+
107
+ def setSort(self, value):
108
+ """
109
+ Sets the value of :py:attr:`sort`.
110
+ """
111
+ return self._set(sort=value)