spark-nlp 5.5.2__py2.py3-none-any.whl → 6.0.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of spark-nlp might be problematic. Click here for more details.
- {spark_nlp-5.5.2.dist-info → spark_nlp-6.0.0.dist-info}/METADATA +20 -11
- {spark_nlp-5.5.2.dist-info → spark_nlp-6.0.0.dist-info}/RECORD +33 -18
- sparknlp/__init__.py +2 -2
- sparknlp/annotator/classifier_dl/__init__.py +4 -0
- sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +2 -2
- sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
- sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
- sparknlp/annotator/cleaners/__init__.py +15 -0
- sparknlp/annotator/cleaners/cleaner.py +202 -0
- sparknlp/annotator/cleaners/extractor.py +191 -0
- sparknlp/annotator/cv/__init__.py +6 -1
- sparknlp/annotator/cv/janus_for_multimodal.py +356 -0
- sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
- sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
- sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
- sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
- sparknlp/annotator/embeddings/auto_gguf_embeddings.py +10 -6
- sparknlp/annotator/embeddings/bge_embeddings.py +7 -3
- sparknlp/annotator/seq2seq/__init__.py +3 -0
- sparknlp/annotator/seq2seq/auto_gguf_model.py +8 -503
- sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +333 -0
- sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
- sparknlp/annotator/seq2seq/llama3_transformer.py +4 -4
- sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
- sparknlp/base/image_assembler.py +58 -0
- sparknlp/common/properties.py +632 -96
- sparknlp/internal/__init__.py +100 -2
- sparknlp/reader/pdf_to_text.py +65 -0
- sparknlp/reader/sparknlp_reader.py +260 -60
- spark_nlp-5.5.2.dist-info/.uuid +0 -1
- {spark_nlp-5.5.2.dist-info → spark_nlp-6.0.0.dist-info}/WHEEL +0 -0
- {spark_nlp-5.5.2.dist-info → spark_nlp-6.0.0.dist-info}/top_level.txt +0 -0
sparknlp/internal/__init__.py
CHANGED
|
@@ -67,6 +67,15 @@ class _AlbertForZeroShotClassificationLoader(ExtendedJavaWrapper):
|
|
|
67
67
|
)
|
|
68
68
|
|
|
69
69
|
|
|
70
|
+
class _AlbertMultipleChoiceLoader(ExtendedJavaWrapper):
|
|
71
|
+
def __init__(self, path, jspark):
|
|
72
|
+
super(_AlbertMultipleChoiceLoader, self).__init__(
|
|
73
|
+
"com.johnsnowlabs.nlp.annotators.classifier.dl.AlbertForMultipleChoice.loadSavedModel",
|
|
74
|
+
path,
|
|
75
|
+
jspark,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
70
79
|
class _BertLoader(ExtendedJavaWrapper):
|
|
71
80
|
def __init__(self, path, jspark, use_openvino=False):
|
|
72
81
|
super(_BertLoader, self).__init__(
|
|
@@ -121,6 +130,15 @@ class _BertMultipleChoiceLoader(ExtendedJavaWrapper):
|
|
|
121
130
|
jspark,
|
|
122
131
|
)
|
|
123
132
|
|
|
133
|
+
class _CoHereLoader(ExtendedJavaWrapper):
|
|
134
|
+
def __init__(self, path, jspark, use_openvino=False):
|
|
135
|
+
super(_CoHereLoader, self).__init__(
|
|
136
|
+
"com.johnsnowlabs.nlp.annotators.seq2seq.CoHereTransformer.loadSavedModel",
|
|
137
|
+
path,
|
|
138
|
+
jspark,
|
|
139
|
+
use_openvino,
|
|
140
|
+
)
|
|
141
|
+
|
|
124
142
|
class _DeBERTaLoader(ExtendedJavaWrapper):
|
|
125
143
|
def __init__(self, path, jspark):
|
|
126
144
|
super(_DeBERTaLoader, self).__init__(
|
|
@@ -211,6 +229,15 @@ class _DistilBertQuestionAnsweringLoader(ExtendedJavaWrapper):
|
|
|
211
229
|
)
|
|
212
230
|
|
|
213
231
|
|
|
232
|
+
class _DistilBertMultipleChoiceLoader(ExtendedJavaWrapper):
|
|
233
|
+
def __init__(self, path, jspark):
|
|
234
|
+
super(_DistilBertMultipleChoiceLoader, self).__init__(
|
|
235
|
+
"com.johnsnowlabs.nlp.annotators.classifier.dl.DistilBertForMultipleChoice.loadSavedModel",
|
|
236
|
+
path,
|
|
237
|
+
jspark,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
|
|
214
241
|
class _ElmoLoader(ExtendedJavaWrapper):
|
|
215
242
|
def __init__(self, path, jspark):
|
|
216
243
|
super(_ElmoLoader, self).__init__(
|
|
@@ -245,6 +272,14 @@ class _GPT2Loader(ExtendedJavaWrapper):
|
|
|
245
272
|
jspark,
|
|
246
273
|
)
|
|
247
274
|
|
|
275
|
+
class _JanusForMultiModalLoader(ExtendedJavaWrapper):
|
|
276
|
+
def __init__(self, path, jspark, use_openvino=False):
|
|
277
|
+
super(_JanusForMultiModalLoader, self).__init__(
|
|
278
|
+
"com.johnsnowlabs.nlp.annotators.cv.JanusForMultiModal.loadSavedModel",
|
|
279
|
+
path,
|
|
280
|
+
jspark,
|
|
281
|
+
use_openvino
|
|
282
|
+
)
|
|
248
283
|
|
|
249
284
|
class _LLAMA2Loader(ExtendedJavaWrapper):
|
|
250
285
|
def __init__(self, path, jspark, use_openvino=False):
|
|
@@ -299,6 +334,14 @@ class _LongformerQuestionAnsweringLoader(ExtendedJavaWrapper):
|
|
|
299
334
|
jspark,
|
|
300
335
|
)
|
|
301
336
|
|
|
337
|
+
class _LLAVAForMultiModalLoader(ExtendedJavaWrapper):
|
|
338
|
+
def __init__(self, path, jspark, use_openvino=False):
|
|
339
|
+
super(_LLAVAForMultiModalLoader, self).__init__(
|
|
340
|
+
"com.johnsnowlabs.nlp.annotators.cv.LLAVAForMultiModal.loadSavedModel",
|
|
341
|
+
path,
|
|
342
|
+
jspark,
|
|
343
|
+
use_openvino
|
|
344
|
+
)
|
|
302
345
|
|
|
303
346
|
class _M2M100Loader(ExtendedJavaWrapper):
|
|
304
347
|
def __init__(self, path, jspark, use_openvino=False):
|
|
@@ -318,6 +361,14 @@ class _MistralLoader(ExtendedJavaWrapper):
|
|
|
318
361
|
use_openvino,
|
|
319
362
|
)
|
|
320
363
|
|
|
364
|
+
class _MLLamaForMultimodalLoader(ExtendedJavaWrapper):
|
|
365
|
+
def __init__(self, path, jspark, use_openvino=False):
|
|
366
|
+
super(_MLLamaForMultimodalLoader, self).__init__(
|
|
367
|
+
"com.johnsnowlabs.nlp.annotators.cv.MLLamaForMultimodal.loadSavedModel",
|
|
368
|
+
path,
|
|
369
|
+
jspark,
|
|
370
|
+
use_openvino
|
|
371
|
+
)
|
|
321
372
|
|
|
322
373
|
class _NLLBLoader(ExtendedJavaWrapper):
|
|
323
374
|
def __init__(self, path, jspark, use_openvino=False):
|
|
@@ -345,6 +396,10 @@ class _MPNetLoader(ExtendedJavaWrapper):
|
|
|
345
396
|
)
|
|
346
397
|
|
|
347
398
|
|
|
399
|
+
class _OLMoLoader(ExtendedJavaWrapper):
|
|
400
|
+
def __init__(self, path, jspark):
|
|
401
|
+
super(_OLMoLoader, self).__init__(
|
|
402
|
+
"com.johnsnowlabs.nlp.annotators.seq2seq.OLMoTransformer.loadSavedModel", path, jspark)
|
|
348
403
|
class _Phi2Loader(ExtendedJavaWrapper):
|
|
349
404
|
def __init__(self, path, jspark, use_openvino=False):
|
|
350
405
|
super(_Phi2Loader, self).__init__(
|
|
@@ -363,6 +418,15 @@ class _Phi3Loader(ExtendedJavaWrapper):
|
|
|
363
418
|
use_openvino,
|
|
364
419
|
)
|
|
365
420
|
|
|
421
|
+
class _Phi3VisionLoader(ExtendedJavaWrapper):
|
|
422
|
+
def __init__(self, path, jspark, use_openvino=False):
|
|
423
|
+
super(_Phi3VisionLoader, self).__init__(
|
|
424
|
+
"com.johnsnowlabs.nlp.annotators.cv.Phi3Vision.loadSavedModel",
|
|
425
|
+
path,
|
|
426
|
+
jspark,
|
|
427
|
+
use_openvino
|
|
428
|
+
)
|
|
429
|
+
|
|
366
430
|
class _RoBertaLoader(ExtendedJavaWrapper):
|
|
367
431
|
def __init__(self, path, jspark, use_openvino=False):
|
|
368
432
|
super(_RoBertaLoader, self).__init__(
|
|
@@ -409,6 +473,15 @@ class _RoBertaQuestionAnsweringLoader(ExtendedJavaWrapper):
|
|
|
409
473
|
)
|
|
410
474
|
|
|
411
475
|
|
|
476
|
+
class _RoBertaMultipleChoiceLoader(ExtendedJavaWrapper):
|
|
477
|
+
def __init__(self, path, jspark):
|
|
478
|
+
super(_RoBertaMultipleChoiceLoader, self).__init__(
|
|
479
|
+
"com.johnsnowlabs.nlp.annotators.classifier.dl.RoBertaForMultipleChoice.loadSavedModel",
|
|
480
|
+
path,
|
|
481
|
+
jspark,
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
|
|
412
485
|
class _StarCoderLoader(ExtendedJavaWrapper):
|
|
413
486
|
def __init__(self, path, jspark, use_openvino=False):
|
|
414
487
|
super(_StarCoderLoader, self).__init__(
|
|
@@ -504,6 +577,15 @@ class _XlmRoBertaQuestionAnsweringLoader(ExtendedJavaWrapper):
|
|
|
504
577
|
)
|
|
505
578
|
|
|
506
579
|
|
|
580
|
+
class _XlmRoBertaMultipleChoiceLoader(ExtendedJavaWrapper):
|
|
581
|
+
def __init__(self, path, jspark):
|
|
582
|
+
super(_XlmRoBertaMultipleChoiceLoader, self).__init__(
|
|
583
|
+
"com.johnsnowlabs.nlp.annotators.classifier.dl.XlmRoBertaForMultipleChoice.loadSavedModel",
|
|
584
|
+
path,
|
|
585
|
+
jspark,
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
|
|
507
589
|
class _XlnetLoader(ExtendedJavaWrapper):
|
|
508
590
|
def __init__(self, path, jspark):
|
|
509
591
|
super(_XlnetLoader, self).__init__(
|
|
@@ -992,8 +1074,8 @@ class _AutoGGUFLoader(ExtendedJavaWrapper):
|
|
|
992
1074
|
def __init__(self, path, jspark):
|
|
993
1075
|
super(_AutoGGUFLoader, self).__init__(
|
|
994
1076
|
"com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel.loadSavedModel", path, jspark)
|
|
995
|
-
|
|
996
|
-
|
|
1077
|
+
|
|
1078
|
+
|
|
997
1079
|
class _MxbaiEmbeddingsLoader(ExtendedJavaWrapper):
|
|
998
1080
|
def __init__(self, path, jspark):
|
|
999
1081
|
super(_MxbaiEmbeddingsLoader, self).__init__(
|
|
@@ -1021,3 +1103,19 @@ class _BLIPForQuestionAnswering(ExtendedJavaWrapper):
|
|
|
1021
1103
|
path,
|
|
1022
1104
|
jspark,
|
|
1023
1105
|
)
|
|
1106
|
+
|
|
1107
|
+
|
|
1108
|
+
class _AutoGGUFVisionLoader(ExtendedJavaWrapper):
|
|
1109
|
+
def __init__(self, modelPath, mmprojPath, jspark):
|
|
1110
|
+
super(_AutoGGUFVisionLoader, self).__init__(
|
|
1111
|
+
"com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFVisionModel.loadSavedModel", modelPath, mmprojPath, jspark)
|
|
1112
|
+
|
|
1113
|
+
|
|
1114
|
+
class _Qwen2VLTransformerLoader(ExtendedJavaWrapper):
|
|
1115
|
+
def __init__(self, path, jspark, use_openvino=False):
|
|
1116
|
+
super(_Qwen2VLTransformerLoader, self).__init__(
|
|
1117
|
+
"com.johnsnowlabs.nlp.annotators.cv.Qwen2VLTransformer.loadSavedModel",
|
|
1118
|
+
path,
|
|
1119
|
+
jspark,
|
|
1120
|
+
use_openvino,
|
|
1121
|
+
)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from pyspark import keyword_only
|
|
2
|
+
from pyspark.ml.param import Param, Params, TypeConverters
|
|
3
|
+
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
|
|
4
|
+
from pyspark.ml.util import JavaMLReadable, JavaMLWritable
|
|
5
|
+
from pyspark.ml.wrapper import JavaTransformer
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PdfToText(JavaTransformer, HasInputCol, HasOutputCol,
|
|
9
|
+
JavaMLReadable, JavaMLWritable):
|
|
10
|
+
"""
|
|
11
|
+
Extract text from Pdf document to single string or to several strings per each page.
|
|
12
|
+
Input is a column with binary representation of PDF document.
|
|
13
|
+
As output generate column with text and page number.
|
|
14
|
+
Explode each page as separate row if split to page enabled.
|
|
15
|
+
"""
|
|
16
|
+
pageNumCol = Param(Params._dummy(), "pageNumCol",
|
|
17
|
+
"Page number output column name.",
|
|
18
|
+
typeConverter=TypeConverters.toString)
|
|
19
|
+
|
|
20
|
+
partitionNum = Param(Params._dummy(), "partitionNum",
|
|
21
|
+
"Number of partitions.",
|
|
22
|
+
typeConverter=TypeConverters.toInt)
|
|
23
|
+
|
|
24
|
+
storeSplittedPdf = Param(Params._dummy(), "storeSplittedPdf",
|
|
25
|
+
"Force to store splitted pdf.",
|
|
26
|
+
typeConverter=TypeConverters.toBoolean)
|
|
27
|
+
|
|
28
|
+
@keyword_only
|
|
29
|
+
def __init__(self):
|
|
30
|
+
"""
|
|
31
|
+
__init__(self)
|
|
32
|
+
"""
|
|
33
|
+
super(PdfToText, self).__init__()
|
|
34
|
+
self._java_obj = self._new_java_obj("com.johnsnowlabs.reader.PdfToText", self.uid)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def setInputCol(self, value):
|
|
38
|
+
"""
|
|
39
|
+
Sets the value of :py:attr:`inputCol`.
|
|
40
|
+
"""
|
|
41
|
+
return self._set(inputCol=value)
|
|
42
|
+
|
|
43
|
+
def setOutputCol(self, value):
|
|
44
|
+
"""
|
|
45
|
+
Sets the value of :py:attr:`outputCol`.
|
|
46
|
+
"""
|
|
47
|
+
return self._set(outputCol=value)
|
|
48
|
+
|
|
49
|
+
def setPageNumCol(self, value):
|
|
50
|
+
"""
|
|
51
|
+
Sets the value of :py:attr:`pageNumCol`.
|
|
52
|
+
"""
|
|
53
|
+
return self._set(pageNumCol=value)
|
|
54
|
+
|
|
55
|
+
def setPartitionNum(self, value):
|
|
56
|
+
"""
|
|
57
|
+
Sets the value of :py:attr:`partitionNum`.
|
|
58
|
+
"""
|
|
59
|
+
return self._set(partitionNum=value)
|
|
60
|
+
|
|
61
|
+
def setStoreSplittedPdf(self, value):
|
|
62
|
+
"""
|
|
63
|
+
Sets the value of :py:attr:`storeSplittedPdf`.
|
|
64
|
+
"""
|
|
65
|
+
return self._set(storeSplittedPdf=value)
|