spark-nlp 5.5.3__py2.py3-none-any.whl → 6.0.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spark-nlp might be problematic. Click here for more details.

Files changed (37) hide show
  1. {spark_nlp-5.5.3.dist-info → spark_nlp-6.0.1.dist-info}/METADATA +20 -11
  2. {spark_nlp-5.5.3.dist-info → spark_nlp-6.0.1.dist-info}/RECORD +36 -17
  3. {spark_nlp-5.5.3.dist-info → spark_nlp-6.0.1.dist-info}/WHEEL +1 -1
  4. sparknlp/__init__.py +2 -2
  5. sparknlp/annotator/classifier_dl/__init__.py +4 -0
  6. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  7. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +2 -2
  8. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  9. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  10. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  11. sparknlp/annotator/cleaners/__init__.py +15 -0
  12. sparknlp/annotator/cleaners/cleaner.py +202 -0
  13. sparknlp/annotator/cleaners/extractor.py +191 -0
  14. sparknlp/annotator/cv/__init__.py +9 -1
  15. sparknlp/annotator/cv/gemma3_for_multimodal.py +351 -0
  16. sparknlp/annotator/cv/janus_for_multimodal.py +356 -0
  17. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  18. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  19. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  20. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  21. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  22. sparknlp/annotator/cv/smolvlm_transformer.py +432 -0
  23. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +10 -6
  24. sparknlp/annotator/seq2seq/__init__.py +3 -0
  25. sparknlp/annotator/seq2seq/auto_gguf_model.py +8 -503
  26. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +333 -0
  27. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  28. sparknlp/annotator/seq2seq/llama3_transformer.py +4 -4
  29. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  30. sparknlp/base/image_assembler.py +58 -0
  31. sparknlp/common/properties.py +605 -96
  32. sparknlp/internal/__init__.py +127 -2
  33. sparknlp/reader/enums.py +19 -0
  34. sparknlp/reader/pdf_to_text.py +111 -0
  35. sparknlp/reader/sparknlp_reader.py +222 -14
  36. spark_nlp-5.5.3.dist-info/.uuid +0 -1
  37. {spark_nlp-5.5.3.dist-info → spark_nlp-6.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,161 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from sparknlp.common import *
16
+
17
+ class RoBertaForMultipleChoice(AnnotatorModel,
18
+ HasCaseSensitiveProperties,
19
+ HasBatchedAnnotate,
20
+ HasEngine,
21
+ HasMaxSentenceLengthLimit):
22
+ """RoBertaForMultipleChoice can load RoBERTa Models with a multiple choice classification head on top
23
+ (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
24
+
25
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
26
+ object:
27
+
28
+ >>> spanClassifier = RoBertaForMultipleChoice.pretrained() \\
29
+ ... .setInputCols(["document_question", "document_context"]) \\
30
+ ... .setOutputCol("answer")
31
+
32
+ The default model is ``"roberta_base_uncased_multiple_choice"``, if no name is
33
+ provided.
34
+
35
+ For available pretrained models please see the `Models Hub
36
+ <https://sparknlp.org/models?task=Multiple+Choice>`__.
37
+
38
+ To see which models are compatible and how to import them see
39
+ `Import Transformers into Spark NLP 🚀
40
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
41
+
42
+ ====================== ======================
43
+ Input Annotation types Output Annotation type
44
+ ====================== ======================
45
+ ``DOCUMENT, DOCUMENT`` ``CHUNK``
46
+ ====================== ======================
47
+
48
+ Parameters
49
+ ----------
50
+ batchSize
51
+ Batch size. Large values allows faster processing but requires more
52
+ memory, by default 8
53
+ caseSensitive
54
+ Whether to ignore case in tokens for embeddings matching, by default
55
+ False
56
+ maxSentenceLength
57
+ Max sentence length to process, by default 512
58
+
59
+ Examples
60
+ --------
61
+ >>> import sparknlp
62
+ >>> from sparknlp.base import *
63
+ >>> from sparknlp.annotator import *
64
+ >>> from pyspark.ml import Pipeline
65
+ >>> documentAssembler = MultiDocumentAssembler() \\
66
+ ... .setInputCols(["question", "context"]) \\
67
+ ... .setOutputCols(["document_question", "document_context"])
68
+ >>> questionAnswering = RoBertaForMultipleChoice.pretrained() \\
69
+ ... .setInputCols(["document_question", "document_context"]) \\
70
+ ... .setOutputCol("answer") \\
71
+ ... .setCaseSensitive(False)
72
+ >>> pipeline = Pipeline().setStages([
73
+ ... documentAssembler,
74
+ ... questionAnswering
75
+ ... ])
76
+ >>> data = spark.createDataFrame([["The Eiffel Tower is located in which country??", "Germany, France, Italy"]]).toDF("question", "context")
77
+ >>> result = pipeline.fit(data).transform(data)
78
+ >>> result.select("answer.result").show(truncate=False)
79
+ +--------------------+
80
+ |result |
81
+ +--------------------+
82
+ |[France] |
83
+ +--------------------+
84
+ """
85
+ name = "RobertaForMultipleChoice"
86
+
87
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT]
88
+
89
+ outputAnnotatorType = AnnotatorType.CHUNK
90
+
91
+ choicesDelimiter = Param(Params._dummy(),
92
+ "choicesDelimiter",
93
+ "Delimiter character use to split the choices",
94
+ TypeConverters.toString)
95
+
96
+ def setChoicesDelimiter(self, value):
97
+ """Sets delimiter character use to split the choices
98
+
99
+ Parameters
100
+ ----------
101
+ value : string
102
+ Delimiter character use to split the choices
103
+ """
104
+ return self._set(caseSensitive=value)
105
+
106
+ @keyword_only
107
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.RoBertaForMultipleChoice",
108
+ java_model=None):
109
+ super(RoBertaForMultipleChoice, self).__init__(
110
+ classname=classname,
111
+ java_model=java_model
112
+ )
113
+ self._setDefault(
114
+ batchSize=4,
115
+ maxSentenceLength=512,
116
+ caseSensitive=False,
117
+ choicesDelimiter = ","
118
+ )
119
+
120
+ @staticmethod
121
+ def loadSavedModel(folder, spark_session):
122
+ """Loads a locally saved model.
123
+
124
+ Parameters
125
+ ----------
126
+ folder : str
127
+ Folder of the saved model
128
+ spark_session : pyspark.sql.SparkSession
129
+ The current SparkSession
130
+
131
+ Returns
132
+ -------
133
+ RobertaForQuestionAnswering
134
+ The restored model
135
+ """
136
+ from sparknlp.internal import _RoBertaMultipleChoiceLoader
137
+ jModel = _RoBertaMultipleChoiceLoader(folder, spark_session._jsparkSession)._java_obj
138
+ return RoBertaForMultipleChoice(java_model=jModel)
139
+
140
+ @staticmethod
141
+ def pretrained(name="Roberta_base_uncased_multiple_choice", lang="en", remote_loc=None):
142
+ """Downloads and loads a pretrained model.
143
+
144
+ Parameters
145
+ ----------
146
+ name : str, optional
147
+ Name of the pretrained model, by default
148
+ "Roberta_base_uncased_multiple_choice"
149
+ lang : str, optional
150
+ Language of the pretrained model, by default "en"
151
+ remote_loc : str, optional
152
+ Optional remote address of the resource, by default None. Will use
153
+ Spark NLPs repositories otherwise.
154
+
155
+ Returns
156
+ -------
157
+ RoBertaForMultipleChoice
158
+ The restored model
159
+ """
160
+ from sparknlp.pretrained import ResourceDownloader
161
+ return ResourceDownloader.downloadModel(RoBertaForMultipleChoice, name, lang, remote_loc)
@@ -0,0 +1,149 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from sparknlp.common import *
16
+
17
+
18
+ class XlmRoBertaForMultipleChoice(AnnotatorModel,
19
+ HasCaseSensitiveProperties,
20
+ HasBatchedAnnotate,
21
+ HasEngine,
22
+ HasMaxSentenceLengthLimit):
23
+ """XlmRoBertaForMultipleChoice can load XLM-RoBERTa Models with a span classification head on top for extractive
24
+ question-answering tasks like SQuAD (a linear layer on top of the hidden-states output to compute span start
25
+ logits and span end logits).
26
+
27
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
28
+ object:
29
+
30
+ >>> spanClassifier = XlmRoBertaForMultipleChoice.pretrained() \\
31
+ ... .setInputCols(["document_question", "document_context"]) \\
32
+ ... .setOutputCol("answer")
33
+
34
+ The default model is ``"xlm_roberta_base_qa_squad2"``, if no name is
35
+ provided.
36
+
37
+ For available pretrained models please see the `Models Hub
38
+ <https://sparknlp.org/models?task=Question+Answering>`__.
39
+
40
+ To see which models are compatible and how to import them see
41
+ `Import Transformers into Spark NLP 🚀
42
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
43
+
44
+ ====================== ======================
45
+ Input Annotation types Output Annotation type
46
+ ====================== ======================
47
+ ``DOCUMENT, DOCUMENT`` ``CHUNK``
48
+ ====================== ======================
49
+
50
+ Parameters
51
+ ----------
52
+ batchSize
53
+ Batch size. Large values allows faster processing but requires more
54
+ memory, by default 8
55
+ caseSensitive
56
+ Whether to ignore case in tokens for embeddings matching, by default
57
+ False
58
+ configProtoBytes
59
+ ConfigProto from tensorflow, serialized into byte array.
60
+ maxSentenceLength
61
+ Max sentence length to process, by default 128
62
+
63
+ Examples
64
+ --------
65
+ >>> import sparknlp
66
+ >>> from sparknlp.base import *
67
+ >>> from sparknlp.annotator import *
68
+ >>> from pyspark.ml import Pipeline
69
+ >>> documentAssembler = MultiDocumentAssembler() \\
70
+ ... .setInputCols(["question", "context"]) \\
71
+ ... .setOutputCol(["document_question", "document_context"])
72
+ >>> spanClassifier = XlmRoBertaForMultipleChoice.pretrained() \\
73
+ ... .setInputCols(["document_question", "document_context"]) \\
74
+ ... .setOutputCol("answer") \\
75
+ ... .setCaseSensitive(False)
76
+ >>> pipeline = Pipeline().setStages([
77
+ ... documentAssembler,
78
+ ... spanClassifier
79
+ ... ])
80
+ >>> data = spark.createDataFrame([["What's my name?", "My name is Clara and I live in Berkeley."]]).toDF("question", "context")
81
+ >>> result = pipeline.fit(data).transform(data)
82
+ >>> result.select("answer.result").show(truncate=False)
83
+ +--------------------+
84
+ |result |
85
+ +--------------------+
86
+ |[Clara] |
87
+ +--------------------+
88
+ """
89
+ name = "XlmRoBertaForMultipleChoice"
90
+
91
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT]
92
+
93
+ outputAnnotatorType = AnnotatorType.CHUNK
94
+
95
+ @keyword_only
96
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.XlmRoBertaForMultipleChoice",
97
+ java_model=None):
98
+ super(XlmRoBertaForMultipleChoice, self).__init__(
99
+ classname=classname,
100
+ java_model=java_model
101
+ )
102
+ self._setDefault(
103
+ batchSize=8,
104
+ maxSentenceLength=128,
105
+ caseSensitive=False
106
+ )
107
+
108
+ @staticmethod
109
+ def loadSavedModel(folder, spark_session):
110
+ """Loads a locally saved model.
111
+
112
+ Parameters
113
+ ----------
114
+ folder : str
115
+ Folder of the saved model
116
+ spark_session : pyspark.sql.SparkSession
117
+ The current SparkSession
118
+
119
+ Returns
120
+ -------
121
+ XlmRoBertaForMultipleChoice
122
+ The restored model
123
+ """
124
+ from sparknlp.internal import _XlmRoBertaMultipleChoiceLoader
125
+ jModel = _XlmRoBertaMultipleChoiceLoader(folder, spark_session._jsparkSession)._java_obj
126
+ return XlmRoBertaForMultipleChoice(java_model=jModel)
127
+
128
+ @staticmethod
129
+ def pretrained(name="xlm_roberta_base_mc", lang="en", remote_loc=None):
130
+ """Downloads and loads a pretrained model.
131
+
132
+ Parameters
133
+ ----------
134
+ name : str, optional
135
+ Name of the pretrained model, by default
136
+ "xlm_roberta_base_qa_squad2"
137
+ lang : str, optional
138
+ Language of the pretrained model, by default "en"
139
+ remote_loc : str, optional
140
+ Optional remote address of the resource, by default None. Will use
141
+ Spark NLPs repositories otherwise.
142
+
143
+ Returns
144
+ -------
145
+ XlmRoBertaForMultipleChoice
146
+ The restored model
147
+ """
148
+ from sparknlp.pretrained import ResourceDownloader
149
+ return ResourceDownloader.downloadModel(XlmRoBertaForMultipleChoice, name, lang, remote_loc)
@@ -0,0 +1,15 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from sparknlp.annotator.cleaners.extractor import *
15
+ from sparknlp.annotator.cleaners.cleaner import *
@@ -0,0 +1,202 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for Cleaner."""
15
+ from sparknlp.annotator import MarianTransformer
16
+ from sparknlp.common import *
17
+
18
+ class Cleaner(MarianTransformer):
19
+ name = "Cleaner"
20
+
21
+ inputAnnotatorTypes = [AnnotatorType.TOKEN]
22
+
23
+ outputAnnotatorType = AnnotatorType.CHUNK
24
+
25
+ encoding = Param(Params._dummy(),
26
+ "encoding",
27
+ "The encoding to be used for decoding the byte string (default is utf-8)",
28
+ typeConverter=TypeConverters.toString)
29
+
30
+ cleanPrefixPattern = Param(Params._dummy(),
31
+ "cleanPrefixPattern",
32
+ "The pattern for the prefix. Can be a simple string or a regex pattern.",
33
+ typeConverter=TypeConverters.toString)
34
+
35
+ cleanPostfixPattern = Param(Params._dummy(),
36
+ "cleanPostfixPattern",
37
+ "The pattern for the postfix. Can be a simple string or a regex pattern.",
38
+ typeConverter=TypeConverters.toString)
39
+
40
+ cleanerMode = Param(
41
+ Params._dummy(),
42
+ "cleanerMode",
43
+ "possible values: " +
44
+ "clean, bytes_string_to_string, clean_non_ascii_chars, clean_ordered_bullets, clean_postfix, clean_prefix, remove_punctuation, replace_unicode_quotes",
45
+ typeConverter=TypeConverters.toString
46
+ )
47
+
48
+ extraWhitespace = Param(Params._dummy(),
49
+ "extraWhitespace",
50
+ "Whether to remove extra whitespace.",
51
+ typeConverter=TypeConverters.toBoolean)
52
+
53
+ dashes = Param(Params._dummy(),
54
+ "dashes",
55
+ "Whether to handle dashes in text.",
56
+ typeConverter=TypeConverters.toBoolean)
57
+
58
+ bullets = Param(Params._dummy(),
59
+ "bullets",
60
+ "Whether to handle bullets in text.",
61
+ typeConverter=TypeConverters.toBoolean)
62
+
63
+ trailingPunctuation = Param(Params._dummy(),
64
+ "trailingPunctuation",
65
+ "Whether to remove trailing punctuation from text.",
66
+ typeConverter=TypeConverters.toBoolean)
67
+
68
+ lowercase = Param(Params._dummy(),
69
+ "lowercase",
70
+ "Whether to convert text to lowercase.",
71
+ typeConverter=TypeConverters.toBoolean)
72
+
73
+ ignoreCase = Param(Params._dummy(),
74
+ "ignoreCase",
75
+ "If true, ignores case in the pattern.",
76
+ typeConverter=TypeConverters.toBoolean)
77
+
78
+ strip = Param(Params._dummy(),
79
+ "strip",
80
+ "If true, removes leading or trailing whitespace from the cleaned string.",
81
+ typeConverter=TypeConverters.toBoolean)
82
+
83
+ def setEncoding(self, value):
84
+ """Sets the encoding to be used for decoding the byte string (default is utf-8).
85
+
86
+ Parameters
87
+ ----------
88
+ value : str
89
+ The encoding to be used for decoding the byte string (default is utf-8)
90
+ """
91
+ return self._set(encoding=value)
92
+
93
+ def setCleanPrefixPattern(self, value):
94
+ """Sets the pattern for the prefix. Can be a simple string or a regex pattern.
95
+
96
+ Parameters
97
+ ----------
98
+ value : str
99
+ The pattern for the prefix. Can be a simple string or a regex pattern.
100
+ """
101
+ return self._set(cleanPrefixPattern=value)
102
+
103
+ def setCleanPostfixPattern(self, value):
104
+ """Sets the pattern for the postfix. Can be a simple string or a regex pattern.
105
+
106
+ Parameters
107
+ ----------
108
+ value : str
109
+ The pattern for the postfix. Can be a simple string or a regex pattern.
110
+ """
111
+ return self._set(cleanPostfixPattern=value)
112
+
113
+ def setCleanerMode(self, value):
114
+ """Sets the cleaner mode.
115
+
116
+ Possible values:
117
+ clean, bytes_string_to_string, clean_non_ascii_chars, clean_ordered_bullets,
118
+ clean_postfix, clean_prefix, remove_punctuation, replace_unicode_quotes
119
+
120
+ Parameters
121
+ ----------
122
+ value : str
123
+ The mode for cleaning operations.
124
+ """
125
+ return self._set(cleanerMode=value)
126
+
127
+ def setExtraWhitespace(self, value):
128
+ """Sets whether to remove extra whitespace.
129
+
130
+ Parameters
131
+ ----------
132
+ value : bool
133
+ Whether to remove extra whitespace.
134
+ """
135
+ return self._set(extraWhitespace=value)
136
+
137
+ def setDashes(self, value):
138
+ """Sets whether to handle dashes in text.
139
+
140
+ Parameters
141
+ ----------
142
+ value : bool
143
+ Whether to handle dashes in text.
144
+ """
145
+ return self._set(dashes=value)
146
+
147
+ def setBullets(self, value):
148
+ """Sets whether to handle bullets in text.
149
+
150
+ Parameters
151
+ ----------
152
+ value : bool
153
+ Whether to handle bullets in text.
154
+ """
155
+ return self._set(bullets=value)
156
+
157
+ def setTrailingPunctuation(self, value):
158
+ """Sets whether to remove trailing punctuation from text.
159
+
160
+ Parameters
161
+ ----------
162
+ value : bool
163
+ Whether to remove trailing punctuation from text.
164
+ """
165
+ return self._set(trailingPunctuation=value)
166
+
167
+ def setLowercase(self, value):
168
+ """Sets whether to convert text to lowercase.
169
+
170
+ Parameters
171
+ ----------
172
+ value : bool
173
+ Whether to convert text to lowercase.
174
+ """
175
+ return self._set(lowercase=value)
176
+
177
+ def setIgnoreCase(self, value):
178
+ """Sets whether to ignore case in the pattern.
179
+
180
+ Parameters
181
+ ----------
182
+ value : bool
183
+ If true, ignores case in the pattern.
184
+ """
185
+ return self._set(ignoreCase=value)
186
+
187
+ def setStrip(self, value):
188
+ """Sets whether to remove leading or trailing whitespace from the cleaned string.
189
+
190
+ Parameters
191
+ ----------
192
+ value : bool
193
+ If true, removes leading or trailing whitespace from the cleaned string.
194
+ """
195
+ return self._set(strip=value)
196
+
197
+ @keyword_only
198
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.cleaners.Cleaner", java_model=None):
199
+ super(Cleaner, self).__init__(
200
+ classname=classname,
201
+ java_model=java_model
202
+ )