spark-nlp 6.0.1rc1__py2.py3-none-any.whl → 6.0.2__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spark-nlp might be problematic. Click here for more details.

Files changed (36) hide show
  1. {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.2.dist-info}/METADATA +13 -6
  2. {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.2.dist-info}/RECORD +36 -30
  3. {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.2.dist-info}/WHEEL +1 -1
  4. sparknlp/__init__.py +4 -2
  5. sparknlp/annotator/cv/__init__.py +2 -0
  6. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  7. sparknlp/annotator/cv/gemma3_for_multimodal.py +5 -10
  8. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  9. sparknlp/annotator/cv/janus_for_multimodal.py +8 -13
  10. sparknlp/annotator/cv/llava_for_multimodal.py +1 -1
  11. sparknlp/annotator/cv/paligemma_for_multimodal.py +7 -7
  12. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +1 -1
  13. sparknlp/annotator/cv/qwen2vl_transformer.py +1 -1
  14. sparknlp/annotator/cv/smolvlm_transformer.py +7 -13
  15. sparknlp/annotator/date2_chunk.py +1 -1
  16. sparknlp/annotator/document_character_text_splitter.py +8 -8
  17. sparknlp/annotator/document_token_splitter.py +7 -7
  18. sparknlp/annotator/embeddings/bge_embeddings.py +21 -19
  19. sparknlp/annotator/embeddings/snowflake_embeddings.py +15 -15
  20. sparknlp/annotator/openai/openai_completion.py +3 -4
  21. sparknlp/annotator/seq2seq/m2m100_transformer.py +1 -1
  22. sparknlp/annotator/seq2seq/mistral_transformer.py +2 -3
  23. sparknlp/annotator/seq2seq/nllb_transformer.py +1 -1
  24. sparknlp/annotator/seq2seq/qwen_transformer.py +26 -25
  25. sparknlp/annotator/spell_check/context_spell_checker.py +1 -1
  26. sparknlp/base/prompt_assembler.py +1 -1
  27. sparknlp/common/properties.py +7 -7
  28. sparknlp/internal/__init__.py +19 -0
  29. sparknlp/partition/__init__.py +16 -0
  30. sparknlp/partition/partition.py +244 -0
  31. sparknlp/partition/partition_properties.py +257 -0
  32. sparknlp/partition/partition_transformer.py +196 -0
  33. sparknlp/reader/pdf_to_text.py +50 -4
  34. sparknlp/reader/sparknlp_reader.py +56 -52
  35. sparknlp/training/spacy_to_annotation.py +7 -7
  36. {spark_nlp-6.0.1rc1.dist-info → spark_nlp-6.0.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,280 @@
1
+ from sparknlp.common import *
2
+
3
+ class InternVLForMultiModal(AnnotatorModel,
4
+ HasBatchedAnnotateImage,
5
+ HasImageFeatureProperties,
6
+ HasEngine,
7
+ HasGeneratorProperties):
8
+ """
9
+ InternVLForMultiModal can load InternVL Vision models for visual question answering.
10
+ The model consists of a vision encoder, a text encoder, a text decoder and a model merger.
11
+ The vision encoder will encode the input image, the text encoder will encode the input text,
12
+ the model merger will merge the image and text embeddings, and the text decoder will output the answer.
13
+
14
+ InternVL 2.5 is an advanced multimodal large language model (MLLM) series that builds upon InternVL 2.0,
15
+ maintaining its core model architecture while introducing significant enhancements in training and testing
16
+ strategies as well as data quality. Key features include:
17
+ - Large context window support
18
+ - Multilingual support
19
+ - Multimodal capabilities handling both text and image inputs
20
+ - Optimized for deployment with int4 quantization
21
+
22
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion object:
23
+ >>> visualQA = InternVLForMultiModal.pretrained() \\
24
+ ... .setInputCols("image_assembler") \\
25
+ ... .setOutputCol("answer")
26
+
27
+ The default model is `"internvl2_5_1b_int4"`, if no name is provided.
28
+ For available pretrained models, refer to the `Models Hub
29
+ <https://sparknlp.org/models?task=Question+Answering>`__.
30
+
31
+ ====================== ======================
32
+ Input Annotation types Output Annotation type
33
+ ====================== ======================
34
+ ``IMAGE`` ``DOCUMENT``
35
+ ====================== ======================
36
+
37
+ Parameters
38
+ ----------
39
+ batchSize : int, optional
40
+ Batch size. Larger values allow faster processing but require more memory,
41
+ by default 1.
42
+ maxSentenceLength : int, optional
43
+ Maximum sentence length to process, by default 4096.
44
+
45
+ Examples
46
+ --------
47
+ >>> import sparknlp
48
+ >>> from sparknlp.base import *
49
+ >>> from sparknlp.annotator import *
50
+ >>> from pyspark.ml import Pipeline
51
+ >>> from pyspark.sql.functions import lit
52
+ >>> image_df = spark.read.format("image").load(path=images_path)
53
+ >>> test_df = image_df.withColumn(
54
+ ... "text",
55
+ ... lit("<|im_start|><image>\\nDescribe this image in detail.<|im_end|><|im_start|>assistant\\n")
56
+ ... )
57
+ >>> imageAssembler = ImageAssembler() \\
58
+ ... .setInputCol("image") \\
59
+ ... .setOutputCol("image_assembler")
60
+ >>> visualQA = InternVLForMultiModal.pretrained() \\
61
+ ... .setInputCols("image_assembler") \\
62
+ ... .setOutputCol("answer")
63
+ >>> pipeline = Pipeline().setStages([
64
+ ... imageAssembler,
65
+ ... visualQA
66
+ ... ])
67
+
68
+ >>> result = pipeline.fit(test_df).transform(test_df)
69
+ >>> result.select("image_assembler.origin", "answer.result").show(truncate=False)
70
+ """
71
+
72
+ name = "InternVLForMultiModal"
73
+
74
+ inputAnnotatorTypes = [AnnotatorType.IMAGE]
75
+
76
+ outputAnnotatorType = AnnotatorType.DOCUMENT
77
+
78
+ minOutputLength = Param(Params._dummy(), "minOutputLength", "Minimum length of the sequence to be generated",
79
+ typeConverter=TypeConverters.toInt)
80
+
81
+ maxOutputLength = Param(Params._dummy(), "maxOutputLength", "Maximum length of output text",
82
+ typeConverter=TypeConverters.toInt)
83
+
84
+ doSample = Param(Params._dummy(), "doSample", "Whether or not to use sampling; use greedy decoding otherwise",
85
+ typeConverter=TypeConverters.toBoolean)
86
+
87
+ temperature = Param(Params._dummy(), "temperature", "The value used to module the next token probabilities",
88
+ typeConverter=TypeConverters.toFloat)
89
+
90
+ topK = Param(Params._dummy(), "topK",
91
+ "The number of highest probability vocabulary tokens to keep for top-k-filtering",
92
+ typeConverter=TypeConverters.toInt)
93
+
94
+ topP = Param(Params._dummy(), "topP",
95
+ "If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are kept for generation",
96
+ typeConverter=TypeConverters.toFloat)
97
+
98
+ repetitionPenalty = Param(Params._dummy(), "repetitionPenalty",
99
+ "The parameter for repetition penalty. 1.0 means no penalty. See `this paper <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details",
100
+ typeConverter=TypeConverters.toFloat)
101
+
102
+ noRepeatNgramSize = Param(Params._dummy(), "noRepeatNgramSize",
103
+ "If set to int > 0, all ngrams of that size can only occur once",
104
+ typeConverter=TypeConverters.toInt)
105
+
106
+ ignoreTokenIds = Param(Params._dummy(), "ignoreTokenIds",
107
+ "A list of token ids which are ignored in the decoder's output",
108
+ typeConverter=TypeConverters.toListInt)
109
+
110
+ beamSize = Param(Params._dummy(), "beamSize",
111
+ "The Number of beams for beam search.",
112
+ typeConverter=TypeConverters.toInt)
113
+
114
+ def setMaxSentenceSize(self, value):
115
+ """Sets Maximum sentence length that the annotator will process, by
116
+ default 4096.
117
+ Parameters
118
+ ----------
119
+ value : int
120
+ Maximum sentence length that the annotator will process
121
+ """
122
+ return self._set(maxSentenceLength=value)
123
+
124
+ def setIgnoreTokenIds(self, value):
125
+ """A list of token ids which are ignored in the decoder's output.
126
+ Parameters
127
+ ----------
128
+ value : List[int]
129
+ The words to be filtered out
130
+ """
131
+ return self._set(ignoreTokenIds=value)
132
+
133
+ def setMinOutputLength(self, value):
134
+ """Sets minimum length of the sequence to be generated.
135
+ Parameters
136
+ ----------
137
+ value : int
138
+ Minimum length of the sequence to be generated
139
+ """
140
+ return self._set(minOutputLength=value)
141
+
142
+ def setMaxOutputLength(self, value):
143
+ """Sets maximum length of output text.
144
+ Parameters
145
+ ----------
146
+ value : int
147
+ Maximum length of output text
148
+ """
149
+ return self._set(maxOutputLength=value)
150
+
151
+ def setDoSample(self, value):
152
+ """Sets whether or not to use sampling, use greedy decoding otherwise.
153
+ Parameters
154
+ ----------
155
+ value : bool
156
+ Whether or not to use sampling; use greedy decoding otherwise
157
+ """
158
+ return self._set(doSample=value)
159
+
160
+ def setTemperature(self, value):
161
+ """Sets the value used to module the next token probabilities.
162
+ Parameters
163
+ ----------
164
+ value : float
165
+ The value used to module the next token probabilities
166
+ """
167
+ return self._set(temperature=value)
168
+
169
+ def setTopK(self, value):
170
+ """Sets the number of highest probability vocabulary tokens to keep for
171
+ top-k-filtering.
172
+ Parameters
173
+ ----------
174
+ value : int
175
+ Number of highest probability vocabulary tokens to keep
176
+ """
177
+ return self._set(topK=value)
178
+
179
+ def setTopP(self, value):
180
+ """Sets the top cumulative probability for vocabulary tokens.
181
+ If set to float < 1, only the most probable tokens with probabilities
182
+ that add up to ``topP`` or higher are kept for generation.
183
+ Parameters
184
+ ----------
185
+ value : float
186
+ Cumulative probability for vocabulary tokens
187
+ """
188
+ return self._set(topP=value)
189
+
190
+ def setRepetitionPenalty(self, value):
191
+ """Sets the parameter for repetition penalty. 1.0 means no penalty.
192
+ Parameters
193
+ ----------
194
+ value : float
195
+ The repetition penalty
196
+ References
197
+ ----------
198
+ See `Ctrl: A Conditional Transformer Language Model For Controllable
199
+ Generation <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
200
+ """
201
+ return self._set(repetitionPenalty=value)
202
+
203
+ def setNoRepeatNgramSize(self, value):
204
+ """Sets size of n-grams that can only occur once.
205
+ If set to int > 0, all ngrams of that size can only occur once.
206
+ Parameters
207
+ ----------
208
+ value : int
209
+ N-gram size can only occur once
210
+ """
211
+ return self._set(noRepeatNgramSize=value)
212
+
213
+ def setBeamSize(self, value):
214
+ """Sets the number of beam size for beam search, by default `1`.
215
+ Parameters
216
+ ----------
217
+ value : int
218
+ Number of beam size for beam search
219
+ """
220
+ return self._set(beamSize=value)
221
+
222
+ @keyword_only
223
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.cv.InternVLForMultiModal",
224
+ java_model=None):
225
+ super(InternVLForMultiModal, self).__init__(
226
+ classname=classname,
227
+ java_model=java_model
228
+ )
229
+ self._setDefault(
230
+ batchSize=1,
231
+ minOutputLength=0,
232
+ maxOutputLength=20,
233
+ doSample=False,
234
+ temperature=0.6,
235
+ topK=-1,
236
+ topP=0.9,
237
+ repetitionPenalty=1.0,
238
+ noRepeatNgramSize=3,
239
+ ignoreTokenIds=[],
240
+ beamSize=1
241
+ )
242
+
243
+ @staticmethod
244
+ def loadSavedModel(folder, spark_session, use_openvino=False):
245
+ """Loads a locally saved model.
246
+ Parameters
247
+ ----------
248
+ folder : str
249
+ Folder of the saved model
250
+ spark_session : pyspark.sql.SparkSession
251
+ The current SparkSession
252
+ Returns
253
+ -------
254
+ InternVLForMultiModal
255
+ The restored model
256
+ """
257
+ from sparknlp.internal import _InternVLForMultiModalLoader
258
+ jModel = _InternVLForMultiModalLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
259
+ return InternVLForMultiModal(java_model=jModel)
260
+
261
+ @staticmethod
262
+ def pretrained(name="internvl2_5_1b_int4", lang="en", remote_loc=None):
263
+ """Downloads and loads a pretrained model.
264
+ Parameters
265
+ ----------
266
+ name : str, optional
267
+ Name of the pretrained model, by default
268
+ "internvl2_5_1b_int4"
269
+ lang : str, optional
270
+ Language of the pretrained model, by default "en"
271
+ remote_loc : str, optional
272
+ Optional remote address of the resource, by default None. Will use
273
+ Spark NLPs repositories otherwise.
274
+ Returns
275
+ -------
276
+ InternVLForMultiModal
277
+ The restored model
278
+ """
279
+ from sparknlp.pretrained import ResourceDownloader
280
+ return ResourceDownloader.downloadModel(InternVLForMultiModal, name, lang, remote_loc)
@@ -36,8 +36,9 @@ class JanusForMultiModal(AnnotatorModel,
36
36
  and for image generation, it uses a tokenizer with a downsample rate of 16.
37
37
 
38
38
  Pretrained models can be loaded with :meth:`.pretrained` of the companion object:
39
- >>> visualQAClassifier = JanusForMultiModal.pretrained() \
40
- ... .setInputCols(["image_assembler"]) \
39
+
40
+ >>> visualQAClassifier = JanusForMultiModal.pretrained() \\
41
+ ... .setInputCols(["image_assembler"]) \\
41
42
  ... .setOutputCol("answer")
42
43
 
43
44
  The default model is `"janus_1_3b_int4"`, if no name is provided.
@@ -73,29 +74,23 @@ class JanusForMultiModal(AnnotatorModel,
73
74
  >>> from sparknlp.annotator import *
74
75
  >>> from pyspark.ml import Pipeline
75
76
  >>> from pyspark.sql.functions import lit
76
-
77
77
  >>> image_df = SparkSessionForTest.spark.read.format("image").load(path=images_path)
78
78
  >>> test_df = image_df.withColumn(
79
79
  ... "text",
80
- ... lit("User: <image_placeholder>Describe image in details\n\nAssistant:")
80
+ ... lit("User: <image_placeholder>Describe image in details\\n\\nAssistant:")
81
81
  ... )
82
-
83
- >>> imageAssembler = ImageAssembler() \
84
- ... .setInputCol("image") \
82
+ >>> imageAssembler = ImageAssembler() \\
83
+ ... .setInputCol("image") \\
85
84
  ... .setOutputCol("image_assembler")
86
-
87
- >>> visualQAClassifier = JanusForMultiModal.pretrained() \
88
- ... .setInputCols("image_assembler") \
85
+ >>> visualQAClassifier = JanusForMultiModal.pretrained() \\
86
+ ... .setInputCols("image_assembler") \\
89
87
  ... .setOutputCol("answer")
90
-
91
88
  >>> pipeline = Pipeline().setStages([
92
89
  ... imageAssembler,
93
90
  ... visualQAClassifier
94
91
  ... ])
95
-
96
92
  >>> result = pipeline.fit(test_df).transform(test_df)
97
93
  >>> result.select("image_assembler.origin", "answer.result").show(truncate=False)
98
-
99
94
  +--------------------------------------+----------------------------------------------------------------------+
100
95
  |origin |result |
101
96
  +--------------------------------------+----------------------------------------------------------------------+
@@ -65,7 +65,7 @@ class LLAVAForMultiModal(AnnotatorModel,
65
65
  >>> from sparknlp.annotator import *
66
66
  >>> from pyspark.ml import Pipeline
67
67
  >>> image_df = SparkSessionForTest.spark.read.format("image").load(path=images_path)
68
- >>> test_df = image_df.withColumn("text", lit("USER: \n <|image|> \n What's this picture about? \n ASSISTANT:\n"))
68
+ >>> test_df = image_df.withColumn("text", lit("USER: \\n <|image|> \\n What's this picture about? \\n ASSISTANT:\\n"))
69
69
  >>> imageAssembler = ImageAssembler() \\
70
70
  ... .setInputCol("image") \\
71
71
  ... .setOutputCol("image_assembler")
@@ -28,8 +28,8 @@ class PaliGemmaForMultiModal(AnnotatorModel,
28
28
  Pretrained models can be loaded with :meth:`.pretrained` of the companion
29
29
  object:
30
30
 
31
- >>> visualQAClassifier = PaliGemmaForMultiModal.pretrained() \
32
- ... .setInputCols(["image_assembler"]) \
31
+ >>> visualQAClassifier = PaliGemmaForMultiModal.pretrained() \\
32
+ ... .setInputCols(["image_assembler"]) \\
33
33
  ... .setOutputCol("answer")
34
34
 
35
35
  The default model is ``"paligemma_3b_pt_224_int4"``, if no name is
@@ -59,12 +59,12 @@ class PaliGemmaForMultiModal(AnnotatorModel,
59
59
  >>> from sparknlp.annotator import *
60
60
  >>> from pyspark.ml import Pipeline
61
61
  >>> image_df = SparkSessionForTest.spark.read.format("image").load(path=images_path)
62
- >>> test_df = image_df.withColumn("text", lit("USER: \n <image> \nDescribe this image. \nASSISTANT:\n"))
63
- >>> imageAssembler = ImageAssembler() \
64
- ... .setInputCol("image") \
62
+ >>> test_df = image_df.withColumn("text", lit("USER: \\n <image> \\nDescribe this image. \\nASSISTANT:\\n"))
63
+ >>> imageAssembler = ImageAssembler() \\
64
+ ... .setInputCol("image") \\
65
65
  ... .setOutputCol("image_assembler")
66
- >>> visualQAClassifier = PaliGemmaForMultiModal.pretrained() \
67
- ... .setInputCols("image_assembler") \
66
+ >>> visualQAClassifier = PaliGemmaForMultiModal.pretrained() \\
67
+ ... .setInputCols("image_assembler") \\
68
68
  ... .setOutputCol("answer")
69
69
  >>> pipeline = Pipeline().setStages([
70
70
  ... imageAssembler,
@@ -65,7 +65,7 @@ class Phi3Vision(AnnotatorModel,
65
65
  >>> from sparknlp.annotator import *
66
66
  >>> from pyspark.ml import Pipeline
67
67
  >>> image_df = SparkSessionForTest.spark.read.format("image").load(path=images_path)
68
- >>> test_df = image_df.withColumn("text", lit("<|user|> \n <|image_1|> \nWhat is unusual on this picture? <|end|>\n <|assistant|>\n"))
68
+ >>> test_df = image_df.withColumn("text", lit("<|user|> \\n <|image_1|> \\nWhat is unusual on this picture? <|end|>\\n <|assistant|>\\n"))
69
69
  >>> imageAssembler = ImageAssembler() \\
70
70
  ... .setInputCol("image") \\
71
71
  ... .setOutputCol("image_assembler")
@@ -68,7 +68,7 @@ class Qwen2VLTransformer(AnnotatorModel,
68
68
  >>> from sparknlp.annotator import *
69
69
  >>> from pyspark.ml import Pipeline
70
70
  >>> image_df = SparkSessionForTest.spark.read.format("image").load(path=images_path)
71
- >>> test_df = image_df.withColumn("text", lit("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n"))
71
+ >>> test_df = image_df.withColumn("text", lit("<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n<|im_start|>user\\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\\n<|im_start|>assistant\\n"))
72
72
  >>> imageAssembler = ImageAssembler() \\
73
73
  ... .setInputCol("image") \\
74
74
  ... .setOutputCol("image_assembler")
@@ -33,8 +33,8 @@ class SmolVLMTransformer(AnnotatorModel,
33
33
  while maintaining strong performance on multimodal tasks.
34
34
 
35
35
  Pretrained models can be loaded with :meth:`.pretrained` of the companion object:
36
- >>> visualQA = SmolVLMTransformer.pretrained() \
37
- ... .setInputCols(["image_assembler"]) \
36
+ >>> visualQA = SmolVLMTransformer.pretrained() \\
37
+ ... .setInputCols(["image_assembler"]) \\
38
38
  ... .setOutputCol("answer")
39
39
 
40
40
  The default model is `"smolvlm_instruct_int4"`, if no name is provided.
@@ -82,29 +82,23 @@ class SmolVLMTransformer(AnnotatorModel,
82
82
  >>> from sparknlp.annotator import *
83
83
  >>> from pyspark.ml import Pipeline
84
84
  >>> from pyspark.sql.functions import lit
85
-
86
85
  >>> imageDF = spark.read.format("image").load(path=images_path)
87
86
  >>> testDF = imageDF.withColumn(
88
87
  ... "text",
89
- ... lit("<|im_start|>User:<image>Can you describe the image?<end_of_utterance>\nAssistant:")
88
+ ... lit("<|im_start|>User:<image>Can you describe the image?<end_of_utterance>\\nAssistant:")
90
89
  ... )
91
-
92
- >>> imageAssembler = ImageAssembler() \
93
- ... .setInputCol("image") \
90
+ >>> imageAssembler = ImageAssembler() \\
91
+ ... .setInputCol("image") \\
94
92
  ... .setOutputCol("image_assembler")
95
-
96
- >>> visualQAClassifier = SmolVLMTransformer.pretrained() \
97
- ... .setInputCols("image_assembler") \
93
+ >>> visualQAClassifier = SmolVLMTransformer.pretrained() \\
94
+ ... .setInputCols("image_assembler") \\
98
95
  ... .setOutputCol("answer")
99
-
100
96
  >>> pipeline = Pipeline().setStages([
101
97
  ... imageAssembler,
102
98
  ... visualQAClassifier
103
99
  ... ])
104
-
105
100
  >>> result = pipeline.fit(testDF).transform(testDF)
106
101
  >>> result.select("image_assembler.origin", "answer.result").show(truncate=False)
107
-
108
102
  +--------------------------------------+----------------------------------------------------------------------+
109
103
  |origin |result |
110
104
  +--------------------------------------+----------------------------------------------------------------------+
@@ -24,7 +24,7 @@ class Date2Chunk(AnnotatorModel):
24
24
  ====================== ======================
25
25
  Input Annotation types Output Annotation type
26
26
  ====================== ======================
27
- ``DATE`` ``CHUNK``
27
+ ``DATE`` ``CHUNK``
28
28
  ====================== ======================
29
29
 
30
30
  Parameters
@@ -55,7 +55,7 @@ class DocumentCharacterTextSplitter(AnnotatorModel):
55
55
  chunkOverlap
56
56
  Length of the overlap between text chunks , by default `0`.
57
57
  splitPatterns
58
- Patterns to separate the text by in decreasing priority , by default `["\n\n", "\n", " ", ""]`.
58
+ Patterns to separate the text by in decreasing priority , by default `["\\n\\n", "\\n", " ", ""]`.
59
59
  patternsAreRegex
60
60
  Whether to interpret the split patterns as regular expressions , by default `False`.
61
61
  keepSeparators
@@ -94,13 +94,13 @@ class DocumentCharacterTextSplitter(AnnotatorModel):
94
94
  | result|splits[0].begin|splits[0].end|length|
95
95
  +--------------------------------------------------------------------------------+---------------+-------------+------+
96
96
  |[ Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyl...| 0| 19994| 19994|
97
- |["And Mademoiselle's address?" he asked.\n\n"Is Briony Lodge, Serpentine Aven...| 19798| 39395| 19597|
98
- |["How did that help you?"\n\n"It was all-important. When a woman thinks that ...| 39371| 59242| 19871|
99
- |["'But,' said I, 'there would be millions of red-headed men who\nwould apply....| 59166| 77833| 18667|
100
- |[My friend was an enthusiastic musician, being himself not only a\nvery capab...| 77835| 97769| 19934|
101
- |["And yet I am not convinced of it," I answered. "The cases which\ncome to li...| 97771| 117248| 19477|
102
- |["Well, she had a slate-coloured, broad-brimmed straw hat, with a\nfeather of...| 117250| 137242| 19992|
103
- |["That sounds a little paradoxical."\n\n"But it is profoundly True. Singulari...| 137244| 157171| 19927|
97
+ |["And Mademoiselle's address?" he asked.\\n\\n"Is Briony Lodge, Serpentine Aven...| 19798| 39395| 19597|
98
+ |["How did that help you?"\\n\\n"It was all-important. When a woman thinks that ...| 39371| 59242| 19871|
99
+ |["'But,' said I, 'there would be millions of red-headed men who\\nwould apply....| 59166| 77833| 18667|
100
+ |[My friend was an enthusiastic musician, being himself not only a\\nvery capab...| 77835| 97769| 19934|
101
+ |["And yet I am not convinced of it," I answered. "The cases which\\ncome to li...| 97771| 117248| 19477|
102
+ |["Well, she had a slate-coloured, broad-brimmed straw hat, with a\\nfeather of...| 117250| 137242| 19992|
103
+ |["That sounds a little paradoxical."\\n\\n"But it is profoundly True. Singulari...| 137244| 157171| 19927|
104
104
  +--------------------------------------------------------------------------------+---------------+-------------+------+
105
105
 
106
106
  """
@@ -88,13 +88,13 @@ class DocumentTokenSplitter(AnnotatorModel):
88
88
  | result|begin| end|length|tokens|
89
89
  +--------------------------------------------------------------------------------+-----+-----+------+------+
90
90
  |[ Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyl...| 0| 3018| 3018| 512|
91
- |[study of crime, and occupied his\nimmense faculties and extraordinary powers...| 2950| 5707| 2757| 512|
92
- |[but as I have changed my clothes I can't imagine how you\ndeduce it. As to M...| 5659| 8483| 2824| 512|
93
- |[quarters received. Be in your chamber then at that hour, and do\nnot take it...| 8427|11241| 2814| 512|
94
- |[a pity\nto miss it."\n\n"But your client--"\n\n"Never mind him. I may want y...|11188|13970| 2782| 512|
95
- |[person who employs me wishes his agent to be unknown to\nyou, and I may conf...|13918|16898| 2980| 512|
96
- |[letters back."\n\n"Precisely so. But how--"\n\n"Was there a secret marriage?...|16836|19744| 2908| 512|
97
- |[seven hundred in\nnotes," he said.\n\nHolmes scribbled a receipt upon a shee...|19683|22551| 2868| 512|
91
+ |[study of crime, and occupied his\\nimmense faculties and extraordinary powers...| 2950| 5707| 2757| 512|
92
+ |[but as I have changed my clothes I can't imagine how you\\ndeduce it. As to M...| 5659| 8483| 2824| 512|
93
+ |[quarters received. Be in your chamber then at that hour, and do\\nnot take it...| 8427|11241| 2814| 512|
94
+ |[a pity\\nto miss it."\\n\\n"But your client--"\\n\\n"Never mind him. I may want y...|11188|13970| 2782| 512|
95
+ |[person who employs me wishes his agent to be unknown to\\nyou, and I may conf...|13918|16898| 2980| 512|
96
+ |[letters back."\\n\\n"Precisely so. But how--"\\n\\n"Was there a secret marriage?...|16836|19744| 2908| 512|
97
+ |[seven hundred in\\nnotes," he said.\\n\\nHolmes scribbled a receipt upon a shee...|19683|22551| 2868| 512|
98
98
  +--------------------------------------------------------------------------------+-----+-----+------+------+
99
99
 
100
100
  """
@@ -49,23 +49,9 @@ class BGEEmbeddings(AnnotatorModel,
49
49
  ``DOCUMENT`` ``SENTENCE_EMBEDDINGS``
50
50
  ====================== ======================
51
51
 
52
- Parameters
53
- ----------
54
- batchSize
55
- Size of every batch , by default 8
56
- dimension
57
- Number of embedding dimensions, by default 768
58
- caseSensitive
59
- Whether to ignore case in tokens for embeddings matching, by default False
60
- maxSentenceLength
61
- Max sentence length to process, by default 512
62
- configProtoBytes
63
- ConfigProto from tensorflow, serialized into byte array.
64
- useCLSToken
65
- Whether to use the CLS token for sentence embeddings, by default True
66
-
67
- References
68
- ----------
52
+
53
+ **References**
54
+
69
55
  `C-Pack: Packaged Resources To Advance General Chinese Embedding <https://arxiv.org/pdf/2309.07597>`__
70
56
  `BGE Github Repository <https://github.com/FlagOpen/FlagEmbedding>`__
71
57
 
@@ -84,6 +70,22 @@ class BGEEmbeddings(AnnotatorModel,
84
70
  benchmark; meanwhile, our released English data is 2 times larger than the Chinese data. All
85
71
  these resources are made publicly available at https://github.com/FlagOpen/FlagEmbedding.*
86
72
 
73
+
74
+ Parameters
75
+ ----------
76
+ batchSize
77
+ Size of every batch , by default 8
78
+ dimension
79
+ Number of embedding dimensions, by default 768
80
+ caseSensitive
81
+ Whether to ignore case in tokens for embeddings matching, by default False
82
+ maxSentenceLength
83
+ Max sentence length to process, by default 512
84
+ configProtoBytes
85
+ ConfigProto from tensorflow, serialized into byte array.
86
+ useCLSToken
87
+ Whether to use the CLS token for sentence embeddings, by default True
88
+
87
89
  Examples
88
90
  --------
89
91
  >>> import sparknlp
@@ -106,8 +108,8 @@ class BGEEmbeddings(AnnotatorModel,
106
108
  ... embeddingsFinisher
107
109
  ... ])
108
110
  >>> data = spark.createDataFrame([["query: how much protein should a female eat",
109
- ... "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." + \
110
- ... "But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" + \
111
+ ... "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." + \\
112
+ ... "But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" + \\
111
113
  ... "marathon. Check out the chart below to see how much protein you should be eating each day.",
112
114
  ... ]]).toDF("text")
113
115
  >>> result = pipeline.fit(data).transform(data)
@@ -47,21 +47,7 @@ class SnowFlakeEmbeddings(AnnotatorModel,
47
47
  ``DOCUMENT`` ``SENTENCE_EMBEDDINGS``
48
48
  ====================== ======================
49
49
 
50
- Parameters
51
- ----------
52
- batchSize
53
- Size of every batch , by default 8
54
- dimension
55
- Number of embedding dimensions, by default 768
56
- caseSensitive
57
- Whether to ignore case in tokens for embeddings matching, by default False
58
- maxSentenceLength
59
- Max sentence length to process, by default 512
60
- configProtoBytes
61
- ConfigProto from tensorflow, serialized into byte array.
62
-
63
- References
64
- ----------
50
+ **References**
65
51
 
66
52
  `Arctic-Embed: Scalable, Efficient, and Accurate Text Embedding Models <https://arxiv.org/abs/2405.05374>`__
67
53
  `Snowflake Arctic-Embed Models <https://github.com/Snowflake-Labs/arctic-embed>`__
@@ -78,6 +64,20 @@ class SnowFlakeEmbeddings(AnnotatorModel,
78
64
  data curation is crucial to retrieval accuracy. A detailed technical report will be available
79
65
  shortly. *
80
66
 
67
+ Parameters
68
+ ----------
69
+ batchSize
70
+ Size of every batch , by default 8
71
+ dimension
72
+ Number of embedding dimensions, by default 768
73
+ caseSensitive
74
+ Whether to ignore case in tokens for embeddings matching, by default False
75
+ maxSentenceLength
76
+ Max sentence length to process, by default 512
77
+ configProtoBytes
78
+ ConfigProto from tensorflow, serialized into byte array.
79
+
80
+
81
81
  Examples
82
82
  --------
83
83
  >>> import sparknlp
@@ -63,7 +63,6 @@ class OpenAICompletion(AnnotatorModel):
63
63
  >>> from sparknlp.annotator import *
64
64
  >>> from sparknlp.common import *
65
65
  >>> from pyspark.ml import Pipeline
66
-
67
66
  >>> documentAssembler = DocumentAssembler() \\
68
67
  ... .setInputCol("text") \\
69
68
  ... .setOutputCol("document")
@@ -83,9 +82,9 @@ class OpenAICompletion(AnnotatorModel):
83
82
  +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
84
83
  |completion |
85
84
  +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
86
- |[{document, 0, 258, \n\nI had the pleasure of dining at La Fiorita recently, and it was a truly delightful experience! The menu boasted a wonderful selection of classic Italian dishes, all exquisitely prepared and presented. The service staff was friendly and attentive and really, {}, []}]|
87
- |[{document, 0, 227, \n\nI recently visited Barbecue Joe's for dinner and it was amazing! The menu had so many items to choose from including pulled pork, smoked turkey, brisket, pork ribs, and sandwiches. I opted for the pulled pork sandwich and let, {}, []}] |
88
- |[{document, 0, 172, \n\n{ \n "review": { \n "overallRating": 4, \n "reviewBody": "I enjoyed my meal at this restaurant. The food was flavourful, well-prepared and beautifully presented., {}, []}] |
85
+ |[{document, 0, 258, \\n\\nI had the pleasure of dining at La Fiorita recently, and it was a truly delightful experience! The menu boasted a wonderful selection of classic Italian dishes, all exquisitely prepared and presented. The service staff was friendly and attentive and really, {}, []}]|
86
+ |[{document, 0, 227, \\n\\nI recently visited Barbecue Joe's for dinner and it was amazing! The menu had so many items to choose from including pulled pork, smoked turkey, brisket, pork ribs, and sandwiches. I opted for the pulled pork sandwich and let, {}, []}] |
87
+ |[{document, 0, 172, \\n\\n{ \\n "review": { \\n "overallRating": 4, \\n "reviewBody": "I enjoyed my meal at this restaurant. The food was flavourful, well-prepared and beautifully presented., {}, []}] |
89
88
  +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
90
89
  """
91
90
 
@@ -77,7 +77,7 @@ class M2M100Transformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
77
77
  Target Language (Default: `fr`)
78
78
 
79
79
  Languages Covered
80
- -----
80
+ -----------------
81
81
  Afrikaans (af), Amharic (am), Arabic (ar), Asturian (ast), Azerbaijani (az), Bashkir (ba),
82
82
  Belarusian (be), Bulgarian (bg), Bengali (bn), Breton (br), Bosnian (bs), Catalan; Valencian
83
83
  (ca), Cebuano (ceb), Czech (cs), Welsh (cy), Danish (da), German (de), Greeek (el), English