spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  4. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  5. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  6. sparknlp/__init__.py +81 -28
  7. sparknlp/annotation.py +3 -2
  8. sparknlp/annotator/__init__.py +6 -0
  9. sparknlp/annotator/audio/__init__.py +2 -0
  10. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  11. sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
  12. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  13. sparknlp/{base → annotator}/chunk2_doc.py +4 -7
  14. sparknlp/annotator/chunker.py +1 -2
  15. sparknlp/annotator/classifier_dl/__init__.py +17 -0
  16. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  17. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
  18. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
  19. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
  20. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  21. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  22. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  23. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
  24. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
  25. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
  26. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  27. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  28. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
  29. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
  30. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  31. sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
  32. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
  33. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
  34. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
  35. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  36. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
  37. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
  38. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
  39. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  40. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  41. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
  42. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
  43. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
  44. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  45. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  46. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  47. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
  48. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  49. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
  50. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
  51. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
  52. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  53. sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
  54. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
  55. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  56. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
  57. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  60. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
  61. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
  62. sparknlp/annotator/cleaners/__init__.py +15 -0
  63. sparknlp/annotator/cleaners/cleaner.py +202 -0
  64. sparknlp/annotator/cleaners/extractor.py +191 -0
  65. sparknlp/annotator/coref/spanbert_coref.py +4 -18
  66. sparknlp/annotator/cv/__init__.py +15 -0
  67. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  68. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  69. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  70. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  71. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  72. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  73. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  74. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  75. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  76. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  77. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  78. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  79. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  80. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  81. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  82. sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
  83. sparknlp/annotator/dataframe_optimizer.py +216 -0
  84. sparknlp/annotator/date2_chunk.py +88 -0
  85. sparknlp/annotator/dependency/dependency_parser.py +2 -3
  86. sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
  87. sparknlp/annotator/document_character_text_splitter.py +228 -0
  88. sparknlp/annotator/document_normalizer.py +37 -1
  89. sparknlp/annotator/document_token_splitter.py +175 -0
  90. sparknlp/annotator/document_token_splitter_test.py +85 -0
  91. sparknlp/annotator/embeddings/__init__.py +11 -0
  92. sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
  93. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  94. sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
  95. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
  96. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  97. sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
  98. sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
  99. sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
  100. sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
  101. sparknlp/annotator/embeddings/doc2vec.py +7 -1
  102. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  103. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  104. sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
  105. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  106. sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
  107. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  108. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  109. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  110. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  111. sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
  112. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
  113. sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
  114. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  115. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  116. sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
  117. sparknlp/annotator/embeddings/word2vec.py +7 -1
  118. sparknlp/annotator/embeddings/word_embeddings.py +4 -5
  119. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
  120. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
  121. sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
  122. sparknlp/annotator/er/entity_ruler.py +37 -23
  123. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
  124. sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
  125. sparknlp/annotator/lemmatizer.py +3 -4
  126. sparknlp/annotator/matcher/date_matcher.py +35 -3
  127. sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
  128. sparknlp/annotator/matcher/regex_matcher.py +3 -3
  129. sparknlp/annotator/matcher/text_matcher.py +2 -3
  130. sparknlp/annotator/n_gram_generator.py +1 -2
  131. sparknlp/annotator/ner/__init__.py +3 -1
  132. sparknlp/annotator/ner/ner_converter.py +18 -0
  133. sparknlp/annotator/ner/ner_crf.py +4 -5
  134. sparknlp/annotator/ner/ner_dl.py +10 -5
  135. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  136. sparknlp/annotator/ner/ner_overwriter.py +2 -2
  137. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  138. sparknlp/annotator/normalizer.py +2 -2
  139. sparknlp/annotator/openai/__init__.py +16 -0
  140. sparknlp/annotator/openai/openai_completion.py +349 -0
  141. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  142. sparknlp/annotator/pos/perceptron.py +6 -7
  143. sparknlp/annotator/sentence/sentence_detector.py +2 -2
  144. sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
  145. sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
  146. sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
  147. sparknlp/annotator/seq2seq/__init__.py +17 -0
  148. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  149. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  150. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  151. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  152. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  153. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  154. sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
  155. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  156. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  157. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  158. sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
  159. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  160. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  161. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  162. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  163. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  164. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  165. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  166. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  167. sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
  168. sparknlp/annotator/similarity/__init__.py +0 -0
  169. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  170. sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
  171. sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
  172. sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
  173. sparknlp/annotator/stemmer.py +2 -3
  174. sparknlp/annotator/stop_words_cleaner.py +3 -4
  175. sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
  176. sparknlp/annotator/token/__init__.py +0 -1
  177. sparknlp/annotator/token/recursive_tokenizer.py +2 -3
  178. sparknlp/annotator/token/tokenizer.py +2 -3
  179. sparknlp/annotator/ws/word_segmenter.py +35 -10
  180. sparknlp/base/__init__.py +2 -3
  181. sparknlp/base/doc2_chunk.py +0 -3
  182. sparknlp/base/document_assembler.py +5 -5
  183. sparknlp/base/embeddings_finisher.py +14 -2
  184. sparknlp/base/finisher.py +15 -4
  185. sparknlp/base/gguf_ranking_finisher.py +234 -0
  186. sparknlp/base/image_assembler.py +69 -0
  187. sparknlp/base/light_pipeline.py +53 -21
  188. sparknlp/base/multi_document_assembler.py +9 -13
  189. sparknlp/base/prompt_assembler.py +207 -0
  190. sparknlp/base/token_assembler.py +1 -2
  191. sparknlp/common/__init__.py +2 -0
  192. sparknlp/common/annotator_type.py +1 -0
  193. sparknlp/common/completion_post_processing.py +37 -0
  194. sparknlp/common/match_strategy.py +33 -0
  195. sparknlp/common/properties.py +914 -9
  196. sparknlp/internal/__init__.py +841 -116
  197. sparknlp/internal/annotator_java_ml.py +1 -1
  198. sparknlp/internal/annotator_transformer.py +3 -0
  199. sparknlp/logging/comet.py +2 -2
  200. sparknlp/partition/__init__.py +16 -0
  201. sparknlp/partition/partition.py +244 -0
  202. sparknlp/partition/partition_properties.py +902 -0
  203. sparknlp/partition/partition_transformer.py +200 -0
  204. sparknlp/pretrained/pretrained_pipeline.py +1 -1
  205. sparknlp/pretrained/resource_downloader.py +126 -2
  206. sparknlp/reader/__init__.py +15 -0
  207. sparknlp/reader/enums.py +19 -0
  208. sparknlp/reader/pdf_to_text.py +190 -0
  209. sparknlp/reader/reader2doc.py +124 -0
  210. sparknlp/reader/reader2image.py +136 -0
  211. sparknlp/reader/reader2table.py +44 -0
  212. sparknlp/reader/reader_assembler.py +159 -0
  213. sparknlp/reader/sparknlp_reader.py +461 -0
  214. sparknlp/training/__init__.py +1 -0
  215. sparknlp/training/conll.py +8 -2
  216. sparknlp/training/spacy_to_annotation.py +57 -0
  217. sparknlp/util.py +26 -0
  218. spark_nlp-4.2.6.dist-info/METADATA +0 -1256
  219. spark_nlp-4.2.6.dist-info/RECORD +0 -196
  220. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
  221. /sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0
@@ -0,0 +1,191 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for Extractor."""
15
+ from sparknlp.common import *
16
+
17
+ class Extractor(AnnotatorModel):
18
+ name = "Extractor"
19
+
20
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
21
+
22
+ outputAnnotatorType = AnnotatorType.CHUNK
23
+
24
+ emailDateTimeTzPattern = Param(Params._dummy(),
25
+ "emailDateTimeTzPattern",
26
+ "Specifies the date-time pattern for email timestamps, including time zone formatting.",
27
+ typeConverter=TypeConverters.toString)
28
+
29
+ emailAddress = Param(
30
+ Params._dummy(),
31
+ "emailAddress",
32
+ "Specifies the pattern for email addresses.",
33
+ typeConverter=TypeConverters.toString
34
+ )
35
+
36
+ ipAddressPattern = Param(
37
+ Params._dummy(),
38
+ "ipAddressPattern",
39
+ "Specifies the pattern for IP addresses.",
40
+ typeConverter=TypeConverters.toString
41
+ )
42
+
43
+ ipAddressNamePattern = Param(
44
+ Params._dummy(),
45
+ "ipAddressNamePattern",
46
+ "Specifies the pattern for IP addresses with names.",
47
+ typeConverter=TypeConverters.toString
48
+ )
49
+
50
+ mapiIdPattern = Param(
51
+ Params._dummy(),
52
+ "mapiIdPattern",
53
+ "Specifies the pattern for MAPI IDs.",
54
+ typeConverter=TypeConverters.toString
55
+ )
56
+
57
+ usPhoneNumbersPattern = Param(
58
+ Params._dummy(),
59
+ "usPhoneNumbersPattern",
60
+ "Specifies the pattern for US phone numbers.",
61
+ typeConverter=TypeConverters.toString
62
+ )
63
+
64
+ imageUrlPattern = Param(
65
+ Params._dummy(),
66
+ "imageUrlPattern",
67
+ "Specifies the pattern for image URLs.",
68
+ typeConverter=TypeConverters.toString
69
+ )
70
+
71
+ textPattern = Param(
72
+ Params._dummy(),
73
+ "textPattern",
74
+ "Specifies the pattern for text after and before.",
75
+ typeConverter=TypeConverters.toString
76
+ )
77
+
78
+ extractorMode = Param(
79
+ Params._dummy(),
80
+ "extractorMode",
81
+ "possible values: " +
82
+ "email_date, email_address, ip_address, ip_address_name, mapi_id, us_phone_numbers, image_urls, bullets, text_after, text_before",
83
+ typeConverter=TypeConverters.toString
84
+ )
85
+
86
+ index = Param(
87
+ Params._dummy(),
88
+ "index",
89
+ "Specifies the index of the pattern to extract in text after or before",
90
+ typeConverter=TypeConverters.toInt
91
+ )
92
+
93
+ def setEmailDateTimeTzPattern(self, value):
94
+ """Sets specifies the date-time pattern for email timestamps, including time zone formatting.
95
+
96
+ Parameters
97
+ ----------
98
+ value : str
99
+ Specifies the date-time pattern for email timestamps, including time zone formatting.
100
+ """
101
+ return self._set(emailDateTimeTzPattern=value)
102
+
103
+ def setEmailAddress(self, value):
104
+ """Sets the pattern for email addresses.
105
+
106
+ Parameters
107
+ ----------
108
+ value : str
109
+ Specifies the pattern for email addresses.
110
+ """
111
+ return self._set(emailAddress=value)
112
+
113
+ def setIpAddressPattern(self, value):
114
+ """Sets the pattern for IP addresses.
115
+
116
+ Parameters
117
+ ----------
118
+ value : str
119
+ Specifies the pattern for IP addresses.
120
+ """
121
+ return self._set(ipAddressPattern=value)
122
+
123
+ def setIpAddressNamePattern(self, value):
124
+ """Sets the pattern for IP addresses with names.
125
+
126
+ Parameters
127
+ ----------
128
+ value : str
129
+ Specifies the pattern for IP addresses with names.
130
+ """
131
+ return self._set(ipAddressNamePattern=value)
132
+
133
+ def setMapiIdPattern(self, value):
134
+ """Sets the pattern for MAPI IDs.
135
+
136
+ Parameters
137
+ ----------
138
+ value : str
139
+ Specifies the pattern for MAPI IDs.
140
+ """
141
+ return self._set(mapiIdPattern=value)
142
+
143
+ def setUsPhoneNumbersPattern(self, value):
144
+ """Sets the pattern for US phone numbers.
145
+
146
+ Parameters
147
+ ----------
148
+ value : str
149
+ Specifies the pattern for US phone numbers.
150
+ """
151
+ return self._set(usPhoneNumbersPattern=value)
152
+
153
+ def setImageUrlPattern(self, value):
154
+ """Sets the pattern for image URLs.
155
+
156
+ Parameters
157
+ ----------
158
+ value : str
159
+ Specifies the pattern for image URLs.
160
+ """
161
+ return self._set(imageUrlPattern=value)
162
+
163
+ def setTextPattern(self, value):
164
+ """Sets the pattern for text after and before.
165
+
166
+ Parameters
167
+ ----------
168
+ value : str
169
+ Specifies the pattern for text after and before.
170
+ """
171
+ return self._set(textPattern=value)
172
+
173
+ def setExtractorMode(self, value):
174
+ return self._set(extractorMode=value)
175
+
176
+ def setIndex(self, value):
177
+ """Sets the index of the pattern to extract in text after or before.
178
+
179
+ Parameters
180
+ ----------
181
+ value : int
182
+ Specifies the index of the pattern to extract in text after or before.
183
+ """
184
+ return self._set(index=value)
185
+
186
+ @keyword_only
187
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.cleaners.Extractor", java_model=None):
188
+ super(Extractor, self).__init__(
189
+ classname=classname,
190
+ java_model=java_model
191
+ )
@@ -20,7 +20,8 @@ class SpanBertCorefModel(AnnotatorModel,
20
20
  HasEmbeddingsProperties,
21
21
  HasCaseSensitiveProperties,
22
22
  HasStorageRef,
23
- HasEngine):
23
+ HasEngine,
24
+ HasMaxSentenceLengthLimit):
24
25
  """
25
26
  A coreference resolution model based on SpanBert.
26
27
 
@@ -38,10 +39,10 @@ class SpanBertCorefModel(AnnotatorModel,
38
39
 
39
40
  The default model is ``"spanbert_base_coref"``, if no name is provided. For available
40
41
  pretrained models please see the `Models Hub
41
- <https://nlp.johnsnowlabs.com/models?q=coref>`__.
42
+ <https://sparknlp.org/models?q=coref>`__.
42
43
 
43
44
  For extended examples of usage, see the
44
- `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/coreference-resolution/Coreference_Resolution_SpanBertCorefModel.ipynb>`__.
45
+ `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/coreference-resolution/Coreference_Resolution_SpanBertCorefModel.ipynb>`__.
45
46
 
46
47
  ====================== ======================
47
48
  Input Annotation types Output Annotation type
@@ -114,11 +115,6 @@ class SpanBertCorefModel(AnnotatorModel,
114
115
 
115
116
  outputAnnotatorType = AnnotatorType.DEPENDENCY
116
117
 
117
- maxSentenceLength = Param(Params._dummy(),
118
- "maxSentenceLength",
119
- "Max sentence length to process",
120
- typeConverter=TypeConverters.toInt)
121
-
122
118
  maxSegmentLength = Param(Params._dummy(),
123
119
  "maxSegmentLength",
124
120
  "Max segment length",
@@ -144,16 +140,6 @@ class SpanBertCorefModel(AnnotatorModel,
144
140
  """
145
141
  return self._set(configProtoBytes=b)
146
142
 
147
- def setMaxSentenceLength(self, value):
148
- """Sets max sentence length to process.
149
-
150
- Parameters
151
- ----------
152
- value : int
153
- Max sentence length to process
154
- """
155
- return self._set(maxSentenceLength=value)
156
-
157
143
  def setMaxSegmentLength(self, value):
158
144
  """Sets max segment length
159
145
 
@@ -12,3 +12,18 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  from sparknlp.annotator.cv.vit_for_image_classification import *
15
+ from sparknlp.annotator.cv.swin_for_image_classification import *
16
+ from sparknlp.annotator.cv.convnext_for_image_classification import *
17
+ from sparknlp.annotator.cv.vision_encoder_decoder_for_image_captioning import *
18
+ from sparknlp.annotator.cv.clip_for_zero_shot_classification import *
19
+ from sparknlp.annotator.cv.blip_for_question_answering import *
20
+ from sparknlp.annotator.cv.janus_for_multimodal import *
21
+ from sparknlp.annotator.cv.mllama_for_multimodal import *
22
+ from sparknlp.annotator.cv.qwen2vl_transformer import *
23
+ from sparknlp.annotator.cv.llava_for_multimodal import *
24
+ from sparknlp.annotator.cv.phi3_vision_for_multimodal import *
25
+ from sparknlp.annotator.cv.smolvlm_transformer import *
26
+ from sparknlp.annotator.cv.paligemma_for_multimodal import *
27
+ from sparknlp.annotator.cv.gemma3_for_multimodal import *
28
+ from sparknlp.annotator.cv.internvl_for_multimodal import *
29
+ from sparknlp.annotator.cv.florence2_transformer import *
@@ -0,0 +1,172 @@
1
+ # Copyright 2017-2024 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from sparknlp.common import *
16
+
17
+ class BLIPForQuestionAnswering(AnnotatorModel,
18
+ HasBatchedAnnotateImage,
19
+ HasImageFeatureProperties,
20
+ HasEngine,
21
+ HasCandidateLabelsProperties,
22
+ HasRescaleFactor):
23
+ """BLIPForQuestionAnswering can load BLIP models for visual question answering.
24
+ The model consists of a vision encoder, a text encoder as well as a text decoder.
25
+ The vision encoder will encode the input image, the text encoder will encode the input question together
26
+ with the encoding of the image, and the text decoder will output the answer to the question.
27
+
28
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
29
+ object:
30
+
31
+ >>> visualQAClassifier = BLIPForQuestionAnswering.pretrained() \\
32
+ ... .setInputCols(["image_assembler"]) \\
33
+ ... .setOutputCol("answer")
34
+
35
+ The default model is ``"blip_vqa_base"``, if no name is
36
+ provided.
37
+
38
+ For available pretrained models please see the `Models Hub
39
+ <https://sparknlp.org/models?task=Question+Answering>`__.
40
+
41
+ To see which models are compatible and how to import them see
42
+ `Import Transformers into Spark NLP 🚀
43
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
44
+
45
+ ====================== ======================
46
+ Input Annotation types Output Annotation type
47
+ ====================== ======================
48
+ ``IMAGE`` ``DOCUMENT``
49
+ ====================== ======================
50
+
51
+ Parameters
52
+ ----------
53
+ batchSize
54
+ Batch size. Large values allows faster processing but requires more
55
+ memory, by default 2
56
+ configProtoBytes
57
+ ConfigProto from tensorflow, serialized into byte array.
58
+ maxSentenceLength
59
+ Max sentence length to process, by default 50
60
+
61
+ Examples
62
+ --------
63
+ >>> import sparknlp
64
+ >>> from sparknlp.base import *
65
+ >>> from sparknlp.annotator import *
66
+ >>> from pyspark.ml import Pipeline
67
+ >>> image_df = SparkSessionForTest.spark.read.format("image").load(path=images_path)
68
+ >>> test_df = image_df.withColumn("text", lit("What's this picture about?"))
69
+ >>> imageAssembler = ImageAssembler() \\
70
+ ... .setInputCol("image") \\
71
+ ... .setOutputCol("image_assembler")
72
+ >>> visualQAClassifier = BLIPForQuestionAnswering.pretrained() \\
73
+ ... .setInputCols("image_assembler") \\
74
+ ... .setOutputCol("answer") \\
75
+ ... .setSize(384)
76
+ >>> pipeline = Pipeline().setStages([
77
+ ... imageAssembler,
78
+ ... visualQAClassifier
79
+ ... ])
80
+ >>> result = pipeline.fit(test_df).transform(test_df)
81
+ >>> result.select("image_assembler.origin", "answer.result").show(false)
82
+ +--------------------------------------+------+
83
+ |origin |result|
84
+ +--------------------------------------+------+
85
+ |[file:///content/images/cat_image.jpg]|[cats]|
86
+ +--------------------------------------+------+
87
+ """
88
+
89
+ name = "BLIPForQuestionAnswering"
90
+
91
+ inputAnnotatorTypes = [AnnotatorType.IMAGE]
92
+
93
+ outputAnnotatorType = AnnotatorType.DOCUMENT
94
+
95
+ configProtoBytes = Param(Params._dummy(),
96
+ "configProtoBytes",
97
+ "ConfigProto from tensorflow, serialized into byte array. Get with "
98
+ "config_proto.SerializeToString()",
99
+ TypeConverters.toListInt)
100
+
101
+ maxSentenceLength = Param(Params._dummy(),
102
+ "maxSentenceLength",
103
+ "Maximum sentence length that the annotator will process. Above this, the sentence is skipped",
104
+ typeConverter=TypeConverters.toInt)
105
+
106
+ def setMaxSentenceSize(self, value):
107
+ """Sets Maximum sentence length that the annotator will process, by
108
+ default 50.
109
+
110
+ Parameters
111
+ ----------
112
+ value : int
113
+ Maximum sentence length that the annotator will process
114
+ """
115
+ return self._set(maxSentenceLength=value)
116
+
117
+
118
+ @keyword_only
119
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.cv.BLIPForQuestionAnswering",
120
+ java_model=None):
121
+ super(BLIPForQuestionAnswering, self).__init__(
122
+ classname=classname,
123
+ java_model=java_model
124
+ )
125
+ self._setDefault(
126
+ batchSize=2,
127
+ size=384,
128
+ maxSentenceLength=50
129
+ )
130
+
131
+ @staticmethod
132
+ def loadSavedModel(folder, spark_session):
133
+ """Loads a locally saved model.
134
+
135
+ Parameters
136
+ ----------
137
+ folder : str
138
+ Folder of the saved model
139
+ spark_session : pyspark.sql.SparkSession
140
+ The current SparkSession
141
+
142
+ Returns
143
+ -------
144
+ CLIPForZeroShotClassification
145
+ The restored model
146
+ """
147
+ from sparknlp.internal import _BLIPForQuestionAnswering
148
+ jModel = _BLIPForQuestionAnswering(folder, spark_session._jsparkSession)._java_obj
149
+ return BLIPForQuestionAnswering(java_model=jModel)
150
+
151
+ @staticmethod
152
+ def pretrained(name="blip_vqa_base", lang="en", remote_loc=None):
153
+ """Downloads and loads a pretrained model.
154
+
155
+ Parameters
156
+ ----------
157
+ name : str, optional
158
+ Name of the pretrained model, by default
159
+ "blip_vqa_tf"
160
+ lang : str, optional
161
+ Language of the pretrained model, by default "en"
162
+ remote_loc : str, optional
163
+ Optional remote address of the resource, by default None. Will use
164
+ Spark NLPs repositories otherwise.
165
+
166
+ Returns
167
+ -------
168
+ CLIPForZeroShotClassification
169
+ The restored model
170
+ """
171
+ from sparknlp.pretrained import ResourceDownloader
172
+ return ResourceDownloader.downloadModel(BLIPForQuestionAnswering, name, lang, remote_loc)
@@ -0,0 +1,193 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Contains classes concerning CLIPForZeroShotClassification."""
16
+
17
+ from sparknlp.common import *
18
+
19
+
20
+ class CLIPForZeroShotClassification(AnnotatorModel,
21
+ HasBatchedAnnotateImage,
22
+ HasImageFeatureProperties,
23
+ HasEngine,
24
+ HasCandidateLabelsProperties,
25
+ HasRescaleFactor):
26
+ """Zero Shot Image Classifier based on CLIP.
27
+
28
+ CLIP (Contrastive Language-Image Pre-Training) is a neural network that was trained on image
29
+ and text pairs. It has the ability to predict images without training on any hard-coded
30
+ labels. This makes it very flexible, as labels can be provided during inference. This is
31
+ similar to the zero-shot capabilities of the GPT-2 and 3 models.
32
+
33
+ Pretrained models can be loaded with ``pretrained`` of the companion object:
34
+
35
+
36
+ .. code-block:: python
37
+
38
+ imageClassifier = CLIPForZeroShotClassification.pretrained() \\
39
+ .setInputCols(["image_assembler"]) \\
40
+ .setOutputCol("label")
41
+
42
+
43
+ The default model is ``"zero_shot_classifier_clip_vit_base_patch32"``, if no name is provided.
44
+
45
+ For available pretrained models please see the
46
+ `Models Hub <https://sparknlp.org/models?task=Zero-Shot+Classification>`__.
47
+
48
+ Models from the HuggingFace 🤗 Transformers library are also compatible with Spark NLP 🚀. To
49
+ see which models are compatible and how to import them see
50
+ https://github.com/JohnSnowLabs/spark-nlp/discussions/5669 and to see more extended
51
+ examples, see
52
+ `CLIPForZeroShotClassificationTestSpec <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/CLIPForZeroShotClassificationTestSpec.scala>`__.
53
+
54
+ ====================== ======================
55
+ Input Annotation types Output Annotation type
56
+ ====================== ======================
57
+ ``IMAGE`` ``CATEGORY``
58
+ ====================== ======================
59
+
60
+ Parameters
61
+ ----------
62
+ batchSize
63
+ Batch size, by default `2`.
64
+ candidateLabels
65
+ Array of labels for classification
66
+
67
+ Examples
68
+ --------
69
+ >>> import sparknlp
70
+ >>> from sparknlp.base import *
71
+ >>> from sparknlp.annotator import *
72
+ >>> from pyspark.ml import Pipeline
73
+ >>> imageDF = spark.read \\
74
+ ... .format("image") \\
75
+ ... .option("dropInvalid", value = True) \\
76
+ ... .load("src/test/resources/image/")
77
+ >>> imageAssembler = ImageAssembler() \\
78
+ ... .setInputCol("image") \\
79
+ ... .setOutputCol("image_assembler")
80
+ >>> candidateLabels = [
81
+ ... "a photo of a bird",
82
+ ... "a photo of a cat",
83
+ ... "a photo of a dog",
84
+ ... "a photo of a hen",
85
+ ... "a photo of a hippo",
86
+ ... "a photo of a room",
87
+ ... "a photo of a tractor",
88
+ ... "a photo of an ostrich",
89
+ ... "a photo of an ox"]
90
+ >>> imageClassifier = CLIPForZeroShotClassification \\
91
+ ... .pretrained() \\
92
+ ... .setInputCols(["image_assembler"]) \\
93
+ ... .setOutputCol("label") \\
94
+ ... .setCandidateLabels(candidateLabels)
95
+ >>> pipeline = Pipeline().setStages([imageAssembler, imageClassifier])
96
+ >>> pipelineDF = pipeline.fit(imageDF).transform(imageDF)
97
+ >>> pipelineDF \\
98
+ ... .selectExpr("reverse(split(image.origin, '/'))[0] as image_name", "label.result") \\
99
+ ... .show(truncate=False)
100
+ +-----------------+-----------------------+
101
+ |image_name |result |
102
+ +-----------------+-----------------------+
103
+ |palace.JPEG |[a photo of a room] |
104
+ |egyptian_cat.jpeg|[a photo of a cat] |
105
+ |hippopotamus.JPEG|[a photo of a hippo] |
106
+ |hen.JPEG |[a photo of a hen] |
107
+ |ostrich.JPEG |[a photo of an ostrich]|
108
+ |junco.JPEG |[a photo of a bird] |
109
+ |bluetick.jpg |[a photo of a dog] |
110
+ |chihuahua.jpg |[a photo of a dog] |
111
+ |tractor.JPEG |[a photo of a tractor] |
112
+ |ox.JPEG |[a photo of an ox] |
113
+ +-----------------+-----------------------+
114
+ """
115
+ name = "CLIPForZeroShotClassification"
116
+
117
+ inputAnnotatorTypes = [AnnotatorType.IMAGE]
118
+
119
+ outputAnnotatorType = AnnotatorType.CATEGORY
120
+
121
+ configProtoBytes = Param(Params._dummy(),
122
+ "configProtoBytes",
123
+ "ConfigProto from tensorflow, serialized into byte array. Get with "
124
+ "config_proto.SerializeToString()",
125
+ TypeConverters.toListInt)
126
+
127
+ def getCandidateLabels(self):
128
+ """
129
+ Returns labels used to train this model
130
+ """
131
+ return self._call_java("getCandidateLabels")
132
+
133
+ @keyword_only
134
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.cv.CLIPForZeroShotClassification",
135
+ java_model=None):
136
+ super(CLIPForZeroShotClassification, self).__init__(
137
+ classname=classname,
138
+ java_model=java_model
139
+ )
140
+ self._setDefault(
141
+ batchSize=2,
142
+ doNormalize=True,
143
+ doRescale=True,
144
+ doResize=True,
145
+ imageMean=[0.48145466, 0.4578275, 0.40821073],
146
+ imageStd=[0.26862954, 0.26130258, 0.27577711],
147
+ resample=2,
148
+ rescaleFactor=1 / 255.0,
149
+ size=224
150
+ )
151
+
152
+ @staticmethod
153
+ def loadSavedModel(folder, spark_session):
154
+ """Loads a locally saved model.
155
+
156
+ Parameters
157
+ ----------
158
+ folder : str
159
+ Folder of the saved model
160
+ spark_session : pyspark.sql.SparkSession
161
+ The current SparkSession
162
+
163
+ Returns
164
+ -------
165
+ CLIPForZeroShotClassification
166
+ The restored model
167
+ """
168
+ from sparknlp.internal import _CLIPForZeroShotClassification
169
+ jModel = _CLIPForZeroShotClassification(folder, spark_session._jsparkSession)._java_obj
170
+ return CLIPForZeroShotClassification(java_model=jModel)
171
+
172
+ @staticmethod
173
+ def pretrained(name="zero_shot_classifier_clip_vit_base_patch32", lang="en", remote_loc=None):
174
+ """Downloads and loads a pretrained model.
175
+
176
+ Parameters
177
+ ----------
178
+ name : str, optional
179
+ Name of the pretrained model, by default
180
+ "image_classifier_vit_base_patch16_224"
181
+ lang : str, optional
182
+ Language of the pretrained model, by default "en"
183
+ remote_loc : str, optional
184
+ Optional remote address of the resource, by default None. Will use
185
+ Spark NLPs repositories otherwise.
186
+
187
+ Returns
188
+ -------
189
+ CLIPForZeroShotClassification
190
+ The restored model
191
+ """
192
+ from sparknlp.pretrained import ResourceDownloader
193
+ return ResourceDownloader.downloadModel(CLIPForZeroShotClassification, name, lang, remote_loc)