spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  4. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  5. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  6. sparknlp/__init__.py +81 -28
  7. sparknlp/annotation.py +3 -2
  8. sparknlp/annotator/__init__.py +6 -0
  9. sparknlp/annotator/audio/__init__.py +2 -0
  10. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  11. sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
  12. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  13. sparknlp/{base → annotator}/chunk2_doc.py +4 -7
  14. sparknlp/annotator/chunker.py +1 -2
  15. sparknlp/annotator/classifier_dl/__init__.py +17 -0
  16. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  17. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
  18. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
  19. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
  20. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  21. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  22. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  23. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
  24. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
  25. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
  26. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  27. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  28. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
  29. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
  30. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  31. sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
  32. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
  33. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
  34. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
  35. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  36. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
  37. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
  38. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
  39. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  40. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  41. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
  42. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
  43. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
  44. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  45. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  46. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  47. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
  48. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  49. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
  50. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
  51. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
  52. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  53. sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
  54. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
  55. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  56. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
  57. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  60. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
  61. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
  62. sparknlp/annotator/cleaners/__init__.py +15 -0
  63. sparknlp/annotator/cleaners/cleaner.py +202 -0
  64. sparknlp/annotator/cleaners/extractor.py +191 -0
  65. sparknlp/annotator/coref/spanbert_coref.py +4 -18
  66. sparknlp/annotator/cv/__init__.py +15 -0
  67. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  68. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  69. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  70. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  71. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  72. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  73. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  74. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  75. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  76. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  77. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  78. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  79. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  80. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  81. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  82. sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
  83. sparknlp/annotator/dataframe_optimizer.py +216 -0
  84. sparknlp/annotator/date2_chunk.py +88 -0
  85. sparknlp/annotator/dependency/dependency_parser.py +2 -3
  86. sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
  87. sparknlp/annotator/document_character_text_splitter.py +228 -0
  88. sparknlp/annotator/document_normalizer.py +37 -1
  89. sparknlp/annotator/document_token_splitter.py +175 -0
  90. sparknlp/annotator/document_token_splitter_test.py +85 -0
  91. sparknlp/annotator/embeddings/__init__.py +11 -0
  92. sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
  93. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  94. sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
  95. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
  96. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  97. sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
  98. sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
  99. sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
  100. sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
  101. sparknlp/annotator/embeddings/doc2vec.py +7 -1
  102. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  103. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  104. sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
  105. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  106. sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
  107. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  108. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  109. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  110. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  111. sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
  112. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
  113. sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
  114. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  115. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  116. sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
  117. sparknlp/annotator/embeddings/word2vec.py +7 -1
  118. sparknlp/annotator/embeddings/word_embeddings.py +4 -5
  119. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
  120. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
  121. sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
  122. sparknlp/annotator/er/entity_ruler.py +37 -23
  123. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
  124. sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
  125. sparknlp/annotator/lemmatizer.py +3 -4
  126. sparknlp/annotator/matcher/date_matcher.py +35 -3
  127. sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
  128. sparknlp/annotator/matcher/regex_matcher.py +3 -3
  129. sparknlp/annotator/matcher/text_matcher.py +2 -3
  130. sparknlp/annotator/n_gram_generator.py +1 -2
  131. sparknlp/annotator/ner/__init__.py +3 -1
  132. sparknlp/annotator/ner/ner_converter.py +18 -0
  133. sparknlp/annotator/ner/ner_crf.py +4 -5
  134. sparknlp/annotator/ner/ner_dl.py +10 -5
  135. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  136. sparknlp/annotator/ner/ner_overwriter.py +2 -2
  137. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  138. sparknlp/annotator/normalizer.py +2 -2
  139. sparknlp/annotator/openai/__init__.py +16 -0
  140. sparknlp/annotator/openai/openai_completion.py +349 -0
  141. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  142. sparknlp/annotator/pos/perceptron.py +6 -7
  143. sparknlp/annotator/sentence/sentence_detector.py +2 -2
  144. sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
  145. sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
  146. sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
  147. sparknlp/annotator/seq2seq/__init__.py +17 -0
  148. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  149. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  150. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  151. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  152. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  153. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  154. sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
  155. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  156. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  157. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  158. sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
  159. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  160. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  161. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  162. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  163. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  164. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  165. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  166. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  167. sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
  168. sparknlp/annotator/similarity/__init__.py +0 -0
  169. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  170. sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
  171. sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
  172. sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
  173. sparknlp/annotator/stemmer.py +2 -3
  174. sparknlp/annotator/stop_words_cleaner.py +3 -4
  175. sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
  176. sparknlp/annotator/token/__init__.py +0 -1
  177. sparknlp/annotator/token/recursive_tokenizer.py +2 -3
  178. sparknlp/annotator/token/tokenizer.py +2 -3
  179. sparknlp/annotator/ws/word_segmenter.py +35 -10
  180. sparknlp/base/__init__.py +2 -3
  181. sparknlp/base/doc2_chunk.py +0 -3
  182. sparknlp/base/document_assembler.py +5 -5
  183. sparknlp/base/embeddings_finisher.py +14 -2
  184. sparknlp/base/finisher.py +15 -4
  185. sparknlp/base/gguf_ranking_finisher.py +234 -0
  186. sparknlp/base/image_assembler.py +69 -0
  187. sparknlp/base/light_pipeline.py +53 -21
  188. sparknlp/base/multi_document_assembler.py +9 -13
  189. sparknlp/base/prompt_assembler.py +207 -0
  190. sparknlp/base/token_assembler.py +1 -2
  191. sparknlp/common/__init__.py +2 -0
  192. sparknlp/common/annotator_type.py +1 -0
  193. sparknlp/common/completion_post_processing.py +37 -0
  194. sparknlp/common/match_strategy.py +33 -0
  195. sparknlp/common/properties.py +914 -9
  196. sparknlp/internal/__init__.py +841 -116
  197. sparknlp/internal/annotator_java_ml.py +1 -1
  198. sparknlp/internal/annotator_transformer.py +3 -0
  199. sparknlp/logging/comet.py +2 -2
  200. sparknlp/partition/__init__.py +16 -0
  201. sparknlp/partition/partition.py +244 -0
  202. sparknlp/partition/partition_properties.py +902 -0
  203. sparknlp/partition/partition_transformer.py +200 -0
  204. sparknlp/pretrained/pretrained_pipeline.py +1 -1
  205. sparknlp/pretrained/resource_downloader.py +126 -2
  206. sparknlp/reader/__init__.py +15 -0
  207. sparknlp/reader/enums.py +19 -0
  208. sparknlp/reader/pdf_to_text.py +190 -0
  209. sparknlp/reader/reader2doc.py +124 -0
  210. sparknlp/reader/reader2image.py +136 -0
  211. sparknlp/reader/reader2table.py +44 -0
  212. sparknlp/reader/reader_assembler.py +159 -0
  213. sparknlp/reader/sparknlp_reader.py +461 -0
  214. sparknlp/training/__init__.py +1 -0
  215. sparknlp/training/conll.py +8 -2
  216. sparknlp/training/spacy_to_annotation.py +57 -0
  217. sparknlp/util.py +26 -0
  218. spark_nlp-4.2.6.dist-info/METADATA +0 -1256
  219. spark_nlp-4.2.6.dist-info/RECORD +0 -196
  220. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
  221. /sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0
@@ -0,0 +1,202 @@
1
+ # Copyright 2017-2024 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for CamemBertForSequenceClassification."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class CamemBertForZeroShotClassification(AnnotatorModel,
20
+ HasCaseSensitiveProperties,
21
+ HasBatchedAnnotate,
22
+ HasClassifierActivationProperties,
23
+ HasCandidateLabelsProperties,
24
+ HasEngine,
25
+ HasMaxSentenceLengthLimit):
26
+ """CamemBertForZeroShotClassification using a `ModelForSequenceClassification` trained on NLI (natural language
27
+ inference) tasks. Equivalent of `DeBertaForSequenceClassification` models, but these models don't require a hardcoded
28
+ number of potential classes, they can be chosen at runtime. It usually means it's slower but it is much more
29
+ flexible.
30
+ Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis
31
+ pair and passed to the pretrained model.
32
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
33
+ object:
34
+ >>> sequenceClassifier = CamemBertForZeroShotClassification.pretrained() \\
35
+ ... .setInputCols(["token", "document"]) \\
36
+ ... .setOutputCol("label")
37
+ The default model is ``"camembert_zero_shot_classifier_xnli_onnx"``, if no name is
38
+ provided.
39
+ For available pretrained models please see the `Models Hub
40
+ <https://sparknlp.orgtask=Text+Classification>`__.
41
+ To see which models are compatible and how to import them see
42
+ `Import Transformers into Spark NLP 🚀
43
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
44
+ ====================== ======================
45
+ Input Annotation types Output Annotation type
46
+ ====================== ======================
47
+ ``DOCUMENT, TOKEN`` ``CATEGORY``
48
+ ====================== ======================
49
+ Parameters
50
+ ----------
51
+ batchSize
52
+ Batch size. Large values allows faster processing but requires more
53
+ memory, by default 8
54
+ caseSensitive
55
+ Whether to ignore case in tokens for embeddings matching, by default
56
+ True
57
+ configProtoBytes
58
+ ConfigProto from tensorflow, serialized into byte array.
59
+ maxSentenceLength
60
+ Max sentence length to process, by default 128
61
+ coalesceSentences
62
+ Instead of 1 class per sentence (if inputCols is `sentence`) output 1
63
+ class per document by averaging probabilities in all sentences, by
64
+ default False
65
+ activation
66
+ Whether to calculate logits via Softmax or Sigmoid, by default
67
+ `"softmax"`.
68
+ Examples
69
+ --------
70
+ >>> import sparknlp
71
+ >>> from sparknlp.base import *
72
+ >>> from sparknlp.annotator import *
73
+ >>> from pyspark.ml import Pipeline
74
+ >>> documentAssembler = DocumentAssembler() \\
75
+ ... .setInputCol("text") \\
76
+ ... .setOutputCol("document")
77
+ >>> tokenizer = Tokenizer() \\
78
+ ... .setInputCols(["document"]) \\
79
+ ... .setOutputCol("token")
80
+ >>> sequenceClassifier = CamemBertForZeroShotClassification.pretrained() \\
81
+ ... .setInputCols(["token", "document"]) \\
82
+ ... .setOutputCol("multi_class") \\
83
+ ... .setCaseSensitive(True)
84
+ ... .setCandidateLabels(["sport", "politique", "science"])
85
+ >>> pipeline = Pipeline().setStages([
86
+ ... documentAssembler,
87
+ ... tokenizer,
88
+ ... sequenceClassifier
89
+ ... ])
90
+ >>> data = spark.createDataFrame([["L'équipe de France joue aujourd'hui au Parc des Princes"]]).toDF("text")
91
+ >>> result = pipeline.fit(data).transform(data)
92
+ >>> result.select("class.result").show(truncate=False)
93
+ +------+
94
+ |result|
95
+ +------+
96
+ |[sport]|
97
+ +------+
98
+ """
99
+ name = "CamemBertForZeroShotClassification"
100
+
101
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
102
+
103
+ outputAnnotatorType = AnnotatorType.CATEGORY
104
+
105
+ configProtoBytes = Param(Params._dummy(),
106
+ "configProtoBytes",
107
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
108
+ TypeConverters.toListInt)
109
+
110
+ coalesceSentences = Param(Params._dummy(), "coalesceSentences",
111
+ "Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences.",
112
+ TypeConverters.toBoolean)
113
+
114
+ def getClasses(self):
115
+ """
116
+ Returns labels used to train this model
117
+ """
118
+ return self._call_java("getClasses")
119
+
120
+ def setConfigProtoBytes(self, b):
121
+ """Sets configProto from tensorflow, serialized into byte array.
122
+
123
+ Parameters
124
+ ----------
125
+ b : List[int]
126
+ ConfigProto from tensorflow, serialized into byte array
127
+ """
128
+ return self._set(configProtoBytes=b)
129
+
130
+ def setCoalesceSentences(self, value):
131
+ """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1
132
+ class per document by averaging probabilities in all sentences, by default True.
133
+
134
+ Due to max sequence length limit in almost all transformer models such as BERT
135
+ (512 tokens), this parameter helps feeding all the sentences into the model and
136
+ averaging all the probabilities for the entire document instead of probabilities
137
+ per sentence.
138
+
139
+ Parameters
140
+ ----------
141
+ value : bool
142
+ If the output of all sentences will be averaged to one output
143
+ """
144
+ return self._set(coalesceSentences=value)
145
+
146
+ @keyword_only
147
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.CamemBertForZeroShotClassification",
148
+ java_model=None):
149
+ super(CamemBertForZeroShotClassification, self).__init__(
150
+ classname=classname,
151
+ java_model=java_model
152
+ )
153
+ self._setDefault(
154
+ batchSize=8,
155
+ maxSentenceLength=128,
156
+ caseSensitive=True,
157
+ coalesceSentences=False,
158
+ activation="softmax"
159
+ )
160
+
161
+ @staticmethod
162
+ def loadSavedModel(folder, spark_session):
163
+ """Loads a locally saved model.
164
+
165
+ Parameters
166
+ ----------
167
+ folder : str
168
+ Folder of the saved model
169
+ spark_session : pyspark.sql.SparkSession
170
+ The current SparkSession
171
+
172
+ Returns
173
+ -------
174
+ CamemBertForZeroShotClassification
175
+ The restored model
176
+ """
177
+ from sparknlp.internal import _CamemBertForZeroShotClassificationLoader
178
+ jModel = _CamemBertForZeroShotClassificationLoader(folder, spark_session._jsparkSession)._java_obj
179
+ return CamemBertForZeroShotClassification(java_model=jModel)
180
+
181
+ @staticmethod
182
+ def pretrained(name="camembert_zero_shot_classifier_xnli_onnx", lang="fr", remote_loc=None):
183
+ """Downloads and loads a pretrained model.
184
+
185
+ Parameters
186
+ ----------
187
+ name : str, optional
188
+ Name of the pretrained model, by default
189
+ "camembert_zero_shot_classifier_xnli_onnx"
190
+ lang : str, optional
191
+ Language of the pretrained model, by default "fr"
192
+ remote_loc : str, optional
193
+ Optional remote address of the resource, by default None. Will use
194
+ Spark NLPs repositories otherwise.
195
+
196
+ Returns
197
+ -------
198
+ CamemBertForSequenceClassification
199
+ The restored model
200
+ """
201
+ from sparknlp.pretrained import ResourceDownloader
202
+ return ResourceDownloader.downloadModel(CamemBertForZeroShotClassification, name, lang, remote_loc)
@@ -54,8 +54,8 @@ class ClassifierDLApproach(AnnotatorApproach, EvaluationDLParams, ClassifierEnco
54
54
  ... .setLabelColumn("label") \\
55
55
  ... .setTestDataset("test_data")
56
56
 
57
- For extended examples of usage, see the Spark NLP Workshop
58
- `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/5.Text_Classification_with_ClassifierDL.ipynb>`__.
57
+ For extended examples of usage, see the Examples
58
+ `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/classification/ClassifierDL_Train_multi_class_news_category_classifier.ipynb>`__.
59
59
 
60
60
  ======================= ======================
61
61
  Input Annotation types Output Annotation type
@@ -203,10 +203,10 @@ class ClassifierDLModel(AnnotatorModel, HasStorageRef, HasEngine):
203
203
  dataset.
204
204
 
205
205
  For available pretrained models please see the
206
- `Models Hub <https://nlp.johnsnowlabs.com/models?task=Text+Classification>`__.
206
+ `Models Hub <https://sparknlp.org/models?task=Text+Classification>`__.
207
207
 
208
208
  For extended examples of usage, see the
209
- `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/5.Text_Classification_with_ClassifierDL.ipynb>`__.
209
+ `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/classification/ClassifierDL_Train_multi_class_news_category_classifier.ipynb>`__.
210
210
 
211
211
  ======================= ======================
212
212
  Input Annotation types Output Annotation type
@@ -18,7 +18,8 @@ from sparknlp.common import *
18
18
  class DeBertaForQuestionAnswering(AnnotatorModel,
19
19
  HasCaseSensitiveProperties,
20
20
  HasBatchedAnnotate,
21
- HasEngine):
21
+ HasEngine,
22
+ HasMaxSentenceLengthLimit):
22
23
  """DeBertaForQuestionAnswering can load DeBERTa Models with a span classification head on top for extractive
23
24
  question-answering tasks like SQuAD (a linear layer on top of the hidden-states output to compute span start
24
25
  logits and span end logits).
@@ -34,7 +35,7 @@ class DeBertaForQuestionAnswering(AnnotatorModel,
34
35
  provided.
35
36
 
36
37
  For available pretrained models please see the `Models Hub
37
- <https://nlp.johnsnowlabs.com/models?task=Question+Answering>`__.
38
+ <https://sparknlp.org/models?task=Question+Answering>`__.
38
39
 
39
40
  To see which models are compatible and how to import them see
40
41
  `Import Transformers into Spark NLP 🚀
@@ -91,11 +92,6 @@ class DeBertaForQuestionAnswering(AnnotatorModel,
91
92
 
92
93
  outputAnnotatorType = AnnotatorType.CHUNK
93
94
 
94
- maxSentenceLength = Param(Params._dummy(),
95
- "maxSentenceLength",
96
- "Max sentence length to process",
97
- typeConverter=TypeConverters.toInt)
98
-
99
95
  configProtoBytes = Param(Params._dummy(),
100
96
  "configProtoBytes",
101
97
  "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
@@ -115,16 +111,6 @@ class DeBertaForQuestionAnswering(AnnotatorModel,
115
111
  """
116
112
  return self._set(configProtoBytes=b)
117
113
 
118
- def setMaxSentenceLength(self, value):
119
- """Sets max sentence length to process, by default 128.
120
-
121
- Parameters
122
- ----------
123
- value : int
124
- Max sentence length to process
125
- """
126
- return self._set(maxSentenceLength=value)
127
-
128
114
  @keyword_only
129
115
  def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.DeBertaForQuestionAnswering",
130
116
  java_model=None):
@@ -19,7 +19,8 @@ class DeBertaForSequenceClassification(AnnotatorModel,
19
19
  HasCaseSensitiveProperties,
20
20
  HasBatchedAnnotate,
21
21
  HasClassifierActivationProperties,
22
- HasEngine):
22
+ HasEngine,
23
+ HasMaxSentenceLengthLimit):
23
24
  """DeBertaForSequenceClassification can load DeBERTa v2 & v3 Models with sequence classification/regression head on
24
25
  top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks.
25
26
 
@@ -34,7 +35,7 @@ class DeBertaForSequenceClassification(AnnotatorModel,
34
35
  provided.
35
36
 
36
37
  For available pretrained models please see the `Models Hub
37
- <https://nlp.johnsnowlabs.com/models?task=Text+Classification>`__.
38
+ <https://sparknlp.org/models?task=Text+Classification>`__.
38
39
 
39
40
  To see which models are compatible and how to import them see
40
41
  `Import Transformers into Spark NLP 🚀
@@ -60,7 +61,7 @@ class DeBertaForSequenceClassification(AnnotatorModel,
60
61
  Max sentence length to process, by default 128
61
62
  coalesceSentences
62
63
  Instead of 1 class per sentence (if inputCols is `sentence`) output
63
- 1 class per document by averaging probabilities in all sentences, by
64
+ 1 class per document by averaging probabilities in all sentences, by
64
65
  default False.
65
66
 
66
67
  Examples
@@ -100,11 +101,6 @@ class DeBertaForSequenceClassification(AnnotatorModel,
100
101
 
101
102
  outputAnnotatorType = AnnotatorType.CATEGORY
102
103
 
103
- maxSentenceLength = Param(Params._dummy(),
104
- "maxSentenceLength",
105
- "Max sentence length to process",
106
- typeConverter=TypeConverters.toInt)
107
-
108
104
  configProtoBytes = Param(Params._dummy(),
109
105
  "configProtoBytes",
110
106
  "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
@@ -130,16 +126,6 @@ class DeBertaForSequenceClassification(AnnotatorModel,
130
126
  """
131
127
  return self._set(configProtoBytes=b)
132
128
 
133
- def setMaxSentenceLength(self, value):
134
- """Sets max sentence length to process, by default 128.
135
-
136
- Parameters
137
- ----------
138
- value : int
139
- Max sentence length to process
140
- """
141
- return self._set(maxSentenceLength=value)
142
-
143
129
  def setCoalesceSentences(self, value):
144
130
  """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging
145
131
  probabilities in all sentences. Due to max sequence length limit in almost all transformer models such as
@@ -210,4 +196,3 @@ class DeBertaForSequenceClassification(AnnotatorModel,
210
196
  """
211
197
  from sparknlp.pretrained import ResourceDownloader
212
198
  return ResourceDownloader.downloadModel(DeBertaForSequenceClassification, name, lang, remote_loc)
213
-
@@ -19,7 +19,8 @@ from sparknlp.common import *
19
19
  class DeBertaForTokenClassification(AnnotatorModel,
20
20
  HasCaseSensitiveProperties,
21
21
  HasBatchedAnnotate,
22
- HasEngine):
22
+ HasEngine,
23
+ HasMaxSentenceLengthLimit):
23
24
  """DeBertaForTokenClassification can load DeBERTa v2&v3 Models with a token
24
25
  classification head on top (a linear layer on top of the hidden-states
25
26
  output) e.g. for Named-Entity-Recognition (NER) tasks.
@@ -35,7 +36,7 @@ class DeBertaForTokenClassification(AnnotatorModel,
35
36
  provided.
36
37
 
37
38
  For available pretrained models please see the `Models Hub
38
- <https://nlp.johnsnowlabs.com/models?task=Named+Entity+Recognition>`__.
39
+ <https://sparknlp.org/models?task=Named+Entity+Recognition>`__.
39
40
 
40
41
  To see which models are compatible and how to import them see
41
42
  `Import Transformers into Spark NLP 🚀
@@ -85,11 +86,9 @@ class DeBertaForTokenClassification(AnnotatorModel,
85
86
  >>> result = pipeline.fit(data).transform(data)
86
87
  >>> result.select("label.result").show(truncate=False)
87
88
  +------------------------------------------------------------------------------------+
88
- |result
89
- |
89
+ |result |
90
90
  +------------------------------------------------------------------------------------+
91
- |[B-PER, I-PER, O, O, O, B-LOC, O, O, O, B-LOC, O, O, O, O, B-PER, O, O, O,
92
- O, B-LOC]|
91
+ |[B-PER, I-PER, O, O, O, B-LOC, O, O, O, B-LOC, O, O, O, O, B-PER, O, O, O, O, B-LOC]|
93
92
  +------------------------------------------------------------------------------------+
94
93
  """
95
94
  name = "DeBertaForTokenClassification"
@@ -98,11 +97,6 @@ class DeBertaForTokenClassification(AnnotatorModel,
98
97
 
99
98
  outputAnnotatorType = AnnotatorType.NAMED_ENTITY
100
99
 
101
- maxSentenceLength = Param(Params._dummy(),
102
- "maxSentenceLength",
103
- "Max sentence length to process",
104
- typeConverter=TypeConverters.toInt)
105
-
106
100
  configProtoBytes = Param(Params._dummy(),
107
101
  "configProtoBytes",
108
102
  "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
@@ -124,16 +118,6 @@ class DeBertaForTokenClassification(AnnotatorModel,
124
118
  """
125
119
  return self._set(configProtoBytes=b)
126
120
 
127
- def setMaxSentenceLength(self, value):
128
- """Sets max sentence length to process, by default 128.
129
-
130
- Parameters
131
- ----------
132
- value : int
133
- Max sentence length to process
134
- """
135
- return self._set(maxSentenceLength=value)
136
-
137
121
  @keyword_only
138
122
  def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.DeBertaForTokenClassification",
139
123
  java_model=None):
@@ -0,0 +1,193 @@
1
+ # Copyright 2017-2023 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for DeBertaForZeroShotClassification."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class DeBertaForZeroShotClassification(AnnotatorModel,
20
+ HasCaseSensitiveProperties,
21
+ HasBatchedAnnotate,
22
+ HasClassifierActivationProperties,
23
+ HasCandidateLabelsProperties,
24
+ HasEngine,
25
+ HasMaxSentenceLengthLimit):
26
+ """DeBertaForZeroShotClassification using a `ModelForSequenceClassification` trained on NLI (natural language
27
+ inference) tasks. Equivalent of `DeBertaForSequenceClassification` models, but these models don't require a hardcoded
28
+ number of potential classes, they can be chosen at runtime. It usually means it's slower but it is much more
29
+ flexible.
30
+ Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis
31
+ pair and passed to the pretrained model.
32
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
33
+ object:
34
+ >>> sequenceClassifier = DeBertaForZeroShotClassification.pretrained() \\
35
+ ... .setInputCols(["token", "document"]) \\
36
+ ... .setOutputCol("label")
37
+ The default model is ``"deberta_base_zero_shot_classifier_mnli_anli_v3"``, if no name is
38
+ provided.
39
+ For available pretrained models please see the `Models Hub
40
+ <https://sparknlp.orgtask=Text+Classification>`__.
41
+ To see which models are compatible and how to import them see
42
+ `Import Transformers into Spark NLP 🚀
43
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
44
+ ====================== ======================
45
+ Input Annotation types Output Annotation type
46
+ ====================== ======================
47
+ ``DOCUMENT, TOKEN`` ``CATEGORY``
48
+ ====================== ======================
49
+ Parameters
50
+ ----------
51
+ batchSize
52
+ Batch size. Large values allows faster processing but requires more
53
+ memory, by default 8
54
+ caseSensitive
55
+ Whether to ignore case in tokens for embeddings matching, by default
56
+ True
57
+ configProtoBytes
58
+ ConfigProto from tensorflow, serialized into byte array.
59
+ maxSentenceLength
60
+ Max sentence length to process, by default 128
61
+ coalesceSentences
62
+ Instead of 1 class per sentence (if inputCols is `sentence`) output 1
63
+ class per document by averaging probabilities in all sentences, by
64
+ default False
65
+ activation
66
+ Whether to calculate logits via Softmax or Sigmoid, by default
67
+ `"softmax"`.
68
+ Examples
69
+ --------
70
+ >>> import sparknlp
71
+ >>> from sparknlp.base import *
72
+ >>> from sparknlp.annotator import *
73
+ >>> from pyspark.ml import Pipeline
74
+ >>> documentAssembler = DocumentAssembler() \\
75
+ ... .setInputCol("text") \\
76
+ ... .setOutputCol("document")
77
+ >>> tokenizer = Tokenizer() \\
78
+ ... .setInputCols(["document"]) \\
79
+ ... .setOutputCol("token")
80
+ >>> sequenceClassifier = DeBertaForZeroShotClassification.pretrained() \\
81
+ ... .setInputCols(["token", "document"]) \\
82
+ ... .setOutputCol("label") \\
83
+ ... .setCaseSensitive(True)
84
+ >>> pipeline = Pipeline().setStages([
85
+ ... documentAssembler,
86
+ ... tokenizer,
87
+ ... sequenceClassifier
88
+ ... ])
89
+ >>> data = spark.createDataFrame([["I loved this movie when I was a child.", "It was pretty boring."]]).toDF("text")
90
+ >>> result = pipeline.fit(data).transform(data)
91
+ >>> result.select("label.result").show(truncate=False)
92
+ +------+
93
+ |result|
94
+ +------+
95
+ |[pos] |
96
+ |[neg] |
97
+ +------+
98
+ """
99
+ name = "DeBertaForZeroShotClassification"
100
+
101
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
102
+
103
+ outputAnnotatorType = AnnotatorType.CATEGORY
104
+
105
+ configProtoBytes = Param(Params._dummy(),
106
+ "configProtoBytes",
107
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
108
+ TypeConverters.toListInt)
109
+
110
+ coalesceSentences = Param(Params._dummy(), "coalesceSentences",
111
+ "Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences.",
112
+ TypeConverters.toBoolean)
113
+
114
+ def getClasses(self):
115
+ """
116
+ Returns labels used to train this model
117
+ """
118
+ return self._call_java("getClasses")
119
+
120
+ def setConfigProtoBytes(self, b):
121
+ """Sets configProto from tensorflow, serialized into byte array.
122
+ Parameters
123
+ ----------
124
+ b : List[int]
125
+ ConfigProto from tensorflow, serialized into byte array
126
+ """
127
+ return self._set(configProtoBytes=b)
128
+
129
+ def setCoalesceSentences(self, value):
130
+ """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging
131
+ probabilities in all sentences. Due to max sequence length limit in almost all transformer models such as DeBerta
132
+ (512 tokens), this parameter helps to feed all the sentences into the model and averaging all the probabilities
133
+ for the entire document instead of probabilities per sentence. (Default: true)
134
+ Parameters
135
+ ----------
136
+ value : bool
137
+ If the output of all sentences will be averaged to one output
138
+ """
139
+ return self._set(coalesceSentences=value)
140
+
141
+ @keyword_only
142
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.DeBertaForZeroShotClassification",
143
+ java_model=None):
144
+ super(DeBertaForZeroShotClassification, self).__init__(
145
+ classname=classname,
146
+ java_model=java_model
147
+ )
148
+ self._setDefault(
149
+ batchSize=8,
150
+ maxSentenceLength=128,
151
+ caseSensitive=True,
152
+ coalesceSentences=False,
153
+ activation="softmax"
154
+ )
155
+
156
+ @staticmethod
157
+ def loadSavedModel(folder, spark_session):
158
+ """Loads a locally saved model.
159
+ Parameters
160
+ ----------
161
+ folder : str
162
+ Folder of the saved model
163
+ spark_session : pyspark.sql.SparkSession
164
+ The current SparkSession
165
+ Returns
166
+ -------
167
+ DeBertaForZeroShotClassification
168
+ The restored model
169
+ """
170
+ from sparknlp.internal import _DeBertaForZeroShotClassification
171
+ jModel = _DeBertaForZeroShotClassification(folder, spark_session._jsparkSession)._java_obj
172
+ return DeBertaForZeroShotClassification(java_model=jModel)
173
+
174
+ @staticmethod
175
+ def pretrained(name="deberta_base_zero_shot_classifier_mnli_anli_v3", lang="en", remote_loc=None):
176
+ """Downloads and loads a pretrained model.
177
+ Parameters
178
+ ----------
179
+ name : str, optional
180
+ Name of the pretrained model, by default
181
+ "deberta_base_zero_shot_classifier_mnli_anli_v3"
182
+ lang : str, optional
183
+ Language of the pretrained model, by default "en"
184
+ remote_loc : str, optional
185
+ Optional remote address of the resource, by default None. Will use
186
+ Spark NLPs repositories otherwise.
187
+ Returns
188
+ -------
189
+ DeBertaForZeroShotClassification
190
+ The restored model
191
+ """
192
+ from sparknlp.pretrained import ResourceDownloader
193
+ return ResourceDownloader.downloadModel(DeBertaForZeroShotClassification, name, lang, remote_loc)
@@ -18,7 +18,8 @@ from sparknlp.common import *
18
18
  class DistilBertForQuestionAnswering(AnnotatorModel,
19
19
  HasCaseSensitiveProperties,
20
20
  HasBatchedAnnotate,
21
- HasEngine):
21
+ HasEngine,
22
+ HasMaxSentenceLengthLimit):
22
23
  """DistilBertForQuestionAnswering can load DistilBERT Models with a span classification head on top for extractive
23
24
  question-answering tasks like SQuAD (a linear layer on top of the hidden-states output to compute span start
24
25
  logits and span end logits).
@@ -34,7 +35,7 @@ class DistilBertForQuestionAnswering(AnnotatorModel,
34
35
  provided.
35
36
 
36
37
  For available pretrained models please see the `Models Hub
37
- <https://nlp.johnsnowlabs.com/models?task=Question+Answering>`__.
38
+ <https://sparknlp.org/models?task=Question+Answering>`__.
38
39
 
39
40
  To see which models are compatible and how to import them see
40
41
  `Import Transformers into Spark NLP 🚀
@@ -91,11 +92,6 @@ class DistilBertForQuestionAnswering(AnnotatorModel,
91
92
 
92
93
  outputAnnotatorType = AnnotatorType.CHUNK
93
94
 
94
- maxSentenceLength = Param(Params._dummy(),
95
- "maxSentenceLength",
96
- "Max sentence length to process",
97
- typeConverter=TypeConverters.toInt)
98
-
99
95
  configProtoBytes = Param(Params._dummy(),
100
96
  "configProtoBytes",
101
97
  "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
@@ -115,16 +111,6 @@ class DistilBertForQuestionAnswering(AnnotatorModel,
115
111
  """
116
112
  return self._set(configProtoBytes=b)
117
113
 
118
- def setMaxSentenceLength(self, value):
119
- """Sets max sentence length to process, by default 128.
120
-
121
- Parameters
122
- ----------
123
- value : int
124
- Max sentence length to process
125
- """
126
- return self._set(maxSentenceLength=value)
127
-
128
114
  @keyword_only
129
115
  def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.DistilBertForQuestionAnswering",
130
116
  java_model=None):