spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  4. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  5. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  6. sparknlp/__init__.py +81 -28
  7. sparknlp/annotation.py +3 -2
  8. sparknlp/annotator/__init__.py +6 -0
  9. sparknlp/annotator/audio/__init__.py +2 -0
  10. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  11. sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
  12. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  13. sparknlp/{base → annotator}/chunk2_doc.py +4 -7
  14. sparknlp/annotator/chunker.py +1 -2
  15. sparknlp/annotator/classifier_dl/__init__.py +17 -0
  16. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  17. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
  18. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
  19. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
  20. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  21. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  22. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  23. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
  24. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
  25. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
  26. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  27. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  28. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
  29. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
  30. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  31. sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
  32. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
  33. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
  34. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
  35. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  36. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
  37. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
  38. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
  39. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  40. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  41. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
  42. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
  43. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
  44. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  45. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  46. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  47. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
  48. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  49. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
  50. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
  51. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
  52. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  53. sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
  54. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
  55. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  56. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
  57. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  60. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
  61. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
  62. sparknlp/annotator/cleaners/__init__.py +15 -0
  63. sparknlp/annotator/cleaners/cleaner.py +202 -0
  64. sparknlp/annotator/cleaners/extractor.py +191 -0
  65. sparknlp/annotator/coref/spanbert_coref.py +4 -18
  66. sparknlp/annotator/cv/__init__.py +15 -0
  67. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  68. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  69. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  70. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  71. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  72. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  73. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  74. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  75. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  76. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  77. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  78. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  79. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  80. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  81. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  82. sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
  83. sparknlp/annotator/dataframe_optimizer.py +216 -0
  84. sparknlp/annotator/date2_chunk.py +88 -0
  85. sparknlp/annotator/dependency/dependency_parser.py +2 -3
  86. sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
  87. sparknlp/annotator/document_character_text_splitter.py +228 -0
  88. sparknlp/annotator/document_normalizer.py +37 -1
  89. sparknlp/annotator/document_token_splitter.py +175 -0
  90. sparknlp/annotator/document_token_splitter_test.py +85 -0
  91. sparknlp/annotator/embeddings/__init__.py +11 -0
  92. sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
  93. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  94. sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
  95. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
  96. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  97. sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
  98. sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
  99. sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
  100. sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
  101. sparknlp/annotator/embeddings/doc2vec.py +7 -1
  102. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  103. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  104. sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
  105. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  106. sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
  107. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  108. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  109. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  110. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  111. sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
  112. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
  113. sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
  114. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  115. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  116. sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
  117. sparknlp/annotator/embeddings/word2vec.py +7 -1
  118. sparknlp/annotator/embeddings/word_embeddings.py +4 -5
  119. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
  120. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
  121. sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
  122. sparknlp/annotator/er/entity_ruler.py +37 -23
  123. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
  124. sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
  125. sparknlp/annotator/lemmatizer.py +3 -4
  126. sparknlp/annotator/matcher/date_matcher.py +35 -3
  127. sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
  128. sparknlp/annotator/matcher/regex_matcher.py +3 -3
  129. sparknlp/annotator/matcher/text_matcher.py +2 -3
  130. sparknlp/annotator/n_gram_generator.py +1 -2
  131. sparknlp/annotator/ner/__init__.py +3 -1
  132. sparknlp/annotator/ner/ner_converter.py +18 -0
  133. sparknlp/annotator/ner/ner_crf.py +4 -5
  134. sparknlp/annotator/ner/ner_dl.py +10 -5
  135. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  136. sparknlp/annotator/ner/ner_overwriter.py +2 -2
  137. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  138. sparknlp/annotator/normalizer.py +2 -2
  139. sparknlp/annotator/openai/__init__.py +16 -0
  140. sparknlp/annotator/openai/openai_completion.py +349 -0
  141. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  142. sparknlp/annotator/pos/perceptron.py +6 -7
  143. sparknlp/annotator/sentence/sentence_detector.py +2 -2
  144. sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
  145. sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
  146. sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
  147. sparknlp/annotator/seq2seq/__init__.py +17 -0
  148. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  149. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  150. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  151. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  152. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  153. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  154. sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
  155. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  156. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  157. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  158. sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
  159. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  160. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  161. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  162. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  163. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  164. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  165. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  166. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  167. sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
  168. sparknlp/annotator/similarity/__init__.py +0 -0
  169. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  170. sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
  171. sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
  172. sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
  173. sparknlp/annotator/stemmer.py +2 -3
  174. sparknlp/annotator/stop_words_cleaner.py +3 -4
  175. sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
  176. sparknlp/annotator/token/__init__.py +0 -1
  177. sparknlp/annotator/token/recursive_tokenizer.py +2 -3
  178. sparknlp/annotator/token/tokenizer.py +2 -3
  179. sparknlp/annotator/ws/word_segmenter.py +35 -10
  180. sparknlp/base/__init__.py +2 -3
  181. sparknlp/base/doc2_chunk.py +0 -3
  182. sparknlp/base/document_assembler.py +5 -5
  183. sparknlp/base/embeddings_finisher.py +14 -2
  184. sparknlp/base/finisher.py +15 -4
  185. sparknlp/base/gguf_ranking_finisher.py +234 -0
  186. sparknlp/base/image_assembler.py +69 -0
  187. sparknlp/base/light_pipeline.py +53 -21
  188. sparknlp/base/multi_document_assembler.py +9 -13
  189. sparknlp/base/prompt_assembler.py +207 -0
  190. sparknlp/base/token_assembler.py +1 -2
  191. sparknlp/common/__init__.py +2 -0
  192. sparknlp/common/annotator_type.py +1 -0
  193. sparknlp/common/completion_post_processing.py +37 -0
  194. sparknlp/common/match_strategy.py +33 -0
  195. sparknlp/common/properties.py +914 -9
  196. sparknlp/internal/__init__.py +841 -116
  197. sparknlp/internal/annotator_java_ml.py +1 -1
  198. sparknlp/internal/annotator_transformer.py +3 -0
  199. sparknlp/logging/comet.py +2 -2
  200. sparknlp/partition/__init__.py +16 -0
  201. sparknlp/partition/partition.py +244 -0
  202. sparknlp/partition/partition_properties.py +902 -0
  203. sparknlp/partition/partition_transformer.py +200 -0
  204. sparknlp/pretrained/pretrained_pipeline.py +1 -1
  205. sparknlp/pretrained/resource_downloader.py +126 -2
  206. sparknlp/reader/__init__.py +15 -0
  207. sparknlp/reader/enums.py +19 -0
  208. sparknlp/reader/pdf_to_text.py +190 -0
  209. sparknlp/reader/reader2doc.py +124 -0
  210. sparknlp/reader/reader2image.py +136 -0
  211. sparknlp/reader/reader2table.py +44 -0
  212. sparknlp/reader/reader_assembler.py +159 -0
  213. sparknlp/reader/sparknlp_reader.py +461 -0
  214. sparknlp/training/__init__.py +1 -0
  215. sparknlp/training/conll.py +8 -2
  216. sparknlp/training/spacy_to_annotation.py +57 -0
  217. sparknlp/util.py +26 -0
  218. spark_nlp-4.2.6.dist-info/METADATA +0 -1256
  219. spark_nlp-4.2.6.dist-info/RECORD +0 -196
  220. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
  221. /sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0
@@ -18,7 +18,8 @@ from sparknlp.common import *
18
18
  class RoBertaForQuestionAnswering(AnnotatorModel,
19
19
  HasCaseSensitiveProperties,
20
20
  HasBatchedAnnotate,
21
- HasEngine):
21
+ HasEngine,
22
+ HasMaxSentenceLengthLimit):
22
23
  """RoBertaForQuestionAnswering can load RoBERTa Models with a span classification head on top for extractive
23
24
  question-answering tasks like SQuAD (a linear layer on top of the hidden-states output to compute span start
24
25
  logits and span end logits).
@@ -34,7 +35,7 @@ class RoBertaForQuestionAnswering(AnnotatorModel,
34
35
  provided.
35
36
 
36
37
  For available pretrained models please see the `Models Hub
37
- <https://nlp.johnsnowlabs.com/models?task=Question+Answering>`__.
38
+ <https://sparknlp.org/models?task=Question+Answering>`__.
38
39
 
39
40
  To see which models are compatible and how to import them see
40
41
  `Import Transformers into Spark NLP 🚀
@@ -91,11 +92,6 @@ class RoBertaForQuestionAnswering(AnnotatorModel,
91
92
 
92
93
  outputAnnotatorType = AnnotatorType.CHUNK
93
94
 
94
- maxSentenceLength = Param(Params._dummy(),
95
- "maxSentenceLength",
96
- "Max sentence length to process",
97
- typeConverter=TypeConverters.toInt)
98
-
99
95
  configProtoBytes = Param(Params._dummy(),
100
96
  "configProtoBytes",
101
97
  "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
@@ -115,16 +111,6 @@ class RoBertaForQuestionAnswering(AnnotatorModel,
115
111
  """
116
112
  return self._set(configProtoBytes=b)
117
113
 
118
- def setMaxSentenceLength(self, value):
119
- """Sets max sentence length to process, by default 128.
120
-
121
- Parameters
122
- ----------
123
- value : int
124
- Max sentence length to process
125
- """
126
- return self._set(maxSentenceLength=value)
127
-
128
114
  @keyword_only
129
115
  def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.RoBertaForQuestionAnswering",
130
116
  java_model=None):
@@ -20,7 +20,8 @@ class RoBertaForSequenceClassification(AnnotatorModel,
20
20
  HasCaseSensitiveProperties,
21
21
  HasBatchedAnnotate,
22
22
  HasClassifierActivationProperties,
23
- HasEngine):
23
+ HasEngine,
24
+ HasMaxSentenceLengthLimit):
24
25
  """RoBertaForSequenceClassification can load RoBERTa Models with sequence classification/regression head on
25
26
  top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks.
26
27
 
@@ -35,7 +36,7 @@ class RoBertaForSequenceClassification(AnnotatorModel,
35
36
  provided.
36
37
 
37
38
  For available pretrained models please see the `Models Hub
38
- <https://nlp.johnsnowlabs.com/models?task=Text+Classification>`__.
39
+ <https://sparknlp.org/models?task=Text+Classification>`__.
39
40
 
40
41
  To see which models are compatible and how to import them see
41
42
  `Import Transformers into Spark NLP 🚀
@@ -61,7 +62,7 @@ class RoBertaForSequenceClassification(AnnotatorModel,
61
62
  Max sentence length to process, by default 128
62
63
  coalesceSentences
63
64
  Instead of 1 class per sentence (if inputCols is `sentence`) output
64
- 1 class per document by averaging probabilities in all sentences, by
65
+ 1 class per document by averaging probabilities in all sentences, by
65
66
  default False.
66
67
  activation
67
68
  Whether to calculate logits via Softmax or Sigmoid, by default
@@ -104,11 +105,6 @@ class RoBertaForSequenceClassification(AnnotatorModel,
104
105
 
105
106
  outputAnnotatorType = AnnotatorType.CATEGORY
106
107
 
107
- maxSentenceLength = Param(Params._dummy(),
108
- "maxSentenceLength",
109
- "Max sentence length to process",
110
- typeConverter=TypeConverters.toInt)
111
-
112
108
  configProtoBytes = Param(Params._dummy(),
113
109
  "configProtoBytes",
114
110
  "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
@@ -134,16 +130,6 @@ class RoBertaForSequenceClassification(AnnotatorModel,
134
130
  """
135
131
  return self._set(configProtoBytes=b)
136
132
 
137
- def setMaxSentenceLength(self, value):
138
- """Sets max sentence length to process, by default 128.
139
-
140
- Parameters
141
- ----------
142
- value : int
143
- Max sentence length to process
144
- """
145
- return self._set(maxSentenceLength=value)
146
-
147
133
  def setCoalesceSentences(self, value):
148
134
  """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences.
149
135
  Due to max sequence length limit in almost all transformer models such as BERT (512 tokens), this parameter helps feeding all the sentences
@@ -35,7 +35,7 @@ class RoBertaForTokenClassification(AnnotatorModel,
35
35
  is provided.
36
36
 
37
37
  For available pretrained models please see the `Models Hub
38
- <https://nlp.johnsnowlabs.com/models?task=Named+Entity+Recognition>`__.
38
+ <https://sparknlp.org/models?task=Named+Entity+Recognition>`__.
39
39
 
40
40
  To see which models are compatible and how to import them see
41
41
  `Import Transformers into Spark NLP 🚀
@@ -0,0 +1,225 @@
1
+ # Copyright 2017-2023 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for RoBertaForZeroShotClassification."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class RoBertaForZeroShotClassification(AnnotatorModel,
20
+ HasCaseSensitiveProperties,
21
+ HasBatchedAnnotate,
22
+ HasClassifierActivationProperties,
23
+ HasCandidateLabelsProperties,
24
+ HasEngine):
25
+ """RoBertaForZeroShotClassification using a `ModelForSequenceClassification` trained on NLI (natural language
26
+ inference) tasks. Equivalent of `RoBertaForSequenceClassification` models, but these models don't require a hardcoded
27
+ number of potential classes, they can be chosen at runtime. It usually means it's slower but it is much more
28
+ flexible.
29
+
30
+ Note that the model will loop through all provided labels. So the more labels you have, the
31
+ longer this process will take.
32
+
33
+ Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis
34
+ pair and passed to the pretrained model.
35
+
36
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
37
+ object:
38
+
39
+ >>> sequenceClassifier = RoBertaForZeroShotClassification.pretrained() \\
40
+ ... .setInputCols(["token", "document"]) \\
41
+ ... .setOutputCol("label")
42
+
43
+ The default model is ``"roberta_base_zero_shot_classifier_nli"``, if no name is
44
+ provided.
45
+
46
+ For available pretrained models please see the `Models Hub
47
+ <https://sparknlp.orgtask=Text+Classification>`__.
48
+
49
+ To see which models are compatible and how to import them see
50
+ `Import Transformers into Spark NLP 🚀
51
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
52
+
53
+ ====================== ======================
54
+ Input Annotation types Output Annotation type
55
+ ====================== ======================
56
+ ``DOCUMENT, TOKEN`` ``CATEGORY``
57
+ ====================== ======================
58
+
59
+ Parameters
60
+ ----------
61
+ batchSize
62
+ Batch size. Large values allows faster processing but requires more
63
+ memory, by default 8
64
+ caseSensitive
65
+ Whether to ignore case in tokens for embeddings matching, by default
66
+ True
67
+ configProtoBytes
68
+ ConfigProto from tensorflow, serialized into byte array.
69
+ maxSentenceLength
70
+ Max sentence length to process, by default 128
71
+ coalesceSentences
72
+ Instead of 1 class per sentence (if inputCols is `sentence`) output 1
73
+ class per document by averaging probabilities in all sentences, by
74
+ default False
75
+ activation
76
+ Whether to calculate logits via Softmax or Sigmoid, by default
77
+ `"softmax"`.
78
+
79
+ Examples
80
+ --------
81
+ >>> import sparknlp
82
+ >>> from sparknlp.base import *
83
+ >>> from sparknlp.annotator import *
84
+ >>> from pyspark.ml import Pipeline
85
+ >>> documentAssembler = DocumentAssembler() \\
86
+ ... .setInputCol("text") \\
87
+ ... .setOutputCol("document")
88
+ >>> tokenizer = Tokenizer() \\
89
+ ... .setInputCols(["document"]) \\
90
+ ... .setOutputCol("token")
91
+ >>> sequenceClassifier = RoBertaForZeroShotClassification.pretrained() \\
92
+ ... .setInputCols(["token", "document"]) \\
93
+ ... .setOutputCol("label") \\
94
+ ... .setCaseSensitive(True)
95
+ >>> pipeline = Pipeline().setStages([
96
+ ... documentAssembler,
97
+ ... tokenizer,
98
+ ... sequenceClassifier
99
+ ... ])
100
+ >>> data = spark.createDataFrame([["I loved this movie when I was a child.", "It was pretty boring."]]).toDF("text")
101
+ >>> result = pipeline.fit(data).transform(data)
102
+ >>> result.select("label.result").show(truncate=False)
103
+ +------+
104
+ |result|
105
+ +------+
106
+ |[pos] |
107
+ |[neg] |
108
+ +------+
109
+ """
110
+ name = "RoBertaForZeroShotClassification"
111
+
112
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
113
+
114
+ outputAnnotatorType = AnnotatorType.CATEGORY
115
+
116
+ maxSentenceLength = Param(Params._dummy(),
117
+ "maxSentenceLength",
118
+ "Max sentence length to process",
119
+ typeConverter=TypeConverters.toInt)
120
+
121
+ configProtoBytes = Param(Params._dummy(),
122
+ "configProtoBytes",
123
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
124
+ TypeConverters.toListInt)
125
+
126
+ coalesceSentences = Param(Params._dummy(), "coalesceSentences",
127
+ "Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences.",
128
+ TypeConverters.toBoolean)
129
+
130
+ def getClasses(self):
131
+ """
132
+ Returns labels used to train this model
133
+ """
134
+ return self._call_java("getClasses")
135
+
136
+ def setConfigProtoBytes(self, b):
137
+ """Sets configProto from tensorflow, serialized into byte array.
138
+
139
+ Parameters
140
+ ----------
141
+ b : List[int]
142
+ ConfigProto from tensorflow, serialized into byte array
143
+ """
144
+ return self._set(configProtoBytes=b)
145
+
146
+ def setMaxSentenceLength(self, value):
147
+ """Sets max sentence length to process, by default 128.
148
+
149
+ Parameters
150
+ ----------
151
+ value : int
152
+ Max sentence length to process
153
+ """
154
+ return self._set(maxSentenceLength=value)
155
+
156
+ def setCoalesceSentences(self, value):
157
+ """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging
158
+ probabilities in all sentences. Due to max sequence length limit in almost all transformer models such as RoBerta
159
+ (512 tokens), this parameter helps to feed all the sentences into the model and averaging all the probabilities
160
+ for the entire document instead of probabilities per sentence. (Default: true)
161
+
162
+ Parameters
163
+ ----------
164
+ value : bool
165
+ If the output of all sentences will be averaged to one output
166
+ """
167
+ return self._set(coalesceSentences=value)
168
+
169
+ @keyword_only
170
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.RoBertaForZeroShotClassification",
171
+ java_model=None):
172
+ super(RoBertaForZeroShotClassification, self).__init__(
173
+ classname=classname,
174
+ java_model=java_model
175
+ )
176
+ self._setDefault(
177
+ batchSize=8,
178
+ maxSentenceLength=128,
179
+ caseSensitive=True,
180
+ coalesceSentences=False,
181
+ activation="softmax"
182
+ )
183
+
184
+ @staticmethod
185
+ def loadSavedModel(folder, spark_session):
186
+ """Loads a locally saved model.
187
+
188
+ Parameters
189
+ ----------
190
+ folder : str
191
+ Folder of the saved model
192
+ spark_session : pyspark.sql.SparkSession
193
+ The current SparkSession
194
+
195
+ Returns
196
+ -------
197
+ RoBertaForZeroShotClassification
198
+ The restored model
199
+ """
200
+ from sparknlp.internal import _RoBertaForZeroShotClassification
201
+ jModel = _RoBertaForZeroShotClassification(folder, spark_session._jsparkSession)._java_obj
202
+ return RoBertaForZeroShotClassification(java_model=jModel)
203
+
204
+ @staticmethod
205
+ def pretrained(name="roberta_base_zero_shot_classifier_nli", lang="en", remote_loc=None):
206
+ """Downloads and loads a pretrained model.
207
+
208
+ Parameters
209
+ ----------
210
+ name : str, optional
211
+ Name of the pretrained model, by default
212
+ "roberta_base_zero_shot_classifier_nli"
213
+ lang : str, optional
214
+ Language of the pretrained model, by default "en"
215
+ remote_loc : str, optional
216
+ Optional remote address of the resource, by default None. Will use
217
+ Spark NLPs repositories otherwise.
218
+
219
+ Returns
220
+ -------
221
+ RoBertaForZeroShotClassification
222
+ The restored model
223
+ """
224
+ from sparknlp.pretrained import ResourceDownloader
225
+ return ResourceDownloader.downloadModel(RoBertaForZeroShotClassification, name, lang, remote_loc)
@@ -53,7 +53,7 @@ class SentimentDLApproach(AnnotatorApproach, EvaluationDLParams, ClassifierEncod
53
53
  ... .setLabelColumn("label") \\
54
54
  ... .setTestDataset("test_data")
55
55
 
56
- For extended examples of usage, see the `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/classification/SentimentDL_train_multiclass_sentiment_classifier.ipynb>`__.
56
+ For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/classification/SentimentDL_train_multiclass_sentiment_classifier.ipynb>`__.
57
57
 
58
58
  ======================= ======================
59
59
  Input Annotation types Output Annotation type
@@ -233,10 +233,10 @@ class SentimentDLModel(AnnotatorModel, HasStorageRef, HasEngine):
233
233
  The default model is ``"sentimentdl_use_imdb"``, if no name is provided. It
234
234
  is english sentiment analysis trained on the IMDB dataset. For available
235
235
  pretrained models please see the `Models Hub
236
- <https://nlp.johnsnowlabs.com/models?task=Sentiment+Analysis>`__.
236
+ <https://sparknlp.org/models?task=Sentiment+Analysis>`__.
237
237
 
238
- For extended examples of usage, see the `Spark NLP Workshop
239
- <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/5.Text_Classification_with_ClassifierDL.ipynb>`__.
238
+ For extended examples of usage, see the `Examples
239
+ <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/classification/SentimentDL_train_multiclass_sentiment_classifier.ipynb>`__.
240
240
 
241
241
  ======================= ======================
242
242
  Input Annotation types Output Annotation type
@@ -33,12 +33,12 @@ class TapasForQuestionAnswering(BertForQuestionAnswering):
33
33
  is provided.
34
34
 
35
35
  For available pretrained models please see the `Models Hub
36
- <https://nlp.johnsnowlabs.com/models?task=Question+Answering+Tapas>`__.
36
+ <https://sparknlp.org/models?task=Question+Answering+Tapas>`__.
37
37
 
38
38
  ====================== ======================
39
39
  Input Annotation types Output Annotation type
40
40
  ====================== ======================
41
- ``DOCUMENT, TABLE`` ``CHUNK``
41
+ ``DOCUMENT, TABLE`` ``CHUNK``
42
42
  ====================== ======================
43
43
 
44
44
  Parameters
@@ -0,0 +1,149 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from sparknlp.common import *
16
+
17
+
18
+ class XlmRoBertaForMultipleChoice(AnnotatorModel,
19
+ HasCaseSensitiveProperties,
20
+ HasBatchedAnnotate,
21
+ HasEngine,
22
+ HasMaxSentenceLengthLimit):
23
+ """XlmRoBertaForMultipleChoice can load XLM-RoBERTa Models with a span classification head on top for extractive
24
+ question-answering tasks like SQuAD (a linear layer on top of the hidden-states output to compute span start
25
+ logits and span end logits).
26
+
27
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
28
+ object:
29
+
30
+ >>> spanClassifier = XlmRoBertaForMultipleChoice.pretrained() \\
31
+ ... .setInputCols(["document_question", "document_context"]) \\
32
+ ... .setOutputCol("answer")
33
+
34
+ The default model is ``"xlm_roberta_base_qa_squad2"``, if no name is
35
+ provided.
36
+
37
+ For available pretrained models please see the `Models Hub
38
+ <https://sparknlp.org/models?task=Question+Answering>`__.
39
+
40
+ To see which models are compatible and how to import them see
41
+ `Import Transformers into Spark NLP 🚀
42
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
43
+
44
+ ====================== ======================
45
+ Input Annotation types Output Annotation type
46
+ ====================== ======================
47
+ ``DOCUMENT, DOCUMENT`` ``CHUNK``
48
+ ====================== ======================
49
+
50
+ Parameters
51
+ ----------
52
+ batchSize
53
+ Batch size. Large values allows faster processing but requires more
54
+ memory, by default 8
55
+ caseSensitive
56
+ Whether to ignore case in tokens for embeddings matching, by default
57
+ False
58
+ configProtoBytes
59
+ ConfigProto from tensorflow, serialized into byte array.
60
+ maxSentenceLength
61
+ Max sentence length to process, by default 128
62
+
63
+ Examples
64
+ --------
65
+ >>> import sparknlp
66
+ >>> from sparknlp.base import *
67
+ >>> from sparknlp.annotator import *
68
+ >>> from pyspark.ml import Pipeline
69
+ >>> documentAssembler = MultiDocumentAssembler() \\
70
+ ... .setInputCols(["question", "context"]) \\
71
+ ... .setOutputCol(["document_question", "document_context"])
72
+ >>> spanClassifier = XlmRoBertaForMultipleChoice.pretrained() \\
73
+ ... .setInputCols(["document_question", "document_context"]) \\
74
+ ... .setOutputCol("answer") \\
75
+ ... .setCaseSensitive(False)
76
+ >>> pipeline = Pipeline().setStages([
77
+ ... documentAssembler,
78
+ ... spanClassifier
79
+ ... ])
80
+ >>> data = spark.createDataFrame([["What's my name?", "My name is Clara and I live in Berkeley."]]).toDF("question", "context")
81
+ >>> result = pipeline.fit(data).transform(data)
82
+ >>> result.select("answer.result").show(truncate=False)
83
+ +--------------------+
84
+ |result |
85
+ +--------------------+
86
+ |[Clara] |
87
+ +--------------------+
88
+ """
89
+ name = "XlmRoBertaForMultipleChoice"
90
+
91
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT]
92
+
93
+ outputAnnotatorType = AnnotatorType.CHUNK
94
+
95
+ @keyword_only
96
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.XlmRoBertaForMultipleChoice",
97
+ java_model=None):
98
+ super(XlmRoBertaForMultipleChoice, self).__init__(
99
+ classname=classname,
100
+ java_model=java_model
101
+ )
102
+ self._setDefault(
103
+ batchSize=8,
104
+ maxSentenceLength=128,
105
+ caseSensitive=False
106
+ )
107
+
108
+ @staticmethod
109
+ def loadSavedModel(folder, spark_session):
110
+ """Loads a locally saved model.
111
+
112
+ Parameters
113
+ ----------
114
+ folder : str
115
+ Folder of the saved model
116
+ spark_session : pyspark.sql.SparkSession
117
+ The current SparkSession
118
+
119
+ Returns
120
+ -------
121
+ XlmRoBertaForMultipleChoice
122
+ The restored model
123
+ """
124
+ from sparknlp.internal import _XlmRoBertaMultipleChoiceLoader
125
+ jModel = _XlmRoBertaMultipleChoiceLoader(folder, spark_session._jsparkSession)._java_obj
126
+ return XlmRoBertaForMultipleChoice(java_model=jModel)
127
+
128
+ @staticmethod
129
+ def pretrained(name="xlm_roberta_base_mc", lang="en", remote_loc=None):
130
+ """Downloads and loads a pretrained model.
131
+
132
+ Parameters
133
+ ----------
134
+ name : str, optional
135
+ Name of the pretrained model, by default
136
+ "xlm_roberta_base_qa_squad2"
137
+ lang : str, optional
138
+ Language of the pretrained model, by default "en"
139
+ remote_loc : str, optional
140
+ Optional remote address of the resource, by default None. Will use
141
+ Spark NLPs repositories otherwise.
142
+
143
+ Returns
144
+ -------
145
+ XlmRoBertaForMultipleChoice
146
+ The restored model
147
+ """
148
+ from sparknlp.pretrained import ResourceDownloader
149
+ return ResourceDownloader.downloadModel(XlmRoBertaForMultipleChoice, name, lang, remote_loc)
@@ -18,7 +18,8 @@ from sparknlp.common import *
18
18
  class XlmRoBertaForQuestionAnswering(AnnotatorModel,
19
19
  HasCaseSensitiveProperties,
20
20
  HasBatchedAnnotate,
21
- HasEngine):
21
+ HasEngine,
22
+ HasMaxSentenceLengthLimit):
22
23
  """XlmRoBertaForQuestionAnswering can load XLM-RoBERTa Models with a span classification head on top for extractive
23
24
  question-answering tasks like SQuAD (a linear layer on top of the hidden-states output to compute span start
24
25
  logits and span end logits).
@@ -34,7 +35,7 @@ class XlmRoBertaForQuestionAnswering(AnnotatorModel,
34
35
  provided.
35
36
 
36
37
  For available pretrained models please see the `Models Hub
37
- <https://nlp.johnsnowlabs.com/models?task=Question+Answering>`__.
38
+ <https://sparknlp.org/models?task=Question+Answering>`__.
38
39
 
39
40
  To see which models are compatible and how to import them see
40
41
  `Import Transformers into Spark NLP 🚀
@@ -91,11 +92,6 @@ class XlmRoBertaForQuestionAnswering(AnnotatorModel,
91
92
 
92
93
  outputAnnotatorType = AnnotatorType.CHUNK
93
94
 
94
- maxSentenceLength = Param(Params._dummy(),
95
- "maxSentenceLength",
96
- "Max sentence length to process",
97
- typeConverter=TypeConverters.toInt)
98
-
99
95
  configProtoBytes = Param(Params._dummy(),
100
96
  "configProtoBytes",
101
97
  "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
@@ -115,16 +111,6 @@ class XlmRoBertaForQuestionAnswering(AnnotatorModel,
115
111
  """
116
112
  return self._set(configProtoBytes=b)
117
113
 
118
- def setMaxSentenceLength(self, value):
119
- """Sets max sentence length to process, by default 128.
120
-
121
- Parameters
122
- ----------
123
- value : int
124
- Max sentence length to process
125
- """
126
- return self._set(maxSentenceLength=value)
127
-
128
114
  @keyword_only
129
115
  def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.XlmRoBertaForQuestionAnswering",
130
116
  java_model=None):
@@ -20,7 +20,8 @@ class XlmRoBertaForSequenceClassification(AnnotatorModel,
20
20
  HasCaseSensitiveProperties,
21
21
  HasBatchedAnnotate,
22
22
  HasClassifierActivationProperties,
23
- HasEngine):
23
+ HasEngine,
24
+ HasMaxSentenceLengthLimit):
24
25
  """XlmRoBertaForSequenceClassification can load XLM-RoBERTa Models with sequence classification/regression head on
25
26
  top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks.
26
27
 
@@ -35,7 +36,7 @@ class XlmRoBertaForSequenceClassification(AnnotatorModel,
35
36
  provided.
36
37
 
37
38
  For available pretrained models please see the `Models Hub
38
- <https://nlp.johnsnowlabs.com/models?task=Text+Classification>`__.
39
+ <https://sparknlp.org/models?task=Text+Classification>`__.
39
40
 
40
41
  To see which models are compatible and how to import them see
41
42
  `Import Transformers into Spark NLP 🚀
@@ -61,7 +62,7 @@ class XlmRoBertaForSequenceClassification(AnnotatorModel,
61
62
  Max sentence length to process, by default 128
62
63
  coalesceSentences
63
64
  Instead of 1 class per sentence (if inputCols is `sentence`) output
64
- 1 class per document by averaging probabilities in all sentences, by
65
+ 1 class per document by averaging probabilities in all sentences, by
65
66
  default False.
66
67
  activation
67
68
  Whether to calculate logits via Softmax or Sigmoid, by default
@@ -104,11 +105,6 @@ class XlmRoBertaForSequenceClassification(AnnotatorModel,
104
105
 
105
106
  outputAnnotatorType = AnnotatorType.CATEGORY
106
107
 
107
- maxSentenceLength = Param(Params._dummy(),
108
- "maxSentenceLength",
109
- "Max sentence length to process",
110
- typeConverter=TypeConverters.toInt)
111
-
112
108
  configProtoBytes = Param(Params._dummy(),
113
109
  "configProtoBytes",
114
110
  "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
@@ -134,16 +130,6 @@ class XlmRoBertaForSequenceClassification(AnnotatorModel,
134
130
  """
135
131
  return self._set(configProtoBytes=b)
136
132
 
137
- def setMaxSentenceLength(self, value):
138
- """Sets max sentence length to process, by default 128.
139
-
140
- Parameters
141
- ----------
142
- value : int
143
- Max sentence length to process
144
- """
145
- return self._set(maxSentenceLength=value)
146
-
147
133
  def setCoalesceSentences(self, value):
148
134
  """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences.
149
135
  Due to max sequence length limit in almost all transformer models such as BERT (512 tokens), this parameter helps feeding all the sentences