spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  4. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  5. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  6. sparknlp/__init__.py +81 -28
  7. sparknlp/annotation.py +3 -2
  8. sparknlp/annotator/__init__.py +6 -0
  9. sparknlp/annotator/audio/__init__.py +2 -0
  10. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  11. sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
  12. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  13. sparknlp/{base → annotator}/chunk2_doc.py +4 -7
  14. sparknlp/annotator/chunker.py +1 -2
  15. sparknlp/annotator/classifier_dl/__init__.py +17 -0
  16. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  17. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
  18. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
  19. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
  20. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  21. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  22. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  23. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
  24. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
  25. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
  26. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  27. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  28. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
  29. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
  30. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  31. sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
  32. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
  33. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
  34. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
  35. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  36. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
  37. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
  38. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
  39. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  40. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  41. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
  42. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
  43. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
  44. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  45. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  46. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  47. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
  48. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  49. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
  50. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
  51. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
  52. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  53. sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
  54. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
  55. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  56. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
  57. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  60. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
  61. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
  62. sparknlp/annotator/cleaners/__init__.py +15 -0
  63. sparknlp/annotator/cleaners/cleaner.py +202 -0
  64. sparknlp/annotator/cleaners/extractor.py +191 -0
  65. sparknlp/annotator/coref/spanbert_coref.py +4 -18
  66. sparknlp/annotator/cv/__init__.py +15 -0
  67. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  68. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  69. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  70. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  71. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  72. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  73. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  74. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  75. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  76. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  77. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  78. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  79. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  80. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  81. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  82. sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
  83. sparknlp/annotator/dataframe_optimizer.py +216 -0
  84. sparknlp/annotator/date2_chunk.py +88 -0
  85. sparknlp/annotator/dependency/dependency_parser.py +2 -3
  86. sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
  87. sparknlp/annotator/document_character_text_splitter.py +228 -0
  88. sparknlp/annotator/document_normalizer.py +37 -1
  89. sparknlp/annotator/document_token_splitter.py +175 -0
  90. sparknlp/annotator/document_token_splitter_test.py +85 -0
  91. sparknlp/annotator/embeddings/__init__.py +11 -0
  92. sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
  93. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  94. sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
  95. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
  96. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  97. sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
  98. sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
  99. sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
  100. sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
  101. sparknlp/annotator/embeddings/doc2vec.py +7 -1
  102. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  103. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  104. sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
  105. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  106. sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
  107. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  108. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  109. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  110. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  111. sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
  112. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
  113. sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
  114. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  115. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  116. sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
  117. sparknlp/annotator/embeddings/word2vec.py +7 -1
  118. sparknlp/annotator/embeddings/word_embeddings.py +4 -5
  119. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
  120. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
  121. sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
  122. sparknlp/annotator/er/entity_ruler.py +37 -23
  123. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
  124. sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
  125. sparknlp/annotator/lemmatizer.py +3 -4
  126. sparknlp/annotator/matcher/date_matcher.py +35 -3
  127. sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
  128. sparknlp/annotator/matcher/regex_matcher.py +3 -3
  129. sparknlp/annotator/matcher/text_matcher.py +2 -3
  130. sparknlp/annotator/n_gram_generator.py +1 -2
  131. sparknlp/annotator/ner/__init__.py +3 -1
  132. sparknlp/annotator/ner/ner_converter.py +18 -0
  133. sparknlp/annotator/ner/ner_crf.py +4 -5
  134. sparknlp/annotator/ner/ner_dl.py +10 -5
  135. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  136. sparknlp/annotator/ner/ner_overwriter.py +2 -2
  137. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  138. sparknlp/annotator/normalizer.py +2 -2
  139. sparknlp/annotator/openai/__init__.py +16 -0
  140. sparknlp/annotator/openai/openai_completion.py +349 -0
  141. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  142. sparknlp/annotator/pos/perceptron.py +6 -7
  143. sparknlp/annotator/sentence/sentence_detector.py +2 -2
  144. sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
  145. sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
  146. sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
  147. sparknlp/annotator/seq2seq/__init__.py +17 -0
  148. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  149. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  150. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  151. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  152. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  153. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  154. sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
  155. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  156. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  157. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  158. sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
  159. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  160. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  161. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  162. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  163. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  164. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  165. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  166. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  167. sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
  168. sparknlp/annotator/similarity/__init__.py +0 -0
  169. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  170. sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
  171. sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
  172. sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
  173. sparknlp/annotator/stemmer.py +2 -3
  174. sparknlp/annotator/stop_words_cleaner.py +3 -4
  175. sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
  176. sparknlp/annotator/token/__init__.py +0 -1
  177. sparknlp/annotator/token/recursive_tokenizer.py +2 -3
  178. sparknlp/annotator/token/tokenizer.py +2 -3
  179. sparknlp/annotator/ws/word_segmenter.py +35 -10
  180. sparknlp/base/__init__.py +2 -3
  181. sparknlp/base/doc2_chunk.py +0 -3
  182. sparknlp/base/document_assembler.py +5 -5
  183. sparknlp/base/embeddings_finisher.py +14 -2
  184. sparknlp/base/finisher.py +15 -4
  185. sparknlp/base/gguf_ranking_finisher.py +234 -0
  186. sparknlp/base/image_assembler.py +69 -0
  187. sparknlp/base/light_pipeline.py +53 -21
  188. sparknlp/base/multi_document_assembler.py +9 -13
  189. sparknlp/base/prompt_assembler.py +207 -0
  190. sparknlp/base/token_assembler.py +1 -2
  191. sparknlp/common/__init__.py +2 -0
  192. sparknlp/common/annotator_type.py +1 -0
  193. sparknlp/common/completion_post_processing.py +37 -0
  194. sparknlp/common/match_strategy.py +33 -0
  195. sparknlp/common/properties.py +914 -9
  196. sparknlp/internal/__init__.py +841 -116
  197. sparknlp/internal/annotator_java_ml.py +1 -1
  198. sparknlp/internal/annotator_transformer.py +3 -0
  199. sparknlp/logging/comet.py +2 -2
  200. sparknlp/partition/__init__.py +16 -0
  201. sparknlp/partition/partition.py +244 -0
  202. sparknlp/partition/partition_properties.py +902 -0
  203. sparknlp/partition/partition_transformer.py +200 -0
  204. sparknlp/pretrained/pretrained_pipeline.py +1 -1
  205. sparknlp/pretrained/resource_downloader.py +126 -2
  206. sparknlp/reader/__init__.py +15 -0
  207. sparknlp/reader/enums.py +19 -0
  208. sparknlp/reader/pdf_to_text.py +190 -0
  209. sparknlp/reader/reader2doc.py +124 -0
  210. sparknlp/reader/reader2image.py +136 -0
  211. sparknlp/reader/reader2table.py +44 -0
  212. sparknlp/reader/reader_assembler.py +159 -0
  213. sparknlp/reader/sparknlp_reader.py +461 -0
  214. sparknlp/training/__init__.py +1 -0
  215. sparknlp/training/conll.py +8 -2
  216. sparknlp/training/spacy_to_annotation.py +57 -0
  217. sparknlp/util.py +26 -0
  218. spark_nlp-4.2.6.dist-info/METADATA +0 -1256
  219. spark_nlp-4.2.6.dist-info/RECORD +0 -196
  220. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
  221. /sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0
@@ -0,0 +1,148 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from sparknlp.common import *
16
+
17
+
18
+ class MPNetForQuestionAnswering(AnnotatorModel,
19
+ HasCaseSensitiveProperties,
20
+ HasBatchedAnnotate,
21
+ HasEngine,
22
+ HasMaxSentenceLengthLimit):
23
+ """MPNetForQuestionAnswering can load MPNet Models with a span classification head on top for extractive
24
+ question-answering tasks like SQuAD (a linear layer on top of the hidden-states output to compute span start
25
+ logits and span end logits).
26
+
27
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
28
+ object:
29
+
30
+ >>> spanClassifier = MPNetForQuestionAnswering.pretrained() \\
31
+ ... .setInputCols(["document_question", "document_context"]) \\
32
+ ... .setOutputCol("answer")
33
+
34
+ The default model is ``"mpnet_base_question_answering_squad2"``, if no name is
35
+ provided.
36
+
37
+ For available pretrained models please see the `Models Hub
38
+ <https://sparknlp.org/models?task=Question+Answering>`__.
39
+
40
+ To see which models are compatible and how to import them see
41
+ `Import Transformers into Spark NLP 🚀
42
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
43
+
44
+ ====================== ======================
45
+ Input Annotation types Output Annotation type
46
+ ====================== ======================
47
+ ``DOCUMENT, DOCUMENT`` ``CHUNK``
48
+ ====================== ======================
49
+
50
+ Parameters
51
+ ----------
52
+ batchSize
53
+ Batch size. Large values allows faster processing but requires more
54
+ memory, by default 8
55
+ caseSensitive
56
+ Whether to ignore case in tokens for embeddings matching, by default
57
+ False
58
+ maxSentenceLength
59
+ Max sentence length to process, by default 128
60
+
61
+ Examples
62
+ --------
63
+ >>> import sparknlp
64
+ >>> from sparknlp.base import *
65
+ >>> from sparknlp.annotator import *
66
+ >>> from pyspark.ml import Pipeline
67
+ >>> documentAssembler = MultiDocumentAssembler() \\
68
+ ... .setInputCols(["question", "context"]) \\
69
+ ... .setOutputCol(["document_question", "document_context"])
70
+ >>> spanClassifier = MPNetForQuestionAnswering.pretrained() \\
71
+ ... .setInputCols(["document_question", "document_context"]) \\
72
+ ... .setOutputCol("answer") \\
73
+ ... .setCaseSensitive(False)
74
+ >>> pipeline = Pipeline().setStages([
75
+ ... documentAssembler,
76
+ ... spanClassifier
77
+ ... ])
78
+ >>> data = spark.createDataFrame([["What's my name?", "My name is Clara and I live in Berkeley."]]).toDF("question", "context")
79
+ >>> result = pipeline.fit(data).transform(data)
80
+ >>> result.select("answer.result").show(truncate=False)
81
+ +--------------------+
82
+ |result |
83
+ +--------------------+
84
+ |[Clara] |
85
+ +--------------------+
86
+ """
87
+ name = "MPNetForQuestionAnswering"
88
+
89
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT]
90
+
91
+ outputAnnotatorType = AnnotatorType.CHUNK
92
+
93
+
94
+ @keyword_only
95
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.MPNetForQuestionAnswering",
96
+ java_model=None):
97
+ super(MPNetForQuestionAnswering, self).__init__(
98
+ classname=classname,
99
+ java_model=java_model
100
+ )
101
+ self._setDefault(
102
+ batchSize=8,
103
+ maxSentenceLength=384,
104
+ caseSensitive=False
105
+ )
106
+
107
+ @staticmethod
108
+ def loadSavedModel(folder, spark_session):
109
+ """Loads a locally saved model.
110
+
111
+ Parameters
112
+ ----------
113
+ folder : str
114
+ Folder of the saved model
115
+ spark_session : pyspark.sql.SparkSession
116
+ The current SparkSession
117
+
118
+ Returns
119
+ -------
120
+ MPNetForQuestionAnswering
121
+ The restored model
122
+ """
123
+ from sparknlp.internal import _MPNetForQuestionAnsweringLoader
124
+ jModel = _MPNetForQuestionAnsweringLoader(folder, spark_session._jsparkSession)._java_obj
125
+ return MPNetForQuestionAnswering(java_model=jModel)
126
+
127
+ @staticmethod
128
+ def pretrained(name="mpnet_base_question_answering_squad2", lang="en", remote_loc=None):
129
+ """Downloads and loads a pretrained model.
130
+
131
+ Parameters
132
+ ----------
133
+ name : str, optional
134
+ Name of the pretrained model, by default
135
+ "mpnet_base_question_answering_squad2"
136
+ lang : str, optional
137
+ Language of the pretrained model, by default "en"
138
+ remote_loc : str, optional
139
+ Optional remote address of the resource, by default None. Will use
140
+ Spark NLPs repositories otherwise.
141
+
142
+ Returns
143
+ -------
144
+ MPNetForQuestionAnswering
145
+ The restored model
146
+ """
147
+ from sparknlp.pretrained import ResourceDownloader
148
+ return ResourceDownloader.downloadModel(MPNetForQuestionAnswering, name, lang, remote_loc)
@@ -0,0 +1,188 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for MPNetForSequenceClassification."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class MPNetForSequenceClassification(AnnotatorModel,
20
+ HasCaseSensitiveProperties,
21
+ HasBatchedAnnotate,
22
+ HasClassifierActivationProperties,
23
+ HasEngine,
24
+ HasMaxSentenceLengthLimit):
25
+ """MPNetForSequenceClassification can load MPNet Models with sequence classification/regression head on
26
+ top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks.
27
+
28
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
29
+ object:
30
+
31
+ >>> sequenceClassifier = MPNetForSequenceClassification.pretrained() \\
32
+ ... .setInputCols(["token", "document"]) \\
33
+ ... .setOutputCol("label")
34
+
35
+ The default model is ``"mpnet_sequence_classifier_ukr_message"``, if no name is
36
+ provided.
37
+
38
+ For available pretrained models please see the `Models Hub
39
+ <https://sparknlp.org/models?task=Text+Classification>`__.
40
+
41
+ To see which models are compatible and how to import them see
42
+ `Import Transformers into Spark NLP 🚀
43
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
44
+
45
+ ====================== ======================
46
+ Input Annotation types Output Annotation type
47
+ ====================== ======================
48
+ ``DOCUMENT, TOKEN`` ``CATEGORY``
49
+ ====================== ======================
50
+
51
+ Parameters
52
+ ----------
53
+ batchSize
54
+ Batch size. Large values allows faster processing but requires more
55
+ memory, by default 8
56
+ caseSensitive
57
+ Whether to ignore case in tokens for embeddings matching, by default
58
+ True
59
+ maxSentenceLength
60
+ Max sentence length to process, by default 128
61
+ coalesceSentences
62
+ Instead of 1 class per sentence (if inputCols is `sentence`) output
63
+ 1 class per document by averaging probabilities in all sentences, by
64
+ default False.
65
+ activation
66
+ Whether to calculate logits via Softmax or Sigmoid, by default
67
+ `"softmax"`.
68
+
69
+ Examples
70
+ --------
71
+ >>> import sparknlp
72
+ >>> from sparknlp.base import *
73
+ >>> from sparknlp.annotator import *
74
+ >>> from pyspark.ml import Pipeline
75
+ >>> document = DocumentAssembler() \\
76
+ ... .setInputCol("text") \\
77
+ ... .setOutputCol("document")
78
+ >>> tokenizer = Tokenizer() \\
79
+ ... .setInputCols(["document"]) \\
80
+ ... .setOutputCol("token")
81
+ >>> sequenceClassifier = MPNetForSequenceClassification \\
82
+ ... .pretrained() \\
83
+ ... .setInputCols(["document", "token"]) \\
84
+ ... .setOutputCol("label")
85
+ >>> data = spark.createDataFrame([
86
+ ... ["I love driving my car."],
87
+ ... ["The next bus will arrive in 20 minutes."],
88
+ ... ["pineapple on pizza is the worst 🤮"],
89
+ ... ]).toDF("text")
90
+ >>> pipeline = Pipeline().setStages([document, tokenizer, sequenceClassifier])
91
+ >>> pipelineModel = pipeline.fit(data)
92
+ >>> results = pipelineModel.transform(data)
93
+ >>> results.select("label.result").show()
94
+ +--------------------+
95
+ | result|
96
+ +--------------------+
97
+ | [TRANSPORT/CAR]|
98
+ |[TRANSPORT/MOVEMENT]|
99
+ | [FOOD]|
100
+ +--------------------+
101
+ """
102
+ name = "MPNetForSequenceClassification"
103
+
104
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
105
+
106
+ outputAnnotatorType = AnnotatorType.CATEGORY
107
+
108
+
109
+ coalesceSentences = Param(Params._dummy(), "coalesceSentences",
110
+ "Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences.",
111
+ TypeConverters.toBoolean)
112
+
113
+ def getClasses(self):
114
+ """
115
+ Returns labels used to train this model
116
+ """
117
+ return self._call_java("getClasses")
118
+
119
+
120
+ def setCoalesceSentences(self, value):
121
+ """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences.
122
+ Due to max sequence length limit in almost all transformer models such as BERT (512 tokens), this parameter helps feeding all the sentences
123
+ into the model and averaging all the probabilities for the entire document instead of probabilities per sentence. (Default: true)
124
+
125
+ Parameters
126
+ ----------
127
+ value : bool
128
+ If the output of all sentences will be averaged to one output
129
+ """
130
+ return self._set(coalesceSentences=value)
131
+
132
+ @keyword_only
133
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.MPNetForSequenceClassification",
134
+ java_model=None):
135
+ super(MPNetForSequenceClassification, self).__init__(
136
+ classname=classname,
137
+ java_model=java_model
138
+ )
139
+ self._setDefault(
140
+ batchSize=8,
141
+ maxSentenceLength=128,
142
+ caseSensitive=True,
143
+ coalesceSentences=False,
144
+ activation="softmax"
145
+ )
146
+
147
+ @staticmethod
148
+ def loadSavedModel(folder, spark_session):
149
+ """Loads a locally saved model.
150
+
151
+ Parameters
152
+ ----------
153
+ folder : str
154
+ Folder of the saved model
155
+ spark_session : pyspark.sql.SparkSession
156
+ The current SparkSession
157
+
158
+ Returns
159
+ -------
160
+ MPNetForSequenceClassification
161
+ The restored model
162
+ """
163
+ from sparknlp.internal import _MPNetForSequenceClassificationLoader
164
+ jModel = _MPNetForSequenceClassificationLoader(folder, spark_session._jsparkSession)._java_obj
165
+ return MPNetForSequenceClassification(java_model=jModel)
166
+
167
+ @staticmethod
168
+ def pretrained(name="mpnet_sequence_classifier_ukr_message", lang="en", remote_loc=None):
169
+ """Downloads and loads a pretrained model.
170
+
171
+ Parameters
172
+ ----------
173
+ name : str, optional
174
+ Name of the pretrained model, by default
175
+ "MPNet_base_sequence_classifier_imdb"
176
+ lang : str, optional
177
+ Language of the pretrained model, by default "en"
178
+ remote_loc : str, optional
179
+ Optional remote address of the resource, by default None. Will use
180
+ Spark NLPs repositories otherwise.
181
+
182
+ Returns
183
+ -------
184
+ MPNetForSequenceClassification
185
+ The restored model
186
+ """
187
+ from sparknlp.pretrained import ResourceDownloader
188
+ return ResourceDownloader.downloadModel(MPNetForSequenceClassification, name, lang, remote_loc)
@@ -0,0 +1,173 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for MPNetForTokenClassification."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class MPNetForTokenClassification(AnnotatorModel,
20
+ HasCaseSensitiveProperties,
21
+ HasBatchedAnnotate,
22
+ HasEngine,
23
+ HasMaxSentenceLengthLimit):
24
+ """MPNetForTokenClassification can load XLM-RoBERTa Models with a token
25
+ classification head on top (a linear layer on top of the hidden-states
26
+ output) e.g. for Named-Entity-Recognition (NER) tasks.
27
+
28
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
29
+ object:
30
+
31
+ >>> token_classifier = MPNetForTokenClassification.pretrained() \\
32
+ ... .setInputCols(["token", "document"]) \\
33
+ ... .setOutputCol("label")
34
+ The default model is ``"mpnet_base_token_classifier"``, if no
35
+ name is provided.
36
+
37
+ For available pretrained models please see the `Models Hub
38
+ <https://sparknlp.org/models?task=Named+Entity+Recognition>`__.
39
+ To see which models are compatible and how to import them see
40
+ `Import Transformers into Spark NLP 🚀
41
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
42
+
43
+ ====================== ======================
44
+ Input Annotation types Output Annotation type
45
+ ====================== ======================
46
+ ``DOCUMENT, TOKEN`` ``NAMED_ENTITY``
47
+ ====================== ======================
48
+
49
+ Parameters
50
+ ----------
51
+ batchSize
52
+ Batch size. Large values allows faster processing but requires more
53
+ memory, by default 8
54
+ caseSensitive
55
+ Whether to ignore case in tokens for embeddings matching, by default
56
+ True
57
+ configProtoBytes
58
+ ConfigProto from tensorflow, serialized into byte array.
59
+ maxSentenceLength
60
+ Max sentence length to process, by default 128
61
+
62
+ Examples
63
+ --------
64
+ >>> import sparknlp
65
+ >>> from sparknlp.base import *
66
+ >>> from sparknlp.annotator import *
67
+ >>> from pyspark.ml import Pipeline
68
+ >>> documentAssembler = DocumentAssembler() \\
69
+ ... .setInputCol("text") \\
70
+ ... .setOutputCol("document")
71
+ >>> tokenizer = Tokenizer() \\
72
+ ... .setInputCols(["document"]) \\
73
+ ... .setOutputCol("token")
74
+ >>> tokenClassifier = MPNetForTokenClassification.pretrained() \\
75
+ ... .setInputCols(["token", "document"]) \\
76
+ ... .setOutputCol("label") \\
77
+ ... .setCaseSensitive(True)
78
+ >>> pipeline = Pipeline().setStages([
79
+ ... documentAssembler,
80
+ ... tokenizer,
81
+ ... tokenClassifier
82
+ ... ])
83
+ >>> data = spark.createDataFrame([["John Lenon was born in London and lived in Paris. My name is Sarah and I live in London"]]).toDF("text")
84
+ >>> result = pipeline.fit(data).transform(data)
85
+ >>> result.select("label.result").show(truncate=False)
86
+ +------------------------------------------------------------------------------------+
87
+ |result |
88
+ +------------------------------------------------------------------------------------+
89
+ |[B-PER, I-PER, O, O, O, B-LOC, O, O, O, B-LOC, O, O, O, O, B-PER, O, O, O, O, B-LOC]|
90
+ +------------------------------------------------------------------------------------+
91
+ """
92
+ name = "MPNetForTokenClassification"
93
+
94
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
95
+
96
+ outputAnnotatorType = AnnotatorType.NAMED_ENTITY
97
+
98
+ configProtoBytes = Param(Params._dummy(),
99
+ "configProtoBytes",
100
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
101
+ TypeConverters.toListInt)
102
+
103
+ def getClasses(self):
104
+ """
105
+ Returns labels used to train this model
106
+ """
107
+ return self._call_java("getClasses")
108
+
109
+ def setConfigProtoBytes(self, b):
110
+ """Sets configProto from tensorflow, serialized into byte array.
111
+
112
+ Parameters
113
+ ----------
114
+ b : List[int]
115
+ ConfigProto from tensorflow, serialized into byte array
116
+ """
117
+ return self._set(configProtoBytes=b)
118
+
119
+ @keyword_only
120
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.MPNetForTokenClassification",
121
+ java_model=None):
122
+ super(MPNetForTokenClassification, self).__init__(
123
+ classname=classname,
124
+ java_model=java_model
125
+ )
126
+ self._setDefault(
127
+ batchSize=8,
128
+ maxSentenceLength=128,
129
+ caseSensitive=True
130
+ )
131
+
132
+ @staticmethod
133
+ def loadSavedModel(folder, spark_session):
134
+ """Loads a locally saved model.
135
+
136
+ Parameters
137
+ ----------
138
+ folder : str
139
+ Folder of the saved model
140
+ spark_session : pyspark.sql.SparkSession
141
+ The current SparkSession
142
+
143
+ Returns
144
+ -------
145
+ XlmRoBertaForTokenClassification
146
+ The restored model
147
+ """
148
+ from sparknlp.internal import _MPNetForTokenClassifierLoader
149
+ jModel = _MPNetForTokenClassifierLoader(folder, spark_session._jsparkSession)._java_obj
150
+ return MPNetForTokenClassification(java_model=jModel)
151
+
152
+ @staticmethod
153
+ def pretrained(name="mpnet_base_token_classifier", lang="en", remote_loc=None):
154
+ """Downloads and loads a pretrained model.
155
+
156
+ Parameters
157
+ ----------
158
+ name : str, optional
159
+ Name of the pretrained model, by default
160
+ "mpnet_base_token_classifier"
161
+ lang : str, optional
162
+ Language of the pretrained model, by default "en"
163
+ remote_loc : str, optional
164
+ Optional remote address of the resource, by default None. Will use
165
+ Spark NLPs repositories otherwise.
166
+
167
+ Returns
168
+ -------
169
+ XlmRoBertaForTokenClassification
170
+ The restored model
171
+ """
172
+ from sparknlp.pretrained import ResourceDownloader
173
+ return ResourceDownloader.downloadModel(MPNetForTokenClassification, name, lang, remote_loc)
@@ -68,7 +68,7 @@ class MultiClassifierDLApproach(AnnotatorApproach, EvaluationDLParams, Classifie
68
68
  ... .setLabelColumn("label") \\
69
69
  ... .setTestDataset("test_data")
70
70
 
71
- For extended examples of usage, see the `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/classification/MultiClassifierDL_train_multi_label_E2E_challenge_classifier.ipynb>`__.
71
+ For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/classification/MultiClassifierDL_train_multi_label_E2E_challenge_classifier.ipynb>`__.
72
72
 
73
73
  ======================= ======================
74
74
  Input Annotation types Output Annotation type
@@ -265,9 +265,9 @@ class MultiClassifierDLModel(AnnotatorModel, HasStorageRef, HasEngine):
265
265
 
266
266
  The data is based on the
267
267
  `Jigsaw Toxic Comment Classification Challenge <https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/overview>`__.
268
- For available pretrained models please see the `Models Hub <https://nlp.johnsnowlabs.com/models?task=Text+Classification>`__.
268
+ For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Text+Classification>`__.
269
269
 
270
- For extended examples of usage, see the `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/classification/MultiClassifierDL_train_multi_label_E2E_challenge_classifier.ipynb>`__.
270
+ For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/classification/MultiClassifierDL_train_multi_label_E2E_challenge_classifier.ipynb>`__.
271
271
 
272
272
  ======================= ======================
273
273
  Input Annotation types Output Annotation type
@@ -0,0 +1,161 @@
1
+ # Copyright 2017-2025 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from sparknlp.common import *
16
+
17
+ class RoBertaForMultipleChoice(AnnotatorModel,
18
+ HasCaseSensitiveProperties,
19
+ HasBatchedAnnotate,
20
+ HasEngine,
21
+ HasMaxSentenceLengthLimit):
22
+ """RoBertaForMultipleChoice can load RoBERTa Models with a multiple choice classification head on top
23
+ (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
24
+
25
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
26
+ object:
27
+
28
+ >>> spanClassifier = RoBertaForMultipleChoice.pretrained() \\
29
+ ... .setInputCols(["document_question", "document_context"]) \\
30
+ ... .setOutputCol("answer")
31
+
32
+ The default model is ``"roberta_base_uncased_multiple_choice"``, if no name is
33
+ provided.
34
+
35
+ For available pretrained models please see the `Models Hub
36
+ <https://sparknlp.org/models?task=Multiple+Choice>`__.
37
+
38
+ To see which models are compatible and how to import them see
39
+ `Import Transformers into Spark NLP 🚀
40
+ <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
41
+
42
+ ====================== ======================
43
+ Input Annotation types Output Annotation type
44
+ ====================== ======================
45
+ ``DOCUMENT, DOCUMENT`` ``CHUNK``
46
+ ====================== ======================
47
+
48
+ Parameters
49
+ ----------
50
+ batchSize
51
+ Batch size. Large values allows faster processing but requires more
52
+ memory, by default 8
53
+ caseSensitive
54
+ Whether to ignore case in tokens for embeddings matching, by default
55
+ False
56
+ maxSentenceLength
57
+ Max sentence length to process, by default 512
58
+
59
+ Examples
60
+ --------
61
+ >>> import sparknlp
62
+ >>> from sparknlp.base import *
63
+ >>> from sparknlp.annotator import *
64
+ >>> from pyspark.ml import Pipeline
65
+ >>> documentAssembler = MultiDocumentAssembler() \\
66
+ ... .setInputCols(["question", "context"]) \\
67
+ ... .setOutputCols(["document_question", "document_context"])
68
+ >>> questionAnswering = RoBertaForMultipleChoice.pretrained() \\
69
+ ... .setInputCols(["document_question", "document_context"]) \\
70
+ ... .setOutputCol("answer") \\
71
+ ... .setCaseSensitive(False)
72
+ >>> pipeline = Pipeline().setStages([
73
+ ... documentAssembler,
74
+ ... questionAnswering
75
+ ... ])
76
+ >>> data = spark.createDataFrame([["The Eiffel Tower is located in which country??", "Germany, France, Italy"]]).toDF("question", "context")
77
+ >>> result = pipeline.fit(data).transform(data)
78
+ >>> result.select("answer.result").show(truncate=False)
79
+ +--------------------+
80
+ |result |
81
+ +--------------------+
82
+ |[France] |
83
+ +--------------------+
84
+ """
85
+ name = "RobertaForMultipleChoice"
86
+
87
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT]
88
+
89
+ outputAnnotatorType = AnnotatorType.CHUNK
90
+
91
+ choicesDelimiter = Param(Params._dummy(),
92
+ "choicesDelimiter",
93
+ "Delimiter character use to split the choices",
94
+ TypeConverters.toString)
95
+
96
+ def setChoicesDelimiter(self, value):
97
+ """Sets delimiter character use to split the choices
98
+
99
+ Parameters
100
+ ----------
101
+ value : string
102
+ Delimiter character use to split the choices
103
+ """
104
+ return self._set(caseSensitive=value)
105
+
106
+ @keyword_only
107
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.RoBertaForMultipleChoice",
108
+ java_model=None):
109
+ super(RoBertaForMultipleChoice, self).__init__(
110
+ classname=classname,
111
+ java_model=java_model
112
+ )
113
+ self._setDefault(
114
+ batchSize=4,
115
+ maxSentenceLength=512,
116
+ caseSensitive=False,
117
+ choicesDelimiter = ","
118
+ )
119
+
120
+ @staticmethod
121
+ def loadSavedModel(folder, spark_session):
122
+ """Loads a locally saved model.
123
+
124
+ Parameters
125
+ ----------
126
+ folder : str
127
+ Folder of the saved model
128
+ spark_session : pyspark.sql.SparkSession
129
+ The current SparkSession
130
+
131
+ Returns
132
+ -------
133
+ RobertaForQuestionAnswering
134
+ The restored model
135
+ """
136
+ from sparknlp.internal import _RoBertaMultipleChoiceLoader
137
+ jModel = _RoBertaMultipleChoiceLoader(folder, spark_session._jsparkSession)._java_obj
138
+ return RoBertaForMultipleChoice(java_model=jModel)
139
+
140
+ @staticmethod
141
+ def pretrained(name="Roberta_base_uncased_multiple_choice", lang="en", remote_loc=None):
142
+ """Downloads and loads a pretrained model.
143
+
144
+ Parameters
145
+ ----------
146
+ name : str, optional
147
+ Name of the pretrained model, by default
148
+ "Roberta_base_uncased_multiple_choice"
149
+ lang : str, optional
150
+ Language of the pretrained model, by default "en"
151
+ remote_loc : str, optional
152
+ Optional remote address of the resource, by default None. Will use
153
+ Spark NLPs repositories otherwise.
154
+
155
+ Returns
156
+ -------
157
+ RoBertaForMultipleChoice
158
+ The restored model
159
+ """
160
+ from sparknlp.pretrained import ResourceDownloader
161
+ return ResourceDownloader.downloadModel(RoBertaForMultipleChoice, name, lang, remote_loc)