spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  4. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  5. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  6. sparknlp/__init__.py +81 -28
  7. sparknlp/annotation.py +3 -2
  8. sparknlp/annotator/__init__.py +6 -0
  9. sparknlp/annotator/audio/__init__.py +2 -0
  10. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  11. sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
  12. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  13. sparknlp/{base → annotator}/chunk2_doc.py +4 -7
  14. sparknlp/annotator/chunker.py +1 -2
  15. sparknlp/annotator/classifier_dl/__init__.py +17 -0
  16. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  17. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
  18. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
  19. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
  20. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  21. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  22. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  23. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
  24. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
  25. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
  26. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  27. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  28. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
  29. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
  30. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  31. sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
  32. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
  33. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
  34. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
  35. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  36. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
  37. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
  38. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
  39. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  40. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  41. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
  42. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
  43. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
  44. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  45. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  46. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  47. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
  48. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  49. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
  50. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
  51. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
  52. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  53. sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
  54. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
  55. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  56. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
  57. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  60. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
  61. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
  62. sparknlp/annotator/cleaners/__init__.py +15 -0
  63. sparknlp/annotator/cleaners/cleaner.py +202 -0
  64. sparknlp/annotator/cleaners/extractor.py +191 -0
  65. sparknlp/annotator/coref/spanbert_coref.py +4 -18
  66. sparknlp/annotator/cv/__init__.py +15 -0
  67. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  68. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  69. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  70. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  71. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  72. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  73. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  74. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  75. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  76. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  77. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  78. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  79. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  80. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  81. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  82. sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
  83. sparknlp/annotator/dataframe_optimizer.py +216 -0
  84. sparknlp/annotator/date2_chunk.py +88 -0
  85. sparknlp/annotator/dependency/dependency_parser.py +2 -3
  86. sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
  87. sparknlp/annotator/document_character_text_splitter.py +228 -0
  88. sparknlp/annotator/document_normalizer.py +37 -1
  89. sparknlp/annotator/document_token_splitter.py +175 -0
  90. sparknlp/annotator/document_token_splitter_test.py +85 -0
  91. sparknlp/annotator/embeddings/__init__.py +11 -0
  92. sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
  93. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  94. sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
  95. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
  96. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  97. sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
  98. sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
  99. sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
  100. sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
  101. sparknlp/annotator/embeddings/doc2vec.py +7 -1
  102. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  103. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  104. sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
  105. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  106. sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
  107. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  108. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  109. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  110. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  111. sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
  112. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
  113. sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
  114. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  115. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  116. sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
  117. sparknlp/annotator/embeddings/word2vec.py +7 -1
  118. sparknlp/annotator/embeddings/word_embeddings.py +4 -5
  119. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
  120. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
  121. sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
  122. sparknlp/annotator/er/entity_ruler.py +37 -23
  123. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
  124. sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
  125. sparknlp/annotator/lemmatizer.py +3 -4
  126. sparknlp/annotator/matcher/date_matcher.py +35 -3
  127. sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
  128. sparknlp/annotator/matcher/regex_matcher.py +3 -3
  129. sparknlp/annotator/matcher/text_matcher.py +2 -3
  130. sparknlp/annotator/n_gram_generator.py +1 -2
  131. sparknlp/annotator/ner/__init__.py +3 -1
  132. sparknlp/annotator/ner/ner_converter.py +18 -0
  133. sparknlp/annotator/ner/ner_crf.py +4 -5
  134. sparknlp/annotator/ner/ner_dl.py +10 -5
  135. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  136. sparknlp/annotator/ner/ner_overwriter.py +2 -2
  137. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  138. sparknlp/annotator/normalizer.py +2 -2
  139. sparknlp/annotator/openai/__init__.py +16 -0
  140. sparknlp/annotator/openai/openai_completion.py +349 -0
  141. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  142. sparknlp/annotator/pos/perceptron.py +6 -7
  143. sparknlp/annotator/sentence/sentence_detector.py +2 -2
  144. sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
  145. sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
  146. sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
  147. sparknlp/annotator/seq2seq/__init__.py +17 -0
  148. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  149. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  150. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  151. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  152. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  153. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  154. sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
  155. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  156. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  157. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  158. sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
  159. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  160. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  161. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  162. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  163. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  164. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  165. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  166. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  167. sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
  168. sparknlp/annotator/similarity/__init__.py +0 -0
  169. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  170. sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
  171. sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
  172. sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
  173. sparknlp/annotator/stemmer.py +2 -3
  174. sparknlp/annotator/stop_words_cleaner.py +3 -4
  175. sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
  176. sparknlp/annotator/token/__init__.py +0 -1
  177. sparknlp/annotator/token/recursive_tokenizer.py +2 -3
  178. sparknlp/annotator/token/tokenizer.py +2 -3
  179. sparknlp/annotator/ws/word_segmenter.py +35 -10
  180. sparknlp/base/__init__.py +2 -3
  181. sparknlp/base/doc2_chunk.py +0 -3
  182. sparknlp/base/document_assembler.py +5 -5
  183. sparknlp/base/embeddings_finisher.py +14 -2
  184. sparknlp/base/finisher.py +15 -4
  185. sparknlp/base/gguf_ranking_finisher.py +234 -0
  186. sparknlp/base/image_assembler.py +69 -0
  187. sparknlp/base/light_pipeline.py +53 -21
  188. sparknlp/base/multi_document_assembler.py +9 -13
  189. sparknlp/base/prompt_assembler.py +207 -0
  190. sparknlp/base/token_assembler.py +1 -2
  191. sparknlp/common/__init__.py +2 -0
  192. sparknlp/common/annotator_type.py +1 -0
  193. sparknlp/common/completion_post_processing.py +37 -0
  194. sparknlp/common/match_strategy.py +33 -0
  195. sparknlp/common/properties.py +914 -9
  196. sparknlp/internal/__init__.py +841 -116
  197. sparknlp/internal/annotator_java_ml.py +1 -1
  198. sparknlp/internal/annotator_transformer.py +3 -0
  199. sparknlp/logging/comet.py +2 -2
  200. sparknlp/partition/__init__.py +16 -0
  201. sparknlp/partition/partition.py +244 -0
  202. sparknlp/partition/partition_properties.py +902 -0
  203. sparknlp/partition/partition_transformer.py +200 -0
  204. sparknlp/pretrained/pretrained_pipeline.py +1 -1
  205. sparknlp/pretrained/resource_downloader.py +126 -2
  206. sparknlp/reader/__init__.py +15 -0
  207. sparknlp/reader/enums.py +19 -0
  208. sparknlp/reader/pdf_to_text.py +190 -0
  209. sparknlp/reader/reader2doc.py +124 -0
  210. sparknlp/reader/reader2image.py +136 -0
  211. sparknlp/reader/reader2table.py +44 -0
  212. sparknlp/reader/reader_assembler.py +159 -0
  213. sparknlp/reader/sparknlp_reader.py +461 -0
  214. sparknlp/training/__init__.py +1 -0
  215. sparknlp/training/conll.py +8 -2
  216. sparknlp/training/spacy_to_annotation.py +57 -0
  217. sparknlp/util.py +26 -0
  218. spark_nlp-4.2.6.dist-info/METADATA +0 -1256
  219. spark_nlp-4.2.6.dist-info/RECORD +0 -196
  220. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
  221. /sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0
@@ -0,0 +1,188 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Contains classes concerning HubertForCTC."""
16
+
17
+ from sparknlp.common import *
18
+
19
+
20
+ class HubertForCTC(AnnotatorModel,
21
+ HasBatchedAnnotateAudio,
22
+ HasAudioFeatureProperties,
23
+ HasEngine):
24
+ """Hubert Model with a language modeling head on top for Connectionist Temporal
25
+ Classification (CTC). Hubert was proposed in HuBERT: Self-Supervised Speech
26
+ Representation Learning by Masked Prediction of Hidden Units by Wei-Ning Hsu,
27
+ Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov,
28
+ Abdelrahman Mohamed.
29
+
30
+ The annotator takes audio files and transcribes it as text. The audio needs to be
31
+ provided pre-processed an array of floats.
32
+
33
+ Note that this annotator is currently not supported on Apple Silicon processors such
34
+ as the M1. This is due to the processor not supporting instructions for XLA.
35
+
36
+ Pretrained models can be loaded with ``pretrained`` of the companion object:
37
+
38
+ >>> speechToText = HubertForCTC.pretrained() \\
39
+ ... .setInputCols(["audio_assembler"]) \\
40
+ ... .setOutputCol("text")
41
+
42
+
43
+ The default model is ``"asr_hubert_large_ls960"``, if no name is provided.
44
+
45
+ For available pretrained models please see the
46
+ `Models Hub <https://sparknlp.org/models>`__.
47
+
48
+ To see which models are compatible and how to import them see
49
+ https://github.com/JohnSnowLabs/spark-nlp/discussions/5669 and to see more extended
50
+ examples, see
51
+ `HubertForCTCTestSpec <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/audio/HubertForCTCTestSpec.scala>`__.
52
+
53
+ **Paper Abstract:**
54
+
55
+ *Self-supervised approaches for speech representation learning are challenged by
56
+ three unique problems: (1) there are multiple sound units in each input utterance,
57
+ (2) there is no lexicon of input sound units during the pre-training phase, and (3)
58
+ sound units have variable lengths with no explicit segmentation. To deal with these
59
+ three problems, we propose the Hidden-Unit BERT (HuBERT) approach for
60
+ self-supervised speech representation learning, which utilizes an offline clustering
61
+ step to provide aligned target labels for a BERT-like prediction loss. A key
62
+ ingredient of our approach is applying the prediction loss over the masked regions
63
+ only, which forces the model to learn a combined acoustic and language model over
64
+ the continuous inputs. HuBERT relies primarily on the consistency of the
65
+ unsupervised clustering step rather than the intrinsic quality of the assigned
66
+ cluster labels. Starting with a simple k-means teacher of 100 clusters, and using
67
+ two iterations of clustering, the HuBERT model either matches or improves upon the
68
+ state-of-the-art wav2vec 2.0 performance on the Librispeech (960h) and Libri-light
69
+ (60,000h) benchmarks with 10min, 1h, 10h, 100h, and 960h fine-tuning subsets. Using
70
+ a 1B parameter model, HuBERT shows up to 19% and 13% relative WER reduction on the
71
+ more challenging dev-other and test-other evaluation subsets.*
72
+
73
+ References
74
+ ----------
75
+
76
+ `HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units
77
+ <https://arxiv.org/abs/2106.07447>`__
78
+
79
+ ====================== ======================
80
+ Input Annotation types Output Annotation type
81
+ ====================== ======================
82
+ ``AUDIO`` ``DOCUMENT``
83
+ ====================== ======================
84
+
85
+ Parameters
86
+ ----------
87
+
88
+ batchSize
89
+ Size of each batch, by default 4
90
+
91
+ Examples
92
+ --------
93
+ >>> import sparknlp
94
+ >>> from sparknlp.base import *
95
+ >>> from sparknlp.annotator import *
96
+ >>> from pyspark.ml import Pipeline
97
+ >>> audioAssembler = AudioAssembler() \\
98
+ ... .setInputCol("audio_content") \\
99
+ ... .setOutputCol("audio_assembler")
100
+ >>> speechToText = HubertForCTC \\
101
+ ... .pretrained() \\
102
+ ... .setInputCols(["audio_assembler"]) \\
103
+ ... .setOutputCol("text")
104
+ >>> pipeline = Pipeline().setStages([audioAssembler, speechToText])
105
+ >>> processedAudioFloats = spark.createDataFrame([[rawFloats]]).toDF("audio_content")
106
+ >>> result = pipeline.fit(processedAudioFloats).transform(processedAudioFloats)
107
+ >>> result.select("text.result").show(truncate = False)
108
+ +------------------------------------------------------------------------------------------+
109
+ |result |
110
+ +------------------------------------------------------------------------------------------+
111
+ |[MISTER QUILTER IS THE APOSTLE OF THE MIDLE CLASES AND WE ARE GLAD TO WELCOME HIS GOSPEL ]|
112
+ +------------------------------------------------------------------------------------------+
113
+ """
114
+ name = "HubertForCTC"
115
+
116
+ inputAnnotatorTypes = [AnnotatorType.AUDIO]
117
+
118
+ outputAnnotatorType = AnnotatorType.DOCUMENT
119
+
120
+ configProtoBytes = Param(Params._dummy(),
121
+ "configProtoBytes",
122
+ "ConfigProto from tensorflow, serialized into byte array. Get with "
123
+ "config_proto.SerializeToString()",
124
+ TypeConverters.toListInt)
125
+
126
+ def setConfigProtoBytes(self, b):
127
+ """Sets configProto from tensorflow, serialized into byte array.
128
+
129
+ Parameters
130
+ ----------
131
+ b : List[int]
132
+ ConfigProto from tensorflow, serialized into byte array
133
+ """
134
+ return self._set(configProtoBytes=b)
135
+
136
+ @keyword_only
137
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.audio.HubertForCTC",
138
+ java_model=None):
139
+ super(HubertForCTC, self).__init__(
140
+ classname=classname,
141
+ java_model=java_model
142
+ )
143
+ self._setDefault(
144
+ batchSize=4
145
+ )
146
+
147
+ @staticmethod
148
+ def loadSavedModel(folder, spark_session):
149
+ """Loads a locally saved model.
150
+
151
+ Parameters
152
+ ----------
153
+ folder : str
154
+ Folder of the saved model
155
+ spark_session : pyspark.sql.SparkSession
156
+ The current SparkSession
157
+
158
+ Returns
159
+ -------
160
+ HubertForCTC
161
+ The restored model
162
+ """
163
+ from sparknlp.internal import _HubertForCTC
164
+ jModel = _HubertForCTC(folder, spark_session._jsparkSession)._java_obj
165
+ return HubertForCTC(java_model=jModel)
166
+
167
+ @staticmethod
168
+ def pretrained(name="asr_hubert_large_ls960", lang="en", remote_loc=None):
169
+ """Downloads and loads a pretrained model.
170
+
171
+ Parameters
172
+ ----------
173
+ name : str, optional
174
+ Name of the pretrained model, by default
175
+ "asr_hubert_large_ls960"
176
+ lang : str, optional
177
+ Language of the pretrained model, by default "en"
178
+ remote_loc : str, optional
179
+ Optional remote address of the resource, by default None. Will use
180
+ Spark NLPs repositories otherwise.
181
+
182
+ Returns
183
+ -------
184
+ HubertForCTC
185
+ The restored model
186
+ """
187
+ from sparknlp.pretrained import ResourceDownloader
188
+ return ResourceDownloader.downloadModel(HubertForCTC, name, lang, remote_loc)
@@ -21,46 +21,46 @@ class Wav2Vec2ForCTC(AnnotatorModel,
21
21
  HasBatchedAnnotateAudio,
22
22
  HasAudioFeatureProperties,
23
23
  HasEngine):
24
- """Wav2Vec2 Model with a language modeling head on top for Connectionist Temporal
24
+ """Wav2Vec2 Model with a language modeling head on top for Connectionist Temporal
25
25
  Classification (CTC). Wav2Vec2 was proposed in wav2vec 2.0: A Framework for
26
26
  Self-Supervised Learning of Speech Representations by Alexei Baevski, Henry Zhou,
27
27
  Abdelrahman Mohamed, Michael Auli.
28
-
28
+
29
29
  The annotator takes audio files and transcribes it as text. The audio needs to be
30
30
  provided pre-processed an array of floats.
31
-
31
+
32
32
  Note that this annotator is currently not supported on Apple Silicon processors such
33
33
  as the M1. This is due to the processor not supporting instructions for XLA.
34
-
34
+
35
35
  Pretrained models can be loaded with ``pretrained`` of the companion object:
36
-
36
+
37
37
  >>> speechToText = Wav2Vec2ForCTC.pretrained() \\
38
38
  ... .setInputCols(["audio_assembler"]) \\
39
39
  ... .setOutputCol("text")
40
-
41
-
40
+
41
+
42
42
  The default model is ``"asr_wav2vec2_base_960h"``, if no name is provided.
43
-
43
+
44
44
  For available pretrained models please see the
45
- `Models Hub <https://nlp.johnsnowlabs.com/models>`__.
46
-
45
+ `Models Hub <https://sparknlp.org/models>`__.
46
+
47
47
  To see which models are compatible and how to import them see
48
48
  https://github.com/JohnSnowLabs/spark-nlp/discussions/5669 and to see more extended
49
49
  examples, see
50
50
  `Wav2Vec2ForCTCTestSpec <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/audio/Wav2Vec2ForCTCTestSpec.scala>`__.
51
-
51
+
52
52
  ====================== ======================
53
53
  Input Annotation types Output Annotation type
54
54
  ====================== ======================
55
55
  ``AUDIO`` ``DOCUMENT``
56
56
  ====================== ======================
57
-
57
+
58
58
  Parameters
59
59
  ----------
60
-
60
+
61
61
  batchSize
62
62
  Size of each batch, by default 2
63
-
63
+
64
64
  Examples
65
65
  --------
66
66
  >>> import sparknlp
@@ -0,0 +1,251 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Contains classes concerning WhisperForCTC."""
16
+
17
+ from sparknlp.common import *
18
+
19
+
20
+ class WhisperForCTC(AnnotatorModel,
21
+ HasBatchedAnnotateAudio,
22
+ HasAudioFeatureProperties,
23
+ HasEngine, HasGeneratorProperties):
24
+ """Whisper Model with a language modeling head on top for Connectionist Temporal Classification
25
+ (CTC).
26
+
27
+ Whisper is an automatic speech recognition (ASR) system trained on 680,000 hours of
28
+ multilingual and multitask supervised data collected from the web. It transcribe in multiple
29
+ languages, as well as translate from those languages into English.
30
+
31
+ The audio needs to be provided pre-processed an array of floats.
32
+
33
+ Note that at the moment, this annotator only supports greedy search and only Spark Versions
34
+ 3.4 and up are supported.
35
+
36
+ For multilingual models, the language and the task (transcribe or translate) can be set with
37
+ ``setLanguage`` and ``setTask``.
38
+
39
+ Pretrained models can be loaded with ``pretrained`` of the companion object:
40
+
41
+ .. code-block:: python
42
+
43
+ speechToText = WhisperForCTC.pretrained() \\
44
+ .setInputCols(["audio_assembler"]) \\
45
+ .setOutputCol("text")
46
+
47
+
48
+ The default model is ``"asr_whisper_tiny_opt"``, if no name is provided.
49
+
50
+ For available pretrained models please see the `Models Hub <https://sparknlp.org/models>`__.
51
+
52
+ To see which models are compatible and how to import them see
53
+ https://github.com/JohnSnowLabs/spark-nlp/discussions/5669 and to see more extended
54
+ examples, see
55
+ `WhisperForCTCTestSpec <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/audio/WhisperForCTCTest.scala>`__.
56
+
57
+ **References:**
58
+
59
+ `Robust Speech Recognition via Large-Scale Weak Supervision <https://arxiv.org/abs/2212.04356>`__
60
+
61
+ **Paper Abstract:**
62
+
63
+ *We study the capabilities of speech processing systems trained simply to predict large
64
+ amounts of transcripts of audio on the internet. When scaled to 680,000 hours of multilingual
65
+ and multitask supervision, the resulting models generalize well to standard benchmarks and are
66
+ often competitive with prior fully supervised results but in a zero- shot transfer setting
67
+ without the need for any fine- tuning. When compared to humans, the models approach their
68
+ accuracy and robustness. We are releasing models and inference code to serve as a foundation
69
+ for further work on robust speech processing.*
70
+
71
+ ====================== ======================
72
+ Input Annotation types Output Annotation type
73
+ ====================== ======================
74
+ ``AUDIO`` ``DOCUMENT``
75
+ ====================== ======================
76
+
77
+ Parameters
78
+ ----------
79
+ task
80
+ The formatted task for the audio. Either `<|translate|>` or `<|transcribe|>`.
81
+ language
82
+ The language for the audio, formatted to e.g. `<|en|>`. Check the model description for
83
+ supported languages.
84
+ isMultilingual
85
+ Whether the model is multilingual
86
+ minOutputLength
87
+ Minimum length of the sequence to be generated
88
+ maxOutputLength
89
+ Maximum length of output text
90
+ doSample
91
+ Whether or not to use sampling; use greedy decoding otherwise
92
+ temperature
93
+ The value used to module the next token probabilities
94
+ topK
95
+ The number of highest probability vocabulary tokens to keep for top-k-filtering
96
+ topP
97
+ If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are
98
+ kept for generation
99
+ repetitionPenalty
100
+ The parameter for repetition penalty. 1.0 means no penalty.
101
+ See `this paper <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details
102
+ noRepeatNgramSize
103
+ If set to int > 0, all ngrams of that size can only occur once
104
+ beamSize
105
+ The Number of beams for beam search
106
+
107
+ Examples
108
+ --------
109
+ >>> import sparknlp
110
+ >>> from sparknlp.base import *
111
+ >>> from sparknlp.annotator import *
112
+ >>> from pyspark.ml import Pipeline
113
+ >>> audioAssembler = AudioAssembler() \\
114
+ ... .setInputCol("audio_content") \\
115
+ ... .setOutputCol("audio_assembler")
116
+ >>> speechToText = WhisperForCTC.pretrained() \\
117
+ ... .setInputCols(["audio_assembler"]) \\
118
+ ... .setOutputCol("text")
119
+ >>> pipeline = Pipeline().setStages([audioAssembler, speechToText])
120
+ >>> processedAudioFloats = spark.createDataFrame([[rawFloats]]).toDF("audio_content")
121
+ >>> result = pipeline.fit(processedAudioFloats).transform(processedAudioFloats)
122
+ >>> result.select("text.result").show(truncate = False)
123
+ +------------------------------------------------------------------------------------------+
124
+ |result |
125
+ +------------------------------------------------------------------------------------------+
126
+ |[ Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.]|
127
+ +------------------------------------------------------------------------------------------+
128
+ """
129
+ name = "WhisperForCTC"
130
+
131
+ inputAnnotatorTypes = [AnnotatorType.AUDIO]
132
+
133
+ outputAnnotatorType = AnnotatorType.DOCUMENT
134
+
135
+ configProtoBytes = Param(Params._dummy(),
136
+ "configProtoBytes",
137
+ "ConfigProto from tensorflow, serialized into byte array. Get with "
138
+ "config_proto.SerializeToString()",
139
+ TypeConverters.toListInt)
140
+
141
+ language = Param(Params._dummy(), "language", "Optional parameter to set the language for the transcription.",
142
+ typeConverter=TypeConverters.toString)
143
+
144
+ isMultilingual = Param(Params._dummy(), "isMultilingual", "Whether the model is multilingual.",
145
+ typeConverter=TypeConverters.toBoolean)
146
+
147
+ def setConfigProtoBytes(self, b):
148
+ """Sets configProto from tensorflow, serialized into byte array.
149
+
150
+ Parameters
151
+ ----------
152
+ b : List[int]
153
+ ConfigProto from tensorflow, serialized into byte array
154
+ """
155
+ return self._set(configProtoBytes=b)
156
+
157
+ def getLanguage(self):
158
+ """Gets the langauge for the transcription."""
159
+ return self.getOrDefault(self.language)
160
+
161
+ def getIsMultilingual(self):
162
+ """Gets whether the model is multilingual."""
163
+ return self.getOrDefault(self.isMultilingual)
164
+
165
+ def setLanguage(self, value):
166
+ """Sets the language for the audio, formatted to e.g. `<|en|>`. Check the model description for
167
+ supported languages.
168
+
169
+ Parameters
170
+ ----------
171
+ value : String
172
+ Formatted language code
173
+ """
174
+ return self._call_java("setLanguage", value)
175
+
176
+ def setTask(self, value):
177
+ """Sets the formatted task for the audio. Either `<|translate|>` or `<|transcribe|>`.
178
+
179
+ Only multilingual models can do translation.
180
+
181
+ Parameters
182
+ ----------
183
+ value : String
184
+ Formatted task
185
+ """
186
+ return self._call_java("setTask", value)
187
+
188
+ @keyword_only
189
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.audio.WhisperForCTC",
190
+ java_model=None):
191
+ super(WhisperForCTC, self).__init__(
192
+ classname=classname,
193
+ java_model=java_model
194
+ )
195
+ self._setDefault(
196
+ minOutputLength=0,
197
+ maxOutputLength=448,
198
+ doSample=False,
199
+ temperature=1.0,
200
+ topK=1,
201
+ topP=1.0,
202
+ repetitionPenalty=1.0,
203
+ noRepeatNgramSize=0,
204
+ batchSize=2,
205
+ beamSize=1,
206
+ nReturnSequences=1,
207
+ isMultilingual=True,
208
+ )
209
+
210
+ @staticmethod
211
+ def loadSavedModel(folder, spark_session):
212
+ """Loads a locally saved model.
213
+
214
+ Parameters
215
+ ----------
216
+ folder : str
217
+ Folder of the saved model
218
+ spark_session : pyspark.sql.SparkSession
219
+ The current SparkSession
220
+
221
+ Returns
222
+ -------
223
+ WhisperForCTC
224
+ The restored model
225
+ """
226
+ from sparknlp.internal import _WhisperForCTC
227
+ jModel = _WhisperForCTC(folder, spark_session._jsparkSession)._java_obj
228
+ return WhisperForCTC(java_model=jModel)
229
+
230
+ @staticmethod
231
+ def pretrained(name="asr_whisper_tiny_opt", lang="xx", remote_loc=None):
232
+ """Downloads and loads a pretrained model.
233
+
234
+ Parameters
235
+ ----------
236
+ name : str, optional
237
+ Name of the pretrained model, by default
238
+ "asr_hubert_large_ls960"
239
+ lang : str, optional
240
+ Language of the pretrained model, by default "en"
241
+ remote_loc : str, optional
242
+ Optional remote address of the resource, by default None. Will use
243
+ Spark NLPs repositories otherwise.
244
+
245
+ Returns
246
+ -------
247
+ WhisperForCTC
248
+ The restored model
249
+ """
250
+ from sparknlp.pretrained import ResourceDownloader
251
+ return ResourceDownloader.downloadModel(WhisperForCTC, name, lang, remote_loc)
@@ -21,11 +21,9 @@ from sparknlp.internal import AnnotatorTransformer
21
21
 
22
22
 
23
23
  class Chunk2Doc(AnnotatorTransformer, AnnotatorProperties):
24
- """Converts a ``CHUNK`` type column back into ``DOCUMENT``. Useful when
25
- trying to re-tokenize or do further analysis on a ``CHUNK`` result.
26
-
27
- For more extended examples on document pre-processing see the
28
- `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb>`__.
24
+ """Converts a ``CHUNK`` type column back into ``DOCUMENT``.
25
+
26
+ Useful when trying to re-tokenize or do further analysis on a ``CHUNK`` result.
29
27
 
30
28
  ====================== ======================
31
29
  Input Annotation types Output Annotation type
@@ -79,10 +77,9 @@ class Chunk2Doc(AnnotatorTransformer, AnnotatorProperties):
79
77
 
80
78
  @keyword_only
81
79
  def __init__(self):
82
- super(Chunk2Doc, self).__init__(classname="com.johnsnowlabs.nlp.Chunk2Doc")
80
+ super(Chunk2Doc, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Chunk2Doc")
83
81
 
84
82
  @keyword_only
85
83
  def setParams(self):
86
84
  kwargs = self._input_kwargs
87
85
  return self._set(**kwargs)
88
-
@@ -41,7 +41,7 @@ class Chunker(AnnotatorModel):
41
41
  treated as groups, so here specifically ``"<NNP>+"`` means 1 or more nouns
42
42
  in succession.
43
43
 
44
- For more extended examples see the `Spark NLP Workshop <https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/3.SparkNLP_Pretrained_Models.ipynb>`__.
44
+ For more extended examples see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/chunking/Chunk_Extraction_with_Chunker.ipynb>`__.
45
45
 
46
46
  ====================== ======================
47
47
  Input Annotation types Output Annotation type
@@ -135,4 +135,3 @@ class Chunker(AnnotatorModel):
135
135
  ... .setRegexParsers(["<NNP>+", "<NNS>+"])
136
136
  """
137
137
  return self._set(regexParsers=value)
138
-
@@ -42,3 +42,20 @@ from sparknlp.annotator.classifier_dl.xlnet_for_token_classification import *
42
42
  from sparknlp.annotator.classifier_dl.camembert_for_token_classification import *
43
43
  from sparknlp.annotator.classifier_dl.tapas_for_question_answering import *
44
44
  from sparknlp.annotator.classifier_dl.camembert_for_sequence_classification import *
45
+ from sparknlp.annotator.classifier_dl.camembert_for_question_answering import *
46
+ from sparknlp.annotator.classifier_dl.bert_for_zero_shot_classification import *
47
+ from sparknlp.annotator.classifier_dl.distil_bert_for_zero_shot_classification import *
48
+ from sparknlp.annotator.classifier_dl.roberta_for_zero_shot_classification import *
49
+ from sparknlp.annotator.classifier_dl.xlm_roberta_for_zero_shot_classification import *
50
+ from sparknlp.annotator.classifier_dl.bart_for_zero_shot_classification import *
51
+ from sparknlp.annotator.classifier_dl.deberta_for_zero_shot_classification import *
52
+ from sparknlp.annotator.classifier_dl.mpnet_for_sequence_classification import *
53
+ from sparknlp.annotator.classifier_dl.mpnet_for_question_answering import *
54
+ from sparknlp.annotator.classifier_dl.mpnet_for_token_classification import *
55
+ from sparknlp.annotator.classifier_dl.albert_for_zero_shot_classification import *
56
+ from sparknlp.annotator.classifier_dl.camembert_for_zero_shot_classification import *
57
+ from sparknlp.annotator.classifier_dl.bert_for_multiple_choice import *
58
+ from sparknlp.annotator.classifier_dl.xlm_roberta_for_multiple_choice import *
59
+ from sparknlp.annotator.classifier_dl.roberta_for_multiple_choice import *
60
+ from sparknlp.annotator.classifier_dl.distilbert_for_multiple_choice import *
61
+ from sparknlp.annotator.classifier_dl.albert_for_multiple_choice import *