spark-nlp 4.2.6__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  4. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  5. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  6. sparknlp/__init__.py +81 -28
  7. sparknlp/annotation.py +3 -2
  8. sparknlp/annotator/__init__.py +6 -0
  9. sparknlp/annotator/audio/__init__.py +2 -0
  10. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  11. sparknlp/annotator/audio/wav2vec2_for_ctc.py +14 -14
  12. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  13. sparknlp/{base → annotator}/chunk2_doc.py +4 -7
  14. sparknlp/annotator/chunker.py +1 -2
  15. sparknlp/annotator/classifier_dl/__init__.py +17 -0
  16. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  17. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +3 -15
  18. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +4 -18
  19. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +3 -17
  20. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  21. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  22. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  23. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +6 -20
  24. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +3 -17
  25. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +3 -17
  26. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  27. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  28. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +5 -19
  29. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +5 -19
  30. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  31. sparknlp/annotator/classifier_dl/classifier_dl.py +4 -4
  32. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +3 -17
  33. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +4 -19
  34. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +5 -21
  35. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  36. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +3 -17
  37. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +4 -18
  38. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +3 -17
  39. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  40. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  41. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +3 -17
  42. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +4 -18
  43. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +3 -17
  44. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  45. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  46. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  47. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +3 -3
  48. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  49. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +3 -17
  50. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +4 -18
  51. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +1 -1
  52. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  53. sparknlp/annotator/classifier_dl/sentiment_dl.py +4 -4
  54. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +2 -2
  55. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  56. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +3 -17
  57. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +4 -18
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +6 -20
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  60. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +4 -18
  61. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +3 -17
  62. sparknlp/annotator/cleaners/__init__.py +15 -0
  63. sparknlp/annotator/cleaners/cleaner.py +202 -0
  64. sparknlp/annotator/cleaners/extractor.py +191 -0
  65. sparknlp/annotator/coref/spanbert_coref.py +4 -18
  66. sparknlp/annotator/cv/__init__.py +15 -0
  67. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  68. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  69. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  70. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  71. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  72. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  73. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  74. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  75. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  76. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  77. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  78. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  79. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  80. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  81. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  82. sparknlp/annotator/cv/vit_for_image_classification.py +36 -4
  83. sparknlp/annotator/dataframe_optimizer.py +216 -0
  84. sparknlp/annotator/date2_chunk.py +88 -0
  85. sparknlp/annotator/dependency/dependency_parser.py +2 -3
  86. sparknlp/annotator/dependency/typed_dependency_parser.py +3 -4
  87. sparknlp/annotator/document_character_text_splitter.py +228 -0
  88. sparknlp/annotator/document_normalizer.py +37 -1
  89. sparknlp/annotator/document_token_splitter.py +175 -0
  90. sparknlp/annotator/document_token_splitter_test.py +85 -0
  91. sparknlp/annotator/embeddings/__init__.py +11 -0
  92. sparknlp/annotator/embeddings/albert_embeddings.py +4 -18
  93. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  94. sparknlp/annotator/embeddings/bert_embeddings.py +9 -22
  95. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +12 -24
  96. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  97. sparknlp/annotator/embeddings/camembert_embeddings.py +4 -20
  98. sparknlp/annotator/embeddings/chunk_embeddings.py +1 -2
  99. sparknlp/annotator/embeddings/deberta_embeddings.py +2 -16
  100. sparknlp/annotator/embeddings/distil_bert_embeddings.py +5 -19
  101. sparknlp/annotator/embeddings/doc2vec.py +7 -1
  102. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  103. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  104. sparknlp/annotator/embeddings/elmo_embeddings.py +2 -2
  105. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  106. sparknlp/annotator/embeddings/longformer_embeddings.py +3 -17
  107. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  108. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  109. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  110. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  111. sparknlp/annotator/embeddings/roberta_embeddings.py +9 -21
  112. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +7 -21
  113. sparknlp/annotator/embeddings/sentence_embeddings.py +2 -3
  114. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  115. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  116. sparknlp/annotator/embeddings/universal_sentence_encoder.py +3 -3
  117. sparknlp/annotator/embeddings/word2vec.py +7 -1
  118. sparknlp/annotator/embeddings/word_embeddings.py +4 -5
  119. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +9 -21
  120. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +7 -21
  121. sparknlp/annotator/embeddings/xlnet_embeddings.py +4 -18
  122. sparknlp/annotator/er/entity_ruler.py +37 -23
  123. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +2 -3
  124. sparknlp/annotator/ld_dl/language_detector_dl.py +2 -2
  125. sparknlp/annotator/lemmatizer.py +3 -4
  126. sparknlp/annotator/matcher/date_matcher.py +35 -3
  127. sparknlp/annotator/matcher/multi_date_matcher.py +1 -2
  128. sparknlp/annotator/matcher/regex_matcher.py +3 -3
  129. sparknlp/annotator/matcher/text_matcher.py +2 -3
  130. sparknlp/annotator/n_gram_generator.py +1 -2
  131. sparknlp/annotator/ner/__init__.py +3 -1
  132. sparknlp/annotator/ner/ner_converter.py +18 -0
  133. sparknlp/annotator/ner/ner_crf.py +4 -5
  134. sparknlp/annotator/ner/ner_dl.py +10 -5
  135. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  136. sparknlp/annotator/ner/ner_overwriter.py +2 -2
  137. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  138. sparknlp/annotator/normalizer.py +2 -2
  139. sparknlp/annotator/openai/__init__.py +16 -0
  140. sparknlp/annotator/openai/openai_completion.py +349 -0
  141. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  142. sparknlp/annotator/pos/perceptron.py +6 -7
  143. sparknlp/annotator/sentence/sentence_detector.py +2 -2
  144. sparknlp/annotator/sentence/sentence_detector_dl.py +3 -3
  145. sparknlp/annotator/sentiment/sentiment_detector.py +4 -5
  146. sparknlp/annotator/sentiment/vivekn_sentiment.py +4 -5
  147. sparknlp/annotator/seq2seq/__init__.py +17 -0
  148. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  149. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  150. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  151. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  152. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  153. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  154. sparknlp/annotator/seq2seq/gpt2_transformer.py +1 -1
  155. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  156. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  157. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  158. sparknlp/annotator/seq2seq/marian_transformer.py +124 -3
  159. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  160. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  161. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  162. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  163. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  164. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  165. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  166. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  167. sparknlp/annotator/seq2seq/t5_transformer.py +54 -4
  168. sparknlp/annotator/similarity/__init__.py +0 -0
  169. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  170. sparknlp/annotator/spell_check/context_spell_checker.py +116 -17
  171. sparknlp/annotator/spell_check/norvig_sweeting.py +3 -6
  172. sparknlp/annotator/spell_check/symmetric_delete.py +1 -1
  173. sparknlp/annotator/stemmer.py +2 -3
  174. sparknlp/annotator/stop_words_cleaner.py +3 -4
  175. sparknlp/annotator/tf_ner_dl_graph_builder.py +1 -1
  176. sparknlp/annotator/token/__init__.py +0 -1
  177. sparknlp/annotator/token/recursive_tokenizer.py +2 -3
  178. sparknlp/annotator/token/tokenizer.py +2 -3
  179. sparknlp/annotator/ws/word_segmenter.py +35 -10
  180. sparknlp/base/__init__.py +2 -3
  181. sparknlp/base/doc2_chunk.py +0 -3
  182. sparknlp/base/document_assembler.py +5 -5
  183. sparknlp/base/embeddings_finisher.py +14 -2
  184. sparknlp/base/finisher.py +15 -4
  185. sparknlp/base/gguf_ranking_finisher.py +234 -0
  186. sparknlp/base/image_assembler.py +69 -0
  187. sparknlp/base/light_pipeline.py +53 -21
  188. sparknlp/base/multi_document_assembler.py +9 -13
  189. sparknlp/base/prompt_assembler.py +207 -0
  190. sparknlp/base/token_assembler.py +1 -2
  191. sparknlp/common/__init__.py +2 -0
  192. sparknlp/common/annotator_type.py +1 -0
  193. sparknlp/common/completion_post_processing.py +37 -0
  194. sparknlp/common/match_strategy.py +33 -0
  195. sparknlp/common/properties.py +914 -9
  196. sparknlp/internal/__init__.py +841 -116
  197. sparknlp/internal/annotator_java_ml.py +1 -1
  198. sparknlp/internal/annotator_transformer.py +3 -0
  199. sparknlp/logging/comet.py +2 -2
  200. sparknlp/partition/__init__.py +16 -0
  201. sparknlp/partition/partition.py +244 -0
  202. sparknlp/partition/partition_properties.py +902 -0
  203. sparknlp/partition/partition_transformer.py +200 -0
  204. sparknlp/pretrained/pretrained_pipeline.py +1 -1
  205. sparknlp/pretrained/resource_downloader.py +126 -2
  206. sparknlp/reader/__init__.py +15 -0
  207. sparknlp/reader/enums.py +19 -0
  208. sparknlp/reader/pdf_to_text.py +190 -0
  209. sparknlp/reader/reader2doc.py +124 -0
  210. sparknlp/reader/reader2image.py +136 -0
  211. sparknlp/reader/reader2table.py +44 -0
  212. sparknlp/reader/reader_assembler.py +159 -0
  213. sparknlp/reader/sparknlp_reader.py +461 -0
  214. sparknlp/training/__init__.py +1 -0
  215. sparknlp/training/conll.py +8 -2
  216. sparknlp/training/spacy_to_annotation.py +57 -0
  217. sparknlp/util.py +26 -0
  218. spark_nlp-4.2.6.dist-info/METADATA +0 -1256
  219. spark_nlp-4.2.6.dist-info/RECORD +0 -196
  220. {spark_nlp-4.2.6.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
  221. /sparknlp/annotator/{token/token2_chunk.py → token2_chunk.py} +0 -0
@@ -0,0 +1,357 @@
1
+ # Copyright 2017-2022 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the CoHereTransformer."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class CoHereTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
20
+ """Cohere: Command-R Transformer
21
+
22
+ C4AI Command-R is a research release of a 35 billion parameter highly performant generative model.
23
+ Command-R is a large language model with open weights optimized for a variety of use cases including reasoning,
24
+ summarization, and question answering. Command-R has the capability for multilingual generation evaluated
25
+ in 10 languages and highly performant RAG capabilities.
26
+
27
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
28
+ object:
29
+
30
+ >>> CoHere = CoHereTransformer.pretrained() \\
31
+ ... .setInputCols(["document"]) \\
32
+ ... .setOutputCol("generation")
33
+
34
+
35
+ The default model is ``"c4ai_command_r_v01_int4"``, if no name is provided. For available
36
+ pretrained models please see the `Models Hub
37
+ <https://sparknlp.org/models?q=CoHere>`__.
38
+
39
+ ====================== ======================
40
+ Input Annotation types Output Annotation type
41
+ ====================== ======================
42
+ ``DOCUMENT`` ``DOCUMENT``
43
+ ====================== ======================
44
+
45
+ Parameters
46
+ ----------
47
+ configProtoBytes
48
+ ConfigProto from tensorflow, serialized into byte array.
49
+ minOutputLength
50
+ Minimum length of the sequence to be generated, by default 0
51
+ maxOutputLength
52
+ Maximum length of output text, by default 60
53
+ doSample
54
+ Whether or not to use sampling; use greedy decoding otherwise, by default False
55
+ temperature
56
+ The value used to modulate the next token probabilities, by default 1.0
57
+ topK
58
+ The number of highest probability vocabulary tokens to keep for
59
+ top-k-filtering, by default 40
60
+ topP
61
+ Top cumulative probability for vocabulary tokens, by default 1.0
62
+
63
+ If set to float < 1, only the most probable tokens with probabilities
64
+ that add up to ``topP`` or higher are kept for generation.
65
+ repetitionPenalty
66
+ The parameter for repetition penalty, 1.0 means no penalty. , by default
67
+ 1.0
68
+ noRepeatNgramSize
69
+ If set to int > 0, all ngrams of that size can only occur once, by
70
+ default 0
71
+ ignoreTokenIds
72
+ A list of token ids which are ignored in the decoder's output, by
73
+ default []
74
+
75
+ Notes
76
+ -----
77
+ This is a very computationally expensive module, especially on larger
78
+ sequences. The use of an accelerator such as GPU is recommended.
79
+
80
+ References
81
+ ----------
82
+ - `Cohere <https://cohere.for.ai/>`__
83
+
84
+
85
+ Examples
86
+ --------
87
+ >>> import sparknlp
88
+ >>> from sparknlp.base import *
89
+ >>> from sparknlp.annotator import *
90
+ >>> from pyspark.ml import Pipeline
91
+ >>> documentAssembler = DocumentAssembler() \\
92
+ ... .setInputCol("text") \\
93
+ ... .setOutputCol("documents")
94
+ >>> CoHere = CoHereTransformer.pretrained("c4ai_command_r_v01_int4","en") \\
95
+ ... .setInputCols(["documents"]) \\
96
+ ... .setMaxOutputLength(60) \\
97
+ ... .setOutputCol("generation")
98
+ >>> pipeline = Pipeline().setStages([documentAssembler, CoHere])
99
+ >>> data = spark.createDataFrame([
100
+ ... (
101
+ ... 1,
102
+ ... "<BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
103
+ ... )
104
+ ... ]).toDF("id", "text")
105
+ >>> result = pipeline.fit(data).transform(data)
106
+ >>> result.select("generation.result").show(truncate=False)
107
+ +------------------------------------------------+
108
+ |result |
109
+ +------------------------------------------------+
110
+ |[Hello! I'm doing well, thank you for asking! I'm excited to help you with whatever questions you have today. How can I assist you?]|
111
+ +------------------------------------------------+
112
+ """
113
+
114
+ name = "CoHereTransformer"
115
+
116
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
117
+
118
+ outputAnnotatorType = AnnotatorType.DOCUMENT
119
+
120
+ configProtoBytes = Param(Params._dummy(),
121
+ "configProtoBytes",
122
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
123
+ TypeConverters.toListInt)
124
+
125
+ minOutputLength = Param(Params._dummy(), "minOutputLength", "Minimum length of the sequence to be generated",
126
+ typeConverter=TypeConverters.toInt)
127
+
128
+ maxOutputLength = Param(Params._dummy(), "maxOutputLength", "Maximum length of output text",
129
+ typeConverter=TypeConverters.toInt)
130
+
131
+ doSample = Param(Params._dummy(), "doSample", "Whether or not to use sampling; use greedy decoding otherwise",
132
+ typeConverter=TypeConverters.toBoolean)
133
+
134
+ temperature = Param(Params._dummy(), "temperature", "The value used to module the next token probabilities",
135
+ typeConverter=TypeConverters.toFloat)
136
+
137
+ topK = Param(Params._dummy(), "topK",
138
+ "The number of highest probability vocabulary tokens to keep for top-k-filtering",
139
+ typeConverter=TypeConverters.toInt)
140
+
141
+ topP = Param(Params._dummy(), "topP",
142
+ "If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are kept for generation",
143
+ typeConverter=TypeConverters.toFloat)
144
+
145
+ repetitionPenalty = Param(Params._dummy(), "repetitionPenalty",
146
+ "The parameter for repetition penalty. 1.0 means no penalty. See `this paper <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details",
147
+ typeConverter=TypeConverters.toFloat)
148
+
149
+ noRepeatNgramSize = Param(Params._dummy(), "noRepeatNgramSize",
150
+ "If set to int > 0, all ngrams of that size can only occur once",
151
+ typeConverter=TypeConverters.toInt)
152
+
153
+ ignoreTokenIds = Param(Params._dummy(), "ignoreTokenIds",
154
+ "A list of token ids which are ignored in the decoder's output",
155
+ typeConverter=TypeConverters.toListInt)
156
+
157
+ beamSize = Param(Params._dummy(), "beamSize",
158
+ "The number of beams to use for beam search",
159
+ typeConverter=TypeConverters.toInt)
160
+
161
+ stopTokenIds = Param(Params._dummy(), "stopTokenIds",
162
+ "A list of token ids which are considered as stop tokens in the decoder's output",
163
+ typeConverter=TypeConverters.toListInt)
164
+
165
+ def setIgnoreTokenIds(self, value):
166
+ """A list of token ids which are ignored in the decoder's output.
167
+
168
+ Parameters
169
+ ----------
170
+ value : List[int]
171
+ The words to be filtered out
172
+ """
173
+ return self._set(ignoreTokenIds=value)
174
+
175
+ def setConfigProtoBytes(self, b):
176
+ """Sets configProto from tensorflow, serialized into byte array.
177
+
178
+ Parameters
179
+ ----------
180
+ b : List[int]
181
+ ConfigProto from tensorflow, serialized into byte array
182
+ """
183
+ return self._set(configProtoBytes=b)
184
+
185
+ def setMinOutputLength(self, value):
186
+ """Sets minimum length of the sequence to be generated.
187
+
188
+ Parameters
189
+ ----------
190
+ value : int
191
+ Minimum length of the sequence to be generated
192
+ """
193
+ return self._set(minOutputLength=value)
194
+
195
+ def setMaxOutputLength(self, value):
196
+ """Sets maximum length of output text.
197
+
198
+ Parameters
199
+ ----------
200
+ value : int
201
+ Maximum length of output text
202
+ """
203
+ return self._set(maxOutputLength=value)
204
+
205
+ def setDoSample(self, value):
206
+ """Sets whether or not to use sampling, use greedy decoding otherwise.
207
+
208
+ Parameters
209
+ ----------
210
+ value : bool
211
+ Whether or not to use sampling; use greedy decoding otherwise
212
+ """
213
+ return self._set(doSample=value)
214
+
215
+ def setTemperature(self, value):
216
+ """Sets the value used to module the next token probabilities.
217
+
218
+ Parameters
219
+ ----------
220
+ value : float
221
+ The value used to module the next token probabilities
222
+ """
223
+ return self._set(temperature=value)
224
+
225
+ def setTopK(self, value):
226
+ """Sets the number of highest probability vocabulary tokens to keep for
227
+ top-k-filtering.
228
+
229
+ Parameters
230
+ ----------
231
+ value : int
232
+ Number of highest probability vocabulary tokens to keep
233
+ """
234
+ return self._set(topK=value)
235
+
236
+ def setTopP(self, value):
237
+ """Sets the top cumulative probability for vocabulary tokens.
238
+
239
+ If set to float < 1, only the most probable tokens with probabilities
240
+ that add up to ``topP`` or higher are kept for generation.
241
+
242
+ Parameters
243
+ ----------
244
+ value : float
245
+ Cumulative probability for vocabulary tokens
246
+ """
247
+ return self._set(topP=value)
248
+
249
+ def setRepetitionPenalty(self, value):
250
+ """Sets the parameter for repetition penalty. 1.0 means no penalty.
251
+
252
+ Parameters
253
+ ----------
254
+ value : float
255
+ The repetition penalty
256
+
257
+ References
258
+ ----------
259
+ See `Ctrl: A Conditional Transformer Language Model For Controllable
260
+ Generation <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
261
+ """
262
+ return self._set(repetitionPenalty=value)
263
+
264
+ def setNoRepeatNgramSize(self, value):
265
+ """Sets size of n-grams that can only occur once.
266
+
267
+ If set to int > 0, all ngrams of that size can only occur once.
268
+
269
+ Parameters
270
+ ----------
271
+ value : int
272
+ N-gram size can only occur once
273
+ """
274
+ return self._set(noRepeatNgramSize=value)
275
+
276
+ def setBeamSize(self, value):
277
+ """Sets the number of beams to use for beam search.
278
+
279
+ Parameters
280
+ ----------
281
+ value : int
282
+ The number of beams to use for beam search
283
+ """
284
+ return self._set(beamSize=value)
285
+
286
+ def setStopTokenIds(self, value):
287
+ """Sets a list of token ids which are considered as stop tokens in the decoder's output.
288
+
289
+ Parameters
290
+ ----------
291
+ value : List[int]
292
+ The words to be considered as stop tokens
293
+ """
294
+ return self._set(stopTokenIds=value)
295
+
296
+ @keyword_only
297
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.CoHereTransformer", java_model=None):
298
+ super(CoHereTransformer, self).__init__(
299
+ classname=classname,
300
+ java_model=java_model
301
+ )
302
+ self._setDefault(
303
+ minOutputLength=0,
304
+ maxOutputLength=20,
305
+ doSample=False,
306
+ temperature=0.6,
307
+ topK=-1,
308
+ topP=0.9,
309
+ repetitionPenalty=1.0,
310
+ noRepeatNgramSize=3,
311
+ ignoreTokenIds=[],
312
+ batchSize=1,
313
+ beamSize=1,
314
+ stopTokenIds=[128001, ]
315
+ )
316
+
317
+ @staticmethod
318
+ def loadSavedModel(folder, spark_session, use_openvino=False):
319
+ """Loads a locally saved model.
320
+
321
+ Parameters
322
+ ----------
323
+ folder : str
324
+ Folder of the saved model
325
+ spark_session : pyspark.sql.SparkSession
326
+ The current SparkSession
327
+
328
+ Returns
329
+ -------
330
+ CoHereTransformer
331
+ The restored model
332
+ """
333
+ from sparknlp.internal import _CoHereLoader
334
+ jModel = _CoHereLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
335
+ return CoHereTransformer(java_model=jModel)
336
+
337
+ @staticmethod
338
+ def pretrained(name="c4ai_command_r_v01_int4", lang="en", remote_loc=None):
339
+ """Downloads and loads a pretrained model.
340
+
341
+ Parameters
342
+ ----------
343
+ name : str, optional
344
+ Name of the pretrained model, by default "c4ai_command_r_v01_int4"
345
+ lang : str, optional
346
+ Language of the pretrained model, by default "en"
347
+ remote_loc : str, optional
348
+ Optional remote address of the resource, by default None. Will use
349
+ Spark NLPs repositories otherwise.
350
+
351
+ Returns
352
+ -------
353
+ CoHereTransformer
354
+ The restored model
355
+ """
356
+ from sparknlp.pretrained import ResourceDownloader
357
+ return ResourceDownloader.downloadModel(CoHereTransformer, name, lang, remote_loc)
@@ -0,0 +1,321 @@
1
+ # Copyright 2017-2024 John Snow Labs
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Contains classes for the CPMTransformer."""
15
+
16
+ from sparknlp.common import *
17
+
18
+
19
+ class CPMTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
20
+ """MiniCPM: Unveiling the Potential of End-side Large Language Models
21
+
22
+ MiniCPM is a series of edge-side large language models, with the base model, MiniCPM-2B,
23
+ having 2.4B non-embedding parameters. It ranks closely with Mistral-7B on comprehensive
24
+ benchmarks (with better performance in Chinese, mathematics, and coding abilities), surpassing
25
+ models like Llama2-13B, MPT-30B, and Falcon-40B. On the MTBench benchmark, which is closest to
26
+ user experience, MiniCPM-2B also outperforms many representative open-source models such as
27
+ Llama2-70B-Chat, Vicuna-33B, Mistral-7B-Instruct-v0.1, and Zephyr-7B-alpha.
28
+
29
+ After DPO, MiniCPM outperforms Llama2-70B-Chat, Vicuna-33B, Mistral-7B-Instruct-v0.1,
30
+ Zephyr-7B-alpha, etc. on MTBench.
31
+
32
+ MiniCPM-V, based on MiniCPM-2B, achieves the best overall performance among multimodel models
33
+ of the same scale, surpassing existing multimodal large models built on Phi-2 and achieving
34
+ performance comparable to or even better than 9.6B Qwen-VL-Chat on some tasks.
35
+
36
+ MiniCPM can be deployed and infer on smartphones, and the speed of streaming output is
37
+ relatively higher than the verbal speed of human.
38
+
39
+ Pretrained models can be loaded with :meth:`.pretrained` of the companion
40
+ object:
41
+
42
+ >>> cpm = CPMTransformer.pretrained() \\
43
+ ... .setInputCols(["document"]) \\
44
+ ... .setOutputCol("generation")
45
+
46
+
47
+ The default model is ``"mini_cpm_2b_8bit"``, if no name is provided. For available
48
+ pretrained models please see the `Models Hub
49
+ <https://sparknlp.org/models?q=cpm>`__.
50
+
51
+ ====================== ======================
52
+ Input Annotation types Output Annotation type
53
+ ====================== ======================
54
+ ``DOCUMENT`` ``DOCUMENT``
55
+ ====================== ======================
56
+
57
+ Parameters
58
+ ----------
59
+ configProtoBytes
60
+ ConfigProto from tensorflow, serialized into byte array.
61
+ minOutputLength
62
+ Minimum length of the sequence to be generated, by default 0
63
+ maxOutputLength
64
+ Maximum length of output text, by default 20
65
+ doSample
66
+ Whether or not to use sampling; use greedy decoding otherwise, by default False
67
+ temperature
68
+ The value used to module the next token probabilities, by default 1.0
69
+ topK
70
+ The number of highest probability vocabulary tokens to keep for
71
+ top-k-filtering, by default 50
72
+ topP
73
+ Top cumulative probability for vocabulary tokens, by default 1.0
74
+
75
+ If set to float < 1, only the most probable tokens with probabilities
76
+ that add up to ``topP`` or higher are kept for generation.
77
+ repetitionPenalty
78
+ The parameter for repetition penalty, 1.0 means no penalty. , by default
79
+ 1.0
80
+ noRepeatNgramSize
81
+ If set to int > 0, all ngrams of that size can only occur once, by
82
+ default 0
83
+ ignoreTokenIds
84
+ A list of token ids which are ignored in the decoder's output, by
85
+ default []
86
+
87
+ Notes
88
+ -----
89
+ This is a very computationally expensive module especially on larger
90
+ sequence. The use of an accelerator such as GPU is recommended.
91
+
92
+ References
93
+ ----------
94
+ - `MiniCPM: Unveiling the Potential of End-side Large Language Models
95
+ <https://shengdinghu.notion.site/MiniCPM-Unveiling-the-Potential-of-End-side-Large-Language-Models-d4d3a8c426424654a4e80e42a711cb20>`
96
+ - https://github.com/OpenBMB/MiniCPM
97
+
98
+ Examples
99
+ --------
100
+ >>> import sparknlp
101
+ >>> from sparknlp.base import *
102
+ >>> from sparknlp.annotator import *
103
+ >>> from pyspark.ml import Pipeline
104
+ >>> documentAssembler = DocumentAssembler() \\
105
+ ... .setInputCol("text") \\
106
+ ... .setOutputCol("documents")
107
+ >>> cpm = CPMTransformer.pretrained("mini_cpm_2b_8bit","xx") \\
108
+ ... .setInputCols(["documents"]) \\
109
+ ... .setMaxOutputLength(50) \\
110
+ ... .setOutputCol("generation")
111
+ >>> pipeline = Pipeline().setStages([documentAssembler, cpm])
112
+ >>> data = spark.createDataFrame([["My name is Leonardo."]]).toDF("text")
113
+ >>> result = pipeline.fit(data).transform(data)
114
+ >>> result.select("summaries.generation").show(truncate=False)
115
+ +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
116
+ |result |
117
+ +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
118
+ |[My name is Leonardo. I am a student at the University of California, Los Angeles. I have a passion for writing and learning about different cultures. I enjoy playing basketball and watching movies]|
119
+ -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
120
+ """
121
+
122
+ name = "CPMTransformer"
123
+
124
+ inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
125
+
126
+ outputAnnotatorType = AnnotatorType.DOCUMENT
127
+
128
+ configProtoBytes = Param(Params._dummy(), "configProtoBytes",
129
+ "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
130
+ TypeConverters.toListInt)
131
+
132
+ minOutputLength = Param(Params._dummy(), "minOutputLength", "Minimum length of the sequence to be generated",
133
+ typeConverter=TypeConverters.toInt)
134
+
135
+ maxOutputLength = Param(Params._dummy(), "maxOutputLength", "Maximum length of output text",
136
+ typeConverter=TypeConverters.toInt)
137
+
138
+ doSample = Param(Params._dummy(), "doSample", "Whether or not to use sampling; use greedy decoding otherwise",
139
+ typeConverter=TypeConverters.toBoolean)
140
+
141
+ temperature = Param(Params._dummy(), "temperature", "The value used to module the next token probabilities",
142
+ typeConverter=TypeConverters.toFloat)
143
+
144
+ topK = Param(Params._dummy(), "topK",
145
+ "The number of highest probability vocabulary tokens to keep for top-k-filtering",
146
+ typeConverter=TypeConverters.toInt)
147
+
148
+ topP = Param(Params._dummy(), "topP",
149
+ "If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are kept for generation",
150
+ typeConverter=TypeConverters.toFloat)
151
+
152
+ repetitionPenalty = Param(Params._dummy(), "repetitionPenalty",
153
+ "The parameter for repetition penalty. 1.0 means no penalty. See `this paper <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details",
154
+ typeConverter=TypeConverters.toFloat)
155
+
156
+ noRepeatNgramSize = Param(Params._dummy(), "noRepeatNgramSize",
157
+ "If set to int > 0, all ngrams of that size can only occur once",
158
+ typeConverter=TypeConverters.toInt)
159
+
160
+ ignoreTokenIds = Param(Params._dummy(), "ignoreTokenIds",
161
+ "A list of token ids which are ignored in the decoder's output",
162
+ typeConverter=TypeConverters.toListInt)
163
+
164
+ def setIgnoreTokenIds(self, value):
165
+ """A list of token ids which are ignored in the decoder's output.
166
+
167
+ Parameters
168
+ ----------
169
+ value : List[int]
170
+ The words to be filtered out
171
+ """
172
+ return self._set(ignoreTokenIds=value)
173
+
174
+ def setConfigProtoBytes(self, b):
175
+ """Sets configProto from tensorflow, serialized into byte array.
176
+
177
+ Parameters
178
+ ----------
179
+ b : List[int]
180
+ ConfigProto from tensorflow, serialized into byte array
181
+ """
182
+ return self._set(configProtoBytes=b)
183
+
184
+ def setMinOutputLength(self, value):
185
+ """Sets minimum length of the sequence to be generated.
186
+
187
+ Parameters
188
+ ----------
189
+ value : int
190
+ Minimum length of the sequence to be generated
191
+ """
192
+ return self._set(minOutputLength=value)
193
+
194
+ def setMaxOutputLength(self, value):
195
+ """Sets maximum length of output text.
196
+
197
+ Parameters
198
+ ----------
199
+ value : int
200
+ Maximum length of output text
201
+ """
202
+ return self._set(maxOutputLength=value)
203
+
204
+ def setDoSample(self, value):
205
+ """Sets whether or not to use sampling, use greedy decoding otherwise.
206
+
207
+ Parameters
208
+ ----------
209
+ value : bool
210
+ Whether or not to use sampling; use greedy decoding otherwise
211
+ """
212
+ return self._set(doSample=value)
213
+
214
+ def setTemperature(self, value):
215
+ """Sets the value used to module the next token probabilities.
216
+
217
+ Parameters
218
+ ----------
219
+ value : float
220
+ The value used to module the next token probabilities
221
+ """
222
+ return self._set(temperature=value)
223
+
224
+ def setTopK(self, value):
225
+ """Sets the number of highest probability vocabulary tokens to keep for
226
+ top-k-filtering.
227
+
228
+ Parameters
229
+ ----------
230
+ value : int
231
+ Number of highest probability vocabulary tokens to keep
232
+ """
233
+ return self._set(topK=value)
234
+
235
+ def setTopP(self, value):
236
+ """Sets the top cumulative probability for vocabulary tokens.
237
+
238
+ If set to float < 1, only the most probable tokens with probabilities
239
+ that add up to ``topP`` or higher are kept for generation.
240
+
241
+ Parameters
242
+ ----------
243
+ value : float
244
+ Cumulative probability for vocabulary tokens
245
+ """
246
+ return self._set(topP=value)
247
+
248
+ def setRepetitionPenalty(self, value):
249
+ """Sets the parameter for repetition penalty. 1.0 means no penalty.
250
+
251
+ Parameters
252
+ ----------
253
+ value : float
254
+ The repetition penalty
255
+
256
+ References
257
+ ----------
258
+ See `Ctrl: A Conditional Transformer Language Model For Controllable
259
+ Generation <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
260
+ """
261
+ return self._set(repetitionPenalty=value)
262
+
263
+ def setNoRepeatNgramSize(self, value):
264
+ """Sets size of n-grams that can only occur once.
265
+
266
+ If set to int > 0, all ngrams of that size can only occur once.
267
+
268
+ Parameters
269
+ ----------
270
+ value : int
271
+ N-gram size can only occur once
272
+ """
273
+ return self._set(noRepeatNgramSize=value)
274
+
275
+ @keyword_only
276
+ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.CPMTransformer", java_model=None):
277
+ super(CPMTransformer, self).__init__(classname=classname, java_model=java_model)
278
+ self._setDefault(minOutputLength=0, maxOutputLength=50, doSample=False, temperature=0.8, topK=100, topP=0.8,
279
+ repetitionPenalty=1.0, noRepeatNgramSize=0, ignoreTokenIds=[], batchSize=1)
280
+
281
+ @staticmethod
282
+ def loadSavedModel(folder, spark_session, use_openvino = False):
283
+ """Loads a locally saved model.
284
+
285
+ Parameters
286
+ ----------
287
+ folder : str
288
+ Folder of the saved model
289
+ spark_session : pyspark.sql.SparkSession
290
+ The current SparkSession
291
+
292
+ Returns
293
+ -------
294
+ CPMTransformer
295
+ The restored model
296
+ """
297
+ from sparknlp.internal import _CPMLoader
298
+ jModel = _CPMLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
299
+ return CPMTransformer(java_model=jModel)
300
+
301
+ @staticmethod
302
+ def pretrained(name="mini_cpm_2b_8bit", lang="xx", remote_loc=None):
303
+ """Downloads and loads a pretrained model.
304
+
305
+ Parameters
306
+ ----------
307
+ name : str, optional
308
+ Name of the pretrained model, by default "mini_cpm_2b_8bit"
309
+ lang : str, optional
310
+ Language of the pretrained model, by default "xx"
311
+ remote_loc : str, optional
312
+ Optional remote address of the resource, by default None. Will use
313
+ Spark NLPs repositories otherwise.
314
+
315
+ Returns
316
+ -------
317
+ CPMTransformer
318
+ The restored model
319
+ """
320
+ from sparknlp.pretrained import ResourceDownloader
321
+ return ResourceDownloader.downloadModel(CPMTransformer, name, lang, remote_loc)
@@ -43,7 +43,7 @@ class GPT2Transformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
43
43
 
44
44
  The default model is ``"gpt2"``, if no name is provided. For available
45
45
  pretrained models please see the `Models Hub
46
- <https://nlp.johnsnowlabs.com/models?q=gpt2>`__.
46
+ <https://sparknlp.org/models?q=gpt2>`__.
47
47
 
48
48
  ====================== ======================
49
49
  Input Annotation types Output Annotation type