cui-llama.rn 1.0.10 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -11,6 +11,7 @@ The following features have been added for Android:
11
11
  - `vocab_only` mode: utilize the llama.cpp tokenizer
12
12
  - tokenizeSync: non-blocking, synchronous tokenizer function
13
13
  - Context Shift taken from [kobold.cpp](https://github.com/LostRuins/koboldcpp)
14
+ - XTC sampling
14
15
 
15
16
  Original repo README.md below.
16
17
 
@@ -218,6 +218,10 @@ public class LlamaContext {
218
218
  params.hasKey("top_p") ? (float) params.getDouble("top_p") : 0.95f,
219
219
  // float min_p,
220
220
  params.hasKey("min_p") ? (float) params.getDouble("min_p") : 0.05f,
221
+ // float xtc_t,
222
+ params.hasKey("xtc_t") ? (float) params.getDouble("xtc_t") : 0.00f,
223
+ // float xtc_p,
224
+ params.hasKey("xtc_p") ? (float) params.getDouble("xtc_p") : 0.00f,
221
225
  // float tfs_z,
222
226
  params.hasKey("tfs_z") ? (float) params.getDouble("tfs_z") : 1.00f,
223
227
  // float typical_p,
@@ -399,6 +403,8 @@ public class LlamaContext {
399
403
  int top_k,
400
404
  float top_p,
401
405
  float min_p,
406
+ float xtc_t,
407
+ float xtc_p,
402
408
  float tfs_z,
403
409
  float typical_p,
404
410
  int seed,
@@ -370,6 +370,8 @@ Java_com_rnllama_LlamaContext_doCompletion(
370
370
  jint top_k,
371
371
  jfloat top_p,
372
372
  jfloat min_p,
373
+ jfloat xtc_t,
374
+ jfloat xtc_p,
373
375
  jfloat tfs_z,
374
376
  jfloat typical_p,
375
377
  jint seed,
@@ -413,6 +415,8 @@ Java_com_rnllama_LlamaContext_doCompletion(
413
415
  sparams.typical_p = typical_p;
414
416
  sparams.n_probs = n_probs;
415
417
  sparams.grammar = env->GetStringUTFChars(grammar, nullptr);
418
+ sparams.xtc_t = xtc_t;
419
+ sparams.xtc_p = xtc_p;
416
420
 
417
421
  sparams.logit_bias.clear();
418
422
  if (ignore_eos) {
package/cpp/common.cpp CHANGED
@@ -83,6 +83,41 @@ char const *LLAMA_BUILD_TARGET = "unknown";
83
83
 
84
84
  using json = nlohmann::ordered_json;
85
85
 
86
+ //
87
+ // Environment variable utils
88
+ //
89
+
90
+ template<typename T>
91
+ static typename std::enable_if<std::is_same<T, std::string>::value, void>::type
92
+ get_env(std::string name, T & target) {
93
+ char * value = std::getenv(name.c_str());
94
+ target = value ? std::string(value) : target;
95
+ }
96
+
97
+ template<typename T>
98
+ static typename std::enable_if<!std::is_same<T, bool>::value && std::is_integral<T>::value, void>::type
99
+ get_env(std::string name, T & target) {
100
+ char * value = std::getenv(name.c_str());
101
+ target = value ? std::stoi(value) : target;
102
+ }
103
+
104
+ template<typename T>
105
+ static typename std::enable_if<std::is_floating_point<T>::value, void>::type
106
+ get_env(std::string name, T & target) {
107
+ char * value = std::getenv(name.c_str());
108
+ target = value ? std::stof(value) : target;
109
+ }
110
+
111
+ template<typename T>
112
+ static typename std::enable_if<std::is_same<T, bool>::value, void>::type
113
+ get_env(std::string name, T & target) {
114
+ char * value = std::getenv(name.c_str());
115
+ if (value) {
116
+ std::string val(value);
117
+ target = val == "1" || val == "true";
118
+ }
119
+ }
120
+
86
121
  //
87
122
  // CPU utils
88
123
  //
@@ -116,8 +151,34 @@ int32_t cpu_get_num_physical_cores() {
116
151
  if (result == 0) {
117
152
  return num_physical_cores;
118
153
  }
119
- #elif defined(_WIN32)
120
- //TODO: Implement
154
+ #elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
155
+ // TODO: windows + arm64 + mingw64
156
+ unsigned int n_threads_win = std::thread::hardware_concurrency();
157
+ unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
158
+
159
+ DWORD buffer_size = 0;
160
+ if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
161
+ if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
162
+ return default_threads;
163
+ }
164
+ }
165
+
166
+ std::vector<char> buffer(buffer_size);
167
+ if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
168
+ return default_threads;
169
+ }
170
+
171
+ int32_t num_physical_cores = 0;
172
+ PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
173
+ while (buffer_size > 0) {
174
+ if (info->Relationship == RelationProcessorCore) {
175
+ num_physical_cores += info->Processor.GroupCount;
176
+ }
177
+ buffer_size -= info->Size;
178
+ info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
179
+ }
180
+
181
+ return num_physical_cores > 0 ? num_physical_cores : default_threads;
121
182
  #endif
122
183
  unsigned int n_threads = std::thread::hardware_concurrency();
123
184
  return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
@@ -200,12 +261,6 @@ int32_t cpu_get_num_math() {
200
261
  // CLI argument parsing
201
262
  //
202
263
 
203
- void gpt_params_handle_hf_token(gpt_params & params) {
204
- if (params.hf_token.empty() && std::getenv("HF_TOKEN")) {
205
- params.hf_token = std::getenv("HF_TOKEN");
206
- }
207
- }
208
-
209
264
  void gpt_params_handle_model_default(gpt_params & params) {
210
265
  if (!params.hf_repo.empty()) {
211
266
  // short-hand to avoid specifying --hf-file -> default it to --model
@@ -253,7 +308,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
253
308
 
254
309
  gpt_params_handle_model_default(params);
255
310
 
256
- gpt_params_handle_hf_token(params);
311
+ if (params.hf_token.empty()) {
312
+ get_env("HF_TOKEN", params.hf_token);
313
+ }
257
314
 
258
315
  if (params.escape) {
259
316
  string_process_escapes(params.prompt);
@@ -273,6 +330,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
273
330
  return true;
274
331
  }
275
332
 
333
+ void gpt_params_parse_from_env(gpt_params & params) {
334
+ // we only care about server-related params for now
335
+ get_env("LLAMA_ARG_MODEL", params.model);
336
+ get_env("LLAMA_ARG_THREADS", params.n_threads);
337
+ get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx);
338
+ get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel);
339
+ get_env("LLAMA_ARG_BATCH", params.n_batch);
340
+ get_env("LLAMA_ARG_UBATCH", params.n_ubatch);
341
+ get_env("LLAMA_ARG_N_GPU_LAYERS", params.n_gpu_layers);
342
+ get_env("LLAMA_ARG_THREADS_HTTP", params.n_threads_http);
343
+ get_env("LLAMA_ARG_CHAT_TEMPLATE", params.chat_template);
344
+ get_env("LLAMA_ARG_N_PREDICT", params.n_predict);
345
+ get_env("LLAMA_ARG_ENDPOINT_METRICS", params.endpoint_metrics);
346
+ get_env("LLAMA_ARG_ENDPOINT_SLOTS", params.endpoint_slots);
347
+ get_env("LLAMA_ARG_EMBEDDINGS", params.embedding);
348
+ get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn);
349
+ get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold);
350
+ }
351
+
276
352
  bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
277
353
  const auto params_org = params; // the example can modify the default params
278
354
 
@@ -1733,7 +1809,13 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
1733
1809
  if (params.n_threads_batch != -1) {
1734
1810
  os << " (n_threads_batch = " << params.n_threads_batch << ")";
1735
1811
  }
1812
+ #if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
1813
+ // TODO: windows + arm64 + mingw64
1814
+ DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
1815
+ os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
1816
+ #else
1736
1817
  os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
1818
+ #endif
1737
1819
 
1738
1820
  return os.str();
1739
1821
  }
package/cpp/common.h CHANGED
@@ -81,7 +81,7 @@ enum dimre_method {
81
81
  struct gpt_params {
82
82
  uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
83
83
 
84
- bool vocab_only = false;
84
+ bool vocab_only = false;
85
85
  int32_t n_threads = cpu_get_num_math();
86
86
  int32_t n_threads_draft = -1;
87
87
  int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
@@ -279,7 +279,7 @@ struct gpt_params {
279
279
  std::string lora_outfile = "ggml-lora-merged-f16.gguf";
280
280
  };
281
281
 
282
- void gpt_params_handle_hf_token(gpt_params & params);
282
+ void gpt_params_parse_from_env(gpt_params & params);
283
283
  void gpt_params_handle_model_default(gpt_params & params);
284
284
 
285
285
  bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
@@ -1018,10 +1018,6 @@ static bool lm_ggml_is_view_op(enum lm_ggml_op op) {
1018
1018
  #define LM_GGML_SCHED_MAX_BACKENDS 16
1019
1019
  #endif
1020
1020
 
1021
- #ifndef LM_GGML_SCHED_MAX_SPLITS
1022
- #define LM_GGML_SCHED_MAX_SPLITS 2048
1023
- #endif
1024
-
1025
1021
  #ifndef LM_GGML_SCHED_MAX_SPLIT_INPUTS
1026
1022
  #define LM_GGML_SCHED_MAX_SPLIT_INPUTS LM_GGML_MAX_SRC
1027
1023
  #endif
@@ -1125,7 +1121,8 @@ static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sch
1125
1121
  }
1126
1122
 
1127
1123
  #if 0
1128
- static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS*LM_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1124
+ #define LM_GGML_SCHED_MAX_SPLITS_DEBUG 4096
1125
+ static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS_DEBUG*LM_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1129
1126
  #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
1130
1127
  #define GET_CAUSE(node) causes[hash_id(node)]
1131
1128
  #else
@@ -1549,7 +1546,6 @@ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, str
1549
1546
  sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split));
1550
1547
  LM_GGML_ASSERT(sched->splits != NULL);
1551
1548
  }
1552
- LM_GGML_ASSERT(i_split < LM_GGML_SCHED_MAX_SPLITS);
1553
1549
  split = &sched->splits[i_split];
1554
1550
  split->backend_id = node_backend_id;
1555
1551
  split->i_start = i;
@@ -1865,13 +1861,14 @@ lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
1865
1861
  sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1866
1862
  sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
1867
1863
 
1868
- const size_t nodes_size = graph_size + LM_GGML_SCHED_MAX_SPLITS*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
1864
+ const size_t lm_ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
1865
+ const size_t nodes_size = graph_size + lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
1869
1866
  sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1870
1867
  sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1871
1868
  sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
1872
1869
  sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
1873
1870
 
1874
- sched->context_buffer_size = LM_GGML_SCHED_MAX_SPLITS*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false);
1871
+ sched->context_buffer_size = lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false);
1875
1872
  sched->context_buffer = malloc(sched->context_buffer_size);
1876
1873
 
1877
1874
  const int initial_splits_capacity = 16;
@@ -171,6 +171,98 @@ void llama_sample_top_p_impl(struct llama_sampling * smpl, llama_token_data_arra
171
171
  }
172
172
  }
173
173
 
174
+ void llama_sample_xtc_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float xtc_threshold, float xtc_probability, size_t min_keep, std::mt19937 & rng) {
175
+ if(xtc_threshold <= 0.0f || !candidates-> size) {
176
+ return;
177
+ }
178
+ // TODO: xtc impl
179
+ bool xtc_applied = false;
180
+ const int64_t t_start_sample_us = lm_ggml_time_us();
181
+
182
+ // unsorted iteration
183
+ if (!candidates->sorted) {
184
+ std::vector<llama_token_data> top_tokens, low_tokens;
185
+
186
+ // split candidates into two arrays for low and high tokens
187
+ for (size_t i = 0; i < candidates->size; ++i) {
188
+ if (candidates->data[i].logit >= xtc_threshold) {
189
+ top_tokens.push_back(candidates->data[i]);
190
+ } else {
191
+ low_tokens.push_back(candidates-> data[i]);
192
+ }
193
+ }
194
+ // if there is only one or no top_tokens, do not truncate
195
+
196
+ if (top_tokens.size() <= 1) {
197
+ return;
198
+ }
199
+
200
+ // sort top_tokens
201
+ std::sort(top_tokens.begin(), top_tokens.end(), [](const llama_token_data & a, const llama_token_data & b) {
202
+ return a.logit > b.logit;
203
+ });
204
+
205
+ // insert top_tokens with probability. Always insert lowest top_token
206
+ low_tokens.push_back(top_tokens[0]);
207
+ std::uniform_real_distribution<float> random_float(0.0 , 1.0);
208
+ for (size_t i = 1; i < top_tokens.size(); ++i) {
209
+ if(random_float(rng) <= xtc_probability) {
210
+ low_tokens.push_back(top_tokens[i]);
211
+ }
212
+ }
213
+ if(low_tokens.size() >= min_keep) {
214
+ memcpy(candidates->data, low_tokens.data(), low_tokens.size()*sizeof(llama_token_data));
215
+ candidates->size = low_tokens.size();
216
+ xtc_applied = true;
217
+ }
218
+ }
219
+ // sorted iteration
220
+
221
+ if (!xtc_applied) {
222
+ // Sort the logits in descending order
223
+ if (!candidates->sorted) {
224
+ std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
225
+ return a.logit > b.logit;
226
+ });
227
+ candidates->sorted = true;
228
+ }
229
+
230
+ // find last token over threshold
231
+
232
+ size_t last_index = 0;
233
+
234
+ for (; last_index < candidates -> size; ++last_index) {
235
+ if(candidates -> data[last_index].logit < xtc_threshold) {
236
+ break;
237
+ }
238
+ }
239
+ last_index--;
240
+ // check if only 1 last index token or total less than min_keep
241
+ if(last_index <= 1 || candidates-> size - last_index < min_keep) {
242
+ return;
243
+ }
244
+ // indexes to be skipped
245
+ size_t safe_index = 0;
246
+ // remove tokens until last threshold item
247
+ candidates -> data;
248
+ std::uniform_real_distribution<float> random_float(0.0 , 1.0);
249
+ for (size_t i = 0; i < last_index; i++) {
250
+ if(random_float(rng) < xtc_probability) {
251
+ if(i != safe_index) {
252
+ std::swap(candidates-> data[i], candidates->data[safe_index]);
253
+ }
254
+ safe_index++;
255
+ }
256
+ }
257
+ candidates -> data = candidates -> data + safe_index;
258
+ candidates -> size = candidates -> size - safe_index;
259
+ }
260
+
261
+ if (smpl) {
262
+ smpl->t_sample_us += lm_ggml_time_us() - t_start_sample_us;
263
+ }
264
+ }
265
+
174
266
  void llama_sample_min_p_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
175
267
  if (p <= 0.0f || !candidates->size) {
176
268
  return;
@@ -32,6 +32,7 @@ void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_
32
32
  void llama_sample_typical_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
33
33
  void llama_sample_entropy_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val);
34
34
  void llama_sample_temp_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float temp);
35
+ void llama_sample_xtc_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float xtc_threshold, float xtc_probability, size_t min_keep, std::mt19937 & rng);
35
36
 
36
37
  void llama_sample_repetition_penalties_impl(
37
38
  struct llama_sampling * smpl,
@@ -321,6 +321,21 @@ private:
321
321
 
322
322
  // TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
323
323
 
324
+ template<typename T, typename Container = std::vector<T>, typename Compare = std::less<typename Container::value_type>>
325
+ class llama_priority_queue : public std::priority_queue<T, Container, Compare> {
326
+ public:
327
+ using std::priority_queue<T, Container, Compare>::priority_queue;
328
+
329
+ T pop_move() {
330
+ T item = std::move(this->c.front());
331
+ std::pop_heap(this->c.begin(), this->c.end(), this->comp);
332
+ this->c.pop_back();
333
+ return item;
334
+ }
335
+
336
+ void pop() = delete;
337
+ };
338
+
324
339
  struct llm_bigram_bpe {
325
340
  struct comparator {
326
341
  bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
@@ -329,7 +344,7 @@ struct llm_bigram_bpe {
329
344
  };
330
345
 
331
346
  using queue_storage = std::vector<llm_bigram_bpe>;
332
- using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
347
+ using queue = llama_priority_queue<llm_bigram_bpe, queue_storage, comparator>;
333
348
  llm_symbol::index left;
334
349
  llm_symbol::index right;
335
350
  std::string text;
@@ -388,6 +403,7 @@ struct llm_tokenizer_bpe {
388
403
  case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
389
404
  case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
390
405
  case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
406
+ case LLAMA_VOCAB_PRE_TYPE_EXAONE:
391
407
  regex_exprs = {
392
408
  "\\p{N}",
393
409
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
@@ -519,8 +535,7 @@ struct llm_tokenizer_bpe {
519
535
 
520
536
  // build token(s)
521
537
  while (!work_queue.empty()) {
522
- auto bigram = work_queue.top();
523
- work_queue.pop();
538
+ auto bigram = work_queue.pop_move();
524
539
 
525
540
  auto & left_symbol = symbols[bigram.left];
526
541
  auto & right_symbol = symbols[bigram.right];
package/cpp/llama.cpp CHANGED
@@ -221,6 +221,8 @@ enum llm_arch {
221
221
  LLM_ARCH_T5,
222
222
  LLM_ARCH_T5ENCODER,
223
223
  LLM_ARCH_JAIS,
224
+ LLM_ARCH_NEMOTRON,
225
+ LLM_ARCH_EXAONE,
224
226
  LLM_ARCH_UNKNOWN,
225
227
  };
226
228
 
@@ -266,6 +268,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
266
268
  { LLM_ARCH_T5, "t5" },
267
269
  { LLM_ARCH_T5ENCODER, "t5encoder" },
268
270
  { LLM_ARCH_JAIS, "jais" },
271
+ { LLM_ARCH_NEMOTRON, "nemotron" },
272
+ { LLM_ARCH_EXAONE, "exaone" },
269
273
  { LLM_ARCH_UNKNOWN, "(unknown)" },
270
274
  };
271
275
 
@@ -335,6 +339,7 @@ enum llm_kv {
335
339
  LLM_KV_SSM_CONV_KERNEL,
336
340
  LLM_KV_SSM_STATE_SIZE,
337
341
  LLM_KV_SSM_TIME_STEP_RANK,
342
+ LLM_KV_SSM_DT_B_C_RMS,
338
343
 
339
344
  LLM_KV_TOKENIZER_MODEL,
340
345
  LLM_KV_TOKENIZER_PRE,
@@ -433,6 +438,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
433
438
  { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
434
439
  { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
435
440
  { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
441
+ { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
436
442
 
437
443
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
438
444
  { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
@@ -1307,6 +1313,43 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1307
1313
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1308
1314
  },
1309
1315
  },
1316
+ {
1317
+ LLM_ARCH_NEMOTRON,
1318
+ {
1319
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1320
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1321
+ { LLM_TENSOR_OUTPUT, "output" },
1322
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1323
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1324
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1325
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1326
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1327
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1328
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
1329
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1330
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1331
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1332
+ },
1333
+ },
1334
+ {
1335
+ LLM_ARCH_EXAONE,
1336
+ {
1337
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1338
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1339
+ { LLM_TENSOR_OUTPUT, "output" },
1340
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1341
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1342
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1343
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1344
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1345
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1346
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
1347
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1348
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1349
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1350
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1351
+ },
1352
+ },
1310
1353
  {
1311
1354
  LLM_ARCH_UNKNOWN,
1312
1355
  {
@@ -2207,6 +2250,7 @@ struct llama_hparams {
2207
2250
  uint32_t ssm_d_inner = 0;
2208
2251
  uint32_t ssm_d_state = 0;
2209
2252
  uint32_t ssm_dt_rank = 0;
2253
+ bool ssm_dt_b_c_rms = false;
2210
2254
 
2211
2255
  float f_clamp_kqv = 0.0f;
2212
2256
  float f_max_alibi_bias = 0.0f;
@@ -2256,6 +2300,7 @@ struct llama_hparams {
2256
2300
  if (this->ssm_d_inner != other.ssm_d_inner) return true;
2257
2301
  if (this->ssm_d_state != other.ssm_d_state) return true;
2258
2302
  if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
2303
+ if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
2259
2304
 
2260
2305
  if (this->dec_start_token_id != other.dec_start_token_id) return true;
2261
2306
 
@@ -5022,6 +5067,7 @@ static void llm_load_hparams(
5022
5067
  ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
5023
5068
  ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
5024
5069
  ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
5070
+ ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
5025
5071
 
5026
5072
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5027
5073
 
@@ -5246,6 +5292,23 @@ static void llm_load_hparams(
5246
5292
  default: model.type = e_model::MODEL_UNKNOWN;
5247
5293
  }
5248
5294
  } break;
5295
+ case LLM_ARCH_NEMOTRON:
5296
+ {
5297
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
5298
+ switch (hparams.n_layer) {
5299
+ case 32: model.type = e_model::MODEL_4B; break;
5300
+ default: model.type = e_model::MODEL_UNKNOWN;
5301
+ }
5302
+ } break;
5303
+ case LLM_ARCH_EXAONE:
5304
+ {
5305
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5306
+
5307
+ switch (hparams.n_layer) {
5308
+ case 32: model.type = e_model::MODEL_8B; break;
5309
+ default: model.type = e_model::MODEL_UNKNOWN;
5310
+ }
5311
+ } break;
5249
5312
  default: (void)0;
5250
5313
  }
5251
5314
 
@@ -5484,6 +5547,9 @@ static void llm_load_vocab(
5484
5547
  } else if (
5485
5548
  tokenizer_pre == "gpt3-finnish") {
5486
5549
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
5550
+ } else if (
5551
+ tokenizer_pre == "exaone") {
5552
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
5487
5553
  } else {
5488
5554
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
5489
5555
  }
@@ -5857,6 +5923,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
5857
5923
  LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
5858
5924
  LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
5859
5925
  LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
5926
+ LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
5860
5927
  }
5861
5928
 
5862
5929
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
@@ -6122,9 +6189,9 @@ static bool llm_load_tensors(
6122
6189
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
6123
6190
 
6124
6191
  // optional MLP bias
6125
- layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
6126
- layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
6127
- layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
6192
+ layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
6193
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
6194
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
6128
6195
  } else {
6129
6196
  layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
6130
6197
 
@@ -6448,7 +6515,7 @@ static bool llm_load_tensors(
6448
6515
  layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
6449
6516
 
6450
6517
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens
6451
- layer.bo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens
6518
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens
6452
6519
 
6453
6520
  layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
6454
6521
  layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
@@ -7579,6 +7646,78 @@ static bool llm_load_tensors(
7579
7646
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
7580
7647
  }
7581
7648
  } break;
7649
+ case LLM_ARCH_NEMOTRON:
7650
+ {
7651
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
7652
+
7653
+ // output
7654
+ {
7655
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
7656
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
7657
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
7658
+ }
7659
+
7660
+ for (int i = 0; i < n_layer; ++i) {
7661
+ lm_ggml_context * ctx_layer = ctx_for_layer(i);
7662
+ lm_ggml_context * ctx_split = ctx_for_layer_split(i);
7663
+
7664
+ auto & layer = model.layers[i];
7665
+
7666
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
7667
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
7668
+
7669
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
7670
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
7671
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
7672
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
7673
+
7674
+ // optional bias tensors
7675
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
7676
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
7677
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
7678
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
7679
+
7680
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
7681
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
7682
+
7683
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
7684
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
7685
+
7686
+ // optional MLP bias
7687
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
7688
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
7689
+ }
7690
+ } break;
7691
+ case LLM_ARCH_EXAONE:
7692
+ {
7693
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
7694
+
7695
+ // output
7696
+ {
7697
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
7698
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
7699
+ }
7700
+
7701
+ for (int i = 0; i < n_layer; ++i) {
7702
+ lm_ggml_context * ctx_layer = ctx_for_layer(i);
7703
+ lm_ggml_context * ctx_split = ctx_for_layer_split(i);
7704
+
7705
+ auto & layer = model.layers[i];
7706
+
7707
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
7708
+
7709
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
7710
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
7711
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
7712
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
7713
+
7714
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
7715
+ layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
7716
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
7717
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
7718
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
7719
+ }
7720
+ } break;
7582
7721
  default:
7583
7722
  throw std::runtime_error("unknown architecture");
7584
7723
  }
@@ -8265,7 +8404,7 @@ static struct lm_ggml_tensor * llm_build_kqv(
8265
8404
  struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx, k, q);
8266
8405
  cb(kq, "kq", il);
8267
8406
 
8268
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
8407
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2 || model.arch == LLM_ARCH_NEMOTRON) {
8269
8408
  // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
8270
8409
  // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
8271
8410
  lm_ggml_mul_mat_set_prec(kq, LM_GGML_PREC_F32);
@@ -12039,6 +12178,10 @@ struct llm_build_context {
12039
12178
  LM_GGML_ASSERT(2 * d_model == d_inner);
12040
12179
  const int64_t d_state = hparams.ssm_d_state;
12041
12180
  const int64_t dt_rank = hparams.ssm_dt_rank;
12181
+ // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
12182
+ const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
12183
+ // Use the same RMS norm as the final layer norm
12184
+ const float norm_rms_eps = hparams.f_norm_rms_eps;
12042
12185
 
12043
12186
  struct lm_ggml_tensor * cur;
12044
12187
  struct lm_ggml_tensor * inpL;
@@ -12119,6 +12262,13 @@ struct llm_build_context {
12119
12262
  struct lm_ggml_tensor * B = lm_ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], lm_ggml_element_size(x_db)*dt_rank);
12120
12263
  struct lm_ggml_tensor * C = lm_ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], lm_ggml_element_size(x_db)*(dt_rank+d_state));
12121
12264
 
12265
+ // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
12266
+ if (ssm_dt_b_c_rms) {
12267
+ dt = lm_ggml_rms_norm(ctx0, dt, norm_rms_eps);
12268
+ B = lm_ggml_rms_norm(ctx0, B, norm_rms_eps);
12269
+ C = lm_ggml_rms_norm(ctx0, C, norm_rms_eps);
12270
+ }
12271
+
12122
12272
  // {dt_rank, d_inner} * {dt_rank, n_tokens} => {d_inner, n_tokens}
12123
12273
  dt = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_dt, dt);
12124
12274
  dt = lm_ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
@@ -13766,6 +13916,254 @@ struct llm_build_context {
13766
13916
 
13767
13917
  return gf;
13768
13918
  }
13919
+
13920
+ struct lm_ggml_cgraph * build_nemotron() {
13921
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13922
+
13923
+ const int64_t n_embd_head = hparams.n_embd_head_v;
13924
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13925
+ //LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
13926
+
13927
+ struct lm_ggml_tensor * cur;
13928
+ struct lm_ggml_tensor * inpL;
13929
+
13930
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
13931
+
13932
+ // inp_pos - contains the positions
13933
+ struct lm_ggml_tensor * inp_pos = build_inp_pos();
13934
+
13935
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
13936
+ struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
13937
+
13938
+ for (int il = 0; il < n_layer; ++il) {
13939
+ struct lm_ggml_tensor * inpSA = inpL;
13940
+
13941
+ // norm
13942
+ cur = llm_build_norm(ctx0, inpL, hparams,
13943
+ model.layers[il].attn_norm,
13944
+ model.layers[il].attn_norm_b,
13945
+ LLM_NORM, cb, il);
13946
+ cb(cur, "attn_norm", il);
13947
+
13948
+ // self-attention
13949
+ {
13950
+ // compute Q and K and RoPE them
13951
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
13952
+ cb(Qcur, "Qcur", il);
13953
+ if (model.layers[il].bq) {
13954
+ Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
13955
+ cb(Qcur, "Qcur", il);
13956
+ }
13957
+
13958
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
13959
+ cb(Kcur, "Kcur", il);
13960
+ if (model.layers[il].bk) {
13961
+ Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
13962
+ cb(Kcur, "Kcur", il);
13963
+ }
13964
+
13965
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
13966
+ cb(Vcur, "Vcur", il);
13967
+ if (model.layers[il].bv) {
13968
+ Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
13969
+ cb(Vcur, "Vcur", il);
13970
+ }
13971
+
13972
+ Qcur = lm_ggml_rope_ext(
13973
+ ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
13974
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13975
+ ext_factor, attn_factor, beta_fast, beta_slow
13976
+ );
13977
+ cb(Qcur, "Qcur", il);
13978
+
13979
+ Kcur = lm_ggml_rope_ext(
13980
+ ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
13981
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13982
+ ext_factor, attn_factor, beta_fast, beta_slow
13983
+ );
13984
+ cb(Kcur, "Kcur", il);
13985
+
13986
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
13987
+ model.layers[il].wo, model.layers[il].bo,
13988
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
13989
+ }
13990
+
13991
+ if (il == n_layer - 1) {
13992
+ // skip computing output for unused tokens
13993
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
13994
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
13995
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
13996
+ }
13997
+
13998
+ struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
13999
+ cb(ffn_inp, "ffn_inp", il);
14000
+
14001
+ // feed-forward network
14002
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
14003
+ model.layers[il].ffn_norm,
14004
+ model.layers[il].ffn_norm_b,
14005
+ LLM_NORM, cb, il);
14006
+ cb(cur, "ffn_norm", il);
14007
+
14008
+ cur = llm_build_ffn(ctx0, lctx, cur,
14009
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
14010
+ NULL, NULL, NULL,
14011
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
14012
+ NULL,
14013
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
14014
+
14015
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
14016
+ cb(cur, "ffn_out", il);
14017
+
14018
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
14019
+ cb(cur, "l_out", il);
14020
+
14021
+ // input for next layer
14022
+ inpL = cur;
14023
+ }
14024
+
14025
+ cur = inpL;
14026
+
14027
+ cur = llm_build_norm(ctx0, cur, hparams,
14028
+ model.output_norm, model.output_norm_b,
14029
+ LLM_NORM, cb, -1);
14030
+ cb(cur, "result_norm", -1);
14031
+
14032
+ // lm_head
14033
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
14034
+ cb(cur, "result_output", -1);
14035
+
14036
+ lm_ggml_build_forward_expand(gf, cur);
14037
+
14038
+ return gf;
14039
+ }
14040
+
14041
+ struct lm_ggml_cgraph * build_exaone() {
14042
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
14043
+
14044
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
14045
+ int32_t n_tokens = this->n_tokens;
14046
+
14047
+ const int64_t n_embd_head = hparams.n_embd_head_v;
14048
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14049
+ LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
14050
+
14051
+ struct lm_ggml_tensor * cur;
14052
+ struct lm_ggml_tensor * inpL;
14053
+
14054
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
14055
+
14056
+ // inp_pos - contains the positions
14057
+ struct lm_ggml_tensor * inp_pos = build_inp_pos();
14058
+
14059
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
14060
+ struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
14061
+
14062
+ for (int il = 0; il < n_layer; ++il) {
14063
+ struct lm_ggml_tensor * inpSA = inpL;
14064
+
14065
+ // norm
14066
+ cur = llm_build_norm(ctx0, inpL, hparams,
14067
+ model.layers[il].attn_norm, NULL,
14068
+ LLM_NORM_RMS, cb, il);
14069
+ cb(cur, "attn_norm", il);
14070
+
14071
+ // self-attention
14072
+ {
14073
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
14074
+ struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
14075
+
14076
+ // compute Q and K and RoPE them
14077
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
14078
+ cb(Qcur, "Qcur", il);
14079
+ if (model.layers[il].bq) {
14080
+ Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
14081
+ cb(Qcur, "Qcur", il);
14082
+ }
14083
+
14084
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
14085
+ cb(Kcur, "Kcur", il);
14086
+ if (model.layers[il].bk) {
14087
+ Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
14088
+ cb(Kcur, "Kcur", il);
14089
+ }
14090
+
14091
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
14092
+ cb(Vcur, "Vcur", il);
14093
+ if (model.layers[il].bv) {
14094
+ Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
14095
+ cb(Vcur, "Vcur", il);
14096
+ }
14097
+
14098
+ Qcur = lm_ggml_rope_ext(
14099
+ ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
14100
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14101
+ ext_factor, attn_factor, beta_fast, beta_slow
14102
+ );
14103
+ cb(Qcur, "Qcur", il);
14104
+
14105
+ Kcur = lm_ggml_rope_ext(
14106
+ ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
14107
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14108
+ ext_factor, attn_factor, beta_fast, beta_slow
14109
+ );
14110
+ cb(Kcur, "Kcur", il);
14111
+
14112
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
14113
+ model.layers[il].wo, model.layers[il].bo,
14114
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
14115
+ }
14116
+
14117
+ if (il == n_layer - 1) {
14118
+ // skip computing output for unused tokens
14119
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
14120
+ n_tokens = n_outputs;
14121
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
14122
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
14123
+ }
14124
+
14125
+ struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
14126
+ cb(ffn_inp, "ffn_inp", il);
14127
+
14128
+ // feed-forward network
14129
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
14130
+ model.layers[il].ffn_norm, NULL,
14131
+ LLM_NORM_RMS, cb, il);
14132
+ cb(cur, "ffn_norm", il);
14133
+
14134
+ cur = llm_build_ffn(ctx0, lctx, cur,
14135
+ model.layers[il].ffn_up, NULL, NULL,
14136
+ model.layers[il].ffn_gate, NULL, NULL,
14137
+ model.layers[il].ffn_down, NULL, NULL,
14138
+ NULL,
14139
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
14140
+ cb(cur, "ffn_out", il);
14141
+
14142
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
14143
+ cb(cur, "ffn_out", il);
14144
+
14145
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
14146
+ cb(cur, "l_out", il);
14147
+
14148
+ // input for next layer
14149
+ inpL = cur;
14150
+ }
14151
+
14152
+ cur = inpL;
14153
+
14154
+ cur = llm_build_norm(ctx0, cur, hparams,
14155
+ model.output_norm, NULL,
14156
+ LLM_NORM_RMS, cb, -1);
14157
+ cb(cur, "result_norm", -1);
14158
+
14159
+ // lm_head
14160
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
14161
+ cb(cur, "result_output", -1);
14162
+
14163
+ lm_ggml_build_forward_expand(gf, cur);
14164
+
14165
+ return gf;
14166
+ }
13769
14167
  };
13770
14168
 
13771
14169
  static struct lm_ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -14021,6 +14419,14 @@ static struct lm_ggml_cgraph * llama_build_graph(
14021
14419
  {
14022
14420
  result = llm.build_jais();
14023
14421
  } break;
14422
+ case LLM_ARCH_NEMOTRON:
14423
+ {
14424
+ result = llm.build_nemotron();
14425
+ } break;
14426
+ case LLM_ARCH_EXAONE:
14427
+ {
14428
+ result = llm.build_exaone();
14429
+ } break;
14024
14430
  default:
14025
14431
  LM_GGML_ABORT("fatal error");
14026
14432
  }
@@ -15727,6 +16133,9 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_
15727
16133
  case LM_GGML_TYPE_Q6_K: new_type = LM_GGML_TYPE_Q8_0; break;
15728
16134
  default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
15729
16135
  }
16136
+ if (tensor->ne[0] % lm_ggml_blck_size(new_type) != 0) {
16137
+ new_type = LM_GGML_TYPE_F16;
16138
+ }
15730
16139
  LLAMA_LOG_WARN(" - using fallback quantization %s\n", lm_ggml_type_name(new_type));
15731
16140
  ++qs.n_fallback;
15732
16141
  }
@@ -16055,8 +16464,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
16055
16464
  // do not quantize Mamba's small yet 2D weights
16056
16465
  // NOTE: can't use LLM_TN here because the layer number is not known
16057
16466
  quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
16058
- quantize &= name.find("ssm_x.weight") == std::string::npos;
16059
- quantize &= name.find("ssm_dt.weight") == std::string::npos;
16060
16467
 
16061
16468
  // do not quantize relative position bias (T5)
16062
16469
  quantize &= name.find("attn_rel_b.weight") == std::string::npos;
@@ -17091,6 +17498,8 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
17091
17498
  case LLM_ARCH_OPENELM:
17092
17499
  case LLM_ARCH_GPTNEOX:
17093
17500
  case LLM_ARCH_CODESHELL:
17501
+ case LLM_ARCH_NEMOTRON:
17502
+ case LLM_ARCH_EXAONE:
17094
17503
  return LLAMA_ROPE_TYPE_NEOX;
17095
17504
 
17096
17505
  // all model arches should be listed explicitly here
@@ -19021,6 +19430,22 @@ static int32_t llama_chat_apply_template_internal(
19021
19430
  if (add_ass) {
19022
19431
  ss << "Assistant:";
19023
19432
  }
19433
+ } else if (tmpl == "exaone3" || (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]"))) {
19434
+ // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
19435
+ // EXAONE-3.0-7.8B-Instruct
19436
+ for (auto message : chat) {
19437
+ std::string role(message->role);
19438
+ if (role == "system") {
19439
+ ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n";
19440
+ } else if (role == "user") {
19441
+ ss << "[|user|]" << trim(message->content) << "\n";
19442
+ } else if (role == "assistant") {
19443
+ ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n";
19444
+ }
19445
+ }
19446
+ if (add_ass) {
19447
+ ss << "[|assistant|]";
19448
+ }
19024
19449
  } else {
19025
19450
  // template not supported
19026
19451
  return -1;
@@ -19134,6 +19559,10 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
19134
19559
  llama_sample_min_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
19135
19560
  }
19136
19561
 
19562
+ void llama_sample_xtc(struct llama_context * ctx, llama_token_data_array * candidates, float xtc_threshold, float xtc_probability, size_t min_keep, std::mt19937 rng){
19563
+ llama_sample_xtc_impl(ctx ? &ctx-> sampling: nullptr, candidates, xtc_threshold, xtc_probability, min_keep, rng);
19564
+ }
19565
+
19137
19566
  void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
19138
19567
  llama_sample_tail_free_impl(ctx ? &ctx->sampling : nullptr, candidates, z, min_keep);
19139
19568
  }
package/cpp/llama.h CHANGED
@@ -8,6 +8,7 @@
8
8
  #include <stdint.h>
9
9
  #include <stdio.h>
10
10
  #include <stdbool.h>
11
+ #include <random>
11
12
 
12
13
  #ifdef LLAMA_SHARED
13
14
  # if defined(_WIN32) && !defined(__MINGW32__)
@@ -95,6 +96,7 @@ extern "C" {
95
96
  LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
96
97
  LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
97
98
  LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
99
+ LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
98
100
  };
99
101
 
100
102
  enum llama_rope_type {
@@ -1084,6 +1086,15 @@ extern "C" {
1084
1086
  float p,
1085
1087
  size_t min_keep);
1086
1088
 
1089
+ /// @details XTC sampling
1090
+ LLAMA_API void llama_sample_xtc(
1091
+ struct llama_context * ctx,
1092
+ llama_token_data_array * candidates,
1093
+ float xtc_threshold,
1094
+ float xtc_probability,
1095
+ size_t min_keep,
1096
+ std::mt19937 rng);
1097
+
1087
1098
  /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
1088
1099
  LLAMA_API void llama_sample_tail_free(
1089
1100
  struct llama_context * ctx,
package/cpp/sampling.cpp CHANGED
@@ -229,6 +229,7 @@ std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::strin
229
229
  // no reasons to expose this function in header
230
230
  static void sampler_queue(
231
231
  struct llama_context * ctx_main,
232
+ struct llama_sampling_context * ctx_sampling,
232
233
  const llama_sampling_params & params,
233
234
  llama_token_data_array & cur_p,
234
235
  size_t min_keep) {
@@ -238,6 +239,8 @@ static void sampler_queue(
238
239
  const int32_t top_k = params.top_k;
239
240
  const float top_p = params.top_p;
240
241
  const float min_p = params.min_p;
242
+ const float xtc_t = params.xtc_t;
243
+ const float xtc_p = params.xtc_p;
241
244
  const float tfs_z = params.tfs_z;
242
245
  const float typical_p = params.typical_p;
243
246
  const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
@@ -249,6 +252,7 @@ static void sampler_queue(
249
252
  case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
250
253
  case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
251
254
  case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
255
+ case llama_sampler_type::XTC : llama_sample_xtc (ctx_main, &cur_p, xtc_t, xtc_p, min_keep, ctx_sampling->rng); break;
252
256
  case llama_sampler_type::TEMPERATURE:
253
257
  if (dynatemp_range > 0) {
254
258
  float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
@@ -302,7 +306,7 @@ static llama_token llama_sampling_sample_impl(
302
306
  // temperature sampling
303
307
  size_t min_keep = std::max(1, params.min_keep);
304
308
 
305
- sampler_queue(ctx_main, params, cur_p, min_keep);
309
+ sampler_queue(ctx_main, ctx_sampling, params, cur_p, min_keep);
306
310
 
307
311
  id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);
308
312
 
package/cpp/sampling.h CHANGED
@@ -14,6 +14,7 @@ enum class llama_sampler_type : char {
14
14
  TOP_K = 'k',
15
15
  TOP_P = 'p',
16
16
  MIN_P = 'm',
17
+ XTC = 'x',
17
18
  TFS_Z = 'f',
18
19
  TYPICAL_P = 'y',
19
20
  TEMPERATURE = 't'
@@ -27,6 +28,8 @@ typedef struct llama_sampling_params {
27
28
  int32_t top_k = 40; // <= 0 to use vocab size
28
29
  float top_p = 0.95f; // 1.0 = disabled
29
30
  float min_p = 0.05f; // 0.0 = disabled
31
+ float xtc_t = 0.0f; // 0.0 = disabled
32
+ float xtc_p = 0.0f; // controls the probability of XTC removal
30
33
  float tfs_z = 1.00f; // 1.0 = disabled
31
34
  float typical_p = 1.00f; // 1.0 = disabled
32
35
  float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
@@ -48,6 +51,7 @@ typedef struct llama_sampling_params {
48
51
  llama_sampler_type::TYPICAL_P,
49
52
  llama_sampler_type::TOP_P,
50
53
  llama_sampler_type::MIN_P,
54
+ llama_sampler_type::XTC,
51
55
  llama_sampler_type::TEMPERATURE
52
56
  };
53
57
 
@@ -1 +1 @@
1
- {"version":3,"names":["_reactNative","require","_default","TurboModuleRegistry","get","exports","default"],"sourceRoot":"..\\..\\src","sources":["NativeRNLlama.ts"],"mappings":";;;;;;AACA,IAAAA,YAAA,GAAAC,OAAA;AAAkD,IAAAC,QAAA,GA6JnCC,gCAAmB,CAACC,GAAG,CAAO,SAAS,CAAC;AAAAC,OAAA,CAAAC,OAAA,GAAAJ,QAAA"}
1
+ {"version":3,"names":["_reactNative","require","_default","TurboModuleRegistry","get","exports","default"],"sourceRoot":"..\\..\\src","sources":["NativeRNLlama.ts"],"mappings":";;;;;;AACA,IAAAA,YAAA,GAAAC,OAAA;AAAkD,IAAAC,QAAA,GA+JnCC,gCAAmB,CAACC,GAAG,CAAO,SAAS,CAAC;AAAAC,OAAA,CAAAC,OAAA,GAAAJ,QAAA"}
@@ -1 +1 @@
1
- {"version":3,"names":["TurboModuleRegistry","get"],"sourceRoot":"..\\..\\src","sources":["NativeRNLlama.ts"],"mappings":"AACA,SAASA,mBAAmB,QAAQ,cAAc;AA6JlD,eAAeA,mBAAmB,CAACC,GAAG,CAAO,SAAS,CAAC"}
1
+ {"version":3,"names":["TurboModuleRegistry","get"],"sourceRoot":"..\\..\\src","sources":["NativeRNLlama.ts"],"mappings":"AACA,SAASA,mBAAmB,QAAQ,cAAc;AA+JlD,eAAeA,mBAAmB,CAACC,GAAG,CAAO,SAAS,CAAC"}
@@ -25,6 +25,8 @@ export type NativeCompletionParams = {
25
25
  top_k?: number;
26
26
  top_p?: number;
27
27
  min_p?: number;
28
+ xtc_t?: number;
29
+ xtc_p?: number;
28
30
  tfs_z?: number;
29
31
  typical_p?: number;
30
32
  temperature?: number;
@@ -1 +1 @@
1
- {"version":3,"file":"NativeRNLlama.d.ts","sourceRoot":"","sources":["../../src/NativeRNLlama.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,cAAc,CAAA;AAG/C,MAAM,MAAM,mBAAmB,GAAG;IAChC,KAAK,EAAE,MAAM,CAAA;IACb,cAAc,CAAC,EAAE,OAAO,CAAA;IAExB,SAAS,CAAC,EAAE,OAAO,CAAA;IAEnB,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,OAAO,CAAC,EAAE,MAAM,CAAA;IAEhB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,YAAY,CAAC,EAAE,MAAM,CAAA;IAErB,SAAS,CAAC,EAAE,OAAO,CAAA;IACnB,QAAQ,CAAC,EAAE,OAAO,CAAA;IAClB,UAAU,CAAC,EAAE,OAAO,CAAA;IAEpB,IAAI,CAAC,EAAE,MAAM,CAAA;IACb,WAAW,CAAC,EAAE,MAAM,CAAA;IAEpB,cAAc,CAAC,EAAE,MAAM,CAAA;IACvB,eAAe,CAAC,EAAE,MAAM,CAAA;CACzB,CAAA;AAED,MAAM,MAAM,sBAAsB,GAAG;IACnC,MAAM,EAAE,MAAM,CAAA;IACd,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,IAAI,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAA;IAEpB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,WAAW,CAAC,EAAE,MAAM,CAAA;IACpB,cAAc,CAAC,EAAE,MAAM,CAAA;IACvB,cAAc,CAAC,EAAE,MAAM,CAAA;IACvB,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,eAAe,CAAC,EAAE,MAAM,CAAA;IACxB,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,WAAW,CAAC,EAAE,OAAO,CAAA;IACrB,IAAI,CAAC,EAAE,MAAM,CAAA;IAEb,UAAU,CAAC,EAAE,OAAO,CAAA;IACpB,UAAU,CAAC,EAAE,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAA;IAEjC,uBAAuB,EAAE,OAAO,CAAA;CACjC,CAAA;AAED,MAAM,MAAM,6BAA6B,GAAG;IAC1C,OAAO,EAAE,MAAM,CAAA;IACf,IAAI,EAAE,MAAM,CAAA;CACb,CAAA;AAED,MAAM,MAAM,yBAAyB,GAAG;IACtC,OAAO,EAAE,MAAM,CAAA;IACf,KAAK,EAAE,KAAK,CAAC,6BAA6B,CAAC,CAAA;CAC5C,CAAA;AAED,MAAM,MAAM,6BAA6B,GAAG;IAC1C,QAAQ,EAAE,MAAM,CAAA;IAChB,SAAS,EAAE,MAAM,CAAA;IACjB,mBAAmB,EAAE,MAAM,CAAA;IAC3B,iBAAiB,EAAE,MAAM,CAAA;IACzB,WAAW,EAAE,MAAM,CAAA;IACnB,YAAY,EAAE,MAAM,CAAA;IACpB,sBAAsB,EAAE,MAAM,CAAA;IAC9B,oBAAoB,EAAE,MAAM,CAAA;CAC7B,CAAA;AAED,MAAM,MAAM,sBAAsB,GAAG;IACnC,IAAI,EAAE,MAAM,CAAA;IAEZ,gBAAgB,EAAE,MAAM,CAAA;IACxB,gBAAgB,EAAE,MAAM,CAAA;IACxB,SAAS,EAAE,OAAO,CAAA;IAClB,WAAW,EAAE,OAAO,CAAA;IACpB,YAAY,EAAE,MAAM,CAAA;IACpB,aAAa,EAAE,MAAM,CAAA;IACrB,aAAa,EAAE,MAAM,CAAA;IACrB,aAAa,EAAE,MAAM,CAAA;IACrB,OAAO,EAAE,6BAA6B,CAAA;IAEtC,wBAAwB,CAAC,EAAE,KAAK,CAAC,yBAAyB,CAAC,CAAA;CAC5D,CAAA;AAED,MAAM,MAAM,oBAAoB,GAAG;IACjC,MAAM,EAAE,KAAK,CAAC,MAAM,CAAC,CAAA;CACtB,CAAA;AAED,MAAM,MAAM,qBAAqB,GAAG;IAClC,SAAS,EAAE,KAAK,CAAC,MAAM,CAAC,CAAA;CACzB,CAAA;AAED,MAAM,MAAM,kBAAkB,GAAG;IAC/B,SAAS,EAAE,MAAM,CAAA;IACjB,GAAG,EAAE,OAAO,CAAA;IACZ,WAAW,EAAE,MAAM,CAAA;IACnB,KAAK,EAAE,MAAM,CAAA;CACd,CAAA;AAED,MAAM,MAAM,uBAAuB,GAAG;IACpC,aAAa,EAAE,MAAM,CAAA;IACrB,MAAM,EAAE,MAAM,CAAA;CACf,CAAA;AAED,MAAM,MAAM,sBAAsB,GAAG;IACnC,IAAI,EAAE,MAAM,CAAA;IACZ,OAAO,EAAE,MAAM,CAAA;CAChB,CAAA;AAED,MAAM,WAAW,IAAK,SAAQ,WAAW;IACvC,eAAe,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAA;IAC7C,WAAW,CAAC,MAAM,EAAE,mBAAmB,GAAG,OAAO,CAAC,kBAAkB,CAAC,CAAA;IAErE,WAAW,CACT,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,GACf,OAAO,CAAC,uBAAuB,CAAC,CAAA;IACnC,WAAW,CACT,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,EAChB,IAAI,EAAE,MAAM,GACX,OAAO,CAAC,MAAM,CAAC,CAAA;IAClB,UAAU,CACR,SAAS,EAAE,MAAM,EACjB,MAAM,EAAE,sBAAsB,GAC7B,OAAO,CAAC,sBAAsB,CAAC,CAAA;IAClC,cAAc,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAA;IAChD,aAAa,CAAC,SAAS,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,oBAAoB,CAAC,CAAA;IAC7E,YAAY,CAAC,SAAS,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,oBAAoB,CAAA;IACnE,gBAAgB,CACd,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,sBAAsB,EAAE,EAClC,YAAY,CAAC,EAAE,MAAM,GACpB,OAAO,CAAC,MAAM,CAAC,CAAA;IAClB,UAAU,CAAC,SAAS,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,CAAA;IAChE,SAAS,CAAC,SAAS,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,qBAAqB,CAAC,CAAA;IAC1E,KAAK,CACH,SAAS,EAAE,MAAM,EACjB,EAAE,EAAE,MAAM,EACV,EAAE,EAAE,MAAM,EACV,EAAE,EAAE,MAAM,EACV,EAAE,EAAE,MAAM,GACT,OAAO,CAAC,MAAM,CAAC,CAAA;IAElB,cAAc,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAA;IAEhD,kBAAkB,IAAI,OAAO,CAAC,IAAI,CAAC,CAAA;CACpC;;AAED,wBAA+D"}
1
+ {"version":3,"file":"NativeRNLlama.d.ts","sourceRoot":"","sources":["../../src/NativeRNLlama.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,cAAc,CAAA;AAG/C,MAAM,MAAM,mBAAmB,GAAG;IAChC,KAAK,EAAE,MAAM,CAAA;IACb,cAAc,CAAC,EAAE,OAAO,CAAA;IAExB,SAAS,CAAC,EAAE,OAAO,CAAA;IAEnB,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,OAAO,CAAC,EAAE,MAAM,CAAA;IAEhB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,YAAY,CAAC,EAAE,MAAM,CAAA;IAErB,SAAS,CAAC,EAAE,OAAO,CAAA;IACnB,QAAQ,CAAC,EAAE,OAAO,CAAA;IAClB,UAAU,CAAC,EAAE,OAAO,CAAA;IAEpB,IAAI,CAAC,EAAE,MAAM,CAAA;IACb,WAAW,CAAC,EAAE,MAAM,CAAA;IAEpB,cAAc,CAAC,EAAE,MAAM,CAAA;IACvB,eAAe,CAAC,EAAE,MAAM,CAAA;CACzB,CAAA;AAED,MAAM,MAAM,sBAAsB,GAAG;IACnC,MAAM,EAAE,MAAM,CAAA;IACd,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,IAAI,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAA;IAEpB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,WAAW,CAAC,EAAE,MAAM,CAAA;IACpB,cAAc,CAAC,EAAE,MAAM,CAAA;IACvB,cAAc,CAAC,EAAE,MAAM,CAAA;IACvB,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,eAAe,CAAC,EAAE,MAAM,CAAA;IACxB,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,WAAW,CAAC,EAAE,OAAO,CAAA;IACrB,IAAI,CAAC,EAAE,MAAM,CAAA;IAEb,UAAU,CAAC,EAAE,OAAO,CAAA;IACpB,UAAU,CAAC,EAAE,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAA;IAEjC,uBAAuB,EAAE,OAAO,CAAA;CACjC,CAAA;AAED,MAAM,MAAM,6BAA6B,GAAG;IAC1C,OAAO,EAAE,MAAM,CAAA;IACf,IAAI,EAAE,MAAM,CAAA;CACb,CAAA;AAED,MAAM,MAAM,yBAAyB,GAAG;IACtC,OAAO,EAAE,MAAM,CAAA;IACf,KAAK,EAAE,KAAK,CAAC,6BAA6B,CAAC,CAAA;CAC5C,CAAA;AAED,MAAM,MAAM,6BAA6B,GAAG;IAC1C,QAAQ,EAAE,MAAM,CAAA;IAChB,SAAS,EAAE,MAAM,CAAA;IACjB,mBAAmB,EAAE,MAAM,CAAA;IAC3B,iBAAiB,EAAE,MAAM,CAAA;IACzB,WAAW,EAAE,MAAM,CAAA;IACnB,YAAY,EAAE,MAAM,CAAA;IACpB,sBAAsB,EAAE,MAAM,CAAA;IAC9B,oBAAoB,EAAE,MAAM,CAAA;CAC7B,CAAA;AAED,MAAM,MAAM,sBAAsB,GAAG;IACnC,IAAI,EAAE,MAAM,CAAA;IAEZ,gBAAgB,EAAE,MAAM,CAAA;IACxB,gBAAgB,EAAE,MAAM,CAAA;IACxB,SAAS,EAAE,OAAO,CAAA;IAClB,WAAW,EAAE,OAAO,CAAA;IACpB,YAAY,EAAE,MAAM,CAAA;IACpB,aAAa,EAAE,MAAM,CAAA;IACrB,aAAa,EAAE,MAAM,CAAA;IACrB,aAAa,EAAE,MAAM,CAAA;IACrB,OAAO,EAAE,6BAA6B,CAAA;IAEtC,wBAAwB,CAAC,EAAE,KAAK,CAAC,yBAAyB,CAAC,CAAA;CAC5D,CAAA;AAED,MAAM,MAAM,oBAAoB,GAAG;IACjC,MAAM,EAAE,KAAK,CAAC,MAAM,CAAC,CAAA;CACtB,CAAA;AAED,MAAM,MAAM,qBAAqB,GAAG;IAClC,SAAS,EAAE,KAAK,CAAC,MAAM,CAAC,CAAA;CACzB,CAAA;AAED,MAAM,MAAM,kBAAkB,GAAG;IAC/B,SAAS,EAAE,MAAM,CAAA;IACjB,GAAG,EAAE,OAAO,CAAA;IACZ,WAAW,EAAE,MAAM,CAAA;IACnB,KAAK,EAAE,MAAM,CAAA;CACd,CAAA;AAED,MAAM,MAAM,uBAAuB,GAAG;IACpC,aAAa,EAAE,MAAM,CAAA;IACrB,MAAM,EAAE,MAAM,CAAA;CACf,CAAA;AAED,MAAM,MAAM,sBAAsB,GAAG;IACnC,IAAI,EAAE,MAAM,CAAA;IACZ,OAAO,EAAE,MAAM,CAAA;CAChB,CAAA;AAED,MAAM,WAAW,IAAK,SAAQ,WAAW;IACvC,eAAe,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAA;IAC7C,WAAW,CAAC,MAAM,EAAE,mBAAmB,GAAG,OAAO,CAAC,kBAAkB,CAAC,CAAA;IAErE,WAAW,CACT,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,GACf,OAAO,CAAC,uBAAuB,CAAC,CAAA;IACnC,WAAW,CACT,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,EAChB,IAAI,EAAE,MAAM,GACX,OAAO,CAAC,MAAM,CAAC,CAAA;IAClB,UAAU,CACR,SAAS,EAAE,MAAM,EACjB,MAAM,EAAE,sBAAsB,GAC7B,OAAO,CAAC,sBAAsB,CAAC,CAAA;IAClC,cAAc,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAA;IAChD,aAAa,CAAC,SAAS,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,oBAAoB,CAAC,CAAA;IAC7E,YAAY,CAAC,SAAS,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,oBAAoB,CAAA;IACnE,gBAAgB,CACd,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,sBAAsB,EAAE,EAClC,YAAY,CAAC,EAAE,MAAM,GACpB,OAAO,CAAC,MAAM,CAAC,CAAA;IAClB,UAAU,CAAC,SAAS,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,CAAA;IAChE,SAAS,CAAC,SAAS,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,qBAAqB,CAAC,CAAA;IAC1E,KAAK,CACH,SAAS,EAAE,MAAM,EACjB,EAAE,EAAE,MAAM,EACV,EAAE,EAAE,MAAM,EACV,EAAE,EAAE,MAAM,EACV,EAAE,EAAE,MAAM,GACT,OAAO,CAAC,MAAM,CAAC,CAAA;IAElB,cAAc,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAA;IAEhD,kBAAkB,IAAI,OAAO,CAAC,IAAI,CAAC,CAAA;CACpC;;AAED,wBAA+D"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "cui-llama.rn",
3
- "version": "1.0.10",
3
+ "version": "1.1.0",
4
4
  "description": "Fork of llama.rn for ChatterUI",
5
5
  "main": "lib/commonjs/index",
6
6
  "module": "lib/module/index",
@@ -35,6 +35,8 @@ export type NativeCompletionParams = {
35
35
  top_k?: number
36
36
  top_p?: number
37
37
  min_p?: number
38
+ xtc_t?: number
39
+ xtc_p?: number
38
40
  tfs_z?: number
39
41
  typical_p?: number
40
42
  temperature?: number // -> temp