cui-llama.rn 1.0.9 → 1.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/common.cpp CHANGED
@@ -116,8 +116,34 @@ int32_t cpu_get_num_physical_cores() {
116
116
  if (result == 0) {
117
117
  return num_physical_cores;
118
118
  }
119
- #elif defined(_WIN32)
120
- //TODO: Implement
119
+ #elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
120
+ // TODO: windows + arm64 + mingw64
121
+ unsigned int n_threads_win = std::thread::hardware_concurrency();
122
+ unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
123
+
124
+ DWORD buffer_size = 0;
125
+ if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
126
+ if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
127
+ return default_threads;
128
+ }
129
+ }
130
+
131
+ std::vector<char> buffer(buffer_size);
132
+ if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
133
+ return default_threads;
134
+ }
135
+
136
+ int32_t num_physical_cores = 0;
137
+ PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
138
+ while (buffer_size > 0) {
139
+ if (info->Relationship == RelationProcessorCore) {
140
+ num_physical_cores += info->Processor.GroupCount;
141
+ }
142
+ buffer_size -= info->Size;
143
+ info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
144
+ }
145
+
146
+ return num_physical_cores > 0 ? num_physical_cores : default_threads;
121
147
  #endif
122
148
  unsigned int n_threads = std::thread::hardware_concurrency();
123
149
  return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
@@ -1733,7 +1759,13 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
1733
1759
  if (params.n_threads_batch != -1) {
1734
1760
  os << " (n_threads_batch = " << params.n_threads_batch << ")";
1735
1761
  }
1762
+ #if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
1763
+ // TODO: windows + arm64 + mingw64
1764
+ DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
1765
+ os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
1766
+ #else
1736
1767
  os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
1768
+ #endif
1737
1769
 
1738
1770
  return os.str();
1739
1771
  }
@@ -2709,12 +2741,6 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
2709
2741
  return text;
2710
2742
  }
2711
2743
 
2712
- bool llama_should_add_bos_token(const llama_model * model) {
2713
- const int add_bos = llama_add_bos_token(model);
2714
-
2715
- return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
2716
- }
2717
-
2718
2744
  //
2719
2745
  // Chat template utils
2720
2746
  //
package/cpp/common.h CHANGED
@@ -392,10 +392,6 @@ std::string llama_detokenize(
392
392
  const std::vector<llama_token> & tokens,
393
393
  bool special = true);
394
394
 
395
- // Uses the value from the model metadata if possible, otherwise
396
- // defaults to true when model type is SPM, otherwise false.
397
- bool llama_should_add_bos_token(const llama_model * model);
398
-
399
395
  //
400
396
  // Chat template utils
401
397
  //
@@ -1018,10 +1018,6 @@ static bool lm_ggml_is_view_op(enum lm_ggml_op op) {
1018
1018
  #define LM_GGML_SCHED_MAX_BACKENDS 16
1019
1019
  #endif
1020
1020
 
1021
- #ifndef LM_GGML_SCHED_MAX_SPLITS
1022
- #define LM_GGML_SCHED_MAX_SPLITS 2048
1023
- #endif
1024
-
1025
1021
  #ifndef LM_GGML_SCHED_MAX_SPLIT_INPUTS
1026
1022
  #define LM_GGML_SCHED_MAX_SPLIT_INPUTS LM_GGML_MAX_SRC
1027
1023
  #endif
@@ -1125,7 +1121,8 @@ static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sch
1125
1121
  }
1126
1122
 
1127
1123
  #if 0
1128
- static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS*LM_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1124
+ #define LM_GGML_SCHED_MAX_SPLITS_DEBUG 4096
1125
+ static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS_DEBUG*LM_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1129
1126
  #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
1130
1127
  #define GET_CAUSE(node) causes[hash_id(node)]
1131
1128
  #else
@@ -1549,7 +1546,6 @@ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, str
1549
1546
  sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split));
1550
1547
  LM_GGML_ASSERT(sched->splits != NULL);
1551
1548
  }
1552
- LM_GGML_ASSERT(i_split < LM_GGML_SCHED_MAX_SPLITS);
1553
1549
  split = &sched->splits[i_split];
1554
1550
  split->backend_id = node_backend_id;
1555
1551
  split->i_start = i;
@@ -1865,13 +1861,14 @@ lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
1865
1861
  sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1866
1862
  sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
1867
1863
 
1868
- const size_t nodes_size = graph_size + LM_GGML_SCHED_MAX_SPLITS*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
1864
+ const size_t lm_ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
1865
+ const size_t nodes_size = graph_size + lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
1869
1866
  sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1870
1867
  sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1871
1868
  sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
1872
1869
  sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
1873
1870
 
1874
- sched->context_buffer_size = LM_GGML_SCHED_MAX_SPLITS*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false);
1871
+ sched->context_buffer_size = lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false);
1875
1872
  sched->context_buffer = malloc(sched->context_buffer_size);
1876
1873
 
1877
1874
  const int initial_splits_capacity = 16;
package/cpp/ggml-metal.m CHANGED
@@ -310,7 +310,7 @@ static struct lm_ggml_backend_metal_context * lm_ggml_metal_init(int n_cb) {
310
310
  LM_GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
311
311
 
312
312
  // Configure context
313
- struct lm_ggml_backend_metal_context * ctx = malloc(sizeof(struct lm_ggml_backend_metal_context));
313
+ struct lm_ggml_backend_metal_context * ctx = calloc(1, sizeof(struct lm_ggml_backend_metal_context));
314
314
  ctx->device = device;
315
315
  ctx->n_cb = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS);
316
316
  ctx->queue = [ctx->device newCommandQueue];
@@ -2313,7 +2313,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
2313
2313
  memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
2314
2314
  memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
2315
2315
 
2316
- const bool is_neox = mode & 2;
2316
+ const bool is_neox = mode & LM_GGML_ROPE_TYPE_NEOX;
2317
2317
 
2318
2318
  id<MTLComputePipelineState> pipeline = nil;
2319
2319
 
package/cpp/ggml.c CHANGED
@@ -14094,7 +14094,7 @@ static void lm_ggml_compute_forward_rope_f32(
14094
14094
  float corr_dims[2];
14095
14095
  lm_ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
14096
14096
 
14097
- const bool is_neox = mode & 2;
14097
+ const bool is_neox = mode & LM_GGML_ROPE_TYPE_NEOX;
14098
14098
 
14099
14099
  const float * freq_factors = NULL;
14100
14100
  if (src2 != NULL) {
@@ -14219,7 +14219,7 @@ static void lm_ggml_compute_forward_rope_f16(
14219
14219
  float corr_dims[2];
14220
14220
  lm_ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
14221
14221
 
14222
- const bool is_neox = mode & 2;
14222
+ const bool is_neox = mode & LM_GGML_ROPE_TYPE_NEOX;
14223
14223
 
14224
14224
  const float * freq_factors = NULL;
14225
14225
  if (src2 != NULL) {
@@ -21129,7 +21129,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
21129
21129
  (int64_t) info->ne[2] *
21130
21130
  (int64_t) info->ne[3];
21131
21131
 
21132
- if (ne % lm_ggml_blck_size(info->type) != 0) {
21132
+ if (lm_ggml_blck_size(info->type) == 0 || ne % lm_ggml_blck_size(info->type) != 0) {
21133
21133
  fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
21134
21134
  __func__, info->name.data, (int) info->type, lm_ggml_type_name(info->type), ne, lm_ggml_blck_size(info->type));
21135
21135
  fclose(file);
package/cpp/ggml.h CHANGED
@@ -244,6 +244,8 @@
244
244
  #define LM_GGML_EXIT_SUCCESS 0
245
245
  #define LM_GGML_EXIT_ABORTED 1
246
246
 
247
+ #define LM_GGML_ROPE_TYPE_NEOX 2
248
+
247
249
  #define LM_GGUF_MAGIC "GGUF"
248
250
 
249
251
  #define LM_GGUF_VERSION 3
@@ -1453,8 +1455,8 @@ extern "C" {
1453
1455
  struct lm_ggml_tensor * b);
1454
1456
 
1455
1457
  // rotary position embedding
1456
- // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
1457
- // if mode & 2 == 1, GPT-NeoX style
1458
+ // if (mode & 1) - skip n_past elements (NOT SUPPORTED)
1459
+ // if (mode & LM_GGML_ROPE_TYPE_NEOX) - GPT-NeoX style
1458
1460
  //
1459
1461
  // b is an int32 vector with size a->ne[2], it contains the positions
1460
1462
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope(
@@ -369,6 +369,9 @@ namespace grammar_parser {
369
369
  }
370
370
  // Validate the state to ensure that all rules are defined
371
371
  for (const auto & rule : state.rules) {
372
+ if (rule.empty()) {
373
+ throw std::runtime_error("Undefined rule");
374
+ }
372
375
  for (const auto & elem : rule) {
373
376
  if (elem.type == LLAMA_GRETYPE_RULE_REF) {
374
377
  // Ensure that the rule at that location exists
@@ -85,14 +85,14 @@ void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_arra
85
85
  constexpr float bucket_low = -10.0f;
86
86
  constexpr float bucket_high = 10.0f;
87
87
  constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
88
- constexpr float bucker_inter = -bucket_low * bucket_scale;
88
+ constexpr float bucket_inter = -bucket_low * bucket_scale;
89
89
 
90
90
  std::vector<int> bucket_idx(candidates->size);
91
91
  std::vector<int> histo(nbuckets, 0);
92
92
 
93
93
  for (int i = 0; i < (int)candidates->size; ++i) {
94
94
  const float val = candidates->data[i].logit;
95
- int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
95
+ int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
96
96
  ib = std::max(0, std::min(nbuckets-1, ib));
97
97
  bucket_idx[i] = ib;
98
98
  ++histo[ib];
@@ -388,6 +388,7 @@ struct llm_tokenizer_bpe {
388
388
  case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
389
389
  case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
390
390
  case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
391
+ case LLAMA_VOCAB_PRE_TYPE_EXAONE:
391
392
  regex_exprs = {
392
393
  "\\p{N}",
393
394
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
@@ -410,6 +411,8 @@ struct llm_tokenizer_bpe {
410
411
  };
411
412
  break;
412
413
  case LLAMA_VOCAB_PRE_TYPE_PORO:
414
+ case LLAMA_VOCAB_PRE_TYPE_BLOOM:
415
+ case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
413
416
  regex_exprs = {
414
417
  " ?[^(\\s|.,!?…。,、।۔،)]+",
415
418
  };
@@ -1466,11 +1469,11 @@ llama_token llama_token_pad_impl(const struct llama_vocab & vocab) {
1466
1469
  return vocab.special_pad_id;
1467
1470
  }
1468
1471
 
1469
- int32_t llama_add_bos_token_impl(const struct llama_vocab & vocab) {
1472
+ bool llama_add_bos_token_impl(const struct llama_vocab & vocab) {
1470
1473
  return vocab.tokenizer_add_bos;
1471
1474
  }
1472
1475
 
1473
- int32_t llama_add_eos_token_impl(const struct llama_vocab & vocab) {
1476
+ bool llama_add_eos_token_impl(const struct llama_vocab & vocab) {
1474
1477
  return vocab.tokenizer_add_eos;
1475
1478
  }
1476
1479
 
package/cpp/llama-vocab.h CHANGED
@@ -95,8 +95,8 @@ llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
95
95
  llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
96
96
  llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
97
97
 
98
- int32_t llama_add_bos_token_impl(const struct llama_vocab & vocab);
99
- int32_t llama_add_eos_token_impl(const struct llama_vocab & vocab);
98
+ bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
99
+ bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
100
100
 
101
101
  llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
102
102
  llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
package/cpp/llama.cpp CHANGED
@@ -221,6 +221,8 @@ enum llm_arch {
221
221
  LLM_ARCH_T5,
222
222
  LLM_ARCH_T5ENCODER,
223
223
  LLM_ARCH_JAIS,
224
+ LLM_ARCH_NEMOTRON,
225
+ LLM_ARCH_EXAONE,
224
226
  LLM_ARCH_UNKNOWN,
225
227
  };
226
228
 
@@ -266,6 +268,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
266
268
  { LLM_ARCH_T5, "t5" },
267
269
  { LLM_ARCH_T5ENCODER, "t5encoder" },
268
270
  { LLM_ARCH_JAIS, "jais" },
271
+ { LLM_ARCH_NEMOTRON, "nemotron" },
272
+ { LLM_ARCH_EXAONE, "exaone" },
269
273
  { LLM_ARCH_UNKNOWN, "(unknown)" },
270
274
  };
271
275
 
@@ -1307,6 +1311,43 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1307
1311
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1308
1312
  },
1309
1313
  },
1314
+ {
1315
+ LLM_ARCH_NEMOTRON,
1316
+ {
1317
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1318
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1319
+ { LLM_TENSOR_OUTPUT, "output" },
1320
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1321
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1322
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1323
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1324
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1325
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1326
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
1327
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1328
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1329
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1330
+ },
1331
+ },
1332
+ {
1333
+ LLM_ARCH_EXAONE,
1334
+ {
1335
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1336
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1337
+ { LLM_TENSOR_OUTPUT, "output" },
1338
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1339
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1340
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1341
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1342
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1343
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1344
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
1345
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1346
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1347
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1348
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1349
+ },
1350
+ },
1310
1351
  {
1311
1352
  LLM_ARCH_UNKNOWN,
1312
1353
  {
@@ -3586,13 +3627,8 @@ namespace GGUFMeta {
3586
3627
 
3587
3628
  using llama_buf_map = std::unordered_map<uint32_t, lm_ggml_backend_buffer_t>;
3588
3629
 
3589
- // TODO: update when needed or think of some clever automatic way to do this
3590
- static size_t llama_model_max_nodes(const llama_model & /*model*/) {
3591
- //if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B
3592
- // return 32768;
3593
- //}
3594
-
3595
- return 8192;
3630
+ static size_t llama_model_max_nodes(const llama_model & model) {
3631
+ return std::max<size_t>(8192, model.tensors_by_name.size()*5);
3596
3632
  }
3597
3633
 
3598
3634
  struct llama_model_loader {
@@ -4912,7 +4948,6 @@ static void llm_load_hparams(
4912
4948
  } break;
4913
4949
  case LLM_ARCH_PHI3:
4914
4950
  {
4915
- ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
4916
4951
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4917
4952
 
4918
4953
  switch (hparams.n_layer) {
@@ -4921,6 +4956,22 @@ static void llm_load_hparams(
4921
4956
  case 40: model.type = e_model::MODEL_14B; break;
4922
4957
  default: model.type = e_model::MODEL_UNKNOWN;
4923
4958
  }
4959
+
4960
+ // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
4961
+ if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
4962
+ // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
4963
+ hparams.n_swa = 2047;
4964
+ } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
4965
+ // default value for Phi-3-mini-128k-instruct
4966
+ hparams.n_swa = 262144;
4967
+ } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
4968
+ // default value for Phi-3-medium-128k-instruct
4969
+ hparams.n_swa = 131072;
4970
+ }
4971
+ bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
4972
+ if (!found_swa && hparams.n_swa == 0) {
4973
+ throw std::runtime_error("invalid value for sliding_window");
4974
+ }
4924
4975
  } break;
4925
4976
  case LLM_ARCH_PLAMO:
4926
4977
  {
@@ -5236,6 +5287,23 @@ static void llm_load_hparams(
5236
5287
  default: model.type = e_model::MODEL_UNKNOWN;
5237
5288
  }
5238
5289
  } break;
5290
+ case LLM_ARCH_NEMOTRON:
5291
+ {
5292
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
5293
+ switch (hparams.n_layer) {
5294
+ case 32: model.type = e_model::MODEL_4B; break;
5295
+ default: model.type = e_model::MODEL_UNKNOWN;
5296
+ }
5297
+ } break;
5298
+ case LLM_ARCH_EXAONE:
5299
+ {
5300
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5301
+
5302
+ switch (hparams.n_layer) {
5303
+ case 32: model.type = e_model::MODEL_8B; break;
5304
+ default: model.type = e_model::MODEL_UNKNOWN;
5305
+ }
5306
+ } break;
5239
5307
  default: (void)0;
5240
5308
  }
5241
5309
 
@@ -5468,6 +5536,15 @@ static void llm_load_vocab(
5468
5536
  } else if (
5469
5537
  tokenizer_pre == "codeshell") {
5470
5538
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
5539
+ } else if (
5540
+ tokenizer_pre == "bloom") {
5541
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BLOOM;
5542
+ } else if (
5543
+ tokenizer_pre == "gpt3-finnish") {
5544
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
5545
+ } else if (
5546
+ tokenizer_pre == "exaone") {
5547
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
5471
5548
  } else {
5472
5549
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
5473
5550
  }
@@ -7563,6 +7640,78 @@ static bool llm_load_tensors(
7563
7640
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
7564
7641
  }
7565
7642
  } break;
7643
+ case LLM_ARCH_NEMOTRON:
7644
+ {
7645
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
7646
+
7647
+ // output
7648
+ {
7649
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
7650
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
7651
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
7652
+ }
7653
+
7654
+ for (int i = 0; i < n_layer; ++i) {
7655
+ lm_ggml_context * ctx_layer = ctx_for_layer(i);
7656
+ lm_ggml_context * ctx_split = ctx_for_layer_split(i);
7657
+
7658
+ auto & layer = model.layers[i];
7659
+
7660
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
7661
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
7662
+
7663
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
7664
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
7665
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
7666
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
7667
+
7668
+ // optional bias tensors
7669
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
7670
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
7671
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
7672
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
7673
+
7674
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
7675
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
7676
+
7677
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
7678
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
7679
+
7680
+ // optional MLP bias
7681
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
7682
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
7683
+ }
7684
+ } break;
7685
+ case LLM_ARCH_EXAONE:
7686
+ {
7687
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
7688
+
7689
+ // output
7690
+ {
7691
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
7692
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
7693
+ }
7694
+
7695
+ for (int i = 0; i < n_layer; ++i) {
7696
+ lm_ggml_context * ctx_layer = ctx_for_layer(i);
7697
+ lm_ggml_context * ctx_split = ctx_for_layer_split(i);
7698
+
7699
+ auto & layer = model.layers[i];
7700
+
7701
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
7702
+
7703
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
7704
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
7705
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
7706
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
7707
+
7708
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
7709
+ layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
7710
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
7711
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
7712
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
7713
+ }
7714
+ } break;
7566
7715
  default:
7567
7716
  throw std::runtime_error("unknown architecture");
7568
7717
  }
@@ -8249,7 +8398,7 @@ static struct lm_ggml_tensor * llm_build_kqv(
8249
8398
  struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx, k, q);
8250
8399
  cb(kq, "kq", il);
8251
8400
 
8252
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
8401
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2 || model.arch == LLM_ARCH_NEMOTRON) {
8253
8402
  // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
8254
8403
  // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
8255
8404
  lm_ggml_mul_mat_set_prec(kq, LM_GGML_PREC_F32);
@@ -13750,6 +13899,254 @@ struct llm_build_context {
13750
13899
 
13751
13900
  return gf;
13752
13901
  }
13902
+
13903
+ struct lm_ggml_cgraph * build_nemotron() {
13904
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13905
+
13906
+ const int64_t n_embd_head = hparams.n_embd_head_v;
13907
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13908
+ //LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
13909
+
13910
+ struct lm_ggml_tensor * cur;
13911
+ struct lm_ggml_tensor * inpL;
13912
+
13913
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
13914
+
13915
+ // inp_pos - contains the positions
13916
+ struct lm_ggml_tensor * inp_pos = build_inp_pos();
13917
+
13918
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
13919
+ struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
13920
+
13921
+ for (int il = 0; il < n_layer; ++il) {
13922
+ struct lm_ggml_tensor * inpSA = inpL;
13923
+
13924
+ // norm
13925
+ cur = llm_build_norm(ctx0, inpL, hparams,
13926
+ model.layers[il].attn_norm,
13927
+ model.layers[il].attn_norm_b,
13928
+ LLM_NORM, cb, il);
13929
+ cb(cur, "attn_norm", il);
13930
+
13931
+ // self-attention
13932
+ {
13933
+ // compute Q and K and RoPE them
13934
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
13935
+ cb(Qcur, "Qcur", il);
13936
+ if (model.layers[il].bq) {
13937
+ Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
13938
+ cb(Qcur, "Qcur", il);
13939
+ }
13940
+
13941
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
13942
+ cb(Kcur, "Kcur", il);
13943
+ if (model.layers[il].bk) {
13944
+ Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
13945
+ cb(Kcur, "Kcur", il);
13946
+ }
13947
+
13948
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
13949
+ cb(Vcur, "Vcur", il);
13950
+ if (model.layers[il].bv) {
13951
+ Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
13952
+ cb(Vcur, "Vcur", il);
13953
+ }
13954
+
13955
+ Qcur = lm_ggml_rope_ext(
13956
+ ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
13957
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13958
+ ext_factor, attn_factor, beta_fast, beta_slow
13959
+ );
13960
+ cb(Qcur, "Qcur", il);
13961
+
13962
+ Kcur = lm_ggml_rope_ext(
13963
+ ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
13964
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13965
+ ext_factor, attn_factor, beta_fast, beta_slow
13966
+ );
13967
+ cb(Kcur, "Kcur", il);
13968
+
13969
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
13970
+ model.layers[il].wo, model.layers[il].bo,
13971
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
13972
+ }
13973
+
13974
+ if (il == n_layer - 1) {
13975
+ // skip computing output for unused tokens
13976
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
13977
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
13978
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
13979
+ }
13980
+
13981
+ struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
13982
+ cb(ffn_inp, "ffn_inp", il);
13983
+
13984
+ // feed-forward network
13985
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
13986
+ model.layers[il].ffn_norm,
13987
+ model.layers[il].ffn_norm_b,
13988
+ LLM_NORM, cb, il);
13989
+ cb(cur, "ffn_norm", il);
13990
+
13991
+ cur = llm_build_ffn(ctx0, lctx, cur,
13992
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
13993
+ NULL, NULL, NULL,
13994
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
13995
+ NULL,
13996
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
13997
+
13998
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
13999
+ cb(cur, "ffn_out", il);
14000
+
14001
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
14002
+ cb(cur, "l_out", il);
14003
+
14004
+ // input for next layer
14005
+ inpL = cur;
14006
+ }
14007
+
14008
+ cur = inpL;
14009
+
14010
+ cur = llm_build_norm(ctx0, cur, hparams,
14011
+ model.output_norm, model.output_norm_b,
14012
+ LLM_NORM, cb, -1);
14013
+ cb(cur, "result_norm", -1);
14014
+
14015
+ // lm_head
14016
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
14017
+ cb(cur, "result_output", -1);
14018
+
14019
+ lm_ggml_build_forward_expand(gf, cur);
14020
+
14021
+ return gf;
14022
+ }
14023
+
14024
+ struct lm_ggml_cgraph * build_exaone() {
14025
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
14026
+
14027
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
14028
+ int32_t n_tokens = this->n_tokens;
14029
+
14030
+ const int64_t n_embd_head = hparams.n_embd_head_v;
14031
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14032
+ LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
14033
+
14034
+ struct lm_ggml_tensor * cur;
14035
+ struct lm_ggml_tensor * inpL;
14036
+
14037
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
14038
+
14039
+ // inp_pos - contains the positions
14040
+ struct lm_ggml_tensor * inp_pos = build_inp_pos();
14041
+
14042
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
14043
+ struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
14044
+
14045
+ for (int il = 0; il < n_layer; ++il) {
14046
+ struct lm_ggml_tensor * inpSA = inpL;
14047
+
14048
+ // norm
14049
+ cur = llm_build_norm(ctx0, inpL, hparams,
14050
+ model.layers[il].attn_norm, NULL,
14051
+ LLM_NORM_RMS, cb, il);
14052
+ cb(cur, "attn_norm", il);
14053
+
14054
+ // self-attention
14055
+ {
14056
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
14057
+ struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
14058
+
14059
+ // compute Q and K and RoPE them
14060
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
14061
+ cb(Qcur, "Qcur", il);
14062
+ if (model.layers[il].bq) {
14063
+ Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
14064
+ cb(Qcur, "Qcur", il);
14065
+ }
14066
+
14067
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
14068
+ cb(Kcur, "Kcur", il);
14069
+ if (model.layers[il].bk) {
14070
+ Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
14071
+ cb(Kcur, "Kcur", il);
14072
+ }
14073
+
14074
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
14075
+ cb(Vcur, "Vcur", il);
14076
+ if (model.layers[il].bv) {
14077
+ Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
14078
+ cb(Vcur, "Vcur", il);
14079
+ }
14080
+
14081
+ Qcur = lm_ggml_rope_ext(
14082
+ ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
14083
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14084
+ ext_factor, attn_factor, beta_fast, beta_slow
14085
+ );
14086
+ cb(Qcur, "Qcur", il);
14087
+
14088
+ Kcur = lm_ggml_rope_ext(
14089
+ ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
14090
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14091
+ ext_factor, attn_factor, beta_fast, beta_slow
14092
+ );
14093
+ cb(Kcur, "Kcur", il);
14094
+
14095
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
14096
+ model.layers[il].wo, model.layers[il].bo,
14097
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
14098
+ }
14099
+
14100
+ if (il == n_layer - 1) {
14101
+ // skip computing output for unused tokens
14102
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
14103
+ n_tokens = n_outputs;
14104
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
14105
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
14106
+ }
14107
+
14108
+ struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
14109
+ cb(ffn_inp, "ffn_inp", il);
14110
+
14111
+ // feed-forward network
14112
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
14113
+ model.layers[il].ffn_norm, NULL,
14114
+ LLM_NORM_RMS, cb, il);
14115
+ cb(cur, "ffn_norm", il);
14116
+
14117
+ cur = llm_build_ffn(ctx0, lctx, cur,
14118
+ model.layers[il].ffn_up, NULL, NULL,
14119
+ model.layers[il].ffn_gate, NULL, NULL,
14120
+ model.layers[il].ffn_down, NULL, NULL,
14121
+ NULL,
14122
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
14123
+ cb(cur, "ffn_out", il);
14124
+
14125
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
14126
+ cb(cur, "ffn_out", il);
14127
+
14128
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
14129
+ cb(cur, "l_out", il);
14130
+
14131
+ // input for next layer
14132
+ inpL = cur;
14133
+ }
14134
+
14135
+ cur = inpL;
14136
+
14137
+ cur = llm_build_norm(ctx0, cur, hparams,
14138
+ model.output_norm, NULL,
14139
+ LLM_NORM_RMS, cb, -1);
14140
+ cb(cur, "result_norm", -1);
14141
+
14142
+ // lm_head
14143
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
14144
+ cb(cur, "result_output", -1);
14145
+
14146
+ lm_ggml_build_forward_expand(gf, cur);
14147
+
14148
+ return gf;
14149
+ }
13753
14150
  };
13754
14151
 
13755
14152
  static struct lm_ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -14005,6 +14402,14 @@ static struct lm_ggml_cgraph * llama_build_graph(
14005
14402
  {
14006
14403
  result = llm.build_jais();
14007
14404
  } break;
14405
+ case LLM_ARCH_NEMOTRON:
14406
+ {
14407
+ result = llm.build_nemotron();
14408
+ } break;
14409
+ case LLM_ARCH_EXAONE:
14410
+ {
14411
+ result = llm.build_exaone();
14412
+ } break;
14008
14413
  default:
14009
14414
  LM_GGML_ABORT("fatal error");
14010
14415
  }
@@ -14718,12 +15123,15 @@ static int llama_decode_internal(
14718
15123
  res = nullptr;
14719
15124
  embd = nullptr;
14720
15125
  } else if (cparams.embeddings) {
14721
- res = nullptr; // do not extract logits for embedding case
14722
- embd = gf->nodes[gf->n_nodes - 1];
14723
- if (strcmp(embd->name, "result_embd_pooled") != 0) {
14724
- embd = gf->nodes[gf->n_nodes - 2];
15126
+ res = nullptr; // do not extract logits for embedding case
15127
+ embd = nullptr;
15128
+ for (int i = gf->n_nodes - 1; i >= 0; --i) {
15129
+ if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
15130
+ embd = gf->nodes[i];
15131
+ break;
15132
+ }
14725
15133
  }
14726
- LM_GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
15134
+ LM_GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
14727
15135
  } else {
14728
15136
  embd = nullptr; // do not extract embeddings when not needed
14729
15137
  LM_GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
@@ -17072,6 +17480,8 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
17072
17480
  case LLM_ARCH_OPENELM:
17073
17481
  case LLM_ARCH_GPTNEOX:
17074
17482
  case LLM_ARCH_CODESHELL:
17483
+ case LLM_ARCH_NEMOTRON:
17484
+ case LLM_ARCH_EXAONE:
17075
17485
  return LLAMA_ROPE_TYPE_NEOX;
17076
17486
 
17077
17487
  // all model arches should be listed explicitly here
@@ -18697,11 +19107,11 @@ llama_token llama_token_pad(const struct llama_model * model) {
18697
19107
  return llama_token_pad_impl(model->vocab);
18698
19108
  }
18699
19109
 
18700
- int32_t llama_add_bos_token(const struct llama_model * model) {
19110
+ bool llama_add_bos_token(const struct llama_model * model) {
18701
19111
  return llama_add_bos_token_impl(model->vocab);
18702
19112
  }
18703
19113
 
18704
- int32_t llama_add_eos_token(const struct llama_model * model) {
19114
+ bool llama_add_eos_token(const struct llama_model * model) {
18705
19115
  return llama_add_eos_token_impl(model->vocab);
18706
19116
  }
18707
19117
 
@@ -19002,6 +19412,22 @@ static int32_t llama_chat_apply_template_internal(
19002
19412
  if (add_ass) {
19003
19413
  ss << "Assistant:";
19004
19414
  }
19415
+ } else if (tmpl == "exaone3" || (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]"))) {
19416
+ // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
19417
+ // EXAONE-3.0-7.8B-Instruct
19418
+ for (auto message : chat) {
19419
+ std::string role(message->role);
19420
+ if (role == "system") {
19421
+ ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n";
19422
+ } else if (role == "user") {
19423
+ ss << "[|user|]" << trim(message->content) << "\n";
19424
+ } else if (role == "assistant") {
19425
+ ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n";
19426
+ }
19427
+ }
19428
+ if (add_ass) {
19429
+ ss << "[|assistant|]";
19430
+ }
19005
19431
  } else {
19006
19432
  // template not supported
19007
19433
  return -1;
package/cpp/llama.h CHANGED
@@ -93,15 +93,15 @@ extern "C" {
93
93
  LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
94
94
  LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
95
95
  LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
96
+ LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
97
+ LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
98
+ LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
96
99
  };
97
100
 
98
- // note: these values should be synchronized with lm_ggml_rope
99
- // TODO: maybe move this enum to ggml.h (lm_ggml_rope_type)
100
101
  enum llama_rope_type {
101
102
  LLAMA_ROPE_TYPE_NONE = -1,
102
- LLAMA_ROPE_TYPE_NORM = 0,
103
- LLAMA_ROPE_TYPE_NEOX = 2,
104
- LLAMA_ROPE_TYPE_GLM = 4,
103
+ LLAMA_ROPE_TYPE_NORM = 0,
104
+ LLAMA_ROPE_TYPE_NEOX = LM_GGML_ROPE_TYPE_NEOX,
105
105
  };
106
106
 
107
107
  enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
@@ -915,11 +915,8 @@ extern "C" {
915
915
  LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
916
916
  LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
917
917
 
918
- // Returns -1 if unknown, 1 for true or 0 for false.
919
- LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
920
-
921
- // Returns -1 if unknown, 1 for true or 0 for false.
922
- LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
918
+ LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
919
+ LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
923
920
 
924
921
  // Codellama infill tokens
925
922
  LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
package/cpp/rn-llama.hpp CHANGED
@@ -297,7 +297,9 @@ struct llama_rn_context
297
297
  }
298
298
 
299
299
  // do Context Shift , may be buggy! TODO: Verify functionality
300
- purge_missing_tokens(ctx, embd, prompt_tokens, params.n_predict, params.n_ctx);
300
+ if(!params.embedding){
301
+ purge_missing_tokens(ctx, embd, prompt_tokens, params.n_predict, params.n_ctx);
302
+ }
301
303
 
302
304
  // push the prompt into the sampling context (do not apply grammar)
303
305
  for (auto & token : prompt_tokens)
@@ -305,7 +307,7 @@ struct llama_rn_context
305
307
  llama_sampling_accept(ctx_sampling, ctx, token, false);
306
308
  }
307
309
  // compare the evaluated prompt with the new prompt
308
- n_past = common_part(embd, prompt_tokens);
310
+ n_past = params.embedding? 0 : common_part(embd, prompt_tokens);
309
311
  LLAMA_LOG_INFO("%s: n_past: %zu", __func__, n_past);
310
312
  LLAMA_LOG_INFO("%s: embd size: %zu", __func__, embd.size());
311
313
  LLAMA_LOG_INFO("%s: prompt_tokens size: %zu", __func__, prompt_tokens.size());
@@ -342,9 +344,9 @@ struct llama_rn_context
342
344
  completion_token_output result;
343
345
  result.tok = -1;
344
346
 
347
+ // this truncation should never trigger with good context shifting
345
348
  if (embd.size() >= (size_t)params.n_ctx)
346
349
  {
347
- // Shift context
348
350
 
349
351
  const int n_left = n_past - params.n_keep - 1;
350
352
  const int n_discard = n_left/2;
@@ -546,9 +548,21 @@ struct llama_rn_context
546
548
  LOG_WARNING("embedding disabled, embedding: %s", params.embedding);
547
549
  return std::vector<float>(n_embd, 0.0f);
548
550
  }
549
- const float *data = llama_get_embeddings(ctx);
550
- std::vector<float> embedding(data, data + n_embd);
551
- return embedding;
551
+ float *data;
552
+
553
+ if(params.pooling_type == 0){
554
+ data = llama_get_embeddings(ctx);
555
+ }
556
+ else {
557
+ data = llama_get_embeddings_seq(ctx, 0);
558
+ }
559
+
560
+ if(!data) {
561
+ return std::vector<float>(n_embd, 0.0f);
562
+ }
563
+ std::vector<float> embedding(data, data + n_embd), out(data, data + n_embd);
564
+ llama_embd_normalize(embedding.data(), out.data(), n_embd, params.embd_normalize);
565
+ return out;
552
566
  }
553
567
 
554
568
  std::string bench(int pp, int tg, int pl, int nr)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "cui-llama.rn",
3
- "version": "1.0.9",
3
+ "version": "1.0.11",
4
4
  "description": "Fork of llama.rn for ChatterUI",
5
5
  "main": "lib/commonjs/index",
6
6
  "module": "lib/module/index",