cui-llama.rn 1.0.10 → 1.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/common.cpp CHANGED
@@ -116,8 +116,34 @@ int32_t cpu_get_num_physical_cores() {
116
116
  if (result == 0) {
117
117
  return num_physical_cores;
118
118
  }
119
- #elif defined(_WIN32)
120
- //TODO: Implement
119
+ #elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
120
+ // TODO: windows + arm64 + mingw64
121
+ unsigned int n_threads_win = std::thread::hardware_concurrency();
122
+ unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
123
+
124
+ DWORD buffer_size = 0;
125
+ if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
126
+ if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
127
+ return default_threads;
128
+ }
129
+ }
130
+
131
+ std::vector<char> buffer(buffer_size);
132
+ if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
133
+ return default_threads;
134
+ }
135
+
136
+ int32_t num_physical_cores = 0;
137
+ PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
138
+ while (buffer_size > 0) {
139
+ if (info->Relationship == RelationProcessorCore) {
140
+ num_physical_cores += info->Processor.GroupCount;
141
+ }
142
+ buffer_size -= info->Size;
143
+ info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
144
+ }
145
+
146
+ return num_physical_cores > 0 ? num_physical_cores : default_threads;
121
147
  #endif
122
148
  unsigned int n_threads = std::thread::hardware_concurrency();
123
149
  return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
@@ -1733,7 +1759,13 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
1733
1759
  if (params.n_threads_batch != -1) {
1734
1760
  os << " (n_threads_batch = " << params.n_threads_batch << ")";
1735
1761
  }
1762
+ #if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
1763
+ // TODO: windows + arm64 + mingw64
1764
+ DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
1765
+ os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
1766
+ #else
1736
1767
  os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
1768
+ #endif
1737
1769
 
1738
1770
  return os.str();
1739
1771
  }
@@ -1018,10 +1018,6 @@ static bool lm_ggml_is_view_op(enum lm_ggml_op op) {
1018
1018
  #define LM_GGML_SCHED_MAX_BACKENDS 16
1019
1019
  #endif
1020
1020
 
1021
- #ifndef LM_GGML_SCHED_MAX_SPLITS
1022
- #define LM_GGML_SCHED_MAX_SPLITS 2048
1023
- #endif
1024
-
1025
1021
  #ifndef LM_GGML_SCHED_MAX_SPLIT_INPUTS
1026
1022
  #define LM_GGML_SCHED_MAX_SPLIT_INPUTS LM_GGML_MAX_SRC
1027
1023
  #endif
@@ -1125,7 +1121,8 @@ static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sch
1125
1121
  }
1126
1122
 
1127
1123
  #if 0
1128
- static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS*LM_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1124
+ #define LM_GGML_SCHED_MAX_SPLITS_DEBUG 4096
1125
+ static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS_DEBUG*LM_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1129
1126
  #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
1130
1127
  #define GET_CAUSE(node) causes[hash_id(node)]
1131
1128
  #else
@@ -1549,7 +1546,6 @@ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, str
1549
1546
  sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split));
1550
1547
  LM_GGML_ASSERT(sched->splits != NULL);
1551
1548
  }
1552
- LM_GGML_ASSERT(i_split < LM_GGML_SCHED_MAX_SPLITS);
1553
1549
  split = &sched->splits[i_split];
1554
1550
  split->backend_id = node_backend_id;
1555
1551
  split->i_start = i;
@@ -1865,13 +1861,14 @@ lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
1865
1861
  sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1866
1862
  sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
1867
1863
 
1868
- const size_t nodes_size = graph_size + LM_GGML_SCHED_MAX_SPLITS*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
1864
+ const size_t lm_ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
1865
+ const size_t nodes_size = graph_size + lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
1869
1866
  sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1870
1867
  sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1871
1868
  sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
1872
1869
  sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
1873
1870
 
1874
- sched->context_buffer_size = LM_GGML_SCHED_MAX_SPLITS*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false);
1871
+ sched->context_buffer_size = lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false);
1875
1872
  sched->context_buffer = malloc(sched->context_buffer_size);
1876
1873
 
1877
1874
  const int initial_splits_capacity = 16;
@@ -388,6 +388,7 @@ struct llm_tokenizer_bpe {
388
388
  case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
389
389
  case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
390
390
  case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
391
+ case LLAMA_VOCAB_PRE_TYPE_EXAONE:
391
392
  regex_exprs = {
392
393
  "\\p{N}",
393
394
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
package/cpp/llama.cpp CHANGED
@@ -221,6 +221,8 @@ enum llm_arch {
221
221
  LLM_ARCH_T5,
222
222
  LLM_ARCH_T5ENCODER,
223
223
  LLM_ARCH_JAIS,
224
+ LLM_ARCH_NEMOTRON,
225
+ LLM_ARCH_EXAONE,
224
226
  LLM_ARCH_UNKNOWN,
225
227
  };
226
228
 
@@ -266,6 +268,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
266
268
  { LLM_ARCH_T5, "t5" },
267
269
  { LLM_ARCH_T5ENCODER, "t5encoder" },
268
270
  { LLM_ARCH_JAIS, "jais" },
271
+ { LLM_ARCH_NEMOTRON, "nemotron" },
272
+ { LLM_ARCH_EXAONE, "exaone" },
269
273
  { LLM_ARCH_UNKNOWN, "(unknown)" },
270
274
  };
271
275
 
@@ -1307,6 +1311,43 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1307
1311
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1308
1312
  },
1309
1313
  },
1314
+ {
1315
+ LLM_ARCH_NEMOTRON,
1316
+ {
1317
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1318
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1319
+ { LLM_TENSOR_OUTPUT, "output" },
1320
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1321
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1322
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1323
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1324
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1325
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1326
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
1327
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1328
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1329
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1330
+ },
1331
+ },
1332
+ {
1333
+ LLM_ARCH_EXAONE,
1334
+ {
1335
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1336
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1337
+ { LLM_TENSOR_OUTPUT, "output" },
1338
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1339
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1340
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1341
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1342
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1343
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1344
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
1345
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1346
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1347
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1348
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1349
+ },
1350
+ },
1310
1351
  {
1311
1352
  LLM_ARCH_UNKNOWN,
1312
1353
  {
@@ -5246,6 +5287,23 @@ static void llm_load_hparams(
5246
5287
  default: model.type = e_model::MODEL_UNKNOWN;
5247
5288
  }
5248
5289
  } break;
5290
+ case LLM_ARCH_NEMOTRON:
5291
+ {
5292
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
5293
+ switch (hparams.n_layer) {
5294
+ case 32: model.type = e_model::MODEL_4B; break;
5295
+ default: model.type = e_model::MODEL_UNKNOWN;
5296
+ }
5297
+ } break;
5298
+ case LLM_ARCH_EXAONE:
5299
+ {
5300
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5301
+
5302
+ switch (hparams.n_layer) {
5303
+ case 32: model.type = e_model::MODEL_8B; break;
5304
+ default: model.type = e_model::MODEL_UNKNOWN;
5305
+ }
5306
+ } break;
5249
5307
  default: (void)0;
5250
5308
  }
5251
5309
 
@@ -5484,6 +5542,9 @@ static void llm_load_vocab(
5484
5542
  } else if (
5485
5543
  tokenizer_pre == "gpt3-finnish") {
5486
5544
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
5545
+ } else if (
5546
+ tokenizer_pre == "exaone") {
5547
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
5487
5548
  } else {
5488
5549
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
5489
5550
  }
@@ -7579,6 +7640,78 @@ static bool llm_load_tensors(
7579
7640
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
7580
7641
  }
7581
7642
  } break;
7643
+ case LLM_ARCH_NEMOTRON:
7644
+ {
7645
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
7646
+
7647
+ // output
7648
+ {
7649
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
7650
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
7651
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
7652
+ }
7653
+
7654
+ for (int i = 0; i < n_layer; ++i) {
7655
+ lm_ggml_context * ctx_layer = ctx_for_layer(i);
7656
+ lm_ggml_context * ctx_split = ctx_for_layer_split(i);
7657
+
7658
+ auto & layer = model.layers[i];
7659
+
7660
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
7661
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
7662
+
7663
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
7664
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
7665
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
7666
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
7667
+
7668
+ // optional bias tensors
7669
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
7670
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
7671
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
7672
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
7673
+
7674
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
7675
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
7676
+
7677
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
7678
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
7679
+
7680
+ // optional MLP bias
7681
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
7682
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
7683
+ }
7684
+ } break;
7685
+ case LLM_ARCH_EXAONE:
7686
+ {
7687
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
7688
+
7689
+ // output
7690
+ {
7691
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
7692
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
7693
+ }
7694
+
7695
+ for (int i = 0; i < n_layer; ++i) {
7696
+ lm_ggml_context * ctx_layer = ctx_for_layer(i);
7697
+ lm_ggml_context * ctx_split = ctx_for_layer_split(i);
7698
+
7699
+ auto & layer = model.layers[i];
7700
+
7701
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
7702
+
7703
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
7704
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
7705
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
7706
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
7707
+
7708
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
7709
+ layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
7710
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
7711
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
7712
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
7713
+ }
7714
+ } break;
7582
7715
  default:
7583
7716
  throw std::runtime_error("unknown architecture");
7584
7717
  }
@@ -8265,7 +8398,7 @@ static struct lm_ggml_tensor * llm_build_kqv(
8265
8398
  struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx, k, q);
8266
8399
  cb(kq, "kq", il);
8267
8400
 
8268
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
8401
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2 || model.arch == LLM_ARCH_NEMOTRON) {
8269
8402
  // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
8270
8403
  // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
8271
8404
  lm_ggml_mul_mat_set_prec(kq, LM_GGML_PREC_F32);
@@ -13766,6 +13899,254 @@ struct llm_build_context {
13766
13899
 
13767
13900
  return gf;
13768
13901
  }
13902
+
13903
+ struct lm_ggml_cgraph * build_nemotron() {
13904
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13905
+
13906
+ const int64_t n_embd_head = hparams.n_embd_head_v;
13907
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13908
+ //LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
13909
+
13910
+ struct lm_ggml_tensor * cur;
13911
+ struct lm_ggml_tensor * inpL;
13912
+
13913
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
13914
+
13915
+ // inp_pos - contains the positions
13916
+ struct lm_ggml_tensor * inp_pos = build_inp_pos();
13917
+
13918
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
13919
+ struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
13920
+
13921
+ for (int il = 0; il < n_layer; ++il) {
13922
+ struct lm_ggml_tensor * inpSA = inpL;
13923
+
13924
+ // norm
13925
+ cur = llm_build_norm(ctx0, inpL, hparams,
13926
+ model.layers[il].attn_norm,
13927
+ model.layers[il].attn_norm_b,
13928
+ LLM_NORM, cb, il);
13929
+ cb(cur, "attn_norm", il);
13930
+
13931
+ // self-attention
13932
+ {
13933
+ // compute Q and K and RoPE them
13934
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
13935
+ cb(Qcur, "Qcur", il);
13936
+ if (model.layers[il].bq) {
13937
+ Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
13938
+ cb(Qcur, "Qcur", il);
13939
+ }
13940
+
13941
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
13942
+ cb(Kcur, "Kcur", il);
13943
+ if (model.layers[il].bk) {
13944
+ Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
13945
+ cb(Kcur, "Kcur", il);
13946
+ }
13947
+
13948
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
13949
+ cb(Vcur, "Vcur", il);
13950
+ if (model.layers[il].bv) {
13951
+ Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
13952
+ cb(Vcur, "Vcur", il);
13953
+ }
13954
+
13955
+ Qcur = lm_ggml_rope_ext(
13956
+ ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
13957
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13958
+ ext_factor, attn_factor, beta_fast, beta_slow
13959
+ );
13960
+ cb(Qcur, "Qcur", il);
13961
+
13962
+ Kcur = lm_ggml_rope_ext(
13963
+ ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
13964
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13965
+ ext_factor, attn_factor, beta_fast, beta_slow
13966
+ );
13967
+ cb(Kcur, "Kcur", il);
13968
+
13969
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
13970
+ model.layers[il].wo, model.layers[il].bo,
13971
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
13972
+ }
13973
+
13974
+ if (il == n_layer - 1) {
13975
+ // skip computing output for unused tokens
13976
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
13977
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
13978
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
13979
+ }
13980
+
13981
+ struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
13982
+ cb(ffn_inp, "ffn_inp", il);
13983
+
13984
+ // feed-forward network
13985
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
13986
+ model.layers[il].ffn_norm,
13987
+ model.layers[il].ffn_norm_b,
13988
+ LLM_NORM, cb, il);
13989
+ cb(cur, "ffn_norm", il);
13990
+
13991
+ cur = llm_build_ffn(ctx0, lctx, cur,
13992
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
13993
+ NULL, NULL, NULL,
13994
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
13995
+ NULL,
13996
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
13997
+
13998
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
13999
+ cb(cur, "ffn_out", il);
14000
+
14001
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
14002
+ cb(cur, "l_out", il);
14003
+
14004
+ // input for next layer
14005
+ inpL = cur;
14006
+ }
14007
+
14008
+ cur = inpL;
14009
+
14010
+ cur = llm_build_norm(ctx0, cur, hparams,
14011
+ model.output_norm, model.output_norm_b,
14012
+ LLM_NORM, cb, -1);
14013
+ cb(cur, "result_norm", -1);
14014
+
14015
+ // lm_head
14016
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
14017
+ cb(cur, "result_output", -1);
14018
+
14019
+ lm_ggml_build_forward_expand(gf, cur);
14020
+
14021
+ return gf;
14022
+ }
14023
+
14024
+ struct lm_ggml_cgraph * build_exaone() {
14025
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
14026
+
14027
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
14028
+ int32_t n_tokens = this->n_tokens;
14029
+
14030
+ const int64_t n_embd_head = hparams.n_embd_head_v;
14031
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14032
+ LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
14033
+
14034
+ struct lm_ggml_tensor * cur;
14035
+ struct lm_ggml_tensor * inpL;
14036
+
14037
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
14038
+
14039
+ // inp_pos - contains the positions
14040
+ struct lm_ggml_tensor * inp_pos = build_inp_pos();
14041
+
14042
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
14043
+ struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
14044
+
14045
+ for (int il = 0; il < n_layer; ++il) {
14046
+ struct lm_ggml_tensor * inpSA = inpL;
14047
+
14048
+ // norm
14049
+ cur = llm_build_norm(ctx0, inpL, hparams,
14050
+ model.layers[il].attn_norm, NULL,
14051
+ LLM_NORM_RMS, cb, il);
14052
+ cb(cur, "attn_norm", il);
14053
+
14054
+ // self-attention
14055
+ {
14056
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
14057
+ struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
14058
+
14059
+ // compute Q and K and RoPE them
14060
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
14061
+ cb(Qcur, "Qcur", il);
14062
+ if (model.layers[il].bq) {
14063
+ Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
14064
+ cb(Qcur, "Qcur", il);
14065
+ }
14066
+
14067
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
14068
+ cb(Kcur, "Kcur", il);
14069
+ if (model.layers[il].bk) {
14070
+ Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
14071
+ cb(Kcur, "Kcur", il);
14072
+ }
14073
+
14074
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
14075
+ cb(Vcur, "Vcur", il);
14076
+ if (model.layers[il].bv) {
14077
+ Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
14078
+ cb(Vcur, "Vcur", il);
14079
+ }
14080
+
14081
+ Qcur = lm_ggml_rope_ext(
14082
+ ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
14083
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14084
+ ext_factor, attn_factor, beta_fast, beta_slow
14085
+ );
14086
+ cb(Qcur, "Qcur", il);
14087
+
14088
+ Kcur = lm_ggml_rope_ext(
14089
+ ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
14090
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14091
+ ext_factor, attn_factor, beta_fast, beta_slow
14092
+ );
14093
+ cb(Kcur, "Kcur", il);
14094
+
14095
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
14096
+ model.layers[il].wo, model.layers[il].bo,
14097
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
14098
+ }
14099
+
14100
+ if (il == n_layer - 1) {
14101
+ // skip computing output for unused tokens
14102
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
14103
+ n_tokens = n_outputs;
14104
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
14105
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
14106
+ }
14107
+
14108
+ struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
14109
+ cb(ffn_inp, "ffn_inp", il);
14110
+
14111
+ // feed-forward network
14112
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
14113
+ model.layers[il].ffn_norm, NULL,
14114
+ LLM_NORM_RMS, cb, il);
14115
+ cb(cur, "ffn_norm", il);
14116
+
14117
+ cur = llm_build_ffn(ctx0, lctx, cur,
14118
+ model.layers[il].ffn_up, NULL, NULL,
14119
+ model.layers[il].ffn_gate, NULL, NULL,
14120
+ model.layers[il].ffn_down, NULL, NULL,
14121
+ NULL,
14122
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
14123
+ cb(cur, "ffn_out", il);
14124
+
14125
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
14126
+ cb(cur, "ffn_out", il);
14127
+
14128
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
14129
+ cb(cur, "l_out", il);
14130
+
14131
+ // input for next layer
14132
+ inpL = cur;
14133
+ }
14134
+
14135
+ cur = inpL;
14136
+
14137
+ cur = llm_build_norm(ctx0, cur, hparams,
14138
+ model.output_norm, NULL,
14139
+ LLM_NORM_RMS, cb, -1);
14140
+ cb(cur, "result_norm", -1);
14141
+
14142
+ // lm_head
14143
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
14144
+ cb(cur, "result_output", -1);
14145
+
14146
+ lm_ggml_build_forward_expand(gf, cur);
14147
+
14148
+ return gf;
14149
+ }
13769
14150
  };
13770
14151
 
13771
14152
  static struct lm_ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -14021,6 +14402,14 @@ static struct lm_ggml_cgraph * llama_build_graph(
14021
14402
  {
14022
14403
  result = llm.build_jais();
14023
14404
  } break;
14405
+ case LLM_ARCH_NEMOTRON:
14406
+ {
14407
+ result = llm.build_nemotron();
14408
+ } break;
14409
+ case LLM_ARCH_EXAONE:
14410
+ {
14411
+ result = llm.build_exaone();
14412
+ } break;
14024
14413
  default:
14025
14414
  LM_GGML_ABORT("fatal error");
14026
14415
  }
@@ -17091,6 +17480,8 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
17091
17480
  case LLM_ARCH_OPENELM:
17092
17481
  case LLM_ARCH_GPTNEOX:
17093
17482
  case LLM_ARCH_CODESHELL:
17483
+ case LLM_ARCH_NEMOTRON:
17484
+ case LLM_ARCH_EXAONE:
17094
17485
  return LLAMA_ROPE_TYPE_NEOX;
17095
17486
 
17096
17487
  // all model arches should be listed explicitly here
@@ -19021,6 +19412,22 @@ static int32_t llama_chat_apply_template_internal(
19021
19412
  if (add_ass) {
19022
19413
  ss << "Assistant:";
19023
19414
  }
19415
+ } else if (tmpl == "exaone3" || (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]"))) {
19416
+ // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
19417
+ // EXAONE-3.0-7.8B-Instruct
19418
+ for (auto message : chat) {
19419
+ std::string role(message->role);
19420
+ if (role == "system") {
19421
+ ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n";
19422
+ } else if (role == "user") {
19423
+ ss << "[|user|]" << trim(message->content) << "\n";
19424
+ } else if (role == "assistant") {
19425
+ ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n";
19426
+ }
19427
+ }
19428
+ if (add_ass) {
19429
+ ss << "[|assistant|]";
19430
+ }
19024
19431
  } else {
19025
19432
  // template not supported
19026
19433
  return -1;
package/cpp/llama.h CHANGED
@@ -95,6 +95,7 @@ extern "C" {
95
95
  LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
96
96
  LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
97
97
  LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
98
+ LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
98
99
  };
99
100
 
100
101
  enum llama_rope_type {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "cui-llama.rn",
3
- "version": "1.0.10",
3
+ "version": "1.0.11",
4
4
  "description": "Fork of llama.rn for ChatterUI",
5
5
  "main": "lib/commonjs/index",
6
6
  "module": "lib/module/index",