cui-llama.rn 1.0.10 → 1.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cpp/common.cpp +34 -2
- package/cpp/ggml-backend.c +5 -8
- package/cpp/llama-vocab.cpp +1 -0
- package/cpp/llama.cpp +408 -1
- package/cpp/llama.h +1 -0
- package/package.json +1 -1
package/cpp/common.cpp
CHANGED
@@ -116,8 +116,34 @@ int32_t cpu_get_num_physical_cores() {
|
|
116
116
|
if (result == 0) {
|
117
117
|
return num_physical_cores;
|
118
118
|
}
|
119
|
-
#elif defined(_WIN32)
|
120
|
-
//TODO:
|
119
|
+
#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
|
120
|
+
// TODO: windows + arm64 + mingw64
|
121
|
+
unsigned int n_threads_win = std::thread::hardware_concurrency();
|
122
|
+
unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
|
123
|
+
|
124
|
+
DWORD buffer_size = 0;
|
125
|
+
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
|
126
|
+
if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
|
127
|
+
return default_threads;
|
128
|
+
}
|
129
|
+
}
|
130
|
+
|
131
|
+
std::vector<char> buffer(buffer_size);
|
132
|
+
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
|
133
|
+
return default_threads;
|
134
|
+
}
|
135
|
+
|
136
|
+
int32_t num_physical_cores = 0;
|
137
|
+
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
|
138
|
+
while (buffer_size > 0) {
|
139
|
+
if (info->Relationship == RelationProcessorCore) {
|
140
|
+
num_physical_cores += info->Processor.GroupCount;
|
141
|
+
}
|
142
|
+
buffer_size -= info->Size;
|
143
|
+
info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
|
144
|
+
}
|
145
|
+
|
146
|
+
return num_physical_cores > 0 ? num_physical_cores : default_threads;
|
121
147
|
#endif
|
122
148
|
unsigned int n_threads = std::thread::hardware_concurrency();
|
123
149
|
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
|
@@ -1733,7 +1759,13 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
|
|
1733
1759
|
if (params.n_threads_batch != -1) {
|
1734
1760
|
os << " (n_threads_batch = " << params.n_threads_batch << ")";
|
1735
1761
|
}
|
1762
|
+
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
|
1763
|
+
// TODO: windows + arm64 + mingw64
|
1764
|
+
DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
|
1765
|
+
os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
|
1766
|
+
#else
|
1736
1767
|
os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
|
1768
|
+
#endif
|
1737
1769
|
|
1738
1770
|
return os.str();
|
1739
1771
|
}
|
package/cpp/ggml-backend.c
CHANGED
@@ -1018,10 +1018,6 @@ static bool lm_ggml_is_view_op(enum lm_ggml_op op) {
|
|
1018
1018
|
#define LM_GGML_SCHED_MAX_BACKENDS 16
|
1019
1019
|
#endif
|
1020
1020
|
|
1021
|
-
#ifndef LM_GGML_SCHED_MAX_SPLITS
|
1022
|
-
#define LM_GGML_SCHED_MAX_SPLITS 2048
|
1023
|
-
#endif
|
1024
|
-
|
1025
1021
|
#ifndef LM_GGML_SCHED_MAX_SPLIT_INPUTS
|
1026
1022
|
#define LM_GGML_SCHED_MAX_SPLIT_INPUTS LM_GGML_MAX_SRC
|
1027
1023
|
#endif
|
@@ -1125,7 +1121,8 @@ static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sch
|
|
1125
1121
|
}
|
1126
1122
|
|
1127
1123
|
#if 0
|
1128
|
-
|
1124
|
+
#define LM_GGML_SCHED_MAX_SPLITS_DEBUG 4096
|
1125
|
+
static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS_DEBUG*LM_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
|
1129
1126
|
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
1130
1127
|
#define GET_CAUSE(node) causes[hash_id(node)]
|
1131
1128
|
#else
|
@@ -1549,7 +1546,6 @@ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, str
|
|
1549
1546
|
sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split));
|
1550
1547
|
LM_GGML_ASSERT(sched->splits != NULL);
|
1551
1548
|
}
|
1552
|
-
LM_GGML_ASSERT(i_split < LM_GGML_SCHED_MAX_SPLITS);
|
1553
1549
|
split = &sched->splits[i_split];
|
1554
1550
|
split->backend_id = node_backend_id;
|
1555
1551
|
split->i_start = i;
|
@@ -1865,13 +1861,14 @@ lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
|
|
1865
1861
|
sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
1866
1862
|
sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
|
1867
1863
|
|
1868
|
-
const size_t
|
1864
|
+
const size_t lm_ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
|
1865
|
+
const size_t nodes_size = graph_size + lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
1869
1866
|
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
1870
1867
|
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
1871
1868
|
sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
1872
1869
|
sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
1873
1870
|
|
1874
|
-
sched->context_buffer_size =
|
1871
|
+
sched->context_buffer_size = lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false);
|
1875
1872
|
sched->context_buffer = malloc(sched->context_buffer_size);
|
1876
1873
|
|
1877
1874
|
const int initial_splits_capacity = 16;
|
package/cpp/llama-vocab.cpp
CHANGED
@@ -388,6 +388,7 @@ struct llm_tokenizer_bpe {
|
|
388
388
|
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
389
389
|
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
|
390
390
|
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
|
391
|
+
case LLAMA_VOCAB_PRE_TYPE_EXAONE:
|
391
392
|
regex_exprs = {
|
392
393
|
"\\p{N}",
|
393
394
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
package/cpp/llama.cpp
CHANGED
@@ -221,6 +221,8 @@ enum llm_arch {
|
|
221
221
|
LLM_ARCH_T5,
|
222
222
|
LLM_ARCH_T5ENCODER,
|
223
223
|
LLM_ARCH_JAIS,
|
224
|
+
LLM_ARCH_NEMOTRON,
|
225
|
+
LLM_ARCH_EXAONE,
|
224
226
|
LLM_ARCH_UNKNOWN,
|
225
227
|
};
|
226
228
|
|
@@ -266,6 +268,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
266
268
|
{ LLM_ARCH_T5, "t5" },
|
267
269
|
{ LLM_ARCH_T5ENCODER, "t5encoder" },
|
268
270
|
{ LLM_ARCH_JAIS, "jais" },
|
271
|
+
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
272
|
+
{ LLM_ARCH_EXAONE, "exaone" },
|
269
273
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
270
274
|
};
|
271
275
|
|
@@ -1307,6 +1311,43 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
1307
1311
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1308
1312
|
},
|
1309
1313
|
},
|
1314
|
+
{
|
1315
|
+
LLM_ARCH_NEMOTRON,
|
1316
|
+
{
|
1317
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1318
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1319
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1320
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
1321
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1322
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1323
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1324
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1325
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1326
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
1327
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1328
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1329
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1330
|
+
},
|
1331
|
+
},
|
1332
|
+
{
|
1333
|
+
LLM_ARCH_EXAONE,
|
1334
|
+
{
|
1335
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1336
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1337
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1338
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
1339
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1340
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1341
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1342
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1343
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1344
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
1345
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1346
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1347
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1348
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1349
|
+
},
|
1350
|
+
},
|
1310
1351
|
{
|
1311
1352
|
LLM_ARCH_UNKNOWN,
|
1312
1353
|
{
|
@@ -5246,6 +5287,23 @@ static void llm_load_hparams(
|
|
5246
5287
|
default: model.type = e_model::MODEL_UNKNOWN;
|
5247
5288
|
}
|
5248
5289
|
} break;
|
5290
|
+
case LLM_ARCH_NEMOTRON:
|
5291
|
+
{
|
5292
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
5293
|
+
switch (hparams.n_layer) {
|
5294
|
+
case 32: model.type = e_model::MODEL_4B; break;
|
5295
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
5296
|
+
}
|
5297
|
+
} break;
|
5298
|
+
case LLM_ARCH_EXAONE:
|
5299
|
+
{
|
5300
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
5301
|
+
|
5302
|
+
switch (hparams.n_layer) {
|
5303
|
+
case 32: model.type = e_model::MODEL_8B; break;
|
5304
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
5305
|
+
}
|
5306
|
+
} break;
|
5249
5307
|
default: (void)0;
|
5250
5308
|
}
|
5251
5309
|
|
@@ -5484,6 +5542,9 @@ static void llm_load_vocab(
|
|
5484
5542
|
} else if (
|
5485
5543
|
tokenizer_pre == "gpt3-finnish") {
|
5486
5544
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
|
5545
|
+
} else if (
|
5546
|
+
tokenizer_pre == "exaone") {
|
5547
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
|
5487
5548
|
} else {
|
5488
5549
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
5489
5550
|
}
|
@@ -7579,6 +7640,78 @@ static bool llm_load_tensors(
|
|
7579
7640
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
7580
7641
|
}
|
7581
7642
|
} break;
|
7643
|
+
case LLM_ARCH_NEMOTRON:
|
7644
|
+
{
|
7645
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
7646
|
+
|
7647
|
+
// output
|
7648
|
+
{
|
7649
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
7650
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
7651
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
7652
|
+
}
|
7653
|
+
|
7654
|
+
for (int i = 0; i < n_layer; ++i) {
|
7655
|
+
lm_ggml_context * ctx_layer = ctx_for_layer(i);
|
7656
|
+
lm_ggml_context * ctx_split = ctx_for_layer_split(i);
|
7657
|
+
|
7658
|
+
auto & layer = model.layers[i];
|
7659
|
+
|
7660
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
7661
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
7662
|
+
|
7663
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
7664
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
7665
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
7666
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
7667
|
+
|
7668
|
+
// optional bias tensors
|
7669
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7670
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7671
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7672
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7673
|
+
|
7674
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
7675
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
7676
|
+
|
7677
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
7678
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
7679
|
+
|
7680
|
+
// optional MLP bias
|
7681
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7682
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7683
|
+
}
|
7684
|
+
} break;
|
7685
|
+
case LLM_ARCH_EXAONE:
|
7686
|
+
{
|
7687
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
7688
|
+
|
7689
|
+
// output
|
7690
|
+
{
|
7691
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
7692
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
7693
|
+
}
|
7694
|
+
|
7695
|
+
for (int i = 0; i < n_layer; ++i) {
|
7696
|
+
lm_ggml_context * ctx_layer = ctx_for_layer(i);
|
7697
|
+
lm_ggml_context * ctx_split = ctx_for_layer_split(i);
|
7698
|
+
|
7699
|
+
auto & layer = model.layers[i];
|
7700
|
+
|
7701
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
7702
|
+
|
7703
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
|
7704
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
7705
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
7706
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
|
7707
|
+
|
7708
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
7709
|
+
layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
7710
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
7711
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
7712
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
7713
|
+
}
|
7714
|
+
} break;
|
7582
7715
|
default:
|
7583
7716
|
throw std::runtime_error("unknown architecture");
|
7584
7717
|
}
|
@@ -8265,7 +8398,7 @@ static struct lm_ggml_tensor * llm_build_kqv(
|
|
8265
8398
|
struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx, k, q);
|
8266
8399
|
cb(kq, "kq", il);
|
8267
8400
|
|
8268
|
-
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
|
8401
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2 || model.arch == LLM_ARCH_NEMOTRON) {
|
8269
8402
|
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
8270
8403
|
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
8271
8404
|
lm_ggml_mul_mat_set_prec(kq, LM_GGML_PREC_F32);
|
@@ -13766,6 +13899,254 @@ struct llm_build_context {
|
|
13766
13899
|
|
13767
13900
|
return gf;
|
13768
13901
|
}
|
13902
|
+
|
13903
|
+
struct lm_ggml_cgraph * build_nemotron() {
|
13904
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
13905
|
+
|
13906
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
13907
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
13908
|
+
//LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
|
13909
|
+
|
13910
|
+
struct lm_ggml_tensor * cur;
|
13911
|
+
struct lm_ggml_tensor * inpL;
|
13912
|
+
|
13913
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
13914
|
+
|
13915
|
+
// inp_pos - contains the positions
|
13916
|
+
struct lm_ggml_tensor * inp_pos = build_inp_pos();
|
13917
|
+
|
13918
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
13919
|
+
struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
13920
|
+
|
13921
|
+
for (int il = 0; il < n_layer; ++il) {
|
13922
|
+
struct lm_ggml_tensor * inpSA = inpL;
|
13923
|
+
|
13924
|
+
// norm
|
13925
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
13926
|
+
model.layers[il].attn_norm,
|
13927
|
+
model.layers[il].attn_norm_b,
|
13928
|
+
LLM_NORM, cb, il);
|
13929
|
+
cb(cur, "attn_norm", il);
|
13930
|
+
|
13931
|
+
// self-attention
|
13932
|
+
{
|
13933
|
+
// compute Q and K and RoPE them
|
13934
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
13935
|
+
cb(Qcur, "Qcur", il);
|
13936
|
+
if (model.layers[il].bq) {
|
13937
|
+
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
13938
|
+
cb(Qcur, "Qcur", il);
|
13939
|
+
}
|
13940
|
+
|
13941
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
13942
|
+
cb(Kcur, "Kcur", il);
|
13943
|
+
if (model.layers[il].bk) {
|
13944
|
+
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
13945
|
+
cb(Kcur, "Kcur", il);
|
13946
|
+
}
|
13947
|
+
|
13948
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
13949
|
+
cb(Vcur, "Vcur", il);
|
13950
|
+
if (model.layers[il].bv) {
|
13951
|
+
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
13952
|
+
cb(Vcur, "Vcur", il);
|
13953
|
+
}
|
13954
|
+
|
13955
|
+
Qcur = lm_ggml_rope_ext(
|
13956
|
+
ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
13957
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
13958
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
13959
|
+
);
|
13960
|
+
cb(Qcur, "Qcur", il);
|
13961
|
+
|
13962
|
+
Kcur = lm_ggml_rope_ext(
|
13963
|
+
ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
13964
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
13965
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
13966
|
+
);
|
13967
|
+
cb(Kcur, "Kcur", il);
|
13968
|
+
|
13969
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
13970
|
+
model.layers[il].wo, model.layers[il].bo,
|
13971
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
13972
|
+
}
|
13973
|
+
|
13974
|
+
if (il == n_layer - 1) {
|
13975
|
+
// skip computing output for unused tokens
|
13976
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
13977
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
13978
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
13979
|
+
}
|
13980
|
+
|
13981
|
+
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
13982
|
+
cb(ffn_inp, "ffn_inp", il);
|
13983
|
+
|
13984
|
+
// feed-forward network
|
13985
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
13986
|
+
model.layers[il].ffn_norm,
|
13987
|
+
model.layers[il].ffn_norm_b,
|
13988
|
+
LLM_NORM, cb, il);
|
13989
|
+
cb(cur, "ffn_norm", il);
|
13990
|
+
|
13991
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
13992
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
13993
|
+
NULL, NULL, NULL,
|
13994
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
13995
|
+
NULL,
|
13996
|
+
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
|
13997
|
+
|
13998
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
13999
|
+
cb(cur, "ffn_out", il);
|
14000
|
+
|
14001
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
14002
|
+
cb(cur, "l_out", il);
|
14003
|
+
|
14004
|
+
// input for next layer
|
14005
|
+
inpL = cur;
|
14006
|
+
}
|
14007
|
+
|
14008
|
+
cur = inpL;
|
14009
|
+
|
14010
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
14011
|
+
model.output_norm, model.output_norm_b,
|
14012
|
+
LLM_NORM, cb, -1);
|
14013
|
+
cb(cur, "result_norm", -1);
|
14014
|
+
|
14015
|
+
// lm_head
|
14016
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
14017
|
+
cb(cur, "result_output", -1);
|
14018
|
+
|
14019
|
+
lm_ggml_build_forward_expand(gf, cur);
|
14020
|
+
|
14021
|
+
return gf;
|
14022
|
+
}
|
14023
|
+
|
14024
|
+
struct lm_ggml_cgraph * build_exaone() {
|
14025
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
14026
|
+
|
14027
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
14028
|
+
int32_t n_tokens = this->n_tokens;
|
14029
|
+
|
14030
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
14031
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
14032
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
|
14033
|
+
|
14034
|
+
struct lm_ggml_tensor * cur;
|
14035
|
+
struct lm_ggml_tensor * inpL;
|
14036
|
+
|
14037
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
14038
|
+
|
14039
|
+
// inp_pos - contains the positions
|
14040
|
+
struct lm_ggml_tensor * inp_pos = build_inp_pos();
|
14041
|
+
|
14042
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
14043
|
+
struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
14044
|
+
|
14045
|
+
for (int il = 0; il < n_layer; ++il) {
|
14046
|
+
struct lm_ggml_tensor * inpSA = inpL;
|
14047
|
+
|
14048
|
+
// norm
|
14049
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
14050
|
+
model.layers[il].attn_norm, NULL,
|
14051
|
+
LLM_NORM_RMS, cb, il);
|
14052
|
+
cb(cur, "attn_norm", il);
|
14053
|
+
|
14054
|
+
// self-attention
|
14055
|
+
{
|
14056
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
14057
|
+
struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
|
14058
|
+
|
14059
|
+
// compute Q and K and RoPE them
|
14060
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
14061
|
+
cb(Qcur, "Qcur", il);
|
14062
|
+
if (model.layers[il].bq) {
|
14063
|
+
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
14064
|
+
cb(Qcur, "Qcur", il);
|
14065
|
+
}
|
14066
|
+
|
14067
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
14068
|
+
cb(Kcur, "Kcur", il);
|
14069
|
+
if (model.layers[il].bk) {
|
14070
|
+
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
14071
|
+
cb(Kcur, "Kcur", il);
|
14072
|
+
}
|
14073
|
+
|
14074
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
14075
|
+
cb(Vcur, "Vcur", il);
|
14076
|
+
if (model.layers[il].bv) {
|
14077
|
+
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
14078
|
+
cb(Vcur, "Vcur", il);
|
14079
|
+
}
|
14080
|
+
|
14081
|
+
Qcur = lm_ggml_rope_ext(
|
14082
|
+
ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
14083
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
14084
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
14085
|
+
);
|
14086
|
+
cb(Qcur, "Qcur", il);
|
14087
|
+
|
14088
|
+
Kcur = lm_ggml_rope_ext(
|
14089
|
+
ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
|
14090
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
14091
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
14092
|
+
);
|
14093
|
+
cb(Kcur, "Kcur", il);
|
14094
|
+
|
14095
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
14096
|
+
model.layers[il].wo, model.layers[il].bo,
|
14097
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
14098
|
+
}
|
14099
|
+
|
14100
|
+
if (il == n_layer - 1) {
|
14101
|
+
// skip computing output for unused tokens
|
14102
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
14103
|
+
n_tokens = n_outputs;
|
14104
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
14105
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
14106
|
+
}
|
14107
|
+
|
14108
|
+
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
14109
|
+
cb(ffn_inp, "ffn_inp", il);
|
14110
|
+
|
14111
|
+
// feed-forward network
|
14112
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
14113
|
+
model.layers[il].ffn_norm, NULL,
|
14114
|
+
LLM_NORM_RMS, cb, il);
|
14115
|
+
cb(cur, "ffn_norm", il);
|
14116
|
+
|
14117
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
14118
|
+
model.layers[il].ffn_up, NULL, NULL,
|
14119
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
14120
|
+
model.layers[il].ffn_down, NULL, NULL,
|
14121
|
+
NULL,
|
14122
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
14123
|
+
cb(cur, "ffn_out", il);
|
14124
|
+
|
14125
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
14126
|
+
cb(cur, "ffn_out", il);
|
14127
|
+
|
14128
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
14129
|
+
cb(cur, "l_out", il);
|
14130
|
+
|
14131
|
+
// input for next layer
|
14132
|
+
inpL = cur;
|
14133
|
+
}
|
14134
|
+
|
14135
|
+
cur = inpL;
|
14136
|
+
|
14137
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
14138
|
+
model.output_norm, NULL,
|
14139
|
+
LLM_NORM_RMS, cb, -1);
|
14140
|
+
cb(cur, "result_norm", -1);
|
14141
|
+
|
14142
|
+
// lm_head
|
14143
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
14144
|
+
cb(cur, "result_output", -1);
|
14145
|
+
|
14146
|
+
lm_ggml_build_forward_expand(gf, cur);
|
14147
|
+
|
14148
|
+
return gf;
|
14149
|
+
}
|
13769
14150
|
};
|
13770
14151
|
|
13771
14152
|
static struct lm_ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -14021,6 +14402,14 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
14021
14402
|
{
|
14022
14403
|
result = llm.build_jais();
|
14023
14404
|
} break;
|
14405
|
+
case LLM_ARCH_NEMOTRON:
|
14406
|
+
{
|
14407
|
+
result = llm.build_nemotron();
|
14408
|
+
} break;
|
14409
|
+
case LLM_ARCH_EXAONE:
|
14410
|
+
{
|
14411
|
+
result = llm.build_exaone();
|
14412
|
+
} break;
|
14024
14413
|
default:
|
14025
14414
|
LM_GGML_ABORT("fatal error");
|
14026
14415
|
}
|
@@ -17091,6 +17480,8 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
17091
17480
|
case LLM_ARCH_OPENELM:
|
17092
17481
|
case LLM_ARCH_GPTNEOX:
|
17093
17482
|
case LLM_ARCH_CODESHELL:
|
17483
|
+
case LLM_ARCH_NEMOTRON:
|
17484
|
+
case LLM_ARCH_EXAONE:
|
17094
17485
|
return LLAMA_ROPE_TYPE_NEOX;
|
17095
17486
|
|
17096
17487
|
// all model arches should be listed explicitly here
|
@@ -19021,6 +19412,22 @@ static int32_t llama_chat_apply_template_internal(
|
|
19021
19412
|
if (add_ass) {
|
19022
19413
|
ss << "Assistant:";
|
19023
19414
|
}
|
19415
|
+
} else if (tmpl == "exaone3" || (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]"))) {
|
19416
|
+
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
19417
|
+
// EXAONE-3.0-7.8B-Instruct
|
19418
|
+
for (auto message : chat) {
|
19419
|
+
std::string role(message->role);
|
19420
|
+
if (role == "system") {
|
19421
|
+
ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n";
|
19422
|
+
} else if (role == "user") {
|
19423
|
+
ss << "[|user|]" << trim(message->content) << "\n";
|
19424
|
+
} else if (role == "assistant") {
|
19425
|
+
ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n";
|
19426
|
+
}
|
19427
|
+
}
|
19428
|
+
if (add_ass) {
|
19429
|
+
ss << "[|assistant|]";
|
19430
|
+
}
|
19024
19431
|
} else {
|
19025
19432
|
// template not supported
|
19026
19433
|
return -1;
|
package/cpp/llama.h
CHANGED