cui-llama.rn 1.0.9 → 1.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cpp/common.cpp +34 -8
- package/cpp/common.h +0 -4
- package/cpp/ggml-backend.c +5 -8
- package/cpp/ggml-metal.m +2 -2
- package/cpp/ggml.c +3 -3
- package/cpp/ggml.h +4 -2
- package/cpp/grammar-parser.cpp +3 -0
- package/cpp/llama-sampling.cpp +2 -2
- package/cpp/llama-vocab.cpp +5 -2
- package/cpp/llama-vocab.h +2 -2
- package/cpp/llama.cpp +442 -16
- package/cpp/llama.h +7 -10
- package/cpp/rn-llama.hpp +20 -6
- package/package.json +1 -1
package/cpp/common.cpp
CHANGED
@@ -116,8 +116,34 @@ int32_t cpu_get_num_physical_cores() {
|
|
116
116
|
if (result == 0) {
|
117
117
|
return num_physical_cores;
|
118
118
|
}
|
119
|
-
#elif defined(_WIN32)
|
120
|
-
//TODO:
|
119
|
+
#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
|
120
|
+
// TODO: windows + arm64 + mingw64
|
121
|
+
unsigned int n_threads_win = std::thread::hardware_concurrency();
|
122
|
+
unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
|
123
|
+
|
124
|
+
DWORD buffer_size = 0;
|
125
|
+
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
|
126
|
+
if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
|
127
|
+
return default_threads;
|
128
|
+
}
|
129
|
+
}
|
130
|
+
|
131
|
+
std::vector<char> buffer(buffer_size);
|
132
|
+
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
|
133
|
+
return default_threads;
|
134
|
+
}
|
135
|
+
|
136
|
+
int32_t num_physical_cores = 0;
|
137
|
+
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
|
138
|
+
while (buffer_size > 0) {
|
139
|
+
if (info->Relationship == RelationProcessorCore) {
|
140
|
+
num_physical_cores += info->Processor.GroupCount;
|
141
|
+
}
|
142
|
+
buffer_size -= info->Size;
|
143
|
+
info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
|
144
|
+
}
|
145
|
+
|
146
|
+
return num_physical_cores > 0 ? num_physical_cores : default_threads;
|
121
147
|
#endif
|
122
148
|
unsigned int n_threads = std::thread::hardware_concurrency();
|
123
149
|
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
|
@@ -1733,7 +1759,13 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
|
|
1733
1759
|
if (params.n_threads_batch != -1) {
|
1734
1760
|
os << " (n_threads_batch = " << params.n_threads_batch << ")";
|
1735
1761
|
}
|
1762
|
+
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
|
1763
|
+
// TODO: windows + arm64 + mingw64
|
1764
|
+
DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
|
1765
|
+
os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
|
1766
|
+
#else
|
1736
1767
|
os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
|
1768
|
+
#endif
|
1737
1769
|
|
1738
1770
|
return os.str();
|
1739
1771
|
}
|
@@ -2709,12 +2741,6 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
|
|
2709
2741
|
return text;
|
2710
2742
|
}
|
2711
2743
|
|
2712
|
-
bool llama_should_add_bos_token(const llama_model * model) {
|
2713
|
-
const int add_bos = llama_add_bos_token(model);
|
2714
|
-
|
2715
|
-
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
|
2716
|
-
}
|
2717
|
-
|
2718
2744
|
//
|
2719
2745
|
// Chat template utils
|
2720
2746
|
//
|
package/cpp/common.h
CHANGED
@@ -392,10 +392,6 @@ std::string llama_detokenize(
|
|
392
392
|
const std::vector<llama_token> & tokens,
|
393
393
|
bool special = true);
|
394
394
|
|
395
|
-
// Uses the value from the model metadata if possible, otherwise
|
396
|
-
// defaults to true when model type is SPM, otherwise false.
|
397
|
-
bool llama_should_add_bos_token(const llama_model * model);
|
398
|
-
|
399
395
|
//
|
400
396
|
// Chat template utils
|
401
397
|
//
|
package/cpp/ggml-backend.c
CHANGED
@@ -1018,10 +1018,6 @@ static bool lm_ggml_is_view_op(enum lm_ggml_op op) {
|
|
1018
1018
|
#define LM_GGML_SCHED_MAX_BACKENDS 16
|
1019
1019
|
#endif
|
1020
1020
|
|
1021
|
-
#ifndef LM_GGML_SCHED_MAX_SPLITS
|
1022
|
-
#define LM_GGML_SCHED_MAX_SPLITS 2048
|
1023
|
-
#endif
|
1024
|
-
|
1025
1021
|
#ifndef LM_GGML_SCHED_MAX_SPLIT_INPUTS
|
1026
1022
|
#define LM_GGML_SCHED_MAX_SPLIT_INPUTS LM_GGML_MAX_SRC
|
1027
1023
|
#endif
|
@@ -1125,7 +1121,8 @@ static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sch
|
|
1125
1121
|
}
|
1126
1122
|
|
1127
1123
|
#if 0
|
1128
|
-
|
1124
|
+
#define LM_GGML_SCHED_MAX_SPLITS_DEBUG 4096
|
1125
|
+
static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS_DEBUG*LM_GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
|
1129
1126
|
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
1130
1127
|
#define GET_CAUSE(node) causes[hash_id(node)]
|
1131
1128
|
#else
|
@@ -1549,7 +1546,6 @@ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, str
|
|
1549
1546
|
sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split));
|
1550
1547
|
LM_GGML_ASSERT(sched->splits != NULL);
|
1551
1548
|
}
|
1552
|
-
LM_GGML_ASSERT(i_split < LM_GGML_SCHED_MAX_SPLITS);
|
1553
1549
|
split = &sched->splits[i_split];
|
1554
1550
|
split->backend_id = node_backend_id;
|
1555
1551
|
split->i_start = i;
|
@@ -1865,13 +1861,14 @@ lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
|
|
1865
1861
|
sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
1866
1862
|
sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
|
1867
1863
|
|
1868
|
-
const size_t
|
1864
|
+
const size_t lm_ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
|
1865
|
+
const size_t nodes_size = graph_size + lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
1869
1866
|
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
1870
1867
|
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
1871
1868
|
sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
1872
1869
|
sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
1873
1870
|
|
1874
|
-
sched->context_buffer_size =
|
1871
|
+
sched->context_buffer_size = lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false);
|
1875
1872
|
sched->context_buffer = malloc(sched->context_buffer_size);
|
1876
1873
|
|
1877
1874
|
const int initial_splits_capacity = 16;
|
package/cpp/ggml-metal.m
CHANGED
@@ -310,7 +310,7 @@ static struct lm_ggml_backend_metal_context * lm_ggml_metal_init(int n_cb) {
|
|
310
310
|
LM_GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
|
311
311
|
|
312
312
|
// Configure context
|
313
|
-
struct lm_ggml_backend_metal_context * ctx =
|
313
|
+
struct lm_ggml_backend_metal_context * ctx = calloc(1, sizeof(struct lm_ggml_backend_metal_context));
|
314
314
|
ctx->device = device;
|
315
315
|
ctx->n_cb = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS);
|
316
316
|
ctx->queue = [ctx->device newCommandQueue];
|
@@ -2313,7 +2313,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
|
2313
2313
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
2314
2314
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
2315
2315
|
|
2316
|
-
const bool is_neox = mode &
|
2316
|
+
const bool is_neox = mode & LM_GGML_ROPE_TYPE_NEOX;
|
2317
2317
|
|
2318
2318
|
id<MTLComputePipelineState> pipeline = nil;
|
2319
2319
|
|
package/cpp/ggml.c
CHANGED
@@ -14094,7 +14094,7 @@ static void lm_ggml_compute_forward_rope_f32(
|
|
14094
14094
|
float corr_dims[2];
|
14095
14095
|
lm_ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
14096
14096
|
|
14097
|
-
const bool is_neox = mode &
|
14097
|
+
const bool is_neox = mode & LM_GGML_ROPE_TYPE_NEOX;
|
14098
14098
|
|
14099
14099
|
const float * freq_factors = NULL;
|
14100
14100
|
if (src2 != NULL) {
|
@@ -14219,7 +14219,7 @@ static void lm_ggml_compute_forward_rope_f16(
|
|
14219
14219
|
float corr_dims[2];
|
14220
14220
|
lm_ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
14221
14221
|
|
14222
|
-
const bool is_neox = mode &
|
14222
|
+
const bool is_neox = mode & LM_GGML_ROPE_TYPE_NEOX;
|
14223
14223
|
|
14224
14224
|
const float * freq_factors = NULL;
|
14225
14225
|
if (src2 != NULL) {
|
@@ -21129,7 +21129,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
|
|
21129
21129
|
(int64_t) info->ne[2] *
|
21130
21130
|
(int64_t) info->ne[3];
|
21131
21131
|
|
21132
|
-
if (ne % lm_ggml_blck_size(info->type) != 0) {
|
21132
|
+
if (lm_ggml_blck_size(info->type) == 0 || ne % lm_ggml_blck_size(info->type) != 0) {
|
21133
21133
|
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
|
21134
21134
|
__func__, info->name.data, (int) info->type, lm_ggml_type_name(info->type), ne, lm_ggml_blck_size(info->type));
|
21135
21135
|
fclose(file);
|
package/cpp/ggml.h
CHANGED
@@ -244,6 +244,8 @@
|
|
244
244
|
#define LM_GGML_EXIT_SUCCESS 0
|
245
245
|
#define LM_GGML_EXIT_ABORTED 1
|
246
246
|
|
247
|
+
#define LM_GGML_ROPE_TYPE_NEOX 2
|
248
|
+
|
247
249
|
#define LM_GGUF_MAGIC "GGUF"
|
248
250
|
|
249
251
|
#define LM_GGUF_VERSION 3
|
@@ -1453,8 +1455,8 @@ extern "C" {
|
|
1453
1455
|
struct lm_ggml_tensor * b);
|
1454
1456
|
|
1455
1457
|
// rotary position embedding
|
1456
|
-
// if mode & 1
|
1457
|
-
// if mode &
|
1458
|
+
// if (mode & 1) - skip n_past elements (NOT SUPPORTED)
|
1459
|
+
// if (mode & LM_GGML_ROPE_TYPE_NEOX) - GPT-NeoX style
|
1458
1460
|
//
|
1459
1461
|
// b is an int32 vector with size a->ne[2], it contains the positions
|
1460
1462
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope(
|
package/cpp/grammar-parser.cpp
CHANGED
@@ -369,6 +369,9 @@ namespace grammar_parser {
|
|
369
369
|
}
|
370
370
|
// Validate the state to ensure that all rules are defined
|
371
371
|
for (const auto & rule : state.rules) {
|
372
|
+
if (rule.empty()) {
|
373
|
+
throw std::runtime_error("Undefined rule");
|
374
|
+
}
|
372
375
|
for (const auto & elem : rule) {
|
373
376
|
if (elem.type == LLAMA_GRETYPE_RULE_REF) {
|
374
377
|
// Ensure that the rule at that location exists
|
package/cpp/llama-sampling.cpp
CHANGED
@@ -85,14 +85,14 @@ void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_arra
|
|
85
85
|
constexpr float bucket_low = -10.0f;
|
86
86
|
constexpr float bucket_high = 10.0f;
|
87
87
|
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
88
|
-
constexpr float
|
88
|
+
constexpr float bucket_inter = -bucket_low * bucket_scale;
|
89
89
|
|
90
90
|
std::vector<int> bucket_idx(candidates->size);
|
91
91
|
std::vector<int> histo(nbuckets, 0);
|
92
92
|
|
93
93
|
for (int i = 0; i < (int)candidates->size; ++i) {
|
94
94
|
const float val = candidates->data[i].logit;
|
95
|
-
int ib = int(bucket_scale * val +
|
95
|
+
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
96
96
|
ib = std::max(0, std::min(nbuckets-1, ib));
|
97
97
|
bucket_idx[i] = ib;
|
98
98
|
++histo[ib];
|
package/cpp/llama-vocab.cpp
CHANGED
@@ -388,6 +388,7 @@ struct llm_tokenizer_bpe {
|
|
388
388
|
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
389
389
|
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
|
390
390
|
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
|
391
|
+
case LLAMA_VOCAB_PRE_TYPE_EXAONE:
|
391
392
|
regex_exprs = {
|
392
393
|
"\\p{N}",
|
393
394
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
@@ -410,6 +411,8 @@ struct llm_tokenizer_bpe {
|
|
410
411
|
};
|
411
412
|
break;
|
412
413
|
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
414
|
+
case LLAMA_VOCAB_PRE_TYPE_BLOOM:
|
415
|
+
case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
|
413
416
|
regex_exprs = {
|
414
417
|
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
415
418
|
};
|
@@ -1466,11 +1469,11 @@ llama_token llama_token_pad_impl(const struct llama_vocab & vocab) {
|
|
1466
1469
|
return vocab.special_pad_id;
|
1467
1470
|
}
|
1468
1471
|
|
1469
|
-
|
1472
|
+
bool llama_add_bos_token_impl(const struct llama_vocab & vocab) {
|
1470
1473
|
return vocab.tokenizer_add_bos;
|
1471
1474
|
}
|
1472
1475
|
|
1473
|
-
|
1476
|
+
bool llama_add_eos_token_impl(const struct llama_vocab & vocab) {
|
1474
1477
|
return vocab.tokenizer_add_eos;
|
1475
1478
|
}
|
1476
1479
|
|
package/cpp/llama-vocab.h
CHANGED
@@ -95,8 +95,8 @@ llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
|
|
95
95
|
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
|
96
96
|
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
|
97
97
|
|
98
|
-
|
99
|
-
|
98
|
+
bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
|
99
|
+
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
|
100
100
|
|
101
101
|
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
|
102
102
|
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
|
package/cpp/llama.cpp
CHANGED
@@ -221,6 +221,8 @@ enum llm_arch {
|
|
221
221
|
LLM_ARCH_T5,
|
222
222
|
LLM_ARCH_T5ENCODER,
|
223
223
|
LLM_ARCH_JAIS,
|
224
|
+
LLM_ARCH_NEMOTRON,
|
225
|
+
LLM_ARCH_EXAONE,
|
224
226
|
LLM_ARCH_UNKNOWN,
|
225
227
|
};
|
226
228
|
|
@@ -266,6 +268,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
266
268
|
{ LLM_ARCH_T5, "t5" },
|
267
269
|
{ LLM_ARCH_T5ENCODER, "t5encoder" },
|
268
270
|
{ LLM_ARCH_JAIS, "jais" },
|
271
|
+
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
272
|
+
{ LLM_ARCH_EXAONE, "exaone" },
|
269
273
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
270
274
|
};
|
271
275
|
|
@@ -1307,6 +1311,43 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
1307
1311
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1308
1312
|
},
|
1309
1313
|
},
|
1314
|
+
{
|
1315
|
+
LLM_ARCH_NEMOTRON,
|
1316
|
+
{
|
1317
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1318
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1319
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1320
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
1321
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1322
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1323
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1324
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1325
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1326
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
1327
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1328
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1329
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1330
|
+
},
|
1331
|
+
},
|
1332
|
+
{
|
1333
|
+
LLM_ARCH_EXAONE,
|
1334
|
+
{
|
1335
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1336
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1337
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1338
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
1339
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1340
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1341
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1342
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1343
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1344
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
1345
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1346
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1347
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1348
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1349
|
+
},
|
1350
|
+
},
|
1310
1351
|
{
|
1311
1352
|
LLM_ARCH_UNKNOWN,
|
1312
1353
|
{
|
@@ -3586,13 +3627,8 @@ namespace GGUFMeta {
|
|
3586
3627
|
|
3587
3628
|
using llama_buf_map = std::unordered_map<uint32_t, lm_ggml_backend_buffer_t>;
|
3588
3629
|
|
3589
|
-
|
3590
|
-
|
3591
|
-
//if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B
|
3592
|
-
// return 32768;
|
3593
|
-
//}
|
3594
|
-
|
3595
|
-
return 8192;
|
3630
|
+
static size_t llama_model_max_nodes(const llama_model & model) {
|
3631
|
+
return std::max<size_t>(8192, model.tensors_by_name.size()*5);
|
3596
3632
|
}
|
3597
3633
|
|
3598
3634
|
struct llama_model_loader {
|
@@ -4912,7 +4948,6 @@ static void llm_load_hparams(
|
|
4912
4948
|
} break;
|
4913
4949
|
case LLM_ARCH_PHI3:
|
4914
4950
|
{
|
4915
|
-
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
4916
4951
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
4917
4952
|
|
4918
4953
|
switch (hparams.n_layer) {
|
@@ -4921,6 +4956,22 @@ static void llm_load_hparams(
|
|
4921
4956
|
case 40: model.type = e_model::MODEL_14B; break;
|
4922
4957
|
default: model.type = e_model::MODEL_UNKNOWN;
|
4923
4958
|
}
|
4959
|
+
|
4960
|
+
// for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
|
4961
|
+
if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
|
4962
|
+
// default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
|
4963
|
+
hparams.n_swa = 2047;
|
4964
|
+
} else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
|
4965
|
+
// default value for Phi-3-mini-128k-instruct
|
4966
|
+
hparams.n_swa = 262144;
|
4967
|
+
} else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
|
4968
|
+
// default value for Phi-3-medium-128k-instruct
|
4969
|
+
hparams.n_swa = 131072;
|
4970
|
+
}
|
4971
|
+
bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
4972
|
+
if (!found_swa && hparams.n_swa == 0) {
|
4973
|
+
throw std::runtime_error("invalid value for sliding_window");
|
4974
|
+
}
|
4924
4975
|
} break;
|
4925
4976
|
case LLM_ARCH_PLAMO:
|
4926
4977
|
{
|
@@ -5236,6 +5287,23 @@ static void llm_load_hparams(
|
|
5236
5287
|
default: model.type = e_model::MODEL_UNKNOWN;
|
5237
5288
|
}
|
5238
5289
|
} break;
|
5290
|
+
case LLM_ARCH_NEMOTRON:
|
5291
|
+
{
|
5292
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
5293
|
+
switch (hparams.n_layer) {
|
5294
|
+
case 32: model.type = e_model::MODEL_4B; break;
|
5295
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
5296
|
+
}
|
5297
|
+
} break;
|
5298
|
+
case LLM_ARCH_EXAONE:
|
5299
|
+
{
|
5300
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
5301
|
+
|
5302
|
+
switch (hparams.n_layer) {
|
5303
|
+
case 32: model.type = e_model::MODEL_8B; break;
|
5304
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
5305
|
+
}
|
5306
|
+
} break;
|
5239
5307
|
default: (void)0;
|
5240
5308
|
}
|
5241
5309
|
|
@@ -5468,6 +5536,15 @@ static void llm_load_vocab(
|
|
5468
5536
|
} else if (
|
5469
5537
|
tokenizer_pre == "codeshell") {
|
5470
5538
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
|
5539
|
+
} else if (
|
5540
|
+
tokenizer_pre == "bloom") {
|
5541
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BLOOM;
|
5542
|
+
} else if (
|
5543
|
+
tokenizer_pre == "gpt3-finnish") {
|
5544
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
|
5545
|
+
} else if (
|
5546
|
+
tokenizer_pre == "exaone") {
|
5547
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
|
5471
5548
|
} else {
|
5472
5549
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
5473
5550
|
}
|
@@ -7563,6 +7640,78 @@ static bool llm_load_tensors(
|
|
7563
7640
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
7564
7641
|
}
|
7565
7642
|
} break;
|
7643
|
+
case LLM_ARCH_NEMOTRON:
|
7644
|
+
{
|
7645
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
7646
|
+
|
7647
|
+
// output
|
7648
|
+
{
|
7649
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
7650
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
7651
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
7652
|
+
}
|
7653
|
+
|
7654
|
+
for (int i = 0; i < n_layer; ++i) {
|
7655
|
+
lm_ggml_context * ctx_layer = ctx_for_layer(i);
|
7656
|
+
lm_ggml_context * ctx_split = ctx_for_layer_split(i);
|
7657
|
+
|
7658
|
+
auto & layer = model.layers[i];
|
7659
|
+
|
7660
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
7661
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
7662
|
+
|
7663
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
7664
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
7665
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
7666
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
7667
|
+
|
7668
|
+
// optional bias tensors
|
7669
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7670
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7671
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7672
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7673
|
+
|
7674
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
7675
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
7676
|
+
|
7677
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
7678
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
7679
|
+
|
7680
|
+
// optional MLP bias
|
7681
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7682
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7683
|
+
}
|
7684
|
+
} break;
|
7685
|
+
case LLM_ARCH_EXAONE:
|
7686
|
+
{
|
7687
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
7688
|
+
|
7689
|
+
// output
|
7690
|
+
{
|
7691
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
7692
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
7693
|
+
}
|
7694
|
+
|
7695
|
+
for (int i = 0; i < n_layer; ++i) {
|
7696
|
+
lm_ggml_context * ctx_layer = ctx_for_layer(i);
|
7697
|
+
lm_ggml_context * ctx_split = ctx_for_layer_split(i);
|
7698
|
+
|
7699
|
+
auto & layer = model.layers[i];
|
7700
|
+
|
7701
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
7702
|
+
|
7703
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
|
7704
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
7705
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
7706
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
|
7707
|
+
|
7708
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
7709
|
+
layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
7710
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
7711
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
7712
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
7713
|
+
}
|
7714
|
+
} break;
|
7566
7715
|
default:
|
7567
7716
|
throw std::runtime_error("unknown architecture");
|
7568
7717
|
}
|
@@ -8249,7 +8398,7 @@ static struct lm_ggml_tensor * llm_build_kqv(
|
|
8249
8398
|
struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx, k, q);
|
8250
8399
|
cb(kq, "kq", il);
|
8251
8400
|
|
8252
|
-
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
|
8401
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2 || model.arch == LLM_ARCH_NEMOTRON) {
|
8253
8402
|
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
8254
8403
|
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
8255
8404
|
lm_ggml_mul_mat_set_prec(kq, LM_GGML_PREC_F32);
|
@@ -13750,6 +13899,254 @@ struct llm_build_context {
|
|
13750
13899
|
|
13751
13900
|
return gf;
|
13752
13901
|
}
|
13902
|
+
|
13903
|
+
struct lm_ggml_cgraph * build_nemotron() {
|
13904
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
13905
|
+
|
13906
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
13907
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
13908
|
+
//LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
|
13909
|
+
|
13910
|
+
struct lm_ggml_tensor * cur;
|
13911
|
+
struct lm_ggml_tensor * inpL;
|
13912
|
+
|
13913
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
13914
|
+
|
13915
|
+
// inp_pos - contains the positions
|
13916
|
+
struct lm_ggml_tensor * inp_pos = build_inp_pos();
|
13917
|
+
|
13918
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
13919
|
+
struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
13920
|
+
|
13921
|
+
for (int il = 0; il < n_layer; ++il) {
|
13922
|
+
struct lm_ggml_tensor * inpSA = inpL;
|
13923
|
+
|
13924
|
+
// norm
|
13925
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
13926
|
+
model.layers[il].attn_norm,
|
13927
|
+
model.layers[il].attn_norm_b,
|
13928
|
+
LLM_NORM, cb, il);
|
13929
|
+
cb(cur, "attn_norm", il);
|
13930
|
+
|
13931
|
+
// self-attention
|
13932
|
+
{
|
13933
|
+
// compute Q and K and RoPE them
|
13934
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
13935
|
+
cb(Qcur, "Qcur", il);
|
13936
|
+
if (model.layers[il].bq) {
|
13937
|
+
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
13938
|
+
cb(Qcur, "Qcur", il);
|
13939
|
+
}
|
13940
|
+
|
13941
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
13942
|
+
cb(Kcur, "Kcur", il);
|
13943
|
+
if (model.layers[il].bk) {
|
13944
|
+
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
13945
|
+
cb(Kcur, "Kcur", il);
|
13946
|
+
}
|
13947
|
+
|
13948
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
13949
|
+
cb(Vcur, "Vcur", il);
|
13950
|
+
if (model.layers[il].bv) {
|
13951
|
+
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
13952
|
+
cb(Vcur, "Vcur", il);
|
13953
|
+
}
|
13954
|
+
|
13955
|
+
Qcur = lm_ggml_rope_ext(
|
13956
|
+
ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
13957
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
13958
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
13959
|
+
);
|
13960
|
+
cb(Qcur, "Qcur", il);
|
13961
|
+
|
13962
|
+
Kcur = lm_ggml_rope_ext(
|
13963
|
+
ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
13964
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
13965
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
13966
|
+
);
|
13967
|
+
cb(Kcur, "Kcur", il);
|
13968
|
+
|
13969
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
13970
|
+
model.layers[il].wo, model.layers[il].bo,
|
13971
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
13972
|
+
}
|
13973
|
+
|
13974
|
+
if (il == n_layer - 1) {
|
13975
|
+
// skip computing output for unused tokens
|
13976
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
13977
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
13978
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
13979
|
+
}
|
13980
|
+
|
13981
|
+
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
13982
|
+
cb(ffn_inp, "ffn_inp", il);
|
13983
|
+
|
13984
|
+
// feed-forward network
|
13985
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
13986
|
+
model.layers[il].ffn_norm,
|
13987
|
+
model.layers[il].ffn_norm_b,
|
13988
|
+
LLM_NORM, cb, il);
|
13989
|
+
cb(cur, "ffn_norm", il);
|
13990
|
+
|
13991
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
13992
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
13993
|
+
NULL, NULL, NULL,
|
13994
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
13995
|
+
NULL,
|
13996
|
+
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
|
13997
|
+
|
13998
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
13999
|
+
cb(cur, "ffn_out", il);
|
14000
|
+
|
14001
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
14002
|
+
cb(cur, "l_out", il);
|
14003
|
+
|
14004
|
+
// input for next layer
|
14005
|
+
inpL = cur;
|
14006
|
+
}
|
14007
|
+
|
14008
|
+
cur = inpL;
|
14009
|
+
|
14010
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
14011
|
+
model.output_norm, model.output_norm_b,
|
14012
|
+
LLM_NORM, cb, -1);
|
14013
|
+
cb(cur, "result_norm", -1);
|
14014
|
+
|
14015
|
+
// lm_head
|
14016
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
14017
|
+
cb(cur, "result_output", -1);
|
14018
|
+
|
14019
|
+
lm_ggml_build_forward_expand(gf, cur);
|
14020
|
+
|
14021
|
+
return gf;
|
14022
|
+
}
|
14023
|
+
|
14024
|
+
struct lm_ggml_cgraph * build_exaone() {
|
14025
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
14026
|
+
|
14027
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
14028
|
+
int32_t n_tokens = this->n_tokens;
|
14029
|
+
|
14030
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
14031
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
14032
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
|
14033
|
+
|
14034
|
+
struct lm_ggml_tensor * cur;
|
14035
|
+
struct lm_ggml_tensor * inpL;
|
14036
|
+
|
14037
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
14038
|
+
|
14039
|
+
// inp_pos - contains the positions
|
14040
|
+
struct lm_ggml_tensor * inp_pos = build_inp_pos();
|
14041
|
+
|
14042
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
14043
|
+
struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
14044
|
+
|
14045
|
+
for (int il = 0; il < n_layer; ++il) {
|
14046
|
+
struct lm_ggml_tensor * inpSA = inpL;
|
14047
|
+
|
14048
|
+
// norm
|
14049
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
14050
|
+
model.layers[il].attn_norm, NULL,
|
14051
|
+
LLM_NORM_RMS, cb, il);
|
14052
|
+
cb(cur, "attn_norm", il);
|
14053
|
+
|
14054
|
+
// self-attention
|
14055
|
+
{
|
14056
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
14057
|
+
struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
|
14058
|
+
|
14059
|
+
// compute Q and K and RoPE them
|
14060
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
14061
|
+
cb(Qcur, "Qcur", il);
|
14062
|
+
if (model.layers[il].bq) {
|
14063
|
+
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
14064
|
+
cb(Qcur, "Qcur", il);
|
14065
|
+
}
|
14066
|
+
|
14067
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
14068
|
+
cb(Kcur, "Kcur", il);
|
14069
|
+
if (model.layers[il].bk) {
|
14070
|
+
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
14071
|
+
cb(Kcur, "Kcur", il);
|
14072
|
+
}
|
14073
|
+
|
14074
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
14075
|
+
cb(Vcur, "Vcur", il);
|
14076
|
+
if (model.layers[il].bv) {
|
14077
|
+
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
14078
|
+
cb(Vcur, "Vcur", il);
|
14079
|
+
}
|
14080
|
+
|
14081
|
+
Qcur = lm_ggml_rope_ext(
|
14082
|
+
ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
14083
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
14084
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
14085
|
+
);
|
14086
|
+
cb(Qcur, "Qcur", il);
|
14087
|
+
|
14088
|
+
Kcur = lm_ggml_rope_ext(
|
14089
|
+
ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
|
14090
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
14091
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
14092
|
+
);
|
14093
|
+
cb(Kcur, "Kcur", il);
|
14094
|
+
|
14095
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
14096
|
+
model.layers[il].wo, model.layers[il].bo,
|
14097
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
14098
|
+
}
|
14099
|
+
|
14100
|
+
if (il == n_layer - 1) {
|
14101
|
+
// skip computing output for unused tokens
|
14102
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
14103
|
+
n_tokens = n_outputs;
|
14104
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
14105
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
14106
|
+
}
|
14107
|
+
|
14108
|
+
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
14109
|
+
cb(ffn_inp, "ffn_inp", il);
|
14110
|
+
|
14111
|
+
// feed-forward network
|
14112
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
14113
|
+
model.layers[il].ffn_norm, NULL,
|
14114
|
+
LLM_NORM_RMS, cb, il);
|
14115
|
+
cb(cur, "ffn_norm", il);
|
14116
|
+
|
14117
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
14118
|
+
model.layers[il].ffn_up, NULL, NULL,
|
14119
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
14120
|
+
model.layers[il].ffn_down, NULL, NULL,
|
14121
|
+
NULL,
|
14122
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
14123
|
+
cb(cur, "ffn_out", il);
|
14124
|
+
|
14125
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
14126
|
+
cb(cur, "ffn_out", il);
|
14127
|
+
|
14128
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
14129
|
+
cb(cur, "l_out", il);
|
14130
|
+
|
14131
|
+
// input for next layer
|
14132
|
+
inpL = cur;
|
14133
|
+
}
|
14134
|
+
|
14135
|
+
cur = inpL;
|
14136
|
+
|
14137
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
14138
|
+
model.output_norm, NULL,
|
14139
|
+
LLM_NORM_RMS, cb, -1);
|
14140
|
+
cb(cur, "result_norm", -1);
|
14141
|
+
|
14142
|
+
// lm_head
|
14143
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
14144
|
+
cb(cur, "result_output", -1);
|
14145
|
+
|
14146
|
+
lm_ggml_build_forward_expand(gf, cur);
|
14147
|
+
|
14148
|
+
return gf;
|
14149
|
+
}
|
13753
14150
|
};
|
13754
14151
|
|
13755
14152
|
static struct lm_ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -14005,6 +14402,14 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
14005
14402
|
{
|
14006
14403
|
result = llm.build_jais();
|
14007
14404
|
} break;
|
14405
|
+
case LLM_ARCH_NEMOTRON:
|
14406
|
+
{
|
14407
|
+
result = llm.build_nemotron();
|
14408
|
+
} break;
|
14409
|
+
case LLM_ARCH_EXAONE:
|
14410
|
+
{
|
14411
|
+
result = llm.build_exaone();
|
14412
|
+
} break;
|
14008
14413
|
default:
|
14009
14414
|
LM_GGML_ABORT("fatal error");
|
14010
14415
|
}
|
@@ -14718,12 +15123,15 @@ static int llama_decode_internal(
|
|
14718
15123
|
res = nullptr;
|
14719
15124
|
embd = nullptr;
|
14720
15125
|
} else if (cparams.embeddings) {
|
14721
|
-
res
|
14722
|
-
embd =
|
14723
|
-
|
14724
|
-
|
15126
|
+
res = nullptr; // do not extract logits for embedding case
|
15127
|
+
embd = nullptr;
|
15128
|
+
for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
15129
|
+
if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
|
15130
|
+
embd = gf->nodes[i];
|
15131
|
+
break;
|
15132
|
+
}
|
14725
15133
|
}
|
14726
|
-
LM_GGML_ASSERT(
|
15134
|
+
LM_GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
|
14727
15135
|
} else {
|
14728
15136
|
embd = nullptr; // do not extract embeddings when not needed
|
14729
15137
|
LM_GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
@@ -17072,6 +17480,8 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
17072
17480
|
case LLM_ARCH_OPENELM:
|
17073
17481
|
case LLM_ARCH_GPTNEOX:
|
17074
17482
|
case LLM_ARCH_CODESHELL:
|
17483
|
+
case LLM_ARCH_NEMOTRON:
|
17484
|
+
case LLM_ARCH_EXAONE:
|
17075
17485
|
return LLAMA_ROPE_TYPE_NEOX;
|
17076
17486
|
|
17077
17487
|
// all model arches should be listed explicitly here
|
@@ -18697,11 +19107,11 @@ llama_token llama_token_pad(const struct llama_model * model) {
|
|
18697
19107
|
return llama_token_pad_impl(model->vocab);
|
18698
19108
|
}
|
18699
19109
|
|
18700
|
-
|
19110
|
+
bool llama_add_bos_token(const struct llama_model * model) {
|
18701
19111
|
return llama_add_bos_token_impl(model->vocab);
|
18702
19112
|
}
|
18703
19113
|
|
18704
|
-
|
19114
|
+
bool llama_add_eos_token(const struct llama_model * model) {
|
18705
19115
|
return llama_add_eos_token_impl(model->vocab);
|
18706
19116
|
}
|
18707
19117
|
|
@@ -19002,6 +19412,22 @@ static int32_t llama_chat_apply_template_internal(
|
|
19002
19412
|
if (add_ass) {
|
19003
19413
|
ss << "Assistant:";
|
19004
19414
|
}
|
19415
|
+
} else if (tmpl == "exaone3" || (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]"))) {
|
19416
|
+
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
19417
|
+
// EXAONE-3.0-7.8B-Instruct
|
19418
|
+
for (auto message : chat) {
|
19419
|
+
std::string role(message->role);
|
19420
|
+
if (role == "system") {
|
19421
|
+
ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n";
|
19422
|
+
} else if (role == "user") {
|
19423
|
+
ss << "[|user|]" << trim(message->content) << "\n";
|
19424
|
+
} else if (role == "assistant") {
|
19425
|
+
ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n";
|
19426
|
+
}
|
19427
|
+
}
|
19428
|
+
if (add_ass) {
|
19429
|
+
ss << "[|assistant|]";
|
19430
|
+
}
|
19005
19431
|
} else {
|
19006
19432
|
// template not supported
|
19007
19433
|
return -1;
|
package/cpp/llama.h
CHANGED
@@ -93,15 +93,15 @@ extern "C" {
|
|
93
93
|
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
94
94
|
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
95
95
|
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
96
|
+
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
97
|
+
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
98
|
+
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
96
99
|
};
|
97
100
|
|
98
|
-
// note: these values should be synchronized with lm_ggml_rope
|
99
|
-
// TODO: maybe move this enum to ggml.h (lm_ggml_rope_type)
|
100
101
|
enum llama_rope_type {
|
101
102
|
LLAMA_ROPE_TYPE_NONE = -1,
|
102
|
-
LLAMA_ROPE_TYPE_NORM =
|
103
|
-
LLAMA_ROPE_TYPE_NEOX =
|
104
|
-
LLAMA_ROPE_TYPE_GLM = 4,
|
103
|
+
LLAMA_ROPE_TYPE_NORM = 0,
|
104
|
+
LLAMA_ROPE_TYPE_NEOX = LM_GGML_ROPE_TYPE_NEOX,
|
105
105
|
};
|
106
106
|
|
107
107
|
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
|
@@ -915,11 +915,8 @@ extern "C" {
|
|
915
915
|
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
916
916
|
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
|
917
917
|
|
918
|
-
|
919
|
-
LLAMA_API
|
920
|
-
|
921
|
-
// Returns -1 if unknown, 1 for true or 0 for false.
|
922
|
-
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
|
918
|
+
LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
|
919
|
+
LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
|
923
920
|
|
924
921
|
// Codellama infill tokens
|
925
922
|
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
package/cpp/rn-llama.hpp
CHANGED
@@ -297,7 +297,9 @@ struct llama_rn_context
|
|
297
297
|
}
|
298
298
|
|
299
299
|
// do Context Shift , may be buggy! TODO: Verify functionality
|
300
|
-
|
300
|
+
if(!params.embedding){
|
301
|
+
purge_missing_tokens(ctx, embd, prompt_tokens, params.n_predict, params.n_ctx);
|
302
|
+
}
|
301
303
|
|
302
304
|
// push the prompt into the sampling context (do not apply grammar)
|
303
305
|
for (auto & token : prompt_tokens)
|
@@ -305,7 +307,7 @@ struct llama_rn_context
|
|
305
307
|
llama_sampling_accept(ctx_sampling, ctx, token, false);
|
306
308
|
}
|
307
309
|
// compare the evaluated prompt with the new prompt
|
308
|
-
n_past = common_part(embd, prompt_tokens);
|
310
|
+
n_past = params.embedding? 0 : common_part(embd, prompt_tokens);
|
309
311
|
LLAMA_LOG_INFO("%s: n_past: %zu", __func__, n_past);
|
310
312
|
LLAMA_LOG_INFO("%s: embd size: %zu", __func__, embd.size());
|
311
313
|
LLAMA_LOG_INFO("%s: prompt_tokens size: %zu", __func__, prompt_tokens.size());
|
@@ -342,9 +344,9 @@ struct llama_rn_context
|
|
342
344
|
completion_token_output result;
|
343
345
|
result.tok = -1;
|
344
346
|
|
347
|
+
// this truncation should never trigger with good context shifting
|
345
348
|
if (embd.size() >= (size_t)params.n_ctx)
|
346
349
|
{
|
347
|
-
// Shift context
|
348
350
|
|
349
351
|
const int n_left = n_past - params.n_keep - 1;
|
350
352
|
const int n_discard = n_left/2;
|
@@ -546,9 +548,21 @@ struct llama_rn_context
|
|
546
548
|
LOG_WARNING("embedding disabled, embedding: %s", params.embedding);
|
547
549
|
return std::vector<float>(n_embd, 0.0f);
|
548
550
|
}
|
549
|
-
|
550
|
-
|
551
|
-
|
551
|
+
float *data;
|
552
|
+
|
553
|
+
if(params.pooling_type == 0){
|
554
|
+
data = llama_get_embeddings(ctx);
|
555
|
+
}
|
556
|
+
else {
|
557
|
+
data = llama_get_embeddings_seq(ctx, 0);
|
558
|
+
}
|
559
|
+
|
560
|
+
if(!data) {
|
561
|
+
return std::vector<float>(n_embd, 0.0f);
|
562
|
+
}
|
563
|
+
std::vector<float> embedding(data, data + n_embd), out(data, data + n_embd);
|
564
|
+
llama_embd_normalize(embedding.data(), out.data(), n_embd, params.embd_normalize);
|
565
|
+
return out;
|
552
566
|
}
|
553
567
|
|
554
568
|
std::string bench(int pp, int tg, int pl, int nr)
|