cui-llama.rn 1.1.6 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/java/com/rnllama/LlamaContext.java +11 -3
- package/android/src/main/jni.cpp +28 -4
- package/cpp/common.cpp +3 -0
- package/cpp/common.h +2 -0
- package/cpp/ggml-aarch64.c +1794 -1368
- package/cpp/ggml-alloc.c +6 -0
- package/cpp/ggml-backend-impl.h +10 -9
- package/cpp/ggml-backend.c +25 -0
- package/cpp/ggml-backend.h +2 -1
- package/cpp/ggml-cpu-impl.h +614 -0
- package/cpp/ggml-impl.h +13 -609
- package/cpp/ggml-metal.m +1 -0
- package/cpp/ggml-quants.c +1 -0
- package/cpp/ggml.c +457 -144
- package/cpp/ggml.h +37 -8
- package/cpp/llama-impl.h +2 -0
- package/cpp/llama-sampling.cpp +7 -5
- package/cpp/llama-vocab.cpp +1 -5
- package/cpp/llama-vocab.h +9 -5
- package/cpp/llama.cpp +202 -30
- package/cpp/llama.h +2 -0
- package/cpp/log.cpp +1 -1
- package/cpp/log.h +2 -0
- package/cpp/sampling.cpp +9 -1
- package/cpp/sgemm.cpp +1 -0
- package/cpp/unicode.cpp +1 -0
- package/lib/commonjs/index.js +8 -1
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/index.js +8 -1
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/index.d.ts +1 -1
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/index.ts +18 -4
package/cpp/ggml.h
CHANGED
@@ -218,6 +218,7 @@
|
|
218
218
|
#include <stddef.h>
|
219
219
|
#include <stdint.h>
|
220
220
|
#include <stdio.h>
|
221
|
+
#include <string.h>
|
221
222
|
|
222
223
|
#define LM_GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
223
224
|
#define LM_GGML_FILE_VERSION 2
|
@@ -534,6 +535,7 @@ extern "C" {
|
|
534
535
|
|
535
536
|
LM_GGML_OP_CROSS_ENTROPY_LOSS,
|
536
537
|
LM_GGML_OP_CROSS_ENTROPY_LOSS_BACK,
|
538
|
+
LM_GGML_OP_OPT_STEP_ADAMW,
|
537
539
|
|
538
540
|
LM_GGML_OP_COUNT,
|
539
541
|
};
|
@@ -569,12 +571,15 @@ extern "C" {
|
|
569
571
|
LM_GGML_LOG_LEVEL_WARN = 2,
|
570
572
|
LM_GGML_LOG_LEVEL_ERROR = 3,
|
571
573
|
LM_GGML_LOG_LEVEL_DEBUG = 4,
|
574
|
+
LM_GGML_LOG_LEVEL_CONT = 5, // continue previous log
|
572
575
|
};
|
573
576
|
|
577
|
+
// this tensor...
|
574
578
|
enum lm_ggml_tensor_flag {
|
575
|
-
LM_GGML_TENSOR_FLAG_INPUT
|
576
|
-
LM_GGML_TENSOR_FLAG_OUTPUT
|
577
|
-
LM_GGML_TENSOR_FLAG_PARAM
|
579
|
+
LM_GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph
|
580
|
+
LM_GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph
|
581
|
+
LM_GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters
|
582
|
+
LM_GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
|
578
583
|
};
|
579
584
|
|
580
585
|
// n-dimensional tensor
|
@@ -1976,6 +1981,9 @@ extern "C" {
|
|
1976
1981
|
typedef void (*lm_ggml_custom2_op_t)(struct lm_ggml_tensor * dst , const struct lm_ggml_tensor * a, const struct lm_ggml_tensor * b, int ith, int nth, void * userdata);
|
1977
1982
|
typedef void (*lm_ggml_custom3_op_t)(struct lm_ggml_tensor * dst , const struct lm_ggml_tensor * a, const struct lm_ggml_tensor * b, const struct lm_ggml_tensor * c, int ith, int nth, void * userdata);
|
1978
1983
|
|
1984
|
+
#define LM_GGML_N_TASKS_MAX (-1)
|
1985
|
+
// n_tasks == LM_GGML_N_TASKS_MAX means to use max number of tasks
|
1986
|
+
|
1979
1987
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_custom1(
|
1980
1988
|
struct lm_ggml_context * ctx,
|
1981
1989
|
struct lm_ggml_tensor * a,
|
@@ -2037,23 +2045,44 @@ extern "C" {
|
|
2037
2045
|
struct lm_ggml_tensor * b,
|
2038
2046
|
struct lm_ggml_tensor * c);
|
2039
2047
|
|
2048
|
+
// AdamW optimizer step
|
2049
|
+
// Paper: https://arxiv.org/pdf/1711.05101v3.pdf
|
2050
|
+
// PyTorch: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
|
2051
|
+
LM_GGML_API struct lm_ggml_tensor * lm_ggml_opt_step_adamw(
|
2052
|
+
struct lm_ggml_context * ctx,
|
2053
|
+
struct lm_ggml_tensor * a,
|
2054
|
+
float alpha,
|
2055
|
+
float beta1,
|
2056
|
+
float beta2,
|
2057
|
+
float eps,
|
2058
|
+
float wd); // weight decay
|
2059
|
+
|
2040
2060
|
//
|
2041
2061
|
// automatic differentiation
|
2042
2062
|
//
|
2043
2063
|
|
2044
|
-
LM_GGML_API void lm_ggml_set_param(
|
2045
|
-
|
2046
|
-
struct lm_ggml_tensor * tensor);
|
2064
|
+
LM_GGML_API void lm_ggml_set_param(struct lm_ggml_context * ctx, struct lm_ggml_tensor * tensor);
|
2065
|
+
LM_GGML_API void lm_ggml_set_loss(struct lm_ggml_tensor * tensor);
|
2047
2066
|
|
2048
2067
|
LM_GGML_API void lm_ggml_build_forward_expand (struct lm_ggml_cgraph * cgraph, struct lm_ggml_tensor * tensor);
|
2049
|
-
LM_GGML_API void lm_ggml_build_backward_expand(struct lm_ggml_context * ctx, struct lm_ggml_cgraph * gf, struct lm_ggml_cgraph * gb, bool keep);
|
2068
|
+
LM_GGML_API void lm_ggml_build_backward_expand(struct lm_ggml_context * ctx, struct lm_ggml_cgraph * gf, struct lm_ggml_cgraph * gb, bool accumulate, bool keep);
|
2069
|
+
|
2070
|
+
LM_GGML_API void lm_ggml_build_opt_adamw(
|
2071
|
+
struct lm_ggml_context * ctx,
|
2072
|
+
struct lm_ggml_cgraph * gf,
|
2073
|
+
struct lm_ggml_cgraph * gb,
|
2074
|
+
float alpha,
|
2075
|
+
float beta1,
|
2076
|
+
float beta2,
|
2077
|
+
float eps,
|
2078
|
+
float wd); // weight decay
|
2050
2079
|
|
2051
2080
|
// graph allocation in a context
|
2052
2081
|
LM_GGML_API struct lm_ggml_cgraph * lm_ggml_new_graph (struct lm_ggml_context * ctx); // size = LM_GGML_DEFAULT_GRAPH_SIZE, grads = false
|
2053
2082
|
LM_GGML_API struct lm_ggml_cgraph * lm_ggml_new_graph_custom(struct lm_ggml_context * ctx, size_t size, bool grads);
|
2054
2083
|
LM_GGML_API struct lm_ggml_cgraph * lm_ggml_graph_dup (struct lm_ggml_context * ctx, struct lm_ggml_cgraph * cgraph);
|
2055
2084
|
LM_GGML_API void lm_ggml_graph_cpy (struct lm_ggml_cgraph * src, struct lm_ggml_cgraph * dst);
|
2056
|
-
LM_GGML_API void lm_ggml_graph_reset (struct lm_ggml_cgraph * cgraph);
|
2085
|
+
LM_GGML_API void lm_ggml_graph_reset (struct lm_ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
|
2057
2086
|
LM_GGML_API void lm_ggml_graph_clear (struct lm_ggml_cgraph * cgraph);
|
2058
2087
|
|
2059
2088
|
LM_GGML_API int lm_ggml_graph_size (struct lm_ggml_cgraph * cgraph);
|
package/cpp/llama-impl.h
CHANGED
@@ -28,6 +28,8 @@ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void
|
|
28
28
|
#define LLAMA_LOG_INFO(...) llama_log_internal(LM_GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
29
29
|
#define LLAMA_LOG_WARN(...) llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
30
30
|
#define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
31
|
+
#define LLAMA_LOG_DEBUG(...) llama_log_internal(LM_GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
32
|
+
#define LLAMA_LOG_CONT(...) llama_log_internal(LM_GGML_LOG_LEVEL_CONT , __VA_ARGS__)
|
31
33
|
|
32
34
|
//
|
33
35
|
// helpers
|
package/cpp/llama-sampling.cpp
CHANGED
@@ -3,13 +3,14 @@
|
|
3
3
|
#include "llama-vocab.h"
|
4
4
|
#include "llama-grammar.h"
|
5
5
|
|
6
|
-
#include <cassert>
|
7
6
|
#include <algorithm>
|
8
|
-
#include <
|
9
|
-
#include <ctime>
|
7
|
+
#include <cassert>
|
10
8
|
#include <cfloat>
|
11
9
|
#include <chrono>
|
12
10
|
#include <cmath>
|
11
|
+
#include <cstdlib>
|
12
|
+
#include <cstring>
|
13
|
+
#include <ctime>
|
13
14
|
#include <numeric>
|
14
15
|
#include <random>
|
15
16
|
#include <unordered_map>
|
@@ -236,9 +237,10 @@ llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_conte
|
|
236
237
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
237
238
|
|
238
239
|
// TODO: do not allocate each time
|
239
|
-
std::vector<llama_token_data> cur
|
240
|
+
std::vector<llama_token_data> cur;
|
241
|
+
cur.reserve(n_vocab);
|
240
242
|
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
241
|
-
cur
|
243
|
+
cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
242
244
|
}
|
243
245
|
|
244
246
|
llama_token_data_array cur_p = {
|
package/cpp/llama-vocab.cpp
CHANGED
@@ -1570,11 +1570,7 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
|
|
1570
1570
|
}
|
1571
1571
|
|
1572
1572
|
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
|
1573
|
-
return token != -1 && (
|
1574
|
-
token == llama_token_eos_impl(vocab) ||
|
1575
|
-
token == llama_token_eot_impl(vocab) ||
|
1576
|
-
token == llama_token_eom_impl(vocab)
|
1577
|
-
);
|
1573
|
+
return token != -1 && vocab.special_eog_ids.count(token) > 0;
|
1578
1574
|
}
|
1579
1575
|
|
1580
1576
|
bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token) {
|
package/cpp/llama-vocab.h
CHANGED
@@ -6,6 +6,7 @@
|
|
6
6
|
#include <vector>
|
7
7
|
#include <unordered_map>
|
8
8
|
#include <map>
|
9
|
+
#include <set>
|
9
10
|
|
10
11
|
struct llama_vocab {
|
11
12
|
using id = llama_token;
|
@@ -49,12 +50,15 @@ struct llama_vocab {
|
|
49
50
|
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
50
51
|
id special_eom_id = -1;
|
51
52
|
|
53
|
+
// set of all tokens that cause "end of generation"
|
54
|
+
std::set<id> special_eog_ids;
|
55
|
+
|
52
56
|
// tokenizer flags
|
53
|
-
bool tokenizer_add_space_prefix
|
54
|
-
bool tokenizer_add_bos
|
55
|
-
bool tokenizer_add_eos
|
56
|
-
bool tokenizer_ignore_merges
|
57
|
-
bool tokenizer_clean_spaces
|
57
|
+
bool tokenizer_add_space_prefix = false;
|
58
|
+
bool tokenizer_add_bos = false;
|
59
|
+
bool tokenizer_add_eos = false;
|
60
|
+
bool tokenizer_ignore_merges = false;
|
61
|
+
bool tokenizer_clean_spaces = false; // clean_up_tokenization_spaces
|
58
62
|
bool tokenizer_remove_extra_whitespaces = false;
|
59
63
|
bool tokenizer_escape_whitespaces = true;
|
60
64
|
bool tokenizer_treat_whitespace_as_suffix = false;
|
package/cpp/llama.cpp
CHANGED
@@ -225,6 +225,8 @@ enum llm_arch {
|
|
225
225
|
LLM_ARCH_NEMOTRON,
|
226
226
|
LLM_ARCH_EXAONE,
|
227
227
|
LLM_ARCH_RWKV6,
|
228
|
+
LLM_ARCH_GRANITE,
|
229
|
+
LLM_ARCH_GRANITE_MOE,
|
228
230
|
LLM_ARCH_UNKNOWN,
|
229
231
|
};
|
230
232
|
|
@@ -275,6 +277,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
275
277
|
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
276
278
|
{ LLM_ARCH_EXAONE, "exaone" },
|
277
279
|
{ LLM_ARCH_RWKV6, "rwkv6" },
|
280
|
+
{ LLM_ARCH_GRANITE, "granite" },
|
281
|
+
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
278
282
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
279
283
|
};
|
280
284
|
|
@@ -314,6 +318,8 @@ enum llm_kv {
|
|
314
318
|
LLM_KV_RESCALE_EVERY_N_LAYERS,
|
315
319
|
LLM_KV_TIME_MIX_EXTRA_DIM,
|
316
320
|
LLM_KV_TIME_DECAY_EXTRA_DIM,
|
321
|
+
LLM_KV_RESIDUAL_SCALE,
|
322
|
+
LLM_KV_EMBEDDING_SCALE,
|
317
323
|
|
318
324
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
319
325
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
@@ -328,6 +334,7 @@ enum llm_kv {
|
|
328
334
|
LLM_KV_ATTENTION_KV_LORA_RANK,
|
329
335
|
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
330
336
|
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
337
|
+
LLM_KV_ATTENTION_SCALE,
|
331
338
|
|
332
339
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
333
340
|
LLM_KV_ROPE_FREQ_BASE,
|
@@ -418,6 +425,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
418
425
|
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
|
419
426
|
{ LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
|
420
427
|
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
|
428
|
+
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
|
429
|
+
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
|
421
430
|
|
422
431
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
423
432
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -432,6 +441,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
432
441
|
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
433
442
|
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
434
443
|
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
444
|
+
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
435
445
|
|
436
446
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
437
447
|
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
@@ -1465,6 +1475,41 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
1465
1475
|
{ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
|
1466
1476
|
},
|
1467
1477
|
},
|
1478
|
+
{
|
1479
|
+
LLM_ARCH_GRANITE,
|
1480
|
+
{
|
1481
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1482
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1483
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1484
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1485
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1486
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1487
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1488
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1489
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1490
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1491
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1492
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1493
|
+
},
|
1494
|
+
},
|
1495
|
+
{
|
1496
|
+
LLM_ARCH_GRANITE_MOE,
|
1497
|
+
{
|
1498
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1499
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1500
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1501
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1502
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1503
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1504
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1505
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1506
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1507
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
1508
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
1509
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
1510
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1511
|
+
},
|
1512
|
+
},
|
1468
1513
|
{
|
1469
1514
|
LLM_ARCH_UNKNOWN,
|
1470
1515
|
{
|
@@ -2383,6 +2428,11 @@ struct llama_hparams {
|
|
2383
2428
|
float f_max_alibi_bias = 0.0f;
|
2384
2429
|
float f_logit_scale = 0.0f;
|
2385
2430
|
|
2431
|
+
// Additional scale factors (Granite/Granite MoE)
|
2432
|
+
float f_residual_scale = 0.0f;
|
2433
|
+
float f_embedding_scale = 0.0f;
|
2434
|
+
float f_attention_scale = 0.0f;
|
2435
|
+
|
2386
2436
|
bool causal_attn = true;
|
2387
2437
|
bool use_alibi = false;
|
2388
2438
|
bool attn_soft_cap = false;
|
@@ -2445,6 +2495,9 @@ struct llama_hparams {
|
|
2445
2495
|
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
2446
2496
|
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
|
2447
2497
|
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
|
2498
|
+
if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true;
|
2499
|
+
if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true;
|
2500
|
+
if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true;
|
2448
2501
|
|
2449
2502
|
return false;
|
2450
2503
|
}
|
@@ -3035,18 +3088,14 @@ struct llama_sbatch {
|
|
3035
3088
|
} else {
|
3036
3089
|
// simple split
|
3037
3090
|
if (batch->n_seq_id) {
|
3038
|
-
|
3039
|
-
ubatch.n_seq_id = batch->n_seq_id + seq.offset;
|
3040
|
-
}
|
3091
|
+
ubatch.n_seq_id = batch->n_seq_id + seq.offset;
|
3041
3092
|
} else {
|
3042
3093
|
for (size_t i = 0; i < length; ++i) {
|
3043
3094
|
ubatch.n_seq_id[ubatch.n_seqs + i] = 1;
|
3044
3095
|
}
|
3045
3096
|
}
|
3046
3097
|
if (batch->seq_id) {
|
3047
|
-
|
3048
|
-
ubatch.seq_id = batch->seq_id + seq.offset;
|
3049
|
-
}
|
3098
|
+
ubatch.seq_id = batch->seq_id + seq.offset;
|
3050
3099
|
} else {
|
3051
3100
|
for (size_t i = 0; i < length; ++i) {
|
3052
3101
|
ubatch.seq_id[ubatch.n_seqs + i] = &seq.all_seq_id;
|
@@ -6030,6 +6079,22 @@ static void llm_load_hparams(
|
|
6030
6079
|
default: model.type = e_model::MODEL_UNKNOWN;
|
6031
6080
|
}
|
6032
6081
|
} break;
|
6082
|
+
case LLM_ARCH_GRANITE:
|
6083
|
+
case LLM_ARCH_GRANITE_MOE:
|
6084
|
+
{
|
6085
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
6086
|
+
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
6087
|
+
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
6088
|
+
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
6089
|
+
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
|
6090
|
+
|
6091
|
+
switch (hparams.n_layer) {
|
6092
|
+
case 32: model.type = e_model::MODEL_3B; break;
|
6093
|
+
case 40: model.type = e_model::MODEL_3B; break;
|
6094
|
+
// Add additional layer/vocab/etc checks here for other model sizes
|
6095
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
6096
|
+
}
|
6097
|
+
} break;
|
6033
6098
|
default: (void)0;
|
6034
6099
|
}
|
6035
6100
|
|
@@ -6072,8 +6137,15 @@ static void llm_load_vocab(
|
|
6072
6137
|
vocab.special_mask_id = -1;
|
6073
6138
|
vocab.linefeed_id = -1;
|
6074
6139
|
|
6140
|
+
// read vocab size from metadata
|
6141
|
+
if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) {
|
6142
|
+
vocab.n_vocab = 0;
|
6143
|
+
LLAMA_LOG_WARN("%s: there is no vocab_size in metadata, vocab.n_vocab will be set to %u\n", __func__, vocab.n_vocab);
|
6144
|
+
}
|
6075
6145
|
return;
|
6076
|
-
}
|
6146
|
+
}
|
6147
|
+
|
6148
|
+
if (tokenizer_model == "llama") {
|
6077
6149
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
6078
6150
|
|
6079
6151
|
// default special tokens
|
@@ -6471,21 +6543,21 @@ static void llm_load_vocab(
|
|
6471
6543
|
// for now, we apply this workaround to find the EOT token based on its text
|
6472
6544
|
if (vocab.special_eot_id == -1) {
|
6473
6545
|
for (const auto & t : vocab.token_to_id) {
|
6474
|
-
if (
|
6546
|
+
if (false
|
6475
6547
|
// TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
|
6476
6548
|
// need to fix convert script
|
6477
6549
|
//vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
|
6478
|
-
|
6479
|
-
|
6480
|
-
|
6481
|
-
|
6482
|
-
|
6483
|
-
|
6550
|
+
|| t.first == "<|eot_id|>"
|
6551
|
+
|| t.first == "<|im_end|>"
|
6552
|
+
|| t.first == "<|end|>"
|
6553
|
+
|| t.first == "<end_of_turn>"
|
6554
|
+
|| t.first == "<|endoftext|>"
|
6555
|
+
|| t.first == "<EOT>"
|
6484
6556
|
) {
|
6485
6557
|
vocab.special_eot_id = t.second;
|
6486
6558
|
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
6487
6559
|
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
6488
|
-
|
6560
|
+
__func__, t.first.c_str());
|
6489
6561
|
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
6490
6562
|
}
|
6491
6563
|
break;
|
@@ -6508,6 +6580,44 @@ static void llm_load_vocab(
|
|
6508
6580
|
}
|
6509
6581
|
}
|
6510
6582
|
}
|
6583
|
+
|
6584
|
+
// maintain a list of tokens that cause end-of-generation
|
6585
|
+
// this is currently determined based on the token text, which is obviously not ideal
|
6586
|
+
// ref: https://github.com/ggerganov/llama.cpp/issues/9606
|
6587
|
+
vocab.special_eog_ids.clear();
|
6588
|
+
for (const auto & t : vocab.token_to_id) {
|
6589
|
+
if (false
|
6590
|
+
|| t.first == "<|eot_id|>"
|
6591
|
+
|| t.first == "<|im_end|>"
|
6592
|
+
|| t.first == "<|end|>"
|
6593
|
+
|| t.first == "<end_of_turn>"
|
6594
|
+
|| t.first == "<|endoftext|>"
|
6595
|
+
|| t.first == "<|eom_id|>"
|
6596
|
+
|| t.first == "<EOT>"
|
6597
|
+
) {
|
6598
|
+
vocab.special_eog_ids.insert(t.second);
|
6599
|
+
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
6600
|
+
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
6601
|
+
__func__, t.first.c_str());
|
6602
|
+
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
6603
|
+
}
|
6604
|
+
}
|
6605
|
+
}
|
6606
|
+
|
6607
|
+
if (vocab.special_eos_id != -1 && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
|
6608
|
+
vocab.special_eog_ids.insert(vocab.special_eos_id);
|
6609
|
+
LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
6610
|
+
}
|
6611
|
+
|
6612
|
+
if (vocab.special_eot_id != -1 && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
|
6613
|
+
vocab.special_eog_ids.insert(vocab.special_eot_id);
|
6614
|
+
LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
6615
|
+
}
|
6616
|
+
|
6617
|
+
if (vocab.special_eom_id != -1 && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
|
6618
|
+
vocab.special_eog_ids.insert(vocab.special_eom_id);
|
6619
|
+
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
6620
|
+
}
|
6511
6621
|
}
|
6512
6622
|
|
6513
6623
|
// build special tokens cache
|
@@ -6711,6 +6821,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
6711
6821
|
if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
|
6712
6822
|
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
6713
6823
|
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
6824
|
+
if (vocab.special_eom_id != -1) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); }
|
6825
|
+
|
6826
|
+
for (const auto & id : vocab.special_eog_ids) {
|
6827
|
+
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
|
6828
|
+
}
|
6714
6829
|
|
6715
6830
|
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
|
6716
6831
|
|
@@ -6728,6 +6843,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
6728
6843
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
6729
6844
|
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
6730
6845
|
}
|
6846
|
+
|
6847
|
+
if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
|
6848
|
+
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
6849
|
+
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
6850
|
+
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
6851
|
+
}
|
6731
6852
|
}
|
6732
6853
|
|
6733
6854
|
// Returns false if cancelled by progress_callback
|
@@ -6896,6 +7017,8 @@ static bool llm_load_tensors(
|
|
6896
7017
|
case LLM_ARCH_LLAMA:
|
6897
7018
|
case LLM_ARCH_REFACT:
|
6898
7019
|
case LLM_ARCH_MINICPM:
|
7020
|
+
case LLM_ARCH_GRANITE:
|
7021
|
+
case LLM_ARCH_GRANITE_MOE:
|
6899
7022
|
{
|
6900
7023
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6901
7024
|
|
@@ -8879,6 +9002,11 @@ static struct lm_ggml_tensor * llm_build_inp_embd(
|
|
8879
9002
|
lm_ggml_set_input(lctx.inp_embd);
|
8880
9003
|
}
|
8881
9004
|
|
9005
|
+
// For Granite architecture
|
9006
|
+
if (hparams.f_embedding_scale != 0.0f) {
|
9007
|
+
inpL = lm_ggml_scale(ctx, inpL, hparams.f_embedding_scale);
|
9008
|
+
}
|
9009
|
+
|
8882
9010
|
cb(inpL, "inp_embd", -1);
|
8883
9011
|
|
8884
9012
|
return inpL;
|
@@ -9880,17 +10008,36 @@ struct llm_build_context {
|
|
9880
10008
|
const int64_t n_head_kv = hparams.n_head_kv(il);
|
9881
10009
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
9882
10010
|
struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
|
9883
|
-
struct lm_ggml_tensor *
|
10011
|
+
struct lm_ggml_tensor * k =
|
10012
|
+
lm_ggml_view_3d(ctx0, kv_self.k_l[il],
|
10013
|
+
n_embd_head_k, n_head_kv, n_ctx,
|
10014
|
+
lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
10015
|
+
lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
10016
|
+
0);
|
10017
|
+
|
10018
|
+
struct lm_ggml_tensor * tmp;
|
10019
|
+
if (lm_ggml_is_quantized(k->type)) {
|
10020
|
+
// dequantize to f32 -> RoPE -> quantize back
|
10021
|
+
tmp = lm_ggml_cast(ctx0, k, LM_GGML_TYPE_F32);
|
10022
|
+
cb(tmp, "K_f32", il);
|
10023
|
+
for (auto * backend : lctx.backends) {
|
10024
|
+
// Figure out which backend KV cache belongs to
|
10025
|
+
if (lm_ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft)) {
|
10026
|
+
lm_ggml_backend_sched_set_tensor_backend(lctx.sched, tmp, backend);
|
10027
|
+
break;
|
10028
|
+
}
|
10029
|
+
}
|
10030
|
+
tmp = lm_ggml_rope_ext_inplace(ctx0, tmp,
|
10031
|
+
lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10032
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
10033
|
+
cb(tmp, "K_shifted_f32", il);
|
10034
|
+
tmp = lm_ggml_cpy(ctx0, tmp, k);
|
10035
|
+
} else {
|
9884
10036
|
// we rotate only the first n_rot dimensions
|
9885
|
-
lm_ggml_rope_ext_inplace(ctx0,
|
9886
|
-
lm_ggml_view_3d(ctx0, kv_self.k_l[il],
|
9887
|
-
n_embd_head_k, n_head_kv, n_ctx,
|
9888
|
-
lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
9889
|
-
lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
9890
|
-
0),
|
10037
|
+
tmp = lm_ggml_rope_ext_inplace(ctx0, k,
|
9891
10038
|
lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
9892
10039
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9893
|
-
|
10040
|
+
}
|
9894
10041
|
cb(tmp, "K_shifted", il);
|
9895
10042
|
lm_ggml_build_forward_expand(gf, tmp);
|
9896
10043
|
}
|
@@ -10157,6 +10304,7 @@ struct llm_build_context {
|
|
10157
10304
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
10158
10305
|
struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
10159
10306
|
|
10307
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
10160
10308
|
for (int il = 0; il < n_layer; ++il) {
|
10161
10309
|
struct lm_ggml_tensor * inpSA = inpL;
|
10162
10310
|
|
@@ -10209,7 +10357,7 @@ struct llm_build_context {
|
|
10209
10357
|
|
10210
10358
|
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
10211
10359
|
model.layers[il].wo, model.layers[il].bo,
|
10212
|
-
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv,
|
10360
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
|
10213
10361
|
}
|
10214
10362
|
|
10215
10363
|
if (il == n_layer - 1) {
|
@@ -10220,6 +10368,11 @@ struct llm_build_context {
|
|
10220
10368
|
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
10221
10369
|
}
|
10222
10370
|
|
10371
|
+
// For Granite architecture
|
10372
|
+
if (hparams.f_residual_scale) {
|
10373
|
+
cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
10374
|
+
}
|
10375
|
+
|
10223
10376
|
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
10224
10377
|
cb(ffn_inp, "ffn_inp", il);
|
10225
10378
|
|
@@ -10256,6 +10409,11 @@ struct llm_build_context {
|
|
10256
10409
|
cb(cur, "ffn_moe_out", il);
|
10257
10410
|
}
|
10258
10411
|
|
10412
|
+
// For Granite architecture
|
10413
|
+
if (hparams.f_residual_scale) {
|
10414
|
+
cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
10415
|
+
}
|
10416
|
+
|
10259
10417
|
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
10260
10418
|
cb(cur, "ffn_out", il);
|
10261
10419
|
|
@@ -10275,6 +10433,12 @@ struct llm_build_context {
|
|
10275
10433
|
|
10276
10434
|
// lm_head
|
10277
10435
|
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
10436
|
+
|
10437
|
+
// For Granite architecture
|
10438
|
+
if (hparams.f_logit_scale) {
|
10439
|
+
cur = lm_ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
10440
|
+
}
|
10441
|
+
|
10278
10442
|
cb(cur, "result_output", -1);
|
10279
10443
|
|
10280
10444
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -15800,6 +15964,8 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
15800
15964
|
|
15801
15965
|
switch (model.arch) {
|
15802
15966
|
case LLM_ARCH_LLAMA:
|
15967
|
+
case LLM_ARCH_GRANITE:
|
15968
|
+
case LLM_ARCH_GRANITE_MOE:
|
15803
15969
|
{
|
15804
15970
|
result = llm.build_llama();
|
15805
15971
|
} break;
|
@@ -16588,7 +16754,7 @@ static int llama_decode_internal(
|
|
16588
16754
|
const uint32_t n_tokens_all = batch_all.n_tokens;
|
16589
16755
|
|
16590
16756
|
if (n_tokens_all == 0) {
|
16591
|
-
LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
|
16757
|
+
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
|
16592
16758
|
return -1;
|
16593
16759
|
}
|
16594
16760
|
|
@@ -16601,7 +16767,7 @@ static int llama_decode_internal(
|
|
16601
16767
|
if (batch_all.token) {
|
16602
16768
|
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
16603
16769
|
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
|
16604
|
-
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
|
16770
|
+
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch_all.token[i]);
|
16605
16771
|
return -1;
|
16606
16772
|
}
|
16607
16773
|
}
|
@@ -16889,7 +17055,7 @@ static int llama_encode_internal(
|
|
16889
17055
|
const uint32_t n_tokens = batch.n_tokens;
|
16890
17056
|
|
16891
17057
|
if (n_tokens == 0) {
|
16892
|
-
LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
|
17058
|
+
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
|
16893
17059
|
return -1;
|
16894
17060
|
}
|
16895
17061
|
|
@@ -16902,7 +17068,7 @@ static int llama_encode_internal(
|
|
16902
17068
|
if (batch.token) {
|
16903
17069
|
for (uint32_t i = 0; i < n_tokens; ++i) {
|
16904
17070
|
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
|
16905
|
-
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
|
17071
|
+
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
|
16906
17072
|
return -1;
|
16907
17073
|
}
|
16908
17074
|
}
|
@@ -18584,9 +18750,9 @@ struct llama_model * llama_load_model_from_file(
|
|
18584
18750
|
unsigned percentage = (unsigned) (100 * progress);
|
18585
18751
|
while (percentage > *cur_percentage_p) {
|
18586
18752
|
*cur_percentage_p = percentage;
|
18587
|
-
|
18753
|
+
LLAMA_LOG_CONT(".");
|
18588
18754
|
if (percentage >= 100) {
|
18589
|
-
|
18755
|
+
LLAMA_LOG_CONT("\n");
|
18590
18756
|
}
|
18591
18757
|
}
|
18592
18758
|
return true;
|
@@ -19058,6 +19224,10 @@ int32_t llama_n_layer(const struct llama_model * model) {
|
|
19058
19224
|
return model->hparams.n_layer;
|
19059
19225
|
}
|
19060
19226
|
|
19227
|
+
int32_t llama_n_head(const struct llama_model * model) {
|
19228
|
+
return model->hparams.n_head();
|
19229
|
+
}
|
19230
|
+
|
19061
19231
|
const struct llama_model * llama_get_model(const struct llama_context * ctx) {
|
19062
19232
|
return &ctx->model;
|
19063
19233
|
}
|
@@ -19096,6 +19266,8 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
19096
19266
|
case LLM_ARCH_ARCTIC:
|
19097
19267
|
case LLM_ARCH_DEEPSEEK2:
|
19098
19268
|
case LLM_ARCH_CHATGLM:
|
19269
|
+
case LLM_ARCH_GRANITE:
|
19270
|
+
case LLM_ARCH_GRANITE_MOE:
|
19099
19271
|
return LLAMA_ROPE_TYPE_NORM;
|
19100
19272
|
|
19101
19273
|
// the pairs of head values are offset by n_rot/2
|
package/cpp/llama.h
CHANGED
@@ -442,6 +442,7 @@ extern "C" {
|
|
442
442
|
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
443
443
|
LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
|
444
444
|
LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
|
445
|
+
LLAMA_API int32_t llama_n_head (const struct llama_model * model);
|
445
446
|
|
446
447
|
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
447
448
|
|
@@ -1066,6 +1067,7 @@ extern "C" {
|
|
1066
1067
|
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
|
1067
1068
|
|
1068
1069
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
1070
|
+
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
|
1069
1071
|
LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void);
|
1070
1072
|
|
1071
1073
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
package/cpp/log.cpp
CHANGED