cui-llama.rn 1.1.6 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/ggml.h CHANGED
@@ -218,6 +218,7 @@
218
218
  #include <stddef.h>
219
219
  #include <stdint.h>
220
220
  #include <stdio.h>
221
+ #include <string.h>
221
222
 
222
223
  #define LM_GGML_FILE_MAGIC 0x67676d6c // "ggml"
223
224
  #define LM_GGML_FILE_VERSION 2
@@ -534,6 +535,7 @@ extern "C" {
534
535
 
535
536
  LM_GGML_OP_CROSS_ENTROPY_LOSS,
536
537
  LM_GGML_OP_CROSS_ENTROPY_LOSS_BACK,
538
+ LM_GGML_OP_OPT_STEP_ADAMW,
537
539
 
538
540
  LM_GGML_OP_COUNT,
539
541
  };
@@ -569,12 +571,15 @@ extern "C" {
569
571
  LM_GGML_LOG_LEVEL_WARN = 2,
570
572
  LM_GGML_LOG_LEVEL_ERROR = 3,
571
573
  LM_GGML_LOG_LEVEL_DEBUG = 4,
574
+ LM_GGML_LOG_LEVEL_CONT = 5, // continue previous log
572
575
  };
573
576
 
577
+ // this tensor...
574
578
  enum lm_ggml_tensor_flag {
575
- LM_GGML_TENSOR_FLAG_INPUT = 1,
576
- LM_GGML_TENSOR_FLAG_OUTPUT = 2,
577
- LM_GGML_TENSOR_FLAG_PARAM = 4,
579
+ LM_GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph
580
+ LM_GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph
581
+ LM_GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters
582
+ LM_GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
578
583
  };
579
584
 
580
585
  // n-dimensional tensor
@@ -1976,6 +1981,9 @@ extern "C" {
1976
1981
  typedef void (*lm_ggml_custom2_op_t)(struct lm_ggml_tensor * dst , const struct lm_ggml_tensor * a, const struct lm_ggml_tensor * b, int ith, int nth, void * userdata);
1977
1982
  typedef void (*lm_ggml_custom3_op_t)(struct lm_ggml_tensor * dst , const struct lm_ggml_tensor * a, const struct lm_ggml_tensor * b, const struct lm_ggml_tensor * c, int ith, int nth, void * userdata);
1978
1983
 
1984
+ #define LM_GGML_N_TASKS_MAX (-1)
1985
+ // n_tasks == LM_GGML_N_TASKS_MAX means to use max number of tasks
1986
+
1979
1987
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_custom1(
1980
1988
  struct lm_ggml_context * ctx,
1981
1989
  struct lm_ggml_tensor * a,
@@ -2037,23 +2045,44 @@ extern "C" {
2037
2045
  struct lm_ggml_tensor * b,
2038
2046
  struct lm_ggml_tensor * c);
2039
2047
 
2048
+ // AdamW optimizer step
2049
+ // Paper: https://arxiv.org/pdf/1711.05101v3.pdf
2050
+ // PyTorch: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
2051
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_opt_step_adamw(
2052
+ struct lm_ggml_context * ctx,
2053
+ struct lm_ggml_tensor * a,
2054
+ float alpha,
2055
+ float beta1,
2056
+ float beta2,
2057
+ float eps,
2058
+ float wd); // weight decay
2059
+
2040
2060
  //
2041
2061
  // automatic differentiation
2042
2062
  //
2043
2063
 
2044
- LM_GGML_API void lm_ggml_set_param(
2045
- struct lm_ggml_context * ctx,
2046
- struct lm_ggml_tensor * tensor);
2064
+ LM_GGML_API void lm_ggml_set_param(struct lm_ggml_context * ctx, struct lm_ggml_tensor * tensor);
2065
+ LM_GGML_API void lm_ggml_set_loss(struct lm_ggml_tensor * tensor);
2047
2066
 
2048
2067
  LM_GGML_API void lm_ggml_build_forward_expand (struct lm_ggml_cgraph * cgraph, struct lm_ggml_tensor * tensor);
2049
- LM_GGML_API void lm_ggml_build_backward_expand(struct lm_ggml_context * ctx, struct lm_ggml_cgraph * gf, struct lm_ggml_cgraph * gb, bool keep);
2068
+ LM_GGML_API void lm_ggml_build_backward_expand(struct lm_ggml_context * ctx, struct lm_ggml_cgraph * gf, struct lm_ggml_cgraph * gb, bool accumulate, bool keep);
2069
+
2070
+ LM_GGML_API void lm_ggml_build_opt_adamw(
2071
+ struct lm_ggml_context * ctx,
2072
+ struct lm_ggml_cgraph * gf,
2073
+ struct lm_ggml_cgraph * gb,
2074
+ float alpha,
2075
+ float beta1,
2076
+ float beta2,
2077
+ float eps,
2078
+ float wd); // weight decay
2050
2079
 
2051
2080
  // graph allocation in a context
2052
2081
  LM_GGML_API struct lm_ggml_cgraph * lm_ggml_new_graph (struct lm_ggml_context * ctx); // size = LM_GGML_DEFAULT_GRAPH_SIZE, grads = false
2053
2082
  LM_GGML_API struct lm_ggml_cgraph * lm_ggml_new_graph_custom(struct lm_ggml_context * ctx, size_t size, bool grads);
2054
2083
  LM_GGML_API struct lm_ggml_cgraph * lm_ggml_graph_dup (struct lm_ggml_context * ctx, struct lm_ggml_cgraph * cgraph);
2055
2084
  LM_GGML_API void lm_ggml_graph_cpy (struct lm_ggml_cgraph * src, struct lm_ggml_cgraph * dst);
2056
- LM_GGML_API void lm_ggml_graph_reset (struct lm_ggml_cgraph * cgraph); // zero grads
2085
+ LM_GGML_API void lm_ggml_graph_reset (struct lm_ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
2057
2086
  LM_GGML_API void lm_ggml_graph_clear (struct lm_ggml_cgraph * cgraph);
2058
2087
 
2059
2088
  LM_GGML_API int lm_ggml_graph_size (struct lm_ggml_cgraph * cgraph);
package/cpp/llama-impl.h CHANGED
@@ -28,6 +28,8 @@ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void
28
28
  #define LLAMA_LOG_INFO(...) llama_log_internal(LM_GGML_LOG_LEVEL_INFO , __VA_ARGS__)
29
29
  #define LLAMA_LOG_WARN(...) llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__)
30
30
  #define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
31
+ #define LLAMA_LOG_DEBUG(...) llama_log_internal(LM_GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
32
+ #define LLAMA_LOG_CONT(...) llama_log_internal(LM_GGML_LOG_LEVEL_CONT , __VA_ARGS__)
31
33
 
32
34
  //
33
35
  // helpers
@@ -3,13 +3,14 @@
3
3
  #include "llama-vocab.h"
4
4
  #include "llama-grammar.h"
5
5
 
6
- #include <cassert>
7
6
  #include <algorithm>
8
- #include <cstring>
9
- #include <ctime>
7
+ #include <cassert>
10
8
  #include <cfloat>
11
9
  #include <chrono>
12
10
  #include <cmath>
11
+ #include <cstdlib>
12
+ #include <cstring>
13
+ #include <ctime>
13
14
  #include <numeric>
14
15
  #include <random>
15
16
  #include <unordered_map>
@@ -236,9 +237,10 @@ llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_conte
236
237
  const int n_vocab = llama_n_vocab(llama_get_model(ctx));
237
238
 
238
239
  // TODO: do not allocate each time
239
- std::vector<llama_token_data> cur(n_vocab);
240
+ std::vector<llama_token_data> cur;
241
+ cur.reserve(n_vocab);
240
242
  for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
241
- cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
243
+ cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
242
244
  }
243
245
 
244
246
  llama_token_data_array cur_p = {
@@ -1570,11 +1570,7 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
1570
1570
  }
1571
1571
 
1572
1572
  bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
1573
- return token != -1 && (
1574
- token == llama_token_eos_impl(vocab) ||
1575
- token == llama_token_eot_impl(vocab) ||
1576
- token == llama_token_eom_impl(vocab)
1577
- );
1573
+ return token != -1 && vocab.special_eog_ids.count(token) > 0;
1578
1574
  }
1579
1575
 
1580
1576
  bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token) {
package/cpp/llama-vocab.h CHANGED
@@ -6,6 +6,7 @@
6
6
  #include <vector>
7
7
  #include <unordered_map>
8
8
  #include <map>
9
+ #include <set>
9
10
 
10
11
  struct llama_vocab {
11
12
  using id = llama_token;
@@ -49,12 +50,15 @@ struct llama_vocab {
49
50
  id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
50
51
  id special_eom_id = -1;
51
52
 
53
+ // set of all tokens that cause "end of generation"
54
+ std::set<id> special_eog_ids;
55
+
52
56
  // tokenizer flags
53
- bool tokenizer_add_space_prefix = false;
54
- bool tokenizer_add_bos = false;
55
- bool tokenizer_add_eos = false;
56
- bool tokenizer_ignore_merges = false;
57
- bool tokenizer_clean_spaces = false; // clean_up_tokenization_spaces
57
+ bool tokenizer_add_space_prefix = false;
58
+ bool tokenizer_add_bos = false;
59
+ bool tokenizer_add_eos = false;
60
+ bool tokenizer_ignore_merges = false;
61
+ bool tokenizer_clean_spaces = false; // clean_up_tokenization_spaces
58
62
  bool tokenizer_remove_extra_whitespaces = false;
59
63
  bool tokenizer_escape_whitespaces = true;
60
64
  bool tokenizer_treat_whitespace_as_suffix = false;
package/cpp/llama.cpp CHANGED
@@ -225,6 +225,8 @@ enum llm_arch {
225
225
  LLM_ARCH_NEMOTRON,
226
226
  LLM_ARCH_EXAONE,
227
227
  LLM_ARCH_RWKV6,
228
+ LLM_ARCH_GRANITE,
229
+ LLM_ARCH_GRANITE_MOE,
228
230
  LLM_ARCH_UNKNOWN,
229
231
  };
230
232
 
@@ -275,6 +277,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
275
277
  { LLM_ARCH_NEMOTRON, "nemotron" },
276
278
  { LLM_ARCH_EXAONE, "exaone" },
277
279
  { LLM_ARCH_RWKV6, "rwkv6" },
280
+ { LLM_ARCH_GRANITE, "granite" },
281
+ { LLM_ARCH_GRANITE_MOE, "granitemoe" },
278
282
  { LLM_ARCH_UNKNOWN, "(unknown)" },
279
283
  };
280
284
 
@@ -314,6 +318,8 @@ enum llm_kv {
314
318
  LLM_KV_RESCALE_EVERY_N_LAYERS,
315
319
  LLM_KV_TIME_MIX_EXTRA_DIM,
316
320
  LLM_KV_TIME_DECAY_EXTRA_DIM,
321
+ LLM_KV_RESIDUAL_SCALE,
322
+ LLM_KV_EMBEDDING_SCALE,
317
323
 
318
324
  LLM_KV_ATTENTION_HEAD_COUNT,
319
325
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -328,6 +334,7 @@ enum llm_kv {
328
334
  LLM_KV_ATTENTION_KV_LORA_RANK,
329
335
  LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
330
336
  LLM_KV_ATTENTION_SLIDING_WINDOW,
337
+ LLM_KV_ATTENTION_SCALE,
331
338
 
332
339
  LLM_KV_ROPE_DIMENSION_COUNT,
333
340
  LLM_KV_ROPE_FREQ_BASE,
@@ -418,6 +425,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
418
425
  { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
419
426
  { LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
420
427
  { LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
428
+ { LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
429
+ { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
421
430
 
422
431
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
423
432
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -432,6 +441,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
432
441
  { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
433
442
  { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
434
443
  { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
444
+ { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
435
445
 
436
446
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
437
447
  { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -1465,6 +1475,41 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1465
1475
  { LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
1466
1476
  },
1467
1477
  },
1478
+ {
1479
+ LLM_ARCH_GRANITE,
1480
+ {
1481
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1482
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1483
+ { LLM_TENSOR_OUTPUT, "output" },
1484
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1485
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1486
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1487
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1488
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1489
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1490
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1491
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1492
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1493
+ },
1494
+ },
1495
+ {
1496
+ LLM_ARCH_GRANITE_MOE,
1497
+ {
1498
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1499
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1500
+ { LLM_TENSOR_OUTPUT, "output" },
1501
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1502
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1503
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1504
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1505
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1506
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1507
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1508
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1509
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1510
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1511
+ },
1512
+ },
1468
1513
  {
1469
1514
  LLM_ARCH_UNKNOWN,
1470
1515
  {
@@ -2383,6 +2428,11 @@ struct llama_hparams {
2383
2428
  float f_max_alibi_bias = 0.0f;
2384
2429
  float f_logit_scale = 0.0f;
2385
2430
 
2431
+ // Additional scale factors (Granite/Granite MoE)
2432
+ float f_residual_scale = 0.0f;
2433
+ float f_embedding_scale = 0.0f;
2434
+ float f_attention_scale = 0.0f;
2435
+
2386
2436
  bool causal_attn = true;
2387
2437
  bool use_alibi = false;
2388
2438
  bool attn_soft_cap = false;
@@ -2445,6 +2495,9 @@ struct llama_hparams {
2445
2495
  if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
2446
2496
  if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
2447
2497
  if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2498
+ if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true;
2499
+ if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true;
2500
+ if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true;
2448
2501
 
2449
2502
  return false;
2450
2503
  }
@@ -3035,18 +3088,14 @@ struct llama_sbatch {
3035
3088
  } else {
3036
3089
  // simple split
3037
3090
  if (batch->n_seq_id) {
3038
- for (size_t i = 0; i < length; ++i) {
3039
- ubatch.n_seq_id = batch->n_seq_id + seq.offset;
3040
- }
3091
+ ubatch.n_seq_id = batch->n_seq_id + seq.offset;
3041
3092
  } else {
3042
3093
  for (size_t i = 0; i < length; ++i) {
3043
3094
  ubatch.n_seq_id[ubatch.n_seqs + i] = 1;
3044
3095
  }
3045
3096
  }
3046
3097
  if (batch->seq_id) {
3047
- for (size_t i = 0; i < length; ++i) {
3048
- ubatch.seq_id = batch->seq_id + seq.offset;
3049
- }
3098
+ ubatch.seq_id = batch->seq_id + seq.offset;
3050
3099
  } else {
3051
3100
  for (size_t i = 0; i < length; ++i) {
3052
3101
  ubatch.seq_id[ubatch.n_seqs + i] = &seq.all_seq_id;
@@ -6030,6 +6079,22 @@ static void llm_load_hparams(
6030
6079
  default: model.type = e_model::MODEL_UNKNOWN;
6031
6080
  }
6032
6081
  } break;
6082
+ case LLM_ARCH_GRANITE:
6083
+ case LLM_ARCH_GRANITE_MOE:
6084
+ {
6085
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
6086
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
6087
+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
6088
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
6089
+ ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
6090
+
6091
+ switch (hparams.n_layer) {
6092
+ case 32: model.type = e_model::MODEL_3B; break;
6093
+ case 40: model.type = e_model::MODEL_3B; break;
6094
+ // Add additional layer/vocab/etc checks here for other model sizes
6095
+ default: model.type = e_model::MODEL_UNKNOWN;
6096
+ }
6097
+ } break;
6033
6098
  default: (void)0;
6034
6099
  }
6035
6100
 
@@ -6072,8 +6137,15 @@ static void llm_load_vocab(
6072
6137
  vocab.special_mask_id = -1;
6073
6138
  vocab.linefeed_id = -1;
6074
6139
 
6140
+ // read vocab size from metadata
6141
+ if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) {
6142
+ vocab.n_vocab = 0;
6143
+ LLAMA_LOG_WARN("%s: there is no vocab_size in metadata, vocab.n_vocab will be set to %u\n", __func__, vocab.n_vocab);
6144
+ }
6075
6145
  return;
6076
- } else if (tokenizer_model == "llama") {
6146
+ }
6147
+
6148
+ if (tokenizer_model == "llama") {
6077
6149
  vocab.type = LLAMA_VOCAB_TYPE_SPM;
6078
6150
 
6079
6151
  // default special tokens
@@ -6471,21 +6543,21 @@ static void llm_load_vocab(
6471
6543
  // for now, we apply this workaround to find the EOT token based on its text
6472
6544
  if (vocab.special_eot_id == -1) {
6473
6545
  for (const auto & t : vocab.token_to_id) {
6474
- if (
6546
+ if (false
6475
6547
  // TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
6476
6548
  // need to fix convert script
6477
6549
  //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
6478
- (t.first == "<|eot_id|>" ||
6479
- t.first == "<|im_end|>" ||
6480
- t.first == "<|end|>" ||
6481
- t.first == "<end_of_turn>" ||
6482
- t.first == "<|endoftext|>"
6483
- )
6550
+ || t.first == "<|eot_id|>"
6551
+ || t.first == "<|im_end|>"
6552
+ || t.first == "<|end|>"
6553
+ || t.first == "<end_of_turn>"
6554
+ || t.first == "<|endoftext|>"
6555
+ || t.first == "<EOT>"
6484
6556
  ) {
6485
6557
  vocab.special_eot_id = t.second;
6486
6558
  if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6487
6559
  LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6488
- __func__, t.first.c_str());
6560
+ __func__, t.first.c_str());
6489
6561
  vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
6490
6562
  }
6491
6563
  break;
@@ -6508,6 +6580,44 @@ static void llm_load_vocab(
6508
6580
  }
6509
6581
  }
6510
6582
  }
6583
+
6584
+ // maintain a list of tokens that cause end-of-generation
6585
+ // this is currently determined based on the token text, which is obviously not ideal
6586
+ // ref: https://github.com/ggerganov/llama.cpp/issues/9606
6587
+ vocab.special_eog_ids.clear();
6588
+ for (const auto & t : vocab.token_to_id) {
6589
+ if (false
6590
+ || t.first == "<|eot_id|>"
6591
+ || t.first == "<|im_end|>"
6592
+ || t.first == "<|end|>"
6593
+ || t.first == "<end_of_turn>"
6594
+ || t.first == "<|endoftext|>"
6595
+ || t.first == "<|eom_id|>"
6596
+ || t.first == "<EOT>"
6597
+ ) {
6598
+ vocab.special_eog_ids.insert(t.second);
6599
+ if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6600
+ LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6601
+ __func__, t.first.c_str());
6602
+ vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
6603
+ }
6604
+ }
6605
+ }
6606
+
6607
+ if (vocab.special_eos_id != -1 && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
6608
+ vocab.special_eog_ids.insert(vocab.special_eos_id);
6609
+ LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
6610
+ }
6611
+
6612
+ if (vocab.special_eot_id != -1 && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
6613
+ vocab.special_eog_ids.insert(vocab.special_eot_id);
6614
+ LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
6615
+ }
6616
+
6617
+ if (vocab.special_eom_id != -1 && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
6618
+ vocab.special_eog_ids.insert(vocab.special_eom_id);
6619
+ LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
6620
+ }
6511
6621
  }
6512
6622
 
6513
6623
  // build special tokens cache
@@ -6711,6 +6821,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
6711
6821
  if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
6712
6822
  if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
6713
6823
  if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
6824
+ if (vocab.special_eom_id != -1) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); }
6825
+
6826
+ for (const auto & id : vocab.special_eog_ids) {
6827
+ LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
6828
+ }
6714
6829
 
6715
6830
  LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
6716
6831
 
@@ -6728,6 +6843,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
6728
6843
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6729
6844
  LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
6730
6845
  }
6846
+
6847
+ if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
6848
+ LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
6849
+ LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
6850
+ LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
6851
+ }
6731
6852
  }
6732
6853
 
6733
6854
  // Returns false if cancelled by progress_callback
@@ -6896,6 +7017,8 @@ static bool llm_load_tensors(
6896
7017
  case LLM_ARCH_LLAMA:
6897
7018
  case LLM_ARCH_REFACT:
6898
7019
  case LLM_ARCH_MINICPM:
7020
+ case LLM_ARCH_GRANITE:
7021
+ case LLM_ARCH_GRANITE_MOE:
6899
7022
  {
6900
7023
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6901
7024
 
@@ -8879,6 +9002,11 @@ static struct lm_ggml_tensor * llm_build_inp_embd(
8879
9002
  lm_ggml_set_input(lctx.inp_embd);
8880
9003
  }
8881
9004
 
9005
+ // For Granite architecture
9006
+ if (hparams.f_embedding_scale != 0.0f) {
9007
+ inpL = lm_ggml_scale(ctx, inpL, hparams.f_embedding_scale);
9008
+ }
9009
+
8882
9010
  cb(inpL, "inp_embd", -1);
8883
9011
 
8884
9012
  return inpL;
@@ -9880,17 +10008,36 @@ struct llm_build_context {
9880
10008
  const int64_t n_head_kv = hparams.n_head_kv(il);
9881
10009
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
9882
10010
  struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
9883
- struct lm_ggml_tensor * tmp =
10011
+ struct lm_ggml_tensor * k =
10012
+ lm_ggml_view_3d(ctx0, kv_self.k_l[il],
10013
+ n_embd_head_k, n_head_kv, n_ctx,
10014
+ lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
10015
+ lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
10016
+ 0);
10017
+
10018
+ struct lm_ggml_tensor * tmp;
10019
+ if (lm_ggml_is_quantized(k->type)) {
10020
+ // dequantize to f32 -> RoPE -> quantize back
10021
+ tmp = lm_ggml_cast(ctx0, k, LM_GGML_TYPE_F32);
10022
+ cb(tmp, "K_f32", il);
10023
+ for (auto * backend : lctx.backends) {
10024
+ // Figure out which backend KV cache belongs to
10025
+ if (lm_ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft)) {
10026
+ lm_ggml_backend_sched_set_tensor_backend(lctx.sched, tmp, backend);
10027
+ break;
10028
+ }
10029
+ }
10030
+ tmp = lm_ggml_rope_ext_inplace(ctx0, tmp,
10031
+ lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10032
+ ext_factor, attn_factor, beta_fast, beta_slow);
10033
+ cb(tmp, "K_shifted_f32", il);
10034
+ tmp = lm_ggml_cpy(ctx0, tmp, k);
10035
+ } else {
9884
10036
  // we rotate only the first n_rot dimensions
9885
- lm_ggml_rope_ext_inplace(ctx0,
9886
- lm_ggml_view_3d(ctx0, kv_self.k_l[il],
9887
- n_embd_head_k, n_head_kv, n_ctx,
9888
- lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
9889
- lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
9890
- 0),
10037
+ tmp = lm_ggml_rope_ext_inplace(ctx0, k,
9891
10038
  lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9892
10039
  ext_factor, attn_factor, beta_fast, beta_slow);
9893
-
10040
+ }
9894
10041
  cb(tmp, "K_shifted", il);
9895
10042
  lm_ggml_build_forward_expand(gf, tmp);
9896
10043
  }
@@ -10157,6 +10304,7 @@ struct llm_build_context {
10157
10304
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10158
10305
  struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
10159
10306
 
10307
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
10160
10308
  for (int il = 0; il < n_layer; ++il) {
10161
10309
  struct lm_ggml_tensor * inpSA = inpL;
10162
10310
 
@@ -10209,7 +10357,7 @@ struct llm_build_context {
10209
10357
 
10210
10358
  cur = llm_build_kv(ctx0, lctx, kv_self, gf,
10211
10359
  model.layers[il].wo, model.layers[il].bo,
10212
- Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10360
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
10213
10361
  }
10214
10362
 
10215
10363
  if (il == n_layer - 1) {
@@ -10220,6 +10368,11 @@ struct llm_build_context {
10220
10368
  inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
10221
10369
  }
10222
10370
 
10371
+ // For Granite architecture
10372
+ if (hparams.f_residual_scale) {
10373
+ cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
10374
+ }
10375
+
10223
10376
  struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
10224
10377
  cb(ffn_inp, "ffn_inp", il);
10225
10378
 
@@ -10256,6 +10409,11 @@ struct llm_build_context {
10256
10409
  cb(cur, "ffn_moe_out", il);
10257
10410
  }
10258
10411
 
10412
+ // For Granite architecture
10413
+ if (hparams.f_residual_scale) {
10414
+ cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
10415
+ }
10416
+
10259
10417
  cur = lm_ggml_add(ctx0, cur, ffn_inp);
10260
10418
  cb(cur, "ffn_out", il);
10261
10419
 
@@ -10275,6 +10433,12 @@ struct llm_build_context {
10275
10433
 
10276
10434
  // lm_head
10277
10435
  cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
10436
+
10437
+ // For Granite architecture
10438
+ if (hparams.f_logit_scale) {
10439
+ cur = lm_ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
10440
+ }
10441
+
10278
10442
  cb(cur, "result_output", -1);
10279
10443
 
10280
10444
  lm_ggml_build_forward_expand(gf, cur);
@@ -15800,6 +15964,8 @@ static struct lm_ggml_cgraph * llama_build_graph(
15800
15964
 
15801
15965
  switch (model.arch) {
15802
15966
  case LLM_ARCH_LLAMA:
15967
+ case LLM_ARCH_GRANITE:
15968
+ case LLM_ARCH_GRANITE_MOE:
15803
15969
  {
15804
15970
  result = llm.build_llama();
15805
15971
  } break;
@@ -16588,7 +16754,7 @@ static int llama_decode_internal(
16588
16754
  const uint32_t n_tokens_all = batch_all.n_tokens;
16589
16755
 
16590
16756
  if (n_tokens_all == 0) {
16591
- LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
16757
+ LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
16592
16758
  return -1;
16593
16759
  }
16594
16760
 
@@ -16601,7 +16767,7 @@ static int llama_decode_internal(
16601
16767
  if (batch_all.token) {
16602
16768
  for (uint32_t i = 0; i < n_tokens_all; ++i) {
16603
16769
  if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
16604
- LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
16770
+ LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch_all.token[i]);
16605
16771
  return -1;
16606
16772
  }
16607
16773
  }
@@ -16889,7 +17055,7 @@ static int llama_encode_internal(
16889
17055
  const uint32_t n_tokens = batch.n_tokens;
16890
17056
 
16891
17057
  if (n_tokens == 0) {
16892
- LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
17058
+ LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
16893
17059
  return -1;
16894
17060
  }
16895
17061
 
@@ -16902,7 +17068,7 @@ static int llama_encode_internal(
16902
17068
  if (batch.token) {
16903
17069
  for (uint32_t i = 0; i < n_tokens; ++i) {
16904
17070
  if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
16905
- LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
17071
+ LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
16906
17072
  return -1;
16907
17073
  }
16908
17074
  }
@@ -18584,9 +18750,9 @@ struct llama_model * llama_load_model_from_file(
18584
18750
  unsigned percentage = (unsigned) (100 * progress);
18585
18751
  while (percentage > *cur_percentage_p) {
18586
18752
  *cur_percentage_p = percentage;
18587
- LLAMA_LOG(".");
18753
+ LLAMA_LOG_CONT(".");
18588
18754
  if (percentage >= 100) {
18589
- LLAMA_LOG("\n");
18755
+ LLAMA_LOG_CONT("\n");
18590
18756
  }
18591
18757
  }
18592
18758
  return true;
@@ -19058,6 +19224,10 @@ int32_t llama_n_layer(const struct llama_model * model) {
19058
19224
  return model->hparams.n_layer;
19059
19225
  }
19060
19226
 
19227
+ int32_t llama_n_head(const struct llama_model * model) {
19228
+ return model->hparams.n_head();
19229
+ }
19230
+
19061
19231
  const struct llama_model * llama_get_model(const struct llama_context * ctx) {
19062
19232
  return &ctx->model;
19063
19233
  }
@@ -19096,6 +19266,8 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
19096
19266
  case LLM_ARCH_ARCTIC:
19097
19267
  case LLM_ARCH_DEEPSEEK2:
19098
19268
  case LLM_ARCH_CHATGLM:
19269
+ case LLM_ARCH_GRANITE:
19270
+ case LLM_ARCH_GRANITE_MOE:
19099
19271
  return LLAMA_ROPE_TYPE_NORM;
19100
19272
 
19101
19273
  // the pairs of head values are offset by n_rot/2
package/cpp/llama.h CHANGED
@@ -442,6 +442,7 @@ extern "C" {
442
442
  LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
443
443
  LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
444
444
  LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
445
+ LLAMA_API int32_t llama_n_head (const struct llama_model * model);
445
446
 
446
447
  LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
447
448
 
@@ -1066,6 +1067,7 @@ extern "C" {
1066
1067
  LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
1067
1068
 
1068
1069
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
1070
+ /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
1069
1071
  LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void);
1070
1072
 
1071
1073
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
package/cpp/log.cpp CHANGED
@@ -82,7 +82,7 @@ struct gpt_log_entry {
82
82
  }
83
83
  }
84
84
 
85
- if (level != LM_GGML_LOG_LEVEL_NONE && prefix) {
85
+ if (level != LM_GGML_LOG_LEVEL_NONE && level != LM_GGML_LOG_LEVEL_CONT && prefix) {
86
86
  if (timestamp) {
87
87
  // [M.s.ms.us]
88
88
  fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",