llama_cpp 0.9.2 → 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -91,6 +91,8 @@
91
91
  #define LLAMA_ATTRIBUTE_FORMAT(...)
92
92
  #endif
93
93
 
94
+ #define LLAMA_MAX_NODES 4096
95
+
94
96
  //
95
97
  // logging
96
98
  //
@@ -190,6 +192,7 @@ enum llm_arch {
190
192
  LLM_ARCH_PERSIMMON,
191
193
  LLM_ARCH_REFACT,
192
194
  LLM_ARCH_BLOOM,
195
+ LLM_ARCH_STABLELM,
193
196
  LLM_ARCH_UNKNOWN,
194
197
  };
195
198
 
@@ -205,6 +208,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
205
208
  { LLM_ARCH_PERSIMMON, "persimmon" },
206
209
  { LLM_ARCH_REFACT, "refact" },
207
210
  { LLM_ARCH_BLOOM, "bloom" },
211
+ { LLM_ARCH_STABLELM, "stablelm" },
208
212
  };
209
213
 
210
214
  enum llm_kv {
@@ -251,6 +255,8 @@ enum llm_kv {
251
255
  LLM_KV_TOKENIZER_UNK_ID,
252
256
  LLM_KV_TOKENIZER_SEP_ID,
253
257
  LLM_KV_TOKENIZER_PAD_ID,
258
+ LLM_KV_TOKENIZER_ADD_BOS,
259
+ LLM_KV_TOKENIZER_ADD_EOS,
254
260
  LLM_KV_TOKENIZER_HF_JSON,
255
261
  LLM_KV_TOKENIZER_RWKV,
256
262
  };
@@ -299,6 +305,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
299
305
  { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
300
306
  { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
301
307
  { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
308
+ { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
309
+ { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
302
310
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
303
311
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
304
312
  };
@@ -493,6 +501,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
493
501
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
494
502
  },
495
503
  },
504
+ {
505
+ LLM_ARCH_STABLELM,
506
+ {
507
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
508
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
509
+ { LLM_TENSOR_OUTPUT, "output" },
510
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
511
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
512
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
513
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
514
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
515
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
516
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
517
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
518
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
519
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
520
+ },
521
+ },
522
+
496
523
  {
497
524
  LLM_ARCH_UNKNOWN,
498
525
  {
@@ -1055,9 +1082,9 @@ enum e_model {
1055
1082
  MODEL_70B,
1056
1083
  };
1057
1084
 
1058
- static const size_t kB = 1024;
1059
- static const size_t MB = 1024*kB;
1060
- static const size_t GB = 1024*MB;
1085
+ static const size_t kiB = 1024;
1086
+ static const size_t MiB = 1024*kiB;
1087
+ static const size_t GiB = 1024*MiB;
1061
1088
 
1062
1089
  struct llama_hparams {
1063
1090
  bool vocab_only;
@@ -1248,6 +1275,9 @@ struct llama_vocab {
1248
1275
  id special_sep_id = -1;
1249
1276
  id special_pad_id = -1;
1250
1277
 
1278
+ int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
1279
+ int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
1280
+
1251
1281
  id linefeed_id = 13;
1252
1282
  id special_prefix_id = 32007;
1253
1283
  id special_middle_id = 32009;
@@ -1453,7 +1483,7 @@ static bool llama_kv_cache_init(
1453
1483
  vram_kv_cache += ggml_nbytes(cache.k);
1454
1484
  }
1455
1485
  if (vram_kv_cache > 0) {
1456
- LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1486
+ LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1457
1487
  }
1458
1488
  }
1459
1489
  #endif
@@ -2209,6 +2239,16 @@ static void llm_load_hparams(
2209
2239
  default: model.type = e_model::MODEL_UNKNOWN;
2210
2240
  }
2211
2241
  } break;
2242
+ case LLM_ARCH_STABLELM:
2243
+ {
2244
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2245
+
2246
+ switch (hparams.n_layer) {
2247
+ case 32: model.type = e_model::MODEL_3B; break;
2248
+ default: model.type = e_model::MODEL_UNKNOWN;
2249
+ }
2250
+ } break;
2251
+
2212
2252
  default: (void)0;
2213
2253
  }
2214
2254
 
@@ -2350,6 +2390,23 @@ static void llm_load_vocab(
2350
2390
  __func__, key.c_str(), id, old_id);
2351
2391
  id = old_id;
2352
2392
  }
2393
+
2394
+ }
2395
+
2396
+ // Handle add_bos_token and add_eos_token
2397
+ std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
2398
+ int kid = gguf_find_key(ctx, key.c_str());
2399
+ enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
2400
+ vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
2401
+ if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
2402
+ LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
2403
+ }
2404
+ key = kv(LLM_KV_TOKENIZER_ADD_EOS);
2405
+ kid = gguf_find_key(ctx, key.c_str());
2406
+ ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
2407
+ vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
2408
+ if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
2409
+ LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
2353
2410
  }
2354
2411
  }
2355
2412
 
@@ -2481,8 +2538,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2481
2538
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
2482
2539
  LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
2483
2540
  LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
2484
- if (ml.n_bytes < GB) {
2485
- LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2541
+ if (ml.n_bytes < GiB) {
2542
+ LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2486
2543
  } else {
2487
2544
  LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2488
2545
  }
@@ -2520,7 +2577,7 @@ static void llm_load_tensors(
2520
2577
 
2521
2578
  ml.calc_sizes(ctx_size, mmapped_size);
2522
2579
 
2523
- LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
2580
+ LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
2524
2581
 
2525
2582
  // create the ggml context
2526
2583
  {
@@ -2872,6 +2929,13 @@ static void llm_load_tensors(
2872
2929
  ggml_backend_type backend_output;
2873
2930
 
2874
2931
  if (n_gpu_layers > int(n_layer)) {
2932
+ #ifdef GGML_USE_CUBLAS
2933
+ if (n_gpu_layers > int(n_layer + 1)) {
2934
+ LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
2935
+ __func__, n_layer + 1);
2936
+ throw std::runtime_error("Persimmon CUDA offload failed");
2937
+ }
2938
+ #endif
2875
2939
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2876
2940
  // on Windows however this is detrimental unless everything is on the GPU
2877
2941
  #ifndef _WIN32
@@ -3073,6 +3137,81 @@ static void llm_load_tensors(
3073
3137
  }
3074
3138
  }
3075
3139
  } break;
3140
+ case LLM_ARCH_STABLELM:
3141
+ {
3142
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3143
+
3144
+ // output
3145
+ {
3146
+ ggml_backend_type backend_norm;
3147
+ ggml_backend_type backend_output;
3148
+
3149
+ if (n_gpu_layers > int(n_layer)) {
3150
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3151
+ // on Windows however this is detrimental unless everything is on the GPU
3152
+ #ifndef _WIN32
3153
+ backend_norm = llama_backend_offload;
3154
+ #else
3155
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
3156
+ #endif // _WIN32
3157
+
3158
+ backend_output = llama_backend_offload_split;
3159
+ } else {
3160
+ backend_norm = GGML_BACKEND_CPU;
3161
+ backend_output = GGML_BACKEND_CPU;
3162
+ }
3163
+
3164
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3165
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3166
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3167
+
3168
+ if (backend_norm == GGML_BACKEND_GPU) {
3169
+ vram_weights += ggml_nbytes(model.output_norm);
3170
+ }
3171
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3172
+ vram_weights += ggml_nbytes(model.output);
3173
+ }
3174
+ }
3175
+
3176
+ const uint32_t n_ff = hparams.n_ff;
3177
+
3178
+ const int i_gpu_start = n_layer - n_gpu_layers;
3179
+
3180
+ model.layers.resize(n_layer);
3181
+
3182
+ for (uint32_t i = 0; i < n_layer; ++i) {
3183
+ /*
3184
+ llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ]
3185
+ */
3186
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3187
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3188
+
3189
+ auto & layer = model.layers[i];
3190
+
3191
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3192
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3193
+
3194
+ layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
3195
+ layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3196
+ layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3197
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3198
+
3199
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3200
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
3201
+
3202
+ layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3203
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3204
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3205
+
3206
+ if (backend == GGML_BACKEND_GPU) {
3207
+ vram_weights +=
3208
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
3209
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
3210
+ ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3211
+ }
3212
+ }
3213
+ } break;
3214
+
3076
3215
  default:
3077
3216
  throw std::runtime_error("unknown architecture");
3078
3217
  }
@@ -3087,7 +3226,7 @@ static void llm_load_tensors(
3087
3226
  ctx_size +
3088
3227
  mmapped_size - vram_weights; // weights in VRAM not in memory
3089
3228
 
3090
- LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
3229
+ LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
3091
3230
 
3092
3231
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
3093
3232
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
@@ -3106,7 +3245,7 @@ static void llm_load_tensors(
3106
3245
  #endif // GGML_USE_CUBLAS
3107
3246
 
3108
3247
  LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
3109
- LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
3248
+ LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
3110
3249
  #else
3111
3250
  (void) n_gpu_layers;
3112
3251
  #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
@@ -3606,7 +3745,7 @@ struct llm_build_context {
3606
3745
  }
3607
3746
 
3608
3747
  struct ggml_cgraph * build_llama() {
3609
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
3748
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3610
3749
 
3611
3750
  GGML_ASSERT(n_embd_head == hparams.n_rot);
3612
3751
 
@@ -3718,7 +3857,7 @@ struct llm_build_context {
3718
3857
  }
3719
3858
 
3720
3859
  struct ggml_cgraph * build_baichuan() {
3721
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
3860
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3722
3861
 
3723
3862
  struct ggml_tensor * cur;
3724
3863
  struct ggml_tensor * inpL;
@@ -3838,7 +3977,7 @@ struct llm_build_context {
3838
3977
  }
3839
3978
 
3840
3979
  struct ggml_cgraph * build_falcon() {
3841
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
3980
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3842
3981
 
3843
3982
  struct ggml_tensor * cur;
3844
3983
  struct ggml_tensor * inpL;
@@ -3960,7 +4099,7 @@ struct llm_build_context {
3960
4099
  }
3961
4100
 
3962
4101
  struct ggml_cgraph * build_starcoder() {
3963
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4102
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3964
4103
 
3965
4104
  struct ggml_tensor * cur;
3966
4105
  struct ggml_tensor * pos;
@@ -4059,7 +4198,7 @@ struct llm_build_context {
4059
4198
  }
4060
4199
 
4061
4200
  struct ggml_cgraph * build_persimmon() {
4062
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4201
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4063
4202
 
4064
4203
  const int64_t n_rot = n_embd_head / 2;
4065
4204
 
@@ -4204,7 +4343,7 @@ struct llm_build_context {
4204
4343
  struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
4205
4344
  cb(Kcur, "Kcur", il);
4206
4345
 
4207
- struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
4346
+ struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
4208
4347
  cb(Q, "Q", il);
4209
4348
 
4210
4349
  Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
@@ -4269,7 +4408,7 @@ struct llm_build_context {
4269
4408
  }
4270
4409
 
4271
4410
  struct ggml_cgraph * build_refact() {
4272
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4411
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4273
4412
 
4274
4413
  struct ggml_tensor * cur;
4275
4414
  struct ggml_tensor * inpL;
@@ -4360,7 +4499,7 @@ struct llm_build_context {
4360
4499
  }
4361
4500
 
4362
4501
  struct ggml_cgraph * build_bloom() {
4363
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4502
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4364
4503
 
4365
4504
  struct ggml_tensor * cur;
4366
4505
  struct ggml_tensor * inpL;
@@ -4454,7 +4593,7 @@ struct llm_build_context {
4454
4593
  }
4455
4594
 
4456
4595
  struct ggml_cgraph * build_mpt() {
4457
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4596
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4458
4597
 
4459
4598
  struct ggml_tensor * cur;
4460
4599
  struct ggml_tensor * inpL;
@@ -4551,6 +4690,177 @@ struct llm_build_context {
4551
4690
 
4552
4691
  return gf;
4553
4692
  }
4693
+
4694
+ struct ggml_cgraph * build_stablelm() {
4695
+ struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4696
+
4697
+ struct ggml_tensor * cur;
4698
+ struct ggml_tensor * inpL;
4699
+
4700
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
4701
+ cb(inpL, "inp_embd", -1);
4702
+
4703
+ // inp_pos - contains the positions
4704
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4705
+ cb(inp_pos, "inp_pos", -1);
4706
+
4707
+ // KQ_scale
4708
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4709
+ cb(KQ_scale, "KQ_scale", -1);
4710
+
4711
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4712
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4713
+ cb(KQ_mask, "KQ_mask", -1);
4714
+
4715
+ // shift the entire K-cache if needed
4716
+ if (do_rope_shift) {
4717
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, hparams.n_rot, freq_base, freq_scale, cb);
4718
+ }
4719
+
4720
+ for (int il = 0; il < n_layer; ++il) {
4721
+ struct ggml_tensor * inpSA = inpL;
4722
+
4723
+ // norm
4724
+ cur = llm_build_norm(ctx0, inpL, hparams,
4725
+ model.layers[il].attn_norm,
4726
+ model.layers[il].attn_norm_b,
4727
+ LLM_NORM, cb, il);
4728
+ cb(cur, "attn_norm", il);
4729
+
4730
+ // self-attention
4731
+ {
4732
+ // compute Q and K and RoPE them
4733
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
4734
+ cb(tmpq, "tmpq", il);
4735
+
4736
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
4737
+ cb(tmpk, "tmpk", il);
4738
+
4739
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
4740
+ cb(Vcur, "Vcur", il);
4741
+
4742
+ // RoPE the first n_rot of q/k, pass the other half, and concat.
4743
+ struct ggml_tensor * qrot = ggml_cont(ctx0, ggml_view_3d(
4744
+ ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
4745
+ ggml_element_size(tmpq) * n_embd_head,
4746
+ ggml_element_size(tmpq) * n_embd_head * n_head,
4747
+ 0
4748
+ ));
4749
+ cb(qrot, "qrot", il);
4750
+
4751
+ struct ggml_tensor * krot = ggml_cont(ctx0, ggml_view_3d(
4752
+ ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
4753
+ ggml_element_size(tmpk) * n_embd_head,
4754
+ ggml_element_size(tmpk) * n_embd_head * n_head_kv,
4755
+ 0
4756
+ ));
4757
+ cb(krot, "krot", il);
4758
+
4759
+ // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
4760
+ struct ggml_tensor * qpass = ggml_view_3d(
4761
+ ctx0, tmpq, (n_embd_head - hparams.n_rot), n_head, n_tokens,
4762
+ ggml_element_size(tmpq) * n_embd_head,
4763
+ ggml_element_size(tmpq) * n_embd_head * n_head,
4764
+ ggml_element_size(tmpq) * hparams.n_rot
4765
+ );
4766
+ cb(qpass, "qpass", il);
4767
+
4768
+ struct ggml_tensor * kpass = ggml_view_3d(
4769
+ ctx0, tmpk, (n_embd_head - hparams.n_rot), n_head_kv, n_tokens,
4770
+ ggml_element_size(tmpk) * (n_embd_head),
4771
+ ggml_element_size(tmpk) * (n_embd_head) * n_head_kv,
4772
+ ggml_element_size(tmpk) * hparams.n_rot
4773
+ );
4774
+ cb(kpass, "kpass", il);
4775
+
4776
+ struct ggml_tensor * qrotated = ggml_rope_custom(
4777
+ ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
4778
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
4779
+ );
4780
+ cb(qrotated, "qrotated", il);
4781
+
4782
+ struct ggml_tensor * krotated = ggml_rope_custom(
4783
+ ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
4784
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
4785
+ );
4786
+ cb(krotated, "krotated", il);
4787
+
4788
+ // ggml currently only supports concatenation on dim=2
4789
+ // so we need to permute qrot, qpass, concat, then permute back.
4790
+ qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
4791
+ cb(qrotated, "qrotated", il);
4792
+
4793
+ krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
4794
+ cb(krotated, "krotated", il);
4795
+
4796
+ qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
4797
+ cb(qpass, "qpass", il);
4798
+
4799
+ kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
4800
+ cb(kpass, "kpass", il);
4801
+
4802
+ struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
4803
+ cb(Qcur, "Qcur", il);
4804
+
4805
+ struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
4806
+ cb(Kcur, "Kcur", il);
4807
+
4808
+ struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
4809
+ cb(Q, "Q", il);
4810
+
4811
+ Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
4812
+ cb(Kcur, "Kcur", il);
4813
+
4814
+ llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4815
+
4816
+ cur = llm_build_kqv(ctx0, hparams, kv_self,
4817
+ model.layers[il].wo, NULL,
4818
+ Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4819
+ cb(cur, "kqv_out", il);
4820
+ }
4821
+
4822
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
4823
+ cb(ffn_inp, "ffn_inp", il);
4824
+
4825
+ // feed-forward network
4826
+ {
4827
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
4828
+ model.layers[il].ffn_norm,
4829
+ model.layers[il].ffn_norm_b,
4830
+ LLM_NORM, cb, il);
4831
+ cb(cur, "ffn_norm", il);
4832
+
4833
+ cur = llm_build_ffn(ctx0, cur,
4834
+ model.layers[il].ffn_up, NULL,
4835
+ model.layers[il].ffn_gate, NULL,
4836
+ model.layers[il].ffn_down, NULL,
4837
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
4838
+ cb(cur, "ffn_out", il);
4839
+ }
4840
+
4841
+ cur = ggml_add(ctx0, cur, ffn_inp);
4842
+ cb(cur, "l_out", il);
4843
+
4844
+ // input for next layer
4845
+ inpL = cur;
4846
+ }
4847
+
4848
+ cur = inpL;
4849
+
4850
+ cur = llm_build_norm(ctx0, cur, hparams,
4851
+ model.output_norm,
4852
+ model.output_norm_b,
4853
+ LLM_NORM, cb, -1);
4854
+ cb(cur, "result_norm", -1);
4855
+
4856
+ // lm_head
4857
+ cur = ggml_mul_mat(ctx0, model.output, cur);
4858
+ cb(cur, "result_output", -1);
4859
+
4860
+ ggml_build_forward_expand(gf, cur);
4861
+
4862
+ return gf;
4863
+ }
4554
4864
  };
4555
4865
 
4556
4866
  //
@@ -5020,6 +5330,10 @@ static struct ggml_cgraph * llama_build_graph(
5020
5330
  {
5021
5331
  result = llm.build_mpt();
5022
5332
  } break;
5333
+ case LLM_ARCH_STABLELM:
5334
+ {
5335
+ result = llm.build_stablelm();
5336
+ } break;
5023
5337
  default:
5024
5338
  GGML_ASSERT(false);
5025
5339
  }
@@ -5195,7 +5509,8 @@ static int llama_decode_internal(
5195
5509
  model.arch == LLM_ARCH_FALCON ||
5196
5510
  model.arch == LLM_ARCH_REFACT ||
5197
5511
  model.arch == LLM_ARCH_MPT ||
5198
- model.arch == LLM_ARCH_STARCODER;
5512
+ model.arch == LLM_ARCH_STARCODER ||
5513
+ model.arch == LLM_ARCH_STABLELM;
5199
5514
 
5200
5515
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
5201
5516
  if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
@@ -5987,7 +6302,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
5987
6302
  // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
5988
6303
  // and passing 'add space prefix' as bool argument
5989
6304
  //
5990
- auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length);
6305
+ auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
6306
+ if (&fragment == &fragment_buffer.front()) {
6307
+ raw_text = " " + raw_text; // prefix with space if the first token is not special
6308
+ }
5991
6309
 
5992
6310
  #ifdef PRETOKENIZERDEBUG
5993
6311
  fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
@@ -7639,7 +7957,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
7639
7957
  workers.clear();
7640
7958
  }
7641
7959
 
7642
- LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
7960
+ LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
7643
7961
  int64_t tot_count = 0;
7644
7962
  for (size_t i = 0; i < hist_cur.size(); i++) {
7645
7963
  hist_all[i] += hist_cur[i];
@@ -8179,7 +8497,7 @@ struct llama_context * llama_new_context_with_model(
8179
8497
 
8180
8498
  {
8181
8499
  const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
8182
- LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
8500
+ LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
8183
8501
  }
8184
8502
 
8185
8503
  // resized during inference
@@ -8196,7 +8514,7 @@ struct llama_context * llama_new_context_with_model(
8196
8514
  {
8197
8515
  static const size_t tensor_alignment = 32;
8198
8516
  // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
8199
- ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
8517
+ ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
8200
8518
 
8201
8519
  // create measure allocator
8202
8520
  ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
@@ -8224,7 +8542,7 @@ struct llama_context * llama_new_context_with_model(
8224
8542
  // measure memory requirements for the graph
8225
8543
  size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
8226
8544
 
8227
- LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
8545
+ LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
8228
8546
 
8229
8547
  // recreate allocator with exact memory requirements
8230
8548
  ggml_allocr_free(ctx->alloc);
@@ -8238,7 +8556,7 @@ struct llama_context * llama_new_context_with_model(
8238
8556
  #endif
8239
8557
  #ifdef GGML_USE_CUBLAS
8240
8558
  ggml_cuda_set_scratch_size(alloc_size);
8241
- LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
8559
+ LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
8242
8560
 
8243
8561
  // calculate total VRAM usage
8244
8562
  auto add_tensor = [](const ggml_tensor * t, size_t & size) {
@@ -8258,10 +8576,10 @@ struct llama_context * llama_new_context_with_model(
8258
8576
  size_t ctx_vram_size = alloc_size + kv_vram_size;
8259
8577
  size_t total_vram_size = model_vram_size + ctx_vram_size;
8260
8578
 
8261
- LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
8579
+ LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
8262
8580
  total_vram_size / 1024.0 / 1024.0,
8263
8581
  model_vram_size / 1024.0 / 1024.0,
8264
- ctx_vram_size / 1024.0 / 1024.0);
8582
+ ctx_vram_size / 1024.0 / 1024.0);
8265
8583
  #endif
8266
8584
  }
8267
8585
 
@@ -8282,7 +8600,7 @@ struct llama_context * llama_new_context_with_model(
8282
8600
 
8283
8601
  const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
8284
8602
 
8285
- LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
8603
+ LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
8286
8604
 
8287
8605
  #define LLAMA_METAL_CHECK_BUF(result) \
8288
8606
  if (!(result)) { \
@@ -8585,8 +8903,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
8585
8903
  if (kv_buf_size) {
8586
8904
  const size_t elt_size = ggml_element_size(kv_self.k);
8587
8905
 
8588
- ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
8589
- ggml_cgraph gf{};
8906
+ ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
8907
+ ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
8590
8908
 
8591
8909
  ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
8592
8910
  std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
@@ -8604,9 +8922,9 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
8604
8922
  kv_head, n_embd, n_layer,
8605
8923
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
8606
8924
 
8607
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
8608
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
8609
- ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
8925
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
8926
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
8927
+ ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
8610
8928
 
8611
8929
  ggml_free(cpy_ctx);
8612
8930
 
@@ -8713,8 +9031,8 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
8713
9031
 
8714
9032
  const size_t elt_size = ggml_element_size(kv_self.k);
8715
9033
 
8716
- ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
8717
- ggml_cgraph gf{};
9034
+ ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9035
+ ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
8718
9036
 
8719
9037
  ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
8720
9038
  kin3d->data = (void *) inp;
@@ -8732,9 +9050,9 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
8732
9050
  kv_head, n_embd, n_layer,
8733
9051
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
8734
9052
 
8735
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
8736
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
8737
- ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
9053
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
9054
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
9055
+ ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
8738
9056
 
8739
9057
  ggml_free(cpy_ctx);
8740
9058
  }
@@ -8989,6 +9307,14 @@ llama_token llama_token_nl(const struct llama_model * model) {
8989
9307
  return model->vocab.linefeed_id;
8990
9308
  }
8991
9309
 
9310
+ int llama_add_bos_token(const struct llama_model * model) {
9311
+ return model->vocab.special_add_bos;
9312
+ }
9313
+
9314
+ int llama_add_eos_token(const struct llama_model * model) {
9315
+ return model->vocab.special_add_eos;
9316
+ }
9317
+
8992
9318
  llama_token llama_token_prefix(const struct llama_model * model) {
8993
9319
  return model->vocab.special_prefix_id;
8994
9320
  }