llama_cpp 0.9.2 → 0.9.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -91,6 +91,8 @@
91
91
  #define LLAMA_ATTRIBUTE_FORMAT(...)
92
92
  #endif
93
93
 
94
+ #define LLAMA_MAX_NODES 4096
95
+
94
96
  //
95
97
  // logging
96
98
  //
@@ -190,6 +192,7 @@ enum llm_arch {
190
192
  LLM_ARCH_PERSIMMON,
191
193
  LLM_ARCH_REFACT,
192
194
  LLM_ARCH_BLOOM,
195
+ LLM_ARCH_STABLELM,
193
196
  LLM_ARCH_UNKNOWN,
194
197
  };
195
198
 
@@ -205,6 +208,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
205
208
  { LLM_ARCH_PERSIMMON, "persimmon" },
206
209
  { LLM_ARCH_REFACT, "refact" },
207
210
  { LLM_ARCH_BLOOM, "bloom" },
211
+ { LLM_ARCH_STABLELM, "stablelm" },
208
212
  };
209
213
 
210
214
  enum llm_kv {
@@ -251,6 +255,8 @@ enum llm_kv {
251
255
  LLM_KV_TOKENIZER_UNK_ID,
252
256
  LLM_KV_TOKENIZER_SEP_ID,
253
257
  LLM_KV_TOKENIZER_PAD_ID,
258
+ LLM_KV_TOKENIZER_ADD_BOS,
259
+ LLM_KV_TOKENIZER_ADD_EOS,
254
260
  LLM_KV_TOKENIZER_HF_JSON,
255
261
  LLM_KV_TOKENIZER_RWKV,
256
262
  };
@@ -299,6 +305,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
299
305
  { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
300
306
  { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
301
307
  { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
308
+ { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
309
+ { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
302
310
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
303
311
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
304
312
  };
@@ -493,6 +501,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
493
501
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
494
502
  },
495
503
  },
504
+ {
505
+ LLM_ARCH_STABLELM,
506
+ {
507
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
508
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
509
+ { LLM_TENSOR_OUTPUT, "output" },
510
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
511
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
512
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
513
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
514
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
515
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
516
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
517
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
518
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
519
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
520
+ },
521
+ },
522
+
496
523
  {
497
524
  LLM_ARCH_UNKNOWN,
498
525
  {
@@ -1055,9 +1082,9 @@ enum e_model {
1055
1082
  MODEL_70B,
1056
1083
  };
1057
1084
 
1058
- static const size_t kB = 1024;
1059
- static const size_t MB = 1024*kB;
1060
- static const size_t GB = 1024*MB;
1085
+ static const size_t kiB = 1024;
1086
+ static const size_t MiB = 1024*kiB;
1087
+ static const size_t GiB = 1024*MiB;
1061
1088
 
1062
1089
  struct llama_hparams {
1063
1090
  bool vocab_only;
@@ -1248,6 +1275,9 @@ struct llama_vocab {
1248
1275
  id special_sep_id = -1;
1249
1276
  id special_pad_id = -1;
1250
1277
 
1278
+ int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
1279
+ int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
1280
+
1251
1281
  id linefeed_id = 13;
1252
1282
  id special_prefix_id = 32007;
1253
1283
  id special_middle_id = 32009;
@@ -1453,7 +1483,7 @@ static bool llama_kv_cache_init(
1453
1483
  vram_kv_cache += ggml_nbytes(cache.k);
1454
1484
  }
1455
1485
  if (vram_kv_cache > 0) {
1456
- LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1486
+ LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1457
1487
  }
1458
1488
  }
1459
1489
  #endif
@@ -2209,6 +2239,16 @@ static void llm_load_hparams(
2209
2239
  default: model.type = e_model::MODEL_UNKNOWN;
2210
2240
  }
2211
2241
  } break;
2242
+ case LLM_ARCH_STABLELM:
2243
+ {
2244
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2245
+
2246
+ switch (hparams.n_layer) {
2247
+ case 32: model.type = e_model::MODEL_3B; break;
2248
+ default: model.type = e_model::MODEL_UNKNOWN;
2249
+ }
2250
+ } break;
2251
+
2212
2252
  default: (void)0;
2213
2253
  }
2214
2254
 
@@ -2350,6 +2390,23 @@ static void llm_load_vocab(
2350
2390
  __func__, key.c_str(), id, old_id);
2351
2391
  id = old_id;
2352
2392
  }
2393
+
2394
+ }
2395
+
2396
+ // Handle add_bos_token and add_eos_token
2397
+ std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
2398
+ int kid = gguf_find_key(ctx, key.c_str());
2399
+ enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
2400
+ vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
2401
+ if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
2402
+ LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
2403
+ }
2404
+ key = kv(LLM_KV_TOKENIZER_ADD_EOS);
2405
+ kid = gguf_find_key(ctx, key.c_str());
2406
+ ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
2407
+ vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
2408
+ if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
2409
+ LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
2353
2410
  }
2354
2411
  }
2355
2412
 
@@ -2481,8 +2538,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2481
2538
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
2482
2539
  LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
2483
2540
  LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
2484
- if (ml.n_bytes < GB) {
2485
- LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2541
+ if (ml.n_bytes < GiB) {
2542
+ LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2486
2543
  } else {
2487
2544
  LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2488
2545
  }
@@ -2520,7 +2577,7 @@ static void llm_load_tensors(
2520
2577
 
2521
2578
  ml.calc_sizes(ctx_size, mmapped_size);
2522
2579
 
2523
- LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
2580
+ LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
2524
2581
 
2525
2582
  // create the ggml context
2526
2583
  {
@@ -2872,6 +2929,13 @@ static void llm_load_tensors(
2872
2929
  ggml_backend_type backend_output;
2873
2930
 
2874
2931
  if (n_gpu_layers > int(n_layer)) {
2932
+ #ifdef GGML_USE_CUBLAS
2933
+ if (n_gpu_layers > int(n_layer + 1)) {
2934
+ LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
2935
+ __func__, n_layer + 1);
2936
+ throw std::runtime_error("Persimmon CUDA offload failed");
2937
+ }
2938
+ #endif
2875
2939
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2876
2940
  // on Windows however this is detrimental unless everything is on the GPU
2877
2941
  #ifndef _WIN32
@@ -3073,6 +3137,81 @@ static void llm_load_tensors(
3073
3137
  }
3074
3138
  }
3075
3139
  } break;
3140
+ case LLM_ARCH_STABLELM:
3141
+ {
3142
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3143
+
3144
+ // output
3145
+ {
3146
+ ggml_backend_type backend_norm;
3147
+ ggml_backend_type backend_output;
3148
+
3149
+ if (n_gpu_layers > int(n_layer)) {
3150
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3151
+ // on Windows however this is detrimental unless everything is on the GPU
3152
+ #ifndef _WIN32
3153
+ backend_norm = llama_backend_offload;
3154
+ #else
3155
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
3156
+ #endif // _WIN32
3157
+
3158
+ backend_output = llama_backend_offload_split;
3159
+ } else {
3160
+ backend_norm = GGML_BACKEND_CPU;
3161
+ backend_output = GGML_BACKEND_CPU;
3162
+ }
3163
+
3164
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3165
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3166
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3167
+
3168
+ if (backend_norm == GGML_BACKEND_GPU) {
3169
+ vram_weights += ggml_nbytes(model.output_norm);
3170
+ }
3171
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3172
+ vram_weights += ggml_nbytes(model.output);
3173
+ }
3174
+ }
3175
+
3176
+ const uint32_t n_ff = hparams.n_ff;
3177
+
3178
+ const int i_gpu_start = n_layer - n_gpu_layers;
3179
+
3180
+ model.layers.resize(n_layer);
3181
+
3182
+ for (uint32_t i = 0; i < n_layer; ++i) {
3183
+ /*
3184
+ llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ]
3185
+ */
3186
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3187
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3188
+
3189
+ auto & layer = model.layers[i];
3190
+
3191
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3192
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3193
+
3194
+ layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
3195
+ layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3196
+ layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3197
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3198
+
3199
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3200
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
3201
+
3202
+ layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3203
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3204
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3205
+
3206
+ if (backend == GGML_BACKEND_GPU) {
3207
+ vram_weights +=
3208
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
3209
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
3210
+ ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3211
+ }
3212
+ }
3213
+ } break;
3214
+
3076
3215
  default:
3077
3216
  throw std::runtime_error("unknown architecture");
3078
3217
  }
@@ -3087,7 +3226,7 @@ static void llm_load_tensors(
3087
3226
  ctx_size +
3088
3227
  mmapped_size - vram_weights; // weights in VRAM not in memory
3089
3228
 
3090
- LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
3229
+ LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
3091
3230
 
3092
3231
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
3093
3232
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
@@ -3106,7 +3245,7 @@ static void llm_load_tensors(
3106
3245
  #endif // GGML_USE_CUBLAS
3107
3246
 
3108
3247
  LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
3109
- LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
3248
+ LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
3110
3249
  #else
3111
3250
  (void) n_gpu_layers;
3112
3251
  #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
@@ -3606,7 +3745,7 @@ struct llm_build_context {
3606
3745
  }
3607
3746
 
3608
3747
  struct ggml_cgraph * build_llama() {
3609
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
3748
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3610
3749
 
3611
3750
  GGML_ASSERT(n_embd_head == hparams.n_rot);
3612
3751
 
@@ -3718,7 +3857,7 @@ struct llm_build_context {
3718
3857
  }
3719
3858
 
3720
3859
  struct ggml_cgraph * build_baichuan() {
3721
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
3860
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3722
3861
 
3723
3862
  struct ggml_tensor * cur;
3724
3863
  struct ggml_tensor * inpL;
@@ -3838,7 +3977,7 @@ struct llm_build_context {
3838
3977
  }
3839
3978
 
3840
3979
  struct ggml_cgraph * build_falcon() {
3841
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
3980
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3842
3981
 
3843
3982
  struct ggml_tensor * cur;
3844
3983
  struct ggml_tensor * inpL;
@@ -3960,7 +4099,7 @@ struct llm_build_context {
3960
4099
  }
3961
4100
 
3962
4101
  struct ggml_cgraph * build_starcoder() {
3963
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4102
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3964
4103
 
3965
4104
  struct ggml_tensor * cur;
3966
4105
  struct ggml_tensor * pos;
@@ -4059,7 +4198,7 @@ struct llm_build_context {
4059
4198
  }
4060
4199
 
4061
4200
  struct ggml_cgraph * build_persimmon() {
4062
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4201
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4063
4202
 
4064
4203
  const int64_t n_rot = n_embd_head / 2;
4065
4204
 
@@ -4204,7 +4343,7 @@ struct llm_build_context {
4204
4343
  struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
4205
4344
  cb(Kcur, "Kcur", il);
4206
4345
 
4207
- struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
4346
+ struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
4208
4347
  cb(Q, "Q", il);
4209
4348
 
4210
4349
  Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
@@ -4269,7 +4408,7 @@ struct llm_build_context {
4269
4408
  }
4270
4409
 
4271
4410
  struct ggml_cgraph * build_refact() {
4272
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4411
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4273
4412
 
4274
4413
  struct ggml_tensor * cur;
4275
4414
  struct ggml_tensor * inpL;
@@ -4360,7 +4499,7 @@ struct llm_build_context {
4360
4499
  }
4361
4500
 
4362
4501
  struct ggml_cgraph * build_bloom() {
4363
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4502
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4364
4503
 
4365
4504
  struct ggml_tensor * cur;
4366
4505
  struct ggml_tensor * inpL;
@@ -4454,7 +4593,7 @@ struct llm_build_context {
4454
4593
  }
4455
4594
 
4456
4595
  struct ggml_cgraph * build_mpt() {
4457
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4596
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4458
4597
 
4459
4598
  struct ggml_tensor * cur;
4460
4599
  struct ggml_tensor * inpL;
@@ -4551,6 +4690,177 @@ struct llm_build_context {
4551
4690
 
4552
4691
  return gf;
4553
4692
  }
4693
+
4694
+ struct ggml_cgraph * build_stablelm() {
4695
+ struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4696
+
4697
+ struct ggml_tensor * cur;
4698
+ struct ggml_tensor * inpL;
4699
+
4700
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
4701
+ cb(inpL, "inp_embd", -1);
4702
+
4703
+ // inp_pos - contains the positions
4704
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4705
+ cb(inp_pos, "inp_pos", -1);
4706
+
4707
+ // KQ_scale
4708
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4709
+ cb(KQ_scale, "KQ_scale", -1);
4710
+
4711
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4712
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4713
+ cb(KQ_mask, "KQ_mask", -1);
4714
+
4715
+ // shift the entire K-cache if needed
4716
+ if (do_rope_shift) {
4717
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, hparams.n_rot, freq_base, freq_scale, cb);
4718
+ }
4719
+
4720
+ for (int il = 0; il < n_layer; ++il) {
4721
+ struct ggml_tensor * inpSA = inpL;
4722
+
4723
+ // norm
4724
+ cur = llm_build_norm(ctx0, inpL, hparams,
4725
+ model.layers[il].attn_norm,
4726
+ model.layers[il].attn_norm_b,
4727
+ LLM_NORM, cb, il);
4728
+ cb(cur, "attn_norm", il);
4729
+
4730
+ // self-attention
4731
+ {
4732
+ // compute Q and K and RoPE them
4733
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
4734
+ cb(tmpq, "tmpq", il);
4735
+
4736
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
4737
+ cb(tmpk, "tmpk", il);
4738
+
4739
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
4740
+ cb(Vcur, "Vcur", il);
4741
+
4742
+ // RoPE the first n_rot of q/k, pass the other half, and concat.
4743
+ struct ggml_tensor * qrot = ggml_cont(ctx0, ggml_view_3d(
4744
+ ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
4745
+ ggml_element_size(tmpq) * n_embd_head,
4746
+ ggml_element_size(tmpq) * n_embd_head * n_head,
4747
+ 0
4748
+ ));
4749
+ cb(qrot, "qrot", il);
4750
+
4751
+ struct ggml_tensor * krot = ggml_cont(ctx0, ggml_view_3d(
4752
+ ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
4753
+ ggml_element_size(tmpk) * n_embd_head,
4754
+ ggml_element_size(tmpk) * n_embd_head * n_head_kv,
4755
+ 0
4756
+ ));
4757
+ cb(krot, "krot", il);
4758
+
4759
+ // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
4760
+ struct ggml_tensor * qpass = ggml_view_3d(
4761
+ ctx0, tmpq, (n_embd_head - hparams.n_rot), n_head, n_tokens,
4762
+ ggml_element_size(tmpq) * n_embd_head,
4763
+ ggml_element_size(tmpq) * n_embd_head * n_head,
4764
+ ggml_element_size(tmpq) * hparams.n_rot
4765
+ );
4766
+ cb(qpass, "qpass", il);
4767
+
4768
+ struct ggml_tensor * kpass = ggml_view_3d(
4769
+ ctx0, tmpk, (n_embd_head - hparams.n_rot), n_head_kv, n_tokens,
4770
+ ggml_element_size(tmpk) * (n_embd_head),
4771
+ ggml_element_size(tmpk) * (n_embd_head) * n_head_kv,
4772
+ ggml_element_size(tmpk) * hparams.n_rot
4773
+ );
4774
+ cb(kpass, "kpass", il);
4775
+
4776
+ struct ggml_tensor * qrotated = ggml_rope_custom(
4777
+ ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
4778
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
4779
+ );
4780
+ cb(qrotated, "qrotated", il);
4781
+
4782
+ struct ggml_tensor * krotated = ggml_rope_custom(
4783
+ ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
4784
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
4785
+ );
4786
+ cb(krotated, "krotated", il);
4787
+
4788
+ // ggml currently only supports concatenation on dim=2
4789
+ // so we need to permute qrot, qpass, concat, then permute back.
4790
+ qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
4791
+ cb(qrotated, "qrotated", il);
4792
+
4793
+ krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
4794
+ cb(krotated, "krotated", il);
4795
+
4796
+ qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
4797
+ cb(qpass, "qpass", il);
4798
+
4799
+ kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
4800
+ cb(kpass, "kpass", il);
4801
+
4802
+ struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
4803
+ cb(Qcur, "Qcur", il);
4804
+
4805
+ struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
4806
+ cb(Kcur, "Kcur", il);
4807
+
4808
+ struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
4809
+ cb(Q, "Q", il);
4810
+
4811
+ Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
4812
+ cb(Kcur, "Kcur", il);
4813
+
4814
+ llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4815
+
4816
+ cur = llm_build_kqv(ctx0, hparams, kv_self,
4817
+ model.layers[il].wo, NULL,
4818
+ Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4819
+ cb(cur, "kqv_out", il);
4820
+ }
4821
+
4822
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
4823
+ cb(ffn_inp, "ffn_inp", il);
4824
+
4825
+ // feed-forward network
4826
+ {
4827
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
4828
+ model.layers[il].ffn_norm,
4829
+ model.layers[il].ffn_norm_b,
4830
+ LLM_NORM, cb, il);
4831
+ cb(cur, "ffn_norm", il);
4832
+
4833
+ cur = llm_build_ffn(ctx0, cur,
4834
+ model.layers[il].ffn_up, NULL,
4835
+ model.layers[il].ffn_gate, NULL,
4836
+ model.layers[il].ffn_down, NULL,
4837
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
4838
+ cb(cur, "ffn_out", il);
4839
+ }
4840
+
4841
+ cur = ggml_add(ctx0, cur, ffn_inp);
4842
+ cb(cur, "l_out", il);
4843
+
4844
+ // input for next layer
4845
+ inpL = cur;
4846
+ }
4847
+
4848
+ cur = inpL;
4849
+
4850
+ cur = llm_build_norm(ctx0, cur, hparams,
4851
+ model.output_norm,
4852
+ model.output_norm_b,
4853
+ LLM_NORM, cb, -1);
4854
+ cb(cur, "result_norm", -1);
4855
+
4856
+ // lm_head
4857
+ cur = ggml_mul_mat(ctx0, model.output, cur);
4858
+ cb(cur, "result_output", -1);
4859
+
4860
+ ggml_build_forward_expand(gf, cur);
4861
+
4862
+ return gf;
4863
+ }
4554
4864
  };
4555
4865
 
4556
4866
  //
@@ -5020,6 +5330,10 @@ static struct ggml_cgraph * llama_build_graph(
5020
5330
  {
5021
5331
  result = llm.build_mpt();
5022
5332
  } break;
5333
+ case LLM_ARCH_STABLELM:
5334
+ {
5335
+ result = llm.build_stablelm();
5336
+ } break;
5023
5337
  default:
5024
5338
  GGML_ASSERT(false);
5025
5339
  }
@@ -5195,7 +5509,8 @@ static int llama_decode_internal(
5195
5509
  model.arch == LLM_ARCH_FALCON ||
5196
5510
  model.arch == LLM_ARCH_REFACT ||
5197
5511
  model.arch == LLM_ARCH_MPT ||
5198
- model.arch == LLM_ARCH_STARCODER;
5512
+ model.arch == LLM_ARCH_STARCODER ||
5513
+ model.arch == LLM_ARCH_STABLELM;
5199
5514
 
5200
5515
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
5201
5516
  if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
@@ -5987,7 +6302,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
5987
6302
  // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
5988
6303
  // and passing 'add space prefix' as bool argument
5989
6304
  //
5990
- auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length);
6305
+ auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
6306
+ if (&fragment == &fragment_buffer.front()) {
6307
+ raw_text = " " + raw_text; // prefix with space if the first token is not special
6308
+ }
5991
6309
 
5992
6310
  #ifdef PRETOKENIZERDEBUG
5993
6311
  fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
@@ -7639,7 +7957,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
7639
7957
  workers.clear();
7640
7958
  }
7641
7959
 
7642
- LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
7960
+ LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
7643
7961
  int64_t tot_count = 0;
7644
7962
  for (size_t i = 0; i < hist_cur.size(); i++) {
7645
7963
  hist_all[i] += hist_cur[i];
@@ -8179,7 +8497,7 @@ struct llama_context * llama_new_context_with_model(
8179
8497
 
8180
8498
  {
8181
8499
  const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
8182
- LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
8500
+ LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
8183
8501
  }
8184
8502
 
8185
8503
  // resized during inference
@@ -8196,7 +8514,7 @@ struct llama_context * llama_new_context_with_model(
8196
8514
  {
8197
8515
  static const size_t tensor_alignment = 32;
8198
8516
  // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
8199
- ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
8517
+ ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
8200
8518
 
8201
8519
  // create measure allocator
8202
8520
  ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
@@ -8224,7 +8542,7 @@ struct llama_context * llama_new_context_with_model(
8224
8542
  // measure memory requirements for the graph
8225
8543
  size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
8226
8544
 
8227
- LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
8545
+ LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
8228
8546
 
8229
8547
  // recreate allocator with exact memory requirements
8230
8548
  ggml_allocr_free(ctx->alloc);
@@ -8238,7 +8556,7 @@ struct llama_context * llama_new_context_with_model(
8238
8556
  #endif
8239
8557
  #ifdef GGML_USE_CUBLAS
8240
8558
  ggml_cuda_set_scratch_size(alloc_size);
8241
- LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
8559
+ LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
8242
8560
 
8243
8561
  // calculate total VRAM usage
8244
8562
  auto add_tensor = [](const ggml_tensor * t, size_t & size) {
@@ -8258,10 +8576,10 @@ struct llama_context * llama_new_context_with_model(
8258
8576
  size_t ctx_vram_size = alloc_size + kv_vram_size;
8259
8577
  size_t total_vram_size = model_vram_size + ctx_vram_size;
8260
8578
 
8261
- LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
8579
+ LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
8262
8580
  total_vram_size / 1024.0 / 1024.0,
8263
8581
  model_vram_size / 1024.0 / 1024.0,
8264
- ctx_vram_size / 1024.0 / 1024.0);
8582
+ ctx_vram_size / 1024.0 / 1024.0);
8265
8583
  #endif
8266
8584
  }
8267
8585
 
@@ -8282,7 +8600,7 @@ struct llama_context * llama_new_context_with_model(
8282
8600
 
8283
8601
  const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
8284
8602
 
8285
- LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
8603
+ LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
8286
8604
 
8287
8605
  #define LLAMA_METAL_CHECK_BUF(result) \
8288
8606
  if (!(result)) { \
@@ -8585,8 +8903,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
8585
8903
  if (kv_buf_size) {
8586
8904
  const size_t elt_size = ggml_element_size(kv_self.k);
8587
8905
 
8588
- ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
8589
- ggml_cgraph gf{};
8906
+ ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
8907
+ ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
8590
8908
 
8591
8909
  ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
8592
8910
  std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
@@ -8604,9 +8922,9 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
8604
8922
  kv_head, n_embd, n_layer,
8605
8923
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
8606
8924
 
8607
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
8608
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
8609
- ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
8925
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
8926
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
8927
+ ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
8610
8928
 
8611
8929
  ggml_free(cpy_ctx);
8612
8930
 
@@ -8713,8 +9031,8 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
8713
9031
 
8714
9032
  const size_t elt_size = ggml_element_size(kv_self.k);
8715
9033
 
8716
- ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
8717
- ggml_cgraph gf{};
9034
+ ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9035
+ ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
8718
9036
 
8719
9037
  ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
8720
9038
  kin3d->data = (void *) inp;
@@ -8732,9 +9050,9 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
8732
9050
  kv_head, n_embd, n_layer,
8733
9051
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
8734
9052
 
8735
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
8736
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
8737
- ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
9053
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
9054
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
9055
+ ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
8738
9056
 
8739
9057
  ggml_free(cpy_ctx);
8740
9058
  }
@@ -8989,6 +9307,14 @@ llama_token llama_token_nl(const struct llama_model * model) {
8989
9307
  return model->vocab.linefeed_id;
8990
9308
  }
8991
9309
 
9310
+ int llama_add_bos_token(const struct llama_model * model) {
9311
+ return model->vocab.special_add_bos;
9312
+ }
9313
+
9314
+ int llama_add_eos_token(const struct llama_model * model) {
9315
+ return model->vocab.special_add_eos;
9316
+ }
9317
+
8992
9318
  llama_token llama_token_prefix(const struct llama_model * model) {
8993
9319
  return model->vocab.special_prefix_id;
8994
9320
  }