llama_cpp 0.3.4 → 0.3.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -56,8 +56,14 @@
56
56
  #pragma warning(disable: 4244 4267) // possible loss of data
57
57
  #endif
58
58
 
59
+ #if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
60
+ #include "ggml-alloc.h"
61
+ #define LLAMA_USE_ALLOCATOR
62
+ #else
59
63
  #define LLAMA_USE_SCRATCH
60
64
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
65
+ #endif
66
+
61
67
 
62
68
  // available llama models
63
69
  enum e_model {
@@ -67,6 +73,7 @@ enum e_model {
67
73
  MODEL_13B,
68
74
  MODEL_30B,
69
75
  MODEL_65B,
76
+ MODEL_70B,
70
77
  };
71
78
 
72
79
  static const size_t kB = 1024;
@@ -98,18 +105,18 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
98
105
  }
99
106
 
100
107
  //
101
- // memory sizes
108
+ // memory sizes (calculated for n_batch == 512)
102
109
  //
103
110
 
104
111
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
105
112
  {
106
113
  static std::map<e_model, size_t> k_sizes = {
107
- /* empirical scaling, still a guess */
108
- { MODEL_3B, ((size_t) n_ctx / 16ull + 128ull) * MB },
109
- { MODEL_7B, ((size_t) n_ctx / 16ull + 256ull) * MB },
110
- { MODEL_13B, ((size_t) n_ctx / 12ull + 256ull) * MB },
111
- { MODEL_30B, ((size_t) n_ctx / 10ull + 256ull) * MB },
112
- { MODEL_65B, ((size_t) n_ctx / 8ull + 512ull) * MB },
114
+ { MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
115
+ { MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
116
+ { MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
117
+ { MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB },
118
+ { MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess
119
+ { MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB },
113
120
  };
114
121
  return k_sizes;
115
122
  }
@@ -117,38 +124,26 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
117
124
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
118
125
  {
119
126
  static std::map<e_model, size_t> k_sizes = {
120
- { MODEL_3B, 256ull * MB },
121
- { MODEL_7B, 512ull * MB },
122
- { MODEL_13B, 512ull * MB },
123
- { MODEL_30B, 512ull * MB },
124
- { MODEL_65B, 1024ull * MB },
125
- };
126
- return k_sizes;
127
- }
128
-
129
- // 2*n_embd*n_ctx*n_layer*sizeof(float16)
130
- static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
131
- {
132
- static std::map<e_model, size_t> k_sizes = {
133
- { MODEL_3B, 682ull * MB },
134
- { MODEL_7B, 1026ull * MB },
135
- { MODEL_13B, 1608ull * MB },
136
- { MODEL_30B, 3124ull * MB },
137
- { MODEL_65B, 5120ull * MB },
127
+ { MODEL_3B, 128ull * MB },
128
+ { MODEL_7B, 160ull * MB },
129
+ { MODEL_13B, 192ull * MB },
130
+ { MODEL_30B, 256ull * MB },
131
+ { MODEL_65B, 384ull * MB }, // guess
132
+ { MODEL_70B, 304ull * MB },
138
133
  };
139
134
  return k_sizes;
140
135
  }
141
136
 
142
- // this is mostly needed for temporary mul_mat buffers to dequantize the data
143
- // not actually needed if BLAS is disabled
144
- static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
137
+ // used to store the compute graph tensors + non-scratch data
138
+ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
145
139
  {
146
140
  static std::map<e_model, size_t> k_sizes = {
147
- { MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB },
148
- { MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB },
149
- { MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
150
- { MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
151
- { MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
141
+ { MODEL_3B, 8ull * MB },
142
+ { MODEL_7B, 10ull * MB },
143
+ { MODEL_13B, 12ull * MB },
144
+ { MODEL_30B, 16ull * MB },
145
+ { MODEL_65B, 24ull * MB }, // guess
146
+ { MODEL_70B, 24ull * MB },
152
147
  };
153
148
  return k_sizes;
154
149
  }
@@ -163,6 +158,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
163
158
  { MODEL_13B, 640ull * kB },
164
159
  { MODEL_30B, 768ull * kB },
165
160
  { MODEL_65B, 1536ull * kB },
161
+ { MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
166
162
  };
167
163
  return k_sizes;
168
164
  }
@@ -177,19 +173,26 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
177
173
  { MODEL_13B, 160ull },
178
174
  { MODEL_30B, 208ull },
179
175
  { MODEL_65B, 416ull },
176
+ { MODEL_70B, 416ull }, // TODO (likely can be reduced)
180
177
  };
181
178
  return k_sizes;
182
179
  }
183
180
 
184
181
  // default hparams (LLaMA 7B)
185
182
  struct llama_hparams {
186
- uint32_t n_vocab = 32000;
187
- uint32_t n_ctx = 512; // this is provided as user input?
188
- uint32_t n_embd = 4096;
189
- uint32_t n_mult = 256;
190
- uint32_t n_head = 32;
191
- uint32_t n_layer = 32;
192
- uint32_t n_rot = 64;
183
+ uint32_t n_vocab = 32000;
184
+ uint32_t n_ctx = 512; // this is provided as user input?
185
+ uint32_t n_embd = 4096;
186
+ uint32_t n_mult = 256;
187
+ uint32_t n_head = 32;
188
+ uint32_t n_head_kv = 32;
189
+ uint32_t n_layer = 32;
190
+ uint32_t n_rot = 64;
191
+
192
+ // LLaMAv2
193
+ // TODO: load from model data hparams
194
+ float f_ffn_mult = 1.0f;
195
+ float f_rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
193
196
 
194
197
  float rope_freq_base = 10000.0f;
195
198
  float rope_freq_scale = 1.0f;
@@ -197,7 +200,28 @@ struct llama_hparams {
197
200
  enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
198
201
 
199
202
  bool operator!=(const llama_hparams & other) const {
200
- return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
203
+ return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
204
+ }
205
+
206
+ uint32_t n_gqa() const {
207
+ return n_head/n_head_kv;
208
+ }
209
+
210
+ uint32_t n_embd_head() const {
211
+ return n_embd/n_head;
212
+ }
213
+
214
+ uint32_t n_embd_gqa() const {
215
+ return n_embd/n_gqa();
216
+ }
217
+
218
+ size_t kv_size() const {
219
+ size_t result = 2ull;
220
+ result *= (size_t) n_embd_gqa();
221
+ result *= (size_t) n_ctx;
222
+ result *= (size_t) n_layer;
223
+ result *= sizeof(ggml_fp16_t);
224
+ return result;
201
225
  }
202
226
  };
203
227
 
@@ -309,13 +333,22 @@ struct llama_model {
309
333
 
310
334
  struct llama_context {
311
335
  llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
312
- #ifdef GGML_USE_METAL
313
336
  ~llama_context() {
337
+ if (model_owner) {
338
+ delete &model;
339
+ }
340
+ #ifdef GGML_USE_METAL
314
341
  if (ctx_metal) {
315
342
  ggml_metal_free(ctx_metal);
316
343
  }
317
- }
318
344
  #endif
345
+ #ifdef LLAMA_USE_ALLOCATOR
346
+ if (alloc) {
347
+ ggml_allocr_free(alloc);
348
+ }
349
+ #endif
350
+ }
351
+
319
352
  std::mt19937 rng;
320
353
 
321
354
  bool has_evaluated_once = false;
@@ -353,7 +386,17 @@ struct llama_context {
353
386
  // memory buffers used to evaluate the model
354
387
  // TODO: move in llama_state
355
388
  llama_ctx_buffer buf_compute;
389
+
390
+ #ifdef LLAMA_USE_ALLOCATOR
391
+ llama_ctx_buffer buf_alloc;
392
+ ggml_allocr * alloc = NULL;
393
+ #endif
394
+
395
+ #ifdef LLAMA_USE_SCRATCH
356
396
  llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
397
+ int buf_last = 0;
398
+ size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
399
+ #endif
357
400
 
358
401
  #ifdef GGML_USE_METAL
359
402
  ggml_metal_context * ctx_metal = NULL;
@@ -363,9 +406,6 @@ struct llama_context {
363
406
  ggml_mpi_context * ctx_mpi = NULL;
364
407
  #endif
365
408
 
366
- int buf_last = 0;
367
- size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
368
-
369
409
  void use_buf(struct ggml_context * ctx, int i) {
370
410
  #if defined(LLAMA_USE_SCRATCH)
371
411
  size_t last_size = 0;
@@ -499,12 +539,16 @@ struct llama_file_loader {
499
539
  }
500
540
  void read_hparams() {
501
541
  hparams.n_vocab = file.read_u32();
502
- hparams.n_embd = file.read_u32();
503
- hparams.n_mult = file.read_u32();
504
- hparams.n_head = file.read_u32();
542
+ hparams.n_embd = file.read_u32();
543
+ hparams.n_mult = file.read_u32();
544
+ hparams.n_head = file.read_u32();
505
545
  hparams.n_layer = file.read_u32();
506
- hparams.n_rot = file.read_u32();
507
- hparams.ftype = (enum llama_ftype) file.read_u32();
546
+ hparams.n_rot = file.read_u32();
547
+ hparams.ftype = (enum llama_ftype) file.read_u32();
548
+
549
+ // LLaMAv2
550
+ // TODO: read from header
551
+ hparams.n_head_kv = hparams.n_head;
508
552
  }
509
553
  void read_vocab() {
510
554
  vocab.id_to_token.resize(hparams.n_vocab);
@@ -803,7 +847,7 @@ static bool kv_cache_init(
803
847
  ggml_type wtype,
804
848
  int n_ctx,
805
849
  int n_gpu_layers) {
806
- const int n_embd = hparams.n_embd;
850
+ const int n_embd = hparams.n_embd_gqa();
807
851
  const int n_layer = hparams.n_layer;
808
852
 
809
853
  const int64_t n_mem = n_layer*n_ctx;
@@ -847,6 +891,8 @@ struct llama_context_params llama_context_default_params() {
847
891
  /*.seed =*/ LLAMA_DEFAULT_SEED,
848
892
  /*.n_ctx =*/ 512,
849
893
  /*.n_batch =*/ 512,
894
+ /*.n_gqa =*/ 1,
895
+ /*.rms_norm_eps =*/ LLAMA_DEFAULT_RMS_EPS,
850
896
  /*.gpu_layers =*/ 0,
851
897
  /*.main_gpu =*/ 0,
852
898
  /*.tensor_split =*/ nullptr,
@@ -855,6 +901,7 @@ struct llama_context_params llama_context_default_params() {
855
901
  /*.progress_callback =*/ nullptr,
856
902
  /*.progress_callback_user_data =*/ nullptr,
857
903
  /*.low_vram =*/ false,
904
+ /*.mul_mat_q =*/ false,
858
905
  /*.f16_kv =*/ true,
859
906
  /*.logits_all =*/ false,
860
907
  /*.vocab_only =*/ false,
@@ -966,6 +1013,7 @@ static const char *llama_model_type_name(e_model type) {
966
1013
  case MODEL_13B: return "13B";
967
1014
  case MODEL_30B: return "30B";
968
1015
  case MODEL_65B: return "65B";
1016
+ case MODEL_70B: return "70B";
969
1017
  default: LLAMA_ASSERT(false);
970
1018
  }
971
1019
  }
@@ -976,9 +1024,12 @@ static void llama_model_load_internal(
976
1024
  llama_vocab & vocab,
977
1025
  int n_ctx,
978
1026
  int n_batch,
1027
+ int n_gqa,
1028
+ float rms_norm_eps,
979
1029
  int n_gpu_layers,
980
1030
  int main_gpu,
981
1031
  const float * tensor_split,
1032
+ const bool mul_mat_q,
982
1033
  float rope_freq_base,
983
1034
  float rope_freq_scale,
984
1035
  bool low_vram,
@@ -997,8 +1048,12 @@ static void llama_model_load_internal(
997
1048
  model.hparams = ml->file_loader->hparams;
998
1049
  model.n_gpu_layers = n_gpu_layers;
999
1050
  llama_file_version file_version = ml->file_loader->file_version;
1051
+
1000
1052
  auto & hparams = model.hparams;
1001
1053
 
1054
+ // TODO: read from file
1055
+ hparams.f_rms_norm_eps = rms_norm_eps;
1056
+
1002
1057
  {
1003
1058
  switch (hparams.n_layer) {
1004
1059
  case 26: model.type = e_model::MODEL_3B; break;
@@ -1016,11 +1071,25 @@ static void llama_model_load_internal(
1016
1071
 
1017
1072
  hparams.n_ctx = n_ctx;
1018
1073
 
1074
+ // LLaMAv2
1075
+ // TODO: temporary until GGUF
1076
+ LLAMA_ASSERT(hparams.n_head % n_gqa == 0);
1077
+ hparams.n_head_kv = hparams.n_head / n_gqa;
1078
+ if (model.type == e_model::MODEL_65B && n_gqa == 8) {
1079
+ fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
1080
+ model.type = e_model::MODEL_70B;
1081
+ hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
1082
+ }
1083
+
1019
1084
  hparams.rope_freq_base = rope_freq_base;
1020
1085
  hparams.rope_freq_scale = rope_freq_scale;
1021
1086
  }
1022
1087
 
1023
- const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
1088
+ // ref: https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/model.py#L194-L199
1089
+ const uint32_t n_ff_raw = 2*(4*hparams.n_embd)/3;
1090
+ const uint32_t n_ff_mult = hparams.f_ffn_mult*n_ff_raw;
1091
+ const uint32_t n_ff = ((n_ff_mult + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
1092
+ //const uint32_t n_ff = 28672;
1024
1093
 
1025
1094
  {
1026
1095
  fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
@@ -1029,12 +1098,15 @@ static void llama_model_load_internal(
1029
1098
  fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
1030
1099
  fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
1031
1100
  fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
1101
+ fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
1032
1102
  fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
1033
- fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1103
+ fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
1104
+ fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa());
1105
+ fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
1106
+ fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1034
1107
  fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
1035
1108
  fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1036
1109
  fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
1037
- fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1038
1110
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1039
1111
  }
1040
1112
 
@@ -1069,7 +1141,7 @@ static void llama_model_load_internal(
1069
1141
  {
1070
1142
  model.buf.resize(ctx_size);
1071
1143
  if (use_mlock) {
1072
- model.mlock_buf.init(model.buf.addr);
1144
+ model.mlock_buf.init (model.buf.addr);
1073
1145
  model.mlock_buf.grow_to(model.buf.size);
1074
1146
  }
1075
1147
 
@@ -1086,9 +1158,11 @@ static void llama_model_load_internal(
1086
1158
  }
1087
1159
 
1088
1160
  (void) main_gpu;
1161
+ (void) mul_mat_q;
1089
1162
  #if defined(GGML_USE_CUBLAS)
1090
1163
  fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
1091
1164
  ggml_cuda_set_main_device(main_gpu);
1165
+ ggml_cuda_set_mul_mat_q(mul_mat_q);
1092
1166
  #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1093
1167
  #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
1094
1168
  #elif defined(GGML_USE_CLBLAST)
@@ -1104,9 +1178,10 @@ static void llama_model_load_internal(
1104
1178
  size_t vram_weights = 0;
1105
1179
  size_t vram_scratch = 0;
1106
1180
  {
1107
- const uint32_t n_embd = hparams.n_embd;
1108
- const uint32_t n_layer = hparams.n_layer;
1109
- const uint32_t n_vocab = hparams.n_vocab;
1181
+ const uint32_t n_embd = hparams.n_embd;
1182
+ const uint32_t n_embd_gqa = hparams.n_embd_gqa();
1183
+ const uint32_t n_layer = hparams.n_layer;
1184
+ const uint32_t n_vocab = hparams.n_vocab;
1110
1185
 
1111
1186
  ml->ggml_ctx = ctx;
1112
1187
 
@@ -1154,16 +1229,16 @@ static void llama_model_load_internal(
1154
1229
 
1155
1230
  layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
1156
1231
 
1157
- layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
1158
- layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split);
1159
- layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split);
1160
- layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
1232
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
1233
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd_gqa}, backend_split);
1234
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd_gqa}, backend_split);
1235
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
1161
1236
 
1162
1237
  layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
1163
1238
 
1164
- layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
1165
- layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
1166
- layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
1239
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
1240
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
1241
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
1167
1242
 
1168
1243
  if (backend == GGML_BACKEND_GPU) {
1169
1244
  vram_weights +=
@@ -1181,16 +1256,20 @@ static void llama_model_load_internal(
1181
1256
  const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
1182
1257
 
1183
1258
  // this is the total memory required to run the inference
1184
- const size_t mem_required =
1259
+ size_t mem_required =
1185
1260
  ctx_size +
1186
- mmapped_size - vram_weights + // weights in VRAM not in memory
1261
+ mmapped_size - vram_weights; // weights in VRAM not in memory
1262
+
1263
+ #ifndef LLAMA_USE_ALLOCATOR
1264
+ mem_required +=
1187
1265
  MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
1188
1266
  MEM_REQ_SCRATCH1().at(model.type) +
1189
- MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
1267
+ MEM_REQ_EVAL().at(model.type);
1268
+ #endif
1190
1269
 
1191
1270
  // this is the memory required by one llama_state
1192
1271
  const size_t mem_required_state =
1193
- scale*MEM_REQ_KV_SELF().at(model.type);
1272
+ scale*hparams.kv_size();
1194
1273
 
1195
1274
  fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1196
1275
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@@ -1231,7 +1310,7 @@ static void llama_model_load_internal(
1231
1310
  fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
1232
1311
  } else {
1233
1312
  fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
1234
- vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1313
+ vram_kv_cache += hparams.kv_size() / 2;
1235
1314
  }
1236
1315
  }
1237
1316
  if (n_gpu_layers > (int) hparams.n_layer + 2) {
@@ -1239,7 +1318,7 @@ static void llama_model_load_internal(
1239
1318
  fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
1240
1319
  } else {
1241
1320
  fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
1242
- vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1321
+ vram_kv_cache += hparams.kv_size() / 2;
1243
1322
  }
1244
1323
  }
1245
1324
  #elif defined(GGML_USE_CLBLAST)
@@ -1287,9 +1366,12 @@ static bool llama_model_load(
1287
1366
  llama_vocab & vocab,
1288
1367
  int n_ctx,
1289
1368
  int n_batch,
1369
+ int n_gqa,
1370
+ float rms_norm_eps,
1290
1371
  int n_gpu_layers,
1291
1372
  int main_gpu,
1292
1373
  const float * tensor_split,
1374
+ const bool mul_mat_q,
1293
1375
  float rope_freq_base,
1294
1376
  float rope_freq_scale,
1295
1377
  bool low_vram,
@@ -1300,7 +1382,8 @@ static bool llama_model_load(
1300
1382
  llama_progress_callback progress_callback,
1301
1383
  void *progress_callback_user_data) {
1302
1384
  try {
1303
- llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1385
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
1386
+ main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1304
1387
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1305
1388
  return true;
1306
1389
  } catch (const std::exception & err) {
@@ -1309,32 +1392,15 @@ static bool llama_model_load(
1309
1392
  }
1310
1393
  }
1311
1394
 
1312
- // evaluate the transformer
1313
- //
1314
- // - lctx: llama context
1315
- // - tokens: new batch of tokens to process
1316
- // - embd embeddings input
1317
- // - n_tokens number of tokens
1318
- // - n_past: the context size so far
1319
- // - n_threads: number of threads to use
1320
- //
1321
- static bool llama_eval_internal(
1395
+ static struct ggml_cgraph * llama_build_graph(
1322
1396
  llama_context & lctx,
1323
1397
  const llama_token * tokens,
1324
1398
  const float * embd,
1325
1399
  int n_tokens,
1326
- int n_past,
1327
- int n_threads,
1328
- const char * cgraph_fname) {
1400
+ int n_past) {
1329
1401
 
1330
1402
  LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1331
1403
 
1332
- #ifdef GGML_USE_MPI
1333
- ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
1334
- #endif
1335
-
1336
- const int64_t t_start_us = ggml_time_us();
1337
-
1338
1404
  const int N = n_tokens;
1339
1405
 
1340
1406
  const auto & model = lctx.model;
@@ -1344,40 +1410,54 @@ static bool llama_eval_internal(
1344
1410
 
1345
1411
  LLAMA_ASSERT(!!kv_self.ctx);
1346
1412
 
1347
- const int n_embd = hparams.n_embd;
1348
- const int n_layer = hparams.n_layer;
1349
- const int n_ctx = hparams.n_ctx;
1350
- const int n_head = hparams.n_head;
1351
- const int n_vocab = hparams.n_vocab;
1352
- const int n_rot = hparams.n_embd/hparams.n_head;
1353
- const int n_gpu_layers = model.n_gpu_layers;
1413
+ const int64_t n_embd = hparams.n_embd;
1414
+ const int64_t n_layer = hparams.n_layer;
1415
+ const int64_t n_ctx = hparams.n_ctx;
1416
+ const int64_t n_head = hparams.n_head;
1417
+ const int64_t n_head_kv = hparams.n_head_kv;
1418
+ const int64_t n_embd_head = hparams.n_embd_head();
1419
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
1420
+
1421
+ LLAMA_ASSERT(n_embd_head == hparams.n_rot);
1354
1422
 
1355
1423
  const float freq_base = hparams.rope_freq_base;
1356
1424
  const float freq_scale = hparams.rope_freq_scale;
1425
+ const float rms_norm_eps = hparams.f_rms_norm_eps;
1426
+
1427
+ const int n_gpu_layers = model.n_gpu_layers;
1357
1428
 
1358
1429
  auto & mem_per_token = lctx.mem_per_token;
1359
1430
  auto & buf_compute = lctx.buf_compute;
1360
1431
 
1432
+
1361
1433
  struct ggml_init_params params = {
1362
1434
  /*.mem_size =*/ buf_compute.size,
1363
1435
  /*.mem_buffer =*/ buf_compute.addr,
1364
1436
  /*.no_alloc =*/ false,
1365
1437
  };
1366
1438
 
1367
- struct ggml_context * ctx0 = ggml_init(params);
1439
+ #ifdef LLAMA_USE_ALLOCATOR
1440
+ params.no_alloc = true;
1441
+ #endif
1368
1442
 
1369
- ggml_cgraph gf = {};
1443
+ struct ggml_context * ctx0 = ggml_init(params);
1370
1444
 
1371
- // for big prompts, if BLAS is enabled, it is better to use only one thread
1372
- // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1373
- n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1445
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
1374
1446
 
1375
1447
  struct ggml_tensor * cur;
1376
1448
  struct ggml_tensor * inpL;
1377
1449
 
1378
1450
  if (tokens) {
1379
1451
  struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1452
+
1453
+ #ifdef LLAMA_USE_ALLOCATOR
1454
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
1455
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
1456
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
1457
+ }
1458
+ #else
1380
1459
  memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
1460
+ #endif
1381
1461
  ggml_set_name(inp_tokens, "inp_tokens");
1382
1462
 
1383
1463
  inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
@@ -1387,7 +1467,15 @@ static bool llama_eval_internal(
1387
1467
  #endif
1388
1468
 
1389
1469
  inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
1470
+
1471
+ #ifdef LLAMA_USE_ALLOCATOR
1472
+ ggml_allocr_alloc(lctx.alloc, inpL);
1473
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
1474
+ memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1475
+ }
1476
+ #else
1390
1477
  memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1478
+ #endif
1391
1479
  }
1392
1480
 
1393
1481
  const int i_gpu_start = n_layer - n_gpu_layers;
@@ -1414,6 +1502,17 @@ static bool llama_eval_internal(
1414
1502
  }
1415
1503
  #endif // GGML_USE_CUBLAS
1416
1504
 
1505
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
1506
+ #ifdef LLAMA_USE_ALLOCATOR
1507
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
1508
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
1509
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
1510
+ }
1511
+ #else
1512
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
1513
+ #endif
1514
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
1515
+
1417
1516
  for (int il = 0; il < n_layer; ++il) {
1418
1517
  ggml_format_name(inpL, "layer_inp_%d", il);
1419
1518
 
@@ -1431,7 +1530,7 @@ static bool llama_eval_internal(
1431
1530
 
1432
1531
  // norm
1433
1532
  {
1434
- cur = ggml_rms_norm(ctx0, inpL);
1533
+ cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
1435
1534
  offload_func(cur);
1436
1535
  ggml_set_name(cur, "rms_norm_0");
1437
1536
 
@@ -1452,11 +1551,11 @@ static bool llama_eval_internal(
1452
1551
  offload_func_kq(tmpq);
1453
1552
  ggml_set_name(tmpq, "tmpq");
1454
1553
 
1455
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
1554
+ struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
1456
1555
  offload_func_kq(Kcur);
1457
1556
  ggml_set_name(Kcur, "Kcur");
1458
1557
 
1459
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
1558
+ struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
1460
1559
  offload_func_kq(Qcur);
1461
1560
  ggml_set_name(Qcur, "Qcur");
1462
1561
 
@@ -1468,23 +1567,23 @@ static bool llama_eval_internal(
1468
1567
  offload_func_v(tmpv);
1469
1568
  ggml_set_name(tmpv, "tmpv");
1470
1569
 
1471
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N));
1570
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
1472
1571
  offload_func_v(Vcur);
1473
1572
  ggml_set_name(Vcur, "Vcur");
1474
1573
 
1475
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1574
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
1476
1575
  offload_func_kq(k);
1477
1576
  ggml_set_name(k, "k");
1478
1577
 
1479
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1578
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
1480
1579
  ( n_ctx)*ggml_element_size(kv_self.v),
1481
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1580
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
1482
1581
  offload_func_v(v);
1483
1582
  ggml_set_name(v, "v");
1484
1583
 
1485
1584
  // important: storing RoPE-ed version of K in the KV cache!
1486
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
1487
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
1585
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
1586
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
1488
1587
  }
1489
1588
 
1490
1589
  struct ggml_tensor * Q =
@@ -1497,8 +1596,8 @@ static bool llama_eval_internal(
1497
1596
  struct ggml_tensor * K =
1498
1597
  ggml_permute(ctx0,
1499
1598
  ggml_reshape_3d(ctx0,
1500
- ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
1501
- n_embd/n_head, n_head, n_past + N),
1599
+ ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd_gqa, il*n_ctx*ggml_element_size(kv_self.k)*n_embd_gqa),
1600
+ n_embd_head, n_head_kv, n_past + N),
1502
1601
  0, 2, 1, 3);
1503
1602
  offload_func_kq(K);
1504
1603
  ggml_set_name(K, "K");
@@ -1508,10 +1607,7 @@ static bool llama_eval_internal(
1508
1607
  offload_func_kq(KQ);
1509
1608
  ggml_set_name(KQ, "KQ");
1510
1609
 
1511
- // KQ_scaled = KQ / sqrt(n_embd/n_head)
1512
- struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
1513
- ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
1514
-
1610
+ // KQ_scaled = KQ / sqrt(n_embd_head)
1515
1611
  // KQ_scaled shape [n_past + N, N, n_head, 1]
1516
1612
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
1517
1613
  offload_func_kq(KQ_scaled);
@@ -1530,10 +1626,10 @@ static bool llama_eval_internal(
1530
1626
  // split cached V into n_head heads
1531
1627
  struct ggml_tensor * V =
1532
1628
  ggml_view_3d(ctx0, kv_self.v,
1533
- n_past + N, n_embd/n_head, n_head,
1629
+ n_past + N, n_embd_head, n_head_kv,
1534
1630
  n_ctx*ggml_element_size(kv_self.v),
1535
- n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
1536
- il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
1631
+ n_ctx*ggml_element_size(kv_self.v)*n_embd_head,
1632
+ n_ctx*ggml_element_size(kv_self.v)*n_embd_gqa*il);
1537
1633
  offload_func_v(V);
1538
1634
  ggml_set_name(V, "V");
1539
1635
 
@@ -1545,7 +1641,7 @@ static bool llama_eval_internal(
1545
1641
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
1546
1642
  // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
1547
1643
  // is there a better way?
1548
- struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
1644
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
1549
1645
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
1550
1646
  #endif
1551
1647
 
@@ -1579,7 +1675,7 @@ static bool llama_eval_internal(
1579
1675
  {
1580
1676
  // norm
1581
1677
  {
1582
- cur = ggml_rms_norm(ctx0, inpFF);
1678
+ cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
1583
1679
  offload_func(cur);
1584
1680
  ggml_set_name(cur, "rms_norm_1");
1585
1681
 
@@ -1627,12 +1723,9 @@ static bool llama_eval_internal(
1627
1723
 
1628
1724
  lctx.use_buf(ctx0, 0);
1629
1725
 
1630
- // used at the end to optionally extract the embeddings
1631
- struct ggml_tensor * embeddings = NULL;
1632
-
1633
1726
  // norm
1634
1727
  {
1635
- cur = ggml_rms_norm(ctx0, inpL);
1728
+ cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
1636
1729
  offload_func_nr(cur);
1637
1730
  ggml_set_name(cur, "rms_norm_2");
1638
1731
 
@@ -1640,8 +1733,6 @@ static bool llama_eval_internal(
1640
1733
  cur = ggml_mul(ctx0, cur, model.norm);
1641
1734
  // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
1642
1735
  ggml_set_name(cur, "result_norm");
1643
-
1644
- embeddings = cur;
1645
1736
  }
1646
1737
 
1647
1738
  // lm_head
@@ -1653,18 +1744,103 @@ static bool llama_eval_internal(
1653
1744
  // logits -> probs
1654
1745
  //cur = ggml_soft_max_inplace(ctx0, cur);
1655
1746
 
1656
- // run the computation
1657
- ggml_build_forward_expand(&gf, cur);
1747
+ ggml_build_forward_expand(gf, cur);
1748
+
1749
+ if (mem_per_token == 0) {
1750
+ mem_per_token = ggml_used_mem(ctx0)/N;
1751
+ }
1752
+
1753
+ #if 0
1754
+ printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
1755
+ ggml_used_mem(ctx0)/1024.0/1024.0,
1756
+ lctx.get_buf_max_mem(0)/1024.0/1024.0,
1757
+ lctx.get_buf_max_mem(1)/1024.0/1024.0,
1758
+ lctx.work_buffer.size()/1024.0/1024.0,
1759
+ n_past, N);
1760
+ #endif
1761
+
1762
+ ggml_free(ctx0);
1763
+
1764
+ return gf;
1765
+ }
1766
+
1767
+ // evaluate the transformer
1768
+ //
1769
+ // - lctx: llama context
1770
+ // - tokens: new batch of tokens to process
1771
+ // - embd embeddings input
1772
+ // - n_tokens number of tokens
1773
+ // - n_past: the context size so far
1774
+ // - n_threads: number of threads to use
1775
+ //
1776
+ static bool llama_eval_internal(
1777
+ llama_context & lctx,
1778
+ const llama_token * tokens,
1779
+ const float * embd,
1780
+ int n_tokens,
1781
+ int n_past,
1782
+ int n_threads,
1783
+ const char * cgraph_fname) {
1784
+
1785
+ LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1786
+
1787
+ const int64_t t_start_us = ggml_time_us();
1788
+
1789
+ #ifdef GGML_USE_MPI
1790
+ ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
1791
+ #endif
1792
+
1793
+ const int N = n_tokens;
1794
+
1795
+ const auto & model = lctx.model;
1796
+ const auto & hparams = model.hparams;
1797
+
1798
+ const auto & kv_self = lctx.kv_self;
1799
+
1800
+ LLAMA_ASSERT(!!kv_self.ctx);
1801
+
1802
+ const int64_t n_embd = hparams.n_embd;
1803
+ const int64_t n_vocab = hparams.n_vocab;
1804
+
1805
+ #ifdef LLAMA_USE_ALLOCATOR
1806
+ ggml_allocr_reset(lctx.alloc);
1807
+ #endif
1808
+
1809
+ ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
1810
+
1811
+ #ifdef LLAMA_USE_ALLOCATOR
1812
+ ggml_allocr_alloc_graph(lctx.alloc, gf);
1813
+ #endif
1814
+
1815
+ // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
1816
+
1817
+ // for big prompts, if BLAS is enabled, it is better to use only one thread
1818
+ // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1819
+ n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1820
+
1821
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
1822
+ struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
1823
+
1824
+ LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
1825
+ LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
1658
1826
 
1659
1827
  #if GGML_USE_MPI
1660
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
1828
+ const int64_t n_layer = hparams.n_layer;
1829
+ ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
1661
1830
  #endif
1662
1831
 
1663
1832
  #ifdef GGML_USE_METAL
1664
1833
  if (lctx.ctx_metal && N == 1) {
1834
+ // TODO: disabled until #2413 is resolved
1835
+ //if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
1836
+ // ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
1837
+ //}
1665
1838
  ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
1666
- ggml_metal_graph_compute(lctx.ctx_metal, &gf);
1667
- ggml_metal_get_tensor (lctx.ctx_metal, cur);
1839
+ ggml_metal_graph_compute(lctx.ctx_metal, gf);
1840
+ ggml_metal_get_tensor (lctx.ctx_metal, res);
1841
+ if (!lctx.embedding.empty()) {
1842
+ ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
1843
+ }
1668
1844
  } else {
1669
1845
  // IMPORTANT:
1670
1846
  // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
@@ -1682,34 +1858,32 @@ static bool llama_eval_internal(
1682
1858
  ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1683
1859
  }
1684
1860
 
1685
- ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1861
+ ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
1686
1862
  }
1687
1863
  #else
1688
- ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1864
+ ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
1689
1865
  #endif
1690
1866
 
1691
1867
  #if GGML_USE_MPI
1692
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
1868
+ ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
1693
1869
  #endif
1694
1870
 
1695
1871
  // update kv token count
1696
1872
  lctx.kv_self.n = n_past + N;
1697
1873
 
1698
- struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
1699
-
1700
1874
  if (cgraph_fname) {
1701
- ggml_graph_export(&gf, cgraph_fname);
1875
+ ggml_graph_export(gf, cgraph_fname);
1702
1876
  }
1703
1877
 
1704
1878
  #ifdef GGML_PERF
1705
1879
  // print timing information per ggml operation (for debugging purposes)
1706
1880
  // requires GGML_PERF to be defined
1707
- ggml_graph_print(&gf);
1881
+ ggml_graph_print(gf);
1708
1882
  #endif
1709
1883
 
1710
1884
  // plot the computation graph in dot format (for debugging purposes)
1711
1885
  //if (n_past%100 == 0) {
1712
- // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
1886
+ // ggml_graph_dump_dot(gf, NULL, "llama.dot");
1713
1887
  //}
1714
1888
 
1715
1889
  // extract logits
@@ -1734,19 +1908,6 @@ static bool llama_eval_internal(
1734
1908
  memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
1735
1909
  }
1736
1910
 
1737
- if (mem_per_token == 0) {
1738
- mem_per_token = ggml_used_mem(ctx0)/N;
1739
- }
1740
-
1741
- #if 0
1742
- printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__,
1743
- ggml_used_mem(ctx0)/1024.0/1024.0,
1744
- lctx.get_buf_max_mem(0)/1024.0/1024.0,
1745
- lctx.get_buf_max_mem(1)/1024.0/1024.0);
1746
- #endif
1747
-
1748
- ggml_free(ctx0);
1749
-
1750
1911
  // measure the performance only for the single-token evals
1751
1912
  if (N == 1) {
1752
1913
  lctx.t_eval_us += ggml_time_us() - t_start_us;
@@ -1858,7 +2019,9 @@ struct llama_tokenizer {
1858
2019
  if (token == vocab_.token_to_id.end()) {
1859
2020
  // output any symbols that did not form tokens as bytes.
1860
2021
  for (int j = 0; j < (int) symbol.n; ++j) {
1861
- llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
2022
+ // NOTE: old version, before #2420 - not sure what are the implications of this
2023
+ //llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
2024
+ llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
1862
2025
  output.push_back(token_id);
1863
2026
  }
1864
2027
  } else {
@@ -1915,6 +2078,279 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
1915
2078
  return output;
1916
2079
  }
1917
2080
 
2081
+ //
2082
+ // grammar - internal
2083
+ //
2084
+
2085
+ struct llama_grammar {
2086
+ const std::vector<std::vector<llama_grammar_element>> rules;
2087
+ std::vector<std::vector<const llama_grammar_element *>> stacks;
2088
+ };
2089
+
2090
+ struct llama_grammar_candidate {
2091
+ size_t index;
2092
+ const uint32_t * code_points;
2093
+ };
2094
+
2095
+ // NOTE: assumes valid utf8 (but checks for overrun)
2096
+ // adds a terminating 0 for use as pointer
2097
+ std::vector<uint32_t> decode_utf8(const char * src) {
2098
+ static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
2099
+ const char * pos = src;
2100
+ std::vector<uint32_t> code_points;
2101
+ while (*pos != 0) {
2102
+ uint8_t first_byte = static_cast<uint8_t>(*pos);
2103
+ uint8_t highbits = first_byte >> 4;
2104
+ int len = lookup[highbits];
2105
+ uint8_t mask = (1 << (8 - len)) - 1;
2106
+ uint32_t value = first_byte & mask;
2107
+ const char * end = pos + len; // may overrun!
2108
+ ++pos;
2109
+ for ( ; pos < end && *pos != 0; ++pos) {
2110
+ value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
2111
+ }
2112
+ code_points.push_back(value);
2113
+ }
2114
+ code_points.push_back(0);
2115
+ return code_points;
2116
+ }
2117
+
2118
+ // returns true iff pos points to the end of one of the definitions of a rule
2119
+ static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
2120
+ switch (pos->type) {
2121
+ case LLAMA_GRETYPE_END: return true;
2122
+ case LLAMA_GRETYPE_ALT: return true;
2123
+ default: return false;
2124
+ }
2125
+ }
2126
+
2127
+ // returns true iff chr satisfies the char range at pos (regular or inverse range)
2128
+ // asserts that pos is pointing to a char range element
2129
+ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
2130
+ const llama_grammar_element * pos,
2131
+ const uint32_t chr) {
2132
+
2133
+ bool found = false;
2134
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
2135
+ LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
2136
+
2137
+ do {
2138
+ if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
2139
+ // inclusive range, e.g. [a-z]
2140
+ found = found || (pos->value <= chr && chr <= pos[1].value);
2141
+ pos += 2;
2142
+ } else {
2143
+ // exact char match, e.g. [a] or "a"
2144
+ found = found || pos->value == chr;
2145
+ pos += 1;
2146
+ }
2147
+ } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
2148
+
2149
+ return std::make_pair(found == is_positive_char, pos);
2150
+ }
2151
+
2152
+ // transforms a grammar pushdown stack into N possible stacks, all ending
2153
+ // at a character range (terminal element)
2154
+ static void llama_grammar_advance_stack(
2155
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2156
+ const std::vector<const llama_grammar_element *> & stack,
2157
+ std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
2158
+
2159
+ if (stack.empty()) {
2160
+ new_stacks.push_back(stack);
2161
+ return;
2162
+ }
2163
+
2164
+ const llama_grammar_element * pos = stack.back();
2165
+
2166
+ switch (pos->type) {
2167
+ case LLAMA_GRETYPE_RULE_REF: {
2168
+ const size_t rule_id = static_cast<size_t>(pos->value);
2169
+ const llama_grammar_element * subpos = rules[rule_id].data();
2170
+ do {
2171
+ // init new stack without the top (pos)
2172
+ std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
2173
+ if (!llama_grammar_is_end_of_sequence(pos + 1)) {
2174
+ // if this rule ref is followed by another element, add that to stack
2175
+ new_stack.push_back(pos + 1);
2176
+ }
2177
+ if (!llama_grammar_is_end_of_sequence(subpos)) {
2178
+ // if alternate is nonempty, add to stack
2179
+ new_stack.push_back(subpos);
2180
+ }
2181
+ llama_grammar_advance_stack(rules, new_stack, new_stacks);
2182
+ while (!llama_grammar_is_end_of_sequence(subpos)) {
2183
+ // scan to end of alternate def
2184
+ subpos++;
2185
+ }
2186
+ if (subpos->type == LLAMA_GRETYPE_ALT) {
2187
+ // there's another alternate def of this rule to process
2188
+ subpos++;
2189
+ } else {
2190
+ break;
2191
+ }
2192
+ } while (true);
2193
+ break;
2194
+ }
2195
+ case LLAMA_GRETYPE_CHAR:
2196
+ case LLAMA_GRETYPE_CHAR_NOT:
2197
+ new_stacks.push_back(stack);
2198
+ break;
2199
+ default:
2200
+ // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
2201
+ // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
2202
+ // those
2203
+ LLAMA_ASSERT(false);
2204
+ }
2205
+ }
2206
+
2207
+ // takes a set of possible pushdown stacks on a grammar, which are required to
2208
+ // be positioned at a character range (see `llama_grammar_advance_stack`), and
2209
+ // produces the N possible stacks if the given char is accepted at those
2210
+ // positions
2211
+ static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
2212
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2213
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
2214
+ const uint32_t chr) {
2215
+
2216
+ std::vector<std::vector<const llama_grammar_element *>> new_stacks;
2217
+
2218
+ for (const auto & stack : stacks) {
2219
+ if (stack.empty()) {
2220
+ continue;
2221
+ }
2222
+
2223
+ auto match = llama_grammar_match_char(stack.back(), chr);
2224
+ if (match.first) {
2225
+ const llama_grammar_element * pos = match.second;
2226
+
2227
+ // update top of stack to next element, if any
2228
+ std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
2229
+ if (!llama_grammar_is_end_of_sequence(pos)) {
2230
+ new_stack.push_back(pos);
2231
+ }
2232
+ llama_grammar_advance_stack(rules, new_stack, new_stacks);
2233
+ }
2234
+ }
2235
+
2236
+ return new_stacks;
2237
+ }
2238
+
2239
+ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
2240
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2241
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
2242
+ const std::vector<llama_grammar_candidate> & candidates);
2243
+
2244
+ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
2245
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2246
+ const std::vector<const llama_grammar_element *> & stack,
2247
+ const std::vector<llama_grammar_candidate> & candidates) {
2248
+
2249
+ std::vector<llama_grammar_candidate> rejects;
2250
+
2251
+ if (stack.empty()) {
2252
+ // accept nothing; EOS is handled elsewhere
2253
+ rejects.insert(rejects.end(), candidates.begin(), candidates.end());
2254
+ return rejects;
2255
+ }
2256
+
2257
+ const llama_grammar_element * stack_pos = stack.back();
2258
+
2259
+ std::vector<llama_grammar_candidate> next_candidates;
2260
+ for (auto tok : candidates) {
2261
+ if (llama_grammar_match_char(stack_pos, tok.code_points[0]).first) {
2262
+ if (tok.code_points[1] != 0) {
2263
+ next_candidates.push_back({ tok.index, tok.code_points + 1 });
2264
+ }
2265
+ } else {
2266
+ rejects.push_back(tok);
2267
+ }
2268
+ }
2269
+
2270
+ auto stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
2271
+
2272
+ // update top of stack to next element, if any
2273
+ std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
2274
+ if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
2275
+ stack_after.push_back(stack_pos_after);
2276
+ }
2277
+ std::vector<std::vector<const llama_grammar_element *>> next_stacks;
2278
+ llama_grammar_advance_stack(rules, stack_after, next_stacks);
2279
+
2280
+ auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
2281
+ for (auto tok : next_rejects) {
2282
+ rejects.push_back({ tok.index, tok.code_points - 1 });
2283
+ }
2284
+
2285
+ return rejects;
2286
+ }
2287
+
2288
+ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
2289
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2290
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
2291
+ const std::vector<llama_grammar_candidate> & candidates) {
2292
+ LLAMA_ASSERT(!stacks.empty()); // REVIEW
2293
+
2294
+ if (candidates.empty()) {
2295
+ return std::vector<llama_grammar_candidate>();
2296
+ }
2297
+
2298
+ auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
2299
+
2300
+ for (size_t i = 1, size = stacks.size(); i < size; ++i) {
2301
+ rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
2302
+ }
2303
+ return rejects;
2304
+ }
2305
+
2306
+ //
2307
+ // grammar - external
2308
+ //
2309
+
2310
+ struct llama_grammar * llama_grammar_init(
2311
+ const llama_grammar_element ** rules,
2312
+ size_t n_rules,
2313
+ size_t start_rule_index) {
2314
+ const llama_grammar_element * pos;
2315
+
2316
+ // copy rule definitions into vectors
2317
+ std::vector<std::vector<llama_grammar_element>> vec_rules(n_rules);
2318
+ for (size_t i = 0; i < n_rules; i++) {
2319
+ for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
2320
+ vec_rules[i].push_back(*pos);
2321
+ }
2322
+ vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
2323
+ }
2324
+
2325
+ // loop over alternates of start rule to build initial stacks
2326
+ std::vector<std::vector<const llama_grammar_element *>> stacks;
2327
+ pos = rules[start_rule_index];
2328
+ do {
2329
+ std::vector<const llama_grammar_element *> stack;
2330
+ if (!llama_grammar_is_end_of_sequence(pos)) {
2331
+ // if alternate is nonempty, add to stack
2332
+ stack.push_back(pos);
2333
+ }
2334
+ llama_grammar_advance_stack(vec_rules, stack, stacks);
2335
+ while (!llama_grammar_is_end_of_sequence(pos)) {
2336
+ // scan to end of alternate def
2337
+ pos++;
2338
+ }
2339
+ if (pos->type == LLAMA_GRETYPE_ALT) {
2340
+ // there's another alternate def of this rule to process
2341
+ pos++;
2342
+ } else {
2343
+ break;
2344
+ }
2345
+ } while (true);
2346
+
2347
+ return new llama_grammar{ std::move(vec_rules), std::move(stacks) };
2348
+ }
2349
+
2350
+ void llama_grammar_free(struct llama_grammar * grammar) {
2351
+ delete grammar;
2352
+ }
2353
+
1918
2354
  //
1919
2355
  // sampling
1920
2356
  //
@@ -2200,6 +2636,47 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
2200
2636
  }
2201
2637
  }
2202
2638
 
2639
+ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
2640
+ assert(ctx);
2641
+ const int64_t t_start_sample_us = ggml_time_us();
2642
+
2643
+ bool allow_eos = false;
2644
+ for (const auto & stack : grammar->stacks) {
2645
+ if (stack.empty()) {
2646
+ allow_eos = true;
2647
+ break;
2648
+ }
2649
+ }
2650
+
2651
+ const llama_token eos = llama_token_eos();
2652
+
2653
+ std::vector<std::vector<uint32_t>> candidates_decoded;
2654
+ std::vector<llama_grammar_candidate> candidates_grammar;
2655
+
2656
+ for (size_t i = 0; i < candidates->size; ++i) {
2657
+ const llama_token id = candidates->data[i].id;
2658
+ const char * str = llama_token_to_str(ctx, id);
2659
+ if (id == eos) {
2660
+ if (!allow_eos) {
2661
+ candidates->data[i].logit = -INFINITY;
2662
+ }
2663
+ } else if (*str == 0) {
2664
+ candidates->data[i].logit = -INFINITY;
2665
+ } else {
2666
+ candidates_decoded.push_back(decode_utf8(str));
2667
+ candidates_grammar.push_back({ i, candidates_decoded.back().data() });
2668
+ }
2669
+ }
2670
+
2671
+ const auto rejects =
2672
+ llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
2673
+ for (auto & reject : rejects) {
2674
+ candidates->data[reject.index].logit = -INFINITY;
2675
+ }
2676
+
2677
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2678
+ }
2679
+
2203
2680
  static void llama_log_softmax(float * array, size_t size) {
2204
2681
  float max_l = *std::max_element(array, array + size);
2205
2682
  float sum = 0.f;
@@ -2375,6 +2852,29 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
2375
2852
  return result;
2376
2853
  }
2377
2854
 
2855
+ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
2856
+ const int64_t t_start_sample_us = ggml_time_us();
2857
+
2858
+ if (token == llama_token_eos()) {
2859
+ for (const auto & stack : grammar->stacks) {
2860
+ if (stack.empty()) {
2861
+ return;
2862
+ }
2863
+ }
2864
+ LLAMA_ASSERT(false);
2865
+ }
2866
+
2867
+ const char * str = llama_token_to_str(ctx, token);
2868
+ // Note terminating 0 in decoded string
2869
+ auto code_points = decode_utf8(str);
2870
+ for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
2871
+ grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
2872
+ }
2873
+ LLAMA_ASSERT(!grammar->stacks.empty());
2874
+
2875
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2876
+ }
2877
+
2378
2878
  //
2379
2879
  // quantization
2380
2880
  //
@@ -2448,8 +2948,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2448
2948
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
2449
2949
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
2450
2950
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
2451
- case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
2452
- case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
2951
+ case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
2952
+ case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
2453
2953
 
2454
2954
  #ifdef GGML_USE_K_QUANTS
2455
2955
  // K-quants
@@ -2533,16 +3033,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2533
3033
  } else {
2534
3034
  new_type = quantized_type;
2535
3035
  #ifdef GGML_USE_K_QUANTS
2536
- bool convert_incompatible_tensor = false;
2537
- if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2538
- quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2539
- int nx = tensor.ne.at(0);
2540
- int ny = tensor.ne.at(1);
2541
- if (nx % QK_K != 0 || ny % QK_K != 0) {
2542
- fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
2543
- convert_incompatible_tensor = true;
2544
- }
2545
- }
2546
3036
  if (tensor.name == "output.weight") {
2547
3037
  int nx = tensor.ne.at(0);
2548
3038
  int ny = tensor.ne.at(1);
@@ -2568,6 +3058,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2568
3058
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2569
3059
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2570
3060
  }
3061
+ bool convert_incompatible_tensor = false;
3062
+ if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
3063
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
3064
+ int nx = tensor.ne.at(0);
3065
+ int ny = tensor.ne.at(1);
3066
+ if (nx % QK_K != 0 || ny % QK_K != 0) {
3067
+ fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
3068
+ convert_incompatible_tensor = true;
3069
+ }
3070
+ }
2571
3071
  if (convert_incompatible_tensor) {
2572
3072
  if (tensor.name == "output.weight") {
2573
3073
  new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
@@ -2594,7 +3094,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2594
3094
  f32_data = (float *) f32_conv_buf.addr;
2595
3095
  }
2596
3096
 
2597
- printf("quantizing .. ");
3097
+ printf("quantizing to %s .. ", ggml_type_name(new_type));
2598
3098
  fflush(stdout);
2599
3099
 
2600
3100
  work.resize(nelements * 4); // upper bound on size
@@ -2697,8 +3197,8 @@ struct llama_model * llama_load_model_from_file(
2697
3197
 
2698
3198
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2699
3199
 
2700
- if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
2701
- params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
3200
+ if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
3201
+ params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
2702
3202
  memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
2703
3203
  params.progress_callback_user_data)) {
2704
3204
  delete model;
@@ -2775,10 +3275,47 @@ struct llama_context * llama_new_context_with_model(
2775
3275
  ctx->embedding.resize(hparams.n_embd);
2776
3276
  }
2777
3277
 
2778
- ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
3278
+ #ifdef LLAMA_USE_ALLOCATOR
3279
+ {
3280
+ static const size_t tensor_alignment = 32;
3281
+ // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
3282
+ ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
3283
+
3284
+ // create measure allocator
3285
+ ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
3286
+
3287
+ // build worst-case graph
3288
+ int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
3289
+ int n_past = hparams.n_ctx - n_tokens;
3290
+ llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
3291
+ ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
3292
+
3293
+ // measure memory requirements for the graph
3294
+ size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
2779
3295
 
3296
+ fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
3297
+
3298
+ // debug - for comparison with scratch buffer
3299
+ //size_t prev_req =
3300
+ // MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
3301
+ // MEM_REQ_SCRATCH1().at(ctx->model.type) +
3302
+ // MEM_REQ_EVAL().at(ctx->model.type);
3303
+ //fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
3304
+
3305
+ // recreate allocator with exact memory requirements
3306
+ ggml_allocr_free(ctx->alloc);
3307
+
3308
+ ctx->buf_alloc.resize(alloc_size);
3309
+ ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
3310
+ }
3311
+ #else
3312
+ ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
3313
+ #endif
3314
+
3315
+ #ifdef LLAMA_USE_SCRATCH
2780
3316
  ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
2781
3317
  ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
3318
+ #endif
2782
3319
  }
2783
3320
 
2784
3321
  #ifdef GGML_USE_METAL
@@ -2799,7 +3336,7 @@ struct llama_context * llama_new_context_with_model(
2799
3336
 
2800
3337
  const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
2801
3338
 
2802
- printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
3339
+ fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
2803
3340
 
2804
3341
  #define LLAMA_METAL_CHECK_BUF(result) \
2805
3342
  if (!(result)) { \
@@ -2848,9 +3385,6 @@ struct llama_context * llama_init_from_file(
2848
3385
  }
2849
3386
 
2850
3387
  void llama_free(struct llama_context * ctx) {
2851
- if (ctx->model_owner) {
2852
- delete &ctx->model;
2853
- }
2854
3388
  delete ctx;
2855
3389
  }
2856
3390
 
@@ -3260,7 +3794,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3260
3794
  const auto & kv_self = ctx->kv_self;
3261
3795
  const auto & hparams = ctx->model.hparams;
3262
3796
  const int n_layer = hparams.n_layer;
3263
- const int n_embd = hparams.n_embd;
3797
+ const int n_embd = hparams.n_embd_gqa();
3264
3798
  const int n_ctx = hparams.n_ctx;
3265
3799
 
3266
3800
  const size_t kv_size = kv_self.buf.size;
@@ -3363,7 +3897,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3363
3897
  const auto & kv_self = ctx->kv_self;
3364
3898
  const auto & hparams = ctx->model.hparams;
3365
3899
  const int n_layer = hparams.n_layer;
3366
- const int n_embd = hparams.n_embd;
3900
+ const int n_embd = hparams.n_embd_gqa();
3367
3901
  const int n_ctx = hparams.n_ctx;
3368
3902
 
3369
3903
  size_t kv_size;