llama_cpp 0.3.4 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -56,8 +56,14 @@
56
56
  #pragma warning(disable: 4244 4267) // possible loss of data
57
57
  #endif
58
58
 
59
+ #if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
60
+ #include "ggml-alloc.h"
61
+ #define LLAMA_USE_ALLOCATOR
62
+ #else
59
63
  #define LLAMA_USE_SCRATCH
60
64
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
65
+ #endif
66
+
61
67
 
62
68
  // available llama models
63
69
  enum e_model {
@@ -67,6 +73,7 @@ enum e_model {
67
73
  MODEL_13B,
68
74
  MODEL_30B,
69
75
  MODEL_65B,
76
+ MODEL_70B,
70
77
  };
71
78
 
72
79
  static const size_t kB = 1024;
@@ -98,18 +105,18 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
98
105
  }
99
106
 
100
107
  //
101
- // memory sizes
108
+ // memory sizes (calculated for n_batch == 512)
102
109
  //
103
110
 
104
111
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
105
112
  {
106
113
  static std::map<e_model, size_t> k_sizes = {
107
- /* empirical scaling, still a guess */
108
- { MODEL_3B, ((size_t) n_ctx / 16ull + 128ull) * MB },
109
- { MODEL_7B, ((size_t) n_ctx / 16ull + 256ull) * MB },
110
- { MODEL_13B, ((size_t) n_ctx / 12ull + 256ull) * MB },
111
- { MODEL_30B, ((size_t) n_ctx / 10ull + 256ull) * MB },
112
- { MODEL_65B, ((size_t) n_ctx / 8ull + 512ull) * MB },
114
+ { MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
115
+ { MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
116
+ { MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
117
+ { MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB },
118
+ { MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess
119
+ { MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB },
113
120
  };
114
121
  return k_sizes;
115
122
  }
@@ -117,38 +124,26 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
117
124
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
118
125
  {
119
126
  static std::map<e_model, size_t> k_sizes = {
120
- { MODEL_3B, 256ull * MB },
121
- { MODEL_7B, 512ull * MB },
122
- { MODEL_13B, 512ull * MB },
123
- { MODEL_30B, 512ull * MB },
124
- { MODEL_65B, 1024ull * MB },
125
- };
126
- return k_sizes;
127
- }
128
-
129
- // 2*n_embd*n_ctx*n_layer*sizeof(float16)
130
- static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
131
- {
132
- static std::map<e_model, size_t> k_sizes = {
133
- { MODEL_3B, 682ull * MB },
134
- { MODEL_7B, 1026ull * MB },
135
- { MODEL_13B, 1608ull * MB },
136
- { MODEL_30B, 3124ull * MB },
137
- { MODEL_65B, 5120ull * MB },
127
+ { MODEL_3B, 128ull * MB },
128
+ { MODEL_7B, 160ull * MB },
129
+ { MODEL_13B, 192ull * MB },
130
+ { MODEL_30B, 256ull * MB },
131
+ { MODEL_65B, 384ull * MB }, // guess
132
+ { MODEL_70B, 304ull * MB },
138
133
  };
139
134
  return k_sizes;
140
135
  }
141
136
 
142
- // this is mostly needed for temporary mul_mat buffers to dequantize the data
143
- // not actually needed if BLAS is disabled
144
- static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
137
+ // used to store the compute graph tensors + non-scratch data
138
+ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
145
139
  {
146
140
  static std::map<e_model, size_t> k_sizes = {
147
- { MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB },
148
- { MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB },
149
- { MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
150
- { MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
151
- { MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
141
+ { MODEL_3B, 8ull * MB },
142
+ { MODEL_7B, 10ull * MB },
143
+ { MODEL_13B, 12ull * MB },
144
+ { MODEL_30B, 16ull * MB },
145
+ { MODEL_65B, 24ull * MB }, // guess
146
+ { MODEL_70B, 24ull * MB },
152
147
  };
153
148
  return k_sizes;
154
149
  }
@@ -163,6 +158,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
163
158
  { MODEL_13B, 640ull * kB },
164
159
  { MODEL_30B, 768ull * kB },
165
160
  { MODEL_65B, 1536ull * kB },
161
+ { MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
166
162
  };
167
163
  return k_sizes;
168
164
  }
@@ -177,19 +173,26 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
177
173
  { MODEL_13B, 160ull },
178
174
  { MODEL_30B, 208ull },
179
175
  { MODEL_65B, 416ull },
176
+ { MODEL_70B, 416ull }, // TODO (likely can be reduced)
180
177
  };
181
178
  return k_sizes;
182
179
  }
183
180
 
184
181
  // default hparams (LLaMA 7B)
185
182
  struct llama_hparams {
186
- uint32_t n_vocab = 32000;
187
- uint32_t n_ctx = 512; // this is provided as user input?
188
- uint32_t n_embd = 4096;
189
- uint32_t n_mult = 256;
190
- uint32_t n_head = 32;
191
- uint32_t n_layer = 32;
192
- uint32_t n_rot = 64;
183
+ uint32_t n_vocab = 32000;
184
+ uint32_t n_ctx = 512; // this is provided as user input?
185
+ uint32_t n_embd = 4096;
186
+ uint32_t n_mult = 256;
187
+ uint32_t n_head = 32;
188
+ uint32_t n_head_kv = 32;
189
+ uint32_t n_layer = 32;
190
+ uint32_t n_rot = 64;
191
+
192
+ // LLaMAv2
193
+ // TODO: load from model data hparams
194
+ float f_ffn_mult = 1.0f;
195
+ float f_rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
193
196
 
194
197
  float rope_freq_base = 10000.0f;
195
198
  float rope_freq_scale = 1.0f;
@@ -197,7 +200,28 @@ struct llama_hparams {
197
200
  enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
198
201
 
199
202
  bool operator!=(const llama_hparams & other) const {
200
- return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
203
+ return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
204
+ }
205
+
206
+ uint32_t n_gqa() const {
207
+ return n_head/n_head_kv;
208
+ }
209
+
210
+ uint32_t n_embd_head() const {
211
+ return n_embd/n_head;
212
+ }
213
+
214
+ uint32_t n_embd_gqa() const {
215
+ return n_embd/n_gqa();
216
+ }
217
+
218
+ size_t kv_size() const {
219
+ size_t result = 2ull;
220
+ result *= (size_t) n_embd_gqa();
221
+ result *= (size_t) n_ctx;
222
+ result *= (size_t) n_layer;
223
+ result *= sizeof(ggml_fp16_t);
224
+ return result;
201
225
  }
202
226
  };
203
227
 
@@ -309,13 +333,22 @@ struct llama_model {
309
333
 
310
334
  struct llama_context {
311
335
  llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
312
- #ifdef GGML_USE_METAL
313
336
  ~llama_context() {
337
+ if (model_owner) {
338
+ delete &model;
339
+ }
340
+ #ifdef GGML_USE_METAL
314
341
  if (ctx_metal) {
315
342
  ggml_metal_free(ctx_metal);
316
343
  }
317
- }
318
344
  #endif
345
+ #ifdef LLAMA_USE_ALLOCATOR
346
+ if (alloc) {
347
+ ggml_allocr_free(alloc);
348
+ }
349
+ #endif
350
+ }
351
+
319
352
  std::mt19937 rng;
320
353
 
321
354
  bool has_evaluated_once = false;
@@ -353,7 +386,17 @@ struct llama_context {
353
386
  // memory buffers used to evaluate the model
354
387
  // TODO: move in llama_state
355
388
  llama_ctx_buffer buf_compute;
389
+
390
+ #ifdef LLAMA_USE_ALLOCATOR
391
+ llama_ctx_buffer buf_alloc;
392
+ ggml_allocr * alloc = NULL;
393
+ #endif
394
+
395
+ #ifdef LLAMA_USE_SCRATCH
356
396
  llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
397
+ int buf_last = 0;
398
+ size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
399
+ #endif
357
400
 
358
401
  #ifdef GGML_USE_METAL
359
402
  ggml_metal_context * ctx_metal = NULL;
@@ -363,9 +406,6 @@ struct llama_context {
363
406
  ggml_mpi_context * ctx_mpi = NULL;
364
407
  #endif
365
408
 
366
- int buf_last = 0;
367
- size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
368
-
369
409
  void use_buf(struct ggml_context * ctx, int i) {
370
410
  #if defined(LLAMA_USE_SCRATCH)
371
411
  size_t last_size = 0;
@@ -499,12 +539,16 @@ struct llama_file_loader {
499
539
  }
500
540
  void read_hparams() {
501
541
  hparams.n_vocab = file.read_u32();
502
- hparams.n_embd = file.read_u32();
503
- hparams.n_mult = file.read_u32();
504
- hparams.n_head = file.read_u32();
542
+ hparams.n_embd = file.read_u32();
543
+ hparams.n_mult = file.read_u32();
544
+ hparams.n_head = file.read_u32();
505
545
  hparams.n_layer = file.read_u32();
506
- hparams.n_rot = file.read_u32();
507
- hparams.ftype = (enum llama_ftype) file.read_u32();
546
+ hparams.n_rot = file.read_u32();
547
+ hparams.ftype = (enum llama_ftype) file.read_u32();
548
+
549
+ // LLaMAv2
550
+ // TODO: read from header
551
+ hparams.n_head_kv = hparams.n_head;
508
552
  }
509
553
  void read_vocab() {
510
554
  vocab.id_to_token.resize(hparams.n_vocab);
@@ -803,7 +847,7 @@ static bool kv_cache_init(
803
847
  ggml_type wtype,
804
848
  int n_ctx,
805
849
  int n_gpu_layers) {
806
- const int n_embd = hparams.n_embd;
850
+ const int n_embd = hparams.n_embd_gqa();
807
851
  const int n_layer = hparams.n_layer;
808
852
 
809
853
  const int64_t n_mem = n_layer*n_ctx;
@@ -847,6 +891,8 @@ struct llama_context_params llama_context_default_params() {
847
891
  /*.seed =*/ LLAMA_DEFAULT_SEED,
848
892
  /*.n_ctx =*/ 512,
849
893
  /*.n_batch =*/ 512,
894
+ /*.n_gqa =*/ 1,
895
+ /*.rms_norm_eps =*/ LLAMA_DEFAULT_RMS_EPS,
850
896
  /*.gpu_layers =*/ 0,
851
897
  /*.main_gpu =*/ 0,
852
898
  /*.tensor_split =*/ nullptr,
@@ -855,6 +901,7 @@ struct llama_context_params llama_context_default_params() {
855
901
  /*.progress_callback =*/ nullptr,
856
902
  /*.progress_callback_user_data =*/ nullptr,
857
903
  /*.low_vram =*/ false,
904
+ /*.mul_mat_q =*/ false,
858
905
  /*.f16_kv =*/ true,
859
906
  /*.logits_all =*/ false,
860
907
  /*.vocab_only =*/ false,
@@ -966,6 +1013,7 @@ static const char *llama_model_type_name(e_model type) {
966
1013
  case MODEL_13B: return "13B";
967
1014
  case MODEL_30B: return "30B";
968
1015
  case MODEL_65B: return "65B";
1016
+ case MODEL_70B: return "70B";
969
1017
  default: LLAMA_ASSERT(false);
970
1018
  }
971
1019
  }
@@ -976,9 +1024,12 @@ static void llama_model_load_internal(
976
1024
  llama_vocab & vocab,
977
1025
  int n_ctx,
978
1026
  int n_batch,
1027
+ int n_gqa,
1028
+ float rms_norm_eps,
979
1029
  int n_gpu_layers,
980
1030
  int main_gpu,
981
1031
  const float * tensor_split,
1032
+ const bool mul_mat_q,
982
1033
  float rope_freq_base,
983
1034
  float rope_freq_scale,
984
1035
  bool low_vram,
@@ -997,8 +1048,12 @@ static void llama_model_load_internal(
997
1048
  model.hparams = ml->file_loader->hparams;
998
1049
  model.n_gpu_layers = n_gpu_layers;
999
1050
  llama_file_version file_version = ml->file_loader->file_version;
1051
+
1000
1052
  auto & hparams = model.hparams;
1001
1053
 
1054
+ // TODO: read from file
1055
+ hparams.f_rms_norm_eps = rms_norm_eps;
1056
+
1002
1057
  {
1003
1058
  switch (hparams.n_layer) {
1004
1059
  case 26: model.type = e_model::MODEL_3B; break;
@@ -1016,11 +1071,25 @@ static void llama_model_load_internal(
1016
1071
 
1017
1072
  hparams.n_ctx = n_ctx;
1018
1073
 
1074
+ // LLaMAv2
1075
+ // TODO: temporary until GGUF
1076
+ LLAMA_ASSERT(hparams.n_head % n_gqa == 0);
1077
+ hparams.n_head_kv = hparams.n_head / n_gqa;
1078
+ if (model.type == e_model::MODEL_65B && n_gqa == 8) {
1079
+ fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
1080
+ model.type = e_model::MODEL_70B;
1081
+ hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
1082
+ }
1083
+
1019
1084
  hparams.rope_freq_base = rope_freq_base;
1020
1085
  hparams.rope_freq_scale = rope_freq_scale;
1021
1086
  }
1022
1087
 
1023
- const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
1088
+ // ref: https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/model.py#L194-L199
1089
+ const uint32_t n_ff_raw = 2*(4*hparams.n_embd)/3;
1090
+ const uint32_t n_ff_mult = hparams.f_ffn_mult*n_ff_raw;
1091
+ const uint32_t n_ff = ((n_ff_mult + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
1092
+ //const uint32_t n_ff = 28672;
1024
1093
 
1025
1094
  {
1026
1095
  fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
@@ -1029,12 +1098,15 @@ static void llama_model_load_internal(
1029
1098
  fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
1030
1099
  fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
1031
1100
  fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
1101
+ fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
1032
1102
  fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
1033
- fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1103
+ fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
1104
+ fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa());
1105
+ fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
1106
+ fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1034
1107
  fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
1035
1108
  fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1036
1109
  fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
1037
- fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1038
1110
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1039
1111
  }
1040
1112
 
@@ -1069,7 +1141,7 @@ static void llama_model_load_internal(
1069
1141
  {
1070
1142
  model.buf.resize(ctx_size);
1071
1143
  if (use_mlock) {
1072
- model.mlock_buf.init(model.buf.addr);
1144
+ model.mlock_buf.init (model.buf.addr);
1073
1145
  model.mlock_buf.grow_to(model.buf.size);
1074
1146
  }
1075
1147
 
@@ -1086,9 +1158,11 @@ static void llama_model_load_internal(
1086
1158
  }
1087
1159
 
1088
1160
  (void) main_gpu;
1161
+ (void) mul_mat_q;
1089
1162
  #if defined(GGML_USE_CUBLAS)
1090
1163
  fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
1091
1164
  ggml_cuda_set_main_device(main_gpu);
1165
+ ggml_cuda_set_mul_mat_q(mul_mat_q);
1092
1166
  #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1093
1167
  #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
1094
1168
  #elif defined(GGML_USE_CLBLAST)
@@ -1104,9 +1178,10 @@ static void llama_model_load_internal(
1104
1178
  size_t vram_weights = 0;
1105
1179
  size_t vram_scratch = 0;
1106
1180
  {
1107
- const uint32_t n_embd = hparams.n_embd;
1108
- const uint32_t n_layer = hparams.n_layer;
1109
- const uint32_t n_vocab = hparams.n_vocab;
1181
+ const uint32_t n_embd = hparams.n_embd;
1182
+ const uint32_t n_embd_gqa = hparams.n_embd_gqa();
1183
+ const uint32_t n_layer = hparams.n_layer;
1184
+ const uint32_t n_vocab = hparams.n_vocab;
1110
1185
 
1111
1186
  ml->ggml_ctx = ctx;
1112
1187
 
@@ -1154,16 +1229,16 @@ static void llama_model_load_internal(
1154
1229
 
1155
1230
  layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
1156
1231
 
1157
- layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
1158
- layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split);
1159
- layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split);
1160
- layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
1232
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
1233
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd_gqa}, backend_split);
1234
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd_gqa}, backend_split);
1235
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
1161
1236
 
1162
1237
  layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
1163
1238
 
1164
- layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
1165
- layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
1166
- layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
1239
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
1240
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
1241
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
1167
1242
 
1168
1243
  if (backend == GGML_BACKEND_GPU) {
1169
1244
  vram_weights +=
@@ -1181,16 +1256,20 @@ static void llama_model_load_internal(
1181
1256
  const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
1182
1257
 
1183
1258
  // this is the total memory required to run the inference
1184
- const size_t mem_required =
1259
+ size_t mem_required =
1185
1260
  ctx_size +
1186
- mmapped_size - vram_weights + // weights in VRAM not in memory
1261
+ mmapped_size - vram_weights; // weights in VRAM not in memory
1262
+
1263
+ #ifndef LLAMA_USE_ALLOCATOR
1264
+ mem_required +=
1187
1265
  MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
1188
1266
  MEM_REQ_SCRATCH1().at(model.type) +
1189
- MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
1267
+ MEM_REQ_EVAL().at(model.type);
1268
+ #endif
1190
1269
 
1191
1270
  // this is the memory required by one llama_state
1192
1271
  const size_t mem_required_state =
1193
- scale*MEM_REQ_KV_SELF().at(model.type);
1272
+ scale*hparams.kv_size();
1194
1273
 
1195
1274
  fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1196
1275
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@@ -1231,7 +1310,7 @@ static void llama_model_load_internal(
1231
1310
  fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
1232
1311
  } else {
1233
1312
  fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
1234
- vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1313
+ vram_kv_cache += hparams.kv_size() / 2;
1235
1314
  }
1236
1315
  }
1237
1316
  if (n_gpu_layers > (int) hparams.n_layer + 2) {
@@ -1239,7 +1318,7 @@ static void llama_model_load_internal(
1239
1318
  fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
1240
1319
  } else {
1241
1320
  fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
1242
- vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1321
+ vram_kv_cache += hparams.kv_size() / 2;
1243
1322
  }
1244
1323
  }
1245
1324
  #elif defined(GGML_USE_CLBLAST)
@@ -1287,9 +1366,12 @@ static bool llama_model_load(
1287
1366
  llama_vocab & vocab,
1288
1367
  int n_ctx,
1289
1368
  int n_batch,
1369
+ int n_gqa,
1370
+ float rms_norm_eps,
1290
1371
  int n_gpu_layers,
1291
1372
  int main_gpu,
1292
1373
  const float * tensor_split,
1374
+ const bool mul_mat_q,
1293
1375
  float rope_freq_base,
1294
1376
  float rope_freq_scale,
1295
1377
  bool low_vram,
@@ -1300,7 +1382,8 @@ static bool llama_model_load(
1300
1382
  llama_progress_callback progress_callback,
1301
1383
  void *progress_callback_user_data) {
1302
1384
  try {
1303
- llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1385
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
1386
+ main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1304
1387
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1305
1388
  return true;
1306
1389
  } catch (const std::exception & err) {
@@ -1309,32 +1392,15 @@ static bool llama_model_load(
1309
1392
  }
1310
1393
  }
1311
1394
 
1312
- // evaluate the transformer
1313
- //
1314
- // - lctx: llama context
1315
- // - tokens: new batch of tokens to process
1316
- // - embd embeddings input
1317
- // - n_tokens number of tokens
1318
- // - n_past: the context size so far
1319
- // - n_threads: number of threads to use
1320
- //
1321
- static bool llama_eval_internal(
1395
+ static struct ggml_cgraph * llama_build_graph(
1322
1396
  llama_context & lctx,
1323
1397
  const llama_token * tokens,
1324
1398
  const float * embd,
1325
1399
  int n_tokens,
1326
- int n_past,
1327
- int n_threads,
1328
- const char * cgraph_fname) {
1400
+ int n_past) {
1329
1401
 
1330
1402
  LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1331
1403
 
1332
- #ifdef GGML_USE_MPI
1333
- ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
1334
- #endif
1335
-
1336
- const int64_t t_start_us = ggml_time_us();
1337
-
1338
1404
  const int N = n_tokens;
1339
1405
 
1340
1406
  const auto & model = lctx.model;
@@ -1344,40 +1410,54 @@ static bool llama_eval_internal(
1344
1410
 
1345
1411
  LLAMA_ASSERT(!!kv_self.ctx);
1346
1412
 
1347
- const int n_embd = hparams.n_embd;
1348
- const int n_layer = hparams.n_layer;
1349
- const int n_ctx = hparams.n_ctx;
1350
- const int n_head = hparams.n_head;
1351
- const int n_vocab = hparams.n_vocab;
1352
- const int n_rot = hparams.n_embd/hparams.n_head;
1353
- const int n_gpu_layers = model.n_gpu_layers;
1413
+ const int64_t n_embd = hparams.n_embd;
1414
+ const int64_t n_layer = hparams.n_layer;
1415
+ const int64_t n_ctx = hparams.n_ctx;
1416
+ const int64_t n_head = hparams.n_head;
1417
+ const int64_t n_head_kv = hparams.n_head_kv;
1418
+ const int64_t n_embd_head = hparams.n_embd_head();
1419
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
1420
+
1421
+ LLAMA_ASSERT(n_embd_head == hparams.n_rot);
1354
1422
 
1355
1423
  const float freq_base = hparams.rope_freq_base;
1356
1424
  const float freq_scale = hparams.rope_freq_scale;
1425
+ const float rms_norm_eps = hparams.f_rms_norm_eps;
1426
+
1427
+ const int n_gpu_layers = model.n_gpu_layers;
1357
1428
 
1358
1429
  auto & mem_per_token = lctx.mem_per_token;
1359
1430
  auto & buf_compute = lctx.buf_compute;
1360
1431
 
1432
+
1361
1433
  struct ggml_init_params params = {
1362
1434
  /*.mem_size =*/ buf_compute.size,
1363
1435
  /*.mem_buffer =*/ buf_compute.addr,
1364
1436
  /*.no_alloc =*/ false,
1365
1437
  };
1366
1438
 
1367
- struct ggml_context * ctx0 = ggml_init(params);
1439
+ #ifdef LLAMA_USE_ALLOCATOR
1440
+ params.no_alloc = true;
1441
+ #endif
1368
1442
 
1369
- ggml_cgraph gf = {};
1443
+ struct ggml_context * ctx0 = ggml_init(params);
1370
1444
 
1371
- // for big prompts, if BLAS is enabled, it is better to use only one thread
1372
- // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1373
- n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1445
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
1374
1446
 
1375
1447
  struct ggml_tensor * cur;
1376
1448
  struct ggml_tensor * inpL;
1377
1449
 
1378
1450
  if (tokens) {
1379
1451
  struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1452
+
1453
+ #ifdef LLAMA_USE_ALLOCATOR
1454
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
1455
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
1456
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
1457
+ }
1458
+ #else
1380
1459
  memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
1460
+ #endif
1381
1461
  ggml_set_name(inp_tokens, "inp_tokens");
1382
1462
 
1383
1463
  inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
@@ -1387,7 +1467,15 @@ static bool llama_eval_internal(
1387
1467
  #endif
1388
1468
 
1389
1469
  inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
1470
+
1471
+ #ifdef LLAMA_USE_ALLOCATOR
1472
+ ggml_allocr_alloc(lctx.alloc, inpL);
1473
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
1474
+ memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1475
+ }
1476
+ #else
1390
1477
  memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1478
+ #endif
1391
1479
  }
1392
1480
 
1393
1481
  const int i_gpu_start = n_layer - n_gpu_layers;
@@ -1414,6 +1502,17 @@ static bool llama_eval_internal(
1414
1502
  }
1415
1503
  #endif // GGML_USE_CUBLAS
1416
1504
 
1505
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
1506
+ #ifdef LLAMA_USE_ALLOCATOR
1507
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
1508
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
1509
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
1510
+ }
1511
+ #else
1512
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
1513
+ #endif
1514
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
1515
+
1417
1516
  for (int il = 0; il < n_layer; ++il) {
1418
1517
  ggml_format_name(inpL, "layer_inp_%d", il);
1419
1518
 
@@ -1431,7 +1530,7 @@ static bool llama_eval_internal(
1431
1530
 
1432
1531
  // norm
1433
1532
  {
1434
- cur = ggml_rms_norm(ctx0, inpL);
1533
+ cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
1435
1534
  offload_func(cur);
1436
1535
  ggml_set_name(cur, "rms_norm_0");
1437
1536
 
@@ -1452,11 +1551,11 @@ static bool llama_eval_internal(
1452
1551
  offload_func_kq(tmpq);
1453
1552
  ggml_set_name(tmpq, "tmpq");
1454
1553
 
1455
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
1554
+ struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
1456
1555
  offload_func_kq(Kcur);
1457
1556
  ggml_set_name(Kcur, "Kcur");
1458
1557
 
1459
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
1558
+ struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
1460
1559
  offload_func_kq(Qcur);
1461
1560
  ggml_set_name(Qcur, "Qcur");
1462
1561
 
@@ -1468,23 +1567,23 @@ static bool llama_eval_internal(
1468
1567
  offload_func_v(tmpv);
1469
1568
  ggml_set_name(tmpv, "tmpv");
1470
1569
 
1471
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N));
1570
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
1472
1571
  offload_func_v(Vcur);
1473
1572
  ggml_set_name(Vcur, "Vcur");
1474
1573
 
1475
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1574
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
1476
1575
  offload_func_kq(k);
1477
1576
  ggml_set_name(k, "k");
1478
1577
 
1479
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1578
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
1480
1579
  ( n_ctx)*ggml_element_size(kv_self.v),
1481
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1580
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
1482
1581
  offload_func_v(v);
1483
1582
  ggml_set_name(v, "v");
1484
1583
 
1485
1584
  // important: storing RoPE-ed version of K in the KV cache!
1486
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
1487
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
1585
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
1586
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
1488
1587
  }
1489
1588
 
1490
1589
  struct ggml_tensor * Q =
@@ -1497,8 +1596,8 @@ static bool llama_eval_internal(
1497
1596
  struct ggml_tensor * K =
1498
1597
  ggml_permute(ctx0,
1499
1598
  ggml_reshape_3d(ctx0,
1500
- ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
1501
- n_embd/n_head, n_head, n_past + N),
1599
+ ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd_gqa, il*n_ctx*ggml_element_size(kv_self.k)*n_embd_gqa),
1600
+ n_embd_head, n_head_kv, n_past + N),
1502
1601
  0, 2, 1, 3);
1503
1602
  offload_func_kq(K);
1504
1603
  ggml_set_name(K, "K");
@@ -1508,10 +1607,7 @@ static bool llama_eval_internal(
1508
1607
  offload_func_kq(KQ);
1509
1608
  ggml_set_name(KQ, "KQ");
1510
1609
 
1511
- // KQ_scaled = KQ / sqrt(n_embd/n_head)
1512
- struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
1513
- ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
1514
-
1610
+ // KQ_scaled = KQ / sqrt(n_embd_head)
1515
1611
  // KQ_scaled shape [n_past + N, N, n_head, 1]
1516
1612
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
1517
1613
  offload_func_kq(KQ_scaled);
@@ -1530,10 +1626,10 @@ static bool llama_eval_internal(
1530
1626
  // split cached V into n_head heads
1531
1627
  struct ggml_tensor * V =
1532
1628
  ggml_view_3d(ctx0, kv_self.v,
1533
- n_past + N, n_embd/n_head, n_head,
1629
+ n_past + N, n_embd_head, n_head_kv,
1534
1630
  n_ctx*ggml_element_size(kv_self.v),
1535
- n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
1536
- il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
1631
+ n_ctx*ggml_element_size(kv_self.v)*n_embd_head,
1632
+ n_ctx*ggml_element_size(kv_self.v)*n_embd_gqa*il);
1537
1633
  offload_func_v(V);
1538
1634
  ggml_set_name(V, "V");
1539
1635
 
@@ -1545,7 +1641,7 @@ static bool llama_eval_internal(
1545
1641
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
1546
1642
  // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
1547
1643
  // is there a better way?
1548
- struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
1644
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
1549
1645
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
1550
1646
  #endif
1551
1647
 
@@ -1579,7 +1675,7 @@ static bool llama_eval_internal(
1579
1675
  {
1580
1676
  // norm
1581
1677
  {
1582
- cur = ggml_rms_norm(ctx0, inpFF);
1678
+ cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
1583
1679
  offload_func(cur);
1584
1680
  ggml_set_name(cur, "rms_norm_1");
1585
1681
 
@@ -1627,12 +1723,9 @@ static bool llama_eval_internal(
1627
1723
 
1628
1724
  lctx.use_buf(ctx0, 0);
1629
1725
 
1630
- // used at the end to optionally extract the embeddings
1631
- struct ggml_tensor * embeddings = NULL;
1632
-
1633
1726
  // norm
1634
1727
  {
1635
- cur = ggml_rms_norm(ctx0, inpL);
1728
+ cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
1636
1729
  offload_func_nr(cur);
1637
1730
  ggml_set_name(cur, "rms_norm_2");
1638
1731
 
@@ -1640,8 +1733,6 @@ static bool llama_eval_internal(
1640
1733
  cur = ggml_mul(ctx0, cur, model.norm);
1641
1734
  // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
1642
1735
  ggml_set_name(cur, "result_norm");
1643
-
1644
- embeddings = cur;
1645
1736
  }
1646
1737
 
1647
1738
  // lm_head
@@ -1653,18 +1744,103 @@ static bool llama_eval_internal(
1653
1744
  // logits -> probs
1654
1745
  //cur = ggml_soft_max_inplace(ctx0, cur);
1655
1746
 
1656
- // run the computation
1657
- ggml_build_forward_expand(&gf, cur);
1747
+ ggml_build_forward_expand(gf, cur);
1748
+
1749
+ if (mem_per_token == 0) {
1750
+ mem_per_token = ggml_used_mem(ctx0)/N;
1751
+ }
1752
+
1753
+ #if 0
1754
+ printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
1755
+ ggml_used_mem(ctx0)/1024.0/1024.0,
1756
+ lctx.get_buf_max_mem(0)/1024.0/1024.0,
1757
+ lctx.get_buf_max_mem(1)/1024.0/1024.0,
1758
+ lctx.work_buffer.size()/1024.0/1024.0,
1759
+ n_past, N);
1760
+ #endif
1761
+
1762
+ ggml_free(ctx0);
1763
+
1764
+ return gf;
1765
+ }
1766
+
1767
+ // evaluate the transformer
1768
+ //
1769
+ // - lctx: llama context
1770
+ // - tokens: new batch of tokens to process
1771
+ // - embd embeddings input
1772
+ // - n_tokens number of tokens
1773
+ // - n_past: the context size so far
1774
+ // - n_threads: number of threads to use
1775
+ //
1776
+ static bool llama_eval_internal(
1777
+ llama_context & lctx,
1778
+ const llama_token * tokens,
1779
+ const float * embd,
1780
+ int n_tokens,
1781
+ int n_past,
1782
+ int n_threads,
1783
+ const char * cgraph_fname) {
1784
+
1785
+ LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1786
+
1787
+ const int64_t t_start_us = ggml_time_us();
1788
+
1789
+ #ifdef GGML_USE_MPI
1790
+ ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
1791
+ #endif
1792
+
1793
+ const int N = n_tokens;
1794
+
1795
+ const auto & model = lctx.model;
1796
+ const auto & hparams = model.hparams;
1797
+
1798
+ const auto & kv_self = lctx.kv_self;
1799
+
1800
+ LLAMA_ASSERT(!!kv_self.ctx);
1801
+
1802
+ const int64_t n_embd = hparams.n_embd;
1803
+ const int64_t n_vocab = hparams.n_vocab;
1804
+
1805
+ #ifdef LLAMA_USE_ALLOCATOR
1806
+ ggml_allocr_reset(lctx.alloc);
1807
+ #endif
1808
+
1809
+ ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
1810
+
1811
+ #ifdef LLAMA_USE_ALLOCATOR
1812
+ ggml_allocr_alloc_graph(lctx.alloc, gf);
1813
+ #endif
1814
+
1815
+ // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
1816
+
1817
+ // for big prompts, if BLAS is enabled, it is better to use only one thread
1818
+ // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1819
+ n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1820
+
1821
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
1822
+ struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
1823
+
1824
+ LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
1825
+ LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
1658
1826
 
1659
1827
  #if GGML_USE_MPI
1660
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
1828
+ const int64_t n_layer = hparams.n_layer;
1829
+ ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
1661
1830
  #endif
1662
1831
 
1663
1832
  #ifdef GGML_USE_METAL
1664
1833
  if (lctx.ctx_metal && N == 1) {
1834
+ // TODO: disabled until #2413 is resolved
1835
+ //if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
1836
+ // ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
1837
+ //}
1665
1838
  ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
1666
- ggml_metal_graph_compute(lctx.ctx_metal, &gf);
1667
- ggml_metal_get_tensor (lctx.ctx_metal, cur);
1839
+ ggml_metal_graph_compute(lctx.ctx_metal, gf);
1840
+ ggml_metal_get_tensor (lctx.ctx_metal, res);
1841
+ if (!lctx.embedding.empty()) {
1842
+ ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
1843
+ }
1668
1844
  } else {
1669
1845
  // IMPORTANT:
1670
1846
  // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
@@ -1682,34 +1858,32 @@ static bool llama_eval_internal(
1682
1858
  ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1683
1859
  }
1684
1860
 
1685
- ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1861
+ ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
1686
1862
  }
1687
1863
  #else
1688
- ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1864
+ ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
1689
1865
  #endif
1690
1866
 
1691
1867
  #if GGML_USE_MPI
1692
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
1868
+ ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
1693
1869
  #endif
1694
1870
 
1695
1871
  // update kv token count
1696
1872
  lctx.kv_self.n = n_past + N;
1697
1873
 
1698
- struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
1699
-
1700
1874
  if (cgraph_fname) {
1701
- ggml_graph_export(&gf, cgraph_fname);
1875
+ ggml_graph_export(gf, cgraph_fname);
1702
1876
  }
1703
1877
 
1704
1878
  #ifdef GGML_PERF
1705
1879
  // print timing information per ggml operation (for debugging purposes)
1706
1880
  // requires GGML_PERF to be defined
1707
- ggml_graph_print(&gf);
1881
+ ggml_graph_print(gf);
1708
1882
  #endif
1709
1883
 
1710
1884
  // plot the computation graph in dot format (for debugging purposes)
1711
1885
  //if (n_past%100 == 0) {
1712
- // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
1886
+ // ggml_graph_dump_dot(gf, NULL, "llama.dot");
1713
1887
  //}
1714
1888
 
1715
1889
  // extract logits
@@ -1734,19 +1908,6 @@ static bool llama_eval_internal(
1734
1908
  memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
1735
1909
  }
1736
1910
 
1737
- if (mem_per_token == 0) {
1738
- mem_per_token = ggml_used_mem(ctx0)/N;
1739
- }
1740
-
1741
- #if 0
1742
- printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__,
1743
- ggml_used_mem(ctx0)/1024.0/1024.0,
1744
- lctx.get_buf_max_mem(0)/1024.0/1024.0,
1745
- lctx.get_buf_max_mem(1)/1024.0/1024.0);
1746
- #endif
1747
-
1748
- ggml_free(ctx0);
1749
-
1750
1911
  // measure the performance only for the single-token evals
1751
1912
  if (N == 1) {
1752
1913
  lctx.t_eval_us += ggml_time_us() - t_start_us;
@@ -1858,7 +2019,9 @@ struct llama_tokenizer {
1858
2019
  if (token == vocab_.token_to_id.end()) {
1859
2020
  // output any symbols that did not form tokens as bytes.
1860
2021
  for (int j = 0; j < (int) symbol.n; ++j) {
1861
- llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
2022
+ // NOTE: old version, before #2420 - not sure what are the implications of this
2023
+ //llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
2024
+ llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
1862
2025
  output.push_back(token_id);
1863
2026
  }
1864
2027
  } else {
@@ -1915,6 +2078,279 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
1915
2078
  return output;
1916
2079
  }
1917
2080
 
2081
+ //
2082
+ // grammar - internal
2083
+ //
2084
+
2085
+ struct llama_grammar {
2086
+ const std::vector<std::vector<llama_grammar_element>> rules;
2087
+ std::vector<std::vector<const llama_grammar_element *>> stacks;
2088
+ };
2089
+
2090
+ struct llama_grammar_candidate {
2091
+ size_t index;
2092
+ const uint32_t * code_points;
2093
+ };
2094
+
2095
+ // NOTE: assumes valid utf8 (but checks for overrun)
2096
+ // adds a terminating 0 for use as pointer
2097
+ std::vector<uint32_t> decode_utf8(const char * src) {
2098
+ static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
2099
+ const char * pos = src;
2100
+ std::vector<uint32_t> code_points;
2101
+ while (*pos != 0) {
2102
+ uint8_t first_byte = static_cast<uint8_t>(*pos);
2103
+ uint8_t highbits = first_byte >> 4;
2104
+ int len = lookup[highbits];
2105
+ uint8_t mask = (1 << (8 - len)) - 1;
2106
+ uint32_t value = first_byte & mask;
2107
+ const char * end = pos + len; // may overrun!
2108
+ ++pos;
2109
+ for ( ; pos < end && *pos != 0; ++pos) {
2110
+ value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
2111
+ }
2112
+ code_points.push_back(value);
2113
+ }
2114
+ code_points.push_back(0);
2115
+ return code_points;
2116
+ }
2117
+
2118
+ // returns true iff pos points to the end of one of the definitions of a rule
2119
+ static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
2120
+ switch (pos->type) {
2121
+ case LLAMA_GRETYPE_END: return true;
2122
+ case LLAMA_GRETYPE_ALT: return true;
2123
+ default: return false;
2124
+ }
2125
+ }
2126
+
2127
+ // returns true iff chr satisfies the char range at pos (regular or inverse range)
2128
+ // asserts that pos is pointing to a char range element
2129
+ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
2130
+ const llama_grammar_element * pos,
2131
+ const uint32_t chr) {
2132
+
2133
+ bool found = false;
2134
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
2135
+ LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
2136
+
2137
+ do {
2138
+ if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
2139
+ // inclusive range, e.g. [a-z]
2140
+ found = found || (pos->value <= chr && chr <= pos[1].value);
2141
+ pos += 2;
2142
+ } else {
2143
+ // exact char match, e.g. [a] or "a"
2144
+ found = found || pos->value == chr;
2145
+ pos += 1;
2146
+ }
2147
+ } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
2148
+
2149
+ return std::make_pair(found == is_positive_char, pos);
2150
+ }
2151
+
2152
+ // transforms a grammar pushdown stack into N possible stacks, all ending
2153
+ // at a character range (terminal element)
2154
+ static void llama_grammar_advance_stack(
2155
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2156
+ const std::vector<const llama_grammar_element *> & stack,
2157
+ std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
2158
+
2159
+ if (stack.empty()) {
2160
+ new_stacks.push_back(stack);
2161
+ return;
2162
+ }
2163
+
2164
+ const llama_grammar_element * pos = stack.back();
2165
+
2166
+ switch (pos->type) {
2167
+ case LLAMA_GRETYPE_RULE_REF: {
2168
+ const size_t rule_id = static_cast<size_t>(pos->value);
2169
+ const llama_grammar_element * subpos = rules[rule_id].data();
2170
+ do {
2171
+ // init new stack without the top (pos)
2172
+ std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
2173
+ if (!llama_grammar_is_end_of_sequence(pos + 1)) {
2174
+ // if this rule ref is followed by another element, add that to stack
2175
+ new_stack.push_back(pos + 1);
2176
+ }
2177
+ if (!llama_grammar_is_end_of_sequence(subpos)) {
2178
+ // if alternate is nonempty, add to stack
2179
+ new_stack.push_back(subpos);
2180
+ }
2181
+ llama_grammar_advance_stack(rules, new_stack, new_stacks);
2182
+ while (!llama_grammar_is_end_of_sequence(subpos)) {
2183
+ // scan to end of alternate def
2184
+ subpos++;
2185
+ }
2186
+ if (subpos->type == LLAMA_GRETYPE_ALT) {
2187
+ // there's another alternate def of this rule to process
2188
+ subpos++;
2189
+ } else {
2190
+ break;
2191
+ }
2192
+ } while (true);
2193
+ break;
2194
+ }
2195
+ case LLAMA_GRETYPE_CHAR:
2196
+ case LLAMA_GRETYPE_CHAR_NOT:
2197
+ new_stacks.push_back(stack);
2198
+ break;
2199
+ default:
2200
+ // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
2201
+ // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
2202
+ // those
2203
+ LLAMA_ASSERT(false);
2204
+ }
2205
+ }
2206
+
2207
+ // takes a set of possible pushdown stacks on a grammar, which are required to
2208
+ // be positioned at a character range (see `llama_grammar_advance_stack`), and
2209
+ // produces the N possible stacks if the given char is accepted at those
2210
+ // positions
2211
+ static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
2212
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2213
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
2214
+ const uint32_t chr) {
2215
+
2216
+ std::vector<std::vector<const llama_grammar_element *>> new_stacks;
2217
+
2218
+ for (const auto & stack : stacks) {
2219
+ if (stack.empty()) {
2220
+ continue;
2221
+ }
2222
+
2223
+ auto match = llama_grammar_match_char(stack.back(), chr);
2224
+ if (match.first) {
2225
+ const llama_grammar_element * pos = match.second;
2226
+
2227
+ // update top of stack to next element, if any
2228
+ std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
2229
+ if (!llama_grammar_is_end_of_sequence(pos)) {
2230
+ new_stack.push_back(pos);
2231
+ }
2232
+ llama_grammar_advance_stack(rules, new_stack, new_stacks);
2233
+ }
2234
+ }
2235
+
2236
+ return new_stacks;
2237
+ }
2238
+
2239
+ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
2240
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2241
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
2242
+ const std::vector<llama_grammar_candidate> & candidates);
2243
+
2244
+ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
2245
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2246
+ const std::vector<const llama_grammar_element *> & stack,
2247
+ const std::vector<llama_grammar_candidate> & candidates) {
2248
+
2249
+ std::vector<llama_grammar_candidate> rejects;
2250
+
2251
+ if (stack.empty()) {
2252
+ // accept nothing; EOS is handled elsewhere
2253
+ rejects.insert(rejects.end(), candidates.begin(), candidates.end());
2254
+ return rejects;
2255
+ }
2256
+
2257
+ const llama_grammar_element * stack_pos = stack.back();
2258
+
2259
+ std::vector<llama_grammar_candidate> next_candidates;
2260
+ for (auto tok : candidates) {
2261
+ if (llama_grammar_match_char(stack_pos, tok.code_points[0]).first) {
2262
+ if (tok.code_points[1] != 0) {
2263
+ next_candidates.push_back({ tok.index, tok.code_points + 1 });
2264
+ }
2265
+ } else {
2266
+ rejects.push_back(tok);
2267
+ }
2268
+ }
2269
+
2270
+ auto stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
2271
+
2272
+ // update top of stack to next element, if any
2273
+ std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
2274
+ if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
2275
+ stack_after.push_back(stack_pos_after);
2276
+ }
2277
+ std::vector<std::vector<const llama_grammar_element *>> next_stacks;
2278
+ llama_grammar_advance_stack(rules, stack_after, next_stacks);
2279
+
2280
+ auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
2281
+ for (auto tok : next_rejects) {
2282
+ rejects.push_back({ tok.index, tok.code_points - 1 });
2283
+ }
2284
+
2285
+ return rejects;
2286
+ }
2287
+
2288
+ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
2289
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2290
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
2291
+ const std::vector<llama_grammar_candidate> & candidates) {
2292
+ LLAMA_ASSERT(!stacks.empty()); // REVIEW
2293
+
2294
+ if (candidates.empty()) {
2295
+ return std::vector<llama_grammar_candidate>();
2296
+ }
2297
+
2298
+ auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
2299
+
2300
+ for (size_t i = 1, size = stacks.size(); i < size; ++i) {
2301
+ rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
2302
+ }
2303
+ return rejects;
2304
+ }
2305
+
2306
+ //
2307
+ // grammar - external
2308
+ //
2309
+
2310
+ struct llama_grammar * llama_grammar_init(
2311
+ const llama_grammar_element ** rules,
2312
+ size_t n_rules,
2313
+ size_t start_rule_index) {
2314
+ const llama_grammar_element * pos;
2315
+
2316
+ // copy rule definitions into vectors
2317
+ std::vector<std::vector<llama_grammar_element>> vec_rules(n_rules);
2318
+ for (size_t i = 0; i < n_rules; i++) {
2319
+ for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
2320
+ vec_rules[i].push_back(*pos);
2321
+ }
2322
+ vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
2323
+ }
2324
+
2325
+ // loop over alternates of start rule to build initial stacks
2326
+ std::vector<std::vector<const llama_grammar_element *>> stacks;
2327
+ pos = rules[start_rule_index];
2328
+ do {
2329
+ std::vector<const llama_grammar_element *> stack;
2330
+ if (!llama_grammar_is_end_of_sequence(pos)) {
2331
+ // if alternate is nonempty, add to stack
2332
+ stack.push_back(pos);
2333
+ }
2334
+ llama_grammar_advance_stack(vec_rules, stack, stacks);
2335
+ while (!llama_grammar_is_end_of_sequence(pos)) {
2336
+ // scan to end of alternate def
2337
+ pos++;
2338
+ }
2339
+ if (pos->type == LLAMA_GRETYPE_ALT) {
2340
+ // there's another alternate def of this rule to process
2341
+ pos++;
2342
+ } else {
2343
+ break;
2344
+ }
2345
+ } while (true);
2346
+
2347
+ return new llama_grammar{ std::move(vec_rules), std::move(stacks) };
2348
+ }
2349
+
2350
+ void llama_grammar_free(struct llama_grammar * grammar) {
2351
+ delete grammar;
2352
+ }
2353
+
1918
2354
  //
1919
2355
  // sampling
1920
2356
  //
@@ -2200,6 +2636,47 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
2200
2636
  }
2201
2637
  }
2202
2638
 
2639
+ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
2640
+ assert(ctx);
2641
+ const int64_t t_start_sample_us = ggml_time_us();
2642
+
2643
+ bool allow_eos = false;
2644
+ for (const auto & stack : grammar->stacks) {
2645
+ if (stack.empty()) {
2646
+ allow_eos = true;
2647
+ break;
2648
+ }
2649
+ }
2650
+
2651
+ const llama_token eos = llama_token_eos();
2652
+
2653
+ std::vector<std::vector<uint32_t>> candidates_decoded;
2654
+ std::vector<llama_grammar_candidate> candidates_grammar;
2655
+
2656
+ for (size_t i = 0; i < candidates->size; ++i) {
2657
+ const llama_token id = candidates->data[i].id;
2658
+ const char * str = llama_token_to_str(ctx, id);
2659
+ if (id == eos) {
2660
+ if (!allow_eos) {
2661
+ candidates->data[i].logit = -INFINITY;
2662
+ }
2663
+ } else if (*str == 0) {
2664
+ candidates->data[i].logit = -INFINITY;
2665
+ } else {
2666
+ candidates_decoded.push_back(decode_utf8(str));
2667
+ candidates_grammar.push_back({ i, candidates_decoded.back().data() });
2668
+ }
2669
+ }
2670
+
2671
+ const auto rejects =
2672
+ llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
2673
+ for (auto & reject : rejects) {
2674
+ candidates->data[reject.index].logit = -INFINITY;
2675
+ }
2676
+
2677
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2678
+ }
2679
+
2203
2680
  static void llama_log_softmax(float * array, size_t size) {
2204
2681
  float max_l = *std::max_element(array, array + size);
2205
2682
  float sum = 0.f;
@@ -2375,6 +2852,29 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
2375
2852
  return result;
2376
2853
  }
2377
2854
 
2855
+ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
2856
+ const int64_t t_start_sample_us = ggml_time_us();
2857
+
2858
+ if (token == llama_token_eos()) {
2859
+ for (const auto & stack : grammar->stacks) {
2860
+ if (stack.empty()) {
2861
+ return;
2862
+ }
2863
+ }
2864
+ LLAMA_ASSERT(false);
2865
+ }
2866
+
2867
+ const char * str = llama_token_to_str(ctx, token);
2868
+ // Note terminating 0 in decoded string
2869
+ auto code_points = decode_utf8(str);
2870
+ for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
2871
+ grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
2872
+ }
2873
+ LLAMA_ASSERT(!grammar->stacks.empty());
2874
+
2875
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2876
+ }
2877
+
2378
2878
  //
2379
2879
  // quantization
2380
2880
  //
@@ -2448,8 +2948,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2448
2948
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
2449
2949
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
2450
2950
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
2451
- case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
2452
- case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
2951
+ case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
2952
+ case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
2453
2953
 
2454
2954
  #ifdef GGML_USE_K_QUANTS
2455
2955
  // K-quants
@@ -2533,16 +3033,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2533
3033
  } else {
2534
3034
  new_type = quantized_type;
2535
3035
  #ifdef GGML_USE_K_QUANTS
2536
- bool convert_incompatible_tensor = false;
2537
- if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2538
- quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2539
- int nx = tensor.ne.at(0);
2540
- int ny = tensor.ne.at(1);
2541
- if (nx % QK_K != 0 || ny % QK_K != 0) {
2542
- fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
2543
- convert_incompatible_tensor = true;
2544
- }
2545
- }
2546
3036
  if (tensor.name == "output.weight") {
2547
3037
  int nx = tensor.ne.at(0);
2548
3038
  int ny = tensor.ne.at(1);
@@ -2568,6 +3058,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2568
3058
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2569
3059
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2570
3060
  }
3061
+ bool convert_incompatible_tensor = false;
3062
+ if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
3063
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
3064
+ int nx = tensor.ne.at(0);
3065
+ int ny = tensor.ne.at(1);
3066
+ if (nx % QK_K != 0 || ny % QK_K != 0) {
3067
+ fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
3068
+ convert_incompatible_tensor = true;
3069
+ }
3070
+ }
2571
3071
  if (convert_incompatible_tensor) {
2572
3072
  if (tensor.name == "output.weight") {
2573
3073
  new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
@@ -2594,7 +3094,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2594
3094
  f32_data = (float *) f32_conv_buf.addr;
2595
3095
  }
2596
3096
 
2597
- printf("quantizing .. ");
3097
+ printf("quantizing to %s .. ", ggml_type_name(new_type));
2598
3098
  fflush(stdout);
2599
3099
 
2600
3100
  work.resize(nelements * 4); // upper bound on size
@@ -2697,8 +3197,8 @@ struct llama_model * llama_load_model_from_file(
2697
3197
 
2698
3198
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2699
3199
 
2700
- if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
2701
- params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
3200
+ if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
3201
+ params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
2702
3202
  memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
2703
3203
  params.progress_callback_user_data)) {
2704
3204
  delete model;
@@ -2775,10 +3275,47 @@ struct llama_context * llama_new_context_with_model(
2775
3275
  ctx->embedding.resize(hparams.n_embd);
2776
3276
  }
2777
3277
 
2778
- ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
3278
+ #ifdef LLAMA_USE_ALLOCATOR
3279
+ {
3280
+ static const size_t tensor_alignment = 32;
3281
+ // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
3282
+ ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
3283
+
3284
+ // create measure allocator
3285
+ ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
3286
+
3287
+ // build worst-case graph
3288
+ int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
3289
+ int n_past = hparams.n_ctx - n_tokens;
3290
+ llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
3291
+ ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
3292
+
3293
+ // measure memory requirements for the graph
3294
+ size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
2779
3295
 
3296
+ fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
3297
+
3298
+ // debug - for comparison with scratch buffer
3299
+ //size_t prev_req =
3300
+ // MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
3301
+ // MEM_REQ_SCRATCH1().at(ctx->model.type) +
3302
+ // MEM_REQ_EVAL().at(ctx->model.type);
3303
+ //fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
3304
+
3305
+ // recreate allocator with exact memory requirements
3306
+ ggml_allocr_free(ctx->alloc);
3307
+
3308
+ ctx->buf_alloc.resize(alloc_size);
3309
+ ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
3310
+ }
3311
+ #else
3312
+ ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
3313
+ #endif
3314
+
3315
+ #ifdef LLAMA_USE_SCRATCH
2780
3316
  ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
2781
3317
  ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
3318
+ #endif
2782
3319
  }
2783
3320
 
2784
3321
  #ifdef GGML_USE_METAL
@@ -2799,7 +3336,7 @@ struct llama_context * llama_new_context_with_model(
2799
3336
 
2800
3337
  const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
2801
3338
 
2802
- printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
3339
+ fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
2803
3340
 
2804
3341
  #define LLAMA_METAL_CHECK_BUF(result) \
2805
3342
  if (!(result)) { \
@@ -2848,9 +3385,6 @@ struct llama_context * llama_init_from_file(
2848
3385
  }
2849
3386
 
2850
3387
  void llama_free(struct llama_context * ctx) {
2851
- if (ctx->model_owner) {
2852
- delete &ctx->model;
2853
- }
2854
3388
  delete ctx;
2855
3389
  }
2856
3390
 
@@ -3260,7 +3794,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3260
3794
  const auto & kv_self = ctx->kv_self;
3261
3795
  const auto & hparams = ctx->model.hparams;
3262
3796
  const int n_layer = hparams.n_layer;
3263
- const int n_embd = hparams.n_embd;
3797
+ const int n_embd = hparams.n_embd_gqa();
3264
3798
  const int n_ctx = hparams.n_ctx;
3265
3799
 
3266
3800
  const size_t kv_size = kv_self.buf.size;
@@ -3363,7 +3897,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3363
3897
  const auto & kv_self = ctx->kv_self;
3364
3898
  const auto & hparams = ctx->model.hparams;
3365
3899
  const int n_layer = hparams.n_layer;
3366
- const int n_embd = hparams.n_embd;
3900
+ const int n_embd = hparams.n_embd_gqa();
3367
3901
  const int n_ctx = hparams.n_ctx;
3368
3902
 
3369
3903
  size_t kv_size;