llama_cpp 0.3.4 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -67,6 +67,7 @@ enum e_model {
67
67
  MODEL_13B,
68
68
  MODEL_30B,
69
69
  MODEL_65B,
70
+ MODEL_70B,
70
71
  };
71
72
 
72
73
  static const size_t kB = 1024;
@@ -98,18 +99,18 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
98
99
  }
99
100
 
100
101
  //
101
- // memory sizes
102
+ // memory sizes (calculated for n_batch == 512)
102
103
  //
103
104
 
104
105
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
105
106
  {
106
107
  static std::map<e_model, size_t> k_sizes = {
107
- /* empirical scaling, still a guess */
108
- { MODEL_3B, ((size_t) n_ctx / 16ull + 128ull) * MB },
109
- { MODEL_7B, ((size_t) n_ctx / 16ull + 256ull) * MB },
110
- { MODEL_13B, ((size_t) n_ctx / 12ull + 256ull) * MB },
111
- { MODEL_30B, ((size_t) n_ctx / 10ull + 256ull) * MB },
112
- { MODEL_65B, ((size_t) n_ctx / 8ull + 512ull) * MB },
108
+ { MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
109
+ { MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
110
+ { MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
111
+ { MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB },
112
+ { MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess
113
+ { MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB },
113
114
  };
114
115
  return k_sizes;
115
116
  }
@@ -117,38 +118,26 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
117
118
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
118
119
  {
119
120
  static std::map<e_model, size_t> k_sizes = {
120
- { MODEL_3B, 256ull * MB },
121
- { MODEL_7B, 512ull * MB },
122
- { MODEL_13B, 512ull * MB },
123
- { MODEL_30B, 512ull * MB },
124
- { MODEL_65B, 1024ull * MB },
121
+ { MODEL_3B, 128ull * MB },
122
+ { MODEL_7B, 160ull * MB },
123
+ { MODEL_13B, 192ull * MB },
124
+ { MODEL_30B, 256ull * MB },
125
+ { MODEL_65B, 384ull * MB }, // guess
126
+ { MODEL_70B, 304ull * MB },
125
127
  };
126
128
  return k_sizes;
127
129
  }
128
130
 
129
- // 2*n_embd*n_ctx*n_layer*sizeof(float16)
130
- static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
131
+ // used to store the compute graph tensors + non-scratch data
132
+ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
131
133
  {
132
134
  static std::map<e_model, size_t> k_sizes = {
133
- { MODEL_3B, 682ull * MB },
134
- { MODEL_7B, 1026ull * MB },
135
- { MODEL_13B, 1608ull * MB },
136
- { MODEL_30B, 3124ull * MB },
137
- { MODEL_65B, 5120ull * MB },
138
- };
139
- return k_sizes;
140
- }
141
-
142
- // this is mostly needed for temporary mul_mat buffers to dequantize the data
143
- // not actually needed if BLAS is disabled
144
- static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
145
- {
146
- static std::map<e_model, size_t> k_sizes = {
147
- { MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB },
148
- { MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB },
149
- { MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
150
- { MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
151
- { MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
135
+ { MODEL_3B, 8ull * MB },
136
+ { MODEL_7B, 10ull * MB },
137
+ { MODEL_13B, 12ull * MB },
138
+ { MODEL_30B, 16ull * MB },
139
+ { MODEL_65B, 24ull * MB }, // guess
140
+ { MODEL_70B, 24ull * MB },
152
141
  };
153
142
  return k_sizes;
154
143
  }
@@ -163,6 +152,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
163
152
  { MODEL_13B, 640ull * kB },
164
153
  { MODEL_30B, 768ull * kB },
165
154
  { MODEL_65B, 1536ull * kB },
155
+ { MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
166
156
  };
167
157
  return k_sizes;
168
158
  }
@@ -177,19 +167,26 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
177
167
  { MODEL_13B, 160ull },
178
168
  { MODEL_30B, 208ull },
179
169
  { MODEL_65B, 416ull },
170
+ { MODEL_70B, 416ull }, // TODO (likely can be reduced)
180
171
  };
181
172
  return k_sizes;
182
173
  }
183
174
 
184
175
  // default hparams (LLaMA 7B)
185
176
  struct llama_hparams {
186
- uint32_t n_vocab = 32000;
187
- uint32_t n_ctx = 512; // this is provided as user input?
188
- uint32_t n_embd = 4096;
189
- uint32_t n_mult = 256;
190
- uint32_t n_head = 32;
191
- uint32_t n_layer = 32;
192
- uint32_t n_rot = 64;
177
+ uint32_t n_vocab = 32000;
178
+ uint32_t n_ctx = 512; // this is provided as user input?
179
+ uint32_t n_embd = 4096;
180
+ uint32_t n_mult = 256;
181
+ uint32_t n_head = 32;
182
+ uint32_t n_head_kv = 32;
183
+ uint32_t n_layer = 32;
184
+ uint32_t n_rot = 64;
185
+
186
+ // LLaMAv2
187
+ // TODO: load from model data hparams
188
+ float f_ffn_mult = 1.0f;
189
+ float f_rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
193
190
 
194
191
  float rope_freq_base = 10000.0f;
195
192
  float rope_freq_scale = 1.0f;
@@ -197,7 +194,28 @@ struct llama_hparams {
197
194
  enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
198
195
 
199
196
  bool operator!=(const llama_hparams & other) const {
200
- return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
197
+ return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
198
+ }
199
+
200
+ uint32_t n_gqa() const {
201
+ return n_head/n_head_kv;
202
+ }
203
+
204
+ uint32_t n_embd_head() const {
205
+ return n_embd/n_head;
206
+ }
207
+
208
+ uint32_t n_embd_gqa() const {
209
+ return n_embd/n_gqa();
210
+ }
211
+
212
+ size_t kv_size() const {
213
+ size_t result = 2ull;
214
+ result *= (size_t) n_embd_gqa();
215
+ result *= (size_t) n_ctx;
216
+ result *= (size_t) n_layer;
217
+ result *= sizeof(ggml_fp16_t);
218
+ return result;
201
219
  }
202
220
  };
203
221
 
@@ -499,12 +517,16 @@ struct llama_file_loader {
499
517
  }
500
518
  void read_hparams() {
501
519
  hparams.n_vocab = file.read_u32();
502
- hparams.n_embd = file.read_u32();
503
- hparams.n_mult = file.read_u32();
504
- hparams.n_head = file.read_u32();
520
+ hparams.n_embd = file.read_u32();
521
+ hparams.n_mult = file.read_u32();
522
+ hparams.n_head = file.read_u32();
505
523
  hparams.n_layer = file.read_u32();
506
- hparams.n_rot = file.read_u32();
507
- hparams.ftype = (enum llama_ftype) file.read_u32();
524
+ hparams.n_rot = file.read_u32();
525
+ hparams.ftype = (enum llama_ftype) file.read_u32();
526
+
527
+ // LLaMAv2
528
+ // TODO: read from header
529
+ hparams.n_head_kv = hparams.n_head;
508
530
  }
509
531
  void read_vocab() {
510
532
  vocab.id_to_token.resize(hparams.n_vocab);
@@ -803,7 +825,7 @@ static bool kv_cache_init(
803
825
  ggml_type wtype,
804
826
  int n_ctx,
805
827
  int n_gpu_layers) {
806
- const int n_embd = hparams.n_embd;
828
+ const int n_embd = hparams.n_embd_gqa();
807
829
  const int n_layer = hparams.n_layer;
808
830
 
809
831
  const int64_t n_mem = n_layer*n_ctx;
@@ -847,6 +869,8 @@ struct llama_context_params llama_context_default_params() {
847
869
  /*.seed =*/ LLAMA_DEFAULT_SEED,
848
870
  /*.n_ctx =*/ 512,
849
871
  /*.n_batch =*/ 512,
872
+ /*.n_gqa =*/ 1,
873
+ /*.rms_norm_eps =*/ LLAMA_DEFAULT_RMS_EPS,
850
874
  /*.gpu_layers =*/ 0,
851
875
  /*.main_gpu =*/ 0,
852
876
  /*.tensor_split =*/ nullptr,
@@ -966,6 +990,7 @@ static const char *llama_model_type_name(e_model type) {
966
990
  case MODEL_13B: return "13B";
967
991
  case MODEL_30B: return "30B";
968
992
  case MODEL_65B: return "65B";
993
+ case MODEL_70B: return "70B";
969
994
  default: LLAMA_ASSERT(false);
970
995
  }
971
996
  }
@@ -976,6 +1001,8 @@ static void llama_model_load_internal(
976
1001
  llama_vocab & vocab,
977
1002
  int n_ctx,
978
1003
  int n_batch,
1004
+ int n_gqa,
1005
+ float rms_norm_eps,
979
1006
  int n_gpu_layers,
980
1007
  int main_gpu,
981
1008
  const float * tensor_split,
@@ -997,8 +1024,12 @@ static void llama_model_load_internal(
997
1024
  model.hparams = ml->file_loader->hparams;
998
1025
  model.n_gpu_layers = n_gpu_layers;
999
1026
  llama_file_version file_version = ml->file_loader->file_version;
1027
+
1000
1028
  auto & hparams = model.hparams;
1001
1029
 
1030
+ // TODO: read from file
1031
+ hparams.f_rms_norm_eps = rms_norm_eps;
1032
+
1002
1033
  {
1003
1034
  switch (hparams.n_layer) {
1004
1035
  case 26: model.type = e_model::MODEL_3B; break;
@@ -1016,11 +1047,25 @@ static void llama_model_load_internal(
1016
1047
 
1017
1048
  hparams.n_ctx = n_ctx;
1018
1049
 
1050
+ // LLaMAv2
1051
+ // TODO: temporary until GGUF
1052
+ LLAMA_ASSERT(hparams.n_head % n_gqa == 0);
1053
+ hparams.n_head_kv = hparams.n_head / n_gqa;
1054
+ if (model.type == e_model::MODEL_65B && n_gqa == 8) {
1055
+ fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
1056
+ model.type = e_model::MODEL_70B;
1057
+ hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
1058
+ }
1059
+
1019
1060
  hparams.rope_freq_base = rope_freq_base;
1020
1061
  hparams.rope_freq_scale = rope_freq_scale;
1021
1062
  }
1022
1063
 
1023
- const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
1064
+ // ref: https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/model.py#L194-L199
1065
+ const uint32_t n_ff_raw = 2*(4*hparams.n_embd)/3;
1066
+ const uint32_t n_ff_mult = hparams.f_ffn_mult*n_ff_raw;
1067
+ const uint32_t n_ff = ((n_ff_mult + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
1068
+ //const uint32_t n_ff = 28672;
1024
1069
 
1025
1070
  {
1026
1071
  fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
@@ -1029,12 +1074,15 @@ static void llama_model_load_internal(
1029
1074
  fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
1030
1075
  fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
1031
1076
  fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
1077
+ fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
1032
1078
  fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
1033
- fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1079
+ fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
1080
+ fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa());
1081
+ fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
1082
+ fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1034
1083
  fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
1035
1084
  fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1036
1085
  fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
1037
- fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1038
1086
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1039
1087
  }
1040
1088
 
@@ -1069,7 +1117,7 @@ static void llama_model_load_internal(
1069
1117
  {
1070
1118
  model.buf.resize(ctx_size);
1071
1119
  if (use_mlock) {
1072
- model.mlock_buf.init(model.buf.addr);
1120
+ model.mlock_buf.init (model.buf.addr);
1073
1121
  model.mlock_buf.grow_to(model.buf.size);
1074
1122
  }
1075
1123
 
@@ -1104,9 +1152,10 @@ static void llama_model_load_internal(
1104
1152
  size_t vram_weights = 0;
1105
1153
  size_t vram_scratch = 0;
1106
1154
  {
1107
- const uint32_t n_embd = hparams.n_embd;
1108
- const uint32_t n_layer = hparams.n_layer;
1109
- const uint32_t n_vocab = hparams.n_vocab;
1155
+ const uint32_t n_embd = hparams.n_embd;
1156
+ const uint32_t n_embd_gqa = hparams.n_embd_gqa();
1157
+ const uint32_t n_layer = hparams.n_layer;
1158
+ const uint32_t n_vocab = hparams.n_vocab;
1110
1159
 
1111
1160
  ml->ggml_ctx = ctx;
1112
1161
 
@@ -1154,16 +1203,16 @@ static void llama_model_load_internal(
1154
1203
 
1155
1204
  layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
1156
1205
 
1157
- layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
1158
- layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split);
1159
- layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split);
1160
- layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
1206
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
1207
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd_gqa}, backend_split);
1208
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd_gqa}, backend_split);
1209
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
1161
1210
 
1162
1211
  layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
1163
1212
 
1164
- layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
1165
- layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
1166
- layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
1213
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
1214
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
1215
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
1167
1216
 
1168
1217
  if (backend == GGML_BACKEND_GPU) {
1169
1218
  vram_weights +=
@@ -1186,11 +1235,11 @@ static void llama_model_load_internal(
1186
1235
  mmapped_size - vram_weights + // weights in VRAM not in memory
1187
1236
  MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
1188
1237
  MEM_REQ_SCRATCH1().at(model.type) +
1189
- MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
1238
+ MEM_REQ_EVAL().at(model.type);
1190
1239
 
1191
1240
  // this is the memory required by one llama_state
1192
1241
  const size_t mem_required_state =
1193
- scale*MEM_REQ_KV_SELF().at(model.type);
1242
+ scale*hparams.kv_size();
1194
1243
 
1195
1244
  fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1196
1245
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@@ -1231,7 +1280,7 @@ static void llama_model_load_internal(
1231
1280
  fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
1232
1281
  } else {
1233
1282
  fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
1234
- vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1283
+ vram_kv_cache += hparams.kv_size() / 2;
1235
1284
  }
1236
1285
  }
1237
1286
  if (n_gpu_layers > (int) hparams.n_layer + 2) {
@@ -1239,7 +1288,7 @@ static void llama_model_load_internal(
1239
1288
  fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
1240
1289
  } else {
1241
1290
  fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
1242
- vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1291
+ vram_kv_cache += hparams.kv_size() / 2;
1243
1292
  }
1244
1293
  }
1245
1294
  #elif defined(GGML_USE_CLBLAST)
@@ -1287,6 +1336,8 @@ static bool llama_model_load(
1287
1336
  llama_vocab & vocab,
1288
1337
  int n_ctx,
1289
1338
  int n_batch,
1339
+ int n_gqa,
1340
+ float rms_norm_eps,
1290
1341
  int n_gpu_layers,
1291
1342
  int main_gpu,
1292
1343
  const float * tensor_split,
@@ -1300,7 +1351,7 @@ static bool llama_model_load(
1300
1351
  llama_progress_callback progress_callback,
1301
1352
  void *progress_callback_user_data) {
1302
1353
  try {
1303
- llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1354
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1304
1355
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1305
1356
  return true;
1306
1357
  } catch (const std::exception & err) {
@@ -1344,16 +1395,23 @@ static bool llama_eval_internal(
1344
1395
 
1345
1396
  LLAMA_ASSERT(!!kv_self.ctx);
1346
1397
 
1347
- const int n_embd = hparams.n_embd;
1348
- const int n_layer = hparams.n_layer;
1349
- const int n_ctx = hparams.n_ctx;
1350
- const int n_head = hparams.n_head;
1351
- const int n_vocab = hparams.n_vocab;
1352
- const int n_rot = hparams.n_embd/hparams.n_head;
1353
- const int n_gpu_layers = model.n_gpu_layers;
1398
+ const int64_t n_embd = hparams.n_embd;
1399
+ const int64_t n_layer = hparams.n_layer;
1400
+ const int64_t n_ctx = hparams.n_ctx;
1401
+ const int64_t n_head = hparams.n_head;
1402
+ const int64_t n_head_kv = hparams.n_head_kv;
1403
+ const int64_t n_embd_head = hparams.n_embd_head();
1404
+ const int64_t n_vocab = hparams.n_vocab;
1405
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
1406
+
1407
+
1408
+ LLAMA_ASSERT(n_embd_head == hparams.n_rot);
1354
1409
 
1355
1410
  const float freq_base = hparams.rope_freq_base;
1356
1411
  const float freq_scale = hparams.rope_freq_scale;
1412
+ const float rms_norm_eps = hparams.f_rms_norm_eps;
1413
+
1414
+ const int n_gpu_layers = model.n_gpu_layers;
1357
1415
 
1358
1416
  auto & mem_per_token = lctx.mem_per_token;
1359
1417
  auto & buf_compute = lctx.buf_compute;
@@ -1366,7 +1424,7 @@ static bool llama_eval_internal(
1366
1424
 
1367
1425
  struct ggml_context * ctx0 = ggml_init(params);
1368
1426
 
1369
- ggml_cgraph gf = {};
1427
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
1370
1428
 
1371
1429
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1372
1430
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
@@ -1431,7 +1489,7 @@ static bool llama_eval_internal(
1431
1489
 
1432
1490
  // norm
1433
1491
  {
1434
- cur = ggml_rms_norm(ctx0, inpL);
1492
+ cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
1435
1493
  offload_func(cur);
1436
1494
  ggml_set_name(cur, "rms_norm_0");
1437
1495
 
@@ -1452,11 +1510,11 @@ static bool llama_eval_internal(
1452
1510
  offload_func_kq(tmpq);
1453
1511
  ggml_set_name(tmpq, "tmpq");
1454
1512
 
1455
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
1513
+ struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
1456
1514
  offload_func_kq(Kcur);
1457
1515
  ggml_set_name(Kcur, "Kcur");
1458
1516
 
1459
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
1517
+ struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
1460
1518
  offload_func_kq(Qcur);
1461
1519
  ggml_set_name(Qcur, "Qcur");
1462
1520
 
@@ -1468,23 +1526,23 @@ static bool llama_eval_internal(
1468
1526
  offload_func_v(tmpv);
1469
1527
  ggml_set_name(tmpv, "tmpv");
1470
1528
 
1471
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N));
1529
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
1472
1530
  offload_func_v(Vcur);
1473
1531
  ggml_set_name(Vcur, "Vcur");
1474
1532
 
1475
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1533
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
1476
1534
  offload_func_kq(k);
1477
1535
  ggml_set_name(k, "k");
1478
1536
 
1479
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1537
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
1480
1538
  ( n_ctx)*ggml_element_size(kv_self.v),
1481
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1539
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
1482
1540
  offload_func_v(v);
1483
1541
  ggml_set_name(v, "v");
1484
1542
 
1485
1543
  // important: storing RoPE-ed version of K in the KV cache!
1486
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
1487
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
1544
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
1545
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
1488
1546
  }
1489
1547
 
1490
1548
  struct ggml_tensor * Q =
@@ -1497,8 +1555,8 @@ static bool llama_eval_internal(
1497
1555
  struct ggml_tensor * K =
1498
1556
  ggml_permute(ctx0,
1499
1557
  ggml_reshape_3d(ctx0,
1500
- ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
1501
- n_embd/n_head, n_head, n_past + N),
1558
+ ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd_gqa, il*n_ctx*ggml_element_size(kv_self.k)*n_embd_gqa),
1559
+ n_embd_head, n_head_kv, n_past + N),
1502
1560
  0, 2, 1, 3);
1503
1561
  offload_func_kq(K);
1504
1562
  ggml_set_name(K, "K");
@@ -1508,9 +1566,9 @@ static bool llama_eval_internal(
1508
1566
  offload_func_kq(KQ);
1509
1567
  ggml_set_name(KQ, "KQ");
1510
1568
 
1511
- // KQ_scaled = KQ / sqrt(n_embd/n_head)
1569
+ // KQ_scaled = KQ / sqrt(n_embd_head)
1512
1570
  struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
1513
- ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
1571
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
1514
1572
 
1515
1573
  // KQ_scaled shape [n_past + N, N, n_head, 1]
1516
1574
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
@@ -1530,10 +1588,10 @@ static bool llama_eval_internal(
1530
1588
  // split cached V into n_head heads
1531
1589
  struct ggml_tensor * V =
1532
1590
  ggml_view_3d(ctx0, kv_self.v,
1533
- n_past + N, n_embd/n_head, n_head,
1591
+ n_past + N, n_embd_head, n_head_kv,
1534
1592
  n_ctx*ggml_element_size(kv_self.v),
1535
- n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
1536
- il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
1593
+ n_ctx*ggml_element_size(kv_self.v)*n_embd_head,
1594
+ n_ctx*ggml_element_size(kv_self.v)*n_embd_gqa*il);
1537
1595
  offload_func_v(V);
1538
1596
  ggml_set_name(V, "V");
1539
1597
 
@@ -1545,7 +1603,7 @@ static bool llama_eval_internal(
1545
1603
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
1546
1604
  // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
1547
1605
  // is there a better way?
1548
- struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
1606
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
1549
1607
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
1550
1608
  #endif
1551
1609
 
@@ -1579,7 +1637,7 @@ static bool llama_eval_internal(
1579
1637
  {
1580
1638
  // norm
1581
1639
  {
1582
- cur = ggml_rms_norm(ctx0, inpFF);
1640
+ cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
1583
1641
  offload_func(cur);
1584
1642
  ggml_set_name(cur, "rms_norm_1");
1585
1643
 
@@ -1632,7 +1690,7 @@ static bool llama_eval_internal(
1632
1690
 
1633
1691
  // norm
1634
1692
  {
1635
- cur = ggml_rms_norm(ctx0, inpL);
1693
+ cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
1636
1694
  offload_func_nr(cur);
1637
1695
  ggml_set_name(cur, "rms_norm_2");
1638
1696
 
@@ -1654,16 +1712,22 @@ static bool llama_eval_internal(
1654
1712
  //cur = ggml_soft_max_inplace(ctx0, cur);
1655
1713
 
1656
1714
  // run the computation
1657
- ggml_build_forward_expand(&gf, cur);
1715
+ ggml_build_forward_expand(gf, cur);
1716
+
1717
+ // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
1658
1718
 
1659
1719
  #if GGML_USE_MPI
1660
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
1720
+ ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
1661
1721
  #endif
1662
1722
 
1663
1723
  #ifdef GGML_USE_METAL
1664
1724
  if (lctx.ctx_metal && N == 1) {
1725
+ // TODO: disabled until #2413 is resolved
1726
+ //if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
1727
+ // ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
1728
+ //}
1665
1729
  ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
1666
- ggml_metal_graph_compute(lctx.ctx_metal, &gf);
1730
+ ggml_metal_graph_compute(lctx.ctx_metal, gf);
1667
1731
  ggml_metal_get_tensor (lctx.ctx_metal, cur);
1668
1732
  } else {
1669
1733
  // IMPORTANT:
@@ -1682,34 +1746,34 @@ static bool llama_eval_internal(
1682
1746
  ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1683
1747
  }
1684
1748
 
1685
- ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1749
+ ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
1686
1750
  }
1687
1751
  #else
1688
- ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1752
+ ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
1689
1753
  #endif
1690
1754
 
1691
1755
  #if GGML_USE_MPI
1692
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
1756
+ ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
1693
1757
  #endif
1694
1758
 
1695
1759
  // update kv token count
1696
1760
  lctx.kv_self.n = n_past + N;
1697
1761
 
1698
- struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
1762
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
1699
1763
 
1700
1764
  if (cgraph_fname) {
1701
- ggml_graph_export(&gf, cgraph_fname);
1765
+ ggml_graph_export(gf, cgraph_fname);
1702
1766
  }
1703
1767
 
1704
1768
  #ifdef GGML_PERF
1705
1769
  // print timing information per ggml operation (for debugging purposes)
1706
1770
  // requires GGML_PERF to be defined
1707
- ggml_graph_print(&gf);
1771
+ ggml_graph_print(gf);
1708
1772
  #endif
1709
1773
 
1710
1774
  // plot the computation graph in dot format (for debugging purposes)
1711
1775
  //if (n_past%100 == 0) {
1712
- // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
1776
+ // ggml_graph_dump_dot(gf, NULL, "llama.dot");
1713
1777
  //}
1714
1778
 
1715
1779
  // extract logits
@@ -1739,10 +1803,12 @@ static bool llama_eval_internal(
1739
1803
  }
1740
1804
 
1741
1805
  #if 0
1742
- printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__,
1806
+ printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
1743
1807
  ggml_used_mem(ctx0)/1024.0/1024.0,
1744
1808
  lctx.get_buf_max_mem(0)/1024.0/1024.0,
1745
- lctx.get_buf_max_mem(1)/1024.0/1024.0);
1809
+ lctx.get_buf_max_mem(1)/1024.0/1024.0,
1810
+ lctx.work_buffer.size()/1024.0/1024.0,
1811
+ n_past, N);
1746
1812
  #endif
1747
1813
 
1748
1814
  ggml_free(ctx0);
@@ -1915,6 +1981,279 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
1915
1981
  return output;
1916
1982
  }
1917
1983
 
1984
+ //
1985
+ // grammar - internal
1986
+ //
1987
+
1988
+ struct llama_grammar {
1989
+ const std::vector<std::vector<llama_grammar_element>> rules;
1990
+ std::vector<std::vector<const llama_grammar_element *>> stacks;
1991
+ };
1992
+
1993
+ struct llama_grammar_candidate {
1994
+ size_t index;
1995
+ const uint32_t * code_points;
1996
+ };
1997
+
1998
+ // NOTE: assumes valid utf8 (but checks for overrun)
1999
+ // adds a terminating 0 for use as pointer
2000
+ std::vector<uint32_t> decode_utf8(const char * src) {
2001
+ static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
2002
+ const char * pos = src;
2003
+ std::vector<uint32_t> code_points;
2004
+ while (*pos != 0) {
2005
+ uint8_t first_byte = static_cast<uint8_t>(*pos);
2006
+ uint8_t highbits = first_byte >> 4;
2007
+ int len = lookup[highbits];
2008
+ uint8_t mask = (1 << (8 - len)) - 1;
2009
+ uint32_t value = first_byte & mask;
2010
+ const char * end = pos + len; // may overrun!
2011
+ ++pos;
2012
+ for ( ; pos < end && *pos != 0; ++pos) {
2013
+ value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
2014
+ }
2015
+ code_points.push_back(value);
2016
+ }
2017
+ code_points.push_back(0);
2018
+ return code_points;
2019
+ }
2020
+
2021
+ // returns true iff pos points to the end of one of the definitions of a rule
2022
+ static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
2023
+ switch (pos->type) {
2024
+ case LLAMA_GRETYPE_END: return true;
2025
+ case LLAMA_GRETYPE_ALT: return true;
2026
+ default: return false;
2027
+ }
2028
+ }
2029
+
2030
+ // returns true iff chr satisfies the char range at pos (regular or inverse range)
2031
+ // asserts that pos is pointing to a char range element
2032
+ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
2033
+ const llama_grammar_element * pos,
2034
+ const uint32_t chr) {
2035
+
2036
+ bool found = false;
2037
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
2038
+ LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
2039
+
2040
+ do {
2041
+ if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
2042
+ // inclusive range, e.g. [a-z]
2043
+ found = found || (pos->value <= chr && chr <= pos[1].value);
2044
+ pos += 2;
2045
+ } else {
2046
+ // exact char match, e.g. [a] or "a"
2047
+ found = found || pos->value == chr;
2048
+ pos += 1;
2049
+ }
2050
+ } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
2051
+
2052
+ return std::make_pair(found == is_positive_char, pos);
2053
+ }
2054
+
2055
+ // transforms a grammar pushdown stack into N possible stacks, all ending
2056
+ // at a character range (terminal element)
2057
+ static void llama_grammar_advance_stack(
2058
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2059
+ const std::vector<const llama_grammar_element *> & stack,
2060
+ std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
2061
+
2062
+ if (stack.empty()) {
2063
+ new_stacks.push_back(stack);
2064
+ return;
2065
+ }
2066
+
2067
+ const llama_grammar_element * pos = stack.back();
2068
+
2069
+ switch (pos->type) {
2070
+ case LLAMA_GRETYPE_RULE_REF: {
2071
+ const size_t rule_id = static_cast<size_t>(pos->value);
2072
+ const llama_grammar_element * subpos = rules[rule_id].data();
2073
+ do {
2074
+ // init new stack without the top (pos)
2075
+ std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
2076
+ if (!llama_grammar_is_end_of_sequence(pos + 1)) {
2077
+ // if this rule ref is followed by another element, add that to stack
2078
+ new_stack.push_back(pos + 1);
2079
+ }
2080
+ if (!llama_grammar_is_end_of_sequence(subpos)) {
2081
+ // if alternate is nonempty, add to stack
2082
+ new_stack.push_back(subpos);
2083
+ }
2084
+ llama_grammar_advance_stack(rules, new_stack, new_stacks);
2085
+ while (!llama_grammar_is_end_of_sequence(subpos)) {
2086
+ // scan to end of alternate def
2087
+ subpos++;
2088
+ }
2089
+ if (subpos->type == LLAMA_GRETYPE_ALT) {
2090
+ // there's another alternate def of this rule to process
2091
+ subpos++;
2092
+ } else {
2093
+ break;
2094
+ }
2095
+ } while (true);
2096
+ break;
2097
+ }
2098
+ case LLAMA_GRETYPE_CHAR:
2099
+ case LLAMA_GRETYPE_CHAR_NOT:
2100
+ new_stacks.push_back(stack);
2101
+ break;
2102
+ default:
2103
+ // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
2104
+ // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
2105
+ // those
2106
+ LLAMA_ASSERT(false);
2107
+ }
2108
+ }
2109
+
2110
+ // takes a set of possible pushdown stacks on a grammar, which are required to
2111
+ // be positioned at a character range (see `llama_grammar_advance_stack`), and
2112
+ // produces the N possible stacks if the given char is accepted at those
2113
+ // positions
2114
+ static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
2115
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2116
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
2117
+ const uint32_t chr) {
2118
+
2119
+ std::vector<std::vector<const llama_grammar_element *>> new_stacks;
2120
+
2121
+ for (const auto & stack : stacks) {
2122
+ if (stack.empty()) {
2123
+ continue;
2124
+ }
2125
+
2126
+ auto match = llama_grammar_match_char(stack.back(), chr);
2127
+ if (match.first) {
2128
+ const llama_grammar_element * pos = match.second;
2129
+
2130
+ // update top of stack to next element, if any
2131
+ std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
2132
+ if (!llama_grammar_is_end_of_sequence(pos)) {
2133
+ new_stack.push_back(pos);
2134
+ }
2135
+ llama_grammar_advance_stack(rules, new_stack, new_stacks);
2136
+ }
2137
+ }
2138
+
2139
+ return new_stacks;
2140
+ }
2141
+
2142
+ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
2143
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2144
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
2145
+ const std::vector<llama_grammar_candidate> & candidates);
2146
+
2147
+ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
2148
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2149
+ const std::vector<const llama_grammar_element *> & stack,
2150
+ const std::vector<llama_grammar_candidate> & candidates) {
2151
+
2152
+ std::vector<llama_grammar_candidate> rejects;
2153
+
2154
+ if (stack.empty()) {
2155
+ // accept nothing; EOS is handled elsewhere
2156
+ rejects.insert(rejects.end(), candidates.begin(), candidates.end());
2157
+ return rejects;
2158
+ }
2159
+
2160
+ const llama_grammar_element * stack_pos = stack.back();
2161
+
2162
+ std::vector<llama_grammar_candidate> next_candidates;
2163
+ for (auto tok : candidates) {
2164
+ if (llama_grammar_match_char(stack_pos, tok.code_points[0]).first) {
2165
+ if (tok.code_points[1] != 0) {
2166
+ next_candidates.push_back({ tok.index, tok.code_points + 1 });
2167
+ }
2168
+ } else {
2169
+ rejects.push_back(tok);
2170
+ }
2171
+ }
2172
+
2173
+ auto stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
2174
+
2175
+ // update top of stack to next element, if any
2176
+ std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
2177
+ if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
2178
+ stack_after.push_back(stack_pos_after);
2179
+ }
2180
+ std::vector<std::vector<const llama_grammar_element *>> next_stacks;
2181
+ llama_grammar_advance_stack(rules, stack_after, next_stacks);
2182
+
2183
+ auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
2184
+ for (auto tok : next_rejects) {
2185
+ rejects.push_back({ tok.index, tok.code_points - 1 });
2186
+ }
2187
+
2188
+ return rejects;
2189
+ }
2190
+
2191
+ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
2192
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2193
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
2194
+ const std::vector<llama_grammar_candidate> & candidates) {
2195
+ LLAMA_ASSERT(!stacks.empty()); // REVIEW
2196
+
2197
+ if (candidates.empty()) {
2198
+ return std::vector<llama_grammar_candidate>();
2199
+ }
2200
+
2201
+ auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
2202
+
2203
+ for (size_t i = 1, size = stacks.size(); i < size; ++i) {
2204
+ rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
2205
+ }
2206
+ return rejects;
2207
+ }
2208
+
2209
+ //
2210
+ // grammar - external
2211
+ //
2212
+
2213
+ struct llama_grammar * llama_grammar_init(
2214
+ const llama_grammar_element ** rules,
2215
+ size_t n_rules,
2216
+ size_t start_rule_index) {
2217
+ const llama_grammar_element * pos;
2218
+
2219
+ // copy rule definitions into vectors
2220
+ std::vector<std::vector<llama_grammar_element>> vec_rules(n_rules);
2221
+ for (size_t i = 0; i < n_rules; i++) {
2222
+ for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
2223
+ vec_rules[i].push_back(*pos);
2224
+ }
2225
+ vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
2226
+ }
2227
+
2228
+ // loop over alternates of start rule to build initial stacks
2229
+ std::vector<std::vector<const llama_grammar_element *>> stacks;
2230
+ pos = rules[start_rule_index];
2231
+ do {
2232
+ std::vector<const llama_grammar_element *> stack;
2233
+ if (!llama_grammar_is_end_of_sequence(pos)) {
2234
+ // if alternate is nonempty, add to stack
2235
+ stack.push_back(pos);
2236
+ }
2237
+ llama_grammar_advance_stack(vec_rules, stack, stacks);
2238
+ while (!llama_grammar_is_end_of_sequence(pos)) {
2239
+ // scan to end of alternate def
2240
+ pos++;
2241
+ }
2242
+ if (pos->type == LLAMA_GRETYPE_ALT) {
2243
+ // there's another alternate def of this rule to process
2244
+ pos++;
2245
+ } else {
2246
+ break;
2247
+ }
2248
+ } while (true);
2249
+
2250
+ return new llama_grammar{ std::move(vec_rules), std::move(stacks) };
2251
+ }
2252
+
2253
+ void llama_grammar_free(struct llama_grammar * grammar) {
2254
+ delete grammar;
2255
+ }
2256
+
1918
2257
  //
1919
2258
  // sampling
1920
2259
  //
@@ -2200,6 +2539,47 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
2200
2539
  }
2201
2540
  }
2202
2541
 
2542
+ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
2543
+ assert(ctx);
2544
+ const int64_t t_start_sample_us = ggml_time_us();
2545
+
2546
+ bool allow_eos = false;
2547
+ for (const auto & stack : grammar->stacks) {
2548
+ if (stack.empty()) {
2549
+ allow_eos = true;
2550
+ break;
2551
+ }
2552
+ }
2553
+
2554
+ const llama_token eos = llama_token_eos();
2555
+
2556
+ std::vector<std::vector<uint32_t>> candidates_decoded;
2557
+ std::vector<llama_grammar_candidate> candidates_grammar;
2558
+
2559
+ for (size_t i = 0; i < candidates->size; ++i) {
2560
+ const llama_token id = candidates->data[i].id;
2561
+ const char * str = llama_token_to_str(ctx, id);
2562
+ if (id == eos) {
2563
+ if (!allow_eos) {
2564
+ candidates->data[i].logit = -INFINITY;
2565
+ }
2566
+ } else if (*str == 0) {
2567
+ candidates->data[i].logit = -INFINITY;
2568
+ } else {
2569
+ candidates_decoded.push_back(decode_utf8(str));
2570
+ candidates_grammar.push_back({ i, candidates_decoded.back().data() });
2571
+ }
2572
+ }
2573
+
2574
+ const auto rejects =
2575
+ llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
2576
+ for (auto & reject : rejects) {
2577
+ candidates->data[reject.index].logit = -INFINITY;
2578
+ }
2579
+
2580
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2581
+ }
2582
+
2203
2583
  static void llama_log_softmax(float * array, size_t size) {
2204
2584
  float max_l = *std::max_element(array, array + size);
2205
2585
  float sum = 0.f;
@@ -2375,6 +2755,29 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
2375
2755
  return result;
2376
2756
  }
2377
2757
 
2758
+ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
2759
+ const int64_t t_start_sample_us = ggml_time_us();
2760
+
2761
+ if (token == llama_token_eos()) {
2762
+ for (const auto & stack : grammar->stacks) {
2763
+ if (stack.empty()) {
2764
+ return;
2765
+ }
2766
+ }
2767
+ LLAMA_ASSERT(false);
2768
+ }
2769
+
2770
+ const char * str = llama_token_to_str(ctx, token);
2771
+ // Note terminating 0 in decoded string
2772
+ auto code_points = decode_utf8(str);
2773
+ for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
2774
+ grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
2775
+ }
2776
+ LLAMA_ASSERT(!grammar->stacks.empty());
2777
+
2778
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2779
+ }
2780
+
2378
2781
  //
2379
2782
  // quantization
2380
2783
  //
@@ -2448,8 +2851,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2448
2851
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
2449
2852
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
2450
2853
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
2451
- case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
2452
- case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
2854
+ case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
2855
+ case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
2453
2856
 
2454
2857
  #ifdef GGML_USE_K_QUANTS
2455
2858
  // K-quants
@@ -2533,16 +2936,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2533
2936
  } else {
2534
2937
  new_type = quantized_type;
2535
2938
  #ifdef GGML_USE_K_QUANTS
2536
- bool convert_incompatible_tensor = false;
2537
- if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2538
- quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2539
- int nx = tensor.ne.at(0);
2540
- int ny = tensor.ne.at(1);
2541
- if (nx % QK_K != 0 || ny % QK_K != 0) {
2542
- fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
2543
- convert_incompatible_tensor = true;
2544
- }
2545
- }
2546
2939
  if (tensor.name == "output.weight") {
2547
2940
  int nx = tensor.ne.at(0);
2548
2941
  int ny = tensor.ne.at(1);
@@ -2568,6 +2961,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2568
2961
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2569
2962
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2570
2963
  }
2964
+ bool convert_incompatible_tensor = false;
2965
+ if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
2966
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
2967
+ int nx = tensor.ne.at(0);
2968
+ int ny = tensor.ne.at(1);
2969
+ if (nx % QK_K != 0 || ny % QK_K != 0) {
2970
+ fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
2971
+ convert_incompatible_tensor = true;
2972
+ }
2973
+ }
2571
2974
  if (convert_incompatible_tensor) {
2572
2975
  if (tensor.name == "output.weight") {
2573
2976
  new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
@@ -2594,7 +2997,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2594
2997
  f32_data = (float *) f32_conv_buf.addr;
2595
2998
  }
2596
2999
 
2597
- printf("quantizing .. ");
3000
+ printf("quantizing to %s .. ", ggml_type_name(new_type));
2598
3001
  fflush(stdout);
2599
3002
 
2600
3003
  work.resize(nelements * 4); // upper bound on size
@@ -2697,7 +3100,7 @@ struct llama_model * llama_load_model_from_file(
2697
3100
 
2698
3101
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2699
3102
 
2700
- if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
3103
+ if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
2701
3104
  params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
2702
3105
  memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
2703
3106
  params.progress_callback_user_data)) {
@@ -2775,7 +3178,7 @@ struct llama_context * llama_new_context_with_model(
2775
3178
  ctx->embedding.resize(hparams.n_embd);
2776
3179
  }
2777
3180
 
2778
- ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
3181
+ ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
2779
3182
 
2780
3183
  ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
2781
3184
  ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
@@ -2799,7 +3202,7 @@ struct llama_context * llama_new_context_with_model(
2799
3202
 
2800
3203
  const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
2801
3204
 
2802
- printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
3205
+ fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
2803
3206
 
2804
3207
  #define LLAMA_METAL_CHECK_BUF(result) \
2805
3208
  if (!(result)) { \