llama_cpp 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -67,6 +67,7 @@ enum e_model {
67
67
  MODEL_13B,
68
68
  MODEL_30B,
69
69
  MODEL_65B,
70
+ MODEL_70B,
70
71
  };
71
72
 
72
73
  static const size_t kB = 1024;
@@ -98,18 +99,18 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
98
99
  }
99
100
 
100
101
  //
101
- // memory sizes
102
+ // memory sizes (calculated for n_batch == 512)
102
103
  //
103
104
 
104
105
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
105
106
  {
106
107
  static std::map<e_model, size_t> k_sizes = {
107
- /* empirical scaling, still a guess */
108
- { MODEL_3B, ((size_t) n_ctx / 16ull + 128ull) * MB },
109
- { MODEL_7B, ((size_t) n_ctx / 16ull + 256ull) * MB },
110
- { MODEL_13B, ((size_t) n_ctx / 12ull + 256ull) * MB },
111
- { MODEL_30B, ((size_t) n_ctx / 10ull + 256ull) * MB },
112
- { MODEL_65B, ((size_t) n_ctx / 8ull + 512ull) * MB },
108
+ { MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
109
+ { MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
110
+ { MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
111
+ { MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB },
112
+ { MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess
113
+ { MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB },
113
114
  };
114
115
  return k_sizes;
115
116
  }
@@ -117,38 +118,26 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
117
118
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
118
119
  {
119
120
  static std::map<e_model, size_t> k_sizes = {
120
- { MODEL_3B, 256ull * MB },
121
- { MODEL_7B, 512ull * MB },
122
- { MODEL_13B, 512ull * MB },
123
- { MODEL_30B, 512ull * MB },
124
- { MODEL_65B, 1024ull * MB },
121
+ { MODEL_3B, 128ull * MB },
122
+ { MODEL_7B, 160ull * MB },
123
+ { MODEL_13B, 192ull * MB },
124
+ { MODEL_30B, 256ull * MB },
125
+ { MODEL_65B, 384ull * MB }, // guess
126
+ { MODEL_70B, 304ull * MB },
125
127
  };
126
128
  return k_sizes;
127
129
  }
128
130
 
129
- // 2*n_embd*n_ctx*n_layer*sizeof(float16)
130
- static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
131
+ // used to store the compute graph tensors + non-scratch data
132
+ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
131
133
  {
132
134
  static std::map<e_model, size_t> k_sizes = {
133
- { MODEL_3B, 682ull * MB },
134
- { MODEL_7B, 1026ull * MB },
135
- { MODEL_13B, 1608ull * MB },
136
- { MODEL_30B, 3124ull * MB },
137
- { MODEL_65B, 5120ull * MB },
138
- };
139
- return k_sizes;
140
- }
141
-
142
- // this is mostly needed for temporary mul_mat buffers to dequantize the data
143
- // not actually needed if BLAS is disabled
144
- static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
145
- {
146
- static std::map<e_model, size_t> k_sizes = {
147
- { MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB },
148
- { MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB },
149
- { MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
150
- { MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
151
- { MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
135
+ { MODEL_3B, 8ull * MB },
136
+ { MODEL_7B, 10ull * MB },
137
+ { MODEL_13B, 12ull * MB },
138
+ { MODEL_30B, 16ull * MB },
139
+ { MODEL_65B, 24ull * MB }, // guess
140
+ { MODEL_70B, 24ull * MB },
152
141
  };
153
142
  return k_sizes;
154
143
  }
@@ -163,6 +152,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
163
152
  { MODEL_13B, 640ull * kB },
164
153
  { MODEL_30B, 768ull * kB },
165
154
  { MODEL_65B, 1536ull * kB },
155
+ { MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
166
156
  };
167
157
  return k_sizes;
168
158
  }
@@ -177,19 +167,26 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
177
167
  { MODEL_13B, 160ull },
178
168
  { MODEL_30B, 208ull },
179
169
  { MODEL_65B, 416ull },
170
+ { MODEL_70B, 416ull }, // TODO (likely can be reduced)
180
171
  };
181
172
  return k_sizes;
182
173
  }
183
174
 
184
175
  // default hparams (LLaMA 7B)
185
176
  struct llama_hparams {
186
- uint32_t n_vocab = 32000;
187
- uint32_t n_ctx = 512; // this is provided as user input?
188
- uint32_t n_embd = 4096;
189
- uint32_t n_mult = 256;
190
- uint32_t n_head = 32;
191
- uint32_t n_layer = 32;
192
- uint32_t n_rot = 64;
177
+ uint32_t n_vocab = 32000;
178
+ uint32_t n_ctx = 512; // this is provided as user input?
179
+ uint32_t n_embd = 4096;
180
+ uint32_t n_mult = 256;
181
+ uint32_t n_head = 32;
182
+ uint32_t n_head_kv = 32;
183
+ uint32_t n_layer = 32;
184
+ uint32_t n_rot = 64;
185
+
186
+ // LLaMAv2
187
+ // TODO: load from model data hparams
188
+ float f_ffn_mult = 1.0f;
189
+ float f_rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
193
190
 
194
191
  float rope_freq_base = 10000.0f;
195
192
  float rope_freq_scale = 1.0f;
@@ -197,7 +194,28 @@ struct llama_hparams {
197
194
  enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
198
195
 
199
196
  bool operator!=(const llama_hparams & other) const {
200
- return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
197
+ return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
198
+ }
199
+
200
+ uint32_t n_gqa() const {
201
+ return n_head/n_head_kv;
202
+ }
203
+
204
+ uint32_t n_embd_head() const {
205
+ return n_embd/n_head;
206
+ }
207
+
208
+ uint32_t n_embd_gqa() const {
209
+ return n_embd/n_gqa();
210
+ }
211
+
212
+ size_t kv_size() const {
213
+ size_t result = 2ull;
214
+ result *= (size_t) n_embd_gqa();
215
+ result *= (size_t) n_ctx;
216
+ result *= (size_t) n_layer;
217
+ result *= sizeof(ggml_fp16_t);
218
+ return result;
201
219
  }
202
220
  };
203
221
 
@@ -499,12 +517,16 @@ struct llama_file_loader {
499
517
  }
500
518
  void read_hparams() {
501
519
  hparams.n_vocab = file.read_u32();
502
- hparams.n_embd = file.read_u32();
503
- hparams.n_mult = file.read_u32();
504
- hparams.n_head = file.read_u32();
520
+ hparams.n_embd = file.read_u32();
521
+ hparams.n_mult = file.read_u32();
522
+ hparams.n_head = file.read_u32();
505
523
  hparams.n_layer = file.read_u32();
506
- hparams.n_rot = file.read_u32();
507
- hparams.ftype = (enum llama_ftype) file.read_u32();
524
+ hparams.n_rot = file.read_u32();
525
+ hparams.ftype = (enum llama_ftype) file.read_u32();
526
+
527
+ // LLaMAv2
528
+ // TODO: read from header
529
+ hparams.n_head_kv = hparams.n_head;
508
530
  }
509
531
  void read_vocab() {
510
532
  vocab.id_to_token.resize(hparams.n_vocab);
@@ -803,7 +825,7 @@ static bool kv_cache_init(
803
825
  ggml_type wtype,
804
826
  int n_ctx,
805
827
  int n_gpu_layers) {
806
- const int n_embd = hparams.n_embd;
828
+ const int n_embd = hparams.n_embd_gqa();
807
829
  const int n_layer = hparams.n_layer;
808
830
 
809
831
  const int64_t n_mem = n_layer*n_ctx;
@@ -847,6 +869,8 @@ struct llama_context_params llama_context_default_params() {
847
869
  /*.seed =*/ LLAMA_DEFAULT_SEED,
848
870
  /*.n_ctx =*/ 512,
849
871
  /*.n_batch =*/ 512,
872
+ /*.n_gqa =*/ 1,
873
+ /*.rms_norm_eps =*/ LLAMA_DEFAULT_RMS_EPS,
850
874
  /*.gpu_layers =*/ 0,
851
875
  /*.main_gpu =*/ 0,
852
876
  /*.tensor_split =*/ nullptr,
@@ -966,6 +990,7 @@ static const char *llama_model_type_name(e_model type) {
966
990
  case MODEL_13B: return "13B";
967
991
  case MODEL_30B: return "30B";
968
992
  case MODEL_65B: return "65B";
993
+ case MODEL_70B: return "70B";
969
994
  default: LLAMA_ASSERT(false);
970
995
  }
971
996
  }
@@ -976,6 +1001,8 @@ static void llama_model_load_internal(
976
1001
  llama_vocab & vocab,
977
1002
  int n_ctx,
978
1003
  int n_batch,
1004
+ int n_gqa,
1005
+ float rms_norm_eps,
979
1006
  int n_gpu_layers,
980
1007
  int main_gpu,
981
1008
  const float * tensor_split,
@@ -997,8 +1024,12 @@ static void llama_model_load_internal(
997
1024
  model.hparams = ml->file_loader->hparams;
998
1025
  model.n_gpu_layers = n_gpu_layers;
999
1026
  llama_file_version file_version = ml->file_loader->file_version;
1027
+
1000
1028
  auto & hparams = model.hparams;
1001
1029
 
1030
+ // TODO: read from file
1031
+ hparams.f_rms_norm_eps = rms_norm_eps;
1032
+
1002
1033
  {
1003
1034
  switch (hparams.n_layer) {
1004
1035
  case 26: model.type = e_model::MODEL_3B; break;
@@ -1016,11 +1047,25 @@ static void llama_model_load_internal(
1016
1047
 
1017
1048
  hparams.n_ctx = n_ctx;
1018
1049
 
1050
+ // LLaMAv2
1051
+ // TODO: temporary until GGUF
1052
+ LLAMA_ASSERT(hparams.n_head % n_gqa == 0);
1053
+ hparams.n_head_kv = hparams.n_head / n_gqa;
1054
+ if (model.type == e_model::MODEL_65B && n_gqa == 8) {
1055
+ fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
1056
+ model.type = e_model::MODEL_70B;
1057
+ hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
1058
+ }
1059
+
1019
1060
  hparams.rope_freq_base = rope_freq_base;
1020
1061
  hparams.rope_freq_scale = rope_freq_scale;
1021
1062
  }
1022
1063
 
1023
- const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
1064
+ // ref: https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/model.py#L194-L199
1065
+ const uint32_t n_ff_raw = 2*(4*hparams.n_embd)/3;
1066
+ const uint32_t n_ff_mult = hparams.f_ffn_mult*n_ff_raw;
1067
+ const uint32_t n_ff = ((n_ff_mult + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
1068
+ //const uint32_t n_ff = 28672;
1024
1069
 
1025
1070
  {
1026
1071
  fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
@@ -1029,12 +1074,15 @@ static void llama_model_load_internal(
1029
1074
  fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
1030
1075
  fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
1031
1076
  fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
1077
+ fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
1032
1078
  fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
1033
- fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1079
+ fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
1080
+ fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa());
1081
+ fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
1082
+ fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1034
1083
  fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
1035
1084
  fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1036
1085
  fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
1037
- fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1038
1086
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1039
1087
  }
1040
1088
 
@@ -1069,7 +1117,7 @@ static void llama_model_load_internal(
1069
1117
  {
1070
1118
  model.buf.resize(ctx_size);
1071
1119
  if (use_mlock) {
1072
- model.mlock_buf.init(model.buf.addr);
1120
+ model.mlock_buf.init (model.buf.addr);
1073
1121
  model.mlock_buf.grow_to(model.buf.size);
1074
1122
  }
1075
1123
 
@@ -1104,9 +1152,10 @@ static void llama_model_load_internal(
1104
1152
  size_t vram_weights = 0;
1105
1153
  size_t vram_scratch = 0;
1106
1154
  {
1107
- const uint32_t n_embd = hparams.n_embd;
1108
- const uint32_t n_layer = hparams.n_layer;
1109
- const uint32_t n_vocab = hparams.n_vocab;
1155
+ const uint32_t n_embd = hparams.n_embd;
1156
+ const uint32_t n_embd_gqa = hparams.n_embd_gqa();
1157
+ const uint32_t n_layer = hparams.n_layer;
1158
+ const uint32_t n_vocab = hparams.n_vocab;
1110
1159
 
1111
1160
  ml->ggml_ctx = ctx;
1112
1161
 
@@ -1154,16 +1203,16 @@ static void llama_model_load_internal(
1154
1203
 
1155
1204
  layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
1156
1205
 
1157
- layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
1158
- layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split);
1159
- layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split);
1160
- layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
1206
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
1207
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd_gqa}, backend_split);
1208
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd_gqa}, backend_split);
1209
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
1161
1210
 
1162
1211
  layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
1163
1212
 
1164
- layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
1165
- layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
1166
- layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
1213
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
1214
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
1215
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
1167
1216
 
1168
1217
  if (backend == GGML_BACKEND_GPU) {
1169
1218
  vram_weights +=
@@ -1186,11 +1235,11 @@ static void llama_model_load_internal(
1186
1235
  mmapped_size - vram_weights + // weights in VRAM not in memory
1187
1236
  MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
1188
1237
  MEM_REQ_SCRATCH1().at(model.type) +
1189
- MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
1238
+ MEM_REQ_EVAL().at(model.type);
1190
1239
 
1191
1240
  // this is the memory required by one llama_state
1192
1241
  const size_t mem_required_state =
1193
- scale*MEM_REQ_KV_SELF().at(model.type);
1242
+ scale*hparams.kv_size();
1194
1243
 
1195
1244
  fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1196
1245
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@@ -1231,7 +1280,7 @@ static void llama_model_load_internal(
1231
1280
  fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
1232
1281
  } else {
1233
1282
  fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
1234
- vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1283
+ vram_kv_cache += hparams.kv_size() / 2;
1235
1284
  }
1236
1285
  }
1237
1286
  if (n_gpu_layers > (int) hparams.n_layer + 2) {
@@ -1239,7 +1288,7 @@ static void llama_model_load_internal(
1239
1288
  fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
1240
1289
  } else {
1241
1290
  fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
1242
- vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1291
+ vram_kv_cache += hparams.kv_size() / 2;
1243
1292
  }
1244
1293
  }
1245
1294
  #elif defined(GGML_USE_CLBLAST)
@@ -1287,6 +1336,8 @@ static bool llama_model_load(
1287
1336
  llama_vocab & vocab,
1288
1337
  int n_ctx,
1289
1338
  int n_batch,
1339
+ int n_gqa,
1340
+ float rms_norm_eps,
1290
1341
  int n_gpu_layers,
1291
1342
  int main_gpu,
1292
1343
  const float * tensor_split,
@@ -1300,7 +1351,7 @@ static bool llama_model_load(
1300
1351
  llama_progress_callback progress_callback,
1301
1352
  void *progress_callback_user_data) {
1302
1353
  try {
1303
- llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1354
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1304
1355
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1305
1356
  return true;
1306
1357
  } catch (const std::exception & err) {
@@ -1344,16 +1395,23 @@ static bool llama_eval_internal(
1344
1395
 
1345
1396
  LLAMA_ASSERT(!!kv_self.ctx);
1346
1397
 
1347
- const int n_embd = hparams.n_embd;
1348
- const int n_layer = hparams.n_layer;
1349
- const int n_ctx = hparams.n_ctx;
1350
- const int n_head = hparams.n_head;
1351
- const int n_vocab = hparams.n_vocab;
1352
- const int n_rot = hparams.n_embd/hparams.n_head;
1353
- const int n_gpu_layers = model.n_gpu_layers;
1398
+ const int64_t n_embd = hparams.n_embd;
1399
+ const int64_t n_layer = hparams.n_layer;
1400
+ const int64_t n_ctx = hparams.n_ctx;
1401
+ const int64_t n_head = hparams.n_head;
1402
+ const int64_t n_head_kv = hparams.n_head_kv;
1403
+ const int64_t n_embd_head = hparams.n_embd_head();
1404
+ const int64_t n_vocab = hparams.n_vocab;
1405
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
1406
+
1407
+
1408
+ LLAMA_ASSERT(n_embd_head == hparams.n_rot);
1354
1409
 
1355
1410
  const float freq_base = hparams.rope_freq_base;
1356
1411
  const float freq_scale = hparams.rope_freq_scale;
1412
+ const float rms_norm_eps = hparams.f_rms_norm_eps;
1413
+
1414
+ const int n_gpu_layers = model.n_gpu_layers;
1357
1415
 
1358
1416
  auto & mem_per_token = lctx.mem_per_token;
1359
1417
  auto & buf_compute = lctx.buf_compute;
@@ -1366,7 +1424,7 @@ static bool llama_eval_internal(
1366
1424
 
1367
1425
  struct ggml_context * ctx0 = ggml_init(params);
1368
1426
 
1369
- ggml_cgraph gf = {};
1427
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
1370
1428
 
1371
1429
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1372
1430
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
@@ -1431,7 +1489,7 @@ static bool llama_eval_internal(
1431
1489
 
1432
1490
  // norm
1433
1491
  {
1434
- cur = ggml_rms_norm(ctx0, inpL);
1492
+ cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
1435
1493
  offload_func(cur);
1436
1494
  ggml_set_name(cur, "rms_norm_0");
1437
1495
 
@@ -1452,11 +1510,11 @@ static bool llama_eval_internal(
1452
1510
  offload_func_kq(tmpq);
1453
1511
  ggml_set_name(tmpq, "tmpq");
1454
1512
 
1455
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
1513
+ struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
1456
1514
  offload_func_kq(Kcur);
1457
1515
  ggml_set_name(Kcur, "Kcur");
1458
1516
 
1459
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
1517
+ struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
1460
1518
  offload_func_kq(Qcur);
1461
1519
  ggml_set_name(Qcur, "Qcur");
1462
1520
 
@@ -1468,23 +1526,23 @@ static bool llama_eval_internal(
1468
1526
  offload_func_v(tmpv);
1469
1527
  ggml_set_name(tmpv, "tmpv");
1470
1528
 
1471
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N));
1529
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
1472
1530
  offload_func_v(Vcur);
1473
1531
  ggml_set_name(Vcur, "Vcur");
1474
1532
 
1475
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1533
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
1476
1534
  offload_func_kq(k);
1477
1535
  ggml_set_name(k, "k");
1478
1536
 
1479
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1537
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
1480
1538
  ( n_ctx)*ggml_element_size(kv_self.v),
1481
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1539
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
1482
1540
  offload_func_v(v);
1483
1541
  ggml_set_name(v, "v");
1484
1542
 
1485
1543
  // important: storing RoPE-ed version of K in the KV cache!
1486
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
1487
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
1544
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
1545
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
1488
1546
  }
1489
1547
 
1490
1548
  struct ggml_tensor * Q =
@@ -1497,8 +1555,8 @@ static bool llama_eval_internal(
1497
1555
  struct ggml_tensor * K =
1498
1556
  ggml_permute(ctx0,
1499
1557
  ggml_reshape_3d(ctx0,
1500
- ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
1501
- n_embd/n_head, n_head, n_past + N),
1558
+ ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd_gqa, il*n_ctx*ggml_element_size(kv_self.k)*n_embd_gqa),
1559
+ n_embd_head, n_head_kv, n_past + N),
1502
1560
  0, 2, 1, 3);
1503
1561
  offload_func_kq(K);
1504
1562
  ggml_set_name(K, "K");
@@ -1508,9 +1566,9 @@ static bool llama_eval_internal(
1508
1566
  offload_func_kq(KQ);
1509
1567
  ggml_set_name(KQ, "KQ");
1510
1568
 
1511
- // KQ_scaled = KQ / sqrt(n_embd/n_head)
1569
+ // KQ_scaled = KQ / sqrt(n_embd_head)
1512
1570
  struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
1513
- ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
1571
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
1514
1572
 
1515
1573
  // KQ_scaled shape [n_past + N, N, n_head, 1]
1516
1574
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
@@ -1530,10 +1588,10 @@ static bool llama_eval_internal(
1530
1588
  // split cached V into n_head heads
1531
1589
  struct ggml_tensor * V =
1532
1590
  ggml_view_3d(ctx0, kv_self.v,
1533
- n_past + N, n_embd/n_head, n_head,
1591
+ n_past + N, n_embd_head, n_head_kv,
1534
1592
  n_ctx*ggml_element_size(kv_self.v),
1535
- n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
1536
- il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
1593
+ n_ctx*ggml_element_size(kv_self.v)*n_embd_head,
1594
+ n_ctx*ggml_element_size(kv_self.v)*n_embd_gqa*il);
1537
1595
  offload_func_v(V);
1538
1596
  ggml_set_name(V, "V");
1539
1597
 
@@ -1545,7 +1603,7 @@ static bool llama_eval_internal(
1545
1603
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
1546
1604
  // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
1547
1605
  // is there a better way?
1548
- struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
1606
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
1549
1607
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
1550
1608
  #endif
1551
1609
 
@@ -1579,7 +1637,7 @@ static bool llama_eval_internal(
1579
1637
  {
1580
1638
  // norm
1581
1639
  {
1582
- cur = ggml_rms_norm(ctx0, inpFF);
1640
+ cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
1583
1641
  offload_func(cur);
1584
1642
  ggml_set_name(cur, "rms_norm_1");
1585
1643
 
@@ -1632,7 +1690,7 @@ static bool llama_eval_internal(
1632
1690
 
1633
1691
  // norm
1634
1692
  {
1635
- cur = ggml_rms_norm(ctx0, inpL);
1693
+ cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
1636
1694
  offload_func_nr(cur);
1637
1695
  ggml_set_name(cur, "rms_norm_2");
1638
1696
 
@@ -1654,16 +1712,22 @@ static bool llama_eval_internal(
1654
1712
  //cur = ggml_soft_max_inplace(ctx0, cur);
1655
1713
 
1656
1714
  // run the computation
1657
- ggml_build_forward_expand(&gf, cur);
1715
+ ggml_build_forward_expand(gf, cur);
1716
+
1717
+ // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
1658
1718
 
1659
1719
  #if GGML_USE_MPI
1660
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
1720
+ ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
1661
1721
  #endif
1662
1722
 
1663
1723
  #ifdef GGML_USE_METAL
1664
1724
  if (lctx.ctx_metal && N == 1) {
1725
+ // TODO: disabled until #2413 is resolved
1726
+ //if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
1727
+ // ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
1728
+ //}
1665
1729
  ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
1666
- ggml_metal_graph_compute(lctx.ctx_metal, &gf);
1730
+ ggml_metal_graph_compute(lctx.ctx_metal, gf);
1667
1731
  ggml_metal_get_tensor (lctx.ctx_metal, cur);
1668
1732
  } else {
1669
1733
  // IMPORTANT:
@@ -1682,34 +1746,34 @@ static bool llama_eval_internal(
1682
1746
  ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1683
1747
  }
1684
1748
 
1685
- ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1749
+ ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
1686
1750
  }
1687
1751
  #else
1688
- ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1752
+ ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
1689
1753
  #endif
1690
1754
 
1691
1755
  #if GGML_USE_MPI
1692
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
1756
+ ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
1693
1757
  #endif
1694
1758
 
1695
1759
  // update kv token count
1696
1760
  lctx.kv_self.n = n_past + N;
1697
1761
 
1698
- struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
1762
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
1699
1763
 
1700
1764
  if (cgraph_fname) {
1701
- ggml_graph_export(&gf, cgraph_fname);
1765
+ ggml_graph_export(gf, cgraph_fname);
1702
1766
  }
1703
1767
 
1704
1768
  #ifdef GGML_PERF
1705
1769
  // print timing information per ggml operation (for debugging purposes)
1706
1770
  // requires GGML_PERF to be defined
1707
- ggml_graph_print(&gf);
1771
+ ggml_graph_print(gf);
1708
1772
  #endif
1709
1773
 
1710
1774
  // plot the computation graph in dot format (for debugging purposes)
1711
1775
  //if (n_past%100 == 0) {
1712
- // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
1776
+ // ggml_graph_dump_dot(gf, NULL, "llama.dot");
1713
1777
  //}
1714
1778
 
1715
1779
  // extract logits
@@ -1739,10 +1803,12 @@ static bool llama_eval_internal(
1739
1803
  }
1740
1804
 
1741
1805
  #if 0
1742
- printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__,
1806
+ printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
1743
1807
  ggml_used_mem(ctx0)/1024.0/1024.0,
1744
1808
  lctx.get_buf_max_mem(0)/1024.0/1024.0,
1745
- lctx.get_buf_max_mem(1)/1024.0/1024.0);
1809
+ lctx.get_buf_max_mem(1)/1024.0/1024.0,
1810
+ lctx.work_buffer.size()/1024.0/1024.0,
1811
+ n_past, N);
1746
1812
  #endif
1747
1813
 
1748
1814
  ggml_free(ctx0);
@@ -1915,6 +1981,279 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
1915
1981
  return output;
1916
1982
  }
1917
1983
 
1984
+ //
1985
+ // grammar - internal
1986
+ //
1987
+
1988
+ struct llama_grammar {
1989
+ const std::vector<std::vector<llama_grammar_element>> rules;
1990
+ std::vector<std::vector<const llama_grammar_element *>> stacks;
1991
+ };
1992
+
1993
+ struct llama_grammar_candidate {
1994
+ size_t index;
1995
+ const uint32_t * code_points;
1996
+ };
1997
+
1998
+ // NOTE: assumes valid utf8 (but checks for overrun)
1999
+ // adds a terminating 0 for use as pointer
2000
+ std::vector<uint32_t> decode_utf8(const char * src) {
2001
+ static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
2002
+ const char * pos = src;
2003
+ std::vector<uint32_t> code_points;
2004
+ while (*pos != 0) {
2005
+ uint8_t first_byte = static_cast<uint8_t>(*pos);
2006
+ uint8_t highbits = first_byte >> 4;
2007
+ int len = lookup[highbits];
2008
+ uint8_t mask = (1 << (8 - len)) - 1;
2009
+ uint32_t value = first_byte & mask;
2010
+ const char * end = pos + len; // may overrun!
2011
+ ++pos;
2012
+ for ( ; pos < end && *pos != 0; ++pos) {
2013
+ value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
2014
+ }
2015
+ code_points.push_back(value);
2016
+ }
2017
+ code_points.push_back(0);
2018
+ return code_points;
2019
+ }
2020
+
2021
+ // returns true iff pos points to the end of one of the definitions of a rule
2022
+ static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
2023
+ switch (pos->type) {
2024
+ case LLAMA_GRETYPE_END: return true;
2025
+ case LLAMA_GRETYPE_ALT: return true;
2026
+ default: return false;
2027
+ }
2028
+ }
2029
+
2030
+ // returns true iff chr satisfies the char range at pos (regular or inverse range)
2031
+ // asserts that pos is pointing to a char range element
2032
+ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
2033
+ const llama_grammar_element * pos,
2034
+ const uint32_t chr) {
2035
+
2036
+ bool found = false;
2037
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
2038
+ LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
2039
+
2040
+ do {
2041
+ if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
2042
+ // inclusive range, e.g. [a-z]
2043
+ found = found || (pos->value <= chr && chr <= pos[1].value);
2044
+ pos += 2;
2045
+ } else {
2046
+ // exact char match, e.g. [a] or "a"
2047
+ found = found || pos->value == chr;
2048
+ pos += 1;
2049
+ }
2050
+ } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
2051
+
2052
+ return std::make_pair(found == is_positive_char, pos);
2053
+ }
2054
+
2055
+ // transforms a grammar pushdown stack into N possible stacks, all ending
2056
+ // at a character range (terminal element)
2057
+ static void llama_grammar_advance_stack(
2058
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2059
+ const std::vector<const llama_grammar_element *> & stack,
2060
+ std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
2061
+
2062
+ if (stack.empty()) {
2063
+ new_stacks.push_back(stack);
2064
+ return;
2065
+ }
2066
+
2067
+ const llama_grammar_element * pos = stack.back();
2068
+
2069
+ switch (pos->type) {
2070
+ case LLAMA_GRETYPE_RULE_REF: {
2071
+ const size_t rule_id = static_cast<size_t>(pos->value);
2072
+ const llama_grammar_element * subpos = rules[rule_id].data();
2073
+ do {
2074
+ // init new stack without the top (pos)
2075
+ std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
2076
+ if (!llama_grammar_is_end_of_sequence(pos + 1)) {
2077
+ // if this rule ref is followed by another element, add that to stack
2078
+ new_stack.push_back(pos + 1);
2079
+ }
2080
+ if (!llama_grammar_is_end_of_sequence(subpos)) {
2081
+ // if alternate is nonempty, add to stack
2082
+ new_stack.push_back(subpos);
2083
+ }
2084
+ llama_grammar_advance_stack(rules, new_stack, new_stacks);
2085
+ while (!llama_grammar_is_end_of_sequence(subpos)) {
2086
+ // scan to end of alternate def
2087
+ subpos++;
2088
+ }
2089
+ if (subpos->type == LLAMA_GRETYPE_ALT) {
2090
+ // there's another alternate def of this rule to process
2091
+ subpos++;
2092
+ } else {
2093
+ break;
2094
+ }
2095
+ } while (true);
2096
+ break;
2097
+ }
2098
+ case LLAMA_GRETYPE_CHAR:
2099
+ case LLAMA_GRETYPE_CHAR_NOT:
2100
+ new_stacks.push_back(stack);
2101
+ break;
2102
+ default:
2103
+ // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
2104
+ // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
2105
+ // those
2106
+ LLAMA_ASSERT(false);
2107
+ }
2108
+ }
2109
+
2110
+ // takes a set of possible pushdown stacks on a grammar, which are required to
2111
+ // be positioned at a character range (see `llama_grammar_advance_stack`), and
2112
+ // produces the N possible stacks if the given char is accepted at those
2113
+ // positions
2114
+ static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
2115
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2116
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
2117
+ const uint32_t chr) {
2118
+
2119
+ std::vector<std::vector<const llama_grammar_element *>> new_stacks;
2120
+
2121
+ for (const auto & stack : stacks) {
2122
+ if (stack.empty()) {
2123
+ continue;
2124
+ }
2125
+
2126
+ auto match = llama_grammar_match_char(stack.back(), chr);
2127
+ if (match.first) {
2128
+ const llama_grammar_element * pos = match.second;
2129
+
2130
+ // update top of stack to next element, if any
2131
+ std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
2132
+ if (!llama_grammar_is_end_of_sequence(pos)) {
2133
+ new_stack.push_back(pos);
2134
+ }
2135
+ llama_grammar_advance_stack(rules, new_stack, new_stacks);
2136
+ }
2137
+ }
2138
+
2139
+ return new_stacks;
2140
+ }
2141
+
2142
+ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
2143
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2144
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
2145
+ const std::vector<llama_grammar_candidate> & candidates);
2146
+
2147
+ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
2148
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2149
+ const std::vector<const llama_grammar_element *> & stack,
2150
+ const std::vector<llama_grammar_candidate> & candidates) {
2151
+
2152
+ std::vector<llama_grammar_candidate> rejects;
2153
+
2154
+ if (stack.empty()) {
2155
+ // accept nothing; EOS is handled elsewhere
2156
+ rejects.insert(rejects.end(), candidates.begin(), candidates.end());
2157
+ return rejects;
2158
+ }
2159
+
2160
+ const llama_grammar_element * stack_pos = stack.back();
2161
+
2162
+ std::vector<llama_grammar_candidate> next_candidates;
2163
+ for (auto tok : candidates) {
2164
+ if (llama_grammar_match_char(stack_pos, tok.code_points[0]).first) {
2165
+ if (tok.code_points[1] != 0) {
2166
+ next_candidates.push_back({ tok.index, tok.code_points + 1 });
2167
+ }
2168
+ } else {
2169
+ rejects.push_back(tok);
2170
+ }
2171
+ }
2172
+
2173
+ auto stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
2174
+
2175
+ // update top of stack to next element, if any
2176
+ std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
2177
+ if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
2178
+ stack_after.push_back(stack_pos_after);
2179
+ }
2180
+ std::vector<std::vector<const llama_grammar_element *>> next_stacks;
2181
+ llama_grammar_advance_stack(rules, stack_after, next_stacks);
2182
+
2183
+ auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
2184
+ for (auto tok : next_rejects) {
2185
+ rejects.push_back({ tok.index, tok.code_points - 1 });
2186
+ }
2187
+
2188
+ return rejects;
2189
+ }
2190
+
2191
+ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
2192
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2193
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
2194
+ const std::vector<llama_grammar_candidate> & candidates) {
2195
+ LLAMA_ASSERT(!stacks.empty()); // REVIEW
2196
+
2197
+ if (candidates.empty()) {
2198
+ return std::vector<llama_grammar_candidate>();
2199
+ }
2200
+
2201
+ auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
2202
+
2203
+ for (size_t i = 1, size = stacks.size(); i < size; ++i) {
2204
+ rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
2205
+ }
2206
+ return rejects;
2207
+ }
2208
+
2209
+ //
2210
+ // grammar - external
2211
+ //
2212
+
2213
+ struct llama_grammar * llama_grammar_init(
2214
+ const llama_grammar_element ** rules,
2215
+ size_t n_rules,
2216
+ size_t start_rule_index) {
2217
+ const llama_grammar_element * pos;
2218
+
2219
+ // copy rule definitions into vectors
2220
+ std::vector<std::vector<llama_grammar_element>> vec_rules(n_rules);
2221
+ for (size_t i = 0; i < n_rules; i++) {
2222
+ for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
2223
+ vec_rules[i].push_back(*pos);
2224
+ }
2225
+ vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
2226
+ }
2227
+
2228
+ // loop over alternates of start rule to build initial stacks
2229
+ std::vector<std::vector<const llama_grammar_element *>> stacks;
2230
+ pos = rules[start_rule_index];
2231
+ do {
2232
+ std::vector<const llama_grammar_element *> stack;
2233
+ if (!llama_grammar_is_end_of_sequence(pos)) {
2234
+ // if alternate is nonempty, add to stack
2235
+ stack.push_back(pos);
2236
+ }
2237
+ llama_grammar_advance_stack(vec_rules, stack, stacks);
2238
+ while (!llama_grammar_is_end_of_sequence(pos)) {
2239
+ // scan to end of alternate def
2240
+ pos++;
2241
+ }
2242
+ if (pos->type == LLAMA_GRETYPE_ALT) {
2243
+ // there's another alternate def of this rule to process
2244
+ pos++;
2245
+ } else {
2246
+ break;
2247
+ }
2248
+ } while (true);
2249
+
2250
+ return new llama_grammar{ std::move(vec_rules), std::move(stacks) };
2251
+ }
2252
+
2253
+ void llama_grammar_free(struct llama_grammar * grammar) {
2254
+ delete grammar;
2255
+ }
2256
+
1918
2257
  //
1919
2258
  // sampling
1920
2259
  //
@@ -2200,6 +2539,47 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
2200
2539
  }
2201
2540
  }
2202
2541
 
2542
+ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
2543
+ assert(ctx);
2544
+ const int64_t t_start_sample_us = ggml_time_us();
2545
+
2546
+ bool allow_eos = false;
2547
+ for (const auto & stack : grammar->stacks) {
2548
+ if (stack.empty()) {
2549
+ allow_eos = true;
2550
+ break;
2551
+ }
2552
+ }
2553
+
2554
+ const llama_token eos = llama_token_eos();
2555
+
2556
+ std::vector<std::vector<uint32_t>> candidates_decoded;
2557
+ std::vector<llama_grammar_candidate> candidates_grammar;
2558
+
2559
+ for (size_t i = 0; i < candidates->size; ++i) {
2560
+ const llama_token id = candidates->data[i].id;
2561
+ const char * str = llama_token_to_str(ctx, id);
2562
+ if (id == eos) {
2563
+ if (!allow_eos) {
2564
+ candidates->data[i].logit = -INFINITY;
2565
+ }
2566
+ } else if (*str == 0) {
2567
+ candidates->data[i].logit = -INFINITY;
2568
+ } else {
2569
+ candidates_decoded.push_back(decode_utf8(str));
2570
+ candidates_grammar.push_back({ i, candidates_decoded.back().data() });
2571
+ }
2572
+ }
2573
+
2574
+ const auto rejects =
2575
+ llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
2576
+ for (auto & reject : rejects) {
2577
+ candidates->data[reject.index].logit = -INFINITY;
2578
+ }
2579
+
2580
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2581
+ }
2582
+
2203
2583
  static void llama_log_softmax(float * array, size_t size) {
2204
2584
  float max_l = *std::max_element(array, array + size);
2205
2585
  float sum = 0.f;
@@ -2375,6 +2755,29 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
2375
2755
  return result;
2376
2756
  }
2377
2757
 
2758
+ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
2759
+ const int64_t t_start_sample_us = ggml_time_us();
2760
+
2761
+ if (token == llama_token_eos()) {
2762
+ for (const auto & stack : grammar->stacks) {
2763
+ if (stack.empty()) {
2764
+ return;
2765
+ }
2766
+ }
2767
+ LLAMA_ASSERT(false);
2768
+ }
2769
+
2770
+ const char * str = llama_token_to_str(ctx, token);
2771
+ // Note terminating 0 in decoded string
2772
+ auto code_points = decode_utf8(str);
2773
+ for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
2774
+ grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
2775
+ }
2776
+ LLAMA_ASSERT(!grammar->stacks.empty());
2777
+
2778
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2779
+ }
2780
+
2378
2781
  //
2379
2782
  // quantization
2380
2783
  //
@@ -2448,8 +2851,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2448
2851
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
2449
2852
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
2450
2853
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
2451
- case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
2452
- case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
2854
+ case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
2855
+ case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
2453
2856
 
2454
2857
  #ifdef GGML_USE_K_QUANTS
2455
2858
  // K-quants
@@ -2533,16 +2936,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2533
2936
  } else {
2534
2937
  new_type = quantized_type;
2535
2938
  #ifdef GGML_USE_K_QUANTS
2536
- bool convert_incompatible_tensor = false;
2537
- if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2538
- quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2539
- int nx = tensor.ne.at(0);
2540
- int ny = tensor.ne.at(1);
2541
- if (nx % QK_K != 0 || ny % QK_K != 0) {
2542
- fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
2543
- convert_incompatible_tensor = true;
2544
- }
2545
- }
2546
2939
  if (tensor.name == "output.weight") {
2547
2940
  int nx = tensor.ne.at(0);
2548
2941
  int ny = tensor.ne.at(1);
@@ -2568,6 +2961,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2568
2961
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2569
2962
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2570
2963
  }
2964
+ bool convert_incompatible_tensor = false;
2965
+ if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
2966
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
2967
+ int nx = tensor.ne.at(0);
2968
+ int ny = tensor.ne.at(1);
2969
+ if (nx % QK_K != 0 || ny % QK_K != 0) {
2970
+ fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
2971
+ convert_incompatible_tensor = true;
2972
+ }
2973
+ }
2571
2974
  if (convert_incompatible_tensor) {
2572
2975
  if (tensor.name == "output.weight") {
2573
2976
  new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
@@ -2594,7 +2997,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2594
2997
  f32_data = (float *) f32_conv_buf.addr;
2595
2998
  }
2596
2999
 
2597
- printf("quantizing .. ");
3000
+ printf("quantizing to %s .. ", ggml_type_name(new_type));
2598
3001
  fflush(stdout);
2599
3002
 
2600
3003
  work.resize(nelements * 4); // upper bound on size
@@ -2697,7 +3100,7 @@ struct llama_model * llama_load_model_from_file(
2697
3100
 
2698
3101
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2699
3102
 
2700
- if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
3103
+ if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
2701
3104
  params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
2702
3105
  memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
2703
3106
  params.progress_callback_user_data)) {
@@ -2775,7 +3178,7 @@ struct llama_context * llama_new_context_with_model(
2775
3178
  ctx->embedding.resize(hparams.n_embd);
2776
3179
  }
2777
3180
 
2778
- ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
3181
+ ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
2779
3182
 
2780
3183
  ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
2781
3184
  ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
@@ -2799,7 +3202,7 @@ struct llama_context * llama_new_context_with_model(
2799
3202
 
2800
3203
  const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
2801
3204
 
2802
- printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
3205
+ fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
2803
3206
 
2804
3207
  #define LLAMA_METAL_CHECK_BUF(result) \
2805
3208
  if (!(result)) { \