llama_cpp 0.3.3 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -67,6 +67,7 @@ enum e_model {
67
67
  MODEL_13B,
68
68
  MODEL_30B,
69
69
  MODEL_65B,
70
+ MODEL_70B,
70
71
  };
71
72
 
72
73
  static const size_t kB = 1024;
@@ -98,17 +99,18 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
98
99
  }
99
100
 
100
101
  //
101
- // memory sizes
102
+ // memory sizes (calculated for n_batch == 512)
102
103
  //
103
104
 
104
- static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
105
+ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
105
106
  {
106
107
  static std::map<e_model, size_t> k_sizes = {
107
- { MODEL_3B, 256ull * MB },
108
- { MODEL_7B, 512ull * MB },
109
- { MODEL_13B, 512ull * MB },
110
- { MODEL_30B, 512ull * MB },
111
- { MODEL_65B, 1024ull * MB },
108
+ { MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
109
+ { MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
110
+ { MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
111
+ { MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB },
112
+ { MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess
113
+ { MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB },
112
114
  };
113
115
  return k_sizes;
114
116
  }
@@ -116,38 +118,26 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
116
118
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
117
119
  {
118
120
  static std::map<e_model, size_t> k_sizes = {
119
- { MODEL_3B, 256ull * MB },
120
- { MODEL_7B, 512ull * MB },
121
- { MODEL_13B, 512ull * MB },
122
- { MODEL_30B, 512ull * MB },
123
- { MODEL_65B, 1024ull * MB },
121
+ { MODEL_3B, 128ull * MB },
122
+ { MODEL_7B, 160ull * MB },
123
+ { MODEL_13B, 192ull * MB },
124
+ { MODEL_30B, 256ull * MB },
125
+ { MODEL_65B, 384ull * MB }, // guess
126
+ { MODEL_70B, 304ull * MB },
124
127
  };
125
128
  return k_sizes;
126
129
  }
127
130
 
128
- // 2*n_embd*n_ctx*n_layer*sizeof(float16)
129
- static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
130
- {
131
- static std::map<e_model, size_t> k_sizes = {
132
- { MODEL_3B, 682ull * MB },
133
- { MODEL_7B, 1026ull * MB },
134
- { MODEL_13B, 1608ull * MB },
135
- { MODEL_30B, 3124ull * MB },
136
- { MODEL_65B, 5120ull * MB },
137
- };
138
- return k_sizes;
139
- }
140
-
141
- // this is mostly needed for temporary mul_mat buffers to dequantize the data
142
- // not actually needed if BLAS is disabled
131
+ // used to store the compute graph tensors + non-scratch data
143
132
  static const std::map<e_model, size_t> & MEM_REQ_EVAL()
144
133
  {
145
134
  static std::map<e_model, size_t> k_sizes = {
146
- { MODEL_3B, 512ull * MB },
147
- { MODEL_7B, 768ull * MB },
148
- { MODEL_13B, 1024ull * MB },
149
- { MODEL_30B, 1280ull * MB },
150
- { MODEL_65B, 1536ull * MB },
135
+ { MODEL_3B, 8ull * MB },
136
+ { MODEL_7B, 10ull * MB },
137
+ { MODEL_13B, 12ull * MB },
138
+ { MODEL_30B, 16ull * MB },
139
+ { MODEL_65B, 24ull * MB }, // guess
140
+ { MODEL_70B, 24ull * MB },
151
141
  };
152
142
  return k_sizes;
153
143
  }
@@ -162,6 +152,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
162
152
  { MODEL_13B, 640ull * kB },
163
153
  { MODEL_30B, 768ull * kB },
164
154
  { MODEL_65B, 1536ull * kB },
155
+ { MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
165
156
  };
166
157
  return k_sizes;
167
158
  }
@@ -176,23 +167,55 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
176
167
  { MODEL_13B, 160ull },
177
168
  { MODEL_30B, 208ull },
178
169
  { MODEL_65B, 416ull },
170
+ { MODEL_70B, 416ull }, // TODO (likely can be reduced)
179
171
  };
180
172
  return k_sizes;
181
173
  }
182
174
 
183
175
  // default hparams (LLaMA 7B)
184
176
  struct llama_hparams {
185
- uint32_t n_vocab = 32000;
186
- uint32_t n_ctx = 512; // this is provided as user input?
187
- uint32_t n_embd = 4096;
188
- uint32_t n_mult = 256;
189
- uint32_t n_head = 32;
190
- uint32_t n_layer = 32;
191
- uint32_t n_rot = 64;
177
+ uint32_t n_vocab = 32000;
178
+ uint32_t n_ctx = 512; // this is provided as user input?
179
+ uint32_t n_embd = 4096;
180
+ uint32_t n_mult = 256;
181
+ uint32_t n_head = 32;
182
+ uint32_t n_head_kv = 32;
183
+ uint32_t n_layer = 32;
184
+ uint32_t n_rot = 64;
185
+
186
+ // LLaMAv2
187
+ // TODO: load from model data hparams
188
+ float f_ffn_mult = 1.0f;
189
+ float f_rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
190
+
191
+ float rope_freq_base = 10000.0f;
192
+ float rope_freq_scale = 1.0f;
193
+
192
194
  enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
193
195
 
194
196
  bool operator!=(const llama_hparams & other) const {
195
- return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
197
+ return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
198
+ }
199
+
200
+ uint32_t n_gqa() const {
201
+ return n_head/n_head_kv;
202
+ }
203
+
204
+ uint32_t n_embd_head() const {
205
+ return n_embd/n_head;
206
+ }
207
+
208
+ uint32_t n_embd_gqa() const {
209
+ return n_embd/n_gqa();
210
+ }
211
+
212
+ size_t kv_size() const {
213
+ size_t result = 2ull;
214
+ result *= (size_t) n_embd_gqa();
215
+ result *= (size_t) n_ctx;
216
+ result *= (size_t) n_layer;
217
+ result *= sizeof(ggml_fp16_t);
218
+ return result;
196
219
  }
197
220
  };
198
221
 
@@ -303,7 +326,7 @@ struct llama_model {
303
326
  };
304
327
 
305
328
  struct llama_context {
306
- llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
329
+ llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
307
330
  #ifdef GGML_USE_METAL
308
331
  ~llama_context() {
309
332
  if (ctx_metal) {
@@ -324,7 +347,6 @@ struct llama_context {
324
347
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
325
348
 
326
349
  const llama_model & model;
327
- const llama_vocab & vocab;
328
350
 
329
351
  bool model_owner = false;
330
352
 
@@ -495,12 +517,16 @@ struct llama_file_loader {
495
517
  }
496
518
  void read_hparams() {
497
519
  hparams.n_vocab = file.read_u32();
498
- hparams.n_embd = file.read_u32();
499
- hparams.n_mult = file.read_u32();
500
- hparams.n_head = file.read_u32();
520
+ hparams.n_embd = file.read_u32();
521
+ hparams.n_mult = file.read_u32();
522
+ hparams.n_head = file.read_u32();
501
523
  hparams.n_layer = file.read_u32();
502
- hparams.n_rot = file.read_u32();
503
- hparams.ftype = (enum llama_ftype) file.read_u32();
524
+ hparams.n_rot = file.read_u32();
525
+ hparams.ftype = (enum llama_ftype) file.read_u32();
526
+
527
+ // LLaMAv2
528
+ // TODO: read from header
529
+ hparams.n_head_kv = hparams.n_head;
504
530
  }
505
531
  void read_vocab() {
506
532
  vocab.id_to_token.resize(hparams.n_vocab);
@@ -551,7 +577,9 @@ struct llama_file_loader {
551
577
  }
552
578
 
553
579
  // skip to the next multiple of 32 bytes
554
- file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
580
+ if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
581
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
582
+ }
555
583
 
556
584
  tensor.file_off = file.tell();
557
585
  tensor.name = name;
@@ -648,7 +676,7 @@ struct llama_model_loader {
648
676
  *ctx_size_p = *mmapped_size_p = 0;
649
677
  for (const llama_load_tensor & lt : tensors_map.tensors) {
650
678
  *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
651
- *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
679
+ *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
652
680
  }
653
681
  }
654
682
 
@@ -797,7 +825,7 @@ static bool kv_cache_init(
797
825
  ggml_type wtype,
798
826
  int n_ctx,
799
827
  int n_gpu_layers) {
800
- const int n_embd = hparams.n_embd;
828
+ const int n_embd = hparams.n_embd_gqa();
801
829
  const int n_layer = hparams.n_layer;
802
830
 
803
831
  const int64_t n_mem = n_layer*n_ctx;
@@ -841,9 +869,13 @@ struct llama_context_params llama_context_default_params() {
841
869
  /*.seed =*/ LLAMA_DEFAULT_SEED,
842
870
  /*.n_ctx =*/ 512,
843
871
  /*.n_batch =*/ 512,
872
+ /*.n_gqa =*/ 1,
873
+ /*.rms_norm_eps =*/ LLAMA_DEFAULT_RMS_EPS,
844
874
  /*.gpu_layers =*/ 0,
845
875
  /*.main_gpu =*/ 0,
846
- /*.tensor_split =*/ {0},
876
+ /*.tensor_split =*/ nullptr,
877
+ /*.rope_freq_base =*/ 10000.0f,
878
+ /*.rope_freq_scale =*/ 1.0f,
847
879
  /*.progress_callback =*/ nullptr,
848
880
  /*.progress_callback_user_data =*/ nullptr,
849
881
  /*.low_vram =*/ false,
@@ -869,6 +901,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
869
901
  return result;
870
902
  }
871
903
 
904
+ int llama_max_devices() {
905
+ return LLAMA_MAX_DEVICES;
906
+ }
907
+
872
908
  bool llama_mmap_supported() {
873
909
  return llama_mmap::SUPPORTED;
874
910
  }
@@ -954,6 +990,7 @@ static const char *llama_model_type_name(e_model type) {
954
990
  case MODEL_13B: return "13B";
955
991
  case MODEL_30B: return "30B";
956
992
  case MODEL_65B: return "65B";
993
+ case MODEL_70B: return "70B";
957
994
  default: LLAMA_ASSERT(false);
958
995
  }
959
996
  }
@@ -964,9 +1001,13 @@ static void llama_model_load_internal(
964
1001
  llama_vocab & vocab,
965
1002
  int n_ctx,
966
1003
  int n_batch,
1004
+ int n_gqa,
1005
+ float rms_norm_eps,
967
1006
  int n_gpu_layers,
968
1007
  int main_gpu,
969
1008
  const float * tensor_split,
1009
+ float rope_freq_base,
1010
+ float rope_freq_scale,
970
1011
  bool low_vram,
971
1012
  ggml_type memory_type,
972
1013
  bool use_mmap,
@@ -983,8 +1024,12 @@ static void llama_model_load_internal(
983
1024
  model.hparams = ml->file_loader->hparams;
984
1025
  model.n_gpu_layers = n_gpu_layers;
985
1026
  llama_file_version file_version = ml->file_loader->file_version;
1027
+
986
1028
  auto & hparams = model.hparams;
987
1029
 
1030
+ // TODO: read from file
1031
+ hparams.f_rms_norm_eps = rms_norm_eps;
1032
+
988
1033
  {
989
1034
  switch (hparams.n_layer) {
990
1035
  case 26: model.type = e_model::MODEL_3B; break;
@@ -1001,22 +1046,44 @@ static void llama_model_load_internal(
1001
1046
  }
1002
1047
 
1003
1048
  hparams.n_ctx = n_ctx;
1049
+
1050
+ // LLaMAv2
1051
+ // TODO: temporary until GGUF
1052
+ LLAMA_ASSERT(hparams.n_head % n_gqa == 0);
1053
+ hparams.n_head_kv = hparams.n_head / n_gqa;
1054
+ if (model.type == e_model::MODEL_65B && n_gqa == 8) {
1055
+ fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
1056
+ model.type = e_model::MODEL_70B;
1057
+ hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
1058
+ }
1059
+
1060
+ hparams.rope_freq_base = rope_freq_base;
1061
+ hparams.rope_freq_scale = rope_freq_scale;
1004
1062
  }
1005
1063
 
1006
- const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
1064
+ // ref: https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/model.py#L194-L199
1065
+ const uint32_t n_ff_raw = 2*(4*hparams.n_embd)/3;
1066
+ const uint32_t n_ff_mult = hparams.f_ffn_mult*n_ff_raw;
1067
+ const uint32_t n_ff = ((n_ff_mult + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
1068
+ //const uint32_t n_ff = 28672;
1007
1069
 
1008
1070
  {
1009
- fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
1010
- fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
1011
- fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
1012
- fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
1013
- fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
1014
- fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
1015
- fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
1016
- fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1071
+ fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
1072
+ fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
1073
+ fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
1074
+ fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
1075
+ fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
1076
+ fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
1077
+ fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
1078
+ fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
1079
+ fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
1080
+ fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa());
1081
+ fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
1082
+ fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1083
+ fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
1084
+ fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1017
1085
  fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
1018
- fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1019
- fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1086
+ fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1020
1087
  }
1021
1088
 
1022
1089
  if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
@@ -1050,7 +1117,7 @@ static void llama_model_load_internal(
1050
1117
  {
1051
1118
  model.buf.resize(ctx_size);
1052
1119
  if (use_mlock) {
1053
- model.mlock_buf.init(model.buf.addr);
1120
+ model.mlock_buf.init (model.buf.addr);
1054
1121
  model.mlock_buf.grow_to(model.buf.size);
1055
1122
  }
1056
1123
 
@@ -1085,9 +1152,10 @@ static void llama_model_load_internal(
1085
1152
  size_t vram_weights = 0;
1086
1153
  size_t vram_scratch = 0;
1087
1154
  {
1088
- const uint32_t n_embd = hparams.n_embd;
1089
- const uint32_t n_layer = hparams.n_layer;
1090
- const uint32_t n_vocab = hparams.n_vocab;
1155
+ const uint32_t n_embd = hparams.n_embd;
1156
+ const uint32_t n_embd_gqa = hparams.n_embd_gqa();
1157
+ const uint32_t n_layer = hparams.n_layer;
1158
+ const uint32_t n_vocab = hparams.n_vocab;
1091
1159
 
1092
1160
  ml->ggml_ctx = ctx;
1093
1161
 
@@ -1135,16 +1203,16 @@ static void llama_model_load_internal(
1135
1203
 
1136
1204
  layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
1137
1205
 
1138
- layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
1139
- layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split);
1140
- layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split);
1141
- layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
1206
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
1207
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd_gqa}, backend_split);
1208
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd_gqa}, backend_split);
1209
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
1142
1210
 
1143
1211
  layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
1144
1212
 
1145
- layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
1146
- layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
1147
- layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
1213
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
1214
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
1215
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
1148
1216
 
1149
1217
  if (backend == GGML_BACKEND_GPU) {
1150
1218
  vram_weights +=
@@ -1165,13 +1233,13 @@ static void llama_model_load_internal(
1165
1233
  const size_t mem_required =
1166
1234
  ctx_size +
1167
1235
  mmapped_size - vram_weights + // weights in VRAM not in memory
1168
- MEM_REQ_SCRATCH0().at(model.type) +
1236
+ MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
1169
1237
  MEM_REQ_SCRATCH1().at(model.type) +
1170
- MEM_REQ_EVAL().at (model.type);
1238
+ MEM_REQ_EVAL().at(model.type);
1171
1239
 
1172
1240
  // this is the memory required by one llama_state
1173
1241
  const size_t mem_required_state =
1174
- scale*MEM_REQ_KV_SELF().at(model.type);
1242
+ scale*hparams.kv_size();
1175
1243
 
1176
1244
  fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1177
1245
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@@ -1212,7 +1280,7 @@ static void llama_model_load_internal(
1212
1280
  fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
1213
1281
  } else {
1214
1282
  fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
1215
- vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1283
+ vram_kv_cache += hparams.kv_size() / 2;
1216
1284
  }
1217
1285
  }
1218
1286
  if (n_gpu_layers > (int) hparams.n_layer + 2) {
@@ -1220,7 +1288,7 @@ static void llama_model_load_internal(
1220
1288
  fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
1221
1289
  } else {
1222
1290
  fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
1223
- vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1291
+ vram_kv_cache += hparams.kv_size() / 2;
1224
1292
  }
1225
1293
  }
1226
1294
  #elif defined(GGML_USE_CLBLAST)
@@ -1268,9 +1336,13 @@ static bool llama_model_load(
1268
1336
  llama_vocab & vocab,
1269
1337
  int n_ctx,
1270
1338
  int n_batch,
1339
+ int n_gqa,
1340
+ float rms_norm_eps,
1271
1341
  int n_gpu_layers,
1272
1342
  int main_gpu,
1273
- float * tensor_split,
1343
+ const float * tensor_split,
1344
+ float rope_freq_base,
1345
+ float rope_freq_scale,
1274
1346
  bool low_vram,
1275
1347
  ggml_type memory_type,
1276
1348
  bool use_mmap,
@@ -1279,7 +1351,7 @@ static bool llama_model_load(
1279
1351
  llama_progress_callback progress_callback,
1280
1352
  void *progress_callback_user_data) {
1281
1353
  try {
1282
- llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1354
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1283
1355
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1284
1356
  return true;
1285
1357
  } catch (const std::exception & err) {
@@ -1323,12 +1395,22 @@ static bool llama_eval_internal(
1323
1395
 
1324
1396
  LLAMA_ASSERT(!!kv_self.ctx);
1325
1397
 
1326
- const int n_embd = hparams.n_embd;
1327
- const int n_layer = hparams.n_layer;
1328
- const int n_ctx = hparams.n_ctx;
1329
- const int n_head = hparams.n_head;
1330
- const int n_vocab = hparams.n_vocab;
1331
- const int n_rot = hparams.n_embd/hparams.n_head;
1398
+ const int64_t n_embd = hparams.n_embd;
1399
+ const int64_t n_layer = hparams.n_layer;
1400
+ const int64_t n_ctx = hparams.n_ctx;
1401
+ const int64_t n_head = hparams.n_head;
1402
+ const int64_t n_head_kv = hparams.n_head_kv;
1403
+ const int64_t n_embd_head = hparams.n_embd_head();
1404
+ const int64_t n_vocab = hparams.n_vocab;
1405
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
1406
+
1407
+
1408
+ LLAMA_ASSERT(n_embd_head == hparams.n_rot);
1409
+
1410
+ const float freq_base = hparams.rope_freq_base;
1411
+ const float freq_scale = hparams.rope_freq_scale;
1412
+ const float rms_norm_eps = hparams.f_rms_norm_eps;
1413
+
1332
1414
  const int n_gpu_layers = model.n_gpu_layers;
1333
1415
 
1334
1416
  auto & mem_per_token = lctx.mem_per_token;
@@ -1342,7 +1424,7 @@ static bool llama_eval_internal(
1342
1424
 
1343
1425
  struct ggml_context * ctx0 = ggml_init(params);
1344
1426
 
1345
- ggml_cgraph gf = {};
1427
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
1346
1428
 
1347
1429
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1348
1430
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
@@ -1407,7 +1489,7 @@ static bool llama_eval_internal(
1407
1489
 
1408
1490
  // norm
1409
1491
  {
1410
- cur = ggml_rms_norm(ctx0, inpL);
1492
+ cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
1411
1493
  offload_func(cur);
1412
1494
  ggml_set_name(cur, "rms_norm_0");
1413
1495
 
@@ -1428,11 +1510,11 @@ static bool llama_eval_internal(
1428
1510
  offload_func_kq(tmpq);
1429
1511
  ggml_set_name(tmpq, "tmpq");
1430
1512
 
1431
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1513
+ struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
1432
1514
  offload_func_kq(Kcur);
1433
1515
  ggml_set_name(Kcur, "Kcur");
1434
1516
 
1435
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1517
+ struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
1436
1518
  offload_func_kq(Qcur);
1437
1519
  ggml_set_name(Qcur, "Qcur");
1438
1520
 
@@ -1444,23 +1526,23 @@ static bool llama_eval_internal(
1444
1526
  offload_func_v(tmpv);
1445
1527
  ggml_set_name(tmpv, "tmpv");
1446
1528
 
1447
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N));
1529
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
1448
1530
  offload_func_v(Vcur);
1449
1531
  ggml_set_name(Vcur, "Vcur");
1450
1532
 
1451
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1533
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
1452
1534
  offload_func_kq(k);
1453
1535
  ggml_set_name(k, "k");
1454
1536
 
1455
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1537
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
1456
1538
  ( n_ctx)*ggml_element_size(kv_self.v),
1457
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1539
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
1458
1540
  offload_func_v(v);
1459
1541
  ggml_set_name(v, "v");
1460
1542
 
1461
1543
  // important: storing RoPE-ed version of K in the KV cache!
1462
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
1463
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
1544
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
1545
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
1464
1546
  }
1465
1547
 
1466
1548
  struct ggml_tensor * Q =
@@ -1473,8 +1555,8 @@ static bool llama_eval_internal(
1473
1555
  struct ggml_tensor * K =
1474
1556
  ggml_permute(ctx0,
1475
1557
  ggml_reshape_3d(ctx0,
1476
- ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
1477
- n_embd/n_head, n_head, n_past + N),
1558
+ ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd_gqa, il*n_ctx*ggml_element_size(kv_self.k)*n_embd_gqa),
1559
+ n_embd_head, n_head_kv, n_past + N),
1478
1560
  0, 2, 1, 3);
1479
1561
  offload_func_kq(K);
1480
1562
  ggml_set_name(K, "K");
@@ -1484,9 +1566,9 @@ static bool llama_eval_internal(
1484
1566
  offload_func_kq(KQ);
1485
1567
  ggml_set_name(KQ, "KQ");
1486
1568
 
1487
- // KQ_scaled = KQ / sqrt(n_embd/n_head)
1569
+ // KQ_scaled = KQ / sqrt(n_embd_head)
1488
1570
  struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
1489
- ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
1571
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
1490
1572
 
1491
1573
  // KQ_scaled shape [n_past + N, N, n_head, 1]
1492
1574
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
@@ -1506,10 +1588,10 @@ static bool llama_eval_internal(
1506
1588
  // split cached V into n_head heads
1507
1589
  struct ggml_tensor * V =
1508
1590
  ggml_view_3d(ctx0, kv_self.v,
1509
- n_past + N, n_embd/n_head, n_head,
1591
+ n_past + N, n_embd_head, n_head_kv,
1510
1592
  n_ctx*ggml_element_size(kv_self.v),
1511
- n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
1512
- il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
1593
+ n_ctx*ggml_element_size(kv_self.v)*n_embd_head,
1594
+ n_ctx*ggml_element_size(kv_self.v)*n_embd_gqa*il);
1513
1595
  offload_func_v(V);
1514
1596
  ggml_set_name(V, "V");
1515
1597
 
@@ -1521,7 +1603,7 @@ static bool llama_eval_internal(
1521
1603
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
1522
1604
  // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
1523
1605
  // is there a better way?
1524
- struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
1606
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
1525
1607
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
1526
1608
  #endif
1527
1609
 
@@ -1555,7 +1637,7 @@ static bool llama_eval_internal(
1555
1637
  {
1556
1638
  // norm
1557
1639
  {
1558
- cur = ggml_rms_norm(ctx0, inpFF);
1640
+ cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
1559
1641
  offload_func(cur);
1560
1642
  ggml_set_name(cur, "rms_norm_1");
1561
1643
 
@@ -1608,7 +1690,7 @@ static bool llama_eval_internal(
1608
1690
 
1609
1691
  // norm
1610
1692
  {
1611
- cur = ggml_rms_norm(ctx0, inpL);
1693
+ cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
1612
1694
  offload_func_nr(cur);
1613
1695
  ggml_set_name(cur, "rms_norm_2");
1614
1696
 
@@ -1630,16 +1712,22 @@ static bool llama_eval_internal(
1630
1712
  //cur = ggml_soft_max_inplace(ctx0, cur);
1631
1713
 
1632
1714
  // run the computation
1633
- ggml_build_forward_expand(&gf, cur);
1715
+ ggml_build_forward_expand(gf, cur);
1716
+
1717
+ // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
1634
1718
 
1635
1719
  #if GGML_USE_MPI
1636
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
1720
+ ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
1637
1721
  #endif
1638
1722
 
1639
1723
  #ifdef GGML_USE_METAL
1640
1724
  if (lctx.ctx_metal && N == 1) {
1725
+ // TODO: disabled until #2413 is resolved
1726
+ //if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
1727
+ // ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
1728
+ //}
1641
1729
  ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
1642
- ggml_metal_graph_compute(lctx.ctx_metal, &gf);
1730
+ ggml_metal_graph_compute(lctx.ctx_metal, gf);
1643
1731
  ggml_metal_get_tensor (lctx.ctx_metal, cur);
1644
1732
  } else {
1645
1733
  // IMPORTANT:
@@ -1658,34 +1746,34 @@ static bool llama_eval_internal(
1658
1746
  ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1659
1747
  }
1660
1748
 
1661
- ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1749
+ ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
1662
1750
  }
1663
1751
  #else
1664
- ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1752
+ ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
1665
1753
  #endif
1666
1754
 
1667
1755
  #if GGML_USE_MPI
1668
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
1756
+ ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
1669
1757
  #endif
1670
1758
 
1671
1759
  // update kv token count
1672
1760
  lctx.kv_self.n = n_past + N;
1673
1761
 
1674
- struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
1762
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
1675
1763
 
1676
1764
  if (cgraph_fname) {
1677
- ggml_graph_export(&gf, cgraph_fname);
1765
+ ggml_graph_export(gf, cgraph_fname);
1678
1766
  }
1679
1767
 
1680
1768
  #ifdef GGML_PERF
1681
1769
  // print timing information per ggml operation (for debugging purposes)
1682
1770
  // requires GGML_PERF to be defined
1683
- ggml_graph_print(&gf);
1771
+ ggml_graph_print(gf);
1684
1772
  #endif
1685
1773
 
1686
1774
  // plot the computation graph in dot format (for debugging purposes)
1687
1775
  //if (n_past%100 == 0) {
1688
- // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
1776
+ // ggml_graph_dump_dot(gf, NULL, "llama.dot");
1689
1777
  //}
1690
1778
 
1691
1779
  // extract logits
@@ -1715,10 +1803,12 @@ static bool llama_eval_internal(
1715
1803
  }
1716
1804
 
1717
1805
  #if 0
1718
- printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__,
1806
+ printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
1719
1807
  ggml_used_mem(ctx0)/1024.0/1024.0,
1720
1808
  lctx.get_buf_max_mem(0)/1024.0/1024.0,
1721
- lctx.get_buf_max_mem(1)/1024.0/1024.0);
1809
+ lctx.get_buf_max_mem(1)/1024.0/1024.0,
1810
+ lctx.work_buffer.size()/1024.0/1024.0,
1811
+ n_past, N);
1722
1812
  #endif
1723
1813
 
1724
1814
  ggml_free(ctx0);
@@ -1891,6 +1981,279 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
1891
1981
  return output;
1892
1982
  }
1893
1983
 
1984
+ //
1985
+ // grammar - internal
1986
+ //
1987
+
1988
+ struct llama_grammar {
1989
+ const std::vector<std::vector<llama_grammar_element>> rules;
1990
+ std::vector<std::vector<const llama_grammar_element *>> stacks;
1991
+ };
1992
+
1993
+ struct llama_grammar_candidate {
1994
+ size_t index;
1995
+ const uint32_t * code_points;
1996
+ };
1997
+
1998
+ // NOTE: assumes valid utf8 (but checks for overrun)
1999
+ // adds a terminating 0 for use as pointer
2000
+ std::vector<uint32_t> decode_utf8(const char * src) {
2001
+ static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
2002
+ const char * pos = src;
2003
+ std::vector<uint32_t> code_points;
2004
+ while (*pos != 0) {
2005
+ uint8_t first_byte = static_cast<uint8_t>(*pos);
2006
+ uint8_t highbits = first_byte >> 4;
2007
+ int len = lookup[highbits];
2008
+ uint8_t mask = (1 << (8 - len)) - 1;
2009
+ uint32_t value = first_byte & mask;
2010
+ const char * end = pos + len; // may overrun!
2011
+ ++pos;
2012
+ for ( ; pos < end && *pos != 0; ++pos) {
2013
+ value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
2014
+ }
2015
+ code_points.push_back(value);
2016
+ }
2017
+ code_points.push_back(0);
2018
+ return code_points;
2019
+ }
2020
+
2021
+ // returns true iff pos points to the end of one of the definitions of a rule
2022
+ static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
2023
+ switch (pos->type) {
2024
+ case LLAMA_GRETYPE_END: return true;
2025
+ case LLAMA_GRETYPE_ALT: return true;
2026
+ default: return false;
2027
+ }
2028
+ }
2029
+
2030
+ // returns true iff chr satisfies the char range at pos (regular or inverse range)
2031
+ // asserts that pos is pointing to a char range element
2032
+ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
2033
+ const llama_grammar_element * pos,
2034
+ const uint32_t chr) {
2035
+
2036
+ bool found = false;
2037
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
2038
+ LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
2039
+
2040
+ do {
2041
+ if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
2042
+ // inclusive range, e.g. [a-z]
2043
+ found = found || (pos->value <= chr && chr <= pos[1].value);
2044
+ pos += 2;
2045
+ } else {
2046
+ // exact char match, e.g. [a] or "a"
2047
+ found = found || pos->value == chr;
2048
+ pos += 1;
2049
+ }
2050
+ } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
2051
+
2052
+ return std::make_pair(found == is_positive_char, pos);
2053
+ }
2054
+
2055
+ // transforms a grammar pushdown stack into N possible stacks, all ending
2056
+ // at a character range (terminal element)
2057
+ static void llama_grammar_advance_stack(
2058
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2059
+ const std::vector<const llama_grammar_element *> & stack,
2060
+ std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
2061
+
2062
+ if (stack.empty()) {
2063
+ new_stacks.push_back(stack);
2064
+ return;
2065
+ }
2066
+
2067
+ const llama_grammar_element * pos = stack.back();
2068
+
2069
+ switch (pos->type) {
2070
+ case LLAMA_GRETYPE_RULE_REF: {
2071
+ const size_t rule_id = static_cast<size_t>(pos->value);
2072
+ const llama_grammar_element * subpos = rules[rule_id].data();
2073
+ do {
2074
+ // init new stack without the top (pos)
2075
+ std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
2076
+ if (!llama_grammar_is_end_of_sequence(pos + 1)) {
2077
+ // if this rule ref is followed by another element, add that to stack
2078
+ new_stack.push_back(pos + 1);
2079
+ }
2080
+ if (!llama_grammar_is_end_of_sequence(subpos)) {
2081
+ // if alternate is nonempty, add to stack
2082
+ new_stack.push_back(subpos);
2083
+ }
2084
+ llama_grammar_advance_stack(rules, new_stack, new_stacks);
2085
+ while (!llama_grammar_is_end_of_sequence(subpos)) {
2086
+ // scan to end of alternate def
2087
+ subpos++;
2088
+ }
2089
+ if (subpos->type == LLAMA_GRETYPE_ALT) {
2090
+ // there's another alternate def of this rule to process
2091
+ subpos++;
2092
+ } else {
2093
+ break;
2094
+ }
2095
+ } while (true);
2096
+ break;
2097
+ }
2098
+ case LLAMA_GRETYPE_CHAR:
2099
+ case LLAMA_GRETYPE_CHAR_NOT:
2100
+ new_stacks.push_back(stack);
2101
+ break;
2102
+ default:
2103
+ // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
2104
+ // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
2105
+ // those
2106
+ LLAMA_ASSERT(false);
2107
+ }
2108
+ }
2109
+
2110
+ // takes a set of possible pushdown stacks on a grammar, which are required to
2111
+ // be positioned at a character range (see `llama_grammar_advance_stack`), and
2112
+ // produces the N possible stacks if the given char is accepted at those
2113
+ // positions
2114
+ static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
2115
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2116
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
2117
+ const uint32_t chr) {
2118
+
2119
+ std::vector<std::vector<const llama_grammar_element *>> new_stacks;
2120
+
2121
+ for (const auto & stack : stacks) {
2122
+ if (stack.empty()) {
2123
+ continue;
2124
+ }
2125
+
2126
+ auto match = llama_grammar_match_char(stack.back(), chr);
2127
+ if (match.first) {
2128
+ const llama_grammar_element * pos = match.second;
2129
+
2130
+ // update top of stack to next element, if any
2131
+ std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
2132
+ if (!llama_grammar_is_end_of_sequence(pos)) {
2133
+ new_stack.push_back(pos);
2134
+ }
2135
+ llama_grammar_advance_stack(rules, new_stack, new_stacks);
2136
+ }
2137
+ }
2138
+
2139
+ return new_stacks;
2140
+ }
2141
+
2142
+ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
2143
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2144
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
2145
+ const std::vector<llama_grammar_candidate> & candidates);
2146
+
2147
+ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
2148
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2149
+ const std::vector<const llama_grammar_element *> & stack,
2150
+ const std::vector<llama_grammar_candidate> & candidates) {
2151
+
2152
+ std::vector<llama_grammar_candidate> rejects;
2153
+
2154
+ if (stack.empty()) {
2155
+ // accept nothing; EOS is handled elsewhere
2156
+ rejects.insert(rejects.end(), candidates.begin(), candidates.end());
2157
+ return rejects;
2158
+ }
2159
+
2160
+ const llama_grammar_element * stack_pos = stack.back();
2161
+
2162
+ std::vector<llama_grammar_candidate> next_candidates;
2163
+ for (auto tok : candidates) {
2164
+ if (llama_grammar_match_char(stack_pos, tok.code_points[0]).first) {
2165
+ if (tok.code_points[1] != 0) {
2166
+ next_candidates.push_back({ tok.index, tok.code_points + 1 });
2167
+ }
2168
+ } else {
2169
+ rejects.push_back(tok);
2170
+ }
2171
+ }
2172
+
2173
+ auto stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
2174
+
2175
+ // update top of stack to next element, if any
2176
+ std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
2177
+ if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
2178
+ stack_after.push_back(stack_pos_after);
2179
+ }
2180
+ std::vector<std::vector<const llama_grammar_element *>> next_stacks;
2181
+ llama_grammar_advance_stack(rules, stack_after, next_stacks);
2182
+
2183
+ auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
2184
+ for (auto tok : next_rejects) {
2185
+ rejects.push_back({ tok.index, tok.code_points - 1 });
2186
+ }
2187
+
2188
+ return rejects;
2189
+ }
2190
+
2191
+ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
2192
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2193
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
2194
+ const std::vector<llama_grammar_candidate> & candidates) {
2195
+ LLAMA_ASSERT(!stacks.empty()); // REVIEW
2196
+
2197
+ if (candidates.empty()) {
2198
+ return std::vector<llama_grammar_candidate>();
2199
+ }
2200
+
2201
+ auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
2202
+
2203
+ for (size_t i = 1, size = stacks.size(); i < size; ++i) {
2204
+ rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
2205
+ }
2206
+ return rejects;
2207
+ }
2208
+
2209
+ //
2210
+ // grammar - external
2211
+ //
2212
+
2213
+ struct llama_grammar * llama_grammar_init(
2214
+ const llama_grammar_element ** rules,
2215
+ size_t n_rules,
2216
+ size_t start_rule_index) {
2217
+ const llama_grammar_element * pos;
2218
+
2219
+ // copy rule definitions into vectors
2220
+ std::vector<std::vector<llama_grammar_element>> vec_rules(n_rules);
2221
+ for (size_t i = 0; i < n_rules; i++) {
2222
+ for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
2223
+ vec_rules[i].push_back(*pos);
2224
+ }
2225
+ vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
2226
+ }
2227
+
2228
+ // loop over alternates of start rule to build initial stacks
2229
+ std::vector<std::vector<const llama_grammar_element *>> stacks;
2230
+ pos = rules[start_rule_index];
2231
+ do {
2232
+ std::vector<const llama_grammar_element *> stack;
2233
+ if (!llama_grammar_is_end_of_sequence(pos)) {
2234
+ // if alternate is nonempty, add to stack
2235
+ stack.push_back(pos);
2236
+ }
2237
+ llama_grammar_advance_stack(vec_rules, stack, stacks);
2238
+ while (!llama_grammar_is_end_of_sequence(pos)) {
2239
+ // scan to end of alternate def
2240
+ pos++;
2241
+ }
2242
+ if (pos->type == LLAMA_GRETYPE_ALT) {
2243
+ // there's another alternate def of this rule to process
2244
+ pos++;
2245
+ } else {
2246
+ break;
2247
+ }
2248
+ } while (true);
2249
+
2250
+ return new llama_grammar{ std::move(vec_rules), std::move(stacks) };
2251
+ }
2252
+
2253
+ void llama_grammar_free(struct llama_grammar * grammar) {
2254
+ delete grammar;
2255
+ }
2256
+
1894
2257
  //
1895
2258
  // sampling
1896
2259
  //
@@ -2006,9 +2369,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
2006
2369
  }
2007
2370
 
2008
2371
  // Normalize the second derivatives
2009
- float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
2010
- for (float & value : second_derivatives) {
2011
- value /= second_derivatives_sum;
2372
+ {
2373
+ const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
2374
+
2375
+ if (second_derivatives_sum > 1e-6f) {
2376
+ for (float & value : second_derivatives) {
2377
+ value /= second_derivatives_sum;
2378
+ }
2379
+ } else {
2380
+ for (float & value : second_derivatives) {
2381
+ value = 1.0f / second_derivatives.size();
2382
+ }
2383
+ }
2012
2384
  }
2013
2385
 
2014
2386
  float cum_sum = 0.0f;
@@ -2167,6 +2539,47 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
2167
2539
  }
2168
2540
  }
2169
2541
 
2542
+ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
2543
+ assert(ctx);
2544
+ const int64_t t_start_sample_us = ggml_time_us();
2545
+
2546
+ bool allow_eos = false;
2547
+ for (const auto & stack : grammar->stacks) {
2548
+ if (stack.empty()) {
2549
+ allow_eos = true;
2550
+ break;
2551
+ }
2552
+ }
2553
+
2554
+ const llama_token eos = llama_token_eos();
2555
+
2556
+ std::vector<std::vector<uint32_t>> candidates_decoded;
2557
+ std::vector<llama_grammar_candidate> candidates_grammar;
2558
+
2559
+ for (size_t i = 0; i < candidates->size; ++i) {
2560
+ const llama_token id = candidates->data[i].id;
2561
+ const char * str = llama_token_to_str(ctx, id);
2562
+ if (id == eos) {
2563
+ if (!allow_eos) {
2564
+ candidates->data[i].logit = -INFINITY;
2565
+ }
2566
+ } else if (*str == 0) {
2567
+ candidates->data[i].logit = -INFINITY;
2568
+ } else {
2569
+ candidates_decoded.push_back(decode_utf8(str));
2570
+ candidates_grammar.push_back({ i, candidates_decoded.back().data() });
2571
+ }
2572
+ }
2573
+
2574
+ const auto rejects =
2575
+ llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
2576
+ for (auto & reject : rejects) {
2577
+ candidates->data[reject.index].logit = -INFINITY;
2578
+ }
2579
+
2580
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2581
+ }
2582
+
2170
2583
  static void llama_log_softmax(float * array, size_t size) {
2171
2584
  float max_l = *std::max_element(array, array + size);
2172
2585
  float sum = 0.f;
@@ -2185,9 +2598,8 @@ void llama_sample_classifier_free_guidance(
2185
2598
  struct llama_context * ctx,
2186
2599
  llama_token_data_array * candidates,
2187
2600
  struct llama_context * guidance_ctx,
2188
- float scale,
2189
- float smooth_factor) {
2190
- int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
2601
+ float scale) {
2602
+ int64_t t_start_sample_us = ggml_time_us();
2191
2603
 
2192
2604
  assert(ctx);
2193
2605
  auto n_vocab = llama_n_vocab(ctx);
@@ -2207,16 +2619,7 @@ void llama_sample_classifier_free_guidance(
2207
2619
  for (int i = 0; i < n_vocab; ++i) {
2208
2620
  float logit_guidance = logits_guidance[i];
2209
2621
  float logit_base = logits_base[i];
2210
- logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance;
2211
- }
2212
-
2213
- llama_log_softmax(logits_guidance, n_vocab);
2214
-
2215
- for (int i = 0; i < n_vocab; ++i) {
2216
- float logit_base = logits_base[i];
2217
- float logit_guidance = logits_guidance[i];
2218
-
2219
- candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
2622
+ candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
2220
2623
  }
2221
2624
 
2222
2625
  if (ctx) {
@@ -2352,6 +2755,29 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
2352
2755
  return result;
2353
2756
  }
2354
2757
 
2758
+ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
2759
+ const int64_t t_start_sample_us = ggml_time_us();
2760
+
2761
+ if (token == llama_token_eos()) {
2762
+ for (const auto & stack : grammar->stacks) {
2763
+ if (stack.empty()) {
2764
+ return;
2765
+ }
2766
+ }
2767
+ LLAMA_ASSERT(false);
2768
+ }
2769
+
2770
+ const char * str = llama_token_to_str(ctx, token);
2771
+ // Note terminating 0 in decoded string
2772
+ auto code_points = decode_utf8(str);
2773
+ for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
2774
+ grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
2775
+ }
2776
+ LLAMA_ASSERT(!grammar->stacks.empty());
2777
+
2778
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2779
+ }
2780
+
2355
2781
  //
2356
2782
  // quantization
2357
2783
  //
@@ -2425,8 +2851,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2425
2851
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
2426
2852
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
2427
2853
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
2428
- case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
2429
- case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
2854
+ case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
2855
+ case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
2430
2856
 
2431
2857
  #ifdef GGML_USE_K_QUANTS
2432
2858
  // K-quants
@@ -2510,16 +2936,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2510
2936
  } else {
2511
2937
  new_type = quantized_type;
2512
2938
  #ifdef GGML_USE_K_QUANTS
2513
- bool convert_incompatible_tensor = false;
2514
- if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2515
- quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2516
- int nx = tensor.ne.at(0);
2517
- int ny = tensor.ne.at(1);
2518
- if (nx % QK_K != 0 || ny % QK_K != 0) {
2519
- fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
2520
- convert_incompatible_tensor = true;
2521
- }
2522
- }
2523
2939
  if (tensor.name == "output.weight") {
2524
2940
  int nx = tensor.ne.at(0);
2525
2941
  int ny = tensor.ne.at(1);
@@ -2545,6 +2961,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2545
2961
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2546
2962
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2547
2963
  }
2964
+ bool convert_incompatible_tensor = false;
2965
+ if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
2966
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
2967
+ int nx = tensor.ne.at(0);
2968
+ int ny = tensor.ne.at(1);
2969
+ if (nx % QK_K != 0 || ny % QK_K != 0) {
2970
+ fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
2971
+ convert_incompatible_tensor = true;
2972
+ }
2973
+ }
2548
2974
  if (convert_incompatible_tensor) {
2549
2975
  if (tensor.name == "output.weight") {
2550
2976
  new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
@@ -2571,7 +2997,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2571
2997
  f32_data = (float *) f32_conv_buf.addr;
2572
2998
  }
2573
2999
 
2574
- printf("quantizing .. ");
3000
+ printf("quantizing to %s .. ", ggml_type_name(new_type));
2575
3001
  fflush(stdout);
2576
3002
 
2577
3003
  work.resize(nelements * 4); // upper bound on size
@@ -2674,9 +3100,10 @@ struct llama_model * llama_load_model_from_file(
2674
3100
 
2675
3101
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2676
3102
 
2677
- if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
2678
- params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2679
- params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
3103
+ if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
3104
+ params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
3105
+ memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
3106
+ params.progress_callback_user_data)) {
2680
3107
  delete model;
2681
3108
  fprintf(stderr, "%s: failed to load model\n", __func__);
2682
3109
  return nullptr;
@@ -2697,7 +3124,7 @@ struct llama_context * llama_new_context_with_model(
2697
3124
  return nullptr;
2698
3125
  }
2699
3126
 
2700
- llama_context * ctx = new llama_context(*model, model->vocab);
3127
+ llama_context * ctx = new llama_context(*model);
2701
3128
 
2702
3129
  if (params.seed == LLAMA_DEFAULT_SEED) {
2703
3130
  params.seed = time(NULL);
@@ -2751,9 +3178,9 @@ struct llama_context * llama_new_context_with_model(
2751
3178
  ctx->embedding.resize(hparams.n_embd);
2752
3179
  }
2753
3180
 
2754
- ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
3181
+ ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
2755
3182
 
2756
- ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
3183
+ ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
2757
3184
  ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
2758
3185
  }
2759
3186
 
@@ -2775,7 +3202,7 @@ struct llama_context * llama_new_context_with_model(
2775
3202
 
2776
3203
  const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
2777
3204
 
2778
- printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
3205
+ fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
2779
3206
 
2780
3207
  #define LLAMA_METAL_CHECK_BUF(result) \
2781
3208
  if (!(result)) { \
@@ -3535,13 +3962,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
3535
3962
  return 0;
3536
3963
  }
3537
3964
 
3538
- int llama_tokenize(
3539
- struct llama_context * ctx,
3965
+ int llama_tokenize_with_model(
3966
+ const struct llama_model * model,
3540
3967
  const char * text,
3541
3968
  llama_token * tokens,
3542
3969
  int n_max_tokens,
3543
3970
  bool add_bos) {
3544
- auto res = llama_tokenize(ctx->vocab, text, add_bos);
3971
+ auto res = llama_tokenize(model->vocab, text, add_bos);
3545
3972
 
3546
3973
  if (n_max_tokens < (int) res.size()) {
3547
3974
  fprintf(stderr, "%s: too many tokens\n", __func__);
@@ -3555,8 +3982,29 @@ int llama_tokenize(
3555
3982
  return res.size();
3556
3983
  }
3557
3984
 
3985
+ int llama_tokenize(
3986
+ struct llama_context * ctx,
3987
+ const char * text,
3988
+ llama_token * tokens,
3989
+ int n_max_tokens,
3990
+ bool add_bos) {
3991
+ return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
3992
+ }
3993
+
3994
+ int llama_n_vocab_from_model(const struct llama_model * model) {
3995
+ return model->vocab.id_to_token.size();
3996
+ }
3997
+
3998
+ int llama_n_ctx_from_model(const struct llama_model * model) {
3999
+ return model->hparams.n_ctx;
4000
+ }
4001
+
4002
+ int llama_n_embd_from_model(const struct llama_model * model) {
4003
+ return model->hparams.n_embd;
4004
+ }
4005
+
3558
4006
  int llama_n_vocab(const struct llama_context * ctx) {
3559
- return ctx->vocab.id_to_token.size();
4007
+ return ctx->model.vocab.id_to_token.size();
3560
4008
  }
3561
4009
 
3562
4010
  int llama_n_ctx(const struct llama_context * ctx) {
@@ -3567,19 +4015,27 @@ int llama_n_embd(const struct llama_context * ctx) {
3567
4015
  return ctx->model.hparams.n_embd;
3568
4016
  }
3569
4017
 
3570
- int llama_get_vocab(
3571
- const struct llama_context * ctx,
4018
+ int llama_get_vocab_from_model(
4019
+ const struct llama_model * model,
3572
4020
  const char * * strings,
3573
4021
  float * scores,
3574
4022
  int capacity) {
3575
- int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
4023
+ int n = std::min(capacity, (int) model->vocab.id_to_token.size());
3576
4024
  for (int i = 0; i<n; ++i) {
3577
- strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
3578
- scores[i] = ctx->vocab.id_to_token[i].score;
4025
+ strings[i] = model->vocab.id_to_token[i].tok.c_str();
4026
+ scores[i] = model->vocab.id_to_token[i].score;
3579
4027
  }
3580
4028
  return n;
3581
4029
  }
3582
4030
 
4031
+ int llama_get_vocab(
4032
+ const struct llama_context * ctx,
4033
+ const char * * strings,
4034
+ float * scores,
4035
+ int capacity) {
4036
+ return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
4037
+ }
4038
+
3583
4039
  float * llama_get_logits(struct llama_context * ctx) {
3584
4040
  return ctx->logits.data();
3585
4041
  }
@@ -3588,12 +4044,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
3588
4044
  return ctx->embedding.data();
3589
4045
  }
3590
4046
 
3591
- const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
3592
- if (token >= llama_n_vocab(ctx)) {
4047
+ const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
4048
+ if (token >= llama_n_vocab_from_model(model)) {
3593
4049
  return nullptr;
3594
4050
  }
3595
4051
 
3596
- return ctx->vocab.id_to_token[token].tok.c_str();
4052
+ return model->vocab.id_to_token[token].tok.c_str();
4053
+ }
4054
+
4055
+ const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
4056
+ return llama_token_to_str_with_model(&ctx->model, token);
3597
4057
  }
3598
4058
 
3599
4059
  llama_token llama_token_bos() {