llama_cpp 0.3.3 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -67,6 +67,7 @@ enum e_model {
67
67
  MODEL_13B,
68
68
  MODEL_30B,
69
69
  MODEL_65B,
70
+ MODEL_70B,
70
71
  };
71
72
 
72
73
  static const size_t kB = 1024;
@@ -98,17 +99,18 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
98
99
  }
99
100
 
100
101
  //
101
- // memory sizes
102
+ // memory sizes (calculated for n_batch == 512)
102
103
  //
103
104
 
104
- static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
105
+ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
105
106
  {
106
107
  static std::map<e_model, size_t> k_sizes = {
107
- { MODEL_3B, 256ull * MB },
108
- { MODEL_7B, 512ull * MB },
109
- { MODEL_13B, 512ull * MB },
110
- { MODEL_30B, 512ull * MB },
111
- { MODEL_65B, 1024ull * MB },
108
+ { MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
109
+ { MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
110
+ { MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
111
+ { MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB },
112
+ { MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess
113
+ { MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB },
112
114
  };
113
115
  return k_sizes;
114
116
  }
@@ -116,38 +118,26 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
116
118
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
117
119
  {
118
120
  static std::map<e_model, size_t> k_sizes = {
119
- { MODEL_3B, 256ull * MB },
120
- { MODEL_7B, 512ull * MB },
121
- { MODEL_13B, 512ull * MB },
122
- { MODEL_30B, 512ull * MB },
123
- { MODEL_65B, 1024ull * MB },
121
+ { MODEL_3B, 128ull * MB },
122
+ { MODEL_7B, 160ull * MB },
123
+ { MODEL_13B, 192ull * MB },
124
+ { MODEL_30B, 256ull * MB },
125
+ { MODEL_65B, 384ull * MB }, // guess
126
+ { MODEL_70B, 304ull * MB },
124
127
  };
125
128
  return k_sizes;
126
129
  }
127
130
 
128
- // 2*n_embd*n_ctx*n_layer*sizeof(float16)
129
- static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
130
- {
131
- static std::map<e_model, size_t> k_sizes = {
132
- { MODEL_3B, 682ull * MB },
133
- { MODEL_7B, 1026ull * MB },
134
- { MODEL_13B, 1608ull * MB },
135
- { MODEL_30B, 3124ull * MB },
136
- { MODEL_65B, 5120ull * MB },
137
- };
138
- return k_sizes;
139
- }
140
-
141
- // this is mostly needed for temporary mul_mat buffers to dequantize the data
142
- // not actually needed if BLAS is disabled
131
+ // used to store the compute graph tensors + non-scratch data
143
132
  static const std::map<e_model, size_t> & MEM_REQ_EVAL()
144
133
  {
145
134
  static std::map<e_model, size_t> k_sizes = {
146
- { MODEL_3B, 512ull * MB },
147
- { MODEL_7B, 768ull * MB },
148
- { MODEL_13B, 1024ull * MB },
149
- { MODEL_30B, 1280ull * MB },
150
- { MODEL_65B, 1536ull * MB },
135
+ { MODEL_3B, 8ull * MB },
136
+ { MODEL_7B, 10ull * MB },
137
+ { MODEL_13B, 12ull * MB },
138
+ { MODEL_30B, 16ull * MB },
139
+ { MODEL_65B, 24ull * MB }, // guess
140
+ { MODEL_70B, 24ull * MB },
151
141
  };
152
142
  return k_sizes;
153
143
  }
@@ -162,6 +152,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
162
152
  { MODEL_13B, 640ull * kB },
163
153
  { MODEL_30B, 768ull * kB },
164
154
  { MODEL_65B, 1536ull * kB },
155
+ { MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
165
156
  };
166
157
  return k_sizes;
167
158
  }
@@ -176,23 +167,55 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
176
167
  { MODEL_13B, 160ull },
177
168
  { MODEL_30B, 208ull },
178
169
  { MODEL_65B, 416ull },
170
+ { MODEL_70B, 416ull }, // TODO (likely can be reduced)
179
171
  };
180
172
  return k_sizes;
181
173
  }
182
174
 
183
175
  // default hparams (LLaMA 7B)
184
176
  struct llama_hparams {
185
- uint32_t n_vocab = 32000;
186
- uint32_t n_ctx = 512; // this is provided as user input?
187
- uint32_t n_embd = 4096;
188
- uint32_t n_mult = 256;
189
- uint32_t n_head = 32;
190
- uint32_t n_layer = 32;
191
- uint32_t n_rot = 64;
177
+ uint32_t n_vocab = 32000;
178
+ uint32_t n_ctx = 512; // this is provided as user input?
179
+ uint32_t n_embd = 4096;
180
+ uint32_t n_mult = 256;
181
+ uint32_t n_head = 32;
182
+ uint32_t n_head_kv = 32;
183
+ uint32_t n_layer = 32;
184
+ uint32_t n_rot = 64;
185
+
186
+ // LLaMAv2
187
+ // TODO: load from model data hparams
188
+ float f_ffn_mult = 1.0f;
189
+ float f_rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
190
+
191
+ float rope_freq_base = 10000.0f;
192
+ float rope_freq_scale = 1.0f;
193
+
192
194
  enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
193
195
 
194
196
  bool operator!=(const llama_hparams & other) const {
195
- return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
197
+ return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
198
+ }
199
+
200
+ uint32_t n_gqa() const {
201
+ return n_head/n_head_kv;
202
+ }
203
+
204
+ uint32_t n_embd_head() const {
205
+ return n_embd/n_head;
206
+ }
207
+
208
+ uint32_t n_embd_gqa() const {
209
+ return n_embd/n_gqa();
210
+ }
211
+
212
+ size_t kv_size() const {
213
+ size_t result = 2ull;
214
+ result *= (size_t) n_embd_gqa();
215
+ result *= (size_t) n_ctx;
216
+ result *= (size_t) n_layer;
217
+ result *= sizeof(ggml_fp16_t);
218
+ return result;
196
219
  }
197
220
  };
198
221
 
@@ -303,7 +326,7 @@ struct llama_model {
303
326
  };
304
327
 
305
328
  struct llama_context {
306
- llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
329
+ llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
307
330
  #ifdef GGML_USE_METAL
308
331
  ~llama_context() {
309
332
  if (ctx_metal) {
@@ -324,7 +347,6 @@ struct llama_context {
324
347
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
325
348
 
326
349
  const llama_model & model;
327
- const llama_vocab & vocab;
328
350
 
329
351
  bool model_owner = false;
330
352
 
@@ -495,12 +517,16 @@ struct llama_file_loader {
495
517
  }
496
518
  void read_hparams() {
497
519
  hparams.n_vocab = file.read_u32();
498
- hparams.n_embd = file.read_u32();
499
- hparams.n_mult = file.read_u32();
500
- hparams.n_head = file.read_u32();
520
+ hparams.n_embd = file.read_u32();
521
+ hparams.n_mult = file.read_u32();
522
+ hparams.n_head = file.read_u32();
501
523
  hparams.n_layer = file.read_u32();
502
- hparams.n_rot = file.read_u32();
503
- hparams.ftype = (enum llama_ftype) file.read_u32();
524
+ hparams.n_rot = file.read_u32();
525
+ hparams.ftype = (enum llama_ftype) file.read_u32();
526
+
527
+ // LLaMAv2
528
+ // TODO: read from header
529
+ hparams.n_head_kv = hparams.n_head;
504
530
  }
505
531
  void read_vocab() {
506
532
  vocab.id_to_token.resize(hparams.n_vocab);
@@ -551,7 +577,9 @@ struct llama_file_loader {
551
577
  }
552
578
 
553
579
  // skip to the next multiple of 32 bytes
554
- file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
580
+ if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
581
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
582
+ }
555
583
 
556
584
  tensor.file_off = file.tell();
557
585
  tensor.name = name;
@@ -648,7 +676,7 @@ struct llama_model_loader {
648
676
  *ctx_size_p = *mmapped_size_p = 0;
649
677
  for (const llama_load_tensor & lt : tensors_map.tensors) {
650
678
  *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
651
- *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
679
+ *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
652
680
  }
653
681
  }
654
682
 
@@ -797,7 +825,7 @@ static bool kv_cache_init(
797
825
  ggml_type wtype,
798
826
  int n_ctx,
799
827
  int n_gpu_layers) {
800
- const int n_embd = hparams.n_embd;
828
+ const int n_embd = hparams.n_embd_gqa();
801
829
  const int n_layer = hparams.n_layer;
802
830
 
803
831
  const int64_t n_mem = n_layer*n_ctx;
@@ -841,9 +869,13 @@ struct llama_context_params llama_context_default_params() {
841
869
  /*.seed =*/ LLAMA_DEFAULT_SEED,
842
870
  /*.n_ctx =*/ 512,
843
871
  /*.n_batch =*/ 512,
872
+ /*.n_gqa =*/ 1,
873
+ /*.rms_norm_eps =*/ LLAMA_DEFAULT_RMS_EPS,
844
874
  /*.gpu_layers =*/ 0,
845
875
  /*.main_gpu =*/ 0,
846
- /*.tensor_split =*/ {0},
876
+ /*.tensor_split =*/ nullptr,
877
+ /*.rope_freq_base =*/ 10000.0f,
878
+ /*.rope_freq_scale =*/ 1.0f,
847
879
  /*.progress_callback =*/ nullptr,
848
880
  /*.progress_callback_user_data =*/ nullptr,
849
881
  /*.low_vram =*/ false,
@@ -869,6 +901,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
869
901
  return result;
870
902
  }
871
903
 
904
+ int llama_max_devices() {
905
+ return LLAMA_MAX_DEVICES;
906
+ }
907
+
872
908
  bool llama_mmap_supported() {
873
909
  return llama_mmap::SUPPORTED;
874
910
  }
@@ -954,6 +990,7 @@ static const char *llama_model_type_name(e_model type) {
954
990
  case MODEL_13B: return "13B";
955
991
  case MODEL_30B: return "30B";
956
992
  case MODEL_65B: return "65B";
993
+ case MODEL_70B: return "70B";
957
994
  default: LLAMA_ASSERT(false);
958
995
  }
959
996
  }
@@ -964,9 +1001,13 @@ static void llama_model_load_internal(
964
1001
  llama_vocab & vocab,
965
1002
  int n_ctx,
966
1003
  int n_batch,
1004
+ int n_gqa,
1005
+ float rms_norm_eps,
967
1006
  int n_gpu_layers,
968
1007
  int main_gpu,
969
1008
  const float * tensor_split,
1009
+ float rope_freq_base,
1010
+ float rope_freq_scale,
970
1011
  bool low_vram,
971
1012
  ggml_type memory_type,
972
1013
  bool use_mmap,
@@ -983,8 +1024,12 @@ static void llama_model_load_internal(
983
1024
  model.hparams = ml->file_loader->hparams;
984
1025
  model.n_gpu_layers = n_gpu_layers;
985
1026
  llama_file_version file_version = ml->file_loader->file_version;
1027
+
986
1028
  auto & hparams = model.hparams;
987
1029
 
1030
+ // TODO: read from file
1031
+ hparams.f_rms_norm_eps = rms_norm_eps;
1032
+
988
1033
  {
989
1034
  switch (hparams.n_layer) {
990
1035
  case 26: model.type = e_model::MODEL_3B; break;
@@ -1001,22 +1046,44 @@ static void llama_model_load_internal(
1001
1046
  }
1002
1047
 
1003
1048
  hparams.n_ctx = n_ctx;
1049
+
1050
+ // LLaMAv2
1051
+ // TODO: temporary until GGUF
1052
+ LLAMA_ASSERT(hparams.n_head % n_gqa == 0);
1053
+ hparams.n_head_kv = hparams.n_head / n_gqa;
1054
+ if (model.type == e_model::MODEL_65B && n_gqa == 8) {
1055
+ fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
1056
+ model.type = e_model::MODEL_70B;
1057
+ hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
1058
+ }
1059
+
1060
+ hparams.rope_freq_base = rope_freq_base;
1061
+ hparams.rope_freq_scale = rope_freq_scale;
1004
1062
  }
1005
1063
 
1006
- const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
1064
+ // ref: https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/model.py#L194-L199
1065
+ const uint32_t n_ff_raw = 2*(4*hparams.n_embd)/3;
1066
+ const uint32_t n_ff_mult = hparams.f_ffn_mult*n_ff_raw;
1067
+ const uint32_t n_ff = ((n_ff_mult + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
1068
+ //const uint32_t n_ff = 28672;
1007
1069
 
1008
1070
  {
1009
- fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
1010
- fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
1011
- fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
1012
- fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
1013
- fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
1014
- fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
1015
- fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
1016
- fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1071
+ fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
1072
+ fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
1073
+ fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
1074
+ fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
1075
+ fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
1076
+ fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
1077
+ fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
1078
+ fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
1079
+ fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
1080
+ fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa());
1081
+ fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
1082
+ fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1083
+ fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
1084
+ fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1017
1085
  fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
1018
- fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1019
- fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1086
+ fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1020
1087
  }
1021
1088
 
1022
1089
  if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
@@ -1050,7 +1117,7 @@ static void llama_model_load_internal(
1050
1117
  {
1051
1118
  model.buf.resize(ctx_size);
1052
1119
  if (use_mlock) {
1053
- model.mlock_buf.init(model.buf.addr);
1120
+ model.mlock_buf.init (model.buf.addr);
1054
1121
  model.mlock_buf.grow_to(model.buf.size);
1055
1122
  }
1056
1123
 
@@ -1085,9 +1152,10 @@ static void llama_model_load_internal(
1085
1152
  size_t vram_weights = 0;
1086
1153
  size_t vram_scratch = 0;
1087
1154
  {
1088
- const uint32_t n_embd = hparams.n_embd;
1089
- const uint32_t n_layer = hparams.n_layer;
1090
- const uint32_t n_vocab = hparams.n_vocab;
1155
+ const uint32_t n_embd = hparams.n_embd;
1156
+ const uint32_t n_embd_gqa = hparams.n_embd_gqa();
1157
+ const uint32_t n_layer = hparams.n_layer;
1158
+ const uint32_t n_vocab = hparams.n_vocab;
1091
1159
 
1092
1160
  ml->ggml_ctx = ctx;
1093
1161
 
@@ -1135,16 +1203,16 @@ static void llama_model_load_internal(
1135
1203
 
1136
1204
  layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
1137
1205
 
1138
- layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
1139
- layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split);
1140
- layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split);
1141
- layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
1206
+ layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
1207
+ layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd_gqa}, backend_split);
1208
+ layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd_gqa}, backend_split);
1209
+ layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
1142
1210
 
1143
1211
  layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
1144
1212
 
1145
- layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
1146
- layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
1147
- layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
1213
+ layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
1214
+ layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
1215
+ layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
1148
1216
 
1149
1217
  if (backend == GGML_BACKEND_GPU) {
1150
1218
  vram_weights +=
@@ -1165,13 +1233,13 @@ static void llama_model_load_internal(
1165
1233
  const size_t mem_required =
1166
1234
  ctx_size +
1167
1235
  mmapped_size - vram_weights + // weights in VRAM not in memory
1168
- MEM_REQ_SCRATCH0().at(model.type) +
1236
+ MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
1169
1237
  MEM_REQ_SCRATCH1().at(model.type) +
1170
- MEM_REQ_EVAL().at (model.type);
1238
+ MEM_REQ_EVAL().at(model.type);
1171
1239
 
1172
1240
  // this is the memory required by one llama_state
1173
1241
  const size_t mem_required_state =
1174
- scale*MEM_REQ_KV_SELF().at(model.type);
1242
+ scale*hparams.kv_size();
1175
1243
 
1176
1244
  fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1177
1245
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@@ -1212,7 +1280,7 @@ static void llama_model_load_internal(
1212
1280
  fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
1213
1281
  } else {
1214
1282
  fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
1215
- vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1283
+ vram_kv_cache += hparams.kv_size() / 2;
1216
1284
  }
1217
1285
  }
1218
1286
  if (n_gpu_layers > (int) hparams.n_layer + 2) {
@@ -1220,7 +1288,7 @@ static void llama_model_load_internal(
1220
1288
  fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
1221
1289
  } else {
1222
1290
  fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
1223
- vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1291
+ vram_kv_cache += hparams.kv_size() / 2;
1224
1292
  }
1225
1293
  }
1226
1294
  #elif defined(GGML_USE_CLBLAST)
@@ -1268,9 +1336,13 @@ static bool llama_model_load(
1268
1336
  llama_vocab & vocab,
1269
1337
  int n_ctx,
1270
1338
  int n_batch,
1339
+ int n_gqa,
1340
+ float rms_norm_eps,
1271
1341
  int n_gpu_layers,
1272
1342
  int main_gpu,
1273
- float * tensor_split,
1343
+ const float * tensor_split,
1344
+ float rope_freq_base,
1345
+ float rope_freq_scale,
1274
1346
  bool low_vram,
1275
1347
  ggml_type memory_type,
1276
1348
  bool use_mmap,
@@ -1279,7 +1351,7 @@ static bool llama_model_load(
1279
1351
  llama_progress_callback progress_callback,
1280
1352
  void *progress_callback_user_data) {
1281
1353
  try {
1282
- llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1354
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1283
1355
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1284
1356
  return true;
1285
1357
  } catch (const std::exception & err) {
@@ -1323,12 +1395,22 @@ static bool llama_eval_internal(
1323
1395
 
1324
1396
  LLAMA_ASSERT(!!kv_self.ctx);
1325
1397
 
1326
- const int n_embd = hparams.n_embd;
1327
- const int n_layer = hparams.n_layer;
1328
- const int n_ctx = hparams.n_ctx;
1329
- const int n_head = hparams.n_head;
1330
- const int n_vocab = hparams.n_vocab;
1331
- const int n_rot = hparams.n_embd/hparams.n_head;
1398
+ const int64_t n_embd = hparams.n_embd;
1399
+ const int64_t n_layer = hparams.n_layer;
1400
+ const int64_t n_ctx = hparams.n_ctx;
1401
+ const int64_t n_head = hparams.n_head;
1402
+ const int64_t n_head_kv = hparams.n_head_kv;
1403
+ const int64_t n_embd_head = hparams.n_embd_head();
1404
+ const int64_t n_vocab = hparams.n_vocab;
1405
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
1406
+
1407
+
1408
+ LLAMA_ASSERT(n_embd_head == hparams.n_rot);
1409
+
1410
+ const float freq_base = hparams.rope_freq_base;
1411
+ const float freq_scale = hparams.rope_freq_scale;
1412
+ const float rms_norm_eps = hparams.f_rms_norm_eps;
1413
+
1332
1414
  const int n_gpu_layers = model.n_gpu_layers;
1333
1415
 
1334
1416
  auto & mem_per_token = lctx.mem_per_token;
@@ -1342,7 +1424,7 @@ static bool llama_eval_internal(
1342
1424
 
1343
1425
  struct ggml_context * ctx0 = ggml_init(params);
1344
1426
 
1345
- ggml_cgraph gf = {};
1427
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
1346
1428
 
1347
1429
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1348
1430
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
@@ -1407,7 +1489,7 @@ static bool llama_eval_internal(
1407
1489
 
1408
1490
  // norm
1409
1491
  {
1410
- cur = ggml_rms_norm(ctx0, inpL);
1492
+ cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
1411
1493
  offload_func(cur);
1412
1494
  ggml_set_name(cur, "rms_norm_0");
1413
1495
 
@@ -1428,11 +1510,11 @@ static bool llama_eval_internal(
1428
1510
  offload_func_kq(tmpq);
1429
1511
  ggml_set_name(tmpq, "tmpq");
1430
1512
 
1431
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1513
+ struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
1432
1514
  offload_func_kq(Kcur);
1433
1515
  ggml_set_name(Kcur, "Kcur");
1434
1516
 
1435
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1517
+ struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
1436
1518
  offload_func_kq(Qcur);
1437
1519
  ggml_set_name(Qcur, "Qcur");
1438
1520
 
@@ -1444,23 +1526,23 @@ static bool llama_eval_internal(
1444
1526
  offload_func_v(tmpv);
1445
1527
  ggml_set_name(tmpv, "tmpv");
1446
1528
 
1447
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N));
1529
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
1448
1530
  offload_func_v(Vcur);
1449
1531
  ggml_set_name(Vcur, "Vcur");
1450
1532
 
1451
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1533
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
1452
1534
  offload_func_kq(k);
1453
1535
  ggml_set_name(k, "k");
1454
1536
 
1455
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1537
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
1456
1538
  ( n_ctx)*ggml_element_size(kv_self.v),
1457
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1539
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
1458
1540
  offload_func_v(v);
1459
1541
  ggml_set_name(v, "v");
1460
1542
 
1461
1543
  // important: storing RoPE-ed version of K in the KV cache!
1462
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
1463
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
1544
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
1545
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
1464
1546
  }
1465
1547
 
1466
1548
  struct ggml_tensor * Q =
@@ -1473,8 +1555,8 @@ static bool llama_eval_internal(
1473
1555
  struct ggml_tensor * K =
1474
1556
  ggml_permute(ctx0,
1475
1557
  ggml_reshape_3d(ctx0,
1476
- ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
1477
- n_embd/n_head, n_head, n_past + N),
1558
+ ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd_gqa, il*n_ctx*ggml_element_size(kv_self.k)*n_embd_gqa),
1559
+ n_embd_head, n_head_kv, n_past + N),
1478
1560
  0, 2, 1, 3);
1479
1561
  offload_func_kq(K);
1480
1562
  ggml_set_name(K, "K");
@@ -1484,9 +1566,9 @@ static bool llama_eval_internal(
1484
1566
  offload_func_kq(KQ);
1485
1567
  ggml_set_name(KQ, "KQ");
1486
1568
 
1487
- // KQ_scaled = KQ / sqrt(n_embd/n_head)
1569
+ // KQ_scaled = KQ / sqrt(n_embd_head)
1488
1570
  struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
1489
- ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
1571
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
1490
1572
 
1491
1573
  // KQ_scaled shape [n_past + N, N, n_head, 1]
1492
1574
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
@@ -1506,10 +1588,10 @@ static bool llama_eval_internal(
1506
1588
  // split cached V into n_head heads
1507
1589
  struct ggml_tensor * V =
1508
1590
  ggml_view_3d(ctx0, kv_self.v,
1509
- n_past + N, n_embd/n_head, n_head,
1591
+ n_past + N, n_embd_head, n_head_kv,
1510
1592
  n_ctx*ggml_element_size(kv_self.v),
1511
- n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
1512
- il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
1593
+ n_ctx*ggml_element_size(kv_self.v)*n_embd_head,
1594
+ n_ctx*ggml_element_size(kv_self.v)*n_embd_gqa*il);
1513
1595
  offload_func_v(V);
1514
1596
  ggml_set_name(V, "V");
1515
1597
 
@@ -1521,7 +1603,7 @@ static bool llama_eval_internal(
1521
1603
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
1522
1604
  // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
1523
1605
  // is there a better way?
1524
- struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
1606
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
1525
1607
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
1526
1608
  #endif
1527
1609
 
@@ -1555,7 +1637,7 @@ static bool llama_eval_internal(
1555
1637
  {
1556
1638
  // norm
1557
1639
  {
1558
- cur = ggml_rms_norm(ctx0, inpFF);
1640
+ cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
1559
1641
  offload_func(cur);
1560
1642
  ggml_set_name(cur, "rms_norm_1");
1561
1643
 
@@ -1608,7 +1690,7 @@ static bool llama_eval_internal(
1608
1690
 
1609
1691
  // norm
1610
1692
  {
1611
- cur = ggml_rms_norm(ctx0, inpL);
1693
+ cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
1612
1694
  offload_func_nr(cur);
1613
1695
  ggml_set_name(cur, "rms_norm_2");
1614
1696
 
@@ -1630,16 +1712,22 @@ static bool llama_eval_internal(
1630
1712
  //cur = ggml_soft_max_inplace(ctx0, cur);
1631
1713
 
1632
1714
  // run the computation
1633
- ggml_build_forward_expand(&gf, cur);
1715
+ ggml_build_forward_expand(gf, cur);
1716
+
1717
+ // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
1634
1718
 
1635
1719
  #if GGML_USE_MPI
1636
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
1720
+ ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
1637
1721
  #endif
1638
1722
 
1639
1723
  #ifdef GGML_USE_METAL
1640
1724
  if (lctx.ctx_metal && N == 1) {
1725
+ // TODO: disabled until #2413 is resolved
1726
+ //if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
1727
+ // ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
1728
+ //}
1641
1729
  ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
1642
- ggml_metal_graph_compute(lctx.ctx_metal, &gf);
1730
+ ggml_metal_graph_compute(lctx.ctx_metal, gf);
1643
1731
  ggml_metal_get_tensor (lctx.ctx_metal, cur);
1644
1732
  } else {
1645
1733
  // IMPORTANT:
@@ -1658,34 +1746,34 @@ static bool llama_eval_internal(
1658
1746
  ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1659
1747
  }
1660
1748
 
1661
- ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1749
+ ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
1662
1750
  }
1663
1751
  #else
1664
- ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1752
+ ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
1665
1753
  #endif
1666
1754
 
1667
1755
  #if GGML_USE_MPI
1668
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
1756
+ ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
1669
1757
  #endif
1670
1758
 
1671
1759
  // update kv token count
1672
1760
  lctx.kv_self.n = n_past + N;
1673
1761
 
1674
- struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
1762
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
1675
1763
 
1676
1764
  if (cgraph_fname) {
1677
- ggml_graph_export(&gf, cgraph_fname);
1765
+ ggml_graph_export(gf, cgraph_fname);
1678
1766
  }
1679
1767
 
1680
1768
  #ifdef GGML_PERF
1681
1769
  // print timing information per ggml operation (for debugging purposes)
1682
1770
  // requires GGML_PERF to be defined
1683
- ggml_graph_print(&gf);
1771
+ ggml_graph_print(gf);
1684
1772
  #endif
1685
1773
 
1686
1774
  // plot the computation graph in dot format (for debugging purposes)
1687
1775
  //if (n_past%100 == 0) {
1688
- // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
1776
+ // ggml_graph_dump_dot(gf, NULL, "llama.dot");
1689
1777
  //}
1690
1778
 
1691
1779
  // extract logits
@@ -1715,10 +1803,12 @@ static bool llama_eval_internal(
1715
1803
  }
1716
1804
 
1717
1805
  #if 0
1718
- printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__,
1806
+ printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
1719
1807
  ggml_used_mem(ctx0)/1024.0/1024.0,
1720
1808
  lctx.get_buf_max_mem(0)/1024.0/1024.0,
1721
- lctx.get_buf_max_mem(1)/1024.0/1024.0);
1809
+ lctx.get_buf_max_mem(1)/1024.0/1024.0,
1810
+ lctx.work_buffer.size()/1024.0/1024.0,
1811
+ n_past, N);
1722
1812
  #endif
1723
1813
 
1724
1814
  ggml_free(ctx0);
@@ -1891,6 +1981,279 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
1891
1981
  return output;
1892
1982
  }
1893
1983
 
1984
+ //
1985
+ // grammar - internal
1986
+ //
1987
+
1988
+ struct llama_grammar {
1989
+ const std::vector<std::vector<llama_grammar_element>> rules;
1990
+ std::vector<std::vector<const llama_grammar_element *>> stacks;
1991
+ };
1992
+
1993
+ struct llama_grammar_candidate {
1994
+ size_t index;
1995
+ const uint32_t * code_points;
1996
+ };
1997
+
1998
+ // NOTE: assumes valid utf8 (but checks for overrun)
1999
+ // adds a terminating 0 for use as pointer
2000
+ std::vector<uint32_t> decode_utf8(const char * src) {
2001
+ static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
2002
+ const char * pos = src;
2003
+ std::vector<uint32_t> code_points;
2004
+ while (*pos != 0) {
2005
+ uint8_t first_byte = static_cast<uint8_t>(*pos);
2006
+ uint8_t highbits = first_byte >> 4;
2007
+ int len = lookup[highbits];
2008
+ uint8_t mask = (1 << (8 - len)) - 1;
2009
+ uint32_t value = first_byte & mask;
2010
+ const char * end = pos + len; // may overrun!
2011
+ ++pos;
2012
+ for ( ; pos < end && *pos != 0; ++pos) {
2013
+ value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
2014
+ }
2015
+ code_points.push_back(value);
2016
+ }
2017
+ code_points.push_back(0);
2018
+ return code_points;
2019
+ }
2020
+
2021
+ // returns true iff pos points to the end of one of the definitions of a rule
2022
+ static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
2023
+ switch (pos->type) {
2024
+ case LLAMA_GRETYPE_END: return true;
2025
+ case LLAMA_GRETYPE_ALT: return true;
2026
+ default: return false;
2027
+ }
2028
+ }
2029
+
2030
+ // returns true iff chr satisfies the char range at pos (regular or inverse range)
2031
+ // asserts that pos is pointing to a char range element
2032
+ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
2033
+ const llama_grammar_element * pos,
2034
+ const uint32_t chr) {
2035
+
2036
+ bool found = false;
2037
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
2038
+ LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
2039
+
2040
+ do {
2041
+ if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
2042
+ // inclusive range, e.g. [a-z]
2043
+ found = found || (pos->value <= chr && chr <= pos[1].value);
2044
+ pos += 2;
2045
+ } else {
2046
+ // exact char match, e.g. [a] or "a"
2047
+ found = found || pos->value == chr;
2048
+ pos += 1;
2049
+ }
2050
+ } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
2051
+
2052
+ return std::make_pair(found == is_positive_char, pos);
2053
+ }
2054
+
2055
+ // transforms a grammar pushdown stack into N possible stacks, all ending
2056
+ // at a character range (terminal element)
2057
+ static void llama_grammar_advance_stack(
2058
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2059
+ const std::vector<const llama_grammar_element *> & stack,
2060
+ std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
2061
+
2062
+ if (stack.empty()) {
2063
+ new_stacks.push_back(stack);
2064
+ return;
2065
+ }
2066
+
2067
+ const llama_grammar_element * pos = stack.back();
2068
+
2069
+ switch (pos->type) {
2070
+ case LLAMA_GRETYPE_RULE_REF: {
2071
+ const size_t rule_id = static_cast<size_t>(pos->value);
2072
+ const llama_grammar_element * subpos = rules[rule_id].data();
2073
+ do {
2074
+ // init new stack without the top (pos)
2075
+ std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
2076
+ if (!llama_grammar_is_end_of_sequence(pos + 1)) {
2077
+ // if this rule ref is followed by another element, add that to stack
2078
+ new_stack.push_back(pos + 1);
2079
+ }
2080
+ if (!llama_grammar_is_end_of_sequence(subpos)) {
2081
+ // if alternate is nonempty, add to stack
2082
+ new_stack.push_back(subpos);
2083
+ }
2084
+ llama_grammar_advance_stack(rules, new_stack, new_stacks);
2085
+ while (!llama_grammar_is_end_of_sequence(subpos)) {
2086
+ // scan to end of alternate def
2087
+ subpos++;
2088
+ }
2089
+ if (subpos->type == LLAMA_GRETYPE_ALT) {
2090
+ // there's another alternate def of this rule to process
2091
+ subpos++;
2092
+ } else {
2093
+ break;
2094
+ }
2095
+ } while (true);
2096
+ break;
2097
+ }
2098
+ case LLAMA_GRETYPE_CHAR:
2099
+ case LLAMA_GRETYPE_CHAR_NOT:
2100
+ new_stacks.push_back(stack);
2101
+ break;
2102
+ default:
2103
+ // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
2104
+ // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
2105
+ // those
2106
+ LLAMA_ASSERT(false);
2107
+ }
2108
+ }
2109
+
2110
+ // takes a set of possible pushdown stacks on a grammar, which are required to
2111
+ // be positioned at a character range (see `llama_grammar_advance_stack`), and
2112
+ // produces the N possible stacks if the given char is accepted at those
2113
+ // positions
2114
+ static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
2115
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2116
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
2117
+ const uint32_t chr) {
2118
+
2119
+ std::vector<std::vector<const llama_grammar_element *>> new_stacks;
2120
+
2121
+ for (const auto & stack : stacks) {
2122
+ if (stack.empty()) {
2123
+ continue;
2124
+ }
2125
+
2126
+ auto match = llama_grammar_match_char(stack.back(), chr);
2127
+ if (match.first) {
2128
+ const llama_grammar_element * pos = match.second;
2129
+
2130
+ // update top of stack to next element, if any
2131
+ std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
2132
+ if (!llama_grammar_is_end_of_sequence(pos)) {
2133
+ new_stack.push_back(pos);
2134
+ }
2135
+ llama_grammar_advance_stack(rules, new_stack, new_stacks);
2136
+ }
2137
+ }
2138
+
2139
+ return new_stacks;
2140
+ }
2141
+
2142
+ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
2143
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2144
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
2145
+ const std::vector<llama_grammar_candidate> & candidates);
2146
+
2147
+ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
2148
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2149
+ const std::vector<const llama_grammar_element *> & stack,
2150
+ const std::vector<llama_grammar_candidate> & candidates) {
2151
+
2152
+ std::vector<llama_grammar_candidate> rejects;
2153
+
2154
+ if (stack.empty()) {
2155
+ // accept nothing; EOS is handled elsewhere
2156
+ rejects.insert(rejects.end(), candidates.begin(), candidates.end());
2157
+ return rejects;
2158
+ }
2159
+
2160
+ const llama_grammar_element * stack_pos = stack.back();
2161
+
2162
+ std::vector<llama_grammar_candidate> next_candidates;
2163
+ for (auto tok : candidates) {
2164
+ if (llama_grammar_match_char(stack_pos, tok.code_points[0]).first) {
2165
+ if (tok.code_points[1] != 0) {
2166
+ next_candidates.push_back({ tok.index, tok.code_points + 1 });
2167
+ }
2168
+ } else {
2169
+ rejects.push_back(tok);
2170
+ }
2171
+ }
2172
+
2173
+ auto stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
2174
+
2175
+ // update top of stack to next element, if any
2176
+ std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
2177
+ if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
2178
+ stack_after.push_back(stack_pos_after);
2179
+ }
2180
+ std::vector<std::vector<const llama_grammar_element *>> next_stacks;
2181
+ llama_grammar_advance_stack(rules, stack_after, next_stacks);
2182
+
2183
+ auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
2184
+ for (auto tok : next_rejects) {
2185
+ rejects.push_back({ tok.index, tok.code_points - 1 });
2186
+ }
2187
+
2188
+ return rejects;
2189
+ }
2190
+
2191
+ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
2192
+ const std::vector<std::vector<llama_grammar_element>> & rules,
2193
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
2194
+ const std::vector<llama_grammar_candidate> & candidates) {
2195
+ LLAMA_ASSERT(!stacks.empty()); // REVIEW
2196
+
2197
+ if (candidates.empty()) {
2198
+ return std::vector<llama_grammar_candidate>();
2199
+ }
2200
+
2201
+ auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
2202
+
2203
+ for (size_t i = 1, size = stacks.size(); i < size; ++i) {
2204
+ rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
2205
+ }
2206
+ return rejects;
2207
+ }
2208
+
2209
+ //
2210
+ // grammar - external
2211
+ //
2212
+
2213
+ struct llama_grammar * llama_grammar_init(
2214
+ const llama_grammar_element ** rules,
2215
+ size_t n_rules,
2216
+ size_t start_rule_index) {
2217
+ const llama_grammar_element * pos;
2218
+
2219
+ // copy rule definitions into vectors
2220
+ std::vector<std::vector<llama_grammar_element>> vec_rules(n_rules);
2221
+ for (size_t i = 0; i < n_rules; i++) {
2222
+ for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
2223
+ vec_rules[i].push_back(*pos);
2224
+ }
2225
+ vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
2226
+ }
2227
+
2228
+ // loop over alternates of start rule to build initial stacks
2229
+ std::vector<std::vector<const llama_grammar_element *>> stacks;
2230
+ pos = rules[start_rule_index];
2231
+ do {
2232
+ std::vector<const llama_grammar_element *> stack;
2233
+ if (!llama_grammar_is_end_of_sequence(pos)) {
2234
+ // if alternate is nonempty, add to stack
2235
+ stack.push_back(pos);
2236
+ }
2237
+ llama_grammar_advance_stack(vec_rules, stack, stacks);
2238
+ while (!llama_grammar_is_end_of_sequence(pos)) {
2239
+ // scan to end of alternate def
2240
+ pos++;
2241
+ }
2242
+ if (pos->type == LLAMA_GRETYPE_ALT) {
2243
+ // there's another alternate def of this rule to process
2244
+ pos++;
2245
+ } else {
2246
+ break;
2247
+ }
2248
+ } while (true);
2249
+
2250
+ return new llama_grammar{ std::move(vec_rules), std::move(stacks) };
2251
+ }
2252
+
2253
+ void llama_grammar_free(struct llama_grammar * grammar) {
2254
+ delete grammar;
2255
+ }
2256
+
1894
2257
  //
1895
2258
  // sampling
1896
2259
  //
@@ -2006,9 +2369,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
2006
2369
  }
2007
2370
 
2008
2371
  // Normalize the second derivatives
2009
- float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
2010
- for (float & value : second_derivatives) {
2011
- value /= second_derivatives_sum;
2372
+ {
2373
+ const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
2374
+
2375
+ if (second_derivatives_sum > 1e-6f) {
2376
+ for (float & value : second_derivatives) {
2377
+ value /= second_derivatives_sum;
2378
+ }
2379
+ } else {
2380
+ for (float & value : second_derivatives) {
2381
+ value = 1.0f / second_derivatives.size();
2382
+ }
2383
+ }
2012
2384
  }
2013
2385
 
2014
2386
  float cum_sum = 0.0f;
@@ -2167,6 +2539,47 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
2167
2539
  }
2168
2540
  }
2169
2541
 
2542
+ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
2543
+ assert(ctx);
2544
+ const int64_t t_start_sample_us = ggml_time_us();
2545
+
2546
+ bool allow_eos = false;
2547
+ for (const auto & stack : grammar->stacks) {
2548
+ if (stack.empty()) {
2549
+ allow_eos = true;
2550
+ break;
2551
+ }
2552
+ }
2553
+
2554
+ const llama_token eos = llama_token_eos();
2555
+
2556
+ std::vector<std::vector<uint32_t>> candidates_decoded;
2557
+ std::vector<llama_grammar_candidate> candidates_grammar;
2558
+
2559
+ for (size_t i = 0; i < candidates->size; ++i) {
2560
+ const llama_token id = candidates->data[i].id;
2561
+ const char * str = llama_token_to_str(ctx, id);
2562
+ if (id == eos) {
2563
+ if (!allow_eos) {
2564
+ candidates->data[i].logit = -INFINITY;
2565
+ }
2566
+ } else if (*str == 0) {
2567
+ candidates->data[i].logit = -INFINITY;
2568
+ } else {
2569
+ candidates_decoded.push_back(decode_utf8(str));
2570
+ candidates_grammar.push_back({ i, candidates_decoded.back().data() });
2571
+ }
2572
+ }
2573
+
2574
+ const auto rejects =
2575
+ llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
2576
+ for (auto & reject : rejects) {
2577
+ candidates->data[reject.index].logit = -INFINITY;
2578
+ }
2579
+
2580
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2581
+ }
2582
+
2170
2583
  static void llama_log_softmax(float * array, size_t size) {
2171
2584
  float max_l = *std::max_element(array, array + size);
2172
2585
  float sum = 0.f;
@@ -2185,9 +2598,8 @@ void llama_sample_classifier_free_guidance(
2185
2598
  struct llama_context * ctx,
2186
2599
  llama_token_data_array * candidates,
2187
2600
  struct llama_context * guidance_ctx,
2188
- float scale,
2189
- float smooth_factor) {
2190
- int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
2601
+ float scale) {
2602
+ int64_t t_start_sample_us = ggml_time_us();
2191
2603
 
2192
2604
  assert(ctx);
2193
2605
  auto n_vocab = llama_n_vocab(ctx);
@@ -2207,16 +2619,7 @@ void llama_sample_classifier_free_guidance(
2207
2619
  for (int i = 0; i < n_vocab; ++i) {
2208
2620
  float logit_guidance = logits_guidance[i];
2209
2621
  float logit_base = logits_base[i];
2210
- logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance;
2211
- }
2212
-
2213
- llama_log_softmax(logits_guidance, n_vocab);
2214
-
2215
- for (int i = 0; i < n_vocab; ++i) {
2216
- float logit_base = logits_base[i];
2217
- float logit_guidance = logits_guidance[i];
2218
-
2219
- candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
2622
+ candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
2220
2623
  }
2221
2624
 
2222
2625
  if (ctx) {
@@ -2352,6 +2755,29 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
2352
2755
  return result;
2353
2756
  }
2354
2757
 
2758
+ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
2759
+ const int64_t t_start_sample_us = ggml_time_us();
2760
+
2761
+ if (token == llama_token_eos()) {
2762
+ for (const auto & stack : grammar->stacks) {
2763
+ if (stack.empty()) {
2764
+ return;
2765
+ }
2766
+ }
2767
+ LLAMA_ASSERT(false);
2768
+ }
2769
+
2770
+ const char * str = llama_token_to_str(ctx, token);
2771
+ // Note terminating 0 in decoded string
2772
+ auto code_points = decode_utf8(str);
2773
+ for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
2774
+ grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
2775
+ }
2776
+ LLAMA_ASSERT(!grammar->stacks.empty());
2777
+
2778
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2779
+ }
2780
+
2355
2781
  //
2356
2782
  // quantization
2357
2783
  //
@@ -2425,8 +2851,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2425
2851
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
2426
2852
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
2427
2853
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
2428
- case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
2429
- case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
2854
+ case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
2855
+ case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
2430
2856
 
2431
2857
  #ifdef GGML_USE_K_QUANTS
2432
2858
  // K-quants
@@ -2510,16 +2936,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2510
2936
  } else {
2511
2937
  new_type = quantized_type;
2512
2938
  #ifdef GGML_USE_K_QUANTS
2513
- bool convert_incompatible_tensor = false;
2514
- if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2515
- quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2516
- int nx = tensor.ne.at(0);
2517
- int ny = tensor.ne.at(1);
2518
- if (nx % QK_K != 0 || ny % QK_K != 0) {
2519
- fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
2520
- convert_incompatible_tensor = true;
2521
- }
2522
- }
2523
2939
  if (tensor.name == "output.weight") {
2524
2940
  int nx = tensor.ne.at(0);
2525
2941
  int ny = tensor.ne.at(1);
@@ -2545,6 +2961,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2545
2961
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2546
2962
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2547
2963
  }
2964
+ bool convert_incompatible_tensor = false;
2965
+ if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
2966
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
2967
+ int nx = tensor.ne.at(0);
2968
+ int ny = tensor.ne.at(1);
2969
+ if (nx % QK_K != 0 || ny % QK_K != 0) {
2970
+ fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
2971
+ convert_incompatible_tensor = true;
2972
+ }
2973
+ }
2548
2974
  if (convert_incompatible_tensor) {
2549
2975
  if (tensor.name == "output.weight") {
2550
2976
  new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
@@ -2571,7 +2997,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2571
2997
  f32_data = (float *) f32_conv_buf.addr;
2572
2998
  }
2573
2999
 
2574
- printf("quantizing .. ");
3000
+ printf("quantizing to %s .. ", ggml_type_name(new_type));
2575
3001
  fflush(stdout);
2576
3002
 
2577
3003
  work.resize(nelements * 4); // upper bound on size
@@ -2674,9 +3100,10 @@ struct llama_model * llama_load_model_from_file(
2674
3100
 
2675
3101
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2676
3102
 
2677
- if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
2678
- params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2679
- params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
3103
+ if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
3104
+ params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
3105
+ memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
3106
+ params.progress_callback_user_data)) {
2680
3107
  delete model;
2681
3108
  fprintf(stderr, "%s: failed to load model\n", __func__);
2682
3109
  return nullptr;
@@ -2697,7 +3124,7 @@ struct llama_context * llama_new_context_with_model(
2697
3124
  return nullptr;
2698
3125
  }
2699
3126
 
2700
- llama_context * ctx = new llama_context(*model, model->vocab);
3127
+ llama_context * ctx = new llama_context(*model);
2701
3128
 
2702
3129
  if (params.seed == LLAMA_DEFAULT_SEED) {
2703
3130
  params.seed = time(NULL);
@@ -2751,9 +3178,9 @@ struct llama_context * llama_new_context_with_model(
2751
3178
  ctx->embedding.resize(hparams.n_embd);
2752
3179
  }
2753
3180
 
2754
- ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
3181
+ ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
2755
3182
 
2756
- ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
3183
+ ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
2757
3184
  ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
2758
3185
  }
2759
3186
 
@@ -2775,7 +3202,7 @@ struct llama_context * llama_new_context_with_model(
2775
3202
 
2776
3203
  const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
2777
3204
 
2778
- printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
3205
+ fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
2779
3206
 
2780
3207
  #define LLAMA_METAL_CHECK_BUF(result) \
2781
3208
  if (!(result)) { \
@@ -3535,13 +3962,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
3535
3962
  return 0;
3536
3963
  }
3537
3964
 
3538
- int llama_tokenize(
3539
- struct llama_context * ctx,
3965
+ int llama_tokenize_with_model(
3966
+ const struct llama_model * model,
3540
3967
  const char * text,
3541
3968
  llama_token * tokens,
3542
3969
  int n_max_tokens,
3543
3970
  bool add_bos) {
3544
- auto res = llama_tokenize(ctx->vocab, text, add_bos);
3971
+ auto res = llama_tokenize(model->vocab, text, add_bos);
3545
3972
 
3546
3973
  if (n_max_tokens < (int) res.size()) {
3547
3974
  fprintf(stderr, "%s: too many tokens\n", __func__);
@@ -3555,8 +3982,29 @@ int llama_tokenize(
3555
3982
  return res.size();
3556
3983
  }
3557
3984
 
3985
+ int llama_tokenize(
3986
+ struct llama_context * ctx,
3987
+ const char * text,
3988
+ llama_token * tokens,
3989
+ int n_max_tokens,
3990
+ bool add_bos) {
3991
+ return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
3992
+ }
3993
+
3994
+ int llama_n_vocab_from_model(const struct llama_model * model) {
3995
+ return model->vocab.id_to_token.size();
3996
+ }
3997
+
3998
+ int llama_n_ctx_from_model(const struct llama_model * model) {
3999
+ return model->hparams.n_ctx;
4000
+ }
4001
+
4002
+ int llama_n_embd_from_model(const struct llama_model * model) {
4003
+ return model->hparams.n_embd;
4004
+ }
4005
+
3558
4006
  int llama_n_vocab(const struct llama_context * ctx) {
3559
- return ctx->vocab.id_to_token.size();
4007
+ return ctx->model.vocab.id_to_token.size();
3560
4008
  }
3561
4009
 
3562
4010
  int llama_n_ctx(const struct llama_context * ctx) {
@@ -3567,19 +4015,27 @@ int llama_n_embd(const struct llama_context * ctx) {
3567
4015
  return ctx->model.hparams.n_embd;
3568
4016
  }
3569
4017
 
3570
- int llama_get_vocab(
3571
- const struct llama_context * ctx,
4018
+ int llama_get_vocab_from_model(
4019
+ const struct llama_model * model,
3572
4020
  const char * * strings,
3573
4021
  float * scores,
3574
4022
  int capacity) {
3575
- int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
4023
+ int n = std::min(capacity, (int) model->vocab.id_to_token.size());
3576
4024
  for (int i = 0; i<n; ++i) {
3577
- strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
3578
- scores[i] = ctx->vocab.id_to_token[i].score;
4025
+ strings[i] = model->vocab.id_to_token[i].tok.c_str();
4026
+ scores[i] = model->vocab.id_to_token[i].score;
3579
4027
  }
3580
4028
  return n;
3581
4029
  }
3582
4030
 
4031
+ int llama_get_vocab(
4032
+ const struct llama_context * ctx,
4033
+ const char * * strings,
4034
+ float * scores,
4035
+ int capacity) {
4036
+ return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
4037
+ }
4038
+
3583
4039
  float * llama_get_logits(struct llama_context * ctx) {
3584
4040
  return ctx->logits.data();
3585
4041
  }
@@ -3588,12 +4044,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
3588
4044
  return ctx->embedding.data();
3589
4045
  }
3590
4046
 
3591
- const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
3592
- if (token >= llama_n_vocab(ctx)) {
4047
+ const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
4048
+ if (token >= llama_n_vocab_from_model(model)) {
3593
4049
  return nullptr;
3594
4050
  }
3595
4051
 
3596
- return ctx->vocab.id_to_token[token].tok.c_str();
4052
+ return model->vocab.id_to_token[token].tok.c_str();
4053
+ }
4054
+
4055
+ const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
4056
+ return llama_token_to_str_with_model(&ctx->model, token);
3597
4057
  }
3598
4058
 
3599
4059
  llama_token llama_token_bos() {