llama_cpp 0.3.2 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -19,6 +19,9 @@
19
19
  #ifdef GGML_USE_METAL
20
20
  #include "ggml-metal.h"
21
21
  #endif
22
+ #ifdef GGML_USE_MPI
23
+ #include "ggml-mpi.h"
24
+ #endif
22
25
  #ifdef GGML_USE_K_QUANTS
23
26
  #ifndef QK_K
24
27
  #ifdef GGML_QKK_64
@@ -79,14 +82,34 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
79
82
  (void) tensor;
80
83
  }
81
84
 
82
- static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
85
+ //
86
+ // ggml helpers
87
+ //
88
+
89
+ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
90
+ struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
91
+
92
+ if (plan.work_size > 0) {
93
+ buf.resize(plan.work_size);
94
+ plan.work_data = buf.data();
95
+ }
96
+
97
+ ggml_graph_compute(graph, &plan);
98
+ }
99
+
100
+ //
101
+ // memory sizes
102
+ //
103
+
104
+ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
83
105
  {
84
106
  static std::map<e_model, size_t> k_sizes = {
85
- { MODEL_3B, 256ull * MB },
86
- { MODEL_7B, 512ull * MB },
87
- { MODEL_13B, 512ull * MB },
88
- { MODEL_30B, 512ull * MB },
89
- { MODEL_65B, 1024ull * MB },
107
+ /* empirical scaling, still a guess */
108
+ { MODEL_3B, ((size_t) n_ctx / 16ull + 128ull) * MB },
109
+ { MODEL_7B, ((size_t) n_ctx / 16ull + 256ull) * MB },
110
+ { MODEL_13B, ((size_t) n_ctx / 12ull + 256ull) * MB },
111
+ { MODEL_30B, ((size_t) n_ctx / 10ull + 256ull) * MB },
112
+ { MODEL_65B, ((size_t) n_ctx / 8ull + 512ull) * MB },
90
113
  };
91
114
  return k_sizes;
92
115
  }
@@ -118,14 +141,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
118
141
 
119
142
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
120
143
  // not actually needed if BLAS is disabled
121
- static const std::map<e_model, size_t> & MEM_REQ_EVAL()
144
+ static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
122
145
  {
123
146
  static std::map<e_model, size_t> k_sizes = {
124
- { MODEL_3B, 512ull * MB },
125
- { MODEL_7B, 768ull * MB },
126
- { MODEL_13B, 1024ull * MB },
127
- { MODEL_30B, 1280ull * MB },
128
- { MODEL_65B, 1536ull * MB },
147
+ { MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB },
148
+ { MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB },
149
+ { MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
150
+ { MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
151
+ { MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
129
152
  };
130
153
  return k_sizes;
131
154
  }
@@ -167,6 +190,10 @@ struct llama_hparams {
167
190
  uint32_t n_head = 32;
168
191
  uint32_t n_layer = 32;
169
192
  uint32_t n_rot = 64;
193
+
194
+ float rope_freq_base = 10000.0f;
195
+ float rope_freq_scale = 1.0f;
196
+
170
197
  enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
171
198
 
172
199
  bool operator!=(const llama_hparams & other) const {
@@ -281,7 +308,7 @@ struct llama_model {
281
308
  };
282
309
 
283
310
  struct llama_context {
284
- llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
311
+ llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
285
312
  #ifdef GGML_USE_METAL
286
313
  ~llama_context() {
287
314
  if (ctx_metal) {
@@ -302,7 +329,6 @@ struct llama_context {
302
329
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
303
330
 
304
331
  const llama_model & model;
305
- const llama_vocab & vocab;
306
332
 
307
333
  bool model_owner = false;
308
334
 
@@ -321,6 +347,9 @@ struct llama_context {
321
347
  // input embedding (1-dimensional array: [n_embd])
322
348
  std::vector<float> embedding;
323
349
 
350
+ // reusable buffer for `struct ggml_graph_plan.work_data`
351
+ std::vector<uint8_t> work_buffer;
352
+
324
353
  // memory buffers used to evaluate the model
325
354
  // TODO: move in llama_state
326
355
  llama_ctx_buffer buf_compute;
@@ -330,6 +359,10 @@ struct llama_context {
330
359
  ggml_metal_context * ctx_metal = NULL;
331
360
  #endif
332
361
 
362
+ #ifdef GGML_USE_MPI
363
+ ggml_mpi_context * ctx_mpi = NULL;
364
+ #endif
365
+
333
366
  int buf_last = 0;
334
367
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
335
368
 
@@ -522,7 +555,9 @@ struct llama_file_loader {
522
555
  }
523
556
 
524
557
  // skip to the next multiple of 32 bytes
525
- file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
558
+ if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
559
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
560
+ }
526
561
 
527
562
  tensor.file_off = file.tell();
528
563
  tensor.name = name;
@@ -619,7 +654,7 @@ struct llama_model_loader {
619
654
  *ctx_size_p = *mmapped_size_p = 0;
620
655
  for (const llama_load_tensor & lt : tensors_map.tensors) {
621
656
  *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
622
- *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
657
+ *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
623
658
  }
624
659
  }
625
660
 
@@ -758,7 +793,6 @@ struct llama_model_loader {
758
793
 
759
794
  };
760
795
 
761
-
762
796
  //
763
797
  // kv cache
764
798
  //
@@ -815,7 +849,9 @@ struct llama_context_params llama_context_default_params() {
815
849
  /*.n_batch =*/ 512,
816
850
  /*.gpu_layers =*/ 0,
817
851
  /*.main_gpu =*/ 0,
818
- /*.tensor_split =*/ {0},
852
+ /*.tensor_split =*/ nullptr,
853
+ /*.rope_freq_base =*/ 10000.0f,
854
+ /*.rope_freq_scale =*/ 1.0f,
819
855
  /*.progress_callback =*/ nullptr,
820
856
  /*.progress_callback_user_data =*/ nullptr,
821
857
  /*.low_vram =*/ false,
@@ -841,6 +877,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
841
877
  return result;
842
878
  }
843
879
 
880
+ int llama_max_devices() {
881
+ return LLAMA_MAX_DEVICES;
882
+ }
883
+
844
884
  bool llama_mmap_supported() {
845
885
  return llama_mmap::SUPPORTED;
846
886
  }
@@ -849,7 +889,7 @@ bool llama_mlock_supported() {
849
889
  return llama_mlock::SUPPORTED;
850
890
  }
851
891
 
852
- void llama_init_backend(bool numa) {
892
+ void llama_backend_init(bool numa) {
853
893
  ggml_time_init();
854
894
 
855
895
  // needed to initialize f16 tables
@@ -862,6 +902,16 @@ void llama_init_backend(bool numa) {
862
902
  if (numa) {
863
903
  ggml_numa_init();
864
904
  }
905
+
906
+ #ifdef GGML_USE_MPI
907
+ ggml_mpi_backend_init();
908
+ #endif
909
+ }
910
+
911
+ void llama_backend_free() {
912
+ #ifdef GGML_USE_MPI
913
+ ggml_mpi_backend_free();
914
+ #endif
865
915
  }
866
916
 
867
917
  int64_t llama_time_us() {
@@ -929,6 +979,8 @@ static void llama_model_load_internal(
929
979
  int n_gpu_layers,
930
980
  int main_gpu,
931
981
  const float * tensor_split,
982
+ float rope_freq_base,
983
+ float rope_freq_scale,
932
984
  bool low_vram,
933
985
  ggml_type memory_type,
934
986
  bool use_mmap,
@@ -963,22 +1015,27 @@ static void llama_model_load_internal(
963
1015
  }
964
1016
 
965
1017
  hparams.n_ctx = n_ctx;
1018
+
1019
+ hparams.rope_freq_base = rope_freq_base;
1020
+ hparams.rope_freq_scale = rope_freq_scale;
966
1021
  }
967
1022
 
968
1023
  const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
969
1024
 
970
1025
  {
971
- fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
972
- fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
973
- fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
974
- fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
975
- fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
976
- fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
977
- fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
978
- fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1026
+ fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
1027
+ fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
1028
+ fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
1029
+ fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
1030
+ fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
1031
+ fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
1032
+ fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
1033
+ fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1034
+ fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
1035
+ fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
979
1036
  fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
980
- fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
981
- fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1037
+ fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1038
+ fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
982
1039
  }
983
1040
 
984
1041
  if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
@@ -1127,9 +1184,9 @@ static void llama_model_load_internal(
1127
1184
  const size_t mem_required =
1128
1185
  ctx_size +
1129
1186
  mmapped_size - vram_weights + // weights in VRAM not in memory
1130
- MEM_REQ_SCRATCH0().at(model.type) +
1187
+ MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
1131
1188
  MEM_REQ_SCRATCH1().at(model.type) +
1132
- MEM_REQ_EVAL().at (model.type);
1189
+ MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
1133
1190
 
1134
1191
  // this is the memory required by one llama_state
1135
1192
  const size_t mem_required_state =
@@ -1232,7 +1289,9 @@ static bool llama_model_load(
1232
1289
  int n_batch,
1233
1290
  int n_gpu_layers,
1234
1291
  int main_gpu,
1235
- float * tensor_split,
1292
+ const float * tensor_split,
1293
+ float rope_freq_base,
1294
+ float rope_freq_scale,
1236
1295
  bool low_vram,
1237
1296
  ggml_type memory_type,
1238
1297
  bool use_mmap,
@@ -1241,7 +1300,7 @@ static bool llama_model_load(
1241
1300
  llama_progress_callback progress_callback,
1242
1301
  void *progress_callback_user_data) {
1243
1302
  try {
1244
- llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1303
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1245
1304
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1246
1305
  return true;
1247
1306
  } catch (const std::exception & err) {
@@ -1263,18 +1322,16 @@ static bool llama_eval_internal(
1263
1322
  llama_context & lctx,
1264
1323
  const llama_token * tokens,
1265
1324
  const float * embd,
1266
- const int n_tokens,
1267
- const int n_past,
1268
- const int n_threads,
1325
+ int n_tokens,
1326
+ int n_past,
1327
+ int n_threads,
1269
1328
  const char * cgraph_fname) {
1270
1329
 
1271
1330
  LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1272
1331
 
1273
- // enforce that the first token is BOS
1274
- if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
1275
- fprintf(stderr, "%s: first token must be BOS\n", __func__);
1276
- return false;
1277
- }
1332
+ #ifdef GGML_USE_MPI
1333
+ ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
1334
+ #endif
1278
1335
 
1279
1336
  const int64_t t_start_us = ggml_time_us();
1280
1337
 
@@ -1295,6 +1352,9 @@ static bool llama_eval_internal(
1295
1352
  const int n_rot = hparams.n_embd/hparams.n_head;
1296
1353
  const int n_gpu_layers = model.n_gpu_layers;
1297
1354
 
1355
+ const float freq_base = hparams.rope_freq_base;
1356
+ const float freq_scale = hparams.rope_freq_scale;
1357
+
1298
1358
  auto & mem_per_token = lctx.mem_per_token;
1299
1359
  auto & buf_compute = lctx.buf_compute;
1300
1360
 
@@ -1306,20 +1366,26 @@ static bool llama_eval_internal(
1306
1366
 
1307
1367
  struct ggml_context * ctx0 = ggml_init(params);
1308
1368
 
1369
+ ggml_cgraph gf = {};
1370
+
1309
1371
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1310
1372
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1311
- ggml_cgraph gf = {};
1312
- gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1373
+ n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1313
1374
 
1314
1375
  struct ggml_tensor * cur;
1315
1376
  struct ggml_tensor * inpL;
1316
1377
 
1317
1378
  if (tokens) {
1318
- struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1319
- ggml_set_name(embd, "embd");
1320
- memcpy(embd->data, tokens, N*ggml_element_size(embd));
1321
- inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1379
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1380
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
1381
+ ggml_set_name(inp_tokens, "inp_tokens");
1382
+
1383
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
1322
1384
  } else {
1385
+ #ifdef GGML_USE_MPI
1386
+ GGML_ASSERT(false && "not implemented");
1387
+ #endif
1388
+
1323
1389
  inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
1324
1390
  memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1325
1391
  }
@@ -1337,18 +1403,20 @@ static bool llama_eval_internal(
1337
1403
  offload_func_t offload_func_v = llama_nop;
1338
1404
 
1339
1405
  #ifdef GGML_USE_CUBLAS
1340
- if (n_gpu_layers > n_layer) {
1341
- offload_func_nr = ggml_cuda_assign_buffers;
1342
- }
1343
- if (n_gpu_layers > n_layer + 1) {
1344
- offload_func_v = ggml_cuda_assign_buffers;
1345
- }
1346
- if (n_gpu_layers > n_layer + 2) {
1347
- offload_func_kq = ggml_cuda_assign_buffers;
1348
- }
1406
+ if (n_gpu_layers > n_layer) {
1407
+ offload_func_nr = ggml_cuda_assign_buffers;
1408
+ }
1409
+ if (n_gpu_layers > n_layer + 1) {
1410
+ offload_func_v = ggml_cuda_assign_buffers;
1411
+ }
1412
+ if (n_gpu_layers > n_layer + 2) {
1413
+ offload_func_kq = ggml_cuda_assign_buffers;
1414
+ }
1349
1415
  #endif // GGML_USE_CUBLAS
1350
1416
 
1351
1417
  for (int il = 0; il < n_layer; ++il) {
1418
+ ggml_format_name(inpL, "layer_inp_%d", il);
1419
+
1352
1420
  offload_func_t offload_func = llama_nop;
1353
1421
 
1354
1422
  #ifdef GGML_USE_CUBLAS
@@ -1384,11 +1452,11 @@ static bool llama_eval_internal(
1384
1452
  offload_func_kq(tmpq);
1385
1453
  ggml_set_name(tmpq, "tmpq");
1386
1454
 
1387
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1455
+ struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
1388
1456
  offload_func_kq(Kcur);
1389
1457
  ggml_set_name(Kcur, "Kcur");
1390
1458
 
1391
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1459
+ struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
1392
1460
  offload_func_kq(Qcur);
1393
1461
  ggml_set_name(Qcur, "Qcur");
1394
1462
 
@@ -1555,7 +1623,6 @@ static bool llama_eval_internal(
1555
1623
 
1556
1624
  // input for next layer
1557
1625
  inpL = cur;
1558
-
1559
1626
  }
1560
1627
 
1561
1628
  lctx.use_buf(ctx0, 0);
@@ -1563,7 +1630,6 @@ static bool llama_eval_internal(
1563
1630
  // used at the end to optionally extract the embeddings
1564
1631
  struct ggml_tensor * embeddings = NULL;
1565
1632
 
1566
-
1567
1633
  // norm
1568
1634
  {
1569
1635
  cur = ggml_rms_norm(ctx0, inpL);
@@ -1578,7 +1644,6 @@ static bool llama_eval_internal(
1578
1644
  embeddings = cur;
1579
1645
  }
1580
1646
 
1581
-
1582
1647
  // lm_head
1583
1648
  cur = ggml_mul_mat(ctx0, model.output, cur);
1584
1649
  ggml_set_name(cur, "result_output");
@@ -1591,8 +1656,13 @@ static bool llama_eval_internal(
1591
1656
  // run the computation
1592
1657
  ggml_build_forward_expand(&gf, cur);
1593
1658
 
1659
+ #if GGML_USE_MPI
1660
+ ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
1661
+ #endif
1662
+
1594
1663
  #ifdef GGML_USE_METAL
1595
1664
  if (lctx.ctx_metal && N == 1) {
1665
+ ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
1596
1666
  ggml_metal_graph_compute(lctx.ctx_metal, &gf);
1597
1667
  ggml_metal_get_tensor (lctx.ctx_metal, cur);
1598
1668
  } else {
@@ -1612,12 +1682,21 @@ static bool llama_eval_internal(
1612
1682
  ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1613
1683
  }
1614
1684
 
1615
- ggml_graph_compute(ctx0, &gf);
1685
+ ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1616
1686
  }
1617
1687
  #else
1618
- ggml_graph_compute(ctx0, &gf);
1688
+ ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1689
+ #endif
1690
+
1691
+ #if GGML_USE_MPI
1692
+ ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
1619
1693
  #endif
1620
1694
 
1695
+ // update kv token count
1696
+ lctx.kv_self.n = n_past + N;
1697
+
1698
+ struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
1699
+
1621
1700
  if (cgraph_fname) {
1622
1701
  ggml_graph_export(&gf, cgraph_fname);
1623
1702
  }
@@ -1633,23 +1712,17 @@ static bool llama_eval_internal(
1633
1712
  // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
1634
1713
  //}
1635
1714
 
1636
- //embd_w.resize(n_vocab*N);
1637
- //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
1638
-
1639
- // update kv token count
1640
- lctx.kv_self.n = n_past + N;
1641
-
1642
1715
  // extract logits
1643
1716
  {
1644
1717
  auto & logits_out = lctx.logits;
1645
1718
 
1646
1719
  if (lctx.logits_all) {
1647
1720
  logits_out.resize(n_vocab * N);
1648
- memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
1721
+ memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
1649
1722
  } else {
1650
1723
  // return result for just the last token
1651
1724
  logits_out.resize(n_vocab);
1652
- memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1725
+ memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1653
1726
  }
1654
1727
  }
1655
1728
 
@@ -1957,9 +2030,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
1957
2030
  }
1958
2031
 
1959
2032
  // Normalize the second derivatives
1960
- float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
1961
- for (float & value : second_derivatives) {
1962
- value /= second_derivatives_sum;
2033
+ {
2034
+ const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
2035
+
2036
+ if (second_derivatives_sum > 1e-6f) {
2037
+ for (float & value : second_derivatives) {
2038
+ value /= second_derivatives_sum;
2039
+ }
2040
+ } else {
2041
+ for (float & value : second_derivatives) {
2042
+ value = 1.0f / second_derivatives.size();
2043
+ }
2044
+ }
1963
2045
  }
1964
2046
 
1965
2047
  float cum_sum = 0.0f;
@@ -2118,6 +2200,52 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
2118
2200
  }
2119
2201
  }
2120
2202
 
2203
+ static void llama_log_softmax(float * array, size_t size) {
2204
+ float max_l = *std::max_element(array, array + size);
2205
+ float sum = 0.f;
2206
+ for (size_t i = 0; i < size; ++i) {
2207
+ float p = expf(array[i] - max_l);
2208
+ sum += p;
2209
+ array[i] = p;
2210
+ }
2211
+
2212
+ for (size_t i = 0; i < size; ++i) {
2213
+ array[i] = logf(array[i] / sum);
2214
+ }
2215
+ }
2216
+
2217
+ void llama_sample_classifier_free_guidance(
2218
+ struct llama_context * ctx,
2219
+ llama_token_data_array * candidates,
2220
+ struct llama_context * guidance_ctx,
2221
+ float scale) {
2222
+ int64_t t_start_sample_us = ggml_time_us();
2223
+
2224
+ assert(ctx);
2225
+ auto n_vocab = llama_n_vocab(ctx);
2226
+ assert(n_vocab == (int)candidates->size);
2227
+ assert(!candidates->sorted);
2228
+
2229
+ std::vector<float> logits_base;
2230
+ logits_base.reserve(candidates->size);
2231
+ for (size_t i = 0; i < candidates->size; ++i) {
2232
+ logits_base.push_back(candidates->data[i].logit);
2233
+ }
2234
+ llama_log_softmax(logits_base.data(), candidates->size);
2235
+
2236
+ float* logits_guidance = llama_get_logits(guidance_ctx);
2237
+ llama_log_softmax(logits_guidance, n_vocab);
2238
+
2239
+ for (int i = 0; i < n_vocab; ++i) {
2240
+ float logit_guidance = logits_guidance[i];
2241
+ float logit_base = logits_base[i];
2242
+ candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
2243
+ }
2244
+
2245
+ if (ctx) {
2246
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2247
+ }
2248
+ }
2121
2249
 
2122
2250
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
2123
2251
  assert(ctx);
@@ -2405,15 +2533,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2405
2533
  } else {
2406
2534
  new_type = quantized_type;
2407
2535
  #ifdef GGML_USE_K_QUANTS
2536
+ bool convert_incompatible_tensor = false;
2408
2537
  if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2409
2538
  quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2410
2539
  int nx = tensor.ne.at(0);
2411
2540
  int ny = tensor.ne.at(1);
2412
2541
  if (nx % QK_K != 0 || ny % QK_K != 0) {
2413
- fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
2414
- fprintf(stderr, "This is required to be able to use k-quants for now!\n");
2415
- fprintf(stderr, "========================================================================================\n\n");
2416
- throw std::runtime_error("Unsupported tensor size encountered\n");
2542
+ fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
2543
+ convert_incompatible_tensor = true;
2417
2544
  }
2418
2545
  }
2419
2546
  if (tensor.name == "output.weight") {
@@ -2441,6 +2568,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2441
2568
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2442
2569
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2443
2570
  }
2571
+ if (convert_incompatible_tensor) {
2572
+ if (tensor.name == "output.weight") {
2573
+ new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
2574
+ fprintf(stderr, "F16 will be used for this tensor instead.\n");
2575
+ } else if (tensor.name == "tok_embeddings.weight") {
2576
+ new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
2577
+ fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
2578
+ } else {
2579
+ throw std::runtime_error("Unsupported tensor size encountered\n");
2580
+ }
2581
+ }
2444
2582
  #endif
2445
2583
 
2446
2584
  float * f32_data;
@@ -2560,8 +2698,9 @@ struct llama_model * llama_load_model_from_file(
2560
2698
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2561
2699
 
2562
2700
  if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
2563
- params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2564
- params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2701
+ params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
2702
+ memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
2703
+ params.progress_callback_user_data)) {
2565
2704
  delete model;
2566
2705
  fprintf(stderr, "%s: failed to load model\n", __func__);
2567
2706
  return nullptr;
@@ -2575,14 +2714,14 @@ void llama_free_model(struct llama_model * model) {
2575
2714
  }
2576
2715
 
2577
2716
  struct llama_context * llama_new_context_with_model(
2578
- struct llama_model * model,
2579
- struct llama_context_params params) {
2717
+ struct llama_model * model,
2718
+ struct llama_context_params params) {
2580
2719
 
2581
2720
  if (!model) {
2582
2721
  return nullptr;
2583
2722
  }
2584
2723
 
2585
- llama_context * ctx = new llama_context(*model, model->vocab);
2724
+ llama_context * ctx = new llama_context(*model);
2586
2725
 
2587
2726
  if (params.seed == LLAMA_DEFAULT_SEED) {
2588
2727
  params.seed = time(NULL);
@@ -2636,16 +2775,16 @@ struct llama_context * llama_new_context_with_model(
2636
2775
  ctx->embedding.resize(hparams.n_embd);
2637
2776
  }
2638
2777
 
2639
- ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
2778
+ ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
2640
2779
 
2641
- ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
2780
+ ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
2642
2781
  ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
2643
2782
  }
2644
2783
 
2645
2784
  #ifdef GGML_USE_METAL
2646
2785
  if (params.n_gpu_layers > 0) {
2647
2786
  // this allocates all Metal resources and memory buffers
2648
- ctx->ctx_metal = ggml_metal_init();
2787
+ ctx->ctx_metal = ggml_metal_init(1);
2649
2788
 
2650
2789
  void * data_ptr = NULL;
2651
2790
  size_t data_size = 0;
@@ -2680,6 +2819,18 @@ struct llama_context * llama_new_context_with_model(
2680
2819
  }
2681
2820
  #endif
2682
2821
 
2822
+ #ifdef GGML_USE_MPI
2823
+ ctx->ctx_mpi = ggml_mpi_init();
2824
+
2825
+ if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
2826
+ // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
2827
+ const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos());
2828
+ while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
2829
+ llama_backend_free();
2830
+ exit(1);
2831
+ }
2832
+ #endif
2833
+
2683
2834
  return ctx;
2684
2835
  }
2685
2836
 
@@ -2802,6 +2953,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2802
2953
  // read tensors and apply
2803
2954
  bool warned = false;
2804
2955
  int n_tensors = 0;
2956
+
2957
+ std::vector<uint8_t> work_buffer;
2958
+
2805
2959
  while (true) {
2806
2960
  int32_t n_dims;
2807
2961
  int32_t length;
@@ -2966,8 +3120,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2966
3120
  }
2967
3121
 
2968
3122
  struct ggml_cgraph gf = ggml_build_forward(r);
2969
- gf.n_threads = n_threads;
2970
- ggml_graph_compute(lora_ctx, &gf);
3123
+
3124
+ ggml_graph_compute_helper(work_buffer, &gf, n_threads);
2971
3125
 
2972
3126
  // we won't need these tensors again, reset the context to save memory
2973
3127
  ggml_free(lora_ctx);
@@ -3120,7 +3274,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3120
3274
 
3121
3275
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3122
3276
  ggml_cgraph gf{};
3123
- gf.n_threads = 1;
3124
3277
 
3125
3278
  ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
3126
3279
  kout3d->data = out;
@@ -3140,7 +3293,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3140
3293
 
3141
3294
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
3142
3295
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
3143
- ggml_graph_compute(cpy_ctx, &gf);
3296
+ ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
3144
3297
 
3145
3298
  ggml_free(cpy_ctx);
3146
3299
  }
@@ -3226,7 +3379,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3226
3379
 
3227
3380
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3228
3381
  ggml_cgraph gf{};
3229
- gf.n_threads = 1;
3230
3382
 
3231
3383
  ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
3232
3384
  kin3d->data = (void *) inp;
@@ -3246,7 +3398,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3246
3398
 
3247
3399
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
3248
3400
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
3249
- ggml_graph_compute(cpy_ctx, &gf);
3401
+ ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
3250
3402
 
3251
3403
  ggml_free(cpy_ctx);
3252
3404
  }
@@ -3407,13 +3559,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
3407
3559
  return 0;
3408
3560
  }
3409
3561
 
3410
- int llama_tokenize(
3411
- struct llama_context * ctx,
3562
+ int llama_tokenize_with_model(
3563
+ const struct llama_model * model,
3412
3564
  const char * text,
3413
3565
  llama_token * tokens,
3414
3566
  int n_max_tokens,
3415
3567
  bool add_bos) {
3416
- auto res = llama_tokenize(ctx->vocab, text, add_bos);
3568
+ auto res = llama_tokenize(model->vocab, text, add_bos);
3417
3569
 
3418
3570
  if (n_max_tokens < (int) res.size()) {
3419
3571
  fprintf(stderr, "%s: too many tokens\n", __func__);
@@ -3427,8 +3579,29 @@ int llama_tokenize(
3427
3579
  return res.size();
3428
3580
  }
3429
3581
 
3582
+ int llama_tokenize(
3583
+ struct llama_context * ctx,
3584
+ const char * text,
3585
+ llama_token * tokens,
3586
+ int n_max_tokens,
3587
+ bool add_bos) {
3588
+ return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
3589
+ }
3590
+
3591
+ int llama_n_vocab_from_model(const struct llama_model * model) {
3592
+ return model->vocab.id_to_token.size();
3593
+ }
3594
+
3595
+ int llama_n_ctx_from_model(const struct llama_model * model) {
3596
+ return model->hparams.n_ctx;
3597
+ }
3598
+
3599
+ int llama_n_embd_from_model(const struct llama_model * model) {
3600
+ return model->hparams.n_embd;
3601
+ }
3602
+
3430
3603
  int llama_n_vocab(const struct llama_context * ctx) {
3431
- return ctx->vocab.id_to_token.size();
3604
+ return ctx->model.vocab.id_to_token.size();
3432
3605
  }
3433
3606
 
3434
3607
  int llama_n_ctx(const struct llama_context * ctx) {
@@ -3439,19 +3612,27 @@ int llama_n_embd(const struct llama_context * ctx) {
3439
3612
  return ctx->model.hparams.n_embd;
3440
3613
  }
3441
3614
 
3442
- int llama_get_vocab(
3443
- const struct llama_context * ctx,
3615
+ int llama_get_vocab_from_model(
3616
+ const struct llama_model * model,
3444
3617
  const char * * strings,
3445
3618
  float * scores,
3446
3619
  int capacity) {
3447
- int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
3620
+ int n = std::min(capacity, (int) model->vocab.id_to_token.size());
3448
3621
  for (int i = 0; i<n; ++i) {
3449
- strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
3450
- scores[i] = ctx->vocab.id_to_token[i].score;
3622
+ strings[i] = model->vocab.id_to_token[i].tok.c_str();
3623
+ scores[i] = model->vocab.id_to_token[i].score;
3451
3624
  }
3452
3625
  return n;
3453
3626
  }
3454
3627
 
3628
+ int llama_get_vocab(
3629
+ const struct llama_context * ctx,
3630
+ const char * * strings,
3631
+ float * scores,
3632
+ int capacity) {
3633
+ return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
3634
+ }
3635
+
3455
3636
  float * llama_get_logits(struct llama_context * ctx) {
3456
3637
  return ctx->logits.data();
3457
3638
  }
@@ -3460,12 +3641,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
3460
3641
  return ctx->embedding.data();
3461
3642
  }
3462
3643
 
3463
- const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
3464
- if (token >= llama_n_vocab(ctx)) {
3644
+ const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
3645
+ if (token >= llama_n_vocab_from_model(model)) {
3465
3646
  return nullptr;
3466
3647
  }
3467
3648
 
3468
- return ctx->vocab.id_to_token[token].tok.c_str();
3649
+ return model->vocab.id_to_token[token].tok.c_str();
3650
+ }
3651
+
3652
+ const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
3653
+ return llama_token_to_str_with_model(&ctx->model, token);
3469
3654
  }
3470
3655
 
3471
3656
  llama_token llama_token_bos() {