llama_cpp 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,6 +19,9 @@
19
19
  #ifdef GGML_USE_METAL
20
20
  #include "ggml-metal.h"
21
21
  #endif
22
+ #ifdef GGML_USE_MPI
23
+ #include "ggml-mpi.h"
24
+ #endif
22
25
  #ifdef GGML_USE_K_QUANTS
23
26
  #ifndef QK_K
24
27
  #ifdef GGML_QKK_64
@@ -79,14 +82,34 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
79
82
  (void) tensor;
80
83
  }
81
84
 
82
- static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
85
+ //
86
+ // ggml helpers
87
+ //
88
+
89
+ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
90
+ struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
91
+
92
+ if (plan.work_size > 0) {
93
+ buf.resize(plan.work_size);
94
+ plan.work_data = buf.data();
95
+ }
96
+
97
+ ggml_graph_compute(graph, &plan);
98
+ }
99
+
100
+ //
101
+ // memory sizes
102
+ //
103
+
104
+ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
83
105
  {
84
106
  static std::map<e_model, size_t> k_sizes = {
85
- { MODEL_3B, 256ull * MB },
86
- { MODEL_7B, 512ull * MB },
87
- { MODEL_13B, 512ull * MB },
88
- { MODEL_30B, 512ull * MB },
89
- { MODEL_65B, 1024ull * MB },
107
+ /* empirical scaling, still a guess */
108
+ { MODEL_3B, ((size_t) n_ctx / 16ull + 128ull) * MB },
109
+ { MODEL_7B, ((size_t) n_ctx / 16ull + 256ull) * MB },
110
+ { MODEL_13B, ((size_t) n_ctx / 12ull + 256ull) * MB },
111
+ { MODEL_30B, ((size_t) n_ctx / 10ull + 256ull) * MB },
112
+ { MODEL_65B, ((size_t) n_ctx / 8ull + 512ull) * MB },
90
113
  };
91
114
  return k_sizes;
92
115
  }
@@ -118,14 +141,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
118
141
 
119
142
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
120
143
  // not actually needed if BLAS is disabled
121
- static const std::map<e_model, size_t> & MEM_REQ_EVAL()
144
+ static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
122
145
  {
123
146
  static std::map<e_model, size_t> k_sizes = {
124
- { MODEL_3B, 512ull * MB },
125
- { MODEL_7B, 768ull * MB },
126
- { MODEL_13B, 1024ull * MB },
127
- { MODEL_30B, 1280ull * MB },
128
- { MODEL_65B, 1536ull * MB },
147
+ { MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB },
148
+ { MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB },
149
+ { MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
150
+ { MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
151
+ { MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
129
152
  };
130
153
  return k_sizes;
131
154
  }
@@ -167,6 +190,10 @@ struct llama_hparams {
167
190
  uint32_t n_head = 32;
168
191
  uint32_t n_layer = 32;
169
192
  uint32_t n_rot = 64;
193
+
194
+ float rope_freq_base = 10000.0f;
195
+ float rope_freq_scale = 1.0f;
196
+
170
197
  enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
171
198
 
172
199
  bool operator!=(const llama_hparams & other) const {
@@ -281,7 +308,7 @@ struct llama_model {
281
308
  };
282
309
 
283
310
  struct llama_context {
284
- llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
311
+ llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
285
312
  #ifdef GGML_USE_METAL
286
313
  ~llama_context() {
287
314
  if (ctx_metal) {
@@ -302,7 +329,6 @@ struct llama_context {
302
329
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
303
330
 
304
331
  const llama_model & model;
305
- const llama_vocab & vocab;
306
332
 
307
333
  bool model_owner = false;
308
334
 
@@ -321,6 +347,9 @@ struct llama_context {
321
347
  // input embedding (1-dimensional array: [n_embd])
322
348
  std::vector<float> embedding;
323
349
 
350
+ // reusable buffer for `struct ggml_graph_plan.work_data`
351
+ std::vector<uint8_t> work_buffer;
352
+
324
353
  // memory buffers used to evaluate the model
325
354
  // TODO: move in llama_state
326
355
  llama_ctx_buffer buf_compute;
@@ -330,6 +359,10 @@ struct llama_context {
330
359
  ggml_metal_context * ctx_metal = NULL;
331
360
  #endif
332
361
 
362
+ #ifdef GGML_USE_MPI
363
+ ggml_mpi_context * ctx_mpi = NULL;
364
+ #endif
365
+
333
366
  int buf_last = 0;
334
367
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
335
368
 
@@ -522,7 +555,9 @@ struct llama_file_loader {
522
555
  }
523
556
 
524
557
  // skip to the next multiple of 32 bytes
525
- file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
558
+ if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
559
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
560
+ }
526
561
 
527
562
  tensor.file_off = file.tell();
528
563
  tensor.name = name;
@@ -619,7 +654,7 @@ struct llama_model_loader {
619
654
  *ctx_size_p = *mmapped_size_p = 0;
620
655
  for (const llama_load_tensor & lt : tensors_map.tensors) {
621
656
  *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
622
- *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
657
+ *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
623
658
  }
624
659
  }
625
660
 
@@ -758,7 +793,6 @@ struct llama_model_loader {
758
793
 
759
794
  };
760
795
 
761
-
762
796
  //
763
797
  // kv cache
764
798
  //
@@ -815,7 +849,9 @@ struct llama_context_params llama_context_default_params() {
815
849
  /*.n_batch =*/ 512,
816
850
  /*.gpu_layers =*/ 0,
817
851
  /*.main_gpu =*/ 0,
818
- /*.tensor_split =*/ {0},
852
+ /*.tensor_split =*/ nullptr,
853
+ /*.rope_freq_base =*/ 10000.0f,
854
+ /*.rope_freq_scale =*/ 1.0f,
819
855
  /*.progress_callback =*/ nullptr,
820
856
  /*.progress_callback_user_data =*/ nullptr,
821
857
  /*.low_vram =*/ false,
@@ -841,6 +877,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
841
877
  return result;
842
878
  }
843
879
 
880
+ int llama_max_devices() {
881
+ return LLAMA_MAX_DEVICES;
882
+ }
883
+
844
884
  bool llama_mmap_supported() {
845
885
  return llama_mmap::SUPPORTED;
846
886
  }
@@ -849,7 +889,7 @@ bool llama_mlock_supported() {
849
889
  return llama_mlock::SUPPORTED;
850
890
  }
851
891
 
852
- void llama_init_backend(bool numa) {
892
+ void llama_backend_init(bool numa) {
853
893
  ggml_time_init();
854
894
 
855
895
  // needed to initialize f16 tables
@@ -862,6 +902,16 @@ void llama_init_backend(bool numa) {
862
902
  if (numa) {
863
903
  ggml_numa_init();
864
904
  }
905
+
906
+ #ifdef GGML_USE_MPI
907
+ ggml_mpi_backend_init();
908
+ #endif
909
+ }
910
+
911
+ void llama_backend_free() {
912
+ #ifdef GGML_USE_MPI
913
+ ggml_mpi_backend_free();
914
+ #endif
865
915
  }
866
916
 
867
917
  int64_t llama_time_us() {
@@ -929,6 +979,8 @@ static void llama_model_load_internal(
929
979
  int n_gpu_layers,
930
980
  int main_gpu,
931
981
  const float * tensor_split,
982
+ float rope_freq_base,
983
+ float rope_freq_scale,
932
984
  bool low_vram,
933
985
  ggml_type memory_type,
934
986
  bool use_mmap,
@@ -963,22 +1015,27 @@ static void llama_model_load_internal(
963
1015
  }
964
1016
 
965
1017
  hparams.n_ctx = n_ctx;
1018
+
1019
+ hparams.rope_freq_base = rope_freq_base;
1020
+ hparams.rope_freq_scale = rope_freq_scale;
966
1021
  }
967
1022
 
968
1023
  const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
969
1024
 
970
1025
  {
971
- fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
972
- fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
973
- fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
974
- fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
975
- fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
976
- fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
977
- fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
978
- fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1026
+ fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
1027
+ fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
1028
+ fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
1029
+ fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
1030
+ fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
1031
+ fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
1032
+ fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
1033
+ fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1034
+ fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
1035
+ fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
979
1036
  fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
980
- fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
981
- fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1037
+ fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1038
+ fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
982
1039
  }
983
1040
 
984
1041
  if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
@@ -1127,9 +1184,9 @@ static void llama_model_load_internal(
1127
1184
  const size_t mem_required =
1128
1185
  ctx_size +
1129
1186
  mmapped_size - vram_weights + // weights in VRAM not in memory
1130
- MEM_REQ_SCRATCH0().at(model.type) +
1187
+ MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
1131
1188
  MEM_REQ_SCRATCH1().at(model.type) +
1132
- MEM_REQ_EVAL().at (model.type);
1189
+ MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
1133
1190
 
1134
1191
  // this is the memory required by one llama_state
1135
1192
  const size_t mem_required_state =
@@ -1232,7 +1289,9 @@ static bool llama_model_load(
1232
1289
  int n_batch,
1233
1290
  int n_gpu_layers,
1234
1291
  int main_gpu,
1235
- float * tensor_split,
1292
+ const float * tensor_split,
1293
+ float rope_freq_base,
1294
+ float rope_freq_scale,
1236
1295
  bool low_vram,
1237
1296
  ggml_type memory_type,
1238
1297
  bool use_mmap,
@@ -1241,7 +1300,7 @@ static bool llama_model_load(
1241
1300
  llama_progress_callback progress_callback,
1242
1301
  void *progress_callback_user_data) {
1243
1302
  try {
1244
- llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1303
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1245
1304
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1246
1305
  return true;
1247
1306
  } catch (const std::exception & err) {
@@ -1263,18 +1322,16 @@ static bool llama_eval_internal(
1263
1322
  llama_context & lctx,
1264
1323
  const llama_token * tokens,
1265
1324
  const float * embd,
1266
- const int n_tokens,
1267
- const int n_past,
1268
- const int n_threads,
1325
+ int n_tokens,
1326
+ int n_past,
1327
+ int n_threads,
1269
1328
  const char * cgraph_fname) {
1270
1329
 
1271
1330
  LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1272
1331
 
1273
- // enforce that the first token is BOS
1274
- if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
1275
- fprintf(stderr, "%s: first token must be BOS\n", __func__);
1276
- return false;
1277
- }
1332
+ #ifdef GGML_USE_MPI
1333
+ ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
1334
+ #endif
1278
1335
 
1279
1336
  const int64_t t_start_us = ggml_time_us();
1280
1337
 
@@ -1295,6 +1352,9 @@ static bool llama_eval_internal(
1295
1352
  const int n_rot = hparams.n_embd/hparams.n_head;
1296
1353
  const int n_gpu_layers = model.n_gpu_layers;
1297
1354
 
1355
+ const float freq_base = hparams.rope_freq_base;
1356
+ const float freq_scale = hparams.rope_freq_scale;
1357
+
1298
1358
  auto & mem_per_token = lctx.mem_per_token;
1299
1359
  auto & buf_compute = lctx.buf_compute;
1300
1360
 
@@ -1306,20 +1366,26 @@ static bool llama_eval_internal(
1306
1366
 
1307
1367
  struct ggml_context * ctx0 = ggml_init(params);
1308
1368
 
1369
+ ggml_cgraph gf = {};
1370
+
1309
1371
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1310
1372
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1311
- ggml_cgraph gf = {};
1312
- gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1373
+ n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1313
1374
 
1314
1375
  struct ggml_tensor * cur;
1315
1376
  struct ggml_tensor * inpL;
1316
1377
 
1317
1378
  if (tokens) {
1318
- struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1319
- ggml_set_name(embd, "embd");
1320
- memcpy(embd->data, tokens, N*ggml_element_size(embd));
1321
- inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1379
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1380
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
1381
+ ggml_set_name(inp_tokens, "inp_tokens");
1382
+
1383
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
1322
1384
  } else {
1385
+ #ifdef GGML_USE_MPI
1386
+ GGML_ASSERT(false && "not implemented");
1387
+ #endif
1388
+
1323
1389
  inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
1324
1390
  memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1325
1391
  }
@@ -1337,18 +1403,20 @@ static bool llama_eval_internal(
1337
1403
  offload_func_t offload_func_v = llama_nop;
1338
1404
 
1339
1405
  #ifdef GGML_USE_CUBLAS
1340
- if (n_gpu_layers > n_layer) {
1341
- offload_func_nr = ggml_cuda_assign_buffers;
1342
- }
1343
- if (n_gpu_layers > n_layer + 1) {
1344
- offload_func_v = ggml_cuda_assign_buffers;
1345
- }
1346
- if (n_gpu_layers > n_layer + 2) {
1347
- offload_func_kq = ggml_cuda_assign_buffers;
1348
- }
1406
+ if (n_gpu_layers > n_layer) {
1407
+ offload_func_nr = ggml_cuda_assign_buffers;
1408
+ }
1409
+ if (n_gpu_layers > n_layer + 1) {
1410
+ offload_func_v = ggml_cuda_assign_buffers;
1411
+ }
1412
+ if (n_gpu_layers > n_layer + 2) {
1413
+ offload_func_kq = ggml_cuda_assign_buffers;
1414
+ }
1349
1415
  #endif // GGML_USE_CUBLAS
1350
1416
 
1351
1417
  for (int il = 0; il < n_layer; ++il) {
1418
+ ggml_format_name(inpL, "layer_inp_%d", il);
1419
+
1352
1420
  offload_func_t offload_func = llama_nop;
1353
1421
 
1354
1422
  #ifdef GGML_USE_CUBLAS
@@ -1384,11 +1452,11 @@ static bool llama_eval_internal(
1384
1452
  offload_func_kq(tmpq);
1385
1453
  ggml_set_name(tmpq, "tmpq");
1386
1454
 
1387
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1455
+ struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
1388
1456
  offload_func_kq(Kcur);
1389
1457
  ggml_set_name(Kcur, "Kcur");
1390
1458
 
1391
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1459
+ struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
1392
1460
  offload_func_kq(Qcur);
1393
1461
  ggml_set_name(Qcur, "Qcur");
1394
1462
 
@@ -1555,7 +1623,6 @@ static bool llama_eval_internal(
1555
1623
 
1556
1624
  // input for next layer
1557
1625
  inpL = cur;
1558
-
1559
1626
  }
1560
1627
 
1561
1628
  lctx.use_buf(ctx0, 0);
@@ -1563,7 +1630,6 @@ static bool llama_eval_internal(
1563
1630
  // used at the end to optionally extract the embeddings
1564
1631
  struct ggml_tensor * embeddings = NULL;
1565
1632
 
1566
-
1567
1633
  // norm
1568
1634
  {
1569
1635
  cur = ggml_rms_norm(ctx0, inpL);
@@ -1578,7 +1644,6 @@ static bool llama_eval_internal(
1578
1644
  embeddings = cur;
1579
1645
  }
1580
1646
 
1581
-
1582
1647
  // lm_head
1583
1648
  cur = ggml_mul_mat(ctx0, model.output, cur);
1584
1649
  ggml_set_name(cur, "result_output");
@@ -1591,8 +1656,13 @@ static bool llama_eval_internal(
1591
1656
  // run the computation
1592
1657
  ggml_build_forward_expand(&gf, cur);
1593
1658
 
1659
+ #if GGML_USE_MPI
1660
+ ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
1661
+ #endif
1662
+
1594
1663
  #ifdef GGML_USE_METAL
1595
1664
  if (lctx.ctx_metal && N == 1) {
1665
+ ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
1596
1666
  ggml_metal_graph_compute(lctx.ctx_metal, &gf);
1597
1667
  ggml_metal_get_tensor (lctx.ctx_metal, cur);
1598
1668
  } else {
@@ -1612,12 +1682,21 @@ static bool llama_eval_internal(
1612
1682
  ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1613
1683
  }
1614
1684
 
1615
- ggml_graph_compute(ctx0, &gf);
1685
+ ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1616
1686
  }
1617
1687
  #else
1618
- ggml_graph_compute(ctx0, &gf);
1688
+ ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1689
+ #endif
1690
+
1691
+ #if GGML_USE_MPI
1692
+ ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
1619
1693
  #endif
1620
1694
 
1695
+ // update kv token count
1696
+ lctx.kv_self.n = n_past + N;
1697
+
1698
+ struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
1699
+
1621
1700
  if (cgraph_fname) {
1622
1701
  ggml_graph_export(&gf, cgraph_fname);
1623
1702
  }
@@ -1633,23 +1712,17 @@ static bool llama_eval_internal(
1633
1712
  // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
1634
1713
  //}
1635
1714
 
1636
- //embd_w.resize(n_vocab*N);
1637
- //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
1638
-
1639
- // update kv token count
1640
- lctx.kv_self.n = n_past + N;
1641
-
1642
1715
  // extract logits
1643
1716
  {
1644
1717
  auto & logits_out = lctx.logits;
1645
1718
 
1646
1719
  if (lctx.logits_all) {
1647
1720
  logits_out.resize(n_vocab * N);
1648
- memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
1721
+ memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
1649
1722
  } else {
1650
1723
  // return result for just the last token
1651
1724
  logits_out.resize(n_vocab);
1652
- memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1725
+ memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1653
1726
  }
1654
1727
  }
1655
1728
 
@@ -1957,9 +2030,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
1957
2030
  }
1958
2031
 
1959
2032
  // Normalize the second derivatives
1960
- float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
1961
- for (float & value : second_derivatives) {
1962
- value /= second_derivatives_sum;
2033
+ {
2034
+ const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
2035
+
2036
+ if (second_derivatives_sum > 1e-6f) {
2037
+ for (float & value : second_derivatives) {
2038
+ value /= second_derivatives_sum;
2039
+ }
2040
+ } else {
2041
+ for (float & value : second_derivatives) {
2042
+ value = 1.0f / second_derivatives.size();
2043
+ }
2044
+ }
1963
2045
  }
1964
2046
 
1965
2047
  float cum_sum = 0.0f;
@@ -2118,6 +2200,52 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
2118
2200
  }
2119
2201
  }
2120
2202
 
2203
+ static void llama_log_softmax(float * array, size_t size) {
2204
+ float max_l = *std::max_element(array, array + size);
2205
+ float sum = 0.f;
2206
+ for (size_t i = 0; i < size; ++i) {
2207
+ float p = expf(array[i] - max_l);
2208
+ sum += p;
2209
+ array[i] = p;
2210
+ }
2211
+
2212
+ for (size_t i = 0; i < size; ++i) {
2213
+ array[i] = logf(array[i] / sum);
2214
+ }
2215
+ }
2216
+
2217
+ void llama_sample_classifier_free_guidance(
2218
+ struct llama_context * ctx,
2219
+ llama_token_data_array * candidates,
2220
+ struct llama_context * guidance_ctx,
2221
+ float scale) {
2222
+ int64_t t_start_sample_us = ggml_time_us();
2223
+
2224
+ assert(ctx);
2225
+ auto n_vocab = llama_n_vocab(ctx);
2226
+ assert(n_vocab == (int)candidates->size);
2227
+ assert(!candidates->sorted);
2228
+
2229
+ std::vector<float> logits_base;
2230
+ logits_base.reserve(candidates->size);
2231
+ for (size_t i = 0; i < candidates->size; ++i) {
2232
+ logits_base.push_back(candidates->data[i].logit);
2233
+ }
2234
+ llama_log_softmax(logits_base.data(), candidates->size);
2235
+
2236
+ float* logits_guidance = llama_get_logits(guidance_ctx);
2237
+ llama_log_softmax(logits_guidance, n_vocab);
2238
+
2239
+ for (int i = 0; i < n_vocab; ++i) {
2240
+ float logit_guidance = logits_guidance[i];
2241
+ float logit_base = logits_base[i];
2242
+ candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
2243
+ }
2244
+
2245
+ if (ctx) {
2246
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2247
+ }
2248
+ }
2121
2249
 
2122
2250
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
2123
2251
  assert(ctx);
@@ -2405,15 +2533,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2405
2533
  } else {
2406
2534
  new_type = quantized_type;
2407
2535
  #ifdef GGML_USE_K_QUANTS
2536
+ bool convert_incompatible_tensor = false;
2408
2537
  if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2409
2538
  quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2410
2539
  int nx = tensor.ne.at(0);
2411
2540
  int ny = tensor.ne.at(1);
2412
2541
  if (nx % QK_K != 0 || ny % QK_K != 0) {
2413
- fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
2414
- fprintf(stderr, "This is required to be able to use k-quants for now!\n");
2415
- fprintf(stderr, "========================================================================================\n\n");
2416
- throw std::runtime_error("Unsupported tensor size encountered\n");
2542
+ fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
2543
+ convert_incompatible_tensor = true;
2417
2544
  }
2418
2545
  }
2419
2546
  if (tensor.name == "output.weight") {
@@ -2441,6 +2568,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2441
2568
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2442
2569
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2443
2570
  }
2571
+ if (convert_incompatible_tensor) {
2572
+ if (tensor.name == "output.weight") {
2573
+ new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
2574
+ fprintf(stderr, "F16 will be used for this tensor instead.\n");
2575
+ } else if (tensor.name == "tok_embeddings.weight") {
2576
+ new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
2577
+ fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
2578
+ } else {
2579
+ throw std::runtime_error("Unsupported tensor size encountered\n");
2580
+ }
2581
+ }
2444
2582
  #endif
2445
2583
 
2446
2584
  float * f32_data;
@@ -2560,8 +2698,9 @@ struct llama_model * llama_load_model_from_file(
2560
2698
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2561
2699
 
2562
2700
  if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
2563
- params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2564
- params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2701
+ params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
2702
+ memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
2703
+ params.progress_callback_user_data)) {
2565
2704
  delete model;
2566
2705
  fprintf(stderr, "%s: failed to load model\n", __func__);
2567
2706
  return nullptr;
@@ -2575,14 +2714,14 @@ void llama_free_model(struct llama_model * model) {
2575
2714
  }
2576
2715
 
2577
2716
  struct llama_context * llama_new_context_with_model(
2578
- struct llama_model * model,
2579
- struct llama_context_params params) {
2717
+ struct llama_model * model,
2718
+ struct llama_context_params params) {
2580
2719
 
2581
2720
  if (!model) {
2582
2721
  return nullptr;
2583
2722
  }
2584
2723
 
2585
- llama_context * ctx = new llama_context(*model, model->vocab);
2724
+ llama_context * ctx = new llama_context(*model);
2586
2725
 
2587
2726
  if (params.seed == LLAMA_DEFAULT_SEED) {
2588
2727
  params.seed = time(NULL);
@@ -2636,16 +2775,16 @@ struct llama_context * llama_new_context_with_model(
2636
2775
  ctx->embedding.resize(hparams.n_embd);
2637
2776
  }
2638
2777
 
2639
- ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
2778
+ ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
2640
2779
 
2641
- ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
2780
+ ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
2642
2781
  ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
2643
2782
  }
2644
2783
 
2645
2784
  #ifdef GGML_USE_METAL
2646
2785
  if (params.n_gpu_layers > 0) {
2647
2786
  // this allocates all Metal resources and memory buffers
2648
- ctx->ctx_metal = ggml_metal_init();
2787
+ ctx->ctx_metal = ggml_metal_init(1);
2649
2788
 
2650
2789
  void * data_ptr = NULL;
2651
2790
  size_t data_size = 0;
@@ -2680,6 +2819,18 @@ struct llama_context * llama_new_context_with_model(
2680
2819
  }
2681
2820
  #endif
2682
2821
 
2822
+ #ifdef GGML_USE_MPI
2823
+ ctx->ctx_mpi = ggml_mpi_init();
2824
+
2825
+ if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
2826
+ // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
2827
+ const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos());
2828
+ while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
2829
+ llama_backend_free();
2830
+ exit(1);
2831
+ }
2832
+ #endif
2833
+
2683
2834
  return ctx;
2684
2835
  }
2685
2836
 
@@ -2802,6 +2953,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2802
2953
  // read tensors and apply
2803
2954
  bool warned = false;
2804
2955
  int n_tensors = 0;
2956
+
2957
+ std::vector<uint8_t> work_buffer;
2958
+
2805
2959
  while (true) {
2806
2960
  int32_t n_dims;
2807
2961
  int32_t length;
@@ -2966,8 +3120,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2966
3120
  }
2967
3121
 
2968
3122
  struct ggml_cgraph gf = ggml_build_forward(r);
2969
- gf.n_threads = n_threads;
2970
- ggml_graph_compute(lora_ctx, &gf);
3123
+
3124
+ ggml_graph_compute_helper(work_buffer, &gf, n_threads);
2971
3125
 
2972
3126
  // we won't need these tensors again, reset the context to save memory
2973
3127
  ggml_free(lora_ctx);
@@ -3120,7 +3274,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3120
3274
 
3121
3275
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3122
3276
  ggml_cgraph gf{};
3123
- gf.n_threads = 1;
3124
3277
 
3125
3278
  ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
3126
3279
  kout3d->data = out;
@@ -3140,7 +3293,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3140
3293
 
3141
3294
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
3142
3295
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
3143
- ggml_graph_compute(cpy_ctx, &gf);
3296
+ ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
3144
3297
 
3145
3298
  ggml_free(cpy_ctx);
3146
3299
  }
@@ -3226,7 +3379,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3226
3379
 
3227
3380
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3228
3381
  ggml_cgraph gf{};
3229
- gf.n_threads = 1;
3230
3382
 
3231
3383
  ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
3232
3384
  kin3d->data = (void *) inp;
@@ -3246,7 +3398,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3246
3398
 
3247
3399
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
3248
3400
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
3249
- ggml_graph_compute(cpy_ctx, &gf);
3401
+ ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
3250
3402
 
3251
3403
  ggml_free(cpy_ctx);
3252
3404
  }
@@ -3407,13 +3559,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
3407
3559
  return 0;
3408
3560
  }
3409
3561
 
3410
- int llama_tokenize(
3411
- struct llama_context * ctx,
3562
+ int llama_tokenize_with_model(
3563
+ const struct llama_model * model,
3412
3564
  const char * text,
3413
3565
  llama_token * tokens,
3414
3566
  int n_max_tokens,
3415
3567
  bool add_bos) {
3416
- auto res = llama_tokenize(ctx->vocab, text, add_bos);
3568
+ auto res = llama_tokenize(model->vocab, text, add_bos);
3417
3569
 
3418
3570
  if (n_max_tokens < (int) res.size()) {
3419
3571
  fprintf(stderr, "%s: too many tokens\n", __func__);
@@ -3427,8 +3579,29 @@ int llama_tokenize(
3427
3579
  return res.size();
3428
3580
  }
3429
3581
 
3582
+ int llama_tokenize(
3583
+ struct llama_context * ctx,
3584
+ const char * text,
3585
+ llama_token * tokens,
3586
+ int n_max_tokens,
3587
+ bool add_bos) {
3588
+ return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
3589
+ }
3590
+
3591
+ int llama_n_vocab_from_model(const struct llama_model * model) {
3592
+ return model->vocab.id_to_token.size();
3593
+ }
3594
+
3595
+ int llama_n_ctx_from_model(const struct llama_model * model) {
3596
+ return model->hparams.n_ctx;
3597
+ }
3598
+
3599
+ int llama_n_embd_from_model(const struct llama_model * model) {
3600
+ return model->hparams.n_embd;
3601
+ }
3602
+
3430
3603
  int llama_n_vocab(const struct llama_context * ctx) {
3431
- return ctx->vocab.id_to_token.size();
3604
+ return ctx->model.vocab.id_to_token.size();
3432
3605
  }
3433
3606
 
3434
3607
  int llama_n_ctx(const struct llama_context * ctx) {
@@ -3439,19 +3612,27 @@ int llama_n_embd(const struct llama_context * ctx) {
3439
3612
  return ctx->model.hparams.n_embd;
3440
3613
  }
3441
3614
 
3442
- int llama_get_vocab(
3443
- const struct llama_context * ctx,
3615
+ int llama_get_vocab_from_model(
3616
+ const struct llama_model * model,
3444
3617
  const char * * strings,
3445
3618
  float * scores,
3446
3619
  int capacity) {
3447
- int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
3620
+ int n = std::min(capacity, (int) model->vocab.id_to_token.size());
3448
3621
  for (int i = 0; i<n; ++i) {
3449
- strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
3450
- scores[i] = ctx->vocab.id_to_token[i].score;
3622
+ strings[i] = model->vocab.id_to_token[i].tok.c_str();
3623
+ scores[i] = model->vocab.id_to_token[i].score;
3451
3624
  }
3452
3625
  return n;
3453
3626
  }
3454
3627
 
3628
+ int llama_get_vocab(
3629
+ const struct llama_context * ctx,
3630
+ const char * * strings,
3631
+ float * scores,
3632
+ int capacity) {
3633
+ return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
3634
+ }
3635
+
3455
3636
  float * llama_get_logits(struct llama_context * ctx) {
3456
3637
  return ctx->logits.data();
3457
3638
  }
@@ -3460,12 +3641,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
3460
3641
  return ctx->embedding.data();
3461
3642
  }
3462
3643
 
3463
- const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
3464
- if (token >= llama_n_vocab(ctx)) {
3644
+ const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
3645
+ if (token >= llama_n_vocab_from_model(model)) {
3465
3646
  return nullptr;
3466
3647
  }
3467
3648
 
3468
- return ctx->vocab.id_to_token[token].tok.c_str();
3649
+ return model->vocab.id_to_token[token].tok.c_str();
3650
+ }
3651
+
3652
+ const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
3653
+ return llama_token_to_str_with_model(&ctx->model, token);
3469
3654
  }
3470
3655
 
3471
3656
  llama_token llama_token_bos() {