llama_cpp 0.3.1 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -19,6 +19,9 @@
19
19
  #ifdef GGML_USE_METAL
20
20
  #include "ggml-metal.h"
21
21
  #endif
22
+ #ifdef GGML_USE_MPI
23
+ #include "ggml-mpi.h"
24
+ #endif
22
25
  #ifdef GGML_USE_K_QUANTS
23
26
  #ifndef QK_K
24
27
  #ifdef GGML_QKK_64
@@ -66,6 +69,7 @@ enum e_model {
66
69
  MODEL_65B,
67
70
  };
68
71
 
72
+ static const size_t kB = 1024;
69
73
  static const size_t MB = 1024*1024;
70
74
 
71
75
  // computed for n_ctx == 2048
@@ -78,6 +82,25 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
78
82
  (void) tensor;
79
83
  }
80
84
 
85
+ //
86
+ // ggml helpers
87
+ //
88
+
89
+ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
90
+ struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
91
+
92
+ if (plan.work_size > 0) {
93
+ buf.resize(plan.work_size);
94
+ plan.work_data = buf.data();
95
+ }
96
+
97
+ ggml_graph_compute(graph, &plan);
98
+ }
99
+
100
+ //
101
+ // memory sizes
102
+ //
103
+
81
104
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
82
105
  {
83
106
  static std::map<e_model, size_t> k_sizes = {
@@ -129,6 +152,34 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
129
152
  return k_sizes;
130
153
  }
131
154
 
155
+ // amount of VRAM needed per batch size to hold temporary results
156
+ // the values for 3b and 65b are not derived from testing but instead chosen conservatively
157
+ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
158
+ {
159
+ static std::map<e_model, size_t> k_sizes = {
160
+ { MODEL_3B, 512ull * kB },
161
+ { MODEL_7B, 512ull * kB },
162
+ { MODEL_13B, 640ull * kB },
163
+ { MODEL_30B, 768ull * kB },
164
+ { MODEL_65B, 1536ull * kB },
165
+ };
166
+ return k_sizes;
167
+ }
168
+
169
+ // amount of VRAM needed per batch size and context to hold temporary results
170
+ // the values for 3b and 65b are not derived from testing but instead chosen conservatively
171
+ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
172
+ {
173
+ static std::map<e_model, size_t> k_sizes = {
174
+ { MODEL_3B, 128ull },
175
+ { MODEL_7B, 128ull },
176
+ { MODEL_13B, 160ull },
177
+ { MODEL_30B, 208ull },
178
+ { MODEL_65B, 416ull },
179
+ };
180
+ return k_sizes;
181
+ }
182
+
132
183
  // default hparams (LLaMA 7B)
133
184
  struct llama_hparams {
134
185
  uint32_t n_vocab = 32000;
@@ -165,8 +216,8 @@ struct llama_layer {
165
216
  };
166
217
 
167
218
  struct llama_kv_cache {
168
- struct ggml_tensor * k;
169
- struct ggml_tensor * v;
219
+ struct ggml_tensor * k = NULL;
220
+ struct ggml_tensor * v = NULL;
170
221
 
171
222
  struct ggml_context * ctx = NULL;
172
223
 
@@ -253,7 +304,13 @@ struct llama_model {
253
304
 
254
305
  struct llama_context {
255
306
  llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
256
-
307
+ #ifdef GGML_USE_METAL
308
+ ~llama_context() {
309
+ if (ctx_metal) {
310
+ ggml_metal_free(ctx_metal);
311
+ }
312
+ }
313
+ #endif
257
314
  std::mt19937 rng;
258
315
 
259
316
  bool has_evaluated_once = false;
@@ -286,6 +343,9 @@ struct llama_context {
286
343
  // input embedding (1-dimensional array: [n_embd])
287
344
  std::vector<float> embedding;
288
345
 
346
+ // reusable buffer for `struct ggml_graph_plan.work_data`
347
+ std::vector<uint8_t> work_buffer;
348
+
289
349
  // memory buffers used to evaluate the model
290
350
  // TODO: move in llama_state
291
351
  llama_ctx_buffer buf_compute;
@@ -295,6 +355,10 @@ struct llama_context {
295
355
  ggml_metal_context * ctx_metal = NULL;
296
356
  #endif
297
357
 
358
+ #ifdef GGML_USE_MPI
359
+ ggml_mpi_context * ctx_mpi = NULL;
360
+ #endif
361
+
298
362
  int buf_last = 0;
299
363
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
300
364
 
@@ -446,9 +510,7 @@ struct llama_file_loader {
446
510
  std::string word = file.read_string(len);
447
511
 
448
512
  float score = 0.0f;
449
- if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) {
450
- file.read_raw(&score, sizeof(score));
451
- }
513
+ file.read_raw(&score, sizeof(score));
452
514
 
453
515
  vocab.token_to_id[word] = i;
454
516
 
@@ -725,7 +787,6 @@ struct llama_model_loader {
725
787
 
726
788
  };
727
789
 
728
-
729
790
  //
730
791
  // kv cache
731
792
  //
@@ -816,7 +877,7 @@ bool llama_mlock_supported() {
816
877
  return llama_mlock::SUPPORTED;
817
878
  }
818
879
 
819
- void llama_init_backend(bool numa) {
880
+ void llama_backend_init(bool numa) {
820
881
  ggml_time_init();
821
882
 
822
883
  // needed to initialize f16 tables
@@ -829,6 +890,16 @@ void llama_init_backend(bool numa) {
829
890
  if (numa) {
830
891
  ggml_numa_init();
831
892
  }
893
+
894
+ #ifdef GGML_USE_MPI
895
+ ggml_mpi_backend_init();
896
+ #endif
897
+ }
898
+
899
+ void llama_backend_free() {
900
+ #ifdef GGML_USE_MPI
901
+ ggml_mpi_backend_free();
902
+ #endif
832
903
  }
833
904
 
834
905
  int64_t llama_time_us() {
@@ -1112,14 +1183,18 @@ static void llama_model_load_internal(
1112
1183
  fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
1113
1184
  ggml_cuda_set_scratch_size(0); // disable scratch
1114
1185
  } else {
1115
- vram_scratch = n_batch * MB;
1186
+ const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
1187
+ const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
1188
+ vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
1116
1189
  ggml_cuda_set_scratch_size(vram_scratch);
1117
1190
  if (n_gpu_layers > 0) {
1118
- fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
1119
- __func__, vram_scratch / MB);
1191
+ fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
1192
+ __func__, vram_scratch_base / kB, vram_scratch_per_context,
1193
+ (vram_scratch + MB - 1) / MB); // round up
1120
1194
  }
1121
1195
  }
1122
1196
  #endif // GGML_USE_CUBLAS
1197
+
1123
1198
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1124
1199
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1125
1200
 
@@ -1128,6 +1203,10 @@ static void llama_model_load_internal(
1128
1203
  fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
1129
1204
  }
1130
1205
  size_t vram_kv_cache = 0;
1206
+
1207
+ #ifdef GGML_USE_CUBLAS
1208
+ const int max_backend_supported_layers = hparams.n_layer + 3;
1209
+ const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1131
1210
  if (n_gpu_layers > (int) hparams.n_layer + 1) {
1132
1211
  if (low_vram) {
1133
1212
  fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
@@ -1144,14 +1223,18 @@ static void llama_model_load_internal(
1144
1223
  vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1145
1224
  }
1146
1225
  }
1147
- const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1226
+ #elif defined(GGML_USE_CLBLAST)
1227
+ const int max_backend_supported_layers = hparams.n_layer + 1;
1228
+ const int max_offloadable_layers = hparams.n_layer + 1;
1229
+ #endif // GGML_USE_CUBLAS
1230
+
1148
1231
  fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
1149
- __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
1232
+ __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
1150
1233
  fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1151
1234
  __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
1152
1235
  #else
1153
1236
  (void) n_gpu_layers;
1154
- #endif
1237
+ #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1155
1238
  }
1156
1239
 
1157
1240
  // populate `tensors_by_name`
@@ -1218,18 +1301,16 @@ static bool llama_eval_internal(
1218
1301
  llama_context & lctx,
1219
1302
  const llama_token * tokens,
1220
1303
  const float * embd,
1221
- const int n_tokens,
1222
- const int n_past,
1223
- const int n_threads,
1304
+ int n_tokens,
1305
+ int n_past,
1306
+ int n_threads,
1224
1307
  const char * cgraph_fname) {
1225
1308
 
1226
1309
  LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1227
1310
 
1228
- // enforce that the first token is BOS
1229
- if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
1230
- fprintf(stderr, "%s: first token must be BOS\n", __func__);
1231
- return false;
1232
- }
1311
+ #ifdef GGML_USE_MPI
1312
+ ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
1313
+ #endif
1233
1314
 
1234
1315
  const int64_t t_start_us = ggml_time_us();
1235
1316
 
@@ -1261,20 +1342,26 @@ static bool llama_eval_internal(
1261
1342
 
1262
1343
  struct ggml_context * ctx0 = ggml_init(params);
1263
1344
 
1345
+ ggml_cgraph gf = {};
1346
+
1264
1347
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1265
1348
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1266
- ggml_cgraph gf = {};
1267
- gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1349
+ n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1268
1350
 
1269
1351
  struct ggml_tensor * cur;
1270
1352
  struct ggml_tensor * inpL;
1271
1353
 
1272
1354
  if (tokens) {
1273
- struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1274
- ggml_set_name(embd, "embd");
1275
- memcpy(embd->data, tokens, N*ggml_element_size(embd));
1276
- inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1355
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1356
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
1357
+ ggml_set_name(inp_tokens, "inp_tokens");
1358
+
1359
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
1277
1360
  } else {
1361
+ #ifdef GGML_USE_MPI
1362
+ GGML_ASSERT(false && "not implemented");
1363
+ #endif
1364
+
1278
1365
  inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
1279
1366
  memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1280
1367
  }
@@ -1292,18 +1379,20 @@ static bool llama_eval_internal(
1292
1379
  offload_func_t offload_func_v = llama_nop;
1293
1380
 
1294
1381
  #ifdef GGML_USE_CUBLAS
1295
- if (n_gpu_layers > n_layer) {
1296
- offload_func_nr = ggml_cuda_assign_buffers;
1297
- }
1298
- if (n_gpu_layers > n_layer + 1) {
1299
- offload_func_v = ggml_cuda_assign_buffers;
1300
- }
1301
- if (n_gpu_layers > n_layer + 2) {
1302
- offload_func_kq = ggml_cuda_assign_buffers;
1303
- }
1382
+ if (n_gpu_layers > n_layer) {
1383
+ offload_func_nr = ggml_cuda_assign_buffers;
1384
+ }
1385
+ if (n_gpu_layers > n_layer + 1) {
1386
+ offload_func_v = ggml_cuda_assign_buffers;
1387
+ }
1388
+ if (n_gpu_layers > n_layer + 2) {
1389
+ offload_func_kq = ggml_cuda_assign_buffers;
1390
+ }
1304
1391
  #endif // GGML_USE_CUBLAS
1305
1392
 
1306
1393
  for (int il = 0; il < n_layer; ++il) {
1394
+ ggml_format_name(inpL, "layer_inp_%d", il);
1395
+
1307
1396
  offload_func_t offload_func = llama_nop;
1308
1397
 
1309
1398
  #ifdef GGML_USE_CUBLAS
@@ -1510,7 +1599,6 @@ static bool llama_eval_internal(
1510
1599
 
1511
1600
  // input for next layer
1512
1601
  inpL = cur;
1513
-
1514
1602
  }
1515
1603
 
1516
1604
  lctx.use_buf(ctx0, 0);
@@ -1518,7 +1606,6 @@ static bool llama_eval_internal(
1518
1606
  // used at the end to optionally extract the embeddings
1519
1607
  struct ggml_tensor * embeddings = NULL;
1520
1608
 
1521
-
1522
1609
  // norm
1523
1610
  {
1524
1611
  cur = ggml_rms_norm(ctx0, inpL);
@@ -1533,7 +1620,6 @@ static bool llama_eval_internal(
1533
1620
  embeddings = cur;
1534
1621
  }
1535
1622
 
1536
-
1537
1623
  // lm_head
1538
1624
  cur = ggml_mul_mat(ctx0, model.output, cur);
1539
1625
  ggml_set_name(cur, "result_output");
@@ -1546,8 +1632,13 @@ static bool llama_eval_internal(
1546
1632
  // run the computation
1547
1633
  ggml_build_forward_expand(&gf, cur);
1548
1634
 
1635
+ #if GGML_USE_MPI
1636
+ ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
1637
+ #endif
1638
+
1549
1639
  #ifdef GGML_USE_METAL
1550
1640
  if (lctx.ctx_metal && N == 1) {
1641
+ ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
1551
1642
  ggml_metal_graph_compute(lctx.ctx_metal, &gf);
1552
1643
  ggml_metal_get_tensor (lctx.ctx_metal, cur);
1553
1644
  } else {
@@ -1567,12 +1658,21 @@ static bool llama_eval_internal(
1567
1658
  ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1568
1659
  }
1569
1660
 
1570
- ggml_graph_compute(ctx0, &gf);
1661
+ ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1571
1662
  }
1572
1663
  #else
1573
- ggml_graph_compute(ctx0, &gf);
1664
+ ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1574
1665
  #endif
1575
1666
 
1667
+ #if GGML_USE_MPI
1668
+ ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
1669
+ #endif
1670
+
1671
+ // update kv token count
1672
+ lctx.kv_self.n = n_past + N;
1673
+
1674
+ struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
1675
+
1576
1676
  if (cgraph_fname) {
1577
1677
  ggml_graph_export(&gf, cgraph_fname);
1578
1678
  }
@@ -1588,23 +1688,17 @@ static bool llama_eval_internal(
1588
1688
  // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
1589
1689
  //}
1590
1690
 
1591
- //embd_w.resize(n_vocab*N);
1592
- //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
1593
-
1594
- // update kv token count
1595
- lctx.kv_self.n = n_past + N;
1596
-
1597
1691
  // extract logits
1598
1692
  {
1599
1693
  auto & logits_out = lctx.logits;
1600
1694
 
1601
1695
  if (lctx.logits_all) {
1602
1696
  logits_out.resize(n_vocab * N);
1603
- memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
1697
+ memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
1604
1698
  } else {
1605
1699
  // return result for just the last token
1606
1700
  logits_out.resize(n_vocab);
1607
- memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1701
+ memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1608
1702
  }
1609
1703
  }
1610
1704
 
@@ -1860,10 +1954,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
1860
1954
  return;
1861
1955
  }
1862
1956
 
1863
- const int64_t t_start_sample_us = ggml_time_us();
1864
-
1865
1957
  llama_sample_softmax(ctx, candidates);
1866
1958
 
1959
+ const int64_t t_start_sample_us = ggml_time_us();
1960
+
1867
1961
  // Compute the cumulative probabilities
1868
1962
  float cum_sum = 0.0f;
1869
1963
  size_t last_idx = candidates->size;
@@ -1892,9 +1986,8 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
1892
1986
  return;
1893
1987
  }
1894
1988
 
1895
- const int64_t t_start_sample_us = ggml_time_us();
1896
-
1897
1989
  llama_sample_softmax(nullptr, candidates);
1990
+ const int64_t t_start_sample_us = ggml_time_us();
1898
1991
 
1899
1992
  // Compute the first and second derivatives
1900
1993
  std::vector<float> first_derivatives(candidates->size - 1);
@@ -1946,11 +2039,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
1946
2039
  return;
1947
2040
  }
1948
2041
 
1949
- const int64_t t_start_sample_us = ggml_time_us();
1950
-
1951
2042
  // Compute the softmax of logits and calculate entropy
1952
2043
  llama_sample_softmax(nullptr, candidates);
1953
2044
 
2045
+ const int64_t t_start_sample_us = ggml_time_us();
2046
+
1954
2047
  float entropy = 0.0f;
1955
2048
  for (size_t i = 0; i < candidates->size; ++i) {
1956
2049
  entropy += -candidates->data[i].p * logf(candidates->data[i].p);
@@ -2074,6 +2167,62 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
2074
2167
  }
2075
2168
  }
2076
2169
 
2170
+ static void llama_log_softmax(float * array, size_t size) {
2171
+ float max_l = *std::max_element(array, array + size);
2172
+ float sum = 0.f;
2173
+ for (size_t i = 0; i < size; ++i) {
2174
+ float p = expf(array[i] - max_l);
2175
+ sum += p;
2176
+ array[i] = p;
2177
+ }
2178
+
2179
+ for (size_t i = 0; i < size; ++i) {
2180
+ array[i] = logf(array[i] / sum);
2181
+ }
2182
+ }
2183
+
2184
+ void llama_sample_classifier_free_guidance(
2185
+ struct llama_context * ctx,
2186
+ llama_token_data_array * candidates,
2187
+ struct llama_context * guidance_ctx,
2188
+ float scale,
2189
+ float smooth_factor) {
2190
+ int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
2191
+
2192
+ assert(ctx);
2193
+ auto n_vocab = llama_n_vocab(ctx);
2194
+ assert(n_vocab == (int)candidates->size);
2195
+ assert(!candidates->sorted);
2196
+
2197
+ std::vector<float> logits_base;
2198
+ logits_base.reserve(candidates->size);
2199
+ for (size_t i = 0; i < candidates->size; ++i) {
2200
+ logits_base.push_back(candidates->data[i].logit);
2201
+ }
2202
+ llama_log_softmax(logits_base.data(), candidates->size);
2203
+
2204
+ float* logits_guidance = llama_get_logits(guidance_ctx);
2205
+ llama_log_softmax(logits_guidance, n_vocab);
2206
+
2207
+ for (int i = 0; i < n_vocab; ++i) {
2208
+ float logit_guidance = logits_guidance[i];
2209
+ float logit_base = logits_base[i];
2210
+ logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance;
2211
+ }
2212
+
2213
+ llama_log_softmax(logits_guidance, n_vocab);
2214
+
2215
+ for (int i = 0; i < n_vocab; ++i) {
2216
+ float logit_base = logits_base[i];
2217
+ float logit_guidance = logits_guidance[i];
2218
+
2219
+ candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
2220
+ }
2221
+
2222
+ if (ctx) {
2223
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2224
+ }
2225
+ }
2077
2226
 
2078
2227
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
2079
2228
  assert(ctx);
@@ -2119,13 +2268,11 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
2119
2268
 
2120
2269
  if (ctx) {
2121
2270
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2122
- ctx->n_sample++;
2123
2271
  }
2124
2272
  return X;
2125
2273
  }
2126
2274
 
2127
2275
  llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
2128
- assert(ctx);
2129
2276
  int64_t t_start_sample_us;
2130
2277
  t_start_sample_us = ggml_time_us();
2131
2278
 
@@ -2140,13 +2287,14 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
2140
2287
  candidates->size = 1;
2141
2288
  }
2142
2289
 
2290
+ if (ctx) {
2291
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2292
+ }
2293
+
2143
2294
  // Normalize the probabilities of the remaining words
2144
2295
  llama_sample_softmax(ctx, candidates);
2145
2296
 
2146
2297
  // Sample the next word X from the remaining words
2147
- if (ctx) {
2148
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2149
- }
2150
2298
  llama_token X = llama_sample_token(ctx, candidates);
2151
2299
  t_start_sample_us = ggml_time_us();
2152
2300
 
@@ -2214,10 +2362,10 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
2214
2362
  }
2215
2363
  float * f32_output = (float *) output.addr;
2216
2364
 
2217
- quantize_fns_t qtype;
2365
+ ggml_type_traits_t qtype;
2218
2366
  if (ggml_is_quantized(tensor.type)) {
2219
- qtype = ggml_internal_get_quantize_fn(tensor.type);
2220
- if (qtype.dequantize_row_q == NULL) {
2367
+ qtype = ggml_internal_get_type_traits(tensor.type);
2368
+ if (qtype.to_float == NULL) {
2221
2369
  throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
2222
2370
  }
2223
2371
  } else if (tensor.type != GGML_TYPE_F16) {
@@ -2228,7 +2376,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
2228
2376
  if (tensor.type == GGML_TYPE_F16) {
2229
2377
  ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
2230
2378
  } else if (ggml_is_quantized(tensor.type)) {
2231
- qtype.dequantize_row_q(tensor.data, f32_output, nelements);
2379
+ qtype.to_float(tensor.data, f32_output, nelements);
2232
2380
  } else {
2233
2381
  LLAMA_ASSERT(false); // unreachable
2234
2382
  }
@@ -2253,7 +2401,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
2253
2401
  if (typ == GGML_TYPE_F16) {
2254
2402
  ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
2255
2403
  } else {
2256
- qtype.dequantize_row_q(inbuf, outbuf, nels);
2404
+ qtype.to_float(inbuf, outbuf, nels);
2257
2405
  }
2258
2406
  };
2259
2407
  workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
@@ -2362,15 +2510,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2362
2510
  } else {
2363
2511
  new_type = quantized_type;
2364
2512
  #ifdef GGML_USE_K_QUANTS
2513
+ bool convert_incompatible_tensor = false;
2365
2514
  if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2366
2515
  quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2367
2516
  int nx = tensor.ne.at(0);
2368
2517
  int ny = tensor.ne.at(1);
2369
2518
  if (nx % QK_K != 0 || ny % QK_K != 0) {
2370
- fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
2371
- fprintf(stderr, "This is required to be able to use k-quants for now!\n");
2372
- fprintf(stderr, "========================================================================================\n\n");
2373
- throw std::runtime_error("Unsupported tensor size encountered\n");
2519
+ fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
2520
+ convert_incompatible_tensor = true;
2374
2521
  }
2375
2522
  }
2376
2523
  if (tensor.name == "output.weight") {
@@ -2398,6 +2545,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2398
2545
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2399
2546
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2400
2547
  }
2548
+ if (convert_incompatible_tensor) {
2549
+ if (tensor.name == "output.weight") {
2550
+ new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
2551
+ fprintf(stderr, "F16 will be used for this tensor instead.\n");
2552
+ } else if (tensor.name == "tok_embeddings.weight") {
2553
+ new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
2554
+ fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
2555
+ } else {
2556
+ throw std::runtime_error("Unsupported tensor size encountered\n");
2557
+ }
2558
+ }
2401
2559
  #endif
2402
2560
 
2403
2561
  float * f32_data;
@@ -2532,8 +2690,8 @@ void llama_free_model(struct llama_model * model) {
2532
2690
  }
2533
2691
 
2534
2692
  struct llama_context * llama_new_context_with_model(
2535
- struct llama_model * model,
2536
- struct llama_context_params params) {
2693
+ struct llama_model * model,
2694
+ struct llama_context_params params) {
2537
2695
 
2538
2696
  if (!model) {
2539
2697
  return nullptr;
@@ -2602,7 +2760,7 @@ struct llama_context * llama_new_context_with_model(
2602
2760
  #ifdef GGML_USE_METAL
2603
2761
  if (params.n_gpu_layers > 0) {
2604
2762
  // this allocates all Metal resources and memory buffers
2605
- ctx->ctx_metal = ggml_metal_init();
2763
+ ctx->ctx_metal = ggml_metal_init(1);
2606
2764
 
2607
2765
  void * data_ptr = NULL;
2608
2766
  size_t data_size = 0;
@@ -2637,6 +2795,18 @@ struct llama_context * llama_new_context_with_model(
2637
2795
  }
2638
2796
  #endif
2639
2797
 
2798
+ #ifdef GGML_USE_MPI
2799
+ ctx->ctx_mpi = ggml_mpi_init();
2800
+
2801
+ if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
2802
+ // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
2803
+ const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos());
2804
+ while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
2805
+ llama_backend_free();
2806
+ exit(1);
2807
+ }
2808
+ #endif
2809
+
2640
2810
  return ctx;
2641
2811
  }
2642
2812
 
@@ -2759,6 +2929,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2759
2929
  // read tensors and apply
2760
2930
  bool warned = false;
2761
2931
  int n_tensors = 0;
2932
+
2933
+ std::vector<uint8_t> work_buffer;
2934
+
2762
2935
  while (true) {
2763
2936
  int32_t n_dims;
2764
2937
  int32_t length;
@@ -2923,8 +3096,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2923
3096
  }
2924
3097
 
2925
3098
  struct ggml_cgraph gf = ggml_build_forward(r);
2926
- gf.n_threads = n_threads;
2927
- ggml_graph_compute(lora_ctx, &gf);
3099
+
3100
+ ggml_graph_compute_helper(work_buffer, &gf, n_threads);
2928
3101
 
2929
3102
  // we won't need these tensors again, reset the context to save memory
2930
3103
  ggml_free(lora_ctx);
@@ -3077,7 +3250,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3077
3250
 
3078
3251
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3079
3252
  ggml_cgraph gf{};
3080
- gf.n_threads = 1;
3081
3253
 
3082
3254
  ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
3083
3255
  kout3d->data = out;
@@ -3097,7 +3269,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3097
3269
 
3098
3270
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
3099
3271
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
3100
- ggml_graph_compute(cpy_ctx, &gf);
3272
+ ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
3101
3273
 
3102
3274
  ggml_free(cpy_ctx);
3103
3275
  }
@@ -3183,7 +3355,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3183
3355
 
3184
3356
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3185
3357
  ggml_cgraph gf{};
3186
- gf.n_threads = 1;
3187
3358
 
3188
3359
  ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
3189
3360
  kin3d->data = (void *) inp;
@@ -3203,7 +3374,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3203
3374
 
3204
3375
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
3205
3376
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
3206
- ggml_graph_compute(cpy_ctx, &gf);
3377
+ ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
3207
3378
 
3208
3379
  ggml_free(cpy_ctx);
3209
3380
  }
@@ -3219,7 +3390,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3219
3390
  return nread;
3220
3391
  }
3221
3392
 
3222
- bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
3393
+ static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
3223
3394
  llama_file file(path_session, "rb");
3224
3395
 
3225
3396
  // sanity checks
@@ -3273,6 +3444,15 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
3273
3444
  return true;
3274
3445
  }
3275
3446
 
3447
+ bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
3448
+ try {
3449
+ return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
3450
+ } catch (const std::exception & err) {
3451
+ fprintf(stderr, "error loading session file: %s\n", err.what());
3452
+ return false;
3453
+ }
3454
+ }
3455
+
3276
3456
  bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
3277
3457
  llama_file file(path_session, "wb");
3278
3458
 
@@ -3428,23 +3608,35 @@ llama_token llama_token_nl() {
3428
3608
  return 13;
3429
3609
  }
3430
3610
 
3611
+ struct llama_timings llama_get_timings(struct llama_context * ctx) {
3612
+ struct llama_timings result = {
3613
+ /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
3614
+ /*.t_end_ms =*/ 1.00 * ggml_time_ms(),
3615
+ /*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
3616
+ /*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
3617
+ /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
3618
+ /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
3431
3619
 
3432
- void llama_print_timings(struct llama_context * ctx) {
3433
- const int64_t t_end_us = ggml_time_us();
3620
+ /*.n_sample =*/ std::max(1, ctx->n_sample),
3621
+ /*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
3622
+ /*.n_eval =*/ std::max(1, ctx->n_eval),
3623
+ };
3434
3624
 
3435
- const int32_t n_sample = std::max(1, ctx->n_sample);
3436
- const int32_t n_eval = std::max(1, ctx->n_eval);
3437
- const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
3625
+ return result;
3626
+ }
3627
+
3628
+ void llama_print_timings(struct llama_context * ctx) {
3629
+ const llama_timings timings = llama_get_timings(ctx);
3438
3630
 
3439
3631
  fprintf(stderr, "\n");
3440
- fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
3632
+ fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
3441
3633
  fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3442
- __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
3634
+ __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
3443
3635
  fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
3444
- __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
3636
+ __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
3445
3637
  fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3446
- __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
3447
- fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
3638
+ __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
3639
+ fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
3448
3640
  }
3449
3641
 
3450
3642
  void llama_reset_timings(struct llama_context * ctx) {