llama_cpp 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,6 +19,9 @@
19
19
  #ifdef GGML_USE_METAL
20
20
  #include "ggml-metal.h"
21
21
  #endif
22
+ #ifdef GGML_USE_MPI
23
+ #include "ggml-mpi.h"
24
+ #endif
22
25
  #ifdef GGML_USE_K_QUANTS
23
26
  #ifndef QK_K
24
27
  #ifdef GGML_QKK_64
@@ -66,6 +69,7 @@ enum e_model {
66
69
  MODEL_65B,
67
70
  };
68
71
 
72
+ static const size_t kB = 1024;
69
73
  static const size_t MB = 1024*1024;
70
74
 
71
75
  // computed for n_ctx == 2048
@@ -78,6 +82,25 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
78
82
  (void) tensor;
79
83
  }
80
84
 
85
+ //
86
+ // ggml helpers
87
+ //
88
+
89
+ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
90
+ struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
91
+
92
+ if (plan.work_size > 0) {
93
+ buf.resize(plan.work_size);
94
+ plan.work_data = buf.data();
95
+ }
96
+
97
+ ggml_graph_compute(graph, &plan);
98
+ }
99
+
100
+ //
101
+ // memory sizes
102
+ //
103
+
81
104
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
82
105
  {
83
106
  static std::map<e_model, size_t> k_sizes = {
@@ -129,6 +152,34 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
129
152
  return k_sizes;
130
153
  }
131
154
 
155
+ // amount of VRAM needed per batch size to hold temporary results
156
+ // the values for 3b and 65b are not derived from testing but instead chosen conservatively
157
+ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
158
+ {
159
+ static std::map<e_model, size_t> k_sizes = {
160
+ { MODEL_3B, 512ull * kB },
161
+ { MODEL_7B, 512ull * kB },
162
+ { MODEL_13B, 640ull * kB },
163
+ { MODEL_30B, 768ull * kB },
164
+ { MODEL_65B, 1536ull * kB },
165
+ };
166
+ return k_sizes;
167
+ }
168
+
169
+ // amount of VRAM needed per batch size and context to hold temporary results
170
+ // the values for 3b and 65b are not derived from testing but instead chosen conservatively
171
+ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
172
+ {
173
+ static std::map<e_model, size_t> k_sizes = {
174
+ { MODEL_3B, 128ull },
175
+ { MODEL_7B, 128ull },
176
+ { MODEL_13B, 160ull },
177
+ { MODEL_30B, 208ull },
178
+ { MODEL_65B, 416ull },
179
+ };
180
+ return k_sizes;
181
+ }
182
+
132
183
  // default hparams (LLaMA 7B)
133
184
  struct llama_hparams {
134
185
  uint32_t n_vocab = 32000;
@@ -165,8 +216,8 @@ struct llama_layer {
165
216
  };
166
217
 
167
218
  struct llama_kv_cache {
168
- struct ggml_tensor * k;
169
- struct ggml_tensor * v;
219
+ struct ggml_tensor * k = NULL;
220
+ struct ggml_tensor * v = NULL;
170
221
 
171
222
  struct ggml_context * ctx = NULL;
172
223
 
@@ -253,7 +304,13 @@ struct llama_model {
253
304
 
254
305
  struct llama_context {
255
306
  llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
256
-
307
+ #ifdef GGML_USE_METAL
308
+ ~llama_context() {
309
+ if (ctx_metal) {
310
+ ggml_metal_free(ctx_metal);
311
+ }
312
+ }
313
+ #endif
257
314
  std::mt19937 rng;
258
315
 
259
316
  bool has_evaluated_once = false;
@@ -286,6 +343,9 @@ struct llama_context {
286
343
  // input embedding (1-dimensional array: [n_embd])
287
344
  std::vector<float> embedding;
288
345
 
346
+ // reusable buffer for `struct ggml_graph_plan.work_data`
347
+ std::vector<uint8_t> work_buffer;
348
+
289
349
  // memory buffers used to evaluate the model
290
350
  // TODO: move in llama_state
291
351
  llama_ctx_buffer buf_compute;
@@ -295,6 +355,10 @@ struct llama_context {
295
355
  ggml_metal_context * ctx_metal = NULL;
296
356
  #endif
297
357
 
358
+ #ifdef GGML_USE_MPI
359
+ ggml_mpi_context * ctx_mpi = NULL;
360
+ #endif
361
+
298
362
  int buf_last = 0;
299
363
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
300
364
 
@@ -446,9 +510,7 @@ struct llama_file_loader {
446
510
  std::string word = file.read_string(len);
447
511
 
448
512
  float score = 0.0f;
449
- if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) {
450
- file.read_raw(&score, sizeof(score));
451
- }
513
+ file.read_raw(&score, sizeof(score));
452
514
 
453
515
  vocab.token_to_id[word] = i;
454
516
 
@@ -725,7 +787,6 @@ struct llama_model_loader {
725
787
 
726
788
  };
727
789
 
728
-
729
790
  //
730
791
  // kv cache
731
792
  //
@@ -816,7 +877,7 @@ bool llama_mlock_supported() {
816
877
  return llama_mlock::SUPPORTED;
817
878
  }
818
879
 
819
- void llama_init_backend(bool numa) {
880
+ void llama_backend_init(bool numa) {
820
881
  ggml_time_init();
821
882
 
822
883
  // needed to initialize f16 tables
@@ -829,6 +890,16 @@ void llama_init_backend(bool numa) {
829
890
  if (numa) {
830
891
  ggml_numa_init();
831
892
  }
893
+
894
+ #ifdef GGML_USE_MPI
895
+ ggml_mpi_backend_init();
896
+ #endif
897
+ }
898
+
899
+ void llama_backend_free() {
900
+ #ifdef GGML_USE_MPI
901
+ ggml_mpi_backend_free();
902
+ #endif
832
903
  }
833
904
 
834
905
  int64_t llama_time_us() {
@@ -1112,14 +1183,18 @@ static void llama_model_load_internal(
1112
1183
  fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
1113
1184
  ggml_cuda_set_scratch_size(0); // disable scratch
1114
1185
  } else {
1115
- vram_scratch = n_batch * MB;
1186
+ const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
1187
+ const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
1188
+ vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
1116
1189
  ggml_cuda_set_scratch_size(vram_scratch);
1117
1190
  if (n_gpu_layers > 0) {
1118
- fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
1119
- __func__, vram_scratch / MB);
1191
+ fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
1192
+ __func__, vram_scratch_base / kB, vram_scratch_per_context,
1193
+ (vram_scratch + MB - 1) / MB); // round up
1120
1194
  }
1121
1195
  }
1122
1196
  #endif // GGML_USE_CUBLAS
1197
+
1123
1198
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1124
1199
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1125
1200
 
@@ -1128,6 +1203,10 @@ static void llama_model_load_internal(
1128
1203
  fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
1129
1204
  }
1130
1205
  size_t vram_kv_cache = 0;
1206
+
1207
+ #ifdef GGML_USE_CUBLAS
1208
+ const int max_backend_supported_layers = hparams.n_layer + 3;
1209
+ const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1131
1210
  if (n_gpu_layers > (int) hparams.n_layer + 1) {
1132
1211
  if (low_vram) {
1133
1212
  fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
@@ -1144,14 +1223,18 @@ static void llama_model_load_internal(
1144
1223
  vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1145
1224
  }
1146
1225
  }
1147
- const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1226
+ #elif defined(GGML_USE_CLBLAST)
1227
+ const int max_backend_supported_layers = hparams.n_layer + 1;
1228
+ const int max_offloadable_layers = hparams.n_layer + 1;
1229
+ #endif // GGML_USE_CUBLAS
1230
+
1148
1231
  fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
1149
- __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
1232
+ __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
1150
1233
  fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1151
1234
  __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
1152
1235
  #else
1153
1236
  (void) n_gpu_layers;
1154
- #endif
1237
+ #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1155
1238
  }
1156
1239
 
1157
1240
  // populate `tensors_by_name`
@@ -1218,18 +1301,16 @@ static bool llama_eval_internal(
1218
1301
  llama_context & lctx,
1219
1302
  const llama_token * tokens,
1220
1303
  const float * embd,
1221
- const int n_tokens,
1222
- const int n_past,
1223
- const int n_threads,
1304
+ int n_tokens,
1305
+ int n_past,
1306
+ int n_threads,
1224
1307
  const char * cgraph_fname) {
1225
1308
 
1226
1309
  LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1227
1310
 
1228
- // enforce that the first token is BOS
1229
- if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
1230
- fprintf(stderr, "%s: first token must be BOS\n", __func__);
1231
- return false;
1232
- }
1311
+ #ifdef GGML_USE_MPI
1312
+ ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
1313
+ #endif
1233
1314
 
1234
1315
  const int64_t t_start_us = ggml_time_us();
1235
1316
 
@@ -1261,20 +1342,26 @@ static bool llama_eval_internal(
1261
1342
 
1262
1343
  struct ggml_context * ctx0 = ggml_init(params);
1263
1344
 
1345
+ ggml_cgraph gf = {};
1346
+
1264
1347
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1265
1348
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1266
- ggml_cgraph gf = {};
1267
- gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1349
+ n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1268
1350
 
1269
1351
  struct ggml_tensor * cur;
1270
1352
  struct ggml_tensor * inpL;
1271
1353
 
1272
1354
  if (tokens) {
1273
- struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1274
- ggml_set_name(embd, "embd");
1275
- memcpy(embd->data, tokens, N*ggml_element_size(embd));
1276
- inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1355
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1356
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
1357
+ ggml_set_name(inp_tokens, "inp_tokens");
1358
+
1359
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
1277
1360
  } else {
1361
+ #ifdef GGML_USE_MPI
1362
+ GGML_ASSERT(false && "not implemented");
1363
+ #endif
1364
+
1278
1365
  inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
1279
1366
  memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1280
1367
  }
@@ -1292,18 +1379,20 @@ static bool llama_eval_internal(
1292
1379
  offload_func_t offload_func_v = llama_nop;
1293
1380
 
1294
1381
  #ifdef GGML_USE_CUBLAS
1295
- if (n_gpu_layers > n_layer) {
1296
- offload_func_nr = ggml_cuda_assign_buffers;
1297
- }
1298
- if (n_gpu_layers > n_layer + 1) {
1299
- offload_func_v = ggml_cuda_assign_buffers;
1300
- }
1301
- if (n_gpu_layers > n_layer + 2) {
1302
- offload_func_kq = ggml_cuda_assign_buffers;
1303
- }
1382
+ if (n_gpu_layers > n_layer) {
1383
+ offload_func_nr = ggml_cuda_assign_buffers;
1384
+ }
1385
+ if (n_gpu_layers > n_layer + 1) {
1386
+ offload_func_v = ggml_cuda_assign_buffers;
1387
+ }
1388
+ if (n_gpu_layers > n_layer + 2) {
1389
+ offload_func_kq = ggml_cuda_assign_buffers;
1390
+ }
1304
1391
  #endif // GGML_USE_CUBLAS
1305
1392
 
1306
1393
  for (int il = 0; il < n_layer; ++il) {
1394
+ ggml_format_name(inpL, "layer_inp_%d", il);
1395
+
1307
1396
  offload_func_t offload_func = llama_nop;
1308
1397
 
1309
1398
  #ifdef GGML_USE_CUBLAS
@@ -1510,7 +1599,6 @@ static bool llama_eval_internal(
1510
1599
 
1511
1600
  // input for next layer
1512
1601
  inpL = cur;
1513
-
1514
1602
  }
1515
1603
 
1516
1604
  lctx.use_buf(ctx0, 0);
@@ -1518,7 +1606,6 @@ static bool llama_eval_internal(
1518
1606
  // used at the end to optionally extract the embeddings
1519
1607
  struct ggml_tensor * embeddings = NULL;
1520
1608
 
1521
-
1522
1609
  // norm
1523
1610
  {
1524
1611
  cur = ggml_rms_norm(ctx0, inpL);
@@ -1533,7 +1620,6 @@ static bool llama_eval_internal(
1533
1620
  embeddings = cur;
1534
1621
  }
1535
1622
 
1536
-
1537
1623
  // lm_head
1538
1624
  cur = ggml_mul_mat(ctx0, model.output, cur);
1539
1625
  ggml_set_name(cur, "result_output");
@@ -1546,8 +1632,13 @@ static bool llama_eval_internal(
1546
1632
  // run the computation
1547
1633
  ggml_build_forward_expand(&gf, cur);
1548
1634
 
1635
+ #if GGML_USE_MPI
1636
+ ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
1637
+ #endif
1638
+
1549
1639
  #ifdef GGML_USE_METAL
1550
1640
  if (lctx.ctx_metal && N == 1) {
1641
+ ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
1551
1642
  ggml_metal_graph_compute(lctx.ctx_metal, &gf);
1552
1643
  ggml_metal_get_tensor (lctx.ctx_metal, cur);
1553
1644
  } else {
@@ -1567,12 +1658,21 @@ static bool llama_eval_internal(
1567
1658
  ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1568
1659
  }
1569
1660
 
1570
- ggml_graph_compute(ctx0, &gf);
1661
+ ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1571
1662
  }
1572
1663
  #else
1573
- ggml_graph_compute(ctx0, &gf);
1664
+ ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1574
1665
  #endif
1575
1666
 
1667
+ #if GGML_USE_MPI
1668
+ ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
1669
+ #endif
1670
+
1671
+ // update kv token count
1672
+ lctx.kv_self.n = n_past + N;
1673
+
1674
+ struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
1675
+
1576
1676
  if (cgraph_fname) {
1577
1677
  ggml_graph_export(&gf, cgraph_fname);
1578
1678
  }
@@ -1588,23 +1688,17 @@ static bool llama_eval_internal(
1588
1688
  // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
1589
1689
  //}
1590
1690
 
1591
- //embd_w.resize(n_vocab*N);
1592
- //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
1593
-
1594
- // update kv token count
1595
- lctx.kv_self.n = n_past + N;
1596
-
1597
1691
  // extract logits
1598
1692
  {
1599
1693
  auto & logits_out = lctx.logits;
1600
1694
 
1601
1695
  if (lctx.logits_all) {
1602
1696
  logits_out.resize(n_vocab * N);
1603
- memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
1697
+ memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
1604
1698
  } else {
1605
1699
  // return result for just the last token
1606
1700
  logits_out.resize(n_vocab);
1607
- memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1701
+ memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1608
1702
  }
1609
1703
  }
1610
1704
 
@@ -1860,10 +1954,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
1860
1954
  return;
1861
1955
  }
1862
1956
 
1863
- const int64_t t_start_sample_us = ggml_time_us();
1864
-
1865
1957
  llama_sample_softmax(ctx, candidates);
1866
1958
 
1959
+ const int64_t t_start_sample_us = ggml_time_us();
1960
+
1867
1961
  // Compute the cumulative probabilities
1868
1962
  float cum_sum = 0.0f;
1869
1963
  size_t last_idx = candidates->size;
@@ -1892,9 +1986,8 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
1892
1986
  return;
1893
1987
  }
1894
1988
 
1895
- const int64_t t_start_sample_us = ggml_time_us();
1896
-
1897
1989
  llama_sample_softmax(nullptr, candidates);
1990
+ const int64_t t_start_sample_us = ggml_time_us();
1898
1991
 
1899
1992
  // Compute the first and second derivatives
1900
1993
  std::vector<float> first_derivatives(candidates->size - 1);
@@ -1946,11 +2039,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
1946
2039
  return;
1947
2040
  }
1948
2041
 
1949
- const int64_t t_start_sample_us = ggml_time_us();
1950
-
1951
2042
  // Compute the softmax of logits and calculate entropy
1952
2043
  llama_sample_softmax(nullptr, candidates);
1953
2044
 
2045
+ const int64_t t_start_sample_us = ggml_time_us();
2046
+
1954
2047
  float entropy = 0.0f;
1955
2048
  for (size_t i = 0; i < candidates->size; ++i) {
1956
2049
  entropy += -candidates->data[i].p * logf(candidates->data[i].p);
@@ -2074,6 +2167,62 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
2074
2167
  }
2075
2168
  }
2076
2169
 
2170
+ static void llama_log_softmax(float * array, size_t size) {
2171
+ float max_l = *std::max_element(array, array + size);
2172
+ float sum = 0.f;
2173
+ for (size_t i = 0; i < size; ++i) {
2174
+ float p = expf(array[i] - max_l);
2175
+ sum += p;
2176
+ array[i] = p;
2177
+ }
2178
+
2179
+ for (size_t i = 0; i < size; ++i) {
2180
+ array[i] = logf(array[i] / sum);
2181
+ }
2182
+ }
2183
+
2184
+ void llama_sample_classifier_free_guidance(
2185
+ struct llama_context * ctx,
2186
+ llama_token_data_array * candidates,
2187
+ struct llama_context * guidance_ctx,
2188
+ float scale,
2189
+ float smooth_factor) {
2190
+ int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
2191
+
2192
+ assert(ctx);
2193
+ auto n_vocab = llama_n_vocab(ctx);
2194
+ assert(n_vocab == (int)candidates->size);
2195
+ assert(!candidates->sorted);
2196
+
2197
+ std::vector<float> logits_base;
2198
+ logits_base.reserve(candidates->size);
2199
+ for (size_t i = 0; i < candidates->size; ++i) {
2200
+ logits_base.push_back(candidates->data[i].logit);
2201
+ }
2202
+ llama_log_softmax(logits_base.data(), candidates->size);
2203
+
2204
+ float* logits_guidance = llama_get_logits(guidance_ctx);
2205
+ llama_log_softmax(logits_guidance, n_vocab);
2206
+
2207
+ for (int i = 0; i < n_vocab; ++i) {
2208
+ float logit_guidance = logits_guidance[i];
2209
+ float logit_base = logits_base[i];
2210
+ logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance;
2211
+ }
2212
+
2213
+ llama_log_softmax(logits_guidance, n_vocab);
2214
+
2215
+ for (int i = 0; i < n_vocab; ++i) {
2216
+ float logit_base = logits_base[i];
2217
+ float logit_guidance = logits_guidance[i];
2218
+
2219
+ candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
2220
+ }
2221
+
2222
+ if (ctx) {
2223
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2224
+ }
2225
+ }
2077
2226
 
2078
2227
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
2079
2228
  assert(ctx);
@@ -2119,13 +2268,11 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
2119
2268
 
2120
2269
  if (ctx) {
2121
2270
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2122
- ctx->n_sample++;
2123
2271
  }
2124
2272
  return X;
2125
2273
  }
2126
2274
 
2127
2275
  llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
2128
- assert(ctx);
2129
2276
  int64_t t_start_sample_us;
2130
2277
  t_start_sample_us = ggml_time_us();
2131
2278
 
@@ -2140,13 +2287,14 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
2140
2287
  candidates->size = 1;
2141
2288
  }
2142
2289
 
2290
+ if (ctx) {
2291
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2292
+ }
2293
+
2143
2294
  // Normalize the probabilities of the remaining words
2144
2295
  llama_sample_softmax(ctx, candidates);
2145
2296
 
2146
2297
  // Sample the next word X from the remaining words
2147
- if (ctx) {
2148
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2149
- }
2150
2298
  llama_token X = llama_sample_token(ctx, candidates);
2151
2299
  t_start_sample_us = ggml_time_us();
2152
2300
 
@@ -2214,10 +2362,10 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
2214
2362
  }
2215
2363
  float * f32_output = (float *) output.addr;
2216
2364
 
2217
- quantize_fns_t qtype;
2365
+ ggml_type_traits_t qtype;
2218
2366
  if (ggml_is_quantized(tensor.type)) {
2219
- qtype = ggml_internal_get_quantize_fn(tensor.type);
2220
- if (qtype.dequantize_row_q == NULL) {
2367
+ qtype = ggml_internal_get_type_traits(tensor.type);
2368
+ if (qtype.to_float == NULL) {
2221
2369
  throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
2222
2370
  }
2223
2371
  } else if (tensor.type != GGML_TYPE_F16) {
@@ -2228,7 +2376,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
2228
2376
  if (tensor.type == GGML_TYPE_F16) {
2229
2377
  ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
2230
2378
  } else if (ggml_is_quantized(tensor.type)) {
2231
- qtype.dequantize_row_q(tensor.data, f32_output, nelements);
2379
+ qtype.to_float(tensor.data, f32_output, nelements);
2232
2380
  } else {
2233
2381
  LLAMA_ASSERT(false); // unreachable
2234
2382
  }
@@ -2253,7 +2401,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
2253
2401
  if (typ == GGML_TYPE_F16) {
2254
2402
  ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
2255
2403
  } else {
2256
- qtype.dequantize_row_q(inbuf, outbuf, nels);
2404
+ qtype.to_float(inbuf, outbuf, nels);
2257
2405
  }
2258
2406
  };
2259
2407
  workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
@@ -2362,15 +2510,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2362
2510
  } else {
2363
2511
  new_type = quantized_type;
2364
2512
  #ifdef GGML_USE_K_QUANTS
2513
+ bool convert_incompatible_tensor = false;
2365
2514
  if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2366
2515
  quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2367
2516
  int nx = tensor.ne.at(0);
2368
2517
  int ny = tensor.ne.at(1);
2369
2518
  if (nx % QK_K != 0 || ny % QK_K != 0) {
2370
- fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
2371
- fprintf(stderr, "This is required to be able to use k-quants for now!\n");
2372
- fprintf(stderr, "========================================================================================\n\n");
2373
- throw std::runtime_error("Unsupported tensor size encountered\n");
2519
+ fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
2520
+ convert_incompatible_tensor = true;
2374
2521
  }
2375
2522
  }
2376
2523
  if (tensor.name == "output.weight") {
@@ -2398,6 +2545,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2398
2545
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2399
2546
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2400
2547
  }
2548
+ if (convert_incompatible_tensor) {
2549
+ if (tensor.name == "output.weight") {
2550
+ new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
2551
+ fprintf(stderr, "F16 will be used for this tensor instead.\n");
2552
+ } else if (tensor.name == "tok_embeddings.weight") {
2553
+ new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
2554
+ fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
2555
+ } else {
2556
+ throw std::runtime_error("Unsupported tensor size encountered\n");
2557
+ }
2558
+ }
2401
2559
  #endif
2402
2560
 
2403
2561
  float * f32_data;
@@ -2532,8 +2690,8 @@ void llama_free_model(struct llama_model * model) {
2532
2690
  }
2533
2691
 
2534
2692
  struct llama_context * llama_new_context_with_model(
2535
- struct llama_model * model,
2536
- struct llama_context_params params) {
2693
+ struct llama_model * model,
2694
+ struct llama_context_params params) {
2537
2695
 
2538
2696
  if (!model) {
2539
2697
  return nullptr;
@@ -2602,7 +2760,7 @@ struct llama_context * llama_new_context_with_model(
2602
2760
  #ifdef GGML_USE_METAL
2603
2761
  if (params.n_gpu_layers > 0) {
2604
2762
  // this allocates all Metal resources and memory buffers
2605
- ctx->ctx_metal = ggml_metal_init();
2763
+ ctx->ctx_metal = ggml_metal_init(1);
2606
2764
 
2607
2765
  void * data_ptr = NULL;
2608
2766
  size_t data_size = 0;
@@ -2637,6 +2795,18 @@ struct llama_context * llama_new_context_with_model(
2637
2795
  }
2638
2796
  #endif
2639
2797
 
2798
+ #ifdef GGML_USE_MPI
2799
+ ctx->ctx_mpi = ggml_mpi_init();
2800
+
2801
+ if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
2802
+ // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
2803
+ const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos());
2804
+ while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
2805
+ llama_backend_free();
2806
+ exit(1);
2807
+ }
2808
+ #endif
2809
+
2640
2810
  return ctx;
2641
2811
  }
2642
2812
 
@@ -2759,6 +2929,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2759
2929
  // read tensors and apply
2760
2930
  bool warned = false;
2761
2931
  int n_tensors = 0;
2932
+
2933
+ std::vector<uint8_t> work_buffer;
2934
+
2762
2935
  while (true) {
2763
2936
  int32_t n_dims;
2764
2937
  int32_t length;
@@ -2923,8 +3096,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2923
3096
  }
2924
3097
 
2925
3098
  struct ggml_cgraph gf = ggml_build_forward(r);
2926
- gf.n_threads = n_threads;
2927
- ggml_graph_compute(lora_ctx, &gf);
3099
+
3100
+ ggml_graph_compute_helper(work_buffer, &gf, n_threads);
2928
3101
 
2929
3102
  // we won't need these tensors again, reset the context to save memory
2930
3103
  ggml_free(lora_ctx);
@@ -3077,7 +3250,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3077
3250
 
3078
3251
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3079
3252
  ggml_cgraph gf{};
3080
- gf.n_threads = 1;
3081
3253
 
3082
3254
  ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
3083
3255
  kout3d->data = out;
@@ -3097,7 +3269,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3097
3269
 
3098
3270
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
3099
3271
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
3100
- ggml_graph_compute(cpy_ctx, &gf);
3272
+ ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
3101
3273
 
3102
3274
  ggml_free(cpy_ctx);
3103
3275
  }
@@ -3183,7 +3355,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3183
3355
 
3184
3356
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3185
3357
  ggml_cgraph gf{};
3186
- gf.n_threads = 1;
3187
3358
 
3188
3359
  ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
3189
3360
  kin3d->data = (void *) inp;
@@ -3203,7 +3374,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3203
3374
 
3204
3375
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
3205
3376
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
3206
- ggml_graph_compute(cpy_ctx, &gf);
3377
+ ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
3207
3378
 
3208
3379
  ggml_free(cpy_ctx);
3209
3380
  }
@@ -3219,7 +3390,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3219
3390
  return nread;
3220
3391
  }
3221
3392
 
3222
- bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
3393
+ static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
3223
3394
  llama_file file(path_session, "rb");
3224
3395
 
3225
3396
  // sanity checks
@@ -3273,6 +3444,15 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
3273
3444
  return true;
3274
3445
  }
3275
3446
 
3447
+ bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
3448
+ try {
3449
+ return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
3450
+ } catch (const std::exception & err) {
3451
+ fprintf(stderr, "error loading session file: %s\n", err.what());
3452
+ return false;
3453
+ }
3454
+ }
3455
+
3276
3456
  bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
3277
3457
  llama_file file(path_session, "wb");
3278
3458
 
@@ -3428,23 +3608,35 @@ llama_token llama_token_nl() {
3428
3608
  return 13;
3429
3609
  }
3430
3610
 
3611
+ struct llama_timings llama_get_timings(struct llama_context * ctx) {
3612
+ struct llama_timings result = {
3613
+ /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
3614
+ /*.t_end_ms =*/ 1.00 * ggml_time_ms(),
3615
+ /*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
3616
+ /*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
3617
+ /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
3618
+ /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
3431
3619
 
3432
- void llama_print_timings(struct llama_context * ctx) {
3433
- const int64_t t_end_us = ggml_time_us();
3620
+ /*.n_sample =*/ std::max(1, ctx->n_sample),
3621
+ /*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
3622
+ /*.n_eval =*/ std::max(1, ctx->n_eval),
3623
+ };
3434
3624
 
3435
- const int32_t n_sample = std::max(1, ctx->n_sample);
3436
- const int32_t n_eval = std::max(1, ctx->n_eval);
3437
- const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
3625
+ return result;
3626
+ }
3627
+
3628
+ void llama_print_timings(struct llama_context * ctx) {
3629
+ const llama_timings timings = llama_get_timings(ctx);
3438
3630
 
3439
3631
  fprintf(stderr, "\n");
3440
- fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
3632
+ fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
3441
3633
  fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3442
- __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
3634
+ __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
3443
3635
  fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
3444
- __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
3636
+ __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
3445
3637
  fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3446
- __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
3447
- fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
3638
+ __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
3639
+ fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
3448
3640
  }
3449
3641
 
3450
3642
  void llama_reset_timings(struct llama_context * ctx) {