llama_cpp 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -65,7 +65,7 @@
65
65
  // ggml_set_f32(a, 3.0f);
66
66
  // ggml_set_f32(b, 4.0f);
67
67
  //
68
- // ggml_graph_compute(ctx0, &gf);
68
+ // ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
69
69
  //
70
70
  // printf("f = %f\n", ggml_get_f32_1d(f, 0));
71
71
  //
@@ -132,10 +132,10 @@
132
132
  // {
133
133
  // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
134
134
  //
135
- // // a[1, 2] = 1.0f;
135
+ // // a[2, 1] = 1.0f;
136
136
  // *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
137
137
  //
138
- // // a[2, 0] = 2.0f;
138
+ // // a[0, 2] = 2.0f;
139
139
  // *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
140
140
  //
141
141
  // ...
@@ -197,12 +197,17 @@
197
197
  #define GGML_MAX_NODES 4096
198
198
  #define GGML_MAX_PARAMS 256
199
199
  #define GGML_MAX_CONTEXTS 64
200
- #define GGML_MAX_OPT 4
200
+ #define GGML_MAX_SRC 6
201
201
  #define GGML_MAX_NAME 48
202
202
  #define GGML_DEFAULT_N_THREADS 4
203
203
 
204
+
205
+ #define GGML_EXIT_SUCCESS 0
206
+ #define GGML_EXIT_ABORTED 1
207
+
204
208
  #define GGML_UNUSED(x) (void)(x)
205
209
 
210
+
206
211
  #define GGML_ASSERT(x) \
207
212
  do { \
208
213
  if (!(x)) { \
@@ -363,6 +368,8 @@ extern "C" {
363
368
  GGML_OP_CLAMP,
364
369
  GGML_OP_CONV_1D,
365
370
  GGML_OP_CONV_2D,
371
+ GGML_OP_POOL_1D,
372
+ GGML_OP_POOL_2D,
366
373
 
367
374
  GGML_OP_FLASH_ATTN,
368
375
  GGML_OP_FLASH_FF,
@@ -414,12 +421,7 @@ extern "C" {
414
421
  bool is_param;
415
422
 
416
423
  struct ggml_tensor * grad;
417
- struct ggml_tensor * src0;
418
- struct ggml_tensor * src1;
419
- struct ggml_tensor * opt[GGML_MAX_OPT];
420
-
421
- // thread scheduling
422
- int n_tasks;
424
+ struct ggml_tensor * src[GGML_MAX_SRC];
423
425
 
424
426
  // performance
425
427
  int perf_runs;
@@ -432,19 +434,31 @@ extern "C" {
432
434
 
433
435
  void * extra; // extra things e.g. for ggml-cuda.cu
434
436
 
435
- char padding[4];
437
+ char padding[8];
436
438
  };
437
439
 
438
440
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
439
441
 
442
+ // the compute plan that needs to be prepared for ggml_graph_compute()
443
+ // since https://github.com/ggerganov/ggml/issues/287
444
+ struct ggml_cplan {
445
+ size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
446
+ uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
447
+
448
+ int n_threads;
449
+
450
+ // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
451
+ int n_tasks[GGML_MAX_NODES];
452
+
453
+ // abort ggml_graph_compute when true
454
+ bool (*abort_callback)(void * data);
455
+ void * abort_callback_data;
456
+ };
457
+
440
458
  // computation graph
441
459
  struct ggml_cgraph {
442
460
  int n_nodes;
443
461
  int n_leafs;
444
- int n_threads;
445
-
446
- size_t work_size;
447
- struct ggml_tensor * work;
448
462
 
449
463
  struct ggml_tensor * nodes[GGML_MAX_NODES];
450
464
  struct ggml_tensor * grads[GGML_MAX_NODES];
@@ -1161,6 +1175,31 @@ extern "C" {
1161
1175
  int s,
1162
1176
  int d);
1163
1177
 
1178
+ enum ggml_op_pool {
1179
+ GGML_OP_POOL_MAX,
1180
+ GGML_OP_POOL_AVG,
1181
+ GGML_OP_POOL_COUNT,
1182
+ };
1183
+
1184
+ GGML_API struct ggml_tensor* ggml_pool_1d(
1185
+ struct ggml_context * ctx,
1186
+ struct ggml_tensor * a,
1187
+ enum ggml_op_pool op,
1188
+ int k0, // kernel size
1189
+ int s0, // stride
1190
+ int p0); // padding
1191
+
1192
+ GGML_API struct ggml_tensor* ggml_pool_2d(
1193
+ struct ggml_context * ctx,
1194
+ struct ggml_tensor * a,
1195
+ enum ggml_op_pool op,
1196
+ int k0,
1197
+ int k1,
1198
+ int s0,
1199
+ int s1,
1200
+ int p0,
1201
+ int p1);
1202
+
1164
1203
  GGML_API struct ggml_tensor * ggml_flash_attn(
1165
1204
  struct ggml_context * ctx,
1166
1205
  struct ggml_tensor * q,
@@ -1290,15 +1329,22 @@ extern "C" {
1290
1329
 
1291
1330
  GGML_API void ggml_set_param(
1292
1331
  struct ggml_context * ctx,
1293
- struct ggml_tensor * tensor);
1332
+ struct ggml_tensor * tensor);
1294
1333
 
1295
1334
  GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1296
1335
 
1297
1336
  GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1298
1337
  GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
1299
1338
 
1300
- GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
1301
- GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
1339
+ // ggml_graph_plan() has to be called before ggml_graph_compute()
1340
+ // when plan.work_size > 0, caller must allocate memory for plan.work_data
1341
+ GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1342
+ GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1343
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
1344
+
1345
+ // same as ggml_graph_compute() but the work data is allocated as a part of the context
1346
+ // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
1347
+ GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
1302
1348
 
1303
1349
  GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
1304
1350
 
@@ -19,6 +19,9 @@
19
19
  #ifdef GGML_USE_METAL
20
20
  #include "ggml-metal.h"
21
21
  #endif
22
+ #ifdef GGML_USE_MPI
23
+ #include "ggml-mpi.h"
24
+ #endif
22
25
  #ifdef GGML_USE_K_QUANTS
23
26
  #ifndef QK_K
24
27
  #ifdef GGML_QKK_64
@@ -79,6 +82,25 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
79
82
  (void) tensor;
80
83
  }
81
84
 
85
+ //
86
+ // ggml helpers
87
+ //
88
+
89
+ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
90
+ struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
91
+
92
+ if (plan.work_size > 0) {
93
+ buf.resize(plan.work_size);
94
+ plan.work_data = buf.data();
95
+ }
96
+
97
+ ggml_graph_compute(graph, &plan);
98
+ }
99
+
100
+ //
101
+ // memory sizes
102
+ //
103
+
82
104
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
83
105
  {
84
106
  static std::map<e_model, size_t> k_sizes = {
@@ -321,6 +343,9 @@ struct llama_context {
321
343
  // input embedding (1-dimensional array: [n_embd])
322
344
  std::vector<float> embedding;
323
345
 
346
+ // reusable buffer for `struct ggml_graph_plan.work_data`
347
+ std::vector<uint8_t> work_buffer;
348
+
324
349
  // memory buffers used to evaluate the model
325
350
  // TODO: move in llama_state
326
351
  llama_ctx_buffer buf_compute;
@@ -330,6 +355,10 @@ struct llama_context {
330
355
  ggml_metal_context * ctx_metal = NULL;
331
356
  #endif
332
357
 
358
+ #ifdef GGML_USE_MPI
359
+ ggml_mpi_context * ctx_mpi = NULL;
360
+ #endif
361
+
333
362
  int buf_last = 0;
334
363
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
335
364
 
@@ -758,7 +787,6 @@ struct llama_model_loader {
758
787
 
759
788
  };
760
789
 
761
-
762
790
  //
763
791
  // kv cache
764
792
  //
@@ -849,7 +877,7 @@ bool llama_mlock_supported() {
849
877
  return llama_mlock::SUPPORTED;
850
878
  }
851
879
 
852
- void llama_init_backend(bool numa) {
880
+ void llama_backend_init(bool numa) {
853
881
  ggml_time_init();
854
882
 
855
883
  // needed to initialize f16 tables
@@ -862,6 +890,16 @@ void llama_init_backend(bool numa) {
862
890
  if (numa) {
863
891
  ggml_numa_init();
864
892
  }
893
+
894
+ #ifdef GGML_USE_MPI
895
+ ggml_mpi_backend_init();
896
+ #endif
897
+ }
898
+
899
+ void llama_backend_free() {
900
+ #ifdef GGML_USE_MPI
901
+ ggml_mpi_backend_free();
902
+ #endif
865
903
  }
866
904
 
867
905
  int64_t llama_time_us() {
@@ -1263,18 +1301,16 @@ static bool llama_eval_internal(
1263
1301
  llama_context & lctx,
1264
1302
  const llama_token * tokens,
1265
1303
  const float * embd,
1266
- const int n_tokens,
1267
- const int n_past,
1268
- const int n_threads,
1304
+ int n_tokens,
1305
+ int n_past,
1306
+ int n_threads,
1269
1307
  const char * cgraph_fname) {
1270
1308
 
1271
1309
  LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1272
1310
 
1273
- // enforce that the first token is BOS
1274
- if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
1275
- fprintf(stderr, "%s: first token must be BOS\n", __func__);
1276
- return false;
1277
- }
1311
+ #ifdef GGML_USE_MPI
1312
+ ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
1313
+ #endif
1278
1314
 
1279
1315
  const int64_t t_start_us = ggml_time_us();
1280
1316
 
@@ -1306,20 +1342,26 @@ static bool llama_eval_internal(
1306
1342
 
1307
1343
  struct ggml_context * ctx0 = ggml_init(params);
1308
1344
 
1345
+ ggml_cgraph gf = {};
1346
+
1309
1347
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1310
1348
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1311
- ggml_cgraph gf = {};
1312
- gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1349
+ n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1313
1350
 
1314
1351
  struct ggml_tensor * cur;
1315
1352
  struct ggml_tensor * inpL;
1316
1353
 
1317
1354
  if (tokens) {
1318
- struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1319
- ggml_set_name(embd, "embd");
1320
- memcpy(embd->data, tokens, N*ggml_element_size(embd));
1321
- inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1355
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1356
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
1357
+ ggml_set_name(inp_tokens, "inp_tokens");
1358
+
1359
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
1322
1360
  } else {
1361
+ #ifdef GGML_USE_MPI
1362
+ GGML_ASSERT(false && "not implemented");
1363
+ #endif
1364
+
1323
1365
  inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
1324
1366
  memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1325
1367
  }
@@ -1337,18 +1379,20 @@ static bool llama_eval_internal(
1337
1379
  offload_func_t offload_func_v = llama_nop;
1338
1380
 
1339
1381
  #ifdef GGML_USE_CUBLAS
1340
- if (n_gpu_layers > n_layer) {
1341
- offload_func_nr = ggml_cuda_assign_buffers;
1342
- }
1343
- if (n_gpu_layers > n_layer + 1) {
1344
- offload_func_v = ggml_cuda_assign_buffers;
1345
- }
1346
- if (n_gpu_layers > n_layer + 2) {
1347
- offload_func_kq = ggml_cuda_assign_buffers;
1348
- }
1382
+ if (n_gpu_layers > n_layer) {
1383
+ offload_func_nr = ggml_cuda_assign_buffers;
1384
+ }
1385
+ if (n_gpu_layers > n_layer + 1) {
1386
+ offload_func_v = ggml_cuda_assign_buffers;
1387
+ }
1388
+ if (n_gpu_layers > n_layer + 2) {
1389
+ offload_func_kq = ggml_cuda_assign_buffers;
1390
+ }
1349
1391
  #endif // GGML_USE_CUBLAS
1350
1392
 
1351
1393
  for (int il = 0; il < n_layer; ++il) {
1394
+ ggml_format_name(inpL, "layer_inp_%d", il);
1395
+
1352
1396
  offload_func_t offload_func = llama_nop;
1353
1397
 
1354
1398
  #ifdef GGML_USE_CUBLAS
@@ -1555,7 +1599,6 @@ static bool llama_eval_internal(
1555
1599
 
1556
1600
  // input for next layer
1557
1601
  inpL = cur;
1558
-
1559
1602
  }
1560
1603
 
1561
1604
  lctx.use_buf(ctx0, 0);
@@ -1563,7 +1606,6 @@ static bool llama_eval_internal(
1563
1606
  // used at the end to optionally extract the embeddings
1564
1607
  struct ggml_tensor * embeddings = NULL;
1565
1608
 
1566
-
1567
1609
  // norm
1568
1610
  {
1569
1611
  cur = ggml_rms_norm(ctx0, inpL);
@@ -1578,7 +1620,6 @@ static bool llama_eval_internal(
1578
1620
  embeddings = cur;
1579
1621
  }
1580
1622
 
1581
-
1582
1623
  // lm_head
1583
1624
  cur = ggml_mul_mat(ctx0, model.output, cur);
1584
1625
  ggml_set_name(cur, "result_output");
@@ -1591,8 +1632,13 @@ static bool llama_eval_internal(
1591
1632
  // run the computation
1592
1633
  ggml_build_forward_expand(&gf, cur);
1593
1634
 
1635
+ #if GGML_USE_MPI
1636
+ ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
1637
+ #endif
1638
+
1594
1639
  #ifdef GGML_USE_METAL
1595
1640
  if (lctx.ctx_metal && N == 1) {
1641
+ ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
1596
1642
  ggml_metal_graph_compute(lctx.ctx_metal, &gf);
1597
1643
  ggml_metal_get_tensor (lctx.ctx_metal, cur);
1598
1644
  } else {
@@ -1612,12 +1658,21 @@ static bool llama_eval_internal(
1612
1658
  ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1613
1659
  }
1614
1660
 
1615
- ggml_graph_compute(ctx0, &gf);
1661
+ ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1616
1662
  }
1617
1663
  #else
1618
- ggml_graph_compute(ctx0, &gf);
1664
+ ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1619
1665
  #endif
1620
1666
 
1667
+ #if GGML_USE_MPI
1668
+ ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
1669
+ #endif
1670
+
1671
+ // update kv token count
1672
+ lctx.kv_self.n = n_past + N;
1673
+
1674
+ struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
1675
+
1621
1676
  if (cgraph_fname) {
1622
1677
  ggml_graph_export(&gf, cgraph_fname);
1623
1678
  }
@@ -1633,23 +1688,17 @@ static bool llama_eval_internal(
1633
1688
  // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
1634
1689
  //}
1635
1690
 
1636
- //embd_w.resize(n_vocab*N);
1637
- //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
1638
-
1639
- // update kv token count
1640
- lctx.kv_self.n = n_past + N;
1641
-
1642
1691
  // extract logits
1643
1692
  {
1644
1693
  auto & logits_out = lctx.logits;
1645
1694
 
1646
1695
  if (lctx.logits_all) {
1647
1696
  logits_out.resize(n_vocab * N);
1648
- memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
1697
+ memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
1649
1698
  } else {
1650
1699
  // return result for just the last token
1651
1700
  logits_out.resize(n_vocab);
1652
- memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1701
+ memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1653
1702
  }
1654
1703
  }
1655
1704
 
@@ -2118,6 +2167,62 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
2118
2167
  }
2119
2168
  }
2120
2169
 
2170
+ static void llama_log_softmax(float * array, size_t size) {
2171
+ float max_l = *std::max_element(array, array + size);
2172
+ float sum = 0.f;
2173
+ for (size_t i = 0; i < size; ++i) {
2174
+ float p = expf(array[i] - max_l);
2175
+ sum += p;
2176
+ array[i] = p;
2177
+ }
2178
+
2179
+ for (size_t i = 0; i < size; ++i) {
2180
+ array[i] = logf(array[i] / sum);
2181
+ }
2182
+ }
2183
+
2184
+ void llama_sample_classifier_free_guidance(
2185
+ struct llama_context * ctx,
2186
+ llama_token_data_array * candidates,
2187
+ struct llama_context * guidance_ctx,
2188
+ float scale,
2189
+ float smooth_factor) {
2190
+ int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
2191
+
2192
+ assert(ctx);
2193
+ auto n_vocab = llama_n_vocab(ctx);
2194
+ assert(n_vocab == (int)candidates->size);
2195
+ assert(!candidates->sorted);
2196
+
2197
+ std::vector<float> logits_base;
2198
+ logits_base.reserve(candidates->size);
2199
+ for (size_t i = 0; i < candidates->size; ++i) {
2200
+ logits_base.push_back(candidates->data[i].logit);
2201
+ }
2202
+ llama_log_softmax(logits_base.data(), candidates->size);
2203
+
2204
+ float* logits_guidance = llama_get_logits(guidance_ctx);
2205
+ llama_log_softmax(logits_guidance, n_vocab);
2206
+
2207
+ for (int i = 0; i < n_vocab; ++i) {
2208
+ float logit_guidance = logits_guidance[i];
2209
+ float logit_base = logits_base[i];
2210
+ logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance;
2211
+ }
2212
+
2213
+ llama_log_softmax(logits_guidance, n_vocab);
2214
+
2215
+ for (int i = 0; i < n_vocab; ++i) {
2216
+ float logit_base = logits_base[i];
2217
+ float logit_guidance = logits_guidance[i];
2218
+
2219
+ candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
2220
+ }
2221
+
2222
+ if (ctx) {
2223
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2224
+ }
2225
+ }
2121
2226
 
2122
2227
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
2123
2228
  assert(ctx);
@@ -2405,15 +2510,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2405
2510
  } else {
2406
2511
  new_type = quantized_type;
2407
2512
  #ifdef GGML_USE_K_QUANTS
2513
+ bool convert_incompatible_tensor = false;
2408
2514
  if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2409
2515
  quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2410
2516
  int nx = tensor.ne.at(0);
2411
2517
  int ny = tensor.ne.at(1);
2412
2518
  if (nx % QK_K != 0 || ny % QK_K != 0) {
2413
- fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
2414
- fprintf(stderr, "This is required to be able to use k-quants for now!\n");
2415
- fprintf(stderr, "========================================================================================\n\n");
2416
- throw std::runtime_error("Unsupported tensor size encountered\n");
2519
+ fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
2520
+ convert_incompatible_tensor = true;
2417
2521
  }
2418
2522
  }
2419
2523
  if (tensor.name == "output.weight") {
@@ -2441,6 +2545,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2441
2545
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2442
2546
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2443
2547
  }
2548
+ if (convert_incompatible_tensor) {
2549
+ if (tensor.name == "output.weight") {
2550
+ new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
2551
+ fprintf(stderr, "F16 will be used for this tensor instead.\n");
2552
+ } else if (tensor.name == "tok_embeddings.weight") {
2553
+ new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
2554
+ fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
2555
+ } else {
2556
+ throw std::runtime_error("Unsupported tensor size encountered\n");
2557
+ }
2558
+ }
2444
2559
  #endif
2445
2560
 
2446
2561
  float * f32_data;
@@ -2575,8 +2690,8 @@ void llama_free_model(struct llama_model * model) {
2575
2690
  }
2576
2691
 
2577
2692
  struct llama_context * llama_new_context_with_model(
2578
- struct llama_model * model,
2579
- struct llama_context_params params) {
2693
+ struct llama_model * model,
2694
+ struct llama_context_params params) {
2580
2695
 
2581
2696
  if (!model) {
2582
2697
  return nullptr;
@@ -2645,7 +2760,7 @@ struct llama_context * llama_new_context_with_model(
2645
2760
  #ifdef GGML_USE_METAL
2646
2761
  if (params.n_gpu_layers > 0) {
2647
2762
  // this allocates all Metal resources and memory buffers
2648
- ctx->ctx_metal = ggml_metal_init();
2763
+ ctx->ctx_metal = ggml_metal_init(1);
2649
2764
 
2650
2765
  void * data_ptr = NULL;
2651
2766
  size_t data_size = 0;
@@ -2680,6 +2795,18 @@ struct llama_context * llama_new_context_with_model(
2680
2795
  }
2681
2796
  #endif
2682
2797
 
2798
+ #ifdef GGML_USE_MPI
2799
+ ctx->ctx_mpi = ggml_mpi_init();
2800
+
2801
+ if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
2802
+ // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
2803
+ const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos());
2804
+ while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
2805
+ llama_backend_free();
2806
+ exit(1);
2807
+ }
2808
+ #endif
2809
+
2683
2810
  return ctx;
2684
2811
  }
2685
2812
 
@@ -2802,6 +2929,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2802
2929
  // read tensors and apply
2803
2930
  bool warned = false;
2804
2931
  int n_tensors = 0;
2932
+
2933
+ std::vector<uint8_t> work_buffer;
2934
+
2805
2935
  while (true) {
2806
2936
  int32_t n_dims;
2807
2937
  int32_t length;
@@ -2966,8 +3096,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2966
3096
  }
2967
3097
 
2968
3098
  struct ggml_cgraph gf = ggml_build_forward(r);
2969
- gf.n_threads = n_threads;
2970
- ggml_graph_compute(lora_ctx, &gf);
3099
+
3100
+ ggml_graph_compute_helper(work_buffer, &gf, n_threads);
2971
3101
 
2972
3102
  // we won't need these tensors again, reset the context to save memory
2973
3103
  ggml_free(lora_ctx);
@@ -3120,7 +3250,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3120
3250
 
3121
3251
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3122
3252
  ggml_cgraph gf{};
3123
- gf.n_threads = 1;
3124
3253
 
3125
3254
  ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
3126
3255
  kout3d->data = out;
@@ -3140,7 +3269,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3140
3269
 
3141
3270
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
3142
3271
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
3143
- ggml_graph_compute(cpy_ctx, &gf);
3272
+ ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
3144
3273
 
3145
3274
  ggml_free(cpy_ctx);
3146
3275
  }
@@ -3226,7 +3355,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3226
3355
 
3227
3356
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3228
3357
  ggml_cgraph gf{};
3229
- gf.n_threads = 1;
3230
3358
 
3231
3359
  ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
3232
3360
  kin3d->data = (void *) inp;
@@ -3246,7 +3374,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3246
3374
 
3247
3375
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
3248
3376
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
3249
- ggml_graph_compute(cpy_ctx, &gf);
3377
+ ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
3250
3378
 
3251
3379
  ggml_free(cpy_ctx);
3252
3380
  }
@@ -158,7 +158,9 @@ extern "C" {
158
158
  // Initialize the llama + ggml backend
159
159
  // If numa is true, use NUMA optimizations
160
160
  // Call once at the start of the program
161
- LLAMA_API void llama_init_backend(bool numa);
161
+ LLAMA_API void llama_backend_init(bool numa);
162
+ // Call once at the end of the program - currently only used for MPI
163
+ LLAMA_API void llama_backend_free();
162
164
 
163
165
  LLAMA_API int64_t llama_time_us();
164
166
 
@@ -307,6 +309,18 @@ extern "C" {
307
309
  /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
308
310
  LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
309
311
 
312
+ /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
313
+ /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
314
+ /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
315
+ /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
316
+ /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
317
+ LLAMA_API void llama_sample_classifier_free_guidance(
318
+ struct llama_context * ctx,
319
+ llama_token_data_array * candidates,
320
+ struct llama_context * guidance_ctx,
321
+ float scale,
322
+ float smooth_factor);
323
+
310
324
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
311
325
  LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
312
326
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.2'
6
+ VERSION = '0.3.3'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-481f793'
9
+ LLAMA_CPP_VERSION = 'master-32c5411'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -108,4 +108,4 @@ module LLaMACpp
108
108
  end
109
109
  end
110
110
 
111
- LLaMACpp.init_backend
111
+ LLaMACpp.backend_init
data/sig/llama_cpp.rbs CHANGED
@@ -26,7 +26,8 @@ module LLaMACpp
26
26
  LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
27
27
  LLAMA_FTYPE_MOSTLY_Q6_K: Integer
28
28
 
29
- def self?.init_backend: (?numa: bool) -> void
29
+ def self?.backend_init: (?numa: bool) -> void
30
+ def self?.backend_free: () -> void
30
31
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
31
32
  def self?.generate: (::LLaMACpp::Context, String,
32
33
  ?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
@@ -108,6 +109,7 @@ module LLaMACpp
108
109
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
109
110
  def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
110
111
  def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
112
+ def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float, smooth_factor: Float) -> void
111
113
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
112
114
  def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
113
115
  def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void