llama_cpp 0.3.2 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -65,7 +65,7 @@
65
65
  // ggml_set_f32(a, 3.0f);
66
66
  // ggml_set_f32(b, 4.0f);
67
67
  //
68
- // ggml_graph_compute(ctx0, &gf);
68
+ // ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
69
69
  //
70
70
  // printf("f = %f\n", ggml_get_f32_1d(f, 0));
71
71
  //
@@ -132,10 +132,10 @@
132
132
  // {
133
133
  // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
134
134
  //
135
- // // a[1, 2] = 1.0f;
135
+ // // a[2, 1] = 1.0f;
136
136
  // *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
137
137
  //
138
- // // a[2, 0] = 2.0f;
138
+ // // a[0, 2] = 2.0f;
139
139
  // *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
140
140
  //
141
141
  // ...
@@ -197,12 +197,17 @@
197
197
  #define GGML_MAX_NODES 4096
198
198
  #define GGML_MAX_PARAMS 256
199
199
  #define GGML_MAX_CONTEXTS 64
200
- #define GGML_MAX_OPT 4
200
+ #define GGML_MAX_SRC 6
201
201
  #define GGML_MAX_NAME 48
202
202
  #define GGML_DEFAULT_N_THREADS 4
203
203
 
204
+
205
+ #define GGML_EXIT_SUCCESS 0
206
+ #define GGML_EXIT_ABORTED 1
207
+
204
208
  #define GGML_UNUSED(x) (void)(x)
205
209
 
210
+
206
211
  #define GGML_ASSERT(x) \
207
212
  do { \
208
213
  if (!(x)) { \
@@ -363,6 +368,8 @@ extern "C" {
363
368
  GGML_OP_CLAMP,
364
369
  GGML_OP_CONV_1D,
365
370
  GGML_OP_CONV_2D,
371
+ GGML_OP_POOL_1D,
372
+ GGML_OP_POOL_2D,
366
373
 
367
374
  GGML_OP_FLASH_ATTN,
368
375
  GGML_OP_FLASH_FF,
@@ -414,12 +421,7 @@ extern "C" {
414
421
  bool is_param;
415
422
 
416
423
  struct ggml_tensor * grad;
417
- struct ggml_tensor * src0;
418
- struct ggml_tensor * src1;
419
- struct ggml_tensor * opt[GGML_MAX_OPT];
420
-
421
- // thread scheduling
422
- int n_tasks;
424
+ struct ggml_tensor * src[GGML_MAX_SRC];
423
425
 
424
426
  // performance
425
427
  int perf_runs;
@@ -432,19 +434,31 @@ extern "C" {
432
434
 
433
435
  void * extra; // extra things e.g. for ggml-cuda.cu
434
436
 
435
- char padding[4];
437
+ char padding[8];
436
438
  };
437
439
 
438
440
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
439
441
 
442
+ // the compute plan that needs to be prepared for ggml_graph_compute()
443
+ // since https://github.com/ggerganov/ggml/issues/287
444
+ struct ggml_cplan {
445
+ size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
446
+ uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
447
+
448
+ int n_threads;
449
+
450
+ // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
451
+ int n_tasks[GGML_MAX_NODES];
452
+
453
+ // abort ggml_graph_compute when true
454
+ bool (*abort_callback)(void * data);
455
+ void * abort_callback_data;
456
+ };
457
+
440
458
  // computation graph
441
459
  struct ggml_cgraph {
442
460
  int n_nodes;
443
461
  int n_leafs;
444
- int n_threads;
445
-
446
- size_t work_size;
447
- struct ggml_tensor * work;
448
462
 
449
463
  struct ggml_tensor * nodes[GGML_MAX_NODES];
450
464
  struct ggml_tensor * grads[GGML_MAX_NODES];
@@ -1161,6 +1175,31 @@ extern "C" {
1161
1175
  int s,
1162
1176
  int d);
1163
1177
 
1178
+ enum ggml_op_pool {
1179
+ GGML_OP_POOL_MAX,
1180
+ GGML_OP_POOL_AVG,
1181
+ GGML_OP_POOL_COUNT,
1182
+ };
1183
+
1184
+ GGML_API struct ggml_tensor* ggml_pool_1d(
1185
+ struct ggml_context * ctx,
1186
+ struct ggml_tensor * a,
1187
+ enum ggml_op_pool op,
1188
+ int k0, // kernel size
1189
+ int s0, // stride
1190
+ int p0); // padding
1191
+
1192
+ GGML_API struct ggml_tensor* ggml_pool_2d(
1193
+ struct ggml_context * ctx,
1194
+ struct ggml_tensor * a,
1195
+ enum ggml_op_pool op,
1196
+ int k0,
1197
+ int k1,
1198
+ int s0,
1199
+ int s1,
1200
+ int p0,
1201
+ int p1);
1202
+
1164
1203
  GGML_API struct ggml_tensor * ggml_flash_attn(
1165
1204
  struct ggml_context * ctx,
1166
1205
  struct ggml_tensor * q,
@@ -1290,15 +1329,22 @@ extern "C" {
1290
1329
 
1291
1330
  GGML_API void ggml_set_param(
1292
1331
  struct ggml_context * ctx,
1293
- struct ggml_tensor * tensor);
1332
+ struct ggml_tensor * tensor);
1294
1333
 
1295
1334
  GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1296
1335
 
1297
1336
  GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1298
1337
  GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
1299
1338
 
1300
- GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
1301
- GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
1339
+ // ggml_graph_plan() has to be called before ggml_graph_compute()
1340
+ // when plan.work_size > 0, caller must allocate memory for plan.work_data
1341
+ GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1342
+ GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1343
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
1344
+
1345
+ // same as ggml_graph_compute() but the work data is allocated as a part of the context
1346
+ // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
1347
+ GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
1302
1348
 
1303
1349
  GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
1304
1350
 
@@ -19,6 +19,9 @@
19
19
  #ifdef GGML_USE_METAL
20
20
  #include "ggml-metal.h"
21
21
  #endif
22
+ #ifdef GGML_USE_MPI
23
+ #include "ggml-mpi.h"
24
+ #endif
22
25
  #ifdef GGML_USE_K_QUANTS
23
26
  #ifndef QK_K
24
27
  #ifdef GGML_QKK_64
@@ -79,6 +82,25 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
79
82
  (void) tensor;
80
83
  }
81
84
 
85
+ //
86
+ // ggml helpers
87
+ //
88
+
89
+ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
90
+ struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
91
+
92
+ if (plan.work_size > 0) {
93
+ buf.resize(plan.work_size);
94
+ plan.work_data = buf.data();
95
+ }
96
+
97
+ ggml_graph_compute(graph, &plan);
98
+ }
99
+
100
+ //
101
+ // memory sizes
102
+ //
103
+
82
104
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
83
105
  {
84
106
  static std::map<e_model, size_t> k_sizes = {
@@ -321,6 +343,9 @@ struct llama_context {
321
343
  // input embedding (1-dimensional array: [n_embd])
322
344
  std::vector<float> embedding;
323
345
 
346
+ // reusable buffer for `struct ggml_graph_plan.work_data`
347
+ std::vector<uint8_t> work_buffer;
348
+
324
349
  // memory buffers used to evaluate the model
325
350
  // TODO: move in llama_state
326
351
  llama_ctx_buffer buf_compute;
@@ -330,6 +355,10 @@ struct llama_context {
330
355
  ggml_metal_context * ctx_metal = NULL;
331
356
  #endif
332
357
 
358
+ #ifdef GGML_USE_MPI
359
+ ggml_mpi_context * ctx_mpi = NULL;
360
+ #endif
361
+
333
362
  int buf_last = 0;
334
363
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
335
364
 
@@ -758,7 +787,6 @@ struct llama_model_loader {
758
787
 
759
788
  };
760
789
 
761
-
762
790
  //
763
791
  // kv cache
764
792
  //
@@ -849,7 +877,7 @@ bool llama_mlock_supported() {
849
877
  return llama_mlock::SUPPORTED;
850
878
  }
851
879
 
852
- void llama_init_backend(bool numa) {
880
+ void llama_backend_init(bool numa) {
853
881
  ggml_time_init();
854
882
 
855
883
  // needed to initialize f16 tables
@@ -862,6 +890,16 @@ void llama_init_backend(bool numa) {
862
890
  if (numa) {
863
891
  ggml_numa_init();
864
892
  }
893
+
894
+ #ifdef GGML_USE_MPI
895
+ ggml_mpi_backend_init();
896
+ #endif
897
+ }
898
+
899
+ void llama_backend_free() {
900
+ #ifdef GGML_USE_MPI
901
+ ggml_mpi_backend_free();
902
+ #endif
865
903
  }
866
904
 
867
905
  int64_t llama_time_us() {
@@ -1263,18 +1301,16 @@ static bool llama_eval_internal(
1263
1301
  llama_context & lctx,
1264
1302
  const llama_token * tokens,
1265
1303
  const float * embd,
1266
- const int n_tokens,
1267
- const int n_past,
1268
- const int n_threads,
1304
+ int n_tokens,
1305
+ int n_past,
1306
+ int n_threads,
1269
1307
  const char * cgraph_fname) {
1270
1308
 
1271
1309
  LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1272
1310
 
1273
- // enforce that the first token is BOS
1274
- if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
1275
- fprintf(stderr, "%s: first token must be BOS\n", __func__);
1276
- return false;
1277
- }
1311
+ #ifdef GGML_USE_MPI
1312
+ ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
1313
+ #endif
1278
1314
 
1279
1315
  const int64_t t_start_us = ggml_time_us();
1280
1316
 
@@ -1306,20 +1342,26 @@ static bool llama_eval_internal(
1306
1342
 
1307
1343
  struct ggml_context * ctx0 = ggml_init(params);
1308
1344
 
1345
+ ggml_cgraph gf = {};
1346
+
1309
1347
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1310
1348
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1311
- ggml_cgraph gf = {};
1312
- gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1349
+ n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1313
1350
 
1314
1351
  struct ggml_tensor * cur;
1315
1352
  struct ggml_tensor * inpL;
1316
1353
 
1317
1354
  if (tokens) {
1318
- struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1319
- ggml_set_name(embd, "embd");
1320
- memcpy(embd->data, tokens, N*ggml_element_size(embd));
1321
- inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1355
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1356
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
1357
+ ggml_set_name(inp_tokens, "inp_tokens");
1358
+
1359
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
1322
1360
  } else {
1361
+ #ifdef GGML_USE_MPI
1362
+ GGML_ASSERT(false && "not implemented");
1363
+ #endif
1364
+
1323
1365
  inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
1324
1366
  memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1325
1367
  }
@@ -1337,18 +1379,20 @@ static bool llama_eval_internal(
1337
1379
  offload_func_t offload_func_v = llama_nop;
1338
1380
 
1339
1381
  #ifdef GGML_USE_CUBLAS
1340
- if (n_gpu_layers > n_layer) {
1341
- offload_func_nr = ggml_cuda_assign_buffers;
1342
- }
1343
- if (n_gpu_layers > n_layer + 1) {
1344
- offload_func_v = ggml_cuda_assign_buffers;
1345
- }
1346
- if (n_gpu_layers > n_layer + 2) {
1347
- offload_func_kq = ggml_cuda_assign_buffers;
1348
- }
1382
+ if (n_gpu_layers > n_layer) {
1383
+ offload_func_nr = ggml_cuda_assign_buffers;
1384
+ }
1385
+ if (n_gpu_layers > n_layer + 1) {
1386
+ offload_func_v = ggml_cuda_assign_buffers;
1387
+ }
1388
+ if (n_gpu_layers > n_layer + 2) {
1389
+ offload_func_kq = ggml_cuda_assign_buffers;
1390
+ }
1349
1391
  #endif // GGML_USE_CUBLAS
1350
1392
 
1351
1393
  for (int il = 0; il < n_layer; ++il) {
1394
+ ggml_format_name(inpL, "layer_inp_%d", il);
1395
+
1352
1396
  offload_func_t offload_func = llama_nop;
1353
1397
 
1354
1398
  #ifdef GGML_USE_CUBLAS
@@ -1555,7 +1599,6 @@ static bool llama_eval_internal(
1555
1599
 
1556
1600
  // input for next layer
1557
1601
  inpL = cur;
1558
-
1559
1602
  }
1560
1603
 
1561
1604
  lctx.use_buf(ctx0, 0);
@@ -1563,7 +1606,6 @@ static bool llama_eval_internal(
1563
1606
  // used at the end to optionally extract the embeddings
1564
1607
  struct ggml_tensor * embeddings = NULL;
1565
1608
 
1566
-
1567
1609
  // norm
1568
1610
  {
1569
1611
  cur = ggml_rms_norm(ctx0, inpL);
@@ -1578,7 +1620,6 @@ static bool llama_eval_internal(
1578
1620
  embeddings = cur;
1579
1621
  }
1580
1622
 
1581
-
1582
1623
  // lm_head
1583
1624
  cur = ggml_mul_mat(ctx0, model.output, cur);
1584
1625
  ggml_set_name(cur, "result_output");
@@ -1591,8 +1632,13 @@ static bool llama_eval_internal(
1591
1632
  // run the computation
1592
1633
  ggml_build_forward_expand(&gf, cur);
1593
1634
 
1635
+ #if GGML_USE_MPI
1636
+ ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
1637
+ #endif
1638
+
1594
1639
  #ifdef GGML_USE_METAL
1595
1640
  if (lctx.ctx_metal && N == 1) {
1641
+ ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
1596
1642
  ggml_metal_graph_compute(lctx.ctx_metal, &gf);
1597
1643
  ggml_metal_get_tensor (lctx.ctx_metal, cur);
1598
1644
  } else {
@@ -1612,12 +1658,21 @@ static bool llama_eval_internal(
1612
1658
  ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1613
1659
  }
1614
1660
 
1615
- ggml_graph_compute(ctx0, &gf);
1661
+ ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1616
1662
  }
1617
1663
  #else
1618
- ggml_graph_compute(ctx0, &gf);
1664
+ ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1619
1665
  #endif
1620
1666
 
1667
+ #if GGML_USE_MPI
1668
+ ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
1669
+ #endif
1670
+
1671
+ // update kv token count
1672
+ lctx.kv_self.n = n_past + N;
1673
+
1674
+ struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
1675
+
1621
1676
  if (cgraph_fname) {
1622
1677
  ggml_graph_export(&gf, cgraph_fname);
1623
1678
  }
@@ -1633,23 +1688,17 @@ static bool llama_eval_internal(
1633
1688
  // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
1634
1689
  //}
1635
1690
 
1636
- //embd_w.resize(n_vocab*N);
1637
- //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
1638
-
1639
- // update kv token count
1640
- lctx.kv_self.n = n_past + N;
1641
-
1642
1691
  // extract logits
1643
1692
  {
1644
1693
  auto & logits_out = lctx.logits;
1645
1694
 
1646
1695
  if (lctx.logits_all) {
1647
1696
  logits_out.resize(n_vocab * N);
1648
- memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
1697
+ memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
1649
1698
  } else {
1650
1699
  // return result for just the last token
1651
1700
  logits_out.resize(n_vocab);
1652
- memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1701
+ memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1653
1702
  }
1654
1703
  }
1655
1704
 
@@ -2118,6 +2167,62 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
2118
2167
  }
2119
2168
  }
2120
2169
 
2170
+ static void llama_log_softmax(float * array, size_t size) {
2171
+ float max_l = *std::max_element(array, array + size);
2172
+ float sum = 0.f;
2173
+ for (size_t i = 0; i < size; ++i) {
2174
+ float p = expf(array[i] - max_l);
2175
+ sum += p;
2176
+ array[i] = p;
2177
+ }
2178
+
2179
+ for (size_t i = 0; i < size; ++i) {
2180
+ array[i] = logf(array[i] / sum);
2181
+ }
2182
+ }
2183
+
2184
+ void llama_sample_classifier_free_guidance(
2185
+ struct llama_context * ctx,
2186
+ llama_token_data_array * candidates,
2187
+ struct llama_context * guidance_ctx,
2188
+ float scale,
2189
+ float smooth_factor) {
2190
+ int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
2191
+
2192
+ assert(ctx);
2193
+ auto n_vocab = llama_n_vocab(ctx);
2194
+ assert(n_vocab == (int)candidates->size);
2195
+ assert(!candidates->sorted);
2196
+
2197
+ std::vector<float> logits_base;
2198
+ logits_base.reserve(candidates->size);
2199
+ for (size_t i = 0; i < candidates->size; ++i) {
2200
+ logits_base.push_back(candidates->data[i].logit);
2201
+ }
2202
+ llama_log_softmax(logits_base.data(), candidates->size);
2203
+
2204
+ float* logits_guidance = llama_get_logits(guidance_ctx);
2205
+ llama_log_softmax(logits_guidance, n_vocab);
2206
+
2207
+ for (int i = 0; i < n_vocab; ++i) {
2208
+ float logit_guidance = logits_guidance[i];
2209
+ float logit_base = logits_base[i];
2210
+ logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance;
2211
+ }
2212
+
2213
+ llama_log_softmax(logits_guidance, n_vocab);
2214
+
2215
+ for (int i = 0; i < n_vocab; ++i) {
2216
+ float logit_base = logits_base[i];
2217
+ float logit_guidance = logits_guidance[i];
2218
+
2219
+ candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
2220
+ }
2221
+
2222
+ if (ctx) {
2223
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2224
+ }
2225
+ }
2121
2226
 
2122
2227
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
2123
2228
  assert(ctx);
@@ -2405,15 +2510,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2405
2510
  } else {
2406
2511
  new_type = quantized_type;
2407
2512
  #ifdef GGML_USE_K_QUANTS
2513
+ bool convert_incompatible_tensor = false;
2408
2514
  if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2409
2515
  quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2410
2516
  int nx = tensor.ne.at(0);
2411
2517
  int ny = tensor.ne.at(1);
2412
2518
  if (nx % QK_K != 0 || ny % QK_K != 0) {
2413
- fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
2414
- fprintf(stderr, "This is required to be able to use k-quants for now!\n");
2415
- fprintf(stderr, "========================================================================================\n\n");
2416
- throw std::runtime_error("Unsupported tensor size encountered\n");
2519
+ fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
2520
+ convert_incompatible_tensor = true;
2417
2521
  }
2418
2522
  }
2419
2523
  if (tensor.name == "output.weight") {
@@ -2441,6 +2545,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2441
2545
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2442
2546
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2443
2547
  }
2548
+ if (convert_incompatible_tensor) {
2549
+ if (tensor.name == "output.weight") {
2550
+ new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
2551
+ fprintf(stderr, "F16 will be used for this tensor instead.\n");
2552
+ } else if (tensor.name == "tok_embeddings.weight") {
2553
+ new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
2554
+ fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
2555
+ } else {
2556
+ throw std::runtime_error("Unsupported tensor size encountered\n");
2557
+ }
2558
+ }
2444
2559
  #endif
2445
2560
 
2446
2561
  float * f32_data;
@@ -2575,8 +2690,8 @@ void llama_free_model(struct llama_model * model) {
2575
2690
  }
2576
2691
 
2577
2692
  struct llama_context * llama_new_context_with_model(
2578
- struct llama_model * model,
2579
- struct llama_context_params params) {
2693
+ struct llama_model * model,
2694
+ struct llama_context_params params) {
2580
2695
 
2581
2696
  if (!model) {
2582
2697
  return nullptr;
@@ -2645,7 +2760,7 @@ struct llama_context * llama_new_context_with_model(
2645
2760
  #ifdef GGML_USE_METAL
2646
2761
  if (params.n_gpu_layers > 0) {
2647
2762
  // this allocates all Metal resources and memory buffers
2648
- ctx->ctx_metal = ggml_metal_init();
2763
+ ctx->ctx_metal = ggml_metal_init(1);
2649
2764
 
2650
2765
  void * data_ptr = NULL;
2651
2766
  size_t data_size = 0;
@@ -2680,6 +2795,18 @@ struct llama_context * llama_new_context_with_model(
2680
2795
  }
2681
2796
  #endif
2682
2797
 
2798
+ #ifdef GGML_USE_MPI
2799
+ ctx->ctx_mpi = ggml_mpi_init();
2800
+
2801
+ if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
2802
+ // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
2803
+ const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos());
2804
+ while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
2805
+ llama_backend_free();
2806
+ exit(1);
2807
+ }
2808
+ #endif
2809
+
2683
2810
  return ctx;
2684
2811
  }
2685
2812
 
@@ -2802,6 +2929,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2802
2929
  // read tensors and apply
2803
2930
  bool warned = false;
2804
2931
  int n_tensors = 0;
2932
+
2933
+ std::vector<uint8_t> work_buffer;
2934
+
2805
2935
  while (true) {
2806
2936
  int32_t n_dims;
2807
2937
  int32_t length;
@@ -2966,8 +3096,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2966
3096
  }
2967
3097
 
2968
3098
  struct ggml_cgraph gf = ggml_build_forward(r);
2969
- gf.n_threads = n_threads;
2970
- ggml_graph_compute(lora_ctx, &gf);
3099
+
3100
+ ggml_graph_compute_helper(work_buffer, &gf, n_threads);
2971
3101
 
2972
3102
  // we won't need these tensors again, reset the context to save memory
2973
3103
  ggml_free(lora_ctx);
@@ -3120,7 +3250,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3120
3250
 
3121
3251
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3122
3252
  ggml_cgraph gf{};
3123
- gf.n_threads = 1;
3124
3253
 
3125
3254
  ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
3126
3255
  kout3d->data = out;
@@ -3140,7 +3269,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3140
3269
 
3141
3270
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
3142
3271
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
3143
- ggml_graph_compute(cpy_ctx, &gf);
3272
+ ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
3144
3273
 
3145
3274
  ggml_free(cpy_ctx);
3146
3275
  }
@@ -3226,7 +3355,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3226
3355
 
3227
3356
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3228
3357
  ggml_cgraph gf{};
3229
- gf.n_threads = 1;
3230
3358
 
3231
3359
  ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
3232
3360
  kin3d->data = (void *) inp;
@@ -3246,7 +3374,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3246
3374
 
3247
3375
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
3248
3376
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
3249
- ggml_graph_compute(cpy_ctx, &gf);
3377
+ ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
3250
3378
 
3251
3379
  ggml_free(cpy_ctx);
3252
3380
  }
@@ -158,7 +158,9 @@ extern "C" {
158
158
  // Initialize the llama + ggml backend
159
159
  // If numa is true, use NUMA optimizations
160
160
  // Call once at the start of the program
161
- LLAMA_API void llama_init_backend(bool numa);
161
+ LLAMA_API void llama_backend_init(bool numa);
162
+ // Call once at the end of the program - currently only used for MPI
163
+ LLAMA_API void llama_backend_free();
162
164
 
163
165
  LLAMA_API int64_t llama_time_us();
164
166
 
@@ -307,6 +309,18 @@ extern "C" {
307
309
  /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
308
310
  LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
309
311
 
312
+ /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
313
+ /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
314
+ /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
315
+ /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
316
+ /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
317
+ LLAMA_API void llama_sample_classifier_free_guidance(
318
+ struct llama_context * ctx,
319
+ llama_token_data_array * candidates,
320
+ struct llama_context * guidance_ctx,
321
+ float scale,
322
+ float smooth_factor);
323
+
310
324
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
311
325
  LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
312
326
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.2'
6
+ VERSION = '0.3.3'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-481f793'
9
+ LLAMA_CPP_VERSION = 'master-32c5411'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -108,4 +108,4 @@ module LLaMACpp
108
108
  end
109
109
  end
110
110
 
111
- LLaMACpp.init_backend
111
+ LLaMACpp.backend_init
data/sig/llama_cpp.rbs CHANGED
@@ -26,7 +26,8 @@ module LLaMACpp
26
26
  LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
27
27
  LLAMA_FTYPE_MOSTLY_Q6_K: Integer
28
28
 
29
- def self?.init_backend: (?numa: bool) -> void
29
+ def self?.backend_init: (?numa: bool) -> void
30
+ def self?.backend_free: () -> void
30
31
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
31
32
  def self?.generate: (::LLaMACpp::Context, String,
32
33
  ?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
@@ -108,6 +109,7 @@ module LLaMACpp
108
109
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
109
110
  def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
110
111
  def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
112
+ def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float, smooth_factor: Float) -> void
111
113
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
112
114
  def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
113
115
  def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void