llama_cpp 0.9.1 → 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -91,6 +91,8 @@
91
91
  #define LLAMA_ATTRIBUTE_FORMAT(...)
92
92
  #endif
93
93
 
94
+ #define LLAMA_MAX_NODES 4096
95
+
94
96
  //
95
97
  // logging
96
98
  //
@@ -190,6 +192,7 @@ enum llm_arch {
190
192
  LLM_ARCH_PERSIMMON,
191
193
  LLM_ARCH_REFACT,
192
194
  LLM_ARCH_BLOOM,
195
+ LLM_ARCH_STABLELM,
193
196
  LLM_ARCH_UNKNOWN,
194
197
  };
195
198
 
@@ -205,6 +208,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
205
208
  { LLM_ARCH_PERSIMMON, "persimmon" },
206
209
  { LLM_ARCH_REFACT, "refact" },
207
210
  { LLM_ARCH_BLOOM, "bloom" },
211
+ { LLM_ARCH_STABLELM, "stablelm" },
208
212
  };
209
213
 
210
214
  enum llm_kv {
@@ -251,6 +255,8 @@ enum llm_kv {
251
255
  LLM_KV_TOKENIZER_UNK_ID,
252
256
  LLM_KV_TOKENIZER_SEP_ID,
253
257
  LLM_KV_TOKENIZER_PAD_ID,
258
+ LLM_KV_TOKENIZER_ADD_BOS,
259
+ LLM_KV_TOKENIZER_ADD_EOS,
254
260
  LLM_KV_TOKENIZER_HF_JSON,
255
261
  LLM_KV_TOKENIZER_RWKV,
256
262
  };
@@ -299,6 +305,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
299
305
  { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
300
306
  { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
301
307
  { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
308
+ { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
309
+ { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
302
310
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
303
311
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
304
312
  };
@@ -493,6 +501,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
493
501
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
494
502
  },
495
503
  },
504
+ {
505
+ LLM_ARCH_STABLELM,
506
+ {
507
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
508
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
509
+ { LLM_TENSOR_OUTPUT, "output" },
510
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
511
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
512
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
513
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
514
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
515
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
516
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
517
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
518
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
519
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
520
+ },
521
+ },
522
+
496
523
  {
497
524
  LLM_ARCH_UNKNOWN,
498
525
  {
@@ -596,19 +623,37 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
596
623
  // llama helpers
597
624
  //
598
625
 
626
+ inline void * llama_host_malloc(size_t n) {
599
627
  #ifdef GGML_USE_CUBLAS
600
- # define llama_host_malloc(n) ggml_cuda_host_malloc(n)
601
- # define llama_host_free(data) ggml_cuda_host_free(data)
628
+ if (ggml_cublas_loaded()) {
629
+ return ggml_cuda_host_malloc(n);
630
+ } else {
631
+ return malloc(n);
632
+ }
633
+ #elif GGML_USE_METAL
634
+ return ggml_metal_host_malloc(n);
635
+ #elif GGML_USE_CPU_HBM
636
+ return hbw_malloc(n);
637
+ #else
638
+ return malloc(n);
639
+ #endif
640
+ }
641
+
642
+ inline void llama_host_free(void * ptr) {
643
+ #ifdef GGML_USE_CUBLAS
644
+ if (ggml_cublas_loaded()) {
645
+ return ggml_cuda_host_free(ptr);
646
+ } else {
647
+ return free(ptr);
648
+ }
602
649
  #elif GGML_USE_METAL
603
- # define llama_host_malloc(n) ggml_metal_host_malloc(n)
604
- # define llama_host_free(data) ggml_metal_host_free(data)
650
+ return ggml_metal_host_free(ptr);
605
651
  #elif GGML_USE_CPU_HBM
606
- # define llama_host_malloc(n) hbw_malloc(n)
607
- # define llama_host_free(data) if (data != NULL) hbw_free(data)
652
+ return hbw_free(ptr);
608
653
  #else
609
- # define llama_host_malloc(n) malloc(n)
610
- # define llama_host_free(data) free(data)
654
+ return free(ptr);
611
655
  #endif
656
+ }
612
657
 
613
658
  #if defined(_WIN32)
614
659
  static std::string llama_format_win_err(DWORD err) {
@@ -1037,9 +1082,9 @@ enum e_model {
1037
1082
  MODEL_70B,
1038
1083
  };
1039
1084
 
1040
- static const size_t kB = 1024;
1041
- static const size_t MB = 1024*kB;
1042
- static const size_t GB = 1024*MB;
1085
+ static const size_t kiB = 1024;
1086
+ static const size_t MiB = 1024*kiB;
1087
+ static const size_t GiB = 1024*MiB;
1043
1088
 
1044
1089
  struct llama_hparams {
1045
1090
  bool vocab_only;
@@ -1195,9 +1240,11 @@ struct llama_kv_cache {
1195
1240
  }
1196
1241
 
1197
1242
  #ifdef GGML_USE_CUBLAS
1198
- ggml_cuda_free_data(k);
1199
- ggml_cuda_free_data(v);
1200
- #endif // GGML_USE_CUBLAS
1243
+ if (ggml_cublas_loaded()) {
1244
+ ggml_cuda_free_data(k);
1245
+ ggml_cuda_free_data(v);
1246
+ }
1247
+ #endif
1201
1248
  }
1202
1249
  };
1203
1250
 
@@ -1228,6 +1275,9 @@ struct llama_vocab {
1228
1275
  id special_sep_id = -1;
1229
1276
  id special_pad_id = -1;
1230
1277
 
1278
+ int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
1279
+ int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
1280
+
1231
1281
  id linefeed_id = 13;
1232
1282
  id special_prefix_id = 32007;
1233
1283
  id special_middle_id = 32009;
@@ -1297,11 +1347,15 @@ struct llama_model {
1297
1347
  }
1298
1348
 
1299
1349
  #ifdef GGML_USE_CUBLAS
1300
- for (size_t i = 0; i < tensors_by_name.size(); ++i) {
1301
- ggml_cuda_free_data(tensors_by_name[i].second);
1350
+ if (ggml_cublas_loaded()) {
1351
+ for (size_t i = 0; i < tensors_by_name.size(); ++i) {
1352
+ ggml_cuda_free_data(tensors_by_name[i].second);
1353
+ }
1354
+ ggml_cuda_free_scratch();
1302
1355
  }
1303
- ggml_cuda_free_scratch();
1304
- #elif defined(GGML_USE_CLBLAST)
1356
+ #endif
1357
+
1358
+ #if defined(GGML_USE_CLBLAST)
1305
1359
  for (size_t i = 0; i < tensors_by_name.size(); ++i) {
1306
1360
  ggml_cl_free_data(tensors_by_name[i].second);
1307
1361
  }
@@ -1413,23 +1467,26 @@ static bool llama_kv_cache_init(
1413
1467
  ggml_set_name(cache.v, "cache_v");
1414
1468
 
1415
1469
  (void) n_gpu_layers;
1470
+
1416
1471
  #ifdef GGML_USE_CUBLAS
1417
- size_t vram_kv_cache = 0;
1472
+ if (ggml_cublas_loaded()) {
1473
+ size_t vram_kv_cache = 0;
1418
1474
 
1419
- if (n_gpu_layers > (int)n_layer + 1) {
1420
- ggml_cuda_assign_buffers_no_scratch(cache.v);
1421
- LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1422
- vram_kv_cache += ggml_nbytes(cache.v);
1423
- }
1424
- if (n_gpu_layers > (int)n_layer + 2) {
1425
- ggml_cuda_assign_buffers_no_scratch(cache.k);
1426
- LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1427
- vram_kv_cache += ggml_nbytes(cache.k);
1428
- }
1429
- if (vram_kv_cache > 0) {
1430
- LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1475
+ if (n_gpu_layers > (int)n_layer + 1) {
1476
+ ggml_cuda_assign_buffers_no_scratch(cache.v);
1477
+ LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1478
+ vram_kv_cache += ggml_nbytes(cache.v);
1479
+ }
1480
+ if (n_gpu_layers > (int)n_layer + 2) {
1481
+ ggml_cuda_assign_buffers_no_scratch(cache.k);
1482
+ LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1483
+ vram_kv_cache += ggml_nbytes(cache.k);
1484
+ }
1485
+ if (vram_kv_cache > 0) {
1486
+ LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1487
+ }
1431
1488
  }
1432
- #endif // GGML_USE_CUBLAS
1489
+ #endif
1433
1490
 
1434
1491
  return true;
1435
1492
  }
@@ -2182,6 +2239,16 @@ static void llm_load_hparams(
2182
2239
  default: model.type = e_model::MODEL_UNKNOWN;
2183
2240
  }
2184
2241
  } break;
2242
+ case LLM_ARCH_STABLELM:
2243
+ {
2244
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2245
+
2246
+ switch (hparams.n_layer) {
2247
+ case 32: model.type = e_model::MODEL_3B; break;
2248
+ default: model.type = e_model::MODEL_UNKNOWN;
2249
+ }
2250
+ } break;
2251
+
2185
2252
  default: (void)0;
2186
2253
  }
2187
2254
 
@@ -2323,6 +2390,23 @@ static void llm_load_vocab(
2323
2390
  __func__, key.c_str(), id, old_id);
2324
2391
  id = old_id;
2325
2392
  }
2393
+
2394
+ }
2395
+
2396
+ // Handle add_bos_token and add_eos_token
2397
+ std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
2398
+ int kid = gguf_find_key(ctx, key.c_str());
2399
+ enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
2400
+ vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
2401
+ if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
2402
+ LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
2403
+ }
2404
+ key = kv(LLM_KV_TOKENIZER_ADD_EOS);
2405
+ kid = gguf_find_key(ctx, key.c_str());
2406
+ ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
2407
+ vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
2408
+ if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
2409
+ LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
2326
2410
  }
2327
2411
  }
2328
2412
 
@@ -2454,8 +2538,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2454
2538
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
2455
2539
  LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
2456
2540
  LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
2457
- if (ml.n_bytes < GB) {
2458
- LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2541
+ if (ml.n_bytes < GiB) {
2542
+ LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2459
2543
  } else {
2460
2544
  LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2461
2545
  }
@@ -2493,7 +2577,7 @@ static void llm_load_tensors(
2493
2577
 
2494
2578
  ml.calc_sizes(ctx_size, mmapped_size);
2495
2579
 
2496
- LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
2580
+ LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
2497
2581
 
2498
2582
  // create the ggml context
2499
2583
  {
@@ -2516,18 +2600,22 @@ static void llm_load_tensors(
2516
2600
  }
2517
2601
 
2518
2602
  (void) main_gpu;
2603
+
2604
+ enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
2605
+ enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
2606
+
2519
2607
  #ifdef GGML_USE_CUBLAS
2520
- LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
2521
- ggml_cuda_set_main_device(main_gpu);
2522
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
2523
- #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
2608
+ if (ggml_cublas_loaded()) {
2609
+ LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
2610
+ ggml_cuda_set_main_device(main_gpu);
2611
+
2612
+ llama_backend_offload = GGML_BACKEND_GPU;
2613
+ llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
2614
+ }
2524
2615
  #elif defined(GGML_USE_CLBLAST)
2525
- LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
2526
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
2527
- #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
2528
- #else
2529
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
2530
- #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
2616
+ LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
2617
+ llama_backend_offload = GGML_BACKEND_GPU;
2618
+ llama_backend_offload_split = GGML_BACKEND_GPU;
2531
2619
  #endif
2532
2620
 
2533
2621
  // prepare memory for the weights
@@ -2554,12 +2642,12 @@ static void llm_load_tensors(
2554
2642
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2555
2643
  // on Windows however this is detrimental unless everything is on the GPU
2556
2644
  #ifndef _WIN32
2557
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2645
+ backend_norm = llama_backend_offload;
2558
2646
  #else
2559
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2647
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2560
2648
  #endif // _WIN32
2561
2649
 
2562
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2650
+ backend_output = llama_backend_offload_split;
2563
2651
  } else {
2564
2652
  backend_norm = GGML_BACKEND_CPU;
2565
2653
  backend_output = GGML_BACKEND_CPU;
@@ -2583,8 +2671,8 @@ static void llm_load_tensors(
2583
2671
  model.layers.resize(n_layer);
2584
2672
 
2585
2673
  for (uint32_t i = 0; i < n_layer; ++i) {
2586
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2587
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2674
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2675
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
2588
2676
 
2589
2677
  auto & layer = model.layers[i];
2590
2678
 
@@ -2620,12 +2708,12 @@ static void llm_load_tensors(
2620
2708
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2621
2709
  // on Windows however this is detrimental unless everything is on the GPU
2622
2710
  #ifndef _WIN32
2623
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2711
+ backend_norm = llama_backend_offload;
2624
2712
  #else
2625
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2713
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2626
2714
  #endif // _WIN32
2627
2715
 
2628
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2716
+ backend_output = llama_backend_offload_split;
2629
2717
  } else {
2630
2718
  backend_norm = GGML_BACKEND_CPU;
2631
2719
  backend_output = GGML_BACKEND_CPU;
@@ -2649,8 +2737,8 @@ static void llm_load_tensors(
2649
2737
  model.layers.resize(n_layer);
2650
2738
 
2651
2739
  for (uint32_t i = 0; i < n_layer; ++i) {
2652
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2653
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2740
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2741
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
2654
2742
 
2655
2743
  auto & layer = model.layers[i];
2656
2744
 
@@ -2690,12 +2778,12 @@ static void llm_load_tensors(
2690
2778
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2691
2779
  // on Windows however this is detrimental unless everything is on the GPU
2692
2780
  #ifndef _WIN32
2693
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2781
+ backend_norm = llama_backend_offload;
2694
2782
  #else
2695
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2783
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2696
2784
  #endif // _WIN32
2697
2785
 
2698
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2786
+ backend_output = llama_backend_offload_split;
2699
2787
  } else {
2700
2788
  backend_norm = GGML_BACKEND_CPU;
2701
2789
  backend_output = GGML_BACKEND_CPU;
@@ -2721,8 +2809,8 @@ static void llm_load_tensors(
2721
2809
  model.layers.resize(n_layer);
2722
2810
 
2723
2811
  for (uint32_t i = 0; i < n_layer; ++i) {
2724
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2725
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2812
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2813
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
2726
2814
 
2727
2815
  auto & layer = model.layers[i];
2728
2816
 
@@ -2767,12 +2855,12 @@ static void llm_load_tensors(
2767
2855
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2768
2856
  // on Windows however this is detrimental unless everything is on the GPU
2769
2857
  #ifndef _WIN32
2770
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2858
+ backend_norm = llama_backend_offload;
2771
2859
  #else
2772
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2860
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2773
2861
  #endif // _WIN32
2774
2862
 
2775
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2863
+ backend_output = llama_backend_offload_split;
2776
2864
  } else {
2777
2865
  backend_norm = GGML_BACKEND_CPU;
2778
2866
  backend_output = GGML_BACKEND_CPU;
@@ -2798,8 +2886,8 @@ static void llm_load_tensors(
2798
2886
  model.layers.resize(n_layer);
2799
2887
 
2800
2888
  for (uint32_t i = 0; i < n_layer; ++i) {
2801
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2802
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2889
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2890
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
2803
2891
 
2804
2892
  auto & layer = model.layers[i];
2805
2893
 
@@ -2841,15 +2929,22 @@ static void llm_load_tensors(
2841
2929
  ggml_backend_type backend_output;
2842
2930
 
2843
2931
  if (n_gpu_layers > int(n_layer)) {
2932
+ #ifdef GGML_USE_CUBLAS
2933
+ if (n_gpu_layers > int(n_layer + 1)) {
2934
+ LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
2935
+ __func__, n_layer + 1);
2936
+ throw std::runtime_error("Persimmon CUDA offload failed");
2937
+ }
2938
+ #endif
2844
2939
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2845
2940
  // on Windows however this is detrimental unless everything is on the GPU
2846
2941
  #ifndef _WIN32
2847
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2942
+ backend_norm = llama_backend_offload;
2848
2943
  #else
2849
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2944
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2850
2945
  #endif // _WIN32
2851
2946
 
2852
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2947
+ backend_output = llama_backend_offload_split;
2853
2948
  } else {
2854
2949
  backend_norm = GGML_BACKEND_CPU;
2855
2950
  backend_output = GGML_BACKEND_CPU;
@@ -2872,8 +2967,8 @@ static void llm_load_tensors(
2872
2967
  const int i_gpu_start = n_layer - n_gpu_layers;
2873
2968
  model.layers.resize(n_layer);
2874
2969
  for (uint32_t i = 0; i < n_layer; ++i) {
2875
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2876
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
2970
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload;
2971
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split;
2877
2972
  auto & layer = model.layers[i];
2878
2973
  layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2879
2974
  layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
@@ -2910,12 +3005,12 @@ static void llm_load_tensors(
2910
3005
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2911
3006
  // on Windows however this is detrimental unless everything is on the GPU
2912
3007
  #ifndef _WIN32
2913
- backend_norm = LLAMA_BACKEND_OFFLOAD;
3008
+ backend_norm = llama_backend_offload;
2914
3009
  #else
2915
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
3010
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2916
3011
  #endif // _WIN32
2917
3012
 
2918
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
3013
+ backend_output = llama_backend_offload_split;
2919
3014
  } else {
2920
3015
  backend_norm = GGML_BACKEND_CPU;
2921
3016
  backend_output = GGML_BACKEND_CPU;
@@ -2941,8 +3036,8 @@ static void llm_load_tensors(
2941
3036
  model.layers.resize(n_layer);
2942
3037
 
2943
3038
  for (uint32_t i = 0; i < n_layer; ++i) {
2944
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2945
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
3039
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3040
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
2946
3041
 
2947
3042
  auto & layer = model.layers[i];
2948
3043
 
@@ -2988,12 +3083,12 @@ static void llm_load_tensors(
2988
3083
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2989
3084
  // on Windows however this is detrimental unless everything is on the GPU
2990
3085
  #ifndef _WIN32
2991
- backend_norm = LLAMA_BACKEND_OFFLOAD;
3086
+ backend_norm = llama_backend_offload;
2992
3087
  #else
2993
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
3088
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2994
3089
  #endif // _WIN32
2995
3090
 
2996
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
3091
+ backend_output = llama_backend_offload_split;
2997
3092
  } else {
2998
3093
  backend_norm = GGML_BACKEND_CPU;
2999
3094
  backend_output = GGML_BACKEND_CPU;
@@ -3017,8 +3112,8 @@ static void llm_load_tensors(
3017
3112
  model.layers.resize(n_layer);
3018
3113
 
3019
3114
  for (uint32_t i = 0; i < n_layer; ++i) {
3020
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
3021
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
3115
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3116
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3022
3117
 
3023
3118
  auto & layer = model.layers[i];
3024
3119
 
@@ -3042,6 +3137,81 @@ static void llm_load_tensors(
3042
3137
  }
3043
3138
  }
3044
3139
  } break;
3140
+ case LLM_ARCH_STABLELM:
3141
+ {
3142
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3143
+
3144
+ // output
3145
+ {
3146
+ ggml_backend_type backend_norm;
3147
+ ggml_backend_type backend_output;
3148
+
3149
+ if (n_gpu_layers > int(n_layer)) {
3150
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3151
+ // on Windows however this is detrimental unless everything is on the GPU
3152
+ #ifndef _WIN32
3153
+ backend_norm = llama_backend_offload;
3154
+ #else
3155
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
3156
+ #endif // _WIN32
3157
+
3158
+ backend_output = llama_backend_offload_split;
3159
+ } else {
3160
+ backend_norm = GGML_BACKEND_CPU;
3161
+ backend_output = GGML_BACKEND_CPU;
3162
+ }
3163
+
3164
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3165
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3166
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3167
+
3168
+ if (backend_norm == GGML_BACKEND_GPU) {
3169
+ vram_weights += ggml_nbytes(model.output_norm);
3170
+ }
3171
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3172
+ vram_weights += ggml_nbytes(model.output);
3173
+ }
3174
+ }
3175
+
3176
+ const uint32_t n_ff = hparams.n_ff;
3177
+
3178
+ const int i_gpu_start = n_layer - n_gpu_layers;
3179
+
3180
+ model.layers.resize(n_layer);
3181
+
3182
+ for (uint32_t i = 0; i < n_layer; ++i) {
3183
+ /*
3184
+ llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ]
3185
+ */
3186
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3187
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3188
+
3189
+ auto & layer = model.layers[i];
3190
+
3191
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3192
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3193
+
3194
+ layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
3195
+ layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3196
+ layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3197
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3198
+
3199
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3200
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
3201
+
3202
+ layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3203
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3204
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3205
+
3206
+ if (backend == GGML_BACKEND_GPU) {
3207
+ vram_weights +=
3208
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
3209
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
3210
+ ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3211
+ }
3212
+ }
3213
+ } break;
3214
+
3045
3215
  default:
3046
3216
  throw std::runtime_error("unknown architecture");
3047
3217
  }
@@ -3056,7 +3226,7 @@ static void llm_load_tensors(
3056
3226
  ctx_size +
3057
3227
  mmapped_size - vram_weights; // weights in VRAM not in memory
3058
3228
 
3059
- LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
3229
+ LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
3060
3230
 
3061
3231
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
3062
3232
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
@@ -3075,7 +3245,7 @@ static void llm_load_tensors(
3075
3245
  #endif // GGML_USE_CUBLAS
3076
3246
 
3077
3247
  LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
3078
- LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
3248
+ LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
3079
3249
  #else
3080
3250
  (void) n_gpu_layers;
3081
3251
  #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
@@ -3575,7 +3745,7 @@ struct llm_build_context {
3575
3745
  }
3576
3746
 
3577
3747
  struct ggml_cgraph * build_llama() {
3578
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
3748
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3579
3749
 
3580
3750
  GGML_ASSERT(n_embd_head == hparams.n_rot);
3581
3751
 
@@ -3687,7 +3857,7 @@ struct llm_build_context {
3687
3857
  }
3688
3858
 
3689
3859
  struct ggml_cgraph * build_baichuan() {
3690
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
3860
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3691
3861
 
3692
3862
  struct ggml_tensor * cur;
3693
3863
  struct ggml_tensor * inpL;
@@ -3807,7 +3977,7 @@ struct llm_build_context {
3807
3977
  }
3808
3978
 
3809
3979
  struct ggml_cgraph * build_falcon() {
3810
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
3980
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3811
3981
 
3812
3982
  struct ggml_tensor * cur;
3813
3983
  struct ggml_tensor * inpL;
@@ -3929,7 +4099,7 @@ struct llm_build_context {
3929
4099
  }
3930
4100
 
3931
4101
  struct ggml_cgraph * build_starcoder() {
3932
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4102
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3933
4103
 
3934
4104
  struct ggml_tensor * cur;
3935
4105
  struct ggml_tensor * pos;
@@ -4028,7 +4198,7 @@ struct llm_build_context {
4028
4198
  }
4029
4199
 
4030
4200
  struct ggml_cgraph * build_persimmon() {
4031
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4201
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4032
4202
 
4033
4203
  const int64_t n_rot = n_embd_head / 2;
4034
4204
 
@@ -4173,7 +4343,7 @@ struct llm_build_context {
4173
4343
  struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
4174
4344
  cb(Kcur, "Kcur", il);
4175
4345
 
4176
- struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
4346
+ struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
4177
4347
  cb(Q, "Q", il);
4178
4348
 
4179
4349
  Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
@@ -4238,7 +4408,7 @@ struct llm_build_context {
4238
4408
  }
4239
4409
 
4240
4410
  struct ggml_cgraph * build_refact() {
4241
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4411
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4242
4412
 
4243
4413
  struct ggml_tensor * cur;
4244
4414
  struct ggml_tensor * inpL;
@@ -4329,7 +4499,7 @@ struct llm_build_context {
4329
4499
  }
4330
4500
 
4331
4501
  struct ggml_cgraph * build_bloom() {
4332
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4502
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4333
4503
 
4334
4504
  struct ggml_tensor * cur;
4335
4505
  struct ggml_tensor * inpL;
@@ -4423,7 +4593,7 @@ struct llm_build_context {
4423
4593
  }
4424
4594
 
4425
4595
  struct ggml_cgraph * build_mpt() {
4426
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4596
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4427
4597
 
4428
4598
  struct ggml_tensor * cur;
4429
4599
  struct ggml_tensor * inpL;
@@ -4520,6 +4690,177 @@ struct llm_build_context {
4520
4690
 
4521
4691
  return gf;
4522
4692
  }
4693
+
4694
+ struct ggml_cgraph * build_stablelm() {
4695
+ struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4696
+
4697
+ struct ggml_tensor * cur;
4698
+ struct ggml_tensor * inpL;
4699
+
4700
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
4701
+ cb(inpL, "inp_embd", -1);
4702
+
4703
+ // inp_pos - contains the positions
4704
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4705
+ cb(inp_pos, "inp_pos", -1);
4706
+
4707
+ // KQ_scale
4708
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4709
+ cb(KQ_scale, "KQ_scale", -1);
4710
+
4711
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4712
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4713
+ cb(KQ_mask, "KQ_mask", -1);
4714
+
4715
+ // shift the entire K-cache if needed
4716
+ if (do_rope_shift) {
4717
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, hparams.n_rot, freq_base, freq_scale, cb);
4718
+ }
4719
+
4720
+ for (int il = 0; il < n_layer; ++il) {
4721
+ struct ggml_tensor * inpSA = inpL;
4722
+
4723
+ // norm
4724
+ cur = llm_build_norm(ctx0, inpL, hparams,
4725
+ model.layers[il].attn_norm,
4726
+ model.layers[il].attn_norm_b,
4727
+ LLM_NORM, cb, il);
4728
+ cb(cur, "attn_norm", il);
4729
+
4730
+ // self-attention
4731
+ {
4732
+ // compute Q and K and RoPE them
4733
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
4734
+ cb(tmpq, "tmpq", il);
4735
+
4736
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
4737
+ cb(tmpk, "tmpk", il);
4738
+
4739
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
4740
+ cb(Vcur, "Vcur", il);
4741
+
4742
+ // RoPE the first n_rot of q/k, pass the other half, and concat.
4743
+ struct ggml_tensor * qrot = ggml_cont(ctx0, ggml_view_3d(
4744
+ ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
4745
+ ggml_element_size(tmpq) * n_embd_head,
4746
+ ggml_element_size(tmpq) * n_embd_head * n_head,
4747
+ 0
4748
+ ));
4749
+ cb(qrot, "qrot", il);
4750
+
4751
+ struct ggml_tensor * krot = ggml_cont(ctx0, ggml_view_3d(
4752
+ ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
4753
+ ggml_element_size(tmpk) * n_embd_head,
4754
+ ggml_element_size(tmpk) * n_embd_head * n_head_kv,
4755
+ 0
4756
+ ));
4757
+ cb(krot, "krot", il);
4758
+
4759
+ // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
4760
+ struct ggml_tensor * qpass = ggml_view_3d(
4761
+ ctx0, tmpq, (n_embd_head - hparams.n_rot), n_head, n_tokens,
4762
+ ggml_element_size(tmpq) * n_embd_head,
4763
+ ggml_element_size(tmpq) * n_embd_head * n_head,
4764
+ ggml_element_size(tmpq) * hparams.n_rot
4765
+ );
4766
+ cb(qpass, "qpass", il);
4767
+
4768
+ struct ggml_tensor * kpass = ggml_view_3d(
4769
+ ctx0, tmpk, (n_embd_head - hparams.n_rot), n_head_kv, n_tokens,
4770
+ ggml_element_size(tmpk) * (n_embd_head),
4771
+ ggml_element_size(tmpk) * (n_embd_head) * n_head_kv,
4772
+ ggml_element_size(tmpk) * hparams.n_rot
4773
+ );
4774
+ cb(kpass, "kpass", il);
4775
+
4776
+ struct ggml_tensor * qrotated = ggml_rope_custom(
4777
+ ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
4778
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
4779
+ );
4780
+ cb(qrotated, "qrotated", il);
4781
+
4782
+ struct ggml_tensor * krotated = ggml_rope_custom(
4783
+ ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
4784
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
4785
+ );
4786
+ cb(krotated, "krotated", il);
4787
+
4788
+ // ggml currently only supports concatenation on dim=2
4789
+ // so we need to permute qrot, qpass, concat, then permute back.
4790
+ qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
4791
+ cb(qrotated, "qrotated", il);
4792
+
4793
+ krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
4794
+ cb(krotated, "krotated", il);
4795
+
4796
+ qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
4797
+ cb(qpass, "qpass", il);
4798
+
4799
+ kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
4800
+ cb(kpass, "kpass", il);
4801
+
4802
+ struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
4803
+ cb(Qcur, "Qcur", il);
4804
+
4805
+ struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
4806
+ cb(Kcur, "Kcur", il);
4807
+
4808
+ struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
4809
+ cb(Q, "Q", il);
4810
+
4811
+ Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
4812
+ cb(Kcur, "Kcur", il);
4813
+
4814
+ llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4815
+
4816
+ cur = llm_build_kqv(ctx0, hparams, kv_self,
4817
+ model.layers[il].wo, NULL,
4818
+ Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4819
+ cb(cur, "kqv_out", il);
4820
+ }
4821
+
4822
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
4823
+ cb(ffn_inp, "ffn_inp", il);
4824
+
4825
+ // feed-forward network
4826
+ {
4827
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
4828
+ model.layers[il].ffn_norm,
4829
+ model.layers[il].ffn_norm_b,
4830
+ LLM_NORM, cb, il);
4831
+ cb(cur, "ffn_norm", il);
4832
+
4833
+ cur = llm_build_ffn(ctx0, cur,
4834
+ model.layers[il].ffn_up, NULL,
4835
+ model.layers[il].ffn_gate, NULL,
4836
+ model.layers[il].ffn_down, NULL,
4837
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
4838
+ cb(cur, "ffn_out", il);
4839
+ }
4840
+
4841
+ cur = ggml_add(ctx0, cur, ffn_inp);
4842
+ cb(cur, "l_out", il);
4843
+
4844
+ // input for next layer
4845
+ inpL = cur;
4846
+ }
4847
+
4848
+ cur = inpL;
4849
+
4850
+ cur = llm_build_norm(ctx0, cur, hparams,
4851
+ model.output_norm,
4852
+ model.output_norm_b,
4853
+ LLM_NORM, cb, -1);
4854
+ cb(cur, "result_norm", -1);
4855
+
4856
+ // lm_head
4857
+ cur = ggml_mul_mat(ctx0, model.output, cur);
4858
+ cb(cur, "result_output", -1);
4859
+
4860
+ ggml_build_forward_expand(gf, cur);
4861
+
4862
+ return gf;
4863
+ }
4523
4864
  };
4524
4865
 
4525
4866
  //
@@ -4989,6 +5330,10 @@ static struct ggml_cgraph * llama_build_graph(
4989
5330
  {
4990
5331
  result = llm.build_mpt();
4991
5332
  } break;
5333
+ case LLM_ARCH_STABLELM:
5334
+ {
5335
+ result = llm.build_stablelm();
5336
+ } break;
4992
5337
  default:
4993
5338
  GGML_ASSERT(false);
4994
5339
  }
@@ -5159,11 +5504,13 @@ static int llama_decode_internal(
5159
5504
 
5160
5505
  // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
5161
5506
  const bool full_offload_supported =
5162
- model.arch == LLM_ARCH_LLAMA ||
5163
- model.arch == LLM_ARCH_BAICHUAN ||
5164
- model.arch == LLM_ARCH_FALCON ||
5165
- model.arch == LLM_ARCH_REFACT ||
5166
- model.arch == LLM_ARCH_MPT;
5507
+ model.arch == LLM_ARCH_LLAMA ||
5508
+ model.arch == LLM_ARCH_BAICHUAN ||
5509
+ model.arch == LLM_ARCH_FALCON ||
5510
+ model.arch == LLM_ARCH_REFACT ||
5511
+ model.arch == LLM_ARCH_MPT ||
5512
+ model.arch == LLM_ARCH_STARCODER ||
5513
+ model.arch == LLM_ARCH_STABLELM;
5167
5514
 
5168
5515
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
5169
5516
  if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
@@ -5955,7 +6302,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
5955
6302
  // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
5956
6303
  // and passing 'add space prefix' as bool argument
5957
6304
  //
5958
- auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length);
6305
+ auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
6306
+ if (&fragment == &fragment_buffer.front()) {
6307
+ raw_text = " " + raw_text; // prefix with space if the first token is not special
6308
+ }
5959
6309
 
5960
6310
  #ifdef PRETOKENIZERDEBUG
5961
6311
  fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
@@ -7607,7 +7957,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
7607
7957
  workers.clear();
7608
7958
  }
7609
7959
 
7610
- LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
7960
+ LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
7611
7961
  int64_t tot_count = 0;
7612
7962
  for (size_t i = 0; i < hist_cur.size(); i++) {
7613
7963
  hist_all[i] += hist_cur[i];
@@ -7977,7 +8327,7 @@ struct llama_context_params llama_context_default_params() {
7977
8327
  /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
7978
8328
  /*.rope_freq_base =*/ 0.0f,
7979
8329
  /*.rope_freq_scale =*/ 0.0f,
7980
- /*.yarn_ext_factor =*/ NAN,
8330
+ /*.yarn_ext_factor =*/ -1.0f,
7981
8331
  /*.yarn_attn_factor =*/ 1.0f,
7982
8332
  /*.yarn_beta_fast =*/ 32.0f,
7983
8333
  /*.yarn_beta_slow =*/ 1.0f,
@@ -8120,7 +8470,7 @@ struct llama_context * llama_new_context_with_model(
8120
8470
  cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
8121
8471
  }
8122
8472
 
8123
- if (std::isnan(cparams.yarn_ext_factor)) { // NaN indicates 'not set'
8473
+ if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
8124
8474
  cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
8125
8475
  }
8126
8476
 
@@ -8147,7 +8497,7 @@ struct llama_context * llama_new_context_with_model(
8147
8497
 
8148
8498
  {
8149
8499
  const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
8150
- LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
8500
+ LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
8151
8501
  }
8152
8502
 
8153
8503
  // resized during inference
@@ -8164,7 +8514,7 @@ struct llama_context * llama_new_context_with_model(
8164
8514
  {
8165
8515
  static const size_t tensor_alignment = 32;
8166
8516
  // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
8167
- ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
8517
+ ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
8168
8518
 
8169
8519
  // create measure allocator
8170
8520
  ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
@@ -8192,7 +8542,7 @@ struct llama_context * llama_new_context_with_model(
8192
8542
  // measure memory requirements for the graph
8193
8543
  size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
8194
8544
 
8195
- LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
8545
+ LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
8196
8546
 
8197
8547
  // recreate allocator with exact memory requirements
8198
8548
  ggml_allocr_free(ctx->alloc);
@@ -8206,7 +8556,7 @@ struct llama_context * llama_new_context_with_model(
8206
8556
  #endif
8207
8557
  #ifdef GGML_USE_CUBLAS
8208
8558
  ggml_cuda_set_scratch_size(alloc_size);
8209
- LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
8559
+ LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
8210
8560
 
8211
8561
  // calculate total VRAM usage
8212
8562
  auto add_tensor = [](const ggml_tensor * t, size_t & size) {
@@ -8226,10 +8576,10 @@ struct llama_context * llama_new_context_with_model(
8226
8576
  size_t ctx_vram_size = alloc_size + kv_vram_size;
8227
8577
  size_t total_vram_size = model_vram_size + ctx_vram_size;
8228
8578
 
8229
- LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
8579
+ LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
8230
8580
  total_vram_size / 1024.0 / 1024.0,
8231
8581
  model_vram_size / 1024.0 / 1024.0,
8232
- ctx_vram_size / 1024.0 / 1024.0);
8582
+ ctx_vram_size / 1024.0 / 1024.0);
8233
8583
  #endif
8234
8584
  }
8235
8585
 
@@ -8250,7 +8600,7 @@ struct llama_context * llama_new_context_with_model(
8250
8600
 
8251
8601
  const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
8252
8602
 
8253
- LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
8603
+ LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
8254
8604
 
8255
8605
  #define LLAMA_METAL_CHECK_BUF(result) \
8256
8606
  if (!(result)) { \
@@ -8553,8 +8903,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
8553
8903
  if (kv_buf_size) {
8554
8904
  const size_t elt_size = ggml_element_size(kv_self.k);
8555
8905
 
8556
- ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
8557
- ggml_cgraph gf{};
8906
+ ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
8907
+ ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
8558
8908
 
8559
8909
  ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
8560
8910
  std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
@@ -8572,9 +8922,9 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
8572
8922
  kv_head, n_embd, n_layer,
8573
8923
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
8574
8924
 
8575
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
8576
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
8577
- ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
8925
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
8926
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
8927
+ ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
8578
8928
 
8579
8929
  ggml_free(cpy_ctx);
8580
8930
 
@@ -8681,8 +9031,8 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
8681
9031
 
8682
9032
  const size_t elt_size = ggml_element_size(kv_self.k);
8683
9033
 
8684
- ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
8685
- ggml_cgraph gf{};
9034
+ ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9035
+ ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
8686
9036
 
8687
9037
  ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
8688
9038
  kin3d->data = (void *) inp;
@@ -8700,9 +9050,9 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
8700
9050
  kv_head, n_embd, n_layer,
8701
9051
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
8702
9052
 
8703
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
8704
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
8705
- ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
9053
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
9054
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
9055
+ ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
8706
9056
 
8707
9057
  ggml_free(cpy_ctx);
8708
9058
  }
@@ -8957,6 +9307,14 @@ llama_token llama_token_nl(const struct llama_model * model) {
8957
9307
  return model->vocab.linefeed_id;
8958
9308
  }
8959
9309
 
9310
+ int llama_add_bos_token(const struct llama_model * model) {
9311
+ return model->vocab.special_add_bos;
9312
+ }
9313
+
9314
+ int llama_add_eos_token(const struct llama_model * model) {
9315
+ return model->vocab.special_add_eos;
9316
+ }
9317
+
8960
9318
  llama_token llama_token_prefix(const struct llama_model * model) {
8961
9319
  return model->vocab.special_prefix_id;
8962
9320
  }