llama_cpp 0.9.1 → 0.9.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -91,6 +91,8 @@
91
91
  #define LLAMA_ATTRIBUTE_FORMAT(...)
92
92
  #endif
93
93
 
94
+ #define LLAMA_MAX_NODES 4096
95
+
94
96
  //
95
97
  // logging
96
98
  //
@@ -190,6 +192,7 @@ enum llm_arch {
190
192
  LLM_ARCH_PERSIMMON,
191
193
  LLM_ARCH_REFACT,
192
194
  LLM_ARCH_BLOOM,
195
+ LLM_ARCH_STABLELM,
193
196
  LLM_ARCH_UNKNOWN,
194
197
  };
195
198
 
@@ -205,6 +208,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
205
208
  { LLM_ARCH_PERSIMMON, "persimmon" },
206
209
  { LLM_ARCH_REFACT, "refact" },
207
210
  { LLM_ARCH_BLOOM, "bloom" },
211
+ { LLM_ARCH_STABLELM, "stablelm" },
208
212
  };
209
213
 
210
214
  enum llm_kv {
@@ -251,6 +255,8 @@ enum llm_kv {
251
255
  LLM_KV_TOKENIZER_UNK_ID,
252
256
  LLM_KV_TOKENIZER_SEP_ID,
253
257
  LLM_KV_TOKENIZER_PAD_ID,
258
+ LLM_KV_TOKENIZER_ADD_BOS,
259
+ LLM_KV_TOKENIZER_ADD_EOS,
254
260
  LLM_KV_TOKENIZER_HF_JSON,
255
261
  LLM_KV_TOKENIZER_RWKV,
256
262
  };
@@ -299,6 +305,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
299
305
  { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
300
306
  { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
301
307
  { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
308
+ { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
309
+ { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
302
310
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
303
311
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
304
312
  };
@@ -493,6 +501,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
493
501
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
494
502
  },
495
503
  },
504
+ {
505
+ LLM_ARCH_STABLELM,
506
+ {
507
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
508
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
509
+ { LLM_TENSOR_OUTPUT, "output" },
510
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
511
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
512
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
513
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
514
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
515
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
516
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
517
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
518
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
519
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
520
+ },
521
+ },
522
+
496
523
  {
497
524
  LLM_ARCH_UNKNOWN,
498
525
  {
@@ -596,19 +623,37 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
596
623
  // llama helpers
597
624
  //
598
625
 
626
+ inline void * llama_host_malloc(size_t n) {
599
627
  #ifdef GGML_USE_CUBLAS
600
- # define llama_host_malloc(n) ggml_cuda_host_malloc(n)
601
- # define llama_host_free(data) ggml_cuda_host_free(data)
628
+ if (ggml_cublas_loaded()) {
629
+ return ggml_cuda_host_malloc(n);
630
+ } else {
631
+ return malloc(n);
632
+ }
633
+ #elif GGML_USE_METAL
634
+ return ggml_metal_host_malloc(n);
635
+ #elif GGML_USE_CPU_HBM
636
+ return hbw_malloc(n);
637
+ #else
638
+ return malloc(n);
639
+ #endif
640
+ }
641
+
642
+ inline void llama_host_free(void * ptr) {
643
+ #ifdef GGML_USE_CUBLAS
644
+ if (ggml_cublas_loaded()) {
645
+ return ggml_cuda_host_free(ptr);
646
+ } else {
647
+ return free(ptr);
648
+ }
602
649
  #elif GGML_USE_METAL
603
- # define llama_host_malloc(n) ggml_metal_host_malloc(n)
604
- # define llama_host_free(data) ggml_metal_host_free(data)
650
+ return ggml_metal_host_free(ptr);
605
651
  #elif GGML_USE_CPU_HBM
606
- # define llama_host_malloc(n) hbw_malloc(n)
607
- # define llama_host_free(data) if (data != NULL) hbw_free(data)
652
+ return hbw_free(ptr);
608
653
  #else
609
- # define llama_host_malloc(n) malloc(n)
610
- # define llama_host_free(data) free(data)
654
+ return free(ptr);
611
655
  #endif
656
+ }
612
657
 
613
658
  #if defined(_WIN32)
614
659
  static std::string llama_format_win_err(DWORD err) {
@@ -1037,9 +1082,9 @@ enum e_model {
1037
1082
  MODEL_70B,
1038
1083
  };
1039
1084
 
1040
- static const size_t kB = 1024;
1041
- static const size_t MB = 1024*kB;
1042
- static const size_t GB = 1024*MB;
1085
+ static const size_t kiB = 1024;
1086
+ static const size_t MiB = 1024*kiB;
1087
+ static const size_t GiB = 1024*MiB;
1043
1088
 
1044
1089
  struct llama_hparams {
1045
1090
  bool vocab_only;
@@ -1195,9 +1240,11 @@ struct llama_kv_cache {
1195
1240
  }
1196
1241
 
1197
1242
  #ifdef GGML_USE_CUBLAS
1198
- ggml_cuda_free_data(k);
1199
- ggml_cuda_free_data(v);
1200
- #endif // GGML_USE_CUBLAS
1243
+ if (ggml_cublas_loaded()) {
1244
+ ggml_cuda_free_data(k);
1245
+ ggml_cuda_free_data(v);
1246
+ }
1247
+ #endif
1201
1248
  }
1202
1249
  };
1203
1250
 
@@ -1228,6 +1275,9 @@ struct llama_vocab {
1228
1275
  id special_sep_id = -1;
1229
1276
  id special_pad_id = -1;
1230
1277
 
1278
+ int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
1279
+ int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
1280
+
1231
1281
  id linefeed_id = 13;
1232
1282
  id special_prefix_id = 32007;
1233
1283
  id special_middle_id = 32009;
@@ -1297,11 +1347,15 @@ struct llama_model {
1297
1347
  }
1298
1348
 
1299
1349
  #ifdef GGML_USE_CUBLAS
1300
- for (size_t i = 0; i < tensors_by_name.size(); ++i) {
1301
- ggml_cuda_free_data(tensors_by_name[i].second);
1350
+ if (ggml_cublas_loaded()) {
1351
+ for (size_t i = 0; i < tensors_by_name.size(); ++i) {
1352
+ ggml_cuda_free_data(tensors_by_name[i].second);
1353
+ }
1354
+ ggml_cuda_free_scratch();
1302
1355
  }
1303
- ggml_cuda_free_scratch();
1304
- #elif defined(GGML_USE_CLBLAST)
1356
+ #endif
1357
+
1358
+ #if defined(GGML_USE_CLBLAST)
1305
1359
  for (size_t i = 0; i < tensors_by_name.size(); ++i) {
1306
1360
  ggml_cl_free_data(tensors_by_name[i].second);
1307
1361
  }
@@ -1413,23 +1467,26 @@ static bool llama_kv_cache_init(
1413
1467
  ggml_set_name(cache.v, "cache_v");
1414
1468
 
1415
1469
  (void) n_gpu_layers;
1470
+
1416
1471
  #ifdef GGML_USE_CUBLAS
1417
- size_t vram_kv_cache = 0;
1472
+ if (ggml_cublas_loaded()) {
1473
+ size_t vram_kv_cache = 0;
1418
1474
 
1419
- if (n_gpu_layers > (int)n_layer + 1) {
1420
- ggml_cuda_assign_buffers_no_scratch(cache.v);
1421
- LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1422
- vram_kv_cache += ggml_nbytes(cache.v);
1423
- }
1424
- if (n_gpu_layers > (int)n_layer + 2) {
1425
- ggml_cuda_assign_buffers_no_scratch(cache.k);
1426
- LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1427
- vram_kv_cache += ggml_nbytes(cache.k);
1428
- }
1429
- if (vram_kv_cache > 0) {
1430
- LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1475
+ if (n_gpu_layers > (int)n_layer + 1) {
1476
+ ggml_cuda_assign_buffers_no_scratch(cache.v);
1477
+ LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1478
+ vram_kv_cache += ggml_nbytes(cache.v);
1479
+ }
1480
+ if (n_gpu_layers > (int)n_layer + 2) {
1481
+ ggml_cuda_assign_buffers_no_scratch(cache.k);
1482
+ LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1483
+ vram_kv_cache += ggml_nbytes(cache.k);
1484
+ }
1485
+ if (vram_kv_cache > 0) {
1486
+ LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1487
+ }
1431
1488
  }
1432
- #endif // GGML_USE_CUBLAS
1489
+ #endif
1433
1490
 
1434
1491
  return true;
1435
1492
  }
@@ -2182,6 +2239,16 @@ static void llm_load_hparams(
2182
2239
  default: model.type = e_model::MODEL_UNKNOWN;
2183
2240
  }
2184
2241
  } break;
2242
+ case LLM_ARCH_STABLELM:
2243
+ {
2244
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2245
+
2246
+ switch (hparams.n_layer) {
2247
+ case 32: model.type = e_model::MODEL_3B; break;
2248
+ default: model.type = e_model::MODEL_UNKNOWN;
2249
+ }
2250
+ } break;
2251
+
2185
2252
  default: (void)0;
2186
2253
  }
2187
2254
 
@@ -2323,6 +2390,23 @@ static void llm_load_vocab(
2323
2390
  __func__, key.c_str(), id, old_id);
2324
2391
  id = old_id;
2325
2392
  }
2393
+
2394
+ }
2395
+
2396
+ // Handle add_bos_token and add_eos_token
2397
+ std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
2398
+ int kid = gguf_find_key(ctx, key.c_str());
2399
+ enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
2400
+ vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
2401
+ if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
2402
+ LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
2403
+ }
2404
+ key = kv(LLM_KV_TOKENIZER_ADD_EOS);
2405
+ kid = gguf_find_key(ctx, key.c_str());
2406
+ ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
2407
+ vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
2408
+ if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
2409
+ LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
2326
2410
  }
2327
2411
  }
2328
2412
 
@@ -2454,8 +2538,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2454
2538
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
2455
2539
  LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
2456
2540
  LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
2457
- if (ml.n_bytes < GB) {
2458
- LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2541
+ if (ml.n_bytes < GiB) {
2542
+ LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2459
2543
  } else {
2460
2544
  LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2461
2545
  }
@@ -2493,7 +2577,7 @@ static void llm_load_tensors(
2493
2577
 
2494
2578
  ml.calc_sizes(ctx_size, mmapped_size);
2495
2579
 
2496
- LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
2580
+ LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
2497
2581
 
2498
2582
  // create the ggml context
2499
2583
  {
@@ -2516,18 +2600,22 @@ static void llm_load_tensors(
2516
2600
  }
2517
2601
 
2518
2602
  (void) main_gpu;
2603
+
2604
+ enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
2605
+ enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
2606
+
2519
2607
  #ifdef GGML_USE_CUBLAS
2520
- LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
2521
- ggml_cuda_set_main_device(main_gpu);
2522
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
2523
- #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
2608
+ if (ggml_cublas_loaded()) {
2609
+ LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
2610
+ ggml_cuda_set_main_device(main_gpu);
2611
+
2612
+ llama_backend_offload = GGML_BACKEND_GPU;
2613
+ llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
2614
+ }
2524
2615
  #elif defined(GGML_USE_CLBLAST)
2525
- LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
2526
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
2527
- #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
2528
- #else
2529
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
2530
- #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
2616
+ LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
2617
+ llama_backend_offload = GGML_BACKEND_GPU;
2618
+ llama_backend_offload_split = GGML_BACKEND_GPU;
2531
2619
  #endif
2532
2620
 
2533
2621
  // prepare memory for the weights
@@ -2554,12 +2642,12 @@ static void llm_load_tensors(
2554
2642
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2555
2643
  // on Windows however this is detrimental unless everything is on the GPU
2556
2644
  #ifndef _WIN32
2557
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2645
+ backend_norm = llama_backend_offload;
2558
2646
  #else
2559
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2647
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2560
2648
  #endif // _WIN32
2561
2649
 
2562
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2650
+ backend_output = llama_backend_offload_split;
2563
2651
  } else {
2564
2652
  backend_norm = GGML_BACKEND_CPU;
2565
2653
  backend_output = GGML_BACKEND_CPU;
@@ -2583,8 +2671,8 @@ static void llm_load_tensors(
2583
2671
  model.layers.resize(n_layer);
2584
2672
 
2585
2673
  for (uint32_t i = 0; i < n_layer; ++i) {
2586
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2587
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2674
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2675
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
2588
2676
 
2589
2677
  auto & layer = model.layers[i];
2590
2678
 
@@ -2620,12 +2708,12 @@ static void llm_load_tensors(
2620
2708
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2621
2709
  // on Windows however this is detrimental unless everything is on the GPU
2622
2710
  #ifndef _WIN32
2623
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2711
+ backend_norm = llama_backend_offload;
2624
2712
  #else
2625
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2713
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2626
2714
  #endif // _WIN32
2627
2715
 
2628
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2716
+ backend_output = llama_backend_offload_split;
2629
2717
  } else {
2630
2718
  backend_norm = GGML_BACKEND_CPU;
2631
2719
  backend_output = GGML_BACKEND_CPU;
@@ -2649,8 +2737,8 @@ static void llm_load_tensors(
2649
2737
  model.layers.resize(n_layer);
2650
2738
 
2651
2739
  for (uint32_t i = 0; i < n_layer; ++i) {
2652
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2653
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2740
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2741
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
2654
2742
 
2655
2743
  auto & layer = model.layers[i];
2656
2744
 
@@ -2690,12 +2778,12 @@ static void llm_load_tensors(
2690
2778
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2691
2779
  // on Windows however this is detrimental unless everything is on the GPU
2692
2780
  #ifndef _WIN32
2693
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2781
+ backend_norm = llama_backend_offload;
2694
2782
  #else
2695
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2783
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2696
2784
  #endif // _WIN32
2697
2785
 
2698
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2786
+ backend_output = llama_backend_offload_split;
2699
2787
  } else {
2700
2788
  backend_norm = GGML_BACKEND_CPU;
2701
2789
  backend_output = GGML_BACKEND_CPU;
@@ -2721,8 +2809,8 @@ static void llm_load_tensors(
2721
2809
  model.layers.resize(n_layer);
2722
2810
 
2723
2811
  for (uint32_t i = 0; i < n_layer; ++i) {
2724
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2725
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2812
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2813
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
2726
2814
 
2727
2815
  auto & layer = model.layers[i];
2728
2816
 
@@ -2767,12 +2855,12 @@ static void llm_load_tensors(
2767
2855
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2768
2856
  // on Windows however this is detrimental unless everything is on the GPU
2769
2857
  #ifndef _WIN32
2770
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2858
+ backend_norm = llama_backend_offload;
2771
2859
  #else
2772
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2860
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2773
2861
  #endif // _WIN32
2774
2862
 
2775
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2863
+ backend_output = llama_backend_offload_split;
2776
2864
  } else {
2777
2865
  backend_norm = GGML_BACKEND_CPU;
2778
2866
  backend_output = GGML_BACKEND_CPU;
@@ -2798,8 +2886,8 @@ static void llm_load_tensors(
2798
2886
  model.layers.resize(n_layer);
2799
2887
 
2800
2888
  for (uint32_t i = 0; i < n_layer; ++i) {
2801
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2802
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2889
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2890
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
2803
2891
 
2804
2892
  auto & layer = model.layers[i];
2805
2893
 
@@ -2841,15 +2929,22 @@ static void llm_load_tensors(
2841
2929
  ggml_backend_type backend_output;
2842
2930
 
2843
2931
  if (n_gpu_layers > int(n_layer)) {
2932
+ #ifdef GGML_USE_CUBLAS
2933
+ if (n_gpu_layers > int(n_layer + 1)) {
2934
+ LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
2935
+ __func__, n_layer + 1);
2936
+ throw std::runtime_error("Persimmon CUDA offload failed");
2937
+ }
2938
+ #endif
2844
2939
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2845
2940
  // on Windows however this is detrimental unless everything is on the GPU
2846
2941
  #ifndef _WIN32
2847
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2942
+ backend_norm = llama_backend_offload;
2848
2943
  #else
2849
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2944
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2850
2945
  #endif // _WIN32
2851
2946
 
2852
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2947
+ backend_output = llama_backend_offload_split;
2853
2948
  } else {
2854
2949
  backend_norm = GGML_BACKEND_CPU;
2855
2950
  backend_output = GGML_BACKEND_CPU;
@@ -2872,8 +2967,8 @@ static void llm_load_tensors(
2872
2967
  const int i_gpu_start = n_layer - n_gpu_layers;
2873
2968
  model.layers.resize(n_layer);
2874
2969
  for (uint32_t i = 0; i < n_layer; ++i) {
2875
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2876
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
2970
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload;
2971
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split;
2877
2972
  auto & layer = model.layers[i];
2878
2973
  layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2879
2974
  layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
@@ -2910,12 +3005,12 @@ static void llm_load_tensors(
2910
3005
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2911
3006
  // on Windows however this is detrimental unless everything is on the GPU
2912
3007
  #ifndef _WIN32
2913
- backend_norm = LLAMA_BACKEND_OFFLOAD;
3008
+ backend_norm = llama_backend_offload;
2914
3009
  #else
2915
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
3010
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2916
3011
  #endif // _WIN32
2917
3012
 
2918
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
3013
+ backend_output = llama_backend_offload_split;
2919
3014
  } else {
2920
3015
  backend_norm = GGML_BACKEND_CPU;
2921
3016
  backend_output = GGML_BACKEND_CPU;
@@ -2941,8 +3036,8 @@ static void llm_load_tensors(
2941
3036
  model.layers.resize(n_layer);
2942
3037
 
2943
3038
  for (uint32_t i = 0; i < n_layer; ++i) {
2944
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2945
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
3039
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3040
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
2946
3041
 
2947
3042
  auto & layer = model.layers[i];
2948
3043
 
@@ -2988,12 +3083,12 @@ static void llm_load_tensors(
2988
3083
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2989
3084
  // on Windows however this is detrimental unless everything is on the GPU
2990
3085
  #ifndef _WIN32
2991
- backend_norm = LLAMA_BACKEND_OFFLOAD;
3086
+ backend_norm = llama_backend_offload;
2992
3087
  #else
2993
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
3088
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2994
3089
  #endif // _WIN32
2995
3090
 
2996
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
3091
+ backend_output = llama_backend_offload_split;
2997
3092
  } else {
2998
3093
  backend_norm = GGML_BACKEND_CPU;
2999
3094
  backend_output = GGML_BACKEND_CPU;
@@ -3017,8 +3112,8 @@ static void llm_load_tensors(
3017
3112
  model.layers.resize(n_layer);
3018
3113
 
3019
3114
  for (uint32_t i = 0; i < n_layer; ++i) {
3020
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
3021
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
3115
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3116
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3022
3117
 
3023
3118
  auto & layer = model.layers[i];
3024
3119
 
@@ -3042,6 +3137,81 @@ static void llm_load_tensors(
3042
3137
  }
3043
3138
  }
3044
3139
  } break;
3140
+ case LLM_ARCH_STABLELM:
3141
+ {
3142
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3143
+
3144
+ // output
3145
+ {
3146
+ ggml_backend_type backend_norm;
3147
+ ggml_backend_type backend_output;
3148
+
3149
+ if (n_gpu_layers > int(n_layer)) {
3150
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3151
+ // on Windows however this is detrimental unless everything is on the GPU
3152
+ #ifndef _WIN32
3153
+ backend_norm = llama_backend_offload;
3154
+ #else
3155
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
3156
+ #endif // _WIN32
3157
+
3158
+ backend_output = llama_backend_offload_split;
3159
+ } else {
3160
+ backend_norm = GGML_BACKEND_CPU;
3161
+ backend_output = GGML_BACKEND_CPU;
3162
+ }
3163
+
3164
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3165
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3166
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3167
+
3168
+ if (backend_norm == GGML_BACKEND_GPU) {
3169
+ vram_weights += ggml_nbytes(model.output_norm);
3170
+ }
3171
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3172
+ vram_weights += ggml_nbytes(model.output);
3173
+ }
3174
+ }
3175
+
3176
+ const uint32_t n_ff = hparams.n_ff;
3177
+
3178
+ const int i_gpu_start = n_layer - n_gpu_layers;
3179
+
3180
+ model.layers.resize(n_layer);
3181
+
3182
+ for (uint32_t i = 0; i < n_layer; ++i) {
3183
+ /*
3184
+ llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ]
3185
+ */
3186
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3187
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3188
+
3189
+ auto & layer = model.layers[i];
3190
+
3191
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3192
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3193
+
3194
+ layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
3195
+ layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3196
+ layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3197
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3198
+
3199
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3200
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
3201
+
3202
+ layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3203
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3204
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3205
+
3206
+ if (backend == GGML_BACKEND_GPU) {
3207
+ vram_weights +=
3208
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
3209
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
3210
+ ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3211
+ }
3212
+ }
3213
+ } break;
3214
+
3045
3215
  default:
3046
3216
  throw std::runtime_error("unknown architecture");
3047
3217
  }
@@ -3056,7 +3226,7 @@ static void llm_load_tensors(
3056
3226
  ctx_size +
3057
3227
  mmapped_size - vram_weights; // weights in VRAM not in memory
3058
3228
 
3059
- LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
3229
+ LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
3060
3230
 
3061
3231
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
3062
3232
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
@@ -3075,7 +3245,7 @@ static void llm_load_tensors(
3075
3245
  #endif // GGML_USE_CUBLAS
3076
3246
 
3077
3247
  LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
3078
- LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
3248
+ LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
3079
3249
  #else
3080
3250
  (void) n_gpu_layers;
3081
3251
  #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
@@ -3575,7 +3745,7 @@ struct llm_build_context {
3575
3745
  }
3576
3746
 
3577
3747
  struct ggml_cgraph * build_llama() {
3578
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
3748
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3579
3749
 
3580
3750
  GGML_ASSERT(n_embd_head == hparams.n_rot);
3581
3751
 
@@ -3687,7 +3857,7 @@ struct llm_build_context {
3687
3857
  }
3688
3858
 
3689
3859
  struct ggml_cgraph * build_baichuan() {
3690
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
3860
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3691
3861
 
3692
3862
  struct ggml_tensor * cur;
3693
3863
  struct ggml_tensor * inpL;
@@ -3807,7 +3977,7 @@ struct llm_build_context {
3807
3977
  }
3808
3978
 
3809
3979
  struct ggml_cgraph * build_falcon() {
3810
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
3980
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3811
3981
 
3812
3982
  struct ggml_tensor * cur;
3813
3983
  struct ggml_tensor * inpL;
@@ -3929,7 +4099,7 @@ struct llm_build_context {
3929
4099
  }
3930
4100
 
3931
4101
  struct ggml_cgraph * build_starcoder() {
3932
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4102
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3933
4103
 
3934
4104
  struct ggml_tensor * cur;
3935
4105
  struct ggml_tensor * pos;
@@ -4028,7 +4198,7 @@ struct llm_build_context {
4028
4198
  }
4029
4199
 
4030
4200
  struct ggml_cgraph * build_persimmon() {
4031
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4201
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4032
4202
 
4033
4203
  const int64_t n_rot = n_embd_head / 2;
4034
4204
 
@@ -4173,7 +4343,7 @@ struct llm_build_context {
4173
4343
  struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
4174
4344
  cb(Kcur, "Kcur", il);
4175
4345
 
4176
- struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
4346
+ struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
4177
4347
  cb(Q, "Q", il);
4178
4348
 
4179
4349
  Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
@@ -4238,7 +4408,7 @@ struct llm_build_context {
4238
4408
  }
4239
4409
 
4240
4410
  struct ggml_cgraph * build_refact() {
4241
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4411
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4242
4412
 
4243
4413
  struct ggml_tensor * cur;
4244
4414
  struct ggml_tensor * inpL;
@@ -4329,7 +4499,7 @@ struct llm_build_context {
4329
4499
  }
4330
4500
 
4331
4501
  struct ggml_cgraph * build_bloom() {
4332
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4502
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4333
4503
 
4334
4504
  struct ggml_tensor * cur;
4335
4505
  struct ggml_tensor * inpL;
@@ -4423,7 +4593,7 @@ struct llm_build_context {
4423
4593
  }
4424
4594
 
4425
4595
  struct ggml_cgraph * build_mpt() {
4426
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4596
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4427
4597
 
4428
4598
  struct ggml_tensor * cur;
4429
4599
  struct ggml_tensor * inpL;
@@ -4520,6 +4690,177 @@ struct llm_build_context {
4520
4690
 
4521
4691
  return gf;
4522
4692
  }
4693
+
4694
+ struct ggml_cgraph * build_stablelm() {
4695
+ struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4696
+
4697
+ struct ggml_tensor * cur;
4698
+ struct ggml_tensor * inpL;
4699
+
4700
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
4701
+ cb(inpL, "inp_embd", -1);
4702
+
4703
+ // inp_pos - contains the positions
4704
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4705
+ cb(inp_pos, "inp_pos", -1);
4706
+
4707
+ // KQ_scale
4708
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4709
+ cb(KQ_scale, "KQ_scale", -1);
4710
+
4711
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4712
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4713
+ cb(KQ_mask, "KQ_mask", -1);
4714
+
4715
+ // shift the entire K-cache if needed
4716
+ if (do_rope_shift) {
4717
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, hparams.n_rot, freq_base, freq_scale, cb);
4718
+ }
4719
+
4720
+ for (int il = 0; il < n_layer; ++il) {
4721
+ struct ggml_tensor * inpSA = inpL;
4722
+
4723
+ // norm
4724
+ cur = llm_build_norm(ctx0, inpL, hparams,
4725
+ model.layers[il].attn_norm,
4726
+ model.layers[il].attn_norm_b,
4727
+ LLM_NORM, cb, il);
4728
+ cb(cur, "attn_norm", il);
4729
+
4730
+ // self-attention
4731
+ {
4732
+ // compute Q and K and RoPE them
4733
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
4734
+ cb(tmpq, "tmpq", il);
4735
+
4736
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
4737
+ cb(tmpk, "tmpk", il);
4738
+
4739
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
4740
+ cb(Vcur, "Vcur", il);
4741
+
4742
+ // RoPE the first n_rot of q/k, pass the other half, and concat.
4743
+ struct ggml_tensor * qrot = ggml_cont(ctx0, ggml_view_3d(
4744
+ ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
4745
+ ggml_element_size(tmpq) * n_embd_head,
4746
+ ggml_element_size(tmpq) * n_embd_head * n_head,
4747
+ 0
4748
+ ));
4749
+ cb(qrot, "qrot", il);
4750
+
4751
+ struct ggml_tensor * krot = ggml_cont(ctx0, ggml_view_3d(
4752
+ ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
4753
+ ggml_element_size(tmpk) * n_embd_head,
4754
+ ggml_element_size(tmpk) * n_embd_head * n_head_kv,
4755
+ 0
4756
+ ));
4757
+ cb(krot, "krot", il);
4758
+
4759
+ // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
4760
+ struct ggml_tensor * qpass = ggml_view_3d(
4761
+ ctx0, tmpq, (n_embd_head - hparams.n_rot), n_head, n_tokens,
4762
+ ggml_element_size(tmpq) * n_embd_head,
4763
+ ggml_element_size(tmpq) * n_embd_head * n_head,
4764
+ ggml_element_size(tmpq) * hparams.n_rot
4765
+ );
4766
+ cb(qpass, "qpass", il);
4767
+
4768
+ struct ggml_tensor * kpass = ggml_view_3d(
4769
+ ctx0, tmpk, (n_embd_head - hparams.n_rot), n_head_kv, n_tokens,
4770
+ ggml_element_size(tmpk) * (n_embd_head),
4771
+ ggml_element_size(tmpk) * (n_embd_head) * n_head_kv,
4772
+ ggml_element_size(tmpk) * hparams.n_rot
4773
+ );
4774
+ cb(kpass, "kpass", il);
4775
+
4776
+ struct ggml_tensor * qrotated = ggml_rope_custom(
4777
+ ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
4778
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
4779
+ );
4780
+ cb(qrotated, "qrotated", il);
4781
+
4782
+ struct ggml_tensor * krotated = ggml_rope_custom(
4783
+ ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
4784
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
4785
+ );
4786
+ cb(krotated, "krotated", il);
4787
+
4788
+ // ggml currently only supports concatenation on dim=2
4789
+ // so we need to permute qrot, qpass, concat, then permute back.
4790
+ qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
4791
+ cb(qrotated, "qrotated", il);
4792
+
4793
+ krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
4794
+ cb(krotated, "krotated", il);
4795
+
4796
+ qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
4797
+ cb(qpass, "qpass", il);
4798
+
4799
+ kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
4800
+ cb(kpass, "kpass", il);
4801
+
4802
+ struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
4803
+ cb(Qcur, "Qcur", il);
4804
+
4805
+ struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
4806
+ cb(Kcur, "Kcur", il);
4807
+
4808
+ struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
4809
+ cb(Q, "Q", il);
4810
+
4811
+ Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
4812
+ cb(Kcur, "Kcur", il);
4813
+
4814
+ llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4815
+
4816
+ cur = llm_build_kqv(ctx0, hparams, kv_self,
4817
+ model.layers[il].wo, NULL,
4818
+ Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4819
+ cb(cur, "kqv_out", il);
4820
+ }
4821
+
4822
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
4823
+ cb(ffn_inp, "ffn_inp", il);
4824
+
4825
+ // feed-forward network
4826
+ {
4827
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
4828
+ model.layers[il].ffn_norm,
4829
+ model.layers[il].ffn_norm_b,
4830
+ LLM_NORM, cb, il);
4831
+ cb(cur, "ffn_norm", il);
4832
+
4833
+ cur = llm_build_ffn(ctx0, cur,
4834
+ model.layers[il].ffn_up, NULL,
4835
+ model.layers[il].ffn_gate, NULL,
4836
+ model.layers[il].ffn_down, NULL,
4837
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
4838
+ cb(cur, "ffn_out", il);
4839
+ }
4840
+
4841
+ cur = ggml_add(ctx0, cur, ffn_inp);
4842
+ cb(cur, "l_out", il);
4843
+
4844
+ // input for next layer
4845
+ inpL = cur;
4846
+ }
4847
+
4848
+ cur = inpL;
4849
+
4850
+ cur = llm_build_norm(ctx0, cur, hparams,
4851
+ model.output_norm,
4852
+ model.output_norm_b,
4853
+ LLM_NORM, cb, -1);
4854
+ cb(cur, "result_norm", -1);
4855
+
4856
+ // lm_head
4857
+ cur = ggml_mul_mat(ctx0, model.output, cur);
4858
+ cb(cur, "result_output", -1);
4859
+
4860
+ ggml_build_forward_expand(gf, cur);
4861
+
4862
+ return gf;
4863
+ }
4523
4864
  };
4524
4865
 
4525
4866
  //
@@ -4989,6 +5330,10 @@ static struct ggml_cgraph * llama_build_graph(
4989
5330
  {
4990
5331
  result = llm.build_mpt();
4991
5332
  } break;
5333
+ case LLM_ARCH_STABLELM:
5334
+ {
5335
+ result = llm.build_stablelm();
5336
+ } break;
4992
5337
  default:
4993
5338
  GGML_ASSERT(false);
4994
5339
  }
@@ -5159,11 +5504,13 @@ static int llama_decode_internal(
5159
5504
 
5160
5505
  // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
5161
5506
  const bool full_offload_supported =
5162
- model.arch == LLM_ARCH_LLAMA ||
5163
- model.arch == LLM_ARCH_BAICHUAN ||
5164
- model.arch == LLM_ARCH_FALCON ||
5165
- model.arch == LLM_ARCH_REFACT ||
5166
- model.arch == LLM_ARCH_MPT;
5507
+ model.arch == LLM_ARCH_LLAMA ||
5508
+ model.arch == LLM_ARCH_BAICHUAN ||
5509
+ model.arch == LLM_ARCH_FALCON ||
5510
+ model.arch == LLM_ARCH_REFACT ||
5511
+ model.arch == LLM_ARCH_MPT ||
5512
+ model.arch == LLM_ARCH_STARCODER ||
5513
+ model.arch == LLM_ARCH_STABLELM;
5167
5514
 
5168
5515
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
5169
5516
  if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
@@ -5955,7 +6302,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
5955
6302
  // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
5956
6303
  // and passing 'add space prefix' as bool argument
5957
6304
  //
5958
- auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length);
6305
+ auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
6306
+ if (&fragment == &fragment_buffer.front()) {
6307
+ raw_text = " " + raw_text; // prefix with space if the first token is not special
6308
+ }
5959
6309
 
5960
6310
  #ifdef PRETOKENIZERDEBUG
5961
6311
  fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
@@ -7607,7 +7957,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
7607
7957
  workers.clear();
7608
7958
  }
7609
7959
 
7610
- LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
7960
+ LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
7611
7961
  int64_t tot_count = 0;
7612
7962
  for (size_t i = 0; i < hist_cur.size(); i++) {
7613
7963
  hist_all[i] += hist_cur[i];
@@ -7977,7 +8327,7 @@ struct llama_context_params llama_context_default_params() {
7977
8327
  /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
7978
8328
  /*.rope_freq_base =*/ 0.0f,
7979
8329
  /*.rope_freq_scale =*/ 0.0f,
7980
- /*.yarn_ext_factor =*/ NAN,
8330
+ /*.yarn_ext_factor =*/ -1.0f,
7981
8331
  /*.yarn_attn_factor =*/ 1.0f,
7982
8332
  /*.yarn_beta_fast =*/ 32.0f,
7983
8333
  /*.yarn_beta_slow =*/ 1.0f,
@@ -8120,7 +8470,7 @@ struct llama_context * llama_new_context_with_model(
8120
8470
  cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
8121
8471
  }
8122
8472
 
8123
- if (std::isnan(cparams.yarn_ext_factor)) { // NaN indicates 'not set'
8473
+ if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
8124
8474
  cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
8125
8475
  }
8126
8476
 
@@ -8147,7 +8497,7 @@ struct llama_context * llama_new_context_with_model(
8147
8497
 
8148
8498
  {
8149
8499
  const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
8150
- LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
8500
+ LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
8151
8501
  }
8152
8502
 
8153
8503
  // resized during inference
@@ -8164,7 +8514,7 @@ struct llama_context * llama_new_context_with_model(
8164
8514
  {
8165
8515
  static const size_t tensor_alignment = 32;
8166
8516
  // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
8167
- ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
8517
+ ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
8168
8518
 
8169
8519
  // create measure allocator
8170
8520
  ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
@@ -8192,7 +8542,7 @@ struct llama_context * llama_new_context_with_model(
8192
8542
  // measure memory requirements for the graph
8193
8543
  size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
8194
8544
 
8195
- LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
8545
+ LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
8196
8546
 
8197
8547
  // recreate allocator with exact memory requirements
8198
8548
  ggml_allocr_free(ctx->alloc);
@@ -8206,7 +8556,7 @@ struct llama_context * llama_new_context_with_model(
8206
8556
  #endif
8207
8557
  #ifdef GGML_USE_CUBLAS
8208
8558
  ggml_cuda_set_scratch_size(alloc_size);
8209
- LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
8559
+ LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
8210
8560
 
8211
8561
  // calculate total VRAM usage
8212
8562
  auto add_tensor = [](const ggml_tensor * t, size_t & size) {
@@ -8226,10 +8576,10 @@ struct llama_context * llama_new_context_with_model(
8226
8576
  size_t ctx_vram_size = alloc_size + kv_vram_size;
8227
8577
  size_t total_vram_size = model_vram_size + ctx_vram_size;
8228
8578
 
8229
- LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
8579
+ LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
8230
8580
  total_vram_size / 1024.0 / 1024.0,
8231
8581
  model_vram_size / 1024.0 / 1024.0,
8232
- ctx_vram_size / 1024.0 / 1024.0);
8582
+ ctx_vram_size / 1024.0 / 1024.0);
8233
8583
  #endif
8234
8584
  }
8235
8585
 
@@ -8250,7 +8600,7 @@ struct llama_context * llama_new_context_with_model(
8250
8600
 
8251
8601
  const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
8252
8602
 
8253
- LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
8603
+ LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
8254
8604
 
8255
8605
  #define LLAMA_METAL_CHECK_BUF(result) \
8256
8606
  if (!(result)) { \
@@ -8553,8 +8903,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
8553
8903
  if (kv_buf_size) {
8554
8904
  const size_t elt_size = ggml_element_size(kv_self.k);
8555
8905
 
8556
- ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
8557
- ggml_cgraph gf{};
8906
+ ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
8907
+ ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
8558
8908
 
8559
8909
  ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
8560
8910
  std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
@@ -8572,9 +8922,9 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
8572
8922
  kv_head, n_embd, n_layer,
8573
8923
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
8574
8924
 
8575
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
8576
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
8577
- ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
8925
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
8926
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
8927
+ ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
8578
8928
 
8579
8929
  ggml_free(cpy_ctx);
8580
8930
 
@@ -8681,8 +9031,8 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
8681
9031
 
8682
9032
  const size_t elt_size = ggml_element_size(kv_self.k);
8683
9033
 
8684
- ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
8685
- ggml_cgraph gf{};
9034
+ ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9035
+ ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
8686
9036
 
8687
9037
  ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
8688
9038
  kin3d->data = (void *) inp;
@@ -8700,9 +9050,9 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
8700
9050
  kv_head, n_embd, n_layer,
8701
9051
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
8702
9052
 
8703
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
8704
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
8705
- ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
9053
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
9054
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
9055
+ ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
8706
9056
 
8707
9057
  ggml_free(cpy_ctx);
8708
9058
  }
@@ -8957,6 +9307,14 @@ llama_token llama_token_nl(const struct llama_model * model) {
8957
9307
  return model->vocab.linefeed_id;
8958
9308
  }
8959
9309
 
9310
+ int llama_add_bos_token(const struct llama_model * model) {
9311
+ return model->vocab.special_add_bos;
9312
+ }
9313
+
9314
+ int llama_add_eos_token(const struct llama_model * model) {
9315
+ return model->vocab.special_add_eos;
9316
+ }
9317
+
8960
9318
  llama_token llama_token_prefix(const struct llama_model * model) {
8961
9319
  return model->vocab.special_prefix_id;
8962
9320
  }