llama_cpp 0.9.1 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -596,19 +596,37 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
596
596
  // llama helpers
597
597
  //
598
598
 
599
+ inline void * llama_host_malloc(size_t n) {
599
600
  #ifdef GGML_USE_CUBLAS
600
- # define llama_host_malloc(n) ggml_cuda_host_malloc(n)
601
- # define llama_host_free(data) ggml_cuda_host_free(data)
601
+ if (ggml_cublas_loaded()) {
602
+ return ggml_cuda_host_malloc(n);
603
+ } else {
604
+ return malloc(n);
605
+ }
602
606
  #elif GGML_USE_METAL
603
- # define llama_host_malloc(n) ggml_metal_host_malloc(n)
604
- # define llama_host_free(data) ggml_metal_host_free(data)
607
+ return ggml_metal_host_malloc(n);
605
608
  #elif GGML_USE_CPU_HBM
606
- # define llama_host_malloc(n) hbw_malloc(n)
607
- # define llama_host_free(data) if (data != NULL) hbw_free(data)
609
+ return hbw_malloc(n);
608
610
  #else
609
- # define llama_host_malloc(n) malloc(n)
610
- # define llama_host_free(data) free(data)
611
+ return malloc(n);
611
612
  #endif
613
+ }
614
+
615
+ inline void llama_host_free(void * ptr) {
616
+ #ifdef GGML_USE_CUBLAS
617
+ if (ggml_cublas_loaded()) {
618
+ return ggml_cuda_host_free(ptr);
619
+ } else {
620
+ return free(ptr);
621
+ }
622
+ #elif GGML_USE_METAL
623
+ return ggml_metal_host_free(ptr);
624
+ #elif GGML_USE_CPU_HBM
625
+ return hbw_free(ptr);
626
+ #else
627
+ return free(ptr);
628
+ #endif
629
+ }
612
630
 
613
631
  #if defined(_WIN32)
614
632
  static std::string llama_format_win_err(DWORD err) {
@@ -1195,9 +1213,11 @@ struct llama_kv_cache {
1195
1213
  }
1196
1214
 
1197
1215
  #ifdef GGML_USE_CUBLAS
1198
- ggml_cuda_free_data(k);
1199
- ggml_cuda_free_data(v);
1200
- #endif // GGML_USE_CUBLAS
1216
+ if (ggml_cublas_loaded()) {
1217
+ ggml_cuda_free_data(k);
1218
+ ggml_cuda_free_data(v);
1219
+ }
1220
+ #endif
1201
1221
  }
1202
1222
  };
1203
1223
 
@@ -1297,11 +1317,15 @@ struct llama_model {
1297
1317
  }
1298
1318
 
1299
1319
  #ifdef GGML_USE_CUBLAS
1300
- for (size_t i = 0; i < tensors_by_name.size(); ++i) {
1301
- ggml_cuda_free_data(tensors_by_name[i].second);
1320
+ if (ggml_cublas_loaded()) {
1321
+ for (size_t i = 0; i < tensors_by_name.size(); ++i) {
1322
+ ggml_cuda_free_data(tensors_by_name[i].second);
1323
+ }
1324
+ ggml_cuda_free_scratch();
1302
1325
  }
1303
- ggml_cuda_free_scratch();
1304
- #elif defined(GGML_USE_CLBLAST)
1326
+ #endif
1327
+
1328
+ #if defined(GGML_USE_CLBLAST)
1305
1329
  for (size_t i = 0; i < tensors_by_name.size(); ++i) {
1306
1330
  ggml_cl_free_data(tensors_by_name[i].second);
1307
1331
  }
@@ -1413,23 +1437,26 @@ static bool llama_kv_cache_init(
1413
1437
  ggml_set_name(cache.v, "cache_v");
1414
1438
 
1415
1439
  (void) n_gpu_layers;
1440
+
1416
1441
  #ifdef GGML_USE_CUBLAS
1417
- size_t vram_kv_cache = 0;
1442
+ if (ggml_cublas_loaded()) {
1443
+ size_t vram_kv_cache = 0;
1418
1444
 
1419
- if (n_gpu_layers > (int)n_layer + 1) {
1420
- ggml_cuda_assign_buffers_no_scratch(cache.v);
1421
- LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1422
- vram_kv_cache += ggml_nbytes(cache.v);
1423
- }
1424
- if (n_gpu_layers > (int)n_layer + 2) {
1425
- ggml_cuda_assign_buffers_no_scratch(cache.k);
1426
- LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1427
- vram_kv_cache += ggml_nbytes(cache.k);
1428
- }
1429
- if (vram_kv_cache > 0) {
1430
- LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1445
+ if (n_gpu_layers > (int)n_layer + 1) {
1446
+ ggml_cuda_assign_buffers_no_scratch(cache.v);
1447
+ LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1448
+ vram_kv_cache += ggml_nbytes(cache.v);
1449
+ }
1450
+ if (n_gpu_layers > (int)n_layer + 2) {
1451
+ ggml_cuda_assign_buffers_no_scratch(cache.k);
1452
+ LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1453
+ vram_kv_cache += ggml_nbytes(cache.k);
1454
+ }
1455
+ if (vram_kv_cache > 0) {
1456
+ LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1457
+ }
1431
1458
  }
1432
- #endif // GGML_USE_CUBLAS
1459
+ #endif
1433
1460
 
1434
1461
  return true;
1435
1462
  }
@@ -2516,18 +2543,22 @@ static void llm_load_tensors(
2516
2543
  }
2517
2544
 
2518
2545
  (void) main_gpu;
2546
+
2547
+ enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
2548
+ enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
2549
+
2519
2550
  #ifdef GGML_USE_CUBLAS
2520
- LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
2521
- ggml_cuda_set_main_device(main_gpu);
2522
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
2523
- #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
2551
+ if (ggml_cublas_loaded()) {
2552
+ LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
2553
+ ggml_cuda_set_main_device(main_gpu);
2554
+
2555
+ llama_backend_offload = GGML_BACKEND_GPU;
2556
+ llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
2557
+ }
2524
2558
  #elif defined(GGML_USE_CLBLAST)
2525
- LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
2526
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
2527
- #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
2528
- #else
2529
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
2530
- #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
2559
+ LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
2560
+ llama_backend_offload = GGML_BACKEND_GPU;
2561
+ llama_backend_offload_split = GGML_BACKEND_GPU;
2531
2562
  #endif
2532
2563
 
2533
2564
  // prepare memory for the weights
@@ -2554,12 +2585,12 @@ static void llm_load_tensors(
2554
2585
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2555
2586
  // on Windows however this is detrimental unless everything is on the GPU
2556
2587
  #ifndef _WIN32
2557
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2588
+ backend_norm = llama_backend_offload;
2558
2589
  #else
2559
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2590
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2560
2591
  #endif // _WIN32
2561
2592
 
2562
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2593
+ backend_output = llama_backend_offload_split;
2563
2594
  } else {
2564
2595
  backend_norm = GGML_BACKEND_CPU;
2565
2596
  backend_output = GGML_BACKEND_CPU;
@@ -2583,8 +2614,8 @@ static void llm_load_tensors(
2583
2614
  model.layers.resize(n_layer);
2584
2615
 
2585
2616
  for (uint32_t i = 0; i < n_layer; ++i) {
2586
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2587
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2617
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2618
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
2588
2619
 
2589
2620
  auto & layer = model.layers[i];
2590
2621
 
@@ -2620,12 +2651,12 @@ static void llm_load_tensors(
2620
2651
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2621
2652
  // on Windows however this is detrimental unless everything is on the GPU
2622
2653
  #ifndef _WIN32
2623
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2654
+ backend_norm = llama_backend_offload;
2624
2655
  #else
2625
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2656
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2626
2657
  #endif // _WIN32
2627
2658
 
2628
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2659
+ backend_output = llama_backend_offload_split;
2629
2660
  } else {
2630
2661
  backend_norm = GGML_BACKEND_CPU;
2631
2662
  backend_output = GGML_BACKEND_CPU;
@@ -2649,8 +2680,8 @@ static void llm_load_tensors(
2649
2680
  model.layers.resize(n_layer);
2650
2681
 
2651
2682
  for (uint32_t i = 0; i < n_layer; ++i) {
2652
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2653
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2683
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2684
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
2654
2685
 
2655
2686
  auto & layer = model.layers[i];
2656
2687
 
@@ -2690,12 +2721,12 @@ static void llm_load_tensors(
2690
2721
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2691
2722
  // on Windows however this is detrimental unless everything is on the GPU
2692
2723
  #ifndef _WIN32
2693
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2724
+ backend_norm = llama_backend_offload;
2694
2725
  #else
2695
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2726
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2696
2727
  #endif // _WIN32
2697
2728
 
2698
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2729
+ backend_output = llama_backend_offload_split;
2699
2730
  } else {
2700
2731
  backend_norm = GGML_BACKEND_CPU;
2701
2732
  backend_output = GGML_BACKEND_CPU;
@@ -2721,8 +2752,8 @@ static void llm_load_tensors(
2721
2752
  model.layers.resize(n_layer);
2722
2753
 
2723
2754
  for (uint32_t i = 0; i < n_layer; ++i) {
2724
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2725
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2755
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2756
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
2726
2757
 
2727
2758
  auto & layer = model.layers[i];
2728
2759
 
@@ -2767,12 +2798,12 @@ static void llm_load_tensors(
2767
2798
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2768
2799
  // on Windows however this is detrimental unless everything is on the GPU
2769
2800
  #ifndef _WIN32
2770
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2801
+ backend_norm = llama_backend_offload;
2771
2802
  #else
2772
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2803
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2773
2804
  #endif // _WIN32
2774
2805
 
2775
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2806
+ backend_output = llama_backend_offload_split;
2776
2807
  } else {
2777
2808
  backend_norm = GGML_BACKEND_CPU;
2778
2809
  backend_output = GGML_BACKEND_CPU;
@@ -2798,8 +2829,8 @@ static void llm_load_tensors(
2798
2829
  model.layers.resize(n_layer);
2799
2830
 
2800
2831
  for (uint32_t i = 0; i < n_layer; ++i) {
2801
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2802
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2832
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2833
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
2803
2834
 
2804
2835
  auto & layer = model.layers[i];
2805
2836
 
@@ -2844,12 +2875,12 @@ static void llm_load_tensors(
2844
2875
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2845
2876
  // on Windows however this is detrimental unless everything is on the GPU
2846
2877
  #ifndef _WIN32
2847
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2878
+ backend_norm = llama_backend_offload;
2848
2879
  #else
2849
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2880
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2850
2881
  #endif // _WIN32
2851
2882
 
2852
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2883
+ backend_output = llama_backend_offload_split;
2853
2884
  } else {
2854
2885
  backend_norm = GGML_BACKEND_CPU;
2855
2886
  backend_output = GGML_BACKEND_CPU;
@@ -2872,8 +2903,8 @@ static void llm_load_tensors(
2872
2903
  const int i_gpu_start = n_layer - n_gpu_layers;
2873
2904
  model.layers.resize(n_layer);
2874
2905
  for (uint32_t i = 0; i < n_layer; ++i) {
2875
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2876
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
2906
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload;
2907
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split;
2877
2908
  auto & layer = model.layers[i];
2878
2909
  layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2879
2910
  layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
@@ -2910,12 +2941,12 @@ static void llm_load_tensors(
2910
2941
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2911
2942
  // on Windows however this is detrimental unless everything is on the GPU
2912
2943
  #ifndef _WIN32
2913
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2944
+ backend_norm = llama_backend_offload;
2914
2945
  #else
2915
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2946
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2916
2947
  #endif // _WIN32
2917
2948
 
2918
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2949
+ backend_output = llama_backend_offload_split;
2919
2950
  } else {
2920
2951
  backend_norm = GGML_BACKEND_CPU;
2921
2952
  backend_output = GGML_BACKEND_CPU;
@@ -2941,8 +2972,8 @@ static void llm_load_tensors(
2941
2972
  model.layers.resize(n_layer);
2942
2973
 
2943
2974
  for (uint32_t i = 0; i < n_layer; ++i) {
2944
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2945
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2975
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2976
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
2946
2977
 
2947
2978
  auto & layer = model.layers[i];
2948
2979
 
@@ -2988,12 +3019,12 @@ static void llm_load_tensors(
2988
3019
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2989
3020
  // on Windows however this is detrimental unless everything is on the GPU
2990
3021
  #ifndef _WIN32
2991
- backend_norm = LLAMA_BACKEND_OFFLOAD;
3022
+ backend_norm = llama_backend_offload;
2992
3023
  #else
2993
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
3024
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2994
3025
  #endif // _WIN32
2995
3026
 
2996
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
3027
+ backend_output = llama_backend_offload_split;
2997
3028
  } else {
2998
3029
  backend_norm = GGML_BACKEND_CPU;
2999
3030
  backend_output = GGML_BACKEND_CPU;
@@ -3017,8 +3048,8 @@ static void llm_load_tensors(
3017
3048
  model.layers.resize(n_layer);
3018
3049
 
3019
3050
  for (uint32_t i = 0; i < n_layer; ++i) {
3020
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
3021
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
3051
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3052
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3022
3053
 
3023
3054
  auto & layer = model.layers[i];
3024
3055
 
@@ -5159,11 +5190,12 @@ static int llama_decode_internal(
5159
5190
 
5160
5191
  // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
5161
5192
  const bool full_offload_supported =
5162
- model.arch == LLM_ARCH_LLAMA ||
5163
- model.arch == LLM_ARCH_BAICHUAN ||
5164
- model.arch == LLM_ARCH_FALCON ||
5165
- model.arch == LLM_ARCH_REFACT ||
5166
- model.arch == LLM_ARCH_MPT;
5193
+ model.arch == LLM_ARCH_LLAMA ||
5194
+ model.arch == LLM_ARCH_BAICHUAN ||
5195
+ model.arch == LLM_ARCH_FALCON ||
5196
+ model.arch == LLM_ARCH_REFACT ||
5197
+ model.arch == LLM_ARCH_MPT ||
5198
+ model.arch == LLM_ARCH_STARCODER;
5167
5199
 
5168
5200
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
5169
5201
  if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
@@ -7977,7 +8009,7 @@ struct llama_context_params llama_context_default_params() {
7977
8009
  /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
7978
8010
  /*.rope_freq_base =*/ 0.0f,
7979
8011
  /*.rope_freq_scale =*/ 0.0f,
7980
- /*.yarn_ext_factor =*/ NAN,
8012
+ /*.yarn_ext_factor =*/ -1.0f,
7981
8013
  /*.yarn_attn_factor =*/ 1.0f,
7982
8014
  /*.yarn_beta_fast =*/ 32.0f,
7983
8015
  /*.yarn_beta_slow =*/ 1.0f,
@@ -8120,7 +8152,7 @@ struct llama_context * llama_new_context_with_model(
8120
8152
  cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
8121
8153
  }
8122
8154
 
8123
- if (std::isnan(cparams.yarn_ext_factor)) { // NaN indicates 'not set'
8155
+ if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
8124
8156
  cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
8125
8157
  }
8126
8158
 
@@ -175,11 +175,11 @@ extern "C" {
175
175
  };
176
176
 
177
177
  struct llama_context_params {
178
- uint32_t seed; // RNG seed, -1 for random
179
- uint32_t n_ctx; // text context, 0 = from model
180
- uint32_t n_batch; // prompt processing maximum batch size
181
- uint32_t n_threads; // number of threads to use for generation
182
- uint32_t n_threads_batch; // number of threads to use for batch processing
178
+ uint32_t seed; // RNG seed, -1 for random
179
+ uint32_t n_ctx; // text context, 0 = from model
180
+ uint32_t n_batch; // prompt processing maximum batch size
181
+ uint32_t n_threads; // number of threads to use for generation
182
+ uint32_t n_threads_batch; // number of threads to use for batch processing
183
183
  int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
184
184
 
185
185
  // ref: https://github.com/ggerganov/llama.cpp/pull/2054
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.9.1'
6
+ VERSION = '0.9.2'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1472'
9
+ LLAMA_CPP_VERSION = 'b1500'
10
10
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.1
4
+ version: 0.9.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-11-03 00:00:00.000000000 Z
11
+ date: 2023-11-11 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: