llama_cpp 0.9.1 → 0.9.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -596,19 +596,37 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
596
596
  // llama helpers
597
597
  //
598
598
 
599
+ inline void * llama_host_malloc(size_t n) {
599
600
  #ifdef GGML_USE_CUBLAS
600
- # define llama_host_malloc(n) ggml_cuda_host_malloc(n)
601
- # define llama_host_free(data) ggml_cuda_host_free(data)
601
+ if (ggml_cublas_loaded()) {
602
+ return ggml_cuda_host_malloc(n);
603
+ } else {
604
+ return malloc(n);
605
+ }
602
606
  #elif GGML_USE_METAL
603
- # define llama_host_malloc(n) ggml_metal_host_malloc(n)
604
- # define llama_host_free(data) ggml_metal_host_free(data)
607
+ return ggml_metal_host_malloc(n);
605
608
  #elif GGML_USE_CPU_HBM
606
- # define llama_host_malloc(n) hbw_malloc(n)
607
- # define llama_host_free(data) if (data != NULL) hbw_free(data)
609
+ return hbw_malloc(n);
608
610
  #else
609
- # define llama_host_malloc(n) malloc(n)
610
- # define llama_host_free(data) free(data)
611
+ return malloc(n);
611
612
  #endif
613
+ }
614
+
615
+ inline void llama_host_free(void * ptr) {
616
+ #ifdef GGML_USE_CUBLAS
617
+ if (ggml_cublas_loaded()) {
618
+ return ggml_cuda_host_free(ptr);
619
+ } else {
620
+ return free(ptr);
621
+ }
622
+ #elif GGML_USE_METAL
623
+ return ggml_metal_host_free(ptr);
624
+ #elif GGML_USE_CPU_HBM
625
+ return hbw_free(ptr);
626
+ #else
627
+ return free(ptr);
628
+ #endif
629
+ }
612
630
 
613
631
  #if defined(_WIN32)
614
632
  static std::string llama_format_win_err(DWORD err) {
@@ -1195,9 +1213,11 @@ struct llama_kv_cache {
1195
1213
  }
1196
1214
 
1197
1215
  #ifdef GGML_USE_CUBLAS
1198
- ggml_cuda_free_data(k);
1199
- ggml_cuda_free_data(v);
1200
- #endif // GGML_USE_CUBLAS
1216
+ if (ggml_cublas_loaded()) {
1217
+ ggml_cuda_free_data(k);
1218
+ ggml_cuda_free_data(v);
1219
+ }
1220
+ #endif
1201
1221
  }
1202
1222
  };
1203
1223
 
@@ -1297,11 +1317,15 @@ struct llama_model {
1297
1317
  }
1298
1318
 
1299
1319
  #ifdef GGML_USE_CUBLAS
1300
- for (size_t i = 0; i < tensors_by_name.size(); ++i) {
1301
- ggml_cuda_free_data(tensors_by_name[i].second);
1320
+ if (ggml_cublas_loaded()) {
1321
+ for (size_t i = 0; i < tensors_by_name.size(); ++i) {
1322
+ ggml_cuda_free_data(tensors_by_name[i].second);
1323
+ }
1324
+ ggml_cuda_free_scratch();
1302
1325
  }
1303
- ggml_cuda_free_scratch();
1304
- #elif defined(GGML_USE_CLBLAST)
1326
+ #endif
1327
+
1328
+ #if defined(GGML_USE_CLBLAST)
1305
1329
  for (size_t i = 0; i < tensors_by_name.size(); ++i) {
1306
1330
  ggml_cl_free_data(tensors_by_name[i].second);
1307
1331
  }
@@ -1413,23 +1437,26 @@ static bool llama_kv_cache_init(
1413
1437
  ggml_set_name(cache.v, "cache_v");
1414
1438
 
1415
1439
  (void) n_gpu_layers;
1440
+
1416
1441
  #ifdef GGML_USE_CUBLAS
1417
- size_t vram_kv_cache = 0;
1442
+ if (ggml_cublas_loaded()) {
1443
+ size_t vram_kv_cache = 0;
1418
1444
 
1419
- if (n_gpu_layers > (int)n_layer + 1) {
1420
- ggml_cuda_assign_buffers_no_scratch(cache.v);
1421
- LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1422
- vram_kv_cache += ggml_nbytes(cache.v);
1423
- }
1424
- if (n_gpu_layers > (int)n_layer + 2) {
1425
- ggml_cuda_assign_buffers_no_scratch(cache.k);
1426
- LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1427
- vram_kv_cache += ggml_nbytes(cache.k);
1428
- }
1429
- if (vram_kv_cache > 0) {
1430
- LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1445
+ if (n_gpu_layers > (int)n_layer + 1) {
1446
+ ggml_cuda_assign_buffers_no_scratch(cache.v);
1447
+ LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1448
+ vram_kv_cache += ggml_nbytes(cache.v);
1449
+ }
1450
+ if (n_gpu_layers > (int)n_layer + 2) {
1451
+ ggml_cuda_assign_buffers_no_scratch(cache.k);
1452
+ LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1453
+ vram_kv_cache += ggml_nbytes(cache.k);
1454
+ }
1455
+ if (vram_kv_cache > 0) {
1456
+ LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1457
+ }
1431
1458
  }
1432
- #endif // GGML_USE_CUBLAS
1459
+ #endif
1433
1460
 
1434
1461
  return true;
1435
1462
  }
@@ -2516,18 +2543,22 @@ static void llm_load_tensors(
2516
2543
  }
2517
2544
 
2518
2545
  (void) main_gpu;
2546
+
2547
+ enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
2548
+ enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
2549
+
2519
2550
  #ifdef GGML_USE_CUBLAS
2520
- LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
2521
- ggml_cuda_set_main_device(main_gpu);
2522
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
2523
- #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
2551
+ if (ggml_cublas_loaded()) {
2552
+ LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
2553
+ ggml_cuda_set_main_device(main_gpu);
2554
+
2555
+ llama_backend_offload = GGML_BACKEND_GPU;
2556
+ llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
2557
+ }
2524
2558
  #elif defined(GGML_USE_CLBLAST)
2525
- LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
2526
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
2527
- #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
2528
- #else
2529
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
2530
- #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
2559
+ LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
2560
+ llama_backend_offload = GGML_BACKEND_GPU;
2561
+ llama_backend_offload_split = GGML_BACKEND_GPU;
2531
2562
  #endif
2532
2563
 
2533
2564
  // prepare memory for the weights
@@ -2554,12 +2585,12 @@ static void llm_load_tensors(
2554
2585
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2555
2586
  // on Windows however this is detrimental unless everything is on the GPU
2556
2587
  #ifndef _WIN32
2557
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2588
+ backend_norm = llama_backend_offload;
2558
2589
  #else
2559
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2590
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2560
2591
  #endif // _WIN32
2561
2592
 
2562
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2593
+ backend_output = llama_backend_offload_split;
2563
2594
  } else {
2564
2595
  backend_norm = GGML_BACKEND_CPU;
2565
2596
  backend_output = GGML_BACKEND_CPU;
@@ -2583,8 +2614,8 @@ static void llm_load_tensors(
2583
2614
  model.layers.resize(n_layer);
2584
2615
 
2585
2616
  for (uint32_t i = 0; i < n_layer; ++i) {
2586
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2587
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2617
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2618
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
2588
2619
 
2589
2620
  auto & layer = model.layers[i];
2590
2621
 
@@ -2620,12 +2651,12 @@ static void llm_load_tensors(
2620
2651
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2621
2652
  // on Windows however this is detrimental unless everything is on the GPU
2622
2653
  #ifndef _WIN32
2623
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2654
+ backend_norm = llama_backend_offload;
2624
2655
  #else
2625
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2656
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2626
2657
  #endif // _WIN32
2627
2658
 
2628
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2659
+ backend_output = llama_backend_offload_split;
2629
2660
  } else {
2630
2661
  backend_norm = GGML_BACKEND_CPU;
2631
2662
  backend_output = GGML_BACKEND_CPU;
@@ -2649,8 +2680,8 @@ static void llm_load_tensors(
2649
2680
  model.layers.resize(n_layer);
2650
2681
 
2651
2682
  for (uint32_t i = 0; i < n_layer; ++i) {
2652
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2653
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2683
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2684
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
2654
2685
 
2655
2686
  auto & layer = model.layers[i];
2656
2687
 
@@ -2690,12 +2721,12 @@ static void llm_load_tensors(
2690
2721
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2691
2722
  // on Windows however this is detrimental unless everything is on the GPU
2692
2723
  #ifndef _WIN32
2693
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2724
+ backend_norm = llama_backend_offload;
2694
2725
  #else
2695
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2726
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2696
2727
  #endif // _WIN32
2697
2728
 
2698
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2729
+ backend_output = llama_backend_offload_split;
2699
2730
  } else {
2700
2731
  backend_norm = GGML_BACKEND_CPU;
2701
2732
  backend_output = GGML_BACKEND_CPU;
@@ -2721,8 +2752,8 @@ static void llm_load_tensors(
2721
2752
  model.layers.resize(n_layer);
2722
2753
 
2723
2754
  for (uint32_t i = 0; i < n_layer; ++i) {
2724
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2725
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2755
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2756
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
2726
2757
 
2727
2758
  auto & layer = model.layers[i];
2728
2759
 
@@ -2767,12 +2798,12 @@ static void llm_load_tensors(
2767
2798
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2768
2799
  // on Windows however this is detrimental unless everything is on the GPU
2769
2800
  #ifndef _WIN32
2770
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2801
+ backend_norm = llama_backend_offload;
2771
2802
  #else
2772
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2803
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2773
2804
  #endif // _WIN32
2774
2805
 
2775
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2806
+ backend_output = llama_backend_offload_split;
2776
2807
  } else {
2777
2808
  backend_norm = GGML_BACKEND_CPU;
2778
2809
  backend_output = GGML_BACKEND_CPU;
@@ -2798,8 +2829,8 @@ static void llm_load_tensors(
2798
2829
  model.layers.resize(n_layer);
2799
2830
 
2800
2831
  for (uint32_t i = 0; i < n_layer; ++i) {
2801
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2802
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2832
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2833
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
2803
2834
 
2804
2835
  auto & layer = model.layers[i];
2805
2836
 
@@ -2844,12 +2875,12 @@ static void llm_load_tensors(
2844
2875
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2845
2876
  // on Windows however this is detrimental unless everything is on the GPU
2846
2877
  #ifndef _WIN32
2847
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2878
+ backend_norm = llama_backend_offload;
2848
2879
  #else
2849
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2880
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2850
2881
  #endif // _WIN32
2851
2882
 
2852
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2883
+ backend_output = llama_backend_offload_split;
2853
2884
  } else {
2854
2885
  backend_norm = GGML_BACKEND_CPU;
2855
2886
  backend_output = GGML_BACKEND_CPU;
@@ -2872,8 +2903,8 @@ static void llm_load_tensors(
2872
2903
  const int i_gpu_start = n_layer - n_gpu_layers;
2873
2904
  model.layers.resize(n_layer);
2874
2905
  for (uint32_t i = 0; i < n_layer; ++i) {
2875
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2876
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
2906
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload;
2907
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split;
2877
2908
  auto & layer = model.layers[i];
2878
2909
  layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2879
2910
  layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
@@ -2910,12 +2941,12 @@ static void llm_load_tensors(
2910
2941
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2911
2942
  // on Windows however this is detrimental unless everything is on the GPU
2912
2943
  #ifndef _WIN32
2913
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2944
+ backend_norm = llama_backend_offload;
2914
2945
  #else
2915
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2946
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2916
2947
  #endif // _WIN32
2917
2948
 
2918
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2949
+ backend_output = llama_backend_offload_split;
2919
2950
  } else {
2920
2951
  backend_norm = GGML_BACKEND_CPU;
2921
2952
  backend_output = GGML_BACKEND_CPU;
@@ -2941,8 +2972,8 @@ static void llm_load_tensors(
2941
2972
  model.layers.resize(n_layer);
2942
2973
 
2943
2974
  for (uint32_t i = 0; i < n_layer; ++i) {
2944
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2945
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2975
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2976
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
2946
2977
 
2947
2978
  auto & layer = model.layers[i];
2948
2979
 
@@ -2988,12 +3019,12 @@ static void llm_load_tensors(
2988
3019
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2989
3020
  // on Windows however this is detrimental unless everything is on the GPU
2990
3021
  #ifndef _WIN32
2991
- backend_norm = LLAMA_BACKEND_OFFLOAD;
3022
+ backend_norm = llama_backend_offload;
2992
3023
  #else
2993
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
3024
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
2994
3025
  #endif // _WIN32
2995
3026
 
2996
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
3027
+ backend_output = llama_backend_offload_split;
2997
3028
  } else {
2998
3029
  backend_norm = GGML_BACKEND_CPU;
2999
3030
  backend_output = GGML_BACKEND_CPU;
@@ -3017,8 +3048,8 @@ static void llm_load_tensors(
3017
3048
  model.layers.resize(n_layer);
3018
3049
 
3019
3050
  for (uint32_t i = 0; i < n_layer; ++i) {
3020
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
3021
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
3051
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3052
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3022
3053
 
3023
3054
  auto & layer = model.layers[i];
3024
3055
 
@@ -5159,11 +5190,12 @@ static int llama_decode_internal(
5159
5190
 
5160
5191
  // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
5161
5192
  const bool full_offload_supported =
5162
- model.arch == LLM_ARCH_LLAMA ||
5163
- model.arch == LLM_ARCH_BAICHUAN ||
5164
- model.arch == LLM_ARCH_FALCON ||
5165
- model.arch == LLM_ARCH_REFACT ||
5166
- model.arch == LLM_ARCH_MPT;
5193
+ model.arch == LLM_ARCH_LLAMA ||
5194
+ model.arch == LLM_ARCH_BAICHUAN ||
5195
+ model.arch == LLM_ARCH_FALCON ||
5196
+ model.arch == LLM_ARCH_REFACT ||
5197
+ model.arch == LLM_ARCH_MPT ||
5198
+ model.arch == LLM_ARCH_STARCODER;
5167
5199
 
5168
5200
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
5169
5201
  if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
@@ -7977,7 +8009,7 @@ struct llama_context_params llama_context_default_params() {
7977
8009
  /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
7978
8010
  /*.rope_freq_base =*/ 0.0f,
7979
8011
  /*.rope_freq_scale =*/ 0.0f,
7980
- /*.yarn_ext_factor =*/ NAN,
8012
+ /*.yarn_ext_factor =*/ -1.0f,
7981
8013
  /*.yarn_attn_factor =*/ 1.0f,
7982
8014
  /*.yarn_beta_fast =*/ 32.0f,
7983
8015
  /*.yarn_beta_slow =*/ 1.0f,
@@ -8120,7 +8152,7 @@ struct llama_context * llama_new_context_with_model(
8120
8152
  cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
8121
8153
  }
8122
8154
 
8123
- if (std::isnan(cparams.yarn_ext_factor)) { // NaN indicates 'not set'
8155
+ if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
8124
8156
  cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
8125
8157
  }
8126
8158
 
@@ -175,11 +175,11 @@ extern "C" {
175
175
  };
176
176
 
177
177
  struct llama_context_params {
178
- uint32_t seed; // RNG seed, -1 for random
179
- uint32_t n_ctx; // text context, 0 = from model
180
- uint32_t n_batch; // prompt processing maximum batch size
181
- uint32_t n_threads; // number of threads to use for generation
182
- uint32_t n_threads_batch; // number of threads to use for batch processing
178
+ uint32_t seed; // RNG seed, -1 for random
179
+ uint32_t n_ctx; // text context, 0 = from model
180
+ uint32_t n_batch; // prompt processing maximum batch size
181
+ uint32_t n_threads; // number of threads to use for generation
182
+ uint32_t n_threads_batch; // number of threads to use for batch processing
183
183
  int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
184
184
 
185
185
  // ref: https://github.com/ggerganov/llama.cpp/pull/2054
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.9.1'
6
+ VERSION = '0.9.2'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1472'
9
+ LLAMA_CPP_VERSION = 'b1500'
10
10
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.1
4
+ version: 0.9.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-11-03 00:00:00.000000000 Z
11
+ date: 2023-11-11 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: