llama_cpp 0.9.1 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-alloc.c +12 -9
- data/ext/llama_cpp/src/ggml-cuda.cu +83 -45
- data/ext/llama_cpp/src/ggml-cuda.h +5 -0
- data/ext/llama_cpp/src/ggml-metal.m +4 -3
- data/ext/llama_cpp/src/ggml.c +78 -252
- data/ext/llama_cpp/src/ggml.h +5 -0
- data/ext/llama_cpp/src/llama.cpp +113 -81
- data/ext/llama_cpp/src/llama.h +5 -5
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -596,19 +596,37 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
596
596
|
// llama helpers
|
597
597
|
//
|
598
598
|
|
599
|
+
inline void * llama_host_malloc(size_t n) {
|
599
600
|
#ifdef GGML_USE_CUBLAS
|
600
|
-
|
601
|
-
|
601
|
+
if (ggml_cublas_loaded()) {
|
602
|
+
return ggml_cuda_host_malloc(n);
|
603
|
+
} else {
|
604
|
+
return malloc(n);
|
605
|
+
}
|
602
606
|
#elif GGML_USE_METAL
|
603
|
-
|
604
|
-
# define llama_host_free(data) ggml_metal_host_free(data)
|
607
|
+
return ggml_metal_host_malloc(n);
|
605
608
|
#elif GGML_USE_CPU_HBM
|
606
|
-
|
607
|
-
# define llama_host_free(data) if (data != NULL) hbw_free(data)
|
609
|
+
return hbw_malloc(n);
|
608
610
|
#else
|
609
|
-
|
610
|
-
# define llama_host_free(data) free(data)
|
611
|
+
return malloc(n);
|
611
612
|
#endif
|
613
|
+
}
|
614
|
+
|
615
|
+
inline void llama_host_free(void * ptr) {
|
616
|
+
#ifdef GGML_USE_CUBLAS
|
617
|
+
if (ggml_cublas_loaded()) {
|
618
|
+
return ggml_cuda_host_free(ptr);
|
619
|
+
} else {
|
620
|
+
return free(ptr);
|
621
|
+
}
|
622
|
+
#elif GGML_USE_METAL
|
623
|
+
return ggml_metal_host_free(ptr);
|
624
|
+
#elif GGML_USE_CPU_HBM
|
625
|
+
return hbw_free(ptr);
|
626
|
+
#else
|
627
|
+
return free(ptr);
|
628
|
+
#endif
|
629
|
+
}
|
612
630
|
|
613
631
|
#if defined(_WIN32)
|
614
632
|
static std::string llama_format_win_err(DWORD err) {
|
@@ -1195,9 +1213,11 @@ struct llama_kv_cache {
|
|
1195
1213
|
}
|
1196
1214
|
|
1197
1215
|
#ifdef GGML_USE_CUBLAS
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1216
|
+
if (ggml_cublas_loaded()) {
|
1217
|
+
ggml_cuda_free_data(k);
|
1218
|
+
ggml_cuda_free_data(v);
|
1219
|
+
}
|
1220
|
+
#endif
|
1201
1221
|
}
|
1202
1222
|
};
|
1203
1223
|
|
@@ -1297,11 +1317,15 @@ struct llama_model {
|
|
1297
1317
|
}
|
1298
1318
|
|
1299
1319
|
#ifdef GGML_USE_CUBLAS
|
1300
|
-
|
1301
|
-
|
1320
|
+
if (ggml_cublas_loaded()) {
|
1321
|
+
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
1322
|
+
ggml_cuda_free_data(tensors_by_name[i].second);
|
1323
|
+
}
|
1324
|
+
ggml_cuda_free_scratch();
|
1302
1325
|
}
|
1303
|
-
|
1304
|
-
|
1326
|
+
#endif
|
1327
|
+
|
1328
|
+
#if defined(GGML_USE_CLBLAST)
|
1305
1329
|
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
1306
1330
|
ggml_cl_free_data(tensors_by_name[i].second);
|
1307
1331
|
}
|
@@ -1413,23 +1437,26 @@ static bool llama_kv_cache_init(
|
|
1413
1437
|
ggml_set_name(cache.v, "cache_v");
|
1414
1438
|
|
1415
1439
|
(void) n_gpu_layers;
|
1440
|
+
|
1416
1441
|
#ifdef GGML_USE_CUBLAS
|
1417
|
-
|
1442
|
+
if (ggml_cublas_loaded()) {
|
1443
|
+
size_t vram_kv_cache = 0;
|
1418
1444
|
|
1419
|
-
|
1420
|
-
|
1421
|
-
|
1422
|
-
|
1423
|
-
|
1424
|
-
|
1425
|
-
|
1426
|
-
|
1427
|
-
|
1428
|
-
|
1429
|
-
|
1430
|
-
|
1445
|
+
if (n_gpu_layers > (int)n_layer + 1) {
|
1446
|
+
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
1447
|
+
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
|
1448
|
+
vram_kv_cache += ggml_nbytes(cache.v);
|
1449
|
+
}
|
1450
|
+
if (n_gpu_layers > (int)n_layer + 2) {
|
1451
|
+
ggml_cuda_assign_buffers_no_scratch(cache.k);
|
1452
|
+
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
|
1453
|
+
vram_kv_cache += ggml_nbytes(cache.k);
|
1454
|
+
}
|
1455
|
+
if (vram_kv_cache > 0) {
|
1456
|
+
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
1457
|
+
}
|
1431
1458
|
}
|
1432
|
-
#endif
|
1459
|
+
#endif
|
1433
1460
|
|
1434
1461
|
return true;
|
1435
1462
|
}
|
@@ -2516,18 +2543,22 @@ static void llm_load_tensors(
|
|
2516
2543
|
}
|
2517
2544
|
|
2518
2545
|
(void) main_gpu;
|
2546
|
+
|
2547
|
+
enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
|
2548
|
+
enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
|
2549
|
+
|
2519
2550
|
#ifdef GGML_USE_CUBLAS
|
2520
|
-
|
2521
|
-
|
2522
|
-
|
2523
|
-
|
2551
|
+
if (ggml_cublas_loaded()) {
|
2552
|
+
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
|
2553
|
+
ggml_cuda_set_main_device(main_gpu);
|
2554
|
+
|
2555
|
+
llama_backend_offload = GGML_BACKEND_GPU;
|
2556
|
+
llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
|
2557
|
+
}
|
2524
2558
|
#elif defined(GGML_USE_CLBLAST)
|
2525
|
-
|
2526
|
-
|
2527
|
-
|
2528
|
-
#else
|
2529
|
-
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
2530
|
-
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
|
2559
|
+
LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
|
2560
|
+
llama_backend_offload = GGML_BACKEND_GPU;
|
2561
|
+
llama_backend_offload_split = GGML_BACKEND_GPU;
|
2531
2562
|
#endif
|
2532
2563
|
|
2533
2564
|
// prepare memory for the weights
|
@@ -2554,12 +2585,12 @@ static void llm_load_tensors(
|
|
2554
2585
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2555
2586
|
// on Windows however this is detrimental unless everything is on the GPU
|
2556
2587
|
#ifndef _WIN32
|
2557
|
-
backend_norm =
|
2588
|
+
backend_norm = llama_backend_offload;
|
2558
2589
|
#else
|
2559
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
2590
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2560
2591
|
#endif // _WIN32
|
2561
2592
|
|
2562
|
-
backend_output =
|
2593
|
+
backend_output = llama_backend_offload_split;
|
2563
2594
|
} else {
|
2564
2595
|
backend_norm = GGML_BACKEND_CPU;
|
2565
2596
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2583,8 +2614,8 @@ static void llm_load_tensors(
|
|
2583
2614
|
model.layers.resize(n_layer);
|
2584
2615
|
|
2585
2616
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2586
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2587
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2617
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
2618
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
2588
2619
|
|
2589
2620
|
auto & layer = model.layers[i];
|
2590
2621
|
|
@@ -2620,12 +2651,12 @@ static void llm_load_tensors(
|
|
2620
2651
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2621
2652
|
// on Windows however this is detrimental unless everything is on the GPU
|
2622
2653
|
#ifndef _WIN32
|
2623
|
-
backend_norm =
|
2654
|
+
backend_norm = llama_backend_offload;
|
2624
2655
|
#else
|
2625
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
2656
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2626
2657
|
#endif // _WIN32
|
2627
2658
|
|
2628
|
-
backend_output =
|
2659
|
+
backend_output = llama_backend_offload_split;
|
2629
2660
|
} else {
|
2630
2661
|
backend_norm = GGML_BACKEND_CPU;
|
2631
2662
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2649,8 +2680,8 @@ static void llm_load_tensors(
|
|
2649
2680
|
model.layers.resize(n_layer);
|
2650
2681
|
|
2651
2682
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2652
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2653
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2683
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
2684
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
2654
2685
|
|
2655
2686
|
auto & layer = model.layers[i];
|
2656
2687
|
|
@@ -2690,12 +2721,12 @@ static void llm_load_tensors(
|
|
2690
2721
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2691
2722
|
// on Windows however this is detrimental unless everything is on the GPU
|
2692
2723
|
#ifndef _WIN32
|
2693
|
-
backend_norm =
|
2724
|
+
backend_norm = llama_backend_offload;
|
2694
2725
|
#else
|
2695
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
2726
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2696
2727
|
#endif // _WIN32
|
2697
2728
|
|
2698
|
-
backend_output =
|
2729
|
+
backend_output = llama_backend_offload_split;
|
2699
2730
|
} else {
|
2700
2731
|
backend_norm = GGML_BACKEND_CPU;
|
2701
2732
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2721,8 +2752,8 @@ static void llm_load_tensors(
|
|
2721
2752
|
model.layers.resize(n_layer);
|
2722
2753
|
|
2723
2754
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2724
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2725
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2755
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
2756
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
2726
2757
|
|
2727
2758
|
auto & layer = model.layers[i];
|
2728
2759
|
|
@@ -2767,12 +2798,12 @@ static void llm_load_tensors(
|
|
2767
2798
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2768
2799
|
// on Windows however this is detrimental unless everything is on the GPU
|
2769
2800
|
#ifndef _WIN32
|
2770
|
-
backend_norm =
|
2801
|
+
backend_norm = llama_backend_offload;
|
2771
2802
|
#else
|
2772
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
2803
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2773
2804
|
#endif // _WIN32
|
2774
2805
|
|
2775
|
-
backend_output =
|
2806
|
+
backend_output = llama_backend_offload_split;
|
2776
2807
|
} else {
|
2777
2808
|
backend_norm = GGML_BACKEND_CPU;
|
2778
2809
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2798,8 +2829,8 @@ static void llm_load_tensors(
|
|
2798
2829
|
model.layers.resize(n_layer);
|
2799
2830
|
|
2800
2831
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2801
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2802
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2832
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
2833
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
2803
2834
|
|
2804
2835
|
auto & layer = model.layers[i];
|
2805
2836
|
|
@@ -2844,12 +2875,12 @@ static void llm_load_tensors(
|
|
2844
2875
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2845
2876
|
// on Windows however this is detrimental unless everything is on the GPU
|
2846
2877
|
#ifndef _WIN32
|
2847
|
-
backend_norm =
|
2878
|
+
backend_norm = llama_backend_offload;
|
2848
2879
|
#else
|
2849
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
2880
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2850
2881
|
#endif // _WIN32
|
2851
2882
|
|
2852
|
-
backend_output =
|
2883
|
+
backend_output = llama_backend_offload_split;
|
2853
2884
|
} else {
|
2854
2885
|
backend_norm = GGML_BACKEND_CPU;
|
2855
2886
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2872,8 +2903,8 @@ static void llm_load_tensors(
|
|
2872
2903
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
2873
2904
|
model.layers.resize(n_layer);
|
2874
2905
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2875
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2876
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2906
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload;
|
2907
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split;
|
2877
2908
|
auto & layer = model.layers[i];
|
2878
2909
|
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2879
2910
|
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
@@ -2910,12 +2941,12 @@ static void llm_load_tensors(
|
|
2910
2941
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2911
2942
|
// on Windows however this is detrimental unless everything is on the GPU
|
2912
2943
|
#ifndef _WIN32
|
2913
|
-
backend_norm =
|
2944
|
+
backend_norm = llama_backend_offload;
|
2914
2945
|
#else
|
2915
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
2946
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2916
2947
|
#endif // _WIN32
|
2917
2948
|
|
2918
|
-
backend_output =
|
2949
|
+
backend_output = llama_backend_offload_split;
|
2919
2950
|
} else {
|
2920
2951
|
backend_norm = GGML_BACKEND_CPU;
|
2921
2952
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2941,8 +2972,8 @@ static void llm_load_tensors(
|
|
2941
2972
|
model.layers.resize(n_layer);
|
2942
2973
|
|
2943
2974
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2944
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2945
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2975
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
2976
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
2946
2977
|
|
2947
2978
|
auto & layer = model.layers[i];
|
2948
2979
|
|
@@ -2988,12 +3019,12 @@ static void llm_load_tensors(
|
|
2988
3019
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2989
3020
|
// on Windows however this is detrimental unless everything is on the GPU
|
2990
3021
|
#ifndef _WIN32
|
2991
|
-
backend_norm =
|
3022
|
+
backend_norm = llama_backend_offload;
|
2992
3023
|
#else
|
2993
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
3024
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2994
3025
|
#endif // _WIN32
|
2995
3026
|
|
2996
|
-
backend_output =
|
3027
|
+
backend_output = llama_backend_offload_split;
|
2997
3028
|
} else {
|
2998
3029
|
backend_norm = GGML_BACKEND_CPU;
|
2999
3030
|
backend_output = GGML_BACKEND_CPU;
|
@@ -3017,8 +3048,8 @@ static void llm_load_tensors(
|
|
3017
3048
|
model.layers.resize(n_layer);
|
3018
3049
|
|
3019
3050
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
3020
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
3021
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
3051
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3052
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3022
3053
|
|
3023
3054
|
auto & layer = model.layers[i];
|
3024
3055
|
|
@@ -5159,11 +5190,12 @@ static int llama_decode_internal(
|
|
5159
5190
|
|
5160
5191
|
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
5161
5192
|
const bool full_offload_supported =
|
5162
|
-
model.arch == LLM_ARCH_LLAMA
|
5163
|
-
model.arch == LLM_ARCH_BAICHUAN
|
5164
|
-
model.arch == LLM_ARCH_FALCON
|
5165
|
-
model.arch == LLM_ARCH_REFACT
|
5166
|
-
model.arch == LLM_ARCH_MPT
|
5193
|
+
model.arch == LLM_ARCH_LLAMA ||
|
5194
|
+
model.arch == LLM_ARCH_BAICHUAN ||
|
5195
|
+
model.arch == LLM_ARCH_FALCON ||
|
5196
|
+
model.arch == LLM_ARCH_REFACT ||
|
5197
|
+
model.arch == LLM_ARCH_MPT ||
|
5198
|
+
model.arch == LLM_ARCH_STARCODER;
|
5167
5199
|
|
5168
5200
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
5169
5201
|
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
@@ -7977,7 +8009,7 @@ struct llama_context_params llama_context_default_params() {
|
|
7977
8009
|
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
|
7978
8010
|
/*.rope_freq_base =*/ 0.0f,
|
7979
8011
|
/*.rope_freq_scale =*/ 0.0f,
|
7980
|
-
/*.yarn_ext_factor =*/
|
8012
|
+
/*.yarn_ext_factor =*/ -1.0f,
|
7981
8013
|
/*.yarn_attn_factor =*/ 1.0f,
|
7982
8014
|
/*.yarn_beta_fast =*/ 32.0f,
|
7983
8015
|
/*.yarn_beta_slow =*/ 1.0f,
|
@@ -8120,7 +8152,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8120
8152
|
cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
|
8121
8153
|
}
|
8122
8154
|
|
8123
|
-
if (
|
8155
|
+
if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
|
8124
8156
|
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
|
8125
8157
|
}
|
8126
8158
|
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -175,11 +175,11 @@ extern "C" {
|
|
175
175
|
};
|
176
176
|
|
177
177
|
struct llama_context_params {
|
178
|
-
uint32_t seed;
|
179
|
-
uint32_t n_ctx;
|
180
|
-
uint32_t n_batch;
|
181
|
-
uint32_t n_threads;
|
182
|
-
uint32_t n_threads_batch;
|
178
|
+
uint32_t seed; // RNG seed, -1 for random
|
179
|
+
uint32_t n_ctx; // text context, 0 = from model
|
180
|
+
uint32_t n_batch; // prompt processing maximum batch size
|
181
|
+
uint32_t n_threads; // number of threads to use for generation
|
182
|
+
uint32_t n_threads_batch; // number of threads to use for batch processing
|
183
183
|
int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
184
184
|
|
185
185
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.9.
|
6
|
+
VERSION = '0.9.2'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1500'
|
10
10
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-11-
|
11
|
+
date: 2023-11-11 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|