llama_cpp 0.9.1 → 0.9.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-alloc.c +12 -9
- data/ext/llama_cpp/src/ggml-cuda.cu +83 -45
- data/ext/llama_cpp/src/ggml-cuda.h +5 -0
- data/ext/llama_cpp/src/ggml-metal.m +4 -3
- data/ext/llama_cpp/src/ggml.c +78 -252
- data/ext/llama_cpp/src/ggml.h +5 -0
- data/ext/llama_cpp/src/llama.cpp +113 -81
- data/ext/llama_cpp/src/llama.h +5 -5
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -596,19 +596,37 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
596
596
|
// llama helpers
|
597
597
|
//
|
598
598
|
|
599
|
+
inline void * llama_host_malloc(size_t n) {
|
599
600
|
#ifdef GGML_USE_CUBLAS
|
600
|
-
|
601
|
-
|
601
|
+
if (ggml_cublas_loaded()) {
|
602
|
+
return ggml_cuda_host_malloc(n);
|
603
|
+
} else {
|
604
|
+
return malloc(n);
|
605
|
+
}
|
602
606
|
#elif GGML_USE_METAL
|
603
|
-
|
604
|
-
# define llama_host_free(data) ggml_metal_host_free(data)
|
607
|
+
return ggml_metal_host_malloc(n);
|
605
608
|
#elif GGML_USE_CPU_HBM
|
606
|
-
|
607
|
-
# define llama_host_free(data) if (data != NULL) hbw_free(data)
|
609
|
+
return hbw_malloc(n);
|
608
610
|
#else
|
609
|
-
|
610
|
-
# define llama_host_free(data) free(data)
|
611
|
+
return malloc(n);
|
611
612
|
#endif
|
613
|
+
}
|
614
|
+
|
615
|
+
inline void llama_host_free(void * ptr) {
|
616
|
+
#ifdef GGML_USE_CUBLAS
|
617
|
+
if (ggml_cublas_loaded()) {
|
618
|
+
return ggml_cuda_host_free(ptr);
|
619
|
+
} else {
|
620
|
+
return free(ptr);
|
621
|
+
}
|
622
|
+
#elif GGML_USE_METAL
|
623
|
+
return ggml_metal_host_free(ptr);
|
624
|
+
#elif GGML_USE_CPU_HBM
|
625
|
+
return hbw_free(ptr);
|
626
|
+
#else
|
627
|
+
return free(ptr);
|
628
|
+
#endif
|
629
|
+
}
|
612
630
|
|
613
631
|
#if defined(_WIN32)
|
614
632
|
static std::string llama_format_win_err(DWORD err) {
|
@@ -1195,9 +1213,11 @@ struct llama_kv_cache {
|
|
1195
1213
|
}
|
1196
1214
|
|
1197
1215
|
#ifdef GGML_USE_CUBLAS
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1216
|
+
if (ggml_cublas_loaded()) {
|
1217
|
+
ggml_cuda_free_data(k);
|
1218
|
+
ggml_cuda_free_data(v);
|
1219
|
+
}
|
1220
|
+
#endif
|
1201
1221
|
}
|
1202
1222
|
};
|
1203
1223
|
|
@@ -1297,11 +1317,15 @@ struct llama_model {
|
|
1297
1317
|
}
|
1298
1318
|
|
1299
1319
|
#ifdef GGML_USE_CUBLAS
|
1300
|
-
|
1301
|
-
|
1320
|
+
if (ggml_cublas_loaded()) {
|
1321
|
+
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
1322
|
+
ggml_cuda_free_data(tensors_by_name[i].second);
|
1323
|
+
}
|
1324
|
+
ggml_cuda_free_scratch();
|
1302
1325
|
}
|
1303
|
-
|
1304
|
-
|
1326
|
+
#endif
|
1327
|
+
|
1328
|
+
#if defined(GGML_USE_CLBLAST)
|
1305
1329
|
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
1306
1330
|
ggml_cl_free_data(tensors_by_name[i].second);
|
1307
1331
|
}
|
@@ -1413,23 +1437,26 @@ static bool llama_kv_cache_init(
|
|
1413
1437
|
ggml_set_name(cache.v, "cache_v");
|
1414
1438
|
|
1415
1439
|
(void) n_gpu_layers;
|
1440
|
+
|
1416
1441
|
#ifdef GGML_USE_CUBLAS
|
1417
|
-
|
1442
|
+
if (ggml_cublas_loaded()) {
|
1443
|
+
size_t vram_kv_cache = 0;
|
1418
1444
|
|
1419
|
-
|
1420
|
-
|
1421
|
-
|
1422
|
-
|
1423
|
-
|
1424
|
-
|
1425
|
-
|
1426
|
-
|
1427
|
-
|
1428
|
-
|
1429
|
-
|
1430
|
-
|
1445
|
+
if (n_gpu_layers > (int)n_layer + 1) {
|
1446
|
+
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
1447
|
+
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
|
1448
|
+
vram_kv_cache += ggml_nbytes(cache.v);
|
1449
|
+
}
|
1450
|
+
if (n_gpu_layers > (int)n_layer + 2) {
|
1451
|
+
ggml_cuda_assign_buffers_no_scratch(cache.k);
|
1452
|
+
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
|
1453
|
+
vram_kv_cache += ggml_nbytes(cache.k);
|
1454
|
+
}
|
1455
|
+
if (vram_kv_cache > 0) {
|
1456
|
+
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
1457
|
+
}
|
1431
1458
|
}
|
1432
|
-
#endif
|
1459
|
+
#endif
|
1433
1460
|
|
1434
1461
|
return true;
|
1435
1462
|
}
|
@@ -2516,18 +2543,22 @@ static void llm_load_tensors(
|
|
2516
2543
|
}
|
2517
2544
|
|
2518
2545
|
(void) main_gpu;
|
2546
|
+
|
2547
|
+
enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
|
2548
|
+
enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
|
2549
|
+
|
2519
2550
|
#ifdef GGML_USE_CUBLAS
|
2520
|
-
|
2521
|
-
|
2522
|
-
|
2523
|
-
|
2551
|
+
if (ggml_cublas_loaded()) {
|
2552
|
+
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
|
2553
|
+
ggml_cuda_set_main_device(main_gpu);
|
2554
|
+
|
2555
|
+
llama_backend_offload = GGML_BACKEND_GPU;
|
2556
|
+
llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
|
2557
|
+
}
|
2524
2558
|
#elif defined(GGML_USE_CLBLAST)
|
2525
|
-
|
2526
|
-
|
2527
|
-
|
2528
|
-
#else
|
2529
|
-
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
2530
|
-
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
|
2559
|
+
LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
|
2560
|
+
llama_backend_offload = GGML_BACKEND_GPU;
|
2561
|
+
llama_backend_offload_split = GGML_BACKEND_GPU;
|
2531
2562
|
#endif
|
2532
2563
|
|
2533
2564
|
// prepare memory for the weights
|
@@ -2554,12 +2585,12 @@ static void llm_load_tensors(
|
|
2554
2585
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2555
2586
|
// on Windows however this is detrimental unless everything is on the GPU
|
2556
2587
|
#ifndef _WIN32
|
2557
|
-
backend_norm =
|
2588
|
+
backend_norm = llama_backend_offload;
|
2558
2589
|
#else
|
2559
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
2590
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2560
2591
|
#endif // _WIN32
|
2561
2592
|
|
2562
|
-
backend_output =
|
2593
|
+
backend_output = llama_backend_offload_split;
|
2563
2594
|
} else {
|
2564
2595
|
backend_norm = GGML_BACKEND_CPU;
|
2565
2596
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2583,8 +2614,8 @@ static void llm_load_tensors(
|
|
2583
2614
|
model.layers.resize(n_layer);
|
2584
2615
|
|
2585
2616
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2586
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2587
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2617
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
2618
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
2588
2619
|
|
2589
2620
|
auto & layer = model.layers[i];
|
2590
2621
|
|
@@ -2620,12 +2651,12 @@ static void llm_load_tensors(
|
|
2620
2651
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2621
2652
|
// on Windows however this is detrimental unless everything is on the GPU
|
2622
2653
|
#ifndef _WIN32
|
2623
|
-
backend_norm =
|
2654
|
+
backend_norm = llama_backend_offload;
|
2624
2655
|
#else
|
2625
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
2656
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2626
2657
|
#endif // _WIN32
|
2627
2658
|
|
2628
|
-
backend_output =
|
2659
|
+
backend_output = llama_backend_offload_split;
|
2629
2660
|
} else {
|
2630
2661
|
backend_norm = GGML_BACKEND_CPU;
|
2631
2662
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2649,8 +2680,8 @@ static void llm_load_tensors(
|
|
2649
2680
|
model.layers.resize(n_layer);
|
2650
2681
|
|
2651
2682
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2652
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2653
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2683
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
2684
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
2654
2685
|
|
2655
2686
|
auto & layer = model.layers[i];
|
2656
2687
|
|
@@ -2690,12 +2721,12 @@ static void llm_load_tensors(
|
|
2690
2721
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2691
2722
|
// on Windows however this is detrimental unless everything is on the GPU
|
2692
2723
|
#ifndef _WIN32
|
2693
|
-
backend_norm =
|
2724
|
+
backend_norm = llama_backend_offload;
|
2694
2725
|
#else
|
2695
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
2726
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2696
2727
|
#endif // _WIN32
|
2697
2728
|
|
2698
|
-
backend_output =
|
2729
|
+
backend_output = llama_backend_offload_split;
|
2699
2730
|
} else {
|
2700
2731
|
backend_norm = GGML_BACKEND_CPU;
|
2701
2732
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2721,8 +2752,8 @@ static void llm_load_tensors(
|
|
2721
2752
|
model.layers.resize(n_layer);
|
2722
2753
|
|
2723
2754
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2724
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2725
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2755
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
2756
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
2726
2757
|
|
2727
2758
|
auto & layer = model.layers[i];
|
2728
2759
|
|
@@ -2767,12 +2798,12 @@ static void llm_load_tensors(
|
|
2767
2798
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2768
2799
|
// on Windows however this is detrimental unless everything is on the GPU
|
2769
2800
|
#ifndef _WIN32
|
2770
|
-
backend_norm =
|
2801
|
+
backend_norm = llama_backend_offload;
|
2771
2802
|
#else
|
2772
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
2803
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2773
2804
|
#endif // _WIN32
|
2774
2805
|
|
2775
|
-
backend_output =
|
2806
|
+
backend_output = llama_backend_offload_split;
|
2776
2807
|
} else {
|
2777
2808
|
backend_norm = GGML_BACKEND_CPU;
|
2778
2809
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2798,8 +2829,8 @@ static void llm_load_tensors(
|
|
2798
2829
|
model.layers.resize(n_layer);
|
2799
2830
|
|
2800
2831
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2801
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2802
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2832
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
2833
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
2803
2834
|
|
2804
2835
|
auto & layer = model.layers[i];
|
2805
2836
|
|
@@ -2844,12 +2875,12 @@ static void llm_load_tensors(
|
|
2844
2875
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2845
2876
|
// on Windows however this is detrimental unless everything is on the GPU
|
2846
2877
|
#ifndef _WIN32
|
2847
|
-
backend_norm =
|
2878
|
+
backend_norm = llama_backend_offload;
|
2848
2879
|
#else
|
2849
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
2880
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2850
2881
|
#endif // _WIN32
|
2851
2882
|
|
2852
|
-
backend_output =
|
2883
|
+
backend_output = llama_backend_offload_split;
|
2853
2884
|
} else {
|
2854
2885
|
backend_norm = GGML_BACKEND_CPU;
|
2855
2886
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2872,8 +2903,8 @@ static void llm_load_tensors(
|
|
2872
2903
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
2873
2904
|
model.layers.resize(n_layer);
|
2874
2905
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2875
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2876
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2906
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload;
|
2907
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split;
|
2877
2908
|
auto & layer = model.layers[i];
|
2878
2909
|
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2879
2910
|
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
@@ -2910,12 +2941,12 @@ static void llm_load_tensors(
|
|
2910
2941
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2911
2942
|
// on Windows however this is detrimental unless everything is on the GPU
|
2912
2943
|
#ifndef _WIN32
|
2913
|
-
backend_norm =
|
2944
|
+
backend_norm = llama_backend_offload;
|
2914
2945
|
#else
|
2915
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
2946
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2916
2947
|
#endif // _WIN32
|
2917
2948
|
|
2918
|
-
backend_output =
|
2949
|
+
backend_output = llama_backend_offload_split;
|
2919
2950
|
} else {
|
2920
2951
|
backend_norm = GGML_BACKEND_CPU;
|
2921
2952
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2941,8 +2972,8 @@ static void llm_load_tensors(
|
|
2941
2972
|
model.layers.resize(n_layer);
|
2942
2973
|
|
2943
2974
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2944
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2945
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2975
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
2976
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
2946
2977
|
|
2947
2978
|
auto & layer = model.layers[i];
|
2948
2979
|
|
@@ -2988,12 +3019,12 @@ static void llm_load_tensors(
|
|
2988
3019
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2989
3020
|
// on Windows however this is detrimental unless everything is on the GPU
|
2990
3021
|
#ifndef _WIN32
|
2991
|
-
backend_norm =
|
3022
|
+
backend_norm = llama_backend_offload;
|
2992
3023
|
#else
|
2993
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
3024
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2994
3025
|
#endif // _WIN32
|
2995
3026
|
|
2996
|
-
backend_output =
|
3027
|
+
backend_output = llama_backend_offload_split;
|
2997
3028
|
} else {
|
2998
3029
|
backend_norm = GGML_BACKEND_CPU;
|
2999
3030
|
backend_output = GGML_BACKEND_CPU;
|
@@ -3017,8 +3048,8 @@ static void llm_load_tensors(
|
|
3017
3048
|
model.layers.resize(n_layer);
|
3018
3049
|
|
3019
3050
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
3020
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
3021
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
3051
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3052
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3022
3053
|
|
3023
3054
|
auto & layer = model.layers[i];
|
3024
3055
|
|
@@ -5159,11 +5190,12 @@ static int llama_decode_internal(
|
|
5159
5190
|
|
5160
5191
|
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
5161
5192
|
const bool full_offload_supported =
|
5162
|
-
model.arch == LLM_ARCH_LLAMA
|
5163
|
-
model.arch == LLM_ARCH_BAICHUAN
|
5164
|
-
model.arch == LLM_ARCH_FALCON
|
5165
|
-
model.arch == LLM_ARCH_REFACT
|
5166
|
-
model.arch == LLM_ARCH_MPT
|
5193
|
+
model.arch == LLM_ARCH_LLAMA ||
|
5194
|
+
model.arch == LLM_ARCH_BAICHUAN ||
|
5195
|
+
model.arch == LLM_ARCH_FALCON ||
|
5196
|
+
model.arch == LLM_ARCH_REFACT ||
|
5197
|
+
model.arch == LLM_ARCH_MPT ||
|
5198
|
+
model.arch == LLM_ARCH_STARCODER;
|
5167
5199
|
|
5168
5200
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
5169
5201
|
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
@@ -7977,7 +8009,7 @@ struct llama_context_params llama_context_default_params() {
|
|
7977
8009
|
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
|
7978
8010
|
/*.rope_freq_base =*/ 0.0f,
|
7979
8011
|
/*.rope_freq_scale =*/ 0.0f,
|
7980
|
-
/*.yarn_ext_factor =*/
|
8012
|
+
/*.yarn_ext_factor =*/ -1.0f,
|
7981
8013
|
/*.yarn_attn_factor =*/ 1.0f,
|
7982
8014
|
/*.yarn_beta_fast =*/ 32.0f,
|
7983
8015
|
/*.yarn_beta_slow =*/ 1.0f,
|
@@ -8120,7 +8152,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8120
8152
|
cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
|
8121
8153
|
}
|
8122
8154
|
|
8123
|
-
if (
|
8155
|
+
if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
|
8124
8156
|
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
|
8125
8157
|
}
|
8126
8158
|
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -175,11 +175,11 @@ extern "C" {
|
|
175
175
|
};
|
176
176
|
|
177
177
|
struct llama_context_params {
|
178
|
-
uint32_t seed;
|
179
|
-
uint32_t n_ctx;
|
180
|
-
uint32_t n_batch;
|
181
|
-
uint32_t n_threads;
|
182
|
-
uint32_t n_threads_batch;
|
178
|
+
uint32_t seed; // RNG seed, -1 for random
|
179
|
+
uint32_t n_ctx; // text context, 0 = from model
|
180
|
+
uint32_t n_batch; // prompt processing maximum batch size
|
181
|
+
uint32_t n_threads; // number of threads to use for generation
|
182
|
+
uint32_t n_threads_batch; // number of threads to use for batch processing
|
183
183
|
int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
184
184
|
|
185
185
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.9.
|
6
|
+
VERSION = '0.9.2'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1500'
|
10
10
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-11-
|
11
|
+
date: 2023-11-11 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|