llama_cpp 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +74 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +48 -17
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +352 -175
- data/ext/llama_cpp/src/llama.cpp +127 -222
- data/ext/llama_cpp/src/llama.h +16 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7a1f299e21bfe5b12d517a4254657cbc5bf9af6d0571285e2a5aff67b9175646
|
4
|
+
data.tar.gz: 62dd6e0d4f0b052a912d87b52cd0cff5bb873ab12378413a3ee0af5671331ef6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b12dc73914e5c7ecdd951fd57b70e01aae1926a2adc88030b5f5310f99c789e129cf552811363ec99525b37b9ca167a708cb756057b94f5cf4dd2a0100b06b6e
|
7
|
+
data.tar.gz: d1d79696b08f89894de02a02fac91f0783c432efa641b21ee59f6987946b045681a60113392db6c85fe97bd0e1fc9860235faa358fb805bb0de21eb85926edd5
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
## [[0.3.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.0...v0.3.1)] - 2023-07-02
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from master-9d23589 to master-b8c8dda.
|
4
|
+
- Use unsigned values for random seed.
|
5
|
+
- Add `eval_embd` method to `Context` class.
|
6
|
+
|
1
7
|
## [[0.3.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.2.2...v0.3.0)] - 2023-06-30
|
2
8
|
|
3
9
|
- Add no_k_quants and qkk_64 config options:
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -404,6 +404,10 @@ private:
|
|
404
404
|
// seed
|
405
405
|
static VALUE _llama_context_params_set_seed(VALUE self, VALUE seed) {
|
406
406
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
407
|
+
if (NUM2INT(seed) < 0) {
|
408
|
+
rb_raise(rb_eArgError, "seed must be positive");
|
409
|
+
return Qnil;
|
410
|
+
}
|
407
411
|
ptr->params.seed = NUM2INT(seed);
|
408
412
|
return INT2NUM(ptr->params.seed);
|
409
413
|
};
|
@@ -685,6 +689,10 @@ private:
|
|
685
689
|
LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(kw_values[1]);
|
686
690
|
LLaMAModelWrapper* model_ptr = get_llama_model(self);
|
687
691
|
|
692
|
+
if (prms_ptr->params.seed == LLAMA_DEFAULT_SEED) {
|
693
|
+
prms_ptr->params.seed = time(NULL);
|
694
|
+
}
|
695
|
+
|
688
696
|
try {
|
689
697
|
model_ptr->model = llama_load_model_from_file(StringValueCStr(filename), prms_ptr->params);
|
690
698
|
} catch (const std::runtime_error& e) {
|
@@ -848,6 +856,7 @@ public:
|
|
848
856
|
rb_define_alloc_func(rb_cLLaMAContext, llama_context_alloc);
|
849
857
|
rb_define_method(rb_cLLaMAContext, "initialize", RUBY_METHOD_FUNC(_llama_context_initialize), -1);
|
850
858
|
rb_define_method(rb_cLLaMAContext, "eval", RUBY_METHOD_FUNC(_llama_context_eval), -1);
|
859
|
+
rb_define_method(rb_cLLaMAContext, "eval_embd", RUBY_METHOD_FUNC(_llama_context_eval_embd), -1);
|
851
860
|
rb_define_method(rb_cLLaMAContext, "eval_export", RUBY_METHOD_FUNC(_llama_context_eval_export), 1);
|
852
861
|
rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
|
853
862
|
rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
|
@@ -971,6 +980,61 @@ private:
|
|
971
980
|
return Qnil;
|
972
981
|
};
|
973
982
|
|
983
|
+
static VALUE _llama_context_eval_embd(int argc, VALUE* argv, VALUE self) {
|
984
|
+
VALUE kw_args = Qnil;
|
985
|
+
ID kw_table[4] = { rb_intern("embd"), rb_intern("n_past"), rb_intern("n_tokens"), rb_intern("n_threads") };
|
986
|
+
VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
|
987
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
988
|
+
rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
|
989
|
+
|
990
|
+
if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
|
991
|
+
rb_raise(rb_eArgError, "tokens must be an Array");
|
992
|
+
return Qnil;
|
993
|
+
}
|
994
|
+
if (!RB_INTEGER_TYPE_P(kw_values[1])) {
|
995
|
+
rb_raise(rb_eArgError, "n_past must be an integer");
|
996
|
+
return Qnil;
|
997
|
+
}
|
998
|
+
if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
|
999
|
+
rb_raise(rb_eArgError, "n_tokens must be an integer");
|
1000
|
+
return Qnil;
|
1001
|
+
}
|
1002
|
+
if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
|
1003
|
+
rb_raise(rb_eArgError, "n_threads must be an integer");
|
1004
|
+
return Qnil;
|
1005
|
+
}
|
1006
|
+
|
1007
|
+
const size_t tokens_len = RARRAY_LEN(kw_values[0]);
|
1008
|
+
std::vector<float> embd(tokens_len);
|
1009
|
+
for (size_t i = 0; i < tokens_len; i++) {
|
1010
|
+
VALUE el = rb_ary_entry(kw_values[0], i);
|
1011
|
+
if (!RB_FLOAT_TYPE_P(el)) {
|
1012
|
+
rb_raise(rb_eArgError, "embd must be an array of floats");
|
1013
|
+
return Qnil;
|
1014
|
+
}
|
1015
|
+
embd[i] = NUM2DBL(el);
|
1016
|
+
}
|
1017
|
+
|
1018
|
+
const int n_tokens = kw_values[2] == Qundef ? (int)tokens_len : NUM2INT(kw_values[2]);
|
1019
|
+
const int n_past = NUM2INT(kw_values[1]);
|
1020
|
+
const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
|
1021
|
+
|
1022
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1023
|
+
if (ptr->ctx == NULL) {
|
1024
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1025
|
+
return Qnil;
|
1026
|
+
}
|
1027
|
+
if (llama_eval_embd(ptr->ctx, embd.data(), n_tokens, n_past, n_threads) != 0) {
|
1028
|
+
rb_raise(rb_eRuntimeError, "Failed to evaluate");
|
1029
|
+
return Qnil;
|
1030
|
+
}
|
1031
|
+
|
1032
|
+
rb_iv_set(self, "@n_tokens", INT2NUM(n_tokens));
|
1033
|
+
rb_iv_set(self, "@has_evaluated", Qtrue);
|
1034
|
+
|
1035
|
+
return Qnil;
|
1036
|
+
}
|
1037
|
+
|
974
1038
|
static VALUE _llama_context_eval_export(VALUE self, VALUE fname_) {
|
975
1039
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
976
1040
|
if (ptr->ctx == NULL) {
|
@@ -1198,7 +1262,11 @@ private:
|
|
1198
1262
|
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1199
1263
|
return Qnil;
|
1200
1264
|
}
|
1201
|
-
|
1265
|
+
if (NUM2INT(seed_) < 0) {
|
1266
|
+
rb_raise(rb_eArgError, "seed must be a non-negative integer");
|
1267
|
+
return Qnil;
|
1268
|
+
}
|
1269
|
+
const uint32_t seed = NUM2INT(seed_);
|
1202
1270
|
llama_set_rng_seed(ptr->ctx, seed);
|
1203
1271
|
return Qnil;
|
1204
1272
|
};
|
@@ -1901,6 +1969,11 @@ extern "C" void Init_llama_cpp(void) {
|
|
1901
1969
|
ss_magic << std::showbase << std::hex << LLAMA_SESSION_MAGIC;
|
1902
1970
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_MAGIC", rb_str_new2(ss_magic.str().c_str()));
|
1903
1971
|
|
1972
|
+
ss_magic.str("");
|
1973
|
+
ss_magic.clear(std::stringstream::goodbit);
|
1974
|
+
ss_magic << std::showbase << std::hex << LLAMA_DEFAULT_SEED;
|
1975
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_DEFAULT_SEED", rb_str_new2(ss_magic.str().c_str()));
|
1976
|
+
|
1904
1977
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
|
1905
1978
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_VERSION", rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str()));
|
1906
1979
|
}
|
@@ -223,6 +223,15 @@ static __global__ void add_f32(const float * x, const float * y, float * dst, co
|
|
223
223
|
dst[i] = x[i] + y[i];
|
224
224
|
}
|
225
225
|
|
226
|
+
static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
|
227
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
228
|
+
|
229
|
+
if (i >= k) {
|
230
|
+
return;
|
231
|
+
}
|
232
|
+
dst[i] = __hadd(x[i], __float2half(y[i]));
|
233
|
+
}
|
234
|
+
|
226
235
|
static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
227
236
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
228
237
|
|
@@ -1235,7 +1244,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
|
|
1235
1244
|
}
|
1236
1245
|
|
1237
1246
|
static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
1238
|
-
const half * x = (half *) vx;
|
1247
|
+
const half * x = (const half *) vx;
|
1239
1248
|
|
1240
1249
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
1241
1250
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
@@ -1283,9 +1292,9 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
|
|
1283
1292
|
|
1284
1293
|
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
1285
1294
|
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
1286
|
-
const int row_stride_x, const int
|
1295
|
+
const int row_stride_x, const int channel_stride_x) {
|
1287
1296
|
|
1288
|
-
const half * x = (half *) vx;
|
1297
|
+
const half * x = (const half *) vx;
|
1289
1298
|
|
1290
1299
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
1291
1300
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
@@ -1328,14 +1337,14 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
1328
1337
|
}
|
1329
1338
|
|
1330
1339
|
static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
|
1331
|
-
const float * xi = (float *) cxi;
|
1340
|
+
const float * xi = (const float *) cxi;
|
1332
1341
|
float * dsti = (float *) cdsti;
|
1333
1342
|
|
1334
1343
|
*dsti = *xi;
|
1335
1344
|
}
|
1336
1345
|
|
1337
1346
|
static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
|
1338
|
-
const float * xi = (float *) cxi;
|
1347
|
+
const float * xi = (const float *) cxi;
|
1339
1348
|
half * dsti = (half *) cdsti;
|
1340
1349
|
|
1341
1350
|
*dsti = __float2half(*xi);
|
@@ -1459,6 +1468,11 @@ static void add_f32_cuda(const float * x, const float * y, float * dst, const in
|
|
1459
1468
|
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
1460
1469
|
}
|
1461
1470
|
|
1471
|
+
static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
|
1472
|
+
const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
1473
|
+
add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
1474
|
+
}
|
1475
|
+
|
1462
1476
|
static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
1463
1477
|
const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
|
1464
1478
|
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
@@ -1684,7 +1698,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_cuda(
|
|
1684
1698
|
const dim3 block_nums(1, nrows_x, nchannels_x);
|
1685
1699
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1686
1700
|
mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
|
1687
|
-
(vx, y, dst, ncols_x, nrows_x, row_stride_x,
|
1701
|
+
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x);
|
1688
1702
|
}
|
1689
1703
|
|
1690
1704
|
static void ggml_cpy_f32_f32_cuda(
|
@@ -1941,7 +1955,7 @@ inline void ggml_cuda_op_add(
|
|
1941
1955
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
1942
1956
|
cudaStream_t & cudaStream_main){
|
1943
1957
|
|
1944
|
-
GGML_ASSERT(src0_ddf_i != nullptr);
|
1958
|
+
GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
|
1945
1959
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
1946
1960
|
GGML_ASSERT(dst_ddf_i != nullptr);
|
1947
1961
|
|
@@ -1949,7 +1963,13 @@ inline void ggml_cuda_op_add(
|
|
1949
1963
|
const int64_t i01_diff = i01_high - i01_low;
|
1950
1964
|
|
1951
1965
|
// compute
|
1952
|
-
|
1966
|
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
1967
|
+
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
|
1968
|
+
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
1969
|
+
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
|
1970
|
+
} else {
|
1971
|
+
GGML_ASSERT(false);
|
1972
|
+
}
|
1953
1973
|
CUDA_CHECK(cudaGetLastError());
|
1954
1974
|
|
1955
1975
|
(void) src1;
|
@@ -2547,8 +2567,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2547
2567
|
}
|
2548
2568
|
|
2549
2569
|
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2550
|
-
|
2551
|
-
|
2570
|
+
// ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op.
|
2571
|
+
// Due to flatten_rows == true this does in practice not make a difference however.
|
2572
|
+
// Better solution would be nice but right now that would require disproportionate changes.
|
2573
|
+
GGML_ASSERT(
|
2574
|
+
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
|
2575
|
+
src1->type == GGML_TYPE_F32 &&
|
2576
|
+
(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
|
2577
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
|
2552
2578
|
}
|
2553
2579
|
|
2554
2580
|
void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -2801,7 +2827,7 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
2801
2827
|
delete extra;
|
2802
2828
|
}
|
2803
2829
|
|
2804
|
-
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
2830
|
+
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
2805
2831
|
if (scratch && g_scratch_size == 0) {
|
2806
2832
|
return;
|
2807
2833
|
}
|
@@ -2810,11 +2836,11 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
|
2810
2836
|
if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
|
2811
2837
|
const ggml_op src0_op = tensor->src0->op;
|
2812
2838
|
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
2813
|
-
ggml_cuda_assign_buffers_impl(tensor->src0, scratch);
|
2839
|
+
ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace);
|
2814
2840
|
}
|
2815
2841
|
}
|
2816
2842
|
if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
|
2817
|
-
ggml_cuda_assign_buffers_impl(tensor->src1, scratch);
|
2843
|
+
ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace);
|
2818
2844
|
}
|
2819
2845
|
|
2820
2846
|
tensor->backend = GGML_BACKEND_GPU;
|
@@ -2822,11 +2848,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
|
2822
2848
|
memset(extra, 0, sizeof(*extra));
|
2823
2849
|
|
2824
2850
|
const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
|
2825
|
-
tensor->op == GGML_OP_VIEW
|
2851
|
+
tensor->op == GGML_OP_VIEW ||
|
2852
|
+
force_inplace;
|
2826
2853
|
const size_t size = ggml_nbytes(tensor);
|
2827
2854
|
|
2828
2855
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2829
|
-
if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
|
2856
|
+
if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
|
2830
2857
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
|
2831
2858
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
2832
2859
|
size_t offset = 0;
|
@@ -2865,11 +2892,15 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
|
2865
2892
|
}
|
2866
2893
|
|
2867
2894
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
2868
|
-
ggml_cuda_assign_buffers_impl(tensor, true);
|
2895
|
+
ggml_cuda_assign_buffers_impl(tensor, true, false);
|
2869
2896
|
}
|
2870
2897
|
|
2871
2898
|
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
|
2872
|
-
ggml_cuda_assign_buffers_impl(tensor, false);
|
2899
|
+
ggml_cuda_assign_buffers_impl(tensor, false, false);
|
2900
|
+
}
|
2901
|
+
|
2902
|
+
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
|
2903
|
+
ggml_cuda_assign_buffers_impl(tensor, false, true);
|
2873
2904
|
}
|
2874
2905
|
|
2875
2906
|
void ggml_cuda_set_main_device(int main_device) {
|
@@ -29,6 +29,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
|
|
29
29
|
void ggml_cuda_free_data(struct ggml_tensor * tensor);
|
30
30
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
31
31
|
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
|
32
|
+
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
|
32
33
|
void ggml_cuda_set_main_device(int main_device);
|
33
34
|
void ggml_cuda_set_scratch_size(size_t scratch_size);
|
34
35
|
void ggml_cuda_free_scratch(void);
|