llama_cpp 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +74 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +48 -17
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +352 -175
- data/ext/llama_cpp/src/llama.cpp +127 -222
- data/ext/llama_cpp/src/llama.h +16 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7a1f299e21bfe5b12d517a4254657cbc5bf9af6d0571285e2a5aff67b9175646
|
4
|
+
data.tar.gz: 62dd6e0d4f0b052a912d87b52cd0cff5bb873ab12378413a3ee0af5671331ef6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b12dc73914e5c7ecdd951fd57b70e01aae1926a2adc88030b5f5310f99c789e129cf552811363ec99525b37b9ca167a708cb756057b94f5cf4dd2a0100b06b6e
|
7
|
+
data.tar.gz: d1d79696b08f89894de02a02fac91f0783c432efa641b21ee59f6987946b045681a60113392db6c85fe97bd0e1fc9860235faa358fb805bb0de21eb85926edd5
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
## [[0.3.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.0...v0.3.1)] - 2023-07-02
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from master-9d23589 to master-b8c8dda.
|
4
|
+
- Use unsigned values for random seed.
|
5
|
+
- Add `eval_embd` method to `Context` class.
|
6
|
+
|
1
7
|
## [[0.3.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.2.2...v0.3.0)] - 2023-06-30
|
2
8
|
|
3
9
|
- Add no_k_quants and qkk_64 config options:
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -404,6 +404,10 @@ private:
|
|
404
404
|
// seed
|
405
405
|
static VALUE _llama_context_params_set_seed(VALUE self, VALUE seed) {
|
406
406
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
407
|
+
if (NUM2INT(seed) < 0) {
|
408
|
+
rb_raise(rb_eArgError, "seed must be positive");
|
409
|
+
return Qnil;
|
410
|
+
}
|
407
411
|
ptr->params.seed = NUM2INT(seed);
|
408
412
|
return INT2NUM(ptr->params.seed);
|
409
413
|
};
|
@@ -685,6 +689,10 @@ private:
|
|
685
689
|
LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(kw_values[1]);
|
686
690
|
LLaMAModelWrapper* model_ptr = get_llama_model(self);
|
687
691
|
|
692
|
+
if (prms_ptr->params.seed == LLAMA_DEFAULT_SEED) {
|
693
|
+
prms_ptr->params.seed = time(NULL);
|
694
|
+
}
|
695
|
+
|
688
696
|
try {
|
689
697
|
model_ptr->model = llama_load_model_from_file(StringValueCStr(filename), prms_ptr->params);
|
690
698
|
} catch (const std::runtime_error& e) {
|
@@ -848,6 +856,7 @@ public:
|
|
848
856
|
rb_define_alloc_func(rb_cLLaMAContext, llama_context_alloc);
|
849
857
|
rb_define_method(rb_cLLaMAContext, "initialize", RUBY_METHOD_FUNC(_llama_context_initialize), -1);
|
850
858
|
rb_define_method(rb_cLLaMAContext, "eval", RUBY_METHOD_FUNC(_llama_context_eval), -1);
|
859
|
+
rb_define_method(rb_cLLaMAContext, "eval_embd", RUBY_METHOD_FUNC(_llama_context_eval_embd), -1);
|
851
860
|
rb_define_method(rb_cLLaMAContext, "eval_export", RUBY_METHOD_FUNC(_llama_context_eval_export), 1);
|
852
861
|
rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
|
853
862
|
rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
|
@@ -971,6 +980,61 @@ private:
|
|
971
980
|
return Qnil;
|
972
981
|
};
|
973
982
|
|
983
|
+
static VALUE _llama_context_eval_embd(int argc, VALUE* argv, VALUE self) {
|
984
|
+
VALUE kw_args = Qnil;
|
985
|
+
ID kw_table[4] = { rb_intern("embd"), rb_intern("n_past"), rb_intern("n_tokens"), rb_intern("n_threads") };
|
986
|
+
VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
|
987
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
988
|
+
rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
|
989
|
+
|
990
|
+
if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
|
991
|
+
rb_raise(rb_eArgError, "tokens must be an Array");
|
992
|
+
return Qnil;
|
993
|
+
}
|
994
|
+
if (!RB_INTEGER_TYPE_P(kw_values[1])) {
|
995
|
+
rb_raise(rb_eArgError, "n_past must be an integer");
|
996
|
+
return Qnil;
|
997
|
+
}
|
998
|
+
if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
|
999
|
+
rb_raise(rb_eArgError, "n_tokens must be an integer");
|
1000
|
+
return Qnil;
|
1001
|
+
}
|
1002
|
+
if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
|
1003
|
+
rb_raise(rb_eArgError, "n_threads must be an integer");
|
1004
|
+
return Qnil;
|
1005
|
+
}
|
1006
|
+
|
1007
|
+
const size_t tokens_len = RARRAY_LEN(kw_values[0]);
|
1008
|
+
std::vector<float> embd(tokens_len);
|
1009
|
+
for (size_t i = 0; i < tokens_len; i++) {
|
1010
|
+
VALUE el = rb_ary_entry(kw_values[0], i);
|
1011
|
+
if (!RB_FLOAT_TYPE_P(el)) {
|
1012
|
+
rb_raise(rb_eArgError, "embd must be an array of floats");
|
1013
|
+
return Qnil;
|
1014
|
+
}
|
1015
|
+
embd[i] = NUM2DBL(el);
|
1016
|
+
}
|
1017
|
+
|
1018
|
+
const int n_tokens = kw_values[2] == Qundef ? (int)tokens_len : NUM2INT(kw_values[2]);
|
1019
|
+
const int n_past = NUM2INT(kw_values[1]);
|
1020
|
+
const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
|
1021
|
+
|
1022
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1023
|
+
if (ptr->ctx == NULL) {
|
1024
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1025
|
+
return Qnil;
|
1026
|
+
}
|
1027
|
+
if (llama_eval_embd(ptr->ctx, embd.data(), n_tokens, n_past, n_threads) != 0) {
|
1028
|
+
rb_raise(rb_eRuntimeError, "Failed to evaluate");
|
1029
|
+
return Qnil;
|
1030
|
+
}
|
1031
|
+
|
1032
|
+
rb_iv_set(self, "@n_tokens", INT2NUM(n_tokens));
|
1033
|
+
rb_iv_set(self, "@has_evaluated", Qtrue);
|
1034
|
+
|
1035
|
+
return Qnil;
|
1036
|
+
}
|
1037
|
+
|
974
1038
|
static VALUE _llama_context_eval_export(VALUE self, VALUE fname_) {
|
975
1039
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
976
1040
|
if (ptr->ctx == NULL) {
|
@@ -1198,7 +1262,11 @@ private:
|
|
1198
1262
|
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1199
1263
|
return Qnil;
|
1200
1264
|
}
|
1201
|
-
|
1265
|
+
if (NUM2INT(seed_) < 0) {
|
1266
|
+
rb_raise(rb_eArgError, "seed must be a non-negative integer");
|
1267
|
+
return Qnil;
|
1268
|
+
}
|
1269
|
+
const uint32_t seed = NUM2INT(seed_);
|
1202
1270
|
llama_set_rng_seed(ptr->ctx, seed);
|
1203
1271
|
return Qnil;
|
1204
1272
|
};
|
@@ -1901,6 +1969,11 @@ extern "C" void Init_llama_cpp(void) {
|
|
1901
1969
|
ss_magic << std::showbase << std::hex << LLAMA_SESSION_MAGIC;
|
1902
1970
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_MAGIC", rb_str_new2(ss_magic.str().c_str()));
|
1903
1971
|
|
1972
|
+
ss_magic.str("");
|
1973
|
+
ss_magic.clear(std::stringstream::goodbit);
|
1974
|
+
ss_magic << std::showbase << std::hex << LLAMA_DEFAULT_SEED;
|
1975
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_DEFAULT_SEED", rb_str_new2(ss_magic.str().c_str()));
|
1976
|
+
|
1904
1977
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
|
1905
1978
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_VERSION", rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str()));
|
1906
1979
|
}
|
@@ -223,6 +223,15 @@ static __global__ void add_f32(const float * x, const float * y, float * dst, co
|
|
223
223
|
dst[i] = x[i] + y[i];
|
224
224
|
}
|
225
225
|
|
226
|
+
static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
|
227
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
228
|
+
|
229
|
+
if (i >= k) {
|
230
|
+
return;
|
231
|
+
}
|
232
|
+
dst[i] = __hadd(x[i], __float2half(y[i]));
|
233
|
+
}
|
234
|
+
|
226
235
|
static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
227
236
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
228
237
|
|
@@ -1235,7 +1244,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
|
|
1235
1244
|
}
|
1236
1245
|
|
1237
1246
|
static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
1238
|
-
const half * x = (half *) vx;
|
1247
|
+
const half * x = (const half *) vx;
|
1239
1248
|
|
1240
1249
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
1241
1250
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
@@ -1283,9 +1292,9 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
|
|
1283
1292
|
|
1284
1293
|
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
1285
1294
|
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
1286
|
-
const int row_stride_x, const int
|
1295
|
+
const int row_stride_x, const int channel_stride_x) {
|
1287
1296
|
|
1288
|
-
const half * x = (half *) vx;
|
1297
|
+
const half * x = (const half *) vx;
|
1289
1298
|
|
1290
1299
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
1291
1300
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
@@ -1328,14 +1337,14 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
1328
1337
|
}
|
1329
1338
|
|
1330
1339
|
static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
|
1331
|
-
const float * xi = (float *) cxi;
|
1340
|
+
const float * xi = (const float *) cxi;
|
1332
1341
|
float * dsti = (float *) cdsti;
|
1333
1342
|
|
1334
1343
|
*dsti = *xi;
|
1335
1344
|
}
|
1336
1345
|
|
1337
1346
|
static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
|
1338
|
-
const float * xi = (float *) cxi;
|
1347
|
+
const float * xi = (const float *) cxi;
|
1339
1348
|
half * dsti = (half *) cdsti;
|
1340
1349
|
|
1341
1350
|
*dsti = __float2half(*xi);
|
@@ -1459,6 +1468,11 @@ static void add_f32_cuda(const float * x, const float * y, float * dst, const in
|
|
1459
1468
|
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
1460
1469
|
}
|
1461
1470
|
|
1471
|
+
static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
|
1472
|
+
const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
1473
|
+
add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
1474
|
+
}
|
1475
|
+
|
1462
1476
|
static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
1463
1477
|
const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
|
1464
1478
|
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
@@ -1684,7 +1698,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_cuda(
|
|
1684
1698
|
const dim3 block_nums(1, nrows_x, nchannels_x);
|
1685
1699
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
1686
1700
|
mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
|
1687
|
-
(vx, y, dst, ncols_x, nrows_x, row_stride_x,
|
1701
|
+
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x);
|
1688
1702
|
}
|
1689
1703
|
|
1690
1704
|
static void ggml_cpy_f32_f32_cuda(
|
@@ -1941,7 +1955,7 @@ inline void ggml_cuda_op_add(
|
|
1941
1955
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
1942
1956
|
cudaStream_t & cudaStream_main){
|
1943
1957
|
|
1944
|
-
GGML_ASSERT(src0_ddf_i != nullptr);
|
1958
|
+
GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
|
1945
1959
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
1946
1960
|
GGML_ASSERT(dst_ddf_i != nullptr);
|
1947
1961
|
|
@@ -1949,7 +1963,13 @@ inline void ggml_cuda_op_add(
|
|
1949
1963
|
const int64_t i01_diff = i01_high - i01_low;
|
1950
1964
|
|
1951
1965
|
// compute
|
1952
|
-
|
1966
|
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
1967
|
+
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
|
1968
|
+
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
1969
|
+
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
|
1970
|
+
} else {
|
1971
|
+
GGML_ASSERT(false);
|
1972
|
+
}
|
1953
1973
|
CUDA_CHECK(cudaGetLastError());
|
1954
1974
|
|
1955
1975
|
(void) src1;
|
@@ -2547,8 +2567,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2547
2567
|
}
|
2548
2568
|
|
2549
2569
|
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2550
|
-
|
2551
|
-
|
2570
|
+
// ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op.
|
2571
|
+
// Due to flatten_rows == true this does in practice not make a difference however.
|
2572
|
+
// Better solution would be nice but right now that would require disproportionate changes.
|
2573
|
+
GGML_ASSERT(
|
2574
|
+
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
|
2575
|
+
src1->type == GGML_TYPE_F32 &&
|
2576
|
+
(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
|
2577
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
|
2552
2578
|
}
|
2553
2579
|
|
2554
2580
|
void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -2801,7 +2827,7 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
2801
2827
|
delete extra;
|
2802
2828
|
}
|
2803
2829
|
|
2804
|
-
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
2830
|
+
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
2805
2831
|
if (scratch && g_scratch_size == 0) {
|
2806
2832
|
return;
|
2807
2833
|
}
|
@@ -2810,11 +2836,11 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
|
2810
2836
|
if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
|
2811
2837
|
const ggml_op src0_op = tensor->src0->op;
|
2812
2838
|
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
2813
|
-
ggml_cuda_assign_buffers_impl(tensor->src0, scratch);
|
2839
|
+
ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace);
|
2814
2840
|
}
|
2815
2841
|
}
|
2816
2842
|
if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
|
2817
|
-
ggml_cuda_assign_buffers_impl(tensor->src1, scratch);
|
2843
|
+
ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace);
|
2818
2844
|
}
|
2819
2845
|
|
2820
2846
|
tensor->backend = GGML_BACKEND_GPU;
|
@@ -2822,11 +2848,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
|
2822
2848
|
memset(extra, 0, sizeof(*extra));
|
2823
2849
|
|
2824
2850
|
const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
|
2825
|
-
tensor->op == GGML_OP_VIEW
|
2851
|
+
tensor->op == GGML_OP_VIEW ||
|
2852
|
+
force_inplace;
|
2826
2853
|
const size_t size = ggml_nbytes(tensor);
|
2827
2854
|
|
2828
2855
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2829
|
-
if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
|
2856
|
+
if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
|
2830
2857
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
|
2831
2858
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
2832
2859
|
size_t offset = 0;
|
@@ -2865,11 +2892,15 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
|
2865
2892
|
}
|
2866
2893
|
|
2867
2894
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
2868
|
-
ggml_cuda_assign_buffers_impl(tensor, true);
|
2895
|
+
ggml_cuda_assign_buffers_impl(tensor, true, false);
|
2869
2896
|
}
|
2870
2897
|
|
2871
2898
|
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
|
2872
|
-
ggml_cuda_assign_buffers_impl(tensor, false);
|
2899
|
+
ggml_cuda_assign_buffers_impl(tensor, false, false);
|
2900
|
+
}
|
2901
|
+
|
2902
|
+
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
|
2903
|
+
ggml_cuda_assign_buffers_impl(tensor, false, true);
|
2873
2904
|
}
|
2874
2905
|
|
2875
2906
|
void ggml_cuda_set_main_device(int main_device) {
|
@@ -29,6 +29,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
|
|
29
29
|
void ggml_cuda_free_data(struct ggml_tensor * tensor);
|
30
30
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
31
31
|
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
|
32
|
+
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
|
32
33
|
void ggml_cuda_set_main_device(int main_device);
|
33
34
|
void ggml_cuda_set_scratch_size(size_t scratch_size);
|
34
35
|
void ggml_cuda_free_scratch(void);
|