llama_cpp 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9e0152eb9e091932225356614b57fad416c2aa96a83316f8585c9ef2872e1504
4
- data.tar.gz: 8ea2f00f11be7dd6524bfe69e3181fc63df7c841ed1e2d91b1b2bcafd99d0b66
3
+ metadata.gz: 7a1f299e21bfe5b12d517a4254657cbc5bf9af6d0571285e2a5aff67b9175646
4
+ data.tar.gz: 62dd6e0d4f0b052a912d87b52cd0cff5bb873ab12378413a3ee0af5671331ef6
5
5
  SHA512:
6
- metadata.gz: a85a4bdd2d1fd575eb406b9bebdf7f388db33dc42f7a2980ba9a7a6b346b539854d9df5515c9b6968727e76f035a23f59d4bc65bc5525df962dfbdf56d8b3b01
7
- data.tar.gz: 33641d622102257dbc1358bde0871a03c595928f5d8cedee512e1df414e4aa93433eadfcd082d4db42046320c1ed7f806dfb3aafd7934a1becb33fe275f9435c
6
+ metadata.gz: b12dc73914e5c7ecdd951fd57b70e01aae1926a2adc88030b5f5310f99c789e129cf552811363ec99525b37b9ca167a708cb756057b94f5cf4dd2a0100b06b6e
7
+ data.tar.gz: d1d79696b08f89894de02a02fac91f0783c432efa641b21ee59f6987946b045681a60113392db6c85fe97bd0e1fc9860235faa358fb805bb0de21eb85926edd5
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## [[0.3.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.0...v0.3.1)] - 2023-07-02
2
+
3
+ - Bump bundled llama.cpp from master-9d23589 to master-b8c8dda.
4
+ - Use unsigned values for random seed.
5
+ - Add `eval_embd` method to `Context` class.
6
+
1
7
  ## [[0.3.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.2.2...v0.3.0)] - 2023-06-30
2
8
 
3
9
  - Add no_k_quants and qkk_64 config options:
@@ -404,6 +404,10 @@ private:
404
404
  // seed
405
405
  static VALUE _llama_context_params_set_seed(VALUE self, VALUE seed) {
406
406
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
407
+ if (NUM2INT(seed) < 0) {
408
+ rb_raise(rb_eArgError, "seed must be positive");
409
+ return Qnil;
410
+ }
407
411
  ptr->params.seed = NUM2INT(seed);
408
412
  return INT2NUM(ptr->params.seed);
409
413
  };
@@ -685,6 +689,10 @@ private:
685
689
  LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(kw_values[1]);
686
690
  LLaMAModelWrapper* model_ptr = get_llama_model(self);
687
691
 
692
+ if (prms_ptr->params.seed == LLAMA_DEFAULT_SEED) {
693
+ prms_ptr->params.seed = time(NULL);
694
+ }
695
+
688
696
  try {
689
697
  model_ptr->model = llama_load_model_from_file(StringValueCStr(filename), prms_ptr->params);
690
698
  } catch (const std::runtime_error& e) {
@@ -848,6 +856,7 @@ public:
848
856
  rb_define_alloc_func(rb_cLLaMAContext, llama_context_alloc);
849
857
  rb_define_method(rb_cLLaMAContext, "initialize", RUBY_METHOD_FUNC(_llama_context_initialize), -1);
850
858
  rb_define_method(rb_cLLaMAContext, "eval", RUBY_METHOD_FUNC(_llama_context_eval), -1);
859
+ rb_define_method(rb_cLLaMAContext, "eval_embd", RUBY_METHOD_FUNC(_llama_context_eval_embd), -1);
851
860
  rb_define_method(rb_cLLaMAContext, "eval_export", RUBY_METHOD_FUNC(_llama_context_eval_export), 1);
852
861
  rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
853
862
  rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
@@ -971,6 +980,61 @@ private:
971
980
  return Qnil;
972
981
  };
973
982
 
983
+ static VALUE _llama_context_eval_embd(int argc, VALUE* argv, VALUE self) {
984
+ VALUE kw_args = Qnil;
985
+ ID kw_table[4] = { rb_intern("embd"), rb_intern("n_past"), rb_intern("n_tokens"), rb_intern("n_threads") };
986
+ VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
987
+ rb_scan_args(argc, argv, ":", &kw_args);
988
+ rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
989
+
990
+ if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
991
+ rb_raise(rb_eArgError, "tokens must be an Array");
992
+ return Qnil;
993
+ }
994
+ if (!RB_INTEGER_TYPE_P(kw_values[1])) {
995
+ rb_raise(rb_eArgError, "n_past must be an integer");
996
+ return Qnil;
997
+ }
998
+ if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
999
+ rb_raise(rb_eArgError, "n_tokens must be an integer");
1000
+ return Qnil;
1001
+ }
1002
+ if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
1003
+ rb_raise(rb_eArgError, "n_threads must be an integer");
1004
+ return Qnil;
1005
+ }
1006
+
1007
+ const size_t tokens_len = RARRAY_LEN(kw_values[0]);
1008
+ std::vector<float> embd(tokens_len);
1009
+ for (size_t i = 0; i < tokens_len; i++) {
1010
+ VALUE el = rb_ary_entry(kw_values[0], i);
1011
+ if (!RB_FLOAT_TYPE_P(el)) {
1012
+ rb_raise(rb_eArgError, "embd must be an array of floats");
1013
+ return Qnil;
1014
+ }
1015
+ embd[i] = NUM2DBL(el);
1016
+ }
1017
+
1018
+ const int n_tokens = kw_values[2] == Qundef ? (int)tokens_len : NUM2INT(kw_values[2]);
1019
+ const int n_past = NUM2INT(kw_values[1]);
1020
+ const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
1021
+
1022
+ LLaMAContextWrapper* ptr = get_llama_context(self);
1023
+ if (ptr->ctx == NULL) {
1024
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1025
+ return Qnil;
1026
+ }
1027
+ if (llama_eval_embd(ptr->ctx, embd.data(), n_tokens, n_past, n_threads) != 0) {
1028
+ rb_raise(rb_eRuntimeError, "Failed to evaluate");
1029
+ return Qnil;
1030
+ }
1031
+
1032
+ rb_iv_set(self, "@n_tokens", INT2NUM(n_tokens));
1033
+ rb_iv_set(self, "@has_evaluated", Qtrue);
1034
+
1035
+ return Qnil;
1036
+ }
1037
+
974
1038
  static VALUE _llama_context_eval_export(VALUE self, VALUE fname_) {
975
1039
  LLaMAContextWrapper* ptr = get_llama_context(self);
976
1040
  if (ptr->ctx == NULL) {
@@ -1198,7 +1262,11 @@ private:
1198
1262
  rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1199
1263
  return Qnil;
1200
1264
  }
1201
- const int seed = NUM2INT(seed_);
1265
+ if (NUM2INT(seed_) < 0) {
1266
+ rb_raise(rb_eArgError, "seed must be a non-negative integer");
1267
+ return Qnil;
1268
+ }
1269
+ const uint32_t seed = NUM2INT(seed_);
1202
1270
  llama_set_rng_seed(ptr->ctx, seed);
1203
1271
  return Qnil;
1204
1272
  };
@@ -1901,6 +1969,11 @@ extern "C" void Init_llama_cpp(void) {
1901
1969
  ss_magic << std::showbase << std::hex << LLAMA_SESSION_MAGIC;
1902
1970
  rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_MAGIC", rb_str_new2(ss_magic.str().c_str()));
1903
1971
 
1972
+ ss_magic.str("");
1973
+ ss_magic.clear(std::stringstream::goodbit);
1974
+ ss_magic << std::showbase << std::hex << LLAMA_DEFAULT_SEED;
1975
+ rb_define_const(rb_mLLaMACpp, "LLAMA_DEFAULT_SEED", rb_str_new2(ss_magic.str().c_str()));
1976
+
1904
1977
  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
1905
1978
  rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_VERSION", rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str()));
1906
1979
  }
@@ -223,6 +223,15 @@ static __global__ void add_f32(const float * x, const float * y, float * dst, co
223
223
  dst[i] = x[i] + y[i];
224
224
  }
225
225
 
226
+ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
227
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
228
+
229
+ if (i >= k) {
230
+ return;
231
+ }
232
+ dst[i] = __hadd(x[i], __float2half(y[i]));
233
+ }
234
+
226
235
  static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
227
236
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
228
237
 
@@ -1235,7 +1244,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
1235
1244
  }
1236
1245
 
1237
1246
  static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1238
- const half * x = (half *) vx;
1247
+ const half * x = (const half *) vx;
1239
1248
 
1240
1249
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1241
1250
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
@@ -1283,9 +1292,9 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
1283
1292
 
1284
1293
  static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1285
1294
  const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
1286
- const int row_stride_x, const int nchannels_x, const int channel_stride_x) {
1295
+ const int row_stride_x, const int channel_stride_x) {
1287
1296
 
1288
- const half * x = (half *) vx;
1297
+ const half * x = (const half *) vx;
1289
1298
 
1290
1299
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1291
1300
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
@@ -1328,14 +1337,14 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1328
1337
  }
1329
1338
 
1330
1339
  static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
1331
- const float * xi = (float *) cxi;
1340
+ const float * xi = (const float *) cxi;
1332
1341
  float * dsti = (float *) cdsti;
1333
1342
 
1334
1343
  *dsti = *xi;
1335
1344
  }
1336
1345
 
1337
1346
  static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
1338
- const float * xi = (float *) cxi;
1347
+ const float * xi = (const float *) cxi;
1339
1348
  half * dsti = (half *) cdsti;
1340
1349
 
1341
1350
  *dsti = __float2half(*xi);
@@ -1459,6 +1468,11 @@ static void add_f32_cuda(const float * x, const float * y, float * dst, const in
1459
1468
  add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
1460
1469
  }
1461
1470
 
1471
+ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
1472
+ const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
1473
+ add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
1474
+ }
1475
+
1462
1476
  static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
1463
1477
  const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
1464
1478
  mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -1684,7 +1698,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_cuda(
1684
1698
  const dim3 block_nums(1, nrows_x, nchannels_x);
1685
1699
  const dim3 block_dims(WARP_SIZE, 1, 1);
1686
1700
  mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
1687
- (vx, y, dst, ncols_x, nrows_x, row_stride_x, nchannels_x, channel_stride_x);
1701
+ (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x);
1688
1702
  }
1689
1703
 
1690
1704
  static void ggml_cpy_f32_f32_cuda(
@@ -1941,7 +1955,7 @@ inline void ggml_cuda_op_add(
1941
1955
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1942
1956
  cudaStream_t & cudaStream_main){
1943
1957
 
1944
- GGML_ASSERT(src0_ddf_i != nullptr);
1958
+ GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
1945
1959
  GGML_ASSERT(src1_ddf_i != nullptr);
1946
1960
  GGML_ASSERT(dst_ddf_i != nullptr);
1947
1961
 
@@ -1949,7 +1963,13 @@ inline void ggml_cuda_op_add(
1949
1963
  const int64_t i01_diff = i01_high - i01_low;
1950
1964
 
1951
1965
  // compute
1952
- add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
1966
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
1967
+ add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
1968
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
1969
+ add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
1970
+ } else {
1971
+ GGML_ASSERT(false);
1972
+ }
1953
1973
  CUDA_CHECK(cudaGetLastError());
1954
1974
 
1955
1975
  (void) src1;
@@ -2547,8 +2567,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2547
2567
  }
2548
2568
 
2549
2569
  void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2550
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2551
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true, true);
2570
+ // ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op.
2571
+ // Due to flatten_rows == true this does in practice not make a difference however.
2572
+ // Better solution would be nice but right now that would require disproportionate changes.
2573
+ GGML_ASSERT(
2574
+ (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
2575
+ src1->type == GGML_TYPE_F32 &&
2576
+ (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
2577
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
2552
2578
  }
2553
2579
 
2554
2580
  void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2801,7 +2827,7 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
2801
2827
  delete extra;
2802
2828
  }
2803
2829
 
2804
- void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2830
+ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
2805
2831
  if (scratch && g_scratch_size == 0) {
2806
2832
  return;
2807
2833
  }
@@ -2810,11 +2836,11 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2810
2836
  if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
2811
2837
  const ggml_op src0_op = tensor->src0->op;
2812
2838
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
2813
- ggml_cuda_assign_buffers_impl(tensor->src0, scratch);
2839
+ ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace);
2814
2840
  }
2815
2841
  }
2816
2842
  if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
2817
- ggml_cuda_assign_buffers_impl(tensor->src1, scratch);
2843
+ ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace);
2818
2844
  }
2819
2845
 
2820
2846
  tensor->backend = GGML_BACKEND_GPU;
@@ -2822,11 +2848,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2822
2848
  memset(extra, 0, sizeof(*extra));
2823
2849
 
2824
2850
  const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
2825
- tensor->op == GGML_OP_VIEW;
2851
+ tensor->op == GGML_OP_VIEW ||
2852
+ force_inplace;
2826
2853
  const size_t size = ggml_nbytes(tensor);
2827
2854
 
2828
2855
  CUDA_CHECK(cudaSetDevice(g_main_device));
2829
- if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
2856
+ if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
2830
2857
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
2831
2858
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
2832
2859
  size_t offset = 0;
@@ -2865,11 +2892,15 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2865
2892
  }
2866
2893
 
2867
2894
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
2868
- ggml_cuda_assign_buffers_impl(tensor, true);
2895
+ ggml_cuda_assign_buffers_impl(tensor, true, false);
2869
2896
  }
2870
2897
 
2871
2898
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
2872
- ggml_cuda_assign_buffers_impl(tensor, false);
2899
+ ggml_cuda_assign_buffers_impl(tensor, false, false);
2900
+ }
2901
+
2902
+ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
2903
+ ggml_cuda_assign_buffers_impl(tensor, false, true);
2873
2904
  }
2874
2905
 
2875
2906
  void ggml_cuda_set_main_device(int main_device) {
@@ -29,6 +29,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
29
29
  void ggml_cuda_free_data(struct ggml_tensor * tensor);
30
30
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
31
31
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
32
+ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
32
33
  void ggml_cuda_set_main_device(int main_device);
33
34
  void ggml_cuda_set_scratch_size(size_t scratch_size);
34
35
  void ggml_cuda_free_scratch(void);