llama_cpp 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9e0152eb9e091932225356614b57fad416c2aa96a83316f8585c9ef2872e1504
4
- data.tar.gz: 8ea2f00f11be7dd6524bfe69e3181fc63df7c841ed1e2d91b1b2bcafd99d0b66
3
+ metadata.gz: 7a1f299e21bfe5b12d517a4254657cbc5bf9af6d0571285e2a5aff67b9175646
4
+ data.tar.gz: 62dd6e0d4f0b052a912d87b52cd0cff5bb873ab12378413a3ee0af5671331ef6
5
5
  SHA512:
6
- metadata.gz: a85a4bdd2d1fd575eb406b9bebdf7f388db33dc42f7a2980ba9a7a6b346b539854d9df5515c9b6968727e76f035a23f59d4bc65bc5525df962dfbdf56d8b3b01
7
- data.tar.gz: 33641d622102257dbc1358bde0871a03c595928f5d8cedee512e1df414e4aa93433eadfcd082d4db42046320c1ed7f806dfb3aafd7934a1becb33fe275f9435c
6
+ metadata.gz: b12dc73914e5c7ecdd951fd57b70e01aae1926a2adc88030b5f5310f99c789e129cf552811363ec99525b37b9ca167a708cb756057b94f5cf4dd2a0100b06b6e
7
+ data.tar.gz: d1d79696b08f89894de02a02fac91f0783c432efa641b21ee59f6987946b045681a60113392db6c85fe97bd0e1fc9860235faa358fb805bb0de21eb85926edd5
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## [[0.3.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.0...v0.3.1)] - 2023-07-02
2
+
3
+ - Bump bundled llama.cpp from master-9d23589 to master-b8c8dda.
4
+ - Use unsigned values for random seed.
5
+ - Add `eval_embd` method to `Context` class.
6
+
1
7
  ## [[0.3.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.2.2...v0.3.0)] - 2023-06-30
2
8
 
3
9
  - Add no_k_quants and qkk_64 config options:
@@ -404,6 +404,10 @@ private:
404
404
  // seed
405
405
  static VALUE _llama_context_params_set_seed(VALUE self, VALUE seed) {
406
406
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
407
+ if (NUM2INT(seed) < 0) {
408
+ rb_raise(rb_eArgError, "seed must be positive");
409
+ return Qnil;
410
+ }
407
411
  ptr->params.seed = NUM2INT(seed);
408
412
  return INT2NUM(ptr->params.seed);
409
413
  };
@@ -685,6 +689,10 @@ private:
685
689
  LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(kw_values[1]);
686
690
  LLaMAModelWrapper* model_ptr = get_llama_model(self);
687
691
 
692
+ if (prms_ptr->params.seed == LLAMA_DEFAULT_SEED) {
693
+ prms_ptr->params.seed = time(NULL);
694
+ }
695
+
688
696
  try {
689
697
  model_ptr->model = llama_load_model_from_file(StringValueCStr(filename), prms_ptr->params);
690
698
  } catch (const std::runtime_error& e) {
@@ -848,6 +856,7 @@ public:
848
856
  rb_define_alloc_func(rb_cLLaMAContext, llama_context_alloc);
849
857
  rb_define_method(rb_cLLaMAContext, "initialize", RUBY_METHOD_FUNC(_llama_context_initialize), -1);
850
858
  rb_define_method(rb_cLLaMAContext, "eval", RUBY_METHOD_FUNC(_llama_context_eval), -1);
859
+ rb_define_method(rb_cLLaMAContext, "eval_embd", RUBY_METHOD_FUNC(_llama_context_eval_embd), -1);
851
860
  rb_define_method(rb_cLLaMAContext, "eval_export", RUBY_METHOD_FUNC(_llama_context_eval_export), 1);
852
861
  rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
853
862
  rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
@@ -971,6 +980,61 @@ private:
971
980
  return Qnil;
972
981
  };
973
982
 
983
+ static VALUE _llama_context_eval_embd(int argc, VALUE* argv, VALUE self) {
984
+ VALUE kw_args = Qnil;
985
+ ID kw_table[4] = { rb_intern("embd"), rb_intern("n_past"), rb_intern("n_tokens"), rb_intern("n_threads") };
986
+ VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
987
+ rb_scan_args(argc, argv, ":", &kw_args);
988
+ rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
989
+
990
+ if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
991
+ rb_raise(rb_eArgError, "tokens must be an Array");
992
+ return Qnil;
993
+ }
994
+ if (!RB_INTEGER_TYPE_P(kw_values[1])) {
995
+ rb_raise(rb_eArgError, "n_past must be an integer");
996
+ return Qnil;
997
+ }
998
+ if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
999
+ rb_raise(rb_eArgError, "n_tokens must be an integer");
1000
+ return Qnil;
1001
+ }
1002
+ if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
1003
+ rb_raise(rb_eArgError, "n_threads must be an integer");
1004
+ return Qnil;
1005
+ }
1006
+
1007
+ const size_t tokens_len = RARRAY_LEN(kw_values[0]);
1008
+ std::vector<float> embd(tokens_len);
1009
+ for (size_t i = 0; i < tokens_len; i++) {
1010
+ VALUE el = rb_ary_entry(kw_values[0], i);
1011
+ if (!RB_FLOAT_TYPE_P(el)) {
1012
+ rb_raise(rb_eArgError, "embd must be an array of floats");
1013
+ return Qnil;
1014
+ }
1015
+ embd[i] = NUM2DBL(el);
1016
+ }
1017
+
1018
+ const int n_tokens = kw_values[2] == Qundef ? (int)tokens_len : NUM2INT(kw_values[2]);
1019
+ const int n_past = NUM2INT(kw_values[1]);
1020
+ const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
1021
+
1022
+ LLaMAContextWrapper* ptr = get_llama_context(self);
1023
+ if (ptr->ctx == NULL) {
1024
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1025
+ return Qnil;
1026
+ }
1027
+ if (llama_eval_embd(ptr->ctx, embd.data(), n_tokens, n_past, n_threads) != 0) {
1028
+ rb_raise(rb_eRuntimeError, "Failed to evaluate");
1029
+ return Qnil;
1030
+ }
1031
+
1032
+ rb_iv_set(self, "@n_tokens", INT2NUM(n_tokens));
1033
+ rb_iv_set(self, "@has_evaluated", Qtrue);
1034
+
1035
+ return Qnil;
1036
+ }
1037
+
974
1038
  static VALUE _llama_context_eval_export(VALUE self, VALUE fname_) {
975
1039
  LLaMAContextWrapper* ptr = get_llama_context(self);
976
1040
  if (ptr->ctx == NULL) {
@@ -1198,7 +1262,11 @@ private:
1198
1262
  rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1199
1263
  return Qnil;
1200
1264
  }
1201
- const int seed = NUM2INT(seed_);
1265
+ if (NUM2INT(seed_) < 0) {
1266
+ rb_raise(rb_eArgError, "seed must be a non-negative integer");
1267
+ return Qnil;
1268
+ }
1269
+ const uint32_t seed = NUM2INT(seed_);
1202
1270
  llama_set_rng_seed(ptr->ctx, seed);
1203
1271
  return Qnil;
1204
1272
  };
@@ -1901,6 +1969,11 @@ extern "C" void Init_llama_cpp(void) {
1901
1969
  ss_magic << std::showbase << std::hex << LLAMA_SESSION_MAGIC;
1902
1970
  rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_MAGIC", rb_str_new2(ss_magic.str().c_str()));
1903
1971
 
1972
+ ss_magic.str("");
1973
+ ss_magic.clear(std::stringstream::goodbit);
1974
+ ss_magic << std::showbase << std::hex << LLAMA_DEFAULT_SEED;
1975
+ rb_define_const(rb_mLLaMACpp, "LLAMA_DEFAULT_SEED", rb_str_new2(ss_magic.str().c_str()));
1976
+
1904
1977
  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
1905
1978
  rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_VERSION", rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str()));
1906
1979
  }
@@ -223,6 +223,15 @@ static __global__ void add_f32(const float * x, const float * y, float * dst, co
223
223
  dst[i] = x[i] + y[i];
224
224
  }
225
225
 
226
+ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
227
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
228
+
229
+ if (i >= k) {
230
+ return;
231
+ }
232
+ dst[i] = __hadd(x[i], __float2half(y[i]));
233
+ }
234
+
226
235
  static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
227
236
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
228
237
 
@@ -1235,7 +1244,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
1235
1244
  }
1236
1245
 
1237
1246
  static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1238
- const half * x = (half *) vx;
1247
+ const half * x = (const half *) vx;
1239
1248
 
1240
1249
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1241
1250
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
@@ -1283,9 +1292,9 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
1283
1292
 
1284
1293
  static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1285
1294
  const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
1286
- const int row_stride_x, const int nchannels_x, const int channel_stride_x) {
1295
+ const int row_stride_x, const int channel_stride_x) {
1287
1296
 
1288
- const half * x = (half *) vx;
1297
+ const half * x = (const half *) vx;
1289
1298
 
1290
1299
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1291
1300
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
@@ -1328,14 +1337,14 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1328
1337
  }
1329
1338
 
1330
1339
  static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
1331
- const float * xi = (float *) cxi;
1340
+ const float * xi = (const float *) cxi;
1332
1341
  float * dsti = (float *) cdsti;
1333
1342
 
1334
1343
  *dsti = *xi;
1335
1344
  }
1336
1345
 
1337
1346
  static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
1338
- const float * xi = (float *) cxi;
1347
+ const float * xi = (const float *) cxi;
1339
1348
  half * dsti = (half *) cdsti;
1340
1349
 
1341
1350
  *dsti = __float2half(*xi);
@@ -1459,6 +1468,11 @@ static void add_f32_cuda(const float * x, const float * y, float * dst, const in
1459
1468
  add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
1460
1469
  }
1461
1470
 
1471
+ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
1472
+ const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
1473
+ add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
1474
+ }
1475
+
1462
1476
  static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
1463
1477
  const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
1464
1478
  mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -1684,7 +1698,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_cuda(
1684
1698
  const dim3 block_nums(1, nrows_x, nchannels_x);
1685
1699
  const dim3 block_dims(WARP_SIZE, 1, 1);
1686
1700
  mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
1687
- (vx, y, dst, ncols_x, nrows_x, row_stride_x, nchannels_x, channel_stride_x);
1701
+ (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x);
1688
1702
  }
1689
1703
 
1690
1704
  static void ggml_cpy_f32_f32_cuda(
@@ -1941,7 +1955,7 @@ inline void ggml_cuda_op_add(
1941
1955
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1942
1956
  cudaStream_t & cudaStream_main){
1943
1957
 
1944
- GGML_ASSERT(src0_ddf_i != nullptr);
1958
+ GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
1945
1959
  GGML_ASSERT(src1_ddf_i != nullptr);
1946
1960
  GGML_ASSERT(dst_ddf_i != nullptr);
1947
1961
 
@@ -1949,7 +1963,13 @@ inline void ggml_cuda_op_add(
1949
1963
  const int64_t i01_diff = i01_high - i01_low;
1950
1964
 
1951
1965
  // compute
1952
- add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
1966
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
1967
+ add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
1968
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
1969
+ add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
1970
+ } else {
1971
+ GGML_ASSERT(false);
1972
+ }
1953
1973
  CUDA_CHECK(cudaGetLastError());
1954
1974
 
1955
1975
  (void) src1;
@@ -2547,8 +2567,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2547
2567
  }
2548
2568
 
2549
2569
  void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2550
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2551
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true, true);
2570
+ // ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op.
2571
+ // Due to flatten_rows == true this does in practice not make a difference however.
2572
+ // Better solution would be nice but right now that would require disproportionate changes.
2573
+ GGML_ASSERT(
2574
+ (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
2575
+ src1->type == GGML_TYPE_F32 &&
2576
+ (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
2577
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
2552
2578
  }
2553
2579
 
2554
2580
  void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2801,7 +2827,7 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
2801
2827
  delete extra;
2802
2828
  }
2803
2829
 
2804
- void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2830
+ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
2805
2831
  if (scratch && g_scratch_size == 0) {
2806
2832
  return;
2807
2833
  }
@@ -2810,11 +2836,11 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2810
2836
  if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
2811
2837
  const ggml_op src0_op = tensor->src0->op;
2812
2838
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
2813
- ggml_cuda_assign_buffers_impl(tensor->src0, scratch);
2839
+ ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace);
2814
2840
  }
2815
2841
  }
2816
2842
  if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
2817
- ggml_cuda_assign_buffers_impl(tensor->src1, scratch);
2843
+ ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace);
2818
2844
  }
2819
2845
 
2820
2846
  tensor->backend = GGML_BACKEND_GPU;
@@ -2822,11 +2848,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2822
2848
  memset(extra, 0, sizeof(*extra));
2823
2849
 
2824
2850
  const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
2825
- tensor->op == GGML_OP_VIEW;
2851
+ tensor->op == GGML_OP_VIEW ||
2852
+ force_inplace;
2826
2853
  const size_t size = ggml_nbytes(tensor);
2827
2854
 
2828
2855
  CUDA_CHECK(cudaSetDevice(g_main_device));
2829
- if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
2856
+ if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
2830
2857
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
2831
2858
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
2832
2859
  size_t offset = 0;
@@ -2865,11 +2892,15 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2865
2892
  }
2866
2893
 
2867
2894
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
2868
- ggml_cuda_assign_buffers_impl(tensor, true);
2895
+ ggml_cuda_assign_buffers_impl(tensor, true, false);
2869
2896
  }
2870
2897
 
2871
2898
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
2872
- ggml_cuda_assign_buffers_impl(tensor, false);
2899
+ ggml_cuda_assign_buffers_impl(tensor, false, false);
2900
+ }
2901
+
2902
+ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
2903
+ ggml_cuda_assign_buffers_impl(tensor, false, true);
2873
2904
  }
2874
2905
 
2875
2906
  void ggml_cuda_set_main_device(int main_device) {
@@ -29,6 +29,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
29
29
  void ggml_cuda_free_data(struct ggml_tensor * tensor);
30
30
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
31
31
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
32
+ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
32
33
  void ggml_cuda_set_main_device(int main_device);
33
34
  void ggml_cuda_set_scratch_size(size_t scratch_size);
34
35
  void ggml_cuda_free_scratch(void);