llama_cpp 0.16.1 → 0.16.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '0408c7d579262d0ba34013671a446a76513f6f4564270ef8ba471203fba75c59'
4
- data.tar.gz: a8085d9678999bb79ba788f7ce856c3f4fe1c6e131af569eaa54aa71fc9ae773
3
+ metadata.gz: 78a5062740a7262e9b0d1d792a59f32e4962385110509b4433c186e78e58f8bc
4
+ data.tar.gz: e0d5921d4dba1496cc376919b9166162e11b358218da5aa1bb5d1b06ebcb7f64
5
5
  SHA512:
6
- metadata.gz: 32e8f294a7f88db05abba3d1e11c951a38f366cac83712f89aa68ed95c581d8eaa4df3d5473f1af3cee965d7a66ea2bf5ccb00222337c59d97fca32ba5e9cade
7
- data.tar.gz: 2c5f66e2902eb1d72e45261e80f1a2599534e02d0e54e83ff432414d355fab67f6bda9eee095f17904a4fb2d3bf5cb4a1509a346fadff13d999c2f057db972c6
6
+ metadata.gz: dc7e55f458cd7840fc6830fb8e3228dcbc62eb0fcae87c8ef758e6518502aca0992048ef9278585516b263229d0c0a6a1dfe5ca67b6c88765ee51d4f7ec8b516
7
+ data.tar.gz: 2819430e6ee8dea168ed5448bc51fed7eed66d60954f3c504f96315359be68ea85bde37ceccdc17feb6832207551154b171b8686196af264a3ee982af8c0e348
data/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ ## [[0.16.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.16.1...v0.16.2)] - 2024-06-22
2
+
3
+ - Bump llama.cpp from b3151 to b3197.
4
+ - Add `LLAMA_POOLING_TYPE_LAST` constant.
5
+ - Add `--with-vulkan-memory-debug` config option.
6
+ - Add `set_embeddings` method to `Context`.
7
+
1
8
  ## [[0.16.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.16.0...v0.16.1)] - 2024-06-15
2
9
 
3
10
  - Bump llama.cpp from b3091 to b3151.
@@ -22,6 +22,7 @@ make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
22
22
  make_envs << ' LLAMA_VULKAN=1' if with_config('vulkan')
23
23
  make_envs << ' LLAMA_NO_OPENMP=1' if with_config('no-openmp')
24
24
  make_envs << ' LLAMA_NO_LLAMAFILE=1' if with_config('no-llamafile')
25
+ make_envs << ' LLAMA_VULKAN_MEMORY_DEBUG=1' if with_config('vulkan-memory-debug')
25
26
 
26
27
  make_envs << ' LLAMA_METAL_EMBED_LIBRARY=1' if RUBY_PLATFORM.match?(/darwin/)
27
28
 
@@ -2133,6 +2133,7 @@ public:
2133
2133
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
2134
2134
  rb_define_method(rb_cLLaMAContext, "embeddings_ith", RUBY_METHOD_FUNC(_llama_context_embeddings_ith), 1);
2135
2135
  rb_define_method(rb_cLLaMAContext, "embeddings_seq", RUBY_METHOD_FUNC(_llama_context_embeddings_seq), 1);
2136
+ rb_define_method(rb_cLLaMAContext, "set_embeddings", RUBY_METHOD_FUNC(_llama_context_set_embeddings), 1);
2136
2137
  rb_define_method(rb_cLLaMAContext, "set_n_threads", RUBY_METHOD_FUNC(_llama_context_set_n_threads), -1);
2137
2138
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
2138
2139
  rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
@@ -2357,6 +2358,16 @@ private:
2357
2358
  return output;
2358
2359
  }
2359
2360
 
2361
+ static VALUE _llama_context_set_embeddings(VALUE self, VALUE embs) {
2362
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2363
+ if (ptr->ctx == NULL) {
2364
+ rb_raise(rb_eArgError, "LLaMA context is not initialized");
2365
+ return Qnil;
2366
+ }
2367
+ llama_set_embeddings(ptr->ctx, RTEST(embs) ? true : false);
2368
+ return Qnil;
2369
+ }
2370
+
2360
2371
  static VALUE _llama_context_set_n_threads(int argc, VALUE* argv, VALUE self) {
2361
2372
  VALUE kw_args = Qnil;
2362
2373
  ID kw_table[2] = { rb_intern("n_threads"), rb_intern("n_threads_batch") };
@@ -3572,6 +3583,7 @@ extern "C" void Init_llama_cpp(void) {
3572
3583
  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_NONE", INT2NUM(LLAMA_POOLING_TYPE_NONE));
3573
3584
  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_MEAN", INT2NUM(LLAMA_POOLING_TYPE_MEAN));
3574
3585
  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_CLS", INT2NUM(LLAMA_POOLING_TYPE_CLS));
3586
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_LAST", INT2NUM(LLAMA_POOLING_TYPE_LAST));
3575
3587
 
3576
3588
  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_NONE", INT2NUM(LLAMA_SPLIT_MODE_NONE));
3577
3589
  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_LAYER", INT2NUM(LLAMA_SPLIT_MODE_LAYER));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.16.1'
6
+ VERSION = '0.16.2'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b3151'
9
+ LLAMA_CPP_VERSION = 'b3197'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -99,6 +99,7 @@ module LLaMACpp
99
99
  LLAMA_POOLING_TYPE_NONE: Integer
100
100
  LLAMA_POOLING_TYPE_MEAN: Integer
101
101
  LLAMA_POOLING_TYPE_CLS: Integer
102
+ LLAMA_POOLING_TYPE_LAST: Integer
102
103
 
103
104
  LLAMA_SPLIT_MODE_NONE: Integer
104
105
  LLAMA_SPLIT_MODE_LAYER: Integer
@@ -258,6 +259,7 @@ module LLaMACpp
258
259
  def embeddings_seq: (Integer) -> Array[Float]
259
260
  def decode: (::LLaMACpp::Batch) -> void
260
261
  def logits: () -> Array[Float]
262
+ def set_embeddings: (bool) -> void
261
263
  def set_n_threads: (n_threads: Integer, n_threads_batch: Integer) -> void
262
264
  def n_ctx: () -> Integer
263
265
  def n_batch: () -> Integer
@@ -38,6 +38,7 @@ BUILD_TARGETS = \
38
38
  llama-tokenize \
39
39
  llama-train-text-from-scratch \
40
40
  llama-vdot \
41
+ llama-cvector-generator \
41
42
  tests/test-c.o
42
43
 
43
44
  # Binaries only useful for tests
@@ -508,7 +509,7 @@ ifdef LLAMA_CUDA
508
509
  CUDA_PATH ?= /usr/local/cuda
509
510
  endif
510
511
  MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
511
- MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
512
+ MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
512
513
  OBJS += ggml-cuda.o
513
514
  OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
514
515
  OBJS += $(OBJS_CUDA_TEMP_INST)
@@ -609,6 +610,10 @@ ifdef LLAMA_VULKAN_DEBUG
609
610
  MK_CPPFLAGS += -DGGML_VULKAN_DEBUG
610
611
  endif
611
612
 
613
+ ifdef LLAMA_VULKAN_MEMORY_DEBUG
614
+ MK_CPPFLAGS += -DGGML_VULKAN_MEMORY_DEBUG
615
+ endif
616
+
612
617
  ifdef LLAMA_VULKAN_VALIDATE
613
618
  MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE
614
619
  endif
@@ -827,7 +832,6 @@ libllama.so: llama.o ggml.o $(OBJS)
827
832
  libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
828
833
  ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
829
834
 
830
-
831
835
  lib: llama.o ggml.o $(OBJS)
832
836
  $(CXX) $(CXXFLAGS) -shared -fPIC -o libllama$(DSO_EXT) $^ $(LDFLAGS)
833
837
  ar rcs libllama.a $^
@@ -928,6 +932,10 @@ llama-eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(C
928
932
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
929
933
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
930
934
 
935
+ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
936
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
937
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
938
+
931
939
  llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
932
940
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
933
941
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1172,7 +1172,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1172
1172
  // check if a backend with higher prio wants to offload the op
1173
1173
  if (src_backend_id == sched->n_backends - 1) {
1174
1174
  for (int b = 0; b < src_backend_id; b++) {
1175
- if (ggml_backend_offload_op(sched->backends[b], tensor)) {
1175
+ if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
1176
1176
  SET_CAUSE(tensor, "1.off");
1177
1177
  return b;
1178
1178
  }
@@ -1706,14 +1706,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1706
1706
  static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1707
1707
  bool backend_ids_changed = false;
1708
1708
  for (int i = 0; i < sched->graph->n_nodes; i++) {
1709
- if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i]) {
1709
+ if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
1710
+ sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
1710
1711
  backend_ids_changed = true;
1711
1712
  break;
1712
1713
  }
1713
1714
  }
1714
1715
  if (!backend_ids_changed) {
1715
1716
  for (int i = 0; i < sched->graph->n_leafs; i++) {
1716
- if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i]) {
1717
+ if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
1718
+ sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
1717
1719
  backend_ids_changed = true;
1718
1720
  break;
1719
1721
  }
@@ -1977,6 +1979,15 @@ int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
1977
1979
  return sched->n_copies;
1978
1980
  }
1979
1981
 
1982
+ int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
1983
+ return sched->n_backends;
1984
+ }
1985
+
1986
+ ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
1987
+ GGML_ASSERT(i >= 0 && i < sched->n_backends);
1988
+ return sched->backends[i];
1989
+ }
1990
+
1980
1991
  size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
1981
1992
  int backend_index = ggml_backend_sched_backend_id(sched, backend);
1982
1993
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
@@ -182,6 +182,9 @@ extern "C" {
182
182
  // Initialize backend buffers from a measure graph
183
183
  GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
184
184
 
185
+ GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
186
+ GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
187
+
185
188
  // Get the number of splits of the last graph
186
189
  GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
187
190
  GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
@@ -30,34 +30,34 @@ void ggml_cuda_op_mul_mat_q(
30
30
 
31
31
  switch (src0->type) {
32
32
  case GGML_TYPE_Q4_0:
33
- mul_mat_q_case<GGML_TYPE_Q4_0>(args, stream);
33
+ mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
34
34
  break;
35
35
  case GGML_TYPE_Q4_1:
36
- mul_mat_q_case<GGML_TYPE_Q4_1>(args, stream);
36
+ mul_mat_q_case<GGML_TYPE_Q4_1>(ctx, args, stream);
37
37
  break;
38
38
  case GGML_TYPE_Q5_0:
39
- mul_mat_q_case<GGML_TYPE_Q5_0>(args, stream);
39
+ mul_mat_q_case<GGML_TYPE_Q5_0>(ctx, args, stream);
40
40
  break;
41
41
  case GGML_TYPE_Q5_1:
42
- mul_mat_q_case<GGML_TYPE_Q5_1>(args, stream);
42
+ mul_mat_q_case<GGML_TYPE_Q5_1>(ctx, args, stream);
43
43
  break;
44
44
  case GGML_TYPE_Q8_0:
45
- mul_mat_q_case<GGML_TYPE_Q8_0>(args, stream);
45
+ mul_mat_q_case<GGML_TYPE_Q8_0>(ctx, args, stream);
46
46
  break;
47
47
  case GGML_TYPE_Q2_K:
48
- mul_mat_q_case<GGML_TYPE_Q2_K>(args, stream);
48
+ mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream);
49
49
  break;
50
50
  case GGML_TYPE_Q3_K:
51
- mul_mat_q_case<GGML_TYPE_Q3_K>(args, stream);
51
+ mul_mat_q_case<GGML_TYPE_Q3_K>(ctx, args, stream);
52
52
  break;
53
53
  case GGML_TYPE_Q4_K:
54
- mul_mat_q_case<GGML_TYPE_Q4_K>(args, stream);
54
+ mul_mat_q_case<GGML_TYPE_Q4_K>(ctx, args, stream);
55
55
  break;
56
56
  case GGML_TYPE_Q5_K:
57
- mul_mat_q_case<GGML_TYPE_Q5_K>(args, stream);
57
+ mul_mat_q_case<GGML_TYPE_Q5_K>(ctx, args, stream);
58
58
  break;
59
59
  case GGML_TYPE_Q6_K:
60
- mul_mat_q_case<GGML_TYPE_Q6_K>(args, stream);
60
+ mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
61
61
  break;
62
62
  default:
63
63
  GGML_ASSERT(false);
@@ -117,7 +117,7 @@ static __global__ void mul_mat_vec_q(
117
117
  tmp[j][i] = warp_reduce_sum(tmp[j][i]);
118
118
  }
119
119
 
120
- if (threadIdx.x < rows_per_cuda_block) {
120
+ if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
121
121
  dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
122
122
  }
123
123
  }
@@ -92,6 +92,15 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
92
92
  dst[i] = x[i] * x[i];
93
93
  }
94
94
 
95
+ static __global__ void sqrt_f32(const float * x, float * dst, const int k) {
96
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
97
+
98
+ if (i >= k) {
99
+ return;
100
+ }
101
+ dst[i] = sqrtf(x[i]);
102
+ }
103
+
95
104
  static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
96
105
  const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
97
106
  gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@@ -142,6 +151,11 @@ static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
142
151
  sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
143
152
  }
144
153
 
154
+ static void sqrt_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
155
+ const int num_blocks = (k + CUDA_SQRT_BLOCK_SIZE - 1) / CUDA_SQRT_BLOCK_SIZE;
156
+ sqrt_f32<<<num_blocks, CUDA_SQRT_BLOCK_SIZE, 0, stream>>>(x, dst, k);
157
+ }
158
+
145
159
  void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
146
160
  const ggml_tensor * src0 = dst->src[0];
147
161
  const float * src0_d = (const float *)src0->data;
@@ -284,3 +298,17 @@ void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
284
298
 
285
299
  sqr_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
286
300
  }
301
+
302
+ void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
303
+ const ggml_tensor * src0 = dst->src[0];
304
+ const float * src0_d = (const float *)src0->data;
305
+ float * dst_d = (float *)dst->data;
306
+ cudaStream_t stream = ctx.stream();
307
+
308
+ GGML_ASSERT(ggml_is_contiguous(src0));
309
+
310
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
311
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
312
+
313
+ sqrt_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
314
+ }
@@ -17,7 +17,7 @@
17
17
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
18
18
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
19
19
 
20
- #if defined(_WIN32)
20
+ #if defined(_MSC_VER)
21
21
 
22
22
  #define m512bh(p) p
23
23
  #define m512i(p) p
@@ -735,6 +735,12 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs
735
735
  }
736
736
 
737
737
  static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
738
+ for (size_t i = 0, n = 3; i < n; ++i) {
739
+ if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
740
+ return false;
741
+ }
742
+ }
743
+
738
744
  switch (op->op) {
739
745
  case GGML_OP_UNARY:
740
746
  switch (ggml_get_unary_op(op)) {