llama_cpp 0.16.1 → 0.16.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '0408c7d579262d0ba34013671a446a76513f6f4564270ef8ba471203fba75c59'
4
- data.tar.gz: a8085d9678999bb79ba788f7ce856c3f4fe1c6e131af569eaa54aa71fc9ae773
3
+ metadata.gz: 78a5062740a7262e9b0d1d792a59f32e4962385110509b4433c186e78e58f8bc
4
+ data.tar.gz: e0d5921d4dba1496cc376919b9166162e11b358218da5aa1bb5d1b06ebcb7f64
5
5
  SHA512:
6
- metadata.gz: 32e8f294a7f88db05abba3d1e11c951a38f366cac83712f89aa68ed95c581d8eaa4df3d5473f1af3cee965d7a66ea2bf5ccb00222337c59d97fca32ba5e9cade
7
- data.tar.gz: 2c5f66e2902eb1d72e45261e80f1a2599534e02d0e54e83ff432414d355fab67f6bda9eee095f17904a4fb2d3bf5cb4a1509a346fadff13d999c2f057db972c6
6
+ metadata.gz: dc7e55f458cd7840fc6830fb8e3228dcbc62eb0fcae87c8ef758e6518502aca0992048ef9278585516b263229d0c0a6a1dfe5ca67b6c88765ee51d4f7ec8b516
7
+ data.tar.gz: 2819430e6ee8dea168ed5448bc51fed7eed66d60954f3c504f96315359be68ea85bde37ceccdc17feb6832207551154b171b8686196af264a3ee982af8c0e348
data/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ ## [[0.16.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.16.1...v0.16.2)] - 2024-06-22
2
+
3
+ - Bump llama.cpp from b3151 to b3197.
4
+ - Add `LLAMA_POOLING_TYPE_LAST` constant.
5
+ - Add `--with-vulkan-memory-debug` config option.
6
+ - Add `set_embeddings` method to `Context`.
7
+
1
8
  ## [[0.16.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.16.0...v0.16.1)] - 2024-06-15
2
9
 
3
10
  - Bump llama.cpp from b3091 to b3151.
@@ -22,6 +22,7 @@ make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
22
22
  make_envs << ' LLAMA_VULKAN=1' if with_config('vulkan')
23
23
  make_envs << ' LLAMA_NO_OPENMP=1' if with_config('no-openmp')
24
24
  make_envs << ' LLAMA_NO_LLAMAFILE=1' if with_config('no-llamafile')
25
+ make_envs << ' LLAMA_VULKAN_MEMORY_DEBUG=1' if with_config('vulkan-memory-debug')
25
26
 
26
27
  make_envs << ' LLAMA_METAL_EMBED_LIBRARY=1' if RUBY_PLATFORM.match?(/darwin/)
27
28
 
@@ -2133,6 +2133,7 @@ public:
2133
2133
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
2134
2134
  rb_define_method(rb_cLLaMAContext, "embeddings_ith", RUBY_METHOD_FUNC(_llama_context_embeddings_ith), 1);
2135
2135
  rb_define_method(rb_cLLaMAContext, "embeddings_seq", RUBY_METHOD_FUNC(_llama_context_embeddings_seq), 1);
2136
+ rb_define_method(rb_cLLaMAContext, "set_embeddings", RUBY_METHOD_FUNC(_llama_context_set_embeddings), 1);
2136
2137
  rb_define_method(rb_cLLaMAContext, "set_n_threads", RUBY_METHOD_FUNC(_llama_context_set_n_threads), -1);
2137
2138
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
2138
2139
  rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
@@ -2357,6 +2358,16 @@ private:
2357
2358
  return output;
2358
2359
  }
2359
2360
 
2361
+ static VALUE _llama_context_set_embeddings(VALUE self, VALUE embs) {
2362
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2363
+ if (ptr->ctx == NULL) {
2364
+ rb_raise(rb_eArgError, "LLaMA context is not initialized");
2365
+ return Qnil;
2366
+ }
2367
+ llama_set_embeddings(ptr->ctx, RTEST(embs) ? true : false);
2368
+ return Qnil;
2369
+ }
2370
+
2360
2371
  static VALUE _llama_context_set_n_threads(int argc, VALUE* argv, VALUE self) {
2361
2372
  VALUE kw_args = Qnil;
2362
2373
  ID kw_table[2] = { rb_intern("n_threads"), rb_intern("n_threads_batch") };
@@ -3572,6 +3583,7 @@ extern "C" void Init_llama_cpp(void) {
3572
3583
  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_NONE", INT2NUM(LLAMA_POOLING_TYPE_NONE));
3573
3584
  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_MEAN", INT2NUM(LLAMA_POOLING_TYPE_MEAN));
3574
3585
  rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_CLS", INT2NUM(LLAMA_POOLING_TYPE_CLS));
3586
+ rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_LAST", INT2NUM(LLAMA_POOLING_TYPE_LAST));
3575
3587
 
3576
3588
  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_NONE", INT2NUM(LLAMA_SPLIT_MODE_NONE));
3577
3589
  rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_LAYER", INT2NUM(LLAMA_SPLIT_MODE_LAYER));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.16.1'
6
+ VERSION = '0.16.2'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b3151'
9
+ LLAMA_CPP_VERSION = 'b3197'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -99,6 +99,7 @@ module LLaMACpp
99
99
  LLAMA_POOLING_TYPE_NONE: Integer
100
100
  LLAMA_POOLING_TYPE_MEAN: Integer
101
101
  LLAMA_POOLING_TYPE_CLS: Integer
102
+ LLAMA_POOLING_TYPE_LAST: Integer
102
103
 
103
104
  LLAMA_SPLIT_MODE_NONE: Integer
104
105
  LLAMA_SPLIT_MODE_LAYER: Integer
@@ -258,6 +259,7 @@ module LLaMACpp
258
259
  def embeddings_seq: (Integer) -> Array[Float]
259
260
  def decode: (::LLaMACpp::Batch) -> void
260
261
  def logits: () -> Array[Float]
262
+ def set_embeddings: (bool) -> void
261
263
  def set_n_threads: (n_threads: Integer, n_threads_batch: Integer) -> void
262
264
  def n_ctx: () -> Integer
263
265
  def n_batch: () -> Integer
@@ -38,6 +38,7 @@ BUILD_TARGETS = \
38
38
  llama-tokenize \
39
39
  llama-train-text-from-scratch \
40
40
  llama-vdot \
41
+ llama-cvector-generator \
41
42
  tests/test-c.o
42
43
 
43
44
  # Binaries only useful for tests
@@ -508,7 +509,7 @@ ifdef LLAMA_CUDA
508
509
  CUDA_PATH ?= /usr/local/cuda
509
510
  endif
510
511
  MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
511
- MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
512
+ MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
512
513
  OBJS += ggml-cuda.o
513
514
  OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
514
515
  OBJS += $(OBJS_CUDA_TEMP_INST)
@@ -609,6 +610,10 @@ ifdef LLAMA_VULKAN_DEBUG
609
610
  MK_CPPFLAGS += -DGGML_VULKAN_DEBUG
610
611
  endif
611
612
 
613
+ ifdef LLAMA_VULKAN_MEMORY_DEBUG
614
+ MK_CPPFLAGS += -DGGML_VULKAN_MEMORY_DEBUG
615
+ endif
616
+
612
617
  ifdef LLAMA_VULKAN_VALIDATE
613
618
  MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE
614
619
  endif
@@ -827,7 +832,6 @@ libllama.so: llama.o ggml.o $(OBJS)
827
832
  libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
828
833
  ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
829
834
 
830
-
831
835
  lib: llama.o ggml.o $(OBJS)
832
836
  $(CXX) $(CXXFLAGS) -shared -fPIC -o libllama$(DSO_EXT) $^ $(LDFLAGS)
833
837
  ar rcs libllama.a $^
@@ -928,6 +932,10 @@ llama-eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(C
928
932
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
929
933
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
930
934
 
935
+ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
936
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
937
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
938
+
931
939
  llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
932
940
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
933
941
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1172,7 +1172,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1172
1172
  // check if a backend with higher prio wants to offload the op
1173
1173
  if (src_backend_id == sched->n_backends - 1) {
1174
1174
  for (int b = 0; b < src_backend_id; b++) {
1175
- if (ggml_backend_offload_op(sched->backends[b], tensor)) {
1175
+ if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
1176
1176
  SET_CAUSE(tensor, "1.off");
1177
1177
  return b;
1178
1178
  }
@@ -1706,14 +1706,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1706
1706
  static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1707
1707
  bool backend_ids_changed = false;
1708
1708
  for (int i = 0; i < sched->graph->n_nodes; i++) {
1709
- if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i]) {
1709
+ if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
1710
+ sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
1710
1711
  backend_ids_changed = true;
1711
1712
  break;
1712
1713
  }
1713
1714
  }
1714
1715
  if (!backend_ids_changed) {
1715
1716
  for (int i = 0; i < sched->graph->n_leafs; i++) {
1716
- if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i]) {
1717
+ if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
1718
+ sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
1717
1719
  backend_ids_changed = true;
1718
1720
  break;
1719
1721
  }
@@ -1977,6 +1979,15 @@ int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
1977
1979
  return sched->n_copies;
1978
1980
  }
1979
1981
 
1982
+ int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
1983
+ return sched->n_backends;
1984
+ }
1985
+
1986
+ ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
1987
+ GGML_ASSERT(i >= 0 && i < sched->n_backends);
1988
+ return sched->backends[i];
1989
+ }
1990
+
1980
1991
  size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
1981
1992
  int backend_index = ggml_backend_sched_backend_id(sched, backend);
1982
1993
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
@@ -182,6 +182,9 @@ extern "C" {
182
182
  // Initialize backend buffers from a measure graph
183
183
  GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
184
184
 
185
+ GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
186
+ GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
187
+
185
188
  // Get the number of splits of the last graph
186
189
  GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
187
190
  GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
@@ -30,34 +30,34 @@ void ggml_cuda_op_mul_mat_q(
30
30
 
31
31
  switch (src0->type) {
32
32
  case GGML_TYPE_Q4_0:
33
- mul_mat_q_case<GGML_TYPE_Q4_0>(args, stream);
33
+ mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
34
34
  break;
35
35
  case GGML_TYPE_Q4_1:
36
- mul_mat_q_case<GGML_TYPE_Q4_1>(args, stream);
36
+ mul_mat_q_case<GGML_TYPE_Q4_1>(ctx, args, stream);
37
37
  break;
38
38
  case GGML_TYPE_Q5_0:
39
- mul_mat_q_case<GGML_TYPE_Q5_0>(args, stream);
39
+ mul_mat_q_case<GGML_TYPE_Q5_0>(ctx, args, stream);
40
40
  break;
41
41
  case GGML_TYPE_Q5_1:
42
- mul_mat_q_case<GGML_TYPE_Q5_1>(args, stream);
42
+ mul_mat_q_case<GGML_TYPE_Q5_1>(ctx, args, stream);
43
43
  break;
44
44
  case GGML_TYPE_Q8_0:
45
- mul_mat_q_case<GGML_TYPE_Q8_0>(args, stream);
45
+ mul_mat_q_case<GGML_TYPE_Q8_0>(ctx, args, stream);
46
46
  break;
47
47
  case GGML_TYPE_Q2_K:
48
- mul_mat_q_case<GGML_TYPE_Q2_K>(args, stream);
48
+ mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream);
49
49
  break;
50
50
  case GGML_TYPE_Q3_K:
51
- mul_mat_q_case<GGML_TYPE_Q3_K>(args, stream);
51
+ mul_mat_q_case<GGML_TYPE_Q3_K>(ctx, args, stream);
52
52
  break;
53
53
  case GGML_TYPE_Q4_K:
54
- mul_mat_q_case<GGML_TYPE_Q4_K>(args, stream);
54
+ mul_mat_q_case<GGML_TYPE_Q4_K>(ctx, args, stream);
55
55
  break;
56
56
  case GGML_TYPE_Q5_K:
57
- mul_mat_q_case<GGML_TYPE_Q5_K>(args, stream);
57
+ mul_mat_q_case<GGML_TYPE_Q5_K>(ctx, args, stream);
58
58
  break;
59
59
  case GGML_TYPE_Q6_K:
60
- mul_mat_q_case<GGML_TYPE_Q6_K>(args, stream);
60
+ mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
61
61
  break;
62
62
  default:
63
63
  GGML_ASSERT(false);
@@ -117,7 +117,7 @@ static __global__ void mul_mat_vec_q(
117
117
  tmp[j][i] = warp_reduce_sum(tmp[j][i]);
118
118
  }
119
119
 
120
- if (threadIdx.x < rows_per_cuda_block) {
120
+ if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
121
121
  dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
122
122
  }
123
123
  }
@@ -92,6 +92,15 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
92
92
  dst[i] = x[i] * x[i];
93
93
  }
94
94
 
95
+ static __global__ void sqrt_f32(const float * x, float * dst, const int k) {
96
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
97
+
98
+ if (i >= k) {
99
+ return;
100
+ }
101
+ dst[i] = sqrtf(x[i]);
102
+ }
103
+
95
104
  static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
96
105
  const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
97
106
  gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@@ -142,6 +151,11 @@ static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
142
151
  sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
143
152
  }
144
153
 
154
+ static void sqrt_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
155
+ const int num_blocks = (k + CUDA_SQRT_BLOCK_SIZE - 1) / CUDA_SQRT_BLOCK_SIZE;
156
+ sqrt_f32<<<num_blocks, CUDA_SQRT_BLOCK_SIZE, 0, stream>>>(x, dst, k);
157
+ }
158
+
145
159
  void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
146
160
  const ggml_tensor * src0 = dst->src[0];
147
161
  const float * src0_d = (const float *)src0->data;
@@ -284,3 +298,17 @@ void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
284
298
 
285
299
  sqr_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
286
300
  }
301
+
302
+ void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
303
+ const ggml_tensor * src0 = dst->src[0];
304
+ const float * src0_d = (const float *)src0->data;
305
+ float * dst_d = (float *)dst->data;
306
+ cudaStream_t stream = ctx.stream();
307
+
308
+ GGML_ASSERT(ggml_is_contiguous(src0));
309
+
310
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
311
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
312
+
313
+ sqrt_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
314
+ }
@@ -17,7 +17,7 @@
17
17
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
18
18
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
19
19
 
20
- #if defined(_WIN32)
20
+ #if defined(_MSC_VER)
21
21
 
22
22
  #define m512bh(p) p
23
23
  #define m512i(p) p
@@ -735,6 +735,12 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs
735
735
  }
736
736
 
737
737
  static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
738
+ for (size_t i = 0, n = 3; i < n; ++i) {
739
+ if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
740
+ return false;
741
+ }
742
+ }
743
+
738
744
  switch (op->op) {
739
745
  case GGML_OP_UNARY:
740
746
  switch (ggml_get_unary_op(op)) {