llama_cpp 0.16.1 → 0.16.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +10 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +10 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +28 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +6 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +8 -3
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2124 -13202
- data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +27564 -23876
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +278 -366
- data/vendor/tmp/llama.cpp/ggml.c +67 -150
- data/vendor/tmp/llama.cpp/ggml.h +6 -0
- data/vendor/tmp/llama.cpp/llama.cpp +530 -237
- data/vendor/tmp/llama.cpp/llama.h +5 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
- data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
- data/vendor/tmp/llama.cpp/unicode.h +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 78a5062740a7262e9b0d1d792a59f32e4962385110509b4433c186e78e58f8bc
|
4
|
+
data.tar.gz: e0d5921d4dba1496cc376919b9166162e11b358218da5aa1bb5d1b06ebcb7f64
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dc7e55f458cd7840fc6830fb8e3228dcbc62eb0fcae87c8ef758e6518502aca0992048ef9278585516b263229d0c0a6a1dfe5ca67b6c88765ee51d4f7ec8b516
|
7
|
+
data.tar.gz: 2819430e6ee8dea168ed5448bc51fed7eed66d60954f3c504f96315359be68ea85bde37ceccdc17feb6832207551154b171b8686196af264a3ee982af8c0e348
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
## [[0.16.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.16.1...v0.16.2)] - 2024-06-22
|
2
|
+
|
3
|
+
- Bump llama.cpp from b3151 to b3197.
|
4
|
+
- Add `LLAMA_POOLING_TYPE_LAST` constant.
|
5
|
+
- Add `--with-vulkan-memory-debug` config option.
|
6
|
+
- Add `set_embeddings` method to `Context`.
|
7
|
+
|
1
8
|
## [[0.16.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.16.0...v0.16.1)] - 2024-06-15
|
2
9
|
|
3
10
|
- Bump llama.cpp from b3091 to b3151.
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -22,6 +22,7 @@ make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
|
|
22
22
|
make_envs << ' LLAMA_VULKAN=1' if with_config('vulkan')
|
23
23
|
make_envs << ' LLAMA_NO_OPENMP=1' if with_config('no-openmp')
|
24
24
|
make_envs << ' LLAMA_NO_LLAMAFILE=1' if with_config('no-llamafile')
|
25
|
+
make_envs << ' LLAMA_VULKAN_MEMORY_DEBUG=1' if with_config('vulkan-memory-debug')
|
25
26
|
|
26
27
|
make_envs << ' LLAMA_METAL_EMBED_LIBRARY=1' if RUBY_PLATFORM.match?(/darwin/)
|
27
28
|
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -2133,6 +2133,7 @@ public:
|
|
2133
2133
|
rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
|
2134
2134
|
rb_define_method(rb_cLLaMAContext, "embeddings_ith", RUBY_METHOD_FUNC(_llama_context_embeddings_ith), 1);
|
2135
2135
|
rb_define_method(rb_cLLaMAContext, "embeddings_seq", RUBY_METHOD_FUNC(_llama_context_embeddings_seq), 1);
|
2136
|
+
rb_define_method(rb_cLLaMAContext, "set_embeddings", RUBY_METHOD_FUNC(_llama_context_set_embeddings), 1);
|
2136
2137
|
rb_define_method(rb_cLLaMAContext, "set_n_threads", RUBY_METHOD_FUNC(_llama_context_set_n_threads), -1);
|
2137
2138
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
2138
2139
|
rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
|
@@ -2357,6 +2358,16 @@ private:
|
|
2357
2358
|
return output;
|
2358
2359
|
}
|
2359
2360
|
|
2361
|
+
static VALUE _llama_context_set_embeddings(VALUE self, VALUE embs) {
|
2362
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2363
|
+
if (ptr->ctx == NULL) {
|
2364
|
+
rb_raise(rb_eArgError, "LLaMA context is not initialized");
|
2365
|
+
return Qnil;
|
2366
|
+
}
|
2367
|
+
llama_set_embeddings(ptr->ctx, RTEST(embs) ? true : false);
|
2368
|
+
return Qnil;
|
2369
|
+
}
|
2370
|
+
|
2360
2371
|
static VALUE _llama_context_set_n_threads(int argc, VALUE* argv, VALUE self) {
|
2361
2372
|
VALUE kw_args = Qnil;
|
2362
2373
|
ID kw_table[2] = { rb_intern("n_threads"), rb_intern("n_threads_batch") };
|
@@ -3572,6 +3583,7 @@ extern "C" void Init_llama_cpp(void) {
|
|
3572
3583
|
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_NONE", INT2NUM(LLAMA_POOLING_TYPE_NONE));
|
3573
3584
|
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_MEAN", INT2NUM(LLAMA_POOLING_TYPE_MEAN));
|
3574
3585
|
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_CLS", INT2NUM(LLAMA_POOLING_TYPE_CLS));
|
3586
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_POOLING_TYPE_LAST", INT2NUM(LLAMA_POOLING_TYPE_LAST));
|
3575
3587
|
|
3576
3588
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_NONE", INT2NUM(LLAMA_SPLIT_MODE_NONE));
|
3577
3589
|
rb_define_const(rb_mLLaMACpp, "LLAMA_SPLIT_MODE_LAYER", INT2NUM(LLAMA_SPLIT_MODE_LAYER));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.16.
|
6
|
+
VERSION = '0.16.2'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b3197'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -99,6 +99,7 @@ module LLaMACpp
|
|
99
99
|
LLAMA_POOLING_TYPE_NONE: Integer
|
100
100
|
LLAMA_POOLING_TYPE_MEAN: Integer
|
101
101
|
LLAMA_POOLING_TYPE_CLS: Integer
|
102
|
+
LLAMA_POOLING_TYPE_LAST: Integer
|
102
103
|
|
103
104
|
LLAMA_SPLIT_MODE_NONE: Integer
|
104
105
|
LLAMA_SPLIT_MODE_LAYER: Integer
|
@@ -258,6 +259,7 @@ module LLaMACpp
|
|
258
259
|
def embeddings_seq: (Integer) -> Array[Float]
|
259
260
|
def decode: (::LLaMACpp::Batch) -> void
|
260
261
|
def logits: () -> Array[Float]
|
262
|
+
def set_embeddings: (bool) -> void
|
261
263
|
def set_n_threads: (n_threads: Integer, n_threads_batch: Integer) -> void
|
262
264
|
def n_ctx: () -> Integer
|
263
265
|
def n_batch: () -> Integer
|
@@ -38,6 +38,7 @@ BUILD_TARGETS = \
|
|
38
38
|
llama-tokenize \
|
39
39
|
llama-train-text-from-scratch \
|
40
40
|
llama-vdot \
|
41
|
+
llama-cvector-generator \
|
41
42
|
tests/test-c.o
|
42
43
|
|
43
44
|
# Binaries only useful for tests
|
@@ -508,7 +509,7 @@ ifdef LLAMA_CUDA
|
|
508
509
|
CUDA_PATH ?= /usr/local/cuda
|
509
510
|
endif
|
510
511
|
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
|
511
|
-
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
512
|
+
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
|
512
513
|
OBJS += ggml-cuda.o
|
513
514
|
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
514
515
|
OBJS += $(OBJS_CUDA_TEMP_INST)
|
@@ -609,6 +610,10 @@ ifdef LLAMA_VULKAN_DEBUG
|
|
609
610
|
MK_CPPFLAGS += -DGGML_VULKAN_DEBUG
|
610
611
|
endif
|
611
612
|
|
613
|
+
ifdef LLAMA_VULKAN_MEMORY_DEBUG
|
614
|
+
MK_CPPFLAGS += -DGGML_VULKAN_MEMORY_DEBUG
|
615
|
+
endif
|
616
|
+
|
612
617
|
ifdef LLAMA_VULKAN_VALIDATE
|
613
618
|
MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE
|
614
619
|
endif
|
@@ -827,7 +832,6 @@ libllama.so: llama.o ggml.o $(OBJS)
|
|
827
832
|
libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
828
833
|
ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
829
834
|
|
830
|
-
|
831
835
|
lib: llama.o ggml.o $(OBJS)
|
832
836
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama$(DSO_EXT) $^ $(LDFLAGS)
|
833
837
|
ar rcs libllama.a $^
|
@@ -928,6 +932,10 @@ llama-eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(C
|
|
928
932
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
929
933
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
930
934
|
|
935
|
+
llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
936
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
937
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
938
|
+
|
931
939
|
llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
932
940
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
933
941
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -1172,7 +1172,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
1172
1172
|
// check if a backend with higher prio wants to offload the op
|
1173
1173
|
if (src_backend_id == sched->n_backends - 1) {
|
1174
1174
|
for (int b = 0; b < src_backend_id; b++) {
|
1175
|
-
if (ggml_backend_offload_op(sched->backends[b], tensor)) {
|
1175
|
+
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
|
1176
1176
|
SET_CAUSE(tensor, "1.off");
|
1177
1177
|
return b;
|
1178
1178
|
}
|
@@ -1706,14 +1706,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
1706
1706
|
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
1707
1707
|
bool backend_ids_changed = false;
|
1708
1708
|
for (int i = 0; i < sched->graph->n_nodes; i++) {
|
1709
|
-
if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i]
|
1709
|
+
if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
|
1710
|
+
sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
|
1710
1711
|
backend_ids_changed = true;
|
1711
1712
|
break;
|
1712
1713
|
}
|
1713
1714
|
}
|
1714
1715
|
if (!backend_ids_changed) {
|
1715
1716
|
for (int i = 0; i < sched->graph->n_leafs; i++) {
|
1716
|
-
if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i]
|
1717
|
+
if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
|
1718
|
+
sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
|
1717
1719
|
backend_ids_changed = true;
|
1718
1720
|
break;
|
1719
1721
|
}
|
@@ -1977,6 +1979,15 @@ int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
|
|
1977
1979
|
return sched->n_copies;
|
1978
1980
|
}
|
1979
1981
|
|
1982
|
+
int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
|
1983
|
+
return sched->n_backends;
|
1984
|
+
}
|
1985
|
+
|
1986
|
+
ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
|
1987
|
+
GGML_ASSERT(i >= 0 && i < sched->n_backends);
|
1988
|
+
return sched->backends[i];
|
1989
|
+
}
|
1990
|
+
|
1980
1991
|
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
1981
1992
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
1982
1993
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
@@ -182,6 +182,9 @@ extern "C" {
|
|
182
182
|
// Initialize backend buffers from a measure graph
|
183
183
|
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
184
184
|
|
185
|
+
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
186
|
+
GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
|
187
|
+
|
185
188
|
// Get the number of splits of the last graph
|
186
189
|
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
187
190
|
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|
@@ -30,34 +30,34 @@ void ggml_cuda_op_mul_mat_q(
|
|
30
30
|
|
31
31
|
switch (src0->type) {
|
32
32
|
case GGML_TYPE_Q4_0:
|
33
|
-
mul_mat_q_case<GGML_TYPE_Q4_0>(args, stream);
|
33
|
+
mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
|
34
34
|
break;
|
35
35
|
case GGML_TYPE_Q4_1:
|
36
|
-
mul_mat_q_case<GGML_TYPE_Q4_1>(args, stream);
|
36
|
+
mul_mat_q_case<GGML_TYPE_Q4_1>(ctx, args, stream);
|
37
37
|
break;
|
38
38
|
case GGML_TYPE_Q5_0:
|
39
|
-
mul_mat_q_case<GGML_TYPE_Q5_0>(args, stream);
|
39
|
+
mul_mat_q_case<GGML_TYPE_Q5_0>(ctx, args, stream);
|
40
40
|
break;
|
41
41
|
case GGML_TYPE_Q5_1:
|
42
|
-
mul_mat_q_case<GGML_TYPE_Q5_1>(args, stream);
|
42
|
+
mul_mat_q_case<GGML_TYPE_Q5_1>(ctx, args, stream);
|
43
43
|
break;
|
44
44
|
case GGML_TYPE_Q8_0:
|
45
|
-
mul_mat_q_case<GGML_TYPE_Q8_0>(args, stream);
|
45
|
+
mul_mat_q_case<GGML_TYPE_Q8_0>(ctx, args, stream);
|
46
46
|
break;
|
47
47
|
case GGML_TYPE_Q2_K:
|
48
|
-
mul_mat_q_case<GGML_TYPE_Q2_K>(args, stream);
|
48
|
+
mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream);
|
49
49
|
break;
|
50
50
|
case GGML_TYPE_Q3_K:
|
51
|
-
mul_mat_q_case<GGML_TYPE_Q3_K>(args, stream);
|
51
|
+
mul_mat_q_case<GGML_TYPE_Q3_K>(ctx, args, stream);
|
52
52
|
break;
|
53
53
|
case GGML_TYPE_Q4_K:
|
54
|
-
mul_mat_q_case<GGML_TYPE_Q4_K>(args, stream);
|
54
|
+
mul_mat_q_case<GGML_TYPE_Q4_K>(ctx, args, stream);
|
55
55
|
break;
|
56
56
|
case GGML_TYPE_Q5_K:
|
57
|
-
mul_mat_q_case<GGML_TYPE_Q5_K>(args, stream);
|
57
|
+
mul_mat_q_case<GGML_TYPE_Q5_K>(ctx, args, stream);
|
58
58
|
break;
|
59
59
|
case GGML_TYPE_Q6_K:
|
60
|
-
mul_mat_q_case<GGML_TYPE_Q6_K>(args, stream);
|
60
|
+
mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
|
61
61
|
break;
|
62
62
|
default:
|
63
63
|
GGML_ASSERT(false);
|
@@ -117,7 +117,7 @@ static __global__ void mul_mat_vec_q(
|
|
117
117
|
tmp[j][i] = warp_reduce_sum(tmp[j][i]);
|
118
118
|
}
|
119
119
|
|
120
|
-
if (threadIdx.x < rows_per_cuda_block) {
|
120
|
+
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
|
121
121
|
dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
|
122
122
|
}
|
123
123
|
}
|
@@ -92,6 +92,15 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
|
92
92
|
dst[i] = x[i] * x[i];
|
93
93
|
}
|
94
94
|
|
95
|
+
static __global__ void sqrt_f32(const float * x, float * dst, const int k) {
|
96
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
97
|
+
|
98
|
+
if (i >= k) {
|
99
|
+
return;
|
100
|
+
}
|
101
|
+
dst[i] = sqrtf(x[i]);
|
102
|
+
}
|
103
|
+
|
95
104
|
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
96
105
|
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
97
106
|
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
@@ -142,6 +151,11 @@ static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
|
|
142
151
|
sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
143
152
|
}
|
144
153
|
|
154
|
+
static void sqrt_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
155
|
+
const int num_blocks = (k + CUDA_SQRT_BLOCK_SIZE - 1) / CUDA_SQRT_BLOCK_SIZE;
|
156
|
+
sqrt_f32<<<num_blocks, CUDA_SQRT_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
157
|
+
}
|
158
|
+
|
145
159
|
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
146
160
|
const ggml_tensor * src0 = dst->src[0];
|
147
161
|
const float * src0_d = (const float *)src0->data;
|
@@ -284,3 +298,17 @@ void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
284
298
|
|
285
299
|
sqr_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
286
300
|
}
|
301
|
+
|
302
|
+
void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
303
|
+
const ggml_tensor * src0 = dst->src[0];
|
304
|
+
const float * src0_d = (const float *)src0->data;
|
305
|
+
float * dst_d = (float *)dst->data;
|
306
|
+
cudaStream_t stream = ctx.stream();
|
307
|
+
|
308
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
309
|
+
|
310
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
311
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
312
|
+
|
313
|
+
sqrt_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
314
|
+
}
|
@@ -735,6 +735,12 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs
|
|
735
735
|
}
|
736
736
|
|
737
737
|
static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
|
738
|
+
for (size_t i = 0, n = 3; i < n; ++i) {
|
739
|
+
if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
|
740
|
+
return false;
|
741
|
+
}
|
742
|
+
}
|
743
|
+
|
738
744
|
switch (op->op) {
|
739
745
|
case GGML_OP_UNARY:
|
740
746
|
switch (ggml_get_unary_op(op)) {
|