llama_cpp 0.15.2 → 0.15.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +6 -17
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +72 -30
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +40 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +68 -70
- data/vendor/tmp/llama.cpp/ggml-metal.metal +24 -409
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +1879 -2450
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +176 -53
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +40 -500
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +202 -225
- data/vendor/tmp/llama.cpp/ggml.c +376 -758
- data/vendor/tmp/llama.cpp/ggml.h +39 -27
- data/vendor/tmp/llama.cpp/llama.cpp +823 -593
- data/vendor/tmp/llama.cpp/llama.h +10 -3
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d0a9cdf86695522e27b1e8d3ed485dfa6ab3a4fc23d9bd9e44bf8c3cb483c347
|
4
|
+
data.tar.gz: 5d97cec87f9b1df94f85f9e18dc46a1b8a4ec593c17d04e4bee0da3d28c34211
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 71f26009b872db64d0d0d416153b5fbd6afb598617b701cb6342d099542c962f410bccddf80b77928bfd8ab8f017a749fbc1d2ed488139d806ef0e3cf75a0e42
|
7
|
+
data.tar.gz: 808c03f6664af65cadfea23071d0b55d459c119189346762ea9632156f7f35b8d1f0e594b356726fc26abdb1c81a3bce9d697b9ca2d6324c454a31f2a442f0d7
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
## [[0.15.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.2...v0.15.3)] - 2024-05-25
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2917 to b2988.
|
4
|
+
- Add constants for pre-tokenization types.
|
5
|
+
- Add `n_threads` method to `Context`.
|
6
|
+
- Add `n_threads_batch` method to `Context`.
|
7
|
+
- Add `set_n_threads` method to `Context`.
|
8
|
+
|
1
9
|
## [[0.15.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.1...v0.15.2)] - 2024-05-18
|
2
10
|
|
3
11
|
- Bump llama.cpp from b2839 to b2917.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -2122,10 +2122,13 @@ public:
|
|
2122
2122
|
rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
|
2123
2123
|
rb_define_method(rb_cLLaMAContext, "embeddings_ith", RUBY_METHOD_FUNC(_llama_context_embeddings_ith), 1);
|
2124
2124
|
rb_define_method(rb_cLLaMAContext, "embeddings_seq", RUBY_METHOD_FUNC(_llama_context_embeddings_seq), 1);
|
2125
|
+
rb_define_method(rb_cLLaMAContext, "set_n_threads", RUBY_METHOD_FUNC(_llama_context_set_n_threads), -1);
|
2125
2126
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
2126
2127
|
rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
|
2127
2128
|
rb_define_method(rb_cLLaMAContext, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_n_ubatch), 0);
|
2128
2129
|
rb_define_method(rb_cLLaMAContext, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_n_seq_max), 0);
|
2130
|
+
rb_define_method(rb_cLLaMAContext, "n_threads", RUBY_METHOD_FUNC(_llama_context_n_threads), 0);
|
2131
|
+
rb_define_method(rb_cLLaMAContext, "n_threads_batch", RUBY_METHOD_FUNC(_llama_context_n_threads_batch), 0);
|
2129
2132
|
rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
|
2130
2133
|
rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
|
2131
2134
|
rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
|
@@ -2343,6 +2346,33 @@ private:
|
|
2343
2346
|
return output;
|
2344
2347
|
}
|
2345
2348
|
|
2349
|
+
static VALUE _llama_context_set_n_threads(int argc, VALUE* argv, VALUE self) {
|
2350
|
+
VALUE kw_args = Qnil;
|
2351
|
+
ID kw_table[2] = { rb_intern("n_threads"), rb_intern("n_threads_batch") };
|
2352
|
+
VALUE kw_values[2] = { Qundef, Qundef };
|
2353
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
2354
|
+
rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
|
2355
|
+
|
2356
|
+
VALUE n_threads = kw_values[0];
|
2357
|
+
if (!RB_INTEGER_TYPE_P(n_threads)) {
|
2358
|
+
rb_raise(rb_eArgError, "n_threads must be an integer");
|
2359
|
+
return Qnil;
|
2360
|
+
}
|
2361
|
+
VALUE n_threads_batch = kw_values[1];
|
2362
|
+
if (!RB_INTEGER_TYPE_P(n_threads_batch)) {
|
2363
|
+
rb_raise(rb_eArgError, "n_threads_batch must be an integer");
|
2364
|
+
return Qnil;
|
2365
|
+
}
|
2366
|
+
|
2367
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2368
|
+
if (ptr->ctx == NULL) {
|
2369
|
+
rb_raise(rb_eArgError, "LLaMA context is not initialized");
|
2370
|
+
return Qnil;
|
2371
|
+
}
|
2372
|
+
llama_set_n_threads(ptr->ctx, NUM2UINT(n_threads), NUM2UINT(n_threads_batch));
|
2373
|
+
return Qnil;
|
2374
|
+
}
|
2375
|
+
|
2346
2376
|
static VALUE _llama_context_n_ctx(VALUE self) {
|
2347
2377
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2348
2378
|
if (ptr->ctx == NULL) {
|
@@ -2379,6 +2409,24 @@ private:
|
|
2379
2409
|
return UINT2NUM(llama_n_seq_max(ptr->ctx));
|
2380
2410
|
}
|
2381
2411
|
|
2412
|
+
static VALUE _llama_context_n_threads(VALUE self) {
|
2413
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2414
|
+
if (ptr->ctx == NULL) {
|
2415
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2416
|
+
return Qnil;
|
2417
|
+
}
|
2418
|
+
return UINT2NUM(llama_n_threads(ptr->ctx));
|
2419
|
+
}
|
2420
|
+
|
2421
|
+
static VALUE _llama_context_n_threads_batch(VALUE self) {
|
2422
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2423
|
+
if (ptr->ctx == NULL) {
|
2424
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2425
|
+
return Qnil;
|
2426
|
+
}
|
2427
|
+
return UINT2NUM(llama_n_threads_batch(ptr->ctx));
|
2428
|
+
}
|
2429
|
+
|
2382
2430
|
static VALUE _llama_context_get_timings(VALUE self) {
|
2383
2431
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2384
2432
|
if (ptr->ctx == NULL) {
|
@@ -3430,6 +3478,7 @@ extern "C" void Init_llama_cpp(void) {
|
|
3430
3478
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
|
3431
3479
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_REFACT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_REFACT));
|
3432
3480
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_COMMAND_R", INT2NUM(LLAMA_VOCAB_PRE_TYPE_COMMAND_R));
|
3481
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STABLELM2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STABLELM2));
|
3433
3482
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_QWEN2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2));
|
3434
3483
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_OLMO", INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO));
|
3435
3484
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DBRX", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.15.
|
6
|
+
VERSION = '0.15.3'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2988'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -26,6 +26,7 @@ module LLaMACpp
|
|
26
26
|
LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
|
27
27
|
LLAMA_VOCAB_PRE_TYPE_REFACT: Integer
|
28
28
|
LLAMA_VOCAB_PRE_TYPE_COMMAND_R: Integer
|
29
|
+
LLAMA_VOCAB_PRE_TYPE_STABLELM2: Integer
|
29
30
|
LLAMA_VOCAB_PRE_TYPE_QWEN2: Integer
|
30
31
|
LLAMA_VOCAB_PRE_TYPE_OLMO: Integer
|
31
32
|
LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
|
@@ -241,10 +242,13 @@ module LLaMACpp
|
|
241
242
|
def embeddings_seq: (Integer) -> Array[Float]
|
242
243
|
def decode: (::LLaMACpp::Batch) -> void
|
243
244
|
def logits: () -> Array[Float]
|
245
|
+
def set_n_threads: (n_threads: Integer, n_threads_batch: Integer) -> void
|
244
246
|
def n_ctx: () -> Integer
|
245
247
|
def n_batch: () -> Integer
|
246
248
|
def n_ubatch: () -> Integer
|
247
249
|
def n_seq_max: () -> Integer
|
250
|
+
def n_threads: () -> Integer
|
251
|
+
def n_threads_batch: () -> Integer
|
248
252
|
def timings: () -> ::LLaMACpp::Timings
|
249
253
|
def print_timings: () -> void
|
250
254
|
def reset_timings: () -> void
|
@@ -381,15 +381,16 @@ ifneq ($(filter ppc64le%,$(UNAME_M)),)
|
|
381
381
|
CUDA_POWER_ARCH = 1
|
382
382
|
endif
|
383
383
|
|
384
|
+
ifneq ($(filter loongarch64%,$(UNAME_M)),)
|
385
|
+
MK_CFLAGS += -mlasx
|
386
|
+
MK_CXXFLAGS += -mlasx
|
387
|
+
endif
|
388
|
+
|
384
389
|
else
|
385
390
|
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
|
386
391
|
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
387
392
|
endif
|
388
393
|
|
389
|
-
ifdef LLAMA_QKK_64
|
390
|
-
MK_CPPFLAGS += -DGGML_QKK_64
|
391
|
-
endif
|
392
|
-
|
393
394
|
ifndef LLAMA_NO_ACCELERATE
|
394
395
|
# Mac OS - include Accelerate framework.
|
395
396
|
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
|
@@ -401,13 +402,6 @@ ifndef LLAMA_NO_ACCELERATE
|
|
401
402
|
endif
|
402
403
|
endif # LLAMA_NO_ACCELERATE
|
403
404
|
|
404
|
-
ifdef LLAMA_MPI
|
405
|
-
MK_CPPFLAGS += -DGGML_USE_MPI
|
406
|
-
MK_CFLAGS += -Wno-cast-qual
|
407
|
-
MK_CXXFLAGS += -Wno-cast-qual
|
408
|
-
OBJS += ggml-mpi.o
|
409
|
-
endif # LLAMA_MPI
|
410
|
-
|
411
405
|
ifdef LLAMA_OPENBLAS
|
412
406
|
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
|
413
407
|
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
@@ -631,11 +625,6 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
|
|
631
625
|
endif
|
632
626
|
endif # LLAMA_METAL
|
633
627
|
|
634
|
-
ifdef LLAMA_MPI
|
635
|
-
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
|
636
|
-
$(CC) $(CFLAGS) -c $< -o $@
|
637
|
-
endif # LLAMA_MPI
|
638
|
-
|
639
628
|
ifndef LLAMA_NO_LLAMAFILE
|
640
629
|
sgemm.o: sgemm.cpp sgemm.h ggml.h
|
641
630
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
@@ -760,7 +749,7 @@ lib: llama.o ggml.o $(OBJS)
|
|
760
749
|
ar rcs libllama.a $^
|
761
750
|
|
762
751
|
clean:
|
763
|
-
rm -vrf *.o tests/*.o *.so *.a *.dll
|
752
|
+
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
764
753
|
rm -vrf ggml-cuda/*.o
|
765
754
|
|
766
755
|
#
|
@@ -65,13 +65,8 @@ typedef sycl::half2 ggml_half2;
|
|
65
65
|
// QK = number of values after dequantization
|
66
66
|
// QK_K = super-block size
|
67
67
|
|
68
|
-
#ifdef GGML_QKK_64
|
69
|
-
#define QK_K 64
|
70
|
-
#define K_SCALE_SIZE 4
|
71
|
-
#else
|
72
68
|
#define QK_K 256
|
73
69
|
#define K_SCALE_SIZE 12
|
74
|
-
#endif // GGML_QKK_64
|
75
70
|
|
76
71
|
#if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
|
77
72
|
// QR = QK / number of values before dequantization
|
@@ -131,13 +126,8 @@ typedef sycl::half2 ggml_half2;
|
|
131
126
|
#define QI4_NL (QK4_NL / (4*QR4_NL))
|
132
127
|
#define QR4_NL 2
|
133
128
|
|
134
|
-
#if QK_K == 64
|
135
|
-
#define QI4_XS QI4_NL
|
136
|
-
#define QR4_XS QR4_NL
|
137
|
-
#else
|
138
129
|
#define QI4_XS (QK_K / (4*QR4_XS))
|
139
130
|
#define QR4_XS 8
|
140
|
-
#endif
|
141
131
|
|
142
132
|
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
143
133
|
|
@@ -228,15 +218,6 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wro
|
|
228
218
|
// weight is represented as x = a * q
|
229
219
|
// 16 blocks of 16 elements each
|
230
220
|
// Effectively 3.4375 bits per weight
|
231
|
-
#ifdef GGML_QKK_64
|
232
|
-
typedef struct {
|
233
|
-
uint8_t hmask[QK_K/8]; // quants - high bit
|
234
|
-
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
235
|
-
uint8_t scales[2];
|
236
|
-
ggml_half d; // super-block scale
|
237
|
-
} block_q3_K;
|
238
|
-
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
|
239
|
-
#else
|
240
221
|
typedef struct {
|
241
222
|
uint8_t hmask[QK_K/8]; // quants - high bit
|
242
223
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
@@ -244,20 +225,11 @@ typedef struct {
|
|
244
225
|
ggml_half d; // super-block scale
|
245
226
|
} block_q3_K;
|
246
227
|
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
|
247
|
-
#endif
|
248
228
|
|
249
229
|
// 4-bit quantization
|
250
230
|
// 8 blocks of 32 elements each
|
251
231
|
// weight is represented as x = a * q + b
|
252
232
|
// Effectively 4.5 bits per weight
|
253
|
-
#ifdef GGML_QKK_64
|
254
|
-
typedef struct {
|
255
|
-
ggml_half d[2]; // super-block scales/mins
|
256
|
-
uint8_t scales[2]; // 4-bit block scales/mins
|
257
|
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
258
|
-
} block_q4_K;
|
259
|
-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
260
|
-
#else
|
261
233
|
typedef struct {
|
262
234
|
union {
|
263
235
|
struct {
|
@@ -270,21 +242,11 @@ typedef struct {
|
|
270
242
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
271
243
|
} block_q4_K;
|
272
244
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
|
273
|
-
#endif
|
274
245
|
|
275
246
|
// 5-bit quantization
|
276
247
|
// 8 blocks of 32 elements each
|
277
248
|
// weight is represented as x = a * q + b
|
278
249
|
// Effectively 5.5 bits per weight
|
279
|
-
#ifdef GGML_QKK_64
|
280
|
-
typedef struct {
|
281
|
-
ggml_half d; // super-block scale
|
282
|
-
int8_t scales[QK_K/16]; // 8-bit block scales
|
283
|
-
uint8_t qh[QK_K/8]; // quants, high bit
|
284
|
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
285
|
-
} block_q5_K;
|
286
|
-
static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
287
|
-
#else
|
288
250
|
typedef struct {
|
289
251
|
union {
|
290
252
|
struct {
|
@@ -298,7 +260,6 @@ typedef struct {
|
|
298
260
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
299
261
|
} block_q5_K;
|
300
262
|
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
301
|
-
#endif
|
302
263
|
|
303
264
|
// 6-bit quantization
|
304
265
|
// weight is represented as x = a * q
|
@@ -356,11 +317,7 @@ typedef struct {
|
|
356
317
|
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
357
318
|
|
358
319
|
// 3.4375 bpw
|
359
|
-
#if QK_K == 64
|
360
|
-
#define IQ3S_N_SCALE 2
|
361
|
-
#else
|
362
320
|
#define IQ3S_N_SCALE QK_K/64
|
363
|
-
#endif
|
364
321
|
typedef struct {
|
365
322
|
ggml_half d;
|
366
323
|
uint8_t qs[QK_K/4];
|
@@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron
|
|
381
338
|
typedef struct {
|
382
339
|
uint8_t qs[QK_K/8]; // grid index, low 8 bits
|
383
340
|
uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
|
384
|
-
#if QK_K == 64
|
385
|
-
ggml_half d;
|
386
|
-
#endif
|
387
341
|
uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
|
388
342
|
} block_iq1_m;
|
389
|
-
#if QK_K == 64
|
390
|
-
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
|
391
|
-
#else
|
392
343
|
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
|
393
|
-
#endif
|
394
344
|
|
395
345
|
// Used by IQ1_M quants
|
396
346
|
typedef union {
|
@@ -406,9 +356,6 @@ typedef struct {
|
|
406
356
|
} block_iq4_nl;
|
407
357
|
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
408
358
|
|
409
|
-
#if QK_K == 64
|
410
|
-
#define block_iq4_xs block_iq4_nl
|
411
|
-
#else
|
412
359
|
typedef struct {
|
413
360
|
ggml_half d;
|
414
361
|
uint16_t scales_h;
|
@@ -416,7 +363,6 @@ typedef struct {
|
|
416
363
|
uint8_t qs[QK_K/2];
|
417
364
|
} block_iq4_xs;
|
418
365
|
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
419
|
-
#endif
|
420
366
|
|
421
367
|
#endif // GGML_COMMON_DECL
|
422
368
|
#endif // GGML_COMMON_DECL
|
@@ -43,19 +43,59 @@
|
|
43
43
|
#include <mutex>
|
44
44
|
#include <stdint.h>
|
45
45
|
#include <stdio.h>
|
46
|
+
#include <stdarg.h>
|
47
|
+
#include <stdlib.h>
|
46
48
|
#include <string>
|
47
49
|
#include <vector>
|
48
50
|
|
49
51
|
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
50
52
|
|
53
|
+
static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
54
|
+
GGML_UNUSED(level);
|
55
|
+
GGML_UNUSED(user_data);
|
56
|
+
fprintf(stderr, "%s", msg);
|
57
|
+
}
|
58
|
+
|
59
|
+
ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
|
60
|
+
void * ggml_cuda_log_user_data = NULL;
|
61
|
+
|
62
|
+
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
63
|
+
ggml_cuda_log_callback = log_callback;
|
64
|
+
ggml_cuda_log_user_data = user_data;
|
65
|
+
}
|
66
|
+
|
67
|
+
#define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
68
|
+
#define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
69
|
+
#define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
70
|
+
|
71
|
+
GGML_ATTRIBUTE_FORMAT(2, 3)
|
72
|
+
static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
|
73
|
+
if (ggml_cuda_log_callback != NULL) {
|
74
|
+
va_list args;
|
75
|
+
va_start(args, format);
|
76
|
+
char buffer[128];
|
77
|
+
int len = vsnprintf(buffer, 128, format, args);
|
78
|
+
if (len < 128) {
|
79
|
+
ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data);
|
80
|
+
} else {
|
81
|
+
std::vector<char> buffer2(len + 1); // vsnprintf adds a null terminator
|
82
|
+
va_end(args);
|
83
|
+
va_start(args, format);
|
84
|
+
vsnprintf(&buffer2[0], buffer2.size(), format, args);
|
85
|
+
ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data);
|
86
|
+
}
|
87
|
+
va_end(args);
|
88
|
+
}
|
89
|
+
}
|
90
|
+
|
51
91
|
[[noreturn]]
|
52
92
|
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
53
93
|
int id = -1; // in case cudaGetDevice fails
|
54
94
|
cudaGetDevice(&id);
|
55
95
|
|
56
|
-
|
57
|
-
|
58
|
-
|
96
|
+
GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg);
|
97
|
+
GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
98
|
+
GGML_CUDA_LOG_ERROR(" %s\n", stmt);
|
59
99
|
// abort with GGML_ASSERT to get a stack trace
|
60
100
|
GGML_ASSERT(!"CUDA error");
|
61
101
|
}
|
@@ -91,7 +131,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
91
131
|
|
92
132
|
cudaError_t err = cudaGetDeviceCount(&info.device_count);
|
93
133
|
if (err != cudaSuccess) {
|
94
|
-
|
134
|
+
GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
|
95
135
|
return info;
|
96
136
|
}
|
97
137
|
|
@@ -99,16 +139,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
99
139
|
|
100
140
|
int64_t total_vram = 0;
|
101
141
|
#if defined(GGML_CUDA_FORCE_MMQ)
|
102
|
-
|
142
|
+
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
103
143
|
#else
|
104
|
-
|
144
|
+
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
105
145
|
#endif
|
106
146
|
#if defined(CUDA_USE_TENSOR_CORES)
|
107
|
-
|
147
|
+
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
108
148
|
#else
|
109
|
-
|
149
|
+
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
110
150
|
#endif
|
111
|
-
|
151
|
+
GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
112
152
|
for (int id = 0; id < info.device_count; ++id) {
|
113
153
|
int device_vmm = 0;
|
114
154
|
|
@@ -129,7 +169,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
129
169
|
|
130
170
|
cudaDeviceProp prop;
|
131
171
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
132
|
-
|
172
|
+
GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
133
173
|
|
134
174
|
info.default_tensor_split[id] = total_vram;
|
135
175
|
total_vram += prop.totalGlobalMem;
|
@@ -235,8 +275,8 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
235
275
|
*actual_size = look_ahead_size;
|
236
276
|
pool_size += look_ahead_size;
|
237
277
|
#ifdef DEBUG_CUDA_MALLOC
|
238
|
-
|
239
|
-
|
278
|
+
GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
|
279
|
+
(uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
|
240
280
|
#endif
|
241
281
|
return ptr;
|
242
282
|
}
|
@@ -250,7 +290,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
250
290
|
return;
|
251
291
|
}
|
252
292
|
}
|
253
|
-
|
293
|
+
GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
254
294
|
ggml_cuda_set_device(device);
|
255
295
|
CUDA_CHECK(cudaFree(ptr));
|
256
296
|
pool_size -= size;
|
@@ -499,7 +539,9 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
|
|
499
539
|
void * dev_ptr;
|
500
540
|
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
501
541
|
if (err != cudaSuccess) {
|
502
|
-
|
542
|
+
// clear the error
|
543
|
+
cudaGetLastError();
|
544
|
+
GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
|
503
545
|
return nullptr;
|
504
546
|
}
|
505
547
|
|
@@ -1002,8 +1044,8 @@ static void * ggml_cuda_host_malloc(size_t size) {
|
|
1002
1044
|
if (err != cudaSuccess) {
|
1003
1045
|
// clear the error
|
1004
1046
|
cudaGetLastError();
|
1005
|
-
|
1006
|
-
|
1047
|
+
GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
1048
|
+
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
1007
1049
|
return nullptr;
|
1008
1050
|
}
|
1009
1051
|
|
@@ -2246,7 +2288,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
2246
2288
|
break;
|
2247
2289
|
case GGML_OP_MUL_MAT:
|
2248
2290
|
if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
|
2249
|
-
|
2291
|
+
GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
|
2250
2292
|
return false;
|
2251
2293
|
} else {
|
2252
2294
|
ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
|
@@ -2300,7 +2342,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
2300
2342
|
|
2301
2343
|
cudaError_t err = cudaGetLastError();
|
2302
2344
|
if (err != cudaSuccess) {
|
2303
|
-
|
2345
|
+
GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
|
2304
2346
|
CUDA_CHECK(err);
|
2305
2347
|
}
|
2306
2348
|
|
@@ -2476,7 +2518,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2476
2518
|
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
2477
2519
|
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
2478
2520
|
#ifndef NDEBUG
|
2479
|
-
|
2521
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
2480
2522
|
#endif
|
2481
2523
|
}
|
2482
2524
|
}
|
@@ -2523,14 +2565,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2523
2565
|
if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
2524
2566
|
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
2525
2567
|
#ifndef NDEBUG
|
2526
|
-
|
2568
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
2527
2569
|
#endif
|
2528
2570
|
}
|
2529
2571
|
|
2530
2572
|
if (node->op == GGML_OP_MUL_MAT_ID) {
|
2531
2573
|
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
2532
2574
|
#ifndef NDEBUG
|
2533
|
-
|
2575
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
|
2534
2576
|
#endif
|
2535
2577
|
}
|
2536
2578
|
|
@@ -2539,7 +2581,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2539
2581
|
// Changes in batch size or context size can cause changes to the grid size of some kernels.
|
2540
2582
|
use_cuda_graph = false;
|
2541
2583
|
#ifndef NDEBUG
|
2542
|
-
|
2584
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
2543
2585
|
#endif
|
2544
2586
|
}
|
2545
2587
|
|
@@ -2567,7 +2609,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2567
2609
|
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
2568
2610
|
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
2569
2611
|
#ifndef NDEBUG
|
2570
|
-
|
2612
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
2571
2613
|
#endif
|
2572
2614
|
}
|
2573
2615
|
}
|
@@ -2605,7 +2647,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2605
2647
|
|
2606
2648
|
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
|
2607
2649
|
if (!ok) {
|
2608
|
-
|
2650
|
+
GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
2609
2651
|
}
|
2610
2652
|
GGML_ASSERT(ok);
|
2611
2653
|
}
|
@@ -2624,7 +2666,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2624
2666
|
use_cuda_graph = false;
|
2625
2667
|
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
|
2626
2668
|
#ifndef NDEBUG
|
2627
|
-
|
2669
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
|
2628
2670
|
#endif
|
2629
2671
|
} else {
|
2630
2672
|
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
@@ -2691,7 +2733,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2691
2733
|
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
2692
2734
|
if (stat == cudaErrorGraphExecUpdateFailure) {
|
2693
2735
|
#ifndef NDEBUG
|
2694
|
-
|
2736
|
+
GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
|
2695
2737
|
#endif
|
2696
2738
|
// The pre-existing graph exec cannot be updated due to violated constraints
|
2697
2739
|
// so instead clear error and re-instantiate
|
@@ -2948,13 +2990,13 @@ static ggml_guid_t ggml_backend_cuda_guid() {
|
|
2948
2990
|
|
2949
2991
|
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
2950
2992
|
if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
|
2951
|
-
|
2993
|
+
GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
|
2952
2994
|
return nullptr;
|
2953
2995
|
}
|
2954
2996
|
|
2955
2997
|
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
|
2956
2998
|
if (ctx == nullptr) {
|
2957
|
-
|
2999
|
+
GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
|
2958
3000
|
return nullptr;
|
2959
3001
|
}
|
2960
3002
|
|
@@ -2998,8 +3040,8 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
|
|
2998
3040
|
// clear the error
|
2999
3041
|
cudaGetLastError();
|
3000
3042
|
|
3001
|
-
|
3002
|
-
|
3043
|
+
GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
3044
|
+
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
3003
3045
|
return false;
|
3004
3046
|
}
|
3005
3047
|
return true;
|
@@ -38,6 +38,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t *
|
|
38
38
|
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
39
39
|
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
40
40
|
|
41
|
+
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
41
42
|
#ifdef __cplusplus
|
42
43
|
}
|
43
44
|
#endif
|
@@ -17,6 +17,18 @@
|
|
17
17
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
18
18
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
19
19
|
|
20
|
+
#if defined(_WIN32)
|
21
|
+
|
22
|
+
#define m512bh(p) p
|
23
|
+
#define m512i(p) p
|
24
|
+
|
25
|
+
#else
|
26
|
+
|
27
|
+
#define m512bh(p) (__m512bh)(p)
|
28
|
+
#define m512i(p) (__m512i)(p)
|
29
|
+
|
30
|
+
#endif
|
31
|
+
|
20
32
|
/**
|
21
33
|
* Converts brain16 to float32.
|
22
34
|
*
|
@@ -443,6 +455,34 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
443
455
|
#include <riscv_vector.h>
|
444
456
|
#endif
|
445
457
|
|
458
|
+
#if defined(__loongarch64)
|
459
|
+
#if defined(__loongarch_asx)
|
460
|
+
#include <lasxintrin.h>
|
461
|
+
#endif
|
462
|
+
#if defined(__loongarch_sx)
|
463
|
+
#include <lsxintrin.h>
|
464
|
+
#endif
|
465
|
+
#endif
|
466
|
+
|
467
|
+
#if defined(__loongarch_asx)
|
468
|
+
|
469
|
+
typedef union {
|
470
|
+
int32_t i;
|
471
|
+
float f;
|
472
|
+
} ft_union;
|
473
|
+
|
474
|
+
/* float type data load instructions */
|
475
|
+
static __m128 __lsx_vreplfr2vr_s(float val) {
|
476
|
+
ft_union fi_tmpval = {.f = val};
|
477
|
+
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
|
478
|
+
}
|
479
|
+
|
480
|
+
static __m256 __lasx_xvreplfr2vr_s(float val) {
|
481
|
+
ft_union fi_tmpval = {.f = val};
|
482
|
+
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
|
483
|
+
}
|
484
|
+
#endif
|
485
|
+
|
446
486
|
#ifdef __F16C__
|
447
487
|
|
448
488
|
#ifdef _MSC_VER
|
@@ -1677,6 +1677,10 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
1677
1677
|
} break;
|
1678
1678
|
case GGML_OP_ROPE:
|
1679
1679
|
{
|
1680
|
+
#pragma message("TODO: implement phi3 frequency factors support")
|
1681
|
+
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
|
1682
|
+
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
|
1683
|
+
|
1680
1684
|
GGML_ASSERT(ne10 == ne02);
|
1681
1685
|
GGML_ASSERT(src0t == dstt);
|
1682
1686
|
// const int n_past = ((int32_t *) dst->op_params)[0];
|