llama_cpp 0.15.2 → 0.15.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +6 -17
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +72 -30
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +40 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +68 -70
- data/vendor/tmp/llama.cpp/ggml-metal.metal +24 -409
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +1879 -2450
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +176 -53
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +40 -500
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +202 -225
- data/vendor/tmp/llama.cpp/ggml.c +376 -758
- data/vendor/tmp/llama.cpp/ggml.h +39 -27
- data/vendor/tmp/llama.cpp/llama.cpp +823 -593
- data/vendor/tmp/llama.cpp/llama.h +10 -3
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d0a9cdf86695522e27b1e8d3ed485dfa6ab3a4fc23d9bd9e44bf8c3cb483c347
|
4
|
+
data.tar.gz: 5d97cec87f9b1df94f85f9e18dc46a1b8a4ec593c17d04e4bee0da3d28c34211
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 71f26009b872db64d0d0d416153b5fbd6afb598617b701cb6342d099542c962f410bccddf80b77928bfd8ab8f017a749fbc1d2ed488139d806ef0e3cf75a0e42
|
7
|
+
data.tar.gz: 808c03f6664af65cadfea23071d0b55d459c119189346762ea9632156f7f35b8d1f0e594b356726fc26abdb1c81a3bce9d697b9ca2d6324c454a31f2a442f0d7
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
## [[0.15.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.2...v0.15.3)] - 2024-05-25
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2917 to b2988.
|
4
|
+
- Add constants for pre-tokenization types.
|
5
|
+
- Add `n_threads` method to `Context`.
|
6
|
+
- Add `n_threads_batch` method to `Context`.
|
7
|
+
- Add `set_n_threads` method to `Context`.
|
8
|
+
|
1
9
|
## [[0.15.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.1...v0.15.2)] - 2024-05-18
|
2
10
|
|
3
11
|
- Bump llama.cpp from b2839 to b2917.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -2122,10 +2122,13 @@ public:
|
|
2122
2122
|
rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
|
2123
2123
|
rb_define_method(rb_cLLaMAContext, "embeddings_ith", RUBY_METHOD_FUNC(_llama_context_embeddings_ith), 1);
|
2124
2124
|
rb_define_method(rb_cLLaMAContext, "embeddings_seq", RUBY_METHOD_FUNC(_llama_context_embeddings_seq), 1);
|
2125
|
+
rb_define_method(rb_cLLaMAContext, "set_n_threads", RUBY_METHOD_FUNC(_llama_context_set_n_threads), -1);
|
2125
2126
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
2126
2127
|
rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
|
2127
2128
|
rb_define_method(rb_cLLaMAContext, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_n_ubatch), 0);
|
2128
2129
|
rb_define_method(rb_cLLaMAContext, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_n_seq_max), 0);
|
2130
|
+
rb_define_method(rb_cLLaMAContext, "n_threads", RUBY_METHOD_FUNC(_llama_context_n_threads), 0);
|
2131
|
+
rb_define_method(rb_cLLaMAContext, "n_threads_batch", RUBY_METHOD_FUNC(_llama_context_n_threads_batch), 0);
|
2129
2132
|
rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
|
2130
2133
|
rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
|
2131
2134
|
rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
|
@@ -2343,6 +2346,33 @@ private:
|
|
2343
2346
|
return output;
|
2344
2347
|
}
|
2345
2348
|
|
2349
|
+
static VALUE _llama_context_set_n_threads(int argc, VALUE* argv, VALUE self) {
|
2350
|
+
VALUE kw_args = Qnil;
|
2351
|
+
ID kw_table[2] = { rb_intern("n_threads"), rb_intern("n_threads_batch") };
|
2352
|
+
VALUE kw_values[2] = { Qundef, Qundef };
|
2353
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
2354
|
+
rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
|
2355
|
+
|
2356
|
+
VALUE n_threads = kw_values[0];
|
2357
|
+
if (!RB_INTEGER_TYPE_P(n_threads)) {
|
2358
|
+
rb_raise(rb_eArgError, "n_threads must be an integer");
|
2359
|
+
return Qnil;
|
2360
|
+
}
|
2361
|
+
VALUE n_threads_batch = kw_values[1];
|
2362
|
+
if (!RB_INTEGER_TYPE_P(n_threads_batch)) {
|
2363
|
+
rb_raise(rb_eArgError, "n_threads_batch must be an integer");
|
2364
|
+
return Qnil;
|
2365
|
+
}
|
2366
|
+
|
2367
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2368
|
+
if (ptr->ctx == NULL) {
|
2369
|
+
rb_raise(rb_eArgError, "LLaMA context is not initialized");
|
2370
|
+
return Qnil;
|
2371
|
+
}
|
2372
|
+
llama_set_n_threads(ptr->ctx, NUM2UINT(n_threads), NUM2UINT(n_threads_batch));
|
2373
|
+
return Qnil;
|
2374
|
+
}
|
2375
|
+
|
2346
2376
|
static VALUE _llama_context_n_ctx(VALUE self) {
|
2347
2377
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2348
2378
|
if (ptr->ctx == NULL) {
|
@@ -2379,6 +2409,24 @@ private:
|
|
2379
2409
|
return UINT2NUM(llama_n_seq_max(ptr->ctx));
|
2380
2410
|
}
|
2381
2411
|
|
2412
|
+
static VALUE _llama_context_n_threads(VALUE self) {
|
2413
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2414
|
+
if (ptr->ctx == NULL) {
|
2415
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2416
|
+
return Qnil;
|
2417
|
+
}
|
2418
|
+
return UINT2NUM(llama_n_threads(ptr->ctx));
|
2419
|
+
}
|
2420
|
+
|
2421
|
+
static VALUE _llama_context_n_threads_batch(VALUE self) {
|
2422
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2423
|
+
if (ptr->ctx == NULL) {
|
2424
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2425
|
+
return Qnil;
|
2426
|
+
}
|
2427
|
+
return UINT2NUM(llama_n_threads_batch(ptr->ctx));
|
2428
|
+
}
|
2429
|
+
|
2382
2430
|
static VALUE _llama_context_get_timings(VALUE self) {
|
2383
2431
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2384
2432
|
if (ptr->ctx == NULL) {
|
@@ -3430,6 +3478,7 @@ extern "C" void Init_llama_cpp(void) {
|
|
3430
3478
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
|
3431
3479
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_REFACT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_REFACT));
|
3432
3480
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_COMMAND_R", INT2NUM(LLAMA_VOCAB_PRE_TYPE_COMMAND_R));
|
3481
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STABLELM2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STABLELM2));
|
3433
3482
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_QWEN2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2));
|
3434
3483
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_OLMO", INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO));
|
3435
3484
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DBRX", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.15.
|
6
|
+
VERSION = '0.15.3'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2988'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -26,6 +26,7 @@ module LLaMACpp
|
|
26
26
|
LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
|
27
27
|
LLAMA_VOCAB_PRE_TYPE_REFACT: Integer
|
28
28
|
LLAMA_VOCAB_PRE_TYPE_COMMAND_R: Integer
|
29
|
+
LLAMA_VOCAB_PRE_TYPE_STABLELM2: Integer
|
29
30
|
LLAMA_VOCAB_PRE_TYPE_QWEN2: Integer
|
30
31
|
LLAMA_VOCAB_PRE_TYPE_OLMO: Integer
|
31
32
|
LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
|
@@ -241,10 +242,13 @@ module LLaMACpp
|
|
241
242
|
def embeddings_seq: (Integer) -> Array[Float]
|
242
243
|
def decode: (::LLaMACpp::Batch) -> void
|
243
244
|
def logits: () -> Array[Float]
|
245
|
+
def set_n_threads: (n_threads: Integer, n_threads_batch: Integer) -> void
|
244
246
|
def n_ctx: () -> Integer
|
245
247
|
def n_batch: () -> Integer
|
246
248
|
def n_ubatch: () -> Integer
|
247
249
|
def n_seq_max: () -> Integer
|
250
|
+
def n_threads: () -> Integer
|
251
|
+
def n_threads_batch: () -> Integer
|
248
252
|
def timings: () -> ::LLaMACpp::Timings
|
249
253
|
def print_timings: () -> void
|
250
254
|
def reset_timings: () -> void
|
@@ -381,15 +381,16 @@ ifneq ($(filter ppc64le%,$(UNAME_M)),)
|
|
381
381
|
CUDA_POWER_ARCH = 1
|
382
382
|
endif
|
383
383
|
|
384
|
+
ifneq ($(filter loongarch64%,$(UNAME_M)),)
|
385
|
+
MK_CFLAGS += -mlasx
|
386
|
+
MK_CXXFLAGS += -mlasx
|
387
|
+
endif
|
388
|
+
|
384
389
|
else
|
385
390
|
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
|
386
391
|
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
387
392
|
endif
|
388
393
|
|
389
|
-
ifdef LLAMA_QKK_64
|
390
|
-
MK_CPPFLAGS += -DGGML_QKK_64
|
391
|
-
endif
|
392
|
-
|
393
394
|
ifndef LLAMA_NO_ACCELERATE
|
394
395
|
# Mac OS - include Accelerate framework.
|
395
396
|
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
|
@@ -401,13 +402,6 @@ ifndef LLAMA_NO_ACCELERATE
|
|
401
402
|
endif
|
402
403
|
endif # LLAMA_NO_ACCELERATE
|
403
404
|
|
404
|
-
ifdef LLAMA_MPI
|
405
|
-
MK_CPPFLAGS += -DGGML_USE_MPI
|
406
|
-
MK_CFLAGS += -Wno-cast-qual
|
407
|
-
MK_CXXFLAGS += -Wno-cast-qual
|
408
|
-
OBJS += ggml-mpi.o
|
409
|
-
endif # LLAMA_MPI
|
410
|
-
|
411
405
|
ifdef LLAMA_OPENBLAS
|
412
406
|
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
|
413
407
|
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
@@ -631,11 +625,6 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
|
|
631
625
|
endif
|
632
626
|
endif # LLAMA_METAL
|
633
627
|
|
634
|
-
ifdef LLAMA_MPI
|
635
|
-
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
|
636
|
-
$(CC) $(CFLAGS) -c $< -o $@
|
637
|
-
endif # LLAMA_MPI
|
638
|
-
|
639
628
|
ifndef LLAMA_NO_LLAMAFILE
|
640
629
|
sgemm.o: sgemm.cpp sgemm.h ggml.h
|
641
630
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
@@ -760,7 +749,7 @@ lib: llama.o ggml.o $(OBJS)
|
|
760
749
|
ar rcs libllama.a $^
|
761
750
|
|
762
751
|
clean:
|
763
|
-
rm -vrf *.o tests/*.o *.so *.a *.dll
|
752
|
+
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
764
753
|
rm -vrf ggml-cuda/*.o
|
765
754
|
|
766
755
|
#
|
@@ -65,13 +65,8 @@ typedef sycl::half2 ggml_half2;
|
|
65
65
|
// QK = number of values after dequantization
|
66
66
|
// QK_K = super-block size
|
67
67
|
|
68
|
-
#ifdef GGML_QKK_64
|
69
|
-
#define QK_K 64
|
70
|
-
#define K_SCALE_SIZE 4
|
71
|
-
#else
|
72
68
|
#define QK_K 256
|
73
69
|
#define K_SCALE_SIZE 12
|
74
|
-
#endif // GGML_QKK_64
|
75
70
|
|
76
71
|
#if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
|
77
72
|
// QR = QK / number of values before dequantization
|
@@ -131,13 +126,8 @@ typedef sycl::half2 ggml_half2;
|
|
131
126
|
#define QI4_NL (QK4_NL / (4*QR4_NL))
|
132
127
|
#define QR4_NL 2
|
133
128
|
|
134
|
-
#if QK_K == 64
|
135
|
-
#define QI4_XS QI4_NL
|
136
|
-
#define QR4_XS QR4_NL
|
137
|
-
#else
|
138
129
|
#define QI4_XS (QK_K / (4*QR4_XS))
|
139
130
|
#define QR4_XS 8
|
140
|
-
#endif
|
141
131
|
|
142
132
|
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
143
133
|
|
@@ -228,15 +218,6 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wro
|
|
228
218
|
// weight is represented as x = a * q
|
229
219
|
// 16 blocks of 16 elements each
|
230
220
|
// Effectively 3.4375 bits per weight
|
231
|
-
#ifdef GGML_QKK_64
|
232
|
-
typedef struct {
|
233
|
-
uint8_t hmask[QK_K/8]; // quants - high bit
|
234
|
-
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
235
|
-
uint8_t scales[2];
|
236
|
-
ggml_half d; // super-block scale
|
237
|
-
} block_q3_K;
|
238
|
-
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
|
239
|
-
#else
|
240
221
|
typedef struct {
|
241
222
|
uint8_t hmask[QK_K/8]; // quants - high bit
|
242
223
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
@@ -244,20 +225,11 @@ typedef struct {
|
|
244
225
|
ggml_half d; // super-block scale
|
245
226
|
} block_q3_K;
|
246
227
|
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
|
247
|
-
#endif
|
248
228
|
|
249
229
|
// 4-bit quantization
|
250
230
|
// 8 blocks of 32 elements each
|
251
231
|
// weight is represented as x = a * q + b
|
252
232
|
// Effectively 4.5 bits per weight
|
253
|
-
#ifdef GGML_QKK_64
|
254
|
-
typedef struct {
|
255
|
-
ggml_half d[2]; // super-block scales/mins
|
256
|
-
uint8_t scales[2]; // 4-bit block scales/mins
|
257
|
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
258
|
-
} block_q4_K;
|
259
|
-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
260
|
-
#else
|
261
233
|
typedef struct {
|
262
234
|
union {
|
263
235
|
struct {
|
@@ -270,21 +242,11 @@ typedef struct {
|
|
270
242
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
271
243
|
} block_q4_K;
|
272
244
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
|
273
|
-
#endif
|
274
245
|
|
275
246
|
// 5-bit quantization
|
276
247
|
// 8 blocks of 32 elements each
|
277
248
|
// weight is represented as x = a * q + b
|
278
249
|
// Effectively 5.5 bits per weight
|
279
|
-
#ifdef GGML_QKK_64
|
280
|
-
typedef struct {
|
281
|
-
ggml_half d; // super-block scale
|
282
|
-
int8_t scales[QK_K/16]; // 8-bit block scales
|
283
|
-
uint8_t qh[QK_K/8]; // quants, high bit
|
284
|
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
285
|
-
} block_q5_K;
|
286
|
-
static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
287
|
-
#else
|
288
250
|
typedef struct {
|
289
251
|
union {
|
290
252
|
struct {
|
@@ -298,7 +260,6 @@ typedef struct {
|
|
298
260
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
299
261
|
} block_q5_K;
|
300
262
|
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
301
|
-
#endif
|
302
263
|
|
303
264
|
// 6-bit quantization
|
304
265
|
// weight is represented as x = a * q
|
@@ -356,11 +317,7 @@ typedef struct {
|
|
356
317
|
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
357
318
|
|
358
319
|
// 3.4375 bpw
|
359
|
-
#if QK_K == 64
|
360
|
-
#define IQ3S_N_SCALE 2
|
361
|
-
#else
|
362
320
|
#define IQ3S_N_SCALE QK_K/64
|
363
|
-
#endif
|
364
321
|
typedef struct {
|
365
322
|
ggml_half d;
|
366
323
|
uint8_t qs[QK_K/4];
|
@@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron
|
|
381
338
|
typedef struct {
|
382
339
|
uint8_t qs[QK_K/8]; // grid index, low 8 bits
|
383
340
|
uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
|
384
|
-
#if QK_K == 64
|
385
|
-
ggml_half d;
|
386
|
-
#endif
|
387
341
|
uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
|
388
342
|
} block_iq1_m;
|
389
|
-
#if QK_K == 64
|
390
|
-
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
|
391
|
-
#else
|
392
343
|
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
|
393
|
-
#endif
|
394
344
|
|
395
345
|
// Used by IQ1_M quants
|
396
346
|
typedef union {
|
@@ -406,9 +356,6 @@ typedef struct {
|
|
406
356
|
} block_iq4_nl;
|
407
357
|
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
408
358
|
|
409
|
-
#if QK_K == 64
|
410
|
-
#define block_iq4_xs block_iq4_nl
|
411
|
-
#else
|
412
359
|
typedef struct {
|
413
360
|
ggml_half d;
|
414
361
|
uint16_t scales_h;
|
@@ -416,7 +363,6 @@ typedef struct {
|
|
416
363
|
uint8_t qs[QK_K/2];
|
417
364
|
} block_iq4_xs;
|
418
365
|
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
419
|
-
#endif
|
420
366
|
|
421
367
|
#endif // GGML_COMMON_DECL
|
422
368
|
#endif // GGML_COMMON_DECL
|
@@ -43,19 +43,59 @@
|
|
43
43
|
#include <mutex>
|
44
44
|
#include <stdint.h>
|
45
45
|
#include <stdio.h>
|
46
|
+
#include <stdarg.h>
|
47
|
+
#include <stdlib.h>
|
46
48
|
#include <string>
|
47
49
|
#include <vector>
|
48
50
|
|
49
51
|
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
50
52
|
|
53
|
+
static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
54
|
+
GGML_UNUSED(level);
|
55
|
+
GGML_UNUSED(user_data);
|
56
|
+
fprintf(stderr, "%s", msg);
|
57
|
+
}
|
58
|
+
|
59
|
+
ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
|
60
|
+
void * ggml_cuda_log_user_data = NULL;
|
61
|
+
|
62
|
+
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
63
|
+
ggml_cuda_log_callback = log_callback;
|
64
|
+
ggml_cuda_log_user_data = user_data;
|
65
|
+
}
|
66
|
+
|
67
|
+
#define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
68
|
+
#define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
69
|
+
#define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
70
|
+
|
71
|
+
GGML_ATTRIBUTE_FORMAT(2, 3)
|
72
|
+
static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
|
73
|
+
if (ggml_cuda_log_callback != NULL) {
|
74
|
+
va_list args;
|
75
|
+
va_start(args, format);
|
76
|
+
char buffer[128];
|
77
|
+
int len = vsnprintf(buffer, 128, format, args);
|
78
|
+
if (len < 128) {
|
79
|
+
ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data);
|
80
|
+
} else {
|
81
|
+
std::vector<char> buffer2(len + 1); // vsnprintf adds a null terminator
|
82
|
+
va_end(args);
|
83
|
+
va_start(args, format);
|
84
|
+
vsnprintf(&buffer2[0], buffer2.size(), format, args);
|
85
|
+
ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data);
|
86
|
+
}
|
87
|
+
va_end(args);
|
88
|
+
}
|
89
|
+
}
|
90
|
+
|
51
91
|
[[noreturn]]
|
52
92
|
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
53
93
|
int id = -1; // in case cudaGetDevice fails
|
54
94
|
cudaGetDevice(&id);
|
55
95
|
|
56
|
-
|
57
|
-
|
58
|
-
|
96
|
+
GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg);
|
97
|
+
GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
98
|
+
GGML_CUDA_LOG_ERROR(" %s\n", stmt);
|
59
99
|
// abort with GGML_ASSERT to get a stack trace
|
60
100
|
GGML_ASSERT(!"CUDA error");
|
61
101
|
}
|
@@ -91,7 +131,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
91
131
|
|
92
132
|
cudaError_t err = cudaGetDeviceCount(&info.device_count);
|
93
133
|
if (err != cudaSuccess) {
|
94
|
-
|
134
|
+
GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
|
95
135
|
return info;
|
96
136
|
}
|
97
137
|
|
@@ -99,16 +139,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
99
139
|
|
100
140
|
int64_t total_vram = 0;
|
101
141
|
#if defined(GGML_CUDA_FORCE_MMQ)
|
102
|
-
|
142
|
+
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
103
143
|
#else
|
104
|
-
|
144
|
+
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
105
145
|
#endif
|
106
146
|
#if defined(CUDA_USE_TENSOR_CORES)
|
107
|
-
|
147
|
+
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
108
148
|
#else
|
109
|
-
|
149
|
+
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
110
150
|
#endif
|
111
|
-
|
151
|
+
GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
112
152
|
for (int id = 0; id < info.device_count; ++id) {
|
113
153
|
int device_vmm = 0;
|
114
154
|
|
@@ -129,7 +169,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
129
169
|
|
130
170
|
cudaDeviceProp prop;
|
131
171
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
132
|
-
|
172
|
+
GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
133
173
|
|
134
174
|
info.default_tensor_split[id] = total_vram;
|
135
175
|
total_vram += prop.totalGlobalMem;
|
@@ -235,8 +275,8 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
235
275
|
*actual_size = look_ahead_size;
|
236
276
|
pool_size += look_ahead_size;
|
237
277
|
#ifdef DEBUG_CUDA_MALLOC
|
238
|
-
|
239
|
-
|
278
|
+
GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
|
279
|
+
(uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
|
240
280
|
#endif
|
241
281
|
return ptr;
|
242
282
|
}
|
@@ -250,7 +290,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
250
290
|
return;
|
251
291
|
}
|
252
292
|
}
|
253
|
-
|
293
|
+
GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
254
294
|
ggml_cuda_set_device(device);
|
255
295
|
CUDA_CHECK(cudaFree(ptr));
|
256
296
|
pool_size -= size;
|
@@ -499,7 +539,9 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
|
|
499
539
|
void * dev_ptr;
|
500
540
|
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
501
541
|
if (err != cudaSuccess) {
|
502
|
-
|
542
|
+
// clear the error
|
543
|
+
cudaGetLastError();
|
544
|
+
GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
|
503
545
|
return nullptr;
|
504
546
|
}
|
505
547
|
|
@@ -1002,8 +1044,8 @@ static void * ggml_cuda_host_malloc(size_t size) {
|
|
1002
1044
|
if (err != cudaSuccess) {
|
1003
1045
|
// clear the error
|
1004
1046
|
cudaGetLastError();
|
1005
|
-
|
1006
|
-
|
1047
|
+
GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
1048
|
+
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
1007
1049
|
return nullptr;
|
1008
1050
|
}
|
1009
1051
|
|
@@ -2246,7 +2288,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
2246
2288
|
break;
|
2247
2289
|
case GGML_OP_MUL_MAT:
|
2248
2290
|
if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
|
2249
|
-
|
2291
|
+
GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
|
2250
2292
|
return false;
|
2251
2293
|
} else {
|
2252
2294
|
ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
|
@@ -2300,7 +2342,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
2300
2342
|
|
2301
2343
|
cudaError_t err = cudaGetLastError();
|
2302
2344
|
if (err != cudaSuccess) {
|
2303
|
-
|
2345
|
+
GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
|
2304
2346
|
CUDA_CHECK(err);
|
2305
2347
|
}
|
2306
2348
|
|
@@ -2476,7 +2518,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2476
2518
|
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
2477
2519
|
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
2478
2520
|
#ifndef NDEBUG
|
2479
|
-
|
2521
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
2480
2522
|
#endif
|
2481
2523
|
}
|
2482
2524
|
}
|
@@ -2523,14 +2565,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2523
2565
|
if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
2524
2566
|
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
2525
2567
|
#ifndef NDEBUG
|
2526
|
-
|
2568
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
2527
2569
|
#endif
|
2528
2570
|
}
|
2529
2571
|
|
2530
2572
|
if (node->op == GGML_OP_MUL_MAT_ID) {
|
2531
2573
|
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
2532
2574
|
#ifndef NDEBUG
|
2533
|
-
|
2575
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
|
2534
2576
|
#endif
|
2535
2577
|
}
|
2536
2578
|
|
@@ -2539,7 +2581,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2539
2581
|
// Changes in batch size or context size can cause changes to the grid size of some kernels.
|
2540
2582
|
use_cuda_graph = false;
|
2541
2583
|
#ifndef NDEBUG
|
2542
|
-
|
2584
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
2543
2585
|
#endif
|
2544
2586
|
}
|
2545
2587
|
|
@@ -2567,7 +2609,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2567
2609
|
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
2568
2610
|
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
2569
2611
|
#ifndef NDEBUG
|
2570
|
-
|
2612
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
2571
2613
|
#endif
|
2572
2614
|
}
|
2573
2615
|
}
|
@@ -2605,7 +2647,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2605
2647
|
|
2606
2648
|
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
|
2607
2649
|
if (!ok) {
|
2608
|
-
|
2650
|
+
GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
2609
2651
|
}
|
2610
2652
|
GGML_ASSERT(ok);
|
2611
2653
|
}
|
@@ -2624,7 +2666,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2624
2666
|
use_cuda_graph = false;
|
2625
2667
|
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
|
2626
2668
|
#ifndef NDEBUG
|
2627
|
-
|
2669
|
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
|
2628
2670
|
#endif
|
2629
2671
|
} else {
|
2630
2672
|
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
@@ -2691,7 +2733,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2691
2733
|
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
2692
2734
|
if (stat == cudaErrorGraphExecUpdateFailure) {
|
2693
2735
|
#ifndef NDEBUG
|
2694
|
-
|
2736
|
+
GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
|
2695
2737
|
#endif
|
2696
2738
|
// The pre-existing graph exec cannot be updated due to violated constraints
|
2697
2739
|
// so instead clear error and re-instantiate
|
@@ -2948,13 +2990,13 @@ static ggml_guid_t ggml_backend_cuda_guid() {
|
|
2948
2990
|
|
2949
2991
|
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
2950
2992
|
if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
|
2951
|
-
|
2993
|
+
GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
|
2952
2994
|
return nullptr;
|
2953
2995
|
}
|
2954
2996
|
|
2955
2997
|
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
|
2956
2998
|
if (ctx == nullptr) {
|
2957
|
-
|
2999
|
+
GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
|
2958
3000
|
return nullptr;
|
2959
3001
|
}
|
2960
3002
|
|
@@ -2998,8 +3040,8 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
|
|
2998
3040
|
// clear the error
|
2999
3041
|
cudaGetLastError();
|
3000
3042
|
|
3001
|
-
|
3002
|
-
|
3043
|
+
GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
3044
|
+
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
3003
3045
|
return false;
|
3004
3046
|
}
|
3005
3047
|
return true;
|
@@ -38,6 +38,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t *
|
|
38
38
|
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
39
39
|
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
40
40
|
|
41
|
+
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
41
42
|
#ifdef __cplusplus
|
42
43
|
}
|
43
44
|
#endif
|
@@ -17,6 +17,18 @@
|
|
17
17
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
18
18
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
19
19
|
|
20
|
+
#if defined(_WIN32)
|
21
|
+
|
22
|
+
#define m512bh(p) p
|
23
|
+
#define m512i(p) p
|
24
|
+
|
25
|
+
#else
|
26
|
+
|
27
|
+
#define m512bh(p) (__m512bh)(p)
|
28
|
+
#define m512i(p) (__m512i)(p)
|
29
|
+
|
30
|
+
#endif
|
31
|
+
|
20
32
|
/**
|
21
33
|
* Converts brain16 to float32.
|
22
34
|
*
|
@@ -443,6 +455,34 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
443
455
|
#include <riscv_vector.h>
|
444
456
|
#endif
|
445
457
|
|
458
|
+
#if defined(__loongarch64)
|
459
|
+
#if defined(__loongarch_asx)
|
460
|
+
#include <lasxintrin.h>
|
461
|
+
#endif
|
462
|
+
#if defined(__loongarch_sx)
|
463
|
+
#include <lsxintrin.h>
|
464
|
+
#endif
|
465
|
+
#endif
|
466
|
+
|
467
|
+
#if defined(__loongarch_asx)
|
468
|
+
|
469
|
+
typedef union {
|
470
|
+
int32_t i;
|
471
|
+
float f;
|
472
|
+
} ft_union;
|
473
|
+
|
474
|
+
/* float type data load instructions */
|
475
|
+
static __m128 __lsx_vreplfr2vr_s(float val) {
|
476
|
+
ft_union fi_tmpval = {.f = val};
|
477
|
+
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
|
478
|
+
}
|
479
|
+
|
480
|
+
static __m256 __lasx_xvreplfr2vr_s(float val) {
|
481
|
+
ft_union fi_tmpval = {.f = val};
|
482
|
+
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
|
483
|
+
}
|
484
|
+
#endif
|
485
|
+
|
446
486
|
#ifdef __F16C__
|
447
487
|
|
448
488
|
#ifdef _MSC_VER
|
@@ -1677,6 +1677,10 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
1677
1677
|
} break;
|
1678
1678
|
case GGML_OP_ROPE:
|
1679
1679
|
{
|
1680
|
+
#pragma message("TODO: implement phi3 frequency factors support")
|
1681
|
+
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
|
1682
|
+
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
|
1683
|
+
|
1680
1684
|
GGML_ASSERT(ne10 == ne02);
|
1681
1685
|
GGML_ASSERT(src0t == dstt);
|
1682
1686
|
// const int n_past = ((int32_t *) dst->op_params)[0];
|