llama_cpp 0.15.2 → 0.15.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 30dd4c29b86098faf7c78de5fa8e57021b631bb5eb3d14c93f63f1d186383ab8
4
- data.tar.gz: b011d891f1cd725f84821428a8db24004b52c9614e785f493f721f7abde71029
3
+ metadata.gz: d0a9cdf86695522e27b1e8d3ed485dfa6ab3a4fc23d9bd9e44bf8c3cb483c347
4
+ data.tar.gz: 5d97cec87f9b1df94f85f9e18dc46a1b8a4ec593c17d04e4bee0da3d28c34211
5
5
  SHA512:
6
- metadata.gz: 6c1628f93762747688f802db8593946e8581c869f63c610669b45759f644b3d19b061825b788e328b6b984977112837586ed398b6118a8f8e5f0c7f6fd0eb2dd
7
- data.tar.gz: 2f8c3d9f1e6c0f6db7e0682995c8d34179d5405d32784bf00f04a3408cb5bf4c95557bfa1692026f8d3dc9e672d6b15dec5d33cbd76ddc1d94e5ec964a9d0409
6
+ metadata.gz: 71f26009b872db64d0d0d416153b5fbd6afb598617b701cb6342d099542c962f410bccddf80b77928bfd8ab8f017a749fbc1d2ed488139d806ef0e3cf75a0e42
7
+ data.tar.gz: 808c03f6664af65cadfea23071d0b55d459c119189346762ea9632156f7f35b8d1f0e594b356726fc26abdb1c81a3bce9d697b9ca2d6324c454a31f2a442f0d7
data/CHANGELOG.md CHANGED
@@ -1,3 +1,11 @@
1
+ ## [[0.15.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.2...v0.15.3)] - 2024-05-25
2
+
3
+ - Bump llama.cpp from b2917 to b2988.
4
+ - Add constants for pre-tokenization types.
5
+ - Add `n_threads` method to `Context`.
6
+ - Add `n_threads_batch` method to `Context`.
7
+ - Add `set_n_threads` method to `Context`.
8
+
1
9
  ## [[0.15.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.1...v0.15.2)] - 2024-05-18
2
10
 
3
11
  - Bump llama.cpp from b2839 to b2917.
@@ -2122,10 +2122,13 @@ public:
2122
2122
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
2123
2123
  rb_define_method(rb_cLLaMAContext, "embeddings_ith", RUBY_METHOD_FUNC(_llama_context_embeddings_ith), 1);
2124
2124
  rb_define_method(rb_cLLaMAContext, "embeddings_seq", RUBY_METHOD_FUNC(_llama_context_embeddings_seq), 1);
2125
+ rb_define_method(rb_cLLaMAContext, "set_n_threads", RUBY_METHOD_FUNC(_llama_context_set_n_threads), -1);
2125
2126
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
2126
2127
  rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
2127
2128
  rb_define_method(rb_cLLaMAContext, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_n_ubatch), 0);
2128
2129
  rb_define_method(rb_cLLaMAContext, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_n_seq_max), 0);
2130
+ rb_define_method(rb_cLLaMAContext, "n_threads", RUBY_METHOD_FUNC(_llama_context_n_threads), 0);
2131
+ rb_define_method(rb_cLLaMAContext, "n_threads_batch", RUBY_METHOD_FUNC(_llama_context_n_threads_batch), 0);
2129
2132
  rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
2130
2133
  rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
2131
2134
  rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
@@ -2343,6 +2346,33 @@ private:
2343
2346
  return output;
2344
2347
  }
2345
2348
 
2349
+ static VALUE _llama_context_set_n_threads(int argc, VALUE* argv, VALUE self) {
2350
+ VALUE kw_args = Qnil;
2351
+ ID kw_table[2] = { rb_intern("n_threads"), rb_intern("n_threads_batch") };
2352
+ VALUE kw_values[2] = { Qundef, Qundef };
2353
+ rb_scan_args(argc, argv, ":", &kw_args);
2354
+ rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
2355
+
2356
+ VALUE n_threads = kw_values[0];
2357
+ if (!RB_INTEGER_TYPE_P(n_threads)) {
2358
+ rb_raise(rb_eArgError, "n_threads must be an integer");
2359
+ return Qnil;
2360
+ }
2361
+ VALUE n_threads_batch = kw_values[1];
2362
+ if (!RB_INTEGER_TYPE_P(n_threads_batch)) {
2363
+ rb_raise(rb_eArgError, "n_threads_batch must be an integer");
2364
+ return Qnil;
2365
+ }
2366
+
2367
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2368
+ if (ptr->ctx == NULL) {
2369
+ rb_raise(rb_eArgError, "LLaMA context is not initialized");
2370
+ return Qnil;
2371
+ }
2372
+ llama_set_n_threads(ptr->ctx, NUM2UINT(n_threads), NUM2UINT(n_threads_batch));
2373
+ return Qnil;
2374
+ }
2375
+
2346
2376
  static VALUE _llama_context_n_ctx(VALUE self) {
2347
2377
  LLaMAContextWrapper* ptr = get_llama_context(self);
2348
2378
  if (ptr->ctx == NULL) {
@@ -2379,6 +2409,24 @@ private:
2379
2409
  return UINT2NUM(llama_n_seq_max(ptr->ctx));
2380
2410
  }
2381
2411
 
2412
+ static VALUE _llama_context_n_threads(VALUE self) {
2413
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2414
+ if (ptr->ctx == NULL) {
2415
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2416
+ return Qnil;
2417
+ }
2418
+ return UINT2NUM(llama_n_threads(ptr->ctx));
2419
+ }
2420
+
2421
+ static VALUE _llama_context_n_threads_batch(VALUE self) {
2422
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2423
+ if (ptr->ctx == NULL) {
2424
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2425
+ return Qnil;
2426
+ }
2427
+ return UINT2NUM(llama_n_threads_batch(ptr->ctx));
2428
+ }
2429
+
2382
2430
  static VALUE _llama_context_get_timings(VALUE self) {
2383
2431
  LLaMAContextWrapper* ptr = get_llama_context(self);
2384
2432
  if (ptr->ctx == NULL) {
@@ -3430,6 +3478,7 @@ extern "C" void Init_llama_cpp(void) {
3430
3478
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
3431
3479
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_REFACT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_REFACT));
3432
3480
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_COMMAND_R", INT2NUM(LLAMA_VOCAB_PRE_TYPE_COMMAND_R));
3481
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STABLELM2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STABLELM2));
3433
3482
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_QWEN2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2));
3434
3483
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_OLMO", INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO));
3435
3484
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DBRX", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.15.2'
6
+ VERSION = '0.15.3'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2917'
9
+ LLAMA_CPP_VERSION = 'b2988'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -26,6 +26,7 @@ module LLaMACpp
26
26
  LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
27
27
  LLAMA_VOCAB_PRE_TYPE_REFACT: Integer
28
28
  LLAMA_VOCAB_PRE_TYPE_COMMAND_R: Integer
29
+ LLAMA_VOCAB_PRE_TYPE_STABLELM2: Integer
29
30
  LLAMA_VOCAB_PRE_TYPE_QWEN2: Integer
30
31
  LLAMA_VOCAB_PRE_TYPE_OLMO: Integer
31
32
  LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
@@ -241,10 +242,13 @@ module LLaMACpp
241
242
  def embeddings_seq: (Integer) -> Array[Float]
242
243
  def decode: (::LLaMACpp::Batch) -> void
243
244
  def logits: () -> Array[Float]
245
+ def set_n_threads: (n_threads: Integer, n_threads_batch: Integer) -> void
244
246
  def n_ctx: () -> Integer
245
247
  def n_batch: () -> Integer
246
248
  def n_ubatch: () -> Integer
247
249
  def n_seq_max: () -> Integer
250
+ def n_threads: () -> Integer
251
+ def n_threads_batch: () -> Integer
248
252
  def timings: () -> ::LLaMACpp::Timings
249
253
  def print_timings: () -> void
250
254
  def reset_timings: () -> void
@@ -381,15 +381,16 @@ ifneq ($(filter ppc64le%,$(UNAME_M)),)
381
381
  CUDA_POWER_ARCH = 1
382
382
  endif
383
383
 
384
+ ifneq ($(filter loongarch64%,$(UNAME_M)),)
385
+ MK_CFLAGS += -mlasx
386
+ MK_CXXFLAGS += -mlasx
387
+ endif
388
+
384
389
  else
385
390
  MK_CFLAGS += -march=rv64gcv -mabi=lp64d
386
391
  MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
387
392
  endif
388
393
 
389
- ifdef LLAMA_QKK_64
390
- MK_CPPFLAGS += -DGGML_QKK_64
391
- endif
392
-
393
394
  ifndef LLAMA_NO_ACCELERATE
394
395
  # Mac OS - include Accelerate framework.
395
396
  # `-framework Accelerate` works both with Apple Silicon and Mac Intel
@@ -401,13 +402,6 @@ ifndef LLAMA_NO_ACCELERATE
401
402
  endif
402
403
  endif # LLAMA_NO_ACCELERATE
403
404
 
404
- ifdef LLAMA_MPI
405
- MK_CPPFLAGS += -DGGML_USE_MPI
406
- MK_CFLAGS += -Wno-cast-qual
407
- MK_CXXFLAGS += -Wno-cast-qual
408
- OBJS += ggml-mpi.o
409
- endif # LLAMA_MPI
410
-
411
405
  ifdef LLAMA_OPENBLAS
412
406
  MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
413
407
  MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
@@ -631,11 +625,6 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
631
625
  endif
632
626
  endif # LLAMA_METAL
633
627
 
634
- ifdef LLAMA_MPI
635
- ggml-mpi.o: ggml-mpi.c ggml-mpi.h
636
- $(CC) $(CFLAGS) -c $< -o $@
637
- endif # LLAMA_MPI
638
-
639
628
  ifndef LLAMA_NO_LLAMAFILE
640
629
  sgemm.o: sgemm.cpp sgemm.h ggml.h
641
630
  $(CXX) $(CXXFLAGS) -c $< -o $@
@@ -760,7 +749,7 @@ lib: llama.o ggml.o $(OBJS)
760
749
  ar rcs libllama.a $^
761
750
 
762
751
  clean:
763
- rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
752
+ rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
764
753
  rm -vrf ggml-cuda/*.o
765
754
 
766
755
  #
@@ -65,13 +65,8 @@ typedef sycl::half2 ggml_half2;
65
65
  // QK = number of values after dequantization
66
66
  // QK_K = super-block size
67
67
 
68
- #ifdef GGML_QKK_64
69
- #define QK_K 64
70
- #define K_SCALE_SIZE 4
71
- #else
72
68
  #define QK_K 256
73
69
  #define K_SCALE_SIZE 12
74
- #endif // GGML_QKK_64
75
70
 
76
71
  #if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
77
72
  // QR = QK / number of values before dequantization
@@ -131,13 +126,8 @@ typedef sycl::half2 ggml_half2;
131
126
  #define QI4_NL (QK4_NL / (4*QR4_NL))
132
127
  #define QR4_NL 2
133
128
 
134
- #if QK_K == 64
135
- #define QI4_XS QI4_NL
136
- #define QR4_XS QR4_NL
137
- #else
138
129
  #define QI4_XS (QK_K / (4*QR4_XS))
139
130
  #define QR4_XS 8
140
- #endif
141
131
 
142
132
  #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
143
133
 
@@ -228,15 +218,6 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wro
228
218
  // weight is represented as x = a * q
229
219
  // 16 blocks of 16 elements each
230
220
  // Effectively 3.4375 bits per weight
231
- #ifdef GGML_QKK_64
232
- typedef struct {
233
- uint8_t hmask[QK_K/8]; // quants - high bit
234
- uint8_t qs[QK_K/4]; // quants - low 2 bits
235
- uint8_t scales[2];
236
- ggml_half d; // super-block scale
237
- } block_q3_K;
238
- static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
239
- #else
240
221
  typedef struct {
241
222
  uint8_t hmask[QK_K/8]; // quants - high bit
242
223
  uint8_t qs[QK_K/4]; // quants - low 2 bits
@@ -244,20 +225,11 @@ typedef struct {
244
225
  ggml_half d; // super-block scale
245
226
  } block_q3_K;
246
227
  static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
247
- #endif
248
228
 
249
229
  // 4-bit quantization
250
230
  // 8 blocks of 32 elements each
251
231
  // weight is represented as x = a * q + b
252
232
  // Effectively 4.5 bits per weight
253
- #ifdef GGML_QKK_64
254
- typedef struct {
255
- ggml_half d[2]; // super-block scales/mins
256
- uint8_t scales[2]; // 4-bit block scales/mins
257
- uint8_t qs[QK_K/2]; // 4--bit quants
258
- } block_q4_K;
259
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
260
- #else
261
233
  typedef struct {
262
234
  union {
263
235
  struct {
@@ -270,21 +242,11 @@ typedef struct {
270
242
  uint8_t qs[QK_K/2]; // 4--bit quants
271
243
  } block_q4_K;
272
244
  static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
273
- #endif
274
245
 
275
246
  // 5-bit quantization
276
247
  // 8 blocks of 32 elements each
277
248
  // weight is represented as x = a * q + b
278
249
  // Effectively 5.5 bits per weight
279
- #ifdef GGML_QKK_64
280
- typedef struct {
281
- ggml_half d; // super-block scale
282
- int8_t scales[QK_K/16]; // 8-bit block scales
283
- uint8_t qh[QK_K/8]; // quants, high bit
284
- uint8_t qs[QK_K/2]; // quants, low 4 bits
285
- } block_q5_K;
286
- static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
287
- #else
288
250
  typedef struct {
289
251
  union {
290
252
  struct {
@@ -298,7 +260,6 @@ typedef struct {
298
260
  uint8_t qs[QK_K/2]; // quants, low 4 bits
299
261
  } block_q5_K;
300
262
  static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
301
- #endif
302
263
 
303
264
  // 6-bit quantization
304
265
  // weight is represented as x = a * q
@@ -356,11 +317,7 @@ typedef struct {
356
317
  static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
357
318
 
358
319
  // 3.4375 bpw
359
- #if QK_K == 64
360
- #define IQ3S_N_SCALE 2
361
- #else
362
320
  #define IQ3S_N_SCALE QK_K/64
363
- #endif
364
321
  typedef struct {
365
322
  ggml_half d;
366
323
  uint8_t qs[QK_K/4];
@@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron
381
338
  typedef struct {
382
339
  uint8_t qs[QK_K/8]; // grid index, low 8 bits
383
340
  uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
384
- #if QK_K == 64
385
- ggml_half d;
386
- #endif
387
341
  uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
388
342
  } block_iq1_m;
389
- #if QK_K == 64
390
- static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
391
- #else
392
343
  static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
393
- #endif
394
344
 
395
345
  // Used by IQ1_M quants
396
346
  typedef union {
@@ -406,9 +356,6 @@ typedef struct {
406
356
  } block_iq4_nl;
407
357
  static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
408
358
 
409
- #if QK_K == 64
410
- #define block_iq4_xs block_iq4_nl
411
- #else
412
359
  typedef struct {
413
360
  ggml_half d;
414
361
  uint16_t scales_h;
@@ -416,7 +363,6 @@ typedef struct {
416
363
  uint8_t qs[QK_K/2];
417
364
  } block_iq4_xs;
418
365
  static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
419
- #endif
420
366
 
421
367
  #endif // GGML_COMMON_DECL
422
368
  #endif // GGML_COMMON_DECL
@@ -43,19 +43,59 @@
43
43
  #include <mutex>
44
44
  #include <stdint.h>
45
45
  #include <stdio.h>
46
+ #include <stdarg.h>
47
+ #include <stdlib.h>
46
48
  #include <string>
47
49
  #include <vector>
48
50
 
49
51
  static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
50
52
 
53
+ static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
54
+ GGML_UNUSED(level);
55
+ GGML_UNUSED(user_data);
56
+ fprintf(stderr, "%s", msg);
57
+ }
58
+
59
+ ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
60
+ void * ggml_cuda_log_user_data = NULL;
61
+
62
+ GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
63
+ ggml_cuda_log_callback = log_callback;
64
+ ggml_cuda_log_user_data = user_data;
65
+ }
66
+
67
+ #define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
68
+ #define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
69
+ #define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
70
+
71
+ GGML_ATTRIBUTE_FORMAT(2, 3)
72
+ static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
73
+ if (ggml_cuda_log_callback != NULL) {
74
+ va_list args;
75
+ va_start(args, format);
76
+ char buffer[128];
77
+ int len = vsnprintf(buffer, 128, format, args);
78
+ if (len < 128) {
79
+ ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data);
80
+ } else {
81
+ std::vector<char> buffer2(len + 1); // vsnprintf adds a null terminator
82
+ va_end(args);
83
+ va_start(args, format);
84
+ vsnprintf(&buffer2[0], buffer2.size(), format, args);
85
+ ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data);
86
+ }
87
+ va_end(args);
88
+ }
89
+ }
90
+
51
91
  [[noreturn]]
52
92
  void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
53
93
  int id = -1; // in case cudaGetDevice fails
54
94
  cudaGetDevice(&id);
55
95
 
56
- fprintf(stderr, "CUDA error: %s\n", msg);
57
- fprintf(stderr, " current device: %d, in function %s at %s:%d\n", id, func, file, line);
58
- fprintf(stderr, " %s\n", stmt);
96
+ GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg);
97
+ GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
98
+ GGML_CUDA_LOG_ERROR(" %s\n", stmt);
59
99
  // abort with GGML_ASSERT to get a stack trace
60
100
  GGML_ASSERT(!"CUDA error");
61
101
  }
@@ -91,7 +131,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
91
131
 
92
132
  cudaError_t err = cudaGetDeviceCount(&info.device_count);
93
133
  if (err != cudaSuccess) {
94
- fprintf(stderr, "%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
134
+ GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
95
135
  return info;
96
136
  }
97
137
 
@@ -99,16 +139,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
99
139
 
100
140
  int64_t total_vram = 0;
101
141
  #if defined(GGML_CUDA_FORCE_MMQ)
102
- fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
142
+ GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
103
143
  #else
104
- fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
144
+ GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
105
145
  #endif
106
146
  #if defined(CUDA_USE_TENSOR_CORES)
107
- fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
147
+ GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
108
148
  #else
109
- fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
149
+ GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
110
150
  #endif
111
- fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
151
+ GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
112
152
  for (int id = 0; id < info.device_count; ++id) {
113
153
  int device_vmm = 0;
114
154
 
@@ -129,7 +169,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
129
169
 
130
170
  cudaDeviceProp prop;
131
171
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
132
- fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
172
+ GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
133
173
 
134
174
  info.default_tensor_split[id] = total_vram;
135
175
  total_vram += prop.totalGlobalMem;
@@ -235,8 +275,8 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
235
275
  *actual_size = look_ahead_size;
236
276
  pool_size += look_ahead_size;
237
277
  #ifdef DEBUG_CUDA_MALLOC
238
- fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
239
- (uint32_t)(max_size/1024/1024), (uint32_t)(pool_size/1024/1024), (uint32_t)(size/1024/1024));
278
+ GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
279
+ (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
240
280
  #endif
241
281
  return ptr;
242
282
  }
@@ -250,7 +290,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
250
290
  return;
251
291
  }
252
292
  }
253
- fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
293
+ GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
254
294
  ggml_cuda_set_device(device);
255
295
  CUDA_CHECK(cudaFree(ptr));
256
296
  pool_size -= size;
@@ -499,7 +539,9 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
499
539
  void * dev_ptr;
500
540
  cudaError_t err = cudaMalloc(&dev_ptr, size);
501
541
  if (err != cudaSuccess) {
502
- fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
542
+ // clear the error
543
+ cudaGetLastError();
544
+ GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
503
545
  return nullptr;
504
546
  }
505
547
 
@@ -1002,8 +1044,8 @@ static void * ggml_cuda_host_malloc(size_t size) {
1002
1044
  if (err != cudaSuccess) {
1003
1045
  // clear the error
1004
1046
  cudaGetLastError();
1005
- fprintf(stderr, "%s: warning: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1006
- size/1024.0/1024.0, cudaGetErrorString(err));
1047
+ GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1048
+ size / 1024.0 / 1024.0, cudaGetErrorString(err));
1007
1049
  return nullptr;
1008
1050
  }
1009
1051
 
@@ -2246,7 +2288,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2246
2288
  break;
2247
2289
  case GGML_OP_MUL_MAT:
2248
2290
  if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
2249
- fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
2291
+ GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
2250
2292
  return false;
2251
2293
  } else {
2252
2294
  ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
@@ -2300,7 +2342,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2300
2342
 
2301
2343
  cudaError_t err = cudaGetLastError();
2302
2344
  if (err != cudaSuccess) {
2303
- fprintf(stderr, "%s: %s failed\n", __func__, ggml_op_desc(dst));
2345
+ GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
2304
2346
  CUDA_CHECK(err);
2305
2347
  }
2306
2348
 
@@ -2476,7 +2518,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2476
2518
  if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
2477
2519
  cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2478
2520
  #ifndef NDEBUG
2479
- fprintf(stderr, "%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2521
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2480
2522
  #endif
2481
2523
  }
2482
2524
  }
@@ -2523,14 +2565,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2523
2565
  if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
2524
2566
  use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2525
2567
  #ifndef NDEBUG
2526
- fprintf(stderr, "%s: disabling CUDA graphs due to split buffer\n", __func__);
2568
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
2527
2569
  #endif
2528
2570
  }
2529
2571
 
2530
2572
  if (node->op == GGML_OP_MUL_MAT_ID) {
2531
2573
  use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2532
2574
  #ifndef NDEBUG
2533
- fprintf(stderr, "%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2575
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2534
2576
  #endif
2535
2577
  }
2536
2578
 
@@ -2539,7 +2581,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2539
2581
  // Changes in batch size or context size can cause changes to the grid size of some kernels.
2540
2582
  use_cuda_graph = false;
2541
2583
  #ifndef NDEBUG
2542
- fprintf(stderr, "%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2584
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2543
2585
  #endif
2544
2586
  }
2545
2587
 
@@ -2567,7 +2609,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2567
2609
  if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
2568
2610
  cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
2569
2611
  #ifndef NDEBUG
2570
- fprintf(stderr, "%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2612
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2571
2613
  #endif
2572
2614
  }
2573
2615
  }
@@ -2605,7 +2647,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2605
2647
 
2606
2648
  bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2607
2649
  if (!ok) {
2608
- fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2650
+ GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2609
2651
  }
2610
2652
  GGML_ASSERT(ok);
2611
2653
  }
@@ -2624,7 +2666,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2624
2666
  use_cuda_graph = false;
2625
2667
  cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
2626
2668
  #ifndef NDEBUG
2627
- fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2669
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2628
2670
  #endif
2629
2671
  } else {
2630
2672
  graph_evaluated_or_captured = true; // CUDA graph has been captured
@@ -2691,7 +2733,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2691
2733
  cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2692
2734
  if (stat == cudaErrorGraphExecUpdateFailure) {
2693
2735
  #ifndef NDEBUG
2694
- fprintf(stderr, "%s: CUDA graph update failed\n", __func__);
2736
+ GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
2695
2737
  #endif
2696
2738
  // The pre-existing graph exec cannot be updated due to violated constraints
2697
2739
  // so instead clear error and re-instantiate
@@ -2948,13 +2990,13 @@ static ggml_guid_t ggml_backend_cuda_guid() {
2948
2990
 
2949
2991
  GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
2950
2992
  if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
2951
- fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
2993
+ GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
2952
2994
  return nullptr;
2953
2995
  }
2954
2996
 
2955
2997
  ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
2956
2998
  if (ctx == nullptr) {
2957
- fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
2999
+ GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
2958
3000
  return nullptr;
2959
3001
  }
2960
3002
 
@@ -2998,8 +3040,8 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
2998
3040
  // clear the error
2999
3041
  cudaGetLastError();
3000
3042
 
3001
- fprintf(stderr, "%s: warning: failed to register %.2f MiB of pinned memory: %s\n", __func__,
3002
- size/1024.0/1024.0, cudaGetErrorString(err));
3043
+ GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
3044
+ size / 1024.0 / 1024.0, cudaGetErrorString(err));
3003
3045
  return false;
3004
3046
  }
3005
3047
  return true;
@@ -38,6 +38,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t *
38
38
  GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
39
39
  GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
40
40
 
41
+ GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
41
42
  #ifdef __cplusplus
42
43
  }
43
44
  #endif
@@ -17,6 +17,18 @@
17
17
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
18
18
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
19
19
 
20
+ #if defined(_WIN32)
21
+
22
+ #define m512bh(p) p
23
+ #define m512i(p) p
24
+
25
+ #else
26
+
27
+ #define m512bh(p) (__m512bh)(p)
28
+ #define m512i(p) (__m512i)(p)
29
+
30
+ #endif
31
+
20
32
  /**
21
33
  * Converts brain16 to float32.
22
34
  *
@@ -443,6 +455,34 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
443
455
  #include <riscv_vector.h>
444
456
  #endif
445
457
 
458
+ #if defined(__loongarch64)
459
+ #if defined(__loongarch_asx)
460
+ #include <lasxintrin.h>
461
+ #endif
462
+ #if defined(__loongarch_sx)
463
+ #include <lsxintrin.h>
464
+ #endif
465
+ #endif
466
+
467
+ #if defined(__loongarch_asx)
468
+
469
+ typedef union {
470
+ int32_t i;
471
+ float f;
472
+ } ft_union;
473
+
474
+ /* float type data load instructions */
475
+ static __m128 __lsx_vreplfr2vr_s(float val) {
476
+ ft_union fi_tmpval = {.f = val};
477
+ return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
478
+ }
479
+
480
+ static __m256 __lasx_xvreplfr2vr_s(float val) {
481
+ ft_union fi_tmpval = {.f = val};
482
+ return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
483
+ }
484
+ #endif
485
+
446
486
  #ifdef __F16C__
447
487
 
448
488
  #ifdef _MSC_VER
@@ -1677,6 +1677,10 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1677
1677
  } break;
1678
1678
  case GGML_OP_ROPE:
1679
1679
  {
1680
+ #pragma message("TODO: implement phi3 frequency factors support")
1681
+ #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
1682
+ GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
1683
+
1680
1684
  GGML_ASSERT(ne10 == ne02);
1681
1685
  GGML_ASSERT(src0t == dstt);
1682
1686
  // const int n_past = ((int32_t *) dst->op_params)[0];