llama_cpp 0.15.2 → 0.15.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 30dd4c29b86098faf7c78de5fa8e57021b631bb5eb3d14c93f63f1d186383ab8
4
- data.tar.gz: b011d891f1cd725f84821428a8db24004b52c9614e785f493f721f7abde71029
3
+ metadata.gz: d0a9cdf86695522e27b1e8d3ed485dfa6ab3a4fc23d9bd9e44bf8c3cb483c347
4
+ data.tar.gz: 5d97cec87f9b1df94f85f9e18dc46a1b8a4ec593c17d04e4bee0da3d28c34211
5
5
  SHA512:
6
- metadata.gz: 6c1628f93762747688f802db8593946e8581c869f63c610669b45759f644b3d19b061825b788e328b6b984977112837586ed398b6118a8f8e5f0c7f6fd0eb2dd
7
- data.tar.gz: 2f8c3d9f1e6c0f6db7e0682995c8d34179d5405d32784bf00f04a3408cb5bf4c95557bfa1692026f8d3dc9e672d6b15dec5d33cbd76ddc1d94e5ec964a9d0409
6
+ metadata.gz: 71f26009b872db64d0d0d416153b5fbd6afb598617b701cb6342d099542c962f410bccddf80b77928bfd8ab8f017a749fbc1d2ed488139d806ef0e3cf75a0e42
7
+ data.tar.gz: 808c03f6664af65cadfea23071d0b55d459c119189346762ea9632156f7f35b8d1f0e594b356726fc26abdb1c81a3bce9d697b9ca2d6324c454a31f2a442f0d7
data/CHANGELOG.md CHANGED
@@ -1,3 +1,11 @@
1
+ ## [[0.15.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.2...v0.15.3)] - 2024-05-25
2
+
3
+ - Bump llama.cpp from b2917 to b2988.
4
+ - Add constants for pre-tokenization types.
5
+ - Add `n_threads` method to `Context`.
6
+ - Add `n_threads_batch` method to `Context`.
7
+ - Add `set_n_threads` method to `Context`.
8
+
1
9
  ## [[0.15.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.1...v0.15.2)] - 2024-05-18
2
10
 
3
11
  - Bump llama.cpp from b2839 to b2917.
@@ -2122,10 +2122,13 @@ public:
2122
2122
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
2123
2123
  rb_define_method(rb_cLLaMAContext, "embeddings_ith", RUBY_METHOD_FUNC(_llama_context_embeddings_ith), 1);
2124
2124
  rb_define_method(rb_cLLaMAContext, "embeddings_seq", RUBY_METHOD_FUNC(_llama_context_embeddings_seq), 1);
2125
+ rb_define_method(rb_cLLaMAContext, "set_n_threads", RUBY_METHOD_FUNC(_llama_context_set_n_threads), -1);
2125
2126
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
2126
2127
  rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
2127
2128
  rb_define_method(rb_cLLaMAContext, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_n_ubatch), 0);
2128
2129
  rb_define_method(rb_cLLaMAContext, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_n_seq_max), 0);
2130
+ rb_define_method(rb_cLLaMAContext, "n_threads", RUBY_METHOD_FUNC(_llama_context_n_threads), 0);
2131
+ rb_define_method(rb_cLLaMAContext, "n_threads_batch", RUBY_METHOD_FUNC(_llama_context_n_threads_batch), 0);
2129
2132
  rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
2130
2133
  rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
2131
2134
  rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
@@ -2343,6 +2346,33 @@ private:
2343
2346
  return output;
2344
2347
  }
2345
2348
 
2349
+ static VALUE _llama_context_set_n_threads(int argc, VALUE* argv, VALUE self) {
2350
+ VALUE kw_args = Qnil;
2351
+ ID kw_table[2] = { rb_intern("n_threads"), rb_intern("n_threads_batch") };
2352
+ VALUE kw_values[2] = { Qundef, Qundef };
2353
+ rb_scan_args(argc, argv, ":", &kw_args);
2354
+ rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
2355
+
2356
+ VALUE n_threads = kw_values[0];
2357
+ if (!RB_INTEGER_TYPE_P(n_threads)) {
2358
+ rb_raise(rb_eArgError, "n_threads must be an integer");
2359
+ return Qnil;
2360
+ }
2361
+ VALUE n_threads_batch = kw_values[1];
2362
+ if (!RB_INTEGER_TYPE_P(n_threads_batch)) {
2363
+ rb_raise(rb_eArgError, "n_threads_batch must be an integer");
2364
+ return Qnil;
2365
+ }
2366
+
2367
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2368
+ if (ptr->ctx == NULL) {
2369
+ rb_raise(rb_eArgError, "LLaMA context is not initialized");
2370
+ return Qnil;
2371
+ }
2372
+ llama_set_n_threads(ptr->ctx, NUM2UINT(n_threads), NUM2UINT(n_threads_batch));
2373
+ return Qnil;
2374
+ }
2375
+
2346
2376
  static VALUE _llama_context_n_ctx(VALUE self) {
2347
2377
  LLaMAContextWrapper* ptr = get_llama_context(self);
2348
2378
  if (ptr->ctx == NULL) {
@@ -2379,6 +2409,24 @@ private:
2379
2409
  return UINT2NUM(llama_n_seq_max(ptr->ctx));
2380
2410
  }
2381
2411
 
2412
+ static VALUE _llama_context_n_threads(VALUE self) {
2413
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2414
+ if (ptr->ctx == NULL) {
2415
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2416
+ return Qnil;
2417
+ }
2418
+ return UINT2NUM(llama_n_threads(ptr->ctx));
2419
+ }
2420
+
2421
+ static VALUE _llama_context_n_threads_batch(VALUE self) {
2422
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2423
+ if (ptr->ctx == NULL) {
2424
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2425
+ return Qnil;
2426
+ }
2427
+ return UINT2NUM(llama_n_threads_batch(ptr->ctx));
2428
+ }
2429
+
2382
2430
  static VALUE _llama_context_get_timings(VALUE self) {
2383
2431
  LLaMAContextWrapper* ptr = get_llama_context(self);
2384
2432
  if (ptr->ctx == NULL) {
@@ -3430,6 +3478,7 @@ extern "C" void Init_llama_cpp(void) {
3430
3478
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
3431
3479
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_REFACT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_REFACT));
3432
3480
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_COMMAND_R", INT2NUM(LLAMA_VOCAB_PRE_TYPE_COMMAND_R));
3481
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STABLELM2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STABLELM2));
3433
3482
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_QWEN2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2));
3434
3483
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_OLMO", INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO));
3435
3484
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DBRX", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.15.2'
6
+ VERSION = '0.15.3'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2917'
9
+ LLAMA_CPP_VERSION = 'b2988'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -26,6 +26,7 @@ module LLaMACpp
26
26
  LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
27
27
  LLAMA_VOCAB_PRE_TYPE_REFACT: Integer
28
28
  LLAMA_VOCAB_PRE_TYPE_COMMAND_R: Integer
29
+ LLAMA_VOCAB_PRE_TYPE_STABLELM2: Integer
29
30
  LLAMA_VOCAB_PRE_TYPE_QWEN2: Integer
30
31
  LLAMA_VOCAB_PRE_TYPE_OLMO: Integer
31
32
  LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
@@ -241,10 +242,13 @@ module LLaMACpp
241
242
  def embeddings_seq: (Integer) -> Array[Float]
242
243
  def decode: (::LLaMACpp::Batch) -> void
243
244
  def logits: () -> Array[Float]
245
+ def set_n_threads: (n_threads: Integer, n_threads_batch: Integer) -> void
244
246
  def n_ctx: () -> Integer
245
247
  def n_batch: () -> Integer
246
248
  def n_ubatch: () -> Integer
247
249
  def n_seq_max: () -> Integer
250
+ def n_threads: () -> Integer
251
+ def n_threads_batch: () -> Integer
248
252
  def timings: () -> ::LLaMACpp::Timings
249
253
  def print_timings: () -> void
250
254
  def reset_timings: () -> void
@@ -381,15 +381,16 @@ ifneq ($(filter ppc64le%,$(UNAME_M)),)
381
381
  CUDA_POWER_ARCH = 1
382
382
  endif
383
383
 
384
+ ifneq ($(filter loongarch64%,$(UNAME_M)),)
385
+ MK_CFLAGS += -mlasx
386
+ MK_CXXFLAGS += -mlasx
387
+ endif
388
+
384
389
  else
385
390
  MK_CFLAGS += -march=rv64gcv -mabi=lp64d
386
391
  MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
387
392
  endif
388
393
 
389
- ifdef LLAMA_QKK_64
390
- MK_CPPFLAGS += -DGGML_QKK_64
391
- endif
392
-
393
394
  ifndef LLAMA_NO_ACCELERATE
394
395
  # Mac OS - include Accelerate framework.
395
396
  # `-framework Accelerate` works both with Apple Silicon and Mac Intel
@@ -401,13 +402,6 @@ ifndef LLAMA_NO_ACCELERATE
401
402
  endif
402
403
  endif # LLAMA_NO_ACCELERATE
403
404
 
404
- ifdef LLAMA_MPI
405
- MK_CPPFLAGS += -DGGML_USE_MPI
406
- MK_CFLAGS += -Wno-cast-qual
407
- MK_CXXFLAGS += -Wno-cast-qual
408
- OBJS += ggml-mpi.o
409
- endif # LLAMA_MPI
410
-
411
405
  ifdef LLAMA_OPENBLAS
412
406
  MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
413
407
  MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
@@ -631,11 +625,6 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
631
625
  endif
632
626
  endif # LLAMA_METAL
633
627
 
634
- ifdef LLAMA_MPI
635
- ggml-mpi.o: ggml-mpi.c ggml-mpi.h
636
- $(CC) $(CFLAGS) -c $< -o $@
637
- endif # LLAMA_MPI
638
-
639
628
  ifndef LLAMA_NO_LLAMAFILE
640
629
  sgemm.o: sgemm.cpp sgemm.h ggml.h
641
630
  $(CXX) $(CXXFLAGS) -c $< -o $@
@@ -760,7 +749,7 @@ lib: llama.o ggml.o $(OBJS)
760
749
  ar rcs libllama.a $^
761
750
 
762
751
  clean:
763
- rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
752
+ rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
764
753
  rm -vrf ggml-cuda/*.o
765
754
 
766
755
  #
@@ -65,13 +65,8 @@ typedef sycl::half2 ggml_half2;
65
65
  // QK = number of values after dequantization
66
66
  // QK_K = super-block size
67
67
 
68
- #ifdef GGML_QKK_64
69
- #define QK_K 64
70
- #define K_SCALE_SIZE 4
71
- #else
72
68
  #define QK_K 256
73
69
  #define K_SCALE_SIZE 12
74
- #endif // GGML_QKK_64
75
70
 
76
71
  #if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
77
72
  // QR = QK / number of values before dequantization
@@ -131,13 +126,8 @@ typedef sycl::half2 ggml_half2;
131
126
  #define QI4_NL (QK4_NL / (4*QR4_NL))
132
127
  #define QR4_NL 2
133
128
 
134
- #if QK_K == 64
135
- #define QI4_XS QI4_NL
136
- #define QR4_XS QR4_NL
137
- #else
138
129
  #define QI4_XS (QK_K / (4*QR4_XS))
139
130
  #define QR4_XS 8
140
- #endif
141
131
 
142
132
  #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
143
133
 
@@ -228,15 +218,6 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wro
228
218
  // weight is represented as x = a * q
229
219
  // 16 blocks of 16 elements each
230
220
  // Effectively 3.4375 bits per weight
231
- #ifdef GGML_QKK_64
232
- typedef struct {
233
- uint8_t hmask[QK_K/8]; // quants - high bit
234
- uint8_t qs[QK_K/4]; // quants - low 2 bits
235
- uint8_t scales[2];
236
- ggml_half d; // super-block scale
237
- } block_q3_K;
238
- static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
239
- #else
240
221
  typedef struct {
241
222
  uint8_t hmask[QK_K/8]; // quants - high bit
242
223
  uint8_t qs[QK_K/4]; // quants - low 2 bits
@@ -244,20 +225,11 @@ typedef struct {
244
225
  ggml_half d; // super-block scale
245
226
  } block_q3_K;
246
227
  static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
247
- #endif
248
228
 
249
229
  // 4-bit quantization
250
230
  // 8 blocks of 32 elements each
251
231
  // weight is represented as x = a * q + b
252
232
  // Effectively 4.5 bits per weight
253
- #ifdef GGML_QKK_64
254
- typedef struct {
255
- ggml_half d[2]; // super-block scales/mins
256
- uint8_t scales[2]; // 4-bit block scales/mins
257
- uint8_t qs[QK_K/2]; // 4--bit quants
258
- } block_q4_K;
259
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
260
- #else
261
233
  typedef struct {
262
234
  union {
263
235
  struct {
@@ -270,21 +242,11 @@ typedef struct {
270
242
  uint8_t qs[QK_K/2]; // 4--bit quants
271
243
  } block_q4_K;
272
244
  static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
273
- #endif
274
245
 
275
246
  // 5-bit quantization
276
247
  // 8 blocks of 32 elements each
277
248
  // weight is represented as x = a * q + b
278
249
  // Effectively 5.5 bits per weight
279
- #ifdef GGML_QKK_64
280
- typedef struct {
281
- ggml_half d; // super-block scale
282
- int8_t scales[QK_K/16]; // 8-bit block scales
283
- uint8_t qh[QK_K/8]; // quants, high bit
284
- uint8_t qs[QK_K/2]; // quants, low 4 bits
285
- } block_q5_K;
286
- static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
287
- #else
288
250
  typedef struct {
289
251
  union {
290
252
  struct {
@@ -298,7 +260,6 @@ typedef struct {
298
260
  uint8_t qs[QK_K/2]; // quants, low 4 bits
299
261
  } block_q5_K;
300
262
  static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
301
- #endif
302
263
 
303
264
  // 6-bit quantization
304
265
  // weight is represented as x = a * q
@@ -356,11 +317,7 @@ typedef struct {
356
317
  static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
357
318
 
358
319
  // 3.4375 bpw
359
- #if QK_K == 64
360
- #define IQ3S_N_SCALE 2
361
- #else
362
320
  #define IQ3S_N_SCALE QK_K/64
363
- #endif
364
321
  typedef struct {
365
322
  ggml_half d;
366
323
  uint8_t qs[QK_K/4];
@@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron
381
338
  typedef struct {
382
339
  uint8_t qs[QK_K/8]; // grid index, low 8 bits
383
340
  uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
384
- #if QK_K == 64
385
- ggml_half d;
386
- #endif
387
341
  uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
388
342
  } block_iq1_m;
389
- #if QK_K == 64
390
- static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
391
- #else
392
343
  static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
393
- #endif
394
344
 
395
345
  // Used by IQ1_M quants
396
346
  typedef union {
@@ -406,9 +356,6 @@ typedef struct {
406
356
  } block_iq4_nl;
407
357
  static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
408
358
 
409
- #if QK_K == 64
410
- #define block_iq4_xs block_iq4_nl
411
- #else
412
359
  typedef struct {
413
360
  ggml_half d;
414
361
  uint16_t scales_h;
@@ -416,7 +363,6 @@ typedef struct {
416
363
  uint8_t qs[QK_K/2];
417
364
  } block_iq4_xs;
418
365
  static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
419
- #endif
420
366
 
421
367
  #endif // GGML_COMMON_DECL
422
368
  #endif // GGML_COMMON_DECL
@@ -43,19 +43,59 @@
43
43
  #include <mutex>
44
44
  #include <stdint.h>
45
45
  #include <stdio.h>
46
+ #include <stdarg.h>
47
+ #include <stdlib.h>
46
48
  #include <string>
47
49
  #include <vector>
48
50
 
49
51
  static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
50
52
 
53
+ static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
54
+ GGML_UNUSED(level);
55
+ GGML_UNUSED(user_data);
56
+ fprintf(stderr, "%s", msg);
57
+ }
58
+
59
+ ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
60
+ void * ggml_cuda_log_user_data = NULL;
61
+
62
+ GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
63
+ ggml_cuda_log_callback = log_callback;
64
+ ggml_cuda_log_user_data = user_data;
65
+ }
66
+
67
+ #define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
68
+ #define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
69
+ #define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
70
+
71
+ GGML_ATTRIBUTE_FORMAT(2, 3)
72
+ static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
73
+ if (ggml_cuda_log_callback != NULL) {
74
+ va_list args;
75
+ va_start(args, format);
76
+ char buffer[128];
77
+ int len = vsnprintf(buffer, 128, format, args);
78
+ if (len < 128) {
79
+ ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data);
80
+ } else {
81
+ std::vector<char> buffer2(len + 1); // vsnprintf adds a null terminator
82
+ va_end(args);
83
+ va_start(args, format);
84
+ vsnprintf(&buffer2[0], buffer2.size(), format, args);
85
+ ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data);
86
+ }
87
+ va_end(args);
88
+ }
89
+ }
90
+
51
91
  [[noreturn]]
52
92
  void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
53
93
  int id = -1; // in case cudaGetDevice fails
54
94
  cudaGetDevice(&id);
55
95
 
56
- fprintf(stderr, "CUDA error: %s\n", msg);
57
- fprintf(stderr, " current device: %d, in function %s at %s:%d\n", id, func, file, line);
58
- fprintf(stderr, " %s\n", stmt);
96
+ GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg);
97
+ GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
98
+ GGML_CUDA_LOG_ERROR(" %s\n", stmt);
59
99
  // abort with GGML_ASSERT to get a stack trace
60
100
  GGML_ASSERT(!"CUDA error");
61
101
  }
@@ -91,7 +131,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
91
131
 
92
132
  cudaError_t err = cudaGetDeviceCount(&info.device_count);
93
133
  if (err != cudaSuccess) {
94
- fprintf(stderr, "%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
134
+ GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
95
135
  return info;
96
136
  }
97
137
 
@@ -99,16 +139,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
99
139
 
100
140
  int64_t total_vram = 0;
101
141
  #if defined(GGML_CUDA_FORCE_MMQ)
102
- fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
142
+ GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
103
143
  #else
104
- fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
144
+ GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
105
145
  #endif
106
146
  #if defined(CUDA_USE_TENSOR_CORES)
107
- fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
147
+ GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
108
148
  #else
109
- fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
149
+ GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
110
150
  #endif
111
- fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
151
+ GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
112
152
  for (int id = 0; id < info.device_count; ++id) {
113
153
  int device_vmm = 0;
114
154
 
@@ -129,7 +169,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
129
169
 
130
170
  cudaDeviceProp prop;
131
171
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
132
- fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
172
+ GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
133
173
 
134
174
  info.default_tensor_split[id] = total_vram;
135
175
  total_vram += prop.totalGlobalMem;
@@ -235,8 +275,8 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
235
275
  *actual_size = look_ahead_size;
236
276
  pool_size += look_ahead_size;
237
277
  #ifdef DEBUG_CUDA_MALLOC
238
- fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
239
- (uint32_t)(max_size/1024/1024), (uint32_t)(pool_size/1024/1024), (uint32_t)(size/1024/1024));
278
+ GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
279
+ (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
240
280
  #endif
241
281
  return ptr;
242
282
  }
@@ -250,7 +290,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
250
290
  return;
251
291
  }
252
292
  }
253
- fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
293
+ GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
254
294
  ggml_cuda_set_device(device);
255
295
  CUDA_CHECK(cudaFree(ptr));
256
296
  pool_size -= size;
@@ -499,7 +539,9 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
499
539
  void * dev_ptr;
500
540
  cudaError_t err = cudaMalloc(&dev_ptr, size);
501
541
  if (err != cudaSuccess) {
502
- fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
542
+ // clear the error
543
+ cudaGetLastError();
544
+ GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
503
545
  return nullptr;
504
546
  }
505
547
 
@@ -1002,8 +1044,8 @@ static void * ggml_cuda_host_malloc(size_t size) {
1002
1044
  if (err != cudaSuccess) {
1003
1045
  // clear the error
1004
1046
  cudaGetLastError();
1005
- fprintf(stderr, "%s: warning: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1006
- size/1024.0/1024.0, cudaGetErrorString(err));
1047
+ GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1048
+ size / 1024.0 / 1024.0, cudaGetErrorString(err));
1007
1049
  return nullptr;
1008
1050
  }
1009
1051
 
@@ -2246,7 +2288,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2246
2288
  break;
2247
2289
  case GGML_OP_MUL_MAT:
2248
2290
  if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
2249
- fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
2291
+ GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
2250
2292
  return false;
2251
2293
  } else {
2252
2294
  ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
@@ -2300,7 +2342,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2300
2342
 
2301
2343
  cudaError_t err = cudaGetLastError();
2302
2344
  if (err != cudaSuccess) {
2303
- fprintf(stderr, "%s: %s failed\n", __func__, ggml_op_desc(dst));
2345
+ GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
2304
2346
  CUDA_CHECK(err);
2305
2347
  }
2306
2348
 
@@ -2476,7 +2518,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2476
2518
  if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
2477
2519
  cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2478
2520
  #ifndef NDEBUG
2479
- fprintf(stderr, "%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2521
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2480
2522
  #endif
2481
2523
  }
2482
2524
  }
@@ -2523,14 +2565,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2523
2565
  if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
2524
2566
  use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2525
2567
  #ifndef NDEBUG
2526
- fprintf(stderr, "%s: disabling CUDA graphs due to split buffer\n", __func__);
2568
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
2527
2569
  #endif
2528
2570
  }
2529
2571
 
2530
2572
  if (node->op == GGML_OP_MUL_MAT_ID) {
2531
2573
  use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2532
2574
  #ifndef NDEBUG
2533
- fprintf(stderr, "%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2575
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2534
2576
  #endif
2535
2577
  }
2536
2578
 
@@ -2539,7 +2581,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2539
2581
  // Changes in batch size or context size can cause changes to the grid size of some kernels.
2540
2582
  use_cuda_graph = false;
2541
2583
  #ifndef NDEBUG
2542
- fprintf(stderr, "%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2584
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2543
2585
  #endif
2544
2586
  }
2545
2587
 
@@ -2567,7 +2609,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2567
2609
  if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
2568
2610
  cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
2569
2611
  #ifndef NDEBUG
2570
- fprintf(stderr, "%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2612
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2571
2613
  #endif
2572
2614
  }
2573
2615
  }
@@ -2605,7 +2647,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2605
2647
 
2606
2648
  bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2607
2649
  if (!ok) {
2608
- fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2650
+ GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2609
2651
  }
2610
2652
  GGML_ASSERT(ok);
2611
2653
  }
@@ -2624,7 +2666,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2624
2666
  use_cuda_graph = false;
2625
2667
  cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
2626
2668
  #ifndef NDEBUG
2627
- fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2669
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2628
2670
  #endif
2629
2671
  } else {
2630
2672
  graph_evaluated_or_captured = true; // CUDA graph has been captured
@@ -2691,7 +2733,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2691
2733
  cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2692
2734
  if (stat == cudaErrorGraphExecUpdateFailure) {
2693
2735
  #ifndef NDEBUG
2694
- fprintf(stderr, "%s: CUDA graph update failed\n", __func__);
2736
+ GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
2695
2737
  #endif
2696
2738
  // The pre-existing graph exec cannot be updated due to violated constraints
2697
2739
  // so instead clear error and re-instantiate
@@ -2948,13 +2990,13 @@ static ggml_guid_t ggml_backend_cuda_guid() {
2948
2990
 
2949
2991
  GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
2950
2992
  if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
2951
- fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
2993
+ GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
2952
2994
  return nullptr;
2953
2995
  }
2954
2996
 
2955
2997
  ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
2956
2998
  if (ctx == nullptr) {
2957
- fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
2999
+ GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
2958
3000
  return nullptr;
2959
3001
  }
2960
3002
 
@@ -2998,8 +3040,8 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
2998
3040
  // clear the error
2999
3041
  cudaGetLastError();
3000
3042
 
3001
- fprintf(stderr, "%s: warning: failed to register %.2f MiB of pinned memory: %s\n", __func__,
3002
- size/1024.0/1024.0, cudaGetErrorString(err));
3043
+ GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
3044
+ size / 1024.0 / 1024.0, cudaGetErrorString(err));
3003
3045
  return false;
3004
3046
  }
3005
3047
  return true;
@@ -38,6 +38,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t *
38
38
  GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
39
39
  GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
40
40
 
41
+ GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
41
42
  #ifdef __cplusplus
42
43
  }
43
44
  #endif
@@ -17,6 +17,18 @@
17
17
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
18
18
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
19
19
 
20
+ #if defined(_WIN32)
21
+
22
+ #define m512bh(p) p
23
+ #define m512i(p) p
24
+
25
+ #else
26
+
27
+ #define m512bh(p) (__m512bh)(p)
28
+ #define m512i(p) (__m512i)(p)
29
+
30
+ #endif
31
+
20
32
  /**
21
33
  * Converts brain16 to float32.
22
34
  *
@@ -443,6 +455,34 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
443
455
  #include <riscv_vector.h>
444
456
  #endif
445
457
 
458
+ #if defined(__loongarch64)
459
+ #if defined(__loongarch_asx)
460
+ #include <lasxintrin.h>
461
+ #endif
462
+ #if defined(__loongarch_sx)
463
+ #include <lsxintrin.h>
464
+ #endif
465
+ #endif
466
+
467
+ #if defined(__loongarch_asx)
468
+
469
+ typedef union {
470
+ int32_t i;
471
+ float f;
472
+ } ft_union;
473
+
474
+ /* float type data load instructions */
475
+ static __m128 __lsx_vreplfr2vr_s(float val) {
476
+ ft_union fi_tmpval = {.f = val};
477
+ return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
478
+ }
479
+
480
+ static __m256 __lasx_xvreplfr2vr_s(float val) {
481
+ ft_union fi_tmpval = {.f = val};
482
+ return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
483
+ }
484
+ #endif
485
+
446
486
  #ifdef __F16C__
447
487
 
448
488
  #ifdef _MSC_VER
@@ -1677,6 +1677,10 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1677
1677
  } break;
1678
1678
  case GGML_OP_ROPE:
1679
1679
  {
1680
+ #pragma message("TODO: implement phi3 frequency factors support")
1681
+ #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
1682
+ GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
1683
+
1680
1684
  GGML_ASSERT(ne10 == ne02);
1681
1685
  GGML_ASSERT(src0t == dstt);
1682
1686
  // const int n_past = ((int32_t *) dst->op_params)[0];