llama_cpp 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9e38c82f6ce7404a78b3ecdbc9574ae860322e6945499f0c4a905956bcbd2be7
4
- data.tar.gz: 4a5effb6fcf3182baad091717bc510176eb127ccd660342ce0cc46bf2d392b4a
3
+ metadata.gz: c45589a61587acfbe88add77ffb135a7949619ba2936178c59126c24c30e23cc
4
+ data.tar.gz: 5866b5b5f8dab59432cc91beca290f927a8d1bc694f83c8ccbe366c6f636f47c
5
5
  SHA512:
6
- metadata.gz: c471bd6c6afee142945d03da1c4908355fe900a5f0c259583b7b65f97d495d07c5397d1b551da888a5970170944596959ddef73d2df803acf001b8d079d0affb
7
- data.tar.gz: 99cbb2d978723f9814d8ac7163f03c642a1ac6cabbd6cf09d003f563c629563a920d909ab797729f1e233f30d5776bf9f70f4c473919e5bf101d3e3f5fd6e938
6
+ metadata.gz: 89714a2a920172c1ddc4fff56a11390ad97db62eb6bd4eefe3ba9376132bd6646eda7569acea49ff3a1ce87486cac0e623cac4fddfcb2b57629c30ee3457d38b
7
+ data.tar.gz: 2c5494528f55b86c57fccb18658350058210e85f31e9ecb8b3587a4da68a1465a0987a98e86dbae16fa5fea9a9502aed96afec3679e5553a82805f4436ef3020
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## [[0.5.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.2...v0.5.3)] - 2023-09-23
2
+
3
+ - Bump bundled llama.cpp from b1 to b1266.
4
+
1
5
  ## [[0.5.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.1...v0.5.2)] - 2023-09-16
2
6
 
3
7
  - Bump bundled llama.cpp from b1198 to b1.
@@ -1046,7 +1046,7 @@ private:
1046
1046
 
1047
1047
  llama_token* tokens = ALLOCA_N(llama_token, n_max_tokens);
1048
1048
  LLaMAModelWrapper* ptr = get_llama_model(self);
1049
- const int n_tokens = llama_tokenize_with_model(ptr->model, text.c_str(), tokens, n_max_tokens, add_bos);
1049
+ const int n_tokens = llama_tokenize_with_model(ptr->model, text.c_str(), text.size(), tokens, n_max_tokens, add_bos);
1050
1050
 
1051
1051
  if (n_tokens < 0) {
1052
1052
  rb_raise(rb_eRuntimeError, "failed to tokenize. The numebr of tokens (%d) is greater than n_max_tokens.", -n_tokens);
@@ -1585,7 +1585,7 @@ private:
1585
1585
  rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1586
1586
  return Qnil;
1587
1587
  }
1588
- const int n = llama_tokenize(ptr->ctx, text.c_str(), tokens.data(), n_max_tokens, add_bos);
1588
+ const int n = llama_tokenize(ptr->ctx, text.c_str(), text.size(), tokens.data(), n_max_tokens, add_bos);
1589
1589
  if (n < 0) {
1590
1590
  rb_raise(rb_eRuntimeError, "Failed to tokenize");
1591
1591
  return Qnil;
@@ -131,6 +131,10 @@ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_ten
131
131
  return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
132
132
  }
133
133
 
134
+ static bool ggml_is_view(struct ggml_tensor * t) {
135
+ return t->view_src != NULL;
136
+ }
137
+
134
138
  void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
135
139
  #ifdef GGML_ALLOCATOR_DEBUG
136
140
  GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
@@ -338,8 +342,8 @@ static void free_vmem(void * base_addr, size_t size) {
338
342
 
339
343
  // allocate uncommitted virtual memory to measure the size of the graph
340
344
  static void alloc_measure_vmem(void ** base_addr, size_t * size) {
341
- // 1TB for 64-bit, 1GB for 32-bit
342
- *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
345
+ // 128GB for 64-bit, 1GB for 32-bit
346
+ *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
343
347
  do {
344
348
  *base_addr = alloc_vmem(*size);
345
349
  if (*base_addr != NULL) {
@@ -399,10 +403,6 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
399
403
 
400
404
  //////////// compute graph allocator
401
405
 
402
- static bool ggml_is_view(struct ggml_tensor * t) {
403
- return t->view_src != NULL;
404
- }
405
-
406
406
  static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
407
407
  if (a->type != b->type) {
408
408
  return false;
@@ -31,6 +31,9 @@
31
31
  #define cublasSetStream hipblasSetStream
32
32
  #define cublasSgemm hipblasSgemm
33
33
  #define cublasStatus_t hipblasStatus_t
34
+ #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
35
+ #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
36
+ #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
34
37
  #define cudaDeviceProp hipDeviceProp_t
35
38
  #define cudaDeviceSynchronize hipDeviceSynchronize
36
39
  #define cudaError_t hipError_t
@@ -61,7 +64,7 @@
61
64
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
62
65
  #define cudaStreamNonBlocking hipStreamNonBlocking
63
66
  #define cudaStreamSynchronize hipStreamSynchronize
64
- #define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
67
+ #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
65
68
  #define cudaStream_t hipStream_t
66
69
  #define cudaSuccess hipSuccess
67
70
  #else
@@ -190,6 +193,12 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
190
193
  } while (0)
191
194
  #endif // CUDART_VERSION >= 11
192
195
 
196
+ #if CUDART_VERSION >= 11100
197
+ #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
198
+ #else
199
+ #define GGML_CUDA_ASSUME(x)
200
+ #endif // CUDART_VERSION >= 11100
201
+
193
202
  #ifdef GGML_CUDA_F16
194
203
  typedef half dfloat; // dequantize float
195
204
  typedef half2 dfloat2;
@@ -418,6 +427,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
418
427
  static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
419
428
  #endif
420
429
 
430
+ #ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
431
+ #define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
432
+ #endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
433
+
421
434
  #define MUL_MAT_SRC1_COL_STRIDE 128
422
435
 
423
436
  #define MAX_STREAMS 8
@@ -2145,10 +2158,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2145
2158
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2146
2159
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2147
2160
 
2148
- __builtin_assume(i_offset >= 0);
2149
- __builtin_assume(i_offset < nwarps);
2150
- __builtin_assume(k >= 0);
2151
- __builtin_assume(k < WARP_SIZE);
2161
+ GGML_CUDA_ASSUME(i_offset >= 0);
2162
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2163
+ GGML_CUDA_ASSUME(k >= 0);
2164
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2152
2165
 
2153
2166
  const int kbx = k / QI4_0;
2154
2167
  const int kqsx = k % QI4_0;
@@ -2239,10 +2252,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2239
2252
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2240
2253
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2241
2254
 
2242
- __builtin_assume(i_offset >= 0);
2243
- __builtin_assume(i_offset < nwarps);
2244
- __builtin_assume(k >= 0);
2245
- __builtin_assume(k < WARP_SIZE);
2255
+ GGML_CUDA_ASSUME(i_offset >= 0);
2256
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2257
+ GGML_CUDA_ASSUME(k >= 0);
2258
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2246
2259
 
2247
2260
  const int kbx = k / QI4_1;
2248
2261
  const int kqsx = k % QI4_1;
@@ -2331,10 +2344,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2331
2344
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2332
2345
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2333
2346
 
2334
- __builtin_assume(i_offset >= 0);
2335
- __builtin_assume(i_offset < nwarps);
2336
- __builtin_assume(k >= 0);
2337
- __builtin_assume(k < WARP_SIZE);
2347
+ GGML_CUDA_ASSUME(i_offset >= 0);
2348
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2349
+ GGML_CUDA_ASSUME(k >= 0);
2350
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2338
2351
 
2339
2352
  const int kbx = k / QI5_0;
2340
2353
  const int kqsx = k % QI5_0;
@@ -2445,10 +2458,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2445
2458
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2446
2459
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2447
2460
 
2448
- __builtin_assume(i_offset >= 0);
2449
- __builtin_assume(i_offset < nwarps);
2450
- __builtin_assume(k >= 0);
2451
- __builtin_assume(k < WARP_SIZE);
2461
+ GGML_CUDA_ASSUME(i_offset >= 0);
2462
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2463
+ GGML_CUDA_ASSUME(k >= 0);
2464
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2452
2465
 
2453
2466
  const int kbx = k / QI5_1;
2454
2467
  const int kqsx = k % QI5_1;
@@ -2551,10 +2564,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2551
2564
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2552
2565
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2553
2566
 
2554
- __builtin_assume(i_offset >= 0);
2555
- __builtin_assume(i_offset < nwarps);
2556
- __builtin_assume(k >= 0);
2557
- __builtin_assume(k < WARP_SIZE);
2567
+ GGML_CUDA_ASSUME(i_offset >= 0);
2568
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2569
+ GGML_CUDA_ASSUME(k >= 0);
2570
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2558
2571
 
2559
2572
  const int kbx = k / QI8_0;
2560
2573
  const int kqsx = k % QI8_0;
@@ -2642,10 +2655,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2642
2655
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2643
2656
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2644
2657
 
2645
- __builtin_assume(i_offset >= 0);
2646
- __builtin_assume(i_offset < nwarps);
2647
- __builtin_assume(k >= 0);
2648
- __builtin_assume(k < WARP_SIZE);
2658
+ GGML_CUDA_ASSUME(i_offset >= 0);
2659
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2660
+ GGML_CUDA_ASSUME(k >= 0);
2661
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2649
2662
 
2650
2663
  const int kbx = k / QI2_K;
2651
2664
  const int kqsx = k % QI2_K;
@@ -2763,10 +2776,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2763
2776
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2764
2777
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2765
2778
 
2766
- __builtin_assume(i_offset >= 0);
2767
- __builtin_assume(i_offset < nwarps);
2768
- __builtin_assume(k >= 0);
2769
- __builtin_assume(k < WARP_SIZE);
2779
+ GGML_CUDA_ASSUME(i_offset >= 0);
2780
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2781
+ GGML_CUDA_ASSUME(k >= 0);
2782
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2770
2783
 
2771
2784
  const int kbx = k / QI3_K;
2772
2785
  const int kqsx = k % QI3_K;
@@ -2981,10 +2994,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2981
2994
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2982
2995
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2983
2996
 
2984
- __builtin_assume(i_offset >= 0);
2985
- __builtin_assume(i_offset < nwarps);
2986
- __builtin_assume(k >= 0);
2987
- __builtin_assume(k < WARP_SIZE);
2997
+ GGML_CUDA_ASSUME(i_offset >= 0);
2998
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2999
+ GGML_CUDA_ASSUME(k >= 0);
3000
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2988
3001
 
2989
3002
  const int kbx = k / QI4_K; // == 0 if QK_K == 256
2990
3003
  const int kqsx = k % QI4_K; // == k if QK_K == 256
@@ -3162,10 +3175,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3162
3175
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3163
3176
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3164
3177
 
3165
- __builtin_assume(i_offset >= 0);
3166
- __builtin_assume(i_offset < nwarps);
3167
- __builtin_assume(k >= 0);
3168
- __builtin_assume(k < WARP_SIZE);
3178
+ GGML_CUDA_ASSUME(i_offset >= 0);
3179
+ GGML_CUDA_ASSUME(i_offset < nwarps);
3180
+ GGML_CUDA_ASSUME(k >= 0);
3181
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
3169
3182
 
3170
3183
  const int kbx = k / QI5_K; // == 0 if QK_K == 256
3171
3184
  const int kqsx = k % QI5_K; // == k if QK_K == 256
@@ -3291,10 +3304,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3291
3304
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3292
3305
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3293
3306
 
3294
- __builtin_assume(i_offset >= 0);
3295
- __builtin_assume(i_offset < nwarps);
3296
- __builtin_assume(k >= 0);
3297
- __builtin_assume(k < WARP_SIZE);
3307
+ GGML_CUDA_ASSUME(i_offset >= 0);
3308
+ GGML_CUDA_ASSUME(i_offset < nwarps);
3309
+ GGML_CUDA_ASSUME(k >= 0);
3310
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
3298
3311
 
3299
3312
  const int kbx = k / QI6_K; // == 0 if QK_K == 256
3300
3313
  const int kqsx = k % QI6_K; // == k if QK_K == 256
@@ -6252,6 +6265,43 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
6252
6265
  }
6253
6266
  }
6254
6267
 
6268
+ void ggml_cuda_set_peer_access(const int n_tokens) {
6269
+ static bool peer_access_enabled = false;
6270
+
6271
+ const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
6272
+
6273
+ if (peer_access_enabled == enable_peer_access) {
6274
+ return;
6275
+ }
6276
+
6277
+ #ifdef NDEBUG
6278
+ for (int id = 0; id < g_device_count; ++id) {
6279
+ CUDA_CHECK(ggml_cuda_set_device(id));
6280
+
6281
+ for (int id_other = 0; id_other < g_device_count; ++id_other) {
6282
+ if (id == id_other) {
6283
+ continue;
6284
+ }
6285
+ if (id != g_main_device && id_other != g_main_device) {
6286
+ continue;
6287
+ }
6288
+
6289
+ int can_access_peer;
6290
+ CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
6291
+ if (can_access_peer) {
6292
+ if (enable_peer_access) {
6293
+ CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
6294
+ } else {
6295
+ CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
6296
+ }
6297
+ }
6298
+ }
6299
+ }
6300
+ #endif // NDEBUG
6301
+
6302
+ peer_access_enabled = enable_peer_access;
6303
+ }
6304
+
6255
6305
  static void ggml_cuda_op_mul_mat(
6256
6306
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
6257
6307
  const bool convert_src1_to_q8_1) {
@@ -6276,6 +6326,8 @@ static void ggml_cuda_op_mul_mat(
6276
6326
  const int nb2 = dst->nb[2];
6277
6327
  const int nb3 = dst->nb[3];
6278
6328
 
6329
+ ggml_cuda_set_peer_access(ne11);
6330
+
6279
6331
  GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
6280
6332
  GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
6281
6333
 
@@ -6408,7 +6460,7 @@ static void ggml_cuda_op_mul_mat(
6408
6460
 
6409
6461
  // wait for main GPU data if necessary
6410
6462
  if (split && (id != g_main_device || is != 0)) {
6411
- CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0]));
6463
+ CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0));
6412
6464
  }
6413
6465
 
6414
6466
  for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
@@ -6530,7 +6582,7 @@ static void ggml_cuda_op_mul_mat(
6530
6582
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6531
6583
  for (int64_t id = 0; id < g_device_count; ++id) {
6532
6584
  for (int64_t is = 0; is < is_max; ++is) {
6533
- CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is]));
6585
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
6534
6586
  }
6535
6587
  }
6536
6588
  }
@@ -6964,6 +7016,7 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
6964
7016
  return;
6965
7017
  }
6966
7018
  if (g_scratch_buffer == nullptr) {
7019
+ ggml_cuda_set_device(g_main_device);
6967
7020
  CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
6968
7021
  }
6969
7022
 
@@ -7003,7 +7056,7 @@ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
7003
7056
  ggml_cuda_assign_buffers_impl(tensor, false, true, false);
7004
7057
  }
7005
7058
 
7006
- void ggml_cuda_set_main_device(int main_device) {
7059
+ void ggml_cuda_set_main_device(const int main_device) {
7007
7060
  if (main_device >= g_device_count) {
7008
7061
  fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
7009
7062
  main_device, g_device_count, g_main_device);
@@ -7017,11 +7070,11 @@ void ggml_cuda_set_main_device(int main_device) {
7017
7070
  }
7018
7071
  }
7019
7072
 
7020
- void ggml_cuda_set_mul_mat_q(bool mul_mat_q) {
7073
+ void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
7021
7074
  g_mul_mat_q = mul_mat_q;
7022
7075
  }
7023
7076
 
7024
- void ggml_cuda_set_scratch_size(size_t scratch_size) {
7077
+ void ggml_cuda_set_scratch_size(const size_t scratch_size) {
7025
7078
  g_scratch_size = scratch_size;
7026
7079
  }
7027
7080
 
@@ -66,6 +66,7 @@ struct ggml_metal_context {
66
66
  GGML_METAL_DECL_KERNEL(soft_max_4);
67
67
  GGML_METAL_DECL_KERNEL(diag_mask_inf);
68
68
  GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
69
+ GGML_METAL_DECL_KERNEL(get_rows_f32);
69
70
  GGML_METAL_DECL_KERNEL(get_rows_f16);
70
71
  GGML_METAL_DECL_KERNEL(get_rows_q4_0);
71
72
  GGML_METAL_DECL_KERNEL(get_rows_q4_1);
@@ -77,6 +78,7 @@ struct ggml_metal_context {
77
78
  GGML_METAL_DECL_KERNEL(get_rows_q6_K);
78
79
  GGML_METAL_DECL_KERNEL(rms_norm);
79
80
  GGML_METAL_DECL_KERNEL(norm);
81
+ GGML_METAL_DECL_KERNEL(mul_mat_f32_f32);
80
82
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
81
83
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
82
84
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
@@ -88,6 +90,7 @@ struct ggml_metal_context {
88
90
  GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
89
91
  GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
90
92
  GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
93
+ GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
91
94
  GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
92
95
  GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
93
96
  GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
@@ -145,7 +148,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
145
148
  ctx->n_buffers = 0;
146
149
  ctx->concur_list_len = 0;
147
150
 
148
- ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
151
+ ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
149
152
 
150
153
  #ifdef GGML_SWIFT
151
154
  // load the default.metallib file
@@ -175,7 +178,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
175
178
 
176
179
  //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
177
180
  NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
178
- NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
181
+ NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
179
182
  metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]);
180
183
 
181
184
  NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
@@ -224,6 +227,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
224
227
  GGML_METAL_ADD_KERNEL(soft_max_4);
225
228
  GGML_METAL_ADD_KERNEL(diag_mask_inf);
226
229
  GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
230
+ GGML_METAL_ADD_KERNEL(get_rows_f32);
227
231
  GGML_METAL_ADD_KERNEL(get_rows_f16);
228
232
  GGML_METAL_ADD_KERNEL(get_rows_q4_0);
229
233
  GGML_METAL_ADD_KERNEL(get_rows_q4_1);
@@ -235,6 +239,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
235
239
  GGML_METAL_ADD_KERNEL(get_rows_q6_K);
236
240
  GGML_METAL_ADD_KERNEL(rms_norm);
237
241
  GGML_METAL_ADD_KERNEL(norm);
242
+ GGML_METAL_ADD_KERNEL(mul_mat_f32_f32);
238
243
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
239
244
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
240
245
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
@@ -246,6 +251,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
246
251
  GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
247
252
  GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
248
253
  GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
254
+ GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
249
255
  GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
250
256
  GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
251
257
  GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
@@ -293,7 +299,9 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
293
299
  GGML_METAL_DEL_KERNEL(gelu);
294
300
  GGML_METAL_DEL_KERNEL(soft_max);
295
301
  GGML_METAL_DEL_KERNEL(soft_max_4);
302
+ GGML_METAL_DEL_KERNEL(diag_mask_inf);
296
303
  GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
304
+ GGML_METAL_DEL_KERNEL(get_rows_f32);
297
305
  GGML_METAL_DEL_KERNEL(get_rows_f16);
298
306
  GGML_METAL_DEL_KERNEL(get_rows_q4_0);
299
307
  GGML_METAL_DEL_KERNEL(get_rows_q4_1);
@@ -305,6 +313,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
305
313
  GGML_METAL_DEL_KERNEL(get_rows_q6_K);
306
314
  GGML_METAL_DEL_KERNEL(rms_norm);
307
315
  GGML_METAL_DEL_KERNEL(norm);
316
+ GGML_METAL_DEL_KERNEL(mul_mat_f32_f32);
308
317
  GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
309
318
  GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
310
319
  GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
@@ -316,6 +325,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
316
325
  GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32);
317
326
  GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32);
318
327
  GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32);
328
+ GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
319
329
  GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
320
330
  GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
321
331
  GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
@@ -386,6 +396,7 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
386
396
  for (int i = 0; i < ctx->n_buffers; ++i) {
387
397
  const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
388
398
 
399
+ //metal_printf("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
389
400
  if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
390
401
  *offs = (size_t) ioffs;
391
402
 
@@ -723,6 +734,7 @@ void ggml_metal_graph_compute(
723
734
  case GGML_OP_ADD:
724
735
  {
725
736
  GGML_ASSERT(ggml_is_contiguous(src0));
737
+ GGML_ASSERT(ggml_is_contiguous(src1));
726
738
 
727
739
  // utilize float4
728
740
  GGML_ASSERT(ne00 % 4 == 0);
@@ -730,6 +742,7 @@ void ggml_metal_graph_compute(
730
742
 
731
743
  if (ggml_nelements(src1) == ne10) {
732
744
  // src1 is a row
745
+ GGML_ASSERT(ne11 == 1);
733
746
  [encoder setComputePipelineState:ctx->pipeline_add_row];
734
747
  } else {
735
748
  [encoder setComputePipelineState:ctx->pipeline_add];
@@ -746,6 +759,7 @@ void ggml_metal_graph_compute(
746
759
  case GGML_OP_MUL:
747
760
  {
748
761
  GGML_ASSERT(ggml_is_contiguous(src0));
762
+ GGML_ASSERT(ggml_is_contiguous(src1));
749
763
 
750
764
  // utilize float4
751
765
  GGML_ASSERT(ne00 % 4 == 0);
@@ -753,6 +767,7 @@ void ggml_metal_graph_compute(
753
767
 
754
768
  if (ggml_nelements(src1) == ne10) {
755
769
  // src1 is a row
770
+ GGML_ASSERT(ne11 == 1);
756
771
  [encoder setComputePipelineState:ctx->pipeline_mul_row];
757
772
  } else {
758
773
  [encoder setComputePipelineState:ctx->pipeline_mul];
@@ -768,6 +783,8 @@ void ggml_metal_graph_compute(
768
783
  } break;
769
784
  case GGML_OP_SCALE:
770
785
  {
786
+ GGML_ASSERT(ggml_is_contiguous(src0));
787
+
771
788
  const float scale = *(const float *) src1->data;
772
789
 
773
790
  [encoder setComputePipelineState:ctx->pipeline_scale];
@@ -867,13 +884,14 @@ void ggml_metal_graph_compute(
867
884
 
868
885
  // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
869
886
  // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
870
- if (ggml_is_contiguous(src0) &&
871
- ggml_is_contiguous(src1) &&
887
+ if (!ggml_is_transposed(src0) &&
888
+ !ggml_is_transposed(src1) &&
872
889
  src1t == GGML_TYPE_F32 &&
873
890
  [ctx->device supportsFamily:MTLGPUFamilyApple7] &&
874
891
  ne00%32 == 0 &&
875
892
  ne11 > 1) {
876
893
  switch (src0->type) {
894
+ case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break;
877
895
  case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
878
896
  case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
879
897
  case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
@@ -893,9 +911,12 @@ void ggml_metal_graph_compute(
893
911
  [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
894
912
  [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
895
913
  [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
896
- [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8];
897
- [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9];
898
- [encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
914
+ [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:8];
915
+ [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:9];
916
+ [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10];
917
+ [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11];
918
+ [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12];
919
+ [encoder setBytes:&gqa length:sizeof(gqa) atIndex:13];
899
920
  [encoder setThreadgroupMemoryLength:8192 atIndex:0];
900
921
  [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
901
922
  } else {
@@ -905,6 +926,11 @@ void ggml_metal_graph_compute(
905
926
 
906
927
  // use custom matrix x vector kernel
907
928
  switch (src0t) {
929
+ case GGML_TYPE_F32:
930
+ {
931
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_f32_f32];
932
+ nrows = 4;
933
+ } break;
908
934
  case GGML_TYPE_F16:
909
935
  {
910
936
  nth0 = 32;
@@ -1045,6 +1071,7 @@ void ggml_metal_graph_compute(
1045
1071
  case GGML_OP_GET_ROWS:
1046
1072
  {
1047
1073
  switch (src0->type) {
1074
+ case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_get_rows_f32]; break;
1048
1075
  case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
1049
1076
  case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
1050
1077
  case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
@@ -1060,9 +1087,9 @@ void ggml_metal_graph_compute(
1060
1087
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1061
1088
  [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
1062
1089
  [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
1063
- [encoder setBytes:&(src0->ne[0]) length:sizeof( int64_t) atIndex:3];
1064
- [encoder setBytes:&(src0->nb[1]) length:sizeof(uint64_t) atIndex:4];
1065
- [encoder setBytes:&(dst->nb[1]) length:sizeof(uint64_t) atIndex:5];
1090
+ [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
1091
+ [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
1092
+ [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:5];
1066
1093
 
1067
1094
  const int64_t n = ggml_nelements(src1);
1068
1095