llama_cpp 0.5.2 → 0.5.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9e38c82f6ce7404a78b3ecdbc9574ae860322e6945499f0c4a905956bcbd2be7
4
- data.tar.gz: 4a5effb6fcf3182baad091717bc510176eb127ccd660342ce0cc46bf2d392b4a
3
+ metadata.gz: c45589a61587acfbe88add77ffb135a7949619ba2936178c59126c24c30e23cc
4
+ data.tar.gz: 5866b5b5f8dab59432cc91beca290f927a8d1bc694f83c8ccbe366c6f636f47c
5
5
  SHA512:
6
- metadata.gz: c471bd6c6afee142945d03da1c4908355fe900a5f0c259583b7b65f97d495d07c5397d1b551da888a5970170944596959ddef73d2df803acf001b8d079d0affb
7
- data.tar.gz: 99cbb2d978723f9814d8ac7163f03c642a1ac6cabbd6cf09d003f563c629563a920d909ab797729f1e233f30d5776bf9f70f4c473919e5bf101d3e3f5fd6e938
6
+ metadata.gz: 89714a2a920172c1ddc4fff56a11390ad97db62eb6bd4eefe3ba9376132bd6646eda7569acea49ff3a1ce87486cac0e623cac4fddfcb2b57629c30ee3457d38b
7
+ data.tar.gz: 2c5494528f55b86c57fccb18658350058210e85f31e9ecb8b3587a4da68a1465a0987a98e86dbae16fa5fea9a9502aed96afec3679e5553a82805f4436ef3020
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## [[0.5.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.2...v0.5.3)] - 2023-09-23
2
+
3
+ - Bump bundled llama.cpp from b1 to b1266.
4
+
1
5
  ## [[0.5.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.1...v0.5.2)] - 2023-09-16
2
6
 
3
7
  - Bump bundled llama.cpp from b1198 to b1.
@@ -1046,7 +1046,7 @@ private:
1046
1046
 
1047
1047
  llama_token* tokens = ALLOCA_N(llama_token, n_max_tokens);
1048
1048
  LLaMAModelWrapper* ptr = get_llama_model(self);
1049
- const int n_tokens = llama_tokenize_with_model(ptr->model, text.c_str(), tokens, n_max_tokens, add_bos);
1049
+ const int n_tokens = llama_tokenize_with_model(ptr->model, text.c_str(), text.size(), tokens, n_max_tokens, add_bos);
1050
1050
 
1051
1051
  if (n_tokens < 0) {
1052
1052
  rb_raise(rb_eRuntimeError, "failed to tokenize. The numebr of tokens (%d) is greater than n_max_tokens.", -n_tokens);
@@ -1585,7 +1585,7 @@ private:
1585
1585
  rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1586
1586
  return Qnil;
1587
1587
  }
1588
- const int n = llama_tokenize(ptr->ctx, text.c_str(), tokens.data(), n_max_tokens, add_bos);
1588
+ const int n = llama_tokenize(ptr->ctx, text.c_str(), text.size(), tokens.data(), n_max_tokens, add_bos);
1589
1589
  if (n < 0) {
1590
1590
  rb_raise(rb_eRuntimeError, "Failed to tokenize");
1591
1591
  return Qnil;
@@ -131,6 +131,10 @@ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_ten
131
131
  return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
132
132
  }
133
133
 
134
+ static bool ggml_is_view(struct ggml_tensor * t) {
135
+ return t->view_src != NULL;
136
+ }
137
+
134
138
  void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
135
139
  #ifdef GGML_ALLOCATOR_DEBUG
136
140
  GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
@@ -338,8 +342,8 @@ static void free_vmem(void * base_addr, size_t size) {
338
342
 
339
343
  // allocate uncommitted virtual memory to measure the size of the graph
340
344
  static void alloc_measure_vmem(void ** base_addr, size_t * size) {
341
- // 1TB for 64-bit, 1GB for 32-bit
342
- *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
345
+ // 128GB for 64-bit, 1GB for 32-bit
346
+ *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
343
347
  do {
344
348
  *base_addr = alloc_vmem(*size);
345
349
  if (*base_addr != NULL) {
@@ -399,10 +403,6 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
399
403
 
400
404
  //////////// compute graph allocator
401
405
 
402
- static bool ggml_is_view(struct ggml_tensor * t) {
403
- return t->view_src != NULL;
404
- }
405
-
406
406
  static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
407
407
  if (a->type != b->type) {
408
408
  return false;
@@ -31,6 +31,9 @@
31
31
  #define cublasSetStream hipblasSetStream
32
32
  #define cublasSgemm hipblasSgemm
33
33
  #define cublasStatus_t hipblasStatus_t
34
+ #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
35
+ #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
36
+ #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
34
37
  #define cudaDeviceProp hipDeviceProp_t
35
38
  #define cudaDeviceSynchronize hipDeviceSynchronize
36
39
  #define cudaError_t hipError_t
@@ -61,7 +64,7 @@
61
64
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
62
65
  #define cudaStreamNonBlocking hipStreamNonBlocking
63
66
  #define cudaStreamSynchronize hipStreamSynchronize
64
- #define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
67
+ #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
65
68
  #define cudaStream_t hipStream_t
66
69
  #define cudaSuccess hipSuccess
67
70
  #else
@@ -190,6 +193,12 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
190
193
  } while (0)
191
194
  #endif // CUDART_VERSION >= 11
192
195
 
196
+ #if CUDART_VERSION >= 11100
197
+ #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
198
+ #else
199
+ #define GGML_CUDA_ASSUME(x)
200
+ #endif // CUDART_VERSION >= 11100
201
+
193
202
  #ifdef GGML_CUDA_F16
194
203
  typedef half dfloat; // dequantize float
195
204
  typedef half2 dfloat2;
@@ -418,6 +427,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
418
427
  static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
419
428
  #endif
420
429
 
430
+ #ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
431
+ #define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
432
+ #endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
433
+
421
434
  #define MUL_MAT_SRC1_COL_STRIDE 128
422
435
 
423
436
  #define MAX_STREAMS 8
@@ -2145,10 +2158,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2145
2158
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2146
2159
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2147
2160
 
2148
- __builtin_assume(i_offset >= 0);
2149
- __builtin_assume(i_offset < nwarps);
2150
- __builtin_assume(k >= 0);
2151
- __builtin_assume(k < WARP_SIZE);
2161
+ GGML_CUDA_ASSUME(i_offset >= 0);
2162
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2163
+ GGML_CUDA_ASSUME(k >= 0);
2164
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2152
2165
 
2153
2166
  const int kbx = k / QI4_0;
2154
2167
  const int kqsx = k % QI4_0;
@@ -2239,10 +2252,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2239
2252
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2240
2253
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2241
2254
 
2242
- __builtin_assume(i_offset >= 0);
2243
- __builtin_assume(i_offset < nwarps);
2244
- __builtin_assume(k >= 0);
2245
- __builtin_assume(k < WARP_SIZE);
2255
+ GGML_CUDA_ASSUME(i_offset >= 0);
2256
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2257
+ GGML_CUDA_ASSUME(k >= 0);
2258
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2246
2259
 
2247
2260
  const int kbx = k / QI4_1;
2248
2261
  const int kqsx = k % QI4_1;
@@ -2331,10 +2344,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2331
2344
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2332
2345
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2333
2346
 
2334
- __builtin_assume(i_offset >= 0);
2335
- __builtin_assume(i_offset < nwarps);
2336
- __builtin_assume(k >= 0);
2337
- __builtin_assume(k < WARP_SIZE);
2347
+ GGML_CUDA_ASSUME(i_offset >= 0);
2348
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2349
+ GGML_CUDA_ASSUME(k >= 0);
2350
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2338
2351
 
2339
2352
  const int kbx = k / QI5_0;
2340
2353
  const int kqsx = k % QI5_0;
@@ -2445,10 +2458,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2445
2458
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2446
2459
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2447
2460
 
2448
- __builtin_assume(i_offset >= 0);
2449
- __builtin_assume(i_offset < nwarps);
2450
- __builtin_assume(k >= 0);
2451
- __builtin_assume(k < WARP_SIZE);
2461
+ GGML_CUDA_ASSUME(i_offset >= 0);
2462
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2463
+ GGML_CUDA_ASSUME(k >= 0);
2464
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2452
2465
 
2453
2466
  const int kbx = k / QI5_1;
2454
2467
  const int kqsx = k % QI5_1;
@@ -2551,10 +2564,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2551
2564
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2552
2565
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2553
2566
 
2554
- __builtin_assume(i_offset >= 0);
2555
- __builtin_assume(i_offset < nwarps);
2556
- __builtin_assume(k >= 0);
2557
- __builtin_assume(k < WARP_SIZE);
2567
+ GGML_CUDA_ASSUME(i_offset >= 0);
2568
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2569
+ GGML_CUDA_ASSUME(k >= 0);
2570
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2558
2571
 
2559
2572
  const int kbx = k / QI8_0;
2560
2573
  const int kqsx = k % QI8_0;
@@ -2642,10 +2655,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2642
2655
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2643
2656
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2644
2657
 
2645
- __builtin_assume(i_offset >= 0);
2646
- __builtin_assume(i_offset < nwarps);
2647
- __builtin_assume(k >= 0);
2648
- __builtin_assume(k < WARP_SIZE);
2658
+ GGML_CUDA_ASSUME(i_offset >= 0);
2659
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2660
+ GGML_CUDA_ASSUME(k >= 0);
2661
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2649
2662
 
2650
2663
  const int kbx = k / QI2_K;
2651
2664
  const int kqsx = k % QI2_K;
@@ -2763,10 +2776,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2763
2776
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2764
2777
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2765
2778
 
2766
- __builtin_assume(i_offset >= 0);
2767
- __builtin_assume(i_offset < nwarps);
2768
- __builtin_assume(k >= 0);
2769
- __builtin_assume(k < WARP_SIZE);
2779
+ GGML_CUDA_ASSUME(i_offset >= 0);
2780
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2781
+ GGML_CUDA_ASSUME(k >= 0);
2782
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2770
2783
 
2771
2784
  const int kbx = k / QI3_K;
2772
2785
  const int kqsx = k % QI3_K;
@@ -2981,10 +2994,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2981
2994
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2982
2995
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2983
2996
 
2984
- __builtin_assume(i_offset >= 0);
2985
- __builtin_assume(i_offset < nwarps);
2986
- __builtin_assume(k >= 0);
2987
- __builtin_assume(k < WARP_SIZE);
2997
+ GGML_CUDA_ASSUME(i_offset >= 0);
2998
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2999
+ GGML_CUDA_ASSUME(k >= 0);
3000
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2988
3001
 
2989
3002
  const int kbx = k / QI4_K; // == 0 if QK_K == 256
2990
3003
  const int kqsx = k % QI4_K; // == k if QK_K == 256
@@ -3162,10 +3175,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3162
3175
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3163
3176
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3164
3177
 
3165
- __builtin_assume(i_offset >= 0);
3166
- __builtin_assume(i_offset < nwarps);
3167
- __builtin_assume(k >= 0);
3168
- __builtin_assume(k < WARP_SIZE);
3178
+ GGML_CUDA_ASSUME(i_offset >= 0);
3179
+ GGML_CUDA_ASSUME(i_offset < nwarps);
3180
+ GGML_CUDA_ASSUME(k >= 0);
3181
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
3169
3182
 
3170
3183
  const int kbx = k / QI5_K; // == 0 if QK_K == 256
3171
3184
  const int kqsx = k % QI5_K; // == k if QK_K == 256
@@ -3291,10 +3304,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3291
3304
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3292
3305
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3293
3306
 
3294
- __builtin_assume(i_offset >= 0);
3295
- __builtin_assume(i_offset < nwarps);
3296
- __builtin_assume(k >= 0);
3297
- __builtin_assume(k < WARP_SIZE);
3307
+ GGML_CUDA_ASSUME(i_offset >= 0);
3308
+ GGML_CUDA_ASSUME(i_offset < nwarps);
3309
+ GGML_CUDA_ASSUME(k >= 0);
3310
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
3298
3311
 
3299
3312
  const int kbx = k / QI6_K; // == 0 if QK_K == 256
3300
3313
  const int kqsx = k % QI6_K; // == k if QK_K == 256
@@ -6252,6 +6265,43 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
6252
6265
  }
6253
6266
  }
6254
6267
 
6268
+ void ggml_cuda_set_peer_access(const int n_tokens) {
6269
+ static bool peer_access_enabled = false;
6270
+
6271
+ const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
6272
+
6273
+ if (peer_access_enabled == enable_peer_access) {
6274
+ return;
6275
+ }
6276
+
6277
+ #ifdef NDEBUG
6278
+ for (int id = 0; id < g_device_count; ++id) {
6279
+ CUDA_CHECK(ggml_cuda_set_device(id));
6280
+
6281
+ for (int id_other = 0; id_other < g_device_count; ++id_other) {
6282
+ if (id == id_other) {
6283
+ continue;
6284
+ }
6285
+ if (id != g_main_device && id_other != g_main_device) {
6286
+ continue;
6287
+ }
6288
+
6289
+ int can_access_peer;
6290
+ CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
6291
+ if (can_access_peer) {
6292
+ if (enable_peer_access) {
6293
+ CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
6294
+ } else {
6295
+ CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
6296
+ }
6297
+ }
6298
+ }
6299
+ }
6300
+ #endif // NDEBUG
6301
+
6302
+ peer_access_enabled = enable_peer_access;
6303
+ }
6304
+
6255
6305
  static void ggml_cuda_op_mul_mat(
6256
6306
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
6257
6307
  const bool convert_src1_to_q8_1) {
@@ -6276,6 +6326,8 @@ static void ggml_cuda_op_mul_mat(
6276
6326
  const int nb2 = dst->nb[2];
6277
6327
  const int nb3 = dst->nb[3];
6278
6328
 
6329
+ ggml_cuda_set_peer_access(ne11);
6330
+
6279
6331
  GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
6280
6332
  GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
6281
6333
 
@@ -6408,7 +6460,7 @@ static void ggml_cuda_op_mul_mat(
6408
6460
 
6409
6461
  // wait for main GPU data if necessary
6410
6462
  if (split && (id != g_main_device || is != 0)) {
6411
- CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0]));
6463
+ CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0));
6412
6464
  }
6413
6465
 
6414
6466
  for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
@@ -6530,7 +6582,7 @@ static void ggml_cuda_op_mul_mat(
6530
6582
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6531
6583
  for (int64_t id = 0; id < g_device_count; ++id) {
6532
6584
  for (int64_t is = 0; is < is_max; ++is) {
6533
- CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is]));
6585
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
6534
6586
  }
6535
6587
  }
6536
6588
  }
@@ -6964,6 +7016,7 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
6964
7016
  return;
6965
7017
  }
6966
7018
  if (g_scratch_buffer == nullptr) {
7019
+ ggml_cuda_set_device(g_main_device);
6967
7020
  CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
6968
7021
  }
6969
7022
 
@@ -7003,7 +7056,7 @@ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
7003
7056
  ggml_cuda_assign_buffers_impl(tensor, false, true, false);
7004
7057
  }
7005
7058
 
7006
- void ggml_cuda_set_main_device(int main_device) {
7059
+ void ggml_cuda_set_main_device(const int main_device) {
7007
7060
  if (main_device >= g_device_count) {
7008
7061
  fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
7009
7062
  main_device, g_device_count, g_main_device);
@@ -7017,11 +7070,11 @@ void ggml_cuda_set_main_device(int main_device) {
7017
7070
  }
7018
7071
  }
7019
7072
 
7020
- void ggml_cuda_set_mul_mat_q(bool mul_mat_q) {
7073
+ void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
7021
7074
  g_mul_mat_q = mul_mat_q;
7022
7075
  }
7023
7076
 
7024
- void ggml_cuda_set_scratch_size(size_t scratch_size) {
7077
+ void ggml_cuda_set_scratch_size(const size_t scratch_size) {
7025
7078
  g_scratch_size = scratch_size;
7026
7079
  }
7027
7080
 
@@ -66,6 +66,7 @@ struct ggml_metal_context {
66
66
  GGML_METAL_DECL_KERNEL(soft_max_4);
67
67
  GGML_METAL_DECL_KERNEL(diag_mask_inf);
68
68
  GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
69
+ GGML_METAL_DECL_KERNEL(get_rows_f32);
69
70
  GGML_METAL_DECL_KERNEL(get_rows_f16);
70
71
  GGML_METAL_DECL_KERNEL(get_rows_q4_0);
71
72
  GGML_METAL_DECL_KERNEL(get_rows_q4_1);
@@ -77,6 +78,7 @@ struct ggml_metal_context {
77
78
  GGML_METAL_DECL_KERNEL(get_rows_q6_K);
78
79
  GGML_METAL_DECL_KERNEL(rms_norm);
79
80
  GGML_METAL_DECL_KERNEL(norm);
81
+ GGML_METAL_DECL_KERNEL(mul_mat_f32_f32);
80
82
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
81
83
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
82
84
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
@@ -88,6 +90,7 @@ struct ggml_metal_context {
88
90
  GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
89
91
  GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
90
92
  GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
93
+ GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
91
94
  GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
92
95
  GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
93
96
  GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
@@ -145,7 +148,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
145
148
  ctx->n_buffers = 0;
146
149
  ctx->concur_list_len = 0;
147
150
 
148
- ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
151
+ ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
149
152
 
150
153
  #ifdef GGML_SWIFT
151
154
  // load the default.metallib file
@@ -175,7 +178,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
175
178
 
176
179
  //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
177
180
  NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
178
- NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
181
+ NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
179
182
  metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]);
180
183
 
181
184
  NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
@@ -224,6 +227,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
224
227
  GGML_METAL_ADD_KERNEL(soft_max_4);
225
228
  GGML_METAL_ADD_KERNEL(diag_mask_inf);
226
229
  GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
230
+ GGML_METAL_ADD_KERNEL(get_rows_f32);
227
231
  GGML_METAL_ADD_KERNEL(get_rows_f16);
228
232
  GGML_METAL_ADD_KERNEL(get_rows_q4_0);
229
233
  GGML_METAL_ADD_KERNEL(get_rows_q4_1);
@@ -235,6 +239,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
235
239
  GGML_METAL_ADD_KERNEL(get_rows_q6_K);
236
240
  GGML_METAL_ADD_KERNEL(rms_norm);
237
241
  GGML_METAL_ADD_KERNEL(norm);
242
+ GGML_METAL_ADD_KERNEL(mul_mat_f32_f32);
238
243
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
239
244
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
240
245
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
@@ -246,6 +251,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
246
251
  GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
247
252
  GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
248
253
  GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
254
+ GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
249
255
  GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
250
256
  GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
251
257
  GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
@@ -293,7 +299,9 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
293
299
  GGML_METAL_DEL_KERNEL(gelu);
294
300
  GGML_METAL_DEL_KERNEL(soft_max);
295
301
  GGML_METAL_DEL_KERNEL(soft_max_4);
302
+ GGML_METAL_DEL_KERNEL(diag_mask_inf);
296
303
  GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
304
+ GGML_METAL_DEL_KERNEL(get_rows_f32);
297
305
  GGML_METAL_DEL_KERNEL(get_rows_f16);
298
306
  GGML_METAL_DEL_KERNEL(get_rows_q4_0);
299
307
  GGML_METAL_DEL_KERNEL(get_rows_q4_1);
@@ -305,6 +313,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
305
313
  GGML_METAL_DEL_KERNEL(get_rows_q6_K);
306
314
  GGML_METAL_DEL_KERNEL(rms_norm);
307
315
  GGML_METAL_DEL_KERNEL(norm);
316
+ GGML_METAL_DEL_KERNEL(mul_mat_f32_f32);
308
317
  GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
309
318
  GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
310
319
  GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
@@ -316,6 +325,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
316
325
  GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32);
317
326
  GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32);
318
327
  GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32);
328
+ GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
319
329
  GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
320
330
  GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
321
331
  GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
@@ -386,6 +396,7 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
386
396
  for (int i = 0; i < ctx->n_buffers; ++i) {
387
397
  const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
388
398
 
399
+ //metal_printf("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
389
400
  if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
390
401
  *offs = (size_t) ioffs;
391
402
 
@@ -723,6 +734,7 @@ void ggml_metal_graph_compute(
723
734
  case GGML_OP_ADD:
724
735
  {
725
736
  GGML_ASSERT(ggml_is_contiguous(src0));
737
+ GGML_ASSERT(ggml_is_contiguous(src1));
726
738
 
727
739
  // utilize float4
728
740
  GGML_ASSERT(ne00 % 4 == 0);
@@ -730,6 +742,7 @@ void ggml_metal_graph_compute(
730
742
 
731
743
  if (ggml_nelements(src1) == ne10) {
732
744
  // src1 is a row
745
+ GGML_ASSERT(ne11 == 1);
733
746
  [encoder setComputePipelineState:ctx->pipeline_add_row];
734
747
  } else {
735
748
  [encoder setComputePipelineState:ctx->pipeline_add];
@@ -746,6 +759,7 @@ void ggml_metal_graph_compute(
746
759
  case GGML_OP_MUL:
747
760
  {
748
761
  GGML_ASSERT(ggml_is_contiguous(src0));
762
+ GGML_ASSERT(ggml_is_contiguous(src1));
749
763
 
750
764
  // utilize float4
751
765
  GGML_ASSERT(ne00 % 4 == 0);
@@ -753,6 +767,7 @@ void ggml_metal_graph_compute(
753
767
 
754
768
  if (ggml_nelements(src1) == ne10) {
755
769
  // src1 is a row
770
+ GGML_ASSERT(ne11 == 1);
756
771
  [encoder setComputePipelineState:ctx->pipeline_mul_row];
757
772
  } else {
758
773
  [encoder setComputePipelineState:ctx->pipeline_mul];
@@ -768,6 +783,8 @@ void ggml_metal_graph_compute(
768
783
  } break;
769
784
  case GGML_OP_SCALE:
770
785
  {
786
+ GGML_ASSERT(ggml_is_contiguous(src0));
787
+
771
788
  const float scale = *(const float *) src1->data;
772
789
 
773
790
  [encoder setComputePipelineState:ctx->pipeline_scale];
@@ -867,13 +884,14 @@ void ggml_metal_graph_compute(
867
884
 
868
885
  // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
869
886
  // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
870
- if (ggml_is_contiguous(src0) &&
871
- ggml_is_contiguous(src1) &&
887
+ if (!ggml_is_transposed(src0) &&
888
+ !ggml_is_transposed(src1) &&
872
889
  src1t == GGML_TYPE_F32 &&
873
890
  [ctx->device supportsFamily:MTLGPUFamilyApple7] &&
874
891
  ne00%32 == 0 &&
875
892
  ne11 > 1) {
876
893
  switch (src0->type) {
894
+ case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break;
877
895
  case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
878
896
  case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
879
897
  case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
@@ -893,9 +911,12 @@ void ggml_metal_graph_compute(
893
911
  [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
894
912
  [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
895
913
  [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
896
- [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8];
897
- [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9];
898
- [encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
914
+ [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:8];
915
+ [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:9];
916
+ [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10];
917
+ [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11];
918
+ [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12];
919
+ [encoder setBytes:&gqa length:sizeof(gqa) atIndex:13];
899
920
  [encoder setThreadgroupMemoryLength:8192 atIndex:0];
900
921
  [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
901
922
  } else {
@@ -905,6 +926,11 @@ void ggml_metal_graph_compute(
905
926
 
906
927
  // use custom matrix x vector kernel
907
928
  switch (src0t) {
929
+ case GGML_TYPE_F32:
930
+ {
931
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_f32_f32];
932
+ nrows = 4;
933
+ } break;
908
934
  case GGML_TYPE_F16:
909
935
  {
910
936
  nth0 = 32;
@@ -1045,6 +1071,7 @@ void ggml_metal_graph_compute(
1045
1071
  case GGML_OP_GET_ROWS:
1046
1072
  {
1047
1073
  switch (src0->type) {
1074
+ case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_get_rows_f32]; break;
1048
1075
  case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
1049
1076
  case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
1050
1077
  case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
@@ -1060,9 +1087,9 @@ void ggml_metal_graph_compute(
1060
1087
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1061
1088
  [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
1062
1089
  [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
1063
- [encoder setBytes:&(src0->ne[0]) length:sizeof( int64_t) atIndex:3];
1064
- [encoder setBytes:&(src0->nb[1]) length:sizeof(uint64_t) atIndex:4];
1065
- [encoder setBytes:&(dst->nb[1]) length:sizeof(uint64_t) atIndex:5];
1090
+ [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
1091
+ [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
1092
+ [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:5];
1066
1093
 
1067
1094
  const int64_t n = ggml_nelements(src1);
1068
1095