llama_cpp 0.5.2 → 0.5.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/llama_cpp.cpp +2 -2
- data/ext/llama_cpp/src/ggml-alloc.c +6 -6
- data/ext/llama_cpp/src/ggml-cuda.cu +99 -46
- data/ext/llama_cpp/src/ggml-metal.m +37 -10
- data/ext/llama_cpp/src/ggml-metal.metal +144 -45
- data/ext/llama_cpp/src/ggml-opencl.cpp +3 -3
- data/ext/llama_cpp/src/ggml.c +68 -40
- data/ext/llama_cpp/src/ggml.h +43 -33
- data/ext/llama_cpp/src/llama.cpp +420 -57
- data/ext/llama_cpp/src/llama.h +5 -1
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c45589a61587acfbe88add77ffb135a7949619ba2936178c59126c24c30e23cc
|
4
|
+
data.tar.gz: 5866b5b5f8dab59432cc91beca290f927a8d1bc694f83c8ccbe366c6f636f47c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 89714a2a920172c1ddc4fff56a11390ad97db62eb6bd4eefe3ba9376132bd6646eda7569acea49ff3a1ce87486cac0e623cac4fddfcb2b57629c30ee3457d38b
|
7
|
+
data.tar.gz: 2c5494528f55b86c57fccb18658350058210e85f31e9ecb8b3587a4da68a1465a0987a98e86dbae16fa5fea9a9502aed96afec3679e5553a82805f4436ef3020
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
## [[0.5.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.2...v0.5.3)] - 2023-09-23
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1 to b1266.
|
4
|
+
|
1
5
|
## [[0.5.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.1...v0.5.2)] - 2023-09-16
|
2
6
|
|
3
7
|
- Bump bundled llama.cpp from b1198 to b1.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1046,7 +1046,7 @@ private:
|
|
1046
1046
|
|
1047
1047
|
llama_token* tokens = ALLOCA_N(llama_token, n_max_tokens);
|
1048
1048
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1049
|
-
const int n_tokens = llama_tokenize_with_model(ptr->model, text.c_str(), tokens, n_max_tokens, add_bos);
|
1049
|
+
const int n_tokens = llama_tokenize_with_model(ptr->model, text.c_str(), text.size(), tokens, n_max_tokens, add_bos);
|
1050
1050
|
|
1051
1051
|
if (n_tokens < 0) {
|
1052
1052
|
rb_raise(rb_eRuntimeError, "failed to tokenize. The numebr of tokens (%d) is greater than n_max_tokens.", -n_tokens);
|
@@ -1585,7 +1585,7 @@ private:
|
|
1585
1585
|
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1586
1586
|
return Qnil;
|
1587
1587
|
}
|
1588
|
-
const int n = llama_tokenize(ptr->ctx, text.c_str(), tokens.data(), n_max_tokens, add_bos);
|
1588
|
+
const int n = llama_tokenize(ptr->ctx, text.c_str(), text.size(), tokens.data(), n_max_tokens, add_bos);
|
1589
1589
|
if (n < 0) {
|
1590
1590
|
rb_raise(rb_eRuntimeError, "Failed to tokenize");
|
1591
1591
|
return Qnil;
|
@@ -131,6 +131,10 @@ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_ten
|
|
131
131
|
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
|
132
132
|
}
|
133
133
|
|
134
|
+
static bool ggml_is_view(struct ggml_tensor * t) {
|
135
|
+
return t->view_src != NULL;
|
136
|
+
}
|
137
|
+
|
134
138
|
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
135
139
|
#ifdef GGML_ALLOCATOR_DEBUG
|
136
140
|
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
@@ -338,8 +342,8 @@ static void free_vmem(void * base_addr, size_t size) {
|
|
338
342
|
|
339
343
|
// allocate uncommitted virtual memory to measure the size of the graph
|
340
344
|
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
|
341
|
-
//
|
342
|
-
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<
|
345
|
+
// 128GB for 64-bit, 1GB for 32-bit
|
346
|
+
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
|
343
347
|
do {
|
344
348
|
*base_addr = alloc_vmem(*size);
|
345
349
|
if (*base_addr != NULL) {
|
@@ -399,10 +403,6 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
|
|
399
403
|
|
400
404
|
//////////// compute graph allocator
|
401
405
|
|
402
|
-
static bool ggml_is_view(struct ggml_tensor * t) {
|
403
|
-
return t->view_src != NULL;
|
404
|
-
}
|
405
|
-
|
406
406
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
407
407
|
if (a->type != b->type) {
|
408
408
|
return false;
|
@@ -31,6 +31,9 @@
|
|
31
31
|
#define cublasSetStream hipblasSetStream
|
32
32
|
#define cublasSgemm hipblasSgemm
|
33
33
|
#define cublasStatus_t hipblasStatus_t
|
34
|
+
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
35
|
+
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
36
|
+
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
34
37
|
#define cudaDeviceProp hipDeviceProp_t
|
35
38
|
#define cudaDeviceSynchronize hipDeviceSynchronize
|
36
39
|
#define cudaError_t hipError_t
|
@@ -61,7 +64,7 @@
|
|
61
64
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
62
65
|
#define cudaStreamNonBlocking hipStreamNonBlocking
|
63
66
|
#define cudaStreamSynchronize hipStreamSynchronize
|
64
|
-
#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event,
|
67
|
+
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
65
68
|
#define cudaStream_t hipStream_t
|
66
69
|
#define cudaSuccess hipSuccess
|
67
70
|
#else
|
@@ -190,6 +193,12 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
190
193
|
} while (0)
|
191
194
|
#endif // CUDART_VERSION >= 11
|
192
195
|
|
196
|
+
#if CUDART_VERSION >= 11100
|
197
|
+
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
198
|
+
#else
|
199
|
+
#define GGML_CUDA_ASSUME(x)
|
200
|
+
#endif // CUDART_VERSION >= 11100
|
201
|
+
|
193
202
|
#ifdef GGML_CUDA_F16
|
194
203
|
typedef half dfloat; // dequantize float
|
195
204
|
typedef half2 dfloat2;
|
@@ -418,6 +427,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
418
427
|
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
419
428
|
#endif
|
420
429
|
|
430
|
+
#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
|
431
|
+
#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
|
432
|
+
#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
|
433
|
+
|
421
434
|
#define MUL_MAT_SRC1_COL_STRIDE 128
|
422
435
|
|
423
436
|
#define MAX_STREAMS 8
|
@@ -2145,10 +2158,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2145
2158
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2146
2159
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2147
2160
|
|
2148
|
-
|
2149
|
-
|
2150
|
-
|
2151
|
-
|
2161
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2162
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2163
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2164
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2152
2165
|
|
2153
2166
|
const int kbx = k / QI4_0;
|
2154
2167
|
const int kqsx = k % QI4_0;
|
@@ -2239,10 +2252,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2239
2252
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2240
2253
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2241
2254
|
|
2242
|
-
|
2243
|
-
|
2244
|
-
|
2245
|
-
|
2255
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2256
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2257
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2258
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2246
2259
|
|
2247
2260
|
const int kbx = k / QI4_1;
|
2248
2261
|
const int kqsx = k % QI4_1;
|
@@ -2331,10 +2344,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2331
2344
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2332
2345
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2333
2346
|
|
2334
|
-
|
2335
|
-
|
2336
|
-
|
2337
|
-
|
2347
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2348
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2349
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2350
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2338
2351
|
|
2339
2352
|
const int kbx = k / QI5_0;
|
2340
2353
|
const int kqsx = k % QI5_0;
|
@@ -2445,10 +2458,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2445
2458
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2446
2459
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2447
2460
|
|
2448
|
-
|
2449
|
-
|
2450
|
-
|
2451
|
-
|
2461
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2462
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2463
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2464
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2452
2465
|
|
2453
2466
|
const int kbx = k / QI5_1;
|
2454
2467
|
const int kqsx = k % QI5_1;
|
@@ -2551,10 +2564,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2551
2564
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2552
2565
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2553
2566
|
|
2554
|
-
|
2555
|
-
|
2556
|
-
|
2557
|
-
|
2567
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2568
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2569
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2570
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2558
2571
|
|
2559
2572
|
const int kbx = k / QI8_0;
|
2560
2573
|
const int kqsx = k % QI8_0;
|
@@ -2642,10 +2655,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2642
2655
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2643
2656
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2644
2657
|
|
2645
|
-
|
2646
|
-
|
2647
|
-
|
2648
|
-
|
2658
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2659
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2660
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2661
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2649
2662
|
|
2650
2663
|
const int kbx = k / QI2_K;
|
2651
2664
|
const int kqsx = k % QI2_K;
|
@@ -2763,10 +2776,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2763
2776
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2764
2777
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2765
2778
|
|
2766
|
-
|
2767
|
-
|
2768
|
-
|
2769
|
-
|
2779
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2780
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2781
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2782
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2770
2783
|
|
2771
2784
|
const int kbx = k / QI3_K;
|
2772
2785
|
const int kqsx = k % QI3_K;
|
@@ -2981,10 +2994,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2981
2994
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2982
2995
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2983
2996
|
|
2984
|
-
|
2985
|
-
|
2986
|
-
|
2987
|
-
|
2997
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2998
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2999
|
+
GGML_CUDA_ASSUME(k >= 0);
|
3000
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2988
3001
|
|
2989
3002
|
const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
2990
3003
|
const int kqsx = k % QI4_K; // == k if QK_K == 256
|
@@ -3162,10 +3175,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3162
3175
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3163
3176
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3164
3177
|
|
3165
|
-
|
3166
|
-
|
3167
|
-
|
3168
|
-
|
3178
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
3179
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
3180
|
+
GGML_CUDA_ASSUME(k >= 0);
|
3181
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
3169
3182
|
|
3170
3183
|
const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
3171
3184
|
const int kqsx = k % QI5_K; // == k if QK_K == 256
|
@@ -3291,10 +3304,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3291
3304
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3292
3305
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3293
3306
|
|
3294
|
-
|
3295
|
-
|
3296
|
-
|
3297
|
-
|
3307
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
3308
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
3309
|
+
GGML_CUDA_ASSUME(k >= 0);
|
3310
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
3298
3311
|
|
3299
3312
|
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
3300
3313
|
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
@@ -6252,6 +6265,43 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
6252
6265
|
}
|
6253
6266
|
}
|
6254
6267
|
|
6268
|
+
void ggml_cuda_set_peer_access(const int n_tokens) {
|
6269
|
+
static bool peer_access_enabled = false;
|
6270
|
+
|
6271
|
+
const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
|
6272
|
+
|
6273
|
+
if (peer_access_enabled == enable_peer_access) {
|
6274
|
+
return;
|
6275
|
+
}
|
6276
|
+
|
6277
|
+
#ifdef NDEBUG
|
6278
|
+
for (int id = 0; id < g_device_count; ++id) {
|
6279
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
6280
|
+
|
6281
|
+
for (int id_other = 0; id_other < g_device_count; ++id_other) {
|
6282
|
+
if (id == id_other) {
|
6283
|
+
continue;
|
6284
|
+
}
|
6285
|
+
if (id != g_main_device && id_other != g_main_device) {
|
6286
|
+
continue;
|
6287
|
+
}
|
6288
|
+
|
6289
|
+
int can_access_peer;
|
6290
|
+
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
|
6291
|
+
if (can_access_peer) {
|
6292
|
+
if (enable_peer_access) {
|
6293
|
+
CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
|
6294
|
+
} else {
|
6295
|
+
CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
|
6296
|
+
}
|
6297
|
+
}
|
6298
|
+
}
|
6299
|
+
}
|
6300
|
+
#endif // NDEBUG
|
6301
|
+
|
6302
|
+
peer_access_enabled = enable_peer_access;
|
6303
|
+
}
|
6304
|
+
|
6255
6305
|
static void ggml_cuda_op_mul_mat(
|
6256
6306
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
|
6257
6307
|
const bool convert_src1_to_q8_1) {
|
@@ -6276,6 +6326,8 @@ static void ggml_cuda_op_mul_mat(
|
|
6276
6326
|
const int nb2 = dst->nb[2];
|
6277
6327
|
const int nb3 = dst->nb[3];
|
6278
6328
|
|
6329
|
+
ggml_cuda_set_peer_access(ne11);
|
6330
|
+
|
6279
6331
|
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
6280
6332
|
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
6281
6333
|
|
@@ -6408,7 +6460,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6408
6460
|
|
6409
6461
|
// wait for main GPU data if necessary
|
6410
6462
|
if (split && (id != g_main_device || is != 0)) {
|
6411
|
-
CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0]));
|
6463
|
+
CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0));
|
6412
6464
|
}
|
6413
6465
|
|
6414
6466
|
for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
|
@@ -6530,7 +6582,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6530
6582
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6531
6583
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
6532
6584
|
for (int64_t is = 0; is < is_max; ++is) {
|
6533
|
-
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is]));
|
6585
|
+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
|
6534
6586
|
}
|
6535
6587
|
}
|
6536
6588
|
}
|
@@ -6964,6 +7016,7 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
|
|
6964
7016
|
return;
|
6965
7017
|
}
|
6966
7018
|
if (g_scratch_buffer == nullptr) {
|
7019
|
+
ggml_cuda_set_device(g_main_device);
|
6967
7020
|
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
6968
7021
|
}
|
6969
7022
|
|
@@ -7003,7 +7056,7 @@ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
|
|
7003
7056
|
ggml_cuda_assign_buffers_impl(tensor, false, true, false);
|
7004
7057
|
}
|
7005
7058
|
|
7006
|
-
void ggml_cuda_set_main_device(int main_device) {
|
7059
|
+
void ggml_cuda_set_main_device(const int main_device) {
|
7007
7060
|
if (main_device >= g_device_count) {
|
7008
7061
|
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
7009
7062
|
main_device, g_device_count, g_main_device);
|
@@ -7017,11 +7070,11 @@ void ggml_cuda_set_main_device(int main_device) {
|
|
7017
7070
|
}
|
7018
7071
|
}
|
7019
7072
|
|
7020
|
-
void ggml_cuda_set_mul_mat_q(bool mul_mat_q) {
|
7073
|
+
void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
|
7021
7074
|
g_mul_mat_q = mul_mat_q;
|
7022
7075
|
}
|
7023
7076
|
|
7024
|
-
void ggml_cuda_set_scratch_size(size_t scratch_size) {
|
7077
|
+
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
7025
7078
|
g_scratch_size = scratch_size;
|
7026
7079
|
}
|
7027
7080
|
|
@@ -66,6 +66,7 @@ struct ggml_metal_context {
|
|
66
66
|
GGML_METAL_DECL_KERNEL(soft_max_4);
|
67
67
|
GGML_METAL_DECL_KERNEL(diag_mask_inf);
|
68
68
|
GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
|
69
|
+
GGML_METAL_DECL_KERNEL(get_rows_f32);
|
69
70
|
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
70
71
|
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
71
72
|
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
@@ -77,6 +78,7 @@ struct ggml_metal_context {
|
|
77
78
|
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
|
78
79
|
GGML_METAL_DECL_KERNEL(rms_norm);
|
79
80
|
GGML_METAL_DECL_KERNEL(norm);
|
81
|
+
GGML_METAL_DECL_KERNEL(mul_mat_f32_f32);
|
80
82
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
81
83
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
|
82
84
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
|
@@ -88,6 +90,7 @@ struct ggml_metal_context {
|
|
88
90
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
|
89
91
|
GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
|
90
92
|
GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
|
93
|
+
GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
|
91
94
|
GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
|
92
95
|
GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
|
93
96
|
GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
|
@@ -145,7 +148,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
145
148
|
ctx->n_buffers = 0;
|
146
149
|
ctx->concur_list_len = 0;
|
147
150
|
|
148
|
-
ctx->d_queue = dispatch_queue_create("
|
151
|
+
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
149
152
|
|
150
153
|
#ifdef GGML_SWIFT
|
151
154
|
// load the default.metallib file
|
@@ -175,7 +178,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
175
178
|
|
176
179
|
//NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
|
177
180
|
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
178
|
-
NSString * path
|
181
|
+
NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
179
182
|
metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]);
|
180
183
|
|
181
184
|
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
|
@@ -224,6 +227,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
224
227
|
GGML_METAL_ADD_KERNEL(soft_max_4);
|
225
228
|
GGML_METAL_ADD_KERNEL(diag_mask_inf);
|
226
229
|
GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
|
230
|
+
GGML_METAL_ADD_KERNEL(get_rows_f32);
|
227
231
|
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
228
232
|
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
229
233
|
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
@@ -235,6 +239,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
235
239
|
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
|
236
240
|
GGML_METAL_ADD_KERNEL(rms_norm);
|
237
241
|
GGML_METAL_ADD_KERNEL(norm);
|
242
|
+
GGML_METAL_ADD_KERNEL(mul_mat_f32_f32);
|
238
243
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
239
244
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
|
240
245
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
|
@@ -246,6 +251,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
246
251
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
|
247
252
|
GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
|
248
253
|
GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
|
254
|
+
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
|
249
255
|
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
|
250
256
|
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
|
251
257
|
GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
|
@@ -293,7 +299,9 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
293
299
|
GGML_METAL_DEL_KERNEL(gelu);
|
294
300
|
GGML_METAL_DEL_KERNEL(soft_max);
|
295
301
|
GGML_METAL_DEL_KERNEL(soft_max_4);
|
302
|
+
GGML_METAL_DEL_KERNEL(diag_mask_inf);
|
296
303
|
GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
|
304
|
+
GGML_METAL_DEL_KERNEL(get_rows_f32);
|
297
305
|
GGML_METAL_DEL_KERNEL(get_rows_f16);
|
298
306
|
GGML_METAL_DEL_KERNEL(get_rows_q4_0);
|
299
307
|
GGML_METAL_DEL_KERNEL(get_rows_q4_1);
|
@@ -305,6 +313,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
305
313
|
GGML_METAL_DEL_KERNEL(get_rows_q6_K);
|
306
314
|
GGML_METAL_DEL_KERNEL(rms_norm);
|
307
315
|
GGML_METAL_DEL_KERNEL(norm);
|
316
|
+
GGML_METAL_DEL_KERNEL(mul_mat_f32_f32);
|
308
317
|
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
|
309
318
|
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
|
310
319
|
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
|
@@ -316,6 +325,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
316
325
|
GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32);
|
317
326
|
GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32);
|
318
327
|
GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32);
|
328
|
+
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
|
319
329
|
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
|
320
330
|
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
|
321
331
|
GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
|
@@ -386,6 +396,7 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
|
|
386
396
|
for (int i = 0; i < ctx->n_buffers; ++i) {
|
387
397
|
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
|
388
398
|
|
399
|
+
//metal_printf("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
|
389
400
|
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
|
390
401
|
*offs = (size_t) ioffs;
|
391
402
|
|
@@ -723,6 +734,7 @@ void ggml_metal_graph_compute(
|
|
723
734
|
case GGML_OP_ADD:
|
724
735
|
{
|
725
736
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
737
|
+
GGML_ASSERT(ggml_is_contiguous(src1));
|
726
738
|
|
727
739
|
// utilize float4
|
728
740
|
GGML_ASSERT(ne00 % 4 == 0);
|
@@ -730,6 +742,7 @@ void ggml_metal_graph_compute(
|
|
730
742
|
|
731
743
|
if (ggml_nelements(src1) == ne10) {
|
732
744
|
// src1 is a row
|
745
|
+
GGML_ASSERT(ne11 == 1);
|
733
746
|
[encoder setComputePipelineState:ctx->pipeline_add_row];
|
734
747
|
} else {
|
735
748
|
[encoder setComputePipelineState:ctx->pipeline_add];
|
@@ -746,6 +759,7 @@ void ggml_metal_graph_compute(
|
|
746
759
|
case GGML_OP_MUL:
|
747
760
|
{
|
748
761
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
762
|
+
GGML_ASSERT(ggml_is_contiguous(src1));
|
749
763
|
|
750
764
|
// utilize float4
|
751
765
|
GGML_ASSERT(ne00 % 4 == 0);
|
@@ -753,6 +767,7 @@ void ggml_metal_graph_compute(
|
|
753
767
|
|
754
768
|
if (ggml_nelements(src1) == ne10) {
|
755
769
|
// src1 is a row
|
770
|
+
GGML_ASSERT(ne11 == 1);
|
756
771
|
[encoder setComputePipelineState:ctx->pipeline_mul_row];
|
757
772
|
} else {
|
758
773
|
[encoder setComputePipelineState:ctx->pipeline_mul];
|
@@ -768,6 +783,8 @@ void ggml_metal_graph_compute(
|
|
768
783
|
} break;
|
769
784
|
case GGML_OP_SCALE:
|
770
785
|
{
|
786
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
787
|
+
|
771
788
|
const float scale = *(const float *) src1->data;
|
772
789
|
|
773
790
|
[encoder setComputePipelineState:ctx->pipeline_scale];
|
@@ -867,13 +884,14 @@ void ggml_metal_graph_compute(
|
|
867
884
|
|
868
885
|
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
869
886
|
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
870
|
-
if (
|
871
|
-
|
887
|
+
if (!ggml_is_transposed(src0) &&
|
888
|
+
!ggml_is_transposed(src1) &&
|
872
889
|
src1t == GGML_TYPE_F32 &&
|
873
890
|
[ctx->device supportsFamily:MTLGPUFamilyApple7] &&
|
874
891
|
ne00%32 == 0 &&
|
875
892
|
ne11 > 1) {
|
876
893
|
switch (src0->type) {
|
894
|
+
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break;
|
877
895
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
|
878
896
|
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
|
879
897
|
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
|
@@ -893,9 +911,12 @@ void ggml_metal_graph_compute(
|
|
893
911
|
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
|
894
912
|
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
|
895
913
|
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
|
896
|
-
[encoder setBytes:&
|
897
|
-
[encoder setBytes:&
|
898
|
-
[encoder setBytes:&
|
914
|
+
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:8];
|
915
|
+
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:9];
|
916
|
+
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10];
|
917
|
+
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11];
|
918
|
+
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12];
|
919
|
+
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:13];
|
899
920
|
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
900
921
|
[encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
901
922
|
} else {
|
@@ -905,6 +926,11 @@ void ggml_metal_graph_compute(
|
|
905
926
|
|
906
927
|
// use custom matrix x vector kernel
|
907
928
|
switch (src0t) {
|
929
|
+
case GGML_TYPE_F32:
|
930
|
+
{
|
931
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f32_f32];
|
932
|
+
nrows = 4;
|
933
|
+
} break;
|
908
934
|
case GGML_TYPE_F16:
|
909
935
|
{
|
910
936
|
nth0 = 32;
|
@@ -1045,6 +1071,7 @@ void ggml_metal_graph_compute(
|
|
1045
1071
|
case GGML_OP_GET_ROWS:
|
1046
1072
|
{
|
1047
1073
|
switch (src0->type) {
|
1074
|
+
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_get_rows_f32]; break;
|
1048
1075
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
|
1049
1076
|
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
1050
1077
|
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
|
@@ -1060,9 +1087,9 @@ void ggml_metal_graph_compute(
|
|
1060
1087
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
1061
1088
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
1062
1089
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
1063
|
-
[encoder setBytes:&
|
1064
|
-
[encoder setBytes:&
|
1065
|
-
[encoder setBytes:&
|
1090
|
+
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
|
1091
|
+
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
|
1092
|
+
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:5];
|
1066
1093
|
|
1067
1094
|
const int64_t n = ggml_nelements(src1);
|
1068
1095
|
|