llama_cpp 0.5.2 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/llama_cpp.cpp +2 -2
- data/ext/llama_cpp/src/ggml-alloc.c +6 -6
- data/ext/llama_cpp/src/ggml-cuda.cu +99 -46
- data/ext/llama_cpp/src/ggml-metal.m +37 -10
- data/ext/llama_cpp/src/ggml-metal.metal +144 -45
- data/ext/llama_cpp/src/ggml-opencl.cpp +3 -3
- data/ext/llama_cpp/src/ggml.c +68 -40
- data/ext/llama_cpp/src/ggml.h +43 -33
- data/ext/llama_cpp/src/llama.cpp +420 -57
- data/ext/llama_cpp/src/llama.h +5 -1
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c45589a61587acfbe88add77ffb135a7949619ba2936178c59126c24c30e23cc
|
4
|
+
data.tar.gz: 5866b5b5f8dab59432cc91beca290f927a8d1bc694f83c8ccbe366c6f636f47c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 89714a2a920172c1ddc4fff56a11390ad97db62eb6bd4eefe3ba9376132bd6646eda7569acea49ff3a1ce87486cac0e623cac4fddfcb2b57629c30ee3457d38b
|
7
|
+
data.tar.gz: 2c5494528f55b86c57fccb18658350058210e85f31e9ecb8b3587a4da68a1465a0987a98e86dbae16fa5fea9a9502aed96afec3679e5553a82805f4436ef3020
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
## [[0.5.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.2...v0.5.3)] - 2023-09-23
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1 to b1266.
|
4
|
+
|
1
5
|
## [[0.5.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.1...v0.5.2)] - 2023-09-16
|
2
6
|
|
3
7
|
- Bump bundled llama.cpp from b1198 to b1.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1046,7 +1046,7 @@ private:
|
|
1046
1046
|
|
1047
1047
|
llama_token* tokens = ALLOCA_N(llama_token, n_max_tokens);
|
1048
1048
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1049
|
-
const int n_tokens = llama_tokenize_with_model(ptr->model, text.c_str(), tokens, n_max_tokens, add_bos);
|
1049
|
+
const int n_tokens = llama_tokenize_with_model(ptr->model, text.c_str(), text.size(), tokens, n_max_tokens, add_bos);
|
1050
1050
|
|
1051
1051
|
if (n_tokens < 0) {
|
1052
1052
|
rb_raise(rb_eRuntimeError, "failed to tokenize. The numebr of tokens (%d) is greater than n_max_tokens.", -n_tokens);
|
@@ -1585,7 +1585,7 @@ private:
|
|
1585
1585
|
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
1586
1586
|
return Qnil;
|
1587
1587
|
}
|
1588
|
-
const int n = llama_tokenize(ptr->ctx, text.c_str(), tokens.data(), n_max_tokens, add_bos);
|
1588
|
+
const int n = llama_tokenize(ptr->ctx, text.c_str(), text.size(), tokens.data(), n_max_tokens, add_bos);
|
1589
1589
|
if (n < 0) {
|
1590
1590
|
rb_raise(rb_eRuntimeError, "Failed to tokenize");
|
1591
1591
|
return Qnil;
|
@@ -131,6 +131,10 @@ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_ten
|
|
131
131
|
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
|
132
132
|
}
|
133
133
|
|
134
|
+
static bool ggml_is_view(struct ggml_tensor * t) {
|
135
|
+
return t->view_src != NULL;
|
136
|
+
}
|
137
|
+
|
134
138
|
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
135
139
|
#ifdef GGML_ALLOCATOR_DEBUG
|
136
140
|
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
@@ -338,8 +342,8 @@ static void free_vmem(void * base_addr, size_t size) {
|
|
338
342
|
|
339
343
|
// allocate uncommitted virtual memory to measure the size of the graph
|
340
344
|
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
|
341
|
-
//
|
342
|
-
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<
|
345
|
+
// 128GB for 64-bit, 1GB for 32-bit
|
346
|
+
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
|
343
347
|
do {
|
344
348
|
*base_addr = alloc_vmem(*size);
|
345
349
|
if (*base_addr != NULL) {
|
@@ -399,10 +403,6 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
|
|
399
403
|
|
400
404
|
//////////// compute graph allocator
|
401
405
|
|
402
|
-
static bool ggml_is_view(struct ggml_tensor * t) {
|
403
|
-
return t->view_src != NULL;
|
404
|
-
}
|
405
|
-
|
406
406
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
407
407
|
if (a->type != b->type) {
|
408
408
|
return false;
|
@@ -31,6 +31,9 @@
|
|
31
31
|
#define cublasSetStream hipblasSetStream
|
32
32
|
#define cublasSgemm hipblasSgemm
|
33
33
|
#define cublasStatus_t hipblasStatus_t
|
34
|
+
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
35
|
+
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
36
|
+
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
34
37
|
#define cudaDeviceProp hipDeviceProp_t
|
35
38
|
#define cudaDeviceSynchronize hipDeviceSynchronize
|
36
39
|
#define cudaError_t hipError_t
|
@@ -61,7 +64,7 @@
|
|
61
64
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
62
65
|
#define cudaStreamNonBlocking hipStreamNonBlocking
|
63
66
|
#define cudaStreamSynchronize hipStreamSynchronize
|
64
|
-
#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event,
|
67
|
+
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
65
68
|
#define cudaStream_t hipStream_t
|
66
69
|
#define cudaSuccess hipSuccess
|
67
70
|
#else
|
@@ -190,6 +193,12 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
190
193
|
} while (0)
|
191
194
|
#endif // CUDART_VERSION >= 11
|
192
195
|
|
196
|
+
#if CUDART_VERSION >= 11100
|
197
|
+
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
198
|
+
#else
|
199
|
+
#define GGML_CUDA_ASSUME(x)
|
200
|
+
#endif // CUDART_VERSION >= 11100
|
201
|
+
|
193
202
|
#ifdef GGML_CUDA_F16
|
194
203
|
typedef half dfloat; // dequantize float
|
195
204
|
typedef half2 dfloat2;
|
@@ -418,6 +427,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
418
427
|
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
419
428
|
#endif
|
420
429
|
|
430
|
+
#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
|
431
|
+
#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
|
432
|
+
#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
|
433
|
+
|
421
434
|
#define MUL_MAT_SRC1_COL_STRIDE 128
|
422
435
|
|
423
436
|
#define MAX_STREAMS 8
|
@@ -2145,10 +2158,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2145
2158
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2146
2159
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2147
2160
|
|
2148
|
-
|
2149
|
-
|
2150
|
-
|
2151
|
-
|
2161
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2162
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2163
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2164
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2152
2165
|
|
2153
2166
|
const int kbx = k / QI4_0;
|
2154
2167
|
const int kqsx = k % QI4_0;
|
@@ -2239,10 +2252,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2239
2252
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2240
2253
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2241
2254
|
|
2242
|
-
|
2243
|
-
|
2244
|
-
|
2245
|
-
|
2255
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2256
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2257
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2258
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2246
2259
|
|
2247
2260
|
const int kbx = k / QI4_1;
|
2248
2261
|
const int kqsx = k % QI4_1;
|
@@ -2331,10 +2344,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2331
2344
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2332
2345
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2333
2346
|
|
2334
|
-
|
2335
|
-
|
2336
|
-
|
2337
|
-
|
2347
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2348
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2349
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2350
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2338
2351
|
|
2339
2352
|
const int kbx = k / QI5_0;
|
2340
2353
|
const int kqsx = k % QI5_0;
|
@@ -2445,10 +2458,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2445
2458
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2446
2459
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2447
2460
|
|
2448
|
-
|
2449
|
-
|
2450
|
-
|
2451
|
-
|
2461
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2462
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2463
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2464
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2452
2465
|
|
2453
2466
|
const int kbx = k / QI5_1;
|
2454
2467
|
const int kqsx = k % QI5_1;
|
@@ -2551,10 +2564,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2551
2564
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2552
2565
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2553
2566
|
|
2554
|
-
|
2555
|
-
|
2556
|
-
|
2557
|
-
|
2567
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2568
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2569
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2570
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2558
2571
|
|
2559
2572
|
const int kbx = k / QI8_0;
|
2560
2573
|
const int kqsx = k % QI8_0;
|
@@ -2642,10 +2655,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2642
2655
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2643
2656
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2644
2657
|
|
2645
|
-
|
2646
|
-
|
2647
|
-
|
2648
|
-
|
2658
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2659
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2660
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2661
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2649
2662
|
|
2650
2663
|
const int kbx = k / QI2_K;
|
2651
2664
|
const int kqsx = k % QI2_K;
|
@@ -2763,10 +2776,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2763
2776
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2764
2777
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2765
2778
|
|
2766
|
-
|
2767
|
-
|
2768
|
-
|
2769
|
-
|
2779
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2780
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2781
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2782
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2770
2783
|
|
2771
2784
|
const int kbx = k / QI3_K;
|
2772
2785
|
const int kqsx = k % QI3_K;
|
@@ -2981,10 +2994,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2981
2994
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2982
2995
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2983
2996
|
|
2984
|
-
|
2985
|
-
|
2986
|
-
|
2987
|
-
|
2997
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2998
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2999
|
+
GGML_CUDA_ASSUME(k >= 0);
|
3000
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2988
3001
|
|
2989
3002
|
const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
2990
3003
|
const int kqsx = k % QI4_K; // == k if QK_K == 256
|
@@ -3162,10 +3175,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3162
3175
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3163
3176
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3164
3177
|
|
3165
|
-
|
3166
|
-
|
3167
|
-
|
3168
|
-
|
3178
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
3179
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
3180
|
+
GGML_CUDA_ASSUME(k >= 0);
|
3181
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
3169
3182
|
|
3170
3183
|
const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
3171
3184
|
const int kqsx = k % QI5_K; // == k if QK_K == 256
|
@@ -3291,10 +3304,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3291
3304
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3292
3305
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3293
3306
|
|
3294
|
-
|
3295
|
-
|
3296
|
-
|
3297
|
-
|
3307
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
3308
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
3309
|
+
GGML_CUDA_ASSUME(k >= 0);
|
3310
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
3298
3311
|
|
3299
3312
|
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
3300
3313
|
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
@@ -6252,6 +6265,43 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
6252
6265
|
}
|
6253
6266
|
}
|
6254
6267
|
|
6268
|
+
void ggml_cuda_set_peer_access(const int n_tokens) {
|
6269
|
+
static bool peer_access_enabled = false;
|
6270
|
+
|
6271
|
+
const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
|
6272
|
+
|
6273
|
+
if (peer_access_enabled == enable_peer_access) {
|
6274
|
+
return;
|
6275
|
+
}
|
6276
|
+
|
6277
|
+
#ifdef NDEBUG
|
6278
|
+
for (int id = 0; id < g_device_count; ++id) {
|
6279
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
6280
|
+
|
6281
|
+
for (int id_other = 0; id_other < g_device_count; ++id_other) {
|
6282
|
+
if (id == id_other) {
|
6283
|
+
continue;
|
6284
|
+
}
|
6285
|
+
if (id != g_main_device && id_other != g_main_device) {
|
6286
|
+
continue;
|
6287
|
+
}
|
6288
|
+
|
6289
|
+
int can_access_peer;
|
6290
|
+
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
|
6291
|
+
if (can_access_peer) {
|
6292
|
+
if (enable_peer_access) {
|
6293
|
+
CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
|
6294
|
+
} else {
|
6295
|
+
CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
|
6296
|
+
}
|
6297
|
+
}
|
6298
|
+
}
|
6299
|
+
}
|
6300
|
+
#endif // NDEBUG
|
6301
|
+
|
6302
|
+
peer_access_enabled = enable_peer_access;
|
6303
|
+
}
|
6304
|
+
|
6255
6305
|
static void ggml_cuda_op_mul_mat(
|
6256
6306
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
|
6257
6307
|
const bool convert_src1_to_q8_1) {
|
@@ -6276,6 +6326,8 @@ static void ggml_cuda_op_mul_mat(
|
|
6276
6326
|
const int nb2 = dst->nb[2];
|
6277
6327
|
const int nb3 = dst->nb[3];
|
6278
6328
|
|
6329
|
+
ggml_cuda_set_peer_access(ne11);
|
6330
|
+
|
6279
6331
|
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
6280
6332
|
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
6281
6333
|
|
@@ -6408,7 +6460,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6408
6460
|
|
6409
6461
|
// wait for main GPU data if necessary
|
6410
6462
|
if (split && (id != g_main_device || is != 0)) {
|
6411
|
-
CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0]));
|
6463
|
+
CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0));
|
6412
6464
|
}
|
6413
6465
|
|
6414
6466
|
for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
|
@@ -6530,7 +6582,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6530
6582
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6531
6583
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
6532
6584
|
for (int64_t is = 0; is < is_max; ++is) {
|
6533
|
-
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is]));
|
6585
|
+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
|
6534
6586
|
}
|
6535
6587
|
}
|
6536
6588
|
}
|
@@ -6964,6 +7016,7 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
|
|
6964
7016
|
return;
|
6965
7017
|
}
|
6966
7018
|
if (g_scratch_buffer == nullptr) {
|
7019
|
+
ggml_cuda_set_device(g_main_device);
|
6967
7020
|
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
6968
7021
|
}
|
6969
7022
|
|
@@ -7003,7 +7056,7 @@ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
|
|
7003
7056
|
ggml_cuda_assign_buffers_impl(tensor, false, true, false);
|
7004
7057
|
}
|
7005
7058
|
|
7006
|
-
void ggml_cuda_set_main_device(int main_device) {
|
7059
|
+
void ggml_cuda_set_main_device(const int main_device) {
|
7007
7060
|
if (main_device >= g_device_count) {
|
7008
7061
|
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
7009
7062
|
main_device, g_device_count, g_main_device);
|
@@ -7017,11 +7070,11 @@ void ggml_cuda_set_main_device(int main_device) {
|
|
7017
7070
|
}
|
7018
7071
|
}
|
7019
7072
|
|
7020
|
-
void ggml_cuda_set_mul_mat_q(bool mul_mat_q) {
|
7073
|
+
void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
|
7021
7074
|
g_mul_mat_q = mul_mat_q;
|
7022
7075
|
}
|
7023
7076
|
|
7024
|
-
void ggml_cuda_set_scratch_size(size_t scratch_size) {
|
7077
|
+
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
7025
7078
|
g_scratch_size = scratch_size;
|
7026
7079
|
}
|
7027
7080
|
|
@@ -66,6 +66,7 @@ struct ggml_metal_context {
|
|
66
66
|
GGML_METAL_DECL_KERNEL(soft_max_4);
|
67
67
|
GGML_METAL_DECL_KERNEL(diag_mask_inf);
|
68
68
|
GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
|
69
|
+
GGML_METAL_DECL_KERNEL(get_rows_f32);
|
69
70
|
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
70
71
|
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
71
72
|
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
@@ -77,6 +78,7 @@ struct ggml_metal_context {
|
|
77
78
|
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
|
78
79
|
GGML_METAL_DECL_KERNEL(rms_norm);
|
79
80
|
GGML_METAL_DECL_KERNEL(norm);
|
81
|
+
GGML_METAL_DECL_KERNEL(mul_mat_f32_f32);
|
80
82
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
81
83
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
|
82
84
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
|
@@ -88,6 +90,7 @@ struct ggml_metal_context {
|
|
88
90
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
|
89
91
|
GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
|
90
92
|
GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
|
93
|
+
GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
|
91
94
|
GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
|
92
95
|
GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
|
93
96
|
GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
|
@@ -145,7 +148,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
145
148
|
ctx->n_buffers = 0;
|
146
149
|
ctx->concur_list_len = 0;
|
147
150
|
|
148
|
-
ctx->d_queue = dispatch_queue_create("
|
151
|
+
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
149
152
|
|
150
153
|
#ifdef GGML_SWIFT
|
151
154
|
// load the default.metallib file
|
@@ -175,7 +178,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
175
178
|
|
176
179
|
//NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
|
177
180
|
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
178
|
-
NSString * path
|
181
|
+
NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
179
182
|
metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]);
|
180
183
|
|
181
184
|
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
|
@@ -224,6 +227,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
224
227
|
GGML_METAL_ADD_KERNEL(soft_max_4);
|
225
228
|
GGML_METAL_ADD_KERNEL(diag_mask_inf);
|
226
229
|
GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
|
230
|
+
GGML_METAL_ADD_KERNEL(get_rows_f32);
|
227
231
|
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
228
232
|
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
229
233
|
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
@@ -235,6 +239,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
235
239
|
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
|
236
240
|
GGML_METAL_ADD_KERNEL(rms_norm);
|
237
241
|
GGML_METAL_ADD_KERNEL(norm);
|
242
|
+
GGML_METAL_ADD_KERNEL(mul_mat_f32_f32);
|
238
243
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
239
244
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
|
240
245
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
|
@@ -246,6 +251,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
246
251
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
|
247
252
|
GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
|
248
253
|
GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
|
254
|
+
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
|
249
255
|
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
|
250
256
|
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
|
251
257
|
GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
|
@@ -293,7 +299,9 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
293
299
|
GGML_METAL_DEL_KERNEL(gelu);
|
294
300
|
GGML_METAL_DEL_KERNEL(soft_max);
|
295
301
|
GGML_METAL_DEL_KERNEL(soft_max_4);
|
302
|
+
GGML_METAL_DEL_KERNEL(diag_mask_inf);
|
296
303
|
GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
|
304
|
+
GGML_METAL_DEL_KERNEL(get_rows_f32);
|
297
305
|
GGML_METAL_DEL_KERNEL(get_rows_f16);
|
298
306
|
GGML_METAL_DEL_KERNEL(get_rows_q4_0);
|
299
307
|
GGML_METAL_DEL_KERNEL(get_rows_q4_1);
|
@@ -305,6 +313,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
305
313
|
GGML_METAL_DEL_KERNEL(get_rows_q6_K);
|
306
314
|
GGML_METAL_DEL_KERNEL(rms_norm);
|
307
315
|
GGML_METAL_DEL_KERNEL(norm);
|
316
|
+
GGML_METAL_DEL_KERNEL(mul_mat_f32_f32);
|
308
317
|
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
|
309
318
|
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
|
310
319
|
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
|
@@ -316,6 +325,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
316
325
|
GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32);
|
317
326
|
GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32);
|
318
327
|
GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32);
|
328
|
+
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
|
319
329
|
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
|
320
330
|
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
|
321
331
|
GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
|
@@ -386,6 +396,7 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
|
|
386
396
|
for (int i = 0; i < ctx->n_buffers; ++i) {
|
387
397
|
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
|
388
398
|
|
399
|
+
//metal_printf("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
|
389
400
|
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
|
390
401
|
*offs = (size_t) ioffs;
|
391
402
|
|
@@ -723,6 +734,7 @@ void ggml_metal_graph_compute(
|
|
723
734
|
case GGML_OP_ADD:
|
724
735
|
{
|
725
736
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
737
|
+
GGML_ASSERT(ggml_is_contiguous(src1));
|
726
738
|
|
727
739
|
// utilize float4
|
728
740
|
GGML_ASSERT(ne00 % 4 == 0);
|
@@ -730,6 +742,7 @@ void ggml_metal_graph_compute(
|
|
730
742
|
|
731
743
|
if (ggml_nelements(src1) == ne10) {
|
732
744
|
// src1 is a row
|
745
|
+
GGML_ASSERT(ne11 == 1);
|
733
746
|
[encoder setComputePipelineState:ctx->pipeline_add_row];
|
734
747
|
} else {
|
735
748
|
[encoder setComputePipelineState:ctx->pipeline_add];
|
@@ -746,6 +759,7 @@ void ggml_metal_graph_compute(
|
|
746
759
|
case GGML_OP_MUL:
|
747
760
|
{
|
748
761
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
762
|
+
GGML_ASSERT(ggml_is_contiguous(src1));
|
749
763
|
|
750
764
|
// utilize float4
|
751
765
|
GGML_ASSERT(ne00 % 4 == 0);
|
@@ -753,6 +767,7 @@ void ggml_metal_graph_compute(
|
|
753
767
|
|
754
768
|
if (ggml_nelements(src1) == ne10) {
|
755
769
|
// src1 is a row
|
770
|
+
GGML_ASSERT(ne11 == 1);
|
756
771
|
[encoder setComputePipelineState:ctx->pipeline_mul_row];
|
757
772
|
} else {
|
758
773
|
[encoder setComputePipelineState:ctx->pipeline_mul];
|
@@ -768,6 +783,8 @@ void ggml_metal_graph_compute(
|
|
768
783
|
} break;
|
769
784
|
case GGML_OP_SCALE:
|
770
785
|
{
|
786
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
787
|
+
|
771
788
|
const float scale = *(const float *) src1->data;
|
772
789
|
|
773
790
|
[encoder setComputePipelineState:ctx->pipeline_scale];
|
@@ -867,13 +884,14 @@ void ggml_metal_graph_compute(
|
|
867
884
|
|
868
885
|
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
869
886
|
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
870
|
-
if (
|
871
|
-
|
887
|
+
if (!ggml_is_transposed(src0) &&
|
888
|
+
!ggml_is_transposed(src1) &&
|
872
889
|
src1t == GGML_TYPE_F32 &&
|
873
890
|
[ctx->device supportsFamily:MTLGPUFamilyApple7] &&
|
874
891
|
ne00%32 == 0 &&
|
875
892
|
ne11 > 1) {
|
876
893
|
switch (src0->type) {
|
894
|
+
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break;
|
877
895
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
|
878
896
|
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
|
879
897
|
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
|
@@ -893,9 +911,12 @@ void ggml_metal_graph_compute(
|
|
893
911
|
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
|
894
912
|
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
|
895
913
|
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
|
896
|
-
[encoder setBytes:&
|
897
|
-
[encoder setBytes:&
|
898
|
-
[encoder setBytes:&
|
914
|
+
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:8];
|
915
|
+
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:9];
|
916
|
+
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10];
|
917
|
+
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11];
|
918
|
+
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12];
|
919
|
+
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:13];
|
899
920
|
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
900
921
|
[encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
901
922
|
} else {
|
@@ -905,6 +926,11 @@ void ggml_metal_graph_compute(
|
|
905
926
|
|
906
927
|
// use custom matrix x vector kernel
|
907
928
|
switch (src0t) {
|
929
|
+
case GGML_TYPE_F32:
|
930
|
+
{
|
931
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f32_f32];
|
932
|
+
nrows = 4;
|
933
|
+
} break;
|
908
934
|
case GGML_TYPE_F16:
|
909
935
|
{
|
910
936
|
nth0 = 32;
|
@@ -1045,6 +1071,7 @@ void ggml_metal_graph_compute(
|
|
1045
1071
|
case GGML_OP_GET_ROWS:
|
1046
1072
|
{
|
1047
1073
|
switch (src0->type) {
|
1074
|
+
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_get_rows_f32]; break;
|
1048
1075
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
|
1049
1076
|
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
1050
1077
|
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
|
@@ -1060,9 +1087,9 @@ void ggml_metal_graph_compute(
|
|
1060
1087
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
1061
1088
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
1062
1089
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
1063
|
-
[encoder setBytes:&
|
1064
|
-
[encoder setBytes:&
|
1065
|
-
[encoder setBytes:&
|
1090
|
+
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
|
1091
|
+
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
|
1092
|
+
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:5];
|
1066
1093
|
|
1067
1094
|
const int64_t n = ggml_nelements(src1);
|
1068
1095
|
|