llama_cpp 0.15.2 → 0.15.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +6 -17
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +72 -30
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +40 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +68 -70
- data/vendor/tmp/llama.cpp/ggml-metal.metal +24 -409
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +1879 -2450
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +176 -53
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +40 -500
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +202 -225
- data/vendor/tmp/llama.cpp/ggml.c +376 -758
- data/vendor/tmp/llama.cpp/ggml.h +39 -27
- data/vendor/tmp/llama.cpp/llama.cpp +823 -593
- data/vendor/tmp/llama.cpp/llama.h +10 -3
- metadata +3 -3
@@ -381,10 +381,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
381
381
|
// dictionary of preprocessor macros
|
382
382
|
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
383
383
|
|
384
|
-
#ifdef GGML_QKK_64
|
385
|
-
prep[@"GGML_QKK_64"] = @(1);
|
386
|
-
#endif
|
387
|
-
|
388
384
|
MTLCompileOptions* options = [MTLCompileOptions new];
|
389
385
|
options.preprocessorMacros = prep;
|
390
386
|
|
@@ -927,22 +923,32 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
927
923
|
const int64_t ne10 = src1 ? src1->ne[0] : 0;
|
928
924
|
const int64_t ne11 = src1 ? src1->ne[1] : 0;
|
929
925
|
const int64_t ne12 = src1 ? src1->ne[2] : 0;
|
930
|
-
const int64_t ne13 = src1 ? src1->ne[3] : 0;
|
926
|
+
const int64_t ne13 = src1 ? src1->ne[3] : 0;
|
931
927
|
|
932
928
|
const uint64_t nb10 = src1 ? src1->nb[0] : 0;
|
933
929
|
const uint64_t nb11 = src1 ? src1->nb[1] : 0;
|
934
930
|
const uint64_t nb12 = src1 ? src1->nb[2] : 0;
|
935
|
-
const uint64_t nb13 = src1 ? src1->nb[3] : 0;
|
931
|
+
const uint64_t nb13 = src1 ? src1->nb[3] : 0;
|
932
|
+
|
933
|
+
const int64_t ne20 = src2 ? src2->ne[0] : 0;
|
934
|
+
const int64_t ne21 = src2 ? src2->ne[1] : 0;
|
935
|
+
const int64_t ne22 = src2 ? src2->ne[2] : 0; GGML_UNUSED(ne22);
|
936
|
+
const int64_t ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23);
|
936
937
|
|
937
|
-
const
|
938
|
-
const
|
939
|
-
const
|
940
|
-
const
|
938
|
+
const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
|
939
|
+
const uint64_t nb21 = src2 ? src2->nb[1] : 0;
|
940
|
+
const uint64_t nb22 = src2 ? src2->nb[2] : 0;
|
941
|
+
const uint64_t nb23 = src2 ? src2->nb[3] : 0;
|
941
942
|
|
942
|
-
const
|
943
|
-
const
|
944
|
-
const
|
945
|
-
const
|
943
|
+
const int64_t ne0 = dst ? dst->ne[0] : 0;
|
944
|
+
const int64_t ne1 = dst ? dst->ne[1] : 0;
|
945
|
+
const int64_t ne2 = dst ? dst->ne[2] : 0;
|
946
|
+
const int64_t ne3 = dst ? dst->ne[3] : 0;
|
947
|
+
|
948
|
+
const uint64_t nb0 = dst ? dst->nb[0] : 0;
|
949
|
+
const uint64_t nb1 = dst ? dst->nb[1] : 0;
|
950
|
+
const uint64_t nb2 = dst ? dst->nb[2] : 0;
|
951
|
+
const uint64_t nb3 = dst ? dst->nb[3] : 0;
|
946
952
|
|
947
953
|
const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
|
948
954
|
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
@@ -1763,11 +1769,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
1763
1769
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
1764
1770
|
}
|
1765
1771
|
else if (src0t == GGML_TYPE_Q3_K) {
|
1766
|
-
#ifdef GGML_QKK_64
|
1767
|
-
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
1768
|
-
#else
|
1769
1772
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
1770
|
-
#endif
|
1771
1773
|
}
|
1772
1774
|
else if (src0t == GGML_TYPE_Q5_K) {
|
1773
1775
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
@@ -1785,16 +1787,6 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
1785
1787
|
const int n_as = src0->ne[2];
|
1786
1788
|
|
1787
1789
|
// src2 = ids
|
1788
|
-
const int64_t ne20 = src2->ne[0];
|
1789
|
-
const int64_t ne21 = src2->ne[1];
|
1790
|
-
const int64_t ne22 = src2->ne[2]; GGML_UNUSED(ne22);
|
1791
|
-
const int64_t ne23 = src2->ne[3]; GGML_UNUSED(ne23);
|
1792
|
-
|
1793
|
-
const uint64_t nb20 = src2->nb[0]; GGML_UNUSED(nb20);
|
1794
|
-
const uint64_t nb21 = src2->nb[1];
|
1795
|
-
const uint64_t nb22 = src2->nb[2]; GGML_UNUSED(nb22);
|
1796
|
-
const uint64_t nb23 = src2->nb[3]; GGML_UNUSED(nb23);
|
1797
|
-
|
1798
1790
|
const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t);
|
1799
1791
|
|
1800
1792
|
GGML_ASSERT(src2t == GGML_TYPE_I32);
|
@@ -2018,12 +2010,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
2018
2010
|
{
|
2019
2011
|
nth0 = 4;
|
2020
2012
|
nth1 = 16;
|
2021
|
-
#if QK_K == 64
|
2022
|
-
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
|
2023
|
-
#else
|
2024
2013
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
|
2025
|
-
#endif
|
2026
|
-
|
2027
2014
|
} break;
|
2028
2015
|
default:
|
2029
2016
|
{
|
@@ -2088,11 +2075,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
2088
2075
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
2089
2076
|
}
|
2090
2077
|
else if (src0t == GGML_TYPE_Q3_K) {
|
2091
|
-
#ifdef GGML_QKK_64
|
2092
|
-
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
2093
|
-
#else
|
2094
2078
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
2095
|
-
#endif
|
2096
2079
|
}
|
2097
2080
|
else if (src0t == GGML_TYPE_Q5_K) {
|
2098
2081
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
@@ -2244,7 +2227,13 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
2244
2227
|
// skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
|
2245
2228
|
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
2246
2229
|
|
2247
|
-
float freq_base
|
2230
|
+
float freq_base;
|
2231
|
+
float freq_scale;
|
2232
|
+
float ext_factor;
|
2233
|
+
float attn_factor;
|
2234
|
+
float beta_fast;
|
2235
|
+
float beta_slow;
|
2236
|
+
|
2248
2237
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
2249
2238
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
2250
2239
|
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
@@ -2252,6 +2241,15 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
2252
2241
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
2253
2242
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
2254
2243
|
|
2244
|
+
const bool is_neox = mode & 2;
|
2245
|
+
const bool is_glm = mode & 4;
|
2246
|
+
|
2247
|
+
GGML_ASSERT(!is_glm && "GLM RoPE not implemented in Metal");
|
2248
|
+
|
2249
|
+
if (!is_neox) {
|
2250
|
+
GGML_ASSERT(id_src2 == nil && "TODO: freq_factors not implemented for !is_neox");
|
2251
|
+
}
|
2252
|
+
|
2255
2253
|
id<MTLComputePipelineState> pipeline = nil;
|
2256
2254
|
|
2257
2255
|
switch (src0->type) {
|
@@ -2263,33 +2261,38 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
2263
2261
|
[encoder setComputePipelineState:pipeline];
|
2264
2262
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
2265
2263
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
2266
|
-
|
2267
|
-
|
2268
|
-
|
2269
|
-
|
2270
|
-
|
2271
|
-
[encoder
|
2272
|
-
[encoder setBytes:&
|
2273
|
-
[encoder setBytes:&
|
2274
|
-
[encoder setBytes:&
|
2275
|
-
[encoder setBytes:&
|
2276
|
-
[encoder setBytes:&
|
2277
|
-
[encoder setBytes:&
|
2278
|
-
[encoder setBytes:&
|
2279
|
-
[encoder setBytes:&
|
2280
|
-
[encoder setBytes:&
|
2281
|
-
[encoder setBytes:&
|
2282
|
-
[encoder setBytes:&
|
2283
|
-
[encoder setBytes:&
|
2284
|
-
[encoder setBytes:&
|
2285
|
-
[encoder setBytes:&
|
2286
|
-
[encoder setBytes:&
|
2287
|
-
[encoder setBytes:&
|
2288
|
-
[encoder setBytes:&
|
2289
|
-
[encoder setBytes:&
|
2290
|
-
[encoder setBytes:&
|
2291
|
-
[encoder setBytes:&
|
2292
|
-
[encoder setBytes:&
|
2264
|
+
if (id_src2 != nil) {
|
2265
|
+
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
|
2266
|
+
} else {
|
2267
|
+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:2];
|
2268
|
+
}
|
2269
|
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:3];
|
2270
|
+
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:4];
|
2271
|
+
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5];
|
2272
|
+
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6];
|
2273
|
+
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7];
|
2274
|
+
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:8];
|
2275
|
+
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:9];
|
2276
|
+
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:10];
|
2277
|
+
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:11];
|
2278
|
+
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:12];
|
2279
|
+
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:13];
|
2280
|
+
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:14];
|
2281
|
+
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:15];
|
2282
|
+
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:16];
|
2283
|
+
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:17];
|
2284
|
+
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:18];
|
2285
|
+
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:19];
|
2286
|
+
[encoder setBytes:&n_past length:sizeof( int) atIndex:20];
|
2287
|
+
[encoder setBytes:&n_dims length:sizeof( int) atIndex:21];
|
2288
|
+
[encoder setBytes:&mode length:sizeof( int) atIndex:22];
|
2289
|
+
[encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:23];
|
2290
|
+
[encoder setBytes:&freq_base length:sizeof( float) atIndex:24];
|
2291
|
+
[encoder setBytes:&freq_scale length:sizeof( float) atIndex:25];
|
2292
|
+
[encoder setBytes:&ext_factor length:sizeof( float) atIndex:26];
|
2293
|
+
[encoder setBytes:&attn_factor length:sizeof( float) atIndex:27];
|
2294
|
+
[encoder setBytes:&beta_fast length:sizeof( float) atIndex:28];
|
2295
|
+
[encoder setBytes:&beta_slow length:sizeof( float) atIndex:29];
|
2293
2296
|
|
2294
2297
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
2295
2298
|
} break;
|
@@ -2535,11 +2538,6 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
2535
2538
|
GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) &&
|
2536
2539
|
"the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
|
2537
2540
|
|
2538
|
-
const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
|
2539
|
-
const uint64_t nb21 = src2 ? src2->nb[1] : 0;
|
2540
|
-
const uint64_t nb22 = src2 ? src2->nb[2] : 0;
|
2541
|
-
const uint64_t nb23 = src2 ? src2->nb[3] : 0;
|
2542
|
-
|
2543
2541
|
const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
|
2544
2542
|
//const int64_t ne31 = src3 ? src3->ne[1] : 0;
|
2545
2543
|
const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);
|