llama_cpp 0.15.2 → 0.15.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +6 -17
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +72 -30
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +40 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +68 -70
- data/vendor/tmp/llama.cpp/ggml-metal.metal +24 -409
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +1879 -2450
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +176 -53
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +40 -500
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +202 -225
- data/vendor/tmp/llama.cpp/ggml.c +376 -758
- data/vendor/tmp/llama.cpp/ggml.h +39 -27
- data/vendor/tmp/llama.cpp/llama.cpp +823 -593
- data/vendor/tmp/llama.cpp/llama.h +10 -3
- metadata +3 -3
@@ -381,10 +381,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
381
381
|
// dictionary of preprocessor macros
|
382
382
|
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
383
383
|
|
384
|
-
#ifdef GGML_QKK_64
|
385
|
-
prep[@"GGML_QKK_64"] = @(1);
|
386
|
-
#endif
|
387
|
-
|
388
384
|
MTLCompileOptions* options = [MTLCompileOptions new];
|
389
385
|
options.preprocessorMacros = prep;
|
390
386
|
|
@@ -927,22 +923,32 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
927
923
|
const int64_t ne10 = src1 ? src1->ne[0] : 0;
|
928
924
|
const int64_t ne11 = src1 ? src1->ne[1] : 0;
|
929
925
|
const int64_t ne12 = src1 ? src1->ne[2] : 0;
|
930
|
-
const int64_t ne13 = src1 ? src1->ne[3] : 0;
|
926
|
+
const int64_t ne13 = src1 ? src1->ne[3] : 0;
|
931
927
|
|
932
928
|
const uint64_t nb10 = src1 ? src1->nb[0] : 0;
|
933
929
|
const uint64_t nb11 = src1 ? src1->nb[1] : 0;
|
934
930
|
const uint64_t nb12 = src1 ? src1->nb[2] : 0;
|
935
|
-
const uint64_t nb13 = src1 ? src1->nb[3] : 0;
|
931
|
+
const uint64_t nb13 = src1 ? src1->nb[3] : 0;
|
932
|
+
|
933
|
+
const int64_t ne20 = src2 ? src2->ne[0] : 0;
|
934
|
+
const int64_t ne21 = src2 ? src2->ne[1] : 0;
|
935
|
+
const int64_t ne22 = src2 ? src2->ne[2] : 0; GGML_UNUSED(ne22);
|
936
|
+
const int64_t ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23);
|
936
937
|
|
937
|
-
const
|
938
|
-
const
|
939
|
-
const
|
940
|
-
const
|
938
|
+
const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
|
939
|
+
const uint64_t nb21 = src2 ? src2->nb[1] : 0;
|
940
|
+
const uint64_t nb22 = src2 ? src2->nb[2] : 0;
|
941
|
+
const uint64_t nb23 = src2 ? src2->nb[3] : 0;
|
941
942
|
|
942
|
-
const
|
943
|
-
const
|
944
|
-
const
|
945
|
-
const
|
943
|
+
const int64_t ne0 = dst ? dst->ne[0] : 0;
|
944
|
+
const int64_t ne1 = dst ? dst->ne[1] : 0;
|
945
|
+
const int64_t ne2 = dst ? dst->ne[2] : 0;
|
946
|
+
const int64_t ne3 = dst ? dst->ne[3] : 0;
|
947
|
+
|
948
|
+
const uint64_t nb0 = dst ? dst->nb[0] : 0;
|
949
|
+
const uint64_t nb1 = dst ? dst->nb[1] : 0;
|
950
|
+
const uint64_t nb2 = dst ? dst->nb[2] : 0;
|
951
|
+
const uint64_t nb3 = dst ? dst->nb[3] : 0;
|
946
952
|
|
947
953
|
const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
|
948
954
|
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
@@ -1763,11 +1769,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
1763
1769
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
1764
1770
|
}
|
1765
1771
|
else if (src0t == GGML_TYPE_Q3_K) {
|
1766
|
-
#ifdef GGML_QKK_64
|
1767
|
-
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
1768
|
-
#else
|
1769
1772
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
1770
|
-
#endif
|
1771
1773
|
}
|
1772
1774
|
else if (src0t == GGML_TYPE_Q5_K) {
|
1773
1775
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
@@ -1785,16 +1787,6 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
1785
1787
|
const int n_as = src0->ne[2];
|
1786
1788
|
|
1787
1789
|
// src2 = ids
|
1788
|
-
const int64_t ne20 = src2->ne[0];
|
1789
|
-
const int64_t ne21 = src2->ne[1];
|
1790
|
-
const int64_t ne22 = src2->ne[2]; GGML_UNUSED(ne22);
|
1791
|
-
const int64_t ne23 = src2->ne[3]; GGML_UNUSED(ne23);
|
1792
|
-
|
1793
|
-
const uint64_t nb20 = src2->nb[0]; GGML_UNUSED(nb20);
|
1794
|
-
const uint64_t nb21 = src2->nb[1];
|
1795
|
-
const uint64_t nb22 = src2->nb[2]; GGML_UNUSED(nb22);
|
1796
|
-
const uint64_t nb23 = src2->nb[3]; GGML_UNUSED(nb23);
|
1797
|
-
|
1798
1790
|
const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t);
|
1799
1791
|
|
1800
1792
|
GGML_ASSERT(src2t == GGML_TYPE_I32);
|
@@ -2018,12 +2010,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
2018
2010
|
{
|
2019
2011
|
nth0 = 4;
|
2020
2012
|
nth1 = 16;
|
2021
|
-
#if QK_K == 64
|
2022
|
-
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
|
2023
|
-
#else
|
2024
2013
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
|
2025
|
-
#endif
|
2026
|
-
|
2027
2014
|
} break;
|
2028
2015
|
default:
|
2029
2016
|
{
|
@@ -2088,11 +2075,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
2088
2075
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
2089
2076
|
}
|
2090
2077
|
else if (src0t == GGML_TYPE_Q3_K) {
|
2091
|
-
#ifdef GGML_QKK_64
|
2092
|
-
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
2093
|
-
#else
|
2094
2078
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
2095
|
-
#endif
|
2096
2079
|
}
|
2097
2080
|
else if (src0t == GGML_TYPE_Q5_K) {
|
2098
2081
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
@@ -2244,7 +2227,13 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
2244
2227
|
// skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
|
2245
2228
|
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
2246
2229
|
|
2247
|
-
float freq_base
|
2230
|
+
float freq_base;
|
2231
|
+
float freq_scale;
|
2232
|
+
float ext_factor;
|
2233
|
+
float attn_factor;
|
2234
|
+
float beta_fast;
|
2235
|
+
float beta_slow;
|
2236
|
+
|
2248
2237
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
2249
2238
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
2250
2239
|
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
@@ -2252,6 +2241,15 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
2252
2241
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
2253
2242
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
2254
2243
|
|
2244
|
+
const bool is_neox = mode & 2;
|
2245
|
+
const bool is_glm = mode & 4;
|
2246
|
+
|
2247
|
+
GGML_ASSERT(!is_glm && "GLM RoPE not implemented in Metal");
|
2248
|
+
|
2249
|
+
if (!is_neox) {
|
2250
|
+
GGML_ASSERT(id_src2 == nil && "TODO: freq_factors not implemented for !is_neox");
|
2251
|
+
}
|
2252
|
+
|
2255
2253
|
id<MTLComputePipelineState> pipeline = nil;
|
2256
2254
|
|
2257
2255
|
switch (src0->type) {
|
@@ -2263,33 +2261,38 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
2263
2261
|
[encoder setComputePipelineState:pipeline];
|
2264
2262
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
2265
2263
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
2266
|
-
|
2267
|
-
|
2268
|
-
|
2269
|
-
|
2270
|
-
|
2271
|
-
[encoder
|
2272
|
-
[encoder setBytes:&
|
2273
|
-
[encoder setBytes:&
|
2274
|
-
[encoder setBytes:&
|
2275
|
-
[encoder setBytes:&
|
2276
|
-
[encoder setBytes:&
|
2277
|
-
[encoder setBytes:&
|
2278
|
-
[encoder setBytes:&
|
2279
|
-
[encoder setBytes:&
|
2280
|
-
[encoder setBytes:&
|
2281
|
-
[encoder setBytes:&
|
2282
|
-
[encoder setBytes:&
|
2283
|
-
[encoder setBytes:&
|
2284
|
-
[encoder setBytes:&
|
2285
|
-
[encoder setBytes:&
|
2286
|
-
[encoder setBytes:&
|
2287
|
-
[encoder setBytes:&
|
2288
|
-
[encoder setBytes:&
|
2289
|
-
[encoder setBytes:&
|
2290
|
-
[encoder setBytes:&
|
2291
|
-
[encoder setBytes:&
|
2292
|
-
[encoder setBytes:&
|
2264
|
+
if (id_src2 != nil) {
|
2265
|
+
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
|
2266
|
+
} else {
|
2267
|
+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:2];
|
2268
|
+
}
|
2269
|
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:3];
|
2270
|
+
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:4];
|
2271
|
+
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5];
|
2272
|
+
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6];
|
2273
|
+
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7];
|
2274
|
+
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:8];
|
2275
|
+
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:9];
|
2276
|
+
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:10];
|
2277
|
+
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:11];
|
2278
|
+
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:12];
|
2279
|
+
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:13];
|
2280
|
+
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:14];
|
2281
|
+
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:15];
|
2282
|
+
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:16];
|
2283
|
+
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:17];
|
2284
|
+
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:18];
|
2285
|
+
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:19];
|
2286
|
+
[encoder setBytes:&n_past length:sizeof( int) atIndex:20];
|
2287
|
+
[encoder setBytes:&n_dims length:sizeof( int) atIndex:21];
|
2288
|
+
[encoder setBytes:&mode length:sizeof( int) atIndex:22];
|
2289
|
+
[encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:23];
|
2290
|
+
[encoder setBytes:&freq_base length:sizeof( float) atIndex:24];
|
2291
|
+
[encoder setBytes:&freq_scale length:sizeof( float) atIndex:25];
|
2292
|
+
[encoder setBytes:&ext_factor length:sizeof( float) atIndex:26];
|
2293
|
+
[encoder setBytes:&attn_factor length:sizeof( float) atIndex:27];
|
2294
|
+
[encoder setBytes:&beta_fast length:sizeof( float) atIndex:28];
|
2295
|
+
[encoder setBytes:&beta_slow length:sizeof( float) atIndex:29];
|
2293
2296
|
|
2294
2297
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
2295
2298
|
} break;
|
@@ -2535,11 +2538,6 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
2535
2538
|
GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) &&
|
2536
2539
|
"the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
|
2537
2540
|
|
2538
|
-
const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
|
2539
|
-
const uint64_t nb21 = src2 ? src2->nb[1] : 0;
|
2540
|
-
const uint64_t nb22 = src2 ? src2->nb[2] : 0;
|
2541
|
-
const uint64_t nb23 = src2 ? src2->nb[3] : 0;
|
2542
|
-
|
2543
2541
|
const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
|
2544
2542
|
//const int64_t ne31 = src3 ? src3->ne[1] : 0;
|
2545
2543
|
const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);
|