llama_cpp 0.15.2 → 0.15.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -381,10 +381,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
381
381
  // dictionary of preprocessor macros
382
382
  NSMutableDictionary * prep = [NSMutableDictionary dictionary];
383
383
 
384
- #ifdef GGML_QKK_64
385
- prep[@"GGML_QKK_64"] = @(1);
386
- #endif
387
-
388
384
  MTLCompileOptions* options = [MTLCompileOptions new];
389
385
  options.preprocessorMacros = prep;
390
386
 
@@ -927,22 +923,32 @@ static enum ggml_status ggml_metal_graph_compute(
927
923
  const int64_t ne10 = src1 ? src1->ne[0] : 0;
928
924
  const int64_t ne11 = src1 ? src1->ne[1] : 0;
929
925
  const int64_t ne12 = src1 ? src1->ne[2] : 0;
930
- const int64_t ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
926
+ const int64_t ne13 = src1 ? src1->ne[3] : 0;
931
927
 
932
928
  const uint64_t nb10 = src1 ? src1->nb[0] : 0;
933
929
  const uint64_t nb11 = src1 ? src1->nb[1] : 0;
934
930
  const uint64_t nb12 = src1 ? src1->nb[2] : 0;
935
- const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
931
+ const uint64_t nb13 = src1 ? src1->nb[3] : 0;
932
+
933
+ const int64_t ne20 = src2 ? src2->ne[0] : 0;
934
+ const int64_t ne21 = src2 ? src2->ne[1] : 0;
935
+ const int64_t ne22 = src2 ? src2->ne[2] : 0; GGML_UNUSED(ne22);
936
+ const int64_t ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23);
936
937
 
937
- const int64_t ne0 = dst ? dst->ne[0] : 0;
938
- const int64_t ne1 = dst ? dst->ne[1] : 0;
939
- const int64_t ne2 = dst ? dst->ne[2] : 0;
940
- const int64_t ne3 = dst ? dst->ne[3] : 0;
938
+ const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
939
+ const uint64_t nb21 = src2 ? src2->nb[1] : 0;
940
+ const uint64_t nb22 = src2 ? src2->nb[2] : 0;
941
+ const uint64_t nb23 = src2 ? src2->nb[3] : 0;
941
942
 
942
- const uint64_t nb0 = dst ? dst->nb[0] : 0;
943
- const uint64_t nb1 = dst ? dst->nb[1] : 0;
944
- const uint64_t nb2 = dst ? dst->nb[2] : 0;
945
- const uint64_t nb3 = dst ? dst->nb[3] : 0;
943
+ const int64_t ne0 = dst ? dst->ne[0] : 0;
944
+ const int64_t ne1 = dst ? dst->ne[1] : 0;
945
+ const int64_t ne2 = dst ? dst->ne[2] : 0;
946
+ const int64_t ne3 = dst ? dst->ne[3] : 0;
947
+
948
+ const uint64_t nb0 = dst ? dst->nb[0] : 0;
949
+ const uint64_t nb1 = dst ? dst->nb[1] : 0;
950
+ const uint64_t nb2 = dst ? dst->nb[2] : 0;
951
+ const uint64_t nb3 = dst ? dst->nb[3] : 0;
946
952
 
947
953
  const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
948
954
  const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
@@ -1763,11 +1769,7 @@ static enum ggml_status ggml_metal_graph_compute(
1763
1769
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1764
1770
  }
1765
1771
  else if (src0t == GGML_TYPE_Q3_K) {
1766
- #ifdef GGML_QKK_64
1767
- [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1768
- #else
1769
1772
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1770
- #endif
1771
1773
  }
1772
1774
  else if (src0t == GGML_TYPE_Q5_K) {
1773
1775
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@@ -1785,16 +1787,6 @@ static enum ggml_status ggml_metal_graph_compute(
1785
1787
  const int n_as = src0->ne[2];
1786
1788
 
1787
1789
  // src2 = ids
1788
- const int64_t ne20 = src2->ne[0];
1789
- const int64_t ne21 = src2->ne[1];
1790
- const int64_t ne22 = src2->ne[2]; GGML_UNUSED(ne22);
1791
- const int64_t ne23 = src2->ne[3]; GGML_UNUSED(ne23);
1792
-
1793
- const uint64_t nb20 = src2->nb[0]; GGML_UNUSED(nb20);
1794
- const uint64_t nb21 = src2->nb[1];
1795
- const uint64_t nb22 = src2->nb[2]; GGML_UNUSED(nb22);
1796
- const uint64_t nb23 = src2->nb[3]; GGML_UNUSED(nb23);
1797
-
1798
1790
  const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t);
1799
1791
 
1800
1792
  GGML_ASSERT(src2t == GGML_TYPE_I32);
@@ -2018,12 +2010,7 @@ static enum ggml_status ggml_metal_graph_compute(
2018
2010
  {
2019
2011
  nth0 = 4;
2020
2012
  nth1 = 16;
2021
- #if QK_K == 64
2022
- pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
2023
- #else
2024
2013
  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
2025
- #endif
2026
-
2027
2014
  } break;
2028
2015
  default:
2029
2016
  {
@@ -2088,11 +2075,7 @@ static enum ggml_status ggml_metal_graph_compute(
2088
2075
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
2089
2076
  }
2090
2077
  else if (src0t == GGML_TYPE_Q3_K) {
2091
- #ifdef GGML_QKK_64
2092
- [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
2093
- #else
2094
2078
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
2095
- #endif
2096
2079
  }
2097
2080
  else if (src0t == GGML_TYPE_Q5_K) {
2098
2081
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@@ -2244,7 +2227,13 @@ static enum ggml_status ggml_metal_graph_compute(
2244
2227
  // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
2245
2228
  const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
2246
2229
 
2247
- float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
2230
+ float freq_base;
2231
+ float freq_scale;
2232
+ float ext_factor;
2233
+ float attn_factor;
2234
+ float beta_fast;
2235
+ float beta_slow;
2236
+
2248
2237
  memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
2249
2238
  memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
2250
2239
  memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
@@ -2252,6 +2241,15 @@ static enum ggml_status ggml_metal_graph_compute(
2252
2241
  memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
2253
2242
  memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
2254
2243
 
2244
+ const bool is_neox = mode & 2;
2245
+ const bool is_glm = mode & 4;
2246
+
2247
+ GGML_ASSERT(!is_glm && "GLM RoPE not implemented in Metal");
2248
+
2249
+ if (!is_neox) {
2250
+ GGML_ASSERT(id_src2 == nil && "TODO: freq_factors not implemented for !is_neox");
2251
+ }
2252
+
2255
2253
  id<MTLComputePipelineState> pipeline = nil;
2256
2254
 
2257
2255
  switch (src0->type) {
@@ -2263,33 +2261,38 @@ static enum ggml_status ggml_metal_graph_compute(
2263
2261
  [encoder setComputePipelineState:pipeline];
2264
2262
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
2265
2263
  [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
2266
- [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
2267
- [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
2268
- [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4];
2269
- [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5];
2270
- [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6];
2271
- [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7];
2272
- [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8];
2273
- [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9];
2274
- [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10];
2275
- [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11];
2276
- [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12];
2277
- [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13];
2278
- [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14];
2279
- [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15];
2280
- [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16];
2281
- [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17];
2282
- [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18];
2283
- [encoder setBytes:&n_past length:sizeof( int) atIndex:19];
2284
- [encoder setBytes:&n_dims length:sizeof( int) atIndex:20];
2285
- [encoder setBytes:&mode length:sizeof( int) atIndex:21];
2286
- [encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:22];
2287
- [encoder setBytes:&freq_base length:sizeof( float) atIndex:23];
2288
- [encoder setBytes:&freq_scale length:sizeof( float) atIndex:24];
2289
- [encoder setBytes:&ext_factor length:sizeof( float) atIndex:25];
2290
- [encoder setBytes:&attn_factor length:sizeof( float) atIndex:26];
2291
- [encoder setBytes:&beta_fast length:sizeof( float) atIndex:27];
2292
- [encoder setBytes:&beta_slow length:sizeof( float) atIndex:28];
2264
+ if (id_src2 != nil) {
2265
+ [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
2266
+ } else {
2267
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:2];
2268
+ }
2269
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:3];
2270
+ [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:4];
2271
+ [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5];
2272
+ [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6];
2273
+ [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7];
2274
+ [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:8];
2275
+ [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:9];
2276
+ [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:10];
2277
+ [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:11];
2278
+ [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:12];
2279
+ [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:13];
2280
+ [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:14];
2281
+ [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:15];
2282
+ [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:16];
2283
+ [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:17];
2284
+ [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:18];
2285
+ [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:19];
2286
+ [encoder setBytes:&n_past length:sizeof( int) atIndex:20];
2287
+ [encoder setBytes:&n_dims length:sizeof( int) atIndex:21];
2288
+ [encoder setBytes:&mode length:sizeof( int) atIndex:22];
2289
+ [encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:23];
2290
+ [encoder setBytes:&freq_base length:sizeof( float) atIndex:24];
2291
+ [encoder setBytes:&freq_scale length:sizeof( float) atIndex:25];
2292
+ [encoder setBytes:&ext_factor length:sizeof( float) atIndex:26];
2293
+ [encoder setBytes:&attn_factor length:sizeof( float) atIndex:27];
2294
+ [encoder setBytes:&beta_fast length:sizeof( float) atIndex:28];
2295
+ [encoder setBytes:&beta_slow length:sizeof( float) atIndex:29];
2293
2296
 
2294
2297
  [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
2295
2298
  } break;
@@ -2535,11 +2538,6 @@ static enum ggml_status ggml_metal_graph_compute(
2535
2538
  GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) &&
2536
2539
  "the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
2537
2540
 
2538
- const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
2539
- const uint64_t nb21 = src2 ? src2->nb[1] : 0;
2540
- const uint64_t nb22 = src2 ? src2->nb[2] : 0;
2541
- const uint64_t nb23 = src2 ? src2->nb[3] : 0;
2542
-
2543
2541
  const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
2544
2542
  //const int64_t ne31 = src3 ? src3->ne[1] : 0;
2545
2543
  const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);