llama_cpp 0.15.2 → 0.15.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -381,10 +381,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
381
381
  // dictionary of preprocessor macros
382
382
  NSMutableDictionary * prep = [NSMutableDictionary dictionary];
383
383
 
384
- #ifdef GGML_QKK_64
385
- prep[@"GGML_QKK_64"] = @(1);
386
- #endif
387
-
388
384
  MTLCompileOptions* options = [MTLCompileOptions new];
389
385
  options.preprocessorMacros = prep;
390
386
 
@@ -927,22 +923,32 @@ static enum ggml_status ggml_metal_graph_compute(
927
923
  const int64_t ne10 = src1 ? src1->ne[0] : 0;
928
924
  const int64_t ne11 = src1 ? src1->ne[1] : 0;
929
925
  const int64_t ne12 = src1 ? src1->ne[2] : 0;
930
- const int64_t ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
926
+ const int64_t ne13 = src1 ? src1->ne[3] : 0;
931
927
 
932
928
  const uint64_t nb10 = src1 ? src1->nb[0] : 0;
933
929
  const uint64_t nb11 = src1 ? src1->nb[1] : 0;
934
930
  const uint64_t nb12 = src1 ? src1->nb[2] : 0;
935
- const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
931
+ const uint64_t nb13 = src1 ? src1->nb[3] : 0;
932
+
933
+ const int64_t ne20 = src2 ? src2->ne[0] : 0;
934
+ const int64_t ne21 = src2 ? src2->ne[1] : 0;
935
+ const int64_t ne22 = src2 ? src2->ne[2] : 0; GGML_UNUSED(ne22);
936
+ const int64_t ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23);
936
937
 
937
- const int64_t ne0 = dst ? dst->ne[0] : 0;
938
- const int64_t ne1 = dst ? dst->ne[1] : 0;
939
- const int64_t ne2 = dst ? dst->ne[2] : 0;
940
- const int64_t ne3 = dst ? dst->ne[3] : 0;
938
+ const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
939
+ const uint64_t nb21 = src2 ? src2->nb[1] : 0;
940
+ const uint64_t nb22 = src2 ? src2->nb[2] : 0;
941
+ const uint64_t nb23 = src2 ? src2->nb[3] : 0;
941
942
 
942
- const uint64_t nb0 = dst ? dst->nb[0] : 0;
943
- const uint64_t nb1 = dst ? dst->nb[1] : 0;
944
- const uint64_t nb2 = dst ? dst->nb[2] : 0;
945
- const uint64_t nb3 = dst ? dst->nb[3] : 0;
943
+ const int64_t ne0 = dst ? dst->ne[0] : 0;
944
+ const int64_t ne1 = dst ? dst->ne[1] : 0;
945
+ const int64_t ne2 = dst ? dst->ne[2] : 0;
946
+ const int64_t ne3 = dst ? dst->ne[3] : 0;
947
+
948
+ const uint64_t nb0 = dst ? dst->nb[0] : 0;
949
+ const uint64_t nb1 = dst ? dst->nb[1] : 0;
950
+ const uint64_t nb2 = dst ? dst->nb[2] : 0;
951
+ const uint64_t nb3 = dst ? dst->nb[3] : 0;
946
952
 
947
953
  const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
948
954
  const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
@@ -1763,11 +1769,7 @@ static enum ggml_status ggml_metal_graph_compute(
1763
1769
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1764
1770
  }
1765
1771
  else if (src0t == GGML_TYPE_Q3_K) {
1766
- #ifdef GGML_QKK_64
1767
- [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1768
- #else
1769
1772
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1770
- #endif
1771
1773
  }
1772
1774
  else if (src0t == GGML_TYPE_Q5_K) {
1773
1775
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@@ -1785,16 +1787,6 @@ static enum ggml_status ggml_metal_graph_compute(
1785
1787
  const int n_as = src0->ne[2];
1786
1788
 
1787
1789
  // src2 = ids
1788
- const int64_t ne20 = src2->ne[0];
1789
- const int64_t ne21 = src2->ne[1];
1790
- const int64_t ne22 = src2->ne[2]; GGML_UNUSED(ne22);
1791
- const int64_t ne23 = src2->ne[3]; GGML_UNUSED(ne23);
1792
-
1793
- const uint64_t nb20 = src2->nb[0]; GGML_UNUSED(nb20);
1794
- const uint64_t nb21 = src2->nb[1];
1795
- const uint64_t nb22 = src2->nb[2]; GGML_UNUSED(nb22);
1796
- const uint64_t nb23 = src2->nb[3]; GGML_UNUSED(nb23);
1797
-
1798
1790
  const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t);
1799
1791
 
1800
1792
  GGML_ASSERT(src2t == GGML_TYPE_I32);
@@ -2018,12 +2010,7 @@ static enum ggml_status ggml_metal_graph_compute(
2018
2010
  {
2019
2011
  nth0 = 4;
2020
2012
  nth1 = 16;
2021
- #if QK_K == 64
2022
- pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
2023
- #else
2024
2013
  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
2025
- #endif
2026
-
2027
2014
  } break;
2028
2015
  default:
2029
2016
  {
@@ -2088,11 +2075,7 @@ static enum ggml_status ggml_metal_graph_compute(
2088
2075
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
2089
2076
  }
2090
2077
  else if (src0t == GGML_TYPE_Q3_K) {
2091
- #ifdef GGML_QKK_64
2092
- [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
2093
- #else
2094
2078
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
2095
- #endif
2096
2079
  }
2097
2080
  else if (src0t == GGML_TYPE_Q5_K) {
2098
2081
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@@ -2244,7 +2227,13 @@ static enum ggml_status ggml_metal_graph_compute(
2244
2227
  // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
2245
2228
  const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
2246
2229
 
2247
- float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
2230
+ float freq_base;
2231
+ float freq_scale;
2232
+ float ext_factor;
2233
+ float attn_factor;
2234
+ float beta_fast;
2235
+ float beta_slow;
2236
+
2248
2237
  memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
2249
2238
  memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
2250
2239
  memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
@@ -2252,6 +2241,15 @@ static enum ggml_status ggml_metal_graph_compute(
2252
2241
  memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
2253
2242
  memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
2254
2243
 
2244
+ const bool is_neox = mode & 2;
2245
+ const bool is_glm = mode & 4;
2246
+
2247
+ GGML_ASSERT(!is_glm && "GLM RoPE not implemented in Metal");
2248
+
2249
+ if (!is_neox) {
2250
+ GGML_ASSERT(id_src2 == nil && "TODO: freq_factors not implemented for !is_neox");
2251
+ }
2252
+
2255
2253
  id<MTLComputePipelineState> pipeline = nil;
2256
2254
 
2257
2255
  switch (src0->type) {
@@ -2263,33 +2261,38 @@ static enum ggml_status ggml_metal_graph_compute(
2263
2261
  [encoder setComputePipelineState:pipeline];
2264
2262
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
2265
2263
  [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
2266
- [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
2267
- [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
2268
- [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4];
2269
- [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5];
2270
- [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6];
2271
- [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7];
2272
- [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8];
2273
- [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9];
2274
- [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10];
2275
- [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11];
2276
- [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12];
2277
- [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13];
2278
- [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14];
2279
- [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15];
2280
- [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16];
2281
- [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17];
2282
- [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18];
2283
- [encoder setBytes:&n_past length:sizeof( int) atIndex:19];
2284
- [encoder setBytes:&n_dims length:sizeof( int) atIndex:20];
2285
- [encoder setBytes:&mode length:sizeof( int) atIndex:21];
2286
- [encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:22];
2287
- [encoder setBytes:&freq_base length:sizeof( float) atIndex:23];
2288
- [encoder setBytes:&freq_scale length:sizeof( float) atIndex:24];
2289
- [encoder setBytes:&ext_factor length:sizeof( float) atIndex:25];
2290
- [encoder setBytes:&attn_factor length:sizeof( float) atIndex:26];
2291
- [encoder setBytes:&beta_fast length:sizeof( float) atIndex:27];
2292
- [encoder setBytes:&beta_slow length:sizeof( float) atIndex:28];
2264
+ if (id_src2 != nil) {
2265
+ [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
2266
+ } else {
2267
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:2];
2268
+ }
2269
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:3];
2270
+ [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:4];
2271
+ [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5];
2272
+ [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6];
2273
+ [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7];
2274
+ [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:8];
2275
+ [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:9];
2276
+ [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:10];
2277
+ [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:11];
2278
+ [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:12];
2279
+ [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:13];
2280
+ [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:14];
2281
+ [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:15];
2282
+ [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:16];
2283
+ [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:17];
2284
+ [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:18];
2285
+ [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:19];
2286
+ [encoder setBytes:&n_past length:sizeof( int) atIndex:20];
2287
+ [encoder setBytes:&n_dims length:sizeof( int) atIndex:21];
2288
+ [encoder setBytes:&mode length:sizeof( int) atIndex:22];
2289
+ [encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:23];
2290
+ [encoder setBytes:&freq_base length:sizeof( float) atIndex:24];
2291
+ [encoder setBytes:&freq_scale length:sizeof( float) atIndex:25];
2292
+ [encoder setBytes:&ext_factor length:sizeof( float) atIndex:26];
2293
+ [encoder setBytes:&attn_factor length:sizeof( float) atIndex:27];
2294
+ [encoder setBytes:&beta_fast length:sizeof( float) atIndex:28];
2295
+ [encoder setBytes:&beta_slow length:sizeof( float) atIndex:29];
2293
2296
 
2294
2297
  [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
2295
2298
  } break;
@@ -2535,11 +2538,6 @@ static enum ggml_status ggml_metal_graph_compute(
2535
2538
  GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) &&
2536
2539
  "the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
2537
2540
 
2538
- const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
2539
- const uint64_t nb21 = src2 ? src2->nb[1] : 0;
2540
- const uint64_t nb22 = src2 ? src2->nb[2] : 0;
2541
- const uint64_t nb23 = src2 ? src2->nb[3] : 0;
2542
-
2543
2541
  const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
2544
2542
  //const int64_t ne31 = src3 ? src3->ne[1] : 0;
2545
2543
  const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);