llama_cpp 0.11.1 → 0.12.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -87,7 +87,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
87
87
 
88
88
  // same as ggml_graph_compute but uses Metal
89
89
  // creates gf->n_threads command buffers in parallel
90
- void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
90
+ bool ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
91
91
 
92
92
  //
93
93
  // backend API
@@ -88,6 +88,8 @@ struct ggml_metal_context {
88
88
  GGML_METAL_DECL_KERNEL(get_rows_q5_K);
89
89
  GGML_METAL_DECL_KERNEL(get_rows_q6_K);
90
90
  GGML_METAL_DECL_KERNEL(get_rows_i32);
91
+ GGML_METAL_DECL_KERNEL(get_rows_iq2_xxs);
92
+ GGML_METAL_DECL_KERNEL(get_rows_iq2_xs);
91
93
  GGML_METAL_DECL_KERNEL(rms_norm);
92
94
  GGML_METAL_DECL_KERNEL(group_norm);
93
95
  GGML_METAL_DECL_KERNEL(norm);
@@ -106,6 +108,8 @@ struct ggml_metal_context {
106
108
  GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32);
107
109
  GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32);
108
110
  GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32);
111
+ GGML_METAL_DECL_KERNEL(mul_mv_iq2_xxs_f32);
112
+ GGML_METAL_DECL_KERNEL(mul_mv_iq2_xs_f32);
109
113
  GGML_METAL_DECL_KERNEL(mul_mv_id_f32_f32);
110
114
  //GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f16);
111
115
  GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32);
@@ -121,6 +125,8 @@ struct ggml_metal_context {
121
125
  GGML_METAL_DECL_KERNEL(mul_mv_id_q4_K_f32);
122
126
  GGML_METAL_DECL_KERNEL(mul_mv_id_q5_K_f32);
123
127
  GGML_METAL_DECL_KERNEL(mul_mv_id_q6_K_f32);
128
+ GGML_METAL_DECL_KERNEL(mul_mv_id_iq2_xxs_f32);
129
+ GGML_METAL_DECL_KERNEL(mul_mv_id_iq2_xs_f32);
124
130
  GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
125
131
  GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
126
132
  GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
@@ -133,6 +139,8 @@ struct ggml_metal_context {
133
139
  GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32);
134
140
  GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32);
135
141
  GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32);
142
+ GGML_METAL_DECL_KERNEL(mul_mm_iq2_xxs_f32);
143
+ GGML_METAL_DECL_KERNEL(mul_mm_iq2_xs_f32);
136
144
  GGML_METAL_DECL_KERNEL(mul_mm_id_f32_f32);
137
145
  GGML_METAL_DECL_KERNEL(mul_mm_id_f16_f32);
138
146
  GGML_METAL_DECL_KERNEL(mul_mm_id_q4_0_f32);
@@ -145,6 +153,8 @@ struct ggml_metal_context {
145
153
  GGML_METAL_DECL_KERNEL(mul_mm_id_q4_K_f32);
146
154
  GGML_METAL_DECL_KERNEL(mul_mm_id_q5_K_f32);
147
155
  GGML_METAL_DECL_KERNEL(mul_mm_id_q6_K_f32);
156
+ GGML_METAL_DECL_KERNEL(mul_mm_id_iq2_xxs_f32);
157
+ GGML_METAL_DECL_KERNEL(mul_mm_id_iq2_xs_f32);
148
158
  GGML_METAL_DECL_KERNEL(rope_f32);
149
159
  GGML_METAL_DECL_KERNEL(rope_f16);
150
160
  GGML_METAL_DECL_KERNEL(alibi_f32);
@@ -258,14 +268,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
258
268
  bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
259
269
  #endif
260
270
  NSError * error = nil;
261
- NSString * libPath = [bundle pathForResource:@"ggml" ofType:@"metallib"];
271
+ NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
262
272
  if (libPath != nil) {
263
273
  // pre-compiled library found
264
274
  NSURL * libURL = [NSURL fileURLWithPath:libPath];
265
275
  GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
266
276
  ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
267
277
  } else {
268
- GGML_METAL_LOG_INFO("%s: ggml.metallib not found, loading from source\n", __func__);
278
+ GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
269
279
 
270
280
  NSString * sourcePath;
271
281
  NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
@@ -295,7 +305,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
295
305
  #endif
296
306
  // try to disable fast-math
297
307
  // NOTE: this seems to have no effect whatsoever
298
- // instead, in order to disable fast-math, we have to build ggml.metallib from the command line
308
+ // instead, in order to disable fast-math, we have to build default.metallib from the command line
299
309
  // using xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
300
310
  // and go through the "pre-compiled library found" path above
301
311
  //[options setFastMathEnabled:false];
@@ -379,6 +389,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
379
389
  GGML_METAL_ADD_KERNEL(get_rows_q5_K);
380
390
  GGML_METAL_ADD_KERNEL(get_rows_q6_K);
381
391
  GGML_METAL_ADD_KERNEL(get_rows_i32);
392
+ GGML_METAL_ADD_KERNEL(get_rows_iq2_xxs);
393
+ GGML_METAL_ADD_KERNEL(get_rows_iq2_xs);
382
394
  GGML_METAL_ADD_KERNEL(rms_norm);
383
395
  GGML_METAL_ADD_KERNEL(group_norm);
384
396
  GGML_METAL_ADD_KERNEL(norm);
@@ -397,6 +409,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
397
409
  GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32);
398
410
  GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
399
411
  GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
412
+ GGML_METAL_ADD_KERNEL(mul_mv_iq2_xxs_f32);
413
+ GGML_METAL_ADD_KERNEL(mul_mv_iq2_xs_f32);
400
414
  GGML_METAL_ADD_KERNEL(mul_mv_id_f32_f32);
401
415
  //GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f16);
402
416
  GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32);
@@ -412,6 +426,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
412
426
  GGML_METAL_ADD_KERNEL(mul_mv_id_q4_K_f32);
413
427
  GGML_METAL_ADD_KERNEL(mul_mv_id_q5_K_f32);
414
428
  GGML_METAL_ADD_KERNEL(mul_mv_id_q6_K_f32);
429
+ GGML_METAL_ADD_KERNEL(mul_mv_id_iq2_xxs_f32);
430
+ GGML_METAL_ADD_KERNEL(mul_mv_id_iq2_xs_f32);
415
431
  if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
416
432
  GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
417
433
  GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
@@ -425,6 +441,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
425
441
  GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
426
442
  GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
427
443
  GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
444
+ GGML_METAL_ADD_KERNEL(mul_mm_iq2_xxs_f32);
445
+ GGML_METAL_ADD_KERNEL(mul_mm_iq2_xs_f32);
428
446
  GGML_METAL_ADD_KERNEL(mul_mm_id_f32_f32);
429
447
  GGML_METAL_ADD_KERNEL(mul_mm_id_f16_f32);
430
448
  GGML_METAL_ADD_KERNEL(mul_mm_id_q4_0_f32);
@@ -437,6 +455,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
437
455
  GGML_METAL_ADD_KERNEL(mul_mm_id_q4_K_f32);
438
456
  GGML_METAL_ADD_KERNEL(mul_mm_id_q5_K_f32);
439
457
  GGML_METAL_ADD_KERNEL(mul_mm_id_q6_K_f32);
458
+ GGML_METAL_ADD_KERNEL(mul_mm_id_iq2_xxs_f32);
459
+ GGML_METAL_ADD_KERNEL(mul_mm_id_iq2_xs_f32);
440
460
  }
441
461
  GGML_METAL_ADD_KERNEL(rope_f32);
442
462
  GGML_METAL_ADD_KERNEL(rope_f16);
@@ -502,6 +522,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
502
522
  GGML_METAL_DEL_KERNEL(get_rows_q5_K);
503
523
  GGML_METAL_DEL_KERNEL(get_rows_q6_K);
504
524
  GGML_METAL_DEL_KERNEL(get_rows_i32);
525
+ GGML_METAL_DEL_KERNEL(get_rows_iq2_xxs);
526
+ GGML_METAL_DEL_KERNEL(get_rows_iq2_xs);
505
527
  GGML_METAL_DEL_KERNEL(rms_norm);
506
528
  GGML_METAL_DEL_KERNEL(group_norm);
507
529
  GGML_METAL_DEL_KERNEL(norm);
@@ -520,6 +542,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
520
542
  GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32);
521
543
  GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
522
544
  GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
545
+ GGML_METAL_DEL_KERNEL(mul_mv_iq2_xxs_f32);
546
+ GGML_METAL_DEL_KERNEL(mul_mv_iq2_xs_f32);
523
547
  GGML_METAL_DEL_KERNEL(mul_mv_id_f32_f32);
524
548
  //GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f16);
525
549
  GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32);
@@ -535,6 +559,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
535
559
  GGML_METAL_DEL_KERNEL(mul_mv_id_q4_K_f32);
536
560
  GGML_METAL_DEL_KERNEL(mul_mv_id_q5_K_f32);
537
561
  GGML_METAL_DEL_KERNEL(mul_mv_id_q6_K_f32);
562
+ GGML_METAL_DEL_KERNEL(mul_mv_id_iq2_xxs_f32);
563
+ GGML_METAL_DEL_KERNEL(mul_mv_id_iq2_xs_f32);
538
564
  if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
539
565
  GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
540
566
  GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
@@ -548,6 +574,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
548
574
  GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
549
575
  GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
550
576
  GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
577
+ GGML_METAL_DEL_KERNEL(mul_mm_iq2_xxs_f32);
578
+ GGML_METAL_DEL_KERNEL(mul_mm_iq2_xs_f32);
551
579
  GGML_METAL_DEL_KERNEL(mul_mm_id_f32_f32);
552
580
  GGML_METAL_DEL_KERNEL(mul_mm_id_f16_f32);
553
581
  GGML_METAL_DEL_KERNEL(mul_mm_id_q4_0_f32);
@@ -560,6 +588,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
560
588
  GGML_METAL_DEL_KERNEL(mul_mm_id_q4_K_f32);
561
589
  GGML_METAL_DEL_KERNEL(mul_mm_id_q5_K_f32);
562
590
  GGML_METAL_DEL_KERNEL(mul_mm_id_q6_K_f32);
591
+ GGML_METAL_DEL_KERNEL(mul_mm_id_iq2_xxs_f32);
592
+ GGML_METAL_DEL_KERNEL(mul_mm_id_iq2_xs_f32);
563
593
  }
564
594
  GGML_METAL_DEL_KERNEL(rope_f32);
565
595
  GGML_METAL_DEL_KERNEL(rope_f16);
@@ -977,7 +1007,7 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) {
977
1007
  return false;
978
1008
  }
979
1009
  }
980
- void ggml_metal_graph_compute(
1010
+ bool ggml_metal_graph_compute(
981
1011
  struct ggml_metal_context * ctx,
982
1012
  struct ggml_cgraph * gf) {
983
1013
  @autoreleasepool {
@@ -1052,6 +1082,10 @@ void ggml_metal_graph_compute(
1052
1082
  GGML_ASSERT(!"unsupported op");
1053
1083
  }
1054
1084
 
1085
+ #ifndef GGML_METAL_NDEBUG
1086
+ [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]];
1087
+ #endif
1088
+
1055
1089
  const int64_t ne00 = src0 ? src0->ne[0] : 0;
1056
1090
  const int64_t ne01 = src0 ? src0->ne[1] : 0;
1057
1091
  const int64_t ne02 = src0 ? src0->ne[2] : 0;
@@ -1541,6 +1575,8 @@ void ggml_metal_graph_compute(
1541
1575
  case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break;
1542
1576
  case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break;
1543
1577
  case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break;
1578
+ case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_mul_mm_iq2_xxs_f32]; break;
1579
+ case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_mul_mm_iq2_xs_f32]; break;
1544
1580
  default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
1545
1581
  }
1546
1582
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -1653,6 +1689,18 @@ void ggml_metal_graph_compute(
1653
1689
  nth1 = 32;
1654
1690
  [encoder setComputePipelineState:ctx->pipeline_mul_mv_q6_K_f32];
1655
1691
  } break;
1692
+ case GGML_TYPE_IQ2_XXS:
1693
+ {
1694
+ nth0 = 4;
1695
+ nth1 = 16;
1696
+ [encoder setComputePipelineState:ctx->pipeline_mul_mv_iq2_xxs_f32];
1697
+ } break;
1698
+ case GGML_TYPE_IQ2_XS:
1699
+ {
1700
+ nth0 = 4;
1701
+ nth1 = 16;
1702
+ [encoder setComputePipelineState:ctx->pipeline_mul_mv_iq2_xs_f32];
1703
+ } break;
1656
1704
  default:
1657
1705
  {
1658
1706
  GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
@@ -1689,6 +1737,11 @@ void ggml_metal_graph_compute(
1689
1737
  src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
1690
1738
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1691
1739
  }
1740
+ else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
1741
+ const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
1742
+ [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
1743
+ [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1744
+ }
1692
1745
  else if (src0t == GGML_TYPE_Q4_K) {
1693
1746
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1694
1747
  }
@@ -1778,6 +1831,8 @@ void ggml_metal_graph_compute(
1778
1831
  case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q4_K_f32]; break;
1779
1832
  case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_K_f32]; break;
1780
1833
  case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q6_K_f32]; break;
1834
+ case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_iq2_xxs_f32]; break;
1835
+ case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_iq2_xs_f32]; break;
1781
1836
  default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
1782
1837
  }
1783
1838
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -1893,6 +1948,18 @@ void ggml_metal_graph_compute(
1893
1948
  nth1 = 32;
1894
1949
  [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q6_K_f32];
1895
1950
  } break;
1951
+ case GGML_TYPE_IQ2_XXS:
1952
+ {
1953
+ nth0 = 4;
1954
+ nth1 = 16;
1955
+ [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_iq2_xxs_f32];
1956
+ } break;
1957
+ case GGML_TYPE_IQ2_XS:
1958
+ {
1959
+ nth0 = 4;
1960
+ nth1 = 16;
1961
+ [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_iq2_xs_f32];
1962
+ } break;
1896
1963
  default:
1897
1964
  {
1898
1965
  GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
@@ -1945,6 +2012,11 @@ void ggml_metal_graph_compute(
1945
2012
  src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) {
1946
2013
  [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1947
2014
  }
2015
+ else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
2016
+ const int mem_size = src2t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
2017
+ [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
2018
+ [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
2019
+ }
1948
2020
  else if (src2t == GGML_TYPE_Q4_K) {
1949
2021
  [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1950
2022
  }
@@ -1982,6 +2054,8 @@ void ggml_metal_graph_compute(
1982
2054
  case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
1983
2055
  case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
1984
2056
  case GGML_TYPE_I32: [encoder setComputePipelineState:ctx->pipeline_get_rows_i32]; break;
2057
+ case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_get_rows_iq2_xxs]; break;
2058
+ case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_get_rows_iq2_xs]; break;
1985
2059
  default: GGML_ASSERT(false && "not implemented");
1986
2060
  }
1987
2061
 
@@ -2383,6 +2457,10 @@ void ggml_metal_graph_compute(
2383
2457
  GGML_ASSERT(false);
2384
2458
  }
2385
2459
  }
2460
+
2461
+ #ifndef GGML_METAL_NDEBUG
2462
+ [encoder popDebugGroup];
2463
+ #endif
2386
2464
  }
2387
2465
 
2388
2466
  if (encoder != nil) {
@@ -2405,10 +2483,11 @@ void ggml_metal_graph_compute(
2405
2483
  MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
2406
2484
  if (status != MTLCommandBufferStatusCompleted) {
2407
2485
  GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
2408
- GGML_ASSERT(false);
2486
+ return false;
2409
2487
  }
2410
2488
  }
2411
2489
 
2490
+ return true;
2412
2491
  }
2413
2492
  }
2414
2493
 
@@ -2688,10 +2767,10 @@ static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggm
2688
2767
  UNUSED(backend);
2689
2768
  }
2690
2769
 
2691
- static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
2770
+ static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
2692
2771
  struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
2693
2772
 
2694
- ggml_metal_graph_compute(metal_ctx, cgraph);
2773
+ return ggml_metal_graph_compute(metal_ctx, cgraph);
2695
2774
  }
2696
2775
 
2697
2776
  static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {