llama_cpp 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -41,12 +41,15 @@ void ggml_metal_free(struct ggml_metal_context * ctx);
41
41
  // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
42
42
  // - the mapping is used during computation to determine the arguments of the compute kernels
43
43
  // - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
44
+ // - max_size specifies the maximum size of a tensor and is used to create shared views such
45
+ // that it is guaranteed that the tensor will fit in at least one of the views
44
46
  //
45
47
  bool ggml_metal_add_buffer(
46
48
  struct ggml_metal_context * ctx,
47
49
  const char * name,
48
50
  void * data,
49
- size_t size);
51
+ size_t size,
52
+ size_t max_size);
50
53
 
51
54
  // set data from host memory into the device
52
55
  void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
@@ -51,23 +51,26 @@ struct ggml_metal_context {
51
51
  GGML_METAL_DECL_KERNEL(get_rows_f16);
52
52
  GGML_METAL_DECL_KERNEL(get_rows_q4_0);
53
53
  GGML_METAL_DECL_KERNEL(get_rows_q4_1);
54
- GGML_METAL_DECL_KERNEL(get_rows_q2_k);
55
- GGML_METAL_DECL_KERNEL(get_rows_q3_k);
56
- GGML_METAL_DECL_KERNEL(get_rows_q4_k);
57
- GGML_METAL_DECL_KERNEL(get_rows_q5_k);
58
- GGML_METAL_DECL_KERNEL(get_rows_q6_k);
54
+ GGML_METAL_DECL_KERNEL(get_rows_q2_K);
55
+ GGML_METAL_DECL_KERNEL(get_rows_q3_K);
56
+ GGML_METAL_DECL_KERNEL(get_rows_q4_K);
57
+ GGML_METAL_DECL_KERNEL(get_rows_q5_K);
58
+ GGML_METAL_DECL_KERNEL(get_rows_q6_K);
59
59
  GGML_METAL_DECL_KERNEL(rms_norm);
60
+ GGML_METAL_DECL_KERNEL(norm);
60
61
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
61
62
  GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
62
63
  GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
63
- GGML_METAL_DECL_KERNEL(mul_mat_q2_k_f32);
64
- GGML_METAL_DECL_KERNEL(mul_mat_q3_k_f32);
65
- GGML_METAL_DECL_KERNEL(mul_mat_q4_k_f32);
66
- GGML_METAL_DECL_KERNEL(mul_mat_q5_k_f32);
67
- GGML_METAL_DECL_KERNEL(mul_mat_q6_k_f32);
64
+ GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
65
+ GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
66
+ GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
67
+ GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
68
+ GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
68
69
  GGML_METAL_DECL_KERNEL(rope);
70
+ GGML_METAL_DECL_KERNEL(alibi_f32);
69
71
  GGML_METAL_DECL_KERNEL(cpy_f32_f16);
70
72
  GGML_METAL_DECL_KERNEL(cpy_f32_f32);
73
+ GGML_METAL_DECL_KERNEL(cpy_f16_f16);
71
74
 
72
75
  #undef GGML_METAL_DECL_KERNEL
73
76
  };
@@ -129,7 +132,13 @@ struct ggml_metal_context * ggml_metal_init(void) {
129
132
  exit(1);
130
133
  }
131
134
 
135
+ #ifdef GGML_QKK_64
136
+ MTLCompileOptions* options = [MTLCompileOptions new];
137
+ options.preprocessorMacros = @{ @"QK_K" : @(64) };
138
+ ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
139
+ #else
132
140
  ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
141
+ #endif
133
142
  if (error) {
134
143
  fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
135
144
  exit(1);
@@ -156,27 +165,38 @@ struct ggml_metal_context * ggml_metal_init(void) {
156
165
  GGML_METAL_ADD_KERNEL(get_rows_f16);
157
166
  GGML_METAL_ADD_KERNEL(get_rows_q4_0);
158
167
  GGML_METAL_ADD_KERNEL(get_rows_q4_1);
159
- GGML_METAL_ADD_KERNEL(get_rows_q2_k);
160
- GGML_METAL_ADD_KERNEL(get_rows_q3_k);
161
- GGML_METAL_ADD_KERNEL(get_rows_q4_k);
162
- GGML_METAL_ADD_KERNEL(get_rows_q5_k);
163
- GGML_METAL_ADD_KERNEL(get_rows_q6_k);
168
+ GGML_METAL_ADD_KERNEL(get_rows_q2_K);
169
+ GGML_METAL_ADD_KERNEL(get_rows_q3_K);
170
+ GGML_METAL_ADD_KERNEL(get_rows_q4_K);
171
+ GGML_METAL_ADD_KERNEL(get_rows_q5_K);
172
+ GGML_METAL_ADD_KERNEL(get_rows_q6_K);
164
173
  GGML_METAL_ADD_KERNEL(rms_norm);
174
+ GGML_METAL_ADD_KERNEL(norm);
165
175
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
166
176
  GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
167
177
  GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
168
- GGML_METAL_ADD_KERNEL(mul_mat_q2_k_f32);
169
- GGML_METAL_ADD_KERNEL(mul_mat_q3_k_f32);
170
- GGML_METAL_ADD_KERNEL(mul_mat_q4_k_f32);
171
- GGML_METAL_ADD_KERNEL(mul_mat_q5_k_f32);
172
- GGML_METAL_ADD_KERNEL(mul_mat_q6_k_f32);
178
+ GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
179
+ GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
180
+ GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
181
+ GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
182
+ GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
173
183
  GGML_METAL_ADD_KERNEL(rope);
184
+ GGML_METAL_ADD_KERNEL(alibi_f32);
174
185
  GGML_METAL_ADD_KERNEL(cpy_f32_f16);
175
186
  GGML_METAL_ADD_KERNEL(cpy_f32_f32);
187
+ GGML_METAL_ADD_KERNEL(cpy_f16_f16);
176
188
 
177
189
  #undef GGML_METAL_ADD_KERNEL
178
190
  }
179
191
 
192
+ fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
193
+ fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
194
+ if (ctx->device.maxTransferRate != 0) {
195
+ fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
196
+ } else {
197
+ fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__);
198
+ }
199
+
180
200
  return ctx;
181
201
  }
182
202
 
@@ -193,10 +213,13 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
193
213
  static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
194
214
  //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
195
215
 
216
+ const int64_t tsize = ggml_nbytes(t);
217
+
218
+ // find the view that contains the tensor fully
196
219
  for (int i = 0; i < ctx->n_buffers; ++i) {
197
220
  const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
198
221
 
199
- if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
222
+ if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
200
223
  *offs = (size_t) ioffs;
201
224
 
202
225
  //fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
@@ -214,7 +237,8 @@ bool ggml_metal_add_buffer(
214
237
  struct ggml_metal_context * ctx,
215
238
  const char * name,
216
239
  void * data,
217
- size_t size) {
240
+ size_t size,
241
+ size_t max_size) {
218
242
  if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
219
243
  fprintf(stderr, "%s: too many buffers\n", __func__);
220
244
  return false;
@@ -231,30 +255,68 @@ bool ggml_metal_add_buffer(
231
255
  }
232
256
  }
233
257
 
234
- size_t page_size = getpagesize();
235
- size_t aligned_size = size;
236
- if ((aligned_size % page_size) != 0) {
237
- aligned_size += (page_size - (aligned_size % page_size));
258
+ const size_t size_page = getpagesize();
259
+
260
+ size_t size_aligned = size;
261
+ if ((size_aligned % size_page) != 0) {
262
+ size_aligned += (size_page - (size_aligned % size_page));
238
263
  }
239
264
 
240
- ctx->buffers[ctx->n_buffers].name = name;
241
- ctx->buffers[ctx->n_buffers].data = data;
242
- ctx->buffers[ctx->n_buffers].size = size;
265
+ // the buffer fits into the max buffer size allowed by the device
266
+ if (size_aligned <= ctx->device.maxBufferLength) {
267
+ ctx->buffers[ctx->n_buffers].name = name;
268
+ ctx->buffers[ctx->n_buffers].data = data;
269
+ ctx->buffers[ctx->n_buffers].size = size;
243
270
 
244
- if (ctx->device.maxBufferLength < aligned_size) {
245
- fprintf(stderr, "%s: buffer '%s' size %zu is larger than buffer maximum of %zu\n", __func__, name, aligned_size, ctx->device.maxBufferLength);
246
- return false;
247
- }
248
- ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil];
271
+ ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
272
+
273
+ if (ctx->buffers[ctx->n_buffers].metal == nil) {
274
+ fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
275
+ return false;
276
+ }
249
277
 
250
- if (ctx->buffers[ctx->n_buffers].metal == nil) {
251
- fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
252
- return false;
278
+ fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
279
+
280
+ ++ctx->n_buffers;
253
281
  } else {
254
- fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
282
+ // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
283
+ // one of the views
284
+ const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
285
+ const size_t size_step = ctx->device.maxBufferLength - size_ovlp;
286
+ const size_t size_view = ctx->device.maxBufferLength;
287
+
288
+ for (size_t i = 0; i < size; i += size_step) {
289
+ const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
290
+
291
+ ctx->buffers[ctx->n_buffers].name = name;
292
+ ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
293
+ ctx->buffers[ctx->n_buffers].size = size_step_aligned;
294
+
295
+ ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
296
+
297
+ if (ctx->buffers[ctx->n_buffers].metal == nil) {
298
+ fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
299
+ return false;
300
+ }
301
+
302
+ fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
303
+ if (i + size_step < size) {
304
+ fprintf(stderr, "\n");
305
+ }
306
+
307
+ ++ctx->n_buffers;
308
+ }
255
309
  }
256
310
 
257
- ++ctx->n_buffers;
311
+ fprintf(stderr, ", (%8.2f / %8.2f)",
312
+ ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
313
+ ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
314
+
315
+ if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
316
+ fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n");
317
+ } else {
318
+ fprintf(stderr, "\n");
319
+ }
258
320
  }
259
321
 
260
322
  return true;
@@ -606,7 +668,7 @@ void ggml_metal_graph_compute(
606
668
 
607
669
  nth0 = 4;
608
670
  nth1 = 16;
609
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32];
671
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
610
672
  } break;
611
673
  case GGML_TYPE_Q3_K:
612
674
  {
@@ -615,7 +677,7 @@ void ggml_metal_graph_compute(
615
677
 
616
678
  nth0 = 4;
617
679
  nth1 = 16;
618
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32];
680
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
619
681
  } break;
620
682
  case GGML_TYPE_Q4_K:
621
683
  {
@@ -624,7 +686,7 @@ void ggml_metal_graph_compute(
624
686
 
625
687
  nth0 = 4;
626
688
  nth1 = 16;
627
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32];
689
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
628
690
  } break;
629
691
  case GGML_TYPE_Q5_K:
630
692
  {
@@ -633,7 +695,7 @@ void ggml_metal_graph_compute(
633
695
 
634
696
  nth0 = 4;
635
697
  nth1 = 16;
636
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32];
698
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
637
699
  } break;
638
700
  case GGML_TYPE_Q6_K:
639
701
  {
@@ -642,7 +704,7 @@ void ggml_metal_graph_compute(
642
704
 
643
705
  nth0 = 4;
644
706
  nth1 = 16;
645
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_k_f32];
707
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
646
708
  } break;
647
709
  default:
648
710
  {
@@ -694,11 +756,11 @@ void ggml_metal_graph_compute(
694
756
  case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
695
757
  case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
696
758
  case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
697
- case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break;
698
- case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break;
699
- case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break;
700
- case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break;
701
- case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break;
759
+ case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
760
+ case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
761
+ case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
762
+ case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
763
+ case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
702
764
  default: GGML_ASSERT(false && "not implemented");
703
765
  }
704
766
 
@@ -735,6 +797,70 @@ void ggml_metal_graph_compute(
735
797
 
736
798
  [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
737
799
  } break;
800
+ case GGML_OP_NORM:
801
+ {
802
+ if (encoder == nil) {
803
+ encoder = [command_buffer computeCommandEncoder];
804
+ }
805
+
806
+ const float eps = 1e-5f;
807
+
808
+ const int nth = 256;
809
+
810
+ [encoder setComputePipelineState:ctx->pipeline_norm];
811
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
812
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
813
+ [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
814
+ [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
815
+ [encoder setBytes:&eps length:sizeof( float) atIndex:4];
816
+ [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
817
+
818
+ const int64_t nrows = ggml_nrows(src0);
819
+
820
+ [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
821
+ } break;
822
+ case GGML_OP_ALIBI:
823
+ {
824
+ if (encoder == nil) {
825
+ encoder = [command_buffer computeCommandEncoder];
826
+ }
827
+
828
+ GGML_ASSERT((src0t == GGML_TYPE_F32));
829
+
830
+ const int n_past = ((int32_t *) src1->data)[0]; UNUSED(n_past);
831
+ const int n_head = ((int32_t *) src1->data)[1];
832
+ const float max_bias = ((float *) src1->data)[2];
833
+
834
+ if (__builtin_popcount(n_head) != 1) {
835
+ GGML_ASSERT(false && "only power-of-two n_head implemented");
836
+ }
837
+
838
+ const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
839
+ const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
840
+
841
+ [encoder setComputePipelineState:ctx->pipeline_alibi_f32];
842
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
843
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
844
+ [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
845
+ [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
846
+ [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
847
+ [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
848
+ [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
849
+ [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
850
+ [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
851
+ [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
852
+ [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
853
+ [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
854
+ [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
855
+ [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
856
+ [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
857
+ [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
858
+ [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
859
+ [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
860
+ [encoder setBytes:&m0 length:sizeof( float) atIndex:18];
861
+ const int nth = 32;
862
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
863
+ } break;
738
864
  case GGML_OP_ROPE:
739
865
  {
740
866
  if (encoder == nil) {
@@ -788,6 +914,14 @@ void ggml_metal_graph_compute(
788
914
  default: GGML_ASSERT(false && "not implemented");
789
915
  };
790
916
  } break;
917
+ case GGML_TYPE_F16:
918
+ {
919
+ switch (dstt) {
920
+ case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f16]; break;
921
+ case GGML_TYPE_F32: GGML_ASSERT(false && "cpy_f16_f32 not implemented"); break;
922
+ default: GGML_ASSERT(false && "not implemented");
923
+ };
924
+ } break;
791
925
  default: GGML_ASSERT(false && "not implemented");
792
926
  }
793
927
 
@@ -831,4 +965,14 @@ void ggml_metal_graph_compute(
831
965
  dispatch_barrier_sync(queue, ^{});
832
966
 
833
967
  [command_buffers[n_cb - 1] waitUntilCompleted];
968
+
969
+ // check status of command buffers
970
+ // needed to detect if the device ran out-of-memory for example (#1881)
971
+ for (int i = 0; i < n_cb; i++) {
972
+ MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
973
+ if (status != MTLCommandBufferStatusCompleted) {
974
+ fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
975
+ GGML_ASSERT(false);
976
+ }
977
+ }
834
978
  }