llama_cpp 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -41,12 +41,15 @@ void ggml_metal_free(struct ggml_metal_context * ctx);
41
41
  // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
42
42
  // - the mapping is used during computation to determine the arguments of the compute kernels
43
43
  // - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
44
+ // - max_size specifies the maximum size of a tensor and is used to create shared views such
45
+ // that it is guaranteed that the tensor will fit in at least one of the views
44
46
  //
45
47
  bool ggml_metal_add_buffer(
46
48
  struct ggml_metal_context * ctx,
47
49
  const char * name,
48
50
  void * data,
49
- size_t size);
51
+ size_t size,
52
+ size_t max_size);
50
53
 
51
54
  // set data from host memory into the device
52
55
  void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
@@ -51,23 +51,26 @@ struct ggml_metal_context {
51
51
  GGML_METAL_DECL_KERNEL(get_rows_f16);
52
52
  GGML_METAL_DECL_KERNEL(get_rows_q4_0);
53
53
  GGML_METAL_DECL_KERNEL(get_rows_q4_1);
54
- GGML_METAL_DECL_KERNEL(get_rows_q2_k);
55
- GGML_METAL_DECL_KERNEL(get_rows_q3_k);
56
- GGML_METAL_DECL_KERNEL(get_rows_q4_k);
57
- GGML_METAL_DECL_KERNEL(get_rows_q5_k);
58
- GGML_METAL_DECL_KERNEL(get_rows_q6_k);
54
+ GGML_METAL_DECL_KERNEL(get_rows_q2_K);
55
+ GGML_METAL_DECL_KERNEL(get_rows_q3_K);
56
+ GGML_METAL_DECL_KERNEL(get_rows_q4_K);
57
+ GGML_METAL_DECL_KERNEL(get_rows_q5_K);
58
+ GGML_METAL_DECL_KERNEL(get_rows_q6_K);
59
59
  GGML_METAL_DECL_KERNEL(rms_norm);
60
+ GGML_METAL_DECL_KERNEL(norm);
60
61
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
61
62
  GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
62
63
  GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
63
- GGML_METAL_DECL_KERNEL(mul_mat_q2_k_f32);
64
- GGML_METAL_DECL_KERNEL(mul_mat_q3_k_f32);
65
- GGML_METAL_DECL_KERNEL(mul_mat_q4_k_f32);
66
- GGML_METAL_DECL_KERNEL(mul_mat_q5_k_f32);
67
- GGML_METAL_DECL_KERNEL(mul_mat_q6_k_f32);
64
+ GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
65
+ GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
66
+ GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
67
+ GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
68
+ GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
68
69
  GGML_METAL_DECL_KERNEL(rope);
70
+ GGML_METAL_DECL_KERNEL(alibi_f32);
69
71
  GGML_METAL_DECL_KERNEL(cpy_f32_f16);
70
72
  GGML_METAL_DECL_KERNEL(cpy_f32_f32);
73
+ GGML_METAL_DECL_KERNEL(cpy_f16_f16);
71
74
 
72
75
  #undef GGML_METAL_DECL_KERNEL
73
76
  };
@@ -129,7 +132,13 @@ struct ggml_metal_context * ggml_metal_init(void) {
129
132
  exit(1);
130
133
  }
131
134
 
135
+ #ifdef GGML_QKK_64
136
+ MTLCompileOptions* options = [MTLCompileOptions new];
137
+ options.preprocessorMacros = @{ @"QK_K" : @(64) };
138
+ ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
139
+ #else
132
140
  ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
141
+ #endif
133
142
  if (error) {
134
143
  fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
135
144
  exit(1);
@@ -156,27 +165,38 @@ struct ggml_metal_context * ggml_metal_init(void) {
156
165
  GGML_METAL_ADD_KERNEL(get_rows_f16);
157
166
  GGML_METAL_ADD_KERNEL(get_rows_q4_0);
158
167
  GGML_METAL_ADD_KERNEL(get_rows_q4_1);
159
- GGML_METAL_ADD_KERNEL(get_rows_q2_k);
160
- GGML_METAL_ADD_KERNEL(get_rows_q3_k);
161
- GGML_METAL_ADD_KERNEL(get_rows_q4_k);
162
- GGML_METAL_ADD_KERNEL(get_rows_q5_k);
163
- GGML_METAL_ADD_KERNEL(get_rows_q6_k);
168
+ GGML_METAL_ADD_KERNEL(get_rows_q2_K);
169
+ GGML_METAL_ADD_KERNEL(get_rows_q3_K);
170
+ GGML_METAL_ADD_KERNEL(get_rows_q4_K);
171
+ GGML_METAL_ADD_KERNEL(get_rows_q5_K);
172
+ GGML_METAL_ADD_KERNEL(get_rows_q6_K);
164
173
  GGML_METAL_ADD_KERNEL(rms_norm);
174
+ GGML_METAL_ADD_KERNEL(norm);
165
175
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
166
176
  GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
167
177
  GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
168
- GGML_METAL_ADD_KERNEL(mul_mat_q2_k_f32);
169
- GGML_METAL_ADD_KERNEL(mul_mat_q3_k_f32);
170
- GGML_METAL_ADD_KERNEL(mul_mat_q4_k_f32);
171
- GGML_METAL_ADD_KERNEL(mul_mat_q5_k_f32);
172
- GGML_METAL_ADD_KERNEL(mul_mat_q6_k_f32);
178
+ GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
179
+ GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
180
+ GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
181
+ GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
182
+ GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
173
183
  GGML_METAL_ADD_KERNEL(rope);
184
+ GGML_METAL_ADD_KERNEL(alibi_f32);
174
185
  GGML_METAL_ADD_KERNEL(cpy_f32_f16);
175
186
  GGML_METAL_ADD_KERNEL(cpy_f32_f32);
187
+ GGML_METAL_ADD_KERNEL(cpy_f16_f16);
176
188
 
177
189
  #undef GGML_METAL_ADD_KERNEL
178
190
  }
179
191
 
192
+ fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
193
+ fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
194
+ if (ctx->device.maxTransferRate != 0) {
195
+ fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
196
+ } else {
197
+ fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__);
198
+ }
199
+
180
200
  return ctx;
181
201
  }
182
202
 
@@ -193,10 +213,13 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
193
213
  static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
194
214
  //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
195
215
 
216
+ const int64_t tsize = ggml_nbytes(t);
217
+
218
+ // find the view that contains the tensor fully
196
219
  for (int i = 0; i < ctx->n_buffers; ++i) {
197
220
  const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
198
221
 
199
- if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
222
+ if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
200
223
  *offs = (size_t) ioffs;
201
224
 
202
225
  //fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
@@ -214,7 +237,8 @@ bool ggml_metal_add_buffer(
214
237
  struct ggml_metal_context * ctx,
215
238
  const char * name,
216
239
  void * data,
217
- size_t size) {
240
+ size_t size,
241
+ size_t max_size) {
218
242
  if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
219
243
  fprintf(stderr, "%s: too many buffers\n", __func__);
220
244
  return false;
@@ -231,30 +255,68 @@ bool ggml_metal_add_buffer(
231
255
  }
232
256
  }
233
257
 
234
- size_t page_size = getpagesize();
235
- size_t aligned_size = size;
236
- if ((aligned_size % page_size) != 0) {
237
- aligned_size += (page_size - (aligned_size % page_size));
258
+ const size_t size_page = getpagesize();
259
+
260
+ size_t size_aligned = size;
261
+ if ((size_aligned % size_page) != 0) {
262
+ size_aligned += (size_page - (size_aligned % size_page));
238
263
  }
239
264
 
240
- ctx->buffers[ctx->n_buffers].name = name;
241
- ctx->buffers[ctx->n_buffers].data = data;
242
- ctx->buffers[ctx->n_buffers].size = size;
265
+ // the buffer fits into the max buffer size allowed by the device
266
+ if (size_aligned <= ctx->device.maxBufferLength) {
267
+ ctx->buffers[ctx->n_buffers].name = name;
268
+ ctx->buffers[ctx->n_buffers].data = data;
269
+ ctx->buffers[ctx->n_buffers].size = size;
243
270
 
244
- if (ctx->device.maxBufferLength < aligned_size) {
245
- fprintf(stderr, "%s: buffer '%s' size %zu is larger than buffer maximum of %zu\n", __func__, name, aligned_size, ctx->device.maxBufferLength);
246
- return false;
247
- }
248
- ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil];
271
+ ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
272
+
273
+ if (ctx->buffers[ctx->n_buffers].metal == nil) {
274
+ fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
275
+ return false;
276
+ }
249
277
 
250
- if (ctx->buffers[ctx->n_buffers].metal == nil) {
251
- fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
252
- return false;
278
+ fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
279
+
280
+ ++ctx->n_buffers;
253
281
  } else {
254
- fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
282
+ // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
283
+ // one of the views
284
+ const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
285
+ const size_t size_step = ctx->device.maxBufferLength - size_ovlp;
286
+ const size_t size_view = ctx->device.maxBufferLength;
287
+
288
+ for (size_t i = 0; i < size; i += size_step) {
289
+ const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
290
+
291
+ ctx->buffers[ctx->n_buffers].name = name;
292
+ ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
293
+ ctx->buffers[ctx->n_buffers].size = size_step_aligned;
294
+
295
+ ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
296
+
297
+ if (ctx->buffers[ctx->n_buffers].metal == nil) {
298
+ fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
299
+ return false;
300
+ }
301
+
302
+ fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
303
+ if (i + size_step < size) {
304
+ fprintf(stderr, "\n");
305
+ }
306
+
307
+ ++ctx->n_buffers;
308
+ }
255
309
  }
256
310
 
257
- ++ctx->n_buffers;
311
+ fprintf(stderr, ", (%8.2f / %8.2f)",
312
+ ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
313
+ ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
314
+
315
+ if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
316
+ fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n");
317
+ } else {
318
+ fprintf(stderr, "\n");
319
+ }
258
320
  }
259
321
 
260
322
  return true;
@@ -606,7 +668,7 @@ void ggml_metal_graph_compute(
606
668
 
607
669
  nth0 = 4;
608
670
  nth1 = 16;
609
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32];
671
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
610
672
  } break;
611
673
  case GGML_TYPE_Q3_K:
612
674
  {
@@ -615,7 +677,7 @@ void ggml_metal_graph_compute(
615
677
 
616
678
  nth0 = 4;
617
679
  nth1 = 16;
618
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32];
680
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
619
681
  } break;
620
682
  case GGML_TYPE_Q4_K:
621
683
  {
@@ -624,7 +686,7 @@ void ggml_metal_graph_compute(
624
686
 
625
687
  nth0 = 4;
626
688
  nth1 = 16;
627
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32];
689
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
628
690
  } break;
629
691
  case GGML_TYPE_Q5_K:
630
692
  {
@@ -633,7 +695,7 @@ void ggml_metal_graph_compute(
633
695
 
634
696
  nth0 = 4;
635
697
  nth1 = 16;
636
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32];
698
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
637
699
  } break;
638
700
  case GGML_TYPE_Q6_K:
639
701
  {
@@ -642,7 +704,7 @@ void ggml_metal_graph_compute(
642
704
 
643
705
  nth0 = 4;
644
706
  nth1 = 16;
645
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_k_f32];
707
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
646
708
  } break;
647
709
  default:
648
710
  {
@@ -694,11 +756,11 @@ void ggml_metal_graph_compute(
694
756
  case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
695
757
  case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
696
758
  case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
697
- case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break;
698
- case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break;
699
- case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break;
700
- case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break;
701
- case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break;
759
+ case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
760
+ case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
761
+ case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
762
+ case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
763
+ case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
702
764
  default: GGML_ASSERT(false && "not implemented");
703
765
  }
704
766
 
@@ -735,6 +797,70 @@ void ggml_metal_graph_compute(
735
797
 
736
798
  [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
737
799
  } break;
800
+ case GGML_OP_NORM:
801
+ {
802
+ if (encoder == nil) {
803
+ encoder = [command_buffer computeCommandEncoder];
804
+ }
805
+
806
+ const float eps = 1e-5f;
807
+
808
+ const int nth = 256;
809
+
810
+ [encoder setComputePipelineState:ctx->pipeline_norm];
811
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
812
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
813
+ [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
814
+ [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
815
+ [encoder setBytes:&eps length:sizeof( float) atIndex:4];
816
+ [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
817
+
818
+ const int64_t nrows = ggml_nrows(src0);
819
+
820
+ [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
821
+ } break;
822
+ case GGML_OP_ALIBI:
823
+ {
824
+ if (encoder == nil) {
825
+ encoder = [command_buffer computeCommandEncoder];
826
+ }
827
+
828
+ GGML_ASSERT((src0t == GGML_TYPE_F32));
829
+
830
+ const int n_past = ((int32_t *) src1->data)[0]; UNUSED(n_past);
831
+ const int n_head = ((int32_t *) src1->data)[1];
832
+ const float max_bias = ((float *) src1->data)[2];
833
+
834
+ if (__builtin_popcount(n_head) != 1) {
835
+ GGML_ASSERT(false && "only power-of-two n_head implemented");
836
+ }
837
+
838
+ const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
839
+ const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
840
+
841
+ [encoder setComputePipelineState:ctx->pipeline_alibi_f32];
842
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
843
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
844
+ [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
845
+ [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
846
+ [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
847
+ [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
848
+ [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
849
+ [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
850
+ [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
851
+ [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
852
+ [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
853
+ [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
854
+ [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
855
+ [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
856
+ [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
857
+ [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
858
+ [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
859
+ [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
860
+ [encoder setBytes:&m0 length:sizeof( float) atIndex:18];
861
+ const int nth = 32;
862
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
863
+ } break;
738
864
  case GGML_OP_ROPE:
739
865
  {
740
866
  if (encoder == nil) {
@@ -788,6 +914,14 @@ void ggml_metal_graph_compute(
788
914
  default: GGML_ASSERT(false && "not implemented");
789
915
  };
790
916
  } break;
917
+ case GGML_TYPE_F16:
918
+ {
919
+ switch (dstt) {
920
+ case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f16]; break;
921
+ case GGML_TYPE_F32: GGML_ASSERT(false && "cpy_f16_f32 not implemented"); break;
922
+ default: GGML_ASSERT(false && "not implemented");
923
+ };
924
+ } break;
791
925
  default: GGML_ASSERT(false && "not implemented");
792
926
  }
793
927
 
@@ -831,4 +965,14 @@ void ggml_metal_graph_compute(
831
965
  dispatch_barrier_sync(queue, ^{});
832
966
 
833
967
  [command_buffers[n_cb - 1] waitUntilCompleted];
968
+
969
+ // check status of command buffers
970
+ // needed to detect if the device ran out-of-memory for example (#1881)
971
+ for (int i = 0; i < n_cb; i++) {
972
+ MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
973
+ if (status != MTLCommandBufferStatusCompleted) {
974
+ fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
975
+ GGML_ASSERT(false);
976
+ }
977
+ }
834
978
  }