whisper.rn 0.5.4 → 0.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/android/src/main/java/com/rnwhisper/WhisperContext.java +5 -0
  2. package/android/src/main/jni.cpp +13 -0
  3. package/cpp/ggml-alloc.c +78 -26
  4. package/cpp/ggml-alloc.h +9 -0
  5. package/cpp/ggml-backend-impl.h +1 -1
  6. package/cpp/ggml-backend-reg.cpp +19 -3
  7. package/cpp/ggml-backend.cpp +72 -20
  8. package/cpp/ggml-backend.h +2 -1
  9. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
  10. package/cpp/ggml-cpu/arch/arm/repack.cpp +1004 -0
  11. package/cpp/ggml-cpu/arch/x86/repack.cpp +6 -6
  12. package/cpp/ggml-cpu/arch-fallback.h +50 -2
  13. package/cpp/ggml-cpu/ggml-cpu-impl.h +1 -1
  14. package/cpp/ggml-cpu/ggml-cpu.c +139 -58
  15. package/cpp/ggml-cpu/ggml-cpu.cpp +4 -0
  16. package/cpp/ggml-cpu/ops.cpp +170 -18
  17. package/cpp/ggml-cpu/ops.h +1 -0
  18. package/cpp/ggml-cpu/repack.cpp +531 -5
  19. package/cpp/ggml-cpu/repack.h +14 -0
  20. package/cpp/ggml-cpu/simd-mappings.h +16 -18
  21. package/cpp/ggml-cpu/vec.cpp +41 -1
  22. package/cpp/ggml-cpu/vec.h +241 -138
  23. package/cpp/ggml-cpu.h +1 -0
  24. package/cpp/ggml-impl.h +0 -4
  25. package/cpp/ggml-metal/ggml-metal-context.m +26 -16
  26. package/cpp/ggml-metal/ggml-metal-device.cpp +452 -371
  27. package/cpp/ggml-metal/ggml-metal-device.h +87 -65
  28. package/cpp/ggml-metal/ggml-metal-device.m +263 -104
  29. package/cpp/ggml-metal/ggml-metal-impl.h +58 -4
  30. package/cpp/ggml-metal/ggml-metal-ops.cpp +415 -98
  31. package/cpp/ggml-metal/ggml-metal-ops.h +4 -0
  32. package/cpp/ggml-metal/ggml-metal.cpp +6 -5
  33. package/cpp/ggml-metal/ggml-metal.metal +404 -34
  34. package/cpp/ggml.c +110 -31
  35. package/cpp/ggml.h +51 -12
  36. package/cpp/jsi/RNWhisperJSI.cpp +1 -0
  37. package/cpp/whisper.cpp +16 -3
  38. package/ios/CMakeLists.txt +21 -1
  39. package/ios/RNWhisperContext.mm +5 -0
  40. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-alloc.h +9 -0
  41. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +1 -1
  42. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +2 -1
  43. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -0
  44. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +0 -4
  45. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +51 -12
  46. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
  47. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-metal.metal +404 -34
  48. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
  49. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-alloc.h +9 -0
  50. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +1 -1
  51. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +2 -1
  52. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -0
  53. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +0 -4
  54. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +51 -12
  55. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  56. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  57. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-metal.metal +404 -34
  58. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  59. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-alloc.h +9 -0
  60. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +1 -1
  61. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +2 -1
  62. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -0
  63. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +0 -4
  64. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +51 -12
  65. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
  66. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-metal.metal +404 -34
  67. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
  68. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-alloc.h +9 -0
  69. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +1 -1
  70. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +2 -1
  71. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -0
  72. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +0 -4
  73. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +51 -12
  74. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  75. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  76. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-metal.metal +404 -34
  77. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  78. package/lib/commonjs/NativeRNWhisper.js.map +1 -1
  79. package/lib/commonjs/jest-mock.js +2 -0
  80. package/lib/commonjs/jest-mock.js.map +1 -1
  81. package/lib/commonjs/version.json +1 -1
  82. package/lib/module/NativeRNWhisper.js.map +1 -1
  83. package/lib/module/jest-mock.js +2 -0
  84. package/lib/module/jest-mock.js.map +1 -1
  85. package/lib/module/version.json +1 -1
  86. package/lib/typescript/NativeRNWhisper.d.ts +1 -0
  87. package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
  88. package/package.json +1 -1
  89. package/src/NativeRNWhisper.ts +1 -0
  90. package/src/jest-mock.ts +2 -0
  91. package/src/version.json +1 -1
@@ -1,7 +1,6 @@
1
1
  #import "ggml-metal-device.h"
2
2
 
3
3
  #import "ggml-impl.h"
4
- #import "ggml-threading.h"
5
4
 
6
5
  #include <Foundation/Foundation.h>
7
6
 
@@ -75,14 +74,6 @@ void wsp_ggml_metal_cv_set_bool(wsp_ggml_metal_cv_t cv, bool value, int32_t idx)
75
74
 
76
75
  struct wsp_ggml_metal_pipeline {
77
76
  id<MTLComputePipelineState> obj;
78
-
79
- // suggested dispatch sizes
80
- int nsg;
81
-
82
- int nr0;
83
- int nr1;
84
-
85
- size_t smem;
86
77
  };
87
78
 
88
79
  wsp_ggml_metal_pipeline_t wsp_ggml_metal_pipeline_init(void) {
@@ -90,10 +81,6 @@ wsp_ggml_metal_pipeline_t wsp_ggml_metal_pipeline_init(void) {
90
81
 
91
82
  *res = (struct wsp_ggml_metal_pipeline) {
92
83
  /*.obj =*/ nil,
93
- /*.nsg =*/ 0,
94
- /*.nr0 =*/ 0,
95
- /*.nr1 =*/ 0,
96
- /*.smem =*/ 0,
97
84
  };
98
85
 
99
86
  return res;
@@ -105,40 +92,8 @@ void wsp_ggml_metal_pipeline_free(wsp_ggml_metal_pipeline_t pipeline) {
105
92
  free(pipeline);
106
93
  }
107
94
 
108
- void wsp_ggml_metal_pipeline_set_nsg(wsp_ggml_metal_pipeline_t pipeline, int nsg) {
109
- pipeline->nsg = nsg;
110
- }
111
-
112
- int wsp_ggml_metal_pipeline_get_nsg(wsp_ggml_metal_pipeline_t pipeline) {
113
- return pipeline->nsg;
114
- }
115
-
116
- void wsp_ggml_metal_pipeline_set_nr0(wsp_ggml_metal_pipeline_t pipeline, int nr0) {
117
- pipeline->nr0 = nr0;
118
- }
119
-
120
- int wsp_ggml_metal_pipeline_get_nr0(wsp_ggml_metal_pipeline_t pipeline) {
121
- return pipeline->nr0;
122
- }
123
-
124
- void wsp_ggml_metal_pipeline_set_nr1(wsp_ggml_metal_pipeline_t pipeline, int nr1) {
125
- pipeline->nr1 = nr1;
126
- }
127
-
128
- int wsp_ggml_metal_pipeline_get_nr1(wsp_ggml_metal_pipeline_t pipeline) {
129
- return pipeline->nr1;
130
- }
131
-
132
- void wsp_ggml_metal_pipeline_set_smem(wsp_ggml_metal_pipeline_t pipeline, size_t smem) {
133
- pipeline->smem = smem;
134
- }
135
-
136
- size_t wsp_ggml_metal_pipeline_get_smem(wsp_ggml_metal_pipeline_t pipeline) {
137
- return pipeline->smem;
138
- }
139
-
140
- int wsp_ggml_metal_pipeline_max_theads_per_threadgroup(wsp_ggml_metal_pipeline_t pipeline) {
141
- return pipeline->obj.maxTotalThreadsPerThreadgroup;
95
+ int wsp_ggml_metal_pipeline_max_theads_per_threadgroup(struct wsp_ggml_metal_pipeline_with_params pipeline) {
96
+ return pipeline.pipeline->obj.maxTotalThreadsPerThreadgroup;
142
97
  }
143
98
 
144
99
  struct wsp_ggml_metal_library {
@@ -146,6 +101,8 @@ struct wsp_ggml_metal_library {
146
101
  id<MTLDevice> device;
147
102
 
148
103
  wsp_ggml_metal_pipelines_t pipelines; // cache of compiled pipelines
104
+
105
+ NSLock * lock;
149
106
  };
150
107
 
151
108
  wsp_ggml_metal_library_t wsp_ggml_metal_library_init(wsp_ggml_metal_device_t dev) {
@@ -296,9 +253,10 @@ wsp_ggml_metal_library_t wsp_ggml_metal_library_init(wsp_ggml_metal_device_t dev
296
253
 
297
254
  wsp_ggml_metal_library_t res = calloc(1, sizeof(struct wsp_ggml_metal_library));
298
255
 
299
- res->obj = library;
300
- res->device = device;
256
+ res->obj = library;
257
+ res->device = device;
301
258
  res->pipelines = wsp_ggml_metal_pipelines_init();
259
+ res->lock = [NSLock new];
302
260
 
303
261
  return res;
304
262
  }
@@ -365,6 +323,7 @@ wsp_ggml_metal_library_t wsp_ggml_metal_library_init_from_source(wsp_ggml_metal_
365
323
  res->obj = library;
366
324
  res->device = device;
367
325
  res->pipelines = wsp_ggml_metal_pipelines_init();
326
+ res->lock = [NSLock new];
368
327
 
369
328
  return res;
370
329
  }
@@ -380,26 +339,47 @@ void wsp_ggml_metal_library_free(wsp_ggml_metal_library_t lib) {
380
339
 
381
340
  wsp_ggml_metal_pipelines_free(lib->pipelines);
382
341
 
342
+ [lib->lock release];
343
+
383
344
  free(lib);
384
345
  }
385
346
 
386
- wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_get_pipeline(wsp_ggml_metal_library_t lib, const char * name) {
387
- return wsp_ggml_metal_pipelines_get(lib->pipelines, name);
347
+ struct wsp_ggml_metal_pipeline_with_params wsp_ggml_metal_library_get_pipeline(wsp_ggml_metal_library_t lib, const char * name) {
348
+ [lib->lock lock];
349
+
350
+ struct wsp_ggml_metal_pipeline_with_params res = {
351
+ /*.pipeline =*/ nil,
352
+ /*.nr0 =*/ 0,
353
+ /*.nr1 =*/ 0,
354
+ /*.nsg =*/ 0,
355
+ /*.smem =*/ 0,
356
+ };
357
+
358
+ res.pipeline = wsp_ggml_metal_pipelines_get(lib->pipelines, name);
359
+
360
+ [lib->lock unlock];
361
+
362
+ return res;
388
363
  }
389
364
 
390
- wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_compile_pipeline(wsp_ggml_metal_library_t lib, const char * base, const char * name, wsp_ggml_metal_cv_t cv) {
391
- // note: the pipelines are cached in the library per device, so they are shared across all metal contexts
392
- wsp_ggml_critical_section_start();
365
+ struct wsp_ggml_metal_pipeline_with_params wsp_ggml_metal_library_compile_pipeline(wsp_ggml_metal_library_t lib, const char * base, const char * name, wsp_ggml_metal_cv_t cv) {
366
+ struct wsp_ggml_metal_pipeline_with_params res = {
367
+ /*.pipeline =*/ nil,
368
+ /*.nr0 =*/ 0,
369
+ /*.nr1 =*/ 0,
370
+ /*.nsg =*/ 0,
371
+ /*.smem =*/ 0,
372
+ };
373
+
374
+ [lib->lock lock];
393
375
 
394
- wsp_ggml_metal_pipeline_t res = wsp_ggml_metal_library_get_pipeline(lib, name);
395
- if (res) {
396
- wsp_ggml_critical_section_end();
376
+ res.pipeline = wsp_ggml_metal_pipelines_get(lib->pipelines, name);
377
+ if (res.pipeline) {
378
+ [lib->lock unlock];
397
379
 
398
380
  return res;
399
381
  }
400
382
 
401
- res = wsp_ggml_metal_pipeline_init();
402
-
403
383
  @autoreleasepool {
404
384
  NSError * error = nil;
405
385
 
@@ -414,36 +394,53 @@ wsp_ggml_metal_pipeline_t wsp_ggml_metal_library_compile_pipeline(wsp_ggml_metal
414
394
  mtl_function = [lib->obj newFunctionWithName:base_func constantValues:cv->obj error:&error];
415
395
  }
416
396
  if (!mtl_function) {
417
- wsp_ggml_critical_section_end();
397
+ [lib->lock unlock];
418
398
 
419
399
  WSP_GGML_LOG_ERROR("%s: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
420
400
  if (error) {
421
401
  WSP_GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
422
402
  }
423
403
 
424
- return nil;
404
+ return res;
425
405
  }
426
406
 
427
- res->obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
407
+ id<MTLComputePipelineState> obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
428
408
 
429
409
  [mtl_function release];
430
410
 
431
- WSP_GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name, (void *) res->obj,
432
- (int) res->obj.maxTotalThreadsPerThreadgroup,
433
- (int) res->obj.threadExecutionWidth);
411
+ if (!obj) {
412
+ [lib->lock unlock];
413
+
414
+ WSP_GGML_LOG_ERROR("%s: failed to create pipeline state: base = '%s', name = '%s'\n", __func__, base, name);
415
+ if (error) {
416
+ WSP_GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
417
+ }
418
+
419
+ return res;
420
+ }
421
+
422
+ WSP_GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name,
423
+ (void *) obj,
424
+ (int) obj.maxTotalThreadsPerThreadgroup,
425
+ (int) obj.threadExecutionWidth);
434
426
 
435
- if (res->obj.maxTotalThreadsPerThreadgroup == 0 || res->obj.threadExecutionWidth == 0) {
436
- wsp_ggml_critical_section_end();
427
+ if (obj.maxTotalThreadsPerThreadgroup == 0 || obj.threadExecutionWidth == 0) {
428
+ [obj release];
429
+
430
+ [lib->lock unlock];
437
431
 
438
432
  WSP_GGML_LOG_ERROR("%s: incompatible pipeline %s\n", __func__, name);
439
433
 
440
- return nil;
434
+ return res;
441
435
  }
442
436
 
443
- wsp_ggml_metal_pipelines_add(lib->pipelines, name, res);
437
+ res.pipeline = wsp_ggml_metal_pipeline_init();
438
+ res.pipeline->obj = obj;
439
+
440
+ wsp_ggml_metal_pipelines_add(lib->pipelines, name, res.pipeline);
444
441
  }
445
442
 
446
- wsp_ggml_critical_section_end();
443
+ [lib->lock unlock];
447
444
 
448
445
  return res;
449
446
  }
@@ -485,8 +482,8 @@ void wsp_ggml_metal_encoder_debug_group_pop (wsp_ggml_metal_encoder_t encoder) {
485
482
  [encoder->obj popDebugGroup];
486
483
  }
487
484
 
488
- void wsp_ggml_metal_encoder_set_pipeline(wsp_ggml_metal_encoder_t encoder, wsp_ggml_metal_pipeline_t pipeline) {
489
- [encoder->obj setComputePipelineState:pipeline->obj];
485
+ void wsp_ggml_metal_encoder_set_pipeline(wsp_ggml_metal_encoder_t encoder, struct wsp_ggml_metal_pipeline_with_params pipeline) {
486
+ [encoder->obj setComputePipelineState:pipeline.pipeline->obj];
490
487
  }
491
488
 
492
489
  void wsp_ggml_metal_encoder_set_bytes(wsp_ggml_metal_encoder_t encoder, void * data, size_t size, int idx) {
@@ -521,11 +518,106 @@ struct wsp_ggml_metal_device {
521
518
  // ref: https://github.com/ggml-org/llama.cpp/pull/15906
522
519
  id<MTLCommandQueue> mtl_queue;
523
520
 
521
+ wsp_ggml_metal_rsets_t rsets;
522
+
524
523
  wsp_ggml_metal_library_t library;
525
524
 
526
525
  struct wsp_ggml_metal_device_props props;
527
526
  };
528
527
 
528
+ //
529
+ // MTLResidenceSet wrapper
530
+ //
531
+
532
+ struct wsp_ggml_metal_rsets {
533
+ NSLock * lock;
534
+
535
+ NSMutableArray * data;
536
+
537
+ // number of seconds since the last graph computation
538
+ // keep the residency sets wired for that amount of time to avoid being collected by the OS
539
+ int keep_alive_s;
540
+
541
+ // background heartbeat thread to keep the residency sets alive
542
+ atomic_bool d_stop;
543
+ atomic_int d_loop;
544
+
545
+ dispatch_group_t d_group;
546
+ };
547
+
548
+ wsp_ggml_metal_rsets_t wsp_ggml_metal_rsets_init(void) {
549
+ wsp_ggml_metal_rsets_t res = calloc(1, sizeof(struct wsp_ggml_metal_rsets));
550
+
551
+ res->lock = [[NSLock alloc] init];
552
+ res->data = [[NSMutableArray alloc] init];
553
+
554
+ // by default keep the memory wired for 3 minutes
555
+ res->keep_alive_s = 3*60;
556
+
557
+ const char * WSP_GGML_METAL_RESIDENCY_KEEP_ALIVE_S = getenv("WSP_GGML_METAL_RESIDENCY_KEEP_ALIVE_S");
558
+ if (WSP_GGML_METAL_RESIDENCY_KEEP_ALIVE_S) {
559
+ res->keep_alive_s = atoi(WSP_GGML_METAL_RESIDENCY_KEEP_ALIVE_S);
560
+ }
561
+
562
+ if (res->keep_alive_s <= 0) {
563
+ res->keep_alive_s = 3*60;
564
+ }
565
+
566
+ WSP_GGML_LOG_INFO("%s: creating a residency set collection (keep_alive = %d s)\n", __func__, res->keep_alive_s);
567
+
568
+ atomic_store_explicit(&res->d_stop, false, memory_order_relaxed);
569
+ atomic_store_explicit(&res->d_loop, 2*res->keep_alive_s, memory_order_relaxed);
570
+
571
+ res->d_group = dispatch_group_create();
572
+
573
+ // start a background thread that periodically requests residency for all the currently active sets in the collection
574
+ // the requests stop after a certain amount of time (keep_alive_s) of inactivity
575
+ dispatch_queue_t d_queue = dispatch_get_global_queue(QOS_CLASS_DEFAULT, 0);
576
+ dispatch_group_async(res->d_group, d_queue, ^{
577
+ #if defined(WSP_GGML_METAL_HAS_RESIDENCY_SETS)
578
+ if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) {
579
+ while (!atomic_load_explicit(&res->d_stop, memory_order_relaxed)) {
580
+ if (atomic_load_explicit(&res->d_loop, memory_order_relaxed) > 0) {
581
+ [res->lock lock];
582
+
583
+ for (int i = 0; i < (int) res->data.count; ++i) {
584
+ [res->data[i] requestResidency];
585
+ }
586
+
587
+ atomic_fetch_sub_explicit(&res->d_loop, 1, memory_order_relaxed);
588
+
589
+ [res->lock unlock];
590
+ }
591
+
592
+ // half a second
593
+ usleep(500 * 1000);
594
+ }
595
+ }
596
+ #endif
597
+ });
598
+
599
+ return res;
600
+ }
601
+
602
+ void wsp_ggml_metal_rsets_free(wsp_ggml_metal_rsets_t rsets) {
603
+ if (rsets == NULL) {
604
+ return;
605
+ }
606
+
607
+ // note: if you hit this assert, most likely you haven't deallocated all Metal resources before exiting
608
+ WSP_GGML_ASSERT([rsets->data count] == 0);
609
+
610
+ atomic_store_explicit(&rsets->d_stop, true, memory_order_relaxed);
611
+
612
+ dispatch_group_wait(rsets->d_group, DISPATCH_TIME_FOREVER);
613
+ dispatch_release(rsets->d_group);
614
+
615
+ [rsets->data release];
616
+ [rsets->lock release];
617
+
618
+ free(rsets);
619
+ }
620
+
529
621
  wsp_ggml_metal_device_t wsp_ggml_metal_device_init(void) {
530
622
  wsp_ggml_metal_device_t dev = calloc(1, sizeof(struct wsp_ggml_metal_device));
531
623
 
@@ -611,8 +703,8 @@ wsp_ggml_metal_device_t wsp_ggml_metal_device_init(void) {
611
703
  WSP_GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
612
704
  dev->props.has_tensor = false;
613
705
  } else {
614
- wsp_ggml_metal_pipeline_t ppl = wsp_ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
615
- if (!ppl) {
706
+ struct wsp_ggml_metal_pipeline_with_params ppl = wsp_ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
707
+ if (!ppl.pipeline) {
616
708
  WSP_GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
617
709
  dev->props.has_tensor = false;
618
710
  }
@@ -661,8 +753,8 @@ wsp_ggml_metal_device_t wsp_ggml_metal_device_init(void) {
661
753
  WSP_GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
662
754
  dev->props.has_bfloat = false;
663
755
  } else {
664
- wsp_ggml_metal_pipeline_t ppl = wsp_ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
665
- if (!ppl) {
756
+ struct wsp_ggml_metal_pipeline_with_params ppl = wsp_ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
757
+ if (!ppl.pipeline) {
666
758
  WSP_GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
667
759
  dev->props.has_bfloat = false;
668
760
  }
@@ -677,12 +769,21 @@ wsp_ggml_metal_device_t wsp_ggml_metal_device_init(void) {
677
769
  #endif
678
770
 
679
771
  dev->props.use_shared_buffers = dev->props.has_unified_memory;
772
+ #if TARGET_OS_OSX
773
+ // In case of eGPU, shared memory may be preferable.
774
+ dev->props.use_shared_buffers |= [dev->mtl_device location] == MTLDeviceLocationExternal;
775
+ #endif
680
776
  if (getenv("WSP_GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) {
681
777
  dev->props.use_shared_buffers = false;
682
778
  }
779
+ if (getenv("WSP_GGML_METAL_SHARED_BUFFERS_ENABLE") != NULL) {
780
+ dev->props.use_shared_buffers = true;
781
+ }
683
782
 
684
783
  dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
685
784
 
785
+ dev->props.op_offload_min_batch_size = getenv("WSP_GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("WSP_GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
786
+
686
787
  dev->props.max_buffer_size = dev->mtl_device.maxBufferLength;
687
788
  dev->props.max_working_set_size = dev->mtl_device.recommendedMaxWorkingSetSize;
688
789
  dev->props.max_theadgroup_memory_size = dev->mtl_device.maxThreadgroupMemoryLength;
@@ -694,7 +795,11 @@ wsp_ggml_metal_device_t wsp_ggml_metal_device_init(void) {
694
795
  WSP_GGML_LOG_ERROR("%s: error: failed to create library\n", __func__);
695
796
  }
696
797
 
697
- // --------------------------------------------------
798
+ if (dev->props.use_residency_sets) {
799
+ dev->rsets = wsp_ggml_metal_rsets_init();
800
+ } else {
801
+ dev->rsets = nil;
802
+ }
698
803
 
699
804
  // print MTL GPU family:
700
805
  WSP_GGML_LOG_INFO("%s: GPU name: %s\n", __func__, dev->props.name);
@@ -747,6 +852,8 @@ wsp_ggml_metal_device_t wsp_ggml_metal_device_init(void) {
747
852
  void wsp_ggml_metal_device_free(wsp_ggml_metal_device_t dev) {
748
853
  assert(dev != NULL);
749
854
 
855
+ wsp_ggml_metal_rsets_free(dev->rsets);
856
+
750
857
  wsp_ggml_metal_library_free(dev->library);
751
858
  dev->library = NULL;
752
859
 
@@ -775,6 +882,42 @@ wsp_ggml_metal_library_t wsp_ggml_metal_device_get_library(wsp_ggml_metal_device
775
882
  return dev->library;
776
883
  }
777
884
 
885
+ void wsp_ggml_metal_device_rsets_add(wsp_ggml_metal_device_t dev, wsp_ggml_metal_rset_t rset) {
886
+ if (rset == nil) {
887
+ return;
888
+ }
889
+
890
+ WSP_GGML_ASSERT(dev->rsets);
891
+
892
+ [dev->rsets->lock lock];
893
+
894
+ [dev->rsets->data addObject:rset];
895
+
896
+ [dev->rsets->lock unlock];
897
+ }
898
+
899
+ void wsp_ggml_metal_device_rsets_rm(wsp_ggml_metal_device_t dev, wsp_ggml_metal_rset_t rset) {
900
+ if (rset == nil) {
901
+ return;
902
+ }
903
+
904
+ WSP_GGML_ASSERT(dev->rsets);
905
+
906
+ [dev->rsets->lock lock];
907
+
908
+ [dev->rsets->data removeObject:rset];
909
+
910
+ [dev->rsets->lock unlock];
911
+ }
912
+
913
+ void wsp_ggml_metal_device_rsets_keep_alive(wsp_ggml_metal_device_t dev) {
914
+ if (dev->rsets == NULL) {
915
+ return;
916
+ }
917
+
918
+ atomic_store_explicit(&dev->rsets->d_loop, 2*dev->rsets->keep_alive_s, memory_order_relaxed);
919
+ }
920
+
778
921
  void wsp_ggml_metal_device_get_memory(wsp_ggml_metal_device_t dev, size_t * free, size_t * total) {
779
922
  if (@available(macOS 10.12, iOS 16.0, *)) {
780
923
  *total = dev->mtl_device.recommendedMaxWorkingSetSize;
@@ -820,6 +963,8 @@ bool wsp_ggml_metal_device_supports_op(wsp_ggml_metal_device_t dev, const struct
820
963
  case WSP_GGML_UNARY_OP_HARDSWISH:
821
964
  case WSP_GGML_UNARY_OP_HARDSIGMOID:
822
965
  case WSP_GGML_UNARY_OP_EXP:
966
+ case WSP_GGML_UNARY_OP_SOFTPLUS:
967
+ case WSP_GGML_UNARY_OP_EXPM1:
823
968
  return wsp_ggml_is_contiguous(op->src[0]) && op->src[0]->type == WSP_GGML_TYPE_F32;
824
969
  default:
825
970
  return false;
@@ -852,6 +997,7 @@ bool wsp_ggml_metal_device_supports_op(wsp_ggml_metal_device_t dev, const struct
852
997
  case WSP_GGML_OP_ACC:
853
998
  case WSP_GGML_OP_REPEAT:
854
999
  case WSP_GGML_OP_SCALE:
1000
+ case WSP_GGML_OP_FILL:
855
1001
  case WSP_GGML_OP_CONV_TRANSPOSE_1D:
856
1002
  return true;
857
1003
  case WSP_GGML_OP_CONV_TRANSPOSE_2D:
@@ -869,6 +1015,8 @@ bool wsp_ggml_metal_device_supports_op(wsp_ggml_metal_device_t dev, const struct
869
1015
  return wsp_ggml_is_contiguous(op->src[0]) && op->src[0]->type == WSP_GGML_TYPE_F32;
870
1016
  case WSP_GGML_OP_SUM:
871
1017
  return has_simdgroup_reduction && wsp_ggml_is_contiguous(op->src[0]);
1018
+ case WSP_GGML_OP_TRI:
1019
+ return wsp_ggml_is_contiguous_rows(op->src[0]);
872
1020
  case WSP_GGML_OP_SUM_ROWS:
873
1021
  case WSP_GGML_OP_CUMSUM:
874
1022
  case WSP_GGML_OP_MEAN:
@@ -877,6 +1025,11 @@ bool wsp_ggml_metal_device_supports_op(wsp_ggml_metal_device_t dev, const struct
877
1025
  return has_simdgroup_reduction && wsp_ggml_is_contiguous_rows(op->src[0]);
878
1026
  case WSP_GGML_OP_L2_NORM:
879
1027
  return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && wsp_ggml_is_contiguous_1(op->src[0]));
1028
+ case WSP_GGML_OP_COUNT_EQUAL:
1029
+ return has_simdgroup_reduction &&
1030
+ op->src[0]->type == WSP_GGML_TYPE_I32 &&
1031
+ op->src[1]->type == WSP_GGML_TYPE_I32 &&
1032
+ op->type == WSP_GGML_TYPE_I64;
880
1033
  case WSP_GGML_OP_ARGMAX:
881
1034
  return has_simdgroup_reduction;
882
1035
  case WSP_GGML_OP_NORM:
@@ -894,10 +1047,15 @@ bool wsp_ggml_metal_device_supports_op(wsp_ggml_metal_device_t dev, const struct
894
1047
  case WSP_GGML_OP_POOL_1D:
895
1048
  return false;
896
1049
  case WSP_GGML_OP_UPSCALE:
897
- return op->src[0]->type == WSP_GGML_TYPE_F32 && op->op_params[0] == WSP_GGML_SCALE_MODE_NEAREST;
1050
+ return op->src[0]->type == WSP_GGML_TYPE_F32 && op->op_params[0] == WSP_GGML_SCALE_MODE_NEAREST && !(op->op_params[0] & WSP_GGML_SCALE_FLAG_ANTIALIAS);
898
1051
  case WSP_GGML_OP_POOL_2D:
899
1052
  return op->src[0]->type == WSP_GGML_TYPE_F32;
900
1053
  case WSP_GGML_OP_PAD:
1054
+ // TODO: add circular padding support for metal, see https://github.com/ggml-org/llama.cpp/pull/16985
1055
+ if (wsp_ggml_get_op_params_i32(op, 8) != 0) {
1056
+ return false;
1057
+ }
1058
+
901
1059
  return (wsp_ggml_get_op_params_i32(op, 0) == 0) && (wsp_ggml_get_op_params_i32(op, 2) == 0) &&
902
1060
  (wsp_ggml_get_op_params_i32(op, 4) == 0) && (wsp_ggml_get_op_params_i32(op, 6) == 0);
903
1061
  case WSP_GGML_OP_PAD_REFLECT_1D:
@@ -905,12 +1063,14 @@ bool wsp_ggml_metal_device_supports_op(wsp_ggml_metal_device_t dev, const struct
905
1063
  case WSP_GGML_OP_LEAKY_RELU:
906
1064
  return op->src[0]->type == WSP_GGML_TYPE_F32;
907
1065
  case WSP_GGML_OP_ARGSORT:
1066
+ case WSP_GGML_OP_TOP_K:
908
1067
  case WSP_GGML_OP_ARANGE:
909
1068
  return true;
910
1069
  case WSP_GGML_OP_FLASH_ATTN_EXT:
911
1070
  // for new head sizes, add checks here
912
1071
  if (op->src[0]->ne[0] != 32 &&
913
1072
  op->src[0]->ne[0] != 40 &&
1073
+ op->src[0]->ne[0] != 48 &&
914
1074
  op->src[0]->ne[0] != 64 &&
915
1075
  op->src[0]->ne[0] != 72 &&
916
1076
  op->src[0]->ne[0] != 80 &&
@@ -1061,9 +1221,8 @@ struct wsp_ggml_metal_buffer {
1061
1221
  // note: cannot use explicity "id<MTLResidencySet>" here because it is not available on certain OSes
1062
1222
  id rset;
1063
1223
 
1064
- // pointers to global device objects
1065
- id<MTLDevice> device;
1066
- id<MTLCommandQueue> queue;
1224
+ // pointers to global device
1225
+ wsp_ggml_metal_device_t dev;
1067
1226
  };
1068
1227
 
1069
1228
  static void wsp_ggml_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
@@ -1106,7 +1265,7 @@ static bool wsp_ggml_metal_buffer_rset_init(wsp_ggml_metal_buffer_t buf) {
1106
1265
  desc.initialCapacity = buf->n_buffers;
1107
1266
 
1108
1267
  NSError * error;
1109
- buf->rset = [buf->device newResidencySetWithDescriptor:desc error:&error];
1268
+ buf->rset = [buf->dev->mtl_device newResidencySetWithDescriptor:desc error:&error];
1110
1269
  if (error) {
1111
1270
  WSP_GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
1112
1271
  [desc release];
@@ -1167,6 +1326,8 @@ static void * wsp_ggml_metal_host_malloc(size_t n) {
1167
1326
  wsp_ggml_metal_buffer_t wsp_ggml_metal_buffer_init(wsp_ggml_metal_device_t dev, size_t size, bool shared) {
1168
1327
  wsp_ggml_metal_buffer_t res = calloc(1, sizeof(struct wsp_ggml_metal_buffer));
1169
1328
 
1329
+ res->dev = dev;
1330
+
1170
1331
  const size_t size_page = sysconf(_SC_PAGESIZE);
1171
1332
 
1172
1333
  size_t size_aligned = size;
@@ -1191,9 +1352,6 @@ wsp_ggml_metal_buffer_t wsp_ggml_metal_buffer_init(wsp_ggml_metal_device_t dev,
1191
1352
 
1192
1353
  res->owned = true;
1193
1354
 
1194
- res->device = wsp_ggml_metal_device_get_obj(dev);
1195
- res->queue = wsp_ggml_metal_device_get_queue(dev);
1196
-
1197
1355
  res->n_buffers = 1;
1198
1356
 
1199
1357
  if (res->all_data != NULL) {
@@ -1202,12 +1360,12 @@ wsp_ggml_metal_buffer_t wsp_ggml_metal_buffer_init(wsp_ggml_metal_device_t dev,
1202
1360
 
1203
1361
  if (size_aligned > 0) {
1204
1362
  if (props_dev->use_shared_buffers && shared) {
1205
- res->buffers[0].metal = [res->device newBufferWithBytesNoCopy:res->all_data
1363
+ res->buffers[0].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:res->all_data
1206
1364
  length:size_aligned
1207
1365
  options:MTLResourceStorageModeShared
1208
1366
  deallocator:nil];
1209
1367
  } else {
1210
- res->buffers[0].metal = [res->device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
1368
+ res->buffers[0].metal = [res->dev->mtl_device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
1211
1369
  }
1212
1370
  }
1213
1371
 
@@ -1228,6 +1386,8 @@ wsp_ggml_metal_buffer_t wsp_ggml_metal_buffer_init(wsp_ggml_metal_device_t dev,
1228
1386
  return NULL;
1229
1387
  }
1230
1388
 
1389
+ wsp_ggml_metal_device_rsets_add(dev, res->rset);
1390
+
1231
1391
  //wsp_ggml_metal_log_allocated_size(device, size_aligned);
1232
1392
 
1233
1393
  return res;
@@ -1236,6 +1396,8 @@ wsp_ggml_metal_buffer_t wsp_ggml_metal_buffer_init(wsp_ggml_metal_device_t dev,
1236
1396
  wsp_ggml_metal_buffer_t wsp_ggml_metal_buffer_map(wsp_ggml_metal_device_t dev, void * ptr, size_t size, size_t max_tensor_size) {
1237
1397
  wsp_ggml_metal_buffer_t res = calloc(1, sizeof(struct wsp_ggml_metal_buffer));
1238
1398
 
1399
+ res->dev = dev;
1400
+
1239
1401
  res->all_data = ptr;
1240
1402
  res->all_size = size;
1241
1403
 
@@ -1258,9 +1420,6 @@ wsp_ggml_metal_buffer_t wsp_ggml_metal_buffer_map(wsp_ggml_metal_device_t dev, v
1258
1420
  size_aligned += (size_page - (size_aligned % size_page));
1259
1421
  }
1260
1422
 
1261
- res->device = wsp_ggml_metal_device_get_obj(dev);
1262
- res->queue = wsp_ggml_metal_device_get_queue(dev);
1263
-
1264
1423
  const struct wsp_ggml_metal_device_props * props_dev = wsp_ggml_metal_device_get_props(dev);
1265
1424
 
1266
1425
  // the buffer fits into the max buffer size allowed by the device
@@ -1270,7 +1429,7 @@ wsp_ggml_metal_buffer_t wsp_ggml_metal_buffer_map(wsp_ggml_metal_device_t dev, v
1270
1429
  res->buffers[res->n_buffers].metal = nil;
1271
1430
 
1272
1431
  if (size_aligned > 0) {
1273
- res->buffers[res->n_buffers].metal = [res->device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
1432
+ res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
1274
1433
 
1275
1434
  if (res->buffers[res->n_buffers].metal == nil) {
1276
1435
  WSP_GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
@@ -1279,7 +1438,7 @@ wsp_ggml_metal_buffer_t wsp_ggml_metal_buffer_map(wsp_ggml_metal_device_t dev, v
1279
1438
  }
1280
1439
  }
1281
1440
 
1282
- wsp_ggml_metal_log_allocated_size(res->device, size_aligned);
1441
+ wsp_ggml_metal_log_allocated_size(res->dev->mtl_device, size_aligned);
1283
1442
 
1284
1443
  ++res->n_buffers;
1285
1444
  } else {
@@ -1297,7 +1456,7 @@ wsp_ggml_metal_buffer_t wsp_ggml_metal_buffer_map(wsp_ggml_metal_device_t dev, v
1297
1456
  res->buffers[res->n_buffers].metal = nil;
1298
1457
 
1299
1458
  if (size_step_aligned > 0) {
1300
- res->buffers[res->n_buffers].metal = [res->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
1459
+ res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
1301
1460
 
1302
1461
  if (res->buffers[res->n_buffers].metal == nil) {
1303
1462
  WSP_GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
@@ -1306,7 +1465,7 @@ wsp_ggml_metal_buffer_t wsp_ggml_metal_buffer_map(wsp_ggml_metal_device_t dev, v
1306
1465
  }
1307
1466
  }
1308
1467
 
1309
- wsp_ggml_metal_log_allocated_size(res->device, size_step_aligned);
1468
+ wsp_ggml_metal_log_allocated_size(res->dev->mtl_device, size_step_aligned);
1310
1469
 
1311
1470
  if (i + size_step < size) {
1312
1471
  WSP_GGML_LOG_INFO("\n");
@@ -1324,10 +1483,14 @@ wsp_ggml_metal_buffer_t wsp_ggml_metal_buffer_map(wsp_ggml_metal_device_t dev, v
1324
1483
  return NULL;
1325
1484
  }
1326
1485
 
1486
+ wsp_ggml_metal_device_rsets_add(dev, res->rset);
1487
+
1327
1488
  return res;
1328
1489
  }
1329
1490
 
1330
1491
  void wsp_ggml_metal_buffer_free(wsp_ggml_metal_buffer_t buf) {
1492
+ wsp_ggml_metal_device_rsets_rm(buf->dev, buf->rset);
1493
+
1331
1494
  for (int i = 0; i < buf->n_buffers; i++) {
1332
1495
  [buf->buffers[i].metal release];
1333
1496
  }
@@ -1364,8 +1527,7 @@ void wsp_ggml_metal_buffer_memset_tensor(wsp_ggml_metal_buffer_t buf, struct wsp
1364
1527
  struct wsp_ggml_metal_buffer_id bid_dst = wsp_ggml_metal_buffer_get_id(buf, tensor);
1365
1528
  bid_dst.offs += offset;
1366
1529
 
1367
- id<MTLCommandQueue> queue = buf->queue;
1368
- id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
1530
+ id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
1369
1531
 
1370
1532
  {
1371
1533
  id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
@@ -1391,7 +1553,7 @@ void wsp_ggml_metal_buffer_set_tensor(wsp_ggml_metal_buffer_t buf, struct wsp_gg
1391
1553
  @autoreleasepool {
1392
1554
  // src
1393
1555
  void * data_ptr = (void *)(uintptr_t) data; // "const cast" the src data
1394
- id<MTLBuffer> buf_src = [buf->device newBufferWithBytesNoCopy:data_ptr
1556
+ id<MTLBuffer> buf_src = [buf->dev->mtl_device newBufferWithBytesNoCopy:data_ptr
1395
1557
  length:size
1396
1558
  options:MTLResourceStorageModeShared
1397
1559
  deallocator:nil];
@@ -1406,8 +1568,7 @@ void wsp_ggml_metal_buffer_set_tensor(wsp_ggml_metal_buffer_t buf, struct wsp_gg
1406
1568
  // this is alternative to waitUntilCompleted, which should be faster, but don't seem to make much difference
1407
1569
  dispatch_semaphore_t completion_semaphore = dispatch_semaphore_create(0);
1408
1570
 
1409
- id<MTLCommandQueue> queue = buf->queue;
1410
- id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
1571
+ id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
1411
1572
 
1412
1573
  {
1413
1574
  id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
@@ -1449,15 +1610,14 @@ void wsp_ggml_metal_buffer_get_tensor(wsp_ggml_metal_buffer_t buf, const struct
1449
1610
  bid_src.offs += offset;
1450
1611
 
1451
1612
  // dst
1452
- id<MTLBuffer> buf_dst = [buf->device newBufferWithBytesNoCopy:data
1613
+ id<MTLBuffer> buf_dst = [buf->dev->mtl_device newBufferWithBytesNoCopy:data
1453
1614
  length:size
1454
1615
  options:MTLResourceStorageModeShared
1455
1616
  deallocator:nil];
1456
1617
 
1457
1618
  WSP_GGML_ASSERT(buf_dst);
1458
1619
 
1459
- id<MTLCommandQueue> queue = buf->queue;
1460
- id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
1620
+ id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
1461
1621
 
1462
1622
  {
1463
1623
  id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
@@ -1483,8 +1643,7 @@ void wsp_ggml_metal_buffer_clear(wsp_ggml_metal_buffer_t buf, uint8_t value) {
1483
1643
  }
1484
1644
 
1485
1645
  @autoreleasepool {
1486
- id<MTLCommandQueue> queue = buf->queue;
1487
- id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
1646
+ id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
1488
1647
 
1489
1648
  {
1490
1649
  id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];