whisper.rn 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/android/build.gradle +2 -1
  2. package/android/gradle.properties +1 -1
  3. package/cpp/ggml-alloc.c +264 -126
  4. package/cpp/ggml-backend-impl.h +4 -1
  5. package/cpp/ggml-backend-reg.cpp +13 -5
  6. package/cpp/ggml-backend.cpp +207 -17
  7. package/cpp/ggml-backend.h +17 -1
  8. package/cpp/ggml-cpu/amx/amx.cpp +4 -2
  9. package/cpp/ggml-cpu/arch/x86/repack.cpp +2 -2
  10. package/cpp/ggml-cpu/arch-fallback.h +0 -4
  11. package/cpp/ggml-cpu/common.h +14 -0
  12. package/cpp/ggml-cpu/ggml-cpu-impl.h +13 -6
  13. package/cpp/ggml-cpu/ggml-cpu.c +48 -41
  14. package/cpp/ggml-cpu/ggml-cpu.cpp +14 -4
  15. package/cpp/ggml-cpu/ops.cpp +518 -767
  16. package/cpp/ggml-cpu/ops.h +2 -0
  17. package/cpp/ggml-cpu/simd-mappings.h +88 -59
  18. package/cpp/ggml-cpu/vec.cpp +161 -20
  19. package/cpp/ggml-cpu/vec.h +400 -51
  20. package/cpp/ggml-cpu.h +1 -1
  21. package/cpp/ggml-impl.h +43 -10
  22. package/cpp/ggml-metal/ggml-metal-common.cpp +446 -0
  23. package/cpp/ggml-metal/ggml-metal-common.h +52 -0
  24. package/cpp/ggml-metal/ggml-metal-context.h +33 -0
  25. package/cpp/ggml-metal/ggml-metal-context.m +600 -0
  26. package/cpp/ggml-metal/ggml-metal-device.cpp +1376 -0
  27. package/cpp/ggml-metal/ggml-metal-device.h +226 -0
  28. package/cpp/ggml-metal/ggml-metal-device.m +1312 -0
  29. package/cpp/ggml-metal/ggml-metal-impl.h +722 -0
  30. package/cpp/ggml-metal/ggml-metal-ops.cpp +3158 -0
  31. package/cpp/ggml-metal/ggml-metal-ops.h +82 -0
  32. package/cpp/ggml-metal/ggml-metal.cpp +718 -0
  33. package/cpp/ggml-metal/ggml-whisper-sim.metallib +0 -0
  34. package/cpp/ggml-metal/ggml-whisper.metallib +0 -0
  35. package/cpp/ggml-metal-impl.h +40 -40
  36. package/cpp/ggml-metal.h +1 -6
  37. package/cpp/ggml-quants.c +1 -0
  38. package/cpp/ggml.c +175 -13
  39. package/cpp/ggml.h +84 -5
  40. package/cpp/jsi/RNWhisperJSI.cpp +2 -0
  41. package/cpp/jsi/ThreadPool.h +3 -3
  42. package/cpp/whisper.cpp +85 -70
  43. package/cpp/whisper.h +1 -0
  44. package/ios/CMakeLists.txt +6 -1
  45. package/ios/RNWhisperVadContext.mm +14 -13
  46. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  47. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  48. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  49. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +43 -10
  50. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  51. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  52. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +84 -5
  53. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
  54. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
  55. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  56. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
  57. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  58. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  59. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  60. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +43 -10
  61. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  62. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  63. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +84 -5
  64. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
  65. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  66. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  67. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  68. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  69. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  70. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  71. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  72. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +43 -10
  73. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  74. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  75. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +84 -5
  76. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
  77. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
  78. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  79. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
  80. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  81. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  82. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  83. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +43 -10
  84. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  85. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  86. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +84 -5
  87. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
  88. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  89. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  90. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  91. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  92. package/lib/commonjs/version.json +1 -1
  93. package/lib/module/version.json +1 -1
  94. package/package.json +1 -1
  95. package/src/version.json +1 -1
  96. package/whisper-rn.podspec +8 -9
  97. package/cpp/ggml-metal.m +0 -6779
  98. package/cpp/ggml-whisper-sim.metallib +0 -0
  99. package/cpp/ggml-whisper.metallib +0 -0
@@ -0,0 +1,3158 @@
1
+ #include "ggml-metal-ops.h"
2
+
3
+ #include "ggml.h"
4
+ #include "ggml-impl.h"
5
+ #include "ggml-backend-impl.h"
6
+
7
+ #include "ggml-metal-impl.h"
8
+ #include "ggml-metal-common.h"
9
+ #include "ggml-metal-device.h"
10
+
11
+ #include <cassert>
12
+ #include <algorithm>
13
+
14
+ static wsp_ggml_metal_buffer_id wsp_ggml_metal_get_buffer_id(const wsp_ggml_tensor * t) {
15
+ if (!t) {
16
+ return { nullptr, 0 };
17
+ }
18
+
19
+ wsp_ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
20
+
21
+ wsp_ggml_metal_buffer_t ctx = (wsp_ggml_metal_buffer_t) buffer->context;
22
+
23
+ return wsp_ggml_metal_buffer_get_id(ctx, t);
24
+ }
25
+
26
+ struct wsp_ggml_metal_op {
27
+ wsp_ggml_metal_op(
28
+ wsp_ggml_metal_device_t dev,
29
+ wsp_ggml_metal_cmd_buf_t cmd_buf,
30
+ wsp_ggml_cgraph * gf,
31
+ int idx_start,
32
+ int idx_end,
33
+ bool use_fusion,
34
+ bool use_concurrency,
35
+ bool use_capture,
36
+ int debug_graph,
37
+ int debug_fusion) {
38
+ this->dev = dev;
39
+ this->lib = wsp_ggml_metal_device_get_library(dev);
40
+ this->enc = wsp_ggml_metal_encoder_init(cmd_buf, use_concurrency);
41
+ this->mem_ranges = wsp_ggml_mem_ranges_init(debug_graph);
42
+ this->idx_start = idx_start;
43
+ this->idx_end = idx_end;
44
+ this->use_fusion = use_fusion;
45
+ this->use_concurrency = use_concurrency;
46
+ this->use_capture = use_capture;
47
+ this->debug_graph = debug_graph;
48
+ this->debug_fusion = debug_fusion;
49
+ this->gf = gf;
50
+
51
+ idxs.reserve(gf->n_nodes);
52
+
53
+ // filter empty nodes
54
+ // TODO: this can be removed when the allocator starts filtering them earlier
55
+ // https://github.com/ggml-org/llama.cpp/pull/16130#issuecomment-3327905830
56
+ for (int i = idx_start; i < idx_end; i++) {
57
+ if (!wsp_ggml_op_is_empty(gf->nodes[i]->op) && !wsp_ggml_is_empty(gf->nodes[i])) {
58
+ idxs.push_back(i);
59
+ }
60
+ }
61
+ }
62
+
63
+ ~wsp_ggml_metal_op() {
64
+ wsp_ggml_metal_encoder_end_encoding(this->enc);
65
+ wsp_ggml_metal_encoder_free(this->enc);
66
+ wsp_ggml_mem_ranges_free(this->mem_ranges);
67
+ }
68
+
69
+ int n_nodes() const {
70
+ return idxs.size();
71
+ }
72
+
73
+ wsp_ggml_tensor * node(int i) const {
74
+ assert(i >= 0 && i < (int) idxs.size());
75
+ return wsp_ggml_graph_node(gf, idxs[i]);
76
+ }
77
+
78
+ bool can_fuse(int i0, const wsp_ggml_op * ops, int n_ops) const {
79
+ assert(use_fusion);
80
+ assert(i0 >= 0 && i0 < n_nodes());
81
+
82
+ if (i0 + n_ops > n_nodes()) {
83
+ return false;
84
+ }
85
+
86
+ return wsp_ggml_can_fuse_ext(gf, idxs.data() + i0, ops, n_ops);
87
+ }
88
+
89
+ wsp_ggml_metal_device_t dev;
90
+ wsp_ggml_metal_library_t lib;
91
+ wsp_ggml_metal_encoder_t enc;
92
+ wsp_ggml_mem_ranges_t mem_ranges;
93
+
94
+ bool use_fusion;
95
+ bool use_concurrency;
96
+ bool use_capture;
97
+
98
+ int debug_graph;
99
+ int debug_fusion;
100
+
101
+ private:
102
+ wsp_ggml_cgraph * gf;
103
+
104
+ int idx_start;
105
+ int idx_end;
106
+
107
+ // non-empty node indices
108
+ std::vector<int> idxs;
109
+ };
110
+
111
+ wsp_ggml_metal_op_t wsp_ggml_metal_op_init(
112
+ wsp_ggml_metal_device_t dev,
113
+ wsp_ggml_metal_cmd_buf_t cmd_buf,
114
+ wsp_ggml_cgraph * gf,
115
+ int idx_start,
116
+ int idx_end,
117
+ bool use_fusion,
118
+ bool use_concurrency,
119
+ bool use_capture,
120
+ int debug_graph,
121
+ int debug_fusion) {
122
+ wsp_ggml_metal_op_t res = new wsp_ggml_metal_op(
123
+ dev,
124
+ cmd_buf,
125
+ gf,
126
+ idx_start,
127
+ idx_end,
128
+ use_fusion,
129
+ use_concurrency,
130
+ use_capture,
131
+ debug_graph,
132
+ debug_fusion);
133
+
134
+ return res;
135
+ }
136
+
137
+ void wsp_ggml_metal_op_free(wsp_ggml_metal_op_t ctx) {
138
+ delete ctx;
139
+ }
140
+
141
+ int wsp_ggml_metal_op_n_nodes(wsp_ggml_metal_op_t ctx) {
142
+ return ctx->n_nodes();
143
+ }
144
+
145
+ static bool wsp_ggml_metal_op_concurrency_reset(wsp_ggml_metal_op_t ctx) {
146
+ if (!ctx->mem_ranges) {
147
+ return true;
148
+ }
149
+
150
+ wsp_ggml_metal_encoder_memory_barrier(ctx->enc);
151
+
152
+ wsp_ggml_mem_ranges_reset(ctx->mem_ranges);
153
+
154
+ return true;
155
+ }
156
+
157
+ static bool wsp_ggml_metal_op_concurrency_check(wsp_ggml_metal_op_t ctx, const wsp_ggml_tensor * node) {
158
+ if (!ctx->mem_ranges) {
159
+ return false;
160
+ }
161
+
162
+ return wsp_ggml_mem_ranges_check(ctx->mem_ranges, node);
163
+ }
164
+
165
+ static bool wsp_ggml_metal_op_concurrency_add(wsp_ggml_metal_op_t ctx, const wsp_ggml_tensor * node) {
166
+ if (!ctx->mem_ranges) {
167
+ return true;
168
+ }
169
+
170
+ return wsp_ggml_mem_ranges_add(ctx->mem_ranges, node);
171
+ }
172
+
173
+ static int wsp_ggml_metal_op_encode_impl(wsp_ggml_metal_op_t ctx, int idx) {
174
+ struct wsp_ggml_tensor * node = ctx->node(idx);
175
+
176
+ //WSP_GGML_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, idx, wsp_ggml_op_name(node->op));
177
+
178
+ if (wsp_ggml_is_empty(node)) {
179
+ return 1;
180
+ }
181
+
182
+ switch (node->op) {
183
+ case WSP_GGML_OP_NONE:
184
+ case WSP_GGML_OP_RESHAPE:
185
+ case WSP_GGML_OP_VIEW:
186
+ case WSP_GGML_OP_TRANSPOSE:
187
+ case WSP_GGML_OP_PERMUTE:
188
+ {
189
+ // noop -> next node
190
+ if (ctx->debug_graph > 0) {
191
+ WSP_GGML_LOG_DEBUG("%s: node[%5d] - %-12s %s\n", __func__, idx, wsp_ggml_op_name(node->op), "(noop)");
192
+ }
193
+ } return 1;
194
+ default:
195
+ {
196
+ } break;
197
+ }
198
+
199
+ if (!wsp_ggml_metal_device_supports_op(ctx->dev, node)) {
200
+ WSP_GGML_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, wsp_ggml_op_desc(node));
201
+ WSP_GGML_ABORT("unsupported op");
202
+ }
203
+
204
+ int n_fuse = 1;
205
+
206
+ // check if the current node can run concurrently with other nodes before it
207
+ // the condition is that:
208
+ // - the current node cannot write to any previous src or dst ranges
209
+ // - the current node cannot read from any previous dst ranges
210
+ //
211
+ // if the condition is not satisfied, we put a memory barrier and clear all ranges
212
+ // otherwise, we add the new ranges to the encoding context and process the node concurrently
213
+ //
214
+ {
215
+ const bool is_concurrent = wsp_ggml_metal_op_concurrency_check(ctx, node);
216
+
217
+ if (!is_concurrent) {
218
+ wsp_ggml_metal_op_concurrency_reset(ctx);
219
+ }
220
+
221
+ if (ctx->debug_graph > 0) {
222
+ WSP_GGML_LOG_DEBUG("%s: node[%5d] - %-12s %s\n", __func__, idx, wsp_ggml_op_name(node->op), is_concurrent ? "(concurrent)" : "");
223
+ }
224
+ if (ctx->debug_graph > 1) {
225
+ WSP_GGML_TENSOR_LOCALS( int64_t, ne0, node->src[0], ne);
226
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, node->src[0], nb);
227
+ WSP_GGML_TENSOR_LOCALS( int64_t, ne1, node->src[1], ne);
228
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, node->src[1], nb);
229
+ WSP_GGML_TENSOR_LOCALS( int64_t, ne, node, ne);
230
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb, node, nb);
231
+
232
+ if (node->src[0]) {
233
+ WSP_GGML_LOG_DEBUG("%s: src0 - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], %d, %s\n", __func__, wsp_ggml_type_name(node->src[0]->type), ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03,
234
+ wsp_ggml_is_contiguous(node->src[0]), node->src[0]->name);
235
+ }
236
+ if (node->src[1]) {
237
+ WSP_GGML_LOG_DEBUG("%s: src1 - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], %d, %s\n", __func__, wsp_ggml_type_name(node->src[1]->type), ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13,
238
+ wsp_ggml_is_contiguous(node->src[1]), node->src[1]->name);
239
+ }
240
+ if (node) {
241
+ WSP_GGML_LOG_DEBUG("%s: node - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], 1, %s\n", __func__, wsp_ggml_type_name(node->type), ne0, ne1, ne2, ne3, nb0, nb1, nb2, nb3,
242
+ node->name);
243
+ }
244
+ }
245
+ }
246
+
247
+ switch (node->op) {
248
+ case WSP_GGML_OP_CONCAT:
249
+ {
250
+ n_fuse = wsp_ggml_metal_op_concat(ctx, idx);
251
+ } break;
252
+ case WSP_GGML_OP_ADD:
253
+ case WSP_GGML_OP_SUB:
254
+ case WSP_GGML_OP_MUL:
255
+ case WSP_GGML_OP_DIV:
256
+ {
257
+ n_fuse = wsp_ggml_metal_op_bin(ctx, idx);
258
+ } break;
259
+ case WSP_GGML_OP_ADD_ID:
260
+ {
261
+ n_fuse = wsp_ggml_metal_op_add_id(ctx, idx);
262
+ } break;
263
+ case WSP_GGML_OP_REPEAT:
264
+ {
265
+ n_fuse = wsp_ggml_metal_op_repeat(ctx, idx);
266
+ } break;
267
+ case WSP_GGML_OP_ACC:
268
+ {
269
+ n_fuse = wsp_ggml_metal_op_acc(ctx, idx);
270
+ } break;
271
+ case WSP_GGML_OP_SCALE:
272
+ {
273
+ n_fuse = wsp_ggml_metal_op_scale(ctx, idx);
274
+ } break;
275
+ case WSP_GGML_OP_CLAMP:
276
+ {
277
+ n_fuse = wsp_ggml_metal_op_clamp(ctx, idx);
278
+ } break;
279
+ case WSP_GGML_OP_SQR:
280
+ case WSP_GGML_OP_SQRT:
281
+ case WSP_GGML_OP_SIN:
282
+ case WSP_GGML_OP_COS:
283
+ case WSP_GGML_OP_LOG:
284
+ case WSP_GGML_OP_UNARY:
285
+ {
286
+ n_fuse = wsp_ggml_metal_op_unary(ctx, idx);
287
+ } break;
288
+ case WSP_GGML_OP_GLU:
289
+ {
290
+ n_fuse = wsp_ggml_metal_op_glu(ctx, idx);
291
+ } break;
292
+ case WSP_GGML_OP_SUM_ROWS:
293
+ case WSP_GGML_OP_MEAN:
294
+ {
295
+ n_fuse = wsp_ggml_metal_op_sum_rows(ctx, idx);
296
+ } break;
297
+ case WSP_GGML_OP_SOFT_MAX:
298
+ {
299
+ n_fuse = wsp_ggml_metal_op_soft_max(ctx, idx);
300
+ } break;
301
+ case WSP_GGML_OP_SSM_CONV:
302
+ {
303
+ n_fuse = wsp_ggml_metal_op_ssm_conv(ctx, idx);
304
+ } break;
305
+ case WSP_GGML_OP_SSM_SCAN:
306
+ {
307
+ n_fuse = wsp_ggml_metal_op_ssm_scan(ctx, idx);
308
+ } break;
309
+ case WSP_GGML_OP_RWKV_WKV6:
310
+ case WSP_GGML_OP_RWKV_WKV7:
311
+ {
312
+ n_fuse = wsp_ggml_metal_op_rwkv(ctx, idx);
313
+ } break;
314
+ case WSP_GGML_OP_MUL_MAT:
315
+ {
316
+ n_fuse = wsp_ggml_metal_op_mul_mat(ctx, idx);
317
+ } break;
318
+ case WSP_GGML_OP_MUL_MAT_ID:
319
+ {
320
+ n_fuse = wsp_ggml_metal_op_mul_mat_id(ctx, idx);
321
+ } break;
322
+ case WSP_GGML_OP_GET_ROWS:
323
+ {
324
+ n_fuse = wsp_ggml_metal_op_get_rows(ctx, idx);
325
+ } break;
326
+ case WSP_GGML_OP_SET_ROWS:
327
+ {
328
+ n_fuse = wsp_ggml_metal_op_set_rows(ctx, idx);
329
+ } break;
330
+ case WSP_GGML_OP_L2_NORM:
331
+ {
332
+ n_fuse = wsp_ggml_metal_op_l2_norm(ctx, idx);
333
+ } break;
334
+ case WSP_GGML_OP_GROUP_NORM:
335
+ {
336
+ n_fuse = wsp_ggml_metal_op_group_norm(ctx, idx);
337
+ } break;
338
+ case WSP_GGML_OP_NORM:
339
+ case WSP_GGML_OP_RMS_NORM:
340
+ {
341
+ n_fuse = wsp_ggml_metal_op_norm(ctx, idx);
342
+ } break;
343
+ case WSP_GGML_OP_ROPE:
344
+ {
345
+ n_fuse = wsp_ggml_metal_op_rope(ctx, idx);
346
+ } break;
347
+ case WSP_GGML_OP_IM2COL:
348
+ {
349
+ n_fuse = wsp_ggml_metal_op_im2col(ctx, idx);
350
+ } break;
351
+ case WSP_GGML_OP_CONV_TRANSPOSE_1D:
352
+ {
353
+ n_fuse = wsp_ggml_metal_op_conv_transpose_1d(ctx, idx);
354
+ } break;
355
+ case WSP_GGML_OP_UPSCALE:
356
+ {
357
+ n_fuse = wsp_ggml_metal_op_upscale(ctx, idx);
358
+ } break;
359
+ case WSP_GGML_OP_PAD:
360
+ {
361
+ n_fuse = wsp_ggml_metal_op_pad(ctx, idx);
362
+ } break;
363
+ case WSP_GGML_OP_PAD_REFLECT_1D:
364
+ {
365
+ n_fuse = wsp_ggml_metal_op_pad_reflect_1d(ctx, idx);
366
+ } break;
367
+ case WSP_GGML_OP_ARANGE:
368
+ {
369
+ n_fuse = wsp_ggml_metal_op_arange(ctx, idx);
370
+ } break;
371
+ case WSP_GGML_OP_TIMESTEP_EMBEDDING:
372
+ {
373
+ n_fuse = wsp_ggml_metal_op_timestep_embedding(ctx, idx);
374
+ } break;
375
+ case WSP_GGML_OP_ARGSORT:
376
+ {
377
+ n_fuse = wsp_ggml_metal_op_argsort(ctx, idx);
378
+ } break;
379
+ case WSP_GGML_OP_LEAKY_RELU:
380
+ {
381
+ n_fuse = wsp_ggml_metal_op_leaky_relu(ctx, idx);
382
+ } break;
383
+ case WSP_GGML_OP_FLASH_ATTN_EXT:
384
+ {
385
+ n_fuse = wsp_ggml_metal_op_flash_attn_ext(ctx, idx);
386
+ } break;
387
+ case WSP_GGML_OP_DUP:
388
+ case WSP_GGML_OP_CPY:
389
+ case WSP_GGML_OP_CONT:
390
+ {
391
+ n_fuse = wsp_ggml_metal_op_cpy(ctx, idx);
392
+ } break;
393
+ case WSP_GGML_OP_POOL_2D:
394
+ {
395
+ n_fuse = wsp_ggml_metal_op_pool_2d(ctx, idx);
396
+ } break;
397
+ case WSP_GGML_OP_ARGMAX:
398
+ {
399
+ n_fuse = wsp_ggml_metal_op_argmax(ctx, idx);
400
+ } break;
401
+ default:
402
+ {
403
+ WSP_GGML_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, wsp_ggml_op_name(node->op));
404
+ WSP_GGML_ABORT("fatal error");
405
+ }
406
+ }
407
+
408
+ if (ctx->debug_graph > 0) {
409
+ if (n_fuse > 1) {
410
+ WSP_GGML_LOG_DEBUG("%s: fuse %d ops\n", __func__, n_fuse);
411
+ }
412
+ }
413
+
414
+ // update the mem ranges in the encoding context
415
+ for (int i = 0; i < n_fuse; ++i) {
416
+ if (!wsp_ggml_metal_op_concurrency_add(ctx, ctx->node(idx + i))) {
417
+ wsp_ggml_metal_op_concurrency_reset(ctx);
418
+ }
419
+ }
420
+
421
+ return n_fuse;
422
+ }
423
+
424
+ int wsp_ggml_metal_op_encode(wsp_ggml_metal_op_t ctx, int idx) {
425
+ if (ctx->use_capture) {
426
+ wsp_ggml_metal_encoder_debug_group_push(ctx->enc, wsp_ggml_op_desc(ctx->node(idx)));
427
+ }
428
+
429
+ int res = wsp_ggml_metal_op_encode_impl(ctx, idx);
430
+ if (idx + res > ctx->n_nodes()) {
431
+ WSP_GGML_ABORT("fusion error: nodes spanning multiple encoders have been fused. this indicates a bug in the fusion logic %s",
432
+ "https://github.com/ggml-org/llama.cpp/pull/14849");
433
+ }
434
+
435
+ if (ctx->use_capture) {
436
+ wsp_ggml_metal_encoder_debug_group_pop(ctx->enc);
437
+ }
438
+
439
+ return res;
440
+ }
441
+
442
+ int wsp_ggml_metal_op_concat(wsp_ggml_metal_op_t ctx, int idx) {
443
+ wsp_ggml_tensor * op = ctx->node(idx);
444
+
445
+ wsp_ggml_metal_library_t lib = ctx->lib;
446
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
447
+
448
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
449
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
450
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
451
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
452
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
453
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
454
+
455
+ const int32_t dim = ((const int32_t *) op->op_params)[0];
456
+
457
+ wsp_ggml_metal_kargs_concat args = {
458
+ /*.ne00 =*/ ne00,
459
+ /*.ne01 =*/ ne01,
460
+ /*.ne02 =*/ ne02,
461
+ /*.ne03 =*/ ne03,
462
+ /*.nb00 =*/ nb00,
463
+ /*.nb01 =*/ nb01,
464
+ /*.nb02 =*/ nb02,
465
+ /*.nb03 =*/ nb03,
466
+ /*.ne10 =*/ ne10,
467
+ /*.ne11 =*/ ne11,
468
+ /*.ne12 =*/ ne12,
469
+ /*.ne13 =*/ ne13,
470
+ /*.nb10 =*/ nb10,
471
+ /*.nb11 =*/ nb11,
472
+ /*.nb12 =*/ nb12,
473
+ /*.nb13 =*/ nb13,
474
+ /*.ne0 =*/ ne0,
475
+ /*.ne1 =*/ ne1,
476
+ /*.ne2 =*/ ne2,
477
+ /*.ne3 =*/ ne3,
478
+ /*.nb0 =*/ nb0,
479
+ /*.nb1 =*/ nb1,
480
+ /*.nb2 =*/ nb2,
481
+ /*.nb3 =*/ nb3,
482
+ /*.dim =*/ dim,
483
+ };
484
+
485
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_base(lib, WSP_GGML_OP_CONCAT);
486
+
487
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
488
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
489
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
490
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
491
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 3);
492
+
493
+ const int nth = std::min(1024, ne0);
494
+
495
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
496
+
497
+ return 1;
498
+ }
499
+
500
+ int wsp_ggml_metal_op_repeat(wsp_ggml_metal_op_t ctx, int idx) {
501
+ wsp_ggml_tensor * op = ctx->node(idx);
502
+
503
+ wsp_ggml_metal_library_t lib = ctx->lib;
504
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
505
+
506
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
507
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
508
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
509
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
510
+
511
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_repeat(lib, op->type);
512
+
513
+ wsp_ggml_metal_kargs_repeat args = {
514
+ /*.ne00 =*/ ne00,
515
+ /*.ne01 =*/ ne01,
516
+ /*.ne02 =*/ ne02,
517
+ /*.ne03 =*/ ne03,
518
+ /*.nb00 =*/ nb00,
519
+ /*.nb01 =*/ nb01,
520
+ /*.nb02 =*/ nb02,
521
+ /*.nb03 =*/ nb03,
522
+ /*.ne0 =*/ ne0,
523
+ /*.ne1 =*/ ne1,
524
+ /*.ne2 =*/ ne2,
525
+ /*.ne3 =*/ ne3,
526
+ /*.nb0 =*/ nb0,
527
+ /*.nb1 =*/ nb1,
528
+ /*.nb2 =*/ nb2,
529
+ /*.nb3 =*/ nb3,
530
+ };
531
+
532
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
533
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
534
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
535
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 2);
536
+
537
+ const int nth = std::min(wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne0);
538
+
539
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
540
+
541
+ return 1;
542
+ }
543
+
544
+ int wsp_ggml_metal_op_acc(wsp_ggml_metal_op_t ctx, int idx) {
545
+ wsp_ggml_tensor * op = ctx->node(idx);
546
+
547
+ wsp_ggml_metal_library_t lib = ctx->lib;
548
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
549
+
550
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
551
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
552
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
553
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
554
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
555
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
556
+
557
+ WSP_GGML_ASSERT(op->src[0]->type == WSP_GGML_TYPE_F32);
558
+ WSP_GGML_ASSERT(op->src[1]->type == WSP_GGML_TYPE_F32);
559
+ WSP_GGML_ASSERT(op->type == WSP_GGML_TYPE_F32);
560
+
561
+ WSP_GGML_ASSERT(wsp_ggml_is_contiguous(op->src[0]));
562
+ WSP_GGML_ASSERT(wsp_ggml_is_contiguous(op->src[1]));
563
+
564
+ const size_t pnb1 = ((const int32_t *) op->op_params)[0];
565
+ const size_t pnb2 = ((const int32_t *) op->op_params)[1];
566
+ const size_t pnb3 = ((const int32_t *) op->op_params)[2];
567
+ const size_t offs = ((const int32_t *) op->op_params)[3];
568
+
569
+ const bool inplace = (bool) ((const int32_t *) op->op_params)[4];
570
+
571
+ if (!inplace) {
572
+ // run a separete kernel to cpy src->dst
573
+ // not sure how to avoid this
574
+ // TODO: make a simpler cpy_bytes kernel
575
+
576
+ //const id<MTLComputePipelineState> pipeline = ctx->pipelines[WSP_GGML_METAL_PIPELINE_TYPE_CPY_F32_F32].obj;
577
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
578
+
579
+ wsp_ggml_metal_kargs_cpy args = {
580
+ /*.ne00 =*/ ne00,
581
+ /*.ne01 =*/ ne01,
582
+ /*.ne02 =*/ ne02,
583
+ /*.ne03 =*/ ne03,
584
+ /*.nb00 =*/ nb00,
585
+ /*.nb01 =*/ nb01,
586
+ /*.nb02 =*/ nb02,
587
+ /*.nb03 =*/ nb03,
588
+ /*.ne0 =*/ ne0,
589
+ /*.ne1 =*/ ne1,
590
+ /*.ne2 =*/ ne2,
591
+ /*.ne3 =*/ ne3,
592
+ /*.nb0 =*/ nb0,
593
+ /*.nb1 =*/ nb1,
594
+ /*.nb2 =*/ nb2,
595
+ /*.nb3 =*/ nb3,
596
+ };
597
+
598
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
599
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
600
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
601
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 2);
602
+
603
+ const int nth = std::min(wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne00);
604
+
605
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
606
+
607
+ wsp_ggml_metal_op_concurrency_reset(ctx);
608
+ }
609
+
610
+ wsp_ggml_metal_kargs_bin args = {
611
+ /*.ne00 =*/ ne00,
612
+ /*.ne01 =*/ ne01,
613
+ /*.ne02 =*/ ne02,
614
+ /*.ne03 =*/ ne03,
615
+ /*.nb00 =*/ nb00,
616
+ /*.nb01 =*/ pnb1,
617
+ /*.nb02 =*/ pnb2,
618
+ /*.nb03 =*/ pnb3,
619
+ /*.ne10 =*/ ne10,
620
+ /*.ne11 =*/ ne11,
621
+ /*.ne12 =*/ ne12,
622
+ /*.ne13 =*/ ne13,
623
+ /*.nb10 =*/ nb10,
624
+ /*.nb11 =*/ nb11,
625
+ /*.nb12 =*/ nb12,
626
+ /*.nb13 =*/ nb13,
627
+ /*.ne0 =*/ ne0,
628
+ /*.ne1 =*/ ne1,
629
+ /*.ne2 =*/ ne2,
630
+ /*.ne3 =*/ ne3,
631
+ /*.nb0 =*/ nb0,
632
+ /*.nb1 =*/ pnb1,
633
+ /*.nb2 =*/ pnb2,
634
+ /*.nb3 =*/ pnb3,
635
+ /*.offs =*/ offs,
636
+ /*.o1 =*/ { 0 },
637
+ };
638
+
639
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_bin(lib, WSP_GGML_OP_ADD, 1, false);
640
+
641
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
642
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
643
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
644
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
645
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 3);
646
+
647
+ const int nth = std::min(wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne00);
648
+
649
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne11, ne12, ne13, nth, 1, 1);
650
+
651
+ return 1;
652
+ }
653
+
654
+ int wsp_ggml_metal_op_scale(wsp_ggml_metal_op_t ctx, int idx) {
655
+ wsp_ggml_tensor * op = ctx->node(idx);
656
+
657
+ wsp_ggml_metal_library_t lib = ctx->lib;
658
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
659
+
660
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
661
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
662
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
663
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
664
+
665
+ float scale;
666
+ float bias;
667
+ memcpy(&scale, ((const int32_t *) op->op_params) + 0, sizeof(float));
668
+ memcpy(&bias, ((const int32_t *) op->op_params) + 1, sizeof(float));
669
+
670
+ wsp_ggml_metal_kargs_scale args = {
671
+ /*.scale =*/ scale,
672
+ /*.bias =*/ bias,
673
+ };
674
+
675
+ int64_t n = wsp_ggml_nelements(op);
676
+
677
+ if (n % 4 == 0) {
678
+ n /= 4;
679
+ }
680
+
681
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_unary(lib, op);
682
+
683
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
684
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
685
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
686
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 2);
687
+
688
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
689
+
690
+ return 1;
691
+ }
692
+
693
+ int wsp_ggml_metal_op_clamp(wsp_ggml_metal_op_t ctx, int idx) {
694
+ wsp_ggml_tensor * op = ctx->node(idx);
695
+
696
+ wsp_ggml_metal_library_t lib = ctx->lib;
697
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
698
+
699
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
700
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
701
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
702
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
703
+
704
+ float min;
705
+ float max;
706
+ memcpy(&min, ((const int32_t *) op->op_params) + 0, sizeof(float));
707
+ memcpy(&max, ((const int32_t *) op->op_params) + 1, sizeof(float));
708
+
709
+ wsp_ggml_metal_kargs_clamp args = {
710
+ /*.min =*/ min,
711
+ /*.max =*/ max,
712
+ };
713
+
714
+ int64_t n = wsp_ggml_nelements(op);
715
+
716
+ if (n % 4 == 0) {
717
+ n /= 4;
718
+ }
719
+
720
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_unary(lib, op);
721
+
722
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
723
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
724
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
725
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 2);
726
+
727
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
728
+
729
+ return 1;
730
+ }
731
+
732
+ int wsp_ggml_metal_op_unary(wsp_ggml_metal_op_t ctx, int idx) {
733
+ wsp_ggml_tensor * op = ctx->node(idx);
734
+
735
+ wsp_ggml_metal_library_t lib = ctx->lib;
736
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
737
+
738
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
739
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
740
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
741
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
742
+
743
+ int64_t n = wsp_ggml_nelements(op);
744
+
745
+ if (n % 4 == 0) {
746
+ n /= 4;
747
+ }
748
+
749
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_unary(lib, op);
750
+
751
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
752
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 0);
753
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 1);
754
+
755
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
756
+
757
+ return 1;
758
+ }
759
+
760
+ int wsp_ggml_metal_op_glu(wsp_ggml_metal_op_t ctx, int idx) {
761
+ wsp_ggml_tensor * op = ctx->node(idx);
762
+
763
+ wsp_ggml_metal_library_t lib = ctx->lib;
764
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
765
+
766
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
767
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
768
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
769
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
770
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
771
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
772
+
773
+ if (op->src[1]) {
774
+ WSP_GGML_ASSERT(wsp_ggml_are_same_shape(op->src[0], op->src[1]));
775
+ }
776
+
777
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_glu(lib, op);
778
+
779
+ const int32_t swp = wsp_ggml_get_op_params_i32(op, 1);
780
+ const float alpha = wsp_ggml_get_op_params_f32(op, 2);
781
+ const float limit = wsp_ggml_get_op_params_f32(op, 3);
782
+
783
+ const int32_t i00 = swp ? ne0 : 0;
784
+ const int32_t i10 = swp ? 0 : ne0;
785
+
786
+ wsp_ggml_metal_kargs_glu args = {
787
+ /*.ne00 =*/ ne00,
788
+ /*.nb01 =*/ nb01,
789
+ /*.ne10 =*/ op->src[1] ? ne10 : ne00,
790
+ /*.nb11 =*/ op->src[1] ? nb11 : nb01,
791
+ /*.ne0 =*/ ne0,
792
+ /*.nb1 =*/ nb1,
793
+ /*.i00 =*/ op->src[1] ? 0 : i00,
794
+ /*.i10 =*/ op->src[1] ? 0 : i10,
795
+ /*.alpha=*/ alpha,
796
+ /*.limit=*/ limit
797
+ };
798
+
799
+ const int64_t nrows = wsp_ggml_nrows(op->src[0]);
800
+
801
+ const int32_t nth = std::min(wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne00/2);
802
+
803
+ //[encoder setComputePipelineState:pipeline];
804
+ //[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
805
+ //if (src1) {
806
+ // [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
807
+ //} else {
808
+ // [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
809
+ //}
810
+ //[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
811
+ //[encoder setBytes:&args length:sizeof(args) atIndex:3];
812
+
813
+ //[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
814
+
815
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
816
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
817
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
818
+ if (op->src[1]) {
819
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
820
+ } else {
821
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 2);
822
+ }
823
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 3);
824
+
825
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, nrows, 1, 1, nth, 1, 1);
826
+
827
+ return 1;
828
+ }
829
+
830
+ int wsp_ggml_metal_op_sum_rows(wsp_ggml_metal_op_t ctx, int idx) {
831
+ wsp_ggml_tensor * op = ctx->node(idx);
832
+
833
+ wsp_ggml_metal_library_t lib = ctx->lib;
834
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
835
+
836
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
837
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
838
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
839
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
840
+
841
+ wsp_ggml_metal_kargs_sum_rows args = {
842
+ /*.ne00 =*/ ne00,
843
+ /*.ne01 =*/ ne01,
844
+ /*.ne02 =*/ ne02,
845
+ /*.ne03 =*/ ne03,
846
+ /*.nb00 =*/ nb00,
847
+ /*.nb01 =*/ nb01,
848
+ /*.nb02 =*/ nb02,
849
+ /*.nb03 =*/ nb03,
850
+ /*.ne0 =*/ ne0,
851
+ /*.ne1 =*/ ne1,
852
+ /*.ne2 =*/ ne2,
853
+ /*.ne3 =*/ ne3,
854
+ /*.nb0 =*/ nb0,
855
+ /*.nb1 =*/ nb1,
856
+ /*.nb2 =*/ nb2,
857
+ /*.nb3 =*/ nb3,
858
+ };
859
+
860
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_sum_rows(lib, op);
861
+
862
+ int nth = 32; // SIMD width
863
+
864
+ while (nth < ne00 && nth < wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
865
+ nth *= 2;
866
+ }
867
+
868
+ nth = std::min(nth, wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
869
+ nth = std::min(nth, ne00);
870
+
871
+ const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
872
+
873
+ //[encoder setComputePipelineState:pipeline];
874
+ //[encoder setBytes:&args length:sizeof(args) atIndex:0];
875
+ //[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
876
+ //[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
877
+ //[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
878
+
879
+ //[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
880
+
881
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
882
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
883
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
884
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 2);
885
+
886
+ wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
887
+
888
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
889
+
890
+ return 1;
891
+ }
892
+
893
+ int wsp_ggml_metal_op_get_rows(wsp_ggml_metal_op_t ctx, int idx) {
894
+ wsp_ggml_tensor * op = ctx->node(idx);
895
+
896
+ wsp_ggml_metal_library_t lib = ctx->lib;
897
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
898
+
899
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
900
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
901
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
902
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
903
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
904
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
905
+
906
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_get_rows(lib, op->src[0]->type);
907
+
908
+ wsp_ggml_metal_kargs_get_rows args = {
909
+ /*.ne00 =*/ ne00,
910
+ /*.nb01 =*/ nb01,
911
+ /*.nb02 =*/ nb02,
912
+ /*.ne10 =*/ ne10,
913
+ /*.nb10 =*/ nb10,
914
+ /*.nb11 =*/ nb11,
915
+ /*.nb1 =*/ nb1,
916
+ /*.nb2 =*/ nb2,
917
+ };
918
+
919
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
920
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
921
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
922
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
923
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 3);
924
+
925
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne10, ne11, ne12, 32, 1, 1);
926
+
927
+ return 1;
928
+ }
929
+
930
+ int wsp_ggml_metal_op_set_rows(wsp_ggml_metal_op_t ctx, int idx) {
931
+ wsp_ggml_tensor * op = ctx->node(idx);
932
+
933
+ wsp_ggml_metal_library_t lib = ctx->lib;
934
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
935
+
936
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
937
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
938
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
939
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
940
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
941
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
942
+
943
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_set_rows(lib, op->src[1]->type, op->type);
944
+
945
+ const int32_t nk0 = ne0/wsp_ggml_blck_size(op->type);
946
+
947
+ int nth = 32; // SIMD width
948
+
949
+ while (nth < nk0 && nth < wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
950
+ nth *= 2;
951
+ }
952
+
953
+ int nrptg = 1;
954
+ if (nth > nk0) {
955
+ nrptg = (nth + nk0 - 1)/nk0;
956
+ nth = nk0;
957
+
958
+ if (nrptg*nth > wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
959
+ nrptg--;
960
+ }
961
+ }
962
+
963
+ nth = std::min(nth, nk0);
964
+
965
+ wsp_ggml_metal_kargs_set_rows args = {
966
+ /*.nk0 =*/ nk0,
967
+ /*.ne01 =*/ ne01,
968
+ /*.nb01 =*/ nb01,
969
+ /*.nb02 =*/ nb02,
970
+ /*.nb03 =*/ nb03,
971
+ /*.ne11 =*/ ne11,
972
+ /*.ne12 =*/ ne12,
973
+ /*.nb10 =*/ nb10,
974
+ /*.nb11 =*/ nb11,
975
+ /*.nb12 =*/ nb12,
976
+ /*.nb1 =*/ nb1,
977
+ /*.nb2 =*/ nb2,
978
+ /*.nb3 =*/ nb3,
979
+ };
980
+
981
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
982
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
983
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
984
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
985
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 3);
986
+
987
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nrptg - 1)/nrptg, ne02, ne03, nth, nrptg, 1);
988
+
989
+ return 1;
990
+ }
991
+
992
+ int wsp_ggml_metal_op_soft_max(wsp_ggml_metal_op_t ctx, int idx) {
993
+ wsp_ggml_tensor * op = ctx->node(idx);
994
+
995
+ wsp_ggml_metal_library_t lib = ctx->lib;
996
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
997
+
998
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
999
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1000
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
1001
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
1002
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
1003
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
1004
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1005
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
1006
+
1007
+ float scale;
1008
+ float max_bias;
1009
+
1010
+ memcpy(&scale, ((const int32_t *) op->op_params) + 0, sizeof(scale));
1011
+ memcpy(&max_bias, ((const int32_t *) op->op_params) + 1, sizeof(max_bias));
1012
+
1013
+ const uint32_t n_head = op->src[0]->ne[2];
1014
+ const int32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
1015
+
1016
+ const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
1017
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
1018
+
1019
+ // softmax
1020
+
1021
+ wsp_ggml_metal_kargs_soft_max args = {
1022
+ /*.ne00 =*/ ne00,
1023
+ /*.ne01 =*/ ne01,
1024
+ /*.ne02 =*/ ne02,
1025
+ /*.nb01 =*/ nb01,
1026
+ /*.nb02 =*/ nb02,
1027
+ /*.nb03 =*/ nb03,
1028
+ /*.ne11 =*/ ne11,
1029
+ /*.ne12 =*/ ne12,
1030
+ /*.ne13 =*/ ne13,
1031
+ /*.nb11 =*/ nb11,
1032
+ /*.nb12 =*/ nb12,
1033
+ /*.nb13 =*/ nb13,
1034
+ /*.nb1 =*/ nb1,
1035
+ /*.nb2 =*/ nb2,
1036
+ /*.nb3 =*/ nb3,
1037
+ /*.scale =*/ scale,
1038
+ /*.max_bias =*/ max_bias,
1039
+ /*.m0 =*/ m0,
1040
+ /*.m1 =*/ m1,
1041
+ /*.n_head_log2 =*/ n_head_log2,
1042
+ };
1043
+
1044
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_soft_max(lib, op);
1045
+
1046
+ int nth = 32; // SIMD width
1047
+
1048
+ if (ne00%4 == 0) {
1049
+ while (nth < ne00/4 && nth*ne01*ne02*ne03 < 256) {
1050
+ nth *= 2;
1051
+ }
1052
+ } else {
1053
+ while (nth < ne00 && nth*ne01*ne02*ne03 < 256) {
1054
+ nth *= 2;
1055
+ }
1056
+ }
1057
+
1058
+ const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
1059
+
1060
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
1061
+ wsp_ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
1062
+ wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
1063
+ if (op->src[1]) {
1064
+ wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
1065
+ } else {
1066
+ wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 2);
1067
+ }
1068
+ if (op->src[2]) {
1069
+ wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[2]), 3);
1070
+ } else {
1071
+ wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 3);
1072
+ }
1073
+ wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op), 4);
1074
+
1075
+ wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
1076
+
1077
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
1078
+
1079
+ return 1;
1080
+ }
1081
+
1082
+ int wsp_ggml_metal_op_ssm_conv(wsp_ggml_metal_op_t ctx, int idx) {
1083
+ wsp_ggml_tensor * op = ctx->node(idx);
1084
+
1085
+ wsp_ggml_metal_library_t lib = ctx->lib;
1086
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
1087
+
1088
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1089
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1090
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
1091
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
1092
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1093
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
1094
+
1095
+ wsp_ggml_metal_kargs_ssm_conv args = {
1096
+ /*.ne00 =*/ ne00,
1097
+ /*.ne01 =*/ ne01,
1098
+ /*.ne02 =*/ ne02,
1099
+ /*.nb00 =*/ nb00,
1100
+ /*.nb01 =*/ nb01,
1101
+ /*.nb02 =*/ nb02,
1102
+ /*.ne10 =*/ ne10,
1103
+ /*.ne11 =*/ ne11,
1104
+ /*.nb10 =*/ nb10,
1105
+ /*.nb11 =*/ nb11,
1106
+ /*.ne0 =*/ ne0,
1107
+ /*.ne1 =*/ ne1,
1108
+ /*.ne2 =*/ ne2,
1109
+ /*.nb0 =*/ nb0,
1110
+ /*.nb1 =*/ nb1,
1111
+ /*.nb2 =*/ nb2,
1112
+ };
1113
+
1114
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_ssm_conv(lib, op);
1115
+
1116
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
1117
+ wsp_ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
1118
+ wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
1119
+ wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
1120
+ wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op), 3);
1121
+
1122
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne1, ne02, 1, 1, 1);
1123
+
1124
+ return 1;
1125
+ }
1126
+
1127
+ int wsp_ggml_metal_op_ssm_scan(wsp_ggml_metal_op_t ctx, int idx) {
1128
+ wsp_ggml_tensor * op = ctx->node(idx);
1129
+
1130
+ wsp_ggml_metal_library_t lib = ctx->lib;
1131
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
1132
+
1133
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1134
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1135
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
1136
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
1137
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
1138
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
1139
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne3, op->src[3], ne);
1140
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb3, op->src[3], nb);
1141
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne4, op->src[4], ne);
1142
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb4, op->src[4], nb);
1143
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne5, op->src[5], ne);
1144
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb5, op->src[5], nb);
1145
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne6, op->src[6], ne);
1146
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb6, op->src[6], nb);
1147
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1148
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
1149
+
1150
+ const wsp_ggml_tensor * src3 = op->src[3];
1151
+ const wsp_ggml_tensor * src4 = op->src[4];
1152
+ const wsp_ggml_tensor * src5 = op->src[5];
1153
+ const wsp_ggml_tensor * src6 = op->src[6];
1154
+
1155
+ WSP_GGML_ASSERT(src3);
1156
+ WSP_GGML_ASSERT(src4);
1157
+ WSP_GGML_ASSERT(src5);
1158
+ WSP_GGML_ASSERT(src6);
1159
+
1160
+ const int64_t d_state = ne00;
1161
+ const int64_t d_inner = ne01;
1162
+ const int64_t n_head = ne02;
1163
+ const int64_t n_group = ne41;
1164
+ const int64_t n_seq_tokens = ne12;
1165
+ const int64_t n_seqs = ne13;
1166
+
1167
+ wsp_ggml_metal_kargs_ssm_scan args = {
1168
+ /*.d_state =*/ d_state,
1169
+ /*.d_inner =*/ d_inner,
1170
+ /*.n_head =*/ n_head,
1171
+ /*.n_group =*/ n_group,
1172
+ /*.n_seq_tokens =*/ n_seq_tokens,
1173
+ /*.n_seqs =*/ n_seqs,
1174
+ /*.s_off =*/ wsp_ggml_nelements(op->src[1]) * sizeof(float),
1175
+ /*.nb01 =*/ nb01,
1176
+ /*.nb02 =*/ nb02,
1177
+ /*.nb03 =*/ nb03,
1178
+ /*.nb11 =*/ nb11,
1179
+ /*.nb12 =*/ nb12,
1180
+ /*.nb13 =*/ nb13,
1181
+ /*.nb21 =*/ nb21,
1182
+ /*.nb22 =*/ nb22,
1183
+ /*.nb31 =*/ nb31,
1184
+ /*.nb41 =*/ nb41,
1185
+ /*.nb42 =*/ nb42,
1186
+ /*.nb43 =*/ nb43,
1187
+ /*.nb51 =*/ nb51,
1188
+ /*.nb52 =*/ nb52,
1189
+ /*.nb53 =*/ nb53,
1190
+ };
1191
+
1192
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_ssm_scan(lib, op);
1193
+
1194
+ const size_t sms = wsp_ggml_metal_pipeline_get_smem(pipeline);
1195
+
1196
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
1197
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1198
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
1199
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
1200
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[2]), 3);
1201
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[3]), 4);
1202
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[4]), 5);
1203
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[5]), 6);
1204
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[6]), 7);
1205
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 8);
1206
+
1207
+ wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, sms, 0);
1208
+
1209
+ if (ne30 == 1) {
1210
+ // Mamba-2
1211
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, d_inner, n_head, n_seqs, d_state, 1, 1);
1212
+ } else {
1213
+ WSP_GGML_ASSERT(d_inner == 1);
1214
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, n_head, n_seqs, 1, d_state, 1, 1);
1215
+ }
1216
+
1217
+ return 1;
1218
+ }
1219
+
1220
+ int wsp_ggml_metal_op_rwkv(wsp_ggml_metal_op_t ctx, int idx) {
1221
+ wsp_ggml_tensor * op = ctx->node(idx);
1222
+
1223
+ wsp_ggml_metal_library_t lib = ctx->lib;
1224
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
1225
+
1226
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1227
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1228
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1229
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
1230
+
1231
+ const int64_t B = op->op == WSP_GGML_OP_RWKV_WKV6 ? op->src[5]->ne[1] : op->src[6]->ne[1];
1232
+ const int64_t T = op->src[0]->ne[2];
1233
+ const int64_t C = op->ne[0];
1234
+ const int64_t H = op->src[0]->ne[1];
1235
+
1236
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_rwkv(lib, op);
1237
+
1238
+ int ida = 0;
1239
+
1240
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
1241
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), ida++);
1242
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), ida++);
1243
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[2]), ida++);
1244
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[3]), ida++);
1245
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[4]), ida++);
1246
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[5]), ida++);
1247
+ if (op->op == WSP_GGML_OP_RWKV_WKV7) {
1248
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[6]), ida++);
1249
+ }
1250
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), ida++);
1251
+ wsp_ggml_metal_encoder_set_bytes (enc, (void *) &B, sizeof(B), ida++);
1252
+ wsp_ggml_metal_encoder_set_bytes (enc, (void *) &T, sizeof(T), ida++);
1253
+ wsp_ggml_metal_encoder_set_bytes (enc, (void *) &C, sizeof(C), ida++);
1254
+ wsp_ggml_metal_encoder_set_bytes (enc, (void *) &H, sizeof(H), ida++);
1255
+
1256
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, B * H, 1, 1, C/H, 1, 1);
1257
+
1258
+ return 1;
1259
+ }
1260
+
1261
+ int wsp_ggml_metal_op_cpy(wsp_ggml_metal_op_t ctx, int idx) {
1262
+ wsp_ggml_tensor * op = ctx->node(idx);
1263
+
1264
+ wsp_ggml_metal_library_t lib = ctx->lib;
1265
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
1266
+
1267
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1268
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1269
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1270
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
1271
+
1272
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
1273
+
1274
+ WSP_GGML_ASSERT(ne00 % wsp_ggml_blck_size(op->src[0]->type) == 0);
1275
+
1276
+ // TODO: support
1277
+ //const int32_t nk00 = ne00/wsp_ggml_blck_size(op->type);
1278
+ const int32_t nk00 = ne00;
1279
+
1280
+ int nth = 32; // SIMD width
1281
+
1282
+ while (nth < nk00 && nth < wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
1283
+ nth *= 2;
1284
+ }
1285
+
1286
+ nth = std::min(nth, wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
1287
+
1288
+ // when rows are small, we can batch them together in a single threadgroup
1289
+ int nrptg = 1;
1290
+
1291
+ // TODO: relax this constraint in the future
1292
+ if (wsp_ggml_blck_size(op->src[0]->type) == 1 && wsp_ggml_blck_size(op->type) == 1) {
1293
+ if (nth > nk00) {
1294
+ nrptg = (nth + nk00 - 1)/nk00;
1295
+ nth = nk00;
1296
+
1297
+ if (nrptg*nth > wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
1298
+ nrptg--;
1299
+ }
1300
+ }
1301
+ }
1302
+
1303
+ nth = std::min(nth, nk00);
1304
+
1305
+ wsp_ggml_metal_kargs_cpy args = {
1306
+ /*.ne00 =*/ nk00,
1307
+ /*.ne01 =*/ ne01,
1308
+ /*.ne02 =*/ ne02,
1309
+ /*.ne03 =*/ ne03,
1310
+ /*.nb00 =*/ nb00,
1311
+ /*.nb01 =*/ nb01,
1312
+ /*.nb02 =*/ nb02,
1313
+ /*.nb03 =*/ nb03,
1314
+ /*.ne0 =*/ ne0,
1315
+ /*.ne1 =*/ ne1,
1316
+ /*.ne2 =*/ ne2,
1317
+ /*.ne3 =*/ ne3,
1318
+ /*.nb0 =*/ nb0,
1319
+ /*.nb1 =*/ nb1,
1320
+ /*.nb2 =*/ nb2,
1321
+ /*.nb3 =*/ nb3,
1322
+ };
1323
+
1324
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
1325
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1326
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
1327
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 2);
1328
+
1329
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, nrptg, 1);
1330
+
1331
+ return 1;
1332
+ }
1333
+
1334
+ int wsp_ggml_metal_op_pool_2d(wsp_ggml_metal_op_t ctx, int idx) {
1335
+ wsp_ggml_tensor * op = ctx->node(idx);
1336
+
1337
+ wsp_ggml_metal_library_t lib = ctx->lib;
1338
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
1339
+
1340
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1341
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1342
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1343
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
1344
+
1345
+ const int32_t * opts = op->op_params;
1346
+ wsp_ggml_op_pool op_pool = (wsp_ggml_op_pool) opts[0];
1347
+
1348
+ const int32_t k0 = opts[1];
1349
+ const int32_t k1 = opts[2];
1350
+ const int32_t s0 = opts[3];
1351
+ const int32_t s1 = opts[4];
1352
+ const int32_t p0 = opts[5];
1353
+ const int32_t p1 = opts[6];
1354
+
1355
+ const int64_t IH = op->src[0]->ne[1];
1356
+ const int64_t IW = op->src[0]->ne[0];
1357
+
1358
+ const int64_t N = op->ne[3];
1359
+ const int64_t OC = op->ne[2];
1360
+ const int64_t OH = op->ne[1];
1361
+ const int64_t OW = op->ne[0];
1362
+
1363
+ const int64_t np = N * OC * OH * OW;
1364
+
1365
+ wsp_ggml_metal_kargs_pool_2d args_pool_2d = {
1366
+ /* .k0 = */ k0,
1367
+ /* .k1 = */ k1,
1368
+ /* .s0 = */ s0,
1369
+ /* .s1 = */ s1,
1370
+ /* .p0 = */ p0,
1371
+ /* .p1 = */ p1,
1372
+ /* .IH = */ IH,
1373
+ /* .IW = */ IW,
1374
+ /* .OH = */ OH,
1375
+ /* .OW = */ OW,
1376
+ /* .np = */ np
1377
+ };
1378
+
1379
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_pool_2d(lib, op, op_pool);
1380
+
1381
+ const int nth = std::min(wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), (int) np);
1382
+ const int ntg = (np + nth - 1) / nth;
1383
+
1384
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
1385
+ wsp_ggml_metal_encoder_set_bytes (enc, &args_pool_2d, sizeof(args_pool_2d), 0);
1386
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
1387
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 2);
1388
+
1389
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ntg, 1, 1, nth, 1, 1);
1390
+
1391
+ return 1;
1392
+ }
1393
+
1394
+ int wsp_ggml_metal_op_mul_mat(wsp_ggml_metal_op_t ctx, int idx) {
1395
+ wsp_ggml_tensor * op = ctx->node(idx);
1396
+
1397
+ wsp_ggml_metal_library_t lib = ctx->lib;
1398
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
1399
+
1400
+ const wsp_ggml_metal_device_props * props_dev = wsp_ggml_metal_device_get_props(ctx->dev);
1401
+
1402
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1403
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1404
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
1405
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
1406
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1407
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
1408
+
1409
+ WSP_GGML_ASSERT(ne00 == ne10);
1410
+
1411
+ WSP_GGML_ASSERT(ne12 % ne02 == 0);
1412
+ WSP_GGML_ASSERT(ne13 % ne03 == 0);
1413
+
1414
+ const int16_t r2 = ne12/ne02;
1415
+ const int16_t r3 = ne13/ne03;
1416
+
1417
+ // find the break-even point where the matrix-matrix kernel becomes more efficient compared
1418
+ // to the matrix-vector kernel
1419
+ const int ne11_mm_min = 8;
1420
+
1421
+ // first try to use small-batch mat-mv kernels
1422
+ // these should be efficient for BS [2, ~8]
1423
+ if (op->src[1]->type == WSP_GGML_TYPE_F32 && (ne00%128 == 0) &&
1424
+ (
1425
+ (
1426
+ (
1427
+ op->src[0]->type == WSP_GGML_TYPE_F32 || // TODO: helper function
1428
+ op->src[0]->type == WSP_GGML_TYPE_F16 ||
1429
+ op->src[0]->type == WSP_GGML_TYPE_Q4_0 ||
1430
+ op->src[0]->type == WSP_GGML_TYPE_Q4_1 ||
1431
+ op->src[0]->type == WSP_GGML_TYPE_Q5_0 ||
1432
+ op->src[0]->type == WSP_GGML_TYPE_Q5_1 ||
1433
+ op->src[0]->type == WSP_GGML_TYPE_Q8_0 ||
1434
+ op->src[0]->type == WSP_GGML_TYPE_MXFP4 ||
1435
+ op->src[0]->type == WSP_GGML_TYPE_IQ4_NL ||
1436
+ false) && (ne11 >= 2 && ne11 <= 8)
1437
+ ) ||
1438
+ (
1439
+ (
1440
+ op->src[0]->type == WSP_GGML_TYPE_Q4_K ||
1441
+ op->src[0]->type == WSP_GGML_TYPE_Q5_K ||
1442
+ op->src[0]->type == WSP_GGML_TYPE_Q6_K ||
1443
+ false) && (ne11 >= 4 && ne11 <= 8)
1444
+ )
1445
+ )
1446
+ ) {
1447
+ // TODO: determine the optimal parameters based on grid utilization
1448
+ // I still don't know why we should not always use the maximum available threads:
1449
+ //
1450
+ // nsg = pipeline.maxTotalThreadsPerThreadgroup / 32
1451
+ //
1452
+ // my current hypothesis is that the work grid is not evenly divisible for different nsg
1453
+ // values and there can be some tail effects when nsg is high. need to confirm this
1454
+ //
1455
+ const int nsg = 2; // num simdgroups per threadgroup
1456
+
1457
+ // num threads along row per simdgroup
1458
+ int16_t nxpsg = 0;
1459
+ if (ne00 % 256 == 0 && ne11 < 3) {
1460
+ nxpsg = 16;
1461
+ } else if (ne00 % 128 == 0) {
1462
+ nxpsg = 8;
1463
+ } else {
1464
+ nxpsg = 4;
1465
+ }
1466
+
1467
+ const int16_t nypsg = 32/nxpsg; // num threads along col per simdgroup (i.e. a simdgroup processes that many src0 rows at a time)
1468
+ const int16_t r0ptg = nypsg*nsg; // num src0 rows per threadgroup
1469
+ int16_t r1ptg = 4; // num src1 rows per threadgroup
1470
+
1471
+ // note: not sure how optimal are those across all different hardware. there might be someting cleverer
1472
+ switch (ne11) {
1473
+ case 2:
1474
+ r1ptg = 2; break;
1475
+ case 3:
1476
+ case 6:
1477
+ r1ptg = 3; break;
1478
+ case 4:
1479
+ case 7:
1480
+ case 8:
1481
+ r1ptg = 4; break;
1482
+ case 5:
1483
+ r1ptg = 5; break;
1484
+ default:
1485
+ WSP_GGML_ABORT("unsupported ne11");
1486
+ };
1487
+
1488
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_mul_mv_ext(lib, op->src[0]->type, op->src[1]->type, nsg, nxpsg, r1ptg);
1489
+
1490
+ wsp_ggml_metal_kargs_mul_mv_ext args = {
1491
+ /*.ne00 =*/ ne00,
1492
+ /*.ne01 =*/ ne01,
1493
+ /*.ne02 =*/ ne02,
1494
+ /*.nb00 =*/ nb00,
1495
+ /*.nb01 =*/ nb01,
1496
+ /*.nb02 =*/ nb02,
1497
+ /*.nb03 =*/ nb03,
1498
+ /*.ne10 =*/ ne10,
1499
+ /*.ne11 =*/ ne11,
1500
+ /*.ne12 =*/ ne12,
1501
+ /*.nb10 =*/ nb10,
1502
+ /*.nb11 =*/ nb11,
1503
+ /*.nb12 =*/ nb12,
1504
+ /*.nb13 =*/ nb13,
1505
+ /*.ne0 =*/ ne0,
1506
+ /*.ne1 =*/ ne1,
1507
+ /*.r2 =*/ r2,
1508
+ /*.r3 =*/ r3,
1509
+ };
1510
+
1511
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
1512
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1513
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
1514
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
1515
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 3);
1516
+
1517
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ((ne01 + r0ptg - 1)/r0ptg), ((ne11 + r1ptg - 1)/r1ptg), ne12*ne13, 32, nsg, 1);
1518
+ } else if (
1519
+ !wsp_ggml_is_transposed(op->src[0]) &&
1520
+ !wsp_ggml_is_transposed(op->src[1]) &&
1521
+ // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
1522
+ // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
1523
+ props_dev->has_simdgroup_mm && ne00 >= 64 &&
1524
+ (ne11 > ne11_mm_min || (wsp_ggml_is_quantized(op->src[0]->type) && ne12 > 1))) {
1525
+ //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
1526
+
1527
+ // some Metal matrix data types require aligned pointers
1528
+ // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
1529
+ //switch (op->src[0]->type) {
1530
+ // case WSP_GGML_TYPE_F32: WSP_GGML_ASSERT(nb01 % 16 == 0); break;
1531
+ // case WSP_GGML_TYPE_F16: WSP_GGML_ASSERT(nb01 % 8 == 0); break;
1532
+ // case WSP_GGML_TYPE_BF16: WSP_GGML_ASSERT(nb01 % 8 == 0); break;
1533
+ // default: break;
1534
+ //}
1535
+
1536
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_mul_mm(lib, op);
1537
+
1538
+ wsp_ggml_metal_kargs_mul_mm args = {
1539
+ /*.ne00 =*/ ne00,
1540
+ /*.ne02 =*/ ne02,
1541
+ /*.nb01 =*/ nb01,
1542
+ /*.nb02 =*/ nb02,
1543
+ /*.nb03 =*/ nb03,
1544
+ /*.ne12 =*/ ne12,
1545
+ /*.nb10 =*/ nb10,
1546
+ /*.nb11 =*/ nb11,
1547
+ /*.nb12 =*/ nb12,
1548
+ /*.nb13 =*/ nb13,
1549
+ /*.ne0 =*/ ne0,
1550
+ /*.ne1 =*/ ne1,
1551
+ /*.r2 =*/ r2,
1552
+ /*.r3 =*/ r3,
1553
+ };
1554
+
1555
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
1556
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1557
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
1558
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
1559
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 3);
1560
+
1561
+ const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
1562
+
1563
+ wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
1564
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ((ne11 + 31)/32), ((ne01 + 63)/64), ne12*ne13, 128, 1, 1);
1565
+ } else {
1566
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_mul_mv(lib, op);
1567
+
1568
+ const int nr0 = wsp_ggml_metal_pipeline_get_nr0(pipeline);
1569
+ const int nr1 = wsp_ggml_metal_pipeline_get_nr1(pipeline);
1570
+ const int nsg = wsp_ggml_metal_pipeline_get_nsg(pipeline);
1571
+
1572
+ const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
1573
+
1574
+ wsp_ggml_metal_kargs_mul_mv args = {
1575
+ /*.ne00 =*/ ne00,
1576
+ /*.ne01 =*/ ne01,
1577
+ /*.ne02 =*/ ne02,
1578
+ /*.nb00 =*/ nb00,
1579
+ /*.nb01 =*/ nb01,
1580
+ /*.nb02 =*/ nb02,
1581
+ /*.nb03 =*/ nb03,
1582
+ /*.ne10 =*/ ne10,
1583
+ /*.ne11 =*/ ne11,
1584
+ /*.ne12 =*/ ne12,
1585
+ /*.nb10 =*/ nb10,
1586
+ /*.nb11 =*/ nb11,
1587
+ /*.nb12 =*/ nb12,
1588
+ /*.nb13 =*/ nb13,
1589
+ /*.ne0 =*/ ne0,
1590
+ /*.ne1 =*/ ne1,
1591
+ /*.nr0 =*/ nr0,
1592
+ /*.r2 =*/ r2,
1593
+ /*.r3 =*/ r3,
1594
+ };
1595
+
1596
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
1597
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1598
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
1599
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
1600
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 3);
1601
+
1602
+ wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
1603
+
1604
+ if (op->src[0]->type == WSP_GGML_TYPE_F32 ||
1605
+ op->src[0]->type == WSP_GGML_TYPE_F16 ||
1606
+ op->src[0]->type == WSP_GGML_TYPE_BF16 ||
1607
+ op->src[0]->type == WSP_GGML_TYPE_Q8_0) {
1608
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ((ne01 + nr0 - 1)/(nr0)), ((ne11 + nr1 - 1)/nr1), ne12*ne13, 32, nsg, 1);
1609
+ } else {
1610
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ((ne01 + nr0*nsg - 1)/(nr0*nsg)), ((ne11 + nr1 - 1)/nr1), ne12*ne13, 32, nsg, 1);
1611
+ }
1612
+ }
1613
+
1614
+ return 1;
1615
+ }
1616
+
1617
+ size_t wsp_ggml_metal_op_mul_mat_id_extra_tpe(const wsp_ggml_tensor * op) {
1618
+ assert(op->op == WSP_GGML_OP_MUL_MAT_ID);
1619
+
1620
+ const int64_t ne02 = op->src[0]->ne[2]; // n_expert
1621
+
1622
+ return wsp_ggml_type_size(WSP_GGML_TYPE_I32)*ne02;
1623
+ }
1624
+
1625
+ size_t wsp_ggml_metal_op_mul_mat_id_extra_ids(const wsp_ggml_tensor * op) {
1626
+ assert(op->op == WSP_GGML_OP_MUL_MAT_ID);
1627
+
1628
+ const int64_t ne02 = op->src[0]->ne[2]; // n_expert
1629
+ const int64_t ne21 = op->src[2]->ne[1]; // n_token
1630
+
1631
+ return wsp_ggml_type_size(WSP_GGML_TYPE_I32)*ne02*ne21;
1632
+ }
1633
+
1634
+ int wsp_ggml_metal_op_mul_mat_id(wsp_ggml_metal_op_t ctx, int idx) {
1635
+ wsp_ggml_tensor * op = ctx->node(idx);
1636
+
1637
+ wsp_ggml_metal_library_t lib = ctx->lib;
1638
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
1639
+
1640
+ const wsp_ggml_metal_device_props * props_dev = wsp_ggml_metal_device_get_props(ctx->dev);
1641
+
1642
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1643
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1644
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
1645
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
1646
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
1647
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
1648
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1649
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
1650
+
1651
+ // src2 = ids
1652
+ WSP_GGML_ASSERT(op->src[2]->type == WSP_GGML_TYPE_I32);
1653
+
1654
+ WSP_GGML_ASSERT(!wsp_ggml_is_transposed(op->src[0]));
1655
+ WSP_GGML_ASSERT(!wsp_ggml_is_transposed(op->src[1]));
1656
+
1657
+ WSP_GGML_ASSERT(ne03 == 1);
1658
+ WSP_GGML_ASSERT(ne13 == 1);
1659
+
1660
+ wsp_ggml_metal_buffer_id bid_src0 = wsp_ggml_metal_get_buffer_id(op->src[0]);
1661
+ wsp_ggml_metal_buffer_id bid_src1 = wsp_ggml_metal_get_buffer_id(op->src[1]);
1662
+ wsp_ggml_metal_buffer_id bid_src2 = wsp_ggml_metal_get_buffer_id(op->src[2]);
1663
+ wsp_ggml_metal_buffer_id bid_dst = wsp_ggml_metal_get_buffer_id(op);
1664
+
1665
+ const uint32_t r2 = 1;
1666
+ const uint32_t r3 = 1;
1667
+
1668
+ // find the break-even point where the matrix-matrix kernel becomes more efficient compared
1669
+ // to the matrix-vector kernel
1670
+ // ne20 = n_used_experts
1671
+ // ne21 = n_rows (batch size)
1672
+ const int ne21_mm_id_min = 32;
1673
+
1674
+ if (props_dev->has_simdgroup_mm && ne00 >= 64 && (ne21 >= ne21_mm_id_min)) {
1675
+ // some Metal matrix data types require aligned pointers
1676
+ // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
1677
+ //switch (op->src[0]->type) {
1678
+ // case WSP_GGML_TYPE_F32: WSP_GGML_ASSERT(nb01 % 16 == 0); break;
1679
+ // case WSP_GGML_TYPE_F16: WSP_GGML_ASSERT(nb01 % 8 == 0); break;
1680
+ // case WSP_GGML_TYPE_BF16: WSP_GGML_ASSERT(nb01 % 8 == 0); break;
1681
+ // default: break;
1682
+ //}
1683
+
1684
+ // extra buffers for intermediate id mapping
1685
+ wsp_ggml_metal_buffer_id bid_tpe = bid_dst;
1686
+ bid_tpe.offs += wsp_ggml_nbytes(op);
1687
+
1688
+ wsp_ggml_metal_buffer_id bid_ids = bid_tpe;
1689
+ bid_ids.offs += wsp_ggml_metal_op_mul_mat_id_extra_tpe(op);
1690
+
1691
+ {
1692
+ wsp_ggml_metal_kargs_mul_mm_id_map0 args = {
1693
+ ne02,
1694
+ ne10,
1695
+ ne11, // n_expert_used (bcast)
1696
+ nb11,
1697
+ nb12,
1698
+ ne21, // n_tokens
1699
+ ne20, // n_expert_used
1700
+ nb21,
1701
+ };
1702
+
1703
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_mul_mm_id_map0(lib, ne02, ne20);
1704
+
1705
+ const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
1706
+
1707
+ WSP_GGML_ASSERT(ne02 <= wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
1708
+
1709
+ WSP_GGML_ASSERT(smem <= props_dev->max_theadgroup_memory_size);
1710
+
1711
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
1712
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1713
+ wsp_ggml_metal_encoder_set_buffer (enc, bid_src2, 1);
1714
+ wsp_ggml_metal_encoder_set_buffer (enc, bid_tpe, 2);
1715
+ wsp_ggml_metal_encoder_set_buffer (enc, bid_ids, 3);
1716
+
1717
+ wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
1718
+
1719
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, 1, 1, 1, ne02, 1, 1);
1720
+ }
1721
+
1722
+ // this barrier is always needed because the next kernel has to wait for the id maps to be computed
1723
+ wsp_ggml_metal_op_concurrency_reset(ctx);
1724
+
1725
+ {
1726
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_mul_mm_id(lib, op);
1727
+
1728
+ wsp_ggml_metal_kargs_mul_mm_id args = {
1729
+ /*.ne00 =*/ ne00,
1730
+ /*.ne02 =*/ ne02,
1731
+ /*.nb01 =*/ nb01,
1732
+ /*.nb02 =*/ nb02,
1733
+ /*.nb03 =*/ nb03,
1734
+ /*.ne11 =*/ ne11, // n_expert_used (bcast)
1735
+ /*.nb10 =*/ nb10,
1736
+ /*.nb11 =*/ nb11,
1737
+ /*.nb12 =*/ nb12,
1738
+ /*.nb13 =*/ nb13,
1739
+ /*.ne20 =*/ ne20, // n_expert_used
1740
+ /*.ne21 =*/ ne21, // n_tokens
1741
+ /*.ne0 =*/ ne0,
1742
+ /*.ne1 =*/ ne1,
1743
+ /*.r2 =*/ r2,
1744
+ /*.r3 =*/ r3,
1745
+ };
1746
+
1747
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
1748
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1749
+ wsp_ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
1750
+ wsp_ggml_metal_encoder_set_buffer (enc, bid_src1, 2);
1751
+ wsp_ggml_metal_encoder_set_buffer (enc, bid_tpe, 3);
1752
+ wsp_ggml_metal_encoder_set_buffer (enc, bid_ids, 4);
1753
+ wsp_ggml_metal_encoder_set_buffer (enc, bid_dst, 5);
1754
+
1755
+ const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
1756
+
1757
+ wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
1758
+
1759
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, (ne21 + 31)/32, (ne01 + 63)/64, ne02, 128, 1, 1);
1760
+ }
1761
+ } else {
1762
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_mul_mv_id(lib, op);
1763
+
1764
+ const int nr0 = wsp_ggml_metal_pipeline_get_nr0(pipeline);
1765
+ const int nr1 = wsp_ggml_metal_pipeline_get_nr1(pipeline);
1766
+ const int nsg = wsp_ggml_metal_pipeline_get_nsg(pipeline);
1767
+
1768
+ const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
1769
+
1770
+ wsp_ggml_metal_kargs_mul_mv_id args = {
1771
+ /*.nei0 =*/ ne20,
1772
+ /*.nei1 =*/ ne21,
1773
+ /*.nbi1 =*/ nb21,
1774
+ /*.ne00 =*/ ne00,
1775
+ /*.ne01 =*/ ne01,
1776
+ /*.ne02 =*/ ne02,
1777
+ /*.nb00 =*/ nb00,
1778
+ /*.nb01 =*/ nb01,
1779
+ /*.nb02 =*/ nb02,
1780
+ /*.ne10 =*/ ne10,
1781
+ /*.ne11 =*/ ne11,
1782
+ /*.ne12 =*/ ne12,
1783
+ /*.ne13 =*/ ne13,
1784
+ /*.nb10 =*/ nb10,
1785
+ /*.nb11 =*/ nb11,
1786
+ /*.nb12 =*/ nb12,
1787
+ /*.ne0 =*/ ne0,
1788
+ /*.ne1 =*/ ne1,
1789
+ /*.nb1 =*/ nb1,
1790
+ /*.nr0 =*/ nr0,
1791
+ };
1792
+
1793
+ if (wsp_ggml_is_quantized(op->src[0]->type)) {
1794
+ WSP_GGML_ASSERT(ne00 >= nsg*nr0);
1795
+ }
1796
+
1797
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
1798
+ wsp_ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
1799
+ wsp_ggml_metal_encoder_set_buffer(enc, bid_src0, 1);
1800
+ wsp_ggml_metal_encoder_set_buffer(enc, bid_src1, 2);
1801
+ wsp_ggml_metal_encoder_set_buffer(enc, bid_dst, 3);
1802
+ wsp_ggml_metal_encoder_set_buffer(enc, bid_src2, 4);
1803
+
1804
+ const int64_t _ne1 = 1;
1805
+ const int64_t ne123 = ne20*ne21;
1806
+
1807
+ wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
1808
+
1809
+ if (op->src[0]->type == WSP_GGML_TYPE_F32 ||
1810
+ op->src[0]->type == WSP_GGML_TYPE_F16 ||
1811
+ op->src[0]->type == WSP_GGML_TYPE_BF16 ||
1812
+ op->src[0]->type == WSP_GGML_TYPE_Q8_0) {
1813
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nr0 - 1)/(nr0), (_ne1 + nr1 - 1)/nr1, ne123, 32, nsg, 1);
1814
+ } else {
1815
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nr0*nsg - 1)/(nr0*nsg), (_ne1 + nr1 - 1)/nr1, ne123, 32, nsg, 1);
1816
+ }
1817
+ }
1818
+
1819
+ return 1;
1820
+ }
1821
+
1822
+ int wsp_ggml_metal_op_add_id(wsp_ggml_metal_op_t ctx, int idx) {
1823
+ wsp_ggml_tensor * op = ctx->node(idx);
1824
+
1825
+ wsp_ggml_metal_library_t lib = ctx->lib;
1826
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
1827
+
1828
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1829
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1830
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
1831
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
1832
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
1833
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
1834
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1835
+
1836
+ WSP_GGML_ASSERT(op->src[0]->type == WSP_GGML_TYPE_F32);
1837
+ WSP_GGML_ASSERT(op->src[1]->type == WSP_GGML_TYPE_F32);
1838
+ WSP_GGML_ASSERT(op->src[2]->type == WSP_GGML_TYPE_I32);
1839
+ WSP_GGML_ASSERT(op->type == WSP_GGML_TYPE_F32);
1840
+
1841
+ WSP_GGML_ASSERT(wsp_ggml_is_contiguous_rows(op->src[0]));
1842
+
1843
+ wsp_ggml_metal_kargs_add_id args = {
1844
+ /*.ne0 =*/ ne0,
1845
+ /*.ne1 =*/ ne1,
1846
+ /*.nb01 =*/ nb01,
1847
+ /*.nb02 =*/ nb02,
1848
+ /*.nb11 =*/ nb11,
1849
+ /*.nb21 =*/ nb21,
1850
+ };
1851
+
1852
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_base(lib, WSP_GGML_OP_ADD_ID);
1853
+
1854
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
1855
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1856
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
1857
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
1858
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[2]), 3);
1859
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 4);
1860
+
1861
+ const int nth = std::min(wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne00);
1862
+
1863
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, 1, nth, 1, 1);
1864
+
1865
+ return 1;
1866
+ }
1867
+
1868
+ bool wsp_ggml_metal_op_flash_attn_ext_use_vec(const wsp_ggml_tensor * op) {
1869
+ assert(op->op == WSP_GGML_OP_FLASH_ATTN_EXT);
1870
+
1871
+ const int64_t ne00 = op->src[0]->ne[0]; // head size
1872
+ const int64_t ne01 = op->src[0]->ne[1]; // batch size
1873
+
1874
+ // use vec kernel if the batch size is small and if the head size is supported
1875
+ return (ne01 < 20) && (ne00 % 32 == 0);
1876
+ }
1877
+
1878
+ size_t wsp_ggml_metal_op_flash_attn_ext_extra_tmp(const wsp_ggml_tensor * op) {
1879
+ assert(op->op == WSP_GGML_OP_FLASH_ATTN_EXT);
1880
+
1881
+ const int64_t nwg = 32;
1882
+
1883
+ const int64_t ne01 = op->src[0]->ne[1];
1884
+ const int64_t ne02 = op->src[0]->ne[2];
1885
+ const int64_t ne03 = op->src[0]->ne[3];
1886
+ const int64_t ne20 = op->src[2]->ne[0];
1887
+
1888
+ // temp buffer for writing the results from each workgroup
1889
+ // - ne20: the size of the Value head
1890
+ // - + 2: the S and M values for each intermediate result
1891
+ return wsp_ggml_type_size(WSP_GGML_TYPE_F32)*(ne01*ne02*ne03*nwg*(ne20 + 2));
1892
+ }
1893
+
1894
+ int wsp_ggml_metal_op_flash_attn_ext(wsp_ggml_metal_op_t ctx, int idx) {
1895
+ wsp_ggml_tensor * op = ctx->node(idx);
1896
+
1897
+ wsp_ggml_metal_library_t lib = ctx->lib;
1898
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
1899
+
1900
+ const wsp_ggml_metal_device_props * props_dev = wsp_ggml_metal_device_get_props(ctx->dev);
1901
+
1902
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1903
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1904
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
1905
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
1906
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
1907
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
1908
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne3, op->src[3], ne);
1909
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb3, op->src[3], nb);
1910
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1911
+ WSP_GGML_TENSOR_LOCALS( int32_t, nb, op, nb);
1912
+
1913
+ WSP_GGML_ASSERT(ne00 % 4 == 0);
1914
+ WSP_GGML_ASSERT(ne11 % 32 == 0);
1915
+
1916
+ WSP_GGML_ASSERT(op->src[0]->type == WSP_GGML_TYPE_F32);
1917
+ WSP_GGML_ASSERT(op->src[1]->type == op->src[2]->type);
1918
+
1919
+ //WSP_GGML_ASSERT(wsp_ggml_are_same_shape (src1, src2));
1920
+ WSP_GGML_ASSERT(ne11 == ne21);
1921
+ WSP_GGML_ASSERT(ne12 == ne22);
1922
+
1923
+ WSP_GGML_ASSERT(!op->src[3] || op->src[3]->type == WSP_GGML_TYPE_F16);
1924
+ WSP_GGML_ASSERT(!op->src[3] || op->src[3]->ne[1] >= WSP_GGML_PAD(op->src[0]->ne[1], 8) &&
1925
+ "the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
1926
+
1927
+ float scale;
1928
+ float max_bias;
1929
+ float logit_softcap;
1930
+
1931
+ memcpy(&scale, ((const int32_t *) op->op_params) + 0, sizeof(scale));
1932
+ memcpy(&max_bias, ((const int32_t *) op->op_params) + 1, sizeof(max_bias));
1933
+ memcpy(&logit_softcap, ((const int32_t *) op->op_params) + 2, sizeof(logit_softcap));
1934
+
1935
+ if (logit_softcap != 0.0f) {
1936
+ scale /= logit_softcap;
1937
+ }
1938
+
1939
+ const bool has_mask = op->src[3] != NULL;
1940
+ const bool has_sinks = op->src[4] != NULL;
1941
+ const bool has_bias = max_bias != 0.0f;
1942
+ const bool has_scap = logit_softcap != 0.0f;
1943
+
1944
+ const uint32_t n_head = op->src[0]->ne[2];
1945
+ const int32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
1946
+
1947
+ const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
1948
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
1949
+
1950
+ WSP_GGML_ASSERT(ne01 < 65536);
1951
+
1952
+ if (!wsp_ggml_metal_op_flash_attn_ext_use_vec(op)) {
1953
+ // half8x8 kernel
1954
+ const int64_t nqptg = 8; // queries per threadgroup !! sync with kernel template arguments !!
1955
+ const int64_t ncpsg = 64; // cache values per simdgroup !! sync with kernel template arguments !!
1956
+
1957
+ WSP_GGML_ASSERT(nqptg <= 32);
1958
+ WSP_GGML_ASSERT(nqptg % 8 == 0);
1959
+ WSP_GGML_ASSERT(ncpsg % 32 == 0);
1960
+
1961
+ const int is_q = wsp_ggml_is_quantized(op->src[1]->type) ? 1 : 0;
1962
+
1963
+ // 2*(2*ncpsg)
1964
+ // ncpsg soft_max values + ncpsg mask values
1965
+ //
1966
+ // 16*32*(nsg)
1967
+ // the shared memory needed for the simdgroups to load the KV cache
1968
+ // each thread loads (dequantizes) 16 head elements, there are 32 threads in th SG
1969
+ //
1970
+ #define FATTN_SMEM(nsg) (WSP_GGML_PAD((nqptg*(ne00 + 2*WSP_GGML_PAD(ne20, 64) + 2*(2*ncpsg)) + is_q*(16*32*(nsg)))*(sizeof(float)/2), 16))
1971
+
1972
+ //int64_t nsgmax = 4;
1973
+ //
1974
+ //if (is_q) {
1975
+ // nsgmax = 2;
1976
+ // while (true) {
1977
+ // const size_t smem = FATTN_SMEM(nsgmax);
1978
+ // if (smem > props_dev->max_theadgroup_memory_size) {
1979
+ // break;
1980
+ // }
1981
+ // nsgmax *= 2;
1982
+ // }
1983
+ // nsgmax /= 2;
1984
+ //}
1985
+
1986
+ // simdgroups per threadgroup (a.k.a. warps)
1987
+ //nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4;
1988
+ int32_t nsg = 4;
1989
+
1990
+ const size_t smem = FATTN_SMEM(nsg);
1991
+
1992
+ wsp_ggml_metal_kargs_flash_attn_ext args = {
1993
+ /*.ne01 =*/ ne01,
1994
+ /*.ne02 =*/ ne02,
1995
+ /*.ne03 =*/ ne03,
1996
+ /*.nb01 =*/ nb01,
1997
+ /*.nb02 =*/ nb02,
1998
+ /*.nb03 =*/ nb03,
1999
+ /*.ne11 =*/ ne11,
2000
+ /*.ne_12_2 =*/ ne12,
2001
+ /*.ne_12_3 =*/ ne13,
2002
+ /*.ns10 =*/ int32_t(nb11/nb10),
2003
+ /*.nb11 =*/ nb11,
2004
+ /*.nb12 =*/ nb12,
2005
+ /*.nb13 =*/ nb13,
2006
+ /*.ns20 =*/ int32_t(nb21/nb20),
2007
+ /*.nb21 =*/ nb21,
2008
+ /*.nb22 =*/ nb22,
2009
+ /*.nb23 =*/ nb23,
2010
+ /*.ne32 =*/ ne32,
2011
+ /*.ne33 =*/ ne33,
2012
+ /*.nb31 =*/ nb31,
2013
+ /*.nb32 =*/ nb32,
2014
+ /*.nb33 =*/ nb33,
2015
+ /*.ne1 =*/ ne1,
2016
+ /*.ne2 =*/ ne2,
2017
+ /*.ne3 =*/ ne3,
2018
+ /*.scale =*/ scale,
2019
+ /*.max_bias =*/ max_bias,
2020
+ /*.m0 =*/ m0,
2021
+ /*.m1 =*/ m1,
2022
+ /*.n_head_log2 =*/ n_head_log2,
2023
+ /*.logit_softcap =*/ logit_softcap,
2024
+ };
2025
+
2026
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_flash_attn_ext(lib, op, has_mask, has_sinks, has_bias, has_scap, nsg);
2027
+
2028
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
2029
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2030
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
2031
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
2032
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[2]), 3);
2033
+ if (op->src[3]) {
2034
+ wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[3]), 4);
2035
+ } else {
2036
+ wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 4);
2037
+ }
2038
+ if (op->src[4]) {
2039
+ wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[4]), 5);
2040
+ } else {
2041
+ wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 5);
2042
+ }
2043
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 6);
2044
+
2045
+ wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
2046
+
2047
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, ne02, ne03, 32, nsg, 1);
2048
+ #undef FATTN_SMEM
2049
+ } else {
2050
+ // half4x4 kernel
2051
+ const int64_t nqptg = 1; // queries per threadgroup !! sync with kernel template arguments !!
2052
+ const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !!
2053
+ const int64_t nkpsg = 1*ncpsg;
2054
+
2055
+ WSP_GGML_ASSERT(nqptg <= 32);
2056
+ WSP_GGML_ASSERT(nqptg % 1 == 0);
2057
+ WSP_GGML_ASSERT(ncpsg % 32 == 0);
2058
+
2059
+ // ne00 + 2*ncpsg*(nsg)
2060
+ // for each query, we load it as f16 in shared memory (ne00)
2061
+ // and store the soft_max values and the mask
2062
+ //
2063
+ // ne20*(nsg)
2064
+ // each simdgroup has a full f32 head vector in shared mem to accumulate results
2065
+ //
2066
+ #define FATTN_SMEM(nsg) (WSP_GGML_PAD((nqptg*(WSP_GGML_PAD(ne00, 128) + 4*ncpsg*(nsg)) + 2*WSP_GGML_PAD(ne20, 128)*(nsg))*(sizeof(float)/2), 16))
2067
+
2068
+ int64_t nsgmax = 2;
2069
+ while (true) {
2070
+ const size_t smem = FATTN_SMEM(nsgmax);
2071
+ // avoid using more than half of the threadgroup memory - can cause slow downs especially for large head sizes
2072
+ if (smem > props_dev->max_theadgroup_memory_size/2) {
2073
+ break;
2074
+ }
2075
+ nsgmax *= 2;
2076
+ }
2077
+ nsgmax /= 2;
2078
+
2079
+ // simdgroups per threadgroup (a.k.a. warps)
2080
+ //const int64_t nsgt = MAX(2, MIN(nsgmax, MIN((ne11 + nkpsg - 1)/(nkpsg), (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32)));
2081
+ const int64_t nsgt = MAX(2, MIN(nsgmax, MIN((ne11 + nkpsg - 1)/(nkpsg), (int64_t) 1024/32)));
2082
+
2083
+ int64_t nsg = 1;
2084
+ while (nsg <= nsgt) {
2085
+ nsg *= 2;
2086
+ }
2087
+ nsg /= 2;
2088
+
2089
+ // workgroups
2090
+ // each workgroup handles nsg*nkpsg cache values
2091
+ int32_t nwg = 1;
2092
+ if (false) {
2093
+ // for small KV caches, we could launch a single workgroup and write the results directly to dst/
2094
+ // however, this does not lead to significant improvement, so disabled
2095
+ nwg = 1;
2096
+ nsg = 4;
2097
+ } else {
2098
+ nwg = 32;
2099
+ nsg = 1;
2100
+ while (2*nwg*nsg*nkpsg < ne11 && nsg < 4) {
2101
+ nsg *= 2;
2102
+ }
2103
+ }
2104
+
2105
+ wsp_ggml_metal_kargs_flash_attn_ext_vec args = {
2106
+ /*.ne01 =*/ ne01,
2107
+ /*.ne02 =*/ ne02,
2108
+ /*.ne03 =*/ ne03,
2109
+ /*.nb01 =*/ nb01,
2110
+ /*.nb02 =*/ nb02,
2111
+ /*.nb03 =*/ nb03,
2112
+ /*.ne11 =*/ ne11,
2113
+ /*.ne_12_2 =*/ ne12,
2114
+ /*.ne_12_3 =*/ ne13,
2115
+ /*.ns10 =*/ int32_t(nb11/nb10),
2116
+ /*.nb11 =*/ nb11,
2117
+ /*.nb12 =*/ nb12,
2118
+ /*.nb13 =*/ nb13,
2119
+ /*.ns20 =*/ int32_t(nb21/nb20),
2120
+ /*.nb21 =*/ nb21,
2121
+ /*.nb22 =*/ nb22,
2122
+ /*.nb23 =*/ nb23,
2123
+ /*.ne32 =*/ ne32,
2124
+ /*.ne33 =*/ ne33,
2125
+ /*.nb31 =*/ nb31,
2126
+ /*.nb32 =*/ nb32,
2127
+ /*.nb33 =*/ nb33,
2128
+ /*.ne1 =*/ ne1,
2129
+ /*.ne2 =*/ ne2,
2130
+ /*.ne3 =*/ ne3,
2131
+ /*.scale =*/ scale,
2132
+ /*.max_bias =*/ max_bias,
2133
+ /*.m0 =*/ m0,
2134
+ /*.m1 =*/ m1,
2135
+ /*.n_head_log2 =*/ n_head_log2,
2136
+ /*.logit_softcap =*/ logit_softcap,
2137
+ };
2138
+
2139
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_flash_attn_ext_vec(lib, op, has_mask, has_sinks, has_bias, has_scap, nsg, nwg);
2140
+
2141
+ WSP_GGML_ASSERT(nsg*32 <= wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
2142
+
2143
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
2144
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2145
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
2146
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
2147
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[2]), 3);
2148
+ if (op->src[3]) {
2149
+ wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[3]), 4);
2150
+ } else {
2151
+ wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 4);
2152
+ }
2153
+ if (op->src[4]) {
2154
+ wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[4]), 5);
2155
+ } else {
2156
+ wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 5);
2157
+ }
2158
+
2159
+ const size_t smem = FATTN_SMEM(nsg);
2160
+
2161
+ //printf("smem: %zu, max: %zu, nsg = %d, nsgmax = %d\n", smem, props_dev->max_theadgroup_memory_size, (int) nsg, (int) nsgmax);
2162
+ WSP_GGML_ASSERT(smem <= props_dev->max_theadgroup_memory_size);
2163
+
2164
+ if (nwg == 1) {
2165
+ // using 1 workgroup -> write the result directly into dst
2166
+ wsp_ggml_metal_encoder_set_buffer(enc, wsp_ggml_metal_get_buffer_id(op), 6);
2167
+
2168
+ wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
2169
+
2170
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, ne02, ne03*nwg, 32, nsg, 1);
2171
+ } else {
2172
+ // sanity checks
2173
+ WSP_GGML_ASSERT(ne01*ne02*ne03 == ne1*ne2*ne3);
2174
+ WSP_GGML_ASSERT((uint64_t)ne1*ne2*ne3 <= (1u << 31));
2175
+
2176
+ wsp_ggml_metal_buffer_id bid_dst = wsp_ggml_metal_get_buffer_id(op);
2177
+
2178
+ // write the results from each workgroup into a temp buffer
2179
+ wsp_ggml_metal_buffer_id bid_tmp = bid_dst;
2180
+ bid_tmp.offs += wsp_ggml_nbytes(op);
2181
+ wsp_ggml_metal_encoder_set_buffer(enc, bid_tmp, 6);
2182
+
2183
+ wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
2184
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, ne02, ne03*nwg, 32, nsg, 1);
2185
+
2186
+ // sync the 2 kernels
2187
+ wsp_ggml_metal_op_concurrency_reset(ctx);
2188
+
2189
+ // reduce the results from the workgroups
2190
+ {
2191
+ const int32_t nrows = ne1*ne2*ne3;
2192
+
2193
+ wsp_ggml_metal_kargs_flash_attn_ext_vec_reduce args0 = {
2194
+ nrows,
2195
+ };
2196
+
2197
+ wsp_ggml_metal_pipeline_t pipeline0 = wsp_ggml_metal_library_get_pipeline_flash_attn_ext_vec_reduce(lib, op, ne20, nwg);
2198
+
2199
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline0);
2200
+ wsp_ggml_metal_encoder_set_bytes (enc, &args0, sizeof(args0), 0);
2201
+ wsp_ggml_metal_encoder_set_buffer (enc, bid_tmp, 1);
2202
+ wsp_ggml_metal_encoder_set_buffer (enc, bid_dst, 2);
2203
+
2204
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, nrows, 1, 1, 32*nwg, 1, 1);
2205
+ }
2206
+ }
2207
+ #undef FATTN_SMEM
2208
+ }
2209
+
2210
+ return 1;
2211
+ }
2212
+
2213
+ int wsp_ggml_metal_op_bin(wsp_ggml_metal_op_t ctx, int idx) {
2214
+ wsp_ggml_tensor * op = ctx->node(idx);
2215
+
2216
+ wsp_ggml_metal_library_t lib = ctx->lib;
2217
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
2218
+
2219
+ const bool use_fusion = ctx->use_fusion;
2220
+
2221
+ const int debug_fusion = ctx->debug_fusion;
2222
+
2223
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2224
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2225
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
2226
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
2227
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2228
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
2229
+
2230
+ WSP_GGML_ASSERT(op->src[0]->type == WSP_GGML_TYPE_F32);
2231
+ WSP_GGML_ASSERT(op->src[1]->type == WSP_GGML_TYPE_F32);
2232
+
2233
+ WSP_GGML_ASSERT(wsp_ggml_is_contiguous_rows(op->src[0]));
2234
+ WSP_GGML_ASSERT(wsp_ggml_is_contiguous_rows(op->src[1]));
2235
+
2236
+ bool bcast_row = false;
2237
+
2238
+ wsp_ggml_metal_buffer_id bid_src0 = wsp_ggml_metal_get_buffer_id(op->src[0]);
2239
+ wsp_ggml_metal_buffer_id bid_src1 = wsp_ggml_metal_get_buffer_id(op->src[1]);
2240
+ wsp_ggml_metal_buffer_id bid_dst = wsp_ggml_metal_get_buffer_id(op);
2241
+
2242
+ wsp_ggml_metal_kargs_bin args = {
2243
+ /*.ne00 =*/ ne00,
2244
+ /*.ne01 =*/ ne01,
2245
+ /*.ne02 =*/ ne02,
2246
+ /*.ne03 =*/ ne03,
2247
+ /*.nb00 =*/ nb00,
2248
+ /*.nb01 =*/ nb01,
2249
+ /*.nb02 =*/ nb02,
2250
+ /*.nb03 =*/ nb03,
2251
+ /*.ne10 =*/ ne10,
2252
+ /*.ne11 =*/ ne11,
2253
+ /*.ne12 =*/ ne12,
2254
+ /*.ne13 =*/ ne13,
2255
+ /*.nb10 =*/ nb10,
2256
+ /*.nb11 =*/ nb11,
2257
+ /*.nb12 =*/ nb12,
2258
+ /*.nb13 =*/ nb13,
2259
+ /*.ne0 =*/ ne0,
2260
+ /*.ne1 =*/ ne1,
2261
+ /*.ne2 =*/ ne2,
2262
+ /*.ne3 =*/ ne3,
2263
+ /*.nb0 =*/ nb0,
2264
+ /*.nb1 =*/ nb1,
2265
+ /*.nb2 =*/ nb2,
2266
+ /*.nb3 =*/ nb3,
2267
+ /*.offs =*/ 0,
2268
+ /*.o1 =*/ { bid_src1.offs },
2269
+ };
2270
+
2271
+ wsp_ggml_op fops[8];
2272
+
2273
+ int n_fuse = 1;
2274
+
2275
+ // c[0] = add(a, b[0])
2276
+ // c[1] = add(c[0], b[1])
2277
+ // c[2] = add(c[1], b[2])
2278
+ // ...
2279
+ if (use_fusion) {
2280
+ fops[0] = WSP_GGML_OP_ADD;
2281
+ fops[1] = WSP_GGML_OP_ADD;
2282
+ fops[2] = WSP_GGML_OP_ADD;
2283
+ fops[3] = WSP_GGML_OP_ADD;
2284
+ fops[4] = WSP_GGML_OP_ADD;
2285
+ fops[5] = WSP_GGML_OP_ADD;
2286
+ fops[6] = WSP_GGML_OP_ADD;
2287
+ fops[7] = WSP_GGML_OP_ADD;
2288
+
2289
+ // note: in metal, we sometimes encode the graph in parallel so we have to avoid fusing ops
2290
+ // across splits. idx_end indicates the last node in the current split
2291
+ for (n_fuse = 0; n_fuse <= 6; ++n_fuse) {
2292
+ if (!ctx->can_fuse(idx + n_fuse, fops + n_fuse, 2)) {
2293
+ break;
2294
+ }
2295
+
2296
+ wsp_ggml_tensor * f0 = ctx->node(idx + n_fuse);
2297
+ wsp_ggml_tensor * f1 = ctx->node(idx + n_fuse + 1);
2298
+
2299
+ if (f0 != f1->src[0]) {
2300
+ break;
2301
+ }
2302
+
2303
+ // b[0] === b[1] === ...
2304
+ if (!wsp_ggml_are_same_layout(f0->src[1], f1->src[1])) {
2305
+ break;
2306
+ }
2307
+
2308
+ // only fuse ops if src1 is in the same Metal buffer
2309
+ wsp_ggml_metal_buffer_id bid_fuse = wsp_ggml_metal_get_buffer_id(f1->src[1]);
2310
+ if (bid_fuse.metal != bid_src1.metal) {
2311
+ break;
2312
+ }
2313
+
2314
+ //ctx->fuse_cnt[ops[n_fuse + 1]->op]++;
2315
+
2316
+ args.o1[n_fuse + 1] = bid_fuse.offs;
2317
+ }
2318
+
2319
+ ++n_fuse;
2320
+
2321
+ if (debug_fusion > 1 && n_fuse > 1) {
2322
+ WSP_GGML_LOG_DEBUG("%s: fuse: ADD x %d\n", __func__, n_fuse);
2323
+ }
2324
+ }
2325
+
2326
+ // the offsets of src1 and all fused buffers are relative to the start of the src1 buffer
2327
+ bid_src1.offs = 0;
2328
+
2329
+ wsp_ggml_metal_pipeline_t pipeline = nullptr;
2330
+
2331
+ if (wsp_ggml_nelements(op->src[1]) == ne10 && wsp_ggml_is_contiguous(op->src[1]) && ne00 % 4 == 0 && ne10 % 4 == 0) {
2332
+ WSP_GGML_ASSERT(wsp_ggml_is_contiguous(op->src[0]));
2333
+
2334
+ // src1 is a row
2335
+ WSP_GGML_ASSERT(ne11 == 1);
2336
+
2337
+ pipeline = wsp_ggml_metal_library_get_pipeline_bin(lib, op->op, n_fuse, true);
2338
+
2339
+ bcast_row = true;
2340
+ } else {
2341
+ pipeline = wsp_ggml_metal_library_get_pipeline_bin(lib, op->op, n_fuse, false);
2342
+ }
2343
+
2344
+ if (n_fuse > 1) {
2345
+ bid_dst = wsp_ggml_metal_get_buffer_id(ctx->node(idx + n_fuse - 1));
2346
+
2347
+ for (int i = 1; i < n_fuse; ++i) {
2348
+ if (!wsp_ggml_metal_op_concurrency_check(ctx, ctx->node(idx + i))) {
2349
+ wsp_ggml_metal_op_concurrency_reset(ctx);
2350
+
2351
+ break;
2352
+ }
2353
+ }
2354
+ }
2355
+
2356
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
2357
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2358
+ wsp_ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
2359
+ wsp_ggml_metal_encoder_set_buffer (enc, bid_src1, 2);
2360
+ wsp_ggml_metal_encoder_set_buffer (enc, bid_dst, 3);
2361
+
2362
+ if (bcast_row) {
2363
+ const int64_t n = wsp_ggml_nelements(op)/4;
2364
+
2365
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
2366
+ } else {
2367
+ int nth = 32;
2368
+
2369
+ while (16*nth < ne0 && nth < wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
2370
+ nth *= 2;
2371
+ }
2372
+
2373
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
2374
+ }
2375
+
2376
+ return n_fuse;
2377
+ }
2378
+
2379
+ int wsp_ggml_metal_op_l2_norm(wsp_ggml_metal_op_t ctx, int idx) {
2380
+ wsp_ggml_tensor * op = ctx->node(idx);
2381
+
2382
+ wsp_ggml_metal_library_t lib = ctx->lib;
2383
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
2384
+
2385
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2386
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2387
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2388
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
2389
+
2390
+ float eps;
2391
+ memcpy(&eps, op->op_params, sizeof(float));
2392
+
2393
+ int nth = 32; // SIMD width
2394
+
2395
+ wsp_ggml_metal_kargs_l2_norm args = {
2396
+ /*.ne00 =*/ ne00,
2397
+ /*.ne00_4 =*/ ne00/4,
2398
+ /*.nb01 =*/ nb01,
2399
+ /*.eps =*/ eps,
2400
+ };
2401
+
2402
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_l2_norm(lib, op);
2403
+
2404
+ while (nth < ne00/4 && nth < wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
2405
+ nth *= 2;
2406
+ }
2407
+
2408
+ nth = std::min(nth, wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
2409
+ nth = std::min(nth, ne00/4);
2410
+
2411
+ const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
2412
+
2413
+ const int64_t nrows = wsp_ggml_nrows(op->src[0]);
2414
+
2415
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
2416
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2417
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
2418
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 2);
2419
+
2420
+ wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
2421
+
2422
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, nrows, 1, 1, nth, 1, 1);
2423
+
2424
+ return 1;
2425
+ }
2426
+
2427
+ int wsp_ggml_metal_op_group_norm(wsp_ggml_metal_op_t ctx, int idx) {
2428
+ wsp_ggml_tensor * op = ctx->node(idx);
2429
+
2430
+ wsp_ggml_metal_library_t lib = ctx->lib;
2431
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
2432
+
2433
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2434
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2435
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2436
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
2437
+
2438
+ const int32_t ngrp = ((const int32_t *) op->op_params)[0];
2439
+
2440
+ float eps;
2441
+ memcpy(&eps, op->op_params + 1, sizeof(float));
2442
+
2443
+ wsp_ggml_metal_kargs_group_norm args = {
2444
+ /*.ne00 =*/ ne00,
2445
+ /*.ne01 =*/ ne01,
2446
+ /*.ne02 =*/ ne02,
2447
+ /*.nb00 =*/ nb00,
2448
+ /*.nb01 =*/ nb01,
2449
+ /*.nb02 =*/ nb02,
2450
+ /*.ngrp =*/ ngrp,
2451
+ /*.eps =*/ eps,
2452
+ };
2453
+
2454
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_group_norm(lib, op);
2455
+
2456
+ int nth = 32; // SIMD width
2457
+ //while (nth < ne00/4 && nth < wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
2458
+ // nth *= 2;
2459
+ //}
2460
+
2461
+ //nth = std::min(nth, wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
2462
+ //nth = std::min(nth, ne00/4);
2463
+
2464
+ const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
2465
+
2466
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
2467
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2468
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
2469
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 2);
2470
+
2471
+ wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
2472
+
2473
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ngrp, 1, 1, nth, 1, 1);
2474
+
2475
+ return 1;
2476
+ }
2477
+
2478
+ int wsp_ggml_metal_op_norm(wsp_ggml_metal_op_t ctx, int idx) {
2479
+ wsp_ggml_tensor * op = ctx->node(idx);
2480
+
2481
+ wsp_ggml_metal_library_t lib = ctx->lib;
2482
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
2483
+
2484
+ const bool use_fusion = ctx->use_fusion;
2485
+
2486
+ const int debug_fusion = ctx->debug_fusion;
2487
+
2488
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2489
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2490
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2491
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
2492
+
2493
+ float eps;
2494
+ memcpy(&eps, op->op_params, sizeof(float));
2495
+
2496
+ wsp_ggml_metal_buffer_id bid_src0 = wsp_ggml_metal_get_buffer_id(op->src[0]);
2497
+ wsp_ggml_metal_buffer_id bid_dst = wsp_ggml_metal_get_buffer_id(op);
2498
+
2499
+ wsp_ggml_metal_kargs_norm args = {
2500
+ /*.ne00 =*/ ne00,
2501
+ /*.ne00_t =*/ ne00 % 4 == 0 ? ne00/4 : ne00,
2502
+ /*.nb1 =*/ nb1,
2503
+ /*.nb2 =*/ nb2,
2504
+ /*.nb3 =*/ nb3,
2505
+ /*.eps =*/ eps,
2506
+ /*.nef1 =*/ { ne01 },
2507
+ /*.nef2 =*/ { ne02 },
2508
+ /*.nef3 =*/ { ne03 },
2509
+ /*.nbf1 =*/ { nb01 },
2510
+ /*.nbf2 =*/ { nb02 },
2511
+ /*.nbf3 =*/ { nb03 },
2512
+ };
2513
+
2514
+ wsp_ggml_op fops[8];
2515
+
2516
+ int n_fuse = 1;
2517
+
2518
+ wsp_ggml_metal_buffer_id bid_fuse[2] = { bid_src0, bid_src0 };
2519
+
2520
+ // d[0] = norm(a)
2521
+ // d[1] = mul(d[0], b)
2522
+ // d[2] = add(d[1], c)
2523
+ if (use_fusion) {
2524
+ fops[0] = op->op;
2525
+ fops[1] = WSP_GGML_OP_MUL;
2526
+ fops[2] = WSP_GGML_OP_ADD;
2527
+
2528
+ for (n_fuse = 0; n_fuse <= 1; ++n_fuse) {
2529
+ if (!ctx->can_fuse(idx + n_fuse, fops + n_fuse, 2)) {
2530
+ break;
2531
+ }
2532
+
2533
+ wsp_ggml_tensor * f0 = ctx->node(idx + n_fuse);
2534
+ wsp_ggml_tensor * f1 = ctx->node(idx + n_fuse + 1);
2535
+
2536
+ if (f0 != f1->src[0]) {
2537
+ break;
2538
+ }
2539
+
2540
+ if (f1->src[1]->ne[0] != op->ne[0]) {
2541
+ break;
2542
+ }
2543
+
2544
+ if (!wsp_ggml_is_contiguous_rows(f1->src[1])) {
2545
+ break;
2546
+ }
2547
+
2548
+ if (f1->type != WSP_GGML_TYPE_F32) {
2549
+ break;
2550
+ }
2551
+
2552
+ //ctx->fuse_cnt[f1->op]++;
2553
+
2554
+ bid_fuse[n_fuse] = wsp_ggml_metal_get_buffer_id(f1->src[1]);
2555
+
2556
+ args.nef1[n_fuse + 1] = f1->src[1]->ne[1];
2557
+ args.nef2[n_fuse + 1] = f1->src[1]->ne[2];
2558
+ args.nef3[n_fuse + 1] = f1->src[1]->ne[3];
2559
+
2560
+ args.nbf1[n_fuse + 1] = f1->src[1]->nb[1];
2561
+ args.nbf2[n_fuse + 1] = f1->src[1]->nb[2];
2562
+ args.nbf3[n_fuse + 1] = f1->src[1]->nb[3];
2563
+ }
2564
+
2565
+ ++n_fuse;
2566
+
2567
+ if (debug_fusion > 1 && n_fuse > 1) {
2568
+ if (n_fuse == 2) {
2569
+ WSP_GGML_LOG_DEBUG("%s: fuse: %s + MUL\n", __func__, wsp_ggml_op_name(op->op));
2570
+ }
2571
+ if (n_fuse == 3) {
2572
+ WSP_GGML_LOG_DEBUG("%s: fuse: %s + MUL + ADD\n", __func__, wsp_ggml_op_name(op->op));
2573
+ }
2574
+ }
2575
+ }
2576
+
2577
+ if (n_fuse > 1) {
2578
+ bid_dst = wsp_ggml_metal_get_buffer_id(ctx->node(idx + n_fuse - 1));
2579
+
2580
+ for (int i = 1; i < n_fuse; ++i) {
2581
+ if (!wsp_ggml_metal_op_concurrency_check(ctx, ctx->node(idx + i))) {
2582
+ wsp_ggml_metal_op_concurrency_reset(ctx);
2583
+
2584
+ break;
2585
+ }
2586
+ }
2587
+ }
2588
+
2589
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_norm(lib, op, n_fuse);
2590
+
2591
+ int nth = 32; // SIMD width
2592
+
2593
+ while (nth < args.ne00_t && nth < wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
2594
+ nth *= 2;
2595
+ }
2596
+
2597
+ nth = std::min(nth, wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
2598
+ nth = std::min(nth, args.ne00_t);
2599
+
2600
+ const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
2601
+
2602
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
2603
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2604
+ wsp_ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
2605
+ wsp_ggml_metal_encoder_set_buffer (enc, bid_fuse[0], 2);
2606
+ wsp_ggml_metal_encoder_set_buffer (enc, bid_fuse[1], 3);
2607
+ wsp_ggml_metal_encoder_set_buffer (enc, bid_dst, 4);
2608
+
2609
+ wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
2610
+
2611
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
2612
+
2613
+ return n_fuse;
2614
+ }
2615
+
2616
+ int wsp_ggml_metal_op_rope(wsp_ggml_metal_op_t ctx, int idx) {
2617
+ wsp_ggml_tensor * op = ctx->node(idx);
2618
+
2619
+ wsp_ggml_metal_library_t lib = ctx->lib;
2620
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
2621
+
2622
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2623
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2624
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
2625
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
2626
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2627
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
2628
+
2629
+ // make sure we have one or more position id(ne10) per token(ne02)
2630
+ WSP_GGML_ASSERT(ne10 % ne02 == 0);
2631
+ WSP_GGML_ASSERT(ne10 >= ne02);
2632
+
2633
+ const int nth = std::min(1024, ne00);
2634
+
2635
+ const int n_past = ((const int32_t *) op->op_params)[0];
2636
+ const int n_dims = ((const int32_t *) op->op_params)[1];
2637
+ //const int mode = ((const int32_t *) op->op_params)[2];
2638
+ // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
2639
+ const int n_ctx_orig = ((const int32_t *) op->op_params)[4];
2640
+
2641
+ float freq_base;
2642
+ float freq_scale;
2643
+ float ext_factor;
2644
+ float attn_factor;
2645
+ float beta_fast;
2646
+ float beta_slow;
2647
+
2648
+ memcpy(&freq_base, (const int32_t *) op->op_params + 5, sizeof(float));
2649
+ memcpy(&freq_scale, (const int32_t *) op->op_params + 6, sizeof(float));
2650
+ memcpy(&ext_factor, (const int32_t *) op->op_params + 7, sizeof(float));
2651
+ memcpy(&attn_factor, (const int32_t *) op->op_params + 8, sizeof(float));
2652
+ memcpy(&beta_fast, (const int32_t *) op->op_params + 9, sizeof(float));
2653
+ memcpy(&beta_slow, (const int32_t *) op->op_params + 10, sizeof(float));
2654
+
2655
+ // mrope
2656
+ const int sect_0 = ((const int32_t *) op->op_params)[11];
2657
+ const int sect_1 = ((const int32_t *) op->op_params)[12];
2658
+ const int sect_2 = ((const int32_t *) op->op_params)[13];
2659
+ const int sect_3 = ((const int32_t *) op->op_params)[14];
2660
+
2661
+ wsp_ggml_metal_kargs_rope args = {
2662
+ /*.ne00 =*/ ne00,
2663
+ /*.ne01 =*/ ne01,
2664
+ /*.ne02 =*/ ne02,
2665
+ /*.ne03 =*/ ne03,
2666
+ /*.nb00 =*/ nb00,
2667
+ /*.nb01 =*/ nb01,
2668
+ /*.nb02 =*/ nb02,
2669
+ /*.nb03 =*/ nb03,
2670
+ /*.ne0 =*/ ne0,
2671
+ /*.ne1 =*/ ne1,
2672
+ /*.ne2 =*/ ne2,
2673
+ /*.ne3 =*/ ne3,
2674
+ /*.nb0 =*/ nb0,
2675
+ /*.nb1 =*/ nb1,
2676
+ /*.nb2 =*/ nb2,
2677
+ /*.nb3 =*/ nb3,
2678
+ /*.n_past =*/ n_past,
2679
+ /*.n_dims =*/ n_dims,
2680
+ /*.n_ctx_orig =*/ n_ctx_orig,
2681
+ /*.freq_base =*/ freq_base,
2682
+ /*.freq_scale =*/ freq_scale,
2683
+ /*.ext_factor =*/ ext_factor,
2684
+ /*.attn_factor =*/ attn_factor,
2685
+ /*.beta_fast =*/ beta_fast,
2686
+ /*.beta_slow =*/ beta_slow,
2687
+ /* sect_0 =*/ sect_0,
2688
+ /* sect_1 =*/ sect_1,
2689
+ /* sect_2 =*/ sect_2,
2690
+ /* sect_3 =*/ sect_3,
2691
+ };
2692
+
2693
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_rope(lib, op);
2694
+
2695
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
2696
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2697
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
2698
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
2699
+ if (op->src[2]) {
2700
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[2]), 3);
2701
+ } else {
2702
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 3);
2703
+ }
2704
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 4);
2705
+
2706
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
2707
+
2708
+ return 1;
2709
+ }
2710
+
2711
+ int wsp_ggml_metal_op_im2col(wsp_ggml_metal_op_t ctx, int idx) {
2712
+ wsp_ggml_tensor * op = ctx->node(idx);
2713
+
2714
+ wsp_ggml_metal_library_t lib = ctx->lib;
2715
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
2716
+
2717
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2718
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2719
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2720
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
2721
+
2722
+ const int32_t s0 = ((const int32_t *)(op->op_params))[0];
2723
+ const int32_t s1 = ((const int32_t *)(op->op_params))[1];
2724
+ const int32_t p0 = ((const int32_t *)(op->op_params))[2];
2725
+ const int32_t p1 = ((const int32_t *)(op->op_params))[3];
2726
+ const int32_t d0 = ((const int32_t *)(op->op_params))[4];
2727
+ const int32_t d1 = ((const int32_t *)(op->op_params))[5];
2728
+
2729
+ const bool is_2D = ((const int32_t *)(op->op_params))[6] == 1;
2730
+
2731
+ const int32_t N = op->src[1]->ne[is_2D ? 3 : 2];
2732
+ const int32_t IC = op->src[1]->ne[is_2D ? 2 : 1];
2733
+ const int32_t IH = is_2D ? op->src[1]->ne[1] : 1;
2734
+ const int32_t IW = op->src[1]->ne[0];
2735
+
2736
+ const int32_t KH = is_2D ? op->src[0]->ne[1] : 1;
2737
+ const int32_t KW = op->src[0]->ne[0];
2738
+
2739
+ const int32_t OH = is_2D ? op->ne[2] : 1;
2740
+ const int32_t OW = op->ne[1];
2741
+
2742
+ const int32_t CHW = IC * KH * KW;
2743
+
2744
+ const uint64_t ofs0 = op->src[1]->nb[is_2D ? 3 : 2] / 4;
2745
+ const uint64_t ofs1 = op->src[1]->nb[is_2D ? 2 : 1] / 4;
2746
+
2747
+ wsp_ggml_metal_kargs_im2col args = {
2748
+ /*.ofs0 =*/ ofs0,
2749
+ /*.ofs1 =*/ ofs1,
2750
+ /*.IW =*/ IW,
2751
+ /*.IH =*/ IH,
2752
+ /*.CHW =*/ CHW,
2753
+ /*.s0 =*/ s0,
2754
+ /*.s1 =*/ s1,
2755
+ /*.p0 =*/ p0,
2756
+ /*.p1 =*/ p1,
2757
+ /*.d0 =*/ d0,
2758
+ /*.d1 =*/ d1,
2759
+ /*.N =*/ N,
2760
+ /*.KH =*/ KH,
2761
+ /*.KW =*/ KW,
2762
+ /*.KHW =*/ KH * KW,
2763
+ };
2764
+
2765
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_im2col(lib, op);
2766
+
2767
+ WSP_GGML_ASSERT(KH*KW <= wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
2768
+
2769
+ const uint64_t ntptg0 = std::min(wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)/(KH*KW), N);
2770
+
2771
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
2772
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2773
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 1);
2774
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 2);
2775
+
2776
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, IC, OH, OW, ntptg0, KH, KW);
2777
+
2778
+ return 1;
2779
+ }
2780
+
2781
+ int wsp_ggml_metal_op_conv_transpose_1d(wsp_ggml_metal_op_t ctx, int idx) {
2782
+ wsp_ggml_tensor * op = ctx->node(idx);
2783
+
2784
+ wsp_ggml_metal_library_t lib = ctx->lib;
2785
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
2786
+
2787
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2788
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2789
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
2790
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
2791
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2792
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
2793
+
2794
+ const int32_t s0 = ((const int32_t *)(op->op_params))[0];
2795
+
2796
+ const int32_t IC = op->src[1]->ne[1];
2797
+ const int32_t IL = op->src[1]->ne[0];
2798
+
2799
+ const int32_t K = op->src[0]->ne[0];
2800
+
2801
+ const int32_t OL = op->ne[0];
2802
+ const int32_t OC = op->ne[1];
2803
+
2804
+ wsp_ggml_metal_kargs_conv_transpose_1d args = {
2805
+ /*.IC =*/ IC,
2806
+ /*.IL =*/ IL,
2807
+ /*.K =*/ K,
2808
+ /*.s0 =*/ s0,
2809
+ /*.nb0 =*/ nb0,
2810
+ /*.nb1 =*/ nb1,
2811
+ };
2812
+
2813
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_conv_transpose_1d(lib, op);
2814
+
2815
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
2816
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2817
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
2818
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[1]), 2);
2819
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 3);
2820
+
2821
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, OL, OC, 1, 1, 1, 1);
2822
+
2823
+ return 1;
2824
+ }
2825
+
2826
+ int wsp_ggml_metal_op_upscale(wsp_ggml_metal_op_t ctx, int idx) {
2827
+ wsp_ggml_tensor * op = ctx->node(idx);
2828
+
2829
+ wsp_ggml_metal_library_t lib = ctx->lib;
2830
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
2831
+
2832
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2833
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2834
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2835
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
2836
+
2837
+ const float sf0 = (float)ne0/op->src[0]->ne[0];
2838
+ const float sf1 = (float)ne1/op->src[0]->ne[1];
2839
+ const float sf2 = (float)ne2/op->src[0]->ne[2];
2840
+ const float sf3 = (float)ne3/op->src[0]->ne[3];
2841
+
2842
+ wsp_ggml_metal_kargs_upscale args = {
2843
+ /*.ne00 =*/ ne00,
2844
+ /*.ne01 =*/ ne01,
2845
+ /*.ne02 =*/ ne02,
2846
+ /*.ne03 =*/ ne03,
2847
+ /*.nb00 =*/ nb00,
2848
+ /*.nb01 =*/ nb01,
2849
+ /*.nb02 =*/ nb02,
2850
+ /*.nb03 =*/ nb03,
2851
+ /*.ne0 =*/ ne0,
2852
+ /*.ne1 =*/ ne1,
2853
+ /*.ne2 =*/ ne2,
2854
+ /*.ne3 =*/ ne3,
2855
+ /*.nb0 =*/ nb0,
2856
+ /*.nb1 =*/ nb1,
2857
+ /*.nb2 =*/ nb2,
2858
+ /*.nb3 =*/ nb3,
2859
+ /*.sf0 =*/ sf0,
2860
+ /*.sf1 =*/ sf1,
2861
+ /*.sf2 =*/ sf2,
2862
+ /*.sf3 =*/ sf3
2863
+ };
2864
+
2865
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_upscale(lib, op);
2866
+
2867
+ const int nth = std::min(wsp_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne0);
2868
+
2869
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
2870
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2871
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
2872
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 2);
2873
+
2874
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
2875
+
2876
+ return 1;
2877
+ }
2878
+
2879
+ int wsp_ggml_metal_op_pad(wsp_ggml_metal_op_t ctx, int idx) {
2880
+ wsp_ggml_tensor * op = ctx->node(idx);
2881
+
2882
+ wsp_ggml_metal_library_t lib = ctx->lib;
2883
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
2884
+
2885
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2886
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2887
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2888
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
2889
+
2890
+ wsp_ggml_metal_kargs_pad args = {
2891
+ /*.ne00 =*/ ne00,
2892
+ /*.ne01 =*/ ne01,
2893
+ /*.ne02 =*/ ne02,
2894
+ /*.ne03 =*/ ne03,
2895
+ /*.nb00 =*/ nb00,
2896
+ /*.nb01 =*/ nb01,
2897
+ /*.nb02 =*/ nb02,
2898
+ /*.nb03 =*/ nb03,
2899
+ /*.ne0 =*/ ne0,
2900
+ /*.ne1 =*/ ne1,
2901
+ /*.ne2 =*/ ne2,
2902
+ /*.ne3 =*/ ne3,
2903
+ /*.nb0 =*/ nb0,
2904
+ /*.nb1 =*/ nb1,
2905
+ /*.nb2 =*/ nb2,
2906
+ /*.nb3 =*/ nb3
2907
+ };
2908
+
2909
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_pad(lib, op);
2910
+
2911
+ const int nth = std::min(1024, ne0);
2912
+
2913
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
2914
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2915
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
2916
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 2);
2917
+
2918
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
2919
+
2920
+ return 1;
2921
+ }
2922
+
2923
+ int wsp_ggml_metal_op_pad_reflect_1d(wsp_ggml_metal_op_t ctx, int idx) {
2924
+ wsp_ggml_tensor * op = ctx->node(idx);
2925
+
2926
+ wsp_ggml_metal_library_t lib = ctx->lib;
2927
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
2928
+
2929
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2930
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2931
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2932
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
2933
+
2934
+ wsp_ggml_metal_kargs_pad_reflect_1d args = {
2935
+ /*.ne00 =*/ ne00,
2936
+ /*.ne01 =*/ ne01,
2937
+ /*.ne02 =*/ ne02,
2938
+ /*.ne03 =*/ ne03,
2939
+ /*.nb00 =*/ nb00,
2940
+ /*.nb01 =*/ nb01,
2941
+ /*.nb02 =*/ nb02,
2942
+ /*.nb03 =*/ nb03,
2943
+ /*.ne0 =*/ ne0,
2944
+ /*.ne1 =*/ ne1,
2945
+ /*.ne2 =*/ ne2,
2946
+ /*.ne3 =*/ ne3,
2947
+ /*.nb0 =*/ nb0,
2948
+ /*.nb1 =*/ nb1,
2949
+ /*.nb2 =*/ nb2,
2950
+ /*.nb3 =*/ nb3,
2951
+ /*.p0 =*/ ((const int32_t *)(op->op_params))[0],
2952
+ /*.p1 =*/ ((const int32_t *)(op->op_params))[1]
2953
+ };
2954
+
2955
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_pad_reflect_1d(lib, op);
2956
+
2957
+ const int nth = std::min(1024, ne0);
2958
+
2959
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
2960
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2961
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
2962
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 2);
2963
+
2964
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
2965
+
2966
+ return 1;
2967
+ }
2968
+
2969
+ int wsp_ggml_metal_op_arange(wsp_ggml_metal_op_t ctx, int idx) {
2970
+ wsp_ggml_tensor * op = ctx->node(idx);
2971
+
2972
+ wsp_ggml_metal_library_t lib = ctx->lib;
2973
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
2974
+
2975
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2976
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
2977
+
2978
+ float start;
2979
+ float step;
2980
+
2981
+ memcpy(&start, ((const int32_t *) op->op_params) + 0, sizeof(float));
2982
+ memcpy(&step, ((const int32_t *) op->op_params) + 2, sizeof(float));
2983
+
2984
+ wsp_ggml_metal_kargs_arange args = {
2985
+ /*.ne0 =*/ ne0,
2986
+ /*.start =*/ start,
2987
+ /*.step =*/ step
2988
+ };
2989
+
2990
+ const int nth = std::min(1024, ne0);
2991
+
2992
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_arange(lib, op);
2993
+
2994
+ //[encoder setComputePipelineState:pipeline];
2995
+ //[encoder setBuffer:id_dst offset:offs_dst atIndex:0];
2996
+ //[encoder setBytes:&args length:sizeof(args) atIndex:1];
2997
+
2998
+ //[encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
2999
+
3000
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
3001
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3002
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 1);
3003
+
3004
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, 1, 1, 1, nth, 1, 1);
3005
+
3006
+ return 1;
3007
+ }
3008
+
3009
+ int wsp_ggml_metal_op_timestep_embedding(wsp_ggml_metal_op_t ctx, int idx) {
3010
+ wsp_ggml_tensor * op = ctx->node(idx);
3011
+
3012
+ wsp_ggml_metal_library_t lib = ctx->lib;
3013
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
3014
+
3015
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
3016
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
3017
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3018
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
3019
+
3020
+ const int dim = op->op_params[0];
3021
+ const int max_period = op->op_params[1];
3022
+
3023
+ wsp_ggml_metal_kargs_timestep_embedding args = {
3024
+ /*.nb1 =*/ nb1,
3025
+ /*.dim =*/ dim,
3026
+ /*.max_period =*/ max_period,
3027
+ };
3028
+
3029
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_timestep_embedding(lib, op);
3030
+
3031
+ const int nth = std::max(1, std::min(1024, dim/2));
3032
+
3033
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
3034
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3035
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
3036
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 2);
3037
+
3038
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, ne00, 1, 1, nth, 1, 1);
3039
+
3040
+ return 1;
3041
+ }
3042
+
3043
+ int wsp_ggml_metal_op_argmax(wsp_ggml_metal_op_t ctx, int idx) {
3044
+ wsp_ggml_tensor * op = ctx->node(idx);
3045
+
3046
+ wsp_ggml_metal_library_t lib = ctx->lib;
3047
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
3048
+
3049
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
3050
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
3051
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3052
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
3053
+
3054
+ wsp_ggml_metal_kargs_argmax args = {
3055
+ /*.ne00 = */ ne00,
3056
+ /*.nb01 = */ nb01,
3057
+ };
3058
+
3059
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_argmax(lib, op);
3060
+
3061
+ const int64_t nrows = wsp_ggml_nrows(op->src[0]);
3062
+
3063
+ int nth = 32; // SIMD width
3064
+ while (nth < ne00 && nth*ne01*ne02*ne03 < 256) {
3065
+ nth *= 2;
3066
+ }
3067
+
3068
+ const size_t smem = wsp_ggml_metal_pipeline_get_smem(pipeline);
3069
+
3070
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
3071
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3072
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
3073
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 2);
3074
+
3075
+ wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
3076
+
3077
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, nrows, 1, 1, nth, 1, 1);
3078
+
3079
+ return 1;
3080
+ }
3081
+
3082
+ int wsp_ggml_metal_op_argsort(wsp_ggml_metal_op_t ctx, int idx) {
3083
+ wsp_ggml_tensor * op = ctx->node(idx);
3084
+
3085
+ wsp_ggml_metal_library_t lib = ctx->lib;
3086
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
3087
+
3088
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
3089
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
3090
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3091
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
3092
+
3093
+ // bitonic sort requires the number of elements to be power of 2
3094
+ int64_t ne00_padded = 1;
3095
+ while (ne00_padded < ne00) {
3096
+ ne00_padded *= 2;
3097
+ }
3098
+
3099
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_argsort(lib, op);
3100
+
3101
+ const int64_t nrows = wsp_ggml_nrows(op->src[0]);
3102
+
3103
+ // Metal kernels require the buffer size to be multiple of 16 bytes
3104
+ // https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/1443142-setthreadgroupmemorylength
3105
+ const size_t smem = WSP_GGML_PAD(ne00_padded*sizeof(int32_t), 16);
3106
+
3107
+ wsp_ggml_metal_kargs_argsort args = {
3108
+ /*.ncols =*/ ne00,
3109
+ /*.ncols_pad =*/ ne00_padded
3110
+ };
3111
+
3112
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
3113
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3114
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
3115
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 2);
3116
+
3117
+ wsp_ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
3118
+
3119
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, 1, nrows, 1, ne00_padded, 1, 1);
3120
+
3121
+ return 1;
3122
+ }
3123
+
3124
+ int wsp_ggml_metal_op_leaky_relu(wsp_ggml_metal_op_t ctx, int idx) {
3125
+ wsp_ggml_tensor * op = ctx->node(idx);
3126
+
3127
+ wsp_ggml_metal_library_t lib = ctx->lib;
3128
+ wsp_ggml_metal_encoder_t enc = ctx->enc;
3129
+
3130
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
3131
+ WSP_GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
3132
+ WSP_GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3133
+ WSP_GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
3134
+
3135
+ float slope;
3136
+ memcpy(&slope, op->op_params, sizeof(float));
3137
+
3138
+ wsp_ggml_metal_kargs_leaky_relu args = {
3139
+ /*.slope =*/ slope
3140
+ };
3141
+
3142
+ wsp_ggml_metal_pipeline_t pipeline = wsp_ggml_metal_library_get_pipeline_unary(lib, op);
3143
+
3144
+ int64_t n = wsp_ggml_nelements(op);
3145
+
3146
+ if (n % 4 == 0) {
3147
+ n /= 4;
3148
+ }
3149
+
3150
+ wsp_ggml_metal_encoder_set_pipeline(enc, pipeline);
3151
+ wsp_ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3152
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op->src[0]), 1);
3153
+ wsp_ggml_metal_encoder_set_buffer (enc, wsp_ggml_metal_get_buffer_id(op), 2);
3154
+
3155
+ wsp_ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
3156
+
3157
+ return 1;
3158
+ }