@elizaos/app-core 2.0.0-beta.1 → 2.0.0-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/package.json +2 -2
  2. package/platforms/electrobun/native/macos/window-effects.mm +103 -0
  3. package/platforms/electrobun/package.json +9 -0
  4. package/platforms/electrobun/src/__stubs__/bun-ffi.ts +16 -0
  5. package/platforms/electrobun/src/libMacWindowEffects.dylib +0 -0
  6. package/platforms/electrobun/src/native/agent.ts +74 -3
  7. package/platforms/electrobun/src/native/desktop.ts +39 -6
  8. package/platforms/electrobun/src/native/mac-window-effects.ts +61 -1
  9. package/platforms/electrobun/src/native/permissions-shared.ts +3 -2
  10. package/platforms/electrobun/src/native/permissions.ts +11 -6
  11. package/platforms/electrobun/src/rpc-handlers.ts +7 -0
  12. package/platforms/electrobun/src/rpc-schema.ts +39 -4
  13. package/platforms/electrobun/src/runtime-permissions.ts +7 -1
  14. package/runtime/ensure-local-inference-handler.d.ts +1 -0
  15. package/runtime/ensure-local-inference-handler.d.ts.map +1 -1
  16. package/runtime/ensure-local-inference-handler.js +9 -0
  17. package/runtime/mode/remote-forwarder.d.ts.map +1 -1
  18. package/runtime/mode/remote-forwarder.js +1 -1
  19. package/runtime/mode/runtime-mode.d.ts +20 -2
  20. package/runtime/mode/runtime-mode.d.ts.map +1 -1
  21. package/runtime/mode/runtime-mode.js +69 -1
  22. package/scripts/aosp/stage-default-models.mjs +2 -2
  23. package/scripts/build-llama-cpp-dflash.mjs +75 -40
  24. package/scripts/kernel-patches/metal-kernels.mjs +357 -337
  25. package/scripts/lib/read-app-identity.mjs +5 -1
  26. package/services/local-inference/catalog.d.ts +2 -1
  27. package/services/local-inference/catalog.d.ts.map +1 -1
  28. package/services/local-inference/catalog.js +131 -12
  29. package/services/local-inference/downloader.d.ts +2 -0
  30. package/services/local-inference/downloader.d.ts.map +1 -1
  31. package/services/local-inference/downloader.js +300 -1
  32. package/services/local-inference/manifest/validator.d.ts.map +1 -1
  33. package/services/local-inference/manifest/validator.js +48 -0
  34. package/services/local-inference/providers.d.ts +1 -1
  35. package/services/local-inference/providers.js +6 -6
  36. package/services/local-inference/registry.d.ts.map +1 -1
  37. package/services/local-inference/registry.js +10 -1
  38. package/services/local-inference/types.d.ts +6 -0
  39. package/services/local-inference/types.d.ts.map +1 -1
  40. package/test/helpers/real-runtime.ts +21 -20
  41. package/platforms/electrobun/src/native/permissions-darwin.ts +0 -342
  42. package/platforms/electrobun/src/native/permissions-linux.ts +0 -34
  43. package/platforms/electrobun/src/native/permissions-win32.ts +0 -56
@@ -9,11 +9,10 @@
9
9
  // self-contained TUs (only #include <metal_stdlib>; their own structs,
10
10
  // constants, kernel symbols), so they compile as independent .air files.
11
11
  //
12
- // 2. Patches ggml/src/ggml-metal/CMakeLists.txt so the non-EMBED_LIBRARY
13
- // branch (the one used by darwin host metal builds) builds each standalone
14
- // shader into its own .air via `xcrun metal -c` and merges all
15
- // .air files (the original ggml-metal.air plus the five milady .air files)
16
- // into default.metallib via a single `xcrun metallib` invocation.
12
+ // 2. Patches ggml/src/ggml-metal/CMakeLists.txt so both Metal packaging
13
+ // branches build each standalone shader into its own .air via
14
+ // `xcrun metal -c` and merge all .air files (the original ggml-metal.air
15
+ // plus the five milady .air files) into one default.metallib.
17
16
  //
18
17
  // The original CMake snippet pipes `xcrun metal | xcrun metallib`. We
19
18
  // replace that with explicit per-source compilation + a final merge step,
@@ -35,11 +34,10 @@
35
34
  // not yet selected by the runtime — the symbol-presence audit (`nm`,
36
35
  // `strings default.metallib`) passes, the dispatch audit does not.
37
36
  //
38
- // * EMBED_LIBRARY path used by iOS targets. iOS builds compile a single
39
- // concatenated .metal via `.incbin`, which would require stripping the
40
- // duplicate decls (`block_qjl1_256`, `block_q4_polar`, `QK_QJL`,
41
- // `QK_POLAR`, `QJL_RESIDUAL_BYTES` already in ggml-common.h). That is a
42
- // separate patch and is documented as a deferred gap.
37
+ // * Convert the EMBED_LIBRARY path used by iOS targets to embed compiled
38
+ // metallib bytes rather than concatenated Metal source. This avoids
39
+ // duplicate declarations between ggml-metal.metal + standalones and lets
40
+ // iOS load the same multi-TU kernel set as desktop.
43
41
 
44
42
  import fs from "node:fs";
45
43
  import path from "node:path";
@@ -70,6 +68,9 @@ export const METAL_KERNEL_FILES = [
70
68
  ];
71
69
 
72
70
  const SENTINEL = "# MILADY-KERNEL-PATCH-V1";
71
+ const SENTINEL_EMBED = "# MILADY-KERNEL-EMBED-PATCH-V1";
72
+ const SENTINEL_EMBED_LOADER = "// MILADY-EMBEDDED-METALLIB-LOADER-V1";
73
+ const SENTINEL_QJL_ATTN = "// MILADY-QJL-ATTN-DISPATCH-V1";
73
74
 
74
75
  function inForkRelpath(name) {
75
76
  return path.posix.join("ggml", "src", "ggml-metal", "milady-shipped", name);
@@ -137,13 +138,10 @@ function copyStandalonesIntoFork(cacheDir, { dryRun }) {
137
138
  return copied;
138
139
  }
139
140
 
140
- // Patch ggml/src/ggml-metal/CMakeLists.txt: replace the single
141
- // `xcrun metal -c X | xcrun metallib - -o Y`
142
- // pipe with a multi-source compile + merge that includes our shipped kernels.
143
- //
144
- // We anchor on the `add_custom_command(OUTPUT ${...}/default.metallib` line
145
- // in the non-EMBED_LIBRARY branch; that is the only metallib build the
146
- // darwin host metal target uses. Idempotent via SENTINEL.
141
+ // Patch ggml/src/ggml-metal/CMakeLists.txt so desktop and iOS both compile
142
+ // ggml-metal.metal + every standalone into separate .air files and merge them
143
+ // into one default.metallib. iOS then embeds that binary metallib into the
144
+ // static archive instead of embedding concatenated source.
147
145
  function patchMetalCMakeLists(cacheDir, { dryRun }) {
148
146
  const cmakePath = path.join(
149
147
  cacheDir,
@@ -158,14 +156,95 @@ function patchMetalCMakeLists(cacheDir, { dryRun }) {
158
156
  );
159
157
  }
160
158
  const original = fs.readFileSync(cmakePath, "utf8");
161
- if (original.includes(SENTINEL)) {
162
- return { changed: false, path: cmakePath };
159
+ let patched = original;
160
+ let changed = false;
161
+
162
+ const miladyAirLinesForSdk = (sdkExpr) =>
163
+ METAL_KERNEL_FILES.map((name) => {
164
+ const stem = name.replace(/\.metal$/, "");
165
+ return ` COMMAND xcrun -sdk ${sdkExpr} metal \${XC_FLAGS} -c \${CMAKE_CURRENT_SOURCE_DIR}/milady-shipped/${name} -o \${CMAKE_CURRENT_BINARY_DIR}/${stem}.air`;
166
+ }).join("\n");
167
+ const miladyAirInputs = METAL_KERNEL_FILES.map((name) => {
168
+ const stem = name.replace(/\.metal$/, "");
169
+ return `\${CMAKE_CURRENT_BINARY_DIR}/${stem}.air`;
170
+ }).join(" ");
171
+ const miladyDepends = METAL_KERNEL_FILES.map(
172
+ (name) => `\${CMAKE_CURRENT_SOURCE_DIR}/milady-shipped/${name}`,
173
+ ).join(" ");
174
+
175
+ if (!patched.includes(SENTINEL_EMBED)) {
176
+ const embedStart = patched.indexOf(
177
+ " # merge ggml-common.h and ggml-metal.metal into a single file",
178
+ );
179
+ const embedEnd =
180
+ embedStart === -1
181
+ ? -1
182
+ : patched.indexOf(
183
+ "\n\n target_sources(ggml-metal PRIVATE \"${METALLIB_EMBED_ASM}\")",
184
+ embedStart,
185
+ );
186
+ if (embedStart === -1 || embedEnd === -1) {
187
+ throw new Error(
188
+ `[metal-kernels] embedded Metal CMake anchor not found at ${cmakePath}; ` +
189
+ `the fork's GGML_METAL_EMBED_LIBRARY branch changed shape and the patch must be revisited.`,
190
+ );
191
+ }
192
+ const embedAirLines = miladyAirLinesForSdk("${METAL_SDK}");
193
+ const embedReplacement = ` # ${SENTINEL_EMBED}
194
+ # Build a compiled default.metallib for embedded-library targets (iOS).
195
+ # The upstream path embedded concatenated Metal source and JIT-compiled it
196
+ # at runtime. That cannot include the milady standalones because the source
197
+ # TUs intentionally redeclare block_* structs/constants that already exist
198
+ # in ggml-common.h. Compile each TU separately, merge into one metallib,
199
+ # and embed the binary metallib bytes instead.
200
+ set(METALLIB_EMBED_ASM "\${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.s")
201
+ set(METALLIB_SOURCE_EMBED "\${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.metal")
202
+ set(METALLIB_SOURCE_EMBED_TMP "\${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp")
203
+ set(METALLIB_EMBED_BINARY "\${CMAKE_CURRENT_BINARY_DIR}/autogenerated/default.metallib")
204
+ set(METALLIB_EMBED_AIR "\${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.air")
205
+ set(METAL_SDK "\${CMAKE_OSX_SYSROOT}")
206
+ if (NOT METAL_SDK)
207
+ set(METAL_SDK macosx)
208
+ endif()
209
+ if (GGML_METAL_SHADER_DEBUG)
210
+ set(XC_FLAGS -fno-fast-math -fno-inline)
211
+ else()
212
+ set(XC_FLAGS -O3)
213
+ endif()
214
+ if (GGML_METAL_STD)
215
+ list(APPEND XC_FLAGS -std=\${GGML_METAL_STD})
216
+ endif()
217
+
218
+ add_custom_command(
219
+ OUTPUT "\${METALLIB_EMBED_ASM}"
220
+ COMMAND echo "Embedding Metal library (compiled metallib + milady-shipped kernels)"
221
+ COMMAND sed -e "/__embed_ggml-common.h__/r \${METALLIB_COMMON}" -e "/__embed_ggml-common.h__/d" < "\${METALLIB_SOURCE}" > "\${METALLIB_SOURCE_EMBED_TMP}"
222
+ COMMAND sed -e "/\\#include \\"ggml-metal-impl.h\\"/r \${METALLIB_IMPL}" -e "/\\#include \\"ggml-metal-impl.h\\"/d" < "\${METALLIB_SOURCE_EMBED_TMP}" > "\${METALLIB_SOURCE_EMBED}"
223
+ COMMAND xcrun -sdk \${METAL_SDK} metal \${XC_FLAGS} -DGGML_METAL_EMBED_LIBRARY=1 -c "\${METALLIB_SOURCE_EMBED}" -o "\${METALLIB_EMBED_AIR}"
224
+ ${embedAirLines}
225
+ COMMAND xcrun -sdk \${METAL_SDK} metallib "\${METALLIB_EMBED_AIR}" ${miladyAirInputs} -o "\${METALLIB_EMBED_BINARY}"
226
+ COMMAND echo ".section __DATA,__ggml_metallib" > "\${METALLIB_EMBED_ASM}"
227
+ COMMAND echo ".globl _ggml_metallib_start" >> "\${METALLIB_EMBED_ASM}"
228
+ COMMAND echo "_ggml_metallib_start:" >> "\${METALLIB_EMBED_ASM}"
229
+ COMMAND echo .incbin "\\"\${METALLIB_EMBED_BINARY}\\"" >> "\${METALLIB_EMBED_ASM}"
230
+ COMMAND echo ".globl _ggml_metallib_end" >> "\${METALLIB_EMBED_ASM}"
231
+ COMMAND echo "_ggml_metallib_end:" >> "\${METALLIB_EMBED_ASM}"
232
+ DEPENDS ../ggml-common.h ggml-metal.metal ggml-metal-impl.h ${miladyDepends}
233
+ COMMENT "Generate assembly for embedded compiled Metal library"
234
+ VERBATIM
235
+ )`;
236
+ patched =
237
+ patched.slice(0, embedStart) +
238
+ embedReplacement +
239
+ patched.slice(embedEnd);
240
+ changed = true;
163
241
  }
164
242
 
165
243
  // The exact block we replace. This pipe pattern has been stable in the
166
244
  // milady-ai/llama.cpp fork for the entire v0.4.x line; if the upstream
167
245
  // ever rewrites it we want to fail loudly rather than silently no-op.
168
- const anchor = ` add_custom_command(
246
+ if (!patched.includes(SENTINEL)) {
247
+ const anchor = ` add_custom_command(
169
248
  OUTPUT \${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
170
249
  COMMAND xcrun -sdk macosx metal \${XC_FLAGS} -c \${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o - |
171
250
  xcrun -sdk macosx metallib - -o \${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
@@ -174,29 +253,16 @@ function patchMetalCMakeLists(cacheDir, { dryRun }) {
174
253
  DEPENDS ggml-metal.metal \${METALLIB_COMMON}
175
254
  COMMENT "Compiling Metal kernels"
176
255
  )`;
177
- if (!original.includes(anchor)) {
178
- throw new Error(
179
- `[metal-kernels] CMakeLists.txt anchor not found at ${cmakePath}; ` +
180
- `the fork's metallib build snippet has changed shape and the patch ` +
181
- `must be revisited. Inspect the file's add_custom_command for default.metallib.`,
182
- );
183
- }
184
-
185
- // Replacement: compile ggml-metal.metal AND each shipped standalone into
186
- // its own .air file, then merge them all into default.metallib.
187
- const milady_air_lines = METAL_KERNEL_FILES.map((name) => {
188
- const stem = name.replace(/\.metal$/, "");
189
- return ` COMMAND xcrun -sdk macosx metal \${XC_FLAGS} -c \${CMAKE_CURRENT_SOURCE_DIR}/milady-shipped/${name} -o \${CMAKE_CURRENT_BINARY_DIR}/${stem}.air`;
190
- }).join("\n");
191
- const milady_air_inputs = METAL_KERNEL_FILES.map((name) => {
192
- const stem = name.replace(/\.metal$/, "");
193
- return `\${CMAKE_CURRENT_BINARY_DIR}/${stem}.air`;
194
- }).join(" ");
195
- const milady_depends = METAL_KERNEL_FILES.map(
196
- (name) => `\${CMAKE_CURRENT_SOURCE_DIR}/milady-shipped/${name}`,
197
- ).join(" ");
256
+ if (!patched.includes(anchor)) {
257
+ throw new Error(
258
+ `[metal-kernels] CMakeLists.txt anchor not found at ${cmakePath}; ` +
259
+ `the fork's metallib build snippet has changed shape and the patch ` +
260
+ `must be revisited. Inspect the file's add_custom_command for default.metallib.`,
261
+ );
262
+ }
198
263
 
199
- const replacement = ` # ${SENTINEL}
264
+ const miladyAirLines = miladyAirLinesForSdk("macosx");
265
+ const replacement = ` # ${SENTINEL}
200
266
  # Build ggml-metal.metal AND each milady standalone shader into its own
201
267
  # .air file, then merge all .air files into a single default.metallib.
202
268
  # The standalones are self-contained TUs (only #include <metal_stdlib>;
@@ -205,23 +271,23 @@ function patchMetalCMakeLists(cacheDir, { dryRun }) {
205
271
  add_custom_command(
206
272
  OUTPUT \${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
207
273
  COMMAND xcrun -sdk macosx metal \${XC_FLAGS} -c \${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o \${CMAKE_CURRENT_BINARY_DIR}/ggml-metal.air
208
- ${milady_air_lines}
209
- COMMAND xcrun -sdk macosx metallib \${CMAKE_CURRENT_BINARY_DIR}/ggml-metal.air ${milady_air_inputs} -o \${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
274
+ ${miladyAirLines}
275
+ COMMAND xcrun -sdk macosx metallib \${CMAKE_CURRENT_BINARY_DIR}/ggml-metal.air ${miladyAirInputs} -o \${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
210
276
  COMMAND rm -f \${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
211
277
  COMMAND rm -f \${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal
212
- DEPENDS ggml-metal.metal \${METALLIB_COMMON} ${milady_depends}
278
+ DEPENDS ggml-metal.metal \${METALLIB_COMMON} ${miladyDepends}
213
279
  COMMENT "Compiling Metal kernels (ggml-metal + milady-shipped: ${METAL_KERNEL_FILES.join(", ")})"
214
280
  )`;
281
+ patched = patched.replace(anchor, replacement);
282
+ changed = true;
283
+ }
215
284
 
216
- const patched = original.replace(anchor, replacement);
217
285
  if (patched === original) {
218
- throw new Error(
219
- `[metal-kernels] anchor matched but replacement did not change ${cmakePath}; this is a bug`,
220
- );
286
+ return { changed: false, path: cmakePath };
221
287
  }
222
288
  if (dryRun) {
223
289
  console.log(
224
- `[metal-kernels] (dry-run) would patch ${cmakePath} (anchor matched, replacement size ${replacement.length} chars, includes ${METAL_KERNEL_FILES.length} shipped kernels)`,
290
+ `[metal-kernels] (dry-run) would patch ${cmakePath} (changed=${changed}, includes ${METAL_KERNEL_FILES.length} shipped kernels)`,
225
291
  );
226
292
  return { changed: false, path: cmakePath };
227
293
  }
@@ -229,86 +295,120 @@ ${milady_air_lines}
229
295
  return { changed: true, path: cmakePath };
230
296
  }
231
297
 
232
- const SENTINEL_DISPATCH = "// MILADY-DISPATCH-V1";
298
+ function patchEmbeddedMetallibLoader(cacheDir, { dryRun }) {
299
+ const deviceMPath = path.join(
300
+ cacheDir,
301
+ "ggml",
302
+ "src",
303
+ "ggml-metal",
304
+ "ggml-metal-device.m",
305
+ );
306
+ if (!fs.existsSync(deviceMPath)) {
307
+ throw new Error(
308
+ `[metal-kernels] expected ${deviceMPath} to exist on the fork; cannot wire embedded metallib loader`,
309
+ );
310
+ }
311
+ const original = fs.readFileSync(deviceMPath, "utf8");
312
+ if (original.includes(SENTINEL_EMBED_LOADER)) {
313
+ return { changed: false, path: deviceMPath };
314
+ }
315
+ const anchor = `#if GGML_METAL_EMBED_LIBRARY
316
+ GGML_LOG_INFO("%s: using embedded metal library\\n", __func__);
317
+
318
+ extern const char ggml_metallib_start[];
319
+ extern const char ggml_metallib_end[];
320
+
321
+ src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
322
+ #else`;
323
+ if (!original.includes(anchor)) {
324
+ throw new Error(
325
+ `[metal-kernels] embedded Metal loader anchor not found at ${deviceMPath}; ` +
326
+ `the fork's GGML_METAL_EMBED_LIBRARY loader changed shape and the patch must be revisited.`,
327
+ );
328
+ }
329
+ const replacement = `#if GGML_METAL_EMBED_LIBRARY
330
+ GGML_LOG_INFO("%s: using embedded compiled metal library\\n", __func__);
331
+
332
+ extern const char ggml_metallib_start[];
333
+ extern const char ggml_metallib_end[];
334
+
335
+ // ${SENTINEL_EMBED_LOADER}
336
+ // The build patch embeds compiled default.metallib bytes here, not
337
+ // Metal source. Loading with newLibraryWithData keeps iOS on the same
338
+ // multi-TU kernel set as desktop and avoids duplicate declarations
339
+ // between ggml-metal.metal and the milady standalone shaders.
340
+ const NSUInteger metallib_len = (NSUInteger)(ggml_metallib_end - ggml_metallib_start);
341
+ dispatch_data_t metallib_data = dispatch_data_create(ggml_metallib_start, metallib_len, nil, DISPATCH_DATA_DESTRUCTOR_DEFAULT);
342
+ library = [device newLibraryWithData:metallib_data error:&error];
343
+ if (error) {
344
+ GGML_LOG_ERROR("%s: error: %s\\n", __func__, [[error description] UTF8String]);
345
+ return nil;
346
+ }
347
+ #else`;
348
+ const patched = original.replace(anchor, replacement);
349
+ if (patched === original) {
350
+ throw new Error("[metal-kernels] embedded loader replace produced no change");
351
+ }
352
+ if (!dryRun) fs.writeFileSync(deviceMPath, patched, "utf8");
353
+ return { changed: !dryRun, path: deviceMPath };
354
+ }
233
355
 
234
- // All milady ggml_type values (must match ggml/include/ggml.h).
235
- // Used by both the type-detection helper and the milady pipeline lookup.
236
- // Note: TBQ3_0=43, TBQ4_0=44, QJL1_256=46, Q4_POLAR=47, TBQ3_TCQ=48.
237
- const MILADY_QUANT_TYPES = ["TBQ3_0", "TBQ4_0", "QJL1_256", "Q4_POLAR", "TBQ3_TCQ"];
356
+ const SENTINEL_DISPATCH = "// MILADY-DISPATCH-V1";
238
357
 
239
- function patchMetalDispatchHeader(cacheDir, { dryRun }) {
358
+ function patchMetalQjlAttnHeader(cacheDir, { dryRun }) {
240
359
  const headerPath = path.join(cacheDir, "ggml", "src", "ggml-metal", "ggml-metal-device.h");
241
360
  const original = fs.readFileSync(headerPath, "utf8");
242
- if (original.includes(SENTINEL_DISPATCH)) {
361
+ if (original.includes(SENTINEL_QJL_ATTN)) {
243
362
  return { changed: false, path: headerPath };
244
363
  }
245
- const anchor = `struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv (ggml_metal_library_t lib, const struct ggml_tensor * op);`;
364
+ const anchor = `struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext(
365
+ ggml_metal_library_t lib,
366
+ const struct ggml_tensor * op,
367
+ bool has_mask,
368
+ bool has_sinks,
369
+ bool has_bias,
370
+ bool has_scap,
371
+ bool has_kvpad,
372
+ int32_t nsg);`;
246
373
  if (!original.includes(anchor)) {
247
374
  throw new Error(
248
- `[metal-dispatch] header anchor not found at ${headerPath}; the fork's get_pipeline_mul_mv decl has moved. Inspect ggml-metal-device.h.`,
375
+ `[metal-qjl-attn] device.h anchor not found at ${headerPath}; inspect flash-attn pipeline declarations.`,
249
376
  );
250
377
  }
251
378
  const insert = `${anchor}
252
- ${SENTINEL_DISPATCH}
253
- struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_milady_mul_mv (ggml_metal_library_t lib, enum ggml_type tsrc0);
254
- struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_milady_get_rows(ggml_metal_library_t lib, enum ggml_type tsrc0);`;
379
+
380
+ ${SENTINEL_QJL_ATTN}
381
+ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_attn_score_qjl(
382
+ ggml_metal_library_t lib);`;
255
383
  const patched = original.replace(anchor, insert);
256
- if (patched === original) {
257
- throw new Error(`[metal-dispatch] header replace produced no change`);
258
- }
259
384
  if (!dryRun) fs.writeFileSync(headerPath, patched, "utf8");
260
385
  return { changed: !dryRun, path: headerPath };
261
386
  }
262
387
 
263
- function patchMetalDispatchDeviceCpp(cacheDir, { dryRun }) {
388
+ function patchMetalQjlAttnDeviceCpp(cacheDir, { dryRun }) {
264
389
  const cppPath = path.join(cacheDir, "ggml", "src", "ggml-metal", "ggml-metal-device.cpp");
265
390
  const original = fs.readFileSync(cppPath, "utf8");
266
- if (original.includes(SENTINEL_DISPATCH)) {
391
+ if (original.includes(SENTINEL_QJL_ATTN)) {
267
392
  return { changed: false, path: cppPath };
268
393
  }
269
-
270
- // (1) Insert milady pipeline lookup helpers right after
271
- // ggml_metal_library_get_pipeline_get_rows. They build the explicit
272
- // standalone symbol names (kernel_mul_mv_qjl1_256_f32 etc.) and bypass
273
- // ggml_metal_library_compile_pipeline (which would re-enter the
274
- // metallib compiler and fail because the standalones don't declare
275
- // the `nsg` function constant). Pure name lookup against the already-
276
- // loaded library — fails fast with GGML_ABORT if the symbol is not
277
- // present in default.metallib (which would mean the kernel-shipment
278
- // patch above silently regressed).
279
- const helpersAnchor = `ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_set_rows(ggml_metal_library_t lib, ggml_type tidx, ggml_type tdst) {`;
280
- if (!original.includes(helpersAnchor)) {
281
- throw new Error(`[metal-dispatch] device.cpp helpers anchor not found at ${cppPath}`);
394
+ const anchor = `ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin(ggml_metal_library_t lib, const ggml_tensor * op, int32_t n_fuse) {`;
395
+ if (!original.includes(anchor)) {
396
+ throw new Error(
397
+ `[metal-qjl-attn] device.cpp anchor not found at ${cppPath}; inspect pipeline helper layout.`,
398
+ );
282
399
  }
283
- const helpers = `${SENTINEL_DISPATCH}
284
- // Milady-quant pipeline lookups. These kernels were built by the kernel
285
- // shipment patch into default.metallib but use CUSTOM arg structs
286
- // (qjl_score_args / qjl_mv_args / qjl_dequant_args / polar_mv_args /
287
- // polar_dequant_args) that do NOT match ggml_metal_kargs_mul_mv. The
288
- // standard get_pipeline_mul_mv helper sets a 'nsg' function constant
289
- // the standalones do not declare; calling it crashes the metallib
290
- // compiler. We keep this lookup constant-free.
291
- ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_milady_mul_mv(ggml_metal_library_t lib, ggml_type tsrc0) {
292
- char name[256];
293
- switch (tsrc0) {
294
- case GGML_TYPE_QJL1_256: snprintf(name, 256, "kernel_mul_mv_qjl1_256_f32"); break;
295
- case GGML_TYPE_Q4_POLAR: snprintf(name, 256, "kernel_mul_mv_q4_polar_f32"); break;
296
- default:
297
- GGML_LOG_ERROR("milady_mul_mv: type %s (%d) has no mul_mv standalone (only attention-score)\\n",
298
- ggml_type_name(tsrc0), (int) tsrc0);
299
- GGML_ABORT("milady_mul_mv: unsupported milady-quant type for MUL_MAT");
300
- }
400
+ const helper = `${SENTINEL_QJL_ATTN}
401
+ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_attn_score_qjl(ggml_metal_library_t lib) {
402
+ const char * name = "kernel_attn_score_qjl1_256";
301
403
  ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
302
404
  if (!res.pipeline) {
303
- // Cache miss compile the pipeline by direct symbol name without
304
- // any function constants. The standard get_pipeline_mul_mv helper
305
- // would set an 'nsg' function constant which the standalones do not
306
- // declare; we explicitly pass nullptr to bypass that.
405
+ // Standalone shipped shader: it declares no Metal function constants,
406
+ // so compile by direct symbol name with a null constants table.
307
407
  res = ggml_metal_library_compile_pipeline(lib, name, name, nullptr);
308
408
  }
309
409
  if (!res.pipeline) {
310
- GGML_LOG_ERROR("milady_mul_mv: kernel '%s' could not be compiled from default.metallib\\n", name);
311
- GGML_ABORT("milady_mul_mv: kernel pipeline compile failed");
410
+ GGML_LOG_ERROR("attn_score_qjl: kernel '%s' missing from default.metallib\\n", name);
411
+ GGML_ABORT("attn_score_qjl: pipeline compile failed");
312
412
  }
313
413
  res.nr0 = 1;
314
414
  res.nr1 = 1;
@@ -317,269 +417,181 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_milady_mul_mv(gg
317
417
  return res;
318
418
  }
319
419
 
320
- ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_milady_get_rows(ggml_metal_library_t lib, ggml_type tsrc0) {
321
- char name[256];
322
- switch (tsrc0) {
323
- case GGML_TYPE_QJL1_256: snprintf(name, 256, "kernel_get_rows_qjl1_256"); break;
324
- case GGML_TYPE_Q4_POLAR: snprintf(name, 256, "kernel_get_rows_q4_polar"); break;
325
- default:
326
- GGML_LOG_ERROR("milady_get_rows: type %s (%d) has no get_rows standalone\\n",
327
- ggml_type_name(tsrc0), (int) tsrc0);
328
- GGML_ABORT("milady_get_rows: unsupported milady-quant type for GET_ROWS");
329
- }
330
- ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
331
- if (!res.pipeline) {
332
- res = ggml_metal_library_compile_pipeline(lib, name, name, nullptr);
333
- }
334
- if (!res.pipeline) {
335
- GGML_LOG_ERROR("milady_get_rows: kernel '%s' could not be compiled from default.metallib\\n", name);
336
- GGML_ABORT("milady_get_rows: kernel pipeline compile failed");
337
- }
338
- res.nr0 = 1; res.nr1 = 1; res.nsg = 1; res.smem = 0;
339
- return res;
420
+ `;
421
+ const patched = original.replace(anchor, helper + anchor);
422
+ if (!dryRun) fs.writeFileSync(cppPath, patched, "utf8");
423
+ return { changed: !dryRun, path: cppPath };
340
424
  }
341
425
 
342
- `;
343
- let patched = original.replace(helpersAnchor, helpers + helpersAnchor);
344
-
345
- // (2) Add a guard at the top of ggml_metal_library_get_pipeline_mul_mv()
346
- // so any caller that didn't go through the milady early-out gets a
347
- // clean structured abort instead of crashing in the metallib compiler
348
- // when the `nsg` function constant has no matching declaration.
349
- const mvSwitchAnchor = ` // use custom matrix x vector kernel
350
- switch (tsrc0) {`;
351
- if (!patched.includes(mvSwitchAnchor)) {
352
- throw new Error(`[metal-dispatch] device.cpp mul_mv switch anchor not found`);
353
- }
354
- const mvGuard = ` // ${SENTINEL_DISPATCH}
355
- // Defence-in-depth: milady-quant types should be diverted by the op-side
356
- // early-out in ggml_metal_op_mul_mat. If we got here, the dispatch
357
- // routing has regressed.
358
- if (tsrc0 == GGML_TYPE_QJL1_256 || tsrc0 == GGML_TYPE_Q4_POLAR ||
359
- tsrc0 == GGML_TYPE_TBQ3_0 || tsrc0 == GGML_TYPE_TBQ4_0 ||
360
- tsrc0 == GGML_TYPE_TBQ3_TCQ) {
361
- GGML_LOG_ERROR("get_pipeline_mul_mv: type %s reached standard helper (op-side dispatch regression)\\n",
362
- ggml_type_name(tsrc0));
363
- GGML_ABORT("get_pipeline_mul_mv: milady-quant type leaked into standard pipeline path");
364
- }
365
- // use custom matrix x vector kernel
366
- switch (tsrc0) {`;
367
- patched = patched.replace(mvSwitchAnchor, mvGuard);
368
-
369
- // (3) Same defence at the top of ggml_metal_library_get_pipeline_get_rows.
370
- // This helper auto-builds `kernel_get_rows_<typename>`, which for the
371
- // milady types yields the right symbol — but it would still try to
372
- // compile a fresh pipeline if the lookup misses, and the compile path
373
- // hits ggml-common.h struct redefinition. Hard-fail instead.
374
- const grAnchor = `ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_get_rows(ggml_metal_library_t lib, ggml_type tsrc) {
375
- char base[256];`;
376
- if (!patched.includes(grAnchor)) {
377
- throw new Error(`[metal-dispatch] device.cpp get_rows anchor not found`);
426
+ function patchMetalQjlAttnOpsHeader(cacheDir, { dryRun }) {
427
+ const headerPath = path.join(cacheDir, "ggml", "src", "ggml-metal", "ggml-metal-ops.h");
428
+ const original = fs.readFileSync(headerPath, "utf8");
429
+ if (original.includes(SENTINEL_QJL_ATTN)) {
430
+ return { changed: false, path: headerPath };
378
431
  }
379
- const grReplace = `ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_get_rows(ggml_metal_library_t lib, ggml_type tsrc) {
380
- // ${SENTINEL_DISPATCH}
381
- if (tsrc == GGML_TYPE_QJL1_256 || tsrc == GGML_TYPE_Q4_POLAR ||
382
- tsrc == GGML_TYPE_TBQ3_0 || tsrc == GGML_TYPE_TBQ4_0 ||
383
- tsrc == GGML_TYPE_TBQ3_TCQ) {
384
- GGML_LOG_ERROR("get_pipeline_get_rows: type %s reached standard helper (op-side dispatch regression)\\n",
385
- ggml_type_name(tsrc));
386
- GGML_ABORT("get_pipeline_get_rows: milady-quant type leaked into standard pipeline path");
387
- }
388
- char base[256];`;
389
- patched = patched.replace(grAnchor, grReplace);
390
-
391
- if (patched === original) {
392
- throw new Error(`[metal-dispatch] device.cpp replace produced no change`);
432
+ const anchor = `int ggml_metal_op_flash_attn_ext (ggml_metal_op_t ctx, int idx);`;
433
+ if (!original.includes(anchor)) {
434
+ throw new Error(
435
+ `[metal-qjl-attn] ops.h anchor not found at ${headerPath}; inspect op declarations.`,
436
+ );
393
437
  }
394
- if (!dryRun) fs.writeFileSync(cppPath, patched, "utf8");
395
- return { changed: !dryRun, path: cppPath };
438
+ const insert = `${anchor}
439
+ ${SENTINEL_QJL_ATTN}
440
+ int ggml_metal_op_attn_score_qjl (ggml_metal_op_t ctx, int idx);`;
441
+ const patched = original.replace(anchor, insert);
442
+ if (!dryRun) fs.writeFileSync(headerPath, patched, "utf8");
443
+ return { changed: !dryRun, path: headerPath };
396
444
  }
397
445
 
398
- function patchMetalDispatchOpsCpp(cacheDir, { dryRun }) {
446
+ function patchMetalQjlAttnOpsCpp(cacheDir, { dryRun }) {
399
447
  const opsPath = path.join(cacheDir, "ggml", "src", "ggml-metal", "ggml-metal-ops.cpp");
400
448
  const original = fs.readFileSync(opsPath, "utf8");
401
- if (original.includes(SENTINEL_DISPATCH)) {
449
+ if (original.includes(SENTINEL_QJL_ATTN)) {
402
450
  return { changed: false, path: opsPath };
403
451
  }
404
452
 
405
- // (1) Insert helper functions just before ggml_metal_op_mul_mat. These
406
- // own the milady-quant dispatch shape: they pull op tensor metadata,
407
- // build the standalone arg struct, set buffers, and dispatch with the
408
- // correct threadgroup shape (32 threads per row, one threadgroup per
409
- // row of src0). The functions hard-abort on unsupported types so a
410
- // mis-routed call surfaces immediately.
411
- // Anchor on int ggml_metal_op_get_rows since it appears earlier in the file
412
- // than ggml_metal_op_mul_mat — both functions reference the milady helpers
413
- // through the early-out so the helpers must be visible to BOTH.
414
- const muMatAnchor = `int ggml_metal_op_get_rows(ggml_metal_op_t ctx, int idx) {`;
415
- if (!original.includes(muMatAnchor)) {
416
- throw new Error(`[metal-dispatch] ops.cpp get_rows anchor not found at ${opsPath}`);
453
+ const funcAnchor = `static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {`;
454
+ if (!original.includes(funcAnchor)) {
455
+ throw new Error(
456
+ `[metal-qjl-attn] ops.cpp function anchor not found at ${opsPath}; inspect encode layout.`,
457
+ );
417
458
  }
418
- const helpers = `${SENTINEL_DISPATCH}
419
- // Milady-quant arg structs. Layout-matched bit-for-bit to the standalone
420
- // declarations in milady-shipped/{qjl,polar}.metal — keep these in sync.
421
- struct milady_qjl_mv_args { uint32_t n_rows; uint32_t proj_dim; };
422
- struct milady_qjl_dequant_args { uint32_t head_dim; uint32_t proj_dim; };
423
- struct milady_polar_mv_args { uint32_t n_rows; uint32_t head_dim; uint32_t use_qjl; };
424
- struct milady_polar_dequant_args { uint32_t head_dim; uint32_t use_qjl; };
425
-
426
- static inline bool milady_is_quant_mul_mv_supported(ggml_type t) {
427
- return t == GGML_TYPE_QJL1_256 || t == GGML_TYPE_Q4_POLAR;
428
- }
429
- static inline bool milady_is_quant_get_rows_supported(ggml_type t) {
430
- return t == GGML_TYPE_QJL1_256 || t == GGML_TYPE_Q4_POLAR;
431
- }
432
- // TBQ3_0 / TBQ4_0 / TBQ3_TCQ — standalones expose only attention-score
433
- // kernels (kernel_turbo3_dot etc.). MUL_MAT against these types in a
434
- // generic graph is not yet supported; we surface a clear abort instead
435
- // of silently routing through a path that crashes in the metallib
436
- // compiler. See AGENTS.md "TBQ* attention bridge" follow-up.
437
- static inline bool milady_is_quant_tbq_attn_only(ggml_type t) {
438
- return t == GGML_TYPE_TBQ3_0 || t == GGML_TYPE_TBQ4_0 || t == GGML_TYPE_TBQ3_TCQ;
459
+ const opFunc = `${SENTINEL_QJL_ATTN}
460
+ struct milady_qjl_score_args {
461
+ uint32_t n_heads;
462
+ uint32_t n_kv_heads;
463
+ uint32_t n_tokens;
464
+ uint32_t proj_dim;
465
+ };
466
+
467
+ static inline ggml_metal_buffer_id milady_metal_buffer_offset(ggml_metal_buffer_id id, size_t extra) {
468
+ id.offs += extra;
469
+ return id;
439
470
  }
440
471
 
441
- static int ggml_metal_op_mul_mv_milady_quant(ggml_metal_op_t ctx, int idx) {
472
+ int ggml_metal_op_attn_score_qjl(ggml_metal_op_t ctx, int idx) {
442
473
  ggml_tensor * op = ctx->node(idx);
443
- ggml_metal_library_t lib = ctx->lib;
444
- ggml_metal_encoder_t enc = ctx->enc;
445
474
 
446
- GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
447
- GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
448
-
449
- const ggml_type tsrc0 = op->src[0]->type;
450
-
451
- if (milady_is_quant_tbq_attn_only(tsrc0)) {
452
- GGML_LOG_ERROR("milady_quant mul_mv: type %s exposes only attention-score kernels in the standalones; MUL_MAT requires an ATTN_SCORE op (Wave-7 work)\\n",
453
- ggml_type_name(tsrc0));
454
- GGML_ABORT("milady_quant: tbq* MUL_MAT not yet wired");
455
- }
456
- if (!milady_is_quant_mul_mv_supported(tsrc0)) {
457
- GGML_LOG_ERROR("milady_quant mul_mv: type %s not a milady-quant type\\n", ggml_type_name(tsrc0));
458
- GGML_ABORT("milady_quant mul_mv: unsupported type");
459
- }
460
- GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32 && "milady_quant mul_mv expects fp32 activation");
461
-
462
- auto pipeline = ggml_metal_library_get_pipeline_milady_mul_mv(lib, tsrc0);
463
-
464
- const int32_t n_rows = ne01;
465
-
466
- ggml_metal_encoder_set_pipeline(enc, pipeline);
467
- ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 0);
468
- ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 1);
469
- ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
470
-
471
- if (tsrc0 == GGML_TYPE_QJL1_256) {
472
- milady_qjl_mv_args args = {
473
- /* n_rows = */ (uint32_t) n_rows,
474
- /* proj_dim = */ 256u,
475
- };
476
- ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 3);
477
- } else { // GGML_TYPE_Q4_POLAR
478
- milady_polar_mv_args args = {
479
- /* n_rows = */ (uint32_t) n_rows,
480
- /* head_dim = */ 128u,
481
- /* use_qjl = */ 0u,
482
- };
483
- ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 3);
484
- }
485
-
486
- // 32 threads per row, one threadgroup per row. Matches the standalone
487
- // dispatch shape verified by metal_verify (8/8 PASS).
488
- ggml_metal_encoder_dispatch_threadgroups(enc, n_rows, 1, 1, 32, 1, 1);
489
- return 1;
490
- }
491
-
492
- static int ggml_metal_op_get_rows_milady_quant(ggml_metal_op_t ctx, int idx) {
493
- ggml_tensor * op = ctx->node(idx);
494
475
  ggml_metal_library_t lib = ctx->lib;
495
476
  ggml_metal_encoder_t enc = ctx->enc;
496
477
 
497
- const ggml_type tsrc0 = op->src[0]->type;
498
- if (!milady_is_quant_get_rows_supported(tsrc0)) {
499
- GGML_LOG_ERROR("milady_quant get_rows: type %s not supported (tbq* lacks get_rows kernel)\\n",
500
- ggml_type_name(tsrc0));
501
- GGML_ABORT("milady_quant get_rows: unsupported type");
502
- }
503
-
504
- auto pipeline = ggml_metal_library_get_pipeline_milady_get_rows(lib, tsrc0);
478
+ const ggml_tensor * q = op->src[0];
479
+ const ggml_tensor * pk = op->src[1];
480
+
481
+ GGML_ASSERT(q != nullptr);
482
+ GGML_ASSERT(pk != nullptr);
483
+ GGML_ASSERT(q->type == GGML_TYPE_F32);
484
+ GGML_ASSERT(pk->type == GGML_TYPE_QJL1_256);
485
+ GGML_ASSERT(op->type == GGML_TYPE_F32);
486
+ GGML_ASSERT(q->ne[0] == 256);
487
+ GGML_ASSERT(pk->ne[0] == 128);
488
+
489
+ const uint32_t n_heads = (uint32_t) q->ne[1];
490
+ const uint32_t n_kv_heads = (uint32_t) ((const int32_t *) op->op_params)[0];
491
+ const uint32_t n_tokens = (uint32_t) pk->ne[1];
492
+ const int64_t n_batch = q->ne[2];
493
+ const int64_t ne3 = q->ne[3];
494
+
495
+ GGML_ASSERT(n_kv_heads > 0);
496
+ GGML_ASSERT((n_heads % n_kv_heads) == 0);
497
+ GGML_ASSERT(pk->ne[2] == (int64_t) n_kv_heads);
498
+ GGML_ASSERT(pk->ne[3] == ne3);
499
+ GGML_ASSERT(op->ne[0] == (int64_t) n_tokens);
500
+ GGML_ASSERT(op->ne[1] == (int64_t) n_heads);
501
+ GGML_ASSERT(op->ne[2] == n_batch);
502
+ GGML_ASSERT(op->ne[3] == ne3);
503
+ GGML_ASSERT(pk->nb[1] == ggml_row_size(GGML_TYPE_QJL1_256, 128));
504
+ GGML_ASSERT(pk->nb[2] == (size_t) n_tokens * pk->nb[1]);
505
+
506
+ milady_qjl_score_args args = {
507
+ /* n_heads = */ n_heads,
508
+ /* n_kv_heads = */ n_kv_heads,
509
+ /* n_tokens = */ n_tokens,
510
+ /* proj_dim = */ 256u,
511
+ };
512
+
513
+ auto pipeline = ggml_metal_library_get_pipeline_attn_score_qjl(lib);
514
+
515
+ const ggml_metal_buffer_id q_base = ggml_metal_get_buffer_id(q);
516
+ const ggml_metal_buffer_id pk_base = ggml_metal_get_buffer_id(pk);
517
+ const ggml_metal_buffer_id dst_base = ggml_metal_get_buffer_id(op);
505
518
 
506
519
  ggml_metal_encoder_set_pipeline(enc, pipeline);
507
- ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 0);
508
- ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 1);
509
- ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
510
-
511
- if (tsrc0 == GGML_TYPE_QJL1_256) {
512
- milady_qjl_dequant_args args = { /* head_dim = */ 128u, /* proj_dim = */ 256u };
513
- ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 3);
514
- } else { // GGML_TYPE_Q4_POLAR
515
- milady_polar_dequant_args args = { /* head_dim = */ 128u, /* use_qjl = */ 0u };
516
- ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 3);
520
+ ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 3);
521
+
522
+ for (int64_t i3 = 0; i3 < ne3; ++i3) {
523
+ const size_t q_i3 = (size_t) i3 * q->nb[3];
524
+ const size_t pk_i3 = (size_t) i3 * pk->nb[3];
525
+ const size_t dst_i3 = (size_t) i3 * op->nb[3];
526
+ for (int64_t ib = 0; ib < n_batch; ++ib) {
527
+ ggml_metal_encoder_set_buffer(enc, milady_metal_buffer_offset(q_base, q_i3 + (size_t) ib * q->nb[2]), 0);
528
+ ggml_metal_encoder_set_buffer(enc, milady_metal_buffer_offset(pk_base, pk_i3), 1);
529
+ ggml_metal_encoder_set_buffer(enc, milady_metal_buffer_offset(dst_base, dst_i3 + (size_t) ib * op->nb[2]), 2);
530
+ ggml_metal_encoder_dispatch_threadgroups(enc, (int) n_heads, (int) n_tokens, 1, 32, 1, 1);
531
+ }
517
532
  }
518
533
 
519
- // Single threadgroup, 32 threads, processes one block.
520
- ggml_metal_encoder_dispatch_threadgroups(enc, 1, 1, 1, 32, 1, 1);
521
534
  return 1;
522
535
  }
523
536
 
524
537
  `;
525
- let patched = original.replace(muMatAnchor, helpers + muMatAnchor);
538
+ let patched = original.replace(funcAnchor, opFunc + funcAnchor);
526
539
 
527
- // (2) Early-out at the top of ggml_metal_op_mul_mat() — divert milady
528
- // types BEFORE any of the kernel-selection logic that depends on
529
- // ggml_metal_library_get_pipeline_mul_mv.
530
- const muMatBodyAnchor = `int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
531
- ggml_tensor * op = ctx->node(idx);
532
-
533
- ggml_metal_library_t lib = ctx->lib;
534
- ggml_metal_encoder_t enc = ctx->enc;`;
535
- if (!patched.includes(muMatBodyAnchor)) {
536
- throw new Error(`[metal-dispatch] ops.cpp mul_mat body anchor not found`);
540
+ const switchAnchor = ` case GGML_OP_FLASH_ATTN_EXT:
541
+ {
542
+ n_fuse = ggml_metal_op_flash_attn_ext(ctx, idx);
543
+ } break;`;
544
+ if (!patched.includes(switchAnchor)) {
545
+ throw new Error(
546
+ `[metal-qjl-attn] ops.cpp switch anchor not found at ${opsPath}; inspect encode switch.`,
547
+ );
537
548
  }
538
- const muMatEarly = `${muMatBodyAnchor}
539
-
540
- // ${SENTINEL_DISPATCH}
541
- {
542
- const ggml_type tsrc0 = op->src[0]->type;
543
- if (tsrc0 == GGML_TYPE_QJL1_256 || tsrc0 == GGML_TYPE_Q4_POLAR ||
544
- tsrc0 == GGML_TYPE_TBQ3_0 || tsrc0 == GGML_TYPE_TBQ4_0 ||
545
- tsrc0 == GGML_TYPE_TBQ3_TCQ) {
546
- return ggml_metal_op_mul_mv_milady_quant(ctx, idx);
547
- }
548
- (void) lib; (void) enc;
549
- }`;
550
- patched = patched.replace(muMatBodyAnchor, muMatEarly);
551
-
552
- // (3) Early-out at the top of ggml_metal_op_get_rows().
553
- const grBodyAnchor = `int ggml_metal_op_get_rows(ggml_metal_op_t ctx, int idx) {
554
- ggml_tensor * op = ctx->node(idx);
549
+ const switchInsert = `${switchAnchor}
550
+ case GGML_OP_ATTN_SCORE_QJL:
551
+ {
552
+ n_fuse = ggml_metal_op_attn_score_qjl(ctx, idx);
553
+ } break;`;
554
+ patched = patched.replace(switchAnchor, switchInsert);
555
+ if (!dryRun) fs.writeFileSync(opsPath, patched, "utf8");
556
+ return { changed: !dryRun, path: opsPath };
557
+ }
555
558
 
556
- ggml_metal_library_t lib = ctx->lib;
557
- ggml_metal_encoder_t enc = ctx->enc;`;
558
- if (!patched.includes(grBodyAnchor)) {
559
- throw new Error(`[metal-dispatch] ops.cpp get_rows body anchor not found`);
559
+ function patchMetalQjlAttnSupportsOp(cacheDir, { dryRun }) {
560
+ const deviceMPath = path.join(cacheDir, "ggml", "src", "ggml-metal", "ggml-metal-device.m");
561
+ const original = fs.readFileSync(deviceMPath, "utf8");
562
+ if (original.includes(SENTINEL_QJL_ATTN)) {
563
+ return { changed: false, path: deviceMPath };
560
564
  }
561
- const grEarly = `${grBodyAnchor}
562
-
563
- // ${SENTINEL_DISPATCH}
564
- {
565
- const ggml_type tsrc0 = op->src[0]->type;
566
- if (tsrc0 == GGML_TYPE_QJL1_256 || tsrc0 == GGML_TYPE_Q4_POLAR) {
567
- return ggml_metal_op_get_rows_milady_quant(ctx, idx);
568
- }
569
- if (tsrc0 == GGML_TYPE_TBQ3_0 || tsrc0 == GGML_TYPE_TBQ4_0 || tsrc0 == GGML_TYPE_TBQ3_TCQ) {
570
- GGML_LOG_ERROR("get_rows: type %s has no standalone get_rows kernel (tbq* attention-only)\\n",
571
- ggml_type_name(tsrc0));
572
- GGML_ABORT("get_rows: tbq* not wired");
573
- }
574
- (void) lib; (void) enc;
575
- }`;
576
- patched = patched.replace(grBodyAnchor, grEarly);
577
-
578
- if (patched === original) {
579
- throw new Error(`[metal-dispatch] ops.cpp replace produced no change`);
565
+ const anchor = ` case GGML_OP_FLASH_ATTN_EXT:
566
+ // for new head sizes, add checks here`;
567
+ if (!original.includes(anchor)) {
568
+ throw new Error(
569
+ `[metal-qjl-attn] supports_op anchor not found at ${deviceMPath}; inspect GGML_OP_FLASH_ATTN_EXT branch.`,
570
+ );
580
571
  }
581
- if (!dryRun) fs.writeFileSync(opsPath, patched, "utf8");
582
- return { changed: !dryRun, path: opsPath };
572
+ const insert = ` case GGML_OP_ATTN_SCORE_QJL:
573
+ // ${SENTINEL_QJL_ATTN}
574
+ return has_simdgroup_reduction &&
575
+ op->type == GGML_TYPE_F32 &&
576
+ op->src[0] != NULL &&
577
+ op->src[1] != NULL &&
578
+ op->src[0]->type == GGML_TYPE_F32 &&
579
+ op->src[1]->type == GGML_TYPE_QJL1_256 &&
580
+ op->src[0]->ne[0] == 256 &&
581
+ op->src[1]->ne[0] == 128;
582
+ ${anchor}`;
583
+ const patched = original.replace(anchor, insert);
584
+ if (!dryRun) fs.writeFileSync(deviceMPath, patched, "utf8");
585
+ return { changed: !dryRun, path: deviceMPath };
586
+ }
587
+
588
+ function patchMetalQjlAttnDispatch(cacheDir, { dryRun }) {
589
+ const header = patchMetalQjlAttnHeader(cacheDir, { dryRun });
590
+ const deviceCpp = patchMetalQjlAttnDeviceCpp(cacheDir, { dryRun });
591
+ const opsHeader = patchMetalQjlAttnOpsHeader(cacheDir, { dryRun });
592
+ const opsCpp = patchMetalQjlAttnOpsCpp(cacheDir, { dryRun });
593
+ const supportsOp = patchMetalQjlAttnSupportsOp(cacheDir, { dryRun });
594
+ return { header, deviceCpp, opsHeader, opsCpp, supportsOp };
583
595
  }
584
596
 
585
597
  export function patchMetalDispatch(cacheDir, { dryRun = false } = {}) {
@@ -613,7 +625,11 @@ export function patchMetalDispatch(cacheDir, { dryRun = false } = {}) {
613
625
  } else {
614
626
  console.log(`${dryRun ? "(dry-run) " : ""}${message}`);
615
627
  }
616
- return { status: "not-applied", unsafePatchPresent: patchedFiles };
628
+ const qjlAttn = patchMetalQjlAttnDispatch(cacheDir, { dryRun });
629
+ console.log(
630
+ `[metal-dispatch] ${dryRun ? "(dry-run) " : ""}wired dedicated GGML_OP_ATTN_SCORE_QJL dispatch via kernel_attn_score_qjl1_256`,
631
+ );
632
+ return { status: "qjl-attn-only", unsafePatchPresent: patchedFiles, qjlAttn };
617
633
  }
618
634
 
619
635
  // Public entry point used by build-llama-cpp-dflash.mjs.
@@ -625,6 +641,7 @@ export function patchMetalKernels(cacheDir, { dryRun = false } = {}) {
625
641
  assertStandalonesPresent();
626
642
  const copied = copyStandalonesIntoFork(cacheDir, { dryRun });
627
643
  const cmake = patchMetalCMakeLists(cacheDir, { dryRun });
644
+ const embeddedLoader = patchEmbeddedMetallibLoader(cacheDir, { dryRun });
628
645
  const dispatch = patchMetalDispatch(cacheDir, { dryRun });
629
646
  console.log(
630
647
  `[metal-kernels] ${dryRun ? "(dry-run) " : ""}wired ${copied.length} shipped Metal kernels: ${METAL_KERNEL_FILES.join(", ")}`,
@@ -632,5 +649,8 @@ export function patchMetalKernels(cacheDir, { dryRun = false } = {}) {
632
649
  console.log(
633
650
  `[metal-kernels] ${dryRun ? "(dry-run) " : ""}CMakeLists.txt: ${cmake.changed ? "patched" : "already-patched"} (${cmake.path})`,
634
651
  );
635
- return { copied, cmake, dispatch };
652
+ console.log(
653
+ `[metal-kernels] ${dryRun ? "(dry-run) " : ""}embedded loader: ${embeddedLoader.changed ? "patched" : "already-patched"} (${embeddedLoader.path})`,
654
+ );
655
+ return { copied, cmake, embeddedLoader, dispatch };
636
656
  }