@elizaos/app-core 2.0.0-beta.1 → 2.0.0-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/platforms/electrobun/native/macos/window-effects.mm +103 -0
- package/platforms/electrobun/package.json +9 -0
- package/platforms/electrobun/src/__stubs__/bun-ffi.ts +16 -0
- package/platforms/electrobun/src/libMacWindowEffects.dylib +0 -0
- package/platforms/electrobun/src/native/agent.ts +74 -3
- package/platforms/electrobun/src/native/desktop.ts +39 -6
- package/platforms/electrobun/src/native/mac-window-effects.ts +61 -1
- package/platforms/electrobun/src/native/permissions-shared.ts +3 -2
- package/platforms/electrobun/src/native/permissions.ts +11 -6
- package/platforms/electrobun/src/rpc-handlers.ts +7 -0
- package/platforms/electrobun/src/rpc-schema.ts +39 -4
- package/platforms/electrobun/src/runtime-permissions.ts +7 -1
- package/runtime/ensure-local-inference-handler.d.ts +1 -0
- package/runtime/ensure-local-inference-handler.d.ts.map +1 -1
- package/runtime/ensure-local-inference-handler.js +9 -0
- package/runtime/mode/remote-forwarder.d.ts.map +1 -1
- package/runtime/mode/remote-forwarder.js +1 -1
- package/runtime/mode/runtime-mode.d.ts +20 -2
- package/runtime/mode/runtime-mode.d.ts.map +1 -1
- package/runtime/mode/runtime-mode.js +69 -1
- package/scripts/aosp/stage-default-models.mjs +2 -2
- package/scripts/build-llama-cpp-dflash.mjs +75 -40
- package/scripts/kernel-patches/metal-kernels.mjs +357 -337
- package/scripts/lib/read-app-identity.mjs +5 -1
- package/services/local-inference/catalog.d.ts +2 -1
- package/services/local-inference/catalog.d.ts.map +1 -1
- package/services/local-inference/catalog.js +131 -12
- package/services/local-inference/downloader.d.ts +2 -0
- package/services/local-inference/downloader.d.ts.map +1 -1
- package/services/local-inference/downloader.js +300 -1
- package/services/local-inference/manifest/validator.d.ts.map +1 -1
- package/services/local-inference/manifest/validator.js +48 -0
- package/services/local-inference/providers.d.ts +1 -1
- package/services/local-inference/providers.js +6 -6
- package/services/local-inference/registry.d.ts.map +1 -1
- package/services/local-inference/registry.js +10 -1
- package/services/local-inference/types.d.ts +6 -0
- package/services/local-inference/types.d.ts.map +1 -1
- package/test/helpers/real-runtime.ts +21 -20
- package/platforms/electrobun/src/native/permissions-darwin.ts +0 -342
- package/platforms/electrobun/src/native/permissions-linux.ts +0 -34
- package/platforms/electrobun/src/native/permissions-win32.ts +0 -56
|
@@ -9,11 +9,10 @@
|
|
|
9
9
|
// self-contained TUs (only #include <metal_stdlib>; their own structs,
|
|
10
10
|
// constants, kernel symbols), so they compile as independent .air files.
|
|
11
11
|
//
|
|
12
|
-
// 2. Patches ggml/src/ggml-metal/CMakeLists.txt so
|
|
13
|
-
//
|
|
14
|
-
//
|
|
15
|
-
//
|
|
16
|
-
// into default.metallib via a single `xcrun metallib` invocation.
|
|
12
|
+
// 2. Patches ggml/src/ggml-metal/CMakeLists.txt so both Metal packaging
|
|
13
|
+
// branches build each standalone shader into its own .air via
|
|
14
|
+
// `xcrun metal -c` and merge all .air files (the original ggml-metal.air
|
|
15
|
+
// plus the five milady .air files) into one default.metallib.
|
|
17
16
|
//
|
|
18
17
|
// The original CMake snippet pipes `xcrun metal | xcrun metallib`. We
|
|
19
18
|
// replace that with explicit per-source compilation + a final merge step,
|
|
@@ -35,11 +34,10 @@
|
|
|
35
34
|
// not yet selected by the runtime — the symbol-presence audit (`nm`,
|
|
36
35
|
// `strings default.metallib`) passes, the dispatch audit does not.
|
|
37
36
|
//
|
|
38
|
-
// * EMBED_LIBRARY path used by iOS targets
|
|
39
|
-
//
|
|
40
|
-
// duplicate
|
|
41
|
-
//
|
|
42
|
-
// separate patch and is documented as a deferred gap.
|
|
37
|
+
// * Convert the EMBED_LIBRARY path used by iOS targets to embed compiled
|
|
38
|
+
// metallib bytes rather than concatenated Metal source. This avoids
|
|
39
|
+
// duplicate declarations between ggml-metal.metal + standalones and lets
|
|
40
|
+
// iOS load the same multi-TU kernel set as desktop.
|
|
43
41
|
|
|
44
42
|
import fs from "node:fs";
|
|
45
43
|
import path from "node:path";
|
|
@@ -70,6 +68,9 @@ export const METAL_KERNEL_FILES = [
|
|
|
70
68
|
];
|
|
71
69
|
|
|
72
70
|
const SENTINEL = "# MILADY-KERNEL-PATCH-V1";
|
|
71
|
+
const SENTINEL_EMBED = "# MILADY-KERNEL-EMBED-PATCH-V1";
|
|
72
|
+
const SENTINEL_EMBED_LOADER = "// MILADY-EMBEDDED-METALLIB-LOADER-V1";
|
|
73
|
+
const SENTINEL_QJL_ATTN = "// MILADY-QJL-ATTN-DISPATCH-V1";
|
|
73
74
|
|
|
74
75
|
function inForkRelpath(name) {
|
|
75
76
|
return path.posix.join("ggml", "src", "ggml-metal", "milady-shipped", name);
|
|
@@ -137,13 +138,10 @@ function copyStandalonesIntoFork(cacheDir, { dryRun }) {
|
|
|
137
138
|
return copied;
|
|
138
139
|
}
|
|
139
140
|
|
|
140
|
-
// Patch ggml/src/ggml-metal/CMakeLists.txt
|
|
141
|
-
//
|
|
142
|
-
//
|
|
143
|
-
//
|
|
144
|
-
// We anchor on the `add_custom_command(OUTPUT ${...}/default.metallib` line
|
|
145
|
-
// in the non-EMBED_LIBRARY branch; that is the only metallib build the
|
|
146
|
-
// darwin host metal target uses. Idempotent via SENTINEL.
|
|
141
|
+
// Patch ggml/src/ggml-metal/CMakeLists.txt so desktop and iOS both compile
|
|
142
|
+
// ggml-metal.metal + every standalone into separate .air files and merge them
|
|
143
|
+
// into one default.metallib. iOS then embeds that binary metallib into the
|
|
144
|
+
// static archive instead of embedding concatenated source.
|
|
147
145
|
function patchMetalCMakeLists(cacheDir, { dryRun }) {
|
|
148
146
|
const cmakePath = path.join(
|
|
149
147
|
cacheDir,
|
|
@@ -158,14 +156,95 @@ function patchMetalCMakeLists(cacheDir, { dryRun }) {
|
|
|
158
156
|
);
|
|
159
157
|
}
|
|
160
158
|
const original = fs.readFileSync(cmakePath, "utf8");
|
|
161
|
-
|
|
162
|
-
|
|
159
|
+
let patched = original;
|
|
160
|
+
let changed = false;
|
|
161
|
+
|
|
162
|
+
const miladyAirLinesForSdk = (sdkExpr) =>
|
|
163
|
+
METAL_KERNEL_FILES.map((name) => {
|
|
164
|
+
const stem = name.replace(/\.metal$/, "");
|
|
165
|
+
return ` COMMAND xcrun -sdk ${sdkExpr} metal \${XC_FLAGS} -c \${CMAKE_CURRENT_SOURCE_DIR}/milady-shipped/${name} -o \${CMAKE_CURRENT_BINARY_DIR}/${stem}.air`;
|
|
166
|
+
}).join("\n");
|
|
167
|
+
const miladyAirInputs = METAL_KERNEL_FILES.map((name) => {
|
|
168
|
+
const stem = name.replace(/\.metal$/, "");
|
|
169
|
+
return `\${CMAKE_CURRENT_BINARY_DIR}/${stem}.air`;
|
|
170
|
+
}).join(" ");
|
|
171
|
+
const miladyDepends = METAL_KERNEL_FILES.map(
|
|
172
|
+
(name) => `\${CMAKE_CURRENT_SOURCE_DIR}/milady-shipped/${name}`,
|
|
173
|
+
).join(" ");
|
|
174
|
+
|
|
175
|
+
if (!patched.includes(SENTINEL_EMBED)) {
|
|
176
|
+
const embedStart = patched.indexOf(
|
|
177
|
+
" # merge ggml-common.h and ggml-metal.metal into a single file",
|
|
178
|
+
);
|
|
179
|
+
const embedEnd =
|
|
180
|
+
embedStart === -1
|
|
181
|
+
? -1
|
|
182
|
+
: patched.indexOf(
|
|
183
|
+
"\n\n target_sources(ggml-metal PRIVATE \"${METALLIB_EMBED_ASM}\")",
|
|
184
|
+
embedStart,
|
|
185
|
+
);
|
|
186
|
+
if (embedStart === -1 || embedEnd === -1) {
|
|
187
|
+
throw new Error(
|
|
188
|
+
`[metal-kernels] embedded Metal CMake anchor not found at ${cmakePath}; ` +
|
|
189
|
+
`the fork's GGML_METAL_EMBED_LIBRARY branch changed shape and the patch must be revisited.`,
|
|
190
|
+
);
|
|
191
|
+
}
|
|
192
|
+
const embedAirLines = miladyAirLinesForSdk("${METAL_SDK}");
|
|
193
|
+
const embedReplacement = ` # ${SENTINEL_EMBED}
|
|
194
|
+
# Build a compiled default.metallib for embedded-library targets (iOS).
|
|
195
|
+
# The upstream path embedded concatenated Metal source and JIT-compiled it
|
|
196
|
+
# at runtime. That cannot include the milady standalones because the source
|
|
197
|
+
# TUs intentionally redeclare block_* structs/constants that already exist
|
|
198
|
+
# in ggml-common.h. Compile each TU separately, merge into one metallib,
|
|
199
|
+
# and embed the binary metallib bytes instead.
|
|
200
|
+
set(METALLIB_EMBED_ASM "\${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.s")
|
|
201
|
+
set(METALLIB_SOURCE_EMBED "\${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.metal")
|
|
202
|
+
set(METALLIB_SOURCE_EMBED_TMP "\${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp")
|
|
203
|
+
set(METALLIB_EMBED_BINARY "\${CMAKE_CURRENT_BINARY_DIR}/autogenerated/default.metallib")
|
|
204
|
+
set(METALLIB_EMBED_AIR "\${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.air")
|
|
205
|
+
set(METAL_SDK "\${CMAKE_OSX_SYSROOT}")
|
|
206
|
+
if (NOT METAL_SDK)
|
|
207
|
+
set(METAL_SDK macosx)
|
|
208
|
+
endif()
|
|
209
|
+
if (GGML_METAL_SHADER_DEBUG)
|
|
210
|
+
set(XC_FLAGS -fno-fast-math -fno-inline)
|
|
211
|
+
else()
|
|
212
|
+
set(XC_FLAGS -O3)
|
|
213
|
+
endif()
|
|
214
|
+
if (GGML_METAL_STD)
|
|
215
|
+
list(APPEND XC_FLAGS -std=\${GGML_METAL_STD})
|
|
216
|
+
endif()
|
|
217
|
+
|
|
218
|
+
add_custom_command(
|
|
219
|
+
OUTPUT "\${METALLIB_EMBED_ASM}"
|
|
220
|
+
COMMAND echo "Embedding Metal library (compiled metallib + milady-shipped kernels)"
|
|
221
|
+
COMMAND sed -e "/__embed_ggml-common.h__/r \${METALLIB_COMMON}" -e "/__embed_ggml-common.h__/d" < "\${METALLIB_SOURCE}" > "\${METALLIB_SOURCE_EMBED_TMP}"
|
|
222
|
+
COMMAND sed -e "/\\#include \\"ggml-metal-impl.h\\"/r \${METALLIB_IMPL}" -e "/\\#include \\"ggml-metal-impl.h\\"/d" < "\${METALLIB_SOURCE_EMBED_TMP}" > "\${METALLIB_SOURCE_EMBED}"
|
|
223
|
+
COMMAND xcrun -sdk \${METAL_SDK} metal \${XC_FLAGS} -DGGML_METAL_EMBED_LIBRARY=1 -c "\${METALLIB_SOURCE_EMBED}" -o "\${METALLIB_EMBED_AIR}"
|
|
224
|
+
${embedAirLines}
|
|
225
|
+
COMMAND xcrun -sdk \${METAL_SDK} metallib "\${METALLIB_EMBED_AIR}" ${miladyAirInputs} -o "\${METALLIB_EMBED_BINARY}"
|
|
226
|
+
COMMAND echo ".section __DATA,__ggml_metallib" > "\${METALLIB_EMBED_ASM}"
|
|
227
|
+
COMMAND echo ".globl _ggml_metallib_start" >> "\${METALLIB_EMBED_ASM}"
|
|
228
|
+
COMMAND echo "_ggml_metallib_start:" >> "\${METALLIB_EMBED_ASM}"
|
|
229
|
+
COMMAND echo .incbin "\\"\${METALLIB_EMBED_BINARY}\\"" >> "\${METALLIB_EMBED_ASM}"
|
|
230
|
+
COMMAND echo ".globl _ggml_metallib_end" >> "\${METALLIB_EMBED_ASM}"
|
|
231
|
+
COMMAND echo "_ggml_metallib_end:" >> "\${METALLIB_EMBED_ASM}"
|
|
232
|
+
DEPENDS ../ggml-common.h ggml-metal.metal ggml-metal-impl.h ${miladyDepends}
|
|
233
|
+
COMMENT "Generate assembly for embedded compiled Metal library"
|
|
234
|
+
VERBATIM
|
|
235
|
+
)`;
|
|
236
|
+
patched =
|
|
237
|
+
patched.slice(0, embedStart) +
|
|
238
|
+
embedReplacement +
|
|
239
|
+
patched.slice(embedEnd);
|
|
240
|
+
changed = true;
|
|
163
241
|
}
|
|
164
242
|
|
|
165
243
|
// The exact block we replace. This pipe pattern has been stable in the
|
|
166
244
|
// milady-ai/llama.cpp fork for the entire v0.4.x line; if the upstream
|
|
167
245
|
// ever rewrites it we want to fail loudly rather than silently no-op.
|
|
168
|
-
|
|
246
|
+
if (!patched.includes(SENTINEL)) {
|
|
247
|
+
const anchor = ` add_custom_command(
|
|
169
248
|
OUTPUT \${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
|
170
249
|
COMMAND xcrun -sdk macosx metal \${XC_FLAGS} -c \${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o - |
|
|
171
250
|
xcrun -sdk macosx metallib - -o \${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
|
@@ -174,29 +253,16 @@ function patchMetalCMakeLists(cacheDir, { dryRun }) {
|
|
|
174
253
|
DEPENDS ggml-metal.metal \${METALLIB_COMMON}
|
|
175
254
|
COMMENT "Compiling Metal kernels"
|
|
176
255
|
)`;
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
// Replacement: compile ggml-metal.metal AND each shipped standalone into
|
|
186
|
-
// its own .air file, then merge them all into default.metallib.
|
|
187
|
-
const milady_air_lines = METAL_KERNEL_FILES.map((name) => {
|
|
188
|
-
const stem = name.replace(/\.metal$/, "");
|
|
189
|
-
return ` COMMAND xcrun -sdk macosx metal \${XC_FLAGS} -c \${CMAKE_CURRENT_SOURCE_DIR}/milady-shipped/${name} -o \${CMAKE_CURRENT_BINARY_DIR}/${stem}.air`;
|
|
190
|
-
}).join("\n");
|
|
191
|
-
const milady_air_inputs = METAL_KERNEL_FILES.map((name) => {
|
|
192
|
-
const stem = name.replace(/\.metal$/, "");
|
|
193
|
-
return `\${CMAKE_CURRENT_BINARY_DIR}/${stem}.air`;
|
|
194
|
-
}).join(" ");
|
|
195
|
-
const milady_depends = METAL_KERNEL_FILES.map(
|
|
196
|
-
(name) => `\${CMAKE_CURRENT_SOURCE_DIR}/milady-shipped/${name}`,
|
|
197
|
-
).join(" ");
|
|
256
|
+
if (!patched.includes(anchor)) {
|
|
257
|
+
throw new Error(
|
|
258
|
+
`[metal-kernels] CMakeLists.txt anchor not found at ${cmakePath}; ` +
|
|
259
|
+
`the fork's metallib build snippet has changed shape and the patch ` +
|
|
260
|
+
`must be revisited. Inspect the file's add_custom_command for default.metallib.`,
|
|
261
|
+
);
|
|
262
|
+
}
|
|
198
263
|
|
|
199
|
-
|
|
264
|
+
const miladyAirLines = miladyAirLinesForSdk("macosx");
|
|
265
|
+
const replacement = ` # ${SENTINEL}
|
|
200
266
|
# Build ggml-metal.metal AND each milady standalone shader into its own
|
|
201
267
|
# .air file, then merge all .air files into a single default.metallib.
|
|
202
268
|
# The standalones are self-contained TUs (only #include <metal_stdlib>;
|
|
@@ -205,23 +271,23 @@ function patchMetalCMakeLists(cacheDir, { dryRun }) {
|
|
|
205
271
|
add_custom_command(
|
|
206
272
|
OUTPUT \${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
|
207
273
|
COMMAND xcrun -sdk macosx metal \${XC_FLAGS} -c \${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o \${CMAKE_CURRENT_BINARY_DIR}/ggml-metal.air
|
|
208
|
-
${
|
|
209
|
-
COMMAND xcrun -sdk macosx metallib \${CMAKE_CURRENT_BINARY_DIR}/ggml-metal.air ${
|
|
274
|
+
${miladyAirLines}
|
|
275
|
+
COMMAND xcrun -sdk macosx metallib \${CMAKE_CURRENT_BINARY_DIR}/ggml-metal.air ${miladyAirInputs} -o \${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
|
210
276
|
COMMAND rm -f \${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
|
|
211
277
|
COMMAND rm -f \${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal
|
|
212
|
-
DEPENDS ggml-metal.metal \${METALLIB_COMMON} ${
|
|
278
|
+
DEPENDS ggml-metal.metal \${METALLIB_COMMON} ${miladyDepends}
|
|
213
279
|
COMMENT "Compiling Metal kernels (ggml-metal + milady-shipped: ${METAL_KERNEL_FILES.join(", ")})"
|
|
214
280
|
)`;
|
|
281
|
+
patched = patched.replace(anchor, replacement);
|
|
282
|
+
changed = true;
|
|
283
|
+
}
|
|
215
284
|
|
|
216
|
-
const patched = original.replace(anchor, replacement);
|
|
217
285
|
if (patched === original) {
|
|
218
|
-
|
|
219
|
-
`[metal-kernels] anchor matched but replacement did not change ${cmakePath}; this is a bug`,
|
|
220
|
-
);
|
|
286
|
+
return { changed: false, path: cmakePath };
|
|
221
287
|
}
|
|
222
288
|
if (dryRun) {
|
|
223
289
|
console.log(
|
|
224
|
-
`[metal-kernels] (dry-run) would patch ${cmakePath} (
|
|
290
|
+
`[metal-kernels] (dry-run) would patch ${cmakePath} (changed=${changed}, includes ${METAL_KERNEL_FILES.length} shipped kernels)`,
|
|
225
291
|
);
|
|
226
292
|
return { changed: false, path: cmakePath };
|
|
227
293
|
}
|
|
@@ -229,86 +295,120 @@ ${milady_air_lines}
|
|
|
229
295
|
return { changed: true, path: cmakePath };
|
|
230
296
|
}
|
|
231
297
|
|
|
232
|
-
|
|
298
|
+
function patchEmbeddedMetallibLoader(cacheDir, { dryRun }) {
|
|
299
|
+
const deviceMPath = path.join(
|
|
300
|
+
cacheDir,
|
|
301
|
+
"ggml",
|
|
302
|
+
"src",
|
|
303
|
+
"ggml-metal",
|
|
304
|
+
"ggml-metal-device.m",
|
|
305
|
+
);
|
|
306
|
+
if (!fs.existsSync(deviceMPath)) {
|
|
307
|
+
throw new Error(
|
|
308
|
+
`[metal-kernels] expected ${deviceMPath} to exist on the fork; cannot wire embedded metallib loader`,
|
|
309
|
+
);
|
|
310
|
+
}
|
|
311
|
+
const original = fs.readFileSync(deviceMPath, "utf8");
|
|
312
|
+
if (original.includes(SENTINEL_EMBED_LOADER)) {
|
|
313
|
+
return { changed: false, path: deviceMPath };
|
|
314
|
+
}
|
|
315
|
+
const anchor = `#if GGML_METAL_EMBED_LIBRARY
|
|
316
|
+
GGML_LOG_INFO("%s: using embedded metal library\\n", __func__);
|
|
317
|
+
|
|
318
|
+
extern const char ggml_metallib_start[];
|
|
319
|
+
extern const char ggml_metallib_end[];
|
|
320
|
+
|
|
321
|
+
src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
|
|
322
|
+
#else`;
|
|
323
|
+
if (!original.includes(anchor)) {
|
|
324
|
+
throw new Error(
|
|
325
|
+
`[metal-kernels] embedded Metal loader anchor not found at ${deviceMPath}; ` +
|
|
326
|
+
`the fork's GGML_METAL_EMBED_LIBRARY loader changed shape and the patch must be revisited.`,
|
|
327
|
+
);
|
|
328
|
+
}
|
|
329
|
+
const replacement = `#if GGML_METAL_EMBED_LIBRARY
|
|
330
|
+
GGML_LOG_INFO("%s: using embedded compiled metal library\\n", __func__);
|
|
331
|
+
|
|
332
|
+
extern const char ggml_metallib_start[];
|
|
333
|
+
extern const char ggml_metallib_end[];
|
|
334
|
+
|
|
335
|
+
// ${SENTINEL_EMBED_LOADER}
|
|
336
|
+
// The build patch embeds compiled default.metallib bytes here, not
|
|
337
|
+
// Metal source. Loading with newLibraryWithData keeps iOS on the same
|
|
338
|
+
// multi-TU kernel set as desktop and avoids duplicate declarations
|
|
339
|
+
// between ggml-metal.metal and the milady standalone shaders.
|
|
340
|
+
const NSUInteger metallib_len = (NSUInteger)(ggml_metallib_end - ggml_metallib_start);
|
|
341
|
+
dispatch_data_t metallib_data = dispatch_data_create(ggml_metallib_start, metallib_len, nil, DISPATCH_DATA_DESTRUCTOR_DEFAULT);
|
|
342
|
+
library = [device newLibraryWithData:metallib_data error:&error];
|
|
343
|
+
if (error) {
|
|
344
|
+
GGML_LOG_ERROR("%s: error: %s\\n", __func__, [[error description] UTF8String]);
|
|
345
|
+
return nil;
|
|
346
|
+
}
|
|
347
|
+
#else`;
|
|
348
|
+
const patched = original.replace(anchor, replacement);
|
|
349
|
+
if (patched === original) {
|
|
350
|
+
throw new Error("[metal-kernels] embedded loader replace produced no change");
|
|
351
|
+
}
|
|
352
|
+
if (!dryRun) fs.writeFileSync(deviceMPath, patched, "utf8");
|
|
353
|
+
return { changed: !dryRun, path: deviceMPath };
|
|
354
|
+
}
|
|
233
355
|
|
|
234
|
-
|
|
235
|
-
// Used by both the type-detection helper and the milady pipeline lookup.
|
|
236
|
-
// Note: TBQ3_0=43, TBQ4_0=44, QJL1_256=46, Q4_POLAR=47, TBQ3_TCQ=48.
|
|
237
|
-
const MILADY_QUANT_TYPES = ["TBQ3_0", "TBQ4_0", "QJL1_256", "Q4_POLAR", "TBQ3_TCQ"];
|
|
356
|
+
const SENTINEL_DISPATCH = "// MILADY-DISPATCH-V1";
|
|
238
357
|
|
|
239
|
-
function
|
|
358
|
+
function patchMetalQjlAttnHeader(cacheDir, { dryRun }) {
|
|
240
359
|
const headerPath = path.join(cacheDir, "ggml", "src", "ggml-metal", "ggml-metal-device.h");
|
|
241
360
|
const original = fs.readFileSync(headerPath, "utf8");
|
|
242
|
-
if (original.includes(
|
|
361
|
+
if (original.includes(SENTINEL_QJL_ATTN)) {
|
|
243
362
|
return { changed: false, path: headerPath };
|
|
244
363
|
}
|
|
245
|
-
const anchor = `struct ggml_metal_pipeline_with_params
|
|
364
|
+
const anchor = `struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext(
|
|
365
|
+
ggml_metal_library_t lib,
|
|
366
|
+
const struct ggml_tensor * op,
|
|
367
|
+
bool has_mask,
|
|
368
|
+
bool has_sinks,
|
|
369
|
+
bool has_bias,
|
|
370
|
+
bool has_scap,
|
|
371
|
+
bool has_kvpad,
|
|
372
|
+
int32_t nsg);`;
|
|
246
373
|
if (!original.includes(anchor)) {
|
|
247
374
|
throw new Error(
|
|
248
|
-
`[metal-
|
|
375
|
+
`[metal-qjl-attn] device.h anchor not found at ${headerPath}; inspect flash-attn pipeline declarations.`,
|
|
249
376
|
);
|
|
250
377
|
}
|
|
251
378
|
const insert = `${anchor}
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
struct ggml_metal_pipeline_with_params
|
|
379
|
+
|
|
380
|
+
${SENTINEL_QJL_ATTN}
|
|
381
|
+
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_attn_score_qjl(
|
|
382
|
+
ggml_metal_library_t lib);`;
|
|
255
383
|
const patched = original.replace(anchor, insert);
|
|
256
|
-
if (patched === original) {
|
|
257
|
-
throw new Error(`[metal-dispatch] header replace produced no change`);
|
|
258
|
-
}
|
|
259
384
|
if (!dryRun) fs.writeFileSync(headerPath, patched, "utf8");
|
|
260
385
|
return { changed: !dryRun, path: headerPath };
|
|
261
386
|
}
|
|
262
387
|
|
|
263
|
-
function
|
|
388
|
+
function patchMetalQjlAttnDeviceCpp(cacheDir, { dryRun }) {
|
|
264
389
|
const cppPath = path.join(cacheDir, "ggml", "src", "ggml-metal", "ggml-metal-device.cpp");
|
|
265
390
|
const original = fs.readFileSync(cppPath, "utf8");
|
|
266
|
-
if (original.includes(
|
|
391
|
+
if (original.includes(SENTINEL_QJL_ATTN)) {
|
|
267
392
|
return { changed: false, path: cppPath };
|
|
268
393
|
}
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
// metallib compiler and fail because the standalones don't declare
|
|
275
|
-
// the `nsg` function constant). Pure name lookup against the already-
|
|
276
|
-
// loaded library — fails fast with GGML_ABORT if the symbol is not
|
|
277
|
-
// present in default.metallib (which would mean the kernel-shipment
|
|
278
|
-
// patch above silently regressed).
|
|
279
|
-
const helpersAnchor = `ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_set_rows(ggml_metal_library_t lib, ggml_type tidx, ggml_type tdst) {`;
|
|
280
|
-
if (!original.includes(helpersAnchor)) {
|
|
281
|
-
throw new Error(`[metal-dispatch] device.cpp helpers anchor not found at ${cppPath}`);
|
|
394
|
+
const anchor = `ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin(ggml_metal_library_t lib, const ggml_tensor * op, int32_t n_fuse) {`;
|
|
395
|
+
if (!original.includes(anchor)) {
|
|
396
|
+
throw new Error(
|
|
397
|
+
`[metal-qjl-attn] device.cpp anchor not found at ${cppPath}; inspect pipeline helper layout.`,
|
|
398
|
+
);
|
|
282
399
|
}
|
|
283
|
-
const
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
// (qjl_score_args / qjl_mv_args / qjl_dequant_args / polar_mv_args /
|
|
287
|
-
// polar_dequant_args) that do NOT match ggml_metal_kargs_mul_mv. The
|
|
288
|
-
// standard get_pipeline_mul_mv helper sets a 'nsg' function constant
|
|
289
|
-
// the standalones do not declare; calling it crashes the metallib
|
|
290
|
-
// compiler. We keep this lookup constant-free.
|
|
291
|
-
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_milady_mul_mv(ggml_metal_library_t lib, ggml_type tsrc0) {
|
|
292
|
-
char name[256];
|
|
293
|
-
switch (tsrc0) {
|
|
294
|
-
case GGML_TYPE_QJL1_256: snprintf(name, 256, "kernel_mul_mv_qjl1_256_f32"); break;
|
|
295
|
-
case GGML_TYPE_Q4_POLAR: snprintf(name, 256, "kernel_mul_mv_q4_polar_f32"); break;
|
|
296
|
-
default:
|
|
297
|
-
GGML_LOG_ERROR("milady_mul_mv: type %s (%d) has no mul_mv standalone (only attention-score)\\n",
|
|
298
|
-
ggml_type_name(tsrc0), (int) tsrc0);
|
|
299
|
-
GGML_ABORT("milady_mul_mv: unsupported milady-quant type for MUL_MAT");
|
|
300
|
-
}
|
|
400
|
+
const helper = `${SENTINEL_QJL_ATTN}
|
|
401
|
+
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_attn_score_qjl(ggml_metal_library_t lib) {
|
|
402
|
+
const char * name = "kernel_attn_score_qjl1_256";
|
|
301
403
|
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
|
302
404
|
if (!res.pipeline) {
|
|
303
|
-
//
|
|
304
|
-
//
|
|
305
|
-
// would set an 'nsg' function constant which the standalones do not
|
|
306
|
-
// declare; we explicitly pass nullptr to bypass that.
|
|
405
|
+
// Standalone shipped shader: it declares no Metal function constants,
|
|
406
|
+
// so compile by direct symbol name with a null constants table.
|
|
307
407
|
res = ggml_metal_library_compile_pipeline(lib, name, name, nullptr);
|
|
308
408
|
}
|
|
309
409
|
if (!res.pipeline) {
|
|
310
|
-
GGML_LOG_ERROR("
|
|
311
|
-
GGML_ABORT("
|
|
410
|
+
GGML_LOG_ERROR("attn_score_qjl: kernel '%s' missing from default.metallib\\n", name);
|
|
411
|
+
GGML_ABORT("attn_score_qjl: pipeline compile failed");
|
|
312
412
|
}
|
|
313
413
|
res.nr0 = 1;
|
|
314
414
|
res.nr1 = 1;
|
|
@@ -317,269 +417,181 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_milady_mul_mv(gg
|
|
|
317
417
|
return res;
|
|
318
418
|
}
|
|
319
419
|
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
case GGML_TYPE_Q4_POLAR: snprintf(name, 256, "kernel_get_rows_q4_polar"); break;
|
|
325
|
-
default:
|
|
326
|
-
GGML_LOG_ERROR("milady_get_rows: type %s (%d) has no get_rows standalone\\n",
|
|
327
|
-
ggml_type_name(tsrc0), (int) tsrc0);
|
|
328
|
-
GGML_ABORT("milady_get_rows: unsupported milady-quant type for GET_ROWS");
|
|
329
|
-
}
|
|
330
|
-
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
|
331
|
-
if (!res.pipeline) {
|
|
332
|
-
res = ggml_metal_library_compile_pipeline(lib, name, name, nullptr);
|
|
333
|
-
}
|
|
334
|
-
if (!res.pipeline) {
|
|
335
|
-
GGML_LOG_ERROR("milady_get_rows: kernel '%s' could not be compiled from default.metallib\\n", name);
|
|
336
|
-
GGML_ABORT("milady_get_rows: kernel pipeline compile failed");
|
|
337
|
-
}
|
|
338
|
-
res.nr0 = 1; res.nr1 = 1; res.nsg = 1; res.smem = 0;
|
|
339
|
-
return res;
|
|
420
|
+
`;
|
|
421
|
+
const patched = original.replace(anchor, helper + anchor);
|
|
422
|
+
if (!dryRun) fs.writeFileSync(cppPath, patched, "utf8");
|
|
423
|
+
return { changed: !dryRun, path: cppPath };
|
|
340
424
|
}
|
|
341
425
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
// clean structured abort instead of crashing in the metallib compiler
|
|
348
|
-
// when the `nsg` function constant has no matching declaration.
|
|
349
|
-
const mvSwitchAnchor = ` // use custom matrix x vector kernel
|
|
350
|
-
switch (tsrc0) {`;
|
|
351
|
-
if (!patched.includes(mvSwitchAnchor)) {
|
|
352
|
-
throw new Error(`[metal-dispatch] device.cpp mul_mv switch anchor not found`);
|
|
353
|
-
}
|
|
354
|
-
const mvGuard = ` // ${SENTINEL_DISPATCH}
|
|
355
|
-
// Defence-in-depth: milady-quant types should be diverted by the op-side
|
|
356
|
-
// early-out in ggml_metal_op_mul_mat. If we got here, the dispatch
|
|
357
|
-
// routing has regressed.
|
|
358
|
-
if (tsrc0 == GGML_TYPE_QJL1_256 || tsrc0 == GGML_TYPE_Q4_POLAR ||
|
|
359
|
-
tsrc0 == GGML_TYPE_TBQ3_0 || tsrc0 == GGML_TYPE_TBQ4_0 ||
|
|
360
|
-
tsrc0 == GGML_TYPE_TBQ3_TCQ) {
|
|
361
|
-
GGML_LOG_ERROR("get_pipeline_mul_mv: type %s reached standard helper (op-side dispatch regression)\\n",
|
|
362
|
-
ggml_type_name(tsrc0));
|
|
363
|
-
GGML_ABORT("get_pipeline_mul_mv: milady-quant type leaked into standard pipeline path");
|
|
364
|
-
}
|
|
365
|
-
// use custom matrix x vector kernel
|
|
366
|
-
switch (tsrc0) {`;
|
|
367
|
-
patched = patched.replace(mvSwitchAnchor, mvGuard);
|
|
368
|
-
|
|
369
|
-
// (3) Same defence at the top of ggml_metal_library_get_pipeline_get_rows.
|
|
370
|
-
// This helper auto-builds `kernel_get_rows_<typename>`, which for the
|
|
371
|
-
// milady types yields the right symbol — but it would still try to
|
|
372
|
-
// compile a fresh pipeline if the lookup misses, and the compile path
|
|
373
|
-
// hits ggml-common.h struct redefinition. Hard-fail instead.
|
|
374
|
-
const grAnchor = `ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_get_rows(ggml_metal_library_t lib, ggml_type tsrc) {
|
|
375
|
-
char base[256];`;
|
|
376
|
-
if (!patched.includes(grAnchor)) {
|
|
377
|
-
throw new Error(`[metal-dispatch] device.cpp get_rows anchor not found`);
|
|
426
|
+
function patchMetalQjlAttnOpsHeader(cacheDir, { dryRun }) {
|
|
427
|
+
const headerPath = path.join(cacheDir, "ggml", "src", "ggml-metal", "ggml-metal-ops.h");
|
|
428
|
+
const original = fs.readFileSync(headerPath, "utf8");
|
|
429
|
+
if (original.includes(SENTINEL_QJL_ATTN)) {
|
|
430
|
+
return { changed: false, path: headerPath };
|
|
378
431
|
}
|
|
379
|
-
const
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
GGML_LOG_ERROR("get_pipeline_get_rows: type %s reached standard helper (op-side dispatch regression)\\n",
|
|
385
|
-
ggml_type_name(tsrc));
|
|
386
|
-
GGML_ABORT("get_pipeline_get_rows: milady-quant type leaked into standard pipeline path");
|
|
387
|
-
}
|
|
388
|
-
char base[256];`;
|
|
389
|
-
patched = patched.replace(grAnchor, grReplace);
|
|
390
|
-
|
|
391
|
-
if (patched === original) {
|
|
392
|
-
throw new Error(`[metal-dispatch] device.cpp replace produced no change`);
|
|
432
|
+
const anchor = `int ggml_metal_op_flash_attn_ext (ggml_metal_op_t ctx, int idx);`;
|
|
433
|
+
if (!original.includes(anchor)) {
|
|
434
|
+
throw new Error(
|
|
435
|
+
`[metal-qjl-attn] ops.h anchor not found at ${headerPath}; inspect op declarations.`,
|
|
436
|
+
);
|
|
393
437
|
}
|
|
394
|
-
|
|
395
|
-
|
|
438
|
+
const insert = `${anchor}
|
|
439
|
+
${SENTINEL_QJL_ATTN}
|
|
440
|
+
int ggml_metal_op_attn_score_qjl (ggml_metal_op_t ctx, int idx);`;
|
|
441
|
+
const patched = original.replace(anchor, insert);
|
|
442
|
+
if (!dryRun) fs.writeFileSync(headerPath, patched, "utf8");
|
|
443
|
+
return { changed: !dryRun, path: headerPath };
|
|
396
444
|
}
|
|
397
445
|
|
|
398
|
-
function
|
|
446
|
+
function patchMetalQjlAttnOpsCpp(cacheDir, { dryRun }) {
|
|
399
447
|
const opsPath = path.join(cacheDir, "ggml", "src", "ggml-metal", "ggml-metal-ops.cpp");
|
|
400
448
|
const original = fs.readFileSync(opsPath, "utf8");
|
|
401
|
-
if (original.includes(
|
|
449
|
+
if (original.includes(SENTINEL_QJL_ATTN)) {
|
|
402
450
|
return { changed: false, path: opsPath };
|
|
403
451
|
}
|
|
404
452
|
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
// mis-routed call surfaces immediately.
|
|
411
|
-
// Anchor on int ggml_metal_op_get_rows since it appears earlier in the file
|
|
412
|
-
// than ggml_metal_op_mul_mat — both functions reference the milady helpers
|
|
413
|
-
// through the early-out so the helpers must be visible to BOTH.
|
|
414
|
-
const muMatAnchor = `int ggml_metal_op_get_rows(ggml_metal_op_t ctx, int idx) {`;
|
|
415
|
-
if (!original.includes(muMatAnchor)) {
|
|
416
|
-
throw new Error(`[metal-dispatch] ops.cpp get_rows anchor not found at ${opsPath}`);
|
|
453
|
+
const funcAnchor = `static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {`;
|
|
454
|
+
if (!original.includes(funcAnchor)) {
|
|
455
|
+
throw new Error(
|
|
456
|
+
`[metal-qjl-attn] ops.cpp function anchor not found at ${opsPath}; inspect encode layout.`,
|
|
457
|
+
);
|
|
417
458
|
}
|
|
418
|
-
const
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
static inline
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
static inline bool milady_is_quant_get_rows_supported(ggml_type t) {
|
|
430
|
-
return t == GGML_TYPE_QJL1_256 || t == GGML_TYPE_Q4_POLAR;
|
|
431
|
-
}
|
|
432
|
-
// TBQ3_0 / TBQ4_0 / TBQ3_TCQ — standalones expose only attention-score
|
|
433
|
-
// kernels (kernel_turbo3_dot etc.). MUL_MAT against these types in a
|
|
434
|
-
// generic graph is not yet supported; we surface a clear abort instead
|
|
435
|
-
// of silently routing through a path that crashes in the metallib
|
|
436
|
-
// compiler. See AGENTS.md "TBQ* attention bridge" follow-up.
|
|
437
|
-
static inline bool milady_is_quant_tbq_attn_only(ggml_type t) {
|
|
438
|
-
return t == GGML_TYPE_TBQ3_0 || t == GGML_TYPE_TBQ4_0 || t == GGML_TYPE_TBQ3_TCQ;
|
|
459
|
+
const opFunc = `${SENTINEL_QJL_ATTN}
|
|
460
|
+
struct milady_qjl_score_args {
|
|
461
|
+
uint32_t n_heads;
|
|
462
|
+
uint32_t n_kv_heads;
|
|
463
|
+
uint32_t n_tokens;
|
|
464
|
+
uint32_t proj_dim;
|
|
465
|
+
};
|
|
466
|
+
|
|
467
|
+
static inline ggml_metal_buffer_id milady_metal_buffer_offset(ggml_metal_buffer_id id, size_t extra) {
|
|
468
|
+
id.offs += extra;
|
|
469
|
+
return id;
|
|
439
470
|
}
|
|
440
471
|
|
|
441
|
-
|
|
472
|
+
int ggml_metal_op_attn_score_qjl(ggml_metal_op_t ctx, int idx) {
|
|
442
473
|
ggml_tensor * op = ctx->node(idx);
|
|
443
|
-
ggml_metal_library_t lib = ctx->lib;
|
|
444
|
-
ggml_metal_encoder_t enc = ctx->enc;
|
|
445
474
|
|
|
446
|
-
GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
|
|
447
|
-
GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
|
|
448
|
-
|
|
449
|
-
const ggml_type tsrc0 = op->src[0]->type;
|
|
450
|
-
|
|
451
|
-
if (milady_is_quant_tbq_attn_only(tsrc0)) {
|
|
452
|
-
GGML_LOG_ERROR("milady_quant mul_mv: type %s exposes only attention-score kernels in the standalones; MUL_MAT requires an ATTN_SCORE op (Wave-7 work)\\n",
|
|
453
|
-
ggml_type_name(tsrc0));
|
|
454
|
-
GGML_ABORT("milady_quant: tbq* MUL_MAT not yet wired");
|
|
455
|
-
}
|
|
456
|
-
if (!milady_is_quant_mul_mv_supported(tsrc0)) {
|
|
457
|
-
GGML_LOG_ERROR("milady_quant mul_mv: type %s not a milady-quant type\\n", ggml_type_name(tsrc0));
|
|
458
|
-
GGML_ABORT("milady_quant mul_mv: unsupported type");
|
|
459
|
-
}
|
|
460
|
-
GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32 && "milady_quant mul_mv expects fp32 activation");
|
|
461
|
-
|
|
462
|
-
auto pipeline = ggml_metal_library_get_pipeline_milady_mul_mv(lib, tsrc0);
|
|
463
|
-
|
|
464
|
-
const int32_t n_rows = ne01;
|
|
465
|
-
|
|
466
|
-
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
|
467
|
-
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 0);
|
|
468
|
-
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 1);
|
|
469
|
-
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
|
|
470
|
-
|
|
471
|
-
if (tsrc0 == GGML_TYPE_QJL1_256) {
|
|
472
|
-
milady_qjl_mv_args args = {
|
|
473
|
-
/* n_rows = */ (uint32_t) n_rows,
|
|
474
|
-
/* proj_dim = */ 256u,
|
|
475
|
-
};
|
|
476
|
-
ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 3);
|
|
477
|
-
} else { // GGML_TYPE_Q4_POLAR
|
|
478
|
-
milady_polar_mv_args args = {
|
|
479
|
-
/* n_rows = */ (uint32_t) n_rows,
|
|
480
|
-
/* head_dim = */ 128u,
|
|
481
|
-
/* use_qjl = */ 0u,
|
|
482
|
-
};
|
|
483
|
-
ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 3);
|
|
484
|
-
}
|
|
485
|
-
|
|
486
|
-
// 32 threads per row, one threadgroup per row. Matches the standalone
|
|
487
|
-
// dispatch shape verified by metal_verify (8/8 PASS).
|
|
488
|
-
ggml_metal_encoder_dispatch_threadgroups(enc, n_rows, 1, 1, 32, 1, 1);
|
|
489
|
-
return 1;
|
|
490
|
-
}
|
|
491
|
-
|
|
492
|
-
static int ggml_metal_op_get_rows_milady_quant(ggml_metal_op_t ctx, int idx) {
|
|
493
|
-
ggml_tensor * op = ctx->node(idx);
|
|
494
475
|
ggml_metal_library_t lib = ctx->lib;
|
|
495
476
|
ggml_metal_encoder_t enc = ctx->enc;
|
|
496
477
|
|
|
497
|
-
const
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
478
|
+
const ggml_tensor * q = op->src[0];
|
|
479
|
+
const ggml_tensor * pk = op->src[1];
|
|
480
|
+
|
|
481
|
+
GGML_ASSERT(q != nullptr);
|
|
482
|
+
GGML_ASSERT(pk != nullptr);
|
|
483
|
+
GGML_ASSERT(q->type == GGML_TYPE_F32);
|
|
484
|
+
GGML_ASSERT(pk->type == GGML_TYPE_QJL1_256);
|
|
485
|
+
GGML_ASSERT(op->type == GGML_TYPE_F32);
|
|
486
|
+
GGML_ASSERT(q->ne[0] == 256);
|
|
487
|
+
GGML_ASSERT(pk->ne[0] == 128);
|
|
488
|
+
|
|
489
|
+
const uint32_t n_heads = (uint32_t) q->ne[1];
|
|
490
|
+
const uint32_t n_kv_heads = (uint32_t) ((const int32_t *) op->op_params)[0];
|
|
491
|
+
const uint32_t n_tokens = (uint32_t) pk->ne[1];
|
|
492
|
+
const int64_t n_batch = q->ne[2];
|
|
493
|
+
const int64_t ne3 = q->ne[3];
|
|
494
|
+
|
|
495
|
+
GGML_ASSERT(n_kv_heads > 0);
|
|
496
|
+
GGML_ASSERT((n_heads % n_kv_heads) == 0);
|
|
497
|
+
GGML_ASSERT(pk->ne[2] == (int64_t) n_kv_heads);
|
|
498
|
+
GGML_ASSERT(pk->ne[3] == ne3);
|
|
499
|
+
GGML_ASSERT(op->ne[0] == (int64_t) n_tokens);
|
|
500
|
+
GGML_ASSERT(op->ne[1] == (int64_t) n_heads);
|
|
501
|
+
GGML_ASSERT(op->ne[2] == n_batch);
|
|
502
|
+
GGML_ASSERT(op->ne[3] == ne3);
|
|
503
|
+
GGML_ASSERT(pk->nb[1] == ggml_row_size(GGML_TYPE_QJL1_256, 128));
|
|
504
|
+
GGML_ASSERT(pk->nb[2] == (size_t) n_tokens * pk->nb[1]);
|
|
505
|
+
|
|
506
|
+
milady_qjl_score_args args = {
|
|
507
|
+
/* n_heads = */ n_heads,
|
|
508
|
+
/* n_kv_heads = */ n_kv_heads,
|
|
509
|
+
/* n_tokens = */ n_tokens,
|
|
510
|
+
/* proj_dim = */ 256u,
|
|
511
|
+
};
|
|
512
|
+
|
|
513
|
+
auto pipeline = ggml_metal_library_get_pipeline_attn_score_qjl(lib);
|
|
514
|
+
|
|
515
|
+
const ggml_metal_buffer_id q_base = ggml_metal_get_buffer_id(q);
|
|
516
|
+
const ggml_metal_buffer_id pk_base = ggml_metal_get_buffer_id(pk);
|
|
517
|
+
const ggml_metal_buffer_id dst_base = ggml_metal_get_buffer_id(op);
|
|
505
518
|
|
|
506
519
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
520
|
+
ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 3);
|
|
521
|
+
|
|
522
|
+
for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
|
523
|
+
const size_t q_i3 = (size_t) i3 * q->nb[3];
|
|
524
|
+
const size_t pk_i3 = (size_t) i3 * pk->nb[3];
|
|
525
|
+
const size_t dst_i3 = (size_t) i3 * op->nb[3];
|
|
526
|
+
for (int64_t ib = 0; ib < n_batch; ++ib) {
|
|
527
|
+
ggml_metal_encoder_set_buffer(enc, milady_metal_buffer_offset(q_base, q_i3 + (size_t) ib * q->nb[2]), 0);
|
|
528
|
+
ggml_metal_encoder_set_buffer(enc, milady_metal_buffer_offset(pk_base, pk_i3), 1);
|
|
529
|
+
ggml_metal_encoder_set_buffer(enc, milady_metal_buffer_offset(dst_base, dst_i3 + (size_t) ib * op->nb[2]), 2);
|
|
530
|
+
ggml_metal_encoder_dispatch_threadgroups(enc, (int) n_heads, (int) n_tokens, 1, 32, 1, 1);
|
|
531
|
+
}
|
|
517
532
|
}
|
|
518
533
|
|
|
519
|
-
// Single threadgroup, 32 threads, processes one block.
|
|
520
|
-
ggml_metal_encoder_dispatch_threadgroups(enc, 1, 1, 1, 32, 1, 1);
|
|
521
534
|
return 1;
|
|
522
535
|
}
|
|
523
536
|
|
|
524
537
|
`;
|
|
525
|
-
let patched = original.replace(
|
|
538
|
+
let patched = original.replace(funcAnchor, opFunc + funcAnchor);
|
|
526
539
|
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
if (!patched.includes(muMatBodyAnchor)) {
|
|
536
|
-
throw new Error(`[metal-dispatch] ops.cpp mul_mat body anchor not found`);
|
|
540
|
+
const switchAnchor = ` case GGML_OP_FLASH_ATTN_EXT:
|
|
541
|
+
{
|
|
542
|
+
n_fuse = ggml_metal_op_flash_attn_ext(ctx, idx);
|
|
543
|
+
} break;`;
|
|
544
|
+
if (!patched.includes(switchAnchor)) {
|
|
545
|
+
throw new Error(
|
|
546
|
+
`[metal-qjl-attn] ops.cpp switch anchor not found at ${opsPath}; inspect encode switch.`,
|
|
547
|
+
);
|
|
537
548
|
}
|
|
538
|
-
const
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
}
|
|
548
|
-
(void) lib; (void) enc;
|
|
549
|
-
}`;
|
|
550
|
-
patched = patched.replace(muMatBodyAnchor, muMatEarly);
|
|
551
|
-
|
|
552
|
-
// (3) Early-out at the top of ggml_metal_op_get_rows().
|
|
553
|
-
const grBodyAnchor = `int ggml_metal_op_get_rows(ggml_metal_op_t ctx, int idx) {
|
|
554
|
-
ggml_tensor * op = ctx->node(idx);
|
|
549
|
+
const switchInsert = `${switchAnchor}
|
|
550
|
+
case GGML_OP_ATTN_SCORE_QJL:
|
|
551
|
+
{
|
|
552
|
+
n_fuse = ggml_metal_op_attn_score_qjl(ctx, idx);
|
|
553
|
+
} break;`;
|
|
554
|
+
patched = patched.replace(switchAnchor, switchInsert);
|
|
555
|
+
if (!dryRun) fs.writeFileSync(opsPath, patched, "utf8");
|
|
556
|
+
return { changed: !dryRun, path: opsPath };
|
|
557
|
+
}
|
|
555
558
|
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
559
|
+
function patchMetalQjlAttnSupportsOp(cacheDir, { dryRun }) {
|
|
560
|
+
const deviceMPath = path.join(cacheDir, "ggml", "src", "ggml-metal", "ggml-metal-device.m");
|
|
561
|
+
const original = fs.readFileSync(deviceMPath, "utf8");
|
|
562
|
+
if (original.includes(SENTINEL_QJL_ATTN)) {
|
|
563
|
+
return { changed: false, path: deviceMPath };
|
|
560
564
|
}
|
|
561
|
-
const
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
return ggml_metal_op_get_rows_milady_quant(ctx, idx);
|
|
568
|
-
}
|
|
569
|
-
if (tsrc0 == GGML_TYPE_TBQ3_0 || tsrc0 == GGML_TYPE_TBQ4_0 || tsrc0 == GGML_TYPE_TBQ3_TCQ) {
|
|
570
|
-
GGML_LOG_ERROR("get_rows: type %s has no standalone get_rows kernel (tbq* attention-only)\\n",
|
|
571
|
-
ggml_type_name(tsrc0));
|
|
572
|
-
GGML_ABORT("get_rows: tbq* not wired");
|
|
573
|
-
}
|
|
574
|
-
(void) lib; (void) enc;
|
|
575
|
-
}`;
|
|
576
|
-
patched = patched.replace(grBodyAnchor, grEarly);
|
|
577
|
-
|
|
578
|
-
if (patched === original) {
|
|
579
|
-
throw new Error(`[metal-dispatch] ops.cpp replace produced no change`);
|
|
565
|
+
const anchor = ` case GGML_OP_FLASH_ATTN_EXT:
|
|
566
|
+
// for new head sizes, add checks here`;
|
|
567
|
+
if (!original.includes(anchor)) {
|
|
568
|
+
throw new Error(
|
|
569
|
+
`[metal-qjl-attn] supports_op anchor not found at ${deviceMPath}; inspect GGML_OP_FLASH_ATTN_EXT branch.`,
|
|
570
|
+
);
|
|
580
571
|
}
|
|
581
|
-
|
|
582
|
-
|
|
572
|
+
const insert = ` case GGML_OP_ATTN_SCORE_QJL:
|
|
573
|
+
// ${SENTINEL_QJL_ATTN}
|
|
574
|
+
return has_simdgroup_reduction &&
|
|
575
|
+
op->type == GGML_TYPE_F32 &&
|
|
576
|
+
op->src[0] != NULL &&
|
|
577
|
+
op->src[1] != NULL &&
|
|
578
|
+
op->src[0]->type == GGML_TYPE_F32 &&
|
|
579
|
+
op->src[1]->type == GGML_TYPE_QJL1_256 &&
|
|
580
|
+
op->src[0]->ne[0] == 256 &&
|
|
581
|
+
op->src[1]->ne[0] == 128;
|
|
582
|
+
${anchor}`;
|
|
583
|
+
const patched = original.replace(anchor, insert);
|
|
584
|
+
if (!dryRun) fs.writeFileSync(deviceMPath, patched, "utf8");
|
|
585
|
+
return { changed: !dryRun, path: deviceMPath };
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
function patchMetalQjlAttnDispatch(cacheDir, { dryRun }) {
|
|
589
|
+
const header = patchMetalQjlAttnHeader(cacheDir, { dryRun });
|
|
590
|
+
const deviceCpp = patchMetalQjlAttnDeviceCpp(cacheDir, { dryRun });
|
|
591
|
+
const opsHeader = patchMetalQjlAttnOpsHeader(cacheDir, { dryRun });
|
|
592
|
+
const opsCpp = patchMetalQjlAttnOpsCpp(cacheDir, { dryRun });
|
|
593
|
+
const supportsOp = patchMetalQjlAttnSupportsOp(cacheDir, { dryRun });
|
|
594
|
+
return { header, deviceCpp, opsHeader, opsCpp, supportsOp };
|
|
583
595
|
}
|
|
584
596
|
|
|
585
597
|
export function patchMetalDispatch(cacheDir, { dryRun = false } = {}) {
|
|
@@ -613,7 +625,11 @@ export function patchMetalDispatch(cacheDir, { dryRun = false } = {}) {
|
|
|
613
625
|
} else {
|
|
614
626
|
console.log(`${dryRun ? "(dry-run) " : ""}${message}`);
|
|
615
627
|
}
|
|
616
|
-
|
|
628
|
+
const qjlAttn = patchMetalQjlAttnDispatch(cacheDir, { dryRun });
|
|
629
|
+
console.log(
|
|
630
|
+
`[metal-dispatch] ${dryRun ? "(dry-run) " : ""}wired dedicated GGML_OP_ATTN_SCORE_QJL dispatch via kernel_attn_score_qjl1_256`,
|
|
631
|
+
);
|
|
632
|
+
return { status: "qjl-attn-only", unsafePatchPresent: patchedFiles, qjlAttn };
|
|
617
633
|
}
|
|
618
634
|
|
|
619
635
|
// Public entry point used by build-llama-cpp-dflash.mjs.
|
|
@@ -625,6 +641,7 @@ export function patchMetalKernels(cacheDir, { dryRun = false } = {}) {
|
|
|
625
641
|
assertStandalonesPresent();
|
|
626
642
|
const copied = copyStandalonesIntoFork(cacheDir, { dryRun });
|
|
627
643
|
const cmake = patchMetalCMakeLists(cacheDir, { dryRun });
|
|
644
|
+
const embeddedLoader = patchEmbeddedMetallibLoader(cacheDir, { dryRun });
|
|
628
645
|
const dispatch = patchMetalDispatch(cacheDir, { dryRun });
|
|
629
646
|
console.log(
|
|
630
647
|
`[metal-kernels] ${dryRun ? "(dry-run) " : ""}wired ${copied.length} shipped Metal kernels: ${METAL_KERNEL_FILES.join(", ")}`,
|
|
@@ -632,5 +649,8 @@ export function patchMetalKernels(cacheDir, { dryRun = false } = {}) {
|
|
|
632
649
|
console.log(
|
|
633
650
|
`[metal-kernels] ${dryRun ? "(dry-run) " : ""}CMakeLists.txt: ${cmake.changed ? "patched" : "already-patched"} (${cmake.path})`,
|
|
634
651
|
);
|
|
635
|
-
|
|
652
|
+
console.log(
|
|
653
|
+
`[metal-kernels] ${dryRun ? "(dry-run) " : ""}embedded loader: ${embeddedLoader.changed ? "patched" : "already-patched"} (${embeddedLoader.path})`,
|
|
654
|
+
);
|
|
655
|
+
return { copied, cmake, embeddedLoader, dispatch };
|
|
636
656
|
}
|