npm - @elizaos/capacitor-llama - Versions diffs - 0.1.0 → 2.0.0-beta.1 - Mend

@elizaos/capacitor-llama 0.1.0 → 2.0.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/README.md +1 -1
package/dist/esm/capacitor-llama-adapter.js +274 -35
package/dist/esm/definitions.d.ts +69 -0
package/dist/esm/device-bridge-client.js +69 -8
package/dist/esm/kv-cache-resolver.d.ts +57 -0
package/dist/esm/kv-cache-resolver.js +74 -0
package/dist/esm/load-capacitor-llama.d.ts +1 -1
package/dist/esm/load-capacitor-llama.js +1 -1
package/dist/plugin.cjs.js +344 -44
package/dist/plugin.cjs.js.map +1 -1
package/dist/plugin.js +344 -44
package/dist/plugin.js.map +1 -1
package/package.json +7 -6
package/dist/esm/index.test.d.ts +0 -1
package/dist/esm/index.test.js +0 -264
package/dist/esm/web.d.ts +0 -11
package/dist/esm/web.js +0 -10

package/README.md CHANGED Viewed

@@ -43,7 +43,7 @@ transparently.
    registerCapacitorLlamaLoader(runtime);
    ```
-3. Run `npx cap sync` in `apps/app` to pick up the native plugin. iOS and
+3. Run `bunx cap sync` in `apps/app` to pick up the native plugin. iOS and
    Android builds will pull in `llama-cpp-capacitor`'s prebuilt native
    libraries automatically.

package/dist/esm/capacitor-llama-adapter.js CHANGED Viewed

@@ -1,4 +1,22 @@
 const CONTEXT_ID = 1;
+const DEFAULT_MAX_TOKENS = 256;
+/**
+ * Mobile-side parallel slot count. Mirrors `DEFAULT_CACHE_PARALLEL` in
+ * `cache-bridge.ts`; on devices with constrained KV memory we keep a small
+ * fixed pool so distinct cacheKey values still get prefix reuse without
+ * blowing memory.
+ */
+const MOBILE_PARALLEL = 4;
+/** FNV-1a 32-bit, deterministic across platforms — matches the agent side. */
+function deriveCacheSlotId(key) {
+    let hash = 0x811c9dc5;
+    for (let i = 0; i < key.length; i += 1) {
+        hash ^= key.charCodeAt(i);
+        hash = Math.imul(hash, 0x01000193);
+    }
+    return Math.abs(hash | 0) % MOBILE_PARALLEL;
+}
+const MOBILE_MAX_TOKENS_CAP = 256;
 function isObject(value) {
     return typeof value === "object" && value !== null;
 }
@@ -7,7 +25,8 @@ function isLlamaCppPluginLike(value) {
         typeof value.initContext === "function" &&
         typeof value.releaseContext === "function" &&
         typeof value.releaseAllContexts === "function" &&
-        typeof value.generateText === "function" &&
+        (typeof value.completion === "function" ||
+            typeof value.generateText === "function") &&
         typeof value.stopCompletion === "function" &&
         typeof value.addListener === "function");
 }
@@ -23,6 +42,42 @@ function resolveLlamaCppPlugin(mod) {
     }
     return null;
 }
+function toPlainLlamaCppPlugin(plugin) {
+    return {
+        initContext: (options) => plugin.initContext(options),
+        releaseContext: (options) => plugin.releaseContext(options),
+        releaseAllContexts: () => plugin.releaseAllContexts(),
+        getHardwareInfo: typeof plugin.getHardwareInfo === "function"
+            ? () => { var _a; return (_a = plugin.getHardwareInfo) === null || _a === void 0 ? void 0 : _a.call(plugin); }
+            : undefined,
+        completion: typeof plugin.completion === "function"
+            ? (options) => { var _a; return (_a = plugin.completion) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
+            : undefined,
+        generateText: typeof plugin.generateText === "function"
+            ? (options) => { var _a; return (_a = plugin.generateText) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
+            : undefined,
+        stopCompletion: (options) => plugin.stopCompletion(options),
+        embedding: typeof plugin.embedding === "function"
+            ? (options) => { var _a; return (_a = plugin.embedding) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
+            : undefined,
+        tokenize: typeof plugin.tokenize === "function"
+            ? (options) => { var _a; return (_a = plugin.tokenize) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
+            : undefined,
+        setCacheType: typeof plugin.setCacheType === "function"
+            ? (options) => { var _a; return (_a = plugin.setCacheType) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
+            : undefined,
+        setSpecType: typeof plugin.setSpecType === "function"
+            ? (options) => { var _a; return (_a = plugin.setSpecType) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
+            : undefined,
+        getNativeKernels: typeof plugin.getNativeKernels === "function"
+            ? () => {
+                var _a;
+                return (_a = plugin.getNativeKernels) === null || _a === void 0 ? void 0 : _a.call(plugin);
+            }
+            : undefined,
+        addListener: (event, listener) => plugin.addListener(event, listener),
+    };
+}
 function isCapacitorNative() {
     var _a;
     const cap = globalThis.Capacitor;
@@ -38,6 +93,110 @@ function detectPlatform() {
         return "android";
     return "web";
 }
+function resolveMobileMaxTokens(requested) {
+    if (!Number.isFinite(requested) || requested == null || requested <= 0) {
+        return DEFAULT_MAX_TOKENS;
+    }
+    return Math.min(Math.floor(requested), MOBILE_MAX_TOKENS_CAP);
+}
+function numberFromUnknown(value) {
+    if (typeof value !== "number" || !Number.isFinite(value))
+        return null;
+    return value;
+}
+function booleanFromUnknown(value) {
+    return typeof value === "boolean" ? value : undefined;
+}
+function stringFromUnknown(value) {
+    return typeof value === "string" && value.trim().length > 0
+        ? value.trim()
+        : undefined;
+}
+function fallbackHardwareInfo(platform = detectPlatform(), reason = "native hardware probe unavailable") {
+    var _a, _b;
+    const nav = globalThis.navigator;
+    const totalRamGb = (_a = numberFromUnknown(nav === null || nav === void 0 ? void 0 : nav.deviceMemory)) !== null && _a !== void 0 ? _a : 0;
+    const gpu = platform === "ios"
+        ? { backend: "metal", available: true }
+        : platform === "android"
+            ? { backend: "vulkan", available: true }
+            : null;
+    return {
+        platform,
+        deviceModel: platform,
+        totalRamGb,
+        availableRamGb: null,
+        cpuCores: (_b = nav === null || nav === void 0 ? void 0 : nav.hardwareConcurrency) !== null && _b !== void 0 ? _b : 0,
+        gpu,
+        gpuSupported: platform !== "web",
+        dflashSupported: false,
+        dflashReason: reason,
+        source: "adapter-fallback",
+        nativeKernels: [],
+        forkVariant: null,
+    };
+}
+function normalizeForkVariant(value) {
+    if (value === "buun-llama-cpp" || value === "stock-llama-cpp")
+        return value;
+    if (value === null)
+        return null;
+    return undefined;
+}
+function stringArrayFromUnknown(value) {
+    if (!Array.isArray(value))
+        return undefined;
+    const out = [];
+    for (const entry of value) {
+        if (typeof entry === "string" && entry.length > 0)
+            out.push(entry);
+    }
+    return out;
+}
+function normalizeHardwareInfo(value, platform = detectPlatform()) {
+    var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k;
+    const fallback = fallbackHardwareInfo(platform);
+    if (!value)
+        return fallback;
+    const totalRamGb = (_a = numberFromUnknown(value.totalRamGb)) !== null && _a !== void 0 ? _a : fallback.totalRamGb;
+    const availableRamGb = value.availableRamGb === null
+        ? null
+        : ((_b = numberFromUnknown(value.availableRamGb)) !== null && _b !== void 0 ? _b : fallback.availableRamGb);
+    const gpu = value.gpu && isObject(value.gpu)
+        ? {
+            backend: value.gpu.backend === "metal" ||
+                value.gpu.backend === "vulkan" ||
+                value.gpu.backend === "gpu-delegate"
+                ? value.gpu.backend
+                : ((_d = (_c = fallback.gpu) === null || _c === void 0 ? void 0 : _c.backend) !== null && _d !== void 0 ? _d : "gpu-delegate"),
+            available: Boolean(value.gpu.available),
+        }
+        : fallback.gpu;
+    return Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign({ platform: value.platform === "ios" ||
+            value.platform === "android" ||
+            value.platform === "web"
+            ? value.platform
+            : platform, deviceModel: (_e = stringFromUnknown(value.deviceModel)) !== null && _e !== void 0 ? _e : fallback.deviceModel }, (stringFromUnknown(value.machineId)
+        ? { machineId: stringFromUnknown(value.machineId) }
+        : {})), (stringFromUnknown(value.osVersion)
+        ? { osVersion: stringFromUnknown(value.osVersion) }
+        : {})), (typeof value.isSimulator === "boolean"
+        ? { isSimulator: value.isSimulator }
+        : {})), { totalRamGb,
+        availableRamGb }), (numberFromUnknown(value.freeStorageGb) !== null
+        ? { freeStorageGb: numberFromUnknown(value.freeStorageGb) }
+        : {})), { cpuCores: (_f = numberFromUnknown(value.cpuCores)) !== null && _f !== void 0 ? _f : fallback.cpuCores, gpu, gpuSupported: (_g = booleanFromUnknown(value.gpuSupported)) !== null && _g !== void 0 ? _g : fallback.gpuSupported }), (typeof value.lowPowerMode === "boolean"
+        ? { lowPowerMode: value.lowPowerMode }
+        : {})), (value.thermalState === "nominal" ||
+        value.thermalState === "fair" ||
+        value.thermalState === "serious" ||
+        value.thermalState === "critical" ||
+        value.thermalState === "unknown"
+        ? { thermalState: value.thermalState }
+        : {})), { dflashSupported: Boolean(value.dflashSupported), dflashReason: (_h = stringFromUnknown(value.dflashReason)) !== null && _h !== void 0 ? _h : (value.dflashSupported
+            ? undefined
+            : "native plugin did not report DFlash support"), source: value.source === "native" ? "native" : "adapter-fallback", nativeKernels: (_j = stringArrayFromUnknown(value.nativeKernels)) !== null && _j !== void 0 ? _j : [], forkVariant: (_k = normalizeForkVariant(value.forkVariant)) !== null && _k !== void 0 ? _k : null });
+}
 class CapacitorLlamaAdapter {
     constructor() {
         this.plugin = null;
@@ -54,10 +213,11 @@ class CapacitorLlamaAdapter {
         if (this.pluginLoadPromise)
             return this.pluginLoadPromise;
         this.pluginLoadPromise = (async () => {
-            const plugin = resolveLlamaCppPlugin(await import("llama-cpp-capacitor"));
-            if (!plugin) {
-                throw new Error("llama-cpp-capacitor did not expose an initContext method");
+            const nativePlugin = resolveLlamaCppPlugin(await import("llama-cpp-capacitor"));
+            if (!nativePlugin) {
+                throw new Error("llama-cpp-capacitor did not expose the native LlamaCpp methods");
             }
+            const plugin = toPlainLlamaCppPlugin(nativePlugin);
             const tokenListenerHandle = await plugin.addListener("@LlamaCpp_onToken", (data) => {
                 var _a, _b;
                 const token = (_b = (_a = data.tokenResult) === null || _a === void 0 ? void 0 : _a.token) !== null && _b !== void 0 ? _b : data.token;
@@ -86,19 +246,73 @@ class CapacitorLlamaAdapter {
         }
     }
     async getHardwareInfo() {
-        var _a;
+        var _a, _b, _c;
         const platform = detectPlatform();
-        const nav = globalThis
-            .navigator;
-        return {
-            platform,
-            deviceModel: platform,
-            totalRamGb: 0,
-            availableRamGb: null,
-            cpuCores: (_a = nav === null || nav === void 0 ? void 0 : nav.hardwareConcurrency) !== null && _a !== void 0 ? _a : 0,
-            gpu: null,
-            gpuSupported: platform !== "web",
-        };
+        if (!isCapacitorNative())
+            return fallbackHardwareInfo(platform);
+        try {
+            const plugin = await this.loadPlugin();
+            const baseInfo = normalizeHardwareInfo(await ((_a = plugin.getHardwareInfo) === null || _a === void 0 ? void 0 : _a.call(plugin)), platform);
+            // Probe fork-specific kernels through the optional bridge method.
+            // Stock builds and older fork builds without the bridge fall back
+            // to the empty list + "stock-llama-cpp" variant marker.
+            let nativeKernels = (_b = baseInfo.nativeKernels) !== null && _b !== void 0 ? _b : [];
+            let forkVariant = (_c = baseInfo.forkVariant) !== null && _c !== void 0 ? _c : "stock-llama-cpp";
+            if (typeof plugin.getNativeKernels === "function") {
+                try {
+                    const probe = await plugin.getNativeKernels();
+                    const kernels = stringArrayFromUnknown(probe === null || probe === void 0 ? void 0 : probe.kernels);
+                    if (kernels)
+                        nativeKernels = kernels;
+                    const variant = normalizeForkVariant(probe === null || probe === void 0 ? void 0 : probe.variant);
+                    if (variant !== undefined)
+                        forkVariant = variant;
+                    else if (nativeKernels.length > 0)
+                        forkVariant = "buun-llama-cpp";
+                }
+                catch (err) {
+                    const message = err instanceof Error ? err.message : String(err);
+                    console.debug("[capacitor-llama] getNativeKernels probe failed", {
+                        error: message,
+                    });
+                }
+            }
+            return Object.assign(Object.assign({}, baseInfo), { nativeKernels,
+                forkVariant });
+        }
+        catch (error) {
+            return fallbackHardwareInfo(platform, error instanceof Error ? error.message : "native hardware probe failed");
+        }
+    }
+    async setCacheType(typeK, typeV) {
+        if (!isCapacitorNative()) {
+            console.warn("[capacitor-llama] setCacheType called on non-native platform; ignoring");
+            return;
+        }
+        const plugin = await this.loadPlugin();
+        if (typeof plugin.setCacheType !== "function") {
+            console.warn("[capacitor-llama] underlying plugin does not expose setCacheType (likely stock build); cache types must be passed via load() params instead");
+            return;
+        }
+        await plugin.setCacheType({ cacheTypeK: typeK, cacheTypeV: typeV });
+    }
+    async setSpecType(args) {
+        if (!isCapacitorNative()) {
+            console.warn("[capacitor-llama] setSpecType called on non-native platform; ignoring");
+            return;
+        }
+        const plugin = await this.loadPlugin();
+        if (typeof plugin.setSpecType !== "function") {
+            console.warn("[capacitor-llama] underlying plugin does not expose setSpecType (likely stock build); pass draft_model + draft_min/max via load() instead");
+            return;
+        }
+        await plugin.setSpecType({
+            target: args.target,
+            drafter: args.drafter,
+            specType: args.specType,
+            draftMin: args.draftMin,
+            draftMax: args.draftMax,
+        });
     }
     async isLoaded() {
         return {
@@ -110,7 +324,7 @@ class CapacitorLlamaAdapter {
         return this.loadedPath;
     }
     async load(options) {
-        var _a, _b;
+        var _a, _b, _c, _d, _e, _f;
         if (!isCapacitorNative()) {
             throw new Error("capacitor-llama is only available on iOS and Android builds");
         }
@@ -119,15 +333,21 @@ class CapacitorLlamaAdapter {
             await plugin.releaseAllContexts();
             this.loadedPath = null;
         }
+        const speculativeSamples = options.mobileSpeculative
+            ? Math.min((_b = (_a = options.speculativeSamples) !== null && _a !== void 0 ? _a : options.draftMax) !== null && _b !== void 0 ? _b : 3, 4)
+            : ((_c = options.speculativeSamples) !== null && _c !== void 0 ? _c : 3);
+        const params = Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign({ model: options.modelPath, n_ctx: (_d = options.contextSize) !== null && _d !== void 0 ? _d : 4096, n_gpu_layers: options.useGpu === false ? 0 : 99, n_threads: (_e = options.maxThreads) !== null && _e !== void 0 ? _e : 0, use_mmap: true, flash_attn: options.useGpu !== false, n_batch: options.mobileSpeculative ? 128 : 512, n_ubatch: options.mobileSpeculative ? 64 : 512 }, (options.draftModelPath
+            ? {
+                draft_model: options.draftModelPath,
+                speculative_samples: speculativeSamples,
+                mobile_speculative: (_f = options.mobileSpeculative) !== null && _f !== void 0 ? _f : true,
+            }
+            : {})), (options.draftContextSize
+            ? { n_ctx_draft: options.draftContextSize }
+            : {})), (options.draftMin ? { draft_min: options.draftMin } : {})), (options.draftMax ? { draft_max: options.draftMax } : {})), (options.cacheTypeK ? { cache_type_k: options.cacheTypeK } : {})), (options.cacheTypeV ? { cache_type_v: options.cacheTypeV } : {})), (options.disableThinking ? { reasoning: false } : {}));
         await plugin.initContext({
             contextId: CONTEXT_ID,
-            params: {
-                model: options.modelPath,
-                n_ctx: (_a = options.contextSize) !== null && _a !== void 0 ? _a : 4096,
-                n_gpu_layers: options.useGpu === false ? 0 : 99,
-                n_threads: (_b = options.maxThreads) !== null && _b !== void 0 ? _b : 0,
-                use_mmap: true,
-            },
+            params,
         });
         this.loadedPath = options.modelPath;
     }
@@ -143,15 +363,15 @@ class CapacitorLlamaAdapter {
         this.loadedPath = null;
     }
     async generate(options) {
-        var _a, _b, _c, _d;
+        var _a, _b, _c, _d, _e;
         if (!this.plugin || !this.loadedPath) {
             throw new Error("No model loaded. Call load() first.");
         }
         this.tokenIndex = 0;
         const params = {
-            n_predict: (_a = options.maxTokens) !== null && _a !== void 0 ? _a : 2048,
-            temperature: (_b = options.temperature) !== null && _b !== void 0 ? _b : 0.7,
-            top_p: (_c = options.topP) !== null && _c !== void 0 ? _c : 0.9,
+            n_predict: resolveMobileMaxTokens(options.maxTokens),
+            temperature: (_a = options.temperature) !== null && _a !== void 0 ? _a : 0.7,
+            top_p: (_b = options.topP) !== null && _b !== void 0 ? _b : 0.9,
         };
         if (options.stopSequences && options.stopSequences.length > 0) {
             params.stop = options.stopSequences;
@@ -159,13 +379,32 @@ class CapacitorLlamaAdapter {
         if (options.stream) {
             params.emit_partial_completion = true;
         }
+        // Cache key threading: surface the slot id derived from
+        // ProviderCachePlan.promptCacheKey to the native side. Stock
+        // llama-cpp-capacitor builds ignore the field; the patched fork build
+        // reads it via setCacheType / completion params and pins KV slots.
+        if (options.cacheKey) {
+            const slotId = deriveCacheSlotId(options.cacheKey);
+            params.cache_prompt =
+                true;
+            params.slot_id =
+                slotId;
+        }
         const started = Date.now();
-        const result = await this.plugin.generateText({
-            contextId: CONTEXT_ID,
-            prompt: options.prompt,
-            params,
-        });
-        const duration = ((_d = result.timings) === null || _d === void 0 ? void 0 : _d.predicted_ms) != null
+        const result = typeof this.plugin.completion === "function"
+            ? await this.plugin.completion({
+                contextId: CONTEXT_ID,
+                params: Object.assign({ prompt: options.prompt, emit_partial_completion: Boolean(params.emit_partial_completion) }, params),
+            })
+            : await ((_d = (_c = this.plugin).generateText) === null || _d === void 0 ? void 0 : _d.call(_c, {
+                contextId: CONTEXT_ID,
+                prompt: options.prompt,
+                params,
+            }));
+        if (!result) {
+            throw new Error("llama-cpp-capacitor did not expose completion() or generateText()");
+        }
+        const duration = ((_e = result.timings) === null || _e === void 0 ? void 0 : _e.predicted_ms) != null
             ? Math.round(result.timings.predicted_ms)
             : Date.now() - started;
         return {
@@ -238,7 +477,7 @@ export function registerCapacitorLlamaLoader(runtime) {
         return;
     runtime.registerService("localInferenceLoader", {
         async loadModel(args) {
-            await capacitorLlama.load({ modelPath: args.modelPath });
+            await capacitorLlama.load(args);
         },
         async unloadModel() {
             await capacitorLlama.unload();

package/dist/esm/definitions.d.ts CHANGED Viewed

@@ -20,6 +20,22 @@ export interface LoadOptions {
     useGpu?: boolean;
     /** Cap on native thread count; native layer picks a reasonable default otherwise. */
     maxThreads?: number;
+    /** Optional draft GGUF for native speculative decoding builds. */
+    draftModelPath?: string;
+    /** Context window for the draft model when supported by the native build. */
+    draftContextSize?: number;
+    /** Lower/upper speculative draft bounds for fork builds that expose them. */
+    draftMin?: number;
+    draftMax?: number;
+    /** Number of draft tokens/samples when the native runtime supports it. */
+    speculativeSamples?: number;
+    /** Mobile runtimes may enable a lower-memory speculative path. */
+    mobileSpeculative?: boolean;
+    /** Optional KV cache types for fork builds such as TurboQuant. */
+    cacheTypeK?: string;
+    cacheTypeV?: string;
+    /** Eliza-1 DFlash drafters are trained for non-thinking outputs. */
+    disableThinking?: boolean;
 }
 export interface GenerateOptions {
     prompt: string;
@@ -29,6 +45,13 @@ export interface GenerateOptions {
     stopSequences?: string[];
     /** When true, token events fire on the "token" listener. */
     stream?: boolean;
+    /**
+     * Forwarded promptCacheKey from `ProviderCachePlan`. Native plugins
+     * that support prefix caching should derive a slot id from this and
+     * keep KV warm for repeated calls with the same key. Plugins without
+     * cache support ignore the field; behavior is unchanged.
+     */
+    cacheKey?: string;
 }
 export interface GenerateResult {
     text: string;
@@ -40,8 +63,13 @@ export interface HardwareInfo {
     platform: "ios" | "android" | "web";
     /** Human-readable device model when the OS exposes one. */
     deviceModel: string;
+    /** Stable OS machine identifier when available, e.g. iPhone16,2. */
+    machineId?: string;
+    osVersion?: string;
+    isSimulator?: boolean;
     totalRamGb: number;
     availableRamGb: number | null;
+    freeStorageGb?: number | null;
     cpuCores: number;
     gpu: {
         backend: "metal" | "vulkan" | "gpu-delegate";
@@ -49,6 +77,25 @@ export interface HardwareInfo {
     } | null;
     /** True when the underlying llama.cpp build has GPU support compiled in. */
     gpuSupported: boolean;
+    lowPowerMode?: boolean;
+    thermalState?: "nominal" | "fair" | "serious" | "critical" | "unknown";
+    /** True only when the native build can load a drafter and run DFlash/spec decode. */
+    dflashSupported?: boolean;
+    dflashReason?: string;
+    source?: "native" | "adapter-fallback";
+    /**
+     * Names of fork-specific kernels compiled into the loaded native library
+     * (e.g. "turbo3", "turbo4", "turbo3_tcq", "dflash", "qjl_full"). Empty
+     * when the loaded build is stock llama.cpp or when no native lib is loaded.
+     * Surfaced from the native bridge via a `kernels.json` manifest shipped
+     * alongside the .so.
+     */
+    nativeKernels?: string[];
+    /**
+     * Which native llama.cpp variant is loaded. `null` when the plugin
+     * isn't loaded at all (web fallback or native lib failed to load).
+     */
+    forkVariant?: "buun-llama-cpp" | "stock-llama-cpp" | null;
 }
 export interface EmbedOptions {
     /** Raw text to embed. The adapter forwards this verbatim to the native plugin. */
@@ -70,6 +117,16 @@ export interface EmbedResult {
      */
     tokens: number;
 }
+export interface SetSpecTypeArgs {
+    /** Path to the target (large) GGUF. */
+    target: string;
+    /** Path to the drafter (small) GGUF. */
+    drafter: string;
+    /** Currently only "dflash" is honoured by the buun fork. */
+    specType: "dflash";
+    draftMin: number;
+    draftMax: number;
+}
 export interface LlamaAdapter {
     getHardwareInfo(): Promise<HardwareInfo>;
     isLoaded(): Promise<{
@@ -89,4 +146,16 @@ export interface LlamaAdapter {
      * does not expose an embedding method on the active platform.
      */
     embed(options: EmbedOptions): Promise<EmbedResult>;
+    /**
+     * Configure the KV cache types used by the next loaded context. Only
+     * the buun-llama-cpp fork honours TurboQuant cache types like
+     * `q4_tq3` / `q4_tq4`. Stock builds will warn-and-no-op when the
+     * underlying plugin doesn't expose the bridge method.
+     */
+    setCacheType?(typeK: string, typeV: string): Promise<void>;
+    /**
+     * Configure DFlash speculative decoding for the next loaded context.
+     * Stock builds without speculative bridge methods warn-and-no-op.
+     */
+    setSpecType?(args: SetSpecTypeArgs): Promise<void>;
 }

package/dist/esm/device-bridge-client.js CHANGED Viewed

@@ -14,6 +14,7 @@
 import { loadCapacitorLlama } from "./load-capacitor-llama";
 const INITIAL_BACKOFF_MS = 1000;
 const MAX_BACKOFF_MS = 30000;
+const CONNECT_TIMEOUT_MS = 5000;
 export class DeviceBridgeClient {
     constructor(config) {
         this.socket = null;
@@ -58,7 +59,27 @@ export class DeviceBridgeClient {
             return;
         }
         this.socket = ws;
+        let timedOut = false;
+        const connectTimeout = setTimeout(() => {
+            var _a, _b;
+            if (this.stopped ||
+                this.socket !== ws ||
+                ws.readyState !== WebSocket.CONNECTING) {
+                return;
+            }
+            timedOut = true;
+            this.socket = null;
+            (_b = (_a = this.config).onStateChange) === null || _b === void 0 ? void 0 : _b.call(_a, "error", "websocket connect timeout");
+            try {
+                ws.close();
+            }
+            catch (_c) {
+                /* best effort */
+            }
+            this.scheduleReconnect();
+        }, CONNECT_TIMEOUT_MS);
         ws.onopen = () => {
+            clearTimeout(connectTimeout);
             this.reconnectAttempt = 0;
             void this.sendRegister(ws);
         };
@@ -78,8 +99,12 @@ export class DeviceBridgeClient {
         };
         ws.onclose = () => {
             var _a, _b;
-            this.socket = null;
+            clearTimeout(connectTimeout);
+            if (this.socket === ws)
+                this.socket = null;
             (_b = (_a = this.config).onStateChange) === null || _b === void 0 ? void 0 : _b.call(_a, "disconnected");
+            if (timedOut)
+                return;
             this.scheduleReconnect();
         };
     }
@@ -107,13 +132,17 @@ export class DeviceBridgeClient {
             payload: {
                 deviceId: this.config.deviceId,
                 pairingToken: this.config.pairingToken,
-                capabilities: {
-                    platform: hardware.platform,
-                    deviceModel: hardware.deviceModel,
-                    totalRamGb: hardware.totalRamGb,
-                    cpuCores: hardware.cpuCores,
-                    gpu: hardware.gpu,
-                },
+                capabilities: Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign({ platform: hardware.platform, deviceModel: hardware.deviceModel }, (hardware.machineId ? { machineId: hardware.machineId } : {})), (hardware.osVersion ? { osVersion: hardware.osVersion } : {})), (typeof hardware.isSimulator === "boolean"
+                    ? { isSimulator: hardware.isSimulator }
+                    : {})), { totalRamGb: hardware.totalRamGb, availableRamGb: hardware.availableRamGb }), (typeof hardware.freeStorageGb === "number"
+                    ? { freeStorageGb: hardware.freeStorageGb }
+                    : {})), { cpuCores: hardware.cpuCores, gpu: hardware.gpu, gpuSupported: hardware.gpuSupported }), (typeof hardware.lowPowerMode === "boolean"
+                    ? { lowPowerMode: hardware.lowPowerMode }
+                    : {})), (hardware.thermalState
+                    ? { thermalState: hardware.thermalState }
+                    : {})), { dflashSupported: hardware.dflashSupported }), (hardware.dflashReason
+                    ? { dflashReason: hardware.dflashReason }
+                    : {})),
                 loadedPath: loaded.modelPath,
             },
         };
@@ -137,6 +166,16 @@ export class DeviceBridgeClient {
                     modelPath: msg.modelPath,
                     contextSize: msg.contextSize,
                     useGpu: msg.useGpu,
+                    maxThreads: msg.maxThreads,
+                    draftModelPath: msg.draftModelPath,
+                    draftContextSize: msg.draftContextSize,
+                    draftMin: msg.draftMin,
+                    draftMax: msg.draftMax,
+                    speculativeSamples: msg.speculativeSamples,
+                    mobileSpeculative: msg.mobileSpeculative,
+                    cacheTypeK: msg.cacheTypeK,
+                    cacheTypeV: msg.cacheTypeV,
+                    disableThinking: msg.disableThinking,
                 });
                 this.send(ws, {
                     type: "loadResult",
@@ -204,6 +243,28 @@ export class DeviceBridgeClient {
             }
             return;
         }
+        if (msg.type === "embed") {
+            try {
+                const capacitorLlama = await loadCapacitorLlama();
+                const result = await capacitorLlama.embed({ input: msg.input });
+                this.send(ws, {
+                    type: "embedResult",
+                    correlationId: msg.correlationId,
+                    ok: true,
+                    embedding: result.embedding,
+                    tokens: result.tokens,
+                });
+            }
+            catch (err) {
+                this.send(ws, {
+                    type: "embedResult",
+                    correlationId: msg.correlationId,
+                    ok: false,
+                    error: err instanceof Error ? err.message : String(err),
+                });
+            }
+            return;
+        }
     }
 }
 /**