npm - @elizaos/capacitor-llama - Versions diffs - 0.1.0 → 2.0.3-beta.2 - Mend

@elizaos/capacitor-llama 0.1.0 → 2.0.3-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/LICENSE +21 -0
package/README.md +64 -43
package/dist/esm/capacitor-llama-adapter.d.ts +92 -1
package/dist/esm/capacitor-llama-adapter.js +705 -64
package/dist/esm/definitions.d.ts +214 -0
package/dist/esm/device-bridge-client.d.ts +17 -0
package/dist/esm/device-bridge-client.js +210 -15
package/dist/esm/index.d.ts +3 -2
package/dist/esm/index.js +3 -2
package/dist/esm/kv-cache-resolver.d.ts +57 -0
package/dist/esm/kv-cache-resolver.js +74 -0
package/dist/esm/load-capacitor-llama.d.ts +1 -1
package/dist/esm/load-capacitor-llama.js +1 -1
package/dist/esm/token-tree-codec.d.ts +51 -0
package/dist/esm/token-tree-codec.js +217 -0
package/dist/plugin.cjs.js +1136 -79
package/dist/plugin.cjs.js.map +1 -1
package/dist/plugin.js +1136 -79
package/dist/plugin.js.map +1 -1
package/package.json +15 -10
package/dist/esm/index.test.d.ts +0 -1
package/dist/esm/index.test.js +0 -264
package/dist/esm/web.d.ts +0 -11
package/dist/esm/web.js +0 -10

package/dist/esm/capacitor-llama-adapter.js CHANGED Viewed

@@ -1,4 +1,46 @@
-const CONTEXT_ID = 1;
+var __asyncValues = (this && this.__asyncValues) || function (o) {
+    if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
+    var m = o[Symbol.asyncIterator], i;
+    return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
+    function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
+    function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
+};
+var __await = (this && this.__await) || function (v) { return this instanceof __await ? (this.v = v, this) : new __await(v); }
+var __asyncGenerator = (this && this.__asyncGenerator) || function (thisArg, _arguments, generator) {
+    if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
+    var g = generator.apply(thisArg, _arguments || []), i, q = [];
+    return i = Object.create((typeof AsyncIterator === "function" ? AsyncIterator : Object).prototype), verb("next"), verb("throw"), verb("return", awaitReturn), i[Symbol.asyncIterator] = function () { return this; }, i;
+    function awaitReturn(f) { return function (v) { return Promise.resolve(v).then(f, reject); }; }
+    function verb(n, f) { if (g[n]) { i[n] = function (v) { return new Promise(function (a, b) { q.push([n, v, a, b]) > 1 || resume(n, v); }); }; if (f) i[n] = f(i[n]); } }
+    function resume(n, v) { try { step(g[n](v)); } catch (e) { settle(q[0][3], e); } }
+    function step(r) { r.value instanceof __await ? Promise.resolve(r.value.v).then(fulfill, reject) : settle(q[0][2], r); }
+    function fulfill(value) { resume("next", value); }
+    function reject(value) { resume("throw", value); }
+    function settle(f, v) { if (f(v), q.shift(), q.length) resume(q[0][0], q[0][1]); }
+};
+// completion(contextId=X) must run against the model that was initContext'd
+// with X — every adapter instance owns its own monotonically-allocated id so
+// the chat LLM and the embedding model never collide on the same native
+// context.
+let nextContextId = 1;
+const DEFAULT_MAX_TOKENS = 256;
+/**
+ * Mobile-side parallel slot count. Mirrors `DEFAULT_CACHE_PARALLEL` in
+ * `cache-bridge.ts`; on devices with constrained KV memory we keep a small
+ * fixed pool so distinct cacheKey values still get prefix reuse without
+ * blowing memory.
+ */
+const MOBILE_PARALLEL = 4;
+/** FNV-1a 32-bit, deterministic across platforms — matches the agent side. */
+function deriveCacheSlotId(key) {
+    let hash = 0x811c9dc5;
+    for (let i = 0; i < key.length; i += 1) {
+        hash ^= key.charCodeAt(i);
+        hash = Math.imul(hash, 0x01000193);
+    }
+    return Math.abs(hash | 0) % MOBILE_PARALLEL;
+}
+const MOBILE_MAX_TOKENS_CAP = 256;
 function isObject(value) {
     return typeof value === "object" && value !== null;
 }
@@ -7,7 +49,8 @@ function isLlamaCppPluginLike(value) {
         typeof value.initContext === "function" &&
         typeof value.releaseContext === "function" &&
         typeof value.releaseAllContexts === "function" &&
-        typeof value.generateText === "function" &&
+        (typeof value.completion === "function" ||
+            typeof value.generateText === "function") &&
         typeof value.stopCompletion === "function" &&
         typeof value.addListener === "function");
 }
@@ -23,6 +66,42 @@ function resolveLlamaCppPlugin(mod) {
     }
     return null;
 }
+function toPlainLlamaCppPlugin(plugin) {
+    return {
+        initContext: (options) => plugin.initContext(options),
+        releaseContext: (options) => plugin.releaseContext(options),
+        releaseAllContexts: () => plugin.releaseAllContexts(),
+        getHardwareInfo: typeof plugin.getHardwareInfo === "function"
+            ? () => { var _a; return (_a = plugin.getHardwareInfo) === null || _a === void 0 ? void 0 : _a.call(plugin); }
+            : undefined,
+        completion: typeof plugin.completion === "function"
+            ? (options) => { var _a; return (_a = plugin.completion) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
+            : undefined,
+        generateText: typeof plugin.generateText === "function"
+            ? (options) => { var _a; return (_a = plugin.generateText) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
+            : undefined,
+        stopCompletion: (options) => plugin.stopCompletion(options),
+        embedding: typeof plugin.embedding === "function"
+            ? (options) => { var _a; return (_a = plugin.embedding) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
+            : undefined,
+        tokenize: typeof plugin.tokenize === "function"
+            ? (options) => { var _a; return (_a = plugin.tokenize) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
+            : undefined,
+        setCacheType: typeof plugin.setCacheType === "function"
+            ? (options) => { var _a; return (_a = plugin.setCacheType) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
+            : undefined,
+        setSpecType: typeof plugin.setSpecType === "function"
+            ? (options) => { var _a; return (_a = plugin.setSpecType) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
+            : undefined,
+        getNativeKernels: typeof plugin.getNativeKernels === "function"
+            ? () => {
+                var _a;
+                return (_a = plugin.getNativeKernels) === null || _a === void 0 ? void 0 : _a.call(plugin);
+            }
+            : undefined,
+        addListener: (event, listener) => plugin.addListener(event, listener),
+    };
+}
 function isCapacitorNative() {
     var _a;
     const cap = globalThis.Capacitor;
@@ -38,15 +117,145 @@ function detectPlatform() {
         return "android";
     return "web";
 }
-class CapacitorLlamaAdapter {
+function resolveMobileMaxTokens(requested) {
+    if (!Number.isFinite(requested) || requested == null || requested <= 0) {
+        return DEFAULT_MAX_TOKENS;
+    }
+    return Math.min(Math.floor(requested), MOBILE_MAX_TOKENS_CAP);
+}
+function numberFromUnknown(value) {
+    if (typeof value !== "number" || !Number.isFinite(value))
+        return null;
+    return value;
+}
+function booleanFromUnknown(value) {
+    return typeof value === "boolean" ? value : undefined;
+}
+function stringFromUnknown(value) {
+    return typeof value === "string" && value.trim().length > 0
+        ? value.trim()
+        : undefined;
+}
+function fallbackHardwareInfo(platform = detectPlatform(), reason = "native hardware probe unavailable") {
+    var _a, _b;
+    const nav = globalThis.navigator;
+    const totalRamGb = (_a = numberFromUnknown(nav === null || nav === void 0 ? void 0 : nav.deviceMemory)) !== null && _a !== void 0 ? _a : 0;
+    const gpu = platform === "ios"
+        ? { backend: "metal", available: true }
+        : null;
+    return {
+        platform,
+        deviceModel: platform,
+        totalRamGb,
+        availableRamGb: null,
+        cpuCores: (_b = nav === null || nav === void 0 ? void 0 : nav.hardwareConcurrency) !== null && _b !== void 0 ? _b : 0,
+        gpu,
+        gpuSupported: platform === "ios",
+        mtpSupported: false,
+        mtpReason: reason,
+        source: "adapter-fallback",
+        nativeKernels: [],
+        forkVariant: null,
+    };
+}
+function defaultNativeGpuEnabled(platform = detectPlatform()) {
+    // iOS builds use the Metal-capable native path by default. Android's current
+    // Capacitor wrapper is CPU-only unless a forked Vulkan bridge explicitly opts
+    // in, so the safe production default is CPU.
+    return platform === "ios";
+}
+function resolveNativeGpuEnabled(useGpu) {
+    return typeof useGpu === "boolean" ? useGpu : defaultNativeGpuEnabled();
+}
+function normalizeForkVariant(value) {
+    if (value === "buun-llama-cpp" || value === "stock-llama-cpp")
+        return value;
+    if (value === null)
+        return null;
+    return undefined;
+}
+function stringArrayFromUnknown(value) {
+    if (!Array.isArray(value))
+        return undefined;
+    const out = [];
+    for (const entry of value) {
+        if (typeof entry === "string" && entry.length > 0)
+            out.push(entry);
+    }
+    return out;
+}
+function normalizeHardwareInfo(value, platform = detectPlatform()) {
+    var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k;
+    const fallback = fallbackHardwareInfo(platform);
+    if (!value)
+        return fallback;
+    const totalRamGb = (_a = numberFromUnknown(value.totalRamGb)) !== null && _a !== void 0 ? _a : fallback.totalRamGb;
+    const availableRamGb = value.availableRamGb === null
+        ? null
+        : ((_b = numberFromUnknown(value.availableRamGb)) !== null && _b !== void 0 ? _b : fallback.availableRamGb);
+    const gpu = value.gpu && isObject(value.gpu)
+        ? {
+            backend: value.gpu.backend === "metal" ||
+                value.gpu.backend === "vulkan" ||
+                value.gpu.backend === "gpu-delegate"
+                ? value.gpu.backend
+                : ((_d = (_c = fallback.gpu) === null || _c === void 0 ? void 0 : _c.backend) !== null && _d !== void 0 ? _d : "gpu-delegate"),
+            available: Boolean(value.gpu.available),
+        }
+        : fallback.gpu;
+    return Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign({ platform: value.platform === "ios" ||
+            value.platform === "android" ||
+            value.platform === "web"
+            ? value.platform
+            : platform, deviceModel: (_e = stringFromUnknown(value.deviceModel)) !== null && _e !== void 0 ? _e : fallback.deviceModel }, (stringFromUnknown(value.machineId)
+        ? { machineId: stringFromUnknown(value.machineId) }
+        : {})), (stringFromUnknown(value.osVersion)
+        ? { osVersion: stringFromUnknown(value.osVersion) }
+        : {})), (typeof value.isSimulator === "boolean"
+        ? { isSimulator: value.isSimulator }
+        : {})), { totalRamGb,
+        availableRamGb }), (numberFromUnknown(value.freeStorageGb) !== null
+        ? { freeStorageGb: numberFromUnknown(value.freeStorageGb) }
+        : {})), { cpuCores: (_f = numberFromUnknown(value.cpuCores)) !== null && _f !== void 0 ? _f : fallback.cpuCores, gpu, gpuSupported: (_g = booleanFromUnknown(value.gpuSupported)) !== null && _g !== void 0 ? _g : fallback.gpuSupported }), (typeof value.lowPowerMode === "boolean"
+        ? { lowPowerMode: value.lowPowerMode }
+        : {})), (value.thermalState === "nominal" ||
+        value.thermalState === "fair" ||
+        value.thermalState === "serious" ||
+        value.thermalState === "critical" ||
+        value.thermalState === "unknown"
+        ? { thermalState: value.thermalState }
+        : {})), { mtpSupported: Boolean(value.mtpSupported), mtpReason: (_h = stringFromUnknown(value.mtpReason)) !== null && _h !== void 0 ? _h : (value.mtpSupported
+            ? undefined
+            : "native plugin did not report MTP support"), source: value.source === "native" ? "native" : "adapter-fallback", nativeKernels: (_j = stringArrayFromUnknown(value.nativeKernels)) !== null && _j !== void 0 ? _j : [], forkVariant: (_k = normalizeForkVariant(value.forkVariant)) !== null && _k !== void 0 ? _k : null });
+}
+export class CapacitorLlamaAdapter {
     constructor() {
         this.plugin = null;
         /** Cached loader promise so concurrent `load()` calls don't race to register duplicate listeners. */
         this.pluginLoadPromise = null;
         this.loadedPath = null;
+        /**
+         * Native context id this adapter owns. Allocated lazily on first `load()`
+         * from the process-wide `nextContextId` counter so distinct adapter
+         * instances never share a context — see the module-level invariant comment.
+         */
+        this.contextId = null;
         this.tokenIndex = 0;
         this.tokenListeners = new Set();
         this.pluginListenerHandle = null;
+        /**
+         * Latest native completion stats captured by `generateStream`. Read by
+         * the `generate()` wrapper to populate `GenerateResult` without
+         * re-issuing the native call. Cleared at the start of every
+         * `generateStream` invocation.
+         */
+        this.lastCompletionStats = null;
+    }
+    requireContextId() {
+        if (this.contextId === null) {
+            throw new Error("No model loaded. Call load() first.");
+        }
+        return this.contextId;
     }
     async loadPlugin() {
         if (this.plugin)
@@ -54,10 +263,11 @@ class CapacitorLlamaAdapter {
         if (this.pluginLoadPromise)
             return this.pluginLoadPromise;
         this.pluginLoadPromise = (async () => {
-            const plugin = resolveLlamaCppPlugin(await import("llama-cpp-capacitor"));
-            if (!plugin) {
-                throw new Error("llama-cpp-capacitor did not expose an initContext method");
+            const nativePlugin = resolveLlamaCppPlugin(await import("llama-cpp-capacitor"));
+            if (!nativePlugin) {
+                throw new Error("llama-cpp-capacitor did not expose the native LlamaCpp methods");
             }
+            const plugin = toPlainLlamaCppPlugin(nativePlugin);
             const tokenListenerHandle = await plugin.addListener("@LlamaCpp_onToken", (data) => {
                 var _a, _b;
                 const token = (_b = (_a = data.tokenResult) === null || _a === void 0 ? void 0 : _a.token) !== null && _b !== void 0 ? _b : data.token;
@@ -86,19 +296,73 @@ class CapacitorLlamaAdapter {
         }
     }
     async getHardwareInfo() {
-        var _a;
+        var _a, _b, _c;
         const platform = detectPlatform();
-        const nav = globalThis
-            .navigator;
-        return {
-            platform,
-            deviceModel: platform,
-            totalRamGb: 0,
-            availableRamGb: null,
-            cpuCores: (_a = nav === null || nav === void 0 ? void 0 : nav.hardwareConcurrency) !== null && _a !== void 0 ? _a : 0,
-            gpu: null,
-            gpuSupported: platform !== "web",
-        };
+        if (!isCapacitorNative())
+            return fallbackHardwareInfo(platform);
+        try {
+            const plugin = await this.loadPlugin();
+            const baseInfo = normalizeHardwareInfo(await ((_a = plugin.getHardwareInfo) === null || _a === void 0 ? void 0 : _a.call(plugin)), platform);
+            // Probe fork-specific kernels through the optional bridge method.
+            // Stock builds and older fork builds without the bridge fall back
+            // to the empty list + "stock-llama-cpp" variant marker.
+            let nativeKernels = (_b = baseInfo.nativeKernels) !== null && _b !== void 0 ? _b : [];
+            let forkVariant = (_c = baseInfo.forkVariant) !== null && _c !== void 0 ? _c : "stock-llama-cpp";
+            if (typeof plugin.getNativeKernels === "function") {
+                try {
+                    const probe = await plugin.getNativeKernels();
+                    const kernels = stringArrayFromUnknown(probe === null || probe === void 0 ? void 0 : probe.kernels);
+                    if (kernels)
+                        nativeKernels = kernels;
+                    const variant = normalizeForkVariant(probe === null || probe === void 0 ? void 0 : probe.variant);
+                    if (variant !== undefined)
+                        forkVariant = variant;
+                    else if (nativeKernels.length > 0)
+                        forkVariant = "buun-llama-cpp";
+                }
+                catch (err) {
+                    const message = err instanceof Error ? err.message : String(err);
+                    console.debug("[capacitor-llama] getNativeKernels probe failed", {
+                        error: message,
+                    });
+                }
+            }
+            return Object.assign(Object.assign({}, baseInfo), { nativeKernels,
+                forkVariant });
+        }
+        catch (error) {
+            return fallbackHardwareInfo(platform, error instanceof Error ? error.message : "native hardware probe failed");
+        }
+    }
+    async setCacheType(typeK, typeV) {
+        if (!isCapacitorNative()) {
+            console.warn("[capacitor-llama] setCacheType called on non-native platform; ignoring");
+            return;
+        }
+        const plugin = await this.loadPlugin();
+        if (typeof plugin.setCacheType !== "function") {
+            console.warn("[capacitor-llama] underlying plugin does not expose setCacheType (likely stock build); cache types must be passed via load() params instead");
+            return;
+        }
+        await plugin.setCacheType({ cacheTypeK: typeK, cacheTypeV: typeV });
+    }
+    async setSpecType(args) {
+        if (!isCapacitorNative()) {
+            console.warn("[capacitor-llama] setSpecType called on non-native platform; ignoring");
+            return;
+        }
+        const plugin = await this.loadPlugin();
+        if (typeof plugin.setSpecType !== "function") {
+            console.warn("[capacitor-llama] underlying plugin does not expose setSpecType (likely stock build); pass draft_model + draft_min/max via load() instead");
+            return;
+        }
+        await plugin.setSpecType({
+            target: args.target,
+            drafter: args.drafter,
+            specType: args.specType,
+            draftMin: args.draftMin,
+            draftMax: args.draftMax,
+        });
     }
     async isLoaded() {
         return {
@@ -110,48 +374,109 @@ class CapacitorLlamaAdapter {
         return this.loadedPath;
     }
     async load(options) {
-        var _a, _b;
+        var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k;
         if (!isCapacitorNative()) {
             throw new Error("capacitor-llama is only available on iOS and Android builds");
         }
         const plugin = await this.loadPlugin();
-        if (this.loadedPath && this.loadedPath !== options.modelPath) {
-            await plugin.releaseAllContexts();
-            this.loadedPath = null;
+        // Release this adapter's own prior context (if any) before reusing the
+        // context id for a new model. We do NOT call `releaseAllContexts` here
+        // — that would destroy contexts owned by sibling adapter instances
+        // (e.g. tear down the embedding model when the chat model reloads).
+        if (this.contextId !== null && this.loadedPath !== null) {
+            try {
+                await plugin.releaseContext({ contextId: this.contextId });
+            }
+            catch (_l) {
+                // The native side may have already cleared this context; safe to
+                // proceed to reinit on the same id.
+            }
+        }
+        this.loadedPath = null;
+        if (this.contextId === null) {
+            this.contextId = nextContextId++;
         }
+        const speculativeSamples = options.mobileSpeculative
+            ? Math.min((_b = (_a = options.speculativeSamples) !== null && _a !== void 0 ? _a : options.draftMax) !== null && _b !== void 0 ? _b : 3, 4)
+            : ((_c = options.speculativeSamples) !== null && _c !== void 0 ? _c : 3);
+        const nativeGpuEnabled = resolveNativeGpuEnabled(options.useGpu);
+        const params = Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign({ model: options.modelPath, n_ctx: (_d = options.contextSize) !== null && _d !== void 0 ? _d : 4096, n_gpu_layers: nativeGpuEnabled ? 99 : 0, n_threads: (_e = options.maxThreads) !== null && _e !== void 0 ? _e : 0, use_mmap: true, flash_attn: nativeGpuEnabled, embedding: looksLikeEmbeddingModelPath(options.modelPath), n_batch: options.mobileSpeculative ? 128 : 512, n_ubatch: options.mobileSpeculative ? 64 : 512 }, (options.draftModelPath
+            ? {
+                draft_model: options.draftModelPath,
+                speculative_samples: speculativeSamples,
+                mobile_speculative: (_f = options.mobileSpeculative) !== null && _f !== void 0 ? _f : true,
+            }
+            : {})), (options.draftContextSize
+            ? { n_ctx_draft: options.draftContextSize }
+            : {})), (options.draftMin ? { draft_min: options.draftMin } : {})), (options.draftMax ? { draft_max: options.draftMax } : {})), (options.cacheTypeK ? { cache_type_k: options.cacheTypeK } : {})), (options.cacheTypeV ? { cache_type_v: options.cacheTypeV } : {})), (options.disableThinking ? { reasoning: false } : {}));
         await plugin.initContext({
-            contextId: CONTEXT_ID,
-            params: {
-                model: options.modelPath,
-                n_ctx: (_a = options.contextSize) !== null && _a !== void 0 ? _a : 4096,
-                n_gpu_layers: options.useGpu === false ? 0 : 99,
-                n_threads: (_b = options.maxThreads) !== null && _b !== void 0 ? _b : 0,
-                use_mmap: true,
-            },
+            contextId: this.contextId,
+            params,
         });
+        // Fork builds expose a separate `setSpecType` bridge that configures
+        // the MTP drafter after the main context is up. Stock builds lack
+        // the method and the setter warns and skips it. We auto-call here so
+        // callers only need to pass `draftModelPath` once via load() — the
+        // adapter then handles both the params-bag path (stock fallback) and
+        // the explicit setSpecType path (fork build) in one shot.
+        if (options.draftModelPath && typeof plugin.setSpecType === "function") {
+            try {
+                await plugin.setSpecType({
+                    target: options.modelPath,
+                    drafter: options.draftModelPath,
+                    specType: "mtp",
+                    draftMin: (_g = options.draftMin) !== null && _g !== void 0 ? _g : 1,
+                    draftMax: (_h = options.draftMax) !== null && _h !== void 0 ? _h : 3,
+                });
+            }
+            catch (err) {
+                const message = err instanceof Error ? err.message : String(err);
+                console.warn("[capacitor-llama] setSpecType failed; spec decode disabled", { error: message });
+            }
+        }
+        // Same pattern for cache_type_k/v: fork builds may surface a separate
+        // setCacheType bridge; stock builds rely on the params bag only.
+        if ((options.cacheTypeK || options.cacheTypeV) &&
+            typeof plugin.setCacheType === "function") {
+            try {
+                await plugin.setCacheType({
+                    cacheTypeK: (_j = options.cacheTypeK) !== null && _j !== void 0 ? _j : "f16",
+                    cacheTypeV: (_k = options.cacheTypeV) !== null && _k !== void 0 ? _k : "f16",
+                });
+            }
+            catch (err) {
+                const message = err instanceof Error ? err.message : String(err);
+                console.warn("[capacitor-llama] setCacheType failed; cache types may be unchanged", { error: message });
+            }
+        }
         this.loadedPath = options.modelPath;
     }
     async unload() {
-        if (!this.plugin || !this.loadedPath)
+        if (!this.plugin || !this.loadedPath || this.contextId === null)
             return;
         try {
-            await this.plugin.releaseContext({ contextId: CONTEXT_ID });
+            await this.plugin.releaseContext({ contextId: this.contextId });
         }
         catch (_a) {
+            // Fall back to a targeted release-all only when the per-context
+            // release fails; this used to be the always-path but it now risks
+            // tearing down sibling adapter instances and is reserved for the
+            // pathological case where the native side has lost track of our id.
             await this.plugin.releaseAllContexts();
         }
         this.loadedPath = null;
     }
-    async generate(options) {
-        var _a, _b, _c, _d;
-        if (!this.plugin || !this.loadedPath) {
-            throw new Error("No model loaded. Call load() first.");
-        }
-        this.tokenIndex = 0;
+    /**
+     * Build the params object for the native completion call. Shared between
+     * the legacy `generate()` path and the new `generateStream()` path so the
+     * cache-key + stop-sequence wiring lives in one place.
+     */
+    buildNativeParams(options) {
+        var _a, _b;
         const params = {
-            n_predict: (_a = options.maxTokens) !== null && _a !== void 0 ? _a : 2048,
-            temperature: (_b = options.temperature) !== null && _b !== void 0 ? _b : 0.7,
-            top_p: (_c = options.topP) !== null && _c !== void 0 ? _c : 0.9,
+            n_predict: resolveMobileMaxTokens(options.maxTokens),
+            temperature: (_a = options.temperature) !== null && _a !== void 0 ? _a : 0.7,
+            top_p: (_b = options.topP) !== null && _b !== void 0 ? _b : 0.9,
         };
         if (options.stopSequences && options.stopSequences.length > 0) {
             params.stop = options.stopSequences;
@@ -159,26 +484,294 @@ class CapacitorLlamaAdapter {
         if (options.stream) {
             params.emit_partial_completion = true;
         }
-        const started = Date.now();
-        const result = await this.plugin.generateText({
-            contextId: CONTEXT_ID,
-            prompt: options.prompt,
-            params,
+        // Cache key threading: surface the slot id derived from
+        // ProviderCachePlan.promptCacheKey to the native side. Stock
+        // llama-cpp-capacitor builds ignore the field; the patched fork build
+        // reads it via setCacheType / completion params and pins KV slots.
+        if (options.cacheKey) {
+            const slotId = deriveCacheSlotId(options.cacheKey);
+            params.cache_prompt = true;
+            params.slot_id = slotId;
+        }
+        return params;
+    }
+    /**
+     * Invoke the native completion (or generateText) entry point with a
+     * pre-built params bag. Returns the raw native result; callers map this
+     * to `GenerateResult` or to a `done` event.
+     */
+    async runNativeCompletion(options, params) {
+        var _a;
+        const plugin = this.plugin;
+        if (!plugin) {
+            throw new Error("No model loaded. Call load() first.");
+        }
+        const contextId = this.requireContextId();
+        const result = typeof plugin.completion === "function"
+            ? await plugin.completion({
+                contextId,
+                params: Object.assign({ prompt: options.prompt, emit_partial_completion: Boolean(params.emit_partial_completion) }, params),
+            })
+            : await ((_a = plugin.generateText) === null || _a === void 0 ? void 0 : _a.call(plugin, {
+                contextId,
+                prompt: options.prompt,
+                params,
+            }));
+        if (!result) {
+            throw new Error("llama-cpp-capacitor did not expose completion() or generateText()");
+        }
+        return result;
+    }
+    /**
+     * Native bridges currently don't honour per-generation sampler-stage
+     * injection — the Swift / Kotlin side needs separate wiring. Until that
+     * lands we log once per stage and otherwise pass through. The stages
+     * remain in the options object so downstream observers (telemetry,
+     * tests) can still see them.
+     */
+    logUnwiredSamplerStages(stages) {
+        if (!stages || stages.length === 0)
+            return;
+        for (const stage of stages) {
+            console.debug(`[capacitor-llama] sampler stage "${stage.kind}" received but not yet wired in native bridge`);
+        }
+    }
+    async generate(options) {
+        var _a, e_1, _b, _c;
+        // Wrapper over `generateStream` so the cache-key, stop-sequence, and
+        // native-call wiring lives in exactly one place. Drains the stream
+        // into the legacy `GenerateResult` shape; per-token events surface to
+        // any `onToken` listener via the native event bridge (unchanged).
+        let text = "";
+        let promptTokens = 0;
+        let outputTokens = 0;
+        let durationMs = 0;
+        let lastError = null;
+        // Wall-clock time-to-first-token: from the call start to the first decoded
+        // token event. This is the on-device prefill wall-clock the resource
+        // workbench differences into prefill vs decode throughput. Stays undefined
+        // when the generation yields no tokens.
+        const startedAt = Date.now();
+        let ttftMs;
+        try {
+            for (var _d = true, _e = __asyncValues(this.generateStream(options)), _f; _f = await _e.next(), _a = _f.done, !_a; _d = true) {
+                _c = _f.value;
+                _d = false;
+                const event = _c;
+                if (event.kind === "token") {
+                    if (ttftMs === undefined)
+                        ttftMs = Date.now() - startedAt;
+                    text += event.text;
+                }
+                else if (event.kind === "telemetry") {
+                    // Native bridge currently emits no telemetry events; ignored here
+                    // because the final `done` event carries the authoritative totals.
+                }
+                else if (event.kind === "error") {
+                    lastError = event.message;
+                }
+                else if (event.kind === "done") {
+                    // The done payload's authoritative fields come from the
+                    // closed-over scope below — set when the native call returns.
+                }
+            }
+        }
+        catch (e_1_1) { e_1 = { error: e_1_1 }; }
+        finally {
+            try {
+                if (!_d && !_a && (_b = _e.return)) await _b.call(_e);
+            }
+            finally { if (e_1) throw e_1.error; }
+        }
+        if (lastError)
+            throw new Error(lastError);
+        // Re-read native counters from the cached completion result. We stored
+        // them on `this.lastCompletionStats` inside the stream's lifecycle.
+        const stats = this.lastCompletionStats;
+        if (stats) {
+            promptTokens = stats.promptTokens;
+            outputTokens = stats.outputTokens;
+            durationMs = stats.durationMs;
+            if (stats.text) {
+                // The native call's authoritative text. Use it instead of the
+                // token-event-assembled string so callers see exactly what the
+                // bridge produced (some bridges only emit tokens, others emit
+                // partial+final; assembled text isn't always equal).
+                text = stats.text;
+            }
+        }
+        return Object.assign({ text,
+            promptTokens,
+            outputTokens,
+            durationMs }, (ttftMs !== undefined ? { ttftMs } : {}));
+    }
+    /**
+     * Streaming generation. Subscribes to the native token event bridge,
+     * starts the completion call, and yields typed `GenerationEvent`s as
+     * tokens arrive. The stream ends with exactly one `done` event (or one
+     * terminal `error`) once the native call resolves.
+     *
+     * Sampler-stage injection (`samplerStages`) and the per-generation
+     * spec-decode toggle (`specDecode`) are accepted but currently pass
+     * through unchanged on the JS side — the Swift / Kotlin bridge wiring is tracked
+     * separately. They flow through as part of the options bag so the
+     * native side can pick them up without an interface change.
+     */
+    generateStream(options) {
+        return __asyncGenerator(this, arguments, function* generateStream_1() {
+            var _a;
+            if (!this.plugin || !this.loadedPath) {
+                throw new Error("No model loaded. Call load() first.");
+            }
+            this.tokenIndex = 0;
+            this.lastCompletionStats = null;
+            this.logUnwiredSamplerStages(options.samplerStages);
+            const queue = [];
+            let waiter = null;
+            const wake = () => {
+                if (waiter) {
+                    const w = waiter;
+                    waiter = null;
+                    w();
+                }
+            };
+            const push = (event) => {
+                queue.push(event);
+                wake();
+            };
+            // Subscribe to per-token events. The native bridge fires
+            // `@LlamaCpp_onToken`; our existing class-level listener forwards into
+            // every `onToken(listener)` consumer. We register one more listener
+            // here, scoped to this stream, that converts strings into `token`
+            // events.
+            const unsubscribe = this.onToken((tokenText, index) => {
+                push({ kind: "token", text: tokenText, index });
+            });
+            const params = this.buildNativeParams(Object.assign(Object.assign({}, options), {
+                // generateStream implies streaming — force on so the bridge emits
+                // partial completions even when the caller didn't set `stream: true`
+                // on the legacy options bag.
+                stream: true }));
+            const started = Date.now();
+            let completionPromise;
+            try {
+                completionPromise = this.runNativeCompletion(options, params);
+            }
+            catch (err) {
+                unsubscribe();
+                const message = err instanceof Error ? err.message : String(err);
+                yield yield __await({ kind: "error", message, recoverable: false });
+                yield yield __await({ kind: "done", finishReason: "error" });
+                return yield __await(void 0);
+            }
+            // Wrapped in an object so TS's control-flow analysis doesn't widen the
+            // closed-over assignments back to `null`/`never` when we read them
+            // after the loop. (Plain `let` with `null` init narrows badly after
+            // an async assignment.)
+            const completionState = { result: null, error: null, done: false };
+            completionPromise
+                .then((result) => {
+                completionState.result = result;
+            })
+                .catch((err) => {
+                completionState.error =
+                    err instanceof Error ? err : { message: String(err) };
+            })
+                .finally(() => {
+                completionState.done = true;
+                wake();
+            });
+            try {
+                while (true) {
+                    if (queue.length > 0) {
+                        yield yield __await(queue.shift());
+                        continue;
+                    }
+                    if (completionState.done)
+                        break;
+                    yield __await(new Promise((resolve) => {
+                        waiter = resolve;
+                    }));
+                }
+            }
+            finally {
+                unsubscribe();
+            }
+            if (completionState.error) {
+                yield yield __await({
+                    kind: "error",
+                    message: completionState.error.message,
+                    recoverable: false,
+                });
+                yield yield __await({ kind: "done", finishReason: "error" });
+                return yield __await(void 0);
+            }
+            if (completionState.result) {
+                const r = completionState.result;
+                const duration = ((_a = r.timings) === null || _a === void 0 ? void 0 : _a.predicted_ms) != null
+                    ? Math.round(r.timings.predicted_ms)
+                    : Date.now() - started;
+                this.lastCompletionStats = {
+                    text: r.text,
+                    promptTokens: r.tokens_evaluated,
+                    outputTokens: r.tokens_predicted,
+                    durationMs: duration,
+                };
+                // Reason heuristic: native fork doesn't expose a finish-reason
+                // enum yet. "stop" is the dominant case; "length" when we hit the
+                // requested n_predict ceiling exactly. Tool/cancel/error are
+                // emitted by the explicit paths above and aren't reachable here.
+                const requested = resolveMobileMaxTokens(options.maxTokens);
+                const finishReason = r.tokens_predicted >= requested ? "length" : "stop";
+                yield yield __await({ kind: "done", finishReason });
+                return yield __await(void 0);
+            }
+            // Native call resolved with no payload and no error — defensive
+            // terminal event so the consumer's `for await` always ends cleanly.
+            yield yield __await({ kind: "done", finishReason: "stop" });
         });
-        const duration = ((_d = result.timings) === null || _d === void 0 ? void 0 : _d.predicted_ms) != null
-            ? Math.round(result.timings.predicted_ms)
-            : Date.now() - started;
-        return {
-            text: result.text,
-            promptTokens: result.tokens_evaluated,
-            outputTokens: result.tokens_predicted,
-            durationMs: duration,
-        };
+    }
+    async setDrafter(drafterPath) {
+        // The native bridge has no live-swap entry point yet; the drafter is
+        // bound at `load()` time via `LoadOptions.draftModelPath`. Log so the
+        // call-site is observable, and leave the loaded context unchanged.
+        console.warn(`[capacitor-llama] setDrafter(${drafterPath !== null && drafterPath !== void 0 ? drafterPath : "null"}) not yet supported by native bridge; pass draftModelPath to load() instead`);
+    }
+    async trimMemory(level) {
+        // No native hook yet — log so the runtime's pressure plumbing can see
+        // the adapter received the signal. Major pressure also clears the
+        // token-listener bookkeeping to drop any orphaned callbacks.
+        if (level === "major") {
+            this.tokenListeners.clear();
+        }
+        console.debug(`[capacitor-llama] trimMemory(${level}) — bridge hook unavailable`);
     }
     async cancelGenerate() {
-        if (!this.plugin)
+        if (!this.plugin || this.contextId === null)
             return;
-        await this.plugin.stopCompletion({ contextId: CONTEXT_ID });
+        await this.plugin.stopCompletion({ contextId: this.contextId });
+    }
+    /**
+     * Round-trip to the loaded GGUF's native chat template via
+     * `LlamaCpp.getFormattedChat`. The plugin's Java side serializes
+     * `messages` as a JSON string and invokes
+     * `cap_format_chat()` → `llama_chat_apply_template()`. Returns the
+     * rendered prompt (or null when the GGUF has no template metadata).
+     */
+    async formatChat(messages) {
+        var _a;
+        if (!this.plugin || !this.loadedPath) {
+            throw new Error("No model loaded. Call load() first.");
+        }
+        if (typeof this.plugin.getFormattedChat !== "function") {
+            return null;
+        }
+        const result = await this.plugin.getFormattedChat({
+            contextId: this.requireContextId(),
+            messages: JSON.stringify(messages),
+            params: { jinja: true },
+        });
+        return (_a = result.prompt) !== null && _a !== void 0 ? _a : null;
     }
     async embed(options) {
         var _a;
@@ -191,8 +784,9 @@ class CapacitorLlamaAdapter {
         const params = {
             embd_normalize: (_a = options.embdNormalize) !== null && _a !== void 0 ? _a : 0,
         };
+        const contextId = this.requireContextId();
         const result = await this.plugin.embedding({
-            contextId: CONTEXT_ID,
+            contextId,
             text: options.input,
             params,
         });
@@ -200,7 +794,7 @@ class CapacitorLlamaAdapter {
         if (typeof this.plugin.tokenize === "function") {
             try {
                 const tokenized = await this.plugin.tokenize({
-                    contextId: CONTEXT_ID,
+                    contextId,
                     text: options.input,
                 });
                 tokenCount = tokenized.tokens.length;
@@ -232,22 +826,69 @@ class CapacitorLlamaAdapter {
         this.pluginLoadPromise = null;
     }
 }
+/**
+ * Default singleton kept for back-compat with device-bridge-client and
+ * hardware-probe callers that don't distinguish chat vs embedding roles.
+ * The runtime's `localInferenceLoader` service uses per-role instances
+ * instead — see `registerCapacitorLlamaLoader`.
+ */
 export const capacitorLlama = new CapacitorLlamaAdapter();
+/**
+ * Lightweight heuristic for routing a `loadModel(modelPath)` call to either
+ * the chat adapter or the embedding adapter. Embedding GGUFs the runtime
+ * ships or that users typically install for `TEXT_EMBEDDING` carry one of
+ * these markers in the filename. Anything else is assumed to be a
+ * generative chat model.
+ */
+function looksLikeEmbeddingModelPath(modelPath) {
+    const lowered = modelPath.toLowerCase();
+    return (lowered.includes("bge-") ||
+        lowered.includes("bge_") ||
+        lowered.includes("nomic-embed") ||
+        lowered.includes("all-minilm") ||
+        lowered.includes("gte-") ||
+        lowered.includes("e5-") ||
+        lowered.includes("/embedding/") ||
+        lowered.endsWith("embedding.gguf"));
+}
 export function registerCapacitorLlamaLoader(runtime) {
     if (typeof runtime.registerService !== "function")
         return;
+    // Two distinct adapter instances so the chat LLM and embedding model
+    // each allocate their own native context id. This is the fix for
+    // elizaOS/eliza#7681 — the previous single-adapter design routed every
+    // operation through CONTEXT_ID=1, and a `completion(contextId=1)` call
+    // would resolve to whichever model registered against id 1 last
+    // (typically the bge-small embedding model on Android), emitting
+    // `[unused{N}]` / `[PAD]` reserved tokens.
+    const chatAdapter = new CapacitorLlamaAdapter();
+    const embeddingAdapter = new CapacitorLlamaAdapter();
+    function adapterFor(modelPath) {
+        return looksLikeEmbeddingModelPath(modelPath)
+            ? embeddingAdapter
+            : chatAdapter;
+    }
     runtime.registerService("localInferenceLoader", {
         async loadModel(args) {
-            await capacitorLlama.load({ modelPath: args.modelPath });
+            await adapterFor(args.modelPath).load(args);
         },
         async unloadModel() {
-            await capacitorLlama.unload();
+            // Each adapter manages its own context lifecycle inside
+            // `load()` (releasing the prior context before reinitializing on the
+            // same id). Tearing down both adapters here would defeat the
+            // per-instance routing — `ensureAssignedModelLoaded` calls
+            // `unloadModel()` before every `loadModel()` on the assumption of
+            // single-model behaviour, and we must not let that unconditionally
+            // kill the embedding adapter when only the chat model is swapping.
         },
         currentModelPath() {
-            return capacitorLlama.currentModelPath();
+            var _a;
+            // The chat path is the primary "active" model from the runtime's
+            // perspective; embedding is treated as a sidecar.
+            return ((_a = chatAdapter.currentModelPath()) !== null && _a !== void 0 ? _a : embeddingAdapter.currentModelPath());
         },
         async generate(args) {
-            const result = await capacitorLlama.generate({
+            const result = await chatAdapter.generate({
                 prompt: args.prompt,
                 stopSequences: args.stopSequences,
                 maxTokens: args.maxTokens,
@@ -256,7 +897,7 @@ export function registerCapacitorLlamaLoader(runtime) {
             return result.text;
         },
         async embed(args) {
-            return capacitorLlama.embed({ input: args.input });
+            return embeddingAdapter.embed({ input: args.input });
         },
     });
 }