@elizaos/capacitor-llama 0.1.0 → 2.0.3-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,46 @@
1
- const CONTEXT_ID = 1;
1
+ var __asyncValues = (this && this.__asyncValues) || function (o) {
2
+ if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
3
+ var m = o[Symbol.asyncIterator], i;
4
+ return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
5
+ function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
6
+ function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
7
+ };
8
+ var __await = (this && this.__await) || function (v) { return this instanceof __await ? (this.v = v, this) : new __await(v); }
9
+ var __asyncGenerator = (this && this.__asyncGenerator) || function (thisArg, _arguments, generator) {
10
+ if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
11
+ var g = generator.apply(thisArg, _arguments || []), i, q = [];
12
+ return i = Object.create((typeof AsyncIterator === "function" ? AsyncIterator : Object).prototype), verb("next"), verb("throw"), verb("return", awaitReturn), i[Symbol.asyncIterator] = function () { return this; }, i;
13
+ function awaitReturn(f) { return function (v) { return Promise.resolve(v).then(f, reject); }; }
14
+ function verb(n, f) { if (g[n]) { i[n] = function (v) { return new Promise(function (a, b) { q.push([n, v, a, b]) > 1 || resume(n, v); }); }; if (f) i[n] = f(i[n]); } }
15
+ function resume(n, v) { try { step(g[n](v)); } catch (e) { settle(q[0][3], e); } }
16
+ function step(r) { r.value instanceof __await ? Promise.resolve(r.value.v).then(fulfill, reject) : settle(q[0][2], r); }
17
+ function fulfill(value) { resume("next", value); }
18
+ function reject(value) { resume("throw", value); }
19
+ function settle(f, v) { if (f(v), q.shift(), q.length) resume(q[0][0], q[0][1]); }
20
+ };
21
+ // completion(contextId=X) must run against the model that was initContext'd
22
+ // with X — every adapter instance owns its own monotonically-allocated id so
23
+ // the chat LLM and the embedding model never collide on the same native
24
+ // context.
25
+ let nextContextId = 1;
26
+ const DEFAULT_MAX_TOKENS = 256;
27
+ /**
28
+ * Mobile-side parallel slot count. Mirrors `DEFAULT_CACHE_PARALLEL` in
29
+ * `cache-bridge.ts`; on devices with constrained KV memory we keep a small
30
+ * fixed pool so distinct cacheKey values still get prefix reuse without
31
+ * blowing memory.
32
+ */
33
+ const MOBILE_PARALLEL = 4;
34
+ /** FNV-1a 32-bit, deterministic across platforms — matches the agent side. */
35
+ function deriveCacheSlotId(key) {
36
+ let hash = 0x811c9dc5;
37
+ for (let i = 0; i < key.length; i += 1) {
38
+ hash ^= key.charCodeAt(i);
39
+ hash = Math.imul(hash, 0x01000193);
40
+ }
41
+ return Math.abs(hash | 0) % MOBILE_PARALLEL;
42
+ }
43
+ const MOBILE_MAX_TOKENS_CAP = 256;
2
44
  function isObject(value) {
3
45
  return typeof value === "object" && value !== null;
4
46
  }
@@ -7,7 +49,8 @@ function isLlamaCppPluginLike(value) {
7
49
  typeof value.initContext === "function" &&
8
50
  typeof value.releaseContext === "function" &&
9
51
  typeof value.releaseAllContexts === "function" &&
10
- typeof value.generateText === "function" &&
52
+ (typeof value.completion === "function" ||
53
+ typeof value.generateText === "function") &&
11
54
  typeof value.stopCompletion === "function" &&
12
55
  typeof value.addListener === "function");
13
56
  }
@@ -23,6 +66,42 @@ function resolveLlamaCppPlugin(mod) {
23
66
  }
24
67
  return null;
25
68
  }
69
+ function toPlainLlamaCppPlugin(plugin) {
70
+ return {
71
+ initContext: (options) => plugin.initContext(options),
72
+ releaseContext: (options) => plugin.releaseContext(options),
73
+ releaseAllContexts: () => plugin.releaseAllContexts(),
74
+ getHardwareInfo: typeof plugin.getHardwareInfo === "function"
75
+ ? () => { var _a; return (_a = plugin.getHardwareInfo) === null || _a === void 0 ? void 0 : _a.call(plugin); }
76
+ : undefined,
77
+ completion: typeof plugin.completion === "function"
78
+ ? (options) => { var _a; return (_a = plugin.completion) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
79
+ : undefined,
80
+ generateText: typeof plugin.generateText === "function"
81
+ ? (options) => { var _a; return (_a = plugin.generateText) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
82
+ : undefined,
83
+ stopCompletion: (options) => plugin.stopCompletion(options),
84
+ embedding: typeof plugin.embedding === "function"
85
+ ? (options) => { var _a; return (_a = plugin.embedding) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
86
+ : undefined,
87
+ tokenize: typeof plugin.tokenize === "function"
88
+ ? (options) => { var _a; return (_a = plugin.tokenize) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
89
+ : undefined,
90
+ setCacheType: typeof plugin.setCacheType === "function"
91
+ ? (options) => { var _a; return (_a = plugin.setCacheType) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
92
+ : undefined,
93
+ setSpecType: typeof plugin.setSpecType === "function"
94
+ ? (options) => { var _a; return (_a = plugin.setSpecType) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
95
+ : undefined,
96
+ getNativeKernels: typeof plugin.getNativeKernels === "function"
97
+ ? () => {
98
+ var _a;
99
+ return (_a = plugin.getNativeKernels) === null || _a === void 0 ? void 0 : _a.call(plugin);
100
+ }
101
+ : undefined,
102
+ addListener: (event, listener) => plugin.addListener(event, listener),
103
+ };
104
+ }
26
105
  function isCapacitorNative() {
27
106
  var _a;
28
107
  const cap = globalThis.Capacitor;
@@ -38,15 +117,145 @@ function detectPlatform() {
38
117
  return "android";
39
118
  return "web";
40
119
  }
41
- class CapacitorLlamaAdapter {
120
+ function resolveMobileMaxTokens(requested) {
121
+ if (!Number.isFinite(requested) || requested == null || requested <= 0) {
122
+ return DEFAULT_MAX_TOKENS;
123
+ }
124
+ return Math.min(Math.floor(requested), MOBILE_MAX_TOKENS_CAP);
125
+ }
126
+ function numberFromUnknown(value) {
127
+ if (typeof value !== "number" || !Number.isFinite(value))
128
+ return null;
129
+ return value;
130
+ }
131
+ function booleanFromUnknown(value) {
132
+ return typeof value === "boolean" ? value : undefined;
133
+ }
134
+ function stringFromUnknown(value) {
135
+ return typeof value === "string" && value.trim().length > 0
136
+ ? value.trim()
137
+ : undefined;
138
+ }
139
+ function fallbackHardwareInfo(platform = detectPlatform(), reason = "native hardware probe unavailable") {
140
+ var _a, _b;
141
+ const nav = globalThis.navigator;
142
+ const totalRamGb = (_a = numberFromUnknown(nav === null || nav === void 0 ? void 0 : nav.deviceMemory)) !== null && _a !== void 0 ? _a : 0;
143
+ const gpu = platform === "ios"
144
+ ? { backend: "metal", available: true }
145
+ : null;
146
+ return {
147
+ platform,
148
+ deviceModel: platform,
149
+ totalRamGb,
150
+ availableRamGb: null,
151
+ cpuCores: (_b = nav === null || nav === void 0 ? void 0 : nav.hardwareConcurrency) !== null && _b !== void 0 ? _b : 0,
152
+ gpu,
153
+ gpuSupported: platform === "ios",
154
+ mtpSupported: false,
155
+ mtpReason: reason,
156
+ source: "adapter-fallback",
157
+ nativeKernels: [],
158
+ forkVariant: null,
159
+ };
160
+ }
161
+ function defaultNativeGpuEnabled(platform = detectPlatform()) {
162
+ // iOS builds use the Metal-capable native path by default. Android's current
163
+ // Capacitor wrapper is CPU-only unless a forked Vulkan bridge explicitly opts
164
+ // in, so the safe production default is CPU.
165
+ return platform === "ios";
166
+ }
167
+ function resolveNativeGpuEnabled(useGpu) {
168
+ return typeof useGpu === "boolean" ? useGpu : defaultNativeGpuEnabled();
169
+ }
170
+ function normalizeForkVariant(value) {
171
+ if (value === "buun-llama-cpp" || value === "stock-llama-cpp")
172
+ return value;
173
+ if (value === null)
174
+ return null;
175
+ return undefined;
176
+ }
177
+ function stringArrayFromUnknown(value) {
178
+ if (!Array.isArray(value))
179
+ return undefined;
180
+ const out = [];
181
+ for (const entry of value) {
182
+ if (typeof entry === "string" && entry.length > 0)
183
+ out.push(entry);
184
+ }
185
+ return out;
186
+ }
187
+ function normalizeHardwareInfo(value, platform = detectPlatform()) {
188
+ var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k;
189
+ const fallback = fallbackHardwareInfo(platform);
190
+ if (!value)
191
+ return fallback;
192
+ const totalRamGb = (_a = numberFromUnknown(value.totalRamGb)) !== null && _a !== void 0 ? _a : fallback.totalRamGb;
193
+ const availableRamGb = value.availableRamGb === null
194
+ ? null
195
+ : ((_b = numberFromUnknown(value.availableRamGb)) !== null && _b !== void 0 ? _b : fallback.availableRamGb);
196
+ const gpu = value.gpu && isObject(value.gpu)
197
+ ? {
198
+ backend: value.gpu.backend === "metal" ||
199
+ value.gpu.backend === "vulkan" ||
200
+ value.gpu.backend === "gpu-delegate"
201
+ ? value.gpu.backend
202
+ : ((_d = (_c = fallback.gpu) === null || _c === void 0 ? void 0 : _c.backend) !== null && _d !== void 0 ? _d : "gpu-delegate"),
203
+ available: Boolean(value.gpu.available),
204
+ }
205
+ : fallback.gpu;
206
+ return Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign({ platform: value.platform === "ios" ||
207
+ value.platform === "android" ||
208
+ value.platform === "web"
209
+ ? value.platform
210
+ : platform, deviceModel: (_e = stringFromUnknown(value.deviceModel)) !== null && _e !== void 0 ? _e : fallback.deviceModel }, (stringFromUnknown(value.machineId)
211
+ ? { machineId: stringFromUnknown(value.machineId) }
212
+ : {})), (stringFromUnknown(value.osVersion)
213
+ ? { osVersion: stringFromUnknown(value.osVersion) }
214
+ : {})), (typeof value.isSimulator === "boolean"
215
+ ? { isSimulator: value.isSimulator }
216
+ : {})), { totalRamGb,
217
+ availableRamGb }), (numberFromUnknown(value.freeStorageGb) !== null
218
+ ? { freeStorageGb: numberFromUnknown(value.freeStorageGb) }
219
+ : {})), { cpuCores: (_f = numberFromUnknown(value.cpuCores)) !== null && _f !== void 0 ? _f : fallback.cpuCores, gpu, gpuSupported: (_g = booleanFromUnknown(value.gpuSupported)) !== null && _g !== void 0 ? _g : fallback.gpuSupported }), (typeof value.lowPowerMode === "boolean"
220
+ ? { lowPowerMode: value.lowPowerMode }
221
+ : {})), (value.thermalState === "nominal" ||
222
+ value.thermalState === "fair" ||
223
+ value.thermalState === "serious" ||
224
+ value.thermalState === "critical" ||
225
+ value.thermalState === "unknown"
226
+ ? { thermalState: value.thermalState }
227
+ : {})), { mtpSupported: Boolean(value.mtpSupported), mtpReason: (_h = stringFromUnknown(value.mtpReason)) !== null && _h !== void 0 ? _h : (value.mtpSupported
228
+ ? undefined
229
+ : "native plugin did not report MTP support"), source: value.source === "native" ? "native" : "adapter-fallback", nativeKernels: (_j = stringArrayFromUnknown(value.nativeKernels)) !== null && _j !== void 0 ? _j : [], forkVariant: (_k = normalizeForkVariant(value.forkVariant)) !== null && _k !== void 0 ? _k : null });
230
+ }
231
+ export class CapacitorLlamaAdapter {
42
232
  constructor() {
43
233
  this.plugin = null;
44
234
  /** Cached loader promise so concurrent `load()` calls don't race to register duplicate listeners. */
45
235
  this.pluginLoadPromise = null;
46
236
  this.loadedPath = null;
237
+ /**
238
+ * Native context id this adapter owns. Allocated lazily on first `load()`
239
+ * from the process-wide `nextContextId` counter so distinct adapter
240
+ * instances never share a context — see the module-level invariant comment.
241
+ */
242
+ this.contextId = null;
47
243
  this.tokenIndex = 0;
48
244
  this.tokenListeners = new Set();
49
245
  this.pluginListenerHandle = null;
246
+ /**
247
+ * Latest native completion stats captured by `generateStream`. Read by
248
+ * the `generate()` wrapper to populate `GenerateResult` without
249
+ * re-issuing the native call. Cleared at the start of every
250
+ * `generateStream` invocation.
251
+ */
252
+ this.lastCompletionStats = null;
253
+ }
254
+ requireContextId() {
255
+ if (this.contextId === null) {
256
+ throw new Error("No model loaded. Call load() first.");
257
+ }
258
+ return this.contextId;
50
259
  }
51
260
  async loadPlugin() {
52
261
  if (this.plugin)
@@ -54,10 +263,11 @@ class CapacitorLlamaAdapter {
54
263
  if (this.pluginLoadPromise)
55
264
  return this.pluginLoadPromise;
56
265
  this.pluginLoadPromise = (async () => {
57
- const plugin = resolveLlamaCppPlugin(await import("llama-cpp-capacitor"));
58
- if (!plugin) {
59
- throw new Error("llama-cpp-capacitor did not expose an initContext method");
266
+ const nativePlugin = resolveLlamaCppPlugin(await import("llama-cpp-capacitor"));
267
+ if (!nativePlugin) {
268
+ throw new Error("llama-cpp-capacitor did not expose the native LlamaCpp methods");
60
269
  }
270
+ const plugin = toPlainLlamaCppPlugin(nativePlugin);
61
271
  const tokenListenerHandle = await plugin.addListener("@LlamaCpp_onToken", (data) => {
62
272
  var _a, _b;
63
273
  const token = (_b = (_a = data.tokenResult) === null || _a === void 0 ? void 0 : _a.token) !== null && _b !== void 0 ? _b : data.token;
@@ -86,19 +296,73 @@ class CapacitorLlamaAdapter {
86
296
  }
87
297
  }
88
298
  async getHardwareInfo() {
89
- var _a;
299
+ var _a, _b, _c;
90
300
  const platform = detectPlatform();
91
- const nav = globalThis
92
- .navigator;
93
- return {
94
- platform,
95
- deviceModel: platform,
96
- totalRamGb: 0,
97
- availableRamGb: null,
98
- cpuCores: (_a = nav === null || nav === void 0 ? void 0 : nav.hardwareConcurrency) !== null && _a !== void 0 ? _a : 0,
99
- gpu: null,
100
- gpuSupported: platform !== "web",
101
- };
301
+ if (!isCapacitorNative())
302
+ return fallbackHardwareInfo(platform);
303
+ try {
304
+ const plugin = await this.loadPlugin();
305
+ const baseInfo = normalizeHardwareInfo(await ((_a = plugin.getHardwareInfo) === null || _a === void 0 ? void 0 : _a.call(plugin)), platform);
306
+ // Probe fork-specific kernels through the optional bridge method.
307
+ // Stock builds and older fork builds without the bridge fall back
308
+ // to the empty list + "stock-llama-cpp" variant marker.
309
+ let nativeKernels = (_b = baseInfo.nativeKernels) !== null && _b !== void 0 ? _b : [];
310
+ let forkVariant = (_c = baseInfo.forkVariant) !== null && _c !== void 0 ? _c : "stock-llama-cpp";
311
+ if (typeof plugin.getNativeKernels === "function") {
312
+ try {
313
+ const probe = await plugin.getNativeKernels();
314
+ const kernels = stringArrayFromUnknown(probe === null || probe === void 0 ? void 0 : probe.kernels);
315
+ if (kernels)
316
+ nativeKernels = kernels;
317
+ const variant = normalizeForkVariant(probe === null || probe === void 0 ? void 0 : probe.variant);
318
+ if (variant !== undefined)
319
+ forkVariant = variant;
320
+ else if (nativeKernels.length > 0)
321
+ forkVariant = "buun-llama-cpp";
322
+ }
323
+ catch (err) {
324
+ const message = err instanceof Error ? err.message : String(err);
325
+ console.debug("[capacitor-llama] getNativeKernels probe failed", {
326
+ error: message,
327
+ });
328
+ }
329
+ }
330
+ return Object.assign(Object.assign({}, baseInfo), { nativeKernels,
331
+ forkVariant });
332
+ }
333
+ catch (error) {
334
+ return fallbackHardwareInfo(platform, error instanceof Error ? error.message : "native hardware probe failed");
335
+ }
336
+ }
337
+ async setCacheType(typeK, typeV) {
338
+ if (!isCapacitorNative()) {
339
+ console.warn("[capacitor-llama] setCacheType called on non-native platform; ignoring");
340
+ return;
341
+ }
342
+ const plugin = await this.loadPlugin();
343
+ if (typeof plugin.setCacheType !== "function") {
344
+ console.warn("[capacitor-llama] underlying plugin does not expose setCacheType (likely stock build); cache types must be passed via load() params instead");
345
+ return;
346
+ }
347
+ await plugin.setCacheType({ cacheTypeK: typeK, cacheTypeV: typeV });
348
+ }
349
+ async setSpecType(args) {
350
+ if (!isCapacitorNative()) {
351
+ console.warn("[capacitor-llama] setSpecType called on non-native platform; ignoring");
352
+ return;
353
+ }
354
+ const plugin = await this.loadPlugin();
355
+ if (typeof plugin.setSpecType !== "function") {
356
+ console.warn("[capacitor-llama] underlying plugin does not expose setSpecType (likely stock build); pass draft_model + draft_min/max via load() instead");
357
+ return;
358
+ }
359
+ await plugin.setSpecType({
360
+ target: args.target,
361
+ drafter: args.drafter,
362
+ specType: args.specType,
363
+ draftMin: args.draftMin,
364
+ draftMax: args.draftMax,
365
+ });
102
366
  }
103
367
  async isLoaded() {
104
368
  return {
@@ -110,48 +374,109 @@ class CapacitorLlamaAdapter {
110
374
  return this.loadedPath;
111
375
  }
112
376
  async load(options) {
113
- var _a, _b;
377
+ var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k;
114
378
  if (!isCapacitorNative()) {
115
379
  throw new Error("capacitor-llama is only available on iOS and Android builds");
116
380
  }
117
381
  const plugin = await this.loadPlugin();
118
- if (this.loadedPath && this.loadedPath !== options.modelPath) {
119
- await plugin.releaseAllContexts();
120
- this.loadedPath = null;
382
+ // Release this adapter's own prior context (if any) before reusing the
383
+ // context id for a new model. We do NOT call `releaseAllContexts` here
384
+ // that would destroy contexts owned by sibling adapter instances
385
+ // (e.g. tear down the embedding model when the chat model reloads).
386
+ if (this.contextId !== null && this.loadedPath !== null) {
387
+ try {
388
+ await plugin.releaseContext({ contextId: this.contextId });
389
+ }
390
+ catch (_l) {
391
+ // The native side may have already cleared this context; safe to
392
+ // proceed to reinit on the same id.
393
+ }
394
+ }
395
+ this.loadedPath = null;
396
+ if (this.contextId === null) {
397
+ this.contextId = nextContextId++;
121
398
  }
399
+ const speculativeSamples = options.mobileSpeculative
400
+ ? Math.min((_b = (_a = options.speculativeSamples) !== null && _a !== void 0 ? _a : options.draftMax) !== null && _b !== void 0 ? _b : 3, 4)
401
+ : ((_c = options.speculativeSamples) !== null && _c !== void 0 ? _c : 3);
402
+ const nativeGpuEnabled = resolveNativeGpuEnabled(options.useGpu);
403
+ const params = Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign({ model: options.modelPath, n_ctx: (_d = options.contextSize) !== null && _d !== void 0 ? _d : 4096, n_gpu_layers: nativeGpuEnabled ? 99 : 0, n_threads: (_e = options.maxThreads) !== null && _e !== void 0 ? _e : 0, use_mmap: true, flash_attn: nativeGpuEnabled, embedding: looksLikeEmbeddingModelPath(options.modelPath), n_batch: options.mobileSpeculative ? 128 : 512, n_ubatch: options.mobileSpeculative ? 64 : 512 }, (options.draftModelPath
404
+ ? {
405
+ draft_model: options.draftModelPath,
406
+ speculative_samples: speculativeSamples,
407
+ mobile_speculative: (_f = options.mobileSpeculative) !== null && _f !== void 0 ? _f : true,
408
+ }
409
+ : {})), (options.draftContextSize
410
+ ? { n_ctx_draft: options.draftContextSize }
411
+ : {})), (options.draftMin ? { draft_min: options.draftMin } : {})), (options.draftMax ? { draft_max: options.draftMax } : {})), (options.cacheTypeK ? { cache_type_k: options.cacheTypeK } : {})), (options.cacheTypeV ? { cache_type_v: options.cacheTypeV } : {})), (options.disableThinking ? { reasoning: false } : {}));
122
412
  await plugin.initContext({
123
- contextId: CONTEXT_ID,
124
- params: {
125
- model: options.modelPath,
126
- n_ctx: (_a = options.contextSize) !== null && _a !== void 0 ? _a : 4096,
127
- n_gpu_layers: options.useGpu === false ? 0 : 99,
128
- n_threads: (_b = options.maxThreads) !== null && _b !== void 0 ? _b : 0,
129
- use_mmap: true,
130
- },
413
+ contextId: this.contextId,
414
+ params,
131
415
  });
416
+ // Fork builds expose a separate `setSpecType` bridge that configures
417
+ // the MTP drafter after the main context is up. Stock builds lack
418
+ // the method and the setter warns and skips it. We auto-call here so
419
+ // callers only need to pass `draftModelPath` once via load() — the
420
+ // adapter then handles both the params-bag path (stock fallback) and
421
+ // the explicit setSpecType path (fork build) in one shot.
422
+ if (options.draftModelPath && typeof plugin.setSpecType === "function") {
423
+ try {
424
+ await plugin.setSpecType({
425
+ target: options.modelPath,
426
+ drafter: options.draftModelPath,
427
+ specType: "mtp",
428
+ draftMin: (_g = options.draftMin) !== null && _g !== void 0 ? _g : 1,
429
+ draftMax: (_h = options.draftMax) !== null && _h !== void 0 ? _h : 3,
430
+ });
431
+ }
432
+ catch (err) {
433
+ const message = err instanceof Error ? err.message : String(err);
434
+ console.warn("[capacitor-llama] setSpecType failed; spec decode disabled", { error: message });
435
+ }
436
+ }
437
+ // Same pattern for cache_type_k/v: fork builds may surface a separate
438
+ // setCacheType bridge; stock builds rely on the params bag only.
439
+ if ((options.cacheTypeK || options.cacheTypeV) &&
440
+ typeof plugin.setCacheType === "function") {
441
+ try {
442
+ await plugin.setCacheType({
443
+ cacheTypeK: (_j = options.cacheTypeK) !== null && _j !== void 0 ? _j : "f16",
444
+ cacheTypeV: (_k = options.cacheTypeV) !== null && _k !== void 0 ? _k : "f16",
445
+ });
446
+ }
447
+ catch (err) {
448
+ const message = err instanceof Error ? err.message : String(err);
449
+ console.warn("[capacitor-llama] setCacheType failed; cache types may be unchanged", { error: message });
450
+ }
451
+ }
132
452
  this.loadedPath = options.modelPath;
133
453
  }
134
454
  async unload() {
135
- if (!this.plugin || !this.loadedPath)
455
+ if (!this.plugin || !this.loadedPath || this.contextId === null)
136
456
  return;
137
457
  try {
138
- await this.plugin.releaseContext({ contextId: CONTEXT_ID });
458
+ await this.plugin.releaseContext({ contextId: this.contextId });
139
459
  }
140
460
  catch (_a) {
461
+ // Fall back to a targeted release-all only when the per-context
462
+ // release fails; this used to be the always-path but it now risks
463
+ // tearing down sibling adapter instances and is reserved for the
464
+ // pathological case where the native side has lost track of our id.
141
465
  await this.plugin.releaseAllContexts();
142
466
  }
143
467
  this.loadedPath = null;
144
468
  }
145
- async generate(options) {
146
- var _a, _b, _c, _d;
147
- if (!this.plugin || !this.loadedPath) {
148
- throw new Error("No model loaded. Call load() first.");
149
- }
150
- this.tokenIndex = 0;
469
+ /**
470
+ * Build the params object for the native completion call. Shared between
471
+ * the legacy `generate()` path and the new `generateStream()` path so the
472
+ * cache-key + stop-sequence wiring lives in one place.
473
+ */
474
+ buildNativeParams(options) {
475
+ var _a, _b;
151
476
  const params = {
152
- n_predict: (_a = options.maxTokens) !== null && _a !== void 0 ? _a : 2048,
153
- temperature: (_b = options.temperature) !== null && _b !== void 0 ? _b : 0.7,
154
- top_p: (_c = options.topP) !== null && _c !== void 0 ? _c : 0.9,
477
+ n_predict: resolveMobileMaxTokens(options.maxTokens),
478
+ temperature: (_a = options.temperature) !== null && _a !== void 0 ? _a : 0.7,
479
+ top_p: (_b = options.topP) !== null && _b !== void 0 ? _b : 0.9,
155
480
  };
156
481
  if (options.stopSequences && options.stopSequences.length > 0) {
157
482
  params.stop = options.stopSequences;
@@ -159,26 +484,294 @@ class CapacitorLlamaAdapter {
159
484
  if (options.stream) {
160
485
  params.emit_partial_completion = true;
161
486
  }
162
- const started = Date.now();
163
- const result = await this.plugin.generateText({
164
- contextId: CONTEXT_ID,
165
- prompt: options.prompt,
166
- params,
487
+ // Cache key threading: surface the slot id derived from
488
+ // ProviderCachePlan.promptCacheKey to the native side. Stock
489
+ // llama-cpp-capacitor builds ignore the field; the patched fork build
490
+ // reads it via setCacheType / completion params and pins KV slots.
491
+ if (options.cacheKey) {
492
+ const slotId = deriveCacheSlotId(options.cacheKey);
493
+ params.cache_prompt = true;
494
+ params.slot_id = slotId;
495
+ }
496
+ return params;
497
+ }
498
+ /**
499
+ * Invoke the native completion (or generateText) entry point with a
500
+ * pre-built params bag. Returns the raw native result; callers map this
501
+ * to `GenerateResult` or to a `done` event.
502
+ */
503
+ async runNativeCompletion(options, params) {
504
+ var _a;
505
+ const plugin = this.plugin;
506
+ if (!plugin) {
507
+ throw new Error("No model loaded. Call load() first.");
508
+ }
509
+ const contextId = this.requireContextId();
510
+ const result = typeof plugin.completion === "function"
511
+ ? await plugin.completion({
512
+ contextId,
513
+ params: Object.assign({ prompt: options.prompt, emit_partial_completion: Boolean(params.emit_partial_completion) }, params),
514
+ })
515
+ : await ((_a = plugin.generateText) === null || _a === void 0 ? void 0 : _a.call(plugin, {
516
+ contextId,
517
+ prompt: options.prompt,
518
+ params,
519
+ }));
520
+ if (!result) {
521
+ throw new Error("llama-cpp-capacitor did not expose completion() or generateText()");
522
+ }
523
+ return result;
524
+ }
525
+ /**
526
+ * Native bridges currently don't honour per-generation sampler-stage
527
+ * injection — the Swift / Kotlin side needs separate wiring. Until that
528
+ * lands we log once per stage and otherwise pass through. The stages
529
+ * remain in the options object so downstream observers (telemetry,
530
+ * tests) can still see them.
531
+ */
532
+ logUnwiredSamplerStages(stages) {
533
+ if (!stages || stages.length === 0)
534
+ return;
535
+ for (const stage of stages) {
536
+ console.debug(`[capacitor-llama] sampler stage "${stage.kind}" received but not yet wired in native bridge`);
537
+ }
538
+ }
539
+ async generate(options) {
540
+ var _a, e_1, _b, _c;
541
+ // Wrapper over `generateStream` so the cache-key, stop-sequence, and
542
+ // native-call wiring lives in exactly one place. Drains the stream
543
+ // into the legacy `GenerateResult` shape; per-token events surface to
544
+ // any `onToken` listener via the native event bridge (unchanged).
545
+ let text = "";
546
+ let promptTokens = 0;
547
+ let outputTokens = 0;
548
+ let durationMs = 0;
549
+ let lastError = null;
550
+ // Wall-clock time-to-first-token: from the call start to the first decoded
551
+ // token event. This is the on-device prefill wall-clock the resource
552
+ // workbench differences into prefill vs decode throughput. Stays undefined
553
+ // when the generation yields no tokens.
554
+ const startedAt = Date.now();
555
+ let ttftMs;
556
+ try {
557
+ for (var _d = true, _e = __asyncValues(this.generateStream(options)), _f; _f = await _e.next(), _a = _f.done, !_a; _d = true) {
558
+ _c = _f.value;
559
+ _d = false;
560
+ const event = _c;
561
+ if (event.kind === "token") {
562
+ if (ttftMs === undefined)
563
+ ttftMs = Date.now() - startedAt;
564
+ text += event.text;
565
+ }
566
+ else if (event.kind === "telemetry") {
567
+ // Native bridge currently emits no telemetry events; ignored here
568
+ // because the final `done` event carries the authoritative totals.
569
+ }
570
+ else if (event.kind === "error") {
571
+ lastError = event.message;
572
+ }
573
+ else if (event.kind === "done") {
574
+ // The done payload's authoritative fields come from the
575
+ // closed-over scope below — set when the native call returns.
576
+ }
577
+ }
578
+ }
579
+ catch (e_1_1) { e_1 = { error: e_1_1 }; }
580
+ finally {
581
+ try {
582
+ if (!_d && !_a && (_b = _e.return)) await _b.call(_e);
583
+ }
584
+ finally { if (e_1) throw e_1.error; }
585
+ }
586
+ if (lastError)
587
+ throw new Error(lastError);
588
+ // Re-read native counters from the cached completion result. We stored
589
+ // them on `this.lastCompletionStats` inside the stream's lifecycle.
590
+ const stats = this.lastCompletionStats;
591
+ if (stats) {
592
+ promptTokens = stats.promptTokens;
593
+ outputTokens = stats.outputTokens;
594
+ durationMs = stats.durationMs;
595
+ if (stats.text) {
596
+ // The native call's authoritative text. Use it instead of the
597
+ // token-event-assembled string so callers see exactly what the
598
+ // bridge produced (some bridges only emit tokens, others emit
599
+ // partial+final; assembled text isn't always equal).
600
+ text = stats.text;
601
+ }
602
+ }
603
+ return Object.assign({ text,
604
+ promptTokens,
605
+ outputTokens,
606
+ durationMs }, (ttftMs !== undefined ? { ttftMs } : {}));
607
+ }
608
+ /**
609
+ * Streaming generation. Subscribes to the native token event bridge,
610
+ * starts the completion call, and yields typed `GenerationEvent`s as
611
+ * tokens arrive. The stream ends with exactly one `done` event (or one
612
+ * terminal `error`) once the native call resolves.
613
+ *
614
+ * Sampler-stage injection (`samplerStages`) and the per-generation
615
+ * spec-decode toggle (`specDecode`) are accepted but currently pass
616
+ * through unchanged on the JS side — the Swift / Kotlin bridge wiring is tracked
617
+ * separately. They flow through as part of the options bag so the
618
+ * native side can pick them up without an interface change.
619
+ */
620
+ generateStream(options) {
621
+ return __asyncGenerator(this, arguments, function* generateStream_1() {
622
+ var _a;
623
+ if (!this.plugin || !this.loadedPath) {
624
+ throw new Error("No model loaded. Call load() first.");
625
+ }
626
+ this.tokenIndex = 0;
627
+ this.lastCompletionStats = null;
628
+ this.logUnwiredSamplerStages(options.samplerStages);
629
+ const queue = [];
630
+ let waiter = null;
631
+ const wake = () => {
632
+ if (waiter) {
633
+ const w = waiter;
634
+ waiter = null;
635
+ w();
636
+ }
637
+ };
638
+ const push = (event) => {
639
+ queue.push(event);
640
+ wake();
641
+ };
642
+ // Subscribe to per-token events. The native bridge fires
643
+ // `@LlamaCpp_onToken`; our existing class-level listener forwards into
644
+ // every `onToken(listener)` consumer. We register one more listener
645
+ // here, scoped to this stream, that converts strings into `token`
646
+ // events.
647
+ const unsubscribe = this.onToken((tokenText, index) => {
648
+ push({ kind: "token", text: tokenText, index });
649
+ });
650
+ const params = this.buildNativeParams(Object.assign(Object.assign({}, options), {
651
+ // generateStream implies streaming — force on so the bridge emits
652
+ // partial completions even when the caller didn't set `stream: true`
653
+ // on the legacy options bag.
654
+ stream: true }));
655
+ const started = Date.now();
656
+ let completionPromise;
657
+ try {
658
+ completionPromise = this.runNativeCompletion(options, params);
659
+ }
660
+ catch (err) {
661
+ unsubscribe();
662
+ const message = err instanceof Error ? err.message : String(err);
663
+ yield yield __await({ kind: "error", message, recoverable: false });
664
+ yield yield __await({ kind: "done", finishReason: "error" });
665
+ return yield __await(void 0);
666
+ }
667
+ // Wrapped in an object so TS's control-flow analysis doesn't widen the
668
+ // closed-over assignments back to `null`/`never` when we read them
669
+ // after the loop. (Plain `let` with `null` init narrows badly after
670
+ // an async assignment.)
671
+ const completionState = { result: null, error: null, done: false };
672
+ completionPromise
673
+ .then((result) => {
674
+ completionState.result = result;
675
+ })
676
+ .catch((err) => {
677
+ completionState.error =
678
+ err instanceof Error ? err : { message: String(err) };
679
+ })
680
+ .finally(() => {
681
+ completionState.done = true;
682
+ wake();
683
+ });
684
+ try {
685
+ while (true) {
686
+ if (queue.length > 0) {
687
+ yield yield __await(queue.shift());
688
+ continue;
689
+ }
690
+ if (completionState.done)
691
+ break;
692
+ yield __await(new Promise((resolve) => {
693
+ waiter = resolve;
694
+ }));
695
+ }
696
+ }
697
+ finally {
698
+ unsubscribe();
699
+ }
700
+ if (completionState.error) {
701
+ yield yield __await({
702
+ kind: "error",
703
+ message: completionState.error.message,
704
+ recoverable: false,
705
+ });
706
+ yield yield __await({ kind: "done", finishReason: "error" });
707
+ return yield __await(void 0);
708
+ }
709
+ if (completionState.result) {
710
+ const r = completionState.result;
711
+ const duration = ((_a = r.timings) === null || _a === void 0 ? void 0 : _a.predicted_ms) != null
712
+ ? Math.round(r.timings.predicted_ms)
713
+ : Date.now() - started;
714
+ this.lastCompletionStats = {
715
+ text: r.text,
716
+ promptTokens: r.tokens_evaluated,
717
+ outputTokens: r.tokens_predicted,
718
+ durationMs: duration,
719
+ };
720
+ // Reason heuristic: native fork doesn't expose a finish-reason
721
+ // enum yet. "stop" is the dominant case; "length" when we hit the
722
+ // requested n_predict ceiling exactly. Tool/cancel/error are
723
+ // emitted by the explicit paths above and aren't reachable here.
724
+ const requested = resolveMobileMaxTokens(options.maxTokens);
725
+ const finishReason = r.tokens_predicted >= requested ? "length" : "stop";
726
+ yield yield __await({ kind: "done", finishReason });
727
+ return yield __await(void 0);
728
+ }
729
+ // Native call resolved with no payload and no error — defensive
730
+ // terminal event so the consumer's `for await` always ends cleanly.
731
+ yield yield __await({ kind: "done", finishReason: "stop" });
167
732
  });
168
- const duration = ((_d = result.timings) === null || _d === void 0 ? void 0 : _d.predicted_ms) != null
169
- ? Math.round(result.timings.predicted_ms)
170
- : Date.now() - started;
171
- return {
172
- text: result.text,
173
- promptTokens: result.tokens_evaluated,
174
- outputTokens: result.tokens_predicted,
175
- durationMs: duration,
176
- };
733
+ }
734
+ async setDrafter(drafterPath) {
735
+ // The native bridge has no live-swap entry point yet; the drafter is
736
+ // bound at `load()` time via `LoadOptions.draftModelPath`. Log so the
737
+ // call-site is observable, and leave the loaded context unchanged.
738
+ console.warn(`[capacitor-llama] setDrafter(${drafterPath !== null && drafterPath !== void 0 ? drafterPath : "null"}) not yet supported by native bridge; pass draftModelPath to load() instead`);
739
+ }
740
+ async trimMemory(level) {
741
+ // No native hook yet — log so the runtime's pressure plumbing can see
742
+ // the adapter received the signal. Major pressure also clears the
743
+ // token-listener bookkeeping to drop any orphaned callbacks.
744
+ if (level === "major") {
745
+ this.tokenListeners.clear();
746
+ }
747
+ console.debug(`[capacitor-llama] trimMemory(${level}) — bridge hook unavailable`);
177
748
  }
178
749
  async cancelGenerate() {
179
- if (!this.plugin)
750
+ if (!this.plugin || this.contextId === null)
180
751
  return;
181
- await this.plugin.stopCompletion({ contextId: CONTEXT_ID });
752
+ await this.plugin.stopCompletion({ contextId: this.contextId });
753
+ }
754
+ /**
755
+ * Round-trip to the loaded GGUF's native chat template via
756
+ * `LlamaCpp.getFormattedChat`. The plugin's Java side serializes
757
+ * `messages` as a JSON string and invokes
758
+ * `cap_format_chat()` → `llama_chat_apply_template()`. Returns the
759
+ * rendered prompt (or null when the GGUF has no template metadata).
760
+ */
761
+ async formatChat(messages) {
762
+ var _a;
763
+ if (!this.plugin || !this.loadedPath) {
764
+ throw new Error("No model loaded. Call load() first.");
765
+ }
766
+ if (typeof this.plugin.getFormattedChat !== "function") {
767
+ return null;
768
+ }
769
+ const result = await this.plugin.getFormattedChat({
770
+ contextId: this.requireContextId(),
771
+ messages: JSON.stringify(messages),
772
+ params: { jinja: true },
773
+ });
774
+ return (_a = result.prompt) !== null && _a !== void 0 ? _a : null;
182
775
  }
183
776
  async embed(options) {
184
777
  var _a;
@@ -191,8 +784,9 @@ class CapacitorLlamaAdapter {
191
784
  const params = {
192
785
  embd_normalize: (_a = options.embdNormalize) !== null && _a !== void 0 ? _a : 0,
193
786
  };
787
+ const contextId = this.requireContextId();
194
788
  const result = await this.plugin.embedding({
195
- contextId: CONTEXT_ID,
789
+ contextId,
196
790
  text: options.input,
197
791
  params,
198
792
  });
@@ -200,7 +794,7 @@ class CapacitorLlamaAdapter {
200
794
  if (typeof this.plugin.tokenize === "function") {
201
795
  try {
202
796
  const tokenized = await this.plugin.tokenize({
203
- contextId: CONTEXT_ID,
797
+ contextId,
204
798
  text: options.input,
205
799
  });
206
800
  tokenCount = tokenized.tokens.length;
@@ -232,22 +826,69 @@ class CapacitorLlamaAdapter {
232
826
  this.pluginLoadPromise = null;
233
827
  }
234
828
  }
829
+ /**
830
+ * Default singleton kept for back-compat with device-bridge-client and
831
+ * hardware-probe callers that don't distinguish chat vs embedding roles.
832
+ * The runtime's `localInferenceLoader` service uses per-role instances
833
+ * instead — see `registerCapacitorLlamaLoader`.
834
+ */
235
835
  export const capacitorLlama = new CapacitorLlamaAdapter();
836
+ /**
837
+ * Lightweight heuristic for routing a `loadModel(modelPath)` call to either
838
+ * the chat adapter or the embedding adapter. Embedding GGUFs the runtime
839
+ * ships or that users typically install for `TEXT_EMBEDDING` carry one of
840
+ * these markers in the filename. Anything else is assumed to be a
841
+ * generative chat model.
842
+ */
843
+ function looksLikeEmbeddingModelPath(modelPath) {
844
+ const lowered = modelPath.toLowerCase();
845
+ return (lowered.includes("bge-") ||
846
+ lowered.includes("bge_") ||
847
+ lowered.includes("nomic-embed") ||
848
+ lowered.includes("all-minilm") ||
849
+ lowered.includes("gte-") ||
850
+ lowered.includes("e5-") ||
851
+ lowered.includes("/embedding/") ||
852
+ lowered.endsWith("embedding.gguf"));
853
+ }
236
854
  export function registerCapacitorLlamaLoader(runtime) {
237
855
  if (typeof runtime.registerService !== "function")
238
856
  return;
857
+ // Two distinct adapter instances so the chat LLM and embedding model
858
+ // each allocate their own native context id. This is the fix for
859
+ // elizaOS/eliza#7681 — the previous single-adapter design routed every
860
+ // operation through CONTEXT_ID=1, and a `completion(contextId=1)` call
861
+ // would resolve to whichever model registered against id 1 last
862
+ // (typically the bge-small embedding model on Android), emitting
863
+ // `[unused{N}]` / `[PAD]` reserved tokens.
864
+ const chatAdapter = new CapacitorLlamaAdapter();
865
+ const embeddingAdapter = new CapacitorLlamaAdapter();
866
+ function adapterFor(modelPath) {
867
+ return looksLikeEmbeddingModelPath(modelPath)
868
+ ? embeddingAdapter
869
+ : chatAdapter;
870
+ }
239
871
  runtime.registerService("localInferenceLoader", {
240
872
  async loadModel(args) {
241
- await capacitorLlama.load({ modelPath: args.modelPath });
873
+ await adapterFor(args.modelPath).load(args);
242
874
  },
243
875
  async unloadModel() {
244
- await capacitorLlama.unload();
876
+ // Each adapter manages its own context lifecycle inside
877
+ // `load()` (releasing the prior context before reinitializing on the
878
+ // same id). Tearing down both adapters here would defeat the
879
+ // per-instance routing — `ensureAssignedModelLoaded` calls
880
+ // `unloadModel()` before every `loadModel()` on the assumption of
881
+ // single-model behaviour, and we must not let that unconditionally
882
+ // kill the embedding adapter when only the chat model is swapping.
245
883
  },
246
884
  currentModelPath() {
247
- return capacitorLlama.currentModelPath();
885
+ var _a;
886
+ // The chat path is the primary "active" model from the runtime's
887
+ // perspective; embedding is treated as a sidecar.
888
+ return ((_a = chatAdapter.currentModelPath()) !== null && _a !== void 0 ? _a : embeddingAdapter.currentModelPath());
248
889
  },
249
890
  async generate(args) {
250
- const result = await capacitorLlama.generate({
891
+ const result = await chatAdapter.generate({
251
892
  prompt: args.prompt,
252
893
  stopSequences: args.stopSequences,
253
894
  maxTokens: args.maxTokens,
@@ -256,7 +897,7 @@ export function registerCapacitorLlamaLoader(runtime) {
256
897
  return result.text;
257
898
  },
258
899
  async embed(args) {
259
- return capacitorLlama.embed({ input: args.input });
900
+ return embeddingAdapter.embed({ input: args.input });
260
901
  },
261
902
  });
262
903
  }