@elizaos/capacitor-llama 0.1.0 → 2.0.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -43,7 +43,7 @@ transparently.
43
43
  registerCapacitorLlamaLoader(runtime);
44
44
  ```
45
45
 
46
- 3. Run `npx cap sync` in `apps/app` to pick up the native plugin. iOS and
46
+ 3. Run `bunx cap sync` in `apps/app` to pick up the native plugin. iOS and
47
47
  Android builds will pull in `llama-cpp-capacitor`'s prebuilt native
48
48
  libraries automatically.
49
49
 
@@ -1,4 +1,22 @@
1
1
  const CONTEXT_ID = 1;
2
+ const DEFAULT_MAX_TOKENS = 256;
3
+ /**
4
+ * Mobile-side parallel slot count. Mirrors `DEFAULT_CACHE_PARALLEL` in
5
+ * `cache-bridge.ts`; on devices with constrained KV memory we keep a small
6
+ * fixed pool so distinct cacheKey values still get prefix reuse without
7
+ * blowing memory.
8
+ */
9
+ const MOBILE_PARALLEL = 4;
10
+ /** FNV-1a 32-bit, deterministic across platforms — matches the agent side. */
11
+ function deriveCacheSlotId(key) {
12
+ let hash = 0x811c9dc5;
13
+ for (let i = 0; i < key.length; i += 1) {
14
+ hash ^= key.charCodeAt(i);
15
+ hash = Math.imul(hash, 0x01000193);
16
+ }
17
+ return Math.abs(hash | 0) % MOBILE_PARALLEL;
18
+ }
19
+ const MOBILE_MAX_TOKENS_CAP = 256;
2
20
  function isObject(value) {
3
21
  return typeof value === "object" && value !== null;
4
22
  }
@@ -7,7 +25,8 @@ function isLlamaCppPluginLike(value) {
7
25
  typeof value.initContext === "function" &&
8
26
  typeof value.releaseContext === "function" &&
9
27
  typeof value.releaseAllContexts === "function" &&
10
- typeof value.generateText === "function" &&
28
+ (typeof value.completion === "function" ||
29
+ typeof value.generateText === "function") &&
11
30
  typeof value.stopCompletion === "function" &&
12
31
  typeof value.addListener === "function");
13
32
  }
@@ -23,6 +42,42 @@ function resolveLlamaCppPlugin(mod) {
23
42
  }
24
43
  return null;
25
44
  }
45
+ function toPlainLlamaCppPlugin(plugin) {
46
+ return {
47
+ initContext: (options) => plugin.initContext(options),
48
+ releaseContext: (options) => plugin.releaseContext(options),
49
+ releaseAllContexts: () => plugin.releaseAllContexts(),
50
+ getHardwareInfo: typeof plugin.getHardwareInfo === "function"
51
+ ? () => { var _a; return (_a = plugin.getHardwareInfo) === null || _a === void 0 ? void 0 : _a.call(plugin); }
52
+ : undefined,
53
+ completion: typeof plugin.completion === "function"
54
+ ? (options) => { var _a; return (_a = plugin.completion) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
55
+ : undefined,
56
+ generateText: typeof plugin.generateText === "function"
57
+ ? (options) => { var _a; return (_a = plugin.generateText) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
58
+ : undefined,
59
+ stopCompletion: (options) => plugin.stopCompletion(options),
60
+ embedding: typeof plugin.embedding === "function"
61
+ ? (options) => { var _a; return (_a = plugin.embedding) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
62
+ : undefined,
63
+ tokenize: typeof plugin.tokenize === "function"
64
+ ? (options) => { var _a; return (_a = plugin.tokenize) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
65
+ : undefined,
66
+ setCacheType: typeof plugin.setCacheType === "function"
67
+ ? (options) => { var _a; return (_a = plugin.setCacheType) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
68
+ : undefined,
69
+ setSpecType: typeof plugin.setSpecType === "function"
70
+ ? (options) => { var _a; return (_a = plugin.setSpecType) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
71
+ : undefined,
72
+ getNativeKernels: typeof plugin.getNativeKernels === "function"
73
+ ? () => {
74
+ var _a;
75
+ return (_a = plugin.getNativeKernels) === null || _a === void 0 ? void 0 : _a.call(plugin);
76
+ }
77
+ : undefined,
78
+ addListener: (event, listener) => plugin.addListener(event, listener),
79
+ };
80
+ }
26
81
  function isCapacitorNative() {
27
82
  var _a;
28
83
  const cap = globalThis.Capacitor;
@@ -38,6 +93,110 @@ function detectPlatform() {
38
93
  return "android";
39
94
  return "web";
40
95
  }
96
+ function resolveMobileMaxTokens(requested) {
97
+ if (!Number.isFinite(requested) || requested == null || requested <= 0) {
98
+ return DEFAULT_MAX_TOKENS;
99
+ }
100
+ return Math.min(Math.floor(requested), MOBILE_MAX_TOKENS_CAP);
101
+ }
102
+ function numberFromUnknown(value) {
103
+ if (typeof value !== "number" || !Number.isFinite(value))
104
+ return null;
105
+ return value;
106
+ }
107
+ function booleanFromUnknown(value) {
108
+ return typeof value === "boolean" ? value : undefined;
109
+ }
110
+ function stringFromUnknown(value) {
111
+ return typeof value === "string" && value.trim().length > 0
112
+ ? value.trim()
113
+ : undefined;
114
+ }
115
+ function fallbackHardwareInfo(platform = detectPlatform(), reason = "native hardware probe unavailable") {
116
+ var _a, _b;
117
+ const nav = globalThis.navigator;
118
+ const totalRamGb = (_a = numberFromUnknown(nav === null || nav === void 0 ? void 0 : nav.deviceMemory)) !== null && _a !== void 0 ? _a : 0;
119
+ const gpu = platform === "ios"
120
+ ? { backend: "metal", available: true }
121
+ : platform === "android"
122
+ ? { backend: "vulkan", available: true }
123
+ : null;
124
+ return {
125
+ platform,
126
+ deviceModel: platform,
127
+ totalRamGb,
128
+ availableRamGb: null,
129
+ cpuCores: (_b = nav === null || nav === void 0 ? void 0 : nav.hardwareConcurrency) !== null && _b !== void 0 ? _b : 0,
130
+ gpu,
131
+ gpuSupported: platform !== "web",
132
+ dflashSupported: false,
133
+ dflashReason: reason,
134
+ source: "adapter-fallback",
135
+ nativeKernels: [],
136
+ forkVariant: null,
137
+ };
138
+ }
139
+ function normalizeForkVariant(value) {
140
+ if (value === "buun-llama-cpp" || value === "stock-llama-cpp")
141
+ return value;
142
+ if (value === null)
143
+ return null;
144
+ return undefined;
145
+ }
146
+ function stringArrayFromUnknown(value) {
147
+ if (!Array.isArray(value))
148
+ return undefined;
149
+ const out = [];
150
+ for (const entry of value) {
151
+ if (typeof entry === "string" && entry.length > 0)
152
+ out.push(entry);
153
+ }
154
+ return out;
155
+ }
156
+ function normalizeHardwareInfo(value, platform = detectPlatform()) {
157
+ var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k;
158
+ const fallback = fallbackHardwareInfo(platform);
159
+ if (!value)
160
+ return fallback;
161
+ const totalRamGb = (_a = numberFromUnknown(value.totalRamGb)) !== null && _a !== void 0 ? _a : fallback.totalRamGb;
162
+ const availableRamGb = value.availableRamGb === null
163
+ ? null
164
+ : ((_b = numberFromUnknown(value.availableRamGb)) !== null && _b !== void 0 ? _b : fallback.availableRamGb);
165
+ const gpu = value.gpu && isObject(value.gpu)
166
+ ? {
167
+ backend: value.gpu.backend === "metal" ||
168
+ value.gpu.backend === "vulkan" ||
169
+ value.gpu.backend === "gpu-delegate"
170
+ ? value.gpu.backend
171
+ : ((_d = (_c = fallback.gpu) === null || _c === void 0 ? void 0 : _c.backend) !== null && _d !== void 0 ? _d : "gpu-delegate"),
172
+ available: Boolean(value.gpu.available),
173
+ }
174
+ : fallback.gpu;
175
+ return Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign({ platform: value.platform === "ios" ||
176
+ value.platform === "android" ||
177
+ value.platform === "web"
178
+ ? value.platform
179
+ : platform, deviceModel: (_e = stringFromUnknown(value.deviceModel)) !== null && _e !== void 0 ? _e : fallback.deviceModel }, (stringFromUnknown(value.machineId)
180
+ ? { machineId: stringFromUnknown(value.machineId) }
181
+ : {})), (stringFromUnknown(value.osVersion)
182
+ ? { osVersion: stringFromUnknown(value.osVersion) }
183
+ : {})), (typeof value.isSimulator === "boolean"
184
+ ? { isSimulator: value.isSimulator }
185
+ : {})), { totalRamGb,
186
+ availableRamGb }), (numberFromUnknown(value.freeStorageGb) !== null
187
+ ? { freeStorageGb: numberFromUnknown(value.freeStorageGb) }
188
+ : {})), { cpuCores: (_f = numberFromUnknown(value.cpuCores)) !== null && _f !== void 0 ? _f : fallback.cpuCores, gpu, gpuSupported: (_g = booleanFromUnknown(value.gpuSupported)) !== null && _g !== void 0 ? _g : fallback.gpuSupported }), (typeof value.lowPowerMode === "boolean"
189
+ ? { lowPowerMode: value.lowPowerMode }
190
+ : {})), (value.thermalState === "nominal" ||
191
+ value.thermalState === "fair" ||
192
+ value.thermalState === "serious" ||
193
+ value.thermalState === "critical" ||
194
+ value.thermalState === "unknown"
195
+ ? { thermalState: value.thermalState }
196
+ : {})), { dflashSupported: Boolean(value.dflashSupported), dflashReason: (_h = stringFromUnknown(value.dflashReason)) !== null && _h !== void 0 ? _h : (value.dflashSupported
197
+ ? undefined
198
+ : "native plugin did not report DFlash support"), source: value.source === "native" ? "native" : "adapter-fallback", nativeKernels: (_j = stringArrayFromUnknown(value.nativeKernels)) !== null && _j !== void 0 ? _j : [], forkVariant: (_k = normalizeForkVariant(value.forkVariant)) !== null && _k !== void 0 ? _k : null });
199
+ }
41
200
  class CapacitorLlamaAdapter {
42
201
  constructor() {
43
202
  this.plugin = null;
@@ -54,10 +213,11 @@ class CapacitorLlamaAdapter {
54
213
  if (this.pluginLoadPromise)
55
214
  return this.pluginLoadPromise;
56
215
  this.pluginLoadPromise = (async () => {
57
- const plugin = resolveLlamaCppPlugin(await import("llama-cpp-capacitor"));
58
- if (!plugin) {
59
- throw new Error("llama-cpp-capacitor did not expose an initContext method");
216
+ const nativePlugin = resolveLlamaCppPlugin(await import("llama-cpp-capacitor"));
217
+ if (!nativePlugin) {
218
+ throw new Error("llama-cpp-capacitor did not expose the native LlamaCpp methods");
60
219
  }
220
+ const plugin = toPlainLlamaCppPlugin(nativePlugin);
61
221
  const tokenListenerHandle = await plugin.addListener("@LlamaCpp_onToken", (data) => {
62
222
  var _a, _b;
63
223
  const token = (_b = (_a = data.tokenResult) === null || _a === void 0 ? void 0 : _a.token) !== null && _b !== void 0 ? _b : data.token;
@@ -86,19 +246,73 @@ class CapacitorLlamaAdapter {
86
246
  }
87
247
  }
88
248
  async getHardwareInfo() {
89
- var _a;
249
+ var _a, _b, _c;
90
250
  const platform = detectPlatform();
91
- const nav = globalThis
92
- .navigator;
93
- return {
94
- platform,
95
- deviceModel: platform,
96
- totalRamGb: 0,
97
- availableRamGb: null,
98
- cpuCores: (_a = nav === null || nav === void 0 ? void 0 : nav.hardwareConcurrency) !== null && _a !== void 0 ? _a : 0,
99
- gpu: null,
100
- gpuSupported: platform !== "web",
101
- };
251
+ if (!isCapacitorNative())
252
+ return fallbackHardwareInfo(platform);
253
+ try {
254
+ const plugin = await this.loadPlugin();
255
+ const baseInfo = normalizeHardwareInfo(await ((_a = plugin.getHardwareInfo) === null || _a === void 0 ? void 0 : _a.call(plugin)), platform);
256
+ // Probe fork-specific kernels through the optional bridge method.
257
+ // Stock builds and older fork builds without the bridge fall back
258
+ // to the empty list + "stock-llama-cpp" variant marker.
259
+ let nativeKernels = (_b = baseInfo.nativeKernels) !== null && _b !== void 0 ? _b : [];
260
+ let forkVariant = (_c = baseInfo.forkVariant) !== null && _c !== void 0 ? _c : "stock-llama-cpp";
261
+ if (typeof plugin.getNativeKernels === "function") {
262
+ try {
263
+ const probe = await plugin.getNativeKernels();
264
+ const kernels = stringArrayFromUnknown(probe === null || probe === void 0 ? void 0 : probe.kernels);
265
+ if (kernels)
266
+ nativeKernels = kernels;
267
+ const variant = normalizeForkVariant(probe === null || probe === void 0 ? void 0 : probe.variant);
268
+ if (variant !== undefined)
269
+ forkVariant = variant;
270
+ else if (nativeKernels.length > 0)
271
+ forkVariant = "buun-llama-cpp";
272
+ }
273
+ catch (err) {
274
+ const message = err instanceof Error ? err.message : String(err);
275
+ console.debug("[capacitor-llama] getNativeKernels probe failed", {
276
+ error: message,
277
+ });
278
+ }
279
+ }
280
+ return Object.assign(Object.assign({}, baseInfo), { nativeKernels,
281
+ forkVariant });
282
+ }
283
+ catch (error) {
284
+ return fallbackHardwareInfo(platform, error instanceof Error ? error.message : "native hardware probe failed");
285
+ }
286
+ }
287
+ async setCacheType(typeK, typeV) {
288
+ if (!isCapacitorNative()) {
289
+ console.warn("[capacitor-llama] setCacheType called on non-native platform; ignoring");
290
+ return;
291
+ }
292
+ const plugin = await this.loadPlugin();
293
+ if (typeof plugin.setCacheType !== "function") {
294
+ console.warn("[capacitor-llama] underlying plugin does not expose setCacheType (likely stock build); cache types must be passed via load() params instead");
295
+ return;
296
+ }
297
+ await plugin.setCacheType({ cacheTypeK: typeK, cacheTypeV: typeV });
298
+ }
299
+ async setSpecType(args) {
300
+ if (!isCapacitorNative()) {
301
+ console.warn("[capacitor-llama] setSpecType called on non-native platform; ignoring");
302
+ return;
303
+ }
304
+ const plugin = await this.loadPlugin();
305
+ if (typeof plugin.setSpecType !== "function") {
306
+ console.warn("[capacitor-llama] underlying plugin does not expose setSpecType (likely stock build); pass draft_model + draft_min/max via load() instead");
307
+ return;
308
+ }
309
+ await plugin.setSpecType({
310
+ target: args.target,
311
+ drafter: args.drafter,
312
+ specType: args.specType,
313
+ draftMin: args.draftMin,
314
+ draftMax: args.draftMax,
315
+ });
102
316
  }
103
317
  async isLoaded() {
104
318
  return {
@@ -110,7 +324,7 @@ class CapacitorLlamaAdapter {
110
324
  return this.loadedPath;
111
325
  }
112
326
  async load(options) {
113
- var _a, _b;
327
+ var _a, _b, _c, _d, _e, _f;
114
328
  if (!isCapacitorNative()) {
115
329
  throw new Error("capacitor-llama is only available on iOS and Android builds");
116
330
  }
@@ -119,15 +333,21 @@ class CapacitorLlamaAdapter {
119
333
  await plugin.releaseAllContexts();
120
334
  this.loadedPath = null;
121
335
  }
336
+ const speculativeSamples = options.mobileSpeculative
337
+ ? Math.min((_b = (_a = options.speculativeSamples) !== null && _a !== void 0 ? _a : options.draftMax) !== null && _b !== void 0 ? _b : 3, 4)
338
+ : ((_c = options.speculativeSamples) !== null && _c !== void 0 ? _c : 3);
339
+ const params = Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign({ model: options.modelPath, n_ctx: (_d = options.contextSize) !== null && _d !== void 0 ? _d : 4096, n_gpu_layers: options.useGpu === false ? 0 : 99, n_threads: (_e = options.maxThreads) !== null && _e !== void 0 ? _e : 0, use_mmap: true, flash_attn: options.useGpu !== false, n_batch: options.mobileSpeculative ? 128 : 512, n_ubatch: options.mobileSpeculative ? 64 : 512 }, (options.draftModelPath
340
+ ? {
341
+ draft_model: options.draftModelPath,
342
+ speculative_samples: speculativeSamples,
343
+ mobile_speculative: (_f = options.mobileSpeculative) !== null && _f !== void 0 ? _f : true,
344
+ }
345
+ : {})), (options.draftContextSize
346
+ ? { n_ctx_draft: options.draftContextSize }
347
+ : {})), (options.draftMin ? { draft_min: options.draftMin } : {})), (options.draftMax ? { draft_max: options.draftMax } : {})), (options.cacheTypeK ? { cache_type_k: options.cacheTypeK } : {})), (options.cacheTypeV ? { cache_type_v: options.cacheTypeV } : {})), (options.disableThinking ? { reasoning: false } : {}));
122
348
  await plugin.initContext({
123
349
  contextId: CONTEXT_ID,
124
- params: {
125
- model: options.modelPath,
126
- n_ctx: (_a = options.contextSize) !== null && _a !== void 0 ? _a : 4096,
127
- n_gpu_layers: options.useGpu === false ? 0 : 99,
128
- n_threads: (_b = options.maxThreads) !== null && _b !== void 0 ? _b : 0,
129
- use_mmap: true,
130
- },
350
+ params,
131
351
  });
132
352
  this.loadedPath = options.modelPath;
133
353
  }
@@ -143,15 +363,15 @@ class CapacitorLlamaAdapter {
143
363
  this.loadedPath = null;
144
364
  }
145
365
  async generate(options) {
146
- var _a, _b, _c, _d;
366
+ var _a, _b, _c, _d, _e;
147
367
  if (!this.plugin || !this.loadedPath) {
148
368
  throw new Error("No model loaded. Call load() first.");
149
369
  }
150
370
  this.tokenIndex = 0;
151
371
  const params = {
152
- n_predict: (_a = options.maxTokens) !== null && _a !== void 0 ? _a : 2048,
153
- temperature: (_b = options.temperature) !== null && _b !== void 0 ? _b : 0.7,
154
- top_p: (_c = options.topP) !== null && _c !== void 0 ? _c : 0.9,
372
+ n_predict: resolveMobileMaxTokens(options.maxTokens),
373
+ temperature: (_a = options.temperature) !== null && _a !== void 0 ? _a : 0.7,
374
+ top_p: (_b = options.topP) !== null && _b !== void 0 ? _b : 0.9,
155
375
  };
156
376
  if (options.stopSequences && options.stopSequences.length > 0) {
157
377
  params.stop = options.stopSequences;
@@ -159,13 +379,32 @@ class CapacitorLlamaAdapter {
159
379
  if (options.stream) {
160
380
  params.emit_partial_completion = true;
161
381
  }
382
+ // Cache key threading: surface the slot id derived from
383
+ // ProviderCachePlan.promptCacheKey to the native side. Stock
384
+ // llama-cpp-capacitor builds ignore the field; the patched fork build
385
+ // reads it via setCacheType / completion params and pins KV slots.
386
+ if (options.cacheKey) {
387
+ const slotId = deriveCacheSlotId(options.cacheKey);
388
+ params.cache_prompt =
389
+ true;
390
+ params.slot_id =
391
+ slotId;
392
+ }
162
393
  const started = Date.now();
163
- const result = await this.plugin.generateText({
164
- contextId: CONTEXT_ID,
165
- prompt: options.prompt,
166
- params,
167
- });
168
- const duration = ((_d = result.timings) === null || _d === void 0 ? void 0 : _d.predicted_ms) != null
394
+ const result = typeof this.plugin.completion === "function"
395
+ ? await this.plugin.completion({
396
+ contextId: CONTEXT_ID,
397
+ params: Object.assign({ prompt: options.prompt, emit_partial_completion: Boolean(params.emit_partial_completion) }, params),
398
+ })
399
+ : await ((_d = (_c = this.plugin).generateText) === null || _d === void 0 ? void 0 : _d.call(_c, {
400
+ contextId: CONTEXT_ID,
401
+ prompt: options.prompt,
402
+ params,
403
+ }));
404
+ if (!result) {
405
+ throw new Error("llama-cpp-capacitor did not expose completion() or generateText()");
406
+ }
407
+ const duration = ((_e = result.timings) === null || _e === void 0 ? void 0 : _e.predicted_ms) != null
169
408
  ? Math.round(result.timings.predicted_ms)
170
409
  : Date.now() - started;
171
410
  return {
@@ -238,7 +477,7 @@ export function registerCapacitorLlamaLoader(runtime) {
238
477
  return;
239
478
  runtime.registerService("localInferenceLoader", {
240
479
  async loadModel(args) {
241
- await capacitorLlama.load({ modelPath: args.modelPath });
480
+ await capacitorLlama.load(args);
242
481
  },
243
482
  async unloadModel() {
244
483
  await capacitorLlama.unload();
@@ -20,6 +20,22 @@ export interface LoadOptions {
20
20
  useGpu?: boolean;
21
21
  /** Cap on native thread count; native layer picks a reasonable default otherwise. */
22
22
  maxThreads?: number;
23
+ /** Optional draft GGUF for native speculative decoding builds. */
24
+ draftModelPath?: string;
25
+ /** Context window for the draft model when supported by the native build. */
26
+ draftContextSize?: number;
27
+ /** Lower/upper speculative draft bounds for fork builds that expose them. */
28
+ draftMin?: number;
29
+ draftMax?: number;
30
+ /** Number of draft tokens/samples when the native runtime supports it. */
31
+ speculativeSamples?: number;
32
+ /** Mobile runtimes may enable a lower-memory speculative path. */
33
+ mobileSpeculative?: boolean;
34
+ /** Optional KV cache types for fork builds such as TurboQuant. */
35
+ cacheTypeK?: string;
36
+ cacheTypeV?: string;
37
+ /** Eliza-1 DFlash drafters are trained for non-thinking outputs. */
38
+ disableThinking?: boolean;
23
39
  }
24
40
  export interface GenerateOptions {
25
41
  prompt: string;
@@ -29,6 +45,13 @@ export interface GenerateOptions {
29
45
  stopSequences?: string[];
30
46
  /** When true, token events fire on the "token" listener. */
31
47
  stream?: boolean;
48
+ /**
49
+ * Forwarded promptCacheKey from `ProviderCachePlan`. Native plugins
50
+ * that support prefix caching should derive a slot id from this and
51
+ * keep KV warm for repeated calls with the same key. Plugins without
52
+ * cache support ignore the field; behavior is unchanged.
53
+ */
54
+ cacheKey?: string;
32
55
  }
33
56
  export interface GenerateResult {
34
57
  text: string;
@@ -40,8 +63,13 @@ export interface HardwareInfo {
40
63
  platform: "ios" | "android" | "web";
41
64
  /** Human-readable device model when the OS exposes one. */
42
65
  deviceModel: string;
66
+ /** Stable OS machine identifier when available, e.g. iPhone16,2. */
67
+ machineId?: string;
68
+ osVersion?: string;
69
+ isSimulator?: boolean;
43
70
  totalRamGb: number;
44
71
  availableRamGb: number | null;
72
+ freeStorageGb?: number | null;
45
73
  cpuCores: number;
46
74
  gpu: {
47
75
  backend: "metal" | "vulkan" | "gpu-delegate";
@@ -49,6 +77,25 @@ export interface HardwareInfo {
49
77
  } | null;
50
78
  /** True when the underlying llama.cpp build has GPU support compiled in. */
51
79
  gpuSupported: boolean;
80
+ lowPowerMode?: boolean;
81
+ thermalState?: "nominal" | "fair" | "serious" | "critical" | "unknown";
82
+ /** True only when the native build can load a drafter and run DFlash/spec decode. */
83
+ dflashSupported?: boolean;
84
+ dflashReason?: string;
85
+ source?: "native" | "adapter-fallback";
86
+ /**
87
+ * Names of fork-specific kernels compiled into the loaded native library
88
+ * (e.g. "turbo3", "turbo4", "turbo3_tcq", "dflash", "qjl_full"). Empty
89
+ * when the loaded build is stock llama.cpp or when no native lib is loaded.
90
+ * Surfaced from the native bridge via a `kernels.json` manifest shipped
91
+ * alongside the .so.
92
+ */
93
+ nativeKernels?: string[];
94
+ /**
95
+ * Which native llama.cpp variant is loaded. `null` when the plugin
96
+ * isn't loaded at all (web fallback or native lib failed to load).
97
+ */
98
+ forkVariant?: "buun-llama-cpp" | "stock-llama-cpp" | null;
52
99
  }
53
100
  export interface EmbedOptions {
54
101
  /** Raw text to embed. The adapter forwards this verbatim to the native plugin. */
@@ -70,6 +117,16 @@ export interface EmbedResult {
70
117
  */
71
118
  tokens: number;
72
119
  }
120
+ export interface SetSpecTypeArgs {
121
+ /** Path to the target (large) GGUF. */
122
+ target: string;
123
+ /** Path to the drafter (small) GGUF. */
124
+ drafter: string;
125
+ /** Currently only "dflash" is honoured by the buun fork. */
126
+ specType: "dflash";
127
+ draftMin: number;
128
+ draftMax: number;
129
+ }
73
130
  export interface LlamaAdapter {
74
131
  getHardwareInfo(): Promise<HardwareInfo>;
75
132
  isLoaded(): Promise<{
@@ -89,4 +146,16 @@ export interface LlamaAdapter {
89
146
  * does not expose an embedding method on the active platform.
90
147
  */
91
148
  embed(options: EmbedOptions): Promise<EmbedResult>;
149
+ /**
150
+ * Configure the KV cache types used by the next loaded context. Only
151
+ * the buun-llama-cpp fork honours TurboQuant cache types like
152
+ * `q4_tq3` / `q4_tq4`. Stock builds will warn-and-no-op when the
153
+ * underlying plugin doesn't expose the bridge method.
154
+ */
155
+ setCacheType?(typeK: string, typeV: string): Promise<void>;
156
+ /**
157
+ * Configure DFlash speculative decoding for the next loaded context.
158
+ * Stock builds without speculative bridge methods warn-and-no-op.
159
+ */
160
+ setSpecType?(args: SetSpecTypeArgs): Promise<void>;
92
161
  }
@@ -14,6 +14,7 @@
14
14
  import { loadCapacitorLlama } from "./load-capacitor-llama";
15
15
  const INITIAL_BACKOFF_MS = 1000;
16
16
  const MAX_BACKOFF_MS = 30000;
17
+ const CONNECT_TIMEOUT_MS = 5000;
17
18
  export class DeviceBridgeClient {
18
19
  constructor(config) {
19
20
  this.socket = null;
@@ -58,7 +59,27 @@ export class DeviceBridgeClient {
58
59
  return;
59
60
  }
60
61
  this.socket = ws;
62
+ let timedOut = false;
63
+ const connectTimeout = setTimeout(() => {
64
+ var _a, _b;
65
+ if (this.stopped ||
66
+ this.socket !== ws ||
67
+ ws.readyState !== WebSocket.CONNECTING) {
68
+ return;
69
+ }
70
+ timedOut = true;
71
+ this.socket = null;
72
+ (_b = (_a = this.config).onStateChange) === null || _b === void 0 ? void 0 : _b.call(_a, "error", "websocket connect timeout");
73
+ try {
74
+ ws.close();
75
+ }
76
+ catch (_c) {
77
+ /* best effort */
78
+ }
79
+ this.scheduleReconnect();
80
+ }, CONNECT_TIMEOUT_MS);
61
81
  ws.onopen = () => {
82
+ clearTimeout(connectTimeout);
62
83
  this.reconnectAttempt = 0;
63
84
  void this.sendRegister(ws);
64
85
  };
@@ -78,8 +99,12 @@ export class DeviceBridgeClient {
78
99
  };
79
100
  ws.onclose = () => {
80
101
  var _a, _b;
81
- this.socket = null;
102
+ clearTimeout(connectTimeout);
103
+ if (this.socket === ws)
104
+ this.socket = null;
82
105
  (_b = (_a = this.config).onStateChange) === null || _b === void 0 ? void 0 : _b.call(_a, "disconnected");
106
+ if (timedOut)
107
+ return;
83
108
  this.scheduleReconnect();
84
109
  };
85
110
  }
@@ -107,13 +132,17 @@ export class DeviceBridgeClient {
107
132
  payload: {
108
133
  deviceId: this.config.deviceId,
109
134
  pairingToken: this.config.pairingToken,
110
- capabilities: {
111
- platform: hardware.platform,
112
- deviceModel: hardware.deviceModel,
113
- totalRamGb: hardware.totalRamGb,
114
- cpuCores: hardware.cpuCores,
115
- gpu: hardware.gpu,
116
- },
135
+ capabilities: Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign({ platform: hardware.platform, deviceModel: hardware.deviceModel }, (hardware.machineId ? { machineId: hardware.machineId } : {})), (hardware.osVersion ? { osVersion: hardware.osVersion } : {})), (typeof hardware.isSimulator === "boolean"
136
+ ? { isSimulator: hardware.isSimulator }
137
+ : {})), { totalRamGb: hardware.totalRamGb, availableRamGb: hardware.availableRamGb }), (typeof hardware.freeStorageGb === "number"
138
+ ? { freeStorageGb: hardware.freeStorageGb }
139
+ : {})), { cpuCores: hardware.cpuCores, gpu: hardware.gpu, gpuSupported: hardware.gpuSupported }), (typeof hardware.lowPowerMode === "boolean"
140
+ ? { lowPowerMode: hardware.lowPowerMode }
141
+ : {})), (hardware.thermalState
142
+ ? { thermalState: hardware.thermalState }
143
+ : {})), { dflashSupported: hardware.dflashSupported }), (hardware.dflashReason
144
+ ? { dflashReason: hardware.dflashReason }
145
+ : {})),
117
146
  loadedPath: loaded.modelPath,
118
147
  },
119
148
  };
@@ -137,6 +166,16 @@ export class DeviceBridgeClient {
137
166
  modelPath: msg.modelPath,
138
167
  contextSize: msg.contextSize,
139
168
  useGpu: msg.useGpu,
169
+ maxThreads: msg.maxThreads,
170
+ draftModelPath: msg.draftModelPath,
171
+ draftContextSize: msg.draftContextSize,
172
+ draftMin: msg.draftMin,
173
+ draftMax: msg.draftMax,
174
+ speculativeSamples: msg.speculativeSamples,
175
+ mobileSpeculative: msg.mobileSpeculative,
176
+ cacheTypeK: msg.cacheTypeK,
177
+ cacheTypeV: msg.cacheTypeV,
178
+ disableThinking: msg.disableThinking,
140
179
  });
141
180
  this.send(ws, {
142
181
  type: "loadResult",
@@ -204,6 +243,28 @@ export class DeviceBridgeClient {
204
243
  }
205
244
  return;
206
245
  }
246
+ if (msg.type === "embed") {
247
+ try {
248
+ const capacitorLlama = await loadCapacitorLlama();
249
+ const result = await capacitorLlama.embed({ input: msg.input });
250
+ this.send(ws, {
251
+ type: "embedResult",
252
+ correlationId: msg.correlationId,
253
+ ok: true,
254
+ embedding: result.embedding,
255
+ tokens: result.tokens,
256
+ });
257
+ }
258
+ catch (err) {
259
+ this.send(ws, {
260
+ type: "embedResult",
261
+ correlationId: msg.correlationId,
262
+ ok: false,
263
+ error: err instanceof Error ? err.message : String(err),
264
+ });
265
+ }
266
+ return;
267
+ }
207
268
  }
208
269
  }
209
270
  /**