@elizaos/capacitor-llama 0.1.0 → 2.0.0-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/esm/capacitor-llama-adapter.js +274 -35
- package/dist/esm/definitions.d.ts +69 -0
- package/dist/esm/device-bridge-client.js +69 -8
- package/dist/esm/kv-cache-resolver.d.ts +57 -0
- package/dist/esm/kv-cache-resolver.js +74 -0
- package/dist/esm/load-capacitor-llama.d.ts +1 -1
- package/dist/esm/load-capacitor-llama.js +1 -1
- package/dist/plugin.cjs.js +344 -44
- package/dist/plugin.cjs.js.map +1 -1
- package/dist/plugin.js +344 -44
- package/dist/plugin.js.map +1 -1
- package/package.json +7 -6
- package/dist/esm/index.test.d.ts +0 -1
- package/dist/esm/index.test.js +0 -264
- package/dist/esm/web.d.ts +0 -11
- package/dist/esm/web.js +0 -10
package/README.md
CHANGED
|
@@ -43,7 +43,7 @@ transparently.
|
|
|
43
43
|
registerCapacitorLlamaLoader(runtime);
|
|
44
44
|
```
|
|
45
45
|
|
|
46
|
-
3. Run `
|
|
46
|
+
3. Run `bunx cap sync` in `apps/app` to pick up the native plugin. iOS and
|
|
47
47
|
Android builds will pull in `llama-cpp-capacitor`'s prebuilt native
|
|
48
48
|
libraries automatically.
|
|
49
49
|
|
|
@@ -1,4 +1,22 @@
|
|
|
1
1
|
const CONTEXT_ID = 1;
|
|
2
|
+
const DEFAULT_MAX_TOKENS = 256;
|
|
3
|
+
/**
|
|
4
|
+
* Mobile-side parallel slot count. Mirrors `DEFAULT_CACHE_PARALLEL` in
|
|
5
|
+
* `cache-bridge.ts`; on devices with constrained KV memory we keep a small
|
|
6
|
+
* fixed pool so distinct cacheKey values still get prefix reuse without
|
|
7
|
+
* blowing memory.
|
|
8
|
+
*/
|
|
9
|
+
const MOBILE_PARALLEL = 4;
|
|
10
|
+
/** FNV-1a 32-bit, deterministic across platforms — matches the agent side. */
|
|
11
|
+
function deriveCacheSlotId(key) {
|
|
12
|
+
let hash = 0x811c9dc5;
|
|
13
|
+
for (let i = 0; i < key.length; i += 1) {
|
|
14
|
+
hash ^= key.charCodeAt(i);
|
|
15
|
+
hash = Math.imul(hash, 0x01000193);
|
|
16
|
+
}
|
|
17
|
+
return Math.abs(hash | 0) % MOBILE_PARALLEL;
|
|
18
|
+
}
|
|
19
|
+
const MOBILE_MAX_TOKENS_CAP = 256;
|
|
2
20
|
function isObject(value) {
|
|
3
21
|
return typeof value === "object" && value !== null;
|
|
4
22
|
}
|
|
@@ -7,7 +25,8 @@ function isLlamaCppPluginLike(value) {
|
|
|
7
25
|
typeof value.initContext === "function" &&
|
|
8
26
|
typeof value.releaseContext === "function" &&
|
|
9
27
|
typeof value.releaseAllContexts === "function" &&
|
|
10
|
-
typeof value.
|
|
28
|
+
(typeof value.completion === "function" ||
|
|
29
|
+
typeof value.generateText === "function") &&
|
|
11
30
|
typeof value.stopCompletion === "function" &&
|
|
12
31
|
typeof value.addListener === "function");
|
|
13
32
|
}
|
|
@@ -23,6 +42,42 @@ function resolveLlamaCppPlugin(mod) {
|
|
|
23
42
|
}
|
|
24
43
|
return null;
|
|
25
44
|
}
|
|
45
|
+
function toPlainLlamaCppPlugin(plugin) {
|
|
46
|
+
return {
|
|
47
|
+
initContext: (options) => plugin.initContext(options),
|
|
48
|
+
releaseContext: (options) => plugin.releaseContext(options),
|
|
49
|
+
releaseAllContexts: () => plugin.releaseAllContexts(),
|
|
50
|
+
getHardwareInfo: typeof plugin.getHardwareInfo === "function"
|
|
51
|
+
? () => { var _a; return (_a = plugin.getHardwareInfo) === null || _a === void 0 ? void 0 : _a.call(plugin); }
|
|
52
|
+
: undefined,
|
|
53
|
+
completion: typeof plugin.completion === "function"
|
|
54
|
+
? (options) => { var _a; return (_a = plugin.completion) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
|
|
55
|
+
: undefined,
|
|
56
|
+
generateText: typeof plugin.generateText === "function"
|
|
57
|
+
? (options) => { var _a; return (_a = plugin.generateText) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
|
|
58
|
+
: undefined,
|
|
59
|
+
stopCompletion: (options) => plugin.stopCompletion(options),
|
|
60
|
+
embedding: typeof plugin.embedding === "function"
|
|
61
|
+
? (options) => { var _a; return (_a = plugin.embedding) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
|
|
62
|
+
: undefined,
|
|
63
|
+
tokenize: typeof plugin.tokenize === "function"
|
|
64
|
+
? (options) => { var _a; return (_a = plugin.tokenize) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
|
|
65
|
+
: undefined,
|
|
66
|
+
setCacheType: typeof plugin.setCacheType === "function"
|
|
67
|
+
? (options) => { var _a; return (_a = plugin.setCacheType) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
|
|
68
|
+
: undefined,
|
|
69
|
+
setSpecType: typeof plugin.setSpecType === "function"
|
|
70
|
+
? (options) => { var _a; return (_a = plugin.setSpecType) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
|
|
71
|
+
: undefined,
|
|
72
|
+
getNativeKernels: typeof plugin.getNativeKernels === "function"
|
|
73
|
+
? () => {
|
|
74
|
+
var _a;
|
|
75
|
+
return (_a = plugin.getNativeKernels) === null || _a === void 0 ? void 0 : _a.call(plugin);
|
|
76
|
+
}
|
|
77
|
+
: undefined,
|
|
78
|
+
addListener: (event, listener) => plugin.addListener(event, listener),
|
|
79
|
+
};
|
|
80
|
+
}
|
|
26
81
|
function isCapacitorNative() {
|
|
27
82
|
var _a;
|
|
28
83
|
const cap = globalThis.Capacitor;
|
|
@@ -38,6 +93,110 @@ function detectPlatform() {
|
|
|
38
93
|
return "android";
|
|
39
94
|
return "web";
|
|
40
95
|
}
|
|
96
|
+
function resolveMobileMaxTokens(requested) {
|
|
97
|
+
if (!Number.isFinite(requested) || requested == null || requested <= 0) {
|
|
98
|
+
return DEFAULT_MAX_TOKENS;
|
|
99
|
+
}
|
|
100
|
+
return Math.min(Math.floor(requested), MOBILE_MAX_TOKENS_CAP);
|
|
101
|
+
}
|
|
102
|
+
function numberFromUnknown(value) {
|
|
103
|
+
if (typeof value !== "number" || !Number.isFinite(value))
|
|
104
|
+
return null;
|
|
105
|
+
return value;
|
|
106
|
+
}
|
|
107
|
+
function booleanFromUnknown(value) {
|
|
108
|
+
return typeof value === "boolean" ? value : undefined;
|
|
109
|
+
}
|
|
110
|
+
function stringFromUnknown(value) {
|
|
111
|
+
return typeof value === "string" && value.trim().length > 0
|
|
112
|
+
? value.trim()
|
|
113
|
+
: undefined;
|
|
114
|
+
}
|
|
115
|
+
function fallbackHardwareInfo(platform = detectPlatform(), reason = "native hardware probe unavailable") {
|
|
116
|
+
var _a, _b;
|
|
117
|
+
const nav = globalThis.navigator;
|
|
118
|
+
const totalRamGb = (_a = numberFromUnknown(nav === null || nav === void 0 ? void 0 : nav.deviceMemory)) !== null && _a !== void 0 ? _a : 0;
|
|
119
|
+
const gpu = platform === "ios"
|
|
120
|
+
? { backend: "metal", available: true }
|
|
121
|
+
: platform === "android"
|
|
122
|
+
? { backend: "vulkan", available: true }
|
|
123
|
+
: null;
|
|
124
|
+
return {
|
|
125
|
+
platform,
|
|
126
|
+
deviceModel: platform,
|
|
127
|
+
totalRamGb,
|
|
128
|
+
availableRamGb: null,
|
|
129
|
+
cpuCores: (_b = nav === null || nav === void 0 ? void 0 : nav.hardwareConcurrency) !== null && _b !== void 0 ? _b : 0,
|
|
130
|
+
gpu,
|
|
131
|
+
gpuSupported: platform !== "web",
|
|
132
|
+
dflashSupported: false,
|
|
133
|
+
dflashReason: reason,
|
|
134
|
+
source: "adapter-fallback",
|
|
135
|
+
nativeKernels: [],
|
|
136
|
+
forkVariant: null,
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
function normalizeForkVariant(value) {
|
|
140
|
+
if (value === "buun-llama-cpp" || value === "stock-llama-cpp")
|
|
141
|
+
return value;
|
|
142
|
+
if (value === null)
|
|
143
|
+
return null;
|
|
144
|
+
return undefined;
|
|
145
|
+
}
|
|
146
|
+
function stringArrayFromUnknown(value) {
|
|
147
|
+
if (!Array.isArray(value))
|
|
148
|
+
return undefined;
|
|
149
|
+
const out = [];
|
|
150
|
+
for (const entry of value) {
|
|
151
|
+
if (typeof entry === "string" && entry.length > 0)
|
|
152
|
+
out.push(entry);
|
|
153
|
+
}
|
|
154
|
+
return out;
|
|
155
|
+
}
|
|
156
|
+
function normalizeHardwareInfo(value, platform = detectPlatform()) {
|
|
157
|
+
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k;
|
|
158
|
+
const fallback = fallbackHardwareInfo(platform);
|
|
159
|
+
if (!value)
|
|
160
|
+
return fallback;
|
|
161
|
+
const totalRamGb = (_a = numberFromUnknown(value.totalRamGb)) !== null && _a !== void 0 ? _a : fallback.totalRamGb;
|
|
162
|
+
const availableRamGb = value.availableRamGb === null
|
|
163
|
+
? null
|
|
164
|
+
: ((_b = numberFromUnknown(value.availableRamGb)) !== null && _b !== void 0 ? _b : fallback.availableRamGb);
|
|
165
|
+
const gpu = value.gpu && isObject(value.gpu)
|
|
166
|
+
? {
|
|
167
|
+
backend: value.gpu.backend === "metal" ||
|
|
168
|
+
value.gpu.backend === "vulkan" ||
|
|
169
|
+
value.gpu.backend === "gpu-delegate"
|
|
170
|
+
? value.gpu.backend
|
|
171
|
+
: ((_d = (_c = fallback.gpu) === null || _c === void 0 ? void 0 : _c.backend) !== null && _d !== void 0 ? _d : "gpu-delegate"),
|
|
172
|
+
available: Boolean(value.gpu.available),
|
|
173
|
+
}
|
|
174
|
+
: fallback.gpu;
|
|
175
|
+
return Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign({ platform: value.platform === "ios" ||
|
|
176
|
+
value.platform === "android" ||
|
|
177
|
+
value.platform === "web"
|
|
178
|
+
? value.platform
|
|
179
|
+
: platform, deviceModel: (_e = stringFromUnknown(value.deviceModel)) !== null && _e !== void 0 ? _e : fallback.deviceModel }, (stringFromUnknown(value.machineId)
|
|
180
|
+
? { machineId: stringFromUnknown(value.machineId) }
|
|
181
|
+
: {})), (stringFromUnknown(value.osVersion)
|
|
182
|
+
? { osVersion: stringFromUnknown(value.osVersion) }
|
|
183
|
+
: {})), (typeof value.isSimulator === "boolean"
|
|
184
|
+
? { isSimulator: value.isSimulator }
|
|
185
|
+
: {})), { totalRamGb,
|
|
186
|
+
availableRamGb }), (numberFromUnknown(value.freeStorageGb) !== null
|
|
187
|
+
? { freeStorageGb: numberFromUnknown(value.freeStorageGb) }
|
|
188
|
+
: {})), { cpuCores: (_f = numberFromUnknown(value.cpuCores)) !== null && _f !== void 0 ? _f : fallback.cpuCores, gpu, gpuSupported: (_g = booleanFromUnknown(value.gpuSupported)) !== null && _g !== void 0 ? _g : fallback.gpuSupported }), (typeof value.lowPowerMode === "boolean"
|
|
189
|
+
? { lowPowerMode: value.lowPowerMode }
|
|
190
|
+
: {})), (value.thermalState === "nominal" ||
|
|
191
|
+
value.thermalState === "fair" ||
|
|
192
|
+
value.thermalState === "serious" ||
|
|
193
|
+
value.thermalState === "critical" ||
|
|
194
|
+
value.thermalState === "unknown"
|
|
195
|
+
? { thermalState: value.thermalState }
|
|
196
|
+
: {})), { dflashSupported: Boolean(value.dflashSupported), dflashReason: (_h = stringFromUnknown(value.dflashReason)) !== null && _h !== void 0 ? _h : (value.dflashSupported
|
|
197
|
+
? undefined
|
|
198
|
+
: "native plugin did not report DFlash support"), source: value.source === "native" ? "native" : "adapter-fallback", nativeKernels: (_j = stringArrayFromUnknown(value.nativeKernels)) !== null && _j !== void 0 ? _j : [], forkVariant: (_k = normalizeForkVariant(value.forkVariant)) !== null && _k !== void 0 ? _k : null });
|
|
199
|
+
}
|
|
41
200
|
class CapacitorLlamaAdapter {
|
|
42
201
|
constructor() {
|
|
43
202
|
this.plugin = null;
|
|
@@ -54,10 +213,11 @@ class CapacitorLlamaAdapter {
|
|
|
54
213
|
if (this.pluginLoadPromise)
|
|
55
214
|
return this.pluginLoadPromise;
|
|
56
215
|
this.pluginLoadPromise = (async () => {
|
|
57
|
-
const
|
|
58
|
-
if (!
|
|
59
|
-
throw new Error("llama-cpp-capacitor did not expose
|
|
216
|
+
const nativePlugin = resolveLlamaCppPlugin(await import("llama-cpp-capacitor"));
|
|
217
|
+
if (!nativePlugin) {
|
|
218
|
+
throw new Error("llama-cpp-capacitor did not expose the native LlamaCpp methods");
|
|
60
219
|
}
|
|
220
|
+
const plugin = toPlainLlamaCppPlugin(nativePlugin);
|
|
61
221
|
const tokenListenerHandle = await plugin.addListener("@LlamaCpp_onToken", (data) => {
|
|
62
222
|
var _a, _b;
|
|
63
223
|
const token = (_b = (_a = data.tokenResult) === null || _a === void 0 ? void 0 : _a.token) !== null && _b !== void 0 ? _b : data.token;
|
|
@@ -86,19 +246,73 @@ class CapacitorLlamaAdapter {
|
|
|
86
246
|
}
|
|
87
247
|
}
|
|
88
248
|
async getHardwareInfo() {
|
|
89
|
-
var _a;
|
|
249
|
+
var _a, _b, _c;
|
|
90
250
|
const platform = detectPlatform();
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
251
|
+
if (!isCapacitorNative())
|
|
252
|
+
return fallbackHardwareInfo(platform);
|
|
253
|
+
try {
|
|
254
|
+
const plugin = await this.loadPlugin();
|
|
255
|
+
const baseInfo = normalizeHardwareInfo(await ((_a = plugin.getHardwareInfo) === null || _a === void 0 ? void 0 : _a.call(plugin)), platform);
|
|
256
|
+
// Probe fork-specific kernels through the optional bridge method.
|
|
257
|
+
// Stock builds and older fork builds without the bridge fall back
|
|
258
|
+
// to the empty list + "stock-llama-cpp" variant marker.
|
|
259
|
+
let nativeKernels = (_b = baseInfo.nativeKernels) !== null && _b !== void 0 ? _b : [];
|
|
260
|
+
let forkVariant = (_c = baseInfo.forkVariant) !== null && _c !== void 0 ? _c : "stock-llama-cpp";
|
|
261
|
+
if (typeof plugin.getNativeKernels === "function") {
|
|
262
|
+
try {
|
|
263
|
+
const probe = await plugin.getNativeKernels();
|
|
264
|
+
const kernels = stringArrayFromUnknown(probe === null || probe === void 0 ? void 0 : probe.kernels);
|
|
265
|
+
if (kernels)
|
|
266
|
+
nativeKernels = kernels;
|
|
267
|
+
const variant = normalizeForkVariant(probe === null || probe === void 0 ? void 0 : probe.variant);
|
|
268
|
+
if (variant !== undefined)
|
|
269
|
+
forkVariant = variant;
|
|
270
|
+
else if (nativeKernels.length > 0)
|
|
271
|
+
forkVariant = "buun-llama-cpp";
|
|
272
|
+
}
|
|
273
|
+
catch (err) {
|
|
274
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
275
|
+
console.debug("[capacitor-llama] getNativeKernels probe failed", {
|
|
276
|
+
error: message,
|
|
277
|
+
});
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
return Object.assign(Object.assign({}, baseInfo), { nativeKernels,
|
|
281
|
+
forkVariant });
|
|
282
|
+
}
|
|
283
|
+
catch (error) {
|
|
284
|
+
return fallbackHardwareInfo(platform, error instanceof Error ? error.message : "native hardware probe failed");
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
async setCacheType(typeK, typeV) {
|
|
288
|
+
if (!isCapacitorNative()) {
|
|
289
|
+
console.warn("[capacitor-llama] setCacheType called on non-native platform; ignoring");
|
|
290
|
+
return;
|
|
291
|
+
}
|
|
292
|
+
const plugin = await this.loadPlugin();
|
|
293
|
+
if (typeof plugin.setCacheType !== "function") {
|
|
294
|
+
console.warn("[capacitor-llama] underlying plugin does not expose setCacheType (likely stock build); cache types must be passed via load() params instead");
|
|
295
|
+
return;
|
|
296
|
+
}
|
|
297
|
+
await plugin.setCacheType({ cacheTypeK: typeK, cacheTypeV: typeV });
|
|
298
|
+
}
|
|
299
|
+
async setSpecType(args) {
|
|
300
|
+
if (!isCapacitorNative()) {
|
|
301
|
+
console.warn("[capacitor-llama] setSpecType called on non-native platform; ignoring");
|
|
302
|
+
return;
|
|
303
|
+
}
|
|
304
|
+
const plugin = await this.loadPlugin();
|
|
305
|
+
if (typeof plugin.setSpecType !== "function") {
|
|
306
|
+
console.warn("[capacitor-llama] underlying plugin does not expose setSpecType (likely stock build); pass draft_model + draft_min/max via load() instead");
|
|
307
|
+
return;
|
|
308
|
+
}
|
|
309
|
+
await plugin.setSpecType({
|
|
310
|
+
target: args.target,
|
|
311
|
+
drafter: args.drafter,
|
|
312
|
+
specType: args.specType,
|
|
313
|
+
draftMin: args.draftMin,
|
|
314
|
+
draftMax: args.draftMax,
|
|
315
|
+
});
|
|
102
316
|
}
|
|
103
317
|
async isLoaded() {
|
|
104
318
|
return {
|
|
@@ -110,7 +324,7 @@ class CapacitorLlamaAdapter {
|
|
|
110
324
|
return this.loadedPath;
|
|
111
325
|
}
|
|
112
326
|
async load(options) {
|
|
113
|
-
var _a, _b;
|
|
327
|
+
var _a, _b, _c, _d, _e, _f;
|
|
114
328
|
if (!isCapacitorNative()) {
|
|
115
329
|
throw new Error("capacitor-llama is only available on iOS and Android builds");
|
|
116
330
|
}
|
|
@@ -119,15 +333,21 @@ class CapacitorLlamaAdapter {
|
|
|
119
333
|
await plugin.releaseAllContexts();
|
|
120
334
|
this.loadedPath = null;
|
|
121
335
|
}
|
|
336
|
+
const speculativeSamples = options.mobileSpeculative
|
|
337
|
+
? Math.min((_b = (_a = options.speculativeSamples) !== null && _a !== void 0 ? _a : options.draftMax) !== null && _b !== void 0 ? _b : 3, 4)
|
|
338
|
+
: ((_c = options.speculativeSamples) !== null && _c !== void 0 ? _c : 3);
|
|
339
|
+
const params = Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign({ model: options.modelPath, n_ctx: (_d = options.contextSize) !== null && _d !== void 0 ? _d : 4096, n_gpu_layers: options.useGpu === false ? 0 : 99, n_threads: (_e = options.maxThreads) !== null && _e !== void 0 ? _e : 0, use_mmap: true, flash_attn: options.useGpu !== false, n_batch: options.mobileSpeculative ? 128 : 512, n_ubatch: options.mobileSpeculative ? 64 : 512 }, (options.draftModelPath
|
|
340
|
+
? {
|
|
341
|
+
draft_model: options.draftModelPath,
|
|
342
|
+
speculative_samples: speculativeSamples,
|
|
343
|
+
mobile_speculative: (_f = options.mobileSpeculative) !== null && _f !== void 0 ? _f : true,
|
|
344
|
+
}
|
|
345
|
+
: {})), (options.draftContextSize
|
|
346
|
+
? { n_ctx_draft: options.draftContextSize }
|
|
347
|
+
: {})), (options.draftMin ? { draft_min: options.draftMin } : {})), (options.draftMax ? { draft_max: options.draftMax } : {})), (options.cacheTypeK ? { cache_type_k: options.cacheTypeK } : {})), (options.cacheTypeV ? { cache_type_v: options.cacheTypeV } : {})), (options.disableThinking ? { reasoning: false } : {}));
|
|
122
348
|
await plugin.initContext({
|
|
123
349
|
contextId: CONTEXT_ID,
|
|
124
|
-
params
|
|
125
|
-
model: options.modelPath,
|
|
126
|
-
n_ctx: (_a = options.contextSize) !== null && _a !== void 0 ? _a : 4096,
|
|
127
|
-
n_gpu_layers: options.useGpu === false ? 0 : 99,
|
|
128
|
-
n_threads: (_b = options.maxThreads) !== null && _b !== void 0 ? _b : 0,
|
|
129
|
-
use_mmap: true,
|
|
130
|
-
},
|
|
350
|
+
params,
|
|
131
351
|
});
|
|
132
352
|
this.loadedPath = options.modelPath;
|
|
133
353
|
}
|
|
@@ -143,15 +363,15 @@ class CapacitorLlamaAdapter {
|
|
|
143
363
|
this.loadedPath = null;
|
|
144
364
|
}
|
|
145
365
|
async generate(options) {
|
|
146
|
-
var _a, _b, _c, _d;
|
|
366
|
+
var _a, _b, _c, _d, _e;
|
|
147
367
|
if (!this.plugin || !this.loadedPath) {
|
|
148
368
|
throw new Error("No model loaded. Call load() first.");
|
|
149
369
|
}
|
|
150
370
|
this.tokenIndex = 0;
|
|
151
371
|
const params = {
|
|
152
|
-
n_predict: (
|
|
153
|
-
temperature: (
|
|
154
|
-
top_p: (
|
|
372
|
+
n_predict: resolveMobileMaxTokens(options.maxTokens),
|
|
373
|
+
temperature: (_a = options.temperature) !== null && _a !== void 0 ? _a : 0.7,
|
|
374
|
+
top_p: (_b = options.topP) !== null && _b !== void 0 ? _b : 0.9,
|
|
155
375
|
};
|
|
156
376
|
if (options.stopSequences && options.stopSequences.length > 0) {
|
|
157
377
|
params.stop = options.stopSequences;
|
|
@@ -159,13 +379,32 @@ class CapacitorLlamaAdapter {
|
|
|
159
379
|
if (options.stream) {
|
|
160
380
|
params.emit_partial_completion = true;
|
|
161
381
|
}
|
|
382
|
+
// Cache key threading: surface the slot id derived from
|
|
383
|
+
// ProviderCachePlan.promptCacheKey to the native side. Stock
|
|
384
|
+
// llama-cpp-capacitor builds ignore the field; the patched fork build
|
|
385
|
+
// reads it via setCacheType / completion params and pins KV slots.
|
|
386
|
+
if (options.cacheKey) {
|
|
387
|
+
const slotId = deriveCacheSlotId(options.cacheKey);
|
|
388
|
+
params.cache_prompt =
|
|
389
|
+
true;
|
|
390
|
+
params.slot_id =
|
|
391
|
+
slotId;
|
|
392
|
+
}
|
|
162
393
|
const started = Date.now();
|
|
163
|
-
const result =
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
394
|
+
const result = typeof this.plugin.completion === "function"
|
|
395
|
+
? await this.plugin.completion({
|
|
396
|
+
contextId: CONTEXT_ID,
|
|
397
|
+
params: Object.assign({ prompt: options.prompt, emit_partial_completion: Boolean(params.emit_partial_completion) }, params),
|
|
398
|
+
})
|
|
399
|
+
: await ((_d = (_c = this.plugin).generateText) === null || _d === void 0 ? void 0 : _d.call(_c, {
|
|
400
|
+
contextId: CONTEXT_ID,
|
|
401
|
+
prompt: options.prompt,
|
|
402
|
+
params,
|
|
403
|
+
}));
|
|
404
|
+
if (!result) {
|
|
405
|
+
throw new Error("llama-cpp-capacitor did not expose completion() or generateText()");
|
|
406
|
+
}
|
|
407
|
+
const duration = ((_e = result.timings) === null || _e === void 0 ? void 0 : _e.predicted_ms) != null
|
|
169
408
|
? Math.round(result.timings.predicted_ms)
|
|
170
409
|
: Date.now() - started;
|
|
171
410
|
return {
|
|
@@ -238,7 +477,7 @@ export function registerCapacitorLlamaLoader(runtime) {
|
|
|
238
477
|
return;
|
|
239
478
|
runtime.registerService("localInferenceLoader", {
|
|
240
479
|
async loadModel(args) {
|
|
241
|
-
await capacitorLlama.load(
|
|
480
|
+
await capacitorLlama.load(args);
|
|
242
481
|
},
|
|
243
482
|
async unloadModel() {
|
|
244
483
|
await capacitorLlama.unload();
|
|
@@ -20,6 +20,22 @@ export interface LoadOptions {
|
|
|
20
20
|
useGpu?: boolean;
|
|
21
21
|
/** Cap on native thread count; native layer picks a reasonable default otherwise. */
|
|
22
22
|
maxThreads?: number;
|
|
23
|
+
/** Optional draft GGUF for native speculative decoding builds. */
|
|
24
|
+
draftModelPath?: string;
|
|
25
|
+
/** Context window for the draft model when supported by the native build. */
|
|
26
|
+
draftContextSize?: number;
|
|
27
|
+
/** Lower/upper speculative draft bounds for fork builds that expose them. */
|
|
28
|
+
draftMin?: number;
|
|
29
|
+
draftMax?: number;
|
|
30
|
+
/** Number of draft tokens/samples when the native runtime supports it. */
|
|
31
|
+
speculativeSamples?: number;
|
|
32
|
+
/** Mobile runtimes may enable a lower-memory speculative path. */
|
|
33
|
+
mobileSpeculative?: boolean;
|
|
34
|
+
/** Optional KV cache types for fork builds such as TurboQuant. */
|
|
35
|
+
cacheTypeK?: string;
|
|
36
|
+
cacheTypeV?: string;
|
|
37
|
+
/** Eliza-1 DFlash drafters are trained for non-thinking outputs. */
|
|
38
|
+
disableThinking?: boolean;
|
|
23
39
|
}
|
|
24
40
|
export interface GenerateOptions {
|
|
25
41
|
prompt: string;
|
|
@@ -29,6 +45,13 @@ export interface GenerateOptions {
|
|
|
29
45
|
stopSequences?: string[];
|
|
30
46
|
/** When true, token events fire on the "token" listener. */
|
|
31
47
|
stream?: boolean;
|
|
48
|
+
/**
|
|
49
|
+
* Forwarded promptCacheKey from `ProviderCachePlan`. Native plugins
|
|
50
|
+
* that support prefix caching should derive a slot id from this and
|
|
51
|
+
* keep KV warm for repeated calls with the same key. Plugins without
|
|
52
|
+
* cache support ignore the field; behavior is unchanged.
|
|
53
|
+
*/
|
|
54
|
+
cacheKey?: string;
|
|
32
55
|
}
|
|
33
56
|
export interface GenerateResult {
|
|
34
57
|
text: string;
|
|
@@ -40,8 +63,13 @@ export interface HardwareInfo {
|
|
|
40
63
|
platform: "ios" | "android" | "web";
|
|
41
64
|
/** Human-readable device model when the OS exposes one. */
|
|
42
65
|
deviceModel: string;
|
|
66
|
+
/** Stable OS machine identifier when available, e.g. iPhone16,2. */
|
|
67
|
+
machineId?: string;
|
|
68
|
+
osVersion?: string;
|
|
69
|
+
isSimulator?: boolean;
|
|
43
70
|
totalRamGb: number;
|
|
44
71
|
availableRamGb: number | null;
|
|
72
|
+
freeStorageGb?: number | null;
|
|
45
73
|
cpuCores: number;
|
|
46
74
|
gpu: {
|
|
47
75
|
backend: "metal" | "vulkan" | "gpu-delegate";
|
|
@@ -49,6 +77,25 @@ export interface HardwareInfo {
|
|
|
49
77
|
} | null;
|
|
50
78
|
/** True when the underlying llama.cpp build has GPU support compiled in. */
|
|
51
79
|
gpuSupported: boolean;
|
|
80
|
+
lowPowerMode?: boolean;
|
|
81
|
+
thermalState?: "nominal" | "fair" | "serious" | "critical" | "unknown";
|
|
82
|
+
/** True only when the native build can load a drafter and run DFlash/spec decode. */
|
|
83
|
+
dflashSupported?: boolean;
|
|
84
|
+
dflashReason?: string;
|
|
85
|
+
source?: "native" | "adapter-fallback";
|
|
86
|
+
/**
|
|
87
|
+
* Names of fork-specific kernels compiled into the loaded native library
|
|
88
|
+
* (e.g. "turbo3", "turbo4", "turbo3_tcq", "dflash", "qjl_full"). Empty
|
|
89
|
+
* when the loaded build is stock llama.cpp or when no native lib is loaded.
|
|
90
|
+
* Surfaced from the native bridge via a `kernels.json` manifest shipped
|
|
91
|
+
* alongside the .so.
|
|
92
|
+
*/
|
|
93
|
+
nativeKernels?: string[];
|
|
94
|
+
/**
|
|
95
|
+
* Which native llama.cpp variant is loaded. `null` when the plugin
|
|
96
|
+
* isn't loaded at all (web fallback or native lib failed to load).
|
|
97
|
+
*/
|
|
98
|
+
forkVariant?: "buun-llama-cpp" | "stock-llama-cpp" | null;
|
|
52
99
|
}
|
|
53
100
|
export interface EmbedOptions {
|
|
54
101
|
/** Raw text to embed. The adapter forwards this verbatim to the native plugin. */
|
|
@@ -70,6 +117,16 @@ export interface EmbedResult {
|
|
|
70
117
|
*/
|
|
71
118
|
tokens: number;
|
|
72
119
|
}
|
|
120
|
+
export interface SetSpecTypeArgs {
|
|
121
|
+
/** Path to the target (large) GGUF. */
|
|
122
|
+
target: string;
|
|
123
|
+
/** Path to the drafter (small) GGUF. */
|
|
124
|
+
drafter: string;
|
|
125
|
+
/** Currently only "dflash" is honoured by the buun fork. */
|
|
126
|
+
specType: "dflash";
|
|
127
|
+
draftMin: number;
|
|
128
|
+
draftMax: number;
|
|
129
|
+
}
|
|
73
130
|
export interface LlamaAdapter {
|
|
74
131
|
getHardwareInfo(): Promise<HardwareInfo>;
|
|
75
132
|
isLoaded(): Promise<{
|
|
@@ -89,4 +146,16 @@ export interface LlamaAdapter {
|
|
|
89
146
|
* does not expose an embedding method on the active platform.
|
|
90
147
|
*/
|
|
91
148
|
embed(options: EmbedOptions): Promise<EmbedResult>;
|
|
149
|
+
/**
|
|
150
|
+
* Configure the KV cache types used by the next loaded context. Only
|
|
151
|
+
* the buun-llama-cpp fork honours TurboQuant cache types like
|
|
152
|
+
* `q4_tq3` / `q4_tq4`. Stock builds will warn-and-no-op when the
|
|
153
|
+
* underlying plugin doesn't expose the bridge method.
|
|
154
|
+
*/
|
|
155
|
+
setCacheType?(typeK: string, typeV: string): Promise<void>;
|
|
156
|
+
/**
|
|
157
|
+
* Configure DFlash speculative decoding for the next loaded context.
|
|
158
|
+
* Stock builds without speculative bridge methods warn-and-no-op.
|
|
159
|
+
*/
|
|
160
|
+
setSpecType?(args: SetSpecTypeArgs): Promise<void>;
|
|
92
161
|
}
|
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
import { loadCapacitorLlama } from "./load-capacitor-llama";
|
|
15
15
|
const INITIAL_BACKOFF_MS = 1000;
|
|
16
16
|
const MAX_BACKOFF_MS = 30000;
|
|
17
|
+
const CONNECT_TIMEOUT_MS = 5000;
|
|
17
18
|
export class DeviceBridgeClient {
|
|
18
19
|
constructor(config) {
|
|
19
20
|
this.socket = null;
|
|
@@ -58,7 +59,27 @@ export class DeviceBridgeClient {
|
|
|
58
59
|
return;
|
|
59
60
|
}
|
|
60
61
|
this.socket = ws;
|
|
62
|
+
let timedOut = false;
|
|
63
|
+
const connectTimeout = setTimeout(() => {
|
|
64
|
+
var _a, _b;
|
|
65
|
+
if (this.stopped ||
|
|
66
|
+
this.socket !== ws ||
|
|
67
|
+
ws.readyState !== WebSocket.CONNECTING) {
|
|
68
|
+
return;
|
|
69
|
+
}
|
|
70
|
+
timedOut = true;
|
|
71
|
+
this.socket = null;
|
|
72
|
+
(_b = (_a = this.config).onStateChange) === null || _b === void 0 ? void 0 : _b.call(_a, "error", "websocket connect timeout");
|
|
73
|
+
try {
|
|
74
|
+
ws.close();
|
|
75
|
+
}
|
|
76
|
+
catch (_c) {
|
|
77
|
+
/* best effort */
|
|
78
|
+
}
|
|
79
|
+
this.scheduleReconnect();
|
|
80
|
+
}, CONNECT_TIMEOUT_MS);
|
|
61
81
|
ws.onopen = () => {
|
|
82
|
+
clearTimeout(connectTimeout);
|
|
62
83
|
this.reconnectAttempt = 0;
|
|
63
84
|
void this.sendRegister(ws);
|
|
64
85
|
};
|
|
@@ -78,8 +99,12 @@ export class DeviceBridgeClient {
|
|
|
78
99
|
};
|
|
79
100
|
ws.onclose = () => {
|
|
80
101
|
var _a, _b;
|
|
81
|
-
|
|
102
|
+
clearTimeout(connectTimeout);
|
|
103
|
+
if (this.socket === ws)
|
|
104
|
+
this.socket = null;
|
|
82
105
|
(_b = (_a = this.config).onStateChange) === null || _b === void 0 ? void 0 : _b.call(_a, "disconnected");
|
|
106
|
+
if (timedOut)
|
|
107
|
+
return;
|
|
83
108
|
this.scheduleReconnect();
|
|
84
109
|
};
|
|
85
110
|
}
|
|
@@ -107,13 +132,17 @@ export class DeviceBridgeClient {
|
|
|
107
132
|
payload: {
|
|
108
133
|
deviceId: this.config.deviceId,
|
|
109
134
|
pairingToken: this.config.pairingToken,
|
|
110
|
-
capabilities: {
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
cpuCores: hardware.cpuCores,
|
|
115
|
-
|
|
116
|
-
|
|
135
|
+
capabilities: Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign({ platform: hardware.platform, deviceModel: hardware.deviceModel }, (hardware.machineId ? { machineId: hardware.machineId } : {})), (hardware.osVersion ? { osVersion: hardware.osVersion } : {})), (typeof hardware.isSimulator === "boolean"
|
|
136
|
+
? { isSimulator: hardware.isSimulator }
|
|
137
|
+
: {})), { totalRamGb: hardware.totalRamGb, availableRamGb: hardware.availableRamGb }), (typeof hardware.freeStorageGb === "number"
|
|
138
|
+
? { freeStorageGb: hardware.freeStorageGb }
|
|
139
|
+
: {})), { cpuCores: hardware.cpuCores, gpu: hardware.gpu, gpuSupported: hardware.gpuSupported }), (typeof hardware.lowPowerMode === "boolean"
|
|
140
|
+
? { lowPowerMode: hardware.lowPowerMode }
|
|
141
|
+
: {})), (hardware.thermalState
|
|
142
|
+
? { thermalState: hardware.thermalState }
|
|
143
|
+
: {})), { dflashSupported: hardware.dflashSupported }), (hardware.dflashReason
|
|
144
|
+
? { dflashReason: hardware.dflashReason }
|
|
145
|
+
: {})),
|
|
117
146
|
loadedPath: loaded.modelPath,
|
|
118
147
|
},
|
|
119
148
|
};
|
|
@@ -137,6 +166,16 @@ export class DeviceBridgeClient {
|
|
|
137
166
|
modelPath: msg.modelPath,
|
|
138
167
|
contextSize: msg.contextSize,
|
|
139
168
|
useGpu: msg.useGpu,
|
|
169
|
+
maxThreads: msg.maxThreads,
|
|
170
|
+
draftModelPath: msg.draftModelPath,
|
|
171
|
+
draftContextSize: msg.draftContextSize,
|
|
172
|
+
draftMin: msg.draftMin,
|
|
173
|
+
draftMax: msg.draftMax,
|
|
174
|
+
speculativeSamples: msg.speculativeSamples,
|
|
175
|
+
mobileSpeculative: msg.mobileSpeculative,
|
|
176
|
+
cacheTypeK: msg.cacheTypeK,
|
|
177
|
+
cacheTypeV: msg.cacheTypeV,
|
|
178
|
+
disableThinking: msg.disableThinking,
|
|
140
179
|
});
|
|
141
180
|
this.send(ws, {
|
|
142
181
|
type: "loadResult",
|
|
@@ -204,6 +243,28 @@ export class DeviceBridgeClient {
|
|
|
204
243
|
}
|
|
205
244
|
return;
|
|
206
245
|
}
|
|
246
|
+
if (msg.type === "embed") {
|
|
247
|
+
try {
|
|
248
|
+
const capacitorLlama = await loadCapacitorLlama();
|
|
249
|
+
const result = await capacitorLlama.embed({ input: msg.input });
|
|
250
|
+
this.send(ws, {
|
|
251
|
+
type: "embedResult",
|
|
252
|
+
correlationId: msg.correlationId,
|
|
253
|
+
ok: true,
|
|
254
|
+
embedding: result.embedding,
|
|
255
|
+
tokens: result.tokens,
|
|
256
|
+
});
|
|
257
|
+
}
|
|
258
|
+
catch (err) {
|
|
259
|
+
this.send(ws, {
|
|
260
|
+
type: "embedResult",
|
|
261
|
+
correlationId: msg.correlationId,
|
|
262
|
+
ok: false,
|
|
263
|
+
error: err instanceof Error ? err.message : String(err),
|
|
264
|
+
});
|
|
265
|
+
}
|
|
266
|
+
return;
|
|
267
|
+
}
|
|
207
268
|
}
|
|
208
269
|
}
|
|
209
270
|
/**
|