@elizaos/capacitor-llama 0.1.0 → 2.0.3-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +64 -43
- package/dist/esm/capacitor-llama-adapter.d.ts +92 -1
- package/dist/esm/capacitor-llama-adapter.js +705 -64
- package/dist/esm/definitions.d.ts +214 -0
- package/dist/esm/device-bridge-client.d.ts +17 -0
- package/dist/esm/device-bridge-client.js +210 -15
- package/dist/esm/index.d.ts +3 -2
- package/dist/esm/index.js +3 -2
- package/dist/esm/kv-cache-resolver.d.ts +57 -0
- package/dist/esm/kv-cache-resolver.js +74 -0
- package/dist/esm/load-capacitor-llama.d.ts +1 -1
- package/dist/esm/load-capacitor-llama.js +1 -1
- package/dist/esm/token-tree-codec.d.ts +51 -0
- package/dist/esm/token-tree-codec.js +217 -0
- package/dist/plugin.cjs.js +1136 -79
- package/dist/plugin.cjs.js.map +1 -1
- package/dist/plugin.js +1136 -79
- package/dist/plugin.js.map +1 -1
- package/package.json +15 -10
- package/dist/esm/index.test.d.ts +0 -1
- package/dist/esm/index.test.js +0 -264
- package/dist/esm/web.d.ts +0 -11
- package/dist/esm/web.js +0 -10
|
@@ -1,4 +1,46 @@
|
|
|
1
|
-
|
|
1
|
+
var __asyncValues = (this && this.__asyncValues) || function (o) {
|
|
2
|
+
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
|
|
3
|
+
var m = o[Symbol.asyncIterator], i;
|
|
4
|
+
return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
|
|
5
|
+
function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
|
|
6
|
+
function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
|
|
7
|
+
};
|
|
8
|
+
var __await = (this && this.__await) || function (v) { return this instanceof __await ? (this.v = v, this) : new __await(v); }
|
|
9
|
+
var __asyncGenerator = (this && this.__asyncGenerator) || function (thisArg, _arguments, generator) {
|
|
10
|
+
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
|
|
11
|
+
var g = generator.apply(thisArg, _arguments || []), i, q = [];
|
|
12
|
+
return i = Object.create((typeof AsyncIterator === "function" ? AsyncIterator : Object).prototype), verb("next"), verb("throw"), verb("return", awaitReturn), i[Symbol.asyncIterator] = function () { return this; }, i;
|
|
13
|
+
function awaitReturn(f) { return function (v) { return Promise.resolve(v).then(f, reject); }; }
|
|
14
|
+
function verb(n, f) { if (g[n]) { i[n] = function (v) { return new Promise(function (a, b) { q.push([n, v, a, b]) > 1 || resume(n, v); }); }; if (f) i[n] = f(i[n]); } }
|
|
15
|
+
function resume(n, v) { try { step(g[n](v)); } catch (e) { settle(q[0][3], e); } }
|
|
16
|
+
function step(r) { r.value instanceof __await ? Promise.resolve(r.value.v).then(fulfill, reject) : settle(q[0][2], r); }
|
|
17
|
+
function fulfill(value) { resume("next", value); }
|
|
18
|
+
function reject(value) { resume("throw", value); }
|
|
19
|
+
function settle(f, v) { if (f(v), q.shift(), q.length) resume(q[0][0], q[0][1]); }
|
|
20
|
+
};
|
|
21
|
+
// completion(contextId=X) must run against the model that was initContext'd
|
|
22
|
+
// with X — every adapter instance owns its own monotonically-allocated id so
|
|
23
|
+
// the chat LLM and the embedding model never collide on the same native
|
|
24
|
+
// context.
|
|
25
|
+
let nextContextId = 1;
|
|
26
|
+
const DEFAULT_MAX_TOKENS = 256;
|
|
27
|
+
/**
|
|
28
|
+
* Mobile-side parallel slot count. Mirrors `DEFAULT_CACHE_PARALLEL` in
|
|
29
|
+
* `cache-bridge.ts`; on devices with constrained KV memory we keep a small
|
|
30
|
+
* fixed pool so distinct cacheKey values still get prefix reuse without
|
|
31
|
+
* blowing memory.
|
|
32
|
+
*/
|
|
33
|
+
const MOBILE_PARALLEL = 4;
|
|
34
|
+
/** FNV-1a 32-bit, deterministic across platforms — matches the agent side. */
|
|
35
|
+
function deriveCacheSlotId(key) {
|
|
36
|
+
let hash = 0x811c9dc5;
|
|
37
|
+
for (let i = 0; i < key.length; i += 1) {
|
|
38
|
+
hash ^= key.charCodeAt(i);
|
|
39
|
+
hash = Math.imul(hash, 0x01000193);
|
|
40
|
+
}
|
|
41
|
+
return Math.abs(hash | 0) % MOBILE_PARALLEL;
|
|
42
|
+
}
|
|
43
|
+
const MOBILE_MAX_TOKENS_CAP = 256;
|
|
2
44
|
function isObject(value) {
|
|
3
45
|
return typeof value === "object" && value !== null;
|
|
4
46
|
}
|
|
@@ -7,7 +49,8 @@ function isLlamaCppPluginLike(value) {
|
|
|
7
49
|
typeof value.initContext === "function" &&
|
|
8
50
|
typeof value.releaseContext === "function" &&
|
|
9
51
|
typeof value.releaseAllContexts === "function" &&
|
|
10
|
-
typeof value.
|
|
52
|
+
(typeof value.completion === "function" ||
|
|
53
|
+
typeof value.generateText === "function") &&
|
|
11
54
|
typeof value.stopCompletion === "function" &&
|
|
12
55
|
typeof value.addListener === "function");
|
|
13
56
|
}
|
|
@@ -23,6 +66,42 @@ function resolveLlamaCppPlugin(mod) {
|
|
|
23
66
|
}
|
|
24
67
|
return null;
|
|
25
68
|
}
|
|
69
|
+
function toPlainLlamaCppPlugin(plugin) {
|
|
70
|
+
return {
|
|
71
|
+
initContext: (options) => plugin.initContext(options),
|
|
72
|
+
releaseContext: (options) => plugin.releaseContext(options),
|
|
73
|
+
releaseAllContexts: () => plugin.releaseAllContexts(),
|
|
74
|
+
getHardwareInfo: typeof plugin.getHardwareInfo === "function"
|
|
75
|
+
? () => { var _a; return (_a = plugin.getHardwareInfo) === null || _a === void 0 ? void 0 : _a.call(plugin); }
|
|
76
|
+
: undefined,
|
|
77
|
+
completion: typeof plugin.completion === "function"
|
|
78
|
+
? (options) => { var _a; return (_a = plugin.completion) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
|
|
79
|
+
: undefined,
|
|
80
|
+
generateText: typeof plugin.generateText === "function"
|
|
81
|
+
? (options) => { var _a; return (_a = plugin.generateText) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
|
|
82
|
+
: undefined,
|
|
83
|
+
stopCompletion: (options) => plugin.stopCompletion(options),
|
|
84
|
+
embedding: typeof plugin.embedding === "function"
|
|
85
|
+
? (options) => { var _a; return (_a = plugin.embedding) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
|
|
86
|
+
: undefined,
|
|
87
|
+
tokenize: typeof plugin.tokenize === "function"
|
|
88
|
+
? (options) => { var _a; return (_a = plugin.tokenize) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
|
|
89
|
+
: undefined,
|
|
90
|
+
setCacheType: typeof plugin.setCacheType === "function"
|
|
91
|
+
? (options) => { var _a; return (_a = plugin.setCacheType) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
|
|
92
|
+
: undefined,
|
|
93
|
+
setSpecType: typeof plugin.setSpecType === "function"
|
|
94
|
+
? (options) => { var _a; return (_a = plugin.setSpecType) === null || _a === void 0 ? void 0 : _a.call(plugin, options); }
|
|
95
|
+
: undefined,
|
|
96
|
+
getNativeKernels: typeof plugin.getNativeKernels === "function"
|
|
97
|
+
? () => {
|
|
98
|
+
var _a;
|
|
99
|
+
return (_a = plugin.getNativeKernels) === null || _a === void 0 ? void 0 : _a.call(plugin);
|
|
100
|
+
}
|
|
101
|
+
: undefined,
|
|
102
|
+
addListener: (event, listener) => plugin.addListener(event, listener),
|
|
103
|
+
};
|
|
104
|
+
}
|
|
26
105
|
function isCapacitorNative() {
|
|
27
106
|
var _a;
|
|
28
107
|
const cap = globalThis.Capacitor;
|
|
@@ -38,15 +117,145 @@ function detectPlatform() {
|
|
|
38
117
|
return "android";
|
|
39
118
|
return "web";
|
|
40
119
|
}
|
|
41
|
-
|
|
120
|
+
function resolveMobileMaxTokens(requested) {
|
|
121
|
+
if (!Number.isFinite(requested) || requested == null || requested <= 0) {
|
|
122
|
+
return DEFAULT_MAX_TOKENS;
|
|
123
|
+
}
|
|
124
|
+
return Math.min(Math.floor(requested), MOBILE_MAX_TOKENS_CAP);
|
|
125
|
+
}
|
|
126
|
+
function numberFromUnknown(value) {
|
|
127
|
+
if (typeof value !== "number" || !Number.isFinite(value))
|
|
128
|
+
return null;
|
|
129
|
+
return value;
|
|
130
|
+
}
|
|
131
|
+
function booleanFromUnknown(value) {
|
|
132
|
+
return typeof value === "boolean" ? value : undefined;
|
|
133
|
+
}
|
|
134
|
+
function stringFromUnknown(value) {
|
|
135
|
+
return typeof value === "string" && value.trim().length > 0
|
|
136
|
+
? value.trim()
|
|
137
|
+
: undefined;
|
|
138
|
+
}
|
|
139
|
+
function fallbackHardwareInfo(platform = detectPlatform(), reason = "native hardware probe unavailable") {
|
|
140
|
+
var _a, _b;
|
|
141
|
+
const nav = globalThis.navigator;
|
|
142
|
+
const totalRamGb = (_a = numberFromUnknown(nav === null || nav === void 0 ? void 0 : nav.deviceMemory)) !== null && _a !== void 0 ? _a : 0;
|
|
143
|
+
const gpu = platform === "ios"
|
|
144
|
+
? { backend: "metal", available: true }
|
|
145
|
+
: null;
|
|
146
|
+
return {
|
|
147
|
+
platform,
|
|
148
|
+
deviceModel: platform,
|
|
149
|
+
totalRamGb,
|
|
150
|
+
availableRamGb: null,
|
|
151
|
+
cpuCores: (_b = nav === null || nav === void 0 ? void 0 : nav.hardwareConcurrency) !== null && _b !== void 0 ? _b : 0,
|
|
152
|
+
gpu,
|
|
153
|
+
gpuSupported: platform === "ios",
|
|
154
|
+
mtpSupported: false,
|
|
155
|
+
mtpReason: reason,
|
|
156
|
+
source: "adapter-fallback",
|
|
157
|
+
nativeKernels: [],
|
|
158
|
+
forkVariant: null,
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
function defaultNativeGpuEnabled(platform = detectPlatform()) {
|
|
162
|
+
// iOS builds use the Metal-capable native path by default. Android's current
|
|
163
|
+
// Capacitor wrapper is CPU-only unless a forked Vulkan bridge explicitly opts
|
|
164
|
+
// in, so the safe production default is CPU.
|
|
165
|
+
return platform === "ios";
|
|
166
|
+
}
|
|
167
|
+
function resolveNativeGpuEnabled(useGpu) {
|
|
168
|
+
return typeof useGpu === "boolean" ? useGpu : defaultNativeGpuEnabled();
|
|
169
|
+
}
|
|
170
|
+
function normalizeForkVariant(value) {
|
|
171
|
+
if (value === "buun-llama-cpp" || value === "stock-llama-cpp")
|
|
172
|
+
return value;
|
|
173
|
+
if (value === null)
|
|
174
|
+
return null;
|
|
175
|
+
return undefined;
|
|
176
|
+
}
|
|
177
|
+
function stringArrayFromUnknown(value) {
|
|
178
|
+
if (!Array.isArray(value))
|
|
179
|
+
return undefined;
|
|
180
|
+
const out = [];
|
|
181
|
+
for (const entry of value) {
|
|
182
|
+
if (typeof entry === "string" && entry.length > 0)
|
|
183
|
+
out.push(entry);
|
|
184
|
+
}
|
|
185
|
+
return out;
|
|
186
|
+
}
|
|
187
|
+
function normalizeHardwareInfo(value, platform = detectPlatform()) {
|
|
188
|
+
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k;
|
|
189
|
+
const fallback = fallbackHardwareInfo(platform);
|
|
190
|
+
if (!value)
|
|
191
|
+
return fallback;
|
|
192
|
+
const totalRamGb = (_a = numberFromUnknown(value.totalRamGb)) !== null && _a !== void 0 ? _a : fallback.totalRamGb;
|
|
193
|
+
const availableRamGb = value.availableRamGb === null
|
|
194
|
+
? null
|
|
195
|
+
: ((_b = numberFromUnknown(value.availableRamGb)) !== null && _b !== void 0 ? _b : fallback.availableRamGb);
|
|
196
|
+
const gpu = value.gpu && isObject(value.gpu)
|
|
197
|
+
? {
|
|
198
|
+
backend: value.gpu.backend === "metal" ||
|
|
199
|
+
value.gpu.backend === "vulkan" ||
|
|
200
|
+
value.gpu.backend === "gpu-delegate"
|
|
201
|
+
? value.gpu.backend
|
|
202
|
+
: ((_d = (_c = fallback.gpu) === null || _c === void 0 ? void 0 : _c.backend) !== null && _d !== void 0 ? _d : "gpu-delegate"),
|
|
203
|
+
available: Boolean(value.gpu.available),
|
|
204
|
+
}
|
|
205
|
+
: fallback.gpu;
|
|
206
|
+
return Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign({ platform: value.platform === "ios" ||
|
|
207
|
+
value.platform === "android" ||
|
|
208
|
+
value.platform === "web"
|
|
209
|
+
? value.platform
|
|
210
|
+
: platform, deviceModel: (_e = stringFromUnknown(value.deviceModel)) !== null && _e !== void 0 ? _e : fallback.deviceModel }, (stringFromUnknown(value.machineId)
|
|
211
|
+
? { machineId: stringFromUnknown(value.machineId) }
|
|
212
|
+
: {})), (stringFromUnknown(value.osVersion)
|
|
213
|
+
? { osVersion: stringFromUnknown(value.osVersion) }
|
|
214
|
+
: {})), (typeof value.isSimulator === "boolean"
|
|
215
|
+
? { isSimulator: value.isSimulator }
|
|
216
|
+
: {})), { totalRamGb,
|
|
217
|
+
availableRamGb }), (numberFromUnknown(value.freeStorageGb) !== null
|
|
218
|
+
? { freeStorageGb: numberFromUnknown(value.freeStorageGb) }
|
|
219
|
+
: {})), { cpuCores: (_f = numberFromUnknown(value.cpuCores)) !== null && _f !== void 0 ? _f : fallback.cpuCores, gpu, gpuSupported: (_g = booleanFromUnknown(value.gpuSupported)) !== null && _g !== void 0 ? _g : fallback.gpuSupported }), (typeof value.lowPowerMode === "boolean"
|
|
220
|
+
? { lowPowerMode: value.lowPowerMode }
|
|
221
|
+
: {})), (value.thermalState === "nominal" ||
|
|
222
|
+
value.thermalState === "fair" ||
|
|
223
|
+
value.thermalState === "serious" ||
|
|
224
|
+
value.thermalState === "critical" ||
|
|
225
|
+
value.thermalState === "unknown"
|
|
226
|
+
? { thermalState: value.thermalState }
|
|
227
|
+
: {})), { mtpSupported: Boolean(value.mtpSupported), mtpReason: (_h = stringFromUnknown(value.mtpReason)) !== null && _h !== void 0 ? _h : (value.mtpSupported
|
|
228
|
+
? undefined
|
|
229
|
+
: "native plugin did not report MTP support"), source: value.source === "native" ? "native" : "adapter-fallback", nativeKernels: (_j = stringArrayFromUnknown(value.nativeKernels)) !== null && _j !== void 0 ? _j : [], forkVariant: (_k = normalizeForkVariant(value.forkVariant)) !== null && _k !== void 0 ? _k : null });
|
|
230
|
+
}
|
|
231
|
+
export class CapacitorLlamaAdapter {
|
|
42
232
|
constructor() {
|
|
43
233
|
this.plugin = null;
|
|
44
234
|
/** Cached loader promise so concurrent `load()` calls don't race to register duplicate listeners. */
|
|
45
235
|
this.pluginLoadPromise = null;
|
|
46
236
|
this.loadedPath = null;
|
|
237
|
+
/**
|
|
238
|
+
* Native context id this adapter owns. Allocated lazily on first `load()`
|
|
239
|
+
* from the process-wide `nextContextId` counter so distinct adapter
|
|
240
|
+
* instances never share a context — see the module-level invariant comment.
|
|
241
|
+
*/
|
|
242
|
+
this.contextId = null;
|
|
47
243
|
this.tokenIndex = 0;
|
|
48
244
|
this.tokenListeners = new Set();
|
|
49
245
|
this.pluginListenerHandle = null;
|
|
246
|
+
/**
|
|
247
|
+
* Latest native completion stats captured by `generateStream`. Read by
|
|
248
|
+
* the `generate()` wrapper to populate `GenerateResult` without
|
|
249
|
+
* re-issuing the native call. Cleared at the start of every
|
|
250
|
+
* `generateStream` invocation.
|
|
251
|
+
*/
|
|
252
|
+
this.lastCompletionStats = null;
|
|
253
|
+
}
|
|
254
|
+
requireContextId() {
|
|
255
|
+
if (this.contextId === null) {
|
|
256
|
+
throw new Error("No model loaded. Call load() first.");
|
|
257
|
+
}
|
|
258
|
+
return this.contextId;
|
|
50
259
|
}
|
|
51
260
|
async loadPlugin() {
|
|
52
261
|
if (this.plugin)
|
|
@@ -54,10 +263,11 @@ class CapacitorLlamaAdapter {
|
|
|
54
263
|
if (this.pluginLoadPromise)
|
|
55
264
|
return this.pluginLoadPromise;
|
|
56
265
|
this.pluginLoadPromise = (async () => {
|
|
57
|
-
const
|
|
58
|
-
if (!
|
|
59
|
-
throw new Error("llama-cpp-capacitor did not expose
|
|
266
|
+
const nativePlugin = resolveLlamaCppPlugin(await import("llama-cpp-capacitor"));
|
|
267
|
+
if (!nativePlugin) {
|
|
268
|
+
throw new Error("llama-cpp-capacitor did not expose the native LlamaCpp methods");
|
|
60
269
|
}
|
|
270
|
+
const plugin = toPlainLlamaCppPlugin(nativePlugin);
|
|
61
271
|
const tokenListenerHandle = await plugin.addListener("@LlamaCpp_onToken", (data) => {
|
|
62
272
|
var _a, _b;
|
|
63
273
|
const token = (_b = (_a = data.tokenResult) === null || _a === void 0 ? void 0 : _a.token) !== null && _b !== void 0 ? _b : data.token;
|
|
@@ -86,19 +296,73 @@ class CapacitorLlamaAdapter {
|
|
|
86
296
|
}
|
|
87
297
|
}
|
|
88
298
|
async getHardwareInfo() {
|
|
89
|
-
var _a;
|
|
299
|
+
var _a, _b, _c;
|
|
90
300
|
const platform = detectPlatform();
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
301
|
+
if (!isCapacitorNative())
|
|
302
|
+
return fallbackHardwareInfo(platform);
|
|
303
|
+
try {
|
|
304
|
+
const plugin = await this.loadPlugin();
|
|
305
|
+
const baseInfo = normalizeHardwareInfo(await ((_a = plugin.getHardwareInfo) === null || _a === void 0 ? void 0 : _a.call(plugin)), platform);
|
|
306
|
+
// Probe fork-specific kernels through the optional bridge method.
|
|
307
|
+
// Stock builds and older fork builds without the bridge fall back
|
|
308
|
+
// to the empty list + "stock-llama-cpp" variant marker.
|
|
309
|
+
let nativeKernels = (_b = baseInfo.nativeKernels) !== null && _b !== void 0 ? _b : [];
|
|
310
|
+
let forkVariant = (_c = baseInfo.forkVariant) !== null && _c !== void 0 ? _c : "stock-llama-cpp";
|
|
311
|
+
if (typeof plugin.getNativeKernels === "function") {
|
|
312
|
+
try {
|
|
313
|
+
const probe = await plugin.getNativeKernels();
|
|
314
|
+
const kernels = stringArrayFromUnknown(probe === null || probe === void 0 ? void 0 : probe.kernels);
|
|
315
|
+
if (kernels)
|
|
316
|
+
nativeKernels = kernels;
|
|
317
|
+
const variant = normalizeForkVariant(probe === null || probe === void 0 ? void 0 : probe.variant);
|
|
318
|
+
if (variant !== undefined)
|
|
319
|
+
forkVariant = variant;
|
|
320
|
+
else if (nativeKernels.length > 0)
|
|
321
|
+
forkVariant = "buun-llama-cpp";
|
|
322
|
+
}
|
|
323
|
+
catch (err) {
|
|
324
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
325
|
+
console.debug("[capacitor-llama] getNativeKernels probe failed", {
|
|
326
|
+
error: message,
|
|
327
|
+
});
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
return Object.assign(Object.assign({}, baseInfo), { nativeKernels,
|
|
331
|
+
forkVariant });
|
|
332
|
+
}
|
|
333
|
+
catch (error) {
|
|
334
|
+
return fallbackHardwareInfo(platform, error instanceof Error ? error.message : "native hardware probe failed");
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
async setCacheType(typeK, typeV) {
|
|
338
|
+
if (!isCapacitorNative()) {
|
|
339
|
+
console.warn("[capacitor-llama] setCacheType called on non-native platform; ignoring");
|
|
340
|
+
return;
|
|
341
|
+
}
|
|
342
|
+
const plugin = await this.loadPlugin();
|
|
343
|
+
if (typeof plugin.setCacheType !== "function") {
|
|
344
|
+
console.warn("[capacitor-llama] underlying plugin does not expose setCacheType (likely stock build); cache types must be passed via load() params instead");
|
|
345
|
+
return;
|
|
346
|
+
}
|
|
347
|
+
await plugin.setCacheType({ cacheTypeK: typeK, cacheTypeV: typeV });
|
|
348
|
+
}
|
|
349
|
+
async setSpecType(args) {
|
|
350
|
+
if (!isCapacitorNative()) {
|
|
351
|
+
console.warn("[capacitor-llama] setSpecType called on non-native platform; ignoring");
|
|
352
|
+
return;
|
|
353
|
+
}
|
|
354
|
+
const plugin = await this.loadPlugin();
|
|
355
|
+
if (typeof plugin.setSpecType !== "function") {
|
|
356
|
+
console.warn("[capacitor-llama] underlying plugin does not expose setSpecType (likely stock build); pass draft_model + draft_min/max via load() instead");
|
|
357
|
+
return;
|
|
358
|
+
}
|
|
359
|
+
await plugin.setSpecType({
|
|
360
|
+
target: args.target,
|
|
361
|
+
drafter: args.drafter,
|
|
362
|
+
specType: args.specType,
|
|
363
|
+
draftMin: args.draftMin,
|
|
364
|
+
draftMax: args.draftMax,
|
|
365
|
+
});
|
|
102
366
|
}
|
|
103
367
|
async isLoaded() {
|
|
104
368
|
return {
|
|
@@ -110,48 +374,109 @@ class CapacitorLlamaAdapter {
|
|
|
110
374
|
return this.loadedPath;
|
|
111
375
|
}
|
|
112
376
|
async load(options) {
|
|
113
|
-
var _a, _b;
|
|
377
|
+
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k;
|
|
114
378
|
if (!isCapacitorNative()) {
|
|
115
379
|
throw new Error("capacitor-llama is only available on iOS and Android builds");
|
|
116
380
|
}
|
|
117
381
|
const plugin = await this.loadPlugin();
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
382
|
+
// Release this adapter's own prior context (if any) before reusing the
|
|
383
|
+
// context id for a new model. We do NOT call `releaseAllContexts` here
|
|
384
|
+
// — that would destroy contexts owned by sibling adapter instances
|
|
385
|
+
// (e.g. tear down the embedding model when the chat model reloads).
|
|
386
|
+
if (this.contextId !== null && this.loadedPath !== null) {
|
|
387
|
+
try {
|
|
388
|
+
await plugin.releaseContext({ contextId: this.contextId });
|
|
389
|
+
}
|
|
390
|
+
catch (_l) {
|
|
391
|
+
// The native side may have already cleared this context; safe to
|
|
392
|
+
// proceed to reinit on the same id.
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
this.loadedPath = null;
|
|
396
|
+
if (this.contextId === null) {
|
|
397
|
+
this.contextId = nextContextId++;
|
|
121
398
|
}
|
|
399
|
+
const speculativeSamples = options.mobileSpeculative
|
|
400
|
+
? Math.min((_b = (_a = options.speculativeSamples) !== null && _a !== void 0 ? _a : options.draftMax) !== null && _b !== void 0 ? _b : 3, 4)
|
|
401
|
+
: ((_c = options.speculativeSamples) !== null && _c !== void 0 ? _c : 3);
|
|
402
|
+
const nativeGpuEnabled = resolveNativeGpuEnabled(options.useGpu);
|
|
403
|
+
const params = Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign({ model: options.modelPath, n_ctx: (_d = options.contextSize) !== null && _d !== void 0 ? _d : 4096, n_gpu_layers: nativeGpuEnabled ? 99 : 0, n_threads: (_e = options.maxThreads) !== null && _e !== void 0 ? _e : 0, use_mmap: true, flash_attn: nativeGpuEnabled, embedding: looksLikeEmbeddingModelPath(options.modelPath), n_batch: options.mobileSpeculative ? 128 : 512, n_ubatch: options.mobileSpeculative ? 64 : 512 }, (options.draftModelPath
|
|
404
|
+
? {
|
|
405
|
+
draft_model: options.draftModelPath,
|
|
406
|
+
speculative_samples: speculativeSamples,
|
|
407
|
+
mobile_speculative: (_f = options.mobileSpeculative) !== null && _f !== void 0 ? _f : true,
|
|
408
|
+
}
|
|
409
|
+
: {})), (options.draftContextSize
|
|
410
|
+
? { n_ctx_draft: options.draftContextSize }
|
|
411
|
+
: {})), (options.draftMin ? { draft_min: options.draftMin } : {})), (options.draftMax ? { draft_max: options.draftMax } : {})), (options.cacheTypeK ? { cache_type_k: options.cacheTypeK } : {})), (options.cacheTypeV ? { cache_type_v: options.cacheTypeV } : {})), (options.disableThinking ? { reasoning: false } : {}));
|
|
122
412
|
await plugin.initContext({
|
|
123
|
-
contextId:
|
|
124
|
-
params
|
|
125
|
-
model: options.modelPath,
|
|
126
|
-
n_ctx: (_a = options.contextSize) !== null && _a !== void 0 ? _a : 4096,
|
|
127
|
-
n_gpu_layers: options.useGpu === false ? 0 : 99,
|
|
128
|
-
n_threads: (_b = options.maxThreads) !== null && _b !== void 0 ? _b : 0,
|
|
129
|
-
use_mmap: true,
|
|
130
|
-
},
|
|
413
|
+
contextId: this.contextId,
|
|
414
|
+
params,
|
|
131
415
|
});
|
|
416
|
+
// Fork builds expose a separate `setSpecType` bridge that configures
|
|
417
|
+
// the MTP drafter after the main context is up. Stock builds lack
|
|
418
|
+
// the method and the setter warns and skips it. We auto-call here so
|
|
419
|
+
// callers only need to pass `draftModelPath` once via load() — the
|
|
420
|
+
// adapter then handles both the params-bag path (stock fallback) and
|
|
421
|
+
// the explicit setSpecType path (fork build) in one shot.
|
|
422
|
+
if (options.draftModelPath && typeof plugin.setSpecType === "function") {
|
|
423
|
+
try {
|
|
424
|
+
await plugin.setSpecType({
|
|
425
|
+
target: options.modelPath,
|
|
426
|
+
drafter: options.draftModelPath,
|
|
427
|
+
specType: "mtp",
|
|
428
|
+
draftMin: (_g = options.draftMin) !== null && _g !== void 0 ? _g : 1,
|
|
429
|
+
draftMax: (_h = options.draftMax) !== null && _h !== void 0 ? _h : 3,
|
|
430
|
+
});
|
|
431
|
+
}
|
|
432
|
+
catch (err) {
|
|
433
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
434
|
+
console.warn("[capacitor-llama] setSpecType failed; spec decode disabled", { error: message });
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
// Same pattern for cache_type_k/v: fork builds may surface a separate
|
|
438
|
+
// setCacheType bridge; stock builds rely on the params bag only.
|
|
439
|
+
if ((options.cacheTypeK || options.cacheTypeV) &&
|
|
440
|
+
typeof plugin.setCacheType === "function") {
|
|
441
|
+
try {
|
|
442
|
+
await plugin.setCacheType({
|
|
443
|
+
cacheTypeK: (_j = options.cacheTypeK) !== null && _j !== void 0 ? _j : "f16",
|
|
444
|
+
cacheTypeV: (_k = options.cacheTypeV) !== null && _k !== void 0 ? _k : "f16",
|
|
445
|
+
});
|
|
446
|
+
}
|
|
447
|
+
catch (err) {
|
|
448
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
449
|
+
console.warn("[capacitor-llama] setCacheType failed; cache types may be unchanged", { error: message });
|
|
450
|
+
}
|
|
451
|
+
}
|
|
132
452
|
this.loadedPath = options.modelPath;
|
|
133
453
|
}
|
|
134
454
|
async unload() {
|
|
135
|
-
if (!this.plugin || !this.loadedPath)
|
|
455
|
+
if (!this.plugin || !this.loadedPath || this.contextId === null)
|
|
136
456
|
return;
|
|
137
457
|
try {
|
|
138
|
-
await this.plugin.releaseContext({ contextId:
|
|
458
|
+
await this.plugin.releaseContext({ contextId: this.contextId });
|
|
139
459
|
}
|
|
140
460
|
catch (_a) {
|
|
461
|
+
// Fall back to a targeted release-all only when the per-context
|
|
462
|
+
// release fails; this used to be the always-path but it now risks
|
|
463
|
+
// tearing down sibling adapter instances and is reserved for the
|
|
464
|
+
// pathological case where the native side has lost track of our id.
|
|
141
465
|
await this.plugin.releaseAllContexts();
|
|
142
466
|
}
|
|
143
467
|
this.loadedPath = null;
|
|
144
468
|
}
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
469
|
+
/**
|
|
470
|
+
* Build the params object for the native completion call. Shared between
|
|
471
|
+
* the legacy `generate()` path and the new `generateStream()` path so the
|
|
472
|
+
* cache-key + stop-sequence wiring lives in one place.
|
|
473
|
+
*/
|
|
474
|
+
buildNativeParams(options) {
|
|
475
|
+
var _a, _b;
|
|
151
476
|
const params = {
|
|
152
|
-
n_predict: (
|
|
153
|
-
temperature: (
|
|
154
|
-
top_p: (
|
|
477
|
+
n_predict: resolveMobileMaxTokens(options.maxTokens),
|
|
478
|
+
temperature: (_a = options.temperature) !== null && _a !== void 0 ? _a : 0.7,
|
|
479
|
+
top_p: (_b = options.topP) !== null && _b !== void 0 ? _b : 0.9,
|
|
155
480
|
};
|
|
156
481
|
if (options.stopSequences && options.stopSequences.length > 0) {
|
|
157
482
|
params.stop = options.stopSequences;
|
|
@@ -159,26 +484,294 @@ class CapacitorLlamaAdapter {
|
|
|
159
484
|
if (options.stream) {
|
|
160
485
|
params.emit_partial_completion = true;
|
|
161
486
|
}
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
487
|
+
// Cache key threading: surface the slot id derived from
|
|
488
|
+
// ProviderCachePlan.promptCacheKey to the native side. Stock
|
|
489
|
+
// llama-cpp-capacitor builds ignore the field; the patched fork build
|
|
490
|
+
// reads it via setCacheType / completion params and pins KV slots.
|
|
491
|
+
if (options.cacheKey) {
|
|
492
|
+
const slotId = deriveCacheSlotId(options.cacheKey);
|
|
493
|
+
params.cache_prompt = true;
|
|
494
|
+
params.slot_id = slotId;
|
|
495
|
+
}
|
|
496
|
+
return params;
|
|
497
|
+
}
|
|
498
|
+
/**
|
|
499
|
+
* Invoke the native completion (or generateText) entry point with a
|
|
500
|
+
* pre-built params bag. Returns the raw native result; callers map this
|
|
501
|
+
* to `GenerateResult` or to a `done` event.
|
|
502
|
+
*/
|
|
503
|
+
async runNativeCompletion(options, params) {
|
|
504
|
+
var _a;
|
|
505
|
+
const plugin = this.plugin;
|
|
506
|
+
if (!plugin) {
|
|
507
|
+
throw new Error("No model loaded. Call load() first.");
|
|
508
|
+
}
|
|
509
|
+
const contextId = this.requireContextId();
|
|
510
|
+
const result = typeof plugin.completion === "function"
|
|
511
|
+
? await plugin.completion({
|
|
512
|
+
contextId,
|
|
513
|
+
params: Object.assign({ prompt: options.prompt, emit_partial_completion: Boolean(params.emit_partial_completion) }, params),
|
|
514
|
+
})
|
|
515
|
+
: await ((_a = plugin.generateText) === null || _a === void 0 ? void 0 : _a.call(plugin, {
|
|
516
|
+
contextId,
|
|
517
|
+
prompt: options.prompt,
|
|
518
|
+
params,
|
|
519
|
+
}));
|
|
520
|
+
if (!result) {
|
|
521
|
+
throw new Error("llama-cpp-capacitor did not expose completion() or generateText()");
|
|
522
|
+
}
|
|
523
|
+
return result;
|
|
524
|
+
}
|
|
525
|
+
/**
|
|
526
|
+
* Native bridges currently don't honour per-generation sampler-stage
|
|
527
|
+
* injection — the Swift / Kotlin side needs separate wiring. Until that
|
|
528
|
+
* lands we log once per stage and otherwise pass through. The stages
|
|
529
|
+
* remain in the options object so downstream observers (telemetry,
|
|
530
|
+
* tests) can still see them.
|
|
531
|
+
*/
|
|
532
|
+
logUnwiredSamplerStages(stages) {
|
|
533
|
+
if (!stages || stages.length === 0)
|
|
534
|
+
return;
|
|
535
|
+
for (const stage of stages) {
|
|
536
|
+
console.debug(`[capacitor-llama] sampler stage "${stage.kind}" received but not yet wired in native bridge`);
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
async generate(options) {
|
|
540
|
+
var _a, e_1, _b, _c;
|
|
541
|
+
// Wrapper over `generateStream` so the cache-key, stop-sequence, and
|
|
542
|
+
// native-call wiring lives in exactly one place. Drains the stream
|
|
543
|
+
// into the legacy `GenerateResult` shape; per-token events surface to
|
|
544
|
+
// any `onToken` listener via the native event bridge (unchanged).
|
|
545
|
+
let text = "";
|
|
546
|
+
let promptTokens = 0;
|
|
547
|
+
let outputTokens = 0;
|
|
548
|
+
let durationMs = 0;
|
|
549
|
+
let lastError = null;
|
|
550
|
+
// Wall-clock time-to-first-token: from the call start to the first decoded
|
|
551
|
+
// token event. This is the on-device prefill wall-clock the resource
|
|
552
|
+
// workbench differences into prefill vs decode throughput. Stays undefined
|
|
553
|
+
// when the generation yields no tokens.
|
|
554
|
+
const startedAt = Date.now();
|
|
555
|
+
let ttftMs;
|
|
556
|
+
try {
|
|
557
|
+
for (var _d = true, _e = __asyncValues(this.generateStream(options)), _f; _f = await _e.next(), _a = _f.done, !_a; _d = true) {
|
|
558
|
+
_c = _f.value;
|
|
559
|
+
_d = false;
|
|
560
|
+
const event = _c;
|
|
561
|
+
if (event.kind === "token") {
|
|
562
|
+
if (ttftMs === undefined)
|
|
563
|
+
ttftMs = Date.now() - startedAt;
|
|
564
|
+
text += event.text;
|
|
565
|
+
}
|
|
566
|
+
else if (event.kind === "telemetry") {
|
|
567
|
+
// Native bridge currently emits no telemetry events; ignored here
|
|
568
|
+
// because the final `done` event carries the authoritative totals.
|
|
569
|
+
}
|
|
570
|
+
else if (event.kind === "error") {
|
|
571
|
+
lastError = event.message;
|
|
572
|
+
}
|
|
573
|
+
else if (event.kind === "done") {
|
|
574
|
+
// The done payload's authoritative fields come from the
|
|
575
|
+
// closed-over scope below — set when the native call returns.
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
catch (e_1_1) { e_1 = { error: e_1_1 }; }
|
|
580
|
+
finally {
|
|
581
|
+
try {
|
|
582
|
+
if (!_d && !_a && (_b = _e.return)) await _b.call(_e);
|
|
583
|
+
}
|
|
584
|
+
finally { if (e_1) throw e_1.error; }
|
|
585
|
+
}
|
|
586
|
+
if (lastError)
|
|
587
|
+
throw new Error(lastError);
|
|
588
|
+
// Re-read native counters from the cached completion result. We stored
|
|
589
|
+
// them on `this.lastCompletionStats` inside the stream's lifecycle.
|
|
590
|
+
const stats = this.lastCompletionStats;
|
|
591
|
+
if (stats) {
|
|
592
|
+
promptTokens = stats.promptTokens;
|
|
593
|
+
outputTokens = stats.outputTokens;
|
|
594
|
+
durationMs = stats.durationMs;
|
|
595
|
+
if (stats.text) {
|
|
596
|
+
// The native call's authoritative text. Use it instead of the
|
|
597
|
+
// token-event-assembled string so callers see exactly what the
|
|
598
|
+
// bridge produced (some bridges only emit tokens, others emit
|
|
599
|
+
// partial+final; assembled text isn't always equal).
|
|
600
|
+
text = stats.text;
|
|
601
|
+
}
|
|
602
|
+
}
|
|
603
|
+
return Object.assign({ text,
|
|
604
|
+
promptTokens,
|
|
605
|
+
outputTokens,
|
|
606
|
+
durationMs }, (ttftMs !== undefined ? { ttftMs } : {}));
|
|
607
|
+
}
|
|
608
|
+
/**
|
|
609
|
+
* Streaming generation. Subscribes to the native token event bridge,
|
|
610
|
+
* starts the completion call, and yields typed `GenerationEvent`s as
|
|
611
|
+
* tokens arrive. The stream ends with exactly one `done` event (or one
|
|
612
|
+
* terminal `error`) once the native call resolves.
|
|
613
|
+
*
|
|
614
|
+
* Sampler-stage injection (`samplerStages`) and the per-generation
|
|
615
|
+
* spec-decode toggle (`specDecode`) are accepted but currently pass
|
|
616
|
+
* through unchanged on the JS side — the Swift / Kotlin bridge wiring is tracked
|
|
617
|
+
* separately. They flow through as part of the options bag so the
|
|
618
|
+
* native side can pick them up without an interface change.
|
|
619
|
+
*/
|
|
620
|
+
generateStream(options) {
|
|
621
|
+
return __asyncGenerator(this, arguments, function* generateStream_1() {
|
|
622
|
+
var _a;
|
|
623
|
+
if (!this.plugin || !this.loadedPath) {
|
|
624
|
+
throw new Error("No model loaded. Call load() first.");
|
|
625
|
+
}
|
|
626
|
+
this.tokenIndex = 0;
|
|
627
|
+
this.lastCompletionStats = null;
|
|
628
|
+
this.logUnwiredSamplerStages(options.samplerStages);
|
|
629
|
+
const queue = [];
|
|
630
|
+
let waiter = null;
|
|
631
|
+
const wake = () => {
|
|
632
|
+
if (waiter) {
|
|
633
|
+
const w = waiter;
|
|
634
|
+
waiter = null;
|
|
635
|
+
w();
|
|
636
|
+
}
|
|
637
|
+
};
|
|
638
|
+
const push = (event) => {
|
|
639
|
+
queue.push(event);
|
|
640
|
+
wake();
|
|
641
|
+
};
|
|
642
|
+
// Subscribe to per-token events. The native bridge fires
|
|
643
|
+
// `@LlamaCpp_onToken`; our existing class-level listener forwards into
|
|
644
|
+
// every `onToken(listener)` consumer. We register one more listener
|
|
645
|
+
// here, scoped to this stream, that converts strings into `token`
|
|
646
|
+
// events.
|
|
647
|
+
const unsubscribe = this.onToken((tokenText, index) => {
|
|
648
|
+
push({ kind: "token", text: tokenText, index });
|
|
649
|
+
});
|
|
650
|
+
const params = this.buildNativeParams(Object.assign(Object.assign({}, options), {
|
|
651
|
+
// generateStream implies streaming — force on so the bridge emits
|
|
652
|
+
// partial completions even when the caller didn't set `stream: true`
|
|
653
|
+
// on the legacy options bag.
|
|
654
|
+
stream: true }));
|
|
655
|
+
const started = Date.now();
|
|
656
|
+
let completionPromise;
|
|
657
|
+
try {
|
|
658
|
+
completionPromise = this.runNativeCompletion(options, params);
|
|
659
|
+
}
|
|
660
|
+
catch (err) {
|
|
661
|
+
unsubscribe();
|
|
662
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
663
|
+
yield yield __await({ kind: "error", message, recoverable: false });
|
|
664
|
+
yield yield __await({ kind: "done", finishReason: "error" });
|
|
665
|
+
return yield __await(void 0);
|
|
666
|
+
}
|
|
667
|
+
// Wrapped in an object so TS's control-flow analysis doesn't widen the
|
|
668
|
+
// closed-over assignments back to `null`/`never` when we read them
|
|
669
|
+
// after the loop. (Plain `let` with `null` init narrows badly after
|
|
670
|
+
// an async assignment.)
|
|
671
|
+
const completionState = { result: null, error: null, done: false };
|
|
672
|
+
completionPromise
|
|
673
|
+
.then((result) => {
|
|
674
|
+
completionState.result = result;
|
|
675
|
+
})
|
|
676
|
+
.catch((err) => {
|
|
677
|
+
completionState.error =
|
|
678
|
+
err instanceof Error ? err : { message: String(err) };
|
|
679
|
+
})
|
|
680
|
+
.finally(() => {
|
|
681
|
+
completionState.done = true;
|
|
682
|
+
wake();
|
|
683
|
+
});
|
|
684
|
+
try {
|
|
685
|
+
while (true) {
|
|
686
|
+
if (queue.length > 0) {
|
|
687
|
+
yield yield __await(queue.shift());
|
|
688
|
+
continue;
|
|
689
|
+
}
|
|
690
|
+
if (completionState.done)
|
|
691
|
+
break;
|
|
692
|
+
yield __await(new Promise((resolve) => {
|
|
693
|
+
waiter = resolve;
|
|
694
|
+
}));
|
|
695
|
+
}
|
|
696
|
+
}
|
|
697
|
+
finally {
|
|
698
|
+
unsubscribe();
|
|
699
|
+
}
|
|
700
|
+
if (completionState.error) {
|
|
701
|
+
yield yield __await({
|
|
702
|
+
kind: "error",
|
|
703
|
+
message: completionState.error.message,
|
|
704
|
+
recoverable: false,
|
|
705
|
+
});
|
|
706
|
+
yield yield __await({ kind: "done", finishReason: "error" });
|
|
707
|
+
return yield __await(void 0);
|
|
708
|
+
}
|
|
709
|
+
if (completionState.result) {
|
|
710
|
+
const r = completionState.result;
|
|
711
|
+
const duration = ((_a = r.timings) === null || _a === void 0 ? void 0 : _a.predicted_ms) != null
|
|
712
|
+
? Math.round(r.timings.predicted_ms)
|
|
713
|
+
: Date.now() - started;
|
|
714
|
+
this.lastCompletionStats = {
|
|
715
|
+
text: r.text,
|
|
716
|
+
promptTokens: r.tokens_evaluated,
|
|
717
|
+
outputTokens: r.tokens_predicted,
|
|
718
|
+
durationMs: duration,
|
|
719
|
+
};
|
|
720
|
+
// Reason heuristic: native fork doesn't expose a finish-reason
|
|
721
|
+
// enum yet. "stop" is the dominant case; "length" when we hit the
|
|
722
|
+
// requested n_predict ceiling exactly. Tool/cancel/error are
|
|
723
|
+
// emitted by the explicit paths above and aren't reachable here.
|
|
724
|
+
const requested = resolveMobileMaxTokens(options.maxTokens);
|
|
725
|
+
const finishReason = r.tokens_predicted >= requested ? "length" : "stop";
|
|
726
|
+
yield yield __await({ kind: "done", finishReason });
|
|
727
|
+
return yield __await(void 0);
|
|
728
|
+
}
|
|
729
|
+
// Native call resolved with no payload and no error — defensive
|
|
730
|
+
// terminal event so the consumer's `for await` always ends cleanly.
|
|
731
|
+
yield yield __await({ kind: "done", finishReason: "stop" });
|
|
167
732
|
});
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
733
|
+
}
|
|
734
|
+
async setDrafter(drafterPath) {
|
|
735
|
+
// The native bridge has no live-swap entry point yet; the drafter is
|
|
736
|
+
// bound at `load()` time via `LoadOptions.draftModelPath`. Log so the
|
|
737
|
+
// call-site is observable, and leave the loaded context unchanged.
|
|
738
|
+
console.warn(`[capacitor-llama] setDrafter(${drafterPath !== null && drafterPath !== void 0 ? drafterPath : "null"}) not yet supported by native bridge; pass draftModelPath to load() instead`);
|
|
739
|
+
}
|
|
740
|
+
async trimMemory(level) {
|
|
741
|
+
// No native hook yet — log so the runtime's pressure plumbing can see
|
|
742
|
+
// the adapter received the signal. Major pressure also clears the
|
|
743
|
+
// token-listener bookkeeping to drop any orphaned callbacks.
|
|
744
|
+
if (level === "major") {
|
|
745
|
+
this.tokenListeners.clear();
|
|
746
|
+
}
|
|
747
|
+
console.debug(`[capacitor-llama] trimMemory(${level}) — bridge hook unavailable`);
|
|
177
748
|
}
|
|
178
749
|
async cancelGenerate() {
|
|
179
|
-
if (!this.plugin)
|
|
750
|
+
if (!this.plugin || this.contextId === null)
|
|
180
751
|
return;
|
|
181
|
-
await this.plugin.stopCompletion({ contextId:
|
|
752
|
+
await this.plugin.stopCompletion({ contextId: this.contextId });
|
|
753
|
+
}
|
|
754
|
+
/**
|
|
755
|
+
* Round-trip to the loaded GGUF's native chat template via
|
|
756
|
+
* `LlamaCpp.getFormattedChat`. The plugin's Java side serializes
|
|
757
|
+
* `messages` as a JSON string and invokes
|
|
758
|
+
* `cap_format_chat()` → `llama_chat_apply_template()`. Returns the
|
|
759
|
+
* rendered prompt (or null when the GGUF has no template metadata).
|
|
760
|
+
*/
|
|
761
|
+
async formatChat(messages) {
|
|
762
|
+
var _a;
|
|
763
|
+
if (!this.plugin || !this.loadedPath) {
|
|
764
|
+
throw new Error("No model loaded. Call load() first.");
|
|
765
|
+
}
|
|
766
|
+
if (typeof this.plugin.getFormattedChat !== "function") {
|
|
767
|
+
return null;
|
|
768
|
+
}
|
|
769
|
+
const result = await this.plugin.getFormattedChat({
|
|
770
|
+
contextId: this.requireContextId(),
|
|
771
|
+
messages: JSON.stringify(messages),
|
|
772
|
+
params: { jinja: true },
|
|
773
|
+
});
|
|
774
|
+
return (_a = result.prompt) !== null && _a !== void 0 ? _a : null;
|
|
182
775
|
}
|
|
183
776
|
async embed(options) {
|
|
184
777
|
var _a;
|
|
@@ -191,8 +784,9 @@ class CapacitorLlamaAdapter {
|
|
|
191
784
|
const params = {
|
|
192
785
|
embd_normalize: (_a = options.embdNormalize) !== null && _a !== void 0 ? _a : 0,
|
|
193
786
|
};
|
|
787
|
+
const contextId = this.requireContextId();
|
|
194
788
|
const result = await this.plugin.embedding({
|
|
195
|
-
contextId
|
|
789
|
+
contextId,
|
|
196
790
|
text: options.input,
|
|
197
791
|
params,
|
|
198
792
|
});
|
|
@@ -200,7 +794,7 @@ class CapacitorLlamaAdapter {
|
|
|
200
794
|
if (typeof this.plugin.tokenize === "function") {
|
|
201
795
|
try {
|
|
202
796
|
const tokenized = await this.plugin.tokenize({
|
|
203
|
-
contextId
|
|
797
|
+
contextId,
|
|
204
798
|
text: options.input,
|
|
205
799
|
});
|
|
206
800
|
tokenCount = tokenized.tokens.length;
|
|
@@ -232,22 +826,69 @@ class CapacitorLlamaAdapter {
|
|
|
232
826
|
this.pluginLoadPromise = null;
|
|
233
827
|
}
|
|
234
828
|
}
|
|
829
|
+
/**
|
|
830
|
+
* Default singleton kept for back-compat with device-bridge-client and
|
|
831
|
+
* hardware-probe callers that don't distinguish chat vs embedding roles.
|
|
832
|
+
* The runtime's `localInferenceLoader` service uses per-role instances
|
|
833
|
+
* instead — see `registerCapacitorLlamaLoader`.
|
|
834
|
+
*/
|
|
235
835
|
export const capacitorLlama = new CapacitorLlamaAdapter();
|
|
836
|
+
/**
|
|
837
|
+
* Lightweight heuristic for routing a `loadModel(modelPath)` call to either
|
|
838
|
+
* the chat adapter or the embedding adapter. Embedding GGUFs the runtime
|
|
839
|
+
* ships or that users typically install for `TEXT_EMBEDDING` carry one of
|
|
840
|
+
* these markers in the filename. Anything else is assumed to be a
|
|
841
|
+
* generative chat model.
|
|
842
|
+
*/
|
|
843
|
+
function looksLikeEmbeddingModelPath(modelPath) {
|
|
844
|
+
const lowered = modelPath.toLowerCase();
|
|
845
|
+
return (lowered.includes("bge-") ||
|
|
846
|
+
lowered.includes("bge_") ||
|
|
847
|
+
lowered.includes("nomic-embed") ||
|
|
848
|
+
lowered.includes("all-minilm") ||
|
|
849
|
+
lowered.includes("gte-") ||
|
|
850
|
+
lowered.includes("e5-") ||
|
|
851
|
+
lowered.includes("/embedding/") ||
|
|
852
|
+
lowered.endsWith("embedding.gguf"));
|
|
853
|
+
}
|
|
236
854
|
export function registerCapacitorLlamaLoader(runtime) {
|
|
237
855
|
if (typeof runtime.registerService !== "function")
|
|
238
856
|
return;
|
|
857
|
+
// Two distinct adapter instances so the chat LLM and embedding model
|
|
858
|
+
// each allocate their own native context id. This is the fix for
|
|
859
|
+
// elizaOS/eliza#7681 — the previous single-adapter design routed every
|
|
860
|
+
// operation through CONTEXT_ID=1, and a `completion(contextId=1)` call
|
|
861
|
+
// would resolve to whichever model registered against id 1 last
|
|
862
|
+
// (typically the bge-small embedding model on Android), emitting
|
|
863
|
+
// `[unused{N}]` / `[PAD]` reserved tokens.
|
|
864
|
+
const chatAdapter = new CapacitorLlamaAdapter();
|
|
865
|
+
const embeddingAdapter = new CapacitorLlamaAdapter();
|
|
866
|
+
function adapterFor(modelPath) {
|
|
867
|
+
return looksLikeEmbeddingModelPath(modelPath)
|
|
868
|
+
? embeddingAdapter
|
|
869
|
+
: chatAdapter;
|
|
870
|
+
}
|
|
239
871
|
runtime.registerService("localInferenceLoader", {
|
|
240
872
|
async loadModel(args) {
|
|
241
|
-
await
|
|
873
|
+
await adapterFor(args.modelPath).load(args);
|
|
242
874
|
},
|
|
243
875
|
async unloadModel() {
|
|
244
|
-
|
|
876
|
+
// Each adapter manages its own context lifecycle inside
|
|
877
|
+
// `load()` (releasing the prior context before reinitializing on the
|
|
878
|
+
// same id). Tearing down both adapters here would defeat the
|
|
879
|
+
// per-instance routing — `ensureAssignedModelLoaded` calls
|
|
880
|
+
// `unloadModel()` before every `loadModel()` on the assumption of
|
|
881
|
+
// single-model behaviour, and we must not let that unconditionally
|
|
882
|
+
// kill the embedding adapter when only the chat model is swapping.
|
|
245
883
|
},
|
|
246
884
|
currentModelPath() {
|
|
247
|
-
|
|
885
|
+
var _a;
|
|
886
|
+
// The chat path is the primary "active" model from the runtime's
|
|
887
|
+
// perspective; embedding is treated as a sidecar.
|
|
888
|
+
return ((_a = chatAdapter.currentModelPath()) !== null && _a !== void 0 ? _a : embeddingAdapter.currentModelPath());
|
|
248
889
|
},
|
|
249
890
|
async generate(args) {
|
|
250
|
-
const result = await
|
|
891
|
+
const result = await chatAdapter.generate({
|
|
251
892
|
prompt: args.prompt,
|
|
252
893
|
stopSequences: args.stopSequences,
|
|
253
894
|
maxTokens: args.maxTokens,
|
|
@@ -256,7 +897,7 @@ export function registerCapacitorLlamaLoader(runtime) {
|
|
|
256
897
|
return result.text;
|
|
257
898
|
},
|
|
258
899
|
async embed(args) {
|
|
259
|
-
return
|
|
900
|
+
return embeddingAdapter.embed({ input: args.input });
|
|
260
901
|
},
|
|
261
902
|
});
|
|
262
903
|
}
|