@elizaos/capacitor-llama 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,68 @@
1
+ # @elizaos/capacitor-llama
2
+
3
+ Mobile llama.cpp adapter for Eliza. A **thin wrapper** over
4
+ [`llama-cpp-capacitor`](https://github.com/arusatech/annadata-llama-cpp) that
5
+ maps its contextId-based API onto Eliza's `LocalInferenceLoader` contract,
6
+ so the standard `ActiveModelCoordinator` in `@elizaos/app-core` can switch
7
+ between the desktop (node-llama-cpp) engine and mobile native inference
8
+ transparently.
9
+
10
+ ## What it does
11
+
12
+ - Registers as the runtime's `localInferenceLoader` service during the
13
+ Capacitor bootstrap.
14
+ - Maps `loadModel({ modelPath })` → `initContext`.
15
+ - Maps `unloadModel()` → `releaseContext` / `releaseAllContexts`.
16
+ - Exposes a `generate()` surface matching the desktop engine.
17
+ - Fans the native `@LlamaCpp_onToken` stream out to Eliza's token listeners.
18
+
19
+ ## What it does not do
20
+
21
+ - It does not ship llama.cpp native binaries — `llama-cpp-capacitor`
22
+ handles iOS (arm64 + x86_64 with Metal) and Android (arm64-v8a,
23
+ armeabi-v7a, x86, x86_64) itself.
24
+ - It does not run on web. On Electrobun / Vite we fall back to the
25
+ standalone `node-llama-cpp` engine in `@elizaos/app-core`.
26
+
27
+ ## Setup in apps/app
28
+
29
+ 1. Install the dependency (already declared here):
30
+
31
+ ```bash
32
+ bun install
33
+ ```
34
+
35
+ 2. Register the loader during Capacitor bootstrap. In `apps/app`'s
36
+ Capacitor init path (currently in `src/capacitor-shell.ts` or the
37
+ runtime bootstrap that owns the mobile `AgentRuntime`):
38
+
39
+ ```ts
40
+ import { registerCapacitorLlamaLoader } from "@elizaos/capacitor-llama";
41
+
42
+ // After runtime boot, before the Model Hub is mounted:
43
+ registerCapacitorLlamaLoader(runtime);
44
+ ```
45
+
46
+ 3. Run `npx cap sync` in `apps/app` to pick up the native plugin. iOS and
47
+ Android builds will pull in `llama-cpp-capacitor`'s prebuilt native
48
+ libraries automatically.
49
+
50
+ ## Scope notes
51
+
52
+ - Only **one model is loaded at a time**. `load()` disposes the previous
53
+ context first so we never double-allocate VRAM on device.
54
+ - GGUF files are downloaded to the app sandbox by the
55
+ `@elizaos/app-core` downloader (shared with desktop). The mobile UI
56
+ filters the catalog to small/tiny bucket models only, since anything
57
+ larger won't realistically run on a phone.
58
+ - Streaming tokens flow over Capacitor's native event bus
59
+ (`@LlamaCpp_onToken`). Subscribe via `capacitorLlama.onToken(listener)`.
60
+ - For a full desktop-level feature set (embeddings, reranking, chat
61
+ templates, tool calling), read the upstream
62
+ [`llama-cpp-capacitor` README](https://github.com/arusatech/annadata-llama-cpp).
63
+ This adapter only wires the minimal slice needed for Eliza's agent
64
+ runtime; extend it as the mobile product grows.
65
+
66
+ ## Licensing
67
+
68
+ MIT — matches `llama-cpp-capacitor` and llama.cpp upstream.
@@ -0,0 +1,5 @@
1
+ import type { LlamaAdapter } from "./definitions";
2
+ export declare const capacitorLlama: LlamaAdapter;
3
+ export declare function registerCapacitorLlamaLoader(runtime: {
4
+ registerService?: (name: string, impl: unknown) => unknown;
5
+ }): void;
@@ -0,0 +1,262 @@
1
+ const CONTEXT_ID = 1;
2
+ function isObject(value) {
3
+ return typeof value === "object" && value !== null;
4
+ }
5
+ function isLlamaCppPluginLike(value) {
6
+ return (isObject(value) &&
7
+ typeof value.initContext === "function" &&
8
+ typeof value.releaseContext === "function" &&
9
+ typeof value.releaseAllContexts === "function" &&
10
+ typeof value.generateText === "function" &&
11
+ typeof value.stopCompletion === "function" &&
12
+ typeof value.addListener === "function");
13
+ }
14
+ function resolveLlamaCppPlugin(mod) {
15
+ if (!isObject(mod))
16
+ return null;
17
+ if (isLlamaCppPluginLike(mod.LlamaCpp))
18
+ return mod.LlamaCpp;
19
+ if (isLlamaCppPluginLike(mod.default))
20
+ return mod.default;
21
+ if (isObject(mod.default) && isLlamaCppPluginLike(mod.default.LlamaCpp)) {
22
+ return mod.default.LlamaCpp;
23
+ }
24
+ return null;
25
+ }
26
+ function isCapacitorNative() {
27
+ var _a;
28
+ const cap = globalThis.Capacitor;
29
+ return Boolean((_a = cap === null || cap === void 0 ? void 0 : cap.isNativePlatform) === null || _a === void 0 ? void 0 : _a.call(cap));
30
+ }
31
+ function detectPlatform() {
32
+ var _a;
33
+ const cap = globalThis.Capacitor;
34
+ const platform = (_a = cap === null || cap === void 0 ? void 0 : cap.getPlatform) === null || _a === void 0 ? void 0 : _a.call(cap);
35
+ if (platform === "ios")
36
+ return "ios";
37
+ if (platform === "android")
38
+ return "android";
39
+ return "web";
40
+ }
41
+ class CapacitorLlamaAdapter {
42
+ constructor() {
43
+ this.plugin = null;
44
+ /** Cached loader promise so concurrent `load()` calls don't race to register duplicate listeners. */
45
+ this.pluginLoadPromise = null;
46
+ this.loadedPath = null;
47
+ this.tokenIndex = 0;
48
+ this.tokenListeners = new Set();
49
+ this.pluginListenerHandle = null;
50
+ }
51
+ async loadPlugin() {
52
+ if (this.plugin)
53
+ return this.plugin;
54
+ if (this.pluginLoadPromise)
55
+ return this.pluginLoadPromise;
56
+ this.pluginLoadPromise = (async () => {
57
+ const plugin = resolveLlamaCppPlugin(await import("llama-cpp-capacitor"));
58
+ if (!plugin) {
59
+ throw new Error("llama-cpp-capacitor did not expose an initContext method");
60
+ }
61
+ const tokenListenerHandle = await plugin.addListener("@LlamaCpp_onToken", (data) => {
62
+ var _a, _b;
63
+ const token = (_b = (_a = data.tokenResult) === null || _a === void 0 ? void 0 : _a.token) !== null && _b !== void 0 ? _b : data.token;
64
+ if (!token)
65
+ return;
66
+ this.tokenIndex += 1;
67
+ for (const listener of this.tokenListeners) {
68
+ try {
69
+ listener(token, this.tokenIndex);
70
+ }
71
+ catch (_c) {
72
+ this.tokenListeners.delete(listener);
73
+ }
74
+ }
75
+ });
76
+ this.pluginListenerHandle = tokenListenerHandle !== null && tokenListenerHandle !== void 0 ? tokenListenerHandle : null;
77
+ this.plugin = plugin;
78
+ return plugin;
79
+ })();
80
+ try {
81
+ return await this.pluginLoadPromise;
82
+ }
83
+ catch (err) {
84
+ this.pluginLoadPromise = null;
85
+ throw err;
86
+ }
87
+ }
88
+ async getHardwareInfo() {
89
+ var _a;
90
+ const platform = detectPlatform();
91
+ const nav = globalThis
92
+ .navigator;
93
+ return {
94
+ platform,
95
+ deviceModel: platform,
96
+ totalRamGb: 0,
97
+ availableRamGb: null,
98
+ cpuCores: (_a = nav === null || nav === void 0 ? void 0 : nav.hardwareConcurrency) !== null && _a !== void 0 ? _a : 0,
99
+ gpu: null,
100
+ gpuSupported: platform !== "web",
101
+ };
102
+ }
103
+ async isLoaded() {
104
+ return {
105
+ loaded: this.loadedPath !== null,
106
+ modelPath: this.loadedPath,
107
+ };
108
+ }
109
+ currentModelPath() {
110
+ return this.loadedPath;
111
+ }
112
+ async load(options) {
113
+ var _a, _b;
114
+ if (!isCapacitorNative()) {
115
+ throw new Error("capacitor-llama is only available on iOS and Android builds");
116
+ }
117
+ const plugin = await this.loadPlugin();
118
+ if (this.loadedPath && this.loadedPath !== options.modelPath) {
119
+ await plugin.releaseAllContexts();
120
+ this.loadedPath = null;
121
+ }
122
+ await plugin.initContext({
123
+ contextId: CONTEXT_ID,
124
+ params: {
125
+ model: options.modelPath,
126
+ n_ctx: (_a = options.contextSize) !== null && _a !== void 0 ? _a : 4096,
127
+ n_gpu_layers: options.useGpu === false ? 0 : 99,
128
+ n_threads: (_b = options.maxThreads) !== null && _b !== void 0 ? _b : 0,
129
+ use_mmap: true,
130
+ },
131
+ });
132
+ this.loadedPath = options.modelPath;
133
+ }
134
+ async unload() {
135
+ if (!this.plugin || !this.loadedPath)
136
+ return;
137
+ try {
138
+ await this.plugin.releaseContext({ contextId: CONTEXT_ID });
139
+ }
140
+ catch (_a) {
141
+ await this.plugin.releaseAllContexts();
142
+ }
143
+ this.loadedPath = null;
144
+ }
145
+ async generate(options) {
146
+ var _a, _b, _c, _d;
147
+ if (!this.plugin || !this.loadedPath) {
148
+ throw new Error("No model loaded. Call load() first.");
149
+ }
150
+ this.tokenIndex = 0;
151
+ const params = {
152
+ n_predict: (_a = options.maxTokens) !== null && _a !== void 0 ? _a : 2048,
153
+ temperature: (_b = options.temperature) !== null && _b !== void 0 ? _b : 0.7,
154
+ top_p: (_c = options.topP) !== null && _c !== void 0 ? _c : 0.9,
155
+ };
156
+ if (options.stopSequences && options.stopSequences.length > 0) {
157
+ params.stop = options.stopSequences;
158
+ }
159
+ if (options.stream) {
160
+ params.emit_partial_completion = true;
161
+ }
162
+ const started = Date.now();
163
+ const result = await this.plugin.generateText({
164
+ contextId: CONTEXT_ID,
165
+ prompt: options.prompt,
166
+ params,
167
+ });
168
+ const duration = ((_d = result.timings) === null || _d === void 0 ? void 0 : _d.predicted_ms) != null
169
+ ? Math.round(result.timings.predicted_ms)
170
+ : Date.now() - started;
171
+ return {
172
+ text: result.text,
173
+ promptTokens: result.tokens_evaluated,
174
+ outputTokens: result.tokens_predicted,
175
+ durationMs: duration,
176
+ };
177
+ }
178
+ async cancelGenerate() {
179
+ if (!this.plugin)
180
+ return;
181
+ await this.plugin.stopCompletion({ contextId: CONTEXT_ID });
182
+ }
183
+ async embed(options) {
184
+ var _a;
185
+ if (!this.plugin || !this.loadedPath) {
186
+ throw new Error("No model loaded. Call load() first.");
187
+ }
188
+ if (typeof this.plugin.embedding !== "function") {
189
+ throw new Error("llama-cpp-capacitor does not expose embedding() on this build; upgrade or use a cloud embedding provider");
190
+ }
191
+ const params = {
192
+ embd_normalize: (_a = options.embdNormalize) !== null && _a !== void 0 ? _a : 0,
193
+ };
194
+ const result = await this.plugin.embedding({
195
+ contextId: CONTEXT_ID,
196
+ text: options.input,
197
+ params,
198
+ });
199
+ let tokenCount = 0;
200
+ if (typeof this.plugin.tokenize === "function") {
201
+ try {
202
+ const tokenized = await this.plugin.tokenize({
203
+ contextId: CONTEXT_ID,
204
+ text: options.input,
205
+ });
206
+ tokenCount = tokenized.tokens.length;
207
+ }
208
+ catch (err) {
209
+ const message = err instanceof Error ? err.message : String(err);
210
+ console.debug("[capacitor-llama] tokenize fallback", {
211
+ error: message,
212
+ });
213
+ tokenCount = 0;
214
+ }
215
+ }
216
+ return { embedding: result.embedding, tokens: tokenCount };
217
+ }
218
+ onToken(listener) {
219
+ this.tokenListeners.add(listener);
220
+ return () => {
221
+ this.tokenListeners.delete(listener);
222
+ };
223
+ }
224
+ async dispose() {
225
+ this.tokenListeners.clear();
226
+ if (this.pluginListenerHandle) {
227
+ await this.pluginListenerHandle.remove();
228
+ this.pluginListenerHandle = null;
229
+ }
230
+ await this.unload();
231
+ this.plugin = null;
232
+ this.pluginLoadPromise = null;
233
+ }
234
+ }
235
+ export const capacitorLlama = new CapacitorLlamaAdapter();
236
+ export function registerCapacitorLlamaLoader(runtime) {
237
+ if (typeof runtime.registerService !== "function")
238
+ return;
239
+ runtime.registerService("localInferenceLoader", {
240
+ async loadModel(args) {
241
+ await capacitorLlama.load({ modelPath: args.modelPath });
242
+ },
243
+ async unloadModel() {
244
+ await capacitorLlama.unload();
245
+ },
246
+ currentModelPath() {
247
+ return capacitorLlama.currentModelPath();
248
+ },
249
+ async generate(args) {
250
+ const result = await capacitorLlama.generate({
251
+ prompt: args.prompt,
252
+ stopSequences: args.stopSequences,
253
+ maxTokens: args.maxTokens,
254
+ temperature: args.temperature,
255
+ });
256
+ return result.text;
257
+ },
258
+ async embed(args) {
259
+ return capacitorLlama.embed({ input: args.input });
260
+ },
261
+ });
262
+ }
@@ -0,0 +1,92 @@
1
+ /**
2
+ * Eliza-flavoured Capacitor llama.cpp adapter contract.
3
+ *
4
+ * This mirrors the `LocalInferenceLoader` interface in @elizaos/app-core so
5
+ * `ActiveModelCoordinator` can swap between the desktop engine
6
+ * (node-llama-cpp) and the mobile Capacitor plugin without caring which is
7
+ * active. Native llama.cpp work is handled by `llama-cpp-capacitor`; this
8
+ * package is intentionally just a thin mapping layer.
9
+ */
10
+ export interface LoadOptions {
11
+ /**
12
+ * Absolute or sandbox path to a GGUF file on device storage. On iOS this
13
+ * lives under `Application Support/`. On Android under the app's internal
14
+ * files dir.
15
+ */
16
+ modelPath: string;
17
+ /** Context window size; default 4096, capped by model metadata. */
18
+ contextSize?: number;
19
+ /** Hint: when true, the native layer uses GPU/Metal/Vulkan where available. */
20
+ useGpu?: boolean;
21
+ /** Cap on native thread count; native layer picks a reasonable default otherwise. */
22
+ maxThreads?: number;
23
+ }
24
+ export interface GenerateOptions {
25
+ prompt: string;
26
+ maxTokens?: number;
27
+ temperature?: number;
28
+ topP?: number;
29
+ stopSequences?: string[];
30
+ /** When true, token events fire on the "token" listener. */
31
+ stream?: boolean;
32
+ }
33
+ export interface GenerateResult {
34
+ text: string;
35
+ promptTokens: number;
36
+ outputTokens: number;
37
+ durationMs: number;
38
+ }
39
+ export interface HardwareInfo {
40
+ platform: "ios" | "android" | "web";
41
+ /** Human-readable device model when the OS exposes one. */
42
+ deviceModel: string;
43
+ totalRamGb: number;
44
+ availableRamGb: number | null;
45
+ cpuCores: number;
46
+ gpu: {
47
+ backend: "metal" | "vulkan" | "gpu-delegate";
48
+ available: boolean;
49
+ } | null;
50
+ /** True when the underlying llama.cpp build has GPU support compiled in. */
51
+ gpuSupported: boolean;
52
+ }
53
+ export interface EmbedOptions {
54
+ /** Raw text to embed. The adapter forwards this verbatim to the native plugin. */
55
+ input: string;
56
+ /**
57
+ * Optional L2 normalisation passed through to llama-cpp-capacitor's
58
+ * `embd_normalize` parameter. Native default is 0 (off); set to 2 for
59
+ * L2-normalised vectors that match most cloud embedding APIs.
60
+ */
61
+ embdNormalize?: number;
62
+ }
63
+ export interface EmbedResult {
64
+ embedding: number[];
65
+ /**
66
+ * Token count of the embedded input. The native plugin doesn't return
67
+ * this directly so adapters may estimate via `tokenize` and report 0
68
+ * when an estimate is unavailable. Always present so downstream
69
+ * accounting code doesn't have to special-case undefined.
70
+ */
71
+ tokens: number;
72
+ }
73
+ export interface LlamaAdapter {
74
+ getHardwareInfo(): Promise<HardwareInfo>;
75
+ isLoaded(): Promise<{
76
+ loaded: boolean;
77
+ modelPath: string | null;
78
+ }>;
79
+ currentModelPath(): string | null;
80
+ load(options: LoadOptions): Promise<void>;
81
+ unload(): Promise<void>;
82
+ generate(options: GenerateOptions): Promise<GenerateResult>;
83
+ cancelGenerate(): Promise<void>;
84
+ /** Fires when `generate({ stream: true })` emits a new token. */
85
+ onToken(listener: (token: string, index: number) => void): () => void;
86
+ /**
87
+ * Compute a single sentence embedding. Returns the raw float vector and
88
+ * (when known) the input token count. Throws when the underlying plugin
89
+ * does not expose an embedding method on the active platform.
90
+ */
91
+ embed(options: EmbedOptions): Promise<EmbedResult>;
92
+ }
@@ -0,0 +1,10 @@
1
+ /**
2
+ * Eliza-flavoured Capacitor llama.cpp adapter contract.
3
+ *
4
+ * This mirrors the `LocalInferenceLoader` interface in @elizaos/app-core so
5
+ * `ActiveModelCoordinator` can swap between the desktop engine
6
+ * (node-llama-cpp) and the mobile Capacitor plugin without caring which is
7
+ * active. Native llama.cpp work is handled by `llama-cpp-capacitor`; this
8
+ * package is intentionally just a thin mapping layer.
9
+ */
10
+ export {};
@@ -0,0 +1,48 @@
1
+ /**
2
+ * Device-side half of the agent↔device inference bridge.
3
+ *
4
+ * Runs inside the mobile app (Capacitor iOS / Android) and dials out to
5
+ * the agent container over WebSocket. Receives `generate` requests,
6
+ * forwards to `capacitorLlama`, returns results. Auto-reconnects with
7
+ * exponential backoff when the link drops.
8
+ *
9
+ * Mirrors the message envelope defined in
10
+ * `@elizaos/app-core/src/services/local-inference/device-bridge.ts`.
11
+ * Keep the two in sync by hand — the message shape is the bridge
12
+ * contract.
13
+ */
14
+ export interface DeviceBridgeClientConfig {
15
+ /** Absolute WS URL of the agent: `wss://agent.example.com/api/local-inference/device-bridge`. */
16
+ agentUrl: string;
17
+ /** Shared pairing secret. Passed both as a `?token=` query param and in the register payload. */
18
+ pairingToken?: string;
19
+ /** Stable device identifier. Survives reinstalls when persisted by the host app. */
20
+ deviceId: string;
21
+ /** Called on state transitions so the host app can show a pairing UI. */
22
+ onStateChange?: (state: "connecting" | "connected" | "disconnected" | "error", detail?: string) => void;
23
+ }
24
+ export declare class DeviceBridgeClient {
25
+ private socket;
26
+ private reconnectAttempt;
27
+ private stopped;
28
+ private readonly config;
29
+ constructor(config: DeviceBridgeClientConfig);
30
+ start(): void;
31
+ stop(): void;
32
+ private computeBackoffMs;
33
+ private connect;
34
+ private buildUrl;
35
+ private scheduleReconnect;
36
+ private sendRegister;
37
+ private send;
38
+ private handleAgentMessage;
39
+ }
40
+ /**
41
+ * Convenience helper for the mobile bootstrap: starts a bridge client
42
+ * using values from the Eliza config or hardcoded env.
43
+ *
44
+ * The host app is expected to call this once during Capacitor bootstrap.
45
+ * `agentUrl` and `pairingToken` come from the user's pairing flow and
46
+ * should be persisted across launches.
47
+ */
48
+ export declare function startDeviceBridgeClient(config: DeviceBridgeClientConfig): DeviceBridgeClient;