@nevescloud/pip 3.5.1 → 3.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -48,6 +48,18 @@ const pip = createPip({ onSubmit: rt.onSubmit, onSlash: rt.onSlash, slashSource:
48
48
  // the turn loop entirely).
49
49
  ```
50
50
 
51
+ ```js
52
+ // Chrome — on-device Gemini Nano via the Prompt API (zero download for users on Chrome 138+ that already has weights; ~2B-effective-param quality)
53
+ import { createRuntime } from 'https://cdn.jsdelivr.net/npm/@nevescloud/pip@latest/runtime.esm.js';
54
+ import { createPip } from 'https://cdn.jsdelivr.net/npm/@nevescloud/pip@latest/pip-core.esm.js';
55
+ import { chrome } from 'https://cdn.jsdelivr.net/npm/@nevescloud/pip@latest/providers/chrome.esm.js';
56
+
57
+ const rt = createRuntime({ provider: chrome({ temperature: 0.1 }) });
58
+ const pip = createPip({ onSubmit: rt.onSubmit, onSlash: rt.onSlash, slashSource: rt.slashSource });
59
+ // No bundle — Chrome doesn't need its own re-export of createPip + createRuntime
60
+ // (`bundle/anthropic` already brings those, and chrome() composes alongside).
61
+ ```
62
+
51
63
  On jsdelivr the `.esm.js` suffix is required — jsdelivr serves files by raw path, not via `package.json` exports. npm-installed consumers can use the shorter `@nevescloud/pip/bundle/anthropic` (Node ESM resolver honors the exports map). `pip/bundle.esm.js` (or `pip/bundle` via npm) is an alias for `bundle/anthropic` — the default when you haven't picked a brain. Bundles are sugar over the layered files; hosts with a different brain shape (UI only, custom provider, in-browser model) import the granular files directly. See [CONSUMERS.md](../../CONSUMERS.md) for the full entry-point list.
52
64
 
53
65
  ## Options
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@nevescloud/pip",
3
- "version": "3.5.1",
3
+ "version": "3.8.0",
4
4
  "description": "Floating assistant bubble + panel + chat runtime. ESM, no build.",
5
5
  "type": "module",
6
6
  "main": "pip-core.esm.js",
@@ -19,7 +19,8 @@
19
19
  "./bundle/local.esm.js": "./bundle/local.esm.js",
20
20
  "./providers/anthropic.esm.js": "./providers/anthropic.esm.js",
21
21
  "./providers/openai.esm.js": "./providers/openai.esm.js",
22
- "./providers/local.esm.js": "./providers/local.esm.js"
22
+ "./providers/local.esm.js": "./providers/local.esm.js",
23
+ "./providers/chrome.esm.js": "./providers/chrome.esm.js"
23
24
  },
24
25
  "files": [
25
26
  "pip-core.esm.js",
@@ -0,0 +1,169 @@
1
+ // Prompt-based tool calling for text-only providers (local, chrome).
2
+ //
3
+ // Models without a native tool-use protocol (Chrome's Prompt API) — and
4
+ // models whose native protocol is gated behind special tokens that
5
+ // transformers.js's TextStreamer strips with skip_special_tokens (Gemma
6
+ // 4's <|tool_call> family) — both need a text-channel convention.
7
+ //
8
+ // We use the community XML-JSON format: <tool_call>{"name":"…","arguments":{…}}</tool_call>.
9
+ // It survives any tokenizer setting (literal characters), parses with a
10
+ // regex, and is well-represented in instruction-tuned model training
11
+ // data (Hermes 2 Pro, Nous, and many fine-tunes converge here).
12
+ //
13
+ // Three exports:
14
+ // * buildToolSystemPrompt(systemPrompt, tools) — augments the system
15
+ // prompt with tool schemas + the response-format instruction.
16
+ // * flattenMessages(messages) — converts the runtime's structured
17
+ // tool_use/tool_result blocks back into the text channel so text-only
18
+ // providers can replay them.
19
+ // * createToolCallParser({ idGen }) — streaming parser. .feed(chunk)
20
+ // yields text_delta + tool_use events; .flush() emits any trailing text.
21
+
22
+ const OPEN = '<tool_call>';
23
+ const CLOSE = '</tool_call>';
24
+
25
+ export function buildToolSystemPrompt(systemPrompt, tools) {
26
+ if (!tools?.length) return systemPrompt || '';
27
+ const schemas = tools.map((t) => ({
28
+ name: t.name,
29
+ description: t.description || '',
30
+ parameters: t.schema || t.input_schema || { type: 'object', properties: {} },
31
+ }));
32
+ // The trailing example is critical for small models — Gemini Nano in
33
+ // particular often emits `function_call` or `tool_use` keys without it.
34
+ // The "answer directly otherwise" line keeps single-shot chat working.
35
+ const instruction = [
36
+ 'You have access to the following tools:',
37
+ JSON.stringify(schemas, null, 2),
38
+ '',
39
+ 'When you want to call a tool, emit a single line in this exact format:',
40
+ `${OPEN}{"name":"tool_name","arguments":{"key":"value"}}${CLOSE}`,
41
+ '',
42
+ 'A tool result will be returned to you wrapped in <tool_result>…</tool_result>.',
43
+ 'Use the result to compose your final answer. If no tool is needed, answer directly.',
44
+ ].join('\n');
45
+ return systemPrompt ? `${systemPrompt}\n\n${instruction}` : instruction;
46
+ }
47
+
48
+ // Convert runtime's structured assistant turns ({content: [{type:'tool_use'},…]})
49
+ // and tool_result user turns back into flat text so text-only providers can
50
+ // replay the history. The runtime's tool loop produces these between
51
+ // iterations; the provider sees them on the next call.
52
+ export function flattenMessages(messages) {
53
+ return messages.map((m) => {
54
+ if (typeof m.content === 'string') return { role: m.role, content: m.content };
55
+ if (!Array.isArray(m.content)) return { role: m.role, content: '' };
56
+ const parts = [];
57
+ for (const block of m.content) {
58
+ if (block.type === 'text') {
59
+ parts.push(block.text);
60
+ } else if (block.type === 'tool_use') {
61
+ const args = block.input == null ? {} : block.input;
62
+ parts.push(`${OPEN}${JSON.stringify({ name: block.name, arguments: args })}${CLOSE}`);
63
+ } else if (block.type === 'tool_result') {
64
+ const c = typeof block.content === 'string' ? block.content : JSON.stringify(block.content);
65
+ const name = block.name ? ` name="${block.name}"` : '';
66
+ parts.push(`<tool_result${name}>${c}</tool_result>`);
67
+ }
68
+ }
69
+ return { role: m.role, content: parts.join('\n') };
70
+ });
71
+ }
72
+
73
+ // Streaming parser. Feed chunks of generated text as they arrive; receive
74
+ // a flat list of runtime events. State is internal — one parser per turn.
75
+ //
76
+ // Behaviour:
77
+ // * Text outside <tool_call>…</tool_call> emits as text_delta. We hold
78
+ // back the trailing OPEN.length-1 chars on each feed so a partial
79
+ // opening tag spanning chunks doesn't leak as text.
80
+ // * A complete <tool_call>…</tool_call> block emits as tool_use. The
81
+ // payload is parsed as JSON; malformed JSON falls back to a text
82
+ // emission so the model's intent isn't silently dropped.
83
+ // * flush() emits any remaining text and any partial unterminated
84
+ // tool_call as text (rather than yielding a malformed tool_use that
85
+ // would derail the runtime's loop).
86
+ export function createToolCallParser({ idGen } = {}) {
87
+ const id = idGen || (() => `tu_${Date.now().toString(36)}_${Math.random().toString(36).slice(2, 8)}`);
88
+ let buf = '';
89
+ let emitted = 0; // index up to which we've emitted (text or call body)
90
+ let inCall = false;
91
+ let callBodyStart = -1;
92
+
93
+ function readTextUpTo(safeUpTo) {
94
+ if (safeUpTo <= emitted) return null;
95
+ const text = buf.slice(emitted, safeUpTo);
96
+ emitted = safeUpTo;
97
+ return text ? { type: 'text_delta', text } : null;
98
+ }
99
+
100
+ function parseCallBody(body) {
101
+ try {
102
+ const obj = JSON.parse(body.trim());
103
+ if (obj && typeof obj.name === 'string') {
104
+ return { name: obj.name, input: obj.arguments ?? obj.parameters ?? {} };
105
+ }
106
+ } catch {}
107
+ return null;
108
+ }
109
+
110
+ return {
111
+ feed(chunk) {
112
+ if (!chunk) return [];
113
+ buf += chunk;
114
+ const out = [];
115
+ // Run until we either consume the buffer or can't make progress.
116
+ // eslint-disable-next-line no-constant-condition
117
+ while (true) {
118
+ if (!inCall) {
119
+ const openIdx = buf.indexOf(OPEN, emitted);
120
+ if (openIdx === -1) {
121
+ // No OPEN tag yet. Hold back the last OPEN.length-1 chars in
122
+ // case a partial tag straddles the next chunk.
123
+ const safe = Math.max(emitted, buf.length - (OPEN.length - 1));
124
+ const ev = readTextUpTo(safe);
125
+ if (ev) out.push(ev);
126
+ break;
127
+ }
128
+ // Emit text before the tag
129
+ const ev = readTextUpTo(openIdx);
130
+ if (ev) out.push(ev);
131
+ inCall = true;
132
+ callBodyStart = openIdx + OPEN.length;
133
+ emitted = callBodyStart;
134
+ } else {
135
+ const closeIdx = buf.indexOf(CLOSE, callBodyStart);
136
+ if (closeIdx === -1) break; // wait for more
137
+ const body = buf.slice(callBodyStart, closeIdx);
138
+ const parsed = parseCallBody(body);
139
+ if (parsed) {
140
+ out.push({ type: 'tool_use', id: id(), name: parsed.name, input: parsed.input });
141
+ } else {
142
+ // Couldn't parse — surface the literal text so the user sees
143
+ // what the model emitted instead of a silent drop.
144
+ out.push({ type: 'text_delta', text: OPEN + body + CLOSE });
145
+ }
146
+ inCall = false;
147
+ callBodyStart = -1;
148
+ emitted = closeIdx + CLOSE.length;
149
+ }
150
+ }
151
+ return out;
152
+ },
153
+
154
+ flush() {
155
+ const out = [];
156
+ if (inCall) {
157
+ // Unterminated <tool_call> at end-of-stream. Surface as text so
158
+ // partial content isn't lost; the runtime won't try to dispatch.
159
+ out.push({ type: 'text_delta', text: buf.slice(emitted - OPEN.length) });
160
+ emitted = buf.length;
161
+ inCall = false;
162
+ } else if (emitted < buf.length) {
163
+ out.push({ type: 'text_delta', text: buf.slice(emitted) });
164
+ emitted = buf.length;
165
+ }
166
+ return out;
167
+ },
168
+ };
169
+ }
@@ -0,0 +1,311 @@
1
+ // Chrome's built-in Prompt API (on-device Gemini Nano / Gemma-derived).
2
+ // Runtime-compatible provider — slots into createRuntime as a peer of
3
+ // anthropic/openai/local. Zero download for users on Chrome that
4
+ // already has the weights; reply quality lands ~2B-effective-param.
5
+ //
6
+ // Usage:
7
+ // import { createRuntime } from '@nevescloud/pip/runtime.esm.js';
8
+ // import { chrome } from '@nevescloud/pip/providers/chrome.esm.js';
9
+ //
10
+ // const rt = createRuntime({
11
+ // provider: chrome({ temperature: 0.1 }),
12
+ // tools: [ { name: 'get_time', description: '…', schema: {…}, handler: …} ],
13
+ // });
14
+ //
15
+ // Optimizations (per developer.chrome.com/docs/ai/session-management):
16
+ //
17
+ // * Session cache. First turn calls LanguageModel.create() with the
18
+ // full history as initialPrompts; subsequent turns reuse the same
19
+ // session and only feed the newest turn via session.append(), then
20
+ // prompt with the latest user message. Avoids the create() cost on
21
+ // every chat exchange.
22
+ // * Cache invalidation. If runtime's `messages` shrinks (clear /
23
+ // regenerate / model swap-and-back), the system prompt changes, or
24
+ // the tool schema changes, we destroy() and rebuild — keeps session
25
+ // ↔ runtime in lockstep.
26
+ // * downloadprogress wired to pip's loading bar via the create()
27
+ // monitor option, mounted on the active turnEl. First load only.
28
+ // * AbortSignal flows through create() AND promptStreaming(), so a
29
+ // stop click cancels both model fetch and an in-flight prompt.
30
+ // * temperature/topK are origin-trial / Extensions only. If params()
31
+ // is missing (stable web), we omit both and warn once if the host
32
+ // passed either. If only one is set, we fill the other from
33
+ // LanguageModel.params() to satisfy the API's both-or-neither rule.
34
+ //
35
+ // Tool use (prompt-based — the Prompt API has no native tool channel):
36
+ //
37
+ // * When the runtime passes `tools`, the helper in _tool-prompt.esm.js
38
+ // injects JSON schemas + a response-format instruction into the
39
+ // system prompt and parses the model's stream for
40
+ // <tool_call>{"name":"…","arguments":{…}}</tool_call> blocks.
41
+ // * Detected calls yield as tool_use events with stopReason='tool_use';
42
+ // the runtime dispatches, appends tool_result turns, and re-invokes
43
+ // this provider. The cached session continues from where it left off.
44
+ // * Reliability scales with model size — Nano (~2B effective) handles
45
+ // simple single-tool calls; chained / nested calls are flaky. For
46
+ // hard guarantees, register the action as a slash command instead.
47
+
48
+ import { showLoading, hideLoading } from '../pip-core.esm.js';
49
+ import { buildToolSystemPrompt, flattenMessages, createToolCallParser } from './_tool-prompt.esm.js';
50
+
51
+ const UNAVAILABLE =
52
+ "Chrome's built-in AI isn't available here. Use Chrome 138+ or enable " +
53
+ "chrome://flags#prompt-api-for-gemini-nano on earlier versions.";
54
+
55
+ function getApi() {
56
+ // Newer spec: top-level constructor. Older: nested under window.ai.
57
+ if (globalThis.LanguageModel) return globalThis.LanguageModel;
58
+ if (globalThis.ai?.languageModel) return globalThis.ai.languageModel;
59
+ return null;
60
+ }
61
+
62
+ async function ensureAvailable(LM, availabilityOpts) {
63
+ // Availability is a *hint*, not a gate. Chrome 148+ has been observed
64
+ // returning 'unavailable' from availability() for the default config
65
+ // (no expectedOutputs/Inputs specified) while create() with the same
66
+ // opts succeeds — the availability check is stricter than create's
67
+ // own. We only hard-gate when LM itself is missing (handled upstream);
68
+ // here we just surface state for diagnostics and handle 'downloading'
69
+ // (which create() can't tolerate on some builds).
70
+ if (typeof LM.availability === 'function') {
71
+ let v;
72
+ try { v = await LM.availability(availabilityOpts || {}); }
73
+ catch (e) {
74
+ // eslint-disable-next-line no-console
75
+ console.warn('[pip/chrome] availability() threw — proceeding to create() to get the real error:', e?.message || e);
76
+ return 'unknown';
77
+ }
78
+ if (v === 'unavailable') {
79
+ // eslint-disable-next-line no-console
80
+ console.warn('[pip/chrome] availability() returned "unavailable" with opts', availabilityOpts, '— proceeding to create() anyway; if Chrome rejects, the real reason will surface.');
81
+ return v;
82
+ }
83
+ if (v === 'downloading') {
84
+ const deadline = Date.now() + 5 * 60 * 1000;
85
+ while (Date.now() < deadline) {
86
+ await new Promise((r) => setTimeout(r, 1500));
87
+ const next = await LM.availability(availabilityOpts || {});
88
+ if (next === 'available' || next === 'downloadable') return next;
89
+ if (next === 'unavailable') {
90
+ // eslint-disable-next-line no-console
91
+ console.warn('[pip/chrome] availability flipped to "unavailable" mid-download — proceeding to create() to surface the real error.');
92
+ return next;
93
+ }
94
+ }
95
+ throw new Error("Chrome's built-in model is still downloading — try again in a minute.");
96
+ }
97
+ return v;
98
+ }
99
+ if (typeof LM.capabilities === 'function') {
100
+ const c = await LM.capabilities();
101
+ if (c?.available === 'no') {
102
+ // eslint-disable-next-line no-console
103
+ console.warn('[pip/chrome] capabilities() reported "no" — proceeding to create() to surface the real error.');
104
+ }
105
+ return c?.available || 'unknown';
106
+ }
107
+ return 'available';
108
+ }
109
+
110
+ function toolsFingerprint(tools) {
111
+ if (!tools?.length) return '';
112
+ return tools.map((t) => `${t.name}:${t.description || ''}`).join('|');
113
+ }
114
+
115
+ export function chrome({
116
+ systemPrompt,
117
+ temperature,
118
+ topK,
119
+ expectedInputs,
120
+ expectedOutputs,
121
+ } = {}) {
122
+ let sessionPromise = null;
123
+ let consumed = 0; // count of messages already fed into the cached session
124
+ let lastSystem = null;
125
+ let lastToolsFp = '';
126
+ let warnedNoParams = false;
127
+
128
+ async function buildOpts(LM, augmentedSystem, history, monitorFn, signal) {
129
+ const opts = {};
130
+ if (signal) opts.signal = signal;
131
+ if (expectedInputs) opts.expectedInputs = expectedInputs;
132
+ if (expectedOutputs) opts.expectedOutputs = expectedOutputs;
133
+ if (monitorFn) opts.monitor = monitorFn;
134
+
135
+ const initial = [];
136
+ if (augmentedSystem) initial.push({ role: 'system', content: augmentedSystem });
137
+ for (const m of history) {
138
+ // Skip assistant turns from the runtime's own history serialization
139
+ // when they're string-typed `(role: 'assistant', content: '…tool_call…')`
140
+ // — those already encode tool_use blocks as text via flattenMessages,
141
+ // safe to replay. Only drop anything that ended up non-string.
142
+ if (typeof m.content === 'string') {
143
+ initial.push({ role: m.role, content: m.content });
144
+ }
145
+ }
146
+ if (initial.length) opts.initialPrompts = initial;
147
+
148
+ // temperature/topK are origin-trial / Extensions only. params() is
149
+ // gated to the same builds — its presence is the feature flag.
150
+ if (temperature != null || topK != null) {
151
+ if (typeof LM.params === 'function') {
152
+ let t = temperature, k = topK;
153
+ if ((t != null) !== (k != null)) {
154
+ try {
155
+ const p = await LM.params();
156
+ if (t == null) t = p?.defaultTemperature;
157
+ if (k == null) k = p?.defaultTopK;
158
+ } catch {}
159
+ }
160
+ if (t == null) t = 1.0;
161
+ if (k == null) k = 40;
162
+ opts.temperature = t;
163
+ opts.topK = k;
164
+ } else if (!warnedNoParams) {
165
+ warnedNoParams = true;
166
+ // eslint-disable-next-line no-console
167
+ console.warn(
168
+ '[pip/chrome] temperature/topK ignored — only supported on ' +
169
+ 'Prompt API for Chrome Extensions or with the Origin Trial enabled.'
170
+ );
171
+ }
172
+ }
173
+ return opts;
174
+ }
175
+
176
+ async function getSession(LM, augmentedSystem, toolsFp, flatMessages, turnEl, signal) {
177
+ const needsReset =
178
+ !sessionPromise ||
179
+ augmentedSystem !== lastSystem ||
180
+ toolsFp !== lastToolsFp ||
181
+ flatMessages.length < consumed + 1;
182
+
183
+ if (needsReset && sessionPromise) {
184
+ try { (await sessionPromise).destroy?.(); } catch {}
185
+ sessionPromise = null;
186
+ consumed = 0;
187
+ }
188
+
189
+ if (!sessionPromise) {
190
+ const history = flatMessages.slice(0, -1);
191
+ const monitorFn = turnEl
192
+ ? (m) => {
193
+ try {
194
+ m.addEventListener?.('downloadprogress', (e) => {
195
+ const pct = Math.round((e.loaded || 0) * 100);
196
+ showLoading(turnEl, `downloading model ${pct}%`, pct);
197
+ });
198
+ } catch {}
199
+ }
200
+ : undefined;
201
+ const opts = await buildOpts(LM, augmentedSystem, history, monitorFn, signal);
202
+ // Some Chrome builds reject role 'system' in initialPrompts and return
203
+ // a generic "unable to create a session" error. Fall back to folding
204
+ // the system content into the first user message — same effect.
205
+ const tryCreate = async () => {
206
+ try {
207
+ return await LM.create(opts);
208
+ } catch (err) {
209
+ const msg = String(err?.message || err || '');
210
+ if (augmentedSystem && /unable to create|initialPrompt|role/i.test(msg)) {
211
+ // eslint-disable-next-line no-console
212
+ console.warn('[pip/chrome] create() rejected first-pass opts; retrying with system folded into user prompt:', msg);
213
+ const fallback = { ...opts };
214
+ const prompts = [];
215
+ // Fold system + history → tagged first user message.
216
+ const folded = `<<SYSTEM>>\n${augmentedSystem}\n<<END_SYSTEM>>`;
217
+ prompts.push({ role: 'user', content: folded });
218
+ for (const m of history) {
219
+ if (typeof m.content === 'string') prompts.push({ role: m.role, content: m.content });
220
+ }
221
+ fallback.initialPrompts = prompts;
222
+ return await LM.create(fallback);
223
+ }
224
+ throw err;
225
+ }
226
+ };
227
+ sessionPromise = tryCreate().then((s) => {
228
+ if (turnEl) hideLoading(turnEl);
229
+ s.addEventListener?.('contextoverflow', () => {
230
+ // eslint-disable-next-line no-console
231
+ console.warn('[pip/chrome] context window full — older turns evicted by Chrome.');
232
+ });
233
+ return s;
234
+ }).catch((err) => {
235
+ if (turnEl) hideLoading(turnEl);
236
+ sessionPromise = null;
237
+ // eslint-disable-next-line no-console
238
+ console.warn('[pip/chrome] LM.create() failed:', err?.message || err, '— augmentedSystem chars:', augmentedSystem.length, 'history msgs:', history.length);
239
+ throw err;
240
+ });
241
+ consumed = history.length;
242
+ lastSystem = augmentedSystem;
243
+ lastToolsFp = toolsFp;
244
+ } else {
245
+ // Cached session — append messages new since last call. Skip
246
+ // assistant role: the session already generated those itself
247
+ // during the prior promptStreaming() and re-feeding would duplicate.
248
+ const session = await sessionPromise;
249
+ const newSlice = flatMessages.slice(consumed, flatMessages.length - 1);
250
+ for (const m of newSlice) {
251
+ if (m.role === 'assistant') continue;
252
+ if (typeof m.content === 'string' && typeof session.append === 'function') {
253
+ try { await session.append([{ role: m.role, content: m.content }]); }
254
+ catch {}
255
+ }
256
+ }
257
+ consumed = flatMessages.length - 1;
258
+ }
259
+ return sessionPromise;
260
+ }
261
+
262
+ return ({ messages, signal, system, tools, turnEl }) => (async function* () {
263
+ const LM = getApi();
264
+ if (!LM) throw new Error(UNAVAILABLE);
265
+
266
+ const effectiveSystem = system || systemPrompt || '';
267
+ const augmentedSystem = buildToolSystemPrompt(effectiveSystem, tools);
268
+ const toolsFp = toolsFingerprint(tools);
269
+
270
+ const availabilityOpts = {};
271
+ if (expectedInputs) availabilityOpts.expectedInputs = expectedInputs;
272
+ if (expectedOutputs) availabilityOpts.expectedOutputs = expectedOutputs;
273
+ await ensureAvailable(LM, availabilityOpts);
274
+
275
+ // Flatten structured messages (runtime's tool_use / tool_result blocks)
276
+ // into the text channel the Prompt API speaks.
277
+ const flat = flattenMessages(messages);
278
+ const session = await getSession(LM, augmentedSystem, toolsFp, flat, turnEl, signal);
279
+ const tail = flat[flat.length - 1];
280
+ const userText = tail?.content || '';
281
+
282
+ const parser = createToolCallParser();
283
+ let sawToolUse = false;
284
+
285
+ try {
286
+ const stream = session.promptStreaming(userText, signal ? { signal } : undefined);
287
+ for await (const chunk of stream) {
288
+ if (signal?.aborted) throw new DOMException('Aborted', 'AbortError');
289
+ if (typeof chunk !== 'string' || !chunk) continue;
290
+ for (const ev of parser.feed(chunk)) {
291
+ if (ev.type === 'tool_use') sawToolUse = true;
292
+ yield ev;
293
+ }
294
+ }
295
+ // Flush any pending text the parser buffered (final delta or an
296
+ // unterminated tool_call surfaced as literal text).
297
+ for (const ev of parser.flush()) {
298
+ if (ev.type === 'tool_use') sawToolUse = true;
299
+ yield ev;
300
+ }
301
+ yield { type: 'turn_end', stopReason: sawToolUse ? 'tool_use' : 'end_turn' };
302
+ } catch (err) {
303
+ try { session?.destroy?.(); } catch {}
304
+ sessionPromise = null;
305
+ consumed = 0;
306
+ lastSystem = null;
307
+ lastToolsFp = '';
308
+ throw err;
309
+ }
310
+ })();
311
+ }
@@ -1,7 +1,7 @@
1
1
  // In-browser model via transformers.js + WebGPU. Two shapes ship:
2
2
  //
3
- // 1. `local({ model, dtype, maxTokens, genParams })` — runtime-compatible
4
- // provider, slots into `createRuntime({ models: [{ provider: local(...) }] })`
3
+ // 1. `local({ model, dtype, maxTokens, genParams, chatTemplate })` — runtime-
4
+ // compatible provider, slots into `createRuntime({ models: [{ provider: local(...) }] })`
5
5
  // next to anthropic() and openai(). Wraps the renderer below and adapts
6
6
  // its setReplyText-callback paint into the runtime's async-generator
7
7
  // event protocol. Use this when local should participate in `/model`.
@@ -16,6 +16,7 @@
16
16
  // <think> pill rendering); the provider just adapts the call shape.
17
17
 
18
18
  import { showLoading, hideLoading } from '../pip-core.esm.js';
19
+ import { buildToolSystemPrompt, flattenMessages, createToolCallParser } from './_tool-prompt.esm.js';
19
20
 
20
21
  const TRANSFORMERS_URL = 'https://cdn.jsdelivr.net/npm/@huggingface/transformers';
21
22
 
@@ -194,9 +195,13 @@ export function createTransformersRenderer() {
194
195
  await ensureLoaded(turnEl);
195
196
 
196
197
  const tf = await loadTransformers();
198
+ // chatTemplate spread last so consumers can override defaults (e.g.
199
+ // Gemma 4 needs { enable_thinking: false } to suppress channeled
200
+ // thought-token leaks; Qwen has its own templating knobs).
197
201
  const inputs = tokenizer.apply_chat_template(messages, {
198
202
  add_generation_prompt: true,
199
203
  return_tensors: 'pt',
204
+ ...(config.chatTemplate || {}),
200
205
  });
201
206
 
202
207
  const start = performance.now();
@@ -261,19 +266,50 @@ export function createTransformersRenderer() {
261
266
 
262
267
  // Runtime-compatible provider. The renderer streams reply text via the
263
268
  // setReplyText callback (cumulative buffer per call); we proxy that
264
- // callback, diff each call into a `text_delta` event, and yield events
265
- // the runtime's turn loop already consumes. The <think> pill mounts onto
266
- // turnEl directly inside the renderer — unaffected, still works.
269
+ // callback, diff each call into the tool-aware streaming parser, and
270
+ // yield text_delta + tool_use events the runtime's turn loop consumes.
271
+ // The <think> pill mounts onto turnEl directly inside the renderer —
272
+ // unaffected, still works.
273
+ //
274
+ // Tool calling. When the runtime passes `tools`, we augment the system
275
+ // prompt with JSON schemas + the <tool_call>{…}</tool_call> emit
276
+ // convention (see _tool-prompt.esm.js). Models that follow it (Gemma 4
277
+ // is well-trained on this shape) get dispatched through the runtime's
278
+ // loop; tool_result turns are flattened back into text for the next
279
+ // model call. Gemma 4's native special-token format (<|tool_call>…) is
280
+ // NOT used because TextStreamer with skip_special_tokens drops the
281
+ // markers — the text-channel convention works regardless.
267
282
  //
268
283
  // One pitfall handled: the renderer occasionally re-paints the same
269
284
  // buffer (no new tokens emitted between calls), so the diff guards
270
285
  // against zero-length deltas. AbortSignal flows through naturally —
271
286
  // the underlying TextStreamer throws AbortError, which we surface.
272
- export function local({ model, dtype = 'q4', maxTokens = 256, genParams } = {}) {
287
+ export function local({
288
+ model,
289
+ dtype = 'q4',
290
+ maxTokens = 256,
291
+ genParams,
292
+ chatTemplate,
293
+ systemPrompt,
294
+ } = {}) {
273
295
  const renderer = createTransformersRenderer();
274
- if (model) renderer.setModel({ id: model, dtype, maxTokens, genParams });
296
+ if (model) renderer.setModel({ id: model, dtype, maxTokens, genParams, chatTemplate });
297
+
298
+ return ({ messages, signal, system, tools, turnEl, setReplyText }) => (async function* () {
299
+ const effectiveSystem = system || systemPrompt || '';
300
+ const augmentedSystem = buildToolSystemPrompt(effectiveSystem, tools);
301
+
302
+ // Flatten runtime's structured turns (tool_use/tool_result blocks)
303
+ // back into text-channel strings the chat template understands.
304
+ // Prepend the augmented system if any — apply_chat_template renders
305
+ // role:'system' into Gemma's developer/system slot natively.
306
+ const flat = flattenMessages(messages);
307
+ const renderMessages = augmentedSystem
308
+ ? [{ role: 'system', content: augmentedSystem }, ...flat]
309
+ : flat;
275
310
 
276
- return ({ messages, signal, turnEl, setReplyText }) => (async function* () {
311
+ const parser = createToolCallParser();
312
+ let sawToolUse = false;
277
313
  let lastFull = '';
278
314
  const queue = [];
279
315
  let wake = null;
@@ -281,16 +317,25 @@ export function local({ model, dtype = 'q4', maxTokens = 256, genParams } = {})
281
317
  let error = null;
282
318
 
283
319
  const proxySetReplyText = (_el, fullText) => {
320
+ if (fullText.length <= lastFull.length) return;
284
321
  const delta = fullText.slice(lastFull.length);
285
322
  lastFull = fullText;
286
- if (delta) {
287
- queue.push({ type: 'text_delta', text: delta });
288
- wake?.();
323
+ for (const ev of parser.feed(delta)) {
324
+ if (ev.type === 'tool_use') sawToolUse = true;
325
+ queue.push(ev);
289
326
  }
327
+ wake?.();
290
328
  };
291
329
 
292
- renderer.generate({ messages, turnEl, setReplyText: proxySetReplyText, signal })
293
- .then(() => { done = true; wake?.(); })
330
+ renderer.generate({ messages: renderMessages, turnEl, setReplyText: proxySetReplyText, signal })
331
+ .then(() => {
332
+ for (const ev of parser.flush()) {
333
+ if (ev.type === 'tool_use') sawToolUse = true;
334
+ queue.push(ev);
335
+ }
336
+ done = true;
337
+ wake?.();
338
+ })
294
339
  .catch((e) => { error = e; done = true; wake?.(); });
295
340
 
296
341
  while (true) {
@@ -300,7 +345,7 @@ export function local({ model, dtype = 'q4', maxTokens = 256, genParams } = {})
300
345
  }
301
346
 
302
347
  if (error) throw error;
303
- yield { type: 'turn_end', stopReason: 'end_turn' };
348
+ yield { type: 'turn_end', stopReason: sawToolUse ? 'tool_use' : 'end_turn' };
304
349
  })();
305
350
  }
306
351