@nevescloud/pip 3.7.0 → 3.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@nevescloud/pip",
3
- "version": "3.7.0",
3
+ "version": "3.8.0",
4
4
  "description": "Floating assistant bubble + panel + chat runtime. ESM, no build.",
5
5
  "type": "module",
6
6
  "main": "pip-core.esm.js",
@@ -0,0 +1,169 @@
1
+ // Prompt-based tool calling for text-only providers (local, chrome).
2
+ //
3
+ // Models without a native tool-use protocol (Chrome's Prompt API) — and
4
+ // models whose native protocol is gated behind special tokens that
5
+ // transformers.js's TextStreamer strips with skip_special_tokens (Gemma
6
+ // 4's <|tool_call> family) — both need a text-channel convention.
7
+ //
8
+ // We use the community XML-JSON format: <tool_call>{"name":"…","arguments":{…}}</tool_call>.
9
+ // It survives any tokenizer setting (literal characters), parses with a
10
+ // regex, and is well-represented in instruction-tuned model training
11
+ // data (Hermes 2 Pro, Nous, and many fine-tunes converge here).
12
+ //
13
+ // Three exports:
14
+ // * buildToolSystemPrompt(systemPrompt, tools) — augments the system
15
+ // prompt with tool schemas + the response-format instruction.
16
+ // * flattenMessages(messages) — converts the runtime's structured
17
+ // tool_use/tool_result blocks back into the text channel so text-only
18
+ // providers can replay them.
19
+ // * createToolCallParser({ idGen }) — streaming parser. .feed(chunk)
20
+ // yields text_delta + tool_use events; .flush() emits any trailing text.
21
+
22
+ const OPEN = '<tool_call>';
23
+ const CLOSE = '</tool_call>';
24
+
25
+ export function buildToolSystemPrompt(systemPrompt, tools) {
26
+ if (!tools?.length) return systemPrompt || '';
27
+ const schemas = tools.map((t) => ({
28
+ name: t.name,
29
+ description: t.description || '',
30
+ parameters: t.schema || t.input_schema || { type: 'object', properties: {} },
31
+ }));
32
+ // The trailing example is critical for small models — Gemini Nano in
33
+ // particular often emits `function_call` or `tool_use` keys without it.
34
+ // The "answer directly otherwise" line keeps single-shot chat working.
35
+ const instruction = [
36
+ 'You have access to the following tools:',
37
+ JSON.stringify(schemas, null, 2),
38
+ '',
39
+ 'When you want to call a tool, emit a single line in this exact format:',
40
+ `${OPEN}{"name":"tool_name","arguments":{"key":"value"}}${CLOSE}`,
41
+ '',
42
+ 'A tool result will be returned to you wrapped in <tool_result>…</tool_result>.',
43
+ 'Use the result to compose your final answer. If no tool is needed, answer directly.',
44
+ ].join('\n');
45
+ return systemPrompt ? `${systemPrompt}\n\n${instruction}` : instruction;
46
+ }
47
+
48
+ // Convert runtime's structured assistant turns ({content: [{type:'tool_use'},…]})
49
+ // and tool_result user turns back into flat text so text-only providers can
50
+ // replay the history. The runtime's tool loop produces these between
51
+ // iterations; the provider sees them on the next call.
52
+ export function flattenMessages(messages) {
53
+ return messages.map((m) => {
54
+ if (typeof m.content === 'string') return { role: m.role, content: m.content };
55
+ if (!Array.isArray(m.content)) return { role: m.role, content: '' };
56
+ const parts = [];
57
+ for (const block of m.content) {
58
+ if (block.type === 'text') {
59
+ parts.push(block.text);
60
+ } else if (block.type === 'tool_use') {
61
+ const args = block.input == null ? {} : block.input;
62
+ parts.push(`${OPEN}${JSON.stringify({ name: block.name, arguments: args })}${CLOSE}`);
63
+ } else if (block.type === 'tool_result') {
64
+ const c = typeof block.content === 'string' ? block.content : JSON.stringify(block.content);
65
+ const name = block.name ? ` name="${block.name}"` : '';
66
+ parts.push(`<tool_result${name}>${c}</tool_result>`);
67
+ }
68
+ }
69
+ return { role: m.role, content: parts.join('\n') };
70
+ });
71
+ }
72
+
73
+ // Streaming parser. Feed chunks of generated text as they arrive; receive
74
+ // a flat list of runtime events. State is internal — one parser per turn.
75
+ //
76
+ // Behaviour:
77
+ // * Text outside <tool_call>…</tool_call> emits as text_delta. We hold
78
+ // back the trailing OPEN.length-1 chars on each feed so a partial
79
+ // opening tag spanning chunks doesn't leak as text.
80
+ // * A complete <tool_call>…</tool_call> block emits as tool_use. The
81
+ // payload is parsed as JSON; malformed JSON falls back to a text
82
+ // emission so the model's intent isn't silently dropped.
83
+ // * flush() emits any remaining text and any partial unterminated
84
+ // tool_call as text (rather than yielding a malformed tool_use that
85
+ // would derail the runtime's loop).
86
+ export function createToolCallParser({ idGen } = {}) {
87
+ const id = idGen || (() => `tu_${Date.now().toString(36)}_${Math.random().toString(36).slice(2, 8)}`);
88
+ let buf = '';
89
+ let emitted = 0; // index up to which we've emitted (text or call body)
90
+ let inCall = false;
91
+ let callBodyStart = -1;
92
+
93
+ function readTextUpTo(safeUpTo) {
94
+ if (safeUpTo <= emitted) return null;
95
+ const text = buf.slice(emitted, safeUpTo);
96
+ emitted = safeUpTo;
97
+ return text ? { type: 'text_delta', text } : null;
98
+ }
99
+
100
+ function parseCallBody(body) {
101
+ try {
102
+ const obj = JSON.parse(body.trim());
103
+ if (obj && typeof obj.name === 'string') {
104
+ return { name: obj.name, input: obj.arguments ?? obj.parameters ?? {} };
105
+ }
106
+ } catch {}
107
+ return null;
108
+ }
109
+
110
+ return {
111
+ feed(chunk) {
112
+ if (!chunk) return [];
113
+ buf += chunk;
114
+ const out = [];
115
+ // Run until we either consume the buffer or can't make progress.
116
+ // eslint-disable-next-line no-constant-condition
117
+ while (true) {
118
+ if (!inCall) {
119
+ const openIdx = buf.indexOf(OPEN, emitted);
120
+ if (openIdx === -1) {
121
+ // No OPEN tag yet. Hold back the last OPEN.length-1 chars in
122
+ // case a partial tag straddles the next chunk.
123
+ const safe = Math.max(emitted, buf.length - (OPEN.length - 1));
124
+ const ev = readTextUpTo(safe);
125
+ if (ev) out.push(ev);
126
+ break;
127
+ }
128
+ // Emit text before the tag
129
+ const ev = readTextUpTo(openIdx);
130
+ if (ev) out.push(ev);
131
+ inCall = true;
132
+ callBodyStart = openIdx + OPEN.length;
133
+ emitted = callBodyStart;
134
+ } else {
135
+ const closeIdx = buf.indexOf(CLOSE, callBodyStart);
136
+ if (closeIdx === -1) break; // wait for more
137
+ const body = buf.slice(callBodyStart, closeIdx);
138
+ const parsed = parseCallBody(body);
139
+ if (parsed) {
140
+ out.push({ type: 'tool_use', id: id(), name: parsed.name, input: parsed.input });
141
+ } else {
142
+ // Couldn't parse — surface the literal text so the user sees
143
+ // what the model emitted instead of a silent drop.
144
+ out.push({ type: 'text_delta', text: OPEN + body + CLOSE });
145
+ }
146
+ inCall = false;
147
+ callBodyStart = -1;
148
+ emitted = closeIdx + CLOSE.length;
149
+ }
150
+ }
151
+ return out;
152
+ },
153
+
154
+ flush() {
155
+ const out = [];
156
+ if (inCall) {
157
+ // Unterminated <tool_call> at end-of-stream. Surface as text so
158
+ // partial content isn't lost; the runtime won't try to dispatch.
159
+ out.push({ type: 'text_delta', text: buf.slice(emitted - OPEN.length) });
160
+ emitted = buf.length;
161
+ inCall = false;
162
+ } else if (emitted < buf.length) {
163
+ out.push({ type: 'text_delta', text: buf.slice(emitted) });
164
+ emitted = buf.length;
165
+ }
166
+ return out;
167
+ },
168
+ };
169
+ }
@@ -1,25 +1,52 @@
1
1
  // Chrome's built-in Prompt API (on-device Gemini Nano / Gemma-derived).
2
- // Wraps `LanguageModel.create()` + `session.promptStreaming()` into a
3
- // runtime-compatible provider — zero-download for users on Chrome that
4
- // already has the weights, reply quality in the ~2B-effective-param
5
- // range (well above what transformers.js practically pulls in-browser).
2
+ // Runtime-compatible provider slots into createRuntime as a peer of
3
+ // anthropic/openai/local. Zero download for users on Chrome that
4
+ // already has the weights; reply quality lands ~2B-effective-param.
6
5
  //
7
6
  // Usage:
8
7
  // import { createRuntime } from '@nevescloud/pip/runtime.esm.js';
9
8
  // import { chrome } from '@nevescloud/pip/providers/chrome.esm.js';
10
9
  //
11
- // const rt = createRuntime({ provider: chrome({ temperature: 0.1 }) });
10
+ // const rt = createRuntime({
11
+ // provider: chrome({ temperature: 0.1 }),
12
+ // tools: [ { name: 'get_time', description: '…', schema: {…}, handler: …} ],
13
+ // });
12
14
  //
13
- // Surface shifted across Chrome versions: the API moved from
14
- // `window.ai.languageModel` (earlier flag-gated builds) to the top-level
15
- // `LanguageModel` constructor as the Prompt API spec settled. We try
16
- // the newer surface first and fall back. Chrome 138+ ships the origin
17
- // trial; ~Chrome 148+ runs without a flag for many origins. Non-Chrome
18
- // browsers throw a friendly error on first invocation.
15
+ // Optimizations (per developer.chrome.com/docs/ai/session-management):
19
16
  //
20
- // Limitations: no tool-use, no images. Tools registered on the runtime
21
- // won't be exposed pip's turn loop still works for slash commands and
22
- // chat, but tool dispatch is a no-op with this provider.
17
+ // * Session cache. First turn calls LanguageModel.create() with the
18
+ // full history as initialPrompts; subsequent turns reuse the same
19
+ // session and only feed the newest turn via session.append(), then
20
+ // prompt with the latest user message. Avoids the create() cost on
21
+ // every chat exchange.
22
+ // * Cache invalidation. If runtime's `messages` shrinks (clear /
23
+ // regenerate / model swap-and-back), the system prompt changes, or
24
+ // the tool schema changes, we destroy() and rebuild — keeps session
25
+ // ↔ runtime in lockstep.
26
+ // * downloadprogress wired to pip's loading bar via the create()
27
+ // monitor option, mounted on the active turnEl. First load only.
28
+ // * AbortSignal flows through create() AND promptStreaming(), so a
29
+ // stop click cancels both model fetch and an in-flight prompt.
30
+ // * temperature/topK are origin-trial / Extensions only. If params()
31
+ // is missing (stable web), we omit both and warn once if the host
32
+ // passed either. If only one is set, we fill the other from
33
+ // LanguageModel.params() to satisfy the API's both-or-neither rule.
34
+ //
35
+ // Tool use (prompt-based — the Prompt API has no native tool channel):
36
+ //
37
+ // * When the runtime passes `tools`, the helper in _tool-prompt.esm.js
38
+ // injects JSON schemas + a response-format instruction into the
39
+ // system prompt and parses the model's stream for
40
+ // <tool_call>{"name":"…","arguments":{…}}</tool_call> blocks.
41
+ // * Detected calls yield as tool_use events with stopReason='tool_use';
42
+ // the runtime dispatches, appends tool_result turns, and re-invokes
43
+ // this provider. The cached session continues from where it left off.
44
+ // * Reliability scales with model size — Nano (~2B effective) handles
45
+ // simple single-tool calls; chained / nested calls are flaky. For
46
+ // hard guarantees, register the action as a slash command instead.
47
+
48
+ import { showLoading, hideLoading } from '../pip-core.esm.js';
49
+ import { buildToolSystemPrompt, flattenMessages, createToolCallParser } from './_tool-prompt.esm.js';
23
50
 
24
51
  const UNAVAILABLE =
25
52
  "Chrome's built-in AI isn't available here. Use Chrome 138+ or enable " +
@@ -32,80 +59,253 @@ function getApi() {
32
59
  return null;
33
60
  }
34
61
 
35
- async function ensureAvailable(LM) {
36
- // Newer: availability() 'available' | 'downloadable' | 'downloading' | 'unavailable'
37
- // Older: capabilities() → { available: 'readily' | 'after-download' | 'no' }
62
+ async function ensureAvailable(LM, availabilityOpts) {
63
+ // Availability is a *hint*, not a gate. Chrome 148+ has been observed
64
+ // returning 'unavailable' from availability() for the default config
65
+ // (no expectedOutputs/Inputs specified) while create() with the same
66
+ // opts succeeds — the availability check is stricter than create's
67
+ // own. We only hard-gate when LM itself is missing (handled upstream);
68
+ // here we just surface state for diagnostics and handle 'downloading'
69
+ // (which create() can't tolerate on some builds).
38
70
  if (typeof LM.availability === 'function') {
39
- const v = await LM.availability();
40
- if (v === 'unavailable') throw new Error(UNAVAILABLE);
41
- return;
71
+ let v;
72
+ try { v = await LM.availability(availabilityOpts || {}); }
73
+ catch (e) {
74
+ // eslint-disable-next-line no-console
75
+ console.warn('[pip/chrome] availability() threw — proceeding to create() to get the real error:', e?.message || e);
76
+ return 'unknown';
77
+ }
78
+ if (v === 'unavailable') {
79
+ // eslint-disable-next-line no-console
80
+ console.warn('[pip/chrome] availability() returned "unavailable" with opts', availabilityOpts, '— proceeding to create() anyway; if Chrome rejects, the real reason will surface.');
81
+ return v;
82
+ }
83
+ if (v === 'downloading') {
84
+ const deadline = Date.now() + 5 * 60 * 1000;
85
+ while (Date.now() < deadline) {
86
+ await new Promise((r) => setTimeout(r, 1500));
87
+ const next = await LM.availability(availabilityOpts || {});
88
+ if (next === 'available' || next === 'downloadable') return next;
89
+ if (next === 'unavailable') {
90
+ // eslint-disable-next-line no-console
91
+ console.warn('[pip/chrome] availability flipped to "unavailable" mid-download — proceeding to create() to surface the real error.');
92
+ return next;
93
+ }
94
+ }
95
+ throw new Error("Chrome's built-in model is still downloading — try again in a minute.");
96
+ }
97
+ return v;
42
98
  }
43
99
  if (typeof LM.capabilities === 'function') {
44
100
  const c = await LM.capabilities();
45
- if (c?.available === 'no') throw new Error(UNAVAILABLE);
101
+ if (c?.available === 'no') {
102
+ // eslint-disable-next-line no-console
103
+ console.warn('[pip/chrome] capabilities() reported "no" — proceeding to create() to surface the real error.');
104
+ }
105
+ return c?.available || 'unknown';
46
106
  }
107
+ return 'available';
47
108
  }
48
109
 
49
- export function chrome({ systemPrompt, temperature, topK } = {}) {
50
- return ({ messages, signal, system }) => (async function* () {
51
- const LM = getApi();
52
- if (!LM) throw new Error(UNAVAILABLE);
53
- await ensureAvailable(LM);
54
-
55
- // Runtime's per-call `system` wins over the factory default — same
56
- // precedence anthropic/openai providers use.
57
- const sys = system || systemPrompt;
58
- const initialPrompts = [];
59
- if (sys) initialPrompts.push({ role: 'system', content: sys });
60
- // Replay prior turns. The Prompt API doesn't model tool dispatch, so
61
- // skip non-string content (tool_use / tool_result turns) — they'd
62
- // serialize to "[object Object]" and confuse the model.
63
- for (const m of messages.slice(0, -1)) {
110
+ function toolsFingerprint(tools) {
111
+ if (!tools?.length) return '';
112
+ return tools.map((t) => `${t.name}:${t.description || ''}`).join('|');
113
+ }
114
+
115
+ export function chrome({
116
+ systemPrompt,
117
+ temperature,
118
+ topK,
119
+ expectedInputs,
120
+ expectedOutputs,
121
+ } = {}) {
122
+ let sessionPromise = null;
123
+ let consumed = 0; // count of messages already fed into the cached session
124
+ let lastSystem = null;
125
+ let lastToolsFp = '';
126
+ let warnedNoParams = false;
127
+
128
+ async function buildOpts(LM, augmentedSystem, history, monitorFn, signal) {
129
+ const opts = {};
130
+ if (signal) opts.signal = signal;
131
+ if (expectedInputs) opts.expectedInputs = expectedInputs;
132
+ if (expectedOutputs) opts.expectedOutputs = expectedOutputs;
133
+ if (monitorFn) opts.monitor = monitorFn;
134
+
135
+ const initial = [];
136
+ if (augmentedSystem) initial.push({ role: 'system', content: augmentedSystem });
137
+ for (const m of history) {
138
+ // Skip assistant turns from the runtime's own history serialization
139
+ // when they're string-typed `(role: 'assistant', content: '…tool_call…')`
140
+ // — those already encode tool_use blocks as text via flattenMessages,
141
+ // safe to replay. Only drop anything that ended up non-string.
64
142
  if (typeof m.content === 'string') {
65
- initialPrompts.push({ role: m.role, content: m.content });
143
+ initial.push({ role: m.role, content: m.content });
66
144
  }
67
145
  }
68
- const tail = messages[messages.length - 1];
69
- const userText = typeof tail?.content === 'string' ? tail.content : '';
146
+ if (initial.length) opts.initialPrompts = initial;
70
147
 
71
- const opts = {};
72
- if (initialPrompts.length) opts.initialPrompts = initialPrompts;
73
-
74
- // The Prompt API requires both topK and temperature to be set, or
75
- // neither passing one alone throws "Initializing a new session
76
- // must either specify both topK and temperature, or neither". If
77
- // the caller specified one, fetch the other's default from
78
- // LanguageModel.params() so a half-spec doesn't reject the session.
79
- const hasT = temperature != null;
80
- const hasK = topK != null;
81
- if (hasT || hasK) {
82
- let t = temperature, k = topK;
83
- if (hasT !== hasK && typeof LM.params === 'function') {
148
+ // temperature/topK are origin-trial / Extensions only. params() is
149
+ // gated to the same builds — its presence is the feature flag.
150
+ if (temperature != null || topK != null) {
151
+ if (typeof LM.params === 'function') {
152
+ let t = temperature, k = topK;
153
+ if ((t != null) !== (k != null)) {
154
+ try {
155
+ const p = await LM.params();
156
+ if (t == null) t = p?.defaultTemperature;
157
+ if (k == null) k = p?.defaultTopK;
158
+ } catch {}
159
+ }
160
+ if (t == null) t = 1.0;
161
+ if (k == null) k = 40;
162
+ opts.temperature = t;
163
+ opts.topK = k;
164
+ } else if (!warnedNoParams) {
165
+ warnedNoParams = true;
166
+ // eslint-disable-next-line no-console
167
+ console.warn(
168
+ '[pip/chrome] temperature/topK ignored — only supported on ' +
169
+ 'Prompt API for Chrome Extensions or with the Origin Trial enabled.'
170
+ );
171
+ }
172
+ }
173
+ return opts;
174
+ }
175
+
176
+ async function getSession(LM, augmentedSystem, toolsFp, flatMessages, turnEl, signal) {
177
+ const needsReset =
178
+ !sessionPromise ||
179
+ augmentedSystem !== lastSystem ||
180
+ toolsFp !== lastToolsFp ||
181
+ flatMessages.length < consumed + 1;
182
+
183
+ if (needsReset && sessionPromise) {
184
+ try { (await sessionPromise).destroy?.(); } catch {}
185
+ sessionPromise = null;
186
+ consumed = 0;
187
+ }
188
+
189
+ if (!sessionPromise) {
190
+ const history = flatMessages.slice(0, -1);
191
+ const monitorFn = turnEl
192
+ ? (m) => {
193
+ try {
194
+ m.addEventListener?.('downloadprogress', (e) => {
195
+ const pct = Math.round((e.loaded || 0) * 100);
196
+ showLoading(turnEl, `downloading model ${pct}%`, pct);
197
+ });
198
+ } catch {}
199
+ }
200
+ : undefined;
201
+ const opts = await buildOpts(LM, augmentedSystem, history, monitorFn, signal);
202
+ // Some Chrome builds reject role 'system' in initialPrompts and return
203
+ // a generic "unable to create a session" error. Fall back to folding
204
+ // the system content into the first user message — same effect.
205
+ const tryCreate = async () => {
84
206
  try {
85
- const p = await LM.params();
86
- if (!hasT) t = p?.defaultTemperature;
87
- if (!hasK) k = p?.defaultTopK;
88
- } catch {}
207
+ return await LM.create(opts);
208
+ } catch (err) {
209
+ const msg = String(err?.message || err || '');
210
+ if (augmentedSystem && /unable to create|initialPrompt|role/i.test(msg)) {
211
+ // eslint-disable-next-line no-console
212
+ console.warn('[pip/chrome] create() rejected first-pass opts; retrying with system folded into user prompt:', msg);
213
+ const fallback = { ...opts };
214
+ const prompts = [];
215
+ // Fold system + history → tagged first user message.
216
+ const folded = `<<SYSTEM>>\n${augmentedSystem}\n<<END_SYSTEM>>`;
217
+ prompts.push({ role: 'user', content: folded });
218
+ for (const m of history) {
219
+ if (typeof m.content === 'string') prompts.push({ role: m.role, content: m.content });
220
+ }
221
+ fallback.initialPrompts = prompts;
222
+ return await LM.create(fallback);
223
+ }
224
+ throw err;
225
+ }
226
+ };
227
+ sessionPromise = tryCreate().then((s) => {
228
+ if (turnEl) hideLoading(turnEl);
229
+ s.addEventListener?.('contextoverflow', () => {
230
+ // eslint-disable-next-line no-console
231
+ console.warn('[pip/chrome] context window full — older turns evicted by Chrome.');
232
+ });
233
+ return s;
234
+ }).catch((err) => {
235
+ if (turnEl) hideLoading(turnEl);
236
+ sessionPromise = null;
237
+ // eslint-disable-next-line no-console
238
+ console.warn('[pip/chrome] LM.create() failed:', err?.message || err, '— augmentedSystem chars:', augmentedSystem.length, 'history msgs:', history.length);
239
+ throw err;
240
+ });
241
+ consumed = history.length;
242
+ lastSystem = augmentedSystem;
243
+ lastToolsFp = toolsFp;
244
+ } else {
245
+ // Cached session — append messages new since last call. Skip
246
+ // assistant role: the session already generated those itself
247
+ // during the prior promptStreaming() and re-feeding would duplicate.
248
+ const session = await sessionPromise;
249
+ const newSlice = flatMessages.slice(consumed, flatMessages.length - 1);
250
+ for (const m of newSlice) {
251
+ if (m.role === 'assistant') continue;
252
+ if (typeof m.content === 'string' && typeof session.append === 'function') {
253
+ try { await session.append([{ role: m.role, content: m.content }]); }
254
+ catch {}
255
+ }
89
256
  }
90
- // Last-resort defaults if params() isn't available.
91
- if (t == null) t = 1.0;
92
- if (k == null) k = 40;
93
- opts.temperature = t;
94
- opts.topK = k;
257
+ consumed = flatMessages.length - 1;
95
258
  }
259
+ return sessionPromise;
260
+ }
261
+
262
+ return ({ messages, signal, system, tools, turnEl }) => (async function* () {
263
+ const LM = getApi();
264
+ if (!LM) throw new Error(UNAVAILABLE);
96
265
 
97
- const session = await LM.create(opts);
266
+ const effectiveSystem = system || systemPrompt || '';
267
+ const augmentedSystem = buildToolSystemPrompt(effectiveSystem, tools);
268
+ const toolsFp = toolsFingerprint(tools);
269
+
270
+ const availabilityOpts = {};
271
+ if (expectedInputs) availabilityOpts.expectedInputs = expectedInputs;
272
+ if (expectedOutputs) availabilityOpts.expectedOutputs = expectedOutputs;
273
+ await ensureAvailable(LM, availabilityOpts);
274
+
275
+ // Flatten structured messages (runtime's tool_use / tool_result blocks)
276
+ // into the text channel the Prompt API speaks.
277
+ const flat = flattenMessages(messages);
278
+ const session = await getSession(LM, augmentedSystem, toolsFp, flat, turnEl, signal);
279
+ const tail = flat[flat.length - 1];
280
+ const userText = tail?.content || '';
281
+
282
+ const parser = createToolCallParser();
283
+ let sawToolUse = false;
98
284
 
99
285
  try {
100
- for await (const chunk of session.promptStreaming(userText)) {
286
+ const stream = session.promptStreaming(userText, signal ? { signal } : undefined);
287
+ for await (const chunk of stream) {
101
288
  if (signal?.aborted) throw new DOMException('Aborted', 'AbortError');
102
- if (typeof chunk === 'string' && chunk) {
103
- yield { type: 'text_delta', text: chunk };
289
+ if (typeof chunk !== 'string' || !chunk) continue;
290
+ for (const ev of parser.feed(chunk)) {
291
+ if (ev.type === 'tool_use') sawToolUse = true;
292
+ yield ev;
104
293
  }
105
294
  }
106
- yield { type: 'turn_end', stopReason: 'end_turn' };
107
- } finally {
108
- try { session.destroy?.(); } catch {}
295
+ // Flush any pending text the parser buffered (final delta or an
296
+ // unterminated tool_call surfaced as literal text).
297
+ for (const ev of parser.flush()) {
298
+ if (ev.type === 'tool_use') sawToolUse = true;
299
+ yield ev;
300
+ }
301
+ yield { type: 'turn_end', stopReason: sawToolUse ? 'tool_use' : 'end_turn' };
302
+ } catch (err) {
303
+ try { session?.destroy?.(); } catch {}
304
+ sessionPromise = null;
305
+ consumed = 0;
306
+ lastSystem = null;
307
+ lastToolsFp = '';
308
+ throw err;
109
309
  }
110
310
  })();
111
311
  }
@@ -16,6 +16,7 @@
16
16
  // <think> pill rendering); the provider just adapts the call shape.
17
17
 
18
18
  import { showLoading, hideLoading } from '../pip-core.esm.js';
19
+ import { buildToolSystemPrompt, flattenMessages, createToolCallParser } from './_tool-prompt.esm.js';
19
20
 
20
21
  const TRANSFORMERS_URL = 'https://cdn.jsdelivr.net/npm/@huggingface/transformers';
21
22
 
@@ -265,19 +266,50 @@ export function createTransformersRenderer() {
265
266
 
266
267
  // Runtime-compatible provider. The renderer streams reply text via the
267
268
  // setReplyText callback (cumulative buffer per call); we proxy that
268
- // callback, diff each call into a `text_delta` event, and yield events
269
- // the runtime's turn loop already consumes. The <think> pill mounts onto
270
- // turnEl directly inside the renderer — unaffected, still works.
269
+ // callback, diff each call into the tool-aware streaming parser, and
270
+ // yield text_delta + tool_use events the runtime's turn loop consumes.
271
+ // The <think> pill mounts onto turnEl directly inside the renderer —
272
+ // unaffected, still works.
273
+ //
274
+ // Tool calling. When the runtime passes `tools`, we augment the system
275
+ // prompt with JSON schemas + the <tool_call>{…}</tool_call> emit
276
+ // convention (see _tool-prompt.esm.js). Models that follow it (Gemma 4
277
+ // is well-trained on this shape) get dispatched through the runtime's
278
+ // loop; tool_result turns are flattened back into text for the next
279
+ // model call. Gemma 4's native special-token format (<|tool_call>…) is
280
+ // NOT used because TextStreamer with skip_special_tokens drops the
281
+ // markers — the text-channel convention works regardless.
271
282
  //
272
283
  // One pitfall handled: the renderer occasionally re-paints the same
273
284
  // buffer (no new tokens emitted between calls), so the diff guards
274
285
  // against zero-length deltas. AbortSignal flows through naturally —
275
286
  // the underlying TextStreamer throws AbortError, which we surface.
276
- export function local({ model, dtype = 'q4', maxTokens = 256, genParams, chatTemplate } = {}) {
287
+ export function local({
288
+ model,
289
+ dtype = 'q4',
290
+ maxTokens = 256,
291
+ genParams,
292
+ chatTemplate,
293
+ systemPrompt,
294
+ } = {}) {
277
295
  const renderer = createTransformersRenderer();
278
296
  if (model) renderer.setModel({ id: model, dtype, maxTokens, genParams, chatTemplate });
279
297
 
280
- return ({ messages, signal, turnEl, setReplyText }) => (async function* () {
298
+ return ({ messages, signal, system, tools, turnEl, setReplyText }) => (async function* () {
299
+ const effectiveSystem = system || systemPrompt || '';
300
+ const augmentedSystem = buildToolSystemPrompt(effectiveSystem, tools);
301
+
302
+ // Flatten runtime's structured turns (tool_use/tool_result blocks)
303
+ // back into text-channel strings the chat template understands.
304
+ // Prepend the augmented system if any — apply_chat_template renders
305
+ // role:'system' into Gemma's developer/system slot natively.
306
+ const flat = flattenMessages(messages);
307
+ const renderMessages = augmentedSystem
308
+ ? [{ role: 'system', content: augmentedSystem }, ...flat]
309
+ : flat;
310
+
311
+ const parser = createToolCallParser();
312
+ let sawToolUse = false;
281
313
  let lastFull = '';
282
314
  const queue = [];
283
315
  let wake = null;
@@ -285,16 +317,25 @@ export function local({ model, dtype = 'q4', maxTokens = 256, genParams, chatTem
285
317
  let error = null;
286
318
 
287
319
  const proxySetReplyText = (_el, fullText) => {
320
+ if (fullText.length <= lastFull.length) return;
288
321
  const delta = fullText.slice(lastFull.length);
289
322
  lastFull = fullText;
290
- if (delta) {
291
- queue.push({ type: 'text_delta', text: delta });
292
- wake?.();
323
+ for (const ev of parser.feed(delta)) {
324
+ if (ev.type === 'tool_use') sawToolUse = true;
325
+ queue.push(ev);
293
326
  }
327
+ wake?.();
294
328
  };
295
329
 
296
- renderer.generate({ messages, turnEl, setReplyText: proxySetReplyText, signal })
297
- .then(() => { done = true; wake?.(); })
330
+ renderer.generate({ messages: renderMessages, turnEl, setReplyText: proxySetReplyText, signal })
331
+ .then(() => {
332
+ for (const ev of parser.flush()) {
333
+ if (ev.type === 'tool_use') sawToolUse = true;
334
+ queue.push(ev);
335
+ }
336
+ done = true;
337
+ wake?.();
338
+ })
298
339
  .catch((e) => { error = e; done = true; wake?.(); });
299
340
 
300
341
  while (true) {
@@ -304,7 +345,7 @@ export function local({ model, dtype = 'q4', maxTokens = 256, genParams, chatTem
304
345
  }
305
346
 
306
347
  if (error) throw error;
307
- yield { type: 'turn_end', stopReason: 'end_turn' };
348
+ yield { type: 'turn_end', stopReason: sawToolUse ? 'tool_use' : 'end_turn' };
308
349
  })();
309
350
  }
310
351