npm - @nevescloud/pip - Versions diffs - 3.7.0 → 3.8.0 - Mend

@nevescloud/pip 3.7.0 → 3.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json +1 -1
package/providers/_tool-prompt.esm.js +169 -0
package/providers/chrome.esm.js +268 -68
package/providers/local.esm.js +52 -11

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@nevescloud/pip",
-  "version": "3.7.0",
+  "version": "3.8.0",
   "description": "Floating assistant bubble + panel + chat runtime. ESM, no build.",
   "type": "module",
   "main": "pip-core.esm.js",

package/providers/_tool-prompt.esm.js ADDED Viewed

@@ -0,0 +1,169 @@
+// Prompt-based tool calling for text-only providers (local, chrome).
+//
+// Models without a native tool-use protocol (Chrome's Prompt API) — and
+// models whose native protocol is gated behind special tokens that
+// transformers.js's TextStreamer strips with skip_special_tokens (Gemma
+// 4's <|tool_call> family) — both need a text-channel convention.
+//
+// We use the community XML-JSON format: <tool_call>{"name":"…","arguments":{…}}</tool_call>.
+// It survives any tokenizer setting (literal characters), parses with a
+// regex, and is well-represented in instruction-tuned model training
+// data (Hermes 2 Pro, Nous, and many fine-tunes converge here).
+//
+// Three exports:
+//   * buildToolSystemPrompt(systemPrompt, tools) — augments the system
+//     prompt with tool schemas + the response-format instruction.
+//   * flattenMessages(messages) — converts the runtime's structured
+//     tool_use/tool_result blocks back into the text channel so text-only
+//     providers can replay them.
+//   * createToolCallParser({ idGen }) — streaming parser. .feed(chunk)
+//     yields text_delta + tool_use events; .flush() emits any trailing text.
+const OPEN = '<tool_call>';
+const CLOSE = '</tool_call>';
+export function buildToolSystemPrompt(systemPrompt, tools) {
+  if (!tools?.length) return systemPrompt || '';
+  const schemas = tools.map((t) => ({
+    name: t.name,
+    description: t.description || '',
+    parameters: t.schema || t.input_schema || { type: 'object', properties: {} },
+  }));
+  // The trailing example is critical for small models — Gemini Nano in
+  // particular often emits `function_call` or `tool_use` keys without it.
+  // The "answer directly otherwise" line keeps single-shot chat working.
+  const instruction = [
+    'You have access to the following tools:',
+    JSON.stringify(schemas, null, 2),
+    '',
+    'When you want to call a tool, emit a single line in this exact format:',
+    `${OPEN}{"name":"tool_name","arguments":{"key":"value"}}${CLOSE}`,
+    '',
+    'A tool result will be returned to you wrapped in <tool_result>…</tool_result>.',
+    'Use the result to compose your final answer. If no tool is needed, answer directly.',
+  ].join('\n');
+  return systemPrompt ? `${systemPrompt}\n\n${instruction}` : instruction;
+}
+// Convert runtime's structured assistant turns ({content: [{type:'tool_use'},…]})
+// and tool_result user turns back into flat text so text-only providers can
+// replay the history. The runtime's tool loop produces these between
+// iterations; the provider sees them on the next call.
+export function flattenMessages(messages) {
+  return messages.map((m) => {
+    if (typeof m.content === 'string') return { role: m.role, content: m.content };
+    if (!Array.isArray(m.content)) return { role: m.role, content: '' };
+    const parts = [];
+    for (const block of m.content) {
+      if (block.type === 'text') {
+        parts.push(block.text);
+      } else if (block.type === 'tool_use') {
+        const args = block.input == null ? {} : block.input;
+        parts.push(`${OPEN}${JSON.stringify({ name: block.name, arguments: args })}${CLOSE}`);
+      } else if (block.type === 'tool_result') {
+        const c = typeof block.content === 'string' ? block.content : JSON.stringify(block.content);
+        const name = block.name ? ` name="${block.name}"` : '';
+        parts.push(`<tool_result${name}>${c}</tool_result>`);
+      }
+    }
+    return { role: m.role, content: parts.join('\n') };
+  });
+}
+// Streaming parser. Feed chunks of generated text as they arrive; receive
+// a flat list of runtime events. State is internal — one parser per turn.
+//
+// Behaviour:
+//   * Text outside <tool_call>…</tool_call> emits as text_delta. We hold
+//     back the trailing OPEN.length-1 chars on each feed so a partial
+//     opening tag spanning chunks doesn't leak as text.
+//   * A complete <tool_call>…</tool_call> block emits as tool_use. The
+//     payload is parsed as JSON; malformed JSON falls back to a text
+//     emission so the model's intent isn't silently dropped.
+//   * flush() emits any remaining text and any partial unterminated
+//     tool_call as text (rather than yielding a malformed tool_use that
+//     would derail the runtime's loop).
+export function createToolCallParser({ idGen } = {}) {
+  const id = idGen || (() => `tu_${Date.now().toString(36)}_${Math.random().toString(36).slice(2, 8)}`);
+  let buf = '';
+  let emitted = 0;   // index up to which we've emitted (text or call body)
+  let inCall = false;
+  let callBodyStart = -1;
+  function readTextUpTo(safeUpTo) {
+    if (safeUpTo <= emitted) return null;
+    const text = buf.slice(emitted, safeUpTo);
+    emitted = safeUpTo;
+    return text ? { type: 'text_delta', text } : null;
+  }
+  function parseCallBody(body) {
+    try {
+      const obj = JSON.parse(body.trim());
+      if (obj && typeof obj.name === 'string') {
+        return { name: obj.name, input: obj.arguments ?? obj.parameters ?? {} };
+      }
+    } catch {}
+    return null;
+  }
+  return {
+    feed(chunk) {
+      if (!chunk) return [];
+      buf += chunk;
+      const out = [];
+      // Run until we either consume the buffer or can't make progress.
+      // eslint-disable-next-line no-constant-condition
+      while (true) {
+        if (!inCall) {
+          const openIdx = buf.indexOf(OPEN, emitted);
+          if (openIdx === -1) {
+            // No OPEN tag yet. Hold back the last OPEN.length-1 chars in
+            // case a partial tag straddles the next chunk.
+            const safe = Math.max(emitted, buf.length - (OPEN.length - 1));
+            const ev = readTextUpTo(safe);
+            if (ev) out.push(ev);
+            break;
+          }
+          // Emit text before the tag
+          const ev = readTextUpTo(openIdx);
+          if (ev) out.push(ev);
+          inCall = true;
+          callBodyStart = openIdx + OPEN.length;
+          emitted = callBodyStart;
+        } else {
+          const closeIdx = buf.indexOf(CLOSE, callBodyStart);
+          if (closeIdx === -1) break; // wait for more
+          const body = buf.slice(callBodyStart, closeIdx);
+          const parsed = parseCallBody(body);
+          if (parsed) {
+            out.push({ type: 'tool_use', id: id(), name: parsed.name, input: parsed.input });
+          } else {
+            // Couldn't parse — surface the literal text so the user sees
+            // what the model emitted instead of a silent drop.
+            out.push({ type: 'text_delta', text: OPEN + body + CLOSE });
+          }
+          inCall = false;
+          callBodyStart = -1;
+          emitted = closeIdx + CLOSE.length;
+        }
+      }
+      return out;
+    },
+    flush() {
+      const out = [];
+      if (inCall) {
+        // Unterminated <tool_call> at end-of-stream. Surface as text so
+        // partial content isn't lost; the runtime won't try to dispatch.
+        out.push({ type: 'text_delta', text: buf.slice(emitted - OPEN.length) });
+        emitted = buf.length;
+        inCall = false;
+      } else if (emitted < buf.length) {
+        out.push({ type: 'text_delta', text: buf.slice(emitted) });
+        emitted = buf.length;
+      }
+      return out;
+    },
+  };
+}

package/providers/chrome.esm.js CHANGED Viewed

@@ -1,25 +1,52 @@
 // Chrome's built-in Prompt API (on-device Gemini Nano / Gemma-derived).
-// Wraps `LanguageModel.create()` + `session.promptStreaming()` into a
-// runtime-compatible provider — zero-download for users on Chrome that
-// already has the weights, reply quality in the ~2B-effective-param
-// range (well above what transformers.js practically pulls in-browser).
+// Runtime-compatible provider — slots into createRuntime as a peer of
+// anthropic/openai/local. Zero download for users on Chrome that
+// already has the weights; reply quality lands ~2B-effective-param.
 //
 // Usage:
 //   import { createRuntime } from '@nevescloud/pip/runtime.esm.js';
 //   import { chrome }        from '@nevescloud/pip/providers/chrome.esm.js';
 //
-//   const rt = createRuntime({ provider: chrome({ temperature: 0.1 }) });
+//   const rt = createRuntime({
+//     provider: chrome({ temperature: 0.1 }),
+//     tools: [ { name: 'get_time', description: '…', schema: {…}, handler: …} ],
+//   });
 //
-// Surface shifted across Chrome versions: the API moved from
-// `window.ai.languageModel` (earlier flag-gated builds) to the top-level
-// `LanguageModel` constructor as the Prompt API spec settled. We try
-// the newer surface first and fall back. Chrome 138+ ships the origin
-// trial; ~Chrome 148+ runs without a flag for many origins. Non-Chrome
-// browsers throw a friendly error on first invocation.
+// Optimizations (per developer.chrome.com/docs/ai/session-management):
 //
-// Limitations: no tool-use, no images. Tools registered on the runtime
-// won't be exposed — pip's turn loop still works for slash commands and
-// chat, but tool dispatch is a no-op with this provider.
+//   * Session cache. First turn calls LanguageModel.create() with the
+//     full history as initialPrompts; subsequent turns reuse the same
+//     session and only feed the newest turn via session.append(), then
+//     prompt with the latest user message. Avoids the create() cost on
+//     every chat exchange.
+//   * Cache invalidation. If runtime's `messages` shrinks (clear /
+//     regenerate / model swap-and-back), the system prompt changes, or
+//     the tool schema changes, we destroy() and rebuild — keeps session
+//     ↔ runtime in lockstep.
+//   * downloadprogress wired to pip's loading bar via the create()
+//     monitor option, mounted on the active turnEl. First load only.
+//   * AbortSignal flows through create() AND promptStreaming(), so a
+//     stop click cancels both model fetch and an in-flight prompt.
+//   * temperature/topK are origin-trial / Extensions only. If params()
+//     is missing (stable web), we omit both and warn once if the host
+//     passed either. If only one is set, we fill the other from
+//     LanguageModel.params() to satisfy the API's both-or-neither rule.
+//
+// Tool use (prompt-based — the Prompt API has no native tool channel):
+//
+//   * When the runtime passes `tools`, the helper in _tool-prompt.esm.js
+//     injects JSON schemas + a response-format instruction into the
+//     system prompt and parses the model's stream for
+//     <tool_call>{"name":"…","arguments":{…}}</tool_call> blocks.
+//   * Detected calls yield as tool_use events with stopReason='tool_use';
+//     the runtime dispatches, appends tool_result turns, and re-invokes
+//     this provider. The cached session continues from where it left off.
+//   * Reliability scales with model size — Nano (~2B effective) handles
+//     simple single-tool calls; chained / nested calls are flaky. For
+//     hard guarantees, register the action as a slash command instead.
+import { showLoading, hideLoading } from '../pip-core.esm.js';
+import { buildToolSystemPrompt, flattenMessages, createToolCallParser } from './_tool-prompt.esm.js';
 const UNAVAILABLE =
   "Chrome's built-in AI isn't available here. Use Chrome 138+ or enable " +
@@ -32,80 +59,253 @@ function getApi() {
   return null;
 }
-async function ensureAvailable(LM) {
-  // Newer: availability() → 'available' | 'downloadable' | 'downloading' | 'unavailable'
-  // Older: capabilities() → { available: 'readily' | 'after-download' | 'no' }
+async function ensureAvailable(LM, availabilityOpts) {
+  // Availability is a *hint*, not a gate. Chrome 148+ has been observed
+  // returning 'unavailable' from availability() for the default config
+  // (no expectedOutputs/Inputs specified) while create() with the same
+  // opts succeeds — the availability check is stricter than create's
+  // own. We only hard-gate when LM itself is missing (handled upstream);
+  // here we just surface state for diagnostics and handle 'downloading'
+  // (which create() can't tolerate on some builds).
   if (typeof LM.availability === 'function') {
-    const v = await LM.availability();
-    if (v === 'unavailable') throw new Error(UNAVAILABLE);
-    return;
+    let v;
+    try { v = await LM.availability(availabilityOpts || {}); }
+    catch (e) {
+      // eslint-disable-next-line no-console
+      console.warn('[pip/chrome] availability() threw — proceeding to create() to get the real error:', e?.message || e);
+      return 'unknown';
+    }
+    if (v === 'unavailable') {
+      // eslint-disable-next-line no-console
+      console.warn('[pip/chrome] availability() returned "unavailable" with opts', availabilityOpts, '— proceeding to create() anyway; if Chrome rejects, the real reason will surface.');
+      return v;
+    }
+    if (v === 'downloading') {
+      const deadline = Date.now() + 5 * 60 * 1000;
+      while (Date.now() < deadline) {
+        await new Promise((r) => setTimeout(r, 1500));
+        const next = await LM.availability(availabilityOpts || {});
+        if (next === 'available' || next === 'downloadable') return next;
+        if (next === 'unavailable') {
+          // eslint-disable-next-line no-console
+          console.warn('[pip/chrome] availability flipped to "unavailable" mid-download — proceeding to create() to surface the real error.');
+          return next;
+        }
+      }
+      throw new Error("Chrome's built-in model is still downloading — try again in a minute.");
+    }
+    return v;
   }
   if (typeof LM.capabilities === 'function') {
     const c = await LM.capabilities();
-    if (c?.available === 'no') throw new Error(UNAVAILABLE);
+    if (c?.available === 'no') {
+      // eslint-disable-next-line no-console
+      console.warn('[pip/chrome] capabilities() reported "no" — proceeding to create() to surface the real error.');
+    }
+    return c?.available || 'unknown';
   }
+  return 'available';
 }
-export function chrome({ systemPrompt, temperature, topK } = {}) {
-  return ({ messages, signal, system }) => (async function* () {
-    const LM = getApi();
-    if (!LM) throw new Error(UNAVAILABLE);
-    await ensureAvailable(LM);
-    // Runtime's per-call `system` wins over the factory default — same
-    // precedence anthropic/openai providers use.
-    const sys = system || systemPrompt;
-    const initialPrompts = [];
-    if (sys) initialPrompts.push({ role: 'system', content: sys });
-    // Replay prior turns. The Prompt API doesn't model tool dispatch, so
-    // skip non-string content (tool_use / tool_result turns) — they'd
-    // serialize to "[object Object]" and confuse the model.
-    for (const m of messages.slice(0, -1)) {
+function toolsFingerprint(tools) {
+  if (!tools?.length) return '';
+  return tools.map((t) => `${t.name}:${t.description || ''}`).join('|');
+}
+export function chrome({
+  systemPrompt,
+  temperature,
+  topK,
+  expectedInputs,
+  expectedOutputs,
+} = {}) {
+  let sessionPromise = null;
+  let consumed = 0;       // count of messages already fed into the cached session
+  let lastSystem = null;
+  let lastToolsFp = '';
+  let warnedNoParams = false;
+  async function buildOpts(LM, augmentedSystem, history, monitorFn, signal) {
+    const opts = {};
+    if (signal) opts.signal = signal;
+    if (expectedInputs) opts.expectedInputs = expectedInputs;
+    if (expectedOutputs) opts.expectedOutputs = expectedOutputs;
+    if (monitorFn) opts.monitor = monitorFn;
+    const initial = [];
+    if (augmentedSystem) initial.push({ role: 'system', content: augmentedSystem });
+    for (const m of history) {
+      // Skip assistant turns from the runtime's own history serialization
+      // when they're string-typed `(role: 'assistant', content: '…tool_call…')`
+      // — those already encode tool_use blocks as text via flattenMessages,
+      // safe to replay. Only drop anything that ended up non-string.
       if (typeof m.content === 'string') {
-        initialPrompts.push({ role: m.role, content: m.content });
+        initial.push({ role: m.role, content: m.content });
       }
     }
-    const tail = messages[messages.length - 1];
-    const userText = typeof tail?.content === 'string' ? tail.content : '';
+    if (initial.length) opts.initialPrompts = initial;
-    const opts = {};
-    if (initialPrompts.length) opts.initialPrompts = initialPrompts;
-    // The Prompt API requires both topK and temperature to be set, or
-    // neither — passing one alone throws "Initializing a new session
-    // must either specify both topK and temperature, or neither". If
-    // the caller specified one, fetch the other's default from
-    // LanguageModel.params() so a half-spec doesn't reject the session.
-    const hasT = temperature != null;
-    const hasK = topK != null;
-    if (hasT || hasK) {
-      let t = temperature, k = topK;
-      if (hasT !== hasK && typeof LM.params === 'function') {
+    // temperature/topK are origin-trial / Extensions only. params() is
+    // gated to the same builds — its presence is the feature flag.
+    if (temperature != null || topK != null) {
+      if (typeof LM.params === 'function') {
+        let t = temperature, k = topK;
+        if ((t != null) !== (k != null)) {
+          try {
+            const p = await LM.params();
+            if (t == null) t = p?.defaultTemperature;
+            if (k == null) k = p?.defaultTopK;
+          } catch {}
+        }
+        if (t == null) t = 1.0;
+        if (k == null) k = 40;
+        opts.temperature = t;
+        opts.topK = k;
+      } else if (!warnedNoParams) {
+        warnedNoParams = true;
+        // eslint-disable-next-line no-console
+        console.warn(
+          '[pip/chrome] temperature/topK ignored — only supported on ' +
+          'Prompt API for Chrome Extensions or with the Origin Trial enabled.'
+        );
+      }
+    }
+    return opts;
+  }
+  async function getSession(LM, augmentedSystem, toolsFp, flatMessages, turnEl, signal) {
+    const needsReset =
+      !sessionPromise ||
+      augmentedSystem !== lastSystem ||
+      toolsFp !== lastToolsFp ||
+      flatMessages.length < consumed + 1;
+    if (needsReset && sessionPromise) {
+      try { (await sessionPromise).destroy?.(); } catch {}
+      sessionPromise = null;
+      consumed = 0;
+    }
+    if (!sessionPromise) {
+      const history = flatMessages.slice(0, -1);
+      const monitorFn = turnEl
+        ? (m) => {
+            try {
+              m.addEventListener?.('downloadprogress', (e) => {
+                const pct = Math.round((e.loaded || 0) * 100);
+                showLoading(turnEl, `downloading model ${pct}%`, pct);
+              });
+            } catch {}
+          }
+        : undefined;
+      const opts = await buildOpts(LM, augmentedSystem, history, monitorFn, signal);
+      // Some Chrome builds reject role 'system' in initialPrompts and return
+      // a generic "unable to create a session" error. Fall back to folding
+      // the system content into the first user message — same effect.
+      const tryCreate = async () => {
         try {
-          const p = await LM.params();
-          if (!hasT) t = p?.defaultTemperature;
-          if (!hasK) k = p?.defaultTopK;
-        } catch {}
+          return await LM.create(opts);
+        } catch (err) {
+          const msg = String(err?.message || err || '');
+          if (augmentedSystem && /unable to create|initialPrompt|role/i.test(msg)) {
+            // eslint-disable-next-line no-console
+            console.warn('[pip/chrome] create() rejected first-pass opts; retrying with system folded into user prompt:', msg);
+            const fallback = { ...opts };
+            const prompts = [];
+            // Fold system + history → tagged first user message.
+            const folded = `<<SYSTEM>>\n${augmentedSystem}\n<<END_SYSTEM>>`;
+            prompts.push({ role: 'user', content: folded });
+            for (const m of history) {
+              if (typeof m.content === 'string') prompts.push({ role: m.role, content: m.content });
+            }
+            fallback.initialPrompts = prompts;
+            return await LM.create(fallback);
+          }
+          throw err;
+        }
+      };
+      sessionPromise = tryCreate().then((s) => {
+        if (turnEl) hideLoading(turnEl);
+        s.addEventListener?.('contextoverflow', () => {
+          // eslint-disable-next-line no-console
+          console.warn('[pip/chrome] context window full — older turns evicted by Chrome.');
+        });
+        return s;
+      }).catch((err) => {
+        if (turnEl) hideLoading(turnEl);
+        sessionPromise = null;
+        // eslint-disable-next-line no-console
+        console.warn('[pip/chrome] LM.create() failed:', err?.message || err, '— augmentedSystem chars:', augmentedSystem.length, 'history msgs:', history.length);
+        throw err;
+      });
+      consumed = history.length;
+      lastSystem = augmentedSystem;
+      lastToolsFp = toolsFp;
+    } else {
+      // Cached session — append messages new since last call. Skip
+      // assistant role: the session already generated those itself
+      // during the prior promptStreaming() and re-feeding would duplicate.
+      const session = await sessionPromise;
+      const newSlice = flatMessages.slice(consumed, flatMessages.length - 1);
+      for (const m of newSlice) {
+        if (m.role === 'assistant') continue;
+        if (typeof m.content === 'string' && typeof session.append === 'function') {
+          try { await session.append([{ role: m.role, content: m.content }]); }
+          catch {}
+        }
       }
-      // Last-resort defaults if params() isn't available.
-      if (t == null) t = 1.0;
-      if (k == null) k = 40;
-      opts.temperature = t;
-      opts.topK = k;
+      consumed = flatMessages.length - 1;
     }
+    return sessionPromise;
+  }
+  return ({ messages, signal, system, tools, turnEl }) => (async function* () {
+    const LM = getApi();
+    if (!LM) throw new Error(UNAVAILABLE);
-    const session = await LM.create(opts);
+    const effectiveSystem = system || systemPrompt || '';
+    const augmentedSystem = buildToolSystemPrompt(effectiveSystem, tools);
+    const toolsFp = toolsFingerprint(tools);
+    const availabilityOpts = {};
+    if (expectedInputs) availabilityOpts.expectedInputs = expectedInputs;
+    if (expectedOutputs) availabilityOpts.expectedOutputs = expectedOutputs;
+    await ensureAvailable(LM, availabilityOpts);
+    // Flatten structured messages (runtime's tool_use / tool_result blocks)
+    // into the text channel the Prompt API speaks.
+    const flat = flattenMessages(messages);
+    const session = await getSession(LM, augmentedSystem, toolsFp, flat, turnEl, signal);
+    const tail = flat[flat.length - 1];
+    const userText = tail?.content || '';
+    const parser = createToolCallParser();
+    let sawToolUse = false;
     try {
-      for await (const chunk of session.promptStreaming(userText)) {
+      const stream = session.promptStreaming(userText, signal ? { signal } : undefined);
+      for await (const chunk of stream) {
         if (signal?.aborted) throw new DOMException('Aborted', 'AbortError');
-        if (typeof chunk === 'string' && chunk) {
-          yield { type: 'text_delta', text: chunk };
+        if (typeof chunk !== 'string' || !chunk) continue;
+        for (const ev of parser.feed(chunk)) {
+          if (ev.type === 'tool_use') sawToolUse = true;
+          yield ev;
         }
       }
-      yield { type: 'turn_end', stopReason: 'end_turn' };
-    } finally {
-      try { session.destroy?.(); } catch {}
+      // Flush any pending text the parser buffered (final delta or an
+      // unterminated tool_call surfaced as literal text).
+      for (const ev of parser.flush()) {
+        if (ev.type === 'tool_use') sawToolUse = true;
+        yield ev;
+      }
+      yield { type: 'turn_end', stopReason: sawToolUse ? 'tool_use' : 'end_turn' };
+    } catch (err) {
+      try { session?.destroy?.(); } catch {}
+      sessionPromise = null;
+      consumed = 0;
+      lastSystem = null;
+      lastToolsFp = '';
+      throw err;
     }
   })();
 }

package/providers/local.esm.js CHANGED Viewed

@@ -16,6 +16,7 @@
 // <think> pill rendering); the provider just adapts the call shape.
 import { showLoading, hideLoading } from '../pip-core.esm.js';
+import { buildToolSystemPrompt, flattenMessages, createToolCallParser } from './_tool-prompt.esm.js';
 const TRANSFORMERS_URL = 'https://cdn.jsdelivr.net/npm/@huggingface/transformers';
@@ -265,19 +266,50 @@ export function createTransformersRenderer() {
 // Runtime-compatible provider. The renderer streams reply text via the
 // setReplyText callback (cumulative buffer per call); we proxy that
-// callback, diff each call into a `text_delta` event, and yield events
-// the runtime's turn loop already consumes. The <think> pill mounts onto
-// turnEl directly inside the renderer — unaffected, still works.
+// callback, diff each call into the tool-aware streaming parser, and
+// yield text_delta + tool_use events the runtime's turn loop consumes.
+// The <think> pill mounts onto turnEl directly inside the renderer —
+// unaffected, still works.
+//
+// Tool calling. When the runtime passes `tools`, we augment the system
+// prompt with JSON schemas + the <tool_call>{…}</tool_call> emit
+// convention (see _tool-prompt.esm.js). Models that follow it (Gemma 4
+// is well-trained on this shape) get dispatched through the runtime's
+// loop; tool_result turns are flattened back into text for the next
+// model call. Gemma 4's native special-token format (<|tool_call>…) is
+// NOT used because TextStreamer with skip_special_tokens drops the
+// markers — the text-channel convention works regardless.
 //
 // One pitfall handled: the renderer occasionally re-paints the same
 // buffer (no new tokens emitted between calls), so the diff guards
 // against zero-length deltas. AbortSignal flows through naturally —
 // the underlying TextStreamer throws AbortError, which we surface.
-export function local({ model, dtype = 'q4', maxTokens = 256, genParams, chatTemplate } = {}) {
+export function local({
+  model,
+  dtype = 'q4',
+  maxTokens = 256,
+  genParams,
+  chatTemplate,
+  systemPrompt,
+} = {}) {
   const renderer = createTransformersRenderer();
   if (model) renderer.setModel({ id: model, dtype, maxTokens, genParams, chatTemplate });
-  return ({ messages, signal, turnEl, setReplyText }) => (async function* () {
+  return ({ messages, signal, system, tools, turnEl, setReplyText }) => (async function* () {
+    const effectiveSystem = system || systemPrompt || '';
+    const augmentedSystem = buildToolSystemPrompt(effectiveSystem, tools);
+    // Flatten runtime's structured turns (tool_use/tool_result blocks)
+    // back into text-channel strings the chat template understands.
+    // Prepend the augmented system if any — apply_chat_template renders
+    // role:'system' into Gemma's developer/system slot natively.
+    const flat = flattenMessages(messages);
+    const renderMessages = augmentedSystem
+      ? [{ role: 'system', content: augmentedSystem }, ...flat]
+      : flat;
+    const parser = createToolCallParser();
+    let sawToolUse = false;
     let lastFull = '';
     const queue = [];
     let wake = null;
@@ -285,16 +317,25 @@ export function local({ model, dtype = 'q4', maxTokens = 256, genParams, chatTem
     let error = null;
     const proxySetReplyText = (_el, fullText) => {
+      if (fullText.length <= lastFull.length) return;
       const delta = fullText.slice(lastFull.length);
       lastFull = fullText;
-      if (delta) {
-        queue.push({ type: 'text_delta', text: delta });
-        wake?.();
+      for (const ev of parser.feed(delta)) {
+        if (ev.type === 'tool_use') sawToolUse = true;
+        queue.push(ev);
       }
+      wake?.();
     };
-    renderer.generate({ messages, turnEl, setReplyText: proxySetReplyText, signal })
-      .then(() => { done = true; wake?.(); })
+    renderer.generate({ messages: renderMessages, turnEl, setReplyText: proxySetReplyText, signal })
+      .then(() => {
+        for (const ev of parser.flush()) {
+          if (ev.type === 'tool_use') sawToolUse = true;
+          queue.push(ev);
+        }
+        done = true;
+        wake?.();
+      })
       .catch((e) => { error = e; done = true; wake?.(); });
     while (true) {
@@ -304,7 +345,7 @@ export function local({ model, dtype = 'q4', maxTokens = 256, genParams, chatTem
     }
     if (error) throw error;
-    yield { type: 'turn_end', stopReason: 'end_turn' };
+    yield { type: 'turn_end', stopReason: sawToolUse ? 'tool_use' : 'end_turn' };
   })();
 }