@nevescloud/pip 3.7.0 → 3.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/providers/_tool-prompt.esm.js +169 -0
- package/providers/chrome.esm.js +268 -68
- package/providers/local.esm.js +52 -11
package/package.json
CHANGED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
// Prompt-based tool calling for text-only providers (local, chrome).
|
|
2
|
+
//
|
|
3
|
+
// Models without a native tool-use protocol (Chrome's Prompt API) — and
|
|
4
|
+
// models whose native protocol is gated behind special tokens that
|
|
5
|
+
// transformers.js's TextStreamer strips with skip_special_tokens (Gemma
|
|
6
|
+
// 4's <|tool_call> family) — both need a text-channel convention.
|
|
7
|
+
//
|
|
8
|
+
// We use the community XML-JSON format: <tool_call>{"name":"…","arguments":{…}}</tool_call>.
|
|
9
|
+
// It survives any tokenizer setting (literal characters), parses with a
|
|
10
|
+
// regex, and is well-represented in instruction-tuned model training
|
|
11
|
+
// data (Hermes 2 Pro, Nous, and many fine-tunes converge here).
|
|
12
|
+
//
|
|
13
|
+
// Three exports:
|
|
14
|
+
// * buildToolSystemPrompt(systemPrompt, tools) — augments the system
|
|
15
|
+
// prompt with tool schemas + the response-format instruction.
|
|
16
|
+
// * flattenMessages(messages) — converts the runtime's structured
|
|
17
|
+
// tool_use/tool_result blocks back into the text channel so text-only
|
|
18
|
+
// providers can replay them.
|
|
19
|
+
// * createToolCallParser({ idGen }) — streaming parser. .feed(chunk)
|
|
20
|
+
// yields text_delta + tool_use events; .flush() emits any trailing text.
|
|
21
|
+
|
|
22
|
+
const OPEN = '<tool_call>';
|
|
23
|
+
const CLOSE = '</tool_call>';
|
|
24
|
+
|
|
25
|
+
export function buildToolSystemPrompt(systemPrompt, tools) {
|
|
26
|
+
if (!tools?.length) return systemPrompt || '';
|
|
27
|
+
const schemas = tools.map((t) => ({
|
|
28
|
+
name: t.name,
|
|
29
|
+
description: t.description || '',
|
|
30
|
+
parameters: t.schema || t.input_schema || { type: 'object', properties: {} },
|
|
31
|
+
}));
|
|
32
|
+
// The trailing example is critical for small models — Gemini Nano in
|
|
33
|
+
// particular often emits `function_call` or `tool_use` keys without it.
|
|
34
|
+
// The "answer directly otherwise" line keeps single-shot chat working.
|
|
35
|
+
const instruction = [
|
|
36
|
+
'You have access to the following tools:',
|
|
37
|
+
JSON.stringify(schemas, null, 2),
|
|
38
|
+
'',
|
|
39
|
+
'When you want to call a tool, emit a single line in this exact format:',
|
|
40
|
+
`${OPEN}{"name":"tool_name","arguments":{"key":"value"}}${CLOSE}`,
|
|
41
|
+
'',
|
|
42
|
+
'A tool result will be returned to you wrapped in <tool_result>…</tool_result>.',
|
|
43
|
+
'Use the result to compose your final answer. If no tool is needed, answer directly.',
|
|
44
|
+
].join('\n');
|
|
45
|
+
return systemPrompt ? `${systemPrompt}\n\n${instruction}` : instruction;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// Convert runtime's structured assistant turns ({content: [{type:'tool_use'},…]})
|
|
49
|
+
// and tool_result user turns back into flat text so text-only providers can
|
|
50
|
+
// replay the history. The runtime's tool loop produces these between
|
|
51
|
+
// iterations; the provider sees them on the next call.
|
|
52
|
+
export function flattenMessages(messages) {
|
|
53
|
+
return messages.map((m) => {
|
|
54
|
+
if (typeof m.content === 'string') return { role: m.role, content: m.content };
|
|
55
|
+
if (!Array.isArray(m.content)) return { role: m.role, content: '' };
|
|
56
|
+
const parts = [];
|
|
57
|
+
for (const block of m.content) {
|
|
58
|
+
if (block.type === 'text') {
|
|
59
|
+
parts.push(block.text);
|
|
60
|
+
} else if (block.type === 'tool_use') {
|
|
61
|
+
const args = block.input == null ? {} : block.input;
|
|
62
|
+
parts.push(`${OPEN}${JSON.stringify({ name: block.name, arguments: args })}${CLOSE}`);
|
|
63
|
+
} else if (block.type === 'tool_result') {
|
|
64
|
+
const c = typeof block.content === 'string' ? block.content : JSON.stringify(block.content);
|
|
65
|
+
const name = block.name ? ` name="${block.name}"` : '';
|
|
66
|
+
parts.push(`<tool_result${name}>${c}</tool_result>`);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
return { role: m.role, content: parts.join('\n') };
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Streaming parser. Feed chunks of generated text as they arrive; receive
|
|
74
|
+
// a flat list of runtime events. State is internal — one parser per turn.
|
|
75
|
+
//
|
|
76
|
+
// Behaviour:
|
|
77
|
+
// * Text outside <tool_call>…</tool_call> emits as text_delta. We hold
|
|
78
|
+
// back the trailing OPEN.length-1 chars on each feed so a partial
|
|
79
|
+
// opening tag spanning chunks doesn't leak as text.
|
|
80
|
+
// * A complete <tool_call>…</tool_call> block emits as tool_use. The
|
|
81
|
+
// payload is parsed as JSON; malformed JSON falls back to a text
|
|
82
|
+
// emission so the model's intent isn't silently dropped.
|
|
83
|
+
// * flush() emits any remaining text and any partial unterminated
|
|
84
|
+
// tool_call as text (rather than yielding a malformed tool_use that
|
|
85
|
+
// would derail the runtime's loop).
|
|
86
|
+
export function createToolCallParser({ idGen } = {}) {
|
|
87
|
+
const id = idGen || (() => `tu_${Date.now().toString(36)}_${Math.random().toString(36).slice(2, 8)}`);
|
|
88
|
+
let buf = '';
|
|
89
|
+
let emitted = 0; // index up to which we've emitted (text or call body)
|
|
90
|
+
let inCall = false;
|
|
91
|
+
let callBodyStart = -1;
|
|
92
|
+
|
|
93
|
+
function readTextUpTo(safeUpTo) {
|
|
94
|
+
if (safeUpTo <= emitted) return null;
|
|
95
|
+
const text = buf.slice(emitted, safeUpTo);
|
|
96
|
+
emitted = safeUpTo;
|
|
97
|
+
return text ? { type: 'text_delta', text } : null;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
function parseCallBody(body) {
|
|
101
|
+
try {
|
|
102
|
+
const obj = JSON.parse(body.trim());
|
|
103
|
+
if (obj && typeof obj.name === 'string') {
|
|
104
|
+
return { name: obj.name, input: obj.arguments ?? obj.parameters ?? {} };
|
|
105
|
+
}
|
|
106
|
+
} catch {}
|
|
107
|
+
return null;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return {
|
|
111
|
+
feed(chunk) {
|
|
112
|
+
if (!chunk) return [];
|
|
113
|
+
buf += chunk;
|
|
114
|
+
const out = [];
|
|
115
|
+
// Run until we either consume the buffer or can't make progress.
|
|
116
|
+
// eslint-disable-next-line no-constant-condition
|
|
117
|
+
while (true) {
|
|
118
|
+
if (!inCall) {
|
|
119
|
+
const openIdx = buf.indexOf(OPEN, emitted);
|
|
120
|
+
if (openIdx === -1) {
|
|
121
|
+
// No OPEN tag yet. Hold back the last OPEN.length-1 chars in
|
|
122
|
+
// case a partial tag straddles the next chunk.
|
|
123
|
+
const safe = Math.max(emitted, buf.length - (OPEN.length - 1));
|
|
124
|
+
const ev = readTextUpTo(safe);
|
|
125
|
+
if (ev) out.push(ev);
|
|
126
|
+
break;
|
|
127
|
+
}
|
|
128
|
+
// Emit text before the tag
|
|
129
|
+
const ev = readTextUpTo(openIdx);
|
|
130
|
+
if (ev) out.push(ev);
|
|
131
|
+
inCall = true;
|
|
132
|
+
callBodyStart = openIdx + OPEN.length;
|
|
133
|
+
emitted = callBodyStart;
|
|
134
|
+
} else {
|
|
135
|
+
const closeIdx = buf.indexOf(CLOSE, callBodyStart);
|
|
136
|
+
if (closeIdx === -1) break; // wait for more
|
|
137
|
+
const body = buf.slice(callBodyStart, closeIdx);
|
|
138
|
+
const parsed = parseCallBody(body);
|
|
139
|
+
if (parsed) {
|
|
140
|
+
out.push({ type: 'tool_use', id: id(), name: parsed.name, input: parsed.input });
|
|
141
|
+
} else {
|
|
142
|
+
// Couldn't parse — surface the literal text so the user sees
|
|
143
|
+
// what the model emitted instead of a silent drop.
|
|
144
|
+
out.push({ type: 'text_delta', text: OPEN + body + CLOSE });
|
|
145
|
+
}
|
|
146
|
+
inCall = false;
|
|
147
|
+
callBodyStart = -1;
|
|
148
|
+
emitted = closeIdx + CLOSE.length;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
return out;
|
|
152
|
+
},
|
|
153
|
+
|
|
154
|
+
flush() {
|
|
155
|
+
const out = [];
|
|
156
|
+
if (inCall) {
|
|
157
|
+
// Unterminated <tool_call> at end-of-stream. Surface as text so
|
|
158
|
+
// partial content isn't lost; the runtime won't try to dispatch.
|
|
159
|
+
out.push({ type: 'text_delta', text: buf.slice(emitted - OPEN.length) });
|
|
160
|
+
emitted = buf.length;
|
|
161
|
+
inCall = false;
|
|
162
|
+
} else if (emitted < buf.length) {
|
|
163
|
+
out.push({ type: 'text_delta', text: buf.slice(emitted) });
|
|
164
|
+
emitted = buf.length;
|
|
165
|
+
}
|
|
166
|
+
return out;
|
|
167
|
+
},
|
|
168
|
+
};
|
|
169
|
+
}
|
package/providers/chrome.esm.js
CHANGED
|
@@ -1,25 +1,52 @@
|
|
|
1
1
|
// Chrome's built-in Prompt API (on-device Gemini Nano / Gemma-derived).
|
|
2
|
-
//
|
|
3
|
-
//
|
|
4
|
-
// already has the weights
|
|
5
|
-
// range (well above what transformers.js practically pulls in-browser).
|
|
2
|
+
// Runtime-compatible provider — slots into createRuntime as a peer of
|
|
3
|
+
// anthropic/openai/local. Zero download for users on Chrome that
|
|
4
|
+
// already has the weights; reply quality lands ~2B-effective-param.
|
|
6
5
|
//
|
|
7
6
|
// Usage:
|
|
8
7
|
// import { createRuntime } from '@nevescloud/pip/runtime.esm.js';
|
|
9
8
|
// import { chrome } from '@nevescloud/pip/providers/chrome.esm.js';
|
|
10
9
|
//
|
|
11
|
-
// const rt = createRuntime({
|
|
10
|
+
// const rt = createRuntime({
|
|
11
|
+
// provider: chrome({ temperature: 0.1 }),
|
|
12
|
+
// tools: [ { name: 'get_time', description: '…', schema: {…}, handler: …} ],
|
|
13
|
+
// });
|
|
12
14
|
//
|
|
13
|
-
//
|
|
14
|
-
// `window.ai.languageModel` (earlier flag-gated builds) to the top-level
|
|
15
|
-
// `LanguageModel` constructor as the Prompt API spec settled. We try
|
|
16
|
-
// the newer surface first and fall back. Chrome 138+ ships the origin
|
|
17
|
-
// trial; ~Chrome 148+ runs without a flag for many origins. Non-Chrome
|
|
18
|
-
// browsers throw a friendly error on first invocation.
|
|
15
|
+
// Optimizations (per developer.chrome.com/docs/ai/session-management):
|
|
19
16
|
//
|
|
20
|
-
//
|
|
21
|
-
//
|
|
22
|
-
//
|
|
17
|
+
// * Session cache. First turn calls LanguageModel.create() with the
|
|
18
|
+
// full history as initialPrompts; subsequent turns reuse the same
|
|
19
|
+
// session and only feed the newest turn via session.append(), then
|
|
20
|
+
// prompt with the latest user message. Avoids the create() cost on
|
|
21
|
+
// every chat exchange.
|
|
22
|
+
// * Cache invalidation. If runtime's `messages` shrinks (clear /
|
|
23
|
+
// regenerate / model swap-and-back), the system prompt changes, or
|
|
24
|
+
// the tool schema changes, we destroy() and rebuild — keeps session
|
|
25
|
+
// ↔ runtime in lockstep.
|
|
26
|
+
// * downloadprogress wired to pip's loading bar via the create()
|
|
27
|
+
// monitor option, mounted on the active turnEl. First load only.
|
|
28
|
+
// * AbortSignal flows through create() AND promptStreaming(), so a
|
|
29
|
+
// stop click cancels both model fetch and an in-flight prompt.
|
|
30
|
+
// * temperature/topK are origin-trial / Extensions only. If params()
|
|
31
|
+
// is missing (stable web), we omit both and warn once if the host
|
|
32
|
+
// passed either. If only one is set, we fill the other from
|
|
33
|
+
// LanguageModel.params() to satisfy the API's both-or-neither rule.
|
|
34
|
+
//
|
|
35
|
+
// Tool use (prompt-based — the Prompt API has no native tool channel):
|
|
36
|
+
//
|
|
37
|
+
// * When the runtime passes `tools`, the helper in _tool-prompt.esm.js
|
|
38
|
+
// injects JSON schemas + a response-format instruction into the
|
|
39
|
+
// system prompt and parses the model's stream for
|
|
40
|
+
// <tool_call>{"name":"…","arguments":{…}}</tool_call> blocks.
|
|
41
|
+
// * Detected calls yield as tool_use events with stopReason='tool_use';
|
|
42
|
+
// the runtime dispatches, appends tool_result turns, and re-invokes
|
|
43
|
+
// this provider. The cached session continues from where it left off.
|
|
44
|
+
// * Reliability scales with model size — Nano (~2B effective) handles
|
|
45
|
+
// simple single-tool calls; chained / nested calls are flaky. For
|
|
46
|
+
// hard guarantees, register the action as a slash command instead.
|
|
47
|
+
|
|
48
|
+
import { showLoading, hideLoading } from '../pip-core.esm.js';
|
|
49
|
+
import { buildToolSystemPrompt, flattenMessages, createToolCallParser } from './_tool-prompt.esm.js';
|
|
23
50
|
|
|
24
51
|
const UNAVAILABLE =
|
|
25
52
|
"Chrome's built-in AI isn't available here. Use Chrome 138+ or enable " +
|
|
@@ -32,80 +59,253 @@ function getApi() {
|
|
|
32
59
|
return null;
|
|
33
60
|
}
|
|
34
61
|
|
|
35
|
-
async function ensureAvailable(LM) {
|
|
36
|
-
//
|
|
37
|
-
//
|
|
62
|
+
async function ensureAvailable(LM, availabilityOpts) {
|
|
63
|
+
// Availability is a *hint*, not a gate. Chrome 148+ has been observed
|
|
64
|
+
// returning 'unavailable' from availability() for the default config
|
|
65
|
+
// (no expectedOutputs/Inputs specified) while create() with the same
|
|
66
|
+
// opts succeeds — the availability check is stricter than create's
|
|
67
|
+
// own. We only hard-gate when LM itself is missing (handled upstream);
|
|
68
|
+
// here we just surface state for diagnostics and handle 'downloading'
|
|
69
|
+
// (which create() can't tolerate on some builds).
|
|
38
70
|
if (typeof LM.availability === 'function') {
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
71
|
+
let v;
|
|
72
|
+
try { v = await LM.availability(availabilityOpts || {}); }
|
|
73
|
+
catch (e) {
|
|
74
|
+
// eslint-disable-next-line no-console
|
|
75
|
+
console.warn('[pip/chrome] availability() threw — proceeding to create() to get the real error:', e?.message || e);
|
|
76
|
+
return 'unknown';
|
|
77
|
+
}
|
|
78
|
+
if (v === 'unavailable') {
|
|
79
|
+
// eslint-disable-next-line no-console
|
|
80
|
+
console.warn('[pip/chrome] availability() returned "unavailable" with opts', availabilityOpts, '— proceeding to create() anyway; if Chrome rejects, the real reason will surface.');
|
|
81
|
+
return v;
|
|
82
|
+
}
|
|
83
|
+
if (v === 'downloading') {
|
|
84
|
+
const deadline = Date.now() + 5 * 60 * 1000;
|
|
85
|
+
while (Date.now() < deadline) {
|
|
86
|
+
await new Promise((r) => setTimeout(r, 1500));
|
|
87
|
+
const next = await LM.availability(availabilityOpts || {});
|
|
88
|
+
if (next === 'available' || next === 'downloadable') return next;
|
|
89
|
+
if (next === 'unavailable') {
|
|
90
|
+
// eslint-disable-next-line no-console
|
|
91
|
+
console.warn('[pip/chrome] availability flipped to "unavailable" mid-download — proceeding to create() to surface the real error.');
|
|
92
|
+
return next;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
throw new Error("Chrome's built-in model is still downloading — try again in a minute.");
|
|
96
|
+
}
|
|
97
|
+
return v;
|
|
42
98
|
}
|
|
43
99
|
if (typeof LM.capabilities === 'function') {
|
|
44
100
|
const c = await LM.capabilities();
|
|
45
|
-
if (c?.available === 'no')
|
|
101
|
+
if (c?.available === 'no') {
|
|
102
|
+
// eslint-disable-next-line no-console
|
|
103
|
+
console.warn('[pip/chrome] capabilities() reported "no" — proceeding to create() to surface the real error.');
|
|
104
|
+
}
|
|
105
|
+
return c?.available || 'unknown';
|
|
46
106
|
}
|
|
107
|
+
return 'available';
|
|
47
108
|
}
|
|
48
109
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
110
|
+
function toolsFingerprint(tools) {
|
|
111
|
+
if (!tools?.length) return '';
|
|
112
|
+
return tools.map((t) => `${t.name}:${t.description || ''}`).join('|');
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
export function chrome({
|
|
116
|
+
systemPrompt,
|
|
117
|
+
temperature,
|
|
118
|
+
topK,
|
|
119
|
+
expectedInputs,
|
|
120
|
+
expectedOutputs,
|
|
121
|
+
} = {}) {
|
|
122
|
+
let sessionPromise = null;
|
|
123
|
+
let consumed = 0; // count of messages already fed into the cached session
|
|
124
|
+
let lastSystem = null;
|
|
125
|
+
let lastToolsFp = '';
|
|
126
|
+
let warnedNoParams = false;
|
|
127
|
+
|
|
128
|
+
async function buildOpts(LM, augmentedSystem, history, monitorFn, signal) {
|
|
129
|
+
const opts = {};
|
|
130
|
+
if (signal) opts.signal = signal;
|
|
131
|
+
if (expectedInputs) opts.expectedInputs = expectedInputs;
|
|
132
|
+
if (expectedOutputs) opts.expectedOutputs = expectedOutputs;
|
|
133
|
+
if (monitorFn) opts.monitor = monitorFn;
|
|
134
|
+
|
|
135
|
+
const initial = [];
|
|
136
|
+
if (augmentedSystem) initial.push({ role: 'system', content: augmentedSystem });
|
|
137
|
+
for (const m of history) {
|
|
138
|
+
// Skip assistant turns from the runtime's own history serialization
|
|
139
|
+
// when they're string-typed `(role: 'assistant', content: '…tool_call…')`
|
|
140
|
+
// — those already encode tool_use blocks as text via flattenMessages,
|
|
141
|
+
// safe to replay. Only drop anything that ended up non-string.
|
|
64
142
|
if (typeof m.content === 'string') {
|
|
65
|
-
|
|
143
|
+
initial.push({ role: m.role, content: m.content });
|
|
66
144
|
}
|
|
67
145
|
}
|
|
68
|
-
|
|
69
|
-
const userText = typeof tail?.content === 'string' ? tail.content : '';
|
|
146
|
+
if (initial.length) opts.initialPrompts = initial;
|
|
70
147
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
148
|
+
// temperature/topK are origin-trial / Extensions only. params() is
|
|
149
|
+
// gated to the same builds — its presence is the feature flag.
|
|
150
|
+
if (temperature != null || topK != null) {
|
|
151
|
+
if (typeof LM.params === 'function') {
|
|
152
|
+
let t = temperature, k = topK;
|
|
153
|
+
if ((t != null) !== (k != null)) {
|
|
154
|
+
try {
|
|
155
|
+
const p = await LM.params();
|
|
156
|
+
if (t == null) t = p?.defaultTemperature;
|
|
157
|
+
if (k == null) k = p?.defaultTopK;
|
|
158
|
+
} catch {}
|
|
159
|
+
}
|
|
160
|
+
if (t == null) t = 1.0;
|
|
161
|
+
if (k == null) k = 40;
|
|
162
|
+
opts.temperature = t;
|
|
163
|
+
opts.topK = k;
|
|
164
|
+
} else if (!warnedNoParams) {
|
|
165
|
+
warnedNoParams = true;
|
|
166
|
+
// eslint-disable-next-line no-console
|
|
167
|
+
console.warn(
|
|
168
|
+
'[pip/chrome] temperature/topK ignored — only supported on ' +
|
|
169
|
+
'Prompt API for Chrome Extensions or with the Origin Trial enabled.'
|
|
170
|
+
);
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
return opts;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
async function getSession(LM, augmentedSystem, toolsFp, flatMessages, turnEl, signal) {
|
|
177
|
+
const needsReset =
|
|
178
|
+
!sessionPromise ||
|
|
179
|
+
augmentedSystem !== lastSystem ||
|
|
180
|
+
toolsFp !== lastToolsFp ||
|
|
181
|
+
flatMessages.length < consumed + 1;
|
|
182
|
+
|
|
183
|
+
if (needsReset && sessionPromise) {
|
|
184
|
+
try { (await sessionPromise).destroy?.(); } catch {}
|
|
185
|
+
sessionPromise = null;
|
|
186
|
+
consumed = 0;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
if (!sessionPromise) {
|
|
190
|
+
const history = flatMessages.slice(0, -1);
|
|
191
|
+
const monitorFn = turnEl
|
|
192
|
+
? (m) => {
|
|
193
|
+
try {
|
|
194
|
+
m.addEventListener?.('downloadprogress', (e) => {
|
|
195
|
+
const pct = Math.round((e.loaded || 0) * 100);
|
|
196
|
+
showLoading(turnEl, `downloading model ${pct}%`, pct);
|
|
197
|
+
});
|
|
198
|
+
} catch {}
|
|
199
|
+
}
|
|
200
|
+
: undefined;
|
|
201
|
+
const opts = await buildOpts(LM, augmentedSystem, history, monitorFn, signal);
|
|
202
|
+
// Some Chrome builds reject role 'system' in initialPrompts and return
|
|
203
|
+
// a generic "unable to create a session" error. Fall back to folding
|
|
204
|
+
// the system content into the first user message — same effect.
|
|
205
|
+
const tryCreate = async () => {
|
|
84
206
|
try {
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
207
|
+
return await LM.create(opts);
|
|
208
|
+
} catch (err) {
|
|
209
|
+
const msg = String(err?.message || err || '');
|
|
210
|
+
if (augmentedSystem && /unable to create|initialPrompt|role/i.test(msg)) {
|
|
211
|
+
// eslint-disable-next-line no-console
|
|
212
|
+
console.warn('[pip/chrome] create() rejected first-pass opts; retrying with system folded into user prompt:', msg);
|
|
213
|
+
const fallback = { ...opts };
|
|
214
|
+
const prompts = [];
|
|
215
|
+
// Fold system + history → tagged first user message.
|
|
216
|
+
const folded = `<<SYSTEM>>\n${augmentedSystem}\n<<END_SYSTEM>>`;
|
|
217
|
+
prompts.push({ role: 'user', content: folded });
|
|
218
|
+
for (const m of history) {
|
|
219
|
+
if (typeof m.content === 'string') prompts.push({ role: m.role, content: m.content });
|
|
220
|
+
}
|
|
221
|
+
fallback.initialPrompts = prompts;
|
|
222
|
+
return await LM.create(fallback);
|
|
223
|
+
}
|
|
224
|
+
throw err;
|
|
225
|
+
}
|
|
226
|
+
};
|
|
227
|
+
sessionPromise = tryCreate().then((s) => {
|
|
228
|
+
if (turnEl) hideLoading(turnEl);
|
|
229
|
+
s.addEventListener?.('contextoverflow', () => {
|
|
230
|
+
// eslint-disable-next-line no-console
|
|
231
|
+
console.warn('[pip/chrome] context window full — older turns evicted by Chrome.');
|
|
232
|
+
});
|
|
233
|
+
return s;
|
|
234
|
+
}).catch((err) => {
|
|
235
|
+
if (turnEl) hideLoading(turnEl);
|
|
236
|
+
sessionPromise = null;
|
|
237
|
+
// eslint-disable-next-line no-console
|
|
238
|
+
console.warn('[pip/chrome] LM.create() failed:', err?.message || err, '— augmentedSystem chars:', augmentedSystem.length, 'history msgs:', history.length);
|
|
239
|
+
throw err;
|
|
240
|
+
});
|
|
241
|
+
consumed = history.length;
|
|
242
|
+
lastSystem = augmentedSystem;
|
|
243
|
+
lastToolsFp = toolsFp;
|
|
244
|
+
} else {
|
|
245
|
+
// Cached session — append messages new since last call. Skip
|
|
246
|
+
// assistant role: the session already generated those itself
|
|
247
|
+
// during the prior promptStreaming() and re-feeding would duplicate.
|
|
248
|
+
const session = await sessionPromise;
|
|
249
|
+
const newSlice = flatMessages.slice(consumed, flatMessages.length - 1);
|
|
250
|
+
for (const m of newSlice) {
|
|
251
|
+
if (m.role === 'assistant') continue;
|
|
252
|
+
if (typeof m.content === 'string' && typeof session.append === 'function') {
|
|
253
|
+
try { await session.append([{ role: m.role, content: m.content }]); }
|
|
254
|
+
catch {}
|
|
255
|
+
}
|
|
89
256
|
}
|
|
90
|
-
|
|
91
|
-
if (t == null) t = 1.0;
|
|
92
|
-
if (k == null) k = 40;
|
|
93
|
-
opts.temperature = t;
|
|
94
|
-
opts.topK = k;
|
|
257
|
+
consumed = flatMessages.length - 1;
|
|
95
258
|
}
|
|
259
|
+
return sessionPromise;
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
return ({ messages, signal, system, tools, turnEl }) => (async function* () {
|
|
263
|
+
const LM = getApi();
|
|
264
|
+
if (!LM) throw new Error(UNAVAILABLE);
|
|
96
265
|
|
|
97
|
-
const
|
|
266
|
+
const effectiveSystem = system || systemPrompt || '';
|
|
267
|
+
const augmentedSystem = buildToolSystemPrompt(effectiveSystem, tools);
|
|
268
|
+
const toolsFp = toolsFingerprint(tools);
|
|
269
|
+
|
|
270
|
+
const availabilityOpts = {};
|
|
271
|
+
if (expectedInputs) availabilityOpts.expectedInputs = expectedInputs;
|
|
272
|
+
if (expectedOutputs) availabilityOpts.expectedOutputs = expectedOutputs;
|
|
273
|
+
await ensureAvailable(LM, availabilityOpts);
|
|
274
|
+
|
|
275
|
+
// Flatten structured messages (runtime's tool_use / tool_result blocks)
|
|
276
|
+
// into the text channel the Prompt API speaks.
|
|
277
|
+
const flat = flattenMessages(messages);
|
|
278
|
+
const session = await getSession(LM, augmentedSystem, toolsFp, flat, turnEl, signal);
|
|
279
|
+
const tail = flat[flat.length - 1];
|
|
280
|
+
const userText = tail?.content || '';
|
|
281
|
+
|
|
282
|
+
const parser = createToolCallParser();
|
|
283
|
+
let sawToolUse = false;
|
|
98
284
|
|
|
99
285
|
try {
|
|
100
|
-
|
|
286
|
+
const stream = session.promptStreaming(userText, signal ? { signal } : undefined);
|
|
287
|
+
for await (const chunk of stream) {
|
|
101
288
|
if (signal?.aborted) throw new DOMException('Aborted', 'AbortError');
|
|
102
|
-
if (typeof chunk
|
|
103
|
-
|
|
289
|
+
if (typeof chunk !== 'string' || !chunk) continue;
|
|
290
|
+
for (const ev of parser.feed(chunk)) {
|
|
291
|
+
if (ev.type === 'tool_use') sawToolUse = true;
|
|
292
|
+
yield ev;
|
|
104
293
|
}
|
|
105
294
|
}
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
295
|
+
// Flush any pending text the parser buffered (final delta or an
|
|
296
|
+
// unterminated tool_call surfaced as literal text).
|
|
297
|
+
for (const ev of parser.flush()) {
|
|
298
|
+
if (ev.type === 'tool_use') sawToolUse = true;
|
|
299
|
+
yield ev;
|
|
300
|
+
}
|
|
301
|
+
yield { type: 'turn_end', stopReason: sawToolUse ? 'tool_use' : 'end_turn' };
|
|
302
|
+
} catch (err) {
|
|
303
|
+
try { session?.destroy?.(); } catch {}
|
|
304
|
+
sessionPromise = null;
|
|
305
|
+
consumed = 0;
|
|
306
|
+
lastSystem = null;
|
|
307
|
+
lastToolsFp = '';
|
|
308
|
+
throw err;
|
|
109
309
|
}
|
|
110
310
|
})();
|
|
111
311
|
}
|
package/providers/local.esm.js
CHANGED
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
// <think> pill rendering); the provider just adapts the call shape.
|
|
17
17
|
|
|
18
18
|
import { showLoading, hideLoading } from '../pip-core.esm.js';
|
|
19
|
+
import { buildToolSystemPrompt, flattenMessages, createToolCallParser } from './_tool-prompt.esm.js';
|
|
19
20
|
|
|
20
21
|
const TRANSFORMERS_URL = 'https://cdn.jsdelivr.net/npm/@huggingface/transformers';
|
|
21
22
|
|
|
@@ -265,19 +266,50 @@ export function createTransformersRenderer() {
|
|
|
265
266
|
|
|
266
267
|
// Runtime-compatible provider. The renderer streams reply text via the
|
|
267
268
|
// setReplyText callback (cumulative buffer per call); we proxy that
|
|
268
|
-
// callback, diff each call into
|
|
269
|
-
// the runtime's turn loop
|
|
270
|
-
// turnEl directly inside the renderer —
|
|
269
|
+
// callback, diff each call into the tool-aware streaming parser, and
|
|
270
|
+
// yield text_delta + tool_use events the runtime's turn loop consumes.
|
|
271
|
+
// The <think> pill mounts onto turnEl directly inside the renderer —
|
|
272
|
+
// unaffected, still works.
|
|
273
|
+
//
|
|
274
|
+
// Tool calling. When the runtime passes `tools`, we augment the system
|
|
275
|
+
// prompt with JSON schemas + the <tool_call>{…}</tool_call> emit
|
|
276
|
+
// convention (see _tool-prompt.esm.js). Models that follow it (Gemma 4
|
|
277
|
+
// is well-trained on this shape) get dispatched through the runtime's
|
|
278
|
+
// loop; tool_result turns are flattened back into text for the next
|
|
279
|
+
// model call. Gemma 4's native special-token format (<|tool_call>…) is
|
|
280
|
+
// NOT used because TextStreamer with skip_special_tokens drops the
|
|
281
|
+
// markers — the text-channel convention works regardless.
|
|
271
282
|
//
|
|
272
283
|
// One pitfall handled: the renderer occasionally re-paints the same
|
|
273
284
|
// buffer (no new tokens emitted between calls), so the diff guards
|
|
274
285
|
// against zero-length deltas. AbortSignal flows through naturally —
|
|
275
286
|
// the underlying TextStreamer throws AbortError, which we surface.
|
|
276
|
-
export function local({
|
|
287
|
+
export function local({
|
|
288
|
+
model,
|
|
289
|
+
dtype = 'q4',
|
|
290
|
+
maxTokens = 256,
|
|
291
|
+
genParams,
|
|
292
|
+
chatTemplate,
|
|
293
|
+
systemPrompt,
|
|
294
|
+
} = {}) {
|
|
277
295
|
const renderer = createTransformersRenderer();
|
|
278
296
|
if (model) renderer.setModel({ id: model, dtype, maxTokens, genParams, chatTemplate });
|
|
279
297
|
|
|
280
|
-
return ({ messages, signal, turnEl, setReplyText }) => (async function* () {
|
|
298
|
+
return ({ messages, signal, system, tools, turnEl, setReplyText }) => (async function* () {
|
|
299
|
+
const effectiveSystem = system || systemPrompt || '';
|
|
300
|
+
const augmentedSystem = buildToolSystemPrompt(effectiveSystem, tools);
|
|
301
|
+
|
|
302
|
+
// Flatten runtime's structured turns (tool_use/tool_result blocks)
|
|
303
|
+
// back into text-channel strings the chat template understands.
|
|
304
|
+
// Prepend the augmented system if any — apply_chat_template renders
|
|
305
|
+
// role:'system' into Gemma's developer/system slot natively.
|
|
306
|
+
const flat = flattenMessages(messages);
|
|
307
|
+
const renderMessages = augmentedSystem
|
|
308
|
+
? [{ role: 'system', content: augmentedSystem }, ...flat]
|
|
309
|
+
: flat;
|
|
310
|
+
|
|
311
|
+
const parser = createToolCallParser();
|
|
312
|
+
let sawToolUse = false;
|
|
281
313
|
let lastFull = '';
|
|
282
314
|
const queue = [];
|
|
283
315
|
let wake = null;
|
|
@@ -285,16 +317,25 @@ export function local({ model, dtype = 'q4', maxTokens = 256, genParams, chatTem
|
|
|
285
317
|
let error = null;
|
|
286
318
|
|
|
287
319
|
const proxySetReplyText = (_el, fullText) => {
|
|
320
|
+
if (fullText.length <= lastFull.length) return;
|
|
288
321
|
const delta = fullText.slice(lastFull.length);
|
|
289
322
|
lastFull = fullText;
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
323
|
+
for (const ev of parser.feed(delta)) {
|
|
324
|
+
if (ev.type === 'tool_use') sawToolUse = true;
|
|
325
|
+
queue.push(ev);
|
|
293
326
|
}
|
|
327
|
+
wake?.();
|
|
294
328
|
};
|
|
295
329
|
|
|
296
|
-
renderer.generate({ messages, turnEl, setReplyText: proxySetReplyText, signal })
|
|
297
|
-
.then(() => {
|
|
330
|
+
renderer.generate({ messages: renderMessages, turnEl, setReplyText: proxySetReplyText, signal })
|
|
331
|
+
.then(() => {
|
|
332
|
+
for (const ev of parser.flush()) {
|
|
333
|
+
if (ev.type === 'tool_use') sawToolUse = true;
|
|
334
|
+
queue.push(ev);
|
|
335
|
+
}
|
|
336
|
+
done = true;
|
|
337
|
+
wake?.();
|
|
338
|
+
})
|
|
298
339
|
.catch((e) => { error = e; done = true; wake?.(); });
|
|
299
340
|
|
|
300
341
|
while (true) {
|
|
@@ -304,7 +345,7 @@ export function local({ model, dtype = 'q4', maxTokens = 256, genParams, chatTem
|
|
|
304
345
|
}
|
|
305
346
|
|
|
306
347
|
if (error) throw error;
|
|
307
|
-
yield { type: 'turn_end', stopReason: 'end_turn' };
|
|
348
|
+
yield { type: 'turn_end', stopReason: sawToolUse ? 'tool_use' : 'end_turn' };
|
|
308
349
|
})();
|
|
309
350
|
}
|
|
310
351
|
|