@webmcp-auto-ui/agent 2.5.26 → 2.5.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,640 @@
1
+ /// <reference lib="webworker" />
2
+ /**
3
+ * transformers.worker.ts — Web Worker that runs transformers.js v4 (ONNX + WebGPU).
4
+ *
5
+ * Protocol (see transformers.ts for the main-thread side):
6
+ * main → worker: { type: 'load', modelId, entry, contextSize }
7
+ * main → worker: { type: 'generate', requestId, options,
8
+ * prompt?, chatMessages?, image? }
9
+ * - `prompt` is a pre-built string (Gemma wire format). Used as-is.
10
+ * - `chatMessages` is a [{role, content}] array that the worker feeds to
11
+ * tokenizer.apply_chat_template (Qwen / Mistral native chat_template).
12
+ * - For vision turns, `image` is attached; for Qwen/Mistral VLMs the
13
+ * worker applies the chat_template first, then passes the string to
14
+ * processor(prompt, raw).
15
+ * main → worker: { type: 'abort', requestId }
16
+ * main → worker: { type: 'dispose' }
17
+ *
18
+ * worker → main: { type: 'progress', fileProgress, totalProgress, status, loaded, total }
19
+ * worker → main: { type: 'ready' }
20
+ * worker → main: { type: 'warning', message }
21
+ * worker → main: { type: 'error', message, requestId? }
22
+ * worker → main: { type: 'token', requestId, token }
23
+ * worker → main: { type: 'done', requestId, content, stats, usage }
24
+ *
25
+ * The worker keeps one model loaded at a time and reuses `past_key_values`
26
+ * across turns when no image is attached. Vision turns reset the KV cache.
27
+ */
28
+
29
+ import type { ContentBlock } from '../types.js';
30
+ import type { TransformersModelEntry } from './transformers-models.js';
31
+
32
+ // --------------------------------------------------------------------------
33
+ // Gemma 4 chat_template override.
34
+ //
35
+ // The chat_template baked into onnx-community/gemma-4-E{2,4}B-it-ONNX is the
36
+ // VLM variant shipped by HF, which iterates `message.content` as a list
37
+ // ({% for part in content %}). Our worker feeds plain strings, which trips
38
+ // the minified Jinja `for` loop inside transformers.js with the opaque error
39
+ // "C is not iterable". We replace the template with a string-safe variant
40
+ // that accepts either a string or a list of {type:'text', text} parts.
41
+ // Mirrors the approach in Chong's TurboQuant-WASM demo
42
+ // (demo/src/draw/prompts/preamble.ts on github.com/teamchong/turboquant-wasm).
43
+ // --------------------------------------------------------------------------
44
+
45
+ const GEMMA4_CHAT_TEMPLATE = `{{- bos_token -}}
46
+ {%- for message in messages -%}
47
+ {%- set role = message['role'] -%}
48
+ {%- if role == 'assistant' -%}{%- set role = 'model' -%}{%- endif -%}
49
+ <|turn>{{ role }}
50
+ {%- if message['content'] is string %}
51
+ {{ message['content'] | trim }}
52
+ {%- else -%}
53
+ {%- for part in message['content'] -%}
54
+ {%- if part['type'] == 'text' -%}{{ part['text'] | trim }}{%- endif -%}
55
+ {%- endfor -%}
56
+ {%- endif %}
57
+ <turn|>
58
+ {%- endfor -%}
59
+ {%- if add_generation_prompt -%}
60
+ <|turn>model
61
+ {%- endif -%}`;
62
+
63
+ // --------------------------------------------------------------------------
64
+ // Lazy imports — resolved on first 'load'. Kept as `any` because the v4 API
65
+ // surface isn't fully typed yet.
66
+ // --------------------------------------------------------------------------
67
+
68
+ let transformersMod: any = null;
69
+ let processor: any = null;
70
+ let model: any = null;
71
+ let tokenizer: any = null;
72
+ let entry: TransformersModelEntry | null = null;
73
+ let stoppingCriteria: any = null;
74
+
75
+ /**
76
+ * past_key_values slot. Populated transiently during a generate() call and
77
+ * ALWAYS disposed before and after (see disposePastKeyValues()). The cache is
78
+ * intra-generate only — `use_cache: true` in generateArgs keeps attention at
79
+ * O(n) per token inside a single generate() — but it is NEVER retained across
80
+ * generate() calls. Cross-turn reuse was removed in commit 9bb7d04 (perf:
81
+ * re-enable intra-generate KV cache) after the earlier fix 98d7d57 that
82
+ * disabled reuse entirely because of a SWA mask/score shape desync.
83
+ */
84
+ let pastKeyValues: any = null;
85
+
86
+ /** Active generation request id — set on 'generate', cleared on 'done'/'error'. */
87
+ let activeRequestId: string | null = null;
88
+
89
+ // --------------------------------------------------------------------------
90
+ // Tool-call parser — loaded lazily with a best-effort fallback stub.
91
+ // --------------------------------------------------------------------------
92
+
93
+ type ParsedToolCallBlock =
94
+ | { type: 'text'; text: string }
95
+ | { type: 'tool_use'; id: string; name: string; input: Record<string, unknown> };
96
+
97
+ async function parseToolCalls(
98
+ fullText: string,
99
+ toolFormat: string,
100
+ ): Promise<ParsedToolCallBlock[]> {
101
+ try {
102
+ // Optional fallback import — module is shipped (../prompts/tool-call-parsers.ts);
103
+ // the try/catch is defensive only, guarding against bundler quirks that
104
+ // could drop the worker-side import.
105
+ const mod: any = await import('../prompts/tool-call-parsers.js');
106
+ const fn = mod.parseToolCalls ?? mod.default;
107
+ if (typeof fn === 'function') return await fn(fullText, toolFormat);
108
+ } catch {
109
+ // Import resolution failed — fall through to stub.
110
+ }
111
+ // Stub: ship the raw text as a single text block. Parsing will arrive in a
112
+ // later agent iteration.
113
+ return [{ type: 'text', text: fullText }];
114
+ }
115
+
116
+ // --------------------------------------------------------------------------
117
+ // Cache note: transformers.js manages its own cache via Cache Storage API
118
+ // (enabled by `env.useBrowserCache = true` below). No OPFS pre-download from
119
+ // this worker — the generic OPFS helper requires an explicit file list that
120
+ // transformers.js doesn't expose. Progress is surfaced via `progress_callback`
121
+ // in `fromPretrainedOpts`.
122
+ // --------------------------------------------------------------------------
123
+
124
+ // --------------------------------------------------------------------------
125
+ // Helpers
126
+ // --------------------------------------------------------------------------
127
+
128
+ function post(msg: any, transfer?: Transferable[]): void {
129
+ (self as unknown as Worker).postMessage(msg, transfer ?? []);
130
+ }
131
+
132
+ function disposePastKeyValues(): void {
133
+ if (!pastKeyValues) return;
134
+ try {
135
+ if (typeof pastKeyValues === 'object') {
136
+ for (const v of Object.values(pastKeyValues) as any[]) {
137
+ try { v?.dispose?.(); } catch {}
138
+ }
139
+ }
140
+ } catch {}
141
+ pastKeyValues = null;
142
+ }
143
+
144
+ function resetAll(): void {
145
+ disposePastKeyValues();
146
+ try { stoppingCriteria?.reset?.(); } catch {}
147
+ }
148
+
149
+ // --------------------------------------------------------------------------
150
+ // Model loading
151
+ // --------------------------------------------------------------------------
152
+
153
+ async function loadModel(modelEntry: TransformersModelEntry): Promise<void> {
154
+ entry = modelEntry;
155
+
156
+ post({
157
+ type: 'progress',
158
+ fileProgress: 0,
159
+ totalProgress: 0,
160
+ status: 'importing transformers.js',
161
+ loaded: 0,
162
+ total: modelEntry.size,
163
+ });
164
+
165
+ // Dynamic import — Web Workers don't inherit the document's import-map, so
166
+ // the bare specifier can't resolve when externalized from the worker bundle.
167
+ // Hardcode the CDN URL (mirrors the pin in app.html). Keep /* @vite-ignore */
168
+ // to stop Vite from pre-resolving the runtime string.
169
+ // Version pinning per family — Mistral3 was only fully wired (name → class
170
+ // registry) in transformers.js 3.8.1; 4.1.0 regresses that path but adds
171
+ // Gemma4/Qwen3.5. So route each family to the version that actually works.
172
+ // Gemma 4 is additionally pinned to 4.0.1 because 4.1.0 shipped Jinja
173
+ // regressions that combine badly with the VLM chat_template on
174
+ // onnx-community/gemma-4-*. 4.0.1 is the version Chong validated in
175
+ // TurboQuant-WASM. Qwen3.5 still needs 4.1.0 (that's where its class
176
+ // registry landed).
177
+ const TRANSFORMERS_URL = modelEntry.family === 'mistral'
178
+ ? 'https://esm.sh/@huggingface/transformers@3.8.1'
179
+ : modelEntry.family === 'gemma4'
180
+ ? 'https://esm.sh/@huggingface/transformers@4.0.1'
181
+ : 'https://esm.sh/@huggingface/transformers@4.1.0';
182
+ const imported: any = await import(/* @vite-ignore */ TRANSFORMERS_URL);
183
+ // Some CDN bundles park named exports under `.default`; flatten so the
184
+ // destructure below finds them either way.
185
+ transformersMod = imported?.AutoTokenizer ? imported : (imported?.default ?? imported);
186
+ const topKeys = Object.keys(transformersMod ?? {});
187
+ post({ type: 'warning', message: `[transformers] module loaded. ${topKeys.length} top-level keys. AutoTokenizer=${typeof transformersMod?.AutoTokenizer}, AutoModelForImageTextToText=${typeof transformersMod?.AutoModelForImageTextToText}, AutoModelForCausalLM=${typeof transformersMod?.AutoModelForCausalLM}` });
188
+ const {
189
+ AutoProcessor,
190
+ AutoTokenizer,
191
+ AutoModelForCausalLM,
192
+ InterruptableStoppingCriteria,
193
+ env,
194
+ } = transformersMod;
195
+ if (!AutoTokenizer || !AutoModelForCausalLM) {
196
+ throw new Error(`[transformers] CDN module missing core exports. Keys seen: ${topKeys.slice(0, 40).join(',')}`);
197
+ }
198
+
199
+ // Point ONNX Runtime WASM binaries to the jsdelivr CDN so they're not bundled.
200
+ // esm.sh hosts the JS modules; the native .wasm binaries are served by jsdelivr.
201
+ try {
202
+ if (env?.backends?.onnx?.wasm && modelEntry.family !== 'mistral') {
203
+ // Only override for 4.1.0 / ORT 1.26 (we host the matching .wasm binaries
204
+ // on jsdelivr). For the 3.8.1 path, ORT 1.22.0-dev is a transformers.js-
205
+ // internal build not mirrored on jsdelivr — let transformers.js use its
206
+ // default wasmPaths (which resolve against esm.sh, matching the JS bundle).
207
+ env.backends.onnx.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.26.0-dev.20260410-5e55544225/dist/';
208
+ }
209
+ if (env) {
210
+ env.allowLocalModels = false;
211
+ env.useBrowserCache = true;
212
+ }
213
+ } catch {}
214
+
215
+ stoppingCriteria = new InterruptableStoppingCriteria();
216
+
217
+ // Aggregated progress callback — sums loaded/total across every file we see,
218
+ // emitting a monotonic aggregate ratio. Two guards eliminate flicker:
219
+ // 1. Files with total < 1_000_000 bytes are ignored (configs, tokenizers,
220
+ // chat_templates are all <100KB and would jump instantly to 100%,
221
+ // momentarily overwriting the big weight-shard progress).
222
+ // 2. We emit sum(loaded) / sum(total) — so small-file completions cannot
223
+ // make the overall ratio regress.
224
+ const fileStats = new Map<string, { loaded: number; total: number }>();
225
+ const progressCallback = (p: any) => {
226
+ if (p?.status !== 'progress' || typeof p?.file !== 'string') return;
227
+ const loaded = typeof p.loaded === 'number' ? p.loaded : 0;
228
+ const total = typeof p.total === 'number' && p.total > 0 ? p.total : 0;
229
+ if (total < 1_000_000) return; // skip tiny files
230
+ fileStats.set(p.file, { loaded, total });
231
+ let sumLoaded = 0;
232
+ let sumTotal = 0;
233
+ for (const v of fileStats.values()) { sumLoaded += v.loaded; sumTotal += v.total; }
234
+ const fp = sumTotal > 0 ? sumLoaded / sumTotal : 0;
235
+ post({
236
+ type: 'progress',
237
+ fileProgress: fp,
238
+ totalProgress: fp,
239
+ status: 'downloading',
240
+ loaded: sumLoaded,
241
+ total: sumTotal,
242
+ });
243
+ };
244
+
245
+ // No progress_callback: OPFS already pre-downloaded every weight, so
246
+ // from_pretrained reads from browser cache. Wiring progress_callback here
247
+ // made the loader flicker because each sub-ONNX (embed/decoder/vision/audio)
248
+ // fires its own 0→100% event, overwriting the aggregated OPFS progress.
249
+ const fromPretrainedOpts = {
250
+ dtype: modelEntry.dtype,
251
+ device: 'webgpu' as const,
252
+ progress_callback: progressCallback,
253
+ };
254
+ post({
255
+ type: 'progress',
256
+ fileProgress: 1,
257
+ totalProgress: 1,
258
+ status: 'initializing model weights',
259
+ loaded: modelEntry.size,
260
+ total: modelEntry.size,
261
+ });
262
+
263
+ // Tokenizer + processor — processor is required for VLMs and harmless otherwise.
264
+ // No progress_callback here: OPFS already downloaded every file, so transformers.js
265
+ // just reads from cache. Wiring a progress_callback would flicker the UI between
266
+ // the big decoder bar and tiny tokenizer/config progress events.
267
+ post({
268
+ type: 'progress',
269
+ fileProgress: 1,
270
+ totalProgress: 1,
271
+ status: 'initializing tokenizer',
272
+ loaded: modelEntry.size,
273
+ total: modelEntry.size,
274
+ });
275
+ try {
276
+ tokenizer = await AutoTokenizer.from_pretrained(modelEntry.repo, { progress_callback: progressCallback });
277
+ } catch (err) {
278
+ post({ type: 'warning', message: `tokenizer load: ${String(err)}` });
279
+ tokenizer = null;
280
+ }
281
+
282
+ if (modelEntry.family === 'gemma4' && tokenizer) {
283
+ // Override the VLM chat_template baked into onnx-community/gemma-4-* with a
284
+ // string-safe variant. The shipped template iterates message.content as a
285
+ // list ({% for part in content %}); our serializer emits strings, which
286
+ // triggers "C is not iterable" inside transformers.js's minified Jinja
287
+ // runtime. Mirrors the approach Chong takes in TurboQuant-WASM
288
+ // (demo/src/draw/prompts/preamble.ts).
289
+ try {
290
+ (tokenizer as any).chat_template = GEMMA4_CHAT_TEMPLATE;
291
+ } catch { /* best-effort */ }
292
+ }
293
+
294
+ try {
295
+ processor = await AutoProcessor.from_pretrained(modelEntry.repo, { progress_callback: progressCallback });
296
+ } catch {
297
+ // Some text-only checkpoints ship without an AutoProcessor — that's fine.
298
+ processor = null;
299
+ }
300
+
301
+ // Model class — pick a specialized VLM class when the catalog hints at one,
302
+ // otherwise fall back to AutoModelForCausalLM.
303
+ let ModelClass: any = AutoModelForCausalLM;
304
+ if (modelEntry.modelClass) {
305
+ const resolved = transformersMod[modelEntry.modelClass];
306
+ if (resolved && typeof resolved.from_pretrained === 'function') {
307
+ ModelClass = resolved;
308
+ post({ type: 'warning', message: `[transformers] using ModelClass=${modelEntry.modelClass}` });
309
+ } else {
310
+ const autoKeys = Object.keys(transformersMod).filter((k) => k.startsWith('AutoModel') || k.includes('ForConditional') || k.includes('ForImageText')).join(',');
311
+ post({ type: 'warning', message: `[transformers] modelClass '${modelEntry.modelClass}' not found. Available Auto* keys: ${autoKeys}. Falling back to AutoModelForCausalLM.` });
312
+ }
313
+ }
314
+ if (!ModelClass || typeof ModelClass.from_pretrained !== 'function') {
315
+ throw new Error(`[transformers] No usable model class. AutoModelForCausalLM=${typeof AutoModelForCausalLM}, mod keys sample: ${Object.keys(transformersMod).slice(0, 20).join(',')}`);
316
+ }
317
+
318
+ try {
319
+ model = await ModelClass.from_pretrained(modelEntry.repo, fromPretrainedOpts);
320
+ } catch (err) {
321
+ // WebGPU can fail on older drivers — fall back to WASM and warn the UI.
322
+ post({
323
+ type: 'warning',
324
+ message: `WebGPU unavailable, falling back to WASM: ${String(err)}`,
325
+ });
326
+ model = await ModelClass.from_pretrained(modelEntry.repo, {
327
+ ...fromPretrainedOpts,
328
+ device: 'wasm',
329
+ });
330
+ }
331
+
332
+ post({ type: 'ready' });
333
+ }
334
+
335
+ // --------------------------------------------------------------------------
336
+ // Generation
337
+ // --------------------------------------------------------------------------
338
+
339
+ interface GenerateOptions {
340
+ maxTokens?: number;
341
+ temperature?: number;
342
+ topK?: number;
343
+ }
344
+
345
+ async function handleGenerate(
346
+ requestId: string,
347
+ prompt: string | undefined,
348
+ chatMessages: Array<{ role: string; content: string }> | undefined,
349
+ options: GenerateOptions,
350
+ image?: Uint8Array,
351
+ ): Promise<void> {
352
+ if (!model || !entry) {
353
+ post({ type: 'error', requestId, message: 'model not loaded' });
354
+ return;
355
+ }
356
+
357
+ activeRequestId = requestId;
358
+
359
+ const { TextStreamer, RawImage } = transformersMod;
360
+
361
+ // If the main thread sent chatMessages, apply the tokenizer's native
362
+ // chat_template (Jinja) now. This is how Qwen3 and Mistral produce correctly
363
+ // tagged prompts (<|im_start|>user … / [INST] …). Falling back to the raw
364
+ // string lets the Gemma path (custom wire format) keep working unchanged.
365
+ let effectivePrompt: string | undefined;
366
+ if (chatMessages && entry.family === 'mistral' && processor && typeof processor.apply_chat_template === 'function') {
367
+ try {
368
+ effectivePrompt = processor.apply_chat_template(chatMessages);
369
+ } catch (err) {
370
+ post({ type: 'warning', message: `processor.apply_chat_template failed, falling back to tokenizer: ${String(err)}` });
371
+ // fall through to tokenizer branch below
372
+ }
373
+ }
374
+ if (!effectivePrompt && chatMessages && tokenizer && typeof tokenizer.apply_chat_template === 'function') {
375
+ try {
376
+ effectivePrompt = tokenizer.apply_chat_template(chatMessages, {
377
+ tokenize: false,
378
+ add_generation_prompt: true,
379
+ });
380
+ } catch (err) {
381
+ post({ type: 'warning', message: `apply_chat_template failed on string content, retrying with structured parts: ${String(err)}` });
382
+ try {
383
+ const structured = chatMessages.map(m => ({
384
+ role: m.role,
385
+ content: [{ type: 'text', text: m.content }],
386
+ }));
387
+ effectivePrompt = tokenizer.apply_chat_template(structured, {
388
+ tokenize: false,
389
+ add_generation_prompt: true,
390
+ });
391
+ } catch (err2) {
392
+ post({ type: 'warning', message: `apply_chat_template failed twice, falling back to raw concat: ${String(err2)}` });
393
+ effectivePrompt = chatMessages.map(m => `${m.role}: ${m.content}`).join('\n\n');
394
+ }
395
+ }
396
+ }
397
+ if (!effectivePrompt && typeof prompt === 'string') {
398
+ effectivePrompt = prompt;
399
+ }
400
+ if (!effectivePrompt) {
401
+ post({ type: 'error', requestId, message: 'generate requires either prompt or chatMessages' });
402
+ activeRequestId = null;
403
+ return;
404
+ }
405
+
406
+ const t0 = performance.now();
407
+ let tokenCount = 0;
408
+ let fullText = '';
409
+
410
+ const streamerTokenizer = entry.family === 'mistral' ? (processor?.tokenizer ?? tokenizer) : tokenizer;
411
+ const streamer = new TextStreamer(streamerTokenizer, {
412
+ skip_prompt: true,
413
+ skip_special_tokens: entry.family === 'mistral',
414
+ token_callback_function: () => {
415
+ tokenCount += 1;
416
+ },
417
+ callback_function: (token: string) => {
418
+ fullText += token;
419
+ post({ type: 'token', requestId, token });
420
+ },
421
+ });
422
+
423
+ // Build model inputs — VLM path goes through processor(prompt, image),
424
+ // text path goes through tokenizer(prompt).
425
+ // KV reuse is disabled: the agent loop rebuilds the full prompt each turn,
426
+ // so reusing past_key_values double-prefixes and triggers mask/score shape
427
+ // mismatches (Where node broadcast error on dim 3).
428
+ disposePastKeyValues();
429
+ let inputs: any;
430
+ let isVisionTurn = false;
431
+ try {
432
+ if (image && processor && entry.vision) {
433
+ isVisionTurn = true;
434
+ const blob = new Blob([image]);
435
+ const raw: any = await RawImage.read(blob);
436
+ // Mistral/Pixtral: let image_processor drive sizing (longest_edge), do NOT force 448×448.
437
+ if (entry.family === 'mistral' && processor.image_processor) {
438
+ try { processor.image_processor.size = { longest_edge: 480 }; } catch {}
439
+ } else {
440
+ try { raw.resize?.(448, 448); } catch {}
441
+ }
442
+ // processor(images, text, opts) — ARG ORDER MATTERS for Pixtral processor.
443
+ inputs = await processor(raw, effectivePrompt, { add_special_tokens: false });
444
+ } else if (tokenizer) {
445
+ // Text-only turn — always go through the tokenizer, even on a VLM.
446
+ // VLM processors (Qwen3.5, Mistral3, Gemma4) expect messages-with-content-
447
+ // blocks rather than a plain prompt string, so calling processor(prompt)
448
+ // throws "X is not iterable" on the template path.
449
+ inputs = await tokenizer(effectivePrompt, { return_tensors: 'pt' });
450
+ } else {
451
+ post({ type: 'error', requestId, message: 'no tokenizer/processor available' });
452
+ activeRequestId = null;
453
+ return;
454
+ }
455
+ } catch (err) {
456
+ post({ type: 'error', requestId, message: `input preparation failed: ${String(err)}` });
457
+ activeRequestId = null;
458
+ return;
459
+ }
460
+
461
+ const generateArgs: any = {
462
+ ...inputs,
463
+ max_new_tokens: options.maxTokens ?? 2048,
464
+ do_sample: true,
465
+ return_dict_in_generate: true,
466
+ // Keep the intra-generate KV cache (O(n) per token vs O(n²)). The SWA
467
+ // desync bug only manifested when past_key_values were reused ACROSS
468
+ // generate() calls, which we now prevent via disposePastKeyValues()
469
+ // before each call.
470
+ use_cache: true,
471
+ // Sampling defaults — without these transformers.js degenerates into
472
+ // single-token loops ("Salut! Salut! Salut!...") on Qwen3 especially.
473
+ temperature: typeof options.temperature === 'number' ? options.temperature : 0.7,
474
+ top_p: 0.9,
475
+ top_k: typeof options.topK === 'number' ? options.topK : 50,
476
+ repetition_penalty: 1.1,
477
+ streamer,
478
+ stopping_criteria: stoppingCriteria,
479
+ };
480
+ // past_key_values deliberately never reused (see comment above).
481
+
482
+ if (entry.family === 'mistral') {
483
+ generateArgs.do_sample = false;
484
+ generateArgs.repetition_penalty = 1.2;
485
+ delete generateArgs.temperature;
486
+ delete generateArgs.top_p;
487
+ delete generateArgs.top_k;
488
+ delete generateArgs.return_dict_in_generate;
489
+ delete generateArgs.stopping_criteria;
490
+ }
491
+
492
+ let result: any;
493
+ try {
494
+ result = await model.generate(generateArgs);
495
+ } catch (err) {
496
+ const msg = String(err);
497
+ // Abort / stopping-criteria interrupt → deliver what we have so far.
498
+ if (!msg.includes('interrupt') && !msg.includes('stopping')) {
499
+ post({ type: 'error', requestId, message: msg });
500
+ activeRequestId = null;
501
+ return;
502
+ }
503
+ }
504
+
505
+ // KV cache intentionally not retained — the agent loop re-sends the full
506
+ // prompt each turn, so a stale cache would double-prefix and break shapes.
507
+ try { if (result?.past_key_values) {
508
+ if (typeof result.past_key_values === 'object') {
509
+ for (const v of Object.values(result.past_key_values) as any[]) {
510
+ try { v?.dispose?.(); } catch {}
511
+ }
512
+ }
513
+ } } catch {}
514
+
515
+ // Parse thinking: everything before </think> is routed to the `thinking`
516
+ // option on the leading text block, the rest becomes normal content.
517
+ let thinking: string | undefined;
518
+ let visible = fullText;
519
+ const thinkEnd = fullText.indexOf('</think>');
520
+ if (thinkEnd !== -1) {
521
+ thinking = fullText.slice(0, thinkEnd).replace(/^<think>/, '').trim();
522
+ visible = fullText.slice(thinkEnd + '</think>'.length).trim();
523
+ }
524
+
525
+ // Tool-call parsing (format-aware).
526
+ const parsed = await parseToolCalls(visible, entry.toolFormat);
527
+ const content: ContentBlock[] = [];
528
+ let attachedThinking = false;
529
+ for (const block of parsed) {
530
+ if (block.type === 'text') {
531
+ const text = block.text;
532
+ if (!attachedThinking && thinking) {
533
+ content.push({ type: 'text', text });
534
+ // Embed `thinking` as a side-channel prefix in the block text so the
535
+ // existing ContentBlock surface stays unchanged. Consumers that care
536
+ // can extract the <think>…</think> span from the raw text.
537
+ attachedThinking = true;
538
+ } else {
539
+ content.push({ type: 'text', text });
540
+ }
541
+ } else {
542
+ content.push(block);
543
+ }
544
+ }
545
+ if (content.length === 0) {
546
+ content.push({ type: 'text', text: visible });
547
+ }
548
+
549
+ const latencyMs = performance.now() - t0;
550
+ const tokensPerSec = tokenCount > 0 ? tokenCount / (latencyMs / 1000) : 0;
551
+
552
+ let inputTokens = 0;
553
+ try {
554
+ const ids = (inputs?.input_ids?.dims ?? inputs?.input_ids?.size ?? 0);
555
+ inputTokens = Array.isArray(ids) ? ids[ids.length - 1] : Number(ids) || 0;
556
+ } catch {}
557
+
558
+ post({
559
+ type: 'done',
560
+ requestId,
561
+ content,
562
+ stats: {
563
+ tokensPerSec,
564
+ totalTokens: tokenCount,
565
+ latencyMs,
566
+ },
567
+ usage: {
568
+ input_tokens: inputTokens,
569
+ output_tokens: tokenCount,
570
+ },
571
+ // Thinking is exposed for observers that postMessage-proxy the worker.
572
+ thinking,
573
+ });
574
+
575
+ activeRequestId = null;
576
+ }
577
+
578
+ // --------------------------------------------------------------------------
579
+ // Message dispatch
580
+ // --------------------------------------------------------------------------
581
+
582
+ self.addEventListener('message', async (ev: MessageEvent) => {
583
+ const msg = ev.data;
584
+ if (!msg || typeof msg !== 'object') return;
585
+
586
+ try {
587
+ switch (msg.type) {
588
+ case 'load': {
589
+ if (!msg.entry) {
590
+ post({ type: 'error', message: 'missing entry in load message' });
591
+ return;
592
+ }
593
+ await loadModel(msg.entry as TransformersModelEntry);
594
+ return;
595
+ }
596
+ case 'generate': {
597
+ const requestId: string = msg.requestId;
598
+ const prompt: string | undefined = typeof msg.prompt === 'string' ? msg.prompt : undefined;
599
+ const chatMessages: Array<{ role: string; content: string }> | undefined =
600
+ Array.isArray(msg.chatMessages) ? msg.chatMessages : undefined;
601
+ const options: GenerateOptions = msg.options ?? {};
602
+ const image: Uint8Array | undefined = msg.image instanceof Uint8Array ? msg.image : undefined;
603
+ await handleGenerate(requestId, prompt, chatMessages, options, image);
604
+ return;
605
+ }
606
+ case 'abort': {
607
+ // Shared stopping criteria across requests — any abort interrupts the
608
+ // current generation. The pending main-thread promise resolves via the
609
+ // 'done' or 'error' path depending on how generate() unwinds.
610
+ try { stoppingCriteria?.interrupt?.(); } catch {}
611
+ if (activeRequestId && activeRequestId !== msg.requestId) {
612
+ // Different requestId — still interrupt; the main side filters by id.
613
+ }
614
+ return;
615
+ }
616
+ case 'reset': {
617
+ resetAll();
618
+ return;
619
+ }
620
+ case 'dispose': {
621
+ resetAll();
622
+ try { model?.dispose?.(); } catch {}
623
+ model = null;
624
+ tokenizer = null;
625
+ processor = null;
626
+ entry = null;
627
+ transformersMod = null;
628
+ return;
629
+ }
630
+ default:
631
+ return;
632
+ }
633
+ } catch (err) {
634
+ const requestId = msg?.requestId;
635
+ post({ type: 'error', requestId, message: String(err) });
636
+ activeRequestId = null;
637
+ }
638
+ });
639
+
640
+ export {}; // Mark this file as a module for TS.