@inbrowser/model 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,93 @@
1
+ /**
2
+ * Bundled `ModelPreset`s for Gemma 4 family.
3
+ *
4
+ * Adding a model is one entry here — not a new factory. Consumers
5
+ * may also author their own via `definePreset` from the package root.
6
+ *
7
+ * Capabilities are declared statically from the upstream model card;
8
+ * the runtime engine confirms them after load.
9
+ *
10
+ * Reference: https://huggingface.co/onnx-community/gemma-4-E2B-it-ONNX
11
+ */
12
+ import type { ModelPreset } from './types.js';
13
+ /**
14
+ * Gemma 4 E2B (effective ~2.3B params). ~500MB on-device download.
15
+ * Comfortable fit for modern integrated GPUs; recommended starting
16
+ * point for the POC.
17
+ */
18
+ export declare const gemma4_E2B: ModelPreset;
19
+ /**
20
+ * Gemma 4 E4B (effective ~4.5B params). ~1.5GB on-device download.
21
+ * Higher quality; needs a discrete GPU's worth of WebGPU memory.
22
+ */
23
+ export declare const gemma4_E4B: ModelPreset;
24
+ /**
25
+ * SmolLM2 360M Instruct. ~180MB on-device download at q4f16.
26
+ *
27
+ * Demo + verification preset — small enough to fit ORT-Web's WASM
28
+ * backend on headless setups (no GPU required), and well under
29
+ * WebGPU's 1 GiB `maxBufferSize` cap. Cold-loads in seconds, decodes
30
+ * a token stream end-to-end without specialized hardware.
31
+ *
32
+ * Reference: https://huggingface.co/onnx-community/SmolLM2-360M-Instruct
33
+ */
34
+ export declare const smollm2_360m: ModelPreset;
35
+ /**
36
+ * Qwen 2.5 Coder 1.5B Instruct. ~1.28 GB on-device download at q4f16.
37
+ *
38
+ * Strong on code completion and fill-in-the-middle for size. Uses the
39
+ * Qwen2 vocabulary (~152K tokens) — embedding table stays well under
40
+ * WebGPU's 1 GiB `maxBufferSize` cap, so this runs on most modern
41
+ * GPUs that Gemma 4 can't reach. Native tool calling is disabled
42
+ * here because the ONNX export drops the tool-aware decoding hooks;
43
+ * use the `@inbrowser/agent` polyfill if you need tools.
44
+ *
45
+ * **Verification status:** real-GPU only. Headless WASM verify
46
+ * fetches and loads cleanly but the first decode step fails with
47
+ * `Shape mismatch attempting to re-use buffer. {1,1,1536} != {1,40,1536}`
48
+ * — an ORT-Web buffer-reuse optimization bug at this scale. The
49
+ * WebGPU path works on any real desktop GPU.
50
+ *
51
+ * Reference: https://huggingface.co/onnx-community/Qwen2.5-Coder-1.5B-Instruct
52
+ */
53
+ export declare const qwen2_5_coder_1_5b: ModelPreset;
54
+ /**
55
+ * Qwen 3 1.7B. ~1.36 GB on-device download at q4f16.
56
+ *
57
+ * Current frontier-for-size general model. Supports a "thinking mode"
58
+ * toggle in the chat template (off by default here — flip via a
59
+ * custom `chatTemplate` override on the preset spread). Same Qwen
60
+ * vocabulary as 2.5; embedding table fits comfortably under the
61
+ * WebGPU buffer cap.
62
+ *
63
+ * **Verification status:** real-GPU only. Headless WASM verify fails
64
+ * at session creation with `ERROR_CODE: 6, ERROR_MESSAGE: std::bad_alloc`
65
+ * — V8's WASM heap (~4 GB ceiling) is too tight for this model's
66
+ * load-time scratch allocations. The WebGPU path works on any real
67
+ * desktop GPU.
68
+ *
69
+ * Reference: https://huggingface.co/onnx-community/Qwen3-1.7B-ONNX
70
+ */
71
+ export declare const qwen3_1_7b: ModelPreset;
72
+ /**
73
+ * DeepSeek-R1-Distill-Qwen 1.5B. ~1.37 GB on-device download at q4f16.
74
+ *
75
+ * R1-style reasoning distilled into a Qwen2 1.5B base. Architecture
76
+ * is `Qwen2ForCausalLM` — same code path as `qwen2_5_coder_1_5b` and
77
+ * other Qwen 2.5 family presets, no engine changes needed.
78
+ *
79
+ * `supportsThinking: true` because the model emits its reasoning
80
+ * trace inside literal `<think>…</think>` tags before the answer.
81
+ * Consumers should wrap the engine's stream with `splitThinking()`
82
+ * from `@inbrowser/model` to receive `kind: 'thinking'` events
83
+ * separated from `kind: 'token'` output, then render the two streams
84
+ * differently (e.g., collapsible thinking pane + main output pane).
85
+ *
86
+ * Has a tool-aware chat template, but the distill drops the
87
+ * tool-trained head — leaving `supportsTools: false` until a
88
+ * polyfill or fine-tune restores it.
89
+ *
90
+ * Reference: https://huggingface.co/onnx-community/DeepSeek-R1-Distill-Qwen-1.5B-ONNX
91
+ */
92
+ export declare const deepseek_r1_qwen_1_5b: ModelPreset;
93
+ //# sourceMappingURL=presets.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"presets.d.ts","sourceRoot":"","sources":["../src/presets.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAGH,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AA8B9C;;;;GAIG;AACH,eAAO,MAAM,UAAU,EAAE,WAKvB,CAAC;AAEH;;;GAGG;AACH,eAAO,MAAM,UAAU,EAAE,WAKvB,CAAC;AAEH;;;;;;;;;GASG;AACH,eAAO,MAAM,YAAY,EAAE,WAWzB,CAAC;AAEH;;;;;;;;;;;;;;;;;GAiBG;AACH,eAAO,MAAM,kBAAkB,EAAE,WAe/B,CAAC;AAEH;;;;;;;;;;;;;;;;GAgBG;AACH,eAAO,MAAM,UAAU,EAAE,WAevB,CAAC;AAEH;;;;;;;;;;;;;;;;;;;GAmBG;AACH,eAAO,MAAM,qBAAqB,EAAE,WAoBlC,CAAC"}
@@ -0,0 +1,191 @@
1
+ /**
2
+ * Bundled `ModelPreset`s for Gemma 4 family.
3
+ *
4
+ * Adding a model is one entry here — not a new factory. Consumers
5
+ * may also author their own via `definePreset` from the package root.
6
+ *
7
+ * Capabilities are declared statically from the upstream model card;
8
+ * the runtime engine confirms them after load.
9
+ *
10
+ * Reference: https://huggingface.co/onnx-community/gemma-4-E2B-it-ONNX
11
+ */
12
+ import { definePreset } from './engine.js';
13
+ const GEMMA_4_CAPS = {
14
+ supportsTools: false, // native tool-calling absent; polyfill lives in @inbrowser/agent
15
+ supportsVision: false,
16
+ supportsAudio: true,
17
+ contextWindow: 128_000,
18
+ // Gemma 4 / 3n templates accept `enable_thinking: true` and the
19
+ // engine still passes it through — the model will think before
20
+ // answering. We deliberately DO NOT set `thinkingTags`:
21
+ //
22
+ // Observed E2B + E4B output formats are inconsistent:
23
+ // - E2B sometimes emits content without any opening marker
24
+ // - E4B emits `<|channel>thought\n` opener
25
+ // - Close marker `<channel|>` is sometimes emitted, sometimes
26
+ // replaced by literal `Output\n`, sometimes omitted entirely
27
+ // - End-of-turn `<turn|>` token sometimes leaks
28
+ //
29
+ // A text-based parser can't reliably separate thinking from answer
30
+ // with that variance. Forcing it produces worse UX than just
31
+ // showing the model's reasoning inline.
32
+ //
33
+ // Without `thinkingTags`, the engine keeps the default
34
+ // `skip_special_tokens: true`, so all structural tokens
35
+ // (`<|channel>`, `<channel|>`, `<turn|>`) are stripped automatically.
36
+ // The user sees plain-text reasoning followed by the answer in the
37
+ // main output pane. Mixed content, but clean.
38
+ supportsThinking: true,
39
+ };
40
+ /**
41
+ * Gemma 4 E2B (effective ~2.3B params). ~500MB on-device download.
42
+ * Comfortable fit for modern integrated GPUs; recommended starting
43
+ * point for the POC.
44
+ */
45
+ export const gemma4_E2B = definePreset({
46
+ model: { modelId: 'onnx-community/gemma-4-E2B-it-ONNX' },
47
+ dtype: 'q4f16',
48
+ backend: 'auto',
49
+ capabilities: GEMMA_4_CAPS,
50
+ });
51
+ /**
52
+ * Gemma 4 E4B (effective ~4.5B params). ~1.5GB on-device download.
53
+ * Higher quality; needs a discrete GPU's worth of WebGPU memory.
54
+ */
55
+ export const gemma4_E4B = definePreset({
56
+ model: { modelId: 'onnx-community/gemma-4-E4B-it-ONNX' },
57
+ dtype: 'q4f16',
58
+ backend: 'auto',
59
+ capabilities: GEMMA_4_CAPS,
60
+ });
61
+ /**
62
+ * SmolLM2 360M Instruct. ~180MB on-device download at q4f16.
63
+ *
64
+ * Demo + verification preset — small enough to fit ORT-Web's WASM
65
+ * backend on headless setups (no GPU required), and well under
66
+ * WebGPU's 1 GiB `maxBufferSize` cap. Cold-loads in seconds, decodes
67
+ * a token stream end-to-end without specialized hardware.
68
+ *
69
+ * Reference: https://huggingface.co/onnx-community/SmolLM2-360M-Instruct
70
+ */
71
+ export const smollm2_360m = definePreset({
72
+ model: { modelId: 'HuggingFaceTB/SmolLM2-360M-Instruct' },
73
+ dtype: 'q4f16',
74
+ backend: 'auto',
75
+ capabilities: {
76
+ supportsTools: false,
77
+ supportsVision: false,
78
+ supportsAudio: false,
79
+ contextWindow: 8_192,
80
+ supportsThinking: false,
81
+ },
82
+ });
83
+ /**
84
+ * Qwen 2.5 Coder 1.5B Instruct. ~1.28 GB on-device download at q4f16.
85
+ *
86
+ * Strong on code completion and fill-in-the-middle for size. Uses the
87
+ * Qwen2 vocabulary (~152K tokens) — embedding table stays well under
88
+ * WebGPU's 1 GiB `maxBufferSize` cap, so this runs on most modern
89
+ * GPUs that Gemma 4 can't reach. Native tool calling is disabled
90
+ * here because the ONNX export drops the tool-aware decoding hooks;
91
+ * use the `@inbrowser/agent` polyfill if you need tools.
92
+ *
93
+ * **Verification status:** real-GPU only. Headless WASM verify
94
+ * fetches and loads cleanly but the first decode step fails with
95
+ * `Shape mismatch attempting to re-use buffer. {1,1,1536} != {1,40,1536}`
96
+ * — an ORT-Web buffer-reuse optimization bug at this scale. The
97
+ * WebGPU path works on any real desktop GPU.
98
+ *
99
+ * Reference: https://huggingface.co/onnx-community/Qwen2.5-Coder-1.5B-Instruct
100
+ */
101
+ export const qwen2_5_coder_1_5b = definePreset({
102
+ model: { modelId: 'onnx-community/Qwen2.5-Coder-1.5B-Instruct' },
103
+ dtype: 'q4f16',
104
+ backend: 'auto',
105
+ capabilities: {
106
+ // Qwen 2.5's chat template includes tool slots and the model is
107
+ // trained to emit <tool_call>...</tool_call> envelopes. The engine
108
+ // threads `tools` into apply_chat_template and parses output with
109
+ // parseToolCalls() when this is true.
110
+ supportsTools: true,
111
+ supportsVision: false,
112
+ supportsAudio: false,
113
+ contextWindow: 32_768,
114
+ supportsThinking: false,
115
+ },
116
+ });
117
+ /**
118
+ * Qwen 3 1.7B. ~1.36 GB on-device download at q4f16.
119
+ *
120
+ * Current frontier-for-size general model. Supports a "thinking mode"
121
+ * toggle in the chat template (off by default here — flip via a
122
+ * custom `chatTemplate` override on the preset spread). Same Qwen
123
+ * vocabulary as 2.5; embedding table fits comfortably under the
124
+ * WebGPU buffer cap.
125
+ *
126
+ * **Verification status:** real-GPU only. Headless WASM verify fails
127
+ * at session creation with `ERROR_CODE: 6, ERROR_MESSAGE: std::bad_alloc`
128
+ * — V8's WASM heap (~4 GB ceiling) is too tight for this model's
129
+ * load-time scratch allocations. The WebGPU path works on any real
130
+ * desktop GPU.
131
+ *
132
+ * Reference: https://huggingface.co/onnx-community/Qwen3-1.7B-ONNX
133
+ */
134
+ export const qwen3_1_7b = definePreset({
135
+ model: { modelId: 'onnx-community/Qwen3-1.7B-ONNX' },
136
+ dtype: 'q4f16',
137
+ backend: 'auto',
138
+ capabilities: {
139
+ // Qwen 3 family ships with first-class tool calling in its chat
140
+ // template. The engine threads `tools` through apply_chat_template
141
+ // and parses <tool_call>{...}</tool_call> envelopes from the
142
+ // output stream via parseToolCalls() when set.
143
+ supportsTools: true,
144
+ supportsVision: false,
145
+ supportsAudio: false,
146
+ contextWindow: 32_768,
147
+ supportsThinking: false,
148
+ },
149
+ });
150
+ /**
151
+ * DeepSeek-R1-Distill-Qwen 1.5B. ~1.37 GB on-device download at q4f16.
152
+ *
153
+ * R1-style reasoning distilled into a Qwen2 1.5B base. Architecture
154
+ * is `Qwen2ForCausalLM` — same code path as `qwen2_5_coder_1_5b` and
155
+ * other Qwen 2.5 family presets, no engine changes needed.
156
+ *
157
+ * `supportsThinking: true` because the model emits its reasoning
158
+ * trace inside literal `<think>…</think>` tags before the answer.
159
+ * Consumers should wrap the engine's stream with `splitThinking()`
160
+ * from `@inbrowser/model` to receive `kind: 'thinking'` events
161
+ * separated from `kind: 'token'` output, then render the two streams
162
+ * differently (e.g., collapsible thinking pane + main output pane).
163
+ *
164
+ * Has a tool-aware chat template, but the distill drops the
165
+ * tool-trained head — leaving `supportsTools: false` until a
166
+ * polyfill or fine-tune restores it.
167
+ *
168
+ * Reference: https://huggingface.co/onnx-community/DeepSeek-R1-Distill-Qwen-1.5B-ONNX
169
+ */
170
+ export const deepseek_r1_qwen_1_5b = definePreset({
171
+ model: { modelId: 'onnx-community/DeepSeek-R1-Distill-Qwen-1.5B-ONNX' },
172
+ dtype: 'q4f16',
173
+ backend: 'auto',
174
+ capabilities: {
175
+ supportsTools: false,
176
+ supportsVision: false,
177
+ supportsAudio: false,
178
+ contextWindow: 131_072,
179
+ supportsThinking: true,
180
+ // DeepSeek's literal-text tags — same as `splitThinking`'s default
181
+ // when no tags are passed, but declaring them on the preset lets
182
+ // consumers be model-agnostic: `splitThinking(stream, preset.capabilities.thinkingTags)`
183
+ // works for both DeepSeek and Gemma 4 without consumer-side
184
+ // model-family branching.
185
+ thinkingTags: {
186
+ openTag: '<think>',
187
+ closeTag: '</think>',
188
+ },
189
+ },
190
+ });
191
+ //# sourceMappingURL=presets.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"presets.js","sourceRoot":"","sources":["../src/presets.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAG3C,MAAM,YAAY,GAAG;IACnB,aAAa,EAAE,KAAK,EAAE,iEAAiE;IACvF,cAAc,EAAE,KAAK;IACrB,aAAa,EAAE,IAAI;IACnB,aAAa,EAAE,OAAO;IACtB,gEAAgE;IAChE,+DAA+D;IAC/D,wDAAwD;IACxD,EAAE;IACF,sDAAsD;IACtD,6DAA6D;IAC7D,6CAA6C;IAC7C,gEAAgE;IAChE,iEAAiE;IACjE,kDAAkD;IAClD,EAAE;IACF,mEAAmE;IACnE,6DAA6D;IAC7D,wCAAwC;IACxC,EAAE;IACF,uDAAuD;IACvD,wDAAwD;IACxD,sEAAsE;IACtE,mEAAmE;IACnE,8CAA8C;IAC9C,gBAAgB,EAAE,IAAI;CACd,CAAC;AAEX;;;;GAIG;AACH,MAAM,CAAC,MAAM,UAAU,GAAgB,YAAY,CAAC;IAClD,KAAK,EAAE,EAAE,OAAO,EAAE,oCAAoC,EAAE;IACxD,KAAK,EAAE,OAAO;IACd,OAAO,EAAE,MAAM;IACf,YAAY,EAAE,YAAY;CAC3B,CAAC,CAAC;AAEH;;;GAGG;AACH,MAAM,CAAC,MAAM,UAAU,GAAgB,YAAY,CAAC;IAClD,KAAK,EAAE,EAAE,OAAO,EAAE,oCAAoC,EAAE;IACxD,KAAK,EAAE,OAAO;IACd,OAAO,EAAE,MAAM;IACf,YAAY,EAAE,YAAY;CAC3B,CAAC,CAAC;AAEH;;;;;;;;;GASG;AACH,MAAM,CAAC,MAAM,YAAY,GAAgB,YAAY,CAAC;IACpD,KAAK,EAAE,EAAE,OAAO,EAAE,qCAAqC,EAAE;IACzD,KAAK,EAAE,OAAO;IACd,OAAO,EAAE,MAAM;IACf,YAAY,EAAE;QACZ,aAAa,EAAE,KAAK;QACpB,cAAc,EAAE,KAAK;QACrB,aAAa,EAAE,KAAK;QACpB,aAAa,EAAE,KAAK;QACpB,gBAAgB,EAAE,KAAK;KACxB;CACF,CAAC,CAAC;AAEH;;;;;;;;;;;;;;;;;GAiBG;AACH,MAAM,CAAC,MAAM,kBAAkB,GAAgB,YAAY,CAAC;IAC1D,KAAK,EAAE,EAAE,OAAO,EAAE,4CAA4C,EAAE;IAChE,KAAK,EAAE,OAAO;IACd,OAAO,EAAE,MAAM;IACf,YAAY,EAAE;QACZ,gEAAgE;QAChE,mEAAmE;QACnE,kEAAkE;QAClE,sCAAsC;QACtC,aAAa,EAAE,IAAI;QACnB,cAAc,EAAE,KAAK;QACrB,aAAa,EAAE,KAAK;QACpB,aAAa,EAAE,MAAM;QACrB,gBAAgB,EAAE,KAAK;KACxB;CACF,CAAC,CAAC;AAEH;;;;;;;;;;;;;;;;GAgBG;AACH,MAAM,CAAC,MAAM,UAAU,GAAgB,YAAY,CAAC;IAClD,KAAK,EAAE,EAAE,OAAO,EAAE,gCAAgC,EAAE;IACpD,KAAK,EAAE,OAAO;IACd,OAAO,EAAE,MAAM;IACf,YAAY,EAAE;QACZ,gEAAgE;QAChE,mEAAmE;QACnE,6DAA6D;QAC7D,+CAA+C;QAC/C,aAAa,EAAE,IAAI;QACnB,cAAc,EAAE,KAAK;QACrB,aAAa,EAAE,KAAK;QACpB,aAAa,EAAE,MAAM;QACrB,gBAAgB,EAAE,KAAK;KACxB;CACF,CAAC,CAAC;AAEH;;;;;;;;;;;;;;;;;;;GAmBG;AACH,MAAM,CAAC,MAAM,qBAAqB,GAAgB,YAAY,CAAC;IAC7D,KAAK,EAAE,EAAE,OAAO,EAAE,mDAAmD,EAAE;IACvE,KAAK,EAAE,OAAO;IACd,OAAO,EAAE,MAAM;IACf,YAAY,EAAE;QACZ,aAAa,EAAE,KAAK;QACpB,cAAc,EAAE,KAAK;QACrB,aAAa,EAAE,KAAK;QACpB,aAAa,EAAE,OAAO;QACtB,gBAAgB,EAAE,IAAI;QACtB,mEAAmE;QACnE,iEAAiE;QACjE,yFAAyF;QACzF,4DAA4D;QAC5D,0BAA0B;QAC1B,YAAY,EAAE;YACZ,OAAO,EAAE,SAAS;YAClB,QAAQ,EAAE,UAAU;SACrB;KACF;CACF,CAAC,CAAC"}
@@ -0,0 +1,57 @@
1
+ /**
2
+ * `splitThinking` — stream transformer that splits reasoning-tagged
3
+ * content out of a raw token stream.
4
+ *
5
+ * Reasoning models (DeepSeek R1, R1-Distill-*, some Qwen 3 thinking
6
+ * variants) emit their reasoning trace inside literal tags
7
+ * (`<think>…</think>` by default). The engine itself stays narrow —
8
+ * it just emits `token` events with whatever text the decoder
9
+ * produced. This utility wraps that stream and re-emits the same
10
+ * shape, except text inside the open/close tags is yielded as
11
+ * `kind: 'thinking'` instead of `kind: 'token'`.
12
+ *
13
+ * for await (const evt of splitThinking(engine.generate(msgs))) {
14
+ * if (evt.kind === 'thinking') showReasoning(evt.text);
15
+ * else if (evt.kind === 'token') showOutput(evt.text);
16
+ * }
17
+ *
18
+ * Tag matching is configurable (defaults to DeepSeek's
19
+ * `<think>` / `</think>`). The implementation buffers up to one
20
+ * tag-length minus one byte so partial tags split across token
21
+ * boundaries (e.g., `<th` then `ink>`) resolve correctly.
22
+ *
23
+ * Pass-through behavior for non-token events: `usage` and `error`
24
+ * forward unchanged so terminal accounting is preserved.
25
+ */
26
+ import type { EngineEvent } from './types.js';
27
+ export interface ThinkingSplitOpts {
28
+ /** Tag that opens a reasoning block. Default: `<think>`. */
29
+ openTag?: string;
30
+ /** Tag that closes a reasoning block. Default: `</think>`. */
31
+ closeTag?: string;
32
+ /**
33
+ * When true, the stream is treated as starting *inside* the
34
+ * thinking channel — i.e., the opening tag is implicit. The first
35
+ * `closeTag` ends the block; subsequent text streams as `token`.
36
+ *
37
+ * Used for models where the chat template's `add_generation_prompt`
38
+ * pre-fills the opening marker, so generation begins inside
39
+ * thinking and the model only emits the close marker explicitly.
40
+ * Gemma 4 family works this way.
41
+ *
42
+ * Default: `false`.
43
+ */
44
+ implicitOpen?: boolean;
45
+ /**
46
+ * Literal substrings to strip from `token` events post-parse.
47
+ * Useful for cleaning up structural leak tokens that appear when
48
+ * the engine sets `skip_special_tokens: false` to expose channel
49
+ * markers (Gemma 4's `<turn|>` end-of-turn marker, for example).
50
+ *
51
+ * Stripping is applied AFTER mode classification — content inside
52
+ * thinking blocks is not affected. Default: `[]`.
53
+ */
54
+ stripTokens?: ReadonlyArray<string>;
55
+ }
56
+ export declare function splitThinking(source: AsyncIterable<EngineEvent>, opts?: ThinkingSplitOpts): AsyncIterable<EngineEvent>;
57
+ //# sourceMappingURL=think.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"think.d.ts","sourceRoot":"","sources":["../src/think.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AAE9C,MAAM,WAAW,iBAAiB;IAChC,4DAA4D;IAC5D,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,8DAA8D;IAC9D,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB;;;;;;;;;;;OAWG;IACH,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB;;;;;;;;OAQG;IACH,WAAW,CAAC,EAAE,aAAa,CAAC,MAAM,CAAC,CAAC;CACrC;AAED,wBAAuB,aAAa,CAClC,MAAM,EAAE,aAAa,CAAC,WAAW,CAAC,EAClC,IAAI,GAAE,iBAAsB,GAC3B,aAAa,CAAC,WAAW,CAAC,CAmH5B"}
package/dist/think.js ADDED
@@ -0,0 +1,138 @@
1
+ /**
2
+ * `splitThinking` — stream transformer that splits reasoning-tagged
3
+ * content out of a raw token stream.
4
+ *
5
+ * Reasoning models (DeepSeek R1, R1-Distill-*, some Qwen 3 thinking
6
+ * variants) emit their reasoning trace inside literal tags
7
+ * (`<think>…</think>` by default). The engine itself stays narrow —
8
+ * it just emits `token` events with whatever text the decoder
9
+ * produced. This utility wraps that stream and re-emits the same
10
+ * shape, except text inside the open/close tags is yielded as
11
+ * `kind: 'thinking'` instead of `kind: 'token'`.
12
+ *
13
+ * for await (const evt of splitThinking(engine.generate(msgs))) {
14
+ * if (evt.kind === 'thinking') showReasoning(evt.text);
15
+ * else if (evt.kind === 'token') showOutput(evt.text);
16
+ * }
17
+ *
18
+ * Tag matching is configurable (defaults to DeepSeek's
19
+ * `<think>` / `</think>`). The implementation buffers up to one
20
+ * tag-length minus one byte so partial tags split across token
21
+ * boundaries (e.g., `<th` then `ink>`) resolve correctly.
22
+ *
23
+ * Pass-through behavior for non-token events: `usage` and `error`
24
+ * forward unchanged so terminal accounting is preserved.
25
+ */
26
+ export async function* splitThinking(source, opts = {}) {
27
+ const open = opts.openTag ?? '<think>';
28
+ const close = opts.closeTag ?? '</think>';
29
+ const implicitOpen = opts.implicitOpen ?? false;
30
+ const stripTokens = opts.stripTokens ?? [];
31
+ if (close.length === 0) {
32
+ throw new Error('splitThinking: closeTag must be non-empty');
33
+ }
34
+ if (!implicitOpen && open.length === 0) {
35
+ throw new Error('splitThinking: openTag must be non-empty unless implicitOpen is true');
36
+ }
37
+ // Initial state: when `implicitOpen` is true, the stream is treated
38
+ // as already inside the thinking block (Gemma 4: chat template
39
+ // primes generation inside <|channel>thought, so the first emitted
40
+ // token IS thinking content). Otherwise start in `normal` and wait
41
+ // for the open tag.
42
+ let mode = implicitOpen ? 'inside' : 'normal';
43
+ // `buffer` holds text we haven't decided how to emit yet — typically
44
+ // because the trailing characters could be a partial tag prefix OR
45
+ // a partial stripToken.
46
+ let buffer = '';
47
+ // Holdback: how many trailing bytes to keep in the buffer rather
48
+ // than emit, so partial matches (open-tag prefix, or any
49
+ // stripToken split across input boundaries) can resolve on a
50
+ // subsequent input chunk.
51
+ //
52
+ // - openTag protection: keep up to `open.length - 1` chars so a
53
+ // partial open-tag like `<thi` resolves when `nk>` arrives.
54
+ // - stripToken protection: keep up to `maxStripLen` chars (NOT
55
+ // `maxStripLen - 1`) so a stripToken that *starts* near the
56
+ // boundary is fully held back rather than split. Splitting would
57
+ // mean the literal-substring `includes(t)` check fails on both
58
+ // halves and the token leaks.
59
+ const holdbackForOpen = open.length > 0 ? open.length - 1 : 0;
60
+ const holdbackForStrip = stripTokens.length > 0 ? Math.max(...stripTokens.map((t) => t.length)) : 0;
61
+ const holdback = Math.max(holdbackForOpen, holdbackForStrip);
62
+ // Helper: emit a token event with the configured strip-tokens applied.
63
+ function emitToken(text) {
64
+ let cleaned = text;
65
+ for (const t of stripTokens) {
66
+ if (cleaned.includes(t))
67
+ cleaned = cleaned.split(t).join('');
68
+ }
69
+ return { kind: 'token', text: cleaned };
70
+ }
71
+ for await (const evt of source) {
72
+ if (evt.kind !== 'token') {
73
+ yield evt;
74
+ continue;
75
+ }
76
+ buffer += evt.text;
77
+ // Drain the buffer as far as we can each iteration. We loop because
78
+ // a single token can contain `<think>…</think>` end-to-end; one
79
+ // pass per state transition handles that.
80
+ while (buffer.length > 0) {
81
+ if (mode === 'normal') {
82
+ // In normal mode we look for the next open tag (if defined).
83
+ // When openTag is empty (implicitOpen-with-no-return), we
84
+ // still need to respect stripToken holdback so partial
85
+ // stripTokens don't sneak past us at input-chunk boundaries.
86
+ const idx = open.length > 0 ? buffer.indexOf(open) : -1;
87
+ if (idx === -1) {
88
+ // No open tag visible. Emit buffer minus the holdback
89
+ // window so partial open-tag / stripToken matches resolve
90
+ // on the next input.
91
+ const safeLen = buffer.length - holdback;
92
+ if (safeLen > 0) {
93
+ yield emitToken(buffer.slice(0, safeLen));
94
+ buffer = buffer.slice(safeLen);
95
+ }
96
+ break;
97
+ }
98
+ if (idx > 0) {
99
+ yield emitToken(buffer.slice(0, idx));
100
+ }
101
+ buffer = buffer.slice(idx + open.length);
102
+ mode = 'inside';
103
+ }
104
+ else {
105
+ const idx = buffer.indexOf(close);
106
+ if (idx === -1) {
107
+ // Holdback for close-tag protection inside thinking. We
108
+ // don't need stripToken holdback here — stripTokens apply
109
+ // to token events, not thinking events.
110
+ const safeLen = buffer.length - (close.length - 1);
111
+ if (safeLen > 0) {
112
+ yield { kind: 'thinking', text: buffer.slice(0, safeLen) };
113
+ buffer = buffer.slice(safeLen);
114
+ }
115
+ break;
116
+ }
117
+ if (idx > 0) {
118
+ yield { kind: 'thinking', text: buffer.slice(0, idx) };
119
+ }
120
+ buffer = buffer.slice(idx + close.length);
121
+ mode = 'normal';
122
+ }
123
+ }
124
+ }
125
+ // Flush any residual buffer. If we ended mid-thinking-block (model
126
+ // hit max_new_tokens before the close, OR implicitOpen never saw a
127
+ // close), treat the remainder as thinking — alternative would be
128
+ // silently dropping it.
129
+ if (buffer.length > 0) {
130
+ if (mode === 'inside') {
131
+ yield { kind: 'thinking', text: buffer };
132
+ }
133
+ else {
134
+ yield emitToken(buffer);
135
+ }
136
+ }
137
+ }
138
+ //# sourceMappingURL=think.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"think.js","sourceRoot":"","sources":["../src/think.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAkCH,MAAM,CAAC,KAAK,SAAS,CAAC,CAAC,aAAa,CAClC,MAAkC,EAClC,OAA0B,EAAE;IAE5B,MAAM,IAAI,GAAG,IAAI,CAAC,OAAO,IAAI,SAAS,CAAC;IACvC,MAAM,KAAK,GAAG,IAAI,CAAC,QAAQ,IAAI,UAAU,CAAC;IAC1C,MAAM,YAAY,GAAG,IAAI,CAAC,YAAY,IAAI,KAAK,CAAC;IAChD,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC;IAC3C,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,MAAM,IAAI,KAAK,CAAC,2CAA2C,CAAC,CAAC;IAC/D,CAAC;IACD,IAAI,CAAC,YAAY,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvC,MAAM,IAAI,KAAK,CAAC,sEAAsE,CAAC,CAAC;IAC1F,CAAC;IAED,oEAAoE;IACpE,+DAA+D;IAC/D,mEAAmE;IACnE,mEAAmE;IACnE,oBAAoB;IACpB,IAAI,IAAI,GAAwB,YAAY,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ,CAAC;IACnE,qEAAqE;IACrE,mEAAmE;IACnE,wBAAwB;IACxB,IAAI,MAAM,GAAG,EAAE,CAAC;IAEhB,iEAAiE;IACjE,yDAAyD;IACzD,6DAA6D;IAC7D,0BAA0B;IAC1B,EAAE;IACF,kEAAkE;IAClE,gEAAgE;IAChE,iEAAiE;IACjE,gEAAgE;IAChE,qEAAqE;IACrE,mEAAmE;IACnE,kCAAkC;IAClC,MAAM,eAAe,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC9D,MAAM,gBAAgB,GACpB,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7E,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,eAAe,EAAE,gBAAgB,CAAC,CAAC;IAE7D,uEAAuE;IACvE,SAAS,SAAS,CAAC,IAAY;QAC7B,IAAI,OAAO,GAAG,IAAI,CAAC;QACnB,KAAK,MAAM,CAAC,IAAI,WAAW,EAAE,CAAC;YAC5B,IAAI,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC;gBAAE,OAAO,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAC/D,CAAC;QACD,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC;IAC1C,CAAC;IAED,IAAI,KAAK,EAAE,MAAM,GAAG,IAAI,MAAM,EAAE,CAAC;QAC/B,IAAI,GAAG,CAAC,IAAI,KAAK,OAAO,EAAE,CAAC;YACzB,MAAM,GAAG,CAAC;YACV,SAAS;QACX,CAAC;QACD,MAAM,IAAI,GAAG,CAAC,IAAI,CAAC;QAEnB,oEAAoE;QACpE,gEAAgE;QAChE,0CAA0C;QAC1C,OAAO,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACzB,IAAI,IAAI,KAAK,QAAQ,EAAE,CAAC;gBACtB,6DAA6D;gBAC7D,0DAA0D;gBAC1D,uDAAuD;gBACvD,6DAA6D;gBAC7D,MAAM,GAAG,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;gBACxD,IAAI,GAAG,KAAK,CAAC,CAAC,EAAE,CAAC;oBACf,sDAAsD;oBACtD,0DAA0D;oBAC1D,qBAAqB;oBACrB,MAAM,OAAO,GAAG,MAAM,CAAC,MAAM,GAAG,QAAQ,CAAC;oBACzC,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;wBAChB,MAAM,SAAS,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC;wBAC1C,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;oBACjC,CAAC;oBACD,MAAM;gBACR,CAAC;gBACD,IAAI,GAAG,GAAG,CAAC,EAAE,CAAC;oBACZ,MAAM,SAAS,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;gBACxC,CAAC;gBACD,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC;gBACzC,IAAI,GAAG,QAAQ,CAAC;YAClB,CAAC;iBAAM,CAAC;gBACN,MAAM,GAAG,GAAG,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;gBAClC,IAAI,GAAG,KAAK,CAAC,CAAC,EAAE,CAAC;oBACf,wDAAwD;oBACxD,0DAA0D;oBAC1D,wCAAwC;oBACxC,MAAM,OAAO,GAAG,MAAM,CAAC,MAAM,GAAG,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;oBACnD,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;wBAChB,MAAM,EAAE,IAAI,EAAE,UAAU,EAAE,IAAI,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,EAAE,CAAC;wBAC3D,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;oBACjC,CAAC;oBACD,MAAM;gBACR,CAAC;gBACD,IAAI,GAAG,GAAG,CAAC,EAAE,CAAC;oBACZ,MAAM,EAAE,IAAI,EAAE,UAAU,EAAE,IAAI,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC;gBACzD,CAAC;gBACD,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC;gBAC1C,IAAI,GAAG,QAAQ,CAAC;YAClB,CAAC;QACH,CAAC;IACH,CAAC;IAED,mEAAmE;IACnE,mEAAmE;IACnE,iEAAiE;IACjE,wBAAwB;IACxB,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACtB,IAAI,IAAI,KAAK,QAAQ,EAAE,CAAC;YACtB,MAAM,EAAE,IAAI,EAAE,UAAU,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC;QAC3C,CAAC;aAAM,CAAC;YACN,MAAM,SAAS,CAAC,MAAM,CAAC,CAAC;QAC1B,CAAC;IACH,CAAC;AACH,CAAC"}