npm - @inbrowser/model - Versions diffs - 0.1.0 - Mend

@inbrowser/model 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/AGENTS.md +50 -0
package/LICENSE +21 -0
package/README.md +63 -0
package/dist/adapters/agent.d.ts +19 -0
package/dist/adapters/agent.d.ts.map +1 -0
package/dist/adapters/agent.js +96 -0
package/dist/adapters/agent.js.map +1 -0
package/dist/adapters/relay.d.ts +17 -0
package/dist/adapters/relay.d.ts.map +1 -0
package/dist/adapters/relay.js +90 -0
package/dist/adapters/relay.js.map +1 -0
package/dist/engine.d.ts +35 -0
package/dist/engine.d.ts.map +1 -0
package/dist/engine.js +353 -0
package/dist/engine.js.map +1 -0
package/dist/index.d.ts +21 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +20 -0
package/dist/index.js.map +1 -0
package/dist/parse-tool-calls.d.ts +49 -0
package/dist/parse-tool-calls.d.ts.map +1 -0
package/dist/parse-tool-calls.js +115 -0
package/dist/parse-tool-calls.js.map +1 -0
package/dist/presets.d.ts +93 -0
package/dist/presets.d.ts.map +1 -0
package/dist/presets.js +191 -0
package/dist/presets.js.map +1 -0
package/dist/think.d.ts +57 -0
package/dist/think.d.ts.map +1 -0
package/dist/think.js +138 -0
package/dist/think.js.map +1 -0
package/dist/types.d.ts +291 -0
package/dist/types.d.ts.map +1 -0
package/dist/types.js +17 -0
package/dist/types.js.map +1 -0
package/dist/worker.d.ts +62 -0
package/dist/worker.d.ts.map +1 -0
package/dist/worker.js +493 -0
package/dist/worker.js.map +1 -0
package/package.json +65 -0

package/dist/presets.d.ts ADDED Viewed

@@ -0,0 +1,93 @@
+/**
+ * Bundled `ModelPreset`s for Gemma 4 family.
+ *
+ * Adding a model is one entry here — not a new factory. Consumers
+ * may also author their own via `definePreset` from the package root.
+ *
+ * Capabilities are declared statically from the upstream model card;
+ * the runtime engine confirms them after load.
+ *
+ * Reference: https://huggingface.co/onnx-community/gemma-4-E2B-it-ONNX
+ */
+import type { ModelPreset } from './types.js';
+/**
+ * Gemma 4 E2B (effective ~2.3B params). ~500MB on-device download.
+ * Comfortable fit for modern integrated GPUs; recommended starting
+ * point for the POC.
+ */
+export declare const gemma4_E2B: ModelPreset;
+/**
+ * Gemma 4 E4B (effective ~4.5B params). ~1.5GB on-device download.
+ * Higher quality; needs a discrete GPU's worth of WebGPU memory.
+ */
+export declare const gemma4_E4B: ModelPreset;
+/**
+ * SmolLM2 360M Instruct. ~180MB on-device download at q4f16.
+ *
+ * Demo + verification preset — small enough to fit ORT-Web's WASM
+ * backend on headless setups (no GPU required), and well under
+ * WebGPU's 1 GiB `maxBufferSize` cap. Cold-loads in seconds, decodes
+ * a token stream end-to-end without specialized hardware.
+ *
+ * Reference: https://huggingface.co/onnx-community/SmolLM2-360M-Instruct
+ */
+export declare const smollm2_360m: ModelPreset;
+/**
+ * Qwen 2.5 Coder 1.5B Instruct. ~1.28 GB on-device download at q4f16.
+ *
+ * Strong on code completion and fill-in-the-middle for size. Uses the
+ * Qwen2 vocabulary (~152K tokens) — embedding table stays well under
+ * WebGPU's 1 GiB `maxBufferSize` cap, so this runs on most modern
+ * GPUs that Gemma 4 can't reach. Native tool calling is disabled
+ * here because the ONNX export drops the tool-aware decoding hooks;
+ * use the `@inbrowser/agent` polyfill if you need tools.
+ *
+ * **Verification status:** real-GPU only. Headless WASM verify
+ * fetches and loads cleanly but the first decode step fails with
+ * `Shape mismatch attempting to re-use buffer. {1,1,1536} != {1,40,1536}`
+ * — an ORT-Web buffer-reuse optimization bug at this scale. The
+ * WebGPU path works on any real desktop GPU.
+ *
+ * Reference: https://huggingface.co/onnx-community/Qwen2.5-Coder-1.5B-Instruct
+ */
+export declare const qwen2_5_coder_1_5b: ModelPreset;
+/**
+ * Qwen 3 1.7B. ~1.36 GB on-device download at q4f16.
+ *
+ * Current frontier-for-size general model. Supports a "thinking mode"
+ * toggle in the chat template (off by default here — flip via a
+ * custom `chatTemplate` override on the preset spread). Same Qwen
+ * vocabulary as 2.5; embedding table fits comfortably under the
+ * WebGPU buffer cap.
+ *
+ * **Verification status:** real-GPU only. Headless WASM verify fails
+ * at session creation with `ERROR_CODE: 6, ERROR_MESSAGE: std::bad_alloc`
+ * — V8's WASM heap (~4 GB ceiling) is too tight for this model's
+ * load-time scratch allocations. The WebGPU path works on any real
+ * desktop GPU.
+ *
+ * Reference: https://huggingface.co/onnx-community/Qwen3-1.7B-ONNX
+ */
+export declare const qwen3_1_7b: ModelPreset;
+/**
+ * DeepSeek-R1-Distill-Qwen 1.5B. ~1.37 GB on-device download at q4f16.
+ *
+ * R1-style reasoning distilled into a Qwen2 1.5B base. Architecture
+ * is `Qwen2ForCausalLM` — same code path as `qwen2_5_coder_1_5b` and
+ * other Qwen 2.5 family presets, no engine changes needed.
+ *
+ * `supportsThinking: true` because the model emits its reasoning
+ * trace inside literal `<think>…</think>` tags before the answer.
+ * Consumers should wrap the engine's stream with `splitThinking()`
+ * from `@inbrowser/model` to receive `kind: 'thinking'` events
+ * separated from `kind: 'token'` output, then render the two streams
+ * differently (e.g., collapsible thinking pane + main output pane).
+ *
+ * Has a tool-aware chat template, but the distill drops the
+ * tool-trained head — leaving `supportsTools: false` until a
+ * polyfill or fine-tune restores it.
+ *
+ * Reference: https://huggingface.co/onnx-community/DeepSeek-R1-Distill-Qwen-1.5B-ONNX
+ */
+export declare const deepseek_r1_qwen_1_5b: ModelPreset;
+//# sourceMappingURL=presets.d.ts.map

package/dist/presets.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"presets.d.ts","sourceRoot":"","sources":["../src/presets.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAGH,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AA8B9C;;;;GAIG;AACH,eAAO,MAAM,UAAU,EAAE,WAKvB,CAAC;AAEH;;;GAGG;AACH,eAAO,MAAM,UAAU,EAAE,WAKvB,CAAC;AAEH;;;;;;;;;GASG;AACH,eAAO,MAAM,YAAY,EAAE,WAWzB,CAAC;AAEH;;;;;;;;;;;;;;;;;GAiBG;AACH,eAAO,MAAM,kBAAkB,EAAE,WAe/B,CAAC;AAEH;;;;;;;;;;;;;;;;GAgBG;AACH,eAAO,MAAM,UAAU,EAAE,WAevB,CAAC;AAEH;;;;;;;;;;;;;;;;;;;GAmBG;AACH,eAAO,MAAM,qBAAqB,EAAE,WAoBlC,CAAC"}

package/dist/presets.js ADDED Viewed

@@ -0,0 +1,191 @@
+/**
+ * Bundled `ModelPreset`s for Gemma 4 family.
+ *
+ * Adding a model is one entry here — not a new factory. Consumers
+ * may also author their own via `definePreset` from the package root.
+ *
+ * Capabilities are declared statically from the upstream model card;
+ * the runtime engine confirms them after load.
+ *
+ * Reference: https://huggingface.co/onnx-community/gemma-4-E2B-it-ONNX
+ */
+import { definePreset } from './engine.js';
+const GEMMA_4_CAPS = {
+    supportsTools: false, // native tool-calling absent; polyfill lives in @inbrowser/agent
+    supportsVision: false,
+    supportsAudio: true,
+    contextWindow: 128_000,
+    // Gemma 4 / 3n templates accept `enable_thinking: true` and the
+    // engine still passes it through — the model will think before
+    // answering. We deliberately DO NOT set `thinkingTags`:
+    //
+    // Observed E2B + E4B output formats are inconsistent:
+    //   - E2B sometimes emits content without any opening marker
+    //   - E4B emits `<|channel>thought\n` opener
+    //   - Close marker `<channel|>` is sometimes emitted, sometimes
+    //     replaced by literal `Output\n`, sometimes omitted entirely
+    //   - End-of-turn `<turn|>` token sometimes leaks
+    //
+    // A text-based parser can't reliably separate thinking from answer
+    // with that variance. Forcing it produces worse UX than just
+    // showing the model's reasoning inline.
+    //
+    // Without `thinkingTags`, the engine keeps the default
+    // `skip_special_tokens: true`, so all structural tokens
+    // (`<|channel>`, `<channel|>`, `<turn|>`) are stripped automatically.
+    // The user sees plain-text reasoning followed by the answer in the
+    // main output pane. Mixed content, but clean.
+    supportsThinking: true,
+};
+/**
+ * Gemma 4 E2B (effective ~2.3B params). ~500MB on-device download.
+ * Comfortable fit for modern integrated GPUs; recommended starting
+ * point for the POC.
+ */
+export const gemma4_E2B = definePreset({
+    model: { modelId: 'onnx-community/gemma-4-E2B-it-ONNX' },
+    dtype: 'q4f16',
+    backend: 'auto',
+    capabilities: GEMMA_4_CAPS,
+});
+/**
+ * Gemma 4 E4B (effective ~4.5B params). ~1.5GB on-device download.
+ * Higher quality; needs a discrete GPU's worth of WebGPU memory.
+ */
+export const gemma4_E4B = definePreset({
+    model: { modelId: 'onnx-community/gemma-4-E4B-it-ONNX' },
+    dtype: 'q4f16',
+    backend: 'auto',
+    capabilities: GEMMA_4_CAPS,
+});
+/**
+ * SmolLM2 360M Instruct. ~180MB on-device download at q4f16.
+ *
+ * Demo + verification preset — small enough to fit ORT-Web's WASM
+ * backend on headless setups (no GPU required), and well under
+ * WebGPU's 1 GiB `maxBufferSize` cap. Cold-loads in seconds, decodes
+ * a token stream end-to-end without specialized hardware.
+ *
+ * Reference: https://huggingface.co/onnx-community/SmolLM2-360M-Instruct
+ */
+export const smollm2_360m = definePreset({
+    model: { modelId: 'HuggingFaceTB/SmolLM2-360M-Instruct' },
+    dtype: 'q4f16',
+    backend: 'auto',
+    capabilities: {
+        supportsTools: false,
+        supportsVision: false,
+        supportsAudio: false,
+        contextWindow: 8_192,
+        supportsThinking: false,
+    },
+});
+/**
+ * Qwen 2.5 Coder 1.5B Instruct. ~1.28 GB on-device download at q4f16.
+ *
+ * Strong on code completion and fill-in-the-middle for size. Uses the
+ * Qwen2 vocabulary (~152K tokens) — embedding table stays well under
+ * WebGPU's 1 GiB `maxBufferSize` cap, so this runs on most modern
+ * GPUs that Gemma 4 can't reach. Native tool calling is disabled
+ * here because the ONNX export drops the tool-aware decoding hooks;
+ * use the `@inbrowser/agent` polyfill if you need tools.
+ *
+ * **Verification status:** real-GPU only. Headless WASM verify
+ * fetches and loads cleanly but the first decode step fails with
+ * `Shape mismatch attempting to re-use buffer. {1,1,1536} != {1,40,1536}`
+ * — an ORT-Web buffer-reuse optimization bug at this scale. The
+ * WebGPU path works on any real desktop GPU.
+ *
+ * Reference: https://huggingface.co/onnx-community/Qwen2.5-Coder-1.5B-Instruct
+ */
+export const qwen2_5_coder_1_5b = definePreset({
+    model: { modelId: 'onnx-community/Qwen2.5-Coder-1.5B-Instruct' },
+    dtype: 'q4f16',
+    backend: 'auto',
+    capabilities: {
+        // Qwen 2.5's chat template includes tool slots and the model is
+        // trained to emit <tool_call>...</tool_call> envelopes. The engine
+        // threads `tools` into apply_chat_template and parses output with
+        // parseToolCalls() when this is true.
+        supportsTools: true,
+        supportsVision: false,
+        supportsAudio: false,
+        contextWindow: 32_768,
+        supportsThinking: false,
+    },
+});
+/**
+ * Qwen 3 1.7B. ~1.36 GB on-device download at q4f16.
+ *
+ * Current frontier-for-size general model. Supports a "thinking mode"
+ * toggle in the chat template (off by default here — flip via a
+ * custom `chatTemplate` override on the preset spread). Same Qwen
+ * vocabulary as 2.5; embedding table fits comfortably under the
+ * WebGPU buffer cap.
+ *
+ * **Verification status:** real-GPU only. Headless WASM verify fails
+ * at session creation with `ERROR_CODE: 6, ERROR_MESSAGE: std::bad_alloc`
+ * — V8's WASM heap (~4 GB ceiling) is too tight for this model's
+ * load-time scratch allocations. The WebGPU path works on any real
+ * desktop GPU.
+ *
+ * Reference: https://huggingface.co/onnx-community/Qwen3-1.7B-ONNX
+ */
+export const qwen3_1_7b = definePreset({
+    model: { modelId: 'onnx-community/Qwen3-1.7B-ONNX' },
+    dtype: 'q4f16',
+    backend: 'auto',
+    capabilities: {
+        // Qwen 3 family ships with first-class tool calling in its chat
+        // template. The engine threads `tools` through apply_chat_template
+        // and parses <tool_call>{...}</tool_call> envelopes from the
+        // output stream via parseToolCalls() when set.
+        supportsTools: true,
+        supportsVision: false,
+        supportsAudio: false,
+        contextWindow: 32_768,
+        supportsThinking: false,
+    },
+});
+/**
+ * DeepSeek-R1-Distill-Qwen 1.5B. ~1.37 GB on-device download at q4f16.
+ *
+ * R1-style reasoning distilled into a Qwen2 1.5B base. Architecture
+ * is `Qwen2ForCausalLM` — same code path as `qwen2_5_coder_1_5b` and
+ * other Qwen 2.5 family presets, no engine changes needed.
+ *
+ * `supportsThinking: true` because the model emits its reasoning
+ * trace inside literal `<think>…</think>` tags before the answer.
+ * Consumers should wrap the engine's stream with `splitThinking()`
+ * from `@inbrowser/model` to receive `kind: 'thinking'` events
+ * separated from `kind: 'token'` output, then render the two streams
+ * differently (e.g., collapsible thinking pane + main output pane).
+ *
+ * Has a tool-aware chat template, but the distill drops the
+ * tool-trained head — leaving `supportsTools: false` until a
+ * polyfill or fine-tune restores it.
+ *
+ * Reference: https://huggingface.co/onnx-community/DeepSeek-R1-Distill-Qwen-1.5B-ONNX
+ */
+export const deepseek_r1_qwen_1_5b = definePreset({
+    model: { modelId: 'onnx-community/DeepSeek-R1-Distill-Qwen-1.5B-ONNX' },
+    dtype: 'q4f16',
+    backend: 'auto',
+    capabilities: {
+        supportsTools: false,
+        supportsVision: false,
+        supportsAudio: false,
+        contextWindow: 131_072,
+        supportsThinking: true,
+        // DeepSeek's literal-text tags — same as `splitThinking`'s default
+        // when no tags are passed, but declaring them on the preset lets
+        // consumers be model-agnostic: `splitThinking(stream, preset.capabilities.thinkingTags)`
+        // works for both DeepSeek and Gemma 4 without consumer-side
+        // model-family branching.
+        thinkingTags: {
+            openTag: '<think>',
+            closeTag: '</think>',
+        },
+    },
+});
+//# sourceMappingURL=presets.js.map

package/dist/presets.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"presets.js","sourceRoot":"","sources":["../src/presets.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAG3C,MAAM,YAAY,GAAG;IACnB,aAAa,EAAE,KAAK,EAAE,iEAAiE;IACvF,cAAc,EAAE,KAAK;IACrB,aAAa,EAAE,IAAI;IACnB,aAAa,EAAE,OAAO;IACtB,gEAAgE;IAChE,+DAA+D;IAC/D,wDAAwD;IACxD,EAAE;IACF,sDAAsD;IACtD,6DAA6D;IAC7D,6CAA6C;IAC7C,gEAAgE;IAChE,iEAAiE;IACjE,kDAAkD;IAClD,EAAE;IACF,mEAAmE;IACnE,6DAA6D;IAC7D,wCAAwC;IACxC,EAAE;IACF,uDAAuD;IACvD,wDAAwD;IACxD,sEAAsE;IACtE,mEAAmE;IACnE,8CAA8C;IAC9C,gBAAgB,EAAE,IAAI;CACd,CAAC;AAEX;;;;GAIG;AACH,MAAM,CAAC,MAAM,UAAU,GAAgB,YAAY,CAAC;IAClD,KAAK,EAAE,EAAE,OAAO,EAAE,oCAAoC,EAAE;IACxD,KAAK,EAAE,OAAO;IACd,OAAO,EAAE,MAAM;IACf,YAAY,EAAE,YAAY;CAC3B,CAAC,CAAC;AAEH;;;GAGG;AACH,MAAM,CAAC,MAAM,UAAU,GAAgB,YAAY,CAAC;IAClD,KAAK,EAAE,EAAE,OAAO,EAAE,oCAAoC,EAAE;IACxD,KAAK,EAAE,OAAO;IACd,OAAO,EAAE,MAAM;IACf,YAAY,EAAE,YAAY;CAC3B,CAAC,CAAC;AAEH;;;;;;;;;GASG;AACH,MAAM,CAAC,MAAM,YAAY,GAAgB,YAAY,CAAC;IACpD,KAAK,EAAE,EAAE,OAAO,EAAE,qCAAqC,EAAE;IACzD,KAAK,EAAE,OAAO;IACd,OAAO,EAAE,MAAM;IACf,YAAY,EAAE;QACZ,aAAa,EAAE,KAAK;QACpB,cAAc,EAAE,KAAK;QACrB,aAAa,EAAE,KAAK;QACpB,aAAa,EAAE,KAAK;QACpB,gBAAgB,EAAE,KAAK;KACxB;CACF,CAAC,CAAC;AAEH;;;;;;;;;;;;;;;;;GAiBG;AACH,MAAM,CAAC,MAAM,kBAAkB,GAAgB,YAAY,CAAC;IAC1D,KAAK,EAAE,EAAE,OAAO,EAAE,4CAA4C,EAAE;IAChE,KAAK,EAAE,OAAO;IACd,OAAO,EAAE,MAAM;IACf,YAAY,EAAE;QACZ,gEAAgE;QAChE,mEAAmE;QACnE,kEAAkE;QAClE,sCAAsC;QACtC,aAAa,EAAE,IAAI;QACnB,cAAc,EAAE,KAAK;QACrB,aAAa,EAAE,KAAK;QACpB,aAAa,EAAE,MAAM;QACrB,gBAAgB,EAAE,KAAK;KACxB;CACF,CAAC,CAAC;AAEH;;;;;;;;;;;;;;;;GAgBG;AACH,MAAM,CAAC,MAAM,UAAU,GAAgB,YAAY,CAAC;IAClD,KAAK,EAAE,EAAE,OAAO,EAAE,gCAAgC,EAAE;IACpD,KAAK,EAAE,OAAO;IACd,OAAO,EAAE,MAAM;IACf,YAAY,EAAE;QACZ,gEAAgE;QAChE,mEAAmE;QACnE,6DAA6D;QAC7D,+CAA+C;QAC/C,aAAa,EAAE,IAAI;QACnB,cAAc,EAAE,KAAK;QACrB,aAAa,EAAE,KAAK;QACpB,aAAa,EAAE,MAAM;QACrB,gBAAgB,EAAE,KAAK;KACxB;CACF,CAAC,CAAC;AAEH;;;;;;;;;;;;;;;;;;;GAmBG;AACH,MAAM,CAAC,MAAM,qBAAqB,GAAgB,YAAY,CAAC;IAC7D,KAAK,EAAE,EAAE,OAAO,EAAE,mDAAmD,EAAE;IACvE,KAAK,EAAE,OAAO;IACd,OAAO,EAAE,MAAM;IACf,YAAY,EAAE;QACZ,aAAa,EAAE,KAAK;QACpB,cAAc,EAAE,KAAK;QACrB,aAAa,EAAE,KAAK;QACpB,aAAa,EAAE,OAAO;QACtB,gBAAgB,EAAE,IAAI;QACtB,mEAAmE;QACnE,iEAAiE;QACjE,yFAAyF;QACzF,4DAA4D;QAC5D,0BAA0B;QAC1B,YAAY,EAAE;YACZ,OAAO,EAAE,SAAS;YAClB,QAAQ,EAAE,UAAU;SACrB;KACF;CACF,CAAC,CAAC"}

package/dist/think.d.ts ADDED Viewed

@@ -0,0 +1,57 @@
+/**
+ * `splitThinking` — stream transformer that splits reasoning-tagged
+ * content out of a raw token stream.
+ *
+ * Reasoning models (DeepSeek R1, R1-Distill-*, some Qwen 3 thinking
+ * variants) emit their reasoning trace inside literal tags
+ * (`<think>…</think>` by default). The engine itself stays narrow —
+ * it just emits `token` events with whatever text the decoder
+ * produced. This utility wraps that stream and re-emits the same
+ * shape, except text inside the open/close tags is yielded as
+ * `kind: 'thinking'` instead of `kind: 'token'`.
+ *
+ *   for await (const evt of splitThinking(engine.generate(msgs))) {
+ *     if (evt.kind === 'thinking') showReasoning(evt.text);
+ *     else if (evt.kind === 'token') showOutput(evt.text);
+ *   }
+ *
+ * Tag matching is configurable (defaults to DeepSeek's
+ * `<think>` / `</think>`). The implementation buffers up to one
+ * tag-length minus one byte so partial tags split across token
+ * boundaries (e.g., `<th` then `ink>`) resolve correctly.
+ *
+ * Pass-through behavior for non-token events: `usage` and `error`
+ * forward unchanged so terminal accounting is preserved.
+ */
+import type { EngineEvent } from './types.js';
+export interface ThinkingSplitOpts {
+    /** Tag that opens a reasoning block. Default: `<think>`. */
+    openTag?: string;
+    /** Tag that closes a reasoning block. Default: `</think>`. */
+    closeTag?: string;
+    /**
+     * When true, the stream is treated as starting *inside* the
+     * thinking channel — i.e., the opening tag is implicit. The first
+     * `closeTag` ends the block; subsequent text streams as `token`.
+     *
+     * Used for models where the chat template's `add_generation_prompt`
+     * pre-fills the opening marker, so generation begins inside
+     * thinking and the model only emits the close marker explicitly.
+     * Gemma 4 family works this way.
+     *
+     * Default: `false`.
+     */
+    implicitOpen?: boolean;
+    /**
+     * Literal substrings to strip from `token` events post-parse.
+     * Useful for cleaning up structural leak tokens that appear when
+     * the engine sets `skip_special_tokens: false` to expose channel
+     * markers (Gemma 4's `<turn|>` end-of-turn marker, for example).
+     *
+     * Stripping is applied AFTER mode classification — content inside
+     * thinking blocks is not affected. Default: `[]`.
+     */
+    stripTokens?: ReadonlyArray<string>;
+}
+export declare function splitThinking(source: AsyncIterable<EngineEvent>, opts?: ThinkingSplitOpts): AsyncIterable<EngineEvent>;
+//# sourceMappingURL=think.d.ts.map

package/dist/think.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"think.d.ts","sourceRoot":"","sources":["../src/think.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AAE9C,MAAM,WAAW,iBAAiB;IAChC,4DAA4D;IAC5D,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,8DAA8D;IAC9D,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB;;;;;;;;;;;OAWG;IACH,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB;;;;;;;;OAQG;IACH,WAAW,CAAC,EAAE,aAAa,CAAC,MAAM,CAAC,CAAC;CACrC;AAED,wBAAuB,aAAa,CAClC,MAAM,EAAE,aAAa,CAAC,WAAW,CAAC,EAClC,IAAI,GAAE,iBAAsB,GAC3B,aAAa,CAAC,WAAW,CAAC,CAmH5B"}

package/dist/think.js ADDED Viewed

@@ -0,0 +1,138 @@
+/**
+ * `splitThinking` — stream transformer that splits reasoning-tagged
+ * content out of a raw token stream.
+ *
+ * Reasoning models (DeepSeek R1, R1-Distill-*, some Qwen 3 thinking
+ * variants) emit their reasoning trace inside literal tags
+ * (`<think>…</think>` by default). The engine itself stays narrow —
+ * it just emits `token` events with whatever text the decoder
+ * produced. This utility wraps that stream and re-emits the same
+ * shape, except text inside the open/close tags is yielded as
+ * `kind: 'thinking'` instead of `kind: 'token'`.
+ *
+ *   for await (const evt of splitThinking(engine.generate(msgs))) {
+ *     if (evt.kind === 'thinking') showReasoning(evt.text);
+ *     else if (evt.kind === 'token') showOutput(evt.text);
+ *   }
+ *
+ * Tag matching is configurable (defaults to DeepSeek's
+ * `<think>` / `</think>`). The implementation buffers up to one
+ * tag-length minus one byte so partial tags split across token
+ * boundaries (e.g., `<th` then `ink>`) resolve correctly.
+ *
+ * Pass-through behavior for non-token events: `usage` and `error`
+ * forward unchanged so terminal accounting is preserved.
+ */
+export async function* splitThinking(source, opts = {}) {
+    const open = opts.openTag ?? '<think>';
+    const close = opts.closeTag ?? '</think>';
+    const implicitOpen = opts.implicitOpen ?? false;
+    const stripTokens = opts.stripTokens ?? [];
+    if (close.length === 0) {
+        throw new Error('splitThinking: closeTag must be non-empty');
+    }
+    if (!implicitOpen && open.length === 0) {
+        throw new Error('splitThinking: openTag must be non-empty unless implicitOpen is true');
+    }
+    // Initial state: when `implicitOpen` is true, the stream is treated
+    // as already inside the thinking block (Gemma 4: chat template
+    // primes generation inside <|channel>thought, so the first emitted
+    // token IS thinking content). Otherwise start in `normal` and wait
+    // for the open tag.
+    let mode = implicitOpen ? 'inside' : 'normal';
+    // `buffer` holds text we haven't decided how to emit yet — typically
+    // because the trailing characters could be a partial tag prefix OR
+    // a partial stripToken.
+    let buffer = '';
+    // Holdback: how many trailing bytes to keep in the buffer rather
+    // than emit, so partial matches (open-tag prefix, or any
+    // stripToken split across input boundaries) can resolve on a
+    // subsequent input chunk.
+    //
+    //   - openTag protection: keep up to `open.length - 1` chars so a
+    //     partial open-tag like `<thi` resolves when `nk>` arrives.
+    //   - stripToken protection: keep up to `maxStripLen` chars (NOT
+    //     `maxStripLen - 1`) so a stripToken that *starts* near the
+    //     boundary is fully held back rather than split. Splitting would
+    //     mean the literal-substring `includes(t)` check fails on both
+    //     halves and the token leaks.
+    const holdbackForOpen = open.length > 0 ? open.length - 1 : 0;
+    const holdbackForStrip = stripTokens.length > 0 ? Math.max(...stripTokens.map((t) => t.length)) : 0;
+    const holdback = Math.max(holdbackForOpen, holdbackForStrip);
+    // Helper: emit a token event with the configured strip-tokens applied.
+    function emitToken(text) {
+        let cleaned = text;
+        for (const t of stripTokens) {
+            if (cleaned.includes(t))
+                cleaned = cleaned.split(t).join('');
+        }
+        return { kind: 'token', text: cleaned };
+    }
+    for await (const evt of source) {
+        if (evt.kind !== 'token') {
+            yield evt;
+            continue;
+        }
+        buffer += evt.text;
+        // Drain the buffer as far as we can each iteration. We loop because
+        // a single token can contain `<think>…</think>` end-to-end; one
+        // pass per state transition handles that.
+        while (buffer.length > 0) {
+            if (mode === 'normal') {
+                // In normal mode we look for the next open tag (if defined).
+                // When openTag is empty (implicitOpen-with-no-return), we
+                // still need to respect stripToken holdback so partial
+                // stripTokens don't sneak past us at input-chunk boundaries.
+                const idx = open.length > 0 ? buffer.indexOf(open) : -1;
+                if (idx === -1) {
+                    // No open tag visible. Emit buffer minus the holdback
+                    // window so partial open-tag / stripToken matches resolve
+                    // on the next input.
+                    const safeLen = buffer.length - holdback;
+                    if (safeLen > 0) {
+                        yield emitToken(buffer.slice(0, safeLen));
+                        buffer = buffer.slice(safeLen);
+                    }
+                    break;
+                }
+                if (idx > 0) {
+                    yield emitToken(buffer.slice(0, idx));
+                }
+                buffer = buffer.slice(idx + open.length);
+                mode = 'inside';
+            }
+            else {
+                const idx = buffer.indexOf(close);
+                if (idx === -1) {
+                    // Holdback for close-tag protection inside thinking. We
+                    // don't need stripToken holdback here — stripTokens apply
+                    // to token events, not thinking events.
+                    const safeLen = buffer.length - (close.length - 1);
+                    if (safeLen > 0) {
+                        yield { kind: 'thinking', text: buffer.slice(0, safeLen) };
+                        buffer = buffer.slice(safeLen);
+                    }
+                    break;
+                }
+                if (idx > 0) {
+                    yield { kind: 'thinking', text: buffer.slice(0, idx) };
+                }
+                buffer = buffer.slice(idx + close.length);
+                mode = 'normal';
+            }
+        }
+    }
+    // Flush any residual buffer. If we ended mid-thinking-block (model
+    // hit max_new_tokens before the close, OR implicitOpen never saw a
+    // close), treat the remainder as thinking — alternative would be
+    // silently dropping it.
+    if (buffer.length > 0) {
+        if (mode === 'inside') {
+            yield { kind: 'thinking', text: buffer };
+        }
+        else {
+            yield emitToken(buffer);
+        }
+    }
+}
+//# sourceMappingURL=think.js.map

package/dist/think.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"think.js","sourceRoot":"","sources":["../src/think.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAkCH,MAAM,CAAC,KAAK,SAAS,CAAC,CAAC,aAAa,CAClC,MAAkC,EAClC,OAA0B,EAAE;IAE5B,MAAM,IAAI,GAAG,IAAI,CAAC,OAAO,IAAI,SAAS,CAAC;IACvC,MAAM,KAAK,GAAG,IAAI,CAAC,QAAQ,IAAI,UAAU,CAAC;IAC1C,MAAM,YAAY,GAAG,IAAI,CAAC,YAAY,IAAI,KAAK,CAAC;IAChD,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC;IAC3C,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,MAAM,IAAI,KAAK,CAAC,2CAA2C,CAAC,CAAC;IAC/D,CAAC;IACD,IAAI,CAAC,YAAY,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvC,MAAM,IAAI,KAAK,CAAC,sEAAsE,CAAC,CAAC;IAC1F,CAAC;IAED,oEAAoE;IACpE,+DAA+D;IAC/D,mEAAmE;IACnE,mEAAmE;IACnE,oBAAoB;IACpB,IAAI,IAAI,GAAwB,YAAY,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ,CAAC;IACnE,qEAAqE;IACrE,mEAAmE;IACnE,wBAAwB;IACxB,IAAI,MAAM,GAAG,EAAE,CAAC;IAEhB,iEAAiE;IACjE,yDAAyD;IACzD,6DAA6D;IAC7D,0BAA0B;IAC1B,EAAE;IACF,kEAAkE;IAClE,gEAAgE;IAChE,iEAAiE;IACjE,gEAAgE;IAChE,qEAAqE;IACrE,mEAAmE;IACnE,kCAAkC;IAClC,MAAM,eAAe,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC9D,MAAM,gBAAgB,GACpB,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7E,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,eAAe,EAAE,gBAAgB,CAAC,CAAC;IAE7D,uEAAuE;IACvE,SAAS,SAAS,CAAC,IAAY;QAC7B,IAAI,OAAO,GAAG,IAAI,CAAC;QACnB,KAAK,MAAM,CAAC,IAAI,WAAW,EAAE,CAAC;YAC5B,IAAI,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC;gBAAE,OAAO,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAC/D,CAAC;QACD,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC;IAC1C,CAAC;IAED,IAAI,KAAK,EAAE,MAAM,GAAG,IAAI,MAAM,EAAE,CAAC;QAC/B,IAAI,GAAG,CAAC,IAAI,KAAK,OAAO,EAAE,CAAC;YACzB,MAAM,GAAG,CAAC;YACV,SAAS;QACX,CAAC;QACD,MAAM,IAAI,GAAG,CAAC,IAAI,CAAC;QAEnB,oEAAoE;QACpE,gEAAgE;QAChE,0CAA0C;QAC1C,OAAO,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACzB,IAAI,IAAI,KAAK,QAAQ,EAAE,CAAC;gBACtB,6DAA6D;gBAC7D,0DAA0D;gBAC1D,uDAAuD;gBACvD,6DAA6D;gBAC7D,MAAM,GAAG,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;gBACxD,IAAI,GAAG,KAAK,CAAC,CAAC,EAAE,CAAC;oBACf,sDAAsD;oBACtD,0DAA0D;oBAC1D,qBAAqB;oBACrB,MAAM,OAAO,GAAG,MAAM,CAAC,MAAM,GAAG,QAAQ,CAAC;oBACzC,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;wBAChB,MAAM,SAAS,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC;wBAC1C,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;oBACjC,CAAC;oBACD,MAAM;gBACR,CAAC;gBACD,IAAI,GAAG,GAAG,CAAC,EAAE,CAAC;oBACZ,MAAM,SAAS,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;gBACxC,CAAC;gBACD,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC;gBACzC,IAAI,GAAG,QAAQ,CAAC;YAClB,CAAC;iBAAM,CAAC;gBACN,MAAM,GAAG,GAAG,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;gBAClC,IAAI,GAAG,KAAK,CAAC,CAAC,EAAE,CAAC;oBACf,wDAAwD;oBACxD,0DAA0D;oBAC1D,wCAAwC;oBACxC,MAAM,OAAO,GAAG,MAAM,CAAC,MAAM,GAAG,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;oBACnD,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;wBAChB,MAAM,EAAE,IAAI,EAAE,UAAU,EAAE,IAAI,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,EAAE,CAAC;wBAC3D,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;oBACjC,CAAC;oBACD,MAAM;gBACR,CAAC;gBACD,IAAI,GAAG,GAAG,CAAC,EAAE,CAAC;oBACZ,MAAM,EAAE,IAAI,EAAE,UAAU,EAAE,IAAI,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC;gBACzD,CAAC;gBACD,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC;gBAC1C,IAAI,GAAG,QAAQ,CAAC;YAClB,CAAC;QACH,CAAC;IACH,CAAC;IAED,mEAAmE;IACnE,mEAAmE;IACnE,iEAAiE;IACjE,wBAAwB;IACxB,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACtB,IAAI,IAAI,KAAK,QAAQ,EAAE,CAAC;YACtB,MAAM,EAAE,IAAI,EAAE,UAAU,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC;QAC3C,CAAC;aAAM,CAAC;YACN,MAAM,SAAS,CAAC,MAAM,CAAC,CAAC;QAC1B,CAAC;IACH,CAAC;AACH,CAAC"}