npm - @chonkiejs/chunk - Versions diffs - 0.8.0 → 0.9.1 - Mend

@chonkiejs/chunk 0.8.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/package.json +1 -1
package/pkg/README.md +97 -0
package/pkg/chonkiejs_chunk.d.ts +171 -0
package/pkg/chonkiejs_chunk.js +504 -0
package/pkg/chonkiejs_chunk_bg.wasm +0 -0
package/pkg/chonkiejs_chunk_bg.wasm.d.ts +20 -0
package/pkg/package.json +16 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@chonkiejs/chunk",
-  "version": "0.8.0",
+  "version": "0.9.1",
   "description": "The fastest semantic text chunking library",
   "type": "module",
   "main": "index.js",

package/pkg/README.md ADDED Viewed

@@ -0,0 +1,97 @@
+<p align="center">
+  <img src="../../assets/memchunk_wide.png" alt="@chonkiejs/chunk" width="500">
+</p>
+<h1 align="center">@chonkiejs/chunk</h1>
+<p align="center">
+  <em>the fastest text chunking library — up to 1 TB/s throughput</em>
+</p>
+<p align="center">
+  <a href="https://crates.io/crates/chunk"><img src="https://img.shields.io/crates/v/chunk.svg?color=e74c3c" alt="crates.io"></a>
+  <a href="https://pypi.org/project/chonkie-core"><img src="https://img.shields.io/pypi/v/chonkie-core.svg?color=e67e22" alt="PyPI"></a>
+  <a href="https://www.npmjs.com/package/@chonkiejs/chunk"><img src="https://img.shields.io/npm/v/@chonkiejs/chunk.svg?color=2ecc71" alt="npm"></a>
+  <a href="https://github.com/chonkie-inc/chunk"><img src="https://img.shields.io/badge/github-chunk-3498db" alt="GitHub"></a>
+  <a href="LICENSE-MIT"><img src="https://img.shields.io/badge/license-MIT%2FApache--2.0-9b59b6.svg" alt="License"></a>
+</p>
+---
+you know how every chunking library claims to be fast? yeah, we actually meant it.
+**@chonkiejs/chunk** splits text at semantic boundaries (periods, newlines, the usual suspects) and does it stupid fast. we're talking "chunk the entire english wikipedia in 120ms" fast.
+want to know how? [read the blog post](https://minha.sh/posts/so,-you-want-to-chunk-really-fast) where we nerd out about SIMD instructions and lookup tables.
+## 📦 installation
+```bash
+npm install @chonkiejs/chunk
+```
+looking for [rust](https://github.com/chonkie-inc/chunk) or [python](https://github.com/chonkie-inc/chunk/tree/main/packages/python)?
+## 🚀 usage
+```javascript
+import { init, chunk } from '@chonkiejs/chunk';
+// initialize wasm (required once)
+await init();
+const text = "Hello world. How are you? I'm fine.\nThanks for asking.";
+// with defaults (4KB chunks, split at \n . ?)
+for (const slice of chunk(text)) {
+    console.log(slice);
+}
+// with custom size
+for (const slice of chunk(text, { size: 1024 })) {
+    console.log(slice);
+}
+// with custom delimiters
+for (const slice of chunk(text, { delimiters: ".?!\n" })) {
+    console.log(slice);
+}
+// with multi-byte pattern (e.g., metaspace ▁ for SentencePiece tokenizers)
+for (const slice of chunk(text, { pattern: "▁", prefix: true })) {
+    console.log(slice);
+}
+// with consecutive pattern handling (split at START of runs, not middle)
+for (const slice of chunk("word   next", { pattern: " ", consecutive: true })) {
+    console.log(slice);
+}
+// with forward fallback (search forward if no pattern in backward window)
+for (const slice of chunk(text, { pattern: " ", forwardFallback: true })) {
+    console.log(slice);
+}
+// collect all chunks
+const chunks = [...chunk(text)];
+```
+pass strings and get strings back. for zero-copy performance with binary data, pass `Uint8Array` and you'll get `Uint8Array` views back.
+## 📝 citation
+if you use @chonkiejs/chunk in your research, please cite it as follows:
+```bibtex
+@software{chunk2025,
+  author = {Minhas, Bhavnick},
+  title = {chunk: The fastest text chunking library},
+  year = {2025},
+  publisher = {GitHub},
+  howpublished = {\url{https://github.com/chonkie-inc/chunk}},
+}
+```
+## 📄 license
+licensed under either of [Apache License, Version 2.0](LICENSE-APACHE) or [MIT license](LICENSE-MIT) at your option.

package/pkg/chonkiejs_chunk.d.ts ADDED Viewed

@@ -0,0 +1,171 @@
+/* tslint:disable */
+/* eslint-disable */
+export class Chunker {
+  free(): void;
+  [Symbol.dispose](): void;
+  /**
+   * Create a new Chunker with a multi-byte pattern.
+   *
+   * @param text - The text to chunk (as Uint8Array)
+   * @param size - Target chunk size in bytes
+   * @param pattern - Multi-byte pattern to split on (as Uint8Array)
+   * @param prefix - Put pattern at start of next chunk (default: false)
+   * @param consecutive - Split at START of consecutive runs (default: false)
+   * @param forward_fallback - Search forward if no pattern in backward window (default: false)
+   */
+  static with_pattern(text: Uint8Array, size: number, pattern: Uint8Array, prefix?: boolean | null, consecutive?: boolean | null, forward_fallback?: boolean | null): Chunker;
+  /**
+   * Collect all chunk offsets as a flat array [start1, end1, start2, end2, ...].
+   * This is faster than iterating as it makes a single WASM call.
+   */
+  collect_offsets(): Uint32Array;
+  /**
+   * Create a new Chunker with single-byte delimiters.
+   *
+   * @param text - The text to chunk (as Uint8Array)
+   * @param size - Target chunk size in bytes (default: 4096)
+   * @param delimiters - Delimiter characters as string (default: "\n.?")
+   * @param prefix - Put delimiter at start of next chunk (default: false)
+   * @param consecutive - Split at START of consecutive runs (default: false)
+   * @param forward_fallback - Search forward if no delimiter in backward window (default: false)
+   */
+  constructor(text: Uint8Array, size?: number | null, delimiters?: string | null, prefix?: boolean | null, consecutive?: boolean | null, forward_fallback?: boolean | null);
+  /**
+   * Get the next chunk, or undefined if exhausted.
+   */
+  next(): Uint8Array | undefined;
+  /**
+   * Reset the chunker to iterate from the beginning.
+   */
+  reset(): void;
+}
+/**
+ * Fast chunking function that returns offsets in a single call.
+ * Returns a flat array [start1, end1, start2, end2, ...].
+ * Use this with subarray for maximum performance.
+ *
+ * @example Single-byte delimiters
+ * ```javascript
+ * const offsets = chunk_offsets(textBytes, 4096, ".\n?");
+ * const chunks = [];
+ * for (let i = 0; i < offsets.length; i += 2) {
+ *     chunks.push(textBytes.subarray(offsets[i], offsets[i + 1]));
+ * }
+ * ```
+ */
+export function chunk_offsets(text: Uint8Array, size?: number | null, delimiters?: string | null, prefix?: boolean | null, consecutive?: boolean | null, forward_fallback?: boolean | null): Uint32Array;
+/**
+ * Fast chunking function with multi-byte pattern support.
+ * Returns a flat array [start1, end1, start2, end2, ...].
+ *
+ * @example Multi-byte pattern (metaspace)
+ * ```javascript
+ * const encoder = new TextEncoder();
+ * const metaspace = encoder.encode("▁");
+ * const offsets = chunk_offsets_pattern(textBytes, 4096, metaspace, true, true, true);
+ * ```
+ */
+export function chunk_offsets_pattern(text: Uint8Array, size: number, pattern: Uint8Array, prefix?: boolean | null, consecutive?: boolean | null, forward_fallback?: boolean | null): Uint32Array;
+/**
+ * Get the default delimiters ("\n.?").
+ */
+export function default_delimiters(): Uint8Array;
+/**
+ * Get the default target size (4096 bytes).
+ */
+export function default_target_size(): number;
+/**
+ * Find merge indices for combining segments within token limits.
+ *
+ * Returns indices marking where to split segments into chunks that
+ * respect the token budget. Use this to determine merge boundaries,
+ * then join strings in JavaScript.
+ *
+ * @param token_counts - Array of token counts for each segment
+ * @param chunk_size - Maximum tokens per merged chunk
+ * @returns Array of end indices (exclusive) for each chunk
+ *
+ * @example
+ * ```javascript
+ * const tokenCounts = new Uint32Array([1, 1, 1, 1, 1, 1, 1]);
+ * const indices = find_merge_indices(tokenCounts, 3);
+ * // indices = [3, 6, 7]
+ * // Use to slice: segments.slice(0, 3), segments.slice(3, 6), segments.slice(6, 7)
+ * ```
+ */
+export function find_merge_indices(token_counts: Uint32Array, chunk_size: number): Uint32Array;
+/**
+ * Split text at every delimiter occurrence, returning offsets.
+ * Unlike chunk_offsets which creates size-based chunks, this splits at
+ * **every** delimiter occurrence.
+ *
+ * Returns a flat array [start1, end1, start2, end2, ...].
+ *
+ * @param text - The text to split (as Uint8Array)
+ * @param delimiters - Delimiter characters as string (default: "\n.?")
+ * @param include_delim - Where to attach delimiter: "prev" (default), "next", or "none"
+ * @param min_chars - Minimum characters per segment (default: 0). Shorter segments are merged.
+ *
+ * @example
+ * ```javascript
+ * const offsets = split_offsets(textBytes, ".", "prev", 0);
+ * const segments = [];
+ * for (let i = 0; i < offsets.length; i += 2) {
+ *     segments.push(textBytes.subarray(offsets[i], offsets[i + 1]));
+ * }
+ * // ["Hello.", " World.", " Test."]
+ * ```
+ */
+export function split_offsets(text: Uint8Array, delimiters?: string | null, include_delim?: string | null, min_chars?: number | null): Uint32Array;
+export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembly.Module;
+export interface InitOutput {
+  readonly memory: WebAssembly.Memory;
+  readonly __wbg_chunker_free: (a: number, b: number) => void;
+  readonly chunk_offsets: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => [number, number];
+  readonly chunk_offsets_pattern: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => [number, number];
+  readonly chunker_collect_offsets: (a: number) => [number, number];
+  readonly chunker_new: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => number;
+  readonly chunker_next: (a: number) => [number, number];
+  readonly chunker_reset: (a: number) => void;
+  readonly chunker_with_pattern: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => number;
+  readonly default_delimiters: () => [number, number];
+  readonly default_target_size: () => number;
+  readonly find_merge_indices: (a: number, b: number, c: number) => [number, number];
+  readonly split_offsets: (a: number, b: number, c: number, d: number, e: number, f: number, g: number) => [number, number];
+  readonly __wbindgen_externrefs: WebAssembly.Table;
+  readonly __wbindgen_malloc: (a: number, b: number) => number;
+  readonly __wbindgen_realloc: (a: number, b: number, c: number, d: number) => number;
+  readonly __wbindgen_free: (a: number, b: number, c: number) => void;
+  readonly __wbindgen_start: () => void;
+}
+export type SyncInitInput = BufferSource | WebAssembly.Module;
+/**
+* Instantiates the given `module`, which can either be bytes or
+* a precompiled `WebAssembly.Module`.
+*
+* @param {{ module: SyncInitInput }} module - Passing `SyncInitInput` directly is deprecated.
+*
+* @returns {InitOutput}
+*/
+export function initSync(module: { module: SyncInitInput } | SyncInitInput): InitOutput;
+/**
+* If `module_or_path` is {RequestInfo} or {URL}, makes a request and
+* for everything else, calls `WebAssembly.instantiate` directly.
+*
+* @param {{ module_or_path: InitInput | Promise<InitInput> }} module_or_path - Passing `InitInput` directly is deprecated.
+*
+* @returns {Promise<InitOutput>}
+*/
+export default function __wbg_init (module_or_path?: { module_or_path: InitInput | Promise<InitInput> } | InitInput | Promise<InitInput>): Promise<InitOutput>;

package/pkg/chonkiejs_chunk.js ADDED Viewed

@@ -0,0 +1,504 @@
+let wasm;
+function getArrayU32FromWasm0(ptr, len) {
+    ptr = ptr >>> 0;
+    return getUint32ArrayMemory0().subarray(ptr / 4, ptr / 4 + len);
+}
+function getArrayU8FromWasm0(ptr, len) {
+    ptr = ptr >>> 0;
+    return getUint8ArrayMemory0().subarray(ptr / 1, ptr / 1 + len);
+}
+function getStringFromWasm0(ptr, len) {
+    ptr = ptr >>> 0;
+    return decodeText(ptr, len);
+}
+let cachedUint32ArrayMemory0 = null;
+function getUint32ArrayMemory0() {
+    if (cachedUint32ArrayMemory0 === null || cachedUint32ArrayMemory0.byteLength === 0) {
+        cachedUint32ArrayMemory0 = new Uint32Array(wasm.memory.buffer);
+    }
+    return cachedUint32ArrayMemory0;
+}
+let cachedUint8ArrayMemory0 = null;
+function getUint8ArrayMemory0() {
+    if (cachedUint8ArrayMemory0 === null || cachedUint8ArrayMemory0.byteLength === 0) {
+        cachedUint8ArrayMemory0 = new Uint8Array(wasm.memory.buffer);
+    }
+    return cachedUint8ArrayMemory0;
+}
+function isLikeNone(x) {
+    return x === undefined || x === null;
+}
+function passArray32ToWasm0(arg, malloc) {
+    const ptr = malloc(arg.length * 4, 4) >>> 0;
+    getUint32ArrayMemory0().set(arg, ptr / 4);
+    WASM_VECTOR_LEN = arg.length;
+    return ptr;
+}
+function passArray8ToWasm0(arg, malloc) {
+    const ptr = malloc(arg.length * 1, 1) >>> 0;
+    getUint8ArrayMemory0().set(arg, ptr / 1);
+    WASM_VECTOR_LEN = arg.length;
+    return ptr;
+}
+function passStringToWasm0(arg, malloc, realloc) {
+    if (realloc === undefined) {
+        const buf = cachedTextEncoder.encode(arg);
+        const ptr = malloc(buf.length, 1) >>> 0;
+        getUint8ArrayMemory0().subarray(ptr, ptr + buf.length).set(buf);
+        WASM_VECTOR_LEN = buf.length;
+        return ptr;
+    }
+    let len = arg.length;
+    let ptr = malloc(len, 1) >>> 0;
+    const mem = getUint8ArrayMemory0();
+    let offset = 0;
+    for (; offset < len; offset++) {
+        const code = arg.charCodeAt(offset);
+        if (code > 0x7F) break;
+        mem[ptr + offset] = code;
+    }
+    if (offset !== len) {
+        if (offset !== 0) {
+            arg = arg.slice(offset);
+        }
+        ptr = realloc(ptr, len, len = offset + arg.length * 3, 1) >>> 0;
+        const view = getUint8ArrayMemory0().subarray(ptr + offset, ptr + len);
+        const ret = cachedTextEncoder.encodeInto(arg, view);
+        offset += ret.written;
+        ptr = realloc(ptr, len, offset, 1) >>> 0;
+    }
+    WASM_VECTOR_LEN = offset;
+    return ptr;
+}
+let cachedTextDecoder = new TextDecoder('utf-8', { ignoreBOM: true, fatal: true });
+cachedTextDecoder.decode();
+const MAX_SAFARI_DECODE_BYTES = 2146435072;
+let numBytesDecoded = 0;
+function decodeText(ptr, len) {
+    numBytesDecoded += len;
+    if (numBytesDecoded >= MAX_SAFARI_DECODE_BYTES) {
+        cachedTextDecoder = new TextDecoder('utf-8', { ignoreBOM: true, fatal: true });
+        cachedTextDecoder.decode();
+        numBytesDecoded = len;
+    }
+    return cachedTextDecoder.decode(getUint8ArrayMemory0().subarray(ptr, ptr + len));
+}
+const cachedTextEncoder = new TextEncoder();
+if (!('encodeInto' in cachedTextEncoder)) {
+    cachedTextEncoder.encodeInto = function (arg, view) {
+        const buf = cachedTextEncoder.encode(arg);
+        view.set(buf);
+        return {
+            read: arg.length,
+            written: buf.length
+        };
+    }
+}
+let WASM_VECTOR_LEN = 0;
+const ChunkerFinalization = (typeof FinalizationRegistry === 'undefined')
+    ? { register: () => {}, unregister: () => {} }
+    : new FinalizationRegistry(ptr => wasm.__wbg_chunker_free(ptr >>> 0, 1));
+/**
+ * Chunker splits text at delimiter boundaries.
+ *
+ * @example Single-byte delimiters
+ * ```javascript
+ * const chunker = new Chunker(textBytes, 4096, ".\n?");
+ * let chunk;
+ * while ((chunk = chunker.next()) !== undefined) {
+ *     console.log(chunk);
+ * }
+ * ```
+ *
+ * @example Multi-byte pattern (e.g., metaspace for SentencePiece)
+ * ```javascript
+ * const encoder = new TextEncoder();
+ * const metaspace = encoder.encode("▁");
+ * const chunker = Chunker.with_pattern(textBytes, 4096, metaspace, true);
+ * ```
+ */
+export class Chunker {
+    static __wrap(ptr) {
+        ptr = ptr >>> 0;
+        const obj = Object.create(Chunker.prototype);
+        obj.__wbg_ptr = ptr;
+        ChunkerFinalization.register(obj, obj.__wbg_ptr, obj);
+        return obj;
+    }
+    __destroy_into_raw() {
+        const ptr = this.__wbg_ptr;
+        this.__wbg_ptr = 0;
+        ChunkerFinalization.unregister(this);
+        return ptr;
+    }
+    free() {
+        const ptr = this.__destroy_into_raw();
+        wasm.__wbg_chunker_free(ptr, 0);
+    }
+    /**
+     * Create a new Chunker with a multi-byte pattern.
+     *
+     * @param text - The text to chunk (as Uint8Array)
+     * @param size - Target chunk size in bytes
+     * @param pattern - Multi-byte pattern to split on (as Uint8Array)
+     * @param prefix - Put pattern at start of next chunk (default: false)
+     * @param consecutive - Split at START of consecutive runs (default: false)
+     * @param forward_fallback - Search forward if no pattern in backward window (default: false)
+     * @param {Uint8Array} text
+     * @param {number} size
+     * @param {Uint8Array} pattern
+     * @param {boolean | null} [prefix]
+     * @param {boolean | null} [consecutive]
+     * @param {boolean | null} [forward_fallback]
+     * @returns {Chunker}
+     */
+    static with_pattern(text, size, pattern, prefix, consecutive, forward_fallback) {
+        const ptr0 = passArray8ToWasm0(text, wasm.__wbindgen_malloc);
+        const len0 = WASM_VECTOR_LEN;
+        const ptr1 = passArray8ToWasm0(pattern, wasm.__wbindgen_malloc);
+        const len1 = WASM_VECTOR_LEN;
+        const ret = wasm.chunker_with_pattern(ptr0, len0, size, ptr1, len1, isLikeNone(prefix) ? 0xFFFFFF : prefix ? 1 : 0, isLikeNone(consecutive) ? 0xFFFFFF : consecutive ? 1 : 0, isLikeNone(forward_fallback) ? 0xFFFFFF : forward_fallback ? 1 : 0);
+        return Chunker.__wrap(ret);
+    }
+    /**
+     * Collect all chunk offsets as a flat array [start1, end1, start2, end2, ...].
+     * This is faster than iterating as it makes a single WASM call.
+     * @returns {Uint32Array}
+     */
+    collect_offsets() {
+        const ret = wasm.chunker_collect_offsets(this.__wbg_ptr);
+        var v1 = getArrayU32FromWasm0(ret[0], ret[1]).slice();
+        wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
+        return v1;
+    }
+    /**
+     * Create a new Chunker with single-byte delimiters.
+     *
+     * @param text - The text to chunk (as Uint8Array)
+     * @param size - Target chunk size in bytes (default: 4096)
+     * @param delimiters - Delimiter characters as string (default: "\n.?")
+     * @param prefix - Put delimiter at start of next chunk (default: false)
+     * @param consecutive - Split at START of consecutive runs (default: false)
+     * @param forward_fallback - Search forward if no delimiter in backward window (default: false)
+     * @param {Uint8Array} text
+     * @param {number | null} [size]
+     * @param {string | null} [delimiters]
+     * @param {boolean | null} [prefix]
+     * @param {boolean | null} [consecutive]
+     * @param {boolean | null} [forward_fallback]
+     */
+    constructor(text, size, delimiters, prefix, consecutive, forward_fallback) {
+        const ptr0 = passArray8ToWasm0(text, wasm.__wbindgen_malloc);
+        const len0 = WASM_VECTOR_LEN;
+        var ptr1 = isLikeNone(delimiters) ? 0 : passStringToWasm0(delimiters, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
+        var len1 = WASM_VECTOR_LEN;
+        const ret = wasm.chunker_new(ptr0, len0, isLikeNone(size) ? 0x100000001 : (size) >>> 0, ptr1, len1, isLikeNone(prefix) ? 0xFFFFFF : prefix ? 1 : 0, isLikeNone(consecutive) ? 0xFFFFFF : consecutive ? 1 : 0, isLikeNone(forward_fallback) ? 0xFFFFFF : forward_fallback ? 1 : 0);
+        this.__wbg_ptr = ret >>> 0;
+        ChunkerFinalization.register(this, this.__wbg_ptr, this);
+        return this;
+    }
+    /**
+     * Get the next chunk, or undefined if exhausted.
+     * @returns {Uint8Array | undefined}
+     */
+    next() {
+        const ret = wasm.chunker_next(this.__wbg_ptr);
+        let v1;
+        if (ret[0] !== 0) {
+            v1 = getArrayU8FromWasm0(ret[0], ret[1]).slice();
+            wasm.__wbindgen_free(ret[0], ret[1] * 1, 1);
+        }
+        return v1;
+    }
+    /**
+     * Reset the chunker to iterate from the beginning.
+     */
+    reset() {
+        wasm.chunker_reset(this.__wbg_ptr);
+    }
+}
+if (Symbol.dispose) Chunker.prototype[Symbol.dispose] = Chunker.prototype.free;
+/**
+ * Fast chunking function that returns offsets in a single call.
+ * Returns a flat array [start1, end1, start2, end2, ...].
+ * Use this with subarray for maximum performance.
+ *
+ * @example Single-byte delimiters
+ * ```javascript
+ * const offsets = chunk_offsets(textBytes, 4096, ".\n?");
+ * const chunks = [];
+ * for (let i = 0; i < offsets.length; i += 2) {
+ *     chunks.push(textBytes.subarray(offsets[i], offsets[i + 1]));
+ * }
+ * ```
+ * @param {Uint8Array} text
+ * @param {number | null} [size]
+ * @param {string | null} [delimiters]
+ * @param {boolean | null} [prefix]
+ * @param {boolean | null} [consecutive]
+ * @param {boolean | null} [forward_fallback]
+ * @returns {Uint32Array}
+ */
+export function chunk_offsets(text, size, delimiters, prefix, consecutive, forward_fallback) {
+    const ptr0 = passArray8ToWasm0(text, wasm.__wbindgen_malloc);
+    const len0 = WASM_VECTOR_LEN;
+    var ptr1 = isLikeNone(delimiters) ? 0 : passStringToWasm0(delimiters, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
+    var len1 = WASM_VECTOR_LEN;
+    const ret = wasm.chunk_offsets(ptr0, len0, isLikeNone(size) ? 0x100000001 : (size) >>> 0, ptr1, len1, isLikeNone(prefix) ? 0xFFFFFF : prefix ? 1 : 0, isLikeNone(consecutive) ? 0xFFFFFF : consecutive ? 1 : 0, isLikeNone(forward_fallback) ? 0xFFFFFF : forward_fallback ? 1 : 0);
+    var v3 = getArrayU32FromWasm0(ret[0], ret[1]).slice();
+    wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
+    return v3;
+}
+/**
+ * Fast chunking function with multi-byte pattern support.
+ * Returns a flat array [start1, end1, start2, end2, ...].
+ *
+ * @example Multi-byte pattern (metaspace)
+ * ```javascript
+ * const encoder = new TextEncoder();
+ * const metaspace = encoder.encode("▁");
+ * const offsets = chunk_offsets_pattern(textBytes, 4096, metaspace, true, true, true);
+ * ```
+ * @param {Uint8Array} text
+ * @param {number} size
+ * @param {Uint8Array} pattern
+ * @param {boolean | null} [prefix]
+ * @param {boolean | null} [consecutive]
+ * @param {boolean | null} [forward_fallback]
+ * @returns {Uint32Array}
+ */
+export function chunk_offsets_pattern(text, size, pattern, prefix, consecutive, forward_fallback) {
+    const ptr0 = passArray8ToWasm0(text, wasm.__wbindgen_malloc);
+    const len0 = WASM_VECTOR_LEN;
+    const ptr1 = passArray8ToWasm0(pattern, wasm.__wbindgen_malloc);
+    const len1 = WASM_VECTOR_LEN;
+    const ret = wasm.chunk_offsets_pattern(ptr0, len0, size, ptr1, len1, isLikeNone(prefix) ? 0xFFFFFF : prefix ? 1 : 0, isLikeNone(consecutive) ? 0xFFFFFF : consecutive ? 1 : 0, isLikeNone(forward_fallback) ? 0xFFFFFF : forward_fallback ? 1 : 0);
+    var v3 = getArrayU32FromWasm0(ret[0], ret[1]).slice();
+    wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
+    return v3;
+}
+/**
+ * Get the default delimiters ("\n.?").
+ * @returns {Uint8Array}
+ */
+export function default_delimiters() {
+    const ret = wasm.default_delimiters();
+    var v1 = getArrayU8FromWasm0(ret[0], ret[1]).slice();
+    wasm.__wbindgen_free(ret[0], ret[1] * 1, 1);
+    return v1;
+}
+/**
+ * Get the default target size (4096 bytes).
+ * @returns {number}
+ */
+export function default_target_size() {
+    const ret = wasm.default_target_size();
+    return ret >>> 0;
+}
+/**
+ * Find merge indices for combining segments within token limits.
+ *
+ * Returns indices marking where to split segments into chunks that
+ * respect the token budget. Use this to determine merge boundaries,
+ * then join strings in JavaScript.
+ *
+ * @param token_counts - Array of token counts for each segment
+ * @param chunk_size - Maximum tokens per merged chunk
+ * @returns Array of end indices (exclusive) for each chunk
+ *
+ * @example
+ * ```javascript
+ * const tokenCounts = new Uint32Array([1, 1, 1, 1, 1, 1, 1]);
+ * const indices = find_merge_indices(tokenCounts, 3);
+ * // indices = [3, 6, 7]
+ * // Use to slice: segments.slice(0, 3), segments.slice(3, 6), segments.slice(6, 7)
+ * ```
+ * @param {Uint32Array} token_counts
+ * @param {number} chunk_size
+ * @returns {Uint32Array}
+ */
+export function find_merge_indices(token_counts, chunk_size) {
+    const ptr0 = passArray32ToWasm0(token_counts, wasm.__wbindgen_malloc);
+    const len0 = WASM_VECTOR_LEN;
+    const ret = wasm.find_merge_indices(ptr0, len0, chunk_size);
+    var v2 = getArrayU32FromWasm0(ret[0], ret[1]).slice();
+    wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
+    return v2;
+}
+/**
+ * Split text at every delimiter occurrence, returning offsets.
+ * Unlike chunk_offsets which creates size-based chunks, this splits at
+ * **every** delimiter occurrence.
+ *
+ * Returns a flat array [start1, end1, start2, end2, ...].
+ *
+ * @param text - The text to split (as Uint8Array)
+ * @param delimiters - Delimiter characters as string (default: "\n.?")
+ * @param include_delim - Where to attach delimiter: "prev" (default), "next", or "none"
+ * @param min_chars - Minimum characters per segment (default: 0). Shorter segments are merged.
+ *
+ * @example
+ * ```javascript
+ * const offsets = split_offsets(textBytes, ".", "prev", 0);
+ * const segments = [];
+ * for (let i = 0; i < offsets.length; i += 2) {
+ *     segments.push(textBytes.subarray(offsets[i], offsets[i + 1]));
+ * }
+ * // ["Hello.", " World.", " Test."]
+ * ```
+ * @param {Uint8Array} text
+ * @param {string | null} [delimiters]
+ * @param {string | null} [include_delim]
+ * @param {number | null} [min_chars]
+ * @returns {Uint32Array}
+ */
+export function split_offsets(text, delimiters, include_delim, min_chars) {
+    const ptr0 = passArray8ToWasm0(text, wasm.__wbindgen_malloc);
+    const len0 = WASM_VECTOR_LEN;
+    var ptr1 = isLikeNone(delimiters) ? 0 : passStringToWasm0(delimiters, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
+    var len1 = WASM_VECTOR_LEN;
+    var ptr2 = isLikeNone(include_delim) ? 0 : passStringToWasm0(include_delim, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
+    var len2 = WASM_VECTOR_LEN;
+    const ret = wasm.split_offsets(ptr0, len0, ptr1, len1, ptr2, len2, isLikeNone(min_chars) ? 0x100000001 : (min_chars) >>> 0);
+    var v4 = getArrayU32FromWasm0(ret[0], ret[1]).slice();
+    wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
+    return v4;
+}
+const EXPECTED_RESPONSE_TYPES = new Set(['basic', 'cors', 'default']);
+async function __wbg_load(module, imports) {
+    if (typeof Response === 'function' && module instanceof Response) {
+        if (typeof WebAssembly.instantiateStreaming === 'function') {
+            try {
+                return await WebAssembly.instantiateStreaming(module, imports);
+            } catch (e) {
+                const validResponse = module.ok && EXPECTED_RESPONSE_TYPES.has(module.type);
+                if (validResponse && module.headers.get('Content-Type') !== 'application/wasm') {
+                    console.warn("`WebAssembly.instantiateStreaming` failed because your server does not serve Wasm with `application/wasm` MIME type. Falling back to `WebAssembly.instantiate` which is slower. Original error:\n", e);
+                } else {
+                    throw e;
+                }
+            }
+        }
+        const bytes = await module.arrayBuffer();
+        return await WebAssembly.instantiate(bytes, imports);
+    } else {
+        const instance = await WebAssembly.instantiate(module, imports);
+        if (instance instanceof WebAssembly.Instance) {
+            return { instance, module };
+        } else {
+            return instance;
+        }
+    }
+}
+function __wbg_get_imports() {
+    const imports = {};
+    imports.wbg = {};
+    imports.wbg.__wbg___wbindgen_throw_dd24417ed36fc46e = function(arg0, arg1) {
+        throw new Error(getStringFromWasm0(arg0, arg1));
+    };
+    imports.wbg.__wbindgen_init_externref_table = function() {
+        const table = wasm.__wbindgen_externrefs;
+        const offset = table.grow(4);
+        table.set(0, undefined);
+        table.set(offset + 0, undefined);
+        table.set(offset + 1, null);
+        table.set(offset + 2, true);
+        table.set(offset + 3, false);
+    };
+    return imports;
+}
+function __wbg_finalize_init(instance, module) {
+    wasm = instance.exports;
+    __wbg_init.__wbindgen_wasm_module = module;
+    cachedUint32ArrayMemory0 = null;
+    cachedUint8ArrayMemory0 = null;
+    wasm.__wbindgen_start();
+    return wasm;
+}
+function initSync(module) {
+    if (wasm !== undefined) return wasm;
+    if (typeof module !== 'undefined') {
+        if (Object.getPrototypeOf(module) === Object.prototype) {
+            ({module} = module)
+        } else {
+            console.warn('using deprecated parameters for `initSync()`; pass a single object instead')
+        }
+    }
+    const imports = __wbg_get_imports();
+    if (!(module instanceof WebAssembly.Module)) {
+        module = new WebAssembly.Module(module);
+    }
+    const instance = new WebAssembly.Instance(module, imports);
+    return __wbg_finalize_init(instance, module);
+}
+async function __wbg_init(module_or_path) {
+    if (wasm !== undefined) return wasm;
+    if (typeof module_or_path !== 'undefined') {
+        if (Object.getPrototypeOf(module_or_path) === Object.prototype) {
+            ({module_or_path} = module_or_path)
+        } else {
+            console.warn('using deprecated parameters for the initialization function; pass a single object instead')
+        }
+    }
+    if (typeof module_or_path === 'undefined') {
+        module_or_path = new URL('chonkiejs_chunk_bg.wasm', import.meta.url);
+    }
+    const imports = __wbg_get_imports();
+    if (typeof module_or_path === 'string' || (typeof Request === 'function' && module_or_path instanceof Request) || (typeof URL === 'function' && module_or_path instanceof URL)) {
+        module_or_path = fetch(module_or_path);
+    }
+    const { instance, module } = await __wbg_load(await module_or_path, imports);
+    return __wbg_finalize_init(instance, module);
+}
+export { initSync };
+export default __wbg_init;

package/pkg/chonkiejs_chunk_bg.wasm ADDED Viewed

Binary file

package/pkg/chonkiejs_chunk_bg.wasm.d.ts ADDED Viewed

@@ -0,0 +1,20 @@
+/* tslint:disable */
+/* eslint-disable */
+export const memory: WebAssembly.Memory;
+export const __wbg_chunker_free: (a: number, b: number) => void;
+export const chunk_offsets: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => [number, number];
+export const chunk_offsets_pattern: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => [number, number];
+export const chunker_collect_offsets: (a: number) => [number, number];
+export const chunker_new: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => number;
+export const chunker_next: (a: number) => [number, number];
+export const chunker_reset: (a: number) => void;
+export const chunker_with_pattern: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => number;
+export const default_delimiters: () => [number, number];
+export const default_target_size: () => number;
+export const find_merge_indices: (a: number, b: number, c: number) => [number, number];
+export const split_offsets: (a: number, b: number, c: number, d: number, e: number, f: number, g: number) => [number, number];
+export const __wbindgen_externrefs: WebAssembly.Table;
+export const __wbindgen_malloc: (a: number, b: number) => number;
+export const __wbindgen_realloc: (a: number, b: number, c: number, d: number) => number;
+export const __wbindgen_free: (a: number, b: number, c: number) => void;
+export const __wbindgen_start: () => void;

package/pkg/package.json ADDED Viewed

@@ -0,0 +1,16 @@
+{
+  "name": "chonkiejs-chunk",
+  "type": "module",
+  "version": "0.9.1",
+  "license": "MIT OR Apache-2.0",
+  "files": [
+    "chonkiejs_chunk_bg.wasm",
+    "chonkiejs_chunk.js",
+    "chonkiejs_chunk.d.ts"
+  ],
+  "main": "chonkiejs_chunk.js",
+  "types": "chonkiejs_chunk.d.ts",
+  "sideEffects": [
+    "./snippets/*"
+  ]
+}