npm - memchunk - Versions diffs - 0.2.0 → 0.3.0 - Mend

memchunk 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/README.md CHANGED Viewed

@@ -40,33 +40,43 @@ import { init, chunk } from 'memchunk';
 // initialize wasm (required once)
 await init();
-const text = new TextEncoder().encode("Hello world. How are you? I'm fine.\nThanks for asking.");
+const text = "Hello world. How are you? I'm fine.\nThanks for asking.";
 // with defaults (4KB chunks, split at \n . ?)
 for (const slice of chunk(text)) {
-    console.log(new TextDecoder().decode(slice));
+    console.log(slice);
 }
 // with custom size
 for (const slice of chunk(text, { size: 1024 })) {
-    console.log(new TextDecoder().decode(slice));
+    console.log(slice);
 }
 // with custom delimiters
 for (const slice of chunk(text, { delimiters: ".?!\n" })) {
-    console.log(new TextDecoder().decode(slice));
+    console.log(slice);
 }
-// with both
-for (const slice of chunk(text, { size: 8192, delimiters: "\n" })) {
-    console.log(new TextDecoder().decode(slice));
+// with multi-byte pattern (e.g., metaspace ▁ for SentencePiece tokenizers)
+for (const slice of chunk(text, { pattern: "▁", prefix: true })) {
+    console.log(slice);
+}
+// with consecutive pattern handling (split at START of runs, not middle)
+for (const slice of chunk("word   next", { pattern: " ", consecutive: true })) {
+    console.log(slice);
+}
+// with forward fallback (search forward if no pattern in backward window)
+for (const slice of chunk(text, { pattern: " ", forwardFallback: true })) {
+    console.log(slice);
 }
 // collect all chunks
 const chunks = [...chunk(text)];
 ```
-chunks are returned as `Uint8Array` subarrays (zero-copy views of the original text).
+pass strings and get strings back. for zero-copy performance with binary data, pass `Uint8Array` and you'll get `Uint8Array` views back.
 ## 📝 citation

package/index.js CHANGED Viewed

@@ -7,9 +7,15 @@
  *
  * await init();
  *
- * const text = new TextEncoder().encode("Hello. World. Test.");
- * for (const slice of chunk(text, { size: 10, delimiters: "." })) {
- *     console.log(new TextDecoder().decode(slice));
+ * // Simple string API - strings in, strings out
+ * for (const slice of chunk("Hello. World. Test.", { size: 10 })) {
+ *     console.log(slice);
+ * }
+ *
+ * // Or use bytes for zero-copy performance
+ * const bytes = new TextEncoder().encode("Hello. World.");
+ * for (const slice of chunk(bytes, { size: 10 })) {
+ *     console.log(slice); // Uint8Array
  * }
  * ```
  */
@@ -19,31 +25,65 @@ import initWasm, {
     default_target_size,
     default_delimiters,
     chunk_offsets as wasmChunkOffsets,
+    chunk_offsets_pattern as wasmChunkOffsetsPattern,
 } from './pkg/memchunk_wasm.js';
 export { default_target_size, default_delimiters };
+const encoder = new TextEncoder();
+const decoder = new TextDecoder();
+/**
+ * Convert input to bytes if it's a string.
+ * @param {string | Uint8Array} input
+ * @returns {Uint8Array}
+ */
+function toBytes(input) {
+    return typeof input === 'string' ? encoder.encode(input) : input;
+}
 /**
  * Split text into chunks at delimiter boundaries.
- * Returns an iterator of zero-copy Uint8Array subarray views.
+ * Accepts strings or Uint8Array. Returns the same type as input.
  *
- * @param {Uint8Array} text - The text to chunk as bytes
+ * @param {string | Uint8Array} text - The text to chunk
  * @param {Object} [options] - Options
  * @param {number} [options.size=4096] - Target chunk size in bytes
  * @param {string} [options.delimiters="\n.?"] - Delimiter characters
- * @yields {Uint8Array} Zero-copy subarray views of the original text
+ * @param {string | Uint8Array} [options.pattern] - Multi-byte pattern to split on
+ * @param {boolean} [options.prefix=false] - Put delimiter/pattern at start of next chunk
+ * @param {boolean} [options.consecutive=false] - Split at START of consecutive runs
+ * @param {boolean} [options.forwardFallback=false] - Search forward if no pattern in backward window
+ * @yields {string | Uint8Array} Chunks (same type as input)
+ *
+ * @example
+ * // String input returns strings
+ * for (const slice of chunk("Hello. World.", { size: 10 })) {
+ *     console.log(slice);
+ * }
  *
  * @example
- * const text = new TextEncoder().encode("Hello. World. Test.");
- * for (const slice of chunk(text, { size: 10, delimiters: "." })) {
- *     console.log(new TextDecoder().decode(slice));
+ * // With pattern (e.g., metaspace for SentencePiece)
+ * for (const slice of chunk("Hello▁World▁Test", { pattern: "▁", prefix: true })) {
+ *     console.log(slice);
  * }
  */
 export function* chunk(text, options = {}) {
-    const { size, delimiters } = options;
-    const flat = wasmChunkOffsets(text, size, delimiters);
+    const isString = typeof text === 'string';
+    const bytes = toBytes(text);
+    const { size, delimiters, pattern, prefix, consecutive, forwardFallback } = options;
+    let flat;
+    if (pattern) {
+        const patternBytes = toBytes(pattern);
+        flat = wasmChunkOffsetsPattern(bytes, size ?? 4096, patternBytes, prefix, consecutive, forwardFallback);
+    } else {
+        flat = wasmChunkOffsets(bytes, size, delimiters, prefix);
+    }
     for (let i = 0; i < flat.length; i += 2) {
-        yield text.subarray(flat[i], flat[i + 1]);
+        const slice = bytes.subarray(flat[i], flat[i + 1]);
+        yield isString ? decoder.decode(slice) : slice;
     }
 }
@@ -51,15 +91,28 @@ export function* chunk(text, options = {}) {
  * Get chunk offsets without creating views.
  * Returns an array of [start, end] offset pairs.
  *
- * @param {Uint8Array} text - The text to chunk as bytes
+ * @param {string | Uint8Array} text - The text to chunk
  * @param {Object} [options] - Options
  * @param {number} [options.size=4096] - Target chunk size in bytes
  * @param {string} [options.delimiters="\n.?"] - Delimiter characters
- * @returns {Array<[number, number]>} Array of [start, end] offset pairs
+ * @param {string | Uint8Array} [options.pattern] - Multi-byte pattern to split on
+ * @param {boolean} [options.prefix=false] - Put delimiter/pattern at start of next chunk
+ * @param {boolean} [options.consecutive=false] - Split at START of consecutive runs
+ * @param {boolean} [options.forwardFallback=false] - Search forward if no pattern in backward window
+ * @returns {Array<[number, number]>} Array of [start, end] byte offset pairs
  */
 export function chunk_offsets(text, options = {}) {
-    const { size, delimiters } = options;
-    const flat = wasmChunkOffsets(text, size, delimiters);
+    const bytes = toBytes(text);
+    const { size, delimiters, pattern, prefix, consecutive, forwardFallback } = options;
+    let flat;
+    if (pattern) {
+        const patternBytes = toBytes(pattern);
+        flat = wasmChunkOffsetsPattern(bytes, size ?? 4096, patternBytes, prefix, consecutive, forwardFallback);
+    } else {
+        flat = wasmChunkOffsets(bytes, size, delimiters, prefix);
+    }
     const pairs = [];
     for (let i = 0; i < flat.length; i += 2) {
         pairs.push([flat[i], flat[i + 1]]);
@@ -82,26 +135,54 @@ export async function init() {
 /**
  * Chunker splits text at delimiter boundaries.
  * Implements Symbol.iterator for use in for...of loops.
+ *
+ * @example
+ * // String input
+ * const chunker = new Chunker("Hello. World. Test.", { size: 10 });
+ * for (const slice of chunker) {
+ *     console.log(slice); // strings
+ * }
+ *
+ * @example
+ * // With pattern
+ * const chunker = new Chunker("Hello▁World", { pattern: "▁", prefix: true });
+ * for (const slice of chunker) {
+ *     console.log(slice);
+ * }
  */
 export class Chunker {
     /**
      * Create a new Chunker.
-     * @param {Uint8Array} text - The text to chunk as bytes
+     * @param {string | Uint8Array} text - The text to chunk
      * @param {Object} [options] - Options
      * @param {number} [options.size=4096] - Target chunk size in bytes
      * @param {string} [options.delimiters="\n.?"] - Delimiter characters
+     * @param {string | Uint8Array} [options.pattern] - Multi-byte pattern to split on
+     * @param {boolean} [options.prefix=false] - Put delimiter/pattern at start of next chunk
+     * @param {boolean} [options.consecutive=false] - Split at START of consecutive runs
+     * @param {boolean} [options.forwardFallback=false] - Search forward if no pattern in backward window
      */
     constructor(text, options = {}) {
-        const { size, delimiters } = options;
-        this._chunker = new WasmChunker(text, size, delimiters);
+        this._isString = typeof text === 'string';
+        const bytes = toBytes(text);
+        const { size, delimiters, pattern, prefix, consecutive, forwardFallback } = options;
+        if (pattern) {
+            const patternBytes = toBytes(pattern);
+            this._chunker = WasmChunker.with_pattern(bytes, size ?? 4096, patternBytes, prefix, consecutive, forwardFallback);
+        } else {
+            this._chunker = new WasmChunker(bytes, size, delimiters, prefix);
+        }
     }
     /**
      * Get the next chunk, or undefined if exhausted.
-     * @returns {Uint8Array | undefined}
+     * @returns {string | Uint8Array | undefined}
      */
     next() {
-        return this._chunker.next();
+        const chunk = this._chunker.next();
+        if (chunk === undefined) return undefined;
+        return this._isString ? decoder.decode(chunk) : chunk;
     }
     /**
@@ -138,7 +219,7 @@ export class Chunker {
     *[Symbol.iterator]() {
         let chunk;
         while ((chunk = this._chunker.next()) !== undefined) {
-            yield chunk;
+            yield this._isString ? decoder.decode(chunk) : chunk;
         }
     }
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "memchunk",
-  "version": "0.2.0",
+  "version": "0.3.0",
   "description": "The fastest semantic text chunking library",
   "type": "module",
   "main": "index.js",