@chonkiejs/chunk 0.6.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/index.js +35 -0
  2. package/package.json +1 -1
package/index.js CHANGED
@@ -27,6 +27,7 @@ import initWasm, {
27
27
  chunk_offsets as wasmChunkOffsets,
28
28
  chunk_offsets_pattern as wasmChunkOffsetsPattern,
29
29
  split_offsets as wasmSplitOffsets,
30
+ merge_splits as wasmMergeSplits,
30
31
  } from './pkg/chonkiejs_chunk.js';
31
32
 
32
33
  export { default_target_size, default_delimiters };
@@ -181,6 +182,40 @@ export function split_offsets(text, options = {}) {
181
182
  return pairs;
182
183
  }
183
184
 
185
+ /**
186
+ * Merge segments based on token counts, respecting chunk size limits.
187
+ *
188
+ * This is the equivalent of Chonkie's Cython `_merge_splits` function.
189
+ * Used by RecursiveChunker to merge small segments into larger chunks
190
+ * that fit within a token budget.
191
+ *
192
+ * @param {number[] | Uint32Array} tokenCounts - Array of token counts for each segment
193
+ * @param {number} chunkSize - Maximum tokens per merged chunk
194
+ * @param {boolean} [combineWhitespace=false] - If true, adds +1 token per join for whitespace
195
+ * @returns {{indices: number[], tokenCounts: number[]}} Object with indices and token counts
196
+ *
197
+ * @example
198
+ * const result = merge_splits([1, 1, 1, 1, 1, 1, 1], 3);
199
+ * // result.indices = [3, 6, 7]
200
+ * // result.tokenCounts = [3, 3, 1]
201
+ *
202
+ * @example
203
+ * // With whitespace tokens
204
+ * const result = merge_splits([1, 1, 1, 1, 1, 1, 1], 5, true);
205
+ * // result.indices = [3, 6, 7]
206
+ * // result.tokenCounts = [5, 5, 1] (3 tokens + 2 whitespace joins per chunk)
207
+ */
208
+ export function merge_splits(tokenCounts, chunkSize, combineWhitespace = false) {
209
+ const flat = wasmMergeSplits(tokenCounts, chunkSize, combineWhitespace);
210
+ const indices = [];
211
+ const counts = [];
212
+ for (let i = 0; i < flat.length; i += 2) {
213
+ indices.push(flat[i]);
214
+ counts.push(flat[i + 1]);
215
+ }
216
+ return { indices, tokenCounts: counts };
217
+ }
218
+
184
219
  let initialized = false;
185
220
 
186
221
  /**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@chonkiejs/chunk",
3
- "version": "0.6.0",
3
+ "version": "0.8.0",
4
4
  "description": "The fastest semantic text chunking library",
5
5
  "type": "module",
6
6
  "main": "index.js",