@chonkiejs/chunk 0.6.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +35 -0
- package/package.json +1 -1
package/index.js
CHANGED
|
@@ -27,6 +27,7 @@ import initWasm, {
|
|
|
27
27
|
chunk_offsets as wasmChunkOffsets,
|
|
28
28
|
chunk_offsets_pattern as wasmChunkOffsetsPattern,
|
|
29
29
|
split_offsets as wasmSplitOffsets,
|
|
30
|
+
merge_splits as wasmMergeSplits,
|
|
30
31
|
} from './pkg/chonkiejs_chunk.js';
|
|
31
32
|
|
|
32
33
|
export { default_target_size, default_delimiters };
|
|
@@ -181,6 +182,40 @@ export function split_offsets(text, options = {}) {
|
|
|
181
182
|
return pairs;
|
|
182
183
|
}
|
|
183
184
|
|
|
185
|
+
/**
|
|
186
|
+
* Merge segments based on token counts, respecting chunk size limits.
|
|
187
|
+
*
|
|
188
|
+
* This is the equivalent of Chonkie's Cython `_merge_splits` function.
|
|
189
|
+
* Used by RecursiveChunker to merge small segments into larger chunks
|
|
190
|
+
* that fit within a token budget.
|
|
191
|
+
*
|
|
192
|
+
* @param {number[] | Uint32Array} tokenCounts - Array of token counts for each segment
|
|
193
|
+
* @param {number} chunkSize - Maximum tokens per merged chunk
|
|
194
|
+
* @param {boolean} [combineWhitespace=false] - If true, adds +1 token per join for whitespace
|
|
195
|
+
* @returns {{indices: number[], tokenCounts: number[]}} Object with indices and token counts
|
|
196
|
+
*
|
|
197
|
+
* @example
|
|
198
|
+
* const result = merge_splits([1, 1, 1, 1, 1, 1, 1], 3);
|
|
199
|
+
* // result.indices = [3, 6, 7]
|
|
200
|
+
* // result.tokenCounts = [3, 3, 1]
|
|
201
|
+
*
|
|
202
|
+
* @example
|
|
203
|
+
* // With whitespace tokens
|
|
204
|
+
* const result = merge_splits([1, 1, 1, 1, 1, 1, 1], 5, true);
|
|
205
|
+
* // result.indices = [3, 6, 7]
|
|
206
|
+
* // result.tokenCounts = [5, 5, 1] (3 tokens + 2 whitespace joins per chunk)
|
|
207
|
+
*/
|
|
208
|
+
export function merge_splits(tokenCounts, chunkSize, combineWhitespace = false) {
|
|
209
|
+
const flat = wasmMergeSplits(tokenCounts, chunkSize, combineWhitespace);
|
|
210
|
+
const indices = [];
|
|
211
|
+
const counts = [];
|
|
212
|
+
for (let i = 0; i < flat.length; i += 2) {
|
|
213
|
+
indices.push(flat[i]);
|
|
214
|
+
counts.push(flat[i + 1]);
|
|
215
|
+
}
|
|
216
|
+
return { indices, tokenCounts: counts };
|
|
217
|
+
}
|
|
218
|
+
|
|
184
219
|
let initialized = false;
|
|
185
220
|
|
|
186
221
|
/**
|