@chonkiejs/chunk 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +61 -0
- package/package.json +1 -1
package/index.js
CHANGED
|
@@ -26,6 +26,7 @@ import initWasm, {
|
|
|
26
26
|
default_delimiters,
|
|
27
27
|
chunk_offsets as wasmChunkOffsets,
|
|
28
28
|
chunk_offsets_pattern as wasmChunkOffsetsPattern,
|
|
29
|
+
split_offsets as wasmSplitOffsets,
|
|
29
30
|
} from './pkg/chonkiejs_chunk.js';
|
|
30
31
|
|
|
31
32
|
export { default_target_size, default_delimiters };
|
|
@@ -120,6 +121,66 @@ export function chunk_offsets(text, options = {}) {
|
|
|
120
121
|
return pairs;
|
|
121
122
|
}
|
|
122
123
|
|
|
124
|
+
/**
|
|
125
|
+
* Split text at every delimiter occurrence.
|
|
126
|
+
* Unlike chunk() which creates size-based chunks, this splits at
|
|
127
|
+
* **every** delimiter occurrence.
|
|
128
|
+
*
|
|
129
|
+
* @param {string | Uint8Array} text - The text to split
|
|
130
|
+
* @param {Object} [options] - Options
|
|
131
|
+
* @param {string} [options.delimiters="\n.?"] - Delimiter characters
|
|
132
|
+
* @param {string} [options.includeDelim="prev"] - Where to attach delimiter: "prev", "next", or "none"
|
|
133
|
+
* @param {number} [options.minChars=0] - Minimum characters per segment. Shorter segments are merged.
|
|
134
|
+
* @yields {string | Uint8Array} Segments (same type as input)
|
|
135
|
+
*
|
|
136
|
+
* @example
|
|
137
|
+
* // String input returns strings
|
|
138
|
+
* for (const segment of split("Hello. World. Test.", { delimiters: "." })) {
|
|
139
|
+
* console.log(segment); // "Hello.", " World.", " Test."
|
|
140
|
+
* }
|
|
141
|
+
*/
|
|
142
|
+
export function* split(text, options = {}) {
|
|
143
|
+
const isString = typeof text === 'string';
|
|
144
|
+
const bytes = toBytes(text);
|
|
145
|
+
const { delimiters, includeDelim, minChars } = options;
|
|
146
|
+
|
|
147
|
+
const flat = wasmSplitOffsets(bytes, delimiters, includeDelim, minChars);
|
|
148
|
+
|
|
149
|
+
for (let i = 0; i < flat.length; i += 2) {
|
|
150
|
+
const slice = bytes.subarray(flat[i], flat[i + 1]);
|
|
151
|
+
yield isString ? decoder.decode(slice) : slice;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* Get split offsets without creating views.
|
|
157
|
+
* Unlike chunk_offsets() which creates size-based chunks, this splits at
|
|
158
|
+
* **every** delimiter occurrence.
|
|
159
|
+
*
|
|
160
|
+
* @param {string | Uint8Array} text - The text to split
|
|
161
|
+
* @param {Object} [options] - Options
|
|
162
|
+
* @param {string} [options.delimiters="\n.?"] - Delimiter characters
|
|
163
|
+
* @param {string} [options.includeDelim="prev"] - Where to attach delimiter: "prev", "next", or "none"
|
|
164
|
+
* @param {number} [options.minChars=0] - Minimum characters per segment. Shorter segments are merged.
|
|
165
|
+
* @returns {Array<[number, number]>} Array of [start, end] byte offset pairs
|
|
166
|
+
*
|
|
167
|
+
* @example
|
|
168
|
+
* const offsets = split_offsets("Hello. World.", { delimiters: "." });
|
|
169
|
+
* // [[0, 6], [6, 13]]
|
|
170
|
+
*/
|
|
171
|
+
export function split_offsets(text, options = {}) {
|
|
172
|
+
const bytes = toBytes(text);
|
|
173
|
+
const { delimiters, includeDelim, minChars } = options;
|
|
174
|
+
|
|
175
|
+
const flat = wasmSplitOffsets(bytes, delimiters, includeDelim, minChars);
|
|
176
|
+
|
|
177
|
+
const pairs = [];
|
|
178
|
+
for (let i = 0; i < flat.length; i += 2) {
|
|
179
|
+
pairs.push([flat[i], flat[i + 1]]);
|
|
180
|
+
}
|
|
181
|
+
return pairs;
|
|
182
|
+
}
|
|
183
|
+
|
|
123
184
|
let initialized = false;
|
|
124
185
|
|
|
125
186
|
/**
|