@chonkiejs/chunk 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/index.js +61 -0
  2. package/package.json +1 -1
package/index.js CHANGED
@@ -26,6 +26,7 @@ import initWasm, {
26
26
  default_delimiters,
27
27
  chunk_offsets as wasmChunkOffsets,
28
28
  chunk_offsets_pattern as wasmChunkOffsetsPattern,
29
+ split_offsets as wasmSplitOffsets,
29
30
  } from './pkg/chonkiejs_chunk.js';
30
31
 
31
32
  export { default_target_size, default_delimiters };
@@ -120,6 +121,66 @@ export function chunk_offsets(text, options = {}) {
120
121
  return pairs;
121
122
  }
122
123
 
124
+ /**
125
+ * Split text at every delimiter occurrence.
126
+ * Unlike chunk() which creates size-based chunks, this splits at
127
+ * **every** delimiter occurrence.
128
+ *
129
+ * @param {string | Uint8Array} text - The text to split
130
+ * @param {Object} [options] - Options
131
+ * @param {string} [options.delimiters="\n.?"] - Delimiter characters
132
+ * @param {string} [options.includeDelim="prev"] - Where to attach delimiter: "prev", "next", or "none"
133
+ * @param {number} [options.minChars=0] - Minimum characters per segment. Shorter segments are merged.
134
+ * @yields {string | Uint8Array} Segments (same type as input)
135
+ *
136
+ * @example
137
+ * // String input returns strings
138
+ * for (const segment of split("Hello. World. Test.", { delimiters: "." })) {
139
+ * console.log(segment); // "Hello.", " World.", " Test."
140
+ * }
141
+ */
142
+ export function* split(text, options = {}) {
143
+ const isString = typeof text === 'string';
144
+ const bytes = toBytes(text);
145
+ const { delimiters, includeDelim, minChars } = options;
146
+
147
+ const flat = wasmSplitOffsets(bytes, delimiters, includeDelim, minChars);
148
+
149
+ for (let i = 0; i < flat.length; i += 2) {
150
+ const slice = bytes.subarray(flat[i], flat[i + 1]);
151
+ yield isString ? decoder.decode(slice) : slice;
152
+ }
153
+ }
154
+
155
+ /**
156
+ * Get split offsets without creating views.
157
+ * Unlike chunk_offsets() which creates size-based chunks, this splits at
158
+ * **every** delimiter occurrence.
159
+ *
160
+ * @param {string | Uint8Array} text - The text to split
161
+ * @param {Object} [options] - Options
162
+ * @param {string} [options.delimiters="\n.?"] - Delimiter characters
163
+ * @param {string} [options.includeDelim="prev"] - Where to attach delimiter: "prev", "next", or "none"
164
+ * @param {number} [options.minChars=0] - Minimum characters per segment. Shorter segments are merged.
165
+ * @returns {Array<[number, number]>} Array of [start, end] byte offset pairs
166
+ *
167
+ * @example
168
+ * const offsets = split_offsets("Hello. World.", { delimiters: "." });
169
+ * // [[0, 6], [6, 13]]
170
+ */
171
+ export function split_offsets(text, options = {}) {
172
+ const bytes = toBytes(text);
173
+ const { delimiters, includeDelim, minChars } = options;
174
+
175
+ const flat = wasmSplitOffsets(bytes, delimiters, includeDelim, minChars);
176
+
177
+ const pairs = [];
178
+ for (let i = 0; i < flat.length; i += 2) {
179
+ pairs.push([flat[i], flat[i + 1]]);
180
+ }
181
+ return pairs;
182
+ }
183
+
123
184
  let initialized = false;
124
185
 
125
186
  /**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@chonkiejs/chunk",
3
- "version": "0.5.0",
3
+ "version": "0.6.0",
4
4
  "description": "The fastest semantic text chunking library",
5
5
  "type": "module",
6
6
  "main": "index.js",