ralph-hero-knowledge-index 0.1.20 → 0.1.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/.mcp.json +1 -1
- package/dist/chunker.d.ts +37 -0
- package/dist/chunker.js +219 -0
- package/dist/chunker.js.map +1 -0
- package/package.json +1 -1
- package/src/__tests__/chunker.test.ts +246 -0
- package/src/chunker.ts +279 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "ralph-knowledge",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.21",
|
|
4
4
|
"description": "Knowledge graph for ralph-hero: semantic search, relationship traversal, and document indexing across thoughts/ documents. Optional companion to ralph-hero.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Chad Dubiel",
|
package/.mcp.json
CHANGED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* RecursiveCharacterTextSplitter-style chunker.
|
|
3
|
+
*
|
|
4
|
+
* Splits long text into overlapping chunks while preserving the original
|
|
5
|
+
* character offsets (charStart, charEnd) so downstream code can reconstruct
|
|
6
|
+
* positions. Mirrors the semantics of LangChain's RecursiveCharacterTextSplitter:
|
|
7
|
+
* tries each separator in order, snapping chunk boundaries to the highest-priority
|
|
8
|
+
* separator that keeps pieces under chunkSize.
|
|
9
|
+
*
|
|
10
|
+
* Defaults correspond to 512-token chunks with 64-token overlap
|
|
11
|
+
* (approx: 1 token ~= 4 chars for English text).
|
|
12
|
+
*/
|
|
13
|
+
export interface Chunk {
|
|
14
|
+
index: number;
|
|
15
|
+
content: string;
|
|
16
|
+
charStart: number;
|
|
17
|
+
charEnd: number;
|
|
18
|
+
}
|
|
19
|
+
export interface ChunkerOptions {
|
|
20
|
+
chunkSize?: number;
|
|
21
|
+
chunkOverlap?: number;
|
|
22
|
+
separators?: string[];
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Split `text` into overlapping chunks.
|
|
26
|
+
*
|
|
27
|
+
* Semantics:
|
|
28
|
+
* - Empty input -> empty array.
|
|
29
|
+
* - Short input (<= chunkSize) -> single chunk covering the whole text.
|
|
30
|
+
* - For each chunk, `text.slice(charStart, charEnd) === content`.
|
|
31
|
+
* - `charStart` is monotonically non-decreasing across chunks.
|
|
32
|
+
* - Consecutive chunks overlap by ~`chunkOverlap` chars (snapped to atom
|
|
33
|
+
* boundaries; may differ by up to the largest atom size).
|
|
34
|
+
* - Each chunk's content length is bounded by chunkSize + a small slack for
|
|
35
|
+
* the separator that snapped the boundary.
|
|
36
|
+
*/
|
|
37
|
+
export declare function chunkText(text: string, opts?: ChunkerOptions): Chunk[];
|
package/dist/chunker.js
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* RecursiveCharacterTextSplitter-style chunker.
|
|
3
|
+
*
|
|
4
|
+
* Splits long text into overlapping chunks while preserving the original
|
|
5
|
+
* character offsets (charStart, charEnd) so downstream code can reconstruct
|
|
6
|
+
* positions. Mirrors the semantics of LangChain's RecursiveCharacterTextSplitter:
|
|
7
|
+
* tries each separator in order, snapping chunk boundaries to the highest-priority
|
|
8
|
+
* separator that keeps pieces under chunkSize.
|
|
9
|
+
*
|
|
10
|
+
* Defaults correspond to 512-token chunks with 64-token overlap
|
|
11
|
+
* (approx: 1 token ~= 4 chars for English text).
|
|
12
|
+
*/
|
|
13
|
+
const DEFAULT_CHUNK_SIZE = 2048;
|
|
14
|
+
const DEFAULT_CHUNK_OVERLAP = 256;
|
|
15
|
+
const DEFAULT_SEPARATORS = ["\n\n", "\n", ". ", " ", ""];
|
|
16
|
+
/**
|
|
17
|
+
* Pick the first separator from `separators` that occurs in `piece.text`.
|
|
18
|
+
* Falls back to the last separator (typically `""`) if none match — this is
|
|
19
|
+
* the sentinel that lets us character-split oversized pieces with no natural
|
|
20
|
+
* boundary.
|
|
21
|
+
*/
|
|
22
|
+
function pickSeparator(pieceText, separators) {
|
|
23
|
+
for (let i = 0; i < separators.length; i++) {
|
|
24
|
+
const sep = separators[i];
|
|
25
|
+
if (sep === "") {
|
|
26
|
+
return { separator: sep, remaining: separators.slice(i + 1) };
|
|
27
|
+
}
|
|
28
|
+
if (pieceText.includes(sep)) {
|
|
29
|
+
return { separator: sep, remaining: separators.slice(i + 1) };
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
// Should be unreachable when DEFAULT_SEPARATORS ends with "".
|
|
33
|
+
return { separator: "", remaining: [] };
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Split a piece on `separator` while retaining absolute char offsets.
|
|
37
|
+
* When separator is empty, split into single-character pieces.
|
|
38
|
+
*/
|
|
39
|
+
function splitOnSeparator(piece, separator) {
|
|
40
|
+
if (separator === "") {
|
|
41
|
+
const out = [];
|
|
42
|
+
for (let i = 0; i < piece.text.length; i++) {
|
|
43
|
+
out.push({ text: piece.text[i], start: piece.start + i });
|
|
44
|
+
}
|
|
45
|
+
return out;
|
|
46
|
+
}
|
|
47
|
+
const out = [];
|
|
48
|
+
let cursor = 0;
|
|
49
|
+
let idx = piece.text.indexOf(separator, cursor);
|
|
50
|
+
while (idx !== -1) {
|
|
51
|
+
// Keep the separator attached to the preceding piece so reconstruction
|
|
52
|
+
// via text.slice(charStart, charEnd) works bit-for-bit.
|
|
53
|
+
const sliceEnd = idx + separator.length;
|
|
54
|
+
out.push({
|
|
55
|
+
text: piece.text.slice(cursor, sliceEnd),
|
|
56
|
+
start: piece.start + cursor,
|
|
57
|
+
});
|
|
58
|
+
cursor = sliceEnd;
|
|
59
|
+
idx = piece.text.indexOf(separator, cursor);
|
|
60
|
+
}
|
|
61
|
+
if (cursor < piece.text.length) {
|
|
62
|
+
out.push({
|
|
63
|
+
text: piece.text.slice(cursor),
|
|
64
|
+
start: piece.start + cursor,
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
return out;
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Recursively flatten a piece into "atoms" — pieces small enough to merge
|
|
71
|
+
* greedily into chunks. Pieces larger than chunkSize are split with the next
|
|
72
|
+
* separator in line; pieces that fit are returned as-is.
|
|
73
|
+
*/
|
|
74
|
+
function flattenToAtoms(piece, separators, chunkSize) {
|
|
75
|
+
if (piece.text.length <= chunkSize) {
|
|
76
|
+
return [piece];
|
|
77
|
+
}
|
|
78
|
+
const { separator, remaining } = pickSeparator(piece.text, separators);
|
|
79
|
+
const splits = splitOnSeparator(piece, separator);
|
|
80
|
+
// If the separator didn't actually reduce the piece (e.g., no occurrence),
|
|
81
|
+
// fall through to the next separator with the original piece.
|
|
82
|
+
if (splits.length <= 1) {
|
|
83
|
+
if (remaining.length === 0) {
|
|
84
|
+
// No more separators — return whatever we have, even if oversized.
|
|
85
|
+
return [piece];
|
|
86
|
+
}
|
|
87
|
+
return flattenToAtoms(piece, remaining, chunkSize);
|
|
88
|
+
}
|
|
89
|
+
const out = [];
|
|
90
|
+
for (const sub of splits) {
|
|
91
|
+
if (sub.text.length <= chunkSize) {
|
|
92
|
+
out.push(sub);
|
|
93
|
+
}
|
|
94
|
+
else if (remaining.length > 0) {
|
|
95
|
+
for (const leaf of flattenToAtoms(sub, remaining, chunkSize)) {
|
|
96
|
+
out.push(leaf);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
else {
|
|
100
|
+
// Last-resort: character-split oversized atom so we never return a
|
|
101
|
+
// single atom larger than chunkSize.
|
|
102
|
+
out.push(...splitOnSeparator(sub, ""));
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
return out;
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Build a chunk object from a contiguous run of atoms.
|
|
109
|
+
* `charStart` is taken from the first atom, `charEnd` from the last atom's
|
|
110
|
+
* end boundary, and `content` is `text.slice(start, end)` — this guarantees
|
|
111
|
+
* `text.slice(charStart, charEnd) === content`.
|
|
112
|
+
*/
|
|
113
|
+
function buildChunk(originalText, atoms, index) {
|
|
114
|
+
const first = atoms[0];
|
|
115
|
+
const last = atoms[atoms.length - 1];
|
|
116
|
+
const charStart = first.start;
|
|
117
|
+
const charEnd = last.start + last.text.length;
|
|
118
|
+
return {
|
|
119
|
+
index,
|
|
120
|
+
content: originalText.slice(charStart, charEnd),
|
|
121
|
+
charStart,
|
|
122
|
+
charEnd,
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
/**
|
|
126
|
+
* Compute the start position for the next chunk's atoms given the previous
|
|
127
|
+
* chunk ended at `prevEnd`. We walk backward through the atom list to find
|
|
128
|
+
* the atom whose start >= prevEnd - chunkOverlap; that atom begins the
|
|
129
|
+
* overlap region.
|
|
130
|
+
*/
|
|
131
|
+
function findOverlapStartIndex(atoms, lastEndAtomIndex, prevEnd, chunkOverlap) {
|
|
132
|
+
if (chunkOverlap <= 0) {
|
|
133
|
+
return lastEndAtomIndex + 1;
|
|
134
|
+
}
|
|
135
|
+
const targetStart = prevEnd - chunkOverlap;
|
|
136
|
+
// Find the earliest atom in [0..lastEndAtomIndex] whose start >= targetStart.
|
|
137
|
+
let overlapAtomIdx = lastEndAtomIndex + 1;
|
|
138
|
+
for (let i = lastEndAtomIndex; i >= 0; i--) {
|
|
139
|
+
if (atoms[i].start >= targetStart) {
|
|
140
|
+
overlapAtomIdx = i;
|
|
141
|
+
}
|
|
142
|
+
else {
|
|
143
|
+
break;
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
// If overlap produced no progress (no atoms found), step forward to avoid
|
|
147
|
+
// an infinite loop.
|
|
148
|
+
if (overlapAtomIdx > lastEndAtomIndex) {
|
|
149
|
+
overlapAtomIdx = lastEndAtomIndex + 1;
|
|
150
|
+
}
|
|
151
|
+
return overlapAtomIdx;
|
|
152
|
+
}
|
|
153
|
+
/**
|
|
154
|
+
* Split `text` into overlapping chunks.
|
|
155
|
+
*
|
|
156
|
+
* Semantics:
|
|
157
|
+
* - Empty input -> empty array.
|
|
158
|
+
* - Short input (<= chunkSize) -> single chunk covering the whole text.
|
|
159
|
+
* - For each chunk, `text.slice(charStart, charEnd) === content`.
|
|
160
|
+
* - `charStart` is monotonically non-decreasing across chunks.
|
|
161
|
+
* - Consecutive chunks overlap by ~`chunkOverlap` chars (snapped to atom
|
|
162
|
+
* boundaries; may differ by up to the largest atom size).
|
|
163
|
+
* - Each chunk's content length is bounded by chunkSize + a small slack for
|
|
164
|
+
* the separator that snapped the boundary.
|
|
165
|
+
*/
|
|
166
|
+
export function chunkText(text, opts = {}) {
|
|
167
|
+
const chunkSize = opts.chunkSize ?? DEFAULT_CHUNK_SIZE;
|
|
168
|
+
const chunkOverlap = opts.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP;
|
|
169
|
+
const separators = opts.separators ?? DEFAULT_SEPARATORS;
|
|
170
|
+
if (text.length === 0) {
|
|
171
|
+
return [];
|
|
172
|
+
}
|
|
173
|
+
if (chunkOverlap >= chunkSize) {
|
|
174
|
+
throw new Error(`chunker: chunkOverlap (${chunkOverlap}) must be smaller than chunkSize (${chunkSize})`);
|
|
175
|
+
}
|
|
176
|
+
// Fast path for short docs — no need to walk separators.
|
|
177
|
+
if (text.length <= chunkSize) {
|
|
178
|
+
return [
|
|
179
|
+
{
|
|
180
|
+
index: 0,
|
|
181
|
+
content: text,
|
|
182
|
+
charStart: 0,
|
|
183
|
+
charEnd: text.length,
|
|
184
|
+
},
|
|
185
|
+
];
|
|
186
|
+
}
|
|
187
|
+
const atoms = flattenToAtoms({ text, start: 0 }, separators, chunkSize);
|
|
188
|
+
const chunks = [];
|
|
189
|
+
let chunkIdx = 0;
|
|
190
|
+
let i = 0;
|
|
191
|
+
while (i < atoms.length) {
|
|
192
|
+
// Greedily pack atoms into this chunk until adding one more would push
|
|
193
|
+
// the chunk content past chunkSize.
|
|
194
|
+
let runLen = 0;
|
|
195
|
+
let j = i;
|
|
196
|
+
while (j < atoms.length) {
|
|
197
|
+
const atomLen = atoms[j].text.length;
|
|
198
|
+
// Always include at least one atom per chunk to ensure progress.
|
|
199
|
+
if (j > i && runLen + atomLen > chunkSize) {
|
|
200
|
+
break;
|
|
201
|
+
}
|
|
202
|
+
runLen += atomLen;
|
|
203
|
+
j++;
|
|
204
|
+
}
|
|
205
|
+
const runAtoms = atoms.slice(i, j);
|
|
206
|
+
const chunk = buildChunk(text, runAtoms, chunkIdx);
|
|
207
|
+
chunks.push(chunk);
|
|
208
|
+
chunkIdx++;
|
|
209
|
+
if (j >= atoms.length) {
|
|
210
|
+
break;
|
|
211
|
+
}
|
|
212
|
+
// Compute overlap start for the next chunk.
|
|
213
|
+
const lastEndAtomIdx = j - 1;
|
|
214
|
+
const nextStart = findOverlapStartIndex(atoms, lastEndAtomIdx, chunk.charEnd, chunkOverlap);
|
|
215
|
+
i = nextStart;
|
|
216
|
+
}
|
|
217
|
+
return chunks;
|
|
218
|
+
}
|
|
219
|
+
//# sourceMappingURL=chunker.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunker.js","sourceRoot":"","sources":["../src/chunker.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAeH,MAAM,kBAAkB,GAAG,IAAI,CAAC;AAChC,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAClC,MAAM,kBAAkB,GAAa,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC;AAWnE;;;;;GAKG;AACH,SAAS,aAAa,CAAC,SAAiB,EAAE,UAAoB;IAI5D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3C,MAAM,GAAG,GAAG,UAAU,CAAC,CAAC,CAAE,CAAC;QAC3B,IAAI,GAAG,KAAK,EAAE,EAAE,CAAC;YACf,OAAO,EAAE,SAAS,EAAE,GAAG,EAAE,SAAS,EAAE,UAAU,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;QAChE,CAAC;QACD,IAAI,SAAS,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YAC5B,OAAO,EAAE,SAAS,EAAE,GAAG,EAAE,SAAS,EAAE,UAAU,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;QAChE,CAAC;IACH,CAAC;IACD,8DAA8D;IAC9D,OAAO,EAAE,SAAS,EAAE,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,CAAC;AAC1C,CAAC;AAED;;;GAGG;AACH,SAAS,gBAAgB,CAAC,KAAY,EAAE,SAAiB;IACvD,IAAI,SAAS,KAAK,EAAE,EAAE,CAAC;QACrB,MAAM,GAAG,GAAY,EAAE,CAAC;QACxB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3C,GAAG,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,CAAC,CAAE,EAAE,KAAK,EAAE,KAAK,CAAC,KAAK,GAAG,CAAC,EAAE,CAAC,CAAC;QAC7D,CAAC;QACD,OAAO,GAAG,CAAC;IACb,CAAC;IAED,MAAM,GAAG,GAAY,EAAE,CAAC;IACxB,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,IAAI,GAAG,GAAG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAChD,OAAO,GAAG,KAAK,CAAC,CAAC,EAAE,CAAC;QAClB,uEAAuE;QACvE,wDAAwD;QACxD,MAAM,QAAQ,GAAG,GAAG,GAAG,SAAS,CAAC,MAAM,CAAC;QACxC,GAAG,CAAC,IAAI,CAAC;YACP,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,QAAQ,CAAC;YACxC,KAAK,EAAE,KAAK,CAAC,KAAK,GAAG,MAAM;SAC5B,CAAC,CAAC;QACH,MAAM,GAAG,QAAQ,CAAC;QAClB,GAAG,GAAG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAC9C,CAAC;IACD,IAAI,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;QAC/B,GAAG,CAAC,IAAI,CAAC;YACP,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC;YAC9B,KAAK,EAAE,KAAK,CAAC,KAAK,GAAG,MAAM;SAC5B,CAAC,CAAC;IACL,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;;GAIG;AACH,SAAS,cAAc,CACrB,KAAY,EACZ,UAAoB,EACpB,SAAiB;IAEjB,IAAI,KAAK,CAAC,IAAI,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QACnC,OAAO,CAAC,KAAK,CAAC,CAAC;IACjB,CAAC;IACD,MAAM,EAAE,SAAS,EAAE,SAAS,EAAE,GAAG,aAAa,CAAC,KAAK,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;IACvE,MAAM,MAAM,GAAG,gBAAgB,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;IAElD,2EAA2E;IAC3E,8DAA8D;IAC9D,IAAI,MAAM,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;QACvB,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3B,mEAAmE;YACnE,OAAO,CAAC,KAAK,CAAC,CAAC;QACjB,CAAC;QACD,OAAO,cAAc,CAAC,KAAK,EAAE,SAAS,EAAE,SAAS,CAAC,CAAC;IACrD,CAAC;IAED,MAAM,GAAG,GAAY,EAAE,CAAC;IACxB,KAAK,MAAM,GAAG,IAAI,MAAM,EAAE,CAAC;QACzB,IAAI,GAAG,CAAC,IAAI,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;YACjC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAChB,CAAC;aAAM,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAChC,KAAK,MAAM,IAAI,IAAI,cAAc,CAAC,GAAG,EAAE,SAAS,EAAE,SAAS,CAAC,EAAE,CAAC;gBAC7D,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACjB,CAAC;QACH,CAAC;aAAM,CAAC;YACN,mEAAmE;YACnE,qCAAqC;YACrC,GAAG,CAAC,IAAI,CAAC,GAAG,gBAAgB,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC,CAAC;QACzC,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;;;GAKG;AACH,SAAS,UAAU,CACjB,YAAoB,EACpB,KAAc,EACd,KAAa;IAEb,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAE,CAAC;IACxB,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAE,CAAC;IACtC,MAAM,SAAS,GAAG,KAAK,CAAC,KAAK,CAAC;IAC9B,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC;IAC9C,OAAO;QACL,KAAK;QACL,OAAO,EAAE,YAAY,CAAC,KAAK,CAAC,SAAS,EAAE,OAAO,CAAC;QAC/C,SAAS;QACT,OAAO;KACR,CAAC;AACJ,CAAC;AAED;;;;;GAKG;AACH,SAAS,qBAAqB,CAC5B,KAAc,EACd,gBAAwB,EACxB,OAAe,EACf,YAAoB;IAEpB,IAAI,YAAY,IAAI,CAAC,EAAE,CAAC;QACtB,OAAO,gBAAgB,GAAG,CAAC,CAAC;IAC9B,CAAC;IACD,MAAM,WAAW,GAAG,OAAO,GAAG,YAAY,CAAC;IAC3C,8EAA8E;IAC9E,IAAI,cAAc,GAAG,gBAAgB,GAAG,CAAC,CAAC;IAC1C,KAAK,IAAI,CAAC,GAAG,gBAAgB,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3C,IAAI,KAAK,CAAC,CAAC,CAAE,CAAC,KAAK,IAAI,WAAW,EAAE,CAAC;YACnC,cAAc,GAAG,CAAC,CAAC;QACrB,CAAC;aAAM,CAAC;YACN,MAAM;QACR,CAAC;IACH,CAAC;IACD,0EAA0E;IAC1E,oBAAoB;IACpB,IAAI,cAAc,GAAG,gBAAgB,EAAE,CAAC;QACtC,cAAc,GAAG,gBAAgB,GAAG,CAAC,CAAC;IACxC,CAAC;IACD,OAAO,cAAc,CAAC;AACxB,CAAC;AAED;;;;;;;;;;;;GAYG;AACH,MAAM,UAAU,SAAS,CAAC,IAAY,EAAE,OAAuB,EAAE;IAC/D,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,kBAAkB,CAAC;IACvD,MAAM,YAAY,GAAG,IAAI,CAAC,YAAY,IAAI,qBAAqB,CAAC;IAChE,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,IAAI,kBAAkB,CAAC;IAEzD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtB,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,IAAI,YAAY,IAAI,SAAS,EAAE,CAAC;QAC9B,MAAM,IAAI,KAAK,CACb,0BAA0B,YAAY,qCAAqC,SAAS,GAAG,CACxF,CAAC;IACJ,CAAC;IAED,yDAAyD;IACzD,IAAI,IAAI,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAC7B,OAAO;YACL;gBACE,KAAK,EAAE,CAAC;gBACR,OAAO,EAAE,IAAI;gBACb,SAAS,EAAE,CAAC;gBACZ,OAAO,EAAE,IAAI,CAAC,MAAM;aACrB;SACF,CAAC;IACJ,CAAC;IAED,MAAM,KAAK,GAAG,cAAc,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,EAAE,UAAU,EAAE,SAAS,CAAC,CAAC;IAExE,MAAM,MAAM,GAAY,EAAE,CAAC;IAC3B,IAAI,QAAQ,GAAG,CAAC,CAAC;IACjB,IAAI,CAAC,GAAG,CAAC,CAAC;IAEV,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;QACxB,uEAAuE;QACvE,oCAAoC;QACpC,IAAI,MAAM,GAAG,CAAC,CAAC;QACf,IAAI,CAAC,GAAG,CAAC,CAAC;QACV,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YACxB,MAAM,OAAO,GAAG,KAAK,CAAC,CAAC,CAAE,CAAC,IAAI,CAAC,MAAM,CAAC;YACtC,iEAAiE;YACjE,IAAI,CAAC,GAAG,CAAC,IAAI,MAAM,GAAG,OAAO,GAAG,SAAS,EAAE,CAAC;gBAC1C,MAAM;YACR,CAAC;YACD,MAAM,IAAI,OAAO,CAAC;YAClB,CAAC,EAAE,CAAC;QACN,CAAC;QACD,MAAM,QAAQ,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QACnC,MAAM,KAAK,GAAG,UAAU,CAAC,IAAI,EAAE,QAAQ,EAAE,QAAQ,CAAC,CAAC;QACnD,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACnB,QAAQ,EAAE,CAAC;QAEX,IAAI,CAAC,IAAI,KAAK,CAAC,MAAM,EAAE,CAAC;YACtB,MAAM;QACR,CAAC;QAED,4CAA4C;QAC5C,MAAM,cAAc,GAAG,CAAC,GAAG,CAAC,CAAC;QAC7B,MAAM,SAAS,GAAG,qBAAqB,CACrC,KAAK,EACL,cAAc,EACd,KAAK,CAAC,OAAO,EACb,YAAY,CACb,CAAC;QACF,CAAC,GAAG,SAAS,CAAC;IAChB,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
package/package.json
CHANGED
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { chunkText, type Chunk } from "../chunker.js";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Test suite for the recursive character text splitter.
|
|
6
|
+
*
|
|
7
|
+
* Invariants asserted (all must hold for every chunk returned):
|
|
8
|
+
* - text.slice(charStart, charEnd) === content
|
|
9
|
+
* - charStart is monotonically non-decreasing
|
|
10
|
+
* - content.length <= chunkSize + (longest separator length)
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
function assertCoreInvariants(
|
|
14
|
+
text: string,
|
|
15
|
+
chunks: Chunk[],
|
|
16
|
+
chunkSize: number,
|
|
17
|
+
separatorSlack = 2,
|
|
18
|
+
): void {
|
|
19
|
+
let prevStart = -1;
|
|
20
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
21
|
+
const c = chunks[i]!;
|
|
22
|
+
expect(c.index).toBe(i);
|
|
23
|
+
expect(text.slice(c.charStart, c.charEnd)).toBe(c.content);
|
|
24
|
+
expect(c.charStart).toBeGreaterThanOrEqual(prevStart);
|
|
25
|
+
expect(c.content.length).toBeLessThanOrEqual(chunkSize + separatorSlack);
|
|
26
|
+
prevStart = c.charStart;
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
describe("chunkText — empty and short inputs", () => {
|
|
31
|
+
it("returns [] for empty string", () => {
|
|
32
|
+
expect(chunkText("")).toEqual([]);
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
it("returns a single chunk for a short doc", () => {
|
|
36
|
+
const text = "short doc";
|
|
37
|
+
const chunks = chunkText(text);
|
|
38
|
+
expect(chunks).toHaveLength(1);
|
|
39
|
+
expect(chunks[0]!.charStart).toBe(0);
|
|
40
|
+
expect(chunks[0]!.charEnd).toBe(text.length);
|
|
41
|
+
expect(chunks[0]!.content).toBe(text);
|
|
42
|
+
expect(chunks[0]!.index).toBe(0);
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
it("returns a single chunk at exactly chunkSize", () => {
|
|
46
|
+
const text = "x".repeat(2048);
|
|
47
|
+
const chunks = chunkText(text, { chunkSize: 2048 });
|
|
48
|
+
expect(chunks).toHaveLength(1);
|
|
49
|
+
expect(chunks[0]!.content).toBe(text);
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
it("returns a single chunk when text.length < chunkSize even with unusual separators", () => {
|
|
53
|
+
const chunks = chunkText("hello world", {
|
|
54
|
+
chunkSize: 100,
|
|
55
|
+
chunkOverlap: 10,
|
|
56
|
+
separators: ["##", "\n"],
|
|
57
|
+
});
|
|
58
|
+
expect(chunks).toHaveLength(1);
|
|
59
|
+
expect(chunks[0]!.content).toBe("hello world");
|
|
60
|
+
});
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
describe("chunkText — long documents", () => {
|
|
64
|
+
it("produces >= 4 chunks for an 8K-char paragraph-rich doc with chunkSize=2048", () => {
|
|
65
|
+
const paragraph = "The quick brown fox jumps over the lazy dog. ".repeat(20);
|
|
66
|
+
const text = Array.from({ length: 10 }, () => paragraph).join("\n\n");
|
|
67
|
+
expect(text.length).toBeGreaterThan(8000);
|
|
68
|
+
|
|
69
|
+
const chunks = chunkText(text, { chunkSize: 2048, chunkOverlap: 256 });
|
|
70
|
+
expect(chunks.length).toBeGreaterThanOrEqual(4);
|
|
71
|
+
assertCoreInvariants(text, chunks, 2048);
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
it("bounds content.length at chunkSize + separator slack", () => {
|
|
75
|
+
const text = Array.from({ length: 400 }, (_, i) => `Sentence ${i}.`).join(" ");
|
|
76
|
+
const chunks = chunkText(text, { chunkSize: 512, chunkOverlap: 64 });
|
|
77
|
+
assertCoreInvariants(text, chunks, 512);
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
it("keeps charStart monotonically non-decreasing", () => {
|
|
81
|
+
const text = "x".repeat(10_000);
|
|
82
|
+
const chunks = chunkText(text, { chunkSize: 256, chunkOverlap: 32 });
|
|
83
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
84
|
+
expect(chunks[i]!.charStart).toBeGreaterThanOrEqual(chunks[i - 1]!.charStart);
|
|
85
|
+
}
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
it("reconstructs content bit-for-bit from offsets for every chunk", () => {
|
|
89
|
+
const text = Array.from({ length: 500 }, (_, i) => `Para ${i}.\n\n`).join("");
|
|
90
|
+
const chunks = chunkText(text, { chunkSize: 1024, chunkOverlap: 128 });
|
|
91
|
+
for (const c of chunks) {
|
|
92
|
+
expect(text.slice(c.charStart, c.charEnd)).toBe(c.content);
|
|
93
|
+
}
|
|
94
|
+
});
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
describe("chunkText — overlap behavior", () => {
|
|
98
|
+
it("produces overlap of approximately chunkOverlap between consecutive chunks", () => {
|
|
99
|
+
const text = "abcdefghij ".repeat(500); // ~5500 chars
|
|
100
|
+
const chunkOverlap = 256;
|
|
101
|
+
const chunks = chunkText(text, { chunkSize: 1024, chunkOverlap });
|
|
102
|
+
|
|
103
|
+
expect(chunks.length).toBeGreaterThan(1);
|
|
104
|
+
|
|
105
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
106
|
+
const prev = chunks[i - 1]!;
|
|
107
|
+
const curr = chunks[i]!;
|
|
108
|
+
// Overlap in characters = prev.charEnd - curr.charStart.
|
|
109
|
+
const overlap = prev.charEnd - curr.charStart;
|
|
110
|
+
expect(overlap).toBeGreaterThan(0);
|
|
111
|
+
// Tolerance: +/- 16 chars (allows for snap to atom boundary).
|
|
112
|
+
expect(Math.abs(overlap - chunkOverlap)).toBeLessThanOrEqual(16);
|
|
113
|
+
}
|
|
114
|
+
});
|
|
115
|
+
|
|
116
|
+
it("makes forward progress even when chunkOverlap is zero", () => {
|
|
117
|
+
const text = "abcdefghij ".repeat(300);
|
|
118
|
+
const chunks = chunkText(text, { chunkSize: 512, chunkOverlap: 0 });
|
|
119
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
120
|
+
expect(chunks[i]!.charStart).toBeGreaterThanOrEqual(chunks[i - 1]!.charEnd);
|
|
121
|
+
}
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
it("rejects chunkOverlap >= chunkSize", () => {
|
|
125
|
+
// Only need enough text to trigger the chunking path past the fast return.
|
|
126
|
+
const text = "x".repeat(2048);
|
|
127
|
+
expect(() =>
|
|
128
|
+
chunkText(text, { chunkSize: 100, chunkOverlap: 100 }),
|
|
129
|
+
).toThrow(/chunkOverlap/);
|
|
130
|
+
expect(() =>
|
|
131
|
+
chunkText(text, { chunkSize: 100, chunkOverlap: 200 }),
|
|
132
|
+
).toThrow(/chunkOverlap/);
|
|
133
|
+
});
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
describe("chunkText — unicode correctness", () => {
|
|
137
|
+
it("preserves character boundaries with emoji + CJK", () => {
|
|
138
|
+
// Mix emoji (surrogate pairs), CJK, and ASCII; repeat enough to exceed chunkSize.
|
|
139
|
+
const segment =
|
|
140
|
+
"Hello. こんにちは。 你好。 안녕하세요。 Emojis: 🎉🚀✨. ";
|
|
141
|
+
const text = segment.repeat(100); // plenty of chunks
|
|
142
|
+
const chunks = chunkText(text, { chunkSize: 512, chunkOverlap: 64 });
|
|
143
|
+
|
|
144
|
+
assertCoreInvariants(text, chunks, 512);
|
|
145
|
+
|
|
146
|
+
// Slicing must produce the same string as content — if indices split a
|
|
147
|
+
// surrogate pair the JS string would still compare equal char-by-char,
|
|
148
|
+
// but we additionally verify no lone surrogate prefix/suffix on boundaries.
|
|
149
|
+
for (const c of chunks) {
|
|
150
|
+
expect(text.slice(c.charStart, c.charEnd)).toBe(c.content);
|
|
151
|
+
}
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
it("handles pure CJK doc (no ASCII separators beyond ideographic space)", () => {
|
|
155
|
+
const text = "这是一个很长的中文文档。".repeat(300);
|
|
156
|
+
const chunks = chunkText(text, { chunkSize: 256, chunkOverlap: 32 });
|
|
157
|
+
assertCoreInvariants(text, chunks, 256);
|
|
158
|
+
// Must cover the whole doc (last chunk ends at text.length).
|
|
159
|
+
expect(chunks[chunks.length - 1]!.charEnd).toBe(text.length);
|
|
160
|
+
});
|
|
161
|
+
});
|
|
162
|
+
|
|
163
|
+
describe("chunkText — separator priority", () => {
|
|
164
|
+
it("prefers \\n\\n over other separators so code fences stay intact", () => {
|
|
165
|
+
// A large markdown doc with a code fence that would exceed chunkSize if
|
|
166
|
+
// we naively split on every newline; ensure it's kept whole by preferring
|
|
167
|
+
// \n\n as the outer boundary.
|
|
168
|
+
const prefix = "# Heading\n\nSome text before the fence.\n\n";
|
|
169
|
+
const fence =
|
|
170
|
+
"```typescript\n" +
|
|
171
|
+
"const x = 1;\n".repeat(60) +
|
|
172
|
+
"```";
|
|
173
|
+
const suffix = "\n\nText after the fence.";
|
|
174
|
+
const text = prefix + fence + suffix;
|
|
175
|
+
|
|
176
|
+
const chunkSize = fence.length + 50; // Large enough to keep fence whole.
|
|
177
|
+
const chunks = chunkText(text, {
|
|
178
|
+
chunkSize,
|
|
179
|
+
chunkOverlap: 64,
|
|
180
|
+
separators: ["\n\n", "\n", ". ", " ", ""],
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
// Fence should appear intact in at least one chunk.
|
|
184
|
+
const anyChunkContainsWholeFence = chunks.some(c => c.content.includes(fence));
|
|
185
|
+
expect(anyChunkContainsWholeFence).toBe(true);
|
|
186
|
+
|
|
187
|
+
// And no chunk should contain just a fragment of the opening backticks
|
|
188
|
+
// separated from its closing ones — i.e., if a chunk starts with ``` it
|
|
189
|
+
// must also contain the matching closing ```.
|
|
190
|
+
for (const c of chunks) {
|
|
191
|
+
const opens = (c.content.match(/```/g) ?? []).length;
|
|
192
|
+
// Either zero fences (text-only chunk) or an even number (complete fences).
|
|
193
|
+
expect(opens % 2 === 0 || c.content.includes(fence)).toBe(true);
|
|
194
|
+
}
|
|
195
|
+
});
|
|
196
|
+
|
|
197
|
+
it("honors custom separators — '##' heading splitter", () => {
|
|
198
|
+
const text =
|
|
199
|
+
"Intro paragraph here.\n\n" +
|
|
200
|
+
"## Section A\nContent for A.\n\n" +
|
|
201
|
+
"## Section B\nContent for B with a bit more text.\n\n" +
|
|
202
|
+
"## Section C\nFinal content goes here.";
|
|
203
|
+
const chunks = chunkText(text, {
|
|
204
|
+
chunkSize: 40, // small -> forces splitting
|
|
205
|
+
chunkOverlap: 8,
|
|
206
|
+
separators: ["##", "\n", " ", ""],
|
|
207
|
+
});
|
|
208
|
+
|
|
209
|
+
// Custom separator must be honored: the chunker must split on '##'.
|
|
210
|
+
// Because the separator stays attached to the preceding piece (to keep
|
|
211
|
+
// text.slice(start,end) === content), chunk boundaries occur *just after*
|
|
212
|
+
// each '##' occurrence — verify that at least one chunk's charEnd aligns
|
|
213
|
+
// with a '##' position + 2.
|
|
214
|
+
expect(chunks.length).toBeGreaterThan(1);
|
|
215
|
+
const sectionStarts = [
|
|
216
|
+
text.indexOf("## Section A"),
|
|
217
|
+
text.indexOf("## Section B"),
|
|
218
|
+
text.indexOf("## Section C"),
|
|
219
|
+
];
|
|
220
|
+
const endPositions = chunks.map(c => c.charEnd);
|
|
221
|
+
const boundariesHit = sectionStarts.filter(pos =>
|
|
222
|
+
endPositions.includes(pos + 2),
|
|
223
|
+
);
|
|
224
|
+
expect(boundariesHit.length).toBeGreaterThan(0);
|
|
225
|
+
});
|
|
226
|
+
|
|
227
|
+
it("character-splits as a last resort when no separator applies", () => {
|
|
228
|
+
// A single token longer than chunkSize with no separators; must still
|
|
229
|
+
// produce chunks (via the "" sentinel).
|
|
230
|
+
const text = "a".repeat(300);
|
|
231
|
+
const chunks = chunkText(text, {
|
|
232
|
+
chunkSize: 100,
|
|
233
|
+
chunkOverlap: 10,
|
|
234
|
+
separators: ["\n\n", "\n", ""],
|
|
235
|
+
});
|
|
236
|
+
expect(chunks.length).toBeGreaterThan(1);
|
|
237
|
+
for (const c of chunks) {
|
|
238
|
+
expect(c.content.length).toBeLessThanOrEqual(100);
|
|
239
|
+
}
|
|
240
|
+
// Overall coverage: concatenating (with overlap removed) must reconstruct.
|
|
241
|
+
const firstChar = chunks[0]!.charStart;
|
|
242
|
+
const lastEnd = chunks[chunks.length - 1]!.charEnd;
|
|
243
|
+
expect(firstChar).toBe(0);
|
|
244
|
+
expect(lastEnd).toBe(text.length);
|
|
245
|
+
});
|
|
246
|
+
});
|
package/src/chunker.ts
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* RecursiveCharacterTextSplitter-style chunker.
|
|
3
|
+
*
|
|
4
|
+
* Splits long text into overlapping chunks while preserving the original
|
|
5
|
+
* character offsets (charStart, charEnd) so downstream code can reconstruct
|
|
6
|
+
* positions. Mirrors the semantics of LangChain's RecursiveCharacterTextSplitter:
|
|
7
|
+
* tries each separator in order, snapping chunk boundaries to the highest-priority
|
|
8
|
+
* separator that keeps pieces under chunkSize.
|
|
9
|
+
*
|
|
10
|
+
* Defaults correspond to 512-token chunks with 64-token overlap
|
|
11
|
+
* (approx: 1 token ~= 4 chars for English text).
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
export interface Chunk {
|
|
15
|
+
index: number;
|
|
16
|
+
content: string;
|
|
17
|
+
charStart: number;
|
|
18
|
+
charEnd: number;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export interface ChunkerOptions {
|
|
22
|
+
chunkSize?: number;
|
|
23
|
+
chunkOverlap?: number;
|
|
24
|
+
separators?: string[];
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const DEFAULT_CHUNK_SIZE = 2048;
|
|
28
|
+
const DEFAULT_CHUNK_OVERLAP = 256;
|
|
29
|
+
const DEFAULT_SEPARATORS: string[] = ["\n\n", "\n", ". ", " ", ""];
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* A piece of the original text with an absolute offset back to the source.
|
|
33
|
+
* Used as the internal working type while recursing through separators.
|
|
34
|
+
*/
|
|
35
|
+
interface Piece {
|
|
36
|
+
text: string;
|
|
37
|
+
start: number;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Pick the first separator from `separators` that occurs in `piece.text`.
|
|
42
|
+
* Falls back to the last separator (typically `""`) if none match — this is
|
|
43
|
+
* the sentinel that lets us character-split oversized pieces with no natural
|
|
44
|
+
* boundary.
|
|
45
|
+
*/
|
|
46
|
+
function pickSeparator(pieceText: string, separators: string[]): {
|
|
47
|
+
separator: string;
|
|
48
|
+
remaining: string[];
|
|
49
|
+
} {
|
|
50
|
+
for (let i = 0; i < separators.length; i++) {
|
|
51
|
+
const sep = separators[i]!;
|
|
52
|
+
if (sep === "") {
|
|
53
|
+
return { separator: sep, remaining: separators.slice(i + 1) };
|
|
54
|
+
}
|
|
55
|
+
if (pieceText.includes(sep)) {
|
|
56
|
+
return { separator: sep, remaining: separators.slice(i + 1) };
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
// Should be unreachable when DEFAULT_SEPARATORS ends with "".
|
|
60
|
+
return { separator: "", remaining: [] };
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Split a piece on `separator` while retaining absolute char offsets.
|
|
65
|
+
* When separator is empty, split into single-character pieces.
|
|
66
|
+
*/
|
|
67
|
+
function splitOnSeparator(piece: Piece, separator: string): Piece[] {
|
|
68
|
+
if (separator === "") {
|
|
69
|
+
const out: Piece[] = [];
|
|
70
|
+
for (let i = 0; i < piece.text.length; i++) {
|
|
71
|
+
out.push({ text: piece.text[i]!, start: piece.start + i });
|
|
72
|
+
}
|
|
73
|
+
return out;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
const out: Piece[] = [];
|
|
77
|
+
let cursor = 0;
|
|
78
|
+
let idx = piece.text.indexOf(separator, cursor);
|
|
79
|
+
while (idx !== -1) {
|
|
80
|
+
// Keep the separator attached to the preceding piece so reconstruction
|
|
81
|
+
// via text.slice(charStart, charEnd) works bit-for-bit.
|
|
82
|
+
const sliceEnd = idx + separator.length;
|
|
83
|
+
out.push({
|
|
84
|
+
text: piece.text.slice(cursor, sliceEnd),
|
|
85
|
+
start: piece.start + cursor,
|
|
86
|
+
});
|
|
87
|
+
cursor = sliceEnd;
|
|
88
|
+
idx = piece.text.indexOf(separator, cursor);
|
|
89
|
+
}
|
|
90
|
+
if (cursor < piece.text.length) {
|
|
91
|
+
out.push({
|
|
92
|
+
text: piece.text.slice(cursor),
|
|
93
|
+
start: piece.start + cursor,
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
return out;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Recursively flatten a piece into "atoms" — pieces small enough to merge
|
|
101
|
+
* greedily into chunks. Pieces larger than chunkSize are split with the next
|
|
102
|
+
* separator in line; pieces that fit are returned as-is.
|
|
103
|
+
*/
|
|
104
|
+
function flattenToAtoms(
|
|
105
|
+
piece: Piece,
|
|
106
|
+
separators: string[],
|
|
107
|
+
chunkSize: number,
|
|
108
|
+
): Piece[] {
|
|
109
|
+
if (piece.text.length <= chunkSize) {
|
|
110
|
+
return [piece];
|
|
111
|
+
}
|
|
112
|
+
const { separator, remaining } = pickSeparator(piece.text, separators);
|
|
113
|
+
const splits = splitOnSeparator(piece, separator);
|
|
114
|
+
|
|
115
|
+
// If the separator didn't actually reduce the piece (e.g., no occurrence),
|
|
116
|
+
// fall through to the next separator with the original piece.
|
|
117
|
+
if (splits.length <= 1) {
|
|
118
|
+
if (remaining.length === 0) {
|
|
119
|
+
// No more separators — return whatever we have, even if oversized.
|
|
120
|
+
return [piece];
|
|
121
|
+
}
|
|
122
|
+
return flattenToAtoms(piece, remaining, chunkSize);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
const out: Piece[] = [];
|
|
126
|
+
for (const sub of splits) {
|
|
127
|
+
if (sub.text.length <= chunkSize) {
|
|
128
|
+
out.push(sub);
|
|
129
|
+
} else if (remaining.length > 0) {
|
|
130
|
+
for (const leaf of flattenToAtoms(sub, remaining, chunkSize)) {
|
|
131
|
+
out.push(leaf);
|
|
132
|
+
}
|
|
133
|
+
} else {
|
|
134
|
+
// Last-resort: character-split oversized atom so we never return a
|
|
135
|
+
// single atom larger than chunkSize.
|
|
136
|
+
out.push(...splitOnSeparator(sub, ""));
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
return out;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Build a chunk object from a contiguous run of atoms.
|
|
144
|
+
* `charStart` is taken from the first atom, `charEnd` from the last atom's
|
|
145
|
+
* end boundary, and `content` is `text.slice(start, end)` — this guarantees
|
|
146
|
+
* `text.slice(charStart, charEnd) === content`.
|
|
147
|
+
*/
|
|
148
|
+
function buildChunk(
|
|
149
|
+
originalText: string,
|
|
150
|
+
atoms: Piece[],
|
|
151
|
+
index: number,
|
|
152
|
+
): Chunk {
|
|
153
|
+
const first = atoms[0]!;
|
|
154
|
+
const last = atoms[atoms.length - 1]!;
|
|
155
|
+
const charStart = first.start;
|
|
156
|
+
const charEnd = last.start + last.text.length;
|
|
157
|
+
return {
|
|
158
|
+
index,
|
|
159
|
+
content: originalText.slice(charStart, charEnd),
|
|
160
|
+
charStart,
|
|
161
|
+
charEnd,
|
|
162
|
+
};
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Compute the start position for the next chunk's atoms given the previous
|
|
167
|
+
* chunk ended at `prevEnd`. We walk backward through the atom list to find
|
|
168
|
+
* the atom whose start >= prevEnd - chunkOverlap; that atom begins the
|
|
169
|
+
* overlap region.
|
|
170
|
+
*/
|
|
171
|
+
function findOverlapStartIndex(
|
|
172
|
+
atoms: Piece[],
|
|
173
|
+
lastEndAtomIndex: number,
|
|
174
|
+
prevEnd: number,
|
|
175
|
+
chunkOverlap: number,
|
|
176
|
+
): number {
|
|
177
|
+
if (chunkOverlap <= 0) {
|
|
178
|
+
return lastEndAtomIndex + 1;
|
|
179
|
+
}
|
|
180
|
+
const targetStart = prevEnd - chunkOverlap;
|
|
181
|
+
// Find the earliest atom in [0..lastEndAtomIndex] whose start >= targetStart.
|
|
182
|
+
let overlapAtomIdx = lastEndAtomIndex + 1;
|
|
183
|
+
for (let i = lastEndAtomIndex; i >= 0; i--) {
|
|
184
|
+
if (atoms[i]!.start >= targetStart) {
|
|
185
|
+
overlapAtomIdx = i;
|
|
186
|
+
} else {
|
|
187
|
+
break;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
// If overlap produced no progress (no atoms found), step forward to avoid
|
|
191
|
+
// an infinite loop.
|
|
192
|
+
if (overlapAtomIdx > lastEndAtomIndex) {
|
|
193
|
+
overlapAtomIdx = lastEndAtomIndex + 1;
|
|
194
|
+
}
|
|
195
|
+
return overlapAtomIdx;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Split `text` into overlapping chunks.
|
|
200
|
+
*
|
|
201
|
+
* Semantics:
|
|
202
|
+
* - Empty input -> empty array.
|
|
203
|
+
* - Short input (<= chunkSize) -> single chunk covering the whole text.
|
|
204
|
+
* - For each chunk, `text.slice(charStart, charEnd) === content`.
|
|
205
|
+
* - `charStart` is monotonically non-decreasing across chunks.
|
|
206
|
+
* - Consecutive chunks overlap by ~`chunkOverlap` chars (snapped to atom
|
|
207
|
+
* boundaries; may differ by up to the largest atom size).
|
|
208
|
+
* - Each chunk's content length is bounded by chunkSize + a small slack for
|
|
209
|
+
* the separator that snapped the boundary.
|
|
210
|
+
*/
|
|
211
|
+
export function chunkText(text: string, opts: ChunkerOptions = {}): Chunk[] {
|
|
212
|
+
const chunkSize = opts.chunkSize ?? DEFAULT_CHUNK_SIZE;
|
|
213
|
+
const chunkOverlap = opts.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP;
|
|
214
|
+
const separators = opts.separators ?? DEFAULT_SEPARATORS;
|
|
215
|
+
|
|
216
|
+
if (text.length === 0) {
|
|
217
|
+
return [];
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
if (chunkOverlap >= chunkSize) {
|
|
221
|
+
throw new Error(
|
|
222
|
+
`chunker: chunkOverlap (${chunkOverlap}) must be smaller than chunkSize (${chunkSize})`,
|
|
223
|
+
);
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// Fast path for short docs — no need to walk separators.
|
|
227
|
+
if (text.length <= chunkSize) {
|
|
228
|
+
return [
|
|
229
|
+
{
|
|
230
|
+
index: 0,
|
|
231
|
+
content: text,
|
|
232
|
+
charStart: 0,
|
|
233
|
+
charEnd: text.length,
|
|
234
|
+
},
|
|
235
|
+
];
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
const atoms = flattenToAtoms({ text, start: 0 }, separators, chunkSize);
|
|
239
|
+
|
|
240
|
+
const chunks: Chunk[] = [];
|
|
241
|
+
let chunkIdx = 0;
|
|
242
|
+
let i = 0;
|
|
243
|
+
|
|
244
|
+
while (i < atoms.length) {
|
|
245
|
+
// Greedily pack atoms into this chunk until adding one more would push
|
|
246
|
+
// the chunk content past chunkSize.
|
|
247
|
+
let runLen = 0;
|
|
248
|
+
let j = i;
|
|
249
|
+
while (j < atoms.length) {
|
|
250
|
+
const atomLen = atoms[j]!.text.length;
|
|
251
|
+
// Always include at least one atom per chunk to ensure progress.
|
|
252
|
+
if (j > i && runLen + atomLen > chunkSize) {
|
|
253
|
+
break;
|
|
254
|
+
}
|
|
255
|
+
runLen += atomLen;
|
|
256
|
+
j++;
|
|
257
|
+
}
|
|
258
|
+
const runAtoms = atoms.slice(i, j);
|
|
259
|
+
const chunk = buildChunk(text, runAtoms, chunkIdx);
|
|
260
|
+
chunks.push(chunk);
|
|
261
|
+
chunkIdx++;
|
|
262
|
+
|
|
263
|
+
if (j >= atoms.length) {
|
|
264
|
+
break;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// Compute overlap start for the next chunk.
|
|
268
|
+
const lastEndAtomIdx = j - 1;
|
|
269
|
+
const nextStart = findOverlapStartIndex(
|
|
270
|
+
atoms,
|
|
271
|
+
lastEndAtomIdx,
|
|
272
|
+
chunk.charEnd,
|
|
273
|
+
chunkOverlap,
|
|
274
|
+
);
|
|
275
|
+
i = nextStart;
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
return chunks;
|
|
279
|
+
}
|