ralph-hero-knowledge-index 0.1.20 → 0.1.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ralph-knowledge",
3
- "version": "0.1.20",
3
+ "version": "0.1.21",
4
4
  "description": "Knowledge graph for ralph-hero: semantic search, relationship traversal, and document indexing across thoughts/ documents. Optional companion to ralph-hero.",
5
5
  "author": {
6
6
  "name": "Chad Dubiel",
package/.mcp.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "mcpServers": {
3
3
  "ralph-knowledge": {
4
4
  "command": "npx",
5
- "args": ["-y", "ralph-hero-knowledge-index@0.1.20"]
5
+ "args": ["-y", "ralph-hero-knowledge-index@0.1.21"]
6
6
  }
7
7
  }
8
8
  }
@@ -0,0 +1,37 @@
1
+ /**
2
+ * RecursiveCharacterTextSplitter-style chunker.
3
+ *
4
+ * Splits long text into overlapping chunks while preserving the original
5
+ * character offsets (charStart, charEnd) so downstream code can reconstruct
6
+ * positions. Mirrors the semantics of LangChain's RecursiveCharacterTextSplitter:
7
+ * tries each separator in order, snapping chunk boundaries to the highest-priority
8
+ * separator that keeps pieces under chunkSize.
9
+ *
10
+ * Defaults correspond to 512-token chunks with 64-token overlap
11
+ * (approx: 1 token ~= 4 chars for English text).
12
+ */
13
+ export interface Chunk {
14
+ index: number;
15
+ content: string;
16
+ charStart: number;
17
+ charEnd: number;
18
+ }
19
+ export interface ChunkerOptions {
20
+ chunkSize?: number;
21
+ chunkOverlap?: number;
22
+ separators?: string[];
23
+ }
24
+ /**
25
+ * Split `text` into overlapping chunks.
26
+ *
27
+ * Semantics:
28
+ * - Empty input -> empty array.
29
+ * - Short input (<= chunkSize) -> single chunk covering the whole text.
30
+ * - For each chunk, `text.slice(charStart, charEnd) === content`.
31
+ * - `charStart` is monotonically non-decreasing across chunks.
32
+ * - Consecutive chunks overlap by ~`chunkOverlap` chars (snapped to atom
33
+ * boundaries; may differ by up to the largest atom size).
34
+ * - Each chunk's content length is bounded by chunkSize + a small slack for
35
+ * the separator that snapped the boundary.
36
+ */
37
+ export declare function chunkText(text: string, opts?: ChunkerOptions): Chunk[];
@@ -0,0 +1,219 @@
1
+ /**
2
+ * RecursiveCharacterTextSplitter-style chunker.
3
+ *
4
+ * Splits long text into overlapping chunks while preserving the original
5
+ * character offsets (charStart, charEnd) so downstream code can reconstruct
6
+ * positions. Mirrors the semantics of LangChain's RecursiveCharacterTextSplitter:
7
+ * tries each separator in order, snapping chunk boundaries to the highest-priority
8
+ * separator that keeps pieces under chunkSize.
9
+ *
10
+ * Defaults correspond to 512-token chunks with 64-token overlap
11
+ * (approx: 1 token ~= 4 chars for English text).
12
+ */
13
+ const DEFAULT_CHUNK_SIZE = 2048;
14
+ const DEFAULT_CHUNK_OVERLAP = 256;
15
+ const DEFAULT_SEPARATORS = ["\n\n", "\n", ". ", " ", ""];
16
+ /**
17
+ * Pick the first separator from `separators` that occurs in `piece.text`.
18
+ * Falls back to the last separator (typically `""`) if none match — this is
19
+ * the sentinel that lets us character-split oversized pieces with no natural
20
+ * boundary.
21
+ */
22
+ function pickSeparator(pieceText, separators) {
23
+ for (let i = 0; i < separators.length; i++) {
24
+ const sep = separators[i];
25
+ if (sep === "") {
26
+ return { separator: sep, remaining: separators.slice(i + 1) };
27
+ }
28
+ if (pieceText.includes(sep)) {
29
+ return { separator: sep, remaining: separators.slice(i + 1) };
30
+ }
31
+ }
32
+ // Should be unreachable when DEFAULT_SEPARATORS ends with "".
33
+ return { separator: "", remaining: [] };
34
+ }
35
+ /**
36
+ * Split a piece on `separator` while retaining absolute char offsets.
37
+ * When separator is empty, split into single-character pieces.
38
+ */
39
+ function splitOnSeparator(piece, separator) {
40
+ if (separator === "") {
41
+ const out = [];
42
+ for (let i = 0; i < piece.text.length; i++) {
43
+ out.push({ text: piece.text[i], start: piece.start + i });
44
+ }
45
+ return out;
46
+ }
47
+ const out = [];
48
+ let cursor = 0;
49
+ let idx = piece.text.indexOf(separator, cursor);
50
+ while (idx !== -1) {
51
+ // Keep the separator attached to the preceding piece so reconstruction
52
+ // via text.slice(charStart, charEnd) works bit-for-bit.
53
+ const sliceEnd = idx + separator.length;
54
+ out.push({
55
+ text: piece.text.slice(cursor, sliceEnd),
56
+ start: piece.start + cursor,
57
+ });
58
+ cursor = sliceEnd;
59
+ idx = piece.text.indexOf(separator, cursor);
60
+ }
61
+ if (cursor < piece.text.length) {
62
+ out.push({
63
+ text: piece.text.slice(cursor),
64
+ start: piece.start + cursor,
65
+ });
66
+ }
67
+ return out;
68
+ }
69
+ /**
70
+ * Recursively flatten a piece into "atoms" — pieces small enough to merge
71
+ * greedily into chunks. Pieces larger than chunkSize are split with the next
72
+ * separator in line; pieces that fit are returned as-is.
73
+ */
74
+ function flattenToAtoms(piece, separators, chunkSize) {
75
+ if (piece.text.length <= chunkSize) {
76
+ return [piece];
77
+ }
78
+ const { separator, remaining } = pickSeparator(piece.text, separators);
79
+ const splits = splitOnSeparator(piece, separator);
80
+ // If the separator didn't actually reduce the piece (e.g., no occurrence),
81
+ // fall through to the next separator with the original piece.
82
+ if (splits.length <= 1) {
83
+ if (remaining.length === 0) {
84
+ // No more separators — return whatever we have, even if oversized.
85
+ return [piece];
86
+ }
87
+ return flattenToAtoms(piece, remaining, chunkSize);
88
+ }
89
+ const out = [];
90
+ for (const sub of splits) {
91
+ if (sub.text.length <= chunkSize) {
92
+ out.push(sub);
93
+ }
94
+ else if (remaining.length > 0) {
95
+ for (const leaf of flattenToAtoms(sub, remaining, chunkSize)) {
96
+ out.push(leaf);
97
+ }
98
+ }
99
+ else {
100
+ // Last-resort: character-split oversized atom so we never return a
101
+ // single atom larger than chunkSize.
102
+ out.push(...splitOnSeparator(sub, ""));
103
+ }
104
+ }
105
+ return out;
106
+ }
107
+ /**
108
+ * Build a chunk object from a contiguous run of atoms.
109
+ * `charStart` is taken from the first atom, `charEnd` from the last atom's
110
+ * end boundary, and `content` is `text.slice(start, end)` — this guarantees
111
+ * `text.slice(charStart, charEnd) === content`.
112
+ */
113
+ function buildChunk(originalText, atoms, index) {
114
+ const first = atoms[0];
115
+ const last = atoms[atoms.length - 1];
116
+ const charStart = first.start;
117
+ const charEnd = last.start + last.text.length;
118
+ return {
119
+ index,
120
+ content: originalText.slice(charStart, charEnd),
121
+ charStart,
122
+ charEnd,
123
+ };
124
+ }
125
+ /**
126
+ * Compute the start position for the next chunk's atoms given the previous
127
+ * chunk ended at `prevEnd`. We walk backward through the atom list to find
128
+ * the atom whose start >= prevEnd - chunkOverlap; that atom begins the
129
+ * overlap region.
130
+ */
131
+ function findOverlapStartIndex(atoms, lastEndAtomIndex, prevEnd, chunkOverlap) {
132
+ if (chunkOverlap <= 0) {
133
+ return lastEndAtomIndex + 1;
134
+ }
135
+ const targetStart = prevEnd - chunkOverlap;
136
+ // Find the earliest atom in [0..lastEndAtomIndex] whose start >= targetStart.
137
+ let overlapAtomIdx = lastEndAtomIndex + 1;
138
+ for (let i = lastEndAtomIndex; i >= 0; i--) {
139
+ if (atoms[i].start >= targetStart) {
140
+ overlapAtomIdx = i;
141
+ }
142
+ else {
143
+ break;
144
+ }
145
+ }
146
+ // If overlap produced no progress (no atoms found), step forward to avoid
147
+ // an infinite loop.
148
+ if (overlapAtomIdx > lastEndAtomIndex) {
149
+ overlapAtomIdx = lastEndAtomIndex + 1;
150
+ }
151
+ return overlapAtomIdx;
152
+ }
153
+ /**
154
+ * Split `text` into overlapping chunks.
155
+ *
156
+ * Semantics:
157
+ * - Empty input -> empty array.
158
+ * - Short input (<= chunkSize) -> single chunk covering the whole text.
159
+ * - For each chunk, `text.slice(charStart, charEnd) === content`.
160
+ * - `charStart` is monotonically non-decreasing across chunks.
161
+ * - Consecutive chunks overlap by ~`chunkOverlap` chars (snapped to atom
162
+ * boundaries; may differ by up to the largest atom size).
163
+ * - Each chunk's content length is bounded by chunkSize + a small slack for
164
+ * the separator that snapped the boundary.
165
+ */
166
+ export function chunkText(text, opts = {}) {
167
+ const chunkSize = opts.chunkSize ?? DEFAULT_CHUNK_SIZE;
168
+ const chunkOverlap = opts.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP;
169
+ const separators = opts.separators ?? DEFAULT_SEPARATORS;
170
+ if (text.length === 0) {
171
+ return [];
172
+ }
173
+ if (chunkOverlap >= chunkSize) {
174
+ throw new Error(`chunker: chunkOverlap (${chunkOverlap}) must be smaller than chunkSize (${chunkSize})`);
175
+ }
176
+ // Fast path for short docs — no need to walk separators.
177
+ if (text.length <= chunkSize) {
178
+ return [
179
+ {
180
+ index: 0,
181
+ content: text,
182
+ charStart: 0,
183
+ charEnd: text.length,
184
+ },
185
+ ];
186
+ }
187
+ const atoms = flattenToAtoms({ text, start: 0 }, separators, chunkSize);
188
+ const chunks = [];
189
+ let chunkIdx = 0;
190
+ let i = 0;
191
+ while (i < atoms.length) {
192
+ // Greedily pack atoms into this chunk until adding one more would push
193
+ // the chunk content past chunkSize.
194
+ let runLen = 0;
195
+ let j = i;
196
+ while (j < atoms.length) {
197
+ const atomLen = atoms[j].text.length;
198
+ // Always include at least one atom per chunk to ensure progress.
199
+ if (j > i && runLen + atomLen > chunkSize) {
200
+ break;
201
+ }
202
+ runLen += atomLen;
203
+ j++;
204
+ }
205
+ const runAtoms = atoms.slice(i, j);
206
+ const chunk = buildChunk(text, runAtoms, chunkIdx);
207
+ chunks.push(chunk);
208
+ chunkIdx++;
209
+ if (j >= atoms.length) {
210
+ break;
211
+ }
212
+ // Compute overlap start for the next chunk.
213
+ const lastEndAtomIdx = j - 1;
214
+ const nextStart = findOverlapStartIndex(atoms, lastEndAtomIdx, chunk.charEnd, chunkOverlap);
215
+ i = nextStart;
216
+ }
217
+ return chunks;
218
+ }
219
+ //# sourceMappingURL=chunker.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chunker.js","sourceRoot":"","sources":["../src/chunker.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAeH,MAAM,kBAAkB,GAAG,IAAI,CAAC;AAChC,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAClC,MAAM,kBAAkB,GAAa,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC;AAWnE;;;;;GAKG;AACH,SAAS,aAAa,CAAC,SAAiB,EAAE,UAAoB;IAI5D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3C,MAAM,GAAG,GAAG,UAAU,CAAC,CAAC,CAAE,CAAC;QAC3B,IAAI,GAAG,KAAK,EAAE,EAAE,CAAC;YACf,OAAO,EAAE,SAAS,EAAE,GAAG,EAAE,SAAS,EAAE,UAAU,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;QAChE,CAAC;QACD,IAAI,SAAS,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YAC5B,OAAO,EAAE,SAAS,EAAE,GAAG,EAAE,SAAS,EAAE,UAAU,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;QAChE,CAAC;IACH,CAAC;IACD,8DAA8D;IAC9D,OAAO,EAAE,SAAS,EAAE,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,CAAC;AAC1C,CAAC;AAED;;;GAGG;AACH,SAAS,gBAAgB,CAAC,KAAY,EAAE,SAAiB;IACvD,IAAI,SAAS,KAAK,EAAE,EAAE,CAAC;QACrB,MAAM,GAAG,GAAY,EAAE,CAAC;QACxB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3C,GAAG,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,CAAC,CAAE,EAAE,KAAK,EAAE,KAAK,CAAC,KAAK,GAAG,CAAC,EAAE,CAAC,CAAC;QAC7D,CAAC;QACD,OAAO,GAAG,CAAC;IACb,CAAC;IAED,MAAM,GAAG,GAAY,EAAE,CAAC;IACxB,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,IAAI,GAAG,GAAG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAChD,OAAO,GAAG,KAAK,CAAC,CAAC,EAAE,CAAC;QAClB,uEAAuE;QACvE,wDAAwD;QACxD,MAAM,QAAQ,GAAG,GAAG,GAAG,SAAS,CAAC,MAAM,CAAC;QACxC,GAAG,CAAC,IAAI,CAAC;YACP,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,QAAQ,CAAC;YACxC,KAAK,EAAE,KAAK,CAAC,KAAK,GAAG,MAAM;SAC5B,CAAC,CAAC;QACH,MAAM,GAAG,QAAQ,CAAC;QAClB,GAAG,GAAG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAC9C,CAAC;IACD,IAAI,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;QAC/B,GAAG,CAAC,IAAI,CAAC;YACP,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC;YAC9B,KAAK,EAAE,KAAK,CAAC,KAAK,GAAG,MAAM;SAC5B,CAAC,CAAC;IACL,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;;GAIG;AACH,SAAS,cAAc,CACrB,KAAY,EACZ,UAAoB,EACpB,SAAiB;IAEjB,IAAI,KAAK,CAAC,IAAI,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QACnC,OAAO,CAAC,KAAK,CAAC,CAAC;IACjB,CAAC;IACD,MAAM,EAAE,SAAS,EAAE,SAAS,EAAE,GAAG,aAAa,CAAC,KAAK,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;IACvE,MAAM,MAAM,GAAG,gBAAgB,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;IAElD,2EAA2E;IAC3E,8DAA8D;IAC9D,IAAI,MAAM,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;QACvB,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3B,mEAAmE;YACnE,OAAO,CAAC,KAAK,CAAC,CAAC;QACjB,CAAC;QACD,OAAO,cAAc,CAAC,KAAK,EAAE,SAAS,EAAE,SAAS,CAAC,CAAC;IACrD,CAAC;IAED,MAAM,GAAG,GAAY,EAAE,CAAC;IACxB,KAAK,MAAM,GAAG,IAAI,MAAM,EAAE,CAAC;QACzB,IAAI,GAAG,CAAC,IAAI,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;YACjC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAChB,CAAC;aAAM,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAChC,KAAK,MAAM,IAAI,IAAI,cAAc,CAAC,GAAG,EAAE,SAAS,EAAE,SAAS,CAAC,EAAE,CAAC;gBAC7D,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACjB,CAAC;QACH,CAAC;aAAM,CAAC;YACN,mEAAmE;YACnE,qCAAqC;YACrC,GAAG,CAAC,IAAI,CAAC,GAAG,gBAAgB,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC,CAAC;QACzC,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;;;GAKG;AACH,SAAS,UAAU,CACjB,YAAoB,EACpB,KAAc,EACd,KAAa;IAEb,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAE,CAAC;IACxB,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAE,CAAC;IACtC,MAAM,SAAS,GAAG,KAAK,CAAC,KAAK,CAAC;IAC9B,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC;IAC9C,OAAO;QACL,KAAK;QACL,OAAO,EAAE,YAAY,CAAC,KAAK,CAAC,SAAS,EAAE,OAAO,CAAC;QAC/C,SAAS;QACT,OAAO;KACR,CAAC;AACJ,CAAC;AAED;;;;;GAKG;AACH,SAAS,qBAAqB,CAC5B,KAAc,EACd,gBAAwB,EACxB,OAAe,EACf,YAAoB;IAEpB,IAAI,YAAY,IAAI,CAAC,EAAE,CAAC;QACtB,OAAO,gBAAgB,GAAG,CAAC,CAAC;IAC9B,CAAC;IACD,MAAM,WAAW,GAAG,OAAO,GAAG,YAAY,CAAC;IAC3C,8EAA8E;IAC9E,IAAI,cAAc,GAAG,gBAAgB,GAAG,CAAC,CAAC;IAC1C,KAAK,IAAI,CAAC,GAAG,gBAAgB,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3C,IAAI,KAAK,CAAC,CAAC,CAAE,CAAC,KAAK,IAAI,WAAW,EAAE,CAAC;YACnC,cAAc,GAAG,CAAC,CAAC;QACrB,CAAC;aAAM,CAAC;YACN,MAAM;QACR,CAAC;IACH,CAAC;IACD,0EAA0E;IAC1E,oBAAoB;IACpB,IAAI,cAAc,GAAG,gBAAgB,EAAE,CAAC;QACtC,cAAc,GAAG,gBAAgB,GAAG,CAAC,CAAC;IACxC,CAAC;IACD,OAAO,cAAc,CAAC;AACxB,CAAC;AAED;;;;;;;;;;;;GAYG;AACH,MAAM,UAAU,SAAS,CAAC,IAAY,EAAE,OAAuB,EAAE;IAC/D,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,kBAAkB,CAAC;IACvD,MAAM,YAAY,GAAG,IAAI,CAAC,YAAY,IAAI,qBAAqB,CAAC;IAChE,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,IAAI,kBAAkB,CAAC;IAEzD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtB,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,IAAI,YAAY,IAAI,SAAS,EAAE,CAAC;QAC9B,MAAM,IAAI,KAAK,CACb,0BAA0B,YAAY,qCAAqC,SAAS,GAAG,CACxF,CAAC;IACJ,CAAC;IAED,yDAAyD;IACzD,IAAI,IAAI,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAC7B,OAAO;YACL;gBACE,KAAK,EAAE,CAAC;gBACR,OAAO,EAAE,IAAI;gBACb,SAAS,EAAE,CAAC;gBACZ,OAAO,EAAE,IAAI,CAAC,MAAM;aACrB;SACF,CAAC;IACJ,CAAC;IAED,MAAM,KAAK,GAAG,cAAc,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,EAAE,UAAU,EAAE,SAAS,CAAC,CAAC;IAExE,MAAM,MAAM,GAAY,EAAE,CAAC;IAC3B,IAAI,QAAQ,GAAG,CAAC,CAAC;IACjB,IAAI,CAAC,GAAG,CAAC,CAAC;IAEV,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;QACxB,uEAAuE;QACvE,oCAAoC;QACpC,IAAI,MAAM,GAAG,CAAC,CAAC;QACf,IAAI,CAAC,GAAG,CAAC,CAAC;QACV,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YACxB,MAAM,OAAO,GAAG,KAAK,CAAC,CAAC,CAAE,CAAC,IAAI,CAAC,MAAM,CAAC;YACtC,iEAAiE;YACjE,IAAI,CAAC,GAAG,CAAC,IAAI,MAAM,GAAG,OAAO,GAAG,SAAS,EAAE,CAAC;gBAC1C,MAAM;YACR,CAAC;YACD,MAAM,IAAI,OAAO,CAAC;YAClB,CAAC,EAAE,CAAC;QACN,CAAC;QACD,MAAM,QAAQ,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QACnC,MAAM,KAAK,GAAG,UAAU,CAAC,IAAI,EAAE,QAAQ,EAAE,QAAQ,CAAC,CAAC;QACnD,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACnB,QAAQ,EAAE,CAAC;QAEX,IAAI,CAAC,IAAI,KAAK,CAAC,MAAM,EAAE,CAAC;YACtB,MAAM;QACR,CAAC;QAED,4CAA4C;QAC5C,MAAM,cAAc,GAAG,CAAC,GAAG,CAAC,CAAC;QAC7B,MAAM,SAAS,GAAG,qBAAqB,CACrC,KAAK,EACL,cAAc,EACd,KAAK,CAAC,OAAO,EACb,YAAY,CACb,CAAC;QACF,CAAC,GAAG,SAAS,CAAC;IAChB,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ralph-hero-knowledge-index",
3
- "version": "0.1.20",
3
+ "version": "0.1.21",
4
4
  "type": "module",
5
5
  "main": "dist/index.js",
6
6
  "bin": {
@@ -0,0 +1,246 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { chunkText, type Chunk } from "../chunker.js";
3
+
4
+ /**
5
+ * Test suite for the recursive character text splitter.
6
+ *
7
+ * Invariants asserted (all must hold for every chunk returned):
8
+ * - text.slice(charStart, charEnd) === content
9
+ * - charStart is monotonically non-decreasing
10
+ * - content.length <= chunkSize + (longest separator length)
11
+ */
12
+
13
+ function assertCoreInvariants(
14
+ text: string,
15
+ chunks: Chunk[],
16
+ chunkSize: number,
17
+ separatorSlack = 2,
18
+ ): void {
19
+ let prevStart = -1;
20
+ for (let i = 0; i < chunks.length; i++) {
21
+ const c = chunks[i]!;
22
+ expect(c.index).toBe(i);
23
+ expect(text.slice(c.charStart, c.charEnd)).toBe(c.content);
24
+ expect(c.charStart).toBeGreaterThanOrEqual(prevStart);
25
+ expect(c.content.length).toBeLessThanOrEqual(chunkSize + separatorSlack);
26
+ prevStart = c.charStart;
27
+ }
28
+ }
29
+
30
+ describe("chunkText — empty and short inputs", () => {
31
+ it("returns [] for empty string", () => {
32
+ expect(chunkText("")).toEqual([]);
33
+ });
34
+
35
+ it("returns a single chunk for a short doc", () => {
36
+ const text = "short doc";
37
+ const chunks = chunkText(text);
38
+ expect(chunks).toHaveLength(1);
39
+ expect(chunks[0]!.charStart).toBe(0);
40
+ expect(chunks[0]!.charEnd).toBe(text.length);
41
+ expect(chunks[0]!.content).toBe(text);
42
+ expect(chunks[0]!.index).toBe(0);
43
+ });
44
+
45
+ it("returns a single chunk at exactly chunkSize", () => {
46
+ const text = "x".repeat(2048);
47
+ const chunks = chunkText(text, { chunkSize: 2048 });
48
+ expect(chunks).toHaveLength(1);
49
+ expect(chunks[0]!.content).toBe(text);
50
+ });
51
+
52
+ it("returns a single chunk when text.length < chunkSize even with unusual separators", () => {
53
+ const chunks = chunkText("hello world", {
54
+ chunkSize: 100,
55
+ chunkOverlap: 10,
56
+ separators: ["##", "\n"],
57
+ });
58
+ expect(chunks).toHaveLength(1);
59
+ expect(chunks[0]!.content).toBe("hello world");
60
+ });
61
+ });
62
+
63
+ describe("chunkText — long documents", () => {
64
+ it("produces >= 4 chunks for an 8K-char paragraph-rich doc with chunkSize=2048", () => {
65
+ const paragraph = "The quick brown fox jumps over the lazy dog. ".repeat(20);
66
+ const text = Array.from({ length: 10 }, () => paragraph).join("\n\n");
67
+ expect(text.length).toBeGreaterThan(8000);
68
+
69
+ const chunks = chunkText(text, { chunkSize: 2048, chunkOverlap: 256 });
70
+ expect(chunks.length).toBeGreaterThanOrEqual(4);
71
+ assertCoreInvariants(text, chunks, 2048);
72
+ });
73
+
74
+ it("bounds content.length at chunkSize + separator slack", () => {
75
+ const text = Array.from({ length: 400 }, (_, i) => `Sentence ${i}.`).join(" ");
76
+ const chunks = chunkText(text, { chunkSize: 512, chunkOverlap: 64 });
77
+ assertCoreInvariants(text, chunks, 512);
78
+ });
79
+
80
+ it("keeps charStart monotonically non-decreasing", () => {
81
+ const text = "x".repeat(10_000);
82
+ const chunks = chunkText(text, { chunkSize: 256, chunkOverlap: 32 });
83
+ for (let i = 1; i < chunks.length; i++) {
84
+ expect(chunks[i]!.charStart).toBeGreaterThanOrEqual(chunks[i - 1]!.charStart);
85
+ }
86
+ });
87
+
88
+ it("reconstructs content bit-for-bit from offsets for every chunk", () => {
89
+ const text = Array.from({ length: 500 }, (_, i) => `Para ${i}.\n\n`).join("");
90
+ const chunks = chunkText(text, { chunkSize: 1024, chunkOverlap: 128 });
91
+ for (const c of chunks) {
92
+ expect(text.slice(c.charStart, c.charEnd)).toBe(c.content);
93
+ }
94
+ });
95
+ });
96
+
97
+ describe("chunkText — overlap behavior", () => {
98
+ it("produces overlap of approximately chunkOverlap between consecutive chunks", () => {
99
+ const text = "abcdefghij ".repeat(500); // ~5500 chars
100
+ const chunkOverlap = 256;
101
+ const chunks = chunkText(text, { chunkSize: 1024, chunkOverlap });
102
+
103
+ expect(chunks.length).toBeGreaterThan(1);
104
+
105
+ for (let i = 1; i < chunks.length; i++) {
106
+ const prev = chunks[i - 1]!;
107
+ const curr = chunks[i]!;
108
+ // Overlap in characters = prev.charEnd - curr.charStart.
109
+ const overlap = prev.charEnd - curr.charStart;
110
+ expect(overlap).toBeGreaterThan(0);
111
+ // Tolerance: +/- 16 chars (allows for snap to atom boundary).
112
+ expect(Math.abs(overlap - chunkOverlap)).toBeLessThanOrEqual(16);
113
+ }
114
+ });
115
+
116
+ it("makes forward progress even when chunkOverlap is zero", () => {
117
+ const text = "abcdefghij ".repeat(300);
118
+ const chunks = chunkText(text, { chunkSize: 512, chunkOverlap: 0 });
119
+ for (let i = 1; i < chunks.length; i++) {
120
+ expect(chunks[i]!.charStart).toBeGreaterThanOrEqual(chunks[i - 1]!.charEnd);
121
+ }
122
+ });
123
+
124
+ it("rejects chunkOverlap >= chunkSize", () => {
125
+ // Only need enough text to trigger the chunking path past the fast return.
126
+ const text = "x".repeat(2048);
127
+ expect(() =>
128
+ chunkText(text, { chunkSize: 100, chunkOverlap: 100 }),
129
+ ).toThrow(/chunkOverlap/);
130
+ expect(() =>
131
+ chunkText(text, { chunkSize: 100, chunkOverlap: 200 }),
132
+ ).toThrow(/chunkOverlap/);
133
+ });
134
+ });
135
+
136
+ describe("chunkText — unicode correctness", () => {
137
+ it("preserves character boundaries with emoji + CJK", () => {
138
+ // Mix emoji (surrogate pairs), CJK, and ASCII; repeat enough to exceed chunkSize.
139
+ const segment =
140
+ "Hello. こんにちは。 你好。 안녕하세요。 Emojis: 🎉🚀✨. ";
141
+ const text = segment.repeat(100); // plenty of chunks
142
+ const chunks = chunkText(text, { chunkSize: 512, chunkOverlap: 64 });
143
+
144
+ assertCoreInvariants(text, chunks, 512);
145
+
146
+ // Slicing must produce the same string as content — if indices split a
147
+ // surrogate pair the JS string would still compare equal char-by-char,
148
+ // but we additionally verify no lone surrogate prefix/suffix on boundaries.
149
+ for (const c of chunks) {
150
+ expect(text.slice(c.charStart, c.charEnd)).toBe(c.content);
151
+ }
152
+ });
153
+
154
+ it("handles pure CJK doc (no ASCII separators beyond ideographic space)", () => {
155
+ const text = "这是一个很长的中文文档。".repeat(300);
156
+ const chunks = chunkText(text, { chunkSize: 256, chunkOverlap: 32 });
157
+ assertCoreInvariants(text, chunks, 256);
158
+ // Must cover the whole doc (last chunk ends at text.length).
159
+ expect(chunks[chunks.length - 1]!.charEnd).toBe(text.length);
160
+ });
161
+ });
162
+
163
+ describe("chunkText — separator priority", () => {
164
+ it("prefers \\n\\n over other separators so code fences stay intact", () => {
165
+ // A large markdown doc with a code fence that would exceed chunkSize if
166
+ // we naively split on every newline; ensure it's kept whole by preferring
167
+ // \n\n as the outer boundary.
168
+ const prefix = "# Heading\n\nSome text before the fence.\n\n";
169
+ const fence =
170
+ "```typescript\n" +
171
+ "const x = 1;\n".repeat(60) +
172
+ "```";
173
+ const suffix = "\n\nText after the fence.";
174
+ const text = prefix + fence + suffix;
175
+
176
+ const chunkSize = fence.length + 50; // Large enough to keep fence whole.
177
+ const chunks = chunkText(text, {
178
+ chunkSize,
179
+ chunkOverlap: 64,
180
+ separators: ["\n\n", "\n", ". ", " ", ""],
181
+ });
182
+
183
+ // Fence should appear intact in at least one chunk.
184
+ const anyChunkContainsWholeFence = chunks.some(c => c.content.includes(fence));
185
+ expect(anyChunkContainsWholeFence).toBe(true);
186
+
187
+ // And no chunk should contain just a fragment of the opening backticks
188
+ // separated from its closing ones — i.e., if a chunk starts with ``` it
189
+ // must also contain the matching closing ```.
190
+ for (const c of chunks) {
191
+ const opens = (c.content.match(/```/g) ?? []).length;
192
+ // Either zero fences (text-only chunk) or an even number (complete fences).
193
+ expect(opens % 2 === 0 || c.content.includes(fence)).toBe(true);
194
+ }
195
+ });
196
+
197
+ it("honors custom separators — '##' heading splitter", () => {
198
+ const text =
199
+ "Intro paragraph here.\n\n" +
200
+ "## Section A\nContent for A.\n\n" +
201
+ "## Section B\nContent for B with a bit more text.\n\n" +
202
+ "## Section C\nFinal content goes here.";
203
+ const chunks = chunkText(text, {
204
+ chunkSize: 40, // small -> forces splitting
205
+ chunkOverlap: 8,
206
+ separators: ["##", "\n", " ", ""],
207
+ });
208
+
209
+ // Custom separator must be honored: the chunker must split on '##'.
210
+ // Because the separator stays attached to the preceding piece (to keep
211
+ // text.slice(start,end) === content), chunk boundaries occur *just after*
212
+ // each '##' occurrence — verify that at least one chunk's charEnd aligns
213
+ // with a '##' position + 2.
214
+ expect(chunks.length).toBeGreaterThan(1);
215
+ const sectionStarts = [
216
+ text.indexOf("## Section A"),
217
+ text.indexOf("## Section B"),
218
+ text.indexOf("## Section C"),
219
+ ];
220
+ const endPositions = chunks.map(c => c.charEnd);
221
+ const boundariesHit = sectionStarts.filter(pos =>
222
+ endPositions.includes(pos + 2),
223
+ );
224
+ expect(boundariesHit.length).toBeGreaterThan(0);
225
+ });
226
+
227
+ it("character-splits as a last resort when no separator applies", () => {
228
+ // A single token longer than chunkSize with no separators; must still
229
+ // produce chunks (via the "" sentinel).
230
+ const text = "a".repeat(300);
231
+ const chunks = chunkText(text, {
232
+ chunkSize: 100,
233
+ chunkOverlap: 10,
234
+ separators: ["\n\n", "\n", ""],
235
+ });
236
+ expect(chunks.length).toBeGreaterThan(1);
237
+ for (const c of chunks) {
238
+ expect(c.content.length).toBeLessThanOrEqual(100);
239
+ }
240
+ // Overall coverage: concatenating (with overlap removed) must reconstruct.
241
+ const firstChar = chunks[0]!.charStart;
242
+ const lastEnd = chunks[chunks.length - 1]!.charEnd;
243
+ expect(firstChar).toBe(0);
244
+ expect(lastEnd).toBe(text.length);
245
+ });
246
+ });
package/src/chunker.ts ADDED
@@ -0,0 +1,279 @@
1
+ /**
2
+ * RecursiveCharacterTextSplitter-style chunker.
3
+ *
4
+ * Splits long text into overlapping chunks while preserving the original
5
+ * character offsets (charStart, charEnd) so downstream code can reconstruct
6
+ * positions. Mirrors the semantics of LangChain's RecursiveCharacterTextSplitter:
7
+ * tries each separator in order, snapping chunk boundaries to the highest-priority
8
+ * separator that keeps pieces under chunkSize.
9
+ *
10
+ * Defaults correspond to 512-token chunks with 64-token overlap
11
+ * (approx: 1 token ~= 4 chars for English text).
12
+ */
13
+
14
+ export interface Chunk {
15
+ index: number;
16
+ content: string;
17
+ charStart: number;
18
+ charEnd: number;
19
+ }
20
+
21
+ export interface ChunkerOptions {
22
+ chunkSize?: number;
23
+ chunkOverlap?: number;
24
+ separators?: string[];
25
+ }
26
+
27
+ const DEFAULT_CHUNK_SIZE = 2048;
28
+ const DEFAULT_CHUNK_OVERLAP = 256;
29
+ const DEFAULT_SEPARATORS: string[] = ["\n\n", "\n", ". ", " ", ""];
30
+
31
+ /**
32
+ * A piece of the original text with an absolute offset back to the source.
33
+ * Used as the internal working type while recursing through separators.
34
+ */
35
+ interface Piece {
36
+ text: string;
37
+ start: number;
38
+ }
39
+
40
+ /**
41
+ * Pick the first separator from `separators` that occurs in `piece.text`.
42
+ * Falls back to the last separator (typically `""`) if none match — this is
43
+ * the sentinel that lets us character-split oversized pieces with no natural
44
+ * boundary.
45
+ */
46
+ function pickSeparator(pieceText: string, separators: string[]): {
47
+ separator: string;
48
+ remaining: string[];
49
+ } {
50
+ for (let i = 0; i < separators.length; i++) {
51
+ const sep = separators[i]!;
52
+ if (sep === "") {
53
+ return { separator: sep, remaining: separators.slice(i + 1) };
54
+ }
55
+ if (pieceText.includes(sep)) {
56
+ return { separator: sep, remaining: separators.slice(i + 1) };
57
+ }
58
+ }
59
+ // Should be unreachable when DEFAULT_SEPARATORS ends with "".
60
+ return { separator: "", remaining: [] };
61
+ }
62
+
63
+ /**
64
+ * Split a piece on `separator` while retaining absolute char offsets.
65
+ * When separator is empty, split into single-character pieces.
66
+ */
67
+ function splitOnSeparator(piece: Piece, separator: string): Piece[] {
68
+ if (separator === "") {
69
+ const out: Piece[] = [];
70
+ for (let i = 0; i < piece.text.length; i++) {
71
+ out.push({ text: piece.text[i]!, start: piece.start + i });
72
+ }
73
+ return out;
74
+ }
75
+
76
+ const out: Piece[] = [];
77
+ let cursor = 0;
78
+ let idx = piece.text.indexOf(separator, cursor);
79
+ while (idx !== -1) {
80
+ // Keep the separator attached to the preceding piece so reconstruction
81
+ // via text.slice(charStart, charEnd) works bit-for-bit.
82
+ const sliceEnd = idx + separator.length;
83
+ out.push({
84
+ text: piece.text.slice(cursor, sliceEnd),
85
+ start: piece.start + cursor,
86
+ });
87
+ cursor = sliceEnd;
88
+ idx = piece.text.indexOf(separator, cursor);
89
+ }
90
+ if (cursor < piece.text.length) {
91
+ out.push({
92
+ text: piece.text.slice(cursor),
93
+ start: piece.start + cursor,
94
+ });
95
+ }
96
+ return out;
97
+ }
98
+
99
+ /**
100
+ * Recursively flatten a piece into "atoms" — pieces small enough to merge
101
+ * greedily into chunks. Pieces larger than chunkSize are split with the next
102
+ * separator in line; pieces that fit are returned as-is.
103
+ */
104
+ function flattenToAtoms(
105
+ piece: Piece,
106
+ separators: string[],
107
+ chunkSize: number,
108
+ ): Piece[] {
109
+ if (piece.text.length <= chunkSize) {
110
+ return [piece];
111
+ }
112
+ const { separator, remaining } = pickSeparator(piece.text, separators);
113
+ const splits = splitOnSeparator(piece, separator);
114
+
115
+ // If the separator didn't actually reduce the piece (e.g., no occurrence),
116
+ // fall through to the next separator with the original piece.
117
+ if (splits.length <= 1) {
118
+ if (remaining.length === 0) {
119
+ // No more separators — return whatever we have, even if oversized.
120
+ return [piece];
121
+ }
122
+ return flattenToAtoms(piece, remaining, chunkSize);
123
+ }
124
+
125
+ const out: Piece[] = [];
126
+ for (const sub of splits) {
127
+ if (sub.text.length <= chunkSize) {
128
+ out.push(sub);
129
+ } else if (remaining.length > 0) {
130
+ for (const leaf of flattenToAtoms(sub, remaining, chunkSize)) {
131
+ out.push(leaf);
132
+ }
133
+ } else {
134
+ // Last-resort: character-split oversized atom so we never return a
135
+ // single atom larger than chunkSize.
136
+ out.push(...splitOnSeparator(sub, ""));
137
+ }
138
+ }
139
+ return out;
140
+ }
141
+
142
+ /**
143
+ * Build a chunk object from a contiguous run of atoms.
144
+ * `charStart` is taken from the first atom, `charEnd` from the last atom's
145
+ * end boundary, and `content` is `text.slice(start, end)` — this guarantees
146
+ * `text.slice(charStart, charEnd) === content`.
147
+ */
148
+ function buildChunk(
149
+ originalText: string,
150
+ atoms: Piece[],
151
+ index: number,
152
+ ): Chunk {
153
+ const first = atoms[0]!;
154
+ const last = atoms[atoms.length - 1]!;
155
+ const charStart = first.start;
156
+ const charEnd = last.start + last.text.length;
157
+ return {
158
+ index,
159
+ content: originalText.slice(charStart, charEnd),
160
+ charStart,
161
+ charEnd,
162
+ };
163
+ }
164
+
165
+ /**
166
+ * Compute the start position for the next chunk's atoms given the previous
167
+ * chunk ended at `prevEnd`. We walk backward through the atom list to find
168
+ * the atom whose start >= prevEnd - chunkOverlap; that atom begins the
169
+ * overlap region.
170
+ */
171
+ function findOverlapStartIndex(
172
+ atoms: Piece[],
173
+ lastEndAtomIndex: number,
174
+ prevEnd: number,
175
+ chunkOverlap: number,
176
+ ): number {
177
+ if (chunkOverlap <= 0) {
178
+ return lastEndAtomIndex + 1;
179
+ }
180
+ const targetStart = prevEnd - chunkOverlap;
181
+ // Find the earliest atom in [0..lastEndAtomIndex] whose start >= targetStart.
182
+ let overlapAtomIdx = lastEndAtomIndex + 1;
183
+ for (let i = lastEndAtomIndex; i >= 0; i--) {
184
+ if (atoms[i]!.start >= targetStart) {
185
+ overlapAtomIdx = i;
186
+ } else {
187
+ break;
188
+ }
189
+ }
190
+ // If overlap produced no progress (no atoms found), step forward to avoid
191
+ // an infinite loop.
192
+ if (overlapAtomIdx > lastEndAtomIndex) {
193
+ overlapAtomIdx = lastEndAtomIndex + 1;
194
+ }
195
+ return overlapAtomIdx;
196
+ }
197
+
198
+ /**
199
+ * Split `text` into overlapping chunks.
200
+ *
201
+ * Semantics:
202
+ * - Empty input -> empty array.
203
+ * - Short input (<= chunkSize) -> single chunk covering the whole text.
204
+ * - For each chunk, `text.slice(charStart, charEnd) === content`.
205
+ * - `charStart` is monotonically non-decreasing across chunks.
206
+ * - Consecutive chunks overlap by ~`chunkOverlap` chars (snapped to atom
207
+ * boundaries; may differ by up to the largest atom size).
208
+ * - Each chunk's content length is bounded by chunkSize + a small slack for
209
+ * the separator that snapped the boundary.
210
+ */
211
+ export function chunkText(text: string, opts: ChunkerOptions = {}): Chunk[] {
212
+ const chunkSize = opts.chunkSize ?? DEFAULT_CHUNK_SIZE;
213
+ const chunkOverlap = opts.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP;
214
+ const separators = opts.separators ?? DEFAULT_SEPARATORS;
215
+
216
+ if (text.length === 0) {
217
+ return [];
218
+ }
219
+
220
+ if (chunkOverlap >= chunkSize) {
221
+ throw new Error(
222
+ `chunker: chunkOverlap (${chunkOverlap}) must be smaller than chunkSize (${chunkSize})`,
223
+ );
224
+ }
225
+
226
+ // Fast path for short docs — no need to walk separators.
227
+ if (text.length <= chunkSize) {
228
+ return [
229
+ {
230
+ index: 0,
231
+ content: text,
232
+ charStart: 0,
233
+ charEnd: text.length,
234
+ },
235
+ ];
236
+ }
237
+
238
+ const atoms = flattenToAtoms({ text, start: 0 }, separators, chunkSize);
239
+
240
+ const chunks: Chunk[] = [];
241
+ let chunkIdx = 0;
242
+ let i = 0;
243
+
244
+ while (i < atoms.length) {
245
+ // Greedily pack atoms into this chunk until adding one more would push
246
+ // the chunk content past chunkSize.
247
+ let runLen = 0;
248
+ let j = i;
249
+ while (j < atoms.length) {
250
+ const atomLen = atoms[j]!.text.length;
251
+ // Always include at least one atom per chunk to ensure progress.
252
+ if (j > i && runLen + atomLen > chunkSize) {
253
+ break;
254
+ }
255
+ runLen += atomLen;
256
+ j++;
257
+ }
258
+ const runAtoms = atoms.slice(i, j);
259
+ const chunk = buildChunk(text, runAtoms, chunkIdx);
260
+ chunks.push(chunk);
261
+ chunkIdx++;
262
+
263
+ if (j >= atoms.length) {
264
+ break;
265
+ }
266
+
267
+ // Compute overlap start for the next chunk.
268
+ const lastEndAtomIdx = j - 1;
269
+ const nextStart = findOverlapStartIndex(
270
+ atoms,
271
+ lastEndAtomIdx,
272
+ chunk.charEnd,
273
+ chunkOverlap,
274
+ );
275
+ i = nextStart;
276
+ }
277
+
278
+ return chunks;
279
+ }