ralph-hero-knowledge-index 0.1.30 → 0.1.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/.mcp.json +1 -1
- package/dist/chunker.js +12 -1
- package/dist/chunker.js.map +1 -1
- package/package.json +1 -1
- package/src/__tests__/chunker.test.ts +56 -0
- package/src/chunker.ts +12 -1
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "ralph-knowledge",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.31",
|
|
4
4
|
"description": "Knowledge graph for ralph-hero: semantic search, relationship traversal, and document indexing across thoughts/ documents. Optional companion to ralph-hero.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Chad Dubiel",
|
package/.mcp.json
CHANGED
package/dist/chunker.js
CHANGED
|
@@ -127,6 +127,10 @@ function buildChunk(originalText, atoms, index) {
|
|
|
127
127
|
* chunk ended at `prevEnd`. We walk backward through the atom list to find
|
|
128
128
|
* the atom whose start >= prevEnd - chunkOverlap; that atom begins the
|
|
129
129
|
* overlap region.
|
|
130
|
+
*
|
|
131
|
+
* NOTE: this function may return an index <= the previous chunk's first atom
|
|
132
|
+
* when the previous chunk consisted of a single atom shorter than chunkOverlap.
|
|
133
|
+
* The caller in chunkText clamps the result to ensure forward progress.
|
|
130
134
|
*/
|
|
131
135
|
function findOverlapStartIndex(atoms, lastEndAtomIndex, prevEnd, chunkOverlap) {
|
|
132
136
|
if (chunkOverlap <= 0) {
|
|
@@ -212,7 +216,14 @@ export function chunkText(text, opts = {}) {
|
|
|
212
216
|
// Compute overlap start for the next chunk.
|
|
213
217
|
const lastEndAtomIdx = j - 1;
|
|
214
218
|
const nextStart = findOverlapStartIndex(atoms, lastEndAtomIdx, chunk.charEnd, chunkOverlap);
|
|
215
|
-
|
|
219
|
+
// Ensure forward progress: when a chunk consists of a single atom shorter
|
|
220
|
+
// than chunkOverlap, the overlap walk would land back on the same atom.
|
|
221
|
+
// Clamp to i + 1 in that case so we always advance, accepting that the
|
|
222
|
+
// resulting chunks will not overlap (this is the only correct option:
|
|
223
|
+
// overlap requires that the next chunk start within the previous chunk's
|
|
224
|
+
// span, but if the previous chunk has only one atom, there is no earlier
|
|
225
|
+
// position within its span to start from).
|
|
226
|
+
i = nextStart > i ? nextStart : i + 1;
|
|
216
227
|
}
|
|
217
228
|
return chunks;
|
|
218
229
|
}
|
package/dist/chunker.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"chunker.js","sourceRoot":"","sources":["../src/chunker.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAeH,MAAM,kBAAkB,GAAG,IAAI,CAAC;AAChC,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAClC,MAAM,kBAAkB,GAAa,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC;AAWnE;;;;;GAKG;AACH,SAAS,aAAa,CAAC,SAAiB,EAAE,UAAoB;IAI5D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3C,MAAM,GAAG,GAAG,UAAU,CAAC,CAAC,CAAE,CAAC;QAC3B,IAAI,GAAG,KAAK,EAAE,EAAE,CAAC;YACf,OAAO,EAAE,SAAS,EAAE,GAAG,EAAE,SAAS,EAAE,UAAU,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;QAChE,CAAC;QACD,IAAI,SAAS,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YAC5B,OAAO,EAAE,SAAS,EAAE,GAAG,EAAE,SAAS,EAAE,UAAU,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;QAChE,CAAC;IACH,CAAC;IACD,8DAA8D;IAC9D,OAAO,EAAE,SAAS,EAAE,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,CAAC;AAC1C,CAAC;AAED;;;GAGG;AACH,SAAS,gBAAgB,CAAC,KAAY,EAAE,SAAiB;IACvD,IAAI,SAAS,KAAK,EAAE,EAAE,CAAC;QACrB,MAAM,GAAG,GAAY,EAAE,CAAC;QACxB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3C,GAAG,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,CAAC,CAAE,EAAE,KAAK,EAAE,KAAK,CAAC,KAAK,GAAG,CAAC,EAAE,CAAC,CAAC;QAC7D,CAAC;QACD,OAAO,GAAG,CAAC;IACb,CAAC;IAED,MAAM,GAAG,GAAY,EAAE,CAAC;IACxB,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,IAAI,GAAG,GAAG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAChD,OAAO,GAAG,KAAK,CAAC,CAAC,EAAE,CAAC;QAClB,uEAAuE;QACvE,wDAAwD;QACxD,MAAM,QAAQ,GAAG,GAAG,GAAG,SAAS,CAAC,MAAM,CAAC;QACxC,GAAG,CAAC,IAAI,CAAC;YACP,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,QAAQ,CAAC;YACxC,KAAK,EAAE,KAAK,CAAC,KAAK,GAAG,MAAM;SAC5B,CAAC,CAAC;QACH,MAAM,GAAG,QAAQ,CAAC;QAClB,GAAG,GAAG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAC9C,CAAC;IACD,IAAI,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;QAC/B,GAAG,CAAC,IAAI,CAAC;YACP,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC;YAC9B,KAAK,EAAE,KAAK,CAAC,KAAK,GAAG,MAAM;SAC5B,CAAC,CAAC;IACL,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;;GAIG;AACH,SAAS,cAAc,CACrB,KAAY,EACZ,UAAoB,EACpB,SAAiB;IAEjB,IAAI,KAAK,CAAC,IAAI,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QACnC,OAAO,CAAC,KAAK,CAAC,CAAC;IACjB,CAAC;IACD,MAAM,EAAE,SAAS,EAAE,SAAS,EAAE,GAAG,aAAa,CAAC,KAAK,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;IACvE,MAAM,MAAM,GAAG,gBAAgB,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;IAElD,2EAA2E;IAC3E,8DAA8D;IAC9D,IAAI,MAAM,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;QACvB,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3B,mEAAmE;YACnE,OAAO,CAAC,KAAK,CAAC,CAAC;QACjB,CAAC;QACD,OAAO,cAAc,CAAC,KAAK,EAAE,SAAS,EAAE,SAAS,CAAC,CAAC;IACrD,CAAC;IAED,MAAM,GAAG,GAAY,EAAE,CAAC;IACxB,KAAK,MAAM,GAAG,IAAI,MAAM,EAAE,CAAC;QACzB,IAAI,GAAG,CAAC,IAAI,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;YACjC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAChB,CAAC;aAAM,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAChC,KAAK,MAAM,IAAI,IAAI,cAAc,CAAC,GAAG,EAAE,SAAS,EAAE,SAAS,CAAC,EAAE,CAAC;gBAC7D,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACjB,CAAC;QACH,CAAC;aAAM,CAAC;YACN,mEAAmE;YACnE,qCAAqC;YACrC,GAAG,CAAC,IAAI,CAAC,GAAG,gBAAgB,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC,CAAC;QACzC,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;;;GAKG;AACH,SAAS,UAAU,CACjB,YAAoB,EACpB,KAAc,EACd,KAAa;IAEb,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAE,CAAC;IACxB,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAE,CAAC;IACtC,MAAM,SAAS,GAAG,KAAK,CAAC,KAAK,CAAC;IAC9B,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC;IAC9C,OAAO;QACL,KAAK;QACL,OAAO,EAAE,YAAY,CAAC,KAAK,CAAC,SAAS,EAAE,OAAO,CAAC;QAC/C,SAAS;QACT,OAAO;KACR,CAAC;AACJ,CAAC;AAED
|
|
1
|
+
{"version":3,"file":"chunker.js","sourceRoot":"","sources":["../src/chunker.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAeH,MAAM,kBAAkB,GAAG,IAAI,CAAC;AAChC,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAClC,MAAM,kBAAkB,GAAa,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC;AAWnE;;;;;GAKG;AACH,SAAS,aAAa,CAAC,SAAiB,EAAE,UAAoB;IAI5D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3C,MAAM,GAAG,GAAG,UAAU,CAAC,CAAC,CAAE,CAAC;QAC3B,IAAI,GAAG,KAAK,EAAE,EAAE,CAAC;YACf,OAAO,EAAE,SAAS,EAAE,GAAG,EAAE,SAAS,EAAE,UAAU,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;QAChE,CAAC;QACD,IAAI,SAAS,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YAC5B,OAAO,EAAE,SAAS,EAAE,GAAG,EAAE,SAAS,EAAE,UAAU,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;QAChE,CAAC;IACH,CAAC;IACD,8DAA8D;IAC9D,OAAO,EAAE,SAAS,EAAE,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,CAAC;AAC1C,CAAC;AAED;;;GAGG;AACH,SAAS,gBAAgB,CAAC,KAAY,EAAE,SAAiB;IACvD,IAAI,SAAS,KAAK,EAAE,EAAE,CAAC;QACrB,MAAM,GAAG,GAAY,EAAE,CAAC;QACxB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3C,GAAG,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,CAAC,CAAE,EAAE,KAAK,EAAE,KAAK,CAAC,KAAK,GAAG,CAAC,EAAE,CAAC,CAAC;QAC7D,CAAC;QACD,OAAO,GAAG,CAAC;IACb,CAAC;IAED,MAAM,GAAG,GAAY,EAAE,CAAC;IACxB,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,IAAI,GAAG,GAAG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAChD,OAAO,GAAG,KAAK,CAAC,CAAC,EAAE,CAAC;QAClB,uEAAuE;QACvE,wDAAwD;QACxD,MAAM,QAAQ,GAAG,GAAG,GAAG,SAAS,CAAC,MAAM,CAAC;QACxC,GAAG,CAAC,IAAI,CAAC;YACP,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,QAAQ,CAAC;YACxC,KAAK,EAAE,KAAK,CAAC,KAAK,GAAG,MAAM;SAC5B,CAAC,CAAC;QACH,MAAM,GAAG,QAAQ,CAAC;QAClB,GAAG,GAAG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAC9C,CAAC;IACD,IAAI,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;QAC/B,GAAG,CAAC,IAAI,CAAC;YACP,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC;YAC9B,KAAK,EAAE,KAAK,CAAC,KAAK,GAAG,MAAM;SAC5B,CAAC,CAAC;IACL,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;;GAIG;AACH,SAAS,cAAc,CACrB,KAAY,EACZ,UAAoB,EACpB,SAAiB;IAEjB,IAAI,KAAK,CAAC,IAAI,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QACnC,OAAO,CAAC,KAAK,CAAC,CAAC;IACjB,CAAC;IACD,MAAM,EAAE,SAAS,EAAE,SAAS,EAAE,GAAG,aAAa,CAAC,KAAK,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;IACvE,MAAM,MAAM,GAAG,gBAAgB,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;IAElD,2EAA2E;IAC3E,8DAA8D;IAC9D,IAAI,MAAM,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;QACvB,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3B,mEAAmE;YACnE,OAAO,CAAC,KAAK,CAAC,CAAC;QACjB,CAAC;QACD,OAAO,cAAc,CAAC,KAAK,EAAE,SAAS,EAAE,SAAS,CAAC,CAAC;IACrD,CAAC;IAED,MAAM,GAAG,GAAY,EAAE,CAAC;IACxB,KAAK,MAAM,GAAG,IAAI,MAAM,EAAE,CAAC;QACzB,IAAI,GAAG,CAAC,IAAI,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;YACjC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAChB,CAAC;aAAM,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAChC,KAAK,MAAM,IAAI,IAAI,cAAc,CAAC,GAAG,EAAE,SAAS,EAAE,SAAS,CAAC,EAAE,CAAC;gBAC7D,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACjB,CAAC;QACH,CAAC;aAAM,CAAC;YACN,mEAAmE;YACnE,qCAAqC;YACrC,GAAG,CAAC,IAAI,CAAC,GAAG,gBAAgB,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC,CAAC;QACzC,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;;;GAKG;AACH,SAAS,UAAU,CACjB,YAAoB,EACpB,KAAc,EACd,KAAa;IAEb,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAE,CAAC;IACxB,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAE,CAAC;IACtC,MAAM,SAAS,GAAG,KAAK,CAAC,KAAK,CAAC;IAC9B,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC;IAC9C,OAAO;QACL,KAAK;QACL,OAAO,EAAE,YAAY,CAAC,KAAK,CAAC,SAAS,EAAE,OAAO,CAAC;QAC/C,SAAS;QACT,OAAO;KACR,CAAC;AACJ,CAAC;AAED;;;;;;;;;GASG;AACH,SAAS,qBAAqB,CAC5B,KAAc,EACd,gBAAwB,EACxB,OAAe,EACf,YAAoB;IAEpB,IAAI,YAAY,IAAI,CAAC,EAAE,CAAC;QACtB,OAAO,gBAAgB,GAAG,CAAC,CAAC;IAC9B,CAAC;IACD,MAAM,WAAW,GAAG,OAAO,GAAG,YAAY,CAAC;IAC3C,8EAA8E;IAC9E,IAAI,cAAc,GAAG,gBAAgB,GAAG,CAAC,CAAC;IAC1C,KAAK,IAAI,CAAC,GAAG,gBAAgB,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3C,IAAI,KAAK,CAAC,CAAC,CAAE,CAAC,KAAK,IAAI,WAAW,EAAE,CAAC;YACnC,cAAc,GAAG,CAAC,CAAC;QACrB,CAAC;aAAM,CAAC;YACN,MAAM;QACR,CAAC;IACH,CAAC;IACD,0EAA0E;IAC1E,oBAAoB;IACpB,IAAI,cAAc,GAAG,gBAAgB,EAAE,CAAC;QACtC,cAAc,GAAG,gBAAgB,GAAG,CAAC,CAAC;IACxC,CAAC;IACD,OAAO,cAAc,CAAC;AACxB,CAAC;AAED;;;;;;;;;;;;GAYG;AACH,MAAM,UAAU,SAAS,CAAC,IAAY,EAAE,OAAuB,EAAE;IAC/D,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,kBAAkB,CAAC;IACvD,MAAM,YAAY,GAAG,IAAI,CAAC,YAAY,IAAI,qBAAqB,CAAC;IAChE,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,IAAI,kBAAkB,CAAC;IAEzD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtB,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,IAAI,YAAY,IAAI,SAAS,EAAE,CAAC;QAC9B,MAAM,IAAI,KAAK,CACb,0BAA0B,YAAY,qCAAqC,SAAS,GAAG,CACxF,CAAC;IACJ,CAAC;IAED,yDAAyD;IACzD,IAAI,IAAI,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAC7B,OAAO;YACL;gBACE,KAAK,EAAE,CAAC;gBACR,OAAO,EAAE,IAAI;gBACb,SAAS,EAAE,CAAC;gBACZ,OAAO,EAAE,IAAI,CAAC,MAAM;aACrB;SACF,CAAC;IACJ,CAAC;IAED,MAAM,KAAK,GAAG,cAAc,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,EAAE,UAAU,EAAE,SAAS,CAAC,CAAC;IAExE,MAAM,MAAM,GAAY,EAAE,CAAC;IAC3B,IAAI,QAAQ,GAAG,CAAC,CAAC;IACjB,IAAI,CAAC,GAAG,CAAC,CAAC;IAEV,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;QACxB,uEAAuE;QACvE,oCAAoC;QACpC,IAAI,MAAM,GAAG,CAAC,CAAC;QACf,IAAI,CAAC,GAAG,CAAC,CAAC;QACV,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YACxB,MAAM,OAAO,GAAG,KAAK,CAAC,CAAC,CAAE,CAAC,IAAI,CAAC,MAAM,CAAC;YACtC,iEAAiE;YACjE,IAAI,CAAC,GAAG,CAAC,IAAI,MAAM,GAAG,OAAO,GAAG,SAAS,EAAE,CAAC;gBAC1C,MAAM;YACR,CAAC;YACD,MAAM,IAAI,OAAO,CAAC;YAClB,CAAC,EAAE,CAAC;QACN,CAAC;QACD,MAAM,QAAQ,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QACnC,MAAM,KAAK,GAAG,UAAU,CAAC,IAAI,EAAE,QAAQ,EAAE,QAAQ,CAAC,CAAC;QACnD,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACnB,QAAQ,EAAE,CAAC;QAEX,IAAI,CAAC,IAAI,KAAK,CAAC,MAAM,EAAE,CAAC;YACtB,MAAM;QACR,CAAC;QAED,4CAA4C;QAC5C,MAAM,cAAc,GAAG,CAAC,GAAG,CAAC,CAAC;QAC7B,MAAM,SAAS,GAAG,qBAAqB,CACrC,KAAK,EACL,cAAc,EACd,KAAK,CAAC,OAAO,EACb,YAAY,CACb,CAAC;QACF,0EAA0E;QAC1E,wEAAwE;QACxE,uEAAuE;QACvE,sEAAsE;QACtE,yEAAyE;QACzE,yEAAyE;QACzE,2CAA2C;QAC3C,CAAC,GAAG,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;IACxC,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
package/package.json
CHANGED
|
@@ -244,3 +244,59 @@ describe("chunkText — separator priority", () => {
|
|
|
244
244
|
expect(lastEnd).toBe(text.length);
|
|
245
245
|
});
|
|
246
246
|
});
|
|
247
|
+
|
|
248
|
+
describe("chunkText — forward progress invariant", () => {
|
|
249
|
+
// Regression test for GH-916. Pre-fix this OOMs.
|
|
250
|
+
it("terminates when a chunk would consist of a single atom shorter than chunkOverlap", () => {
|
|
251
|
+
// Trigger: atom N is short (< 256 chars), atom N+1 is large enough that
|
|
252
|
+
// packing them together would exceed chunkSize (2048).
|
|
253
|
+
const shortAtom = "short paragraph that is under 256 chars.\n\n";
|
|
254
|
+
const longAtom = "x".repeat(1900) + "\n\n";
|
|
255
|
+
const text = shortAtom + longAtom + "tail.";
|
|
256
|
+
const chunks = chunkText(text);
|
|
257
|
+
// We don't care about the exact count; we care that chunkText returns at
|
|
258
|
+
// all (pre-fix it loops forever) and that all chunks have non-empty content.
|
|
259
|
+
expect(chunks.length).toBeGreaterThan(0);
|
|
260
|
+
expect(chunks.length).toBeLessThan(100); // sanity: no runaway
|
|
261
|
+
for (const c of chunks) {
|
|
262
|
+
expect(c.content.length).toBeGreaterThan(0);
|
|
263
|
+
}
|
|
264
|
+
});
|
|
265
|
+
|
|
266
|
+
it("makes strict forward progress: chunk[i+1].charStart > chunk[i].charStart", () => {
|
|
267
|
+
// Build an input that triggers many single-atom chunks.
|
|
268
|
+
const blocks = [];
|
|
269
|
+
for (let i = 0; i < 20; i++) {
|
|
270
|
+
blocks.push("short " + i + ".\n\n");
|
|
271
|
+
blocks.push("x".repeat(1900) + "\n\n");
|
|
272
|
+
}
|
|
273
|
+
const chunks = chunkText(blocks.join(""));
|
|
274
|
+
expect(chunks.length).toBeGreaterThan(0);
|
|
275
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
276
|
+
expect(chunks[i]!.charStart).toBeGreaterThan(chunks[i - 1]!.charStart);
|
|
277
|
+
}
|
|
278
|
+
});
|
|
279
|
+
|
|
280
|
+
it("chunks the GH-916 fixture file in bounded time and memory", () => {
|
|
281
|
+
// Path-independent fixture: a markdown sample that mirrors the trigger
|
|
282
|
+
// pattern (short paragraphs interspersed with large code blocks).
|
|
283
|
+
const sample = [
|
|
284
|
+
"# Heading\n\n",
|
|
285
|
+
"Short intro paragraph.\n\n",
|
|
286
|
+
"```python\n" + "code line\n".repeat(180) + "```\n\n",
|
|
287
|
+
"Short follow-up.\n\n",
|
|
288
|
+
"```python\n" + "more code\n".repeat(180) + "```\n\n",
|
|
289
|
+
"End.",
|
|
290
|
+
].join("");
|
|
291
|
+
const start = Date.now();
|
|
292
|
+
const chunks = chunkText(sample);
|
|
293
|
+
const elapsed = Date.now() - start;
|
|
294
|
+
expect(elapsed).toBeLessThan(1000); // pre-fix this would not return
|
|
295
|
+
expect(chunks.length).toBeGreaterThan(0);
|
|
296
|
+
expect(chunks.length).toBeLessThan(50); // sanity: no runaway
|
|
297
|
+
// Spot-check the canonical invariant
|
|
298
|
+
for (const c of chunks) {
|
|
299
|
+
expect(sample.slice(c.charStart, c.charEnd)).toBe(c.content);
|
|
300
|
+
}
|
|
301
|
+
});
|
|
302
|
+
});
|
package/src/chunker.ts
CHANGED
|
@@ -167,6 +167,10 @@ function buildChunk(
|
|
|
167
167
|
* chunk ended at `prevEnd`. We walk backward through the atom list to find
|
|
168
168
|
* the atom whose start >= prevEnd - chunkOverlap; that atom begins the
|
|
169
169
|
* overlap region.
|
|
170
|
+
*
|
|
171
|
+
* NOTE: this function may return an index <= the previous chunk's first atom
|
|
172
|
+
* when the previous chunk consisted of a single atom shorter than chunkOverlap.
|
|
173
|
+
* The caller in chunkText clamps the result to ensure forward progress.
|
|
170
174
|
*/
|
|
171
175
|
function findOverlapStartIndex(
|
|
172
176
|
atoms: Piece[],
|
|
@@ -272,7 +276,14 @@ export function chunkText(text: string, opts: ChunkerOptions = {}): Chunk[] {
|
|
|
272
276
|
chunk.charEnd,
|
|
273
277
|
chunkOverlap,
|
|
274
278
|
);
|
|
275
|
-
|
|
279
|
+
// Ensure forward progress: when a chunk consists of a single atom shorter
|
|
280
|
+
// than chunkOverlap, the overlap walk would land back on the same atom.
|
|
281
|
+
// Clamp to i + 1 in that case so we always advance, accepting that the
|
|
282
|
+
// resulting chunks will not overlap (this is the only correct option:
|
|
283
|
+
// overlap requires that the next chunk start within the previous chunk's
|
|
284
|
+
// span, but if the previous chunk has only one atom, there is no earlier
|
|
285
|
+
// position within its span to start from).
|
|
286
|
+
i = nextStart > i ? nextStart : i + 1;
|
|
276
287
|
}
|
|
277
288
|
|
|
278
289
|
return chunks;
|