voctar 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/chunking/strategies/fixed.d.ts.map +1 -1
- package/dist/src/chunking/strategies/fixed.js +8 -1
- package/dist/src/chunking/strategies/fixed.js.map +1 -1
- package/dist/src/chunking/strategies/paragraph.js +2 -2
- package/dist/src/chunking/strategies/paragraph.js.map +1 -1
- package/dist/src/chunking/strategies/recursive.d.ts.map +1 -1
- package/dist/src/chunking/strategies/recursive.js +10 -1
- package/dist/src/chunking/strategies/recursive.js.map +1 -1
- package/dist/src/chunking/strategies/sentence.js +2 -2
- package/dist/src/chunking/strategies/sentence.js.map +1 -1
- package/dist/src/chunking/utils/tokenizer.d.ts.map +1 -1
- package/dist/src/chunking/utils/tokenizer.js +2 -1
- package/dist/src/chunking/utils/tokenizer.js.map +1 -1
- package/package.json +1 -1
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"fixed.d.ts","sourceRoot":"","sources":["../../../../src/chunking/strategies/fixed.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,KAAK,EAAE,eAAe,EAAE,gBAAgB,EAAE,MAAM,UAAU,CAAC;AAGzE,qBAAa,yBAA0B,YAAW,gBAAgB;IAChE,OAAO,IAAI,MAAM;IAIjB,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,EAAE,OAAO,EAAE,eAAe,GAAG,KAAK,EAAE;
|
|
1
|
+
{"version":3,"file":"fixed.d.ts","sourceRoot":"","sources":["../../../../src/chunking/strategies/fixed.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,KAAK,EAAE,eAAe,EAAE,gBAAgB,EAAE,MAAM,UAAU,CAAC;AAGzE,qBAAa,yBAA0B,YAAW,gBAAgB;IAChE,OAAO,IAAI,MAAM;IAIjB,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,EAAE,OAAO,EAAE,eAAe,GAAG,KAAK,EAAE;IAqE1E;;OAEG;IACH,OAAO,CAAC,qBAAqB;IA0B7B;;OAEG;IACH,OAAO,CAAC,cAAc;CAuBvB"}
|
|
@@ -42,9 +42,16 @@ class FixedSizeChunkingStrategy {
|
|
|
42
42
|
...options.metadata,
|
|
43
43
|
},
|
|
44
44
|
});
|
|
45
|
+
if (endChar >= normalizedText.length) {
|
|
46
|
+
break;
|
|
47
|
+
}
|
|
45
48
|
// Calculate overlap position using token count
|
|
46
49
|
const overlapText = this.getOverlapText(chunkText, overlap);
|
|
47
|
-
|
|
50
|
+
const nextStartChar = endChar - overlapText.length;
|
|
51
|
+
if (nextStartChar <= startChar) {
|
|
52
|
+
break;
|
|
53
|
+
}
|
|
54
|
+
startChar = nextStartChar;
|
|
48
55
|
chunkIndex++;
|
|
49
56
|
// Avoid creating tiny overlapping chunks at the end
|
|
50
57
|
if (normalizedText.length - startChar < overlapText.length) {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"fixed.js","sourceRoot":"","sources":["../../../../src/chunking/strategies/fixed.ts"],"names":[],"mappings":";;;AAAA,+BAA+B;AAC/B,+BAAoC;AAEpC,kDAAiD;AAEjD,MAAa,yBAAyB;IACpC,OAAO;QACL,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,KAAK,CAAC,IAAY,EAAE,UAAkB,EAAE,OAAwB;QAC9D,uDAAuD;QACvD,MAAM,UAAU,GAAI,OAAe,CAAC,UAAU,IAAI,IAAI,CAAC;QACvD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,YAAY,IAAI,IAAI,EAAE,UAAU,CAAC,CAAC;QACnE,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,OAAO,IAAI,GAAG,EAAE,IAAI,CAAC,KAAK,CAAC,OAAO,GAAG,GAAG,CAAC,CAAC,CAAC;QAC5E,MAAM,kBAAkB,GAAG,OAAO,CAAC,kBAAkB,IAAI,KAAK,CAAC;QAE/D,8CAA8C;QAC9C,MAAM,cAAc,GAAG,kBAAkB;YACvC,CAAC,CAAC,IAAI;YACN,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QAErC,MAAM,MAAM,GAAY,EAAE,CAAC;QAC3B,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,IAAI,UAAU,GAAG,CAAC,CAAC;QAEnB,OAAO,SAAS,GAAG,cAAc,CAAC,MAAM,EAAE,CAAC;YACzC,6BAA6B;YAC7B,MAAM,aAAa,GAAG,cAAc,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;YACtD,MAAM,SAAS,GAAG,IAAI,CAAC,qBAAqB,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC;YAErE,IAAI,CAAC,SAAS,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACzC,MAAM;YACR,CAAC;YAED,MAAM,OAAO,GAAG,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC;YAE7C,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAA,SAAM,GAAE;gBACZ,IAAI,EAAE,SAAS;gBACf,QAAQ,EAAE;oBACR,UAAU;oBACV,UAAU;oBACV,WAAW,EAAE,CAAC,EAAE,+CAA+C;oBAC/D,SAAS;oBACT,OAAO;oBACP,MAAM,EAAE,IAAA,uBAAW,EAAC,SAAS,CAAC;oBAC9B,GAAG,OAAO,CAAC,QAAQ;iBACpB;aACF,CAAC,CAAC;YAEH,+CAA+C;YAC/C,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;YAC5D,
|
|
1
|
+
{"version":3,"file":"fixed.js","sourceRoot":"","sources":["../../../../src/chunking/strategies/fixed.ts"],"names":[],"mappings":";;;AAAA,+BAA+B;AAC/B,+BAAoC;AAEpC,kDAAiD;AAEjD,MAAa,yBAAyB;IACpC,OAAO;QACL,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,KAAK,CAAC,IAAY,EAAE,UAAkB,EAAE,OAAwB;QAC9D,uDAAuD;QACvD,MAAM,UAAU,GAAI,OAAe,CAAC,UAAU,IAAI,IAAI,CAAC;QACvD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,YAAY,IAAI,IAAI,EAAE,UAAU,CAAC,CAAC;QACnE,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,OAAO,IAAI,GAAG,EAAE,IAAI,CAAC,KAAK,CAAC,OAAO,GAAG,GAAG,CAAC,CAAC,CAAC;QAC5E,MAAM,kBAAkB,GAAG,OAAO,CAAC,kBAAkB,IAAI,KAAK,CAAC;QAE/D,8CAA8C;QAC9C,MAAM,cAAc,GAAG,kBAAkB;YACvC,CAAC,CAAC,IAAI;YACN,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QAErC,MAAM,MAAM,GAAY,EAAE,CAAC;QAC3B,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,IAAI,UAAU,GAAG,CAAC,CAAC;QAEnB,OAAO,SAAS,GAAG,cAAc,CAAC,MAAM,EAAE,CAAC;YACzC,6BAA6B;YAC7B,MAAM,aAAa,GAAG,cAAc,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;YACtD,MAAM,SAAS,GAAG,IAAI,CAAC,qBAAqB,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC;YAErE,IAAI,CAAC,SAAS,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACzC,MAAM;YACR,CAAC;YAED,MAAM,OAAO,GAAG,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC;YAE7C,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAA,SAAM,GAAE;gBACZ,IAAI,EAAE,SAAS;gBACf,QAAQ,EAAE;oBACR,UAAU;oBACV,UAAU;oBACV,WAAW,EAAE,CAAC,EAAE,+CAA+C;oBAC/D,SAAS;oBACT,OAAO;oBACP,MAAM,EAAE,IAAA,uBAAW,EAAC,SAAS,CAAC;oBAC9B,GAAG,OAAO,CAAC,QAAQ;iBACpB;aACF,CAAC,CAAC;YAEH,IAAI,OAAO,IAAI,cAAc,CAAC,MAAM,EAAE,CAAC;gBACrC,MAAM;YACR,CAAC;YAED,+CAA+C;YAC/C,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;YAC5D,MAAM,aAAa,GAAG,OAAO,GAAG,WAAW,CAAC,MAAM,CAAC;YACnD,IAAI,aAAa,IAAI,SAAS,EAAE,CAAC;gBAC/B,MAAM;YACR,CAAC;YAED,SAAS,GAAG,aAAa,CAAC;YAC1B,UAAU,EAAE,CAAC;YAEb,oDAAoD;YACpD,IAAI,cAAc,CAAC,MAAM,GAAG,SAAS,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC;gBAC3D,MAAM;YACR,CAAC;QACH,CAAC;QAED,oCAAoC;QACpC,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE;YACrB,KAAK,CAAC,QAAQ,CAAC,WAAW,GAAG,MAAM,CAAC,MAAM,CAAC;QAC7C,CAAC,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,qBAAqB,CAAC,IAAY,EAAE,SAAiB;QAC3D,IAAI,IAAA,uBAAW,EAAC,IAAI,CAAC,IAAI,SAAS,EAAE,CAAC;YACnC,OAAO,IAAI,CAAC;QACd,CAAC;QAED,iDAAiD;QACjD,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC;QACtB,IAAI,SAAS,GAAG,EAAE,CAAC;QAEnB,OAAO,KAAK,GAAG,GAAG,EAAE,CAAC;YACnB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;YAC1C,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;YACrC,MAAM,MAAM,GAAG,IAAA,uBAAW,EAAC,SAAS,CAAC,CAAC;YAEtC,IAAI,MAAM,IAAI,SAAS,EAAE,CAAC;gBACxB,SAAS,GAAG,SAAS,CAAC;gBACtB,KAAK,GAAG,GAAG,GAAG,CAAC,CAAC;YAClB,CAAC;iBAAM,CAAC;gBACN,GAAG,GAAG,GAAG,CAAC;YACZ,CAAC;QACH,CAAC;QAED,OAAO,SAAS,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,kBAAkB;IACtF,CAAC;IAED;;OAEG;IACK,cAAc,CAAC,IAAY,EAAE,aAAqB;QACxD,IAAI,aAAa,KAAK,CAAC;YAAE,OAAO,EAAE,CAAC;QAEnC,6CAA6C;QAC7C,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC;QACtB,IAAI,SAAS,GAAG,EAAE,CAAC;QAEnB,OAAO,KAAK,GAAG,GAAG,EAAE,CAAC;YACnB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;YAC1C,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAClC,MAAM,MAAM,GAAG,IAAA,uBAAW,EAAC,SAAS,CAAC,CAAC;YAEtC,IAAI,MAAM,IAAI,aAAa,EAAE,CAAC;gBAC5B,SAAS,GAAG,SAAS,CAAC;gBACtB,GAAG,GAAG,GAAG,CAAC;YACZ,CAAC;iBAAM,CAAC;gBACN,KAAK,GAAG,GAAG,GAAG,CAAC,CAAC;YAClB,CAAC;QACH,CAAC;QAED,OAAO,SAAS,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,uBAAuB;IACzF,CAAC;CACF;AAjID,8DAiIC"}
|
|
@@ -43,8 +43,8 @@ class ParagraphChunkingStrategy {
|
|
|
43
43
|
...options.metadata,
|
|
44
44
|
},
|
|
45
45
|
});
|
|
46
|
-
// Keep last N paragraphs for overlap
|
|
47
|
-
const overlapParagraphs = currentChunk.slice(-overlap);
|
|
46
|
+
// Keep last N paragraphs for overlap. slice(-0) equals slice(0), so handle zero explicitly.
|
|
47
|
+
const overlapParagraphs = overlap > 0 ? currentChunk.slice(-overlap) : [];
|
|
48
48
|
currentChunk = [...overlapParagraphs, paragraph];
|
|
49
49
|
currentTokens = (0, tokenizer_1.countTokens)(overlapParagraphs.join('\n\n')) + paragraphTokens;
|
|
50
50
|
startChar = endChar - (overlapParagraphs.join('\n\n').length);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"paragraph.js","sourceRoot":"","sources":["../../../../src/chunking/strategies/paragraph.ts"],"names":[],"mappings":";;;AAAA,oCAAoC;AACpC,+BAAoC;AAEpC,kDAAiD;AAEjD,MAAa,yBAAyB;IACpC,OAAO;QACL,OAAO,WAAW,CAAC;IACrB,CAAC;IAED,KAAK,CAAC,IAAY,EAAE,UAAkB,EAAE,OAAwB;QAC9D,uDAAuD;QACvD,MAAM,UAAU,GAAI,OAAe,CAAC,UAAU,IAAI,IAAI,CAAC;QACvD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,YAAY,IAAI,IAAI,EAAE,UAAU,CAAC,CAAC;QACnE,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC,kCAAkC;QAExE,6BAA6B;QAC7B,MAAM,UAAU,GAAG,IAAI;aACpB,KAAK,CAAC,SAAS,CAAC;aAChB,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;aAClB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QAE7B,MAAM,MAAM,GAAY,EAAE,CAAC;QAC3B,IAAI,YAAY,GAAa,EAAE,CAAC;QAChC,IAAI,aAAa,GAAG,CAAC,CAAC;QACtB,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3C,MAAM,SAAS,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;YAChC,MAAM,eAAe,GAAG,IAAA,uBAAW,EAAC,SAAS,CAAC,CAAC;YAE/C,gGAAgG;YAChG,IAAI,aAAa,GAAG,eAAe,GAAG,OAAO,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACzE,MAAM,SAAS,GAAG,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;gBACnD,MAAM,OAAO,GAAG,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC;gBAE7C,MAAM,CAAC,IAAI,CAAC;oBACV,EAAE,EAAE,IAAA,SAAM,GAAE;oBACZ,IAAI,EAAE,SAAS;oBACf,QAAQ,EAAE;wBACR,UAAU;wBACV,UAAU;wBACV,WAAW,EAAE,CAAC,EAAE,wBAAwB;wBACxC,SAAS;wBACT,OAAO;wBACP,UAAU,EAAE,YAAY,CAAC,MAAM;wBAC/B,GAAG,OAAO,CAAC,QAAQ;qBACpB;iBACF,CAAC,CAAC;gBAEH,
|
|
1
|
+
{"version":3,"file":"paragraph.js","sourceRoot":"","sources":["../../../../src/chunking/strategies/paragraph.ts"],"names":[],"mappings":";;;AAAA,oCAAoC;AACpC,+BAAoC;AAEpC,kDAAiD;AAEjD,MAAa,yBAAyB;IACpC,OAAO;QACL,OAAO,WAAW,CAAC;IACrB,CAAC;IAED,KAAK,CAAC,IAAY,EAAE,UAAkB,EAAE,OAAwB;QAC9D,uDAAuD;QACvD,MAAM,UAAU,GAAI,OAAe,CAAC,UAAU,IAAI,IAAI,CAAC;QACvD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,YAAY,IAAI,IAAI,EAAE,UAAU,CAAC,CAAC;QACnE,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC,kCAAkC;QAExE,6BAA6B;QAC7B,MAAM,UAAU,GAAG,IAAI;aACpB,KAAK,CAAC,SAAS,CAAC;aAChB,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;aAClB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QAE7B,MAAM,MAAM,GAAY,EAAE,CAAC;QAC3B,IAAI,YAAY,GAAa,EAAE,CAAC;QAChC,IAAI,aAAa,GAAG,CAAC,CAAC;QACtB,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3C,MAAM,SAAS,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;YAChC,MAAM,eAAe,GAAG,IAAA,uBAAW,EAAC,SAAS,CAAC,CAAC;YAE/C,gGAAgG;YAChG,IAAI,aAAa,GAAG,eAAe,GAAG,OAAO,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACzE,MAAM,SAAS,GAAG,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;gBACnD,MAAM,OAAO,GAAG,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC;gBAE7C,MAAM,CAAC,IAAI,CAAC;oBACV,EAAE,EAAE,IAAA,SAAM,GAAE;oBACZ,IAAI,EAAE,SAAS;oBACf,QAAQ,EAAE;wBACR,UAAU;wBACV,UAAU;wBACV,WAAW,EAAE,CAAC,EAAE,wBAAwB;wBACxC,SAAS;wBACT,OAAO;wBACP,UAAU,EAAE,YAAY,CAAC,MAAM;wBAC/B,GAAG,OAAO,CAAC,QAAQ;qBACpB;iBACF,CAAC,CAAC;gBAEH,4FAA4F;gBAC5F,MAAM,iBAAiB,GAAG,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;gBAC1E,YAAY,GAAG,CAAC,GAAG,iBAAiB,EAAE,SAAS,CAAC,CAAC;gBACjD,aAAa,GAAG,IAAA,uBAAW,EAAC,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,eAAe,CAAC;gBAC9E,SAAS,GAAG,OAAO,GAAG,CAAC,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,CAAC;gBAC9D,UAAU,EAAE,CAAC;YACf,CAAC;iBAAM,CAAC;gBACN,YAAY,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;gBAC7B,aAAa,IAAI,eAAe,CAAC;YACnC,CAAC;QACH,CAAC;QAED,uCAAuC;QACvC,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5B,MAAM,SAAS,GAAG,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;YACnD,MAAM,OAAO,GAAG,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC;YAE7C,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAA,SAAM,GAAE;gBACZ,IAAI,EAAE,SAAS;gBACf,QAAQ,EAAE;oBACR,UAAU;oBACV,UAAU;oBACV,WAAW,EAAE,CAAC;oBACd,SAAS;oBACT,OAAO;oBACP,UAAU,EAAE,YAAY,CAAC,MAAM;oBAC/B,GAAG,OAAO,CAAC,QAAQ;iBACpB;aACF,CAAC,CAAC;QACL,CAAC;QAED,qBAAqB;QACrB,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE;YACrB,KAAK,CAAC,QAAQ,CAAC,WAAW,GAAG,MAAM,CAAC,MAAM,CAAC;QAC7C,CAAC,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;CACF;AArFD,8DAqFC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"recursive.d.ts","sourceRoot":"","sources":["../../../../src/chunking/strategies/recursive.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,KAAK,EAAE,eAAe,EAAE,gBAAgB,EAAE,MAAM,UAAU,CAAC;AAGzE,qBAAa,yBAA0B,YAAW,gBAAgB;IAChE,OAAO,IAAI,MAAM;IAIjB,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,EAAE,OAAO,EAAE,eAAe,GAAG,KAAK,EAAE;IAoD1E,OAAO,CAAC,cAAc;IA8CtB,OAAO,CAAC,WAAW;IAsCnB;;OAEG;IACH,OAAO,CAAC,cAAc;
|
|
1
|
+
{"version":3,"file":"recursive.d.ts","sourceRoot":"","sources":["../../../../src/chunking/strategies/recursive.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,KAAK,EAAE,eAAe,EAAE,gBAAgB,EAAE,MAAM,UAAU,CAAC;AAGzE,qBAAa,yBAA0B,YAAW,gBAAgB;IAChE,OAAO,IAAI,MAAM;IAIjB,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,EAAE,OAAO,EAAE,eAAe,GAAG,KAAK,EAAE;IAoD1E,OAAO,CAAC,cAAc;IA8CtB,OAAO,CAAC,WAAW;IAsCnB;;OAEG;IACH,OAAO,CAAC,cAAc;IAwBtB,OAAO,CAAC,iBAAiB;IAkCzB;;OAEG;IACH,OAAO,CAAC,qBAAqB;CAyB9B"}
|
|
@@ -124,6 +124,8 @@ class RecursiveChunkingStrategy {
|
|
|
124
124
|
* Get overlap text that is approximately 'overlapTokens' tokens
|
|
125
125
|
*/
|
|
126
126
|
getOverlapText(text, overlapTokens) {
|
|
127
|
+
if (overlapTokens === 0)
|
|
128
|
+
return '';
|
|
127
129
|
// Binary search for the right amount of text
|
|
128
130
|
let start = 0;
|
|
129
131
|
let end = text.length;
|
|
@@ -153,9 +155,16 @@ class RecursiveChunkingStrategy {
|
|
|
153
155
|
break;
|
|
154
156
|
chunks.push(chunkText);
|
|
155
157
|
const chunkLength = chunkText.length;
|
|
158
|
+
if (start + chunkLength >= text.length) {
|
|
159
|
+
break;
|
|
160
|
+
}
|
|
156
161
|
// Calculate overlap start position
|
|
157
162
|
const overlapText = this.getOverlapText(chunkText, overlap);
|
|
158
|
-
start
|
|
163
|
+
const nextStart = start + chunkLength - overlapText.length;
|
|
164
|
+
if (nextStart <= start) {
|
|
165
|
+
break;
|
|
166
|
+
}
|
|
167
|
+
start = nextStart;
|
|
159
168
|
if (text.length - start < overlapText.length) {
|
|
160
169
|
break;
|
|
161
170
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"recursive.js","sourceRoot":"","sources":["../../../../src/chunking/strategies/recursive.ts"],"names":[],"mappings":";;;AAAA,2EAA2E;AAC3E,+BAAoC;AAEpC,kDAAiD;AAEjD,MAAa,yBAAyB;IACpC,OAAO;QACL,OAAO,WAAW,CAAC;IACrB,CAAC;IAED,KAAK,CAAC,IAAY,EAAE,UAAkB,EAAE,OAAwB;QAC9D,uDAAuD;QACvD,MAAM,UAAU,GAAI,OAAe,CAAC,UAAU,IAAI,IAAI,CAAC;QACvD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,YAAY,IAAI,IAAI,EAAE,UAAU,CAAC,CAAC;QACnE,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,OAAO,IAAI,GAAG,EAAE,IAAI,CAAC,KAAK,CAAC,OAAO,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,0CAA0C;QAEvH,iFAAiF;QACjF,MAAM,iBAAiB,GAAG;YACxB,MAAM,EAAG,YAAY;YACrB,IAAI,EAAK,aAAa;YACtB,IAAI,EAAK,WAAW;YACpB,IAAI,EAAK,WAAW;YACpB,IAAI,EAAK,WAAW;YACpB,IAAI,EAAK,SAAS;YAClB,IAAI,EAAK,SAAS;YAClB,GAAG,EAAM,OAAO;YAChB,EAAE,EAAO,YAAY;SACtB,CAAC;QAEF,MAAM,UAAU,GAAG,OAAO,CAAC,SAAS;YAClC,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;YAC9E,CAAC,CAAC,iBAAiB,CAAC;QAEtB,MAAM,MAAM,GAAG,IAAI,CAAC,cAAc,CAAC,IAAI,EAAE,OAAO,EAAE,OAAO,EAAE,UAAU,CAAC,CAAC;QAEvE,yCAAyC;QACzC,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,MAAM,MAAM,GAAY,EAAE,CAAC;QAE3B,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,EAAE,KAAK,EAAE,EAAE;YAClC,MAAM,OAAO,GAAG,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC;YAE7C,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAA,SAAM,GAAE;gBACZ,IAAI,EAAE,SAAS;gBACf,QAAQ,EAAE;oBACR,UAAU;oBACV,UAAU,EAAE,KAAK;oBACjB,WAAW,EAAE,MAAM,CAAC,MAAM;oBAC1B,SAAS;oBACT,OAAO;oBACP,GAAG,OAAO,CAAC,QAAQ;iBACpB;aACF,CAAC,CAAC;YAEH,wCAAwC;YACxC,SAAS,GAAG,OAAO,GAAG,OAAO,CAAC;QAChC,CAAC,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAEO,cAAc,CACpB,IAAY,EACZ,OAAe,EAAE,2BAA2B;IAC5C,OAAe,EAAE,2BAA2B;IAC5C,UAAoB;QAEpB,MAAM,WAAW,GAAa,EAAE,CAAC;QAEjC,4DAA4D;QAC5D,MAAM,UAAU,GAAG,IAAA,uBAAW,EAAC,IAAI,CAAC,CAAC;QACrC,IAAI,UAAU,IAAI,OAAO,EAAE,CAAC;YAC1B,OAAO,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC1C,CAAC;QAED,8BAA8B;QAC9B,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;YACnC,IAAI,SAAS,KAAK,EAAE,EAAE,CAAC;gBACrB,uCAAuC;gBACvC,OAAO,IAAI,CAAC,iBAAiB,CAAC,IAAI,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC;YACxD,CAAC;YAED,IAAI,IAAI,CAAC,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC;gBAC7B,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;gBACrC,MAAM,MAAM,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,SAAS,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC;gBAErE,yEAAyE;gBACzE,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;oBAC3B,MAAM,WAAW,GAAG,IAAA,uBAAW,EAAC,KAAK,CAAC,CAAC;oBACvC,IAAI,WAAW,GAAG,OAAO,EAAE,CAAC;wBAC1B,kCAAkC;wBAClC,MAAM,kBAAkB,GAAG,UAAU,CAAC,OAAO,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;wBAC7D,MAAM,mBAAmB,GAAG,UAAU,CAAC,KAAK,CAAC,kBAAkB,CAAC,CAAC;wBACjE,WAAW,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,cAAc,CAAC,KAAK,EAAE,OAAO,EAAE,OAAO,EAAE,mBAAmB,CAAC,CAAC,CAAC;oBACzF,CAAC;yBAAM,IAAI,KAAK,CAAC,IAAI,EAAE,EAAE,CAAC;wBACxB,WAAW,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC;oBACjC,CAAC;gBACH,CAAC;gBAED,OAAO,WAAW,CAAC;YACrB,CAAC;QACH,CAAC;QAED,8BAA8B;QAC9B,OAAO,IAAI,CAAC,iBAAiB,CAAC,IAAI,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC;IACxD,CAAC;IAEO,WAAW,CACjB,MAAgB,EAChB,SAAiB,EACjB,OAAe,EAAE,2BAA2B;IAC5C,OAAe,CAAC,2BAA2B;;QAE3C,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,IAAI,YAAY,GAAG,EAAE,CAAC;QAEtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;YACxB,MAAM,KAAK,GAAG,CAAC,GAAG,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,GAAG,SAAS,CAAC,CAAC,CAAC,KAAK,CAAC;YAChE,MAAM,QAAQ,GAAG,YAAY,GAAG,KAAK,CAAC;YACtC,MAAM,cAAc,GAAG,IAAA,uBAAW,EAAC,QAAQ,CAAC,CAAC;YAE7C,IAAI,cAAc,IAAI,OAAO,EAAE,CAAC;gBAC9B,YAAY,GAAG,QAAQ,CAAC;YAC1B,CAAC;iBAAM,CAAC;gBACN,IAAI,YAAY,EAAE,CAAC;oBACjB,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;oBAC1B,2CAA2C;oBAC3C,8DAA8D;oBAC9D,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;oBAC/D,YAAY,GAAG,WAAW,GAAG,KAAK,CAAC;gBACrC,CAAC;qBAAM,CAAC;oBACN,iFAAiF;oBACjF,YAAY,GAAG,KAAK,CAAC;gBACvB,CAAC;YACH,CAAC;QACH,CAAC;QAED,IAAI,YAAY,EAAE,CAAC;YACjB,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QAC5B,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,cAAc,CAAC,IAAY,EAAE,aAAqB;QACxD,6CAA6C;QAC7C,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC;QACtB,IAAI,SAAS,GAAG,EAAE,CAAC;QAEnB,OAAO,KAAK,GAAG,GAAG,EAAE,CAAC;YACnB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;YAC1C,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAClC,MAAM,MAAM,GAAG,IAAA,uBAAW,EAAC,SAAS,CAAC,CAAC;YAEtC,IAAI,MAAM,IAAI,aAAa,EAAE,CAAC;gBAC5B,SAAS,GAAG,SAAS,CAAC;gBACtB,GAAG,GAAG,GAAG,CAAC;YACZ,CAAC;iBAAM,CAAC;gBACN,KAAK,GAAG,GAAG,GAAG,CAAC,CAAC;YAClB,CAAC;QACH,CAAC;QAED,OAAO,SAAS,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,uBAAuB;IACzF,CAAC;IAEO,iBAAiB,CAAC,IAAY,EAAE,OAAe,EAAE,OAAe;QACtE,wEAAwE;QACxE,wEAAwE;QACxE,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,IAAI,KAAK,GAAG,CAAC,CAAC;QAEd,OAAO,KAAK,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;YAC3B,MAAM,SAAS,GAAG,IAAI,CAAC,qBAAqB,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC,CAAC;YACzE,IAAI,CAAC,SAAS;gBAAE,MAAM;YAEtB,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACvB,MAAM,WAAW,GAAG,SAAS,CAAC,MAAM,CAAC;YAErC,mCAAmC;YACnC,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;YAC5D,KAAK,
|
|
1
|
+
{"version":3,"file":"recursive.js","sourceRoot":"","sources":["../../../../src/chunking/strategies/recursive.ts"],"names":[],"mappings":";;;AAAA,2EAA2E;AAC3E,+BAAoC;AAEpC,kDAAiD;AAEjD,MAAa,yBAAyB;IACpC,OAAO;QACL,OAAO,WAAW,CAAC;IACrB,CAAC;IAED,KAAK,CAAC,IAAY,EAAE,UAAkB,EAAE,OAAwB;QAC9D,uDAAuD;QACvD,MAAM,UAAU,GAAI,OAAe,CAAC,UAAU,IAAI,IAAI,CAAC;QACvD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,YAAY,IAAI,IAAI,EAAE,UAAU,CAAC,CAAC;QACnE,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,OAAO,IAAI,GAAG,EAAE,IAAI,CAAC,KAAK,CAAC,OAAO,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,0CAA0C;QAEvH,iFAAiF;QACjF,MAAM,iBAAiB,GAAG;YACxB,MAAM,EAAG,YAAY;YACrB,IAAI,EAAK,aAAa;YACtB,IAAI,EAAK,WAAW;YACpB,IAAI,EAAK,WAAW;YACpB,IAAI,EAAK,WAAW;YACpB,IAAI,EAAK,SAAS;YAClB,IAAI,EAAK,SAAS;YAClB,GAAG,EAAM,OAAO;YAChB,EAAE,EAAO,YAAY;SACtB,CAAC;QAEF,MAAM,UAAU,GAAG,OAAO,CAAC,SAAS;YAClC,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;YAC9E,CAAC,CAAC,iBAAiB,CAAC;QAEtB,MAAM,MAAM,GAAG,IAAI,CAAC,cAAc,CAAC,IAAI,EAAE,OAAO,EAAE,OAAO,EAAE,UAAU,CAAC,CAAC;QAEvE,yCAAyC;QACzC,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,MAAM,MAAM,GAAY,EAAE,CAAC;QAE3B,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,EAAE,KAAK,EAAE,EAAE;YAClC,MAAM,OAAO,GAAG,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC;YAE7C,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAA,SAAM,GAAE;gBACZ,IAAI,EAAE,SAAS;gBACf,QAAQ,EAAE;oBACR,UAAU;oBACV,UAAU,EAAE,KAAK;oBACjB,WAAW,EAAE,MAAM,CAAC,MAAM;oBAC1B,SAAS;oBACT,OAAO;oBACP,GAAG,OAAO,CAAC,QAAQ;iBACpB;aACF,CAAC,CAAC;YAEH,wCAAwC;YACxC,SAAS,GAAG,OAAO,GAAG,OAAO,CAAC;QAChC,CAAC,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAEO,cAAc,CACpB,IAAY,EACZ,OAAe,EAAE,2BAA2B;IAC5C,OAAe,EAAE,2BAA2B;IAC5C,UAAoB;QAEpB,MAAM,WAAW,GAAa,EAAE,CAAC;QAEjC,4DAA4D;QAC5D,MAAM,UAAU,GAAG,IAAA,uBAAW,EAAC,IAAI,CAAC,CAAC;QACrC,IAAI,UAAU,IAAI,OAAO,EAAE,CAAC;YAC1B,OAAO,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC1C,CAAC;QAED,8BAA8B;QAC9B,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;YACnC,IAAI,SAAS,KAAK,EAAE,EAAE,CAAC;gBACrB,uCAAuC;gBACvC,OAAO,IAAI,CAAC,iBAAiB,CAAC,IAAI,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC;YACxD,CAAC;YAED,IAAI,IAAI,CAAC,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC;gBAC7B,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;gBACrC,MAAM,MAAM,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,SAAS,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC;gBAErE,yEAAyE;gBACzE,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;oBAC3B,MAAM,WAAW,GAAG,IAAA,uBAAW,EAAC,KAAK,CAAC,CAAC;oBACvC,IAAI,WAAW,GAAG,OAAO,EAAE,CAAC;wBAC1B,kCAAkC;wBAClC,MAAM,kBAAkB,GAAG,UAAU,CAAC,OAAO,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;wBAC7D,MAAM,mBAAmB,GAAG,UAAU,CAAC,KAAK,CAAC,kBAAkB,CAAC,CAAC;wBACjE,WAAW,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,cAAc,CAAC,KAAK,EAAE,OAAO,EAAE,OAAO,EAAE,mBAAmB,CAAC,CAAC,CAAC;oBACzF,CAAC;yBAAM,IAAI,KAAK,CAAC,IAAI,EAAE,EAAE,CAAC;wBACxB,WAAW,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC;oBACjC,CAAC;gBACH,CAAC;gBAED,OAAO,WAAW,CAAC;YACrB,CAAC;QACH,CAAC;QAED,8BAA8B;QAC9B,OAAO,IAAI,CAAC,iBAAiB,CAAC,IAAI,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC;IACxD,CAAC;IAEO,WAAW,CACjB,MAAgB,EAChB,SAAiB,EACjB,OAAe,EAAE,2BAA2B;IAC5C,OAAe,CAAC,2BAA2B;;QAE3C,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,IAAI,YAAY,GAAG,EAAE,CAAC;QAEtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;YACxB,MAAM,KAAK,GAAG,CAAC,GAAG,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,GAAG,SAAS,CAAC,CAAC,CAAC,KAAK,CAAC;YAChE,MAAM,QAAQ,GAAG,YAAY,GAAG,KAAK,CAAC;YACtC,MAAM,cAAc,GAAG,IAAA,uBAAW,EAAC,QAAQ,CAAC,CAAC;YAE7C,IAAI,cAAc,IAAI,OAAO,EAAE,CAAC;gBAC9B,YAAY,GAAG,QAAQ,CAAC;YAC1B,CAAC;iBAAM,CAAC;gBACN,IAAI,YAAY,EAAE,CAAC;oBACjB,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;oBAC1B,2CAA2C;oBAC3C,8DAA8D;oBAC9D,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;oBAC/D,YAAY,GAAG,WAAW,GAAG,KAAK,CAAC;gBACrC,CAAC;qBAAM,CAAC;oBACN,iFAAiF;oBACjF,YAAY,GAAG,KAAK,CAAC;gBACvB,CAAC;YACH,CAAC;QACH,CAAC;QAED,IAAI,YAAY,EAAE,CAAC;YACjB,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QAC5B,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,cAAc,CAAC,IAAY,EAAE,aAAqB;QACxD,IAAI,aAAa,KAAK,CAAC;YAAE,OAAO,EAAE,CAAC;QAEnC,6CAA6C;QAC7C,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC;QACtB,IAAI,SAAS,GAAG,EAAE,CAAC;QAEnB,OAAO,KAAK,GAAG,GAAG,EAAE,CAAC;YACnB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;YAC1C,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAClC,MAAM,MAAM,GAAG,IAAA,uBAAW,EAAC,SAAS,CAAC,CAAC;YAEtC,IAAI,MAAM,IAAI,aAAa,EAAE,CAAC;gBAC5B,SAAS,GAAG,SAAS,CAAC;gBACtB,GAAG,GAAG,GAAG,CAAC;YACZ,CAAC;iBAAM,CAAC;gBACN,KAAK,GAAG,GAAG,GAAG,CAAC,CAAC;YAClB,CAAC;QACH,CAAC;QAED,OAAO,SAAS,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,uBAAuB;IACzF,CAAC;IAEO,iBAAiB,CAAC,IAAY,EAAE,OAAe,EAAE,OAAe;QACtE,wEAAwE;QACxE,wEAAwE;QACxE,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,IAAI,KAAK,GAAG,CAAC,CAAC;QAEd,OAAO,KAAK,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;YAC3B,MAAM,SAAS,GAAG,IAAI,CAAC,qBAAqB,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC,CAAC;YACzE,IAAI,CAAC,SAAS;gBAAE,MAAM;YAEtB,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACvB,MAAM,WAAW,GAAG,SAAS,CAAC,MAAM,CAAC;YAErC,IAAI,KAAK,GAAG,WAAW,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;gBACvC,MAAM;YACR,CAAC;YAED,mCAAmC;YACnC,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;YAC5D,MAAM,SAAS,GAAG,KAAK,GAAG,WAAW,GAAG,WAAW,CAAC,MAAM,CAAC;YAC3D,IAAI,SAAS,IAAI,KAAK,EAAE,CAAC;gBACvB,MAAM;YACR,CAAC;YAED,KAAK,GAAG,SAAS,CAAC;YAElB,IAAI,IAAI,CAAC,MAAM,GAAG,KAAK,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC;gBAC7C,MAAM;YACR,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,qBAAqB,CAAC,IAAY,EAAE,SAAiB;QAC3D,IAAI,IAAA,uBAAW,EAAC,IAAI,CAAC,IAAI,SAAS,EAAE,CAAC;YACnC,OAAO,IAAI,CAAC;QACd,CAAC;QAED,iDAAiD;QACjD,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC;QACtB,IAAI,SAAS,GAAG,EAAE,CAAC;QAEnB,OAAO,KAAK,GAAG,GAAG,EAAE,CAAC;YACnB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;YAC1C,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;YACrC,MAAM,MAAM,GAAG,IAAA,uBAAW,EAAC,SAAS,CAAC,CAAC;YAEtC,IAAI,MAAM,IAAI,SAAS,EAAE,CAAC;gBACxB,SAAS,GAAG,SAAS,CAAC;gBACtB,KAAK,GAAG,GAAG,GAAG,CAAC,CAAC;YAClB,CAAC;iBAAM,CAAC;gBACN,GAAG,GAAG,GAAG,CAAC;YACZ,CAAC;QACH,CAAC;QAED,OAAO,SAAS,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,kBAAkB;IACtF,CAAC;CACF;AAtOD,8DAsOC"}
|
|
@@ -40,8 +40,8 @@ class SentenceChunkingStrategy {
|
|
|
40
40
|
...options.metadata,
|
|
41
41
|
},
|
|
42
42
|
});
|
|
43
|
-
// Keep last N sentences for overlap
|
|
44
|
-
const overlapSentences = currentChunk.slice(-overlap);
|
|
43
|
+
// Keep last N sentences for overlap. slice(-0) equals slice(0), so handle zero explicitly.
|
|
44
|
+
const overlapSentences = overlap > 0 ? currentChunk.slice(-overlap) : [];
|
|
45
45
|
currentChunk = [...overlapSentences, sentence];
|
|
46
46
|
currentTokens = (0, tokenizer_1.countTokens)(overlapSentences.join(' ')) + sentenceTokens;
|
|
47
47
|
startChar = endChar - (overlapSentences.join(' ').length);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"sentence.js","sourceRoot":"","sources":["../../../../src/chunking/strategies/sentence.ts"],"names":[],"mappings":";;;AAAA,mCAAmC;AACnC,+BAAoC;AAEpC,kDAAiD;AAEjD,MAAa,wBAAwB;IACnC,OAAO;QACL,OAAO,UAAU,CAAC;IACpB,CAAC;IAED,KAAK,CAAC,IAAY,EAAE,UAAkB,EAAE,OAAwB;QAC9D,uDAAuD;QACvD,MAAM,UAAU,GAAI,OAAe,CAAC,UAAU,IAAI,IAAI,CAAC;QACvD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,YAAY,IAAI,IAAI,EAAE,UAAU,CAAC,CAAC;QACnE,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC,iCAAiC;QAEvE,4BAA4B;QAC5B,MAAM,SAAS,GAAG,IAAI,CAAC,kBAAkB,CAAC,IAAI,CAAC,CAAC;QAEhD,MAAM,MAAM,GAAY,EAAE,CAAC;QAC3B,IAAI,YAAY,GAAa,EAAE,CAAC;QAChC,IAAI,aAAa,GAAG,CAAC,CAAC;QACtB,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC1C,MAAM,QAAQ,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;YAC9B,MAAM,cAAc,GAAG,IAAA,uBAAW,EAAC,QAAQ,CAAC,CAAC;YAE7C,+FAA+F;YAC/F,IAAI,aAAa,GAAG,cAAc,GAAG,OAAO,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACxE,MAAM,SAAS,GAAG,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;gBAChD,MAAM,OAAO,GAAG,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC;gBAE7C,MAAM,CAAC,IAAI,CAAC;oBACV,EAAE,EAAE,IAAA,SAAM,GAAE;oBACZ,IAAI,EAAE,SAAS;oBACf,QAAQ,EAAE;wBACR,UAAU;wBACV,UAAU;wBACV,WAAW,EAAE,CAAC,EAAE,wBAAwB;wBACxC,SAAS;wBACT,OAAO;wBACP,SAAS,EAAE,YAAY,CAAC,MAAM;wBAC9B,GAAG,OAAO,CAAC,QAAQ;qBACpB;iBACF,CAAC,CAAC;gBAEH,
|
|
1
|
+
{"version":3,"file":"sentence.js","sourceRoot":"","sources":["../../../../src/chunking/strategies/sentence.ts"],"names":[],"mappings":";;;AAAA,mCAAmC;AACnC,+BAAoC;AAEpC,kDAAiD;AAEjD,MAAa,wBAAwB;IACnC,OAAO;QACL,OAAO,UAAU,CAAC;IACpB,CAAC;IAED,KAAK,CAAC,IAAY,EAAE,UAAkB,EAAE,OAAwB;QAC9D,uDAAuD;QACvD,MAAM,UAAU,GAAI,OAAe,CAAC,UAAU,IAAI,IAAI,CAAC;QACvD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,YAAY,IAAI,IAAI,EAAE,UAAU,CAAC,CAAC;QACnE,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC,iCAAiC;QAEvE,4BAA4B;QAC5B,MAAM,SAAS,GAAG,IAAI,CAAC,kBAAkB,CAAC,IAAI,CAAC,CAAC;QAEhD,MAAM,MAAM,GAAY,EAAE,CAAC;QAC3B,IAAI,YAAY,GAAa,EAAE,CAAC;QAChC,IAAI,aAAa,GAAG,CAAC,CAAC;QACtB,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC1C,MAAM,QAAQ,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;YAC9B,MAAM,cAAc,GAAG,IAAA,uBAAW,EAAC,QAAQ,CAAC,CAAC;YAE7C,+FAA+F;YAC/F,IAAI,aAAa,GAAG,cAAc,GAAG,OAAO,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACxE,MAAM,SAAS,GAAG,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;gBAChD,MAAM,OAAO,GAAG,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC;gBAE7C,MAAM,CAAC,IAAI,CAAC;oBACV,EAAE,EAAE,IAAA,SAAM,GAAE;oBACZ,IAAI,EAAE,SAAS;oBACf,QAAQ,EAAE;wBACR,UAAU;wBACV,UAAU;wBACV,WAAW,EAAE,CAAC,EAAE,wBAAwB;wBACxC,SAAS;wBACT,OAAO;wBACP,SAAS,EAAE,YAAY,CAAC,MAAM;wBAC9B,GAAG,OAAO,CAAC,QAAQ;qBACpB;iBACF,CAAC,CAAC;gBAEH,2FAA2F;gBAC3F,MAAM,gBAAgB,GAAG,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;gBACzE,YAAY,GAAG,CAAC,GAAG,gBAAgB,EAAE,QAAQ,CAAC,CAAC;gBAC/C,aAAa,GAAG,IAAA,uBAAW,EAAC,gBAAgB,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,cAAc,CAAC;gBACzE,SAAS,GAAG,OAAO,GAAG,CAAC,gBAAgB,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC;gBAC1D,UAAU,EAAE,CAAC;YACf,CAAC;iBAAM,CAAC;gBACN,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;gBAC5B,aAAa,IAAI,cAAc,CAAC;YAClC,CAAC;QACH,CAAC;QAED,uCAAuC;QACvC,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5B,MAAM,SAAS,GAAG,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;YAChD,MAAM,OAAO,GAAG,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC;YAE7C,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAA,SAAM,GAAE;gBACZ,IAAI,EAAE,SAAS;gBACf,QAAQ,EAAE;oBACR,UAAU;oBACV,UAAU;oBACV,WAAW,EAAE,CAAC;oBACd,SAAS;oBACT,OAAO;oBACP,SAAS,EAAE,YAAY,CAAC,MAAM;oBAC9B,GAAG,OAAO,CAAC,QAAQ;iBACpB;aACF,CAAC,CAAC;QACL,CAAC;QAED,qBAAqB;QACrB,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE;YACrB,KAAK,CAAC,QAAQ,CAAC,WAAW,GAAG,MAAM,CAAC,MAAM,CAAC;QAC7C,CAAC,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAEO,kBAAkB,CAAC,IAAY;QACrC,gEAAgE;QAChE,+BAA+B;QAC/B,MAAM,SAAS,GAAa,EAAE,CAAC;QAE/B,qDAAqD;QACrD,IAAI,UAAU,GAAG,IAAI;aAClB,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC;aACtB,OAAO,CAAC,QAAQ,EAAE,KAAK,CAAC;aACxB,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC;aACtB,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC;aACtB,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC;aACtB,OAAO,CAAC,QAAQ,EAAE,KAAK,CAAC;aACxB,OAAO,CAAC,SAAS,EAAE,IAAI,CAAC;aACxB,OAAO,CAAC,SAAS,EAAE,IAAI,CAAC,CAAC;QAE5B,+BAA+B;QAC/B,MAAM,KAAK,GAAG,UAAU,CAAC,KAAK,CAAC,iBAAiB,CAAC,CAAC;QAElD,IAAI,eAAe,GAAG,EAAE,CAAC;QAEzB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,IAAI,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC/B,eAAe,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;gBAC/B,IAAI,eAAe,CAAC,IAAI,EAAE,EAAE,CAAC;oBAC3B,SAAS,CAAC,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,CAAC,CAAC;gBACzC,CAAC;gBACD,eAAe,GAAG,EAAE,CAAC;YACvB,CAAC;iBAAM,CAAC;gBACN,eAAe,IAAI,IAAI,CAAC;YAC1B,CAAC;QACH,CAAC;QAED,4BAA4B;QAC5B,IAAI,eAAe,CAAC,IAAI,EAAE,EAAE,CAAC;YAC3B,SAAS,CAAC,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,CAAC,CAAC;QACzC,CAAC;QAED,OAAO,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAC7C,CAAC;CACF;AA3HD,4DA2HC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tokenizer.d.ts","sourceRoot":"","sources":["../../../../src/chunking/utils/tokenizer.ts"],"names":[],"mappings":"AAoBA;;GAEG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,
|
|
1
|
+
{"version":3,"file":"tokenizer.d.ts","sourceRoot":"","sources":["../../../../src/chunking/utils/tokenizer.ts"],"names":[],"mappings":"AAoBA;;GAEG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAchD;AAED;;;GAGG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAMnD"}
|
|
@@ -15,7 +15,7 @@ function getEmbeddingEncoding() {
|
|
|
15
15
|
if (!cachedEncoding) {
|
|
16
16
|
// Use cl100k_base encoding which is used by text-embedding-3 models
|
|
17
17
|
// This is compatible with GPT-4 and text-embedding-3 models
|
|
18
|
-
cachedEncoding = (0, tiktoken_1.encoding_for_model)('
|
|
18
|
+
cachedEncoding = (0, tiktoken_1.encoding_for_model)('text-embedding-3-small');
|
|
19
19
|
}
|
|
20
20
|
return cachedEncoding;
|
|
21
21
|
}
|
|
@@ -31,6 +31,7 @@ function countTokens(text) {
|
|
|
31
31
|
return encoding.encode(text).length;
|
|
32
32
|
}
|
|
33
33
|
catch (error) {
|
|
34
|
+
console.error("using fallback tokenizer, error:", error);
|
|
34
35
|
// Fallback to approximation if tiktoken fails
|
|
35
36
|
// Rough approximation: 1 token ≈ 4 characters for English text
|
|
36
37
|
return Math.ceil(text.length / 4);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../../../../src/chunking/utils/tokenizer.ts"],"names":[],"mappings":";;AAuBA,
|
|
1
|
+
{"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../../../../src/chunking/utils/tokenizer.ts"],"names":[],"mappings":";;AAuBA,kCAcC;AAMD,wCAMC;AAjDD,iDAAiD;AACjD,4CAA4C;AAC5C,uCAA8C;AAE9C,wCAAwC;AACxC,IAAI,cAAc,GAAiD,IAAI,CAAC;AAExE;;;GAGG;AACH,SAAS,oBAAoB;IAC3B,IAAI,CAAC,cAAc,EAAE,CAAC;QACpB,oEAAoE;QACpE,4DAA4D;QAC5D,cAAc,GAAG,IAAA,6BAAkB,EAAC,wBAAwB,CAAC,CAAC;IAChE,CAAC;IACD,OAAO,cAAc,CAAC;AACxB,CAAC;AAED;;GAEG;AACH,SAAgB,WAAW,CAAC,IAAY;IACtC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC/B,OAAO,CAAC,CAAC;IACX,CAAC;IAED,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,oBAAoB,EAAE,CAAC;QACxC,OAAO,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC;IACtC,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CAAC,kCAAkC,EAAE,KAAK,CAAC,CAAC;QACzD,8CAA8C;QAC9C,+DAA+D;QAC/D,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IACpC,CAAC;AACH,CAAC;AAED;;;GAGG;AACH,SAAgB,cAAc,CAAC,IAAY;IACzC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC/B,OAAO,CAAC,CAAC;IACX,CAAC;IACD,+DAA+D;IAC/D,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;AACpC,CAAC"}
|