@keel_flow/kb-pipeline 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/LICENSE +21 -0
  2. package/dist/chunking/ast.d.ts +8 -0
  3. package/dist/chunking/ast.d.ts.map +1 -0
  4. package/dist/chunking/ast.js +86 -0
  5. package/dist/chunking/ast.js.map +1 -0
  6. package/dist/chunking/contextual-fallback.d.ts +15 -0
  7. package/dist/chunking/contextual-fallback.d.ts.map +1 -0
  8. package/dist/chunking/contextual-fallback.js +33 -0
  9. package/dist/chunking/contextual-fallback.js.map +1 -0
  10. package/dist/chunking/fixed.d.ts +6 -0
  11. package/dist/chunking/fixed.d.ts.map +1 -0
  12. package/dist/chunking/fixed.js +24 -0
  13. package/dist/chunking/fixed.js.map +1 -0
  14. package/dist/chunking/index.d.ts +24 -0
  15. package/dist/chunking/index.d.ts.map +1 -0
  16. package/dist/chunking/index.js +86 -0
  17. package/dist/chunking/index.js.map +1 -0
  18. package/dist/chunking/late.d.ts +11 -0
  19. package/dist/chunking/late.d.ts.map +1 -0
  20. package/dist/chunking/late.js +27 -0
  21. package/dist/chunking/late.js.map +1 -0
  22. package/dist/chunking/recursive.d.ts +7 -0
  23. package/dist/chunking/recursive.d.ts.map +1 -0
  24. package/dist/chunking/recursive.js +87 -0
  25. package/dist/chunking/recursive.js.map +1 -0
  26. package/dist/embedding/index.d.ts +14 -0
  27. package/dist/embedding/index.d.ts.map +1 -0
  28. package/dist/embedding/index.js +33 -0
  29. package/dist/embedding/index.js.map +1 -0
  30. package/dist/embedding/local-minilm.d.ts +8 -0
  31. package/dist/embedding/local-minilm.d.ts.map +1 -0
  32. package/dist/embedding/local-minilm.js +51 -0
  33. package/dist/embedding/local-minilm.js.map +1 -0
  34. package/dist/embedding/openai-text-3-small.d.ts +9 -0
  35. package/dist/embedding/openai-text-3-small.d.ts.map +1 -0
  36. package/dist/embedding/openai-text-3-small.js +51 -0
  37. package/dist/embedding/openai-text-3-small.js.map +1 -0
  38. package/dist/embedding/types.d.ts +8 -0
  39. package/dist/embedding/types.d.ts.map +1 -0
  40. package/dist/embedding/types.js +2 -0
  41. package/dist/embedding/types.js.map +1 -0
  42. package/dist/embedding/voyage-context-3.d.ts +9 -0
  43. package/dist/embedding/voyage-context-3.d.ts.map +1 -0
  44. package/dist/embedding/voyage-context-3.js +55 -0
  45. package/dist/embedding/voyage-context-3.js.map +1 -0
  46. package/dist/index.d.ts +9 -0
  47. package/dist/index.d.ts.map +1 -0
  48. package/dist/index.js +5 -0
  49. package/dist/index.js.map +1 -0
  50. package/dist/pipeline.d.ts +51 -0
  51. package/dist/pipeline.d.ts.map +1 -0
  52. package/dist/pipeline.js +90 -0
  53. package/dist/pipeline.js.map +1 -0
  54. package/dist/retrieval/bm25.d.ts +14 -0
  55. package/dist/retrieval/bm25.d.ts.map +1 -0
  56. package/dist/retrieval/bm25.js +60 -0
  57. package/dist/retrieval/bm25.js.map +1 -0
  58. package/dist/retrieval/dense.d.ts +11 -0
  59. package/dist/retrieval/dense.d.ts.map +1 -0
  60. package/dist/retrieval/dense.js +27 -0
  61. package/dist/retrieval/dense.js.map +1 -0
  62. package/dist/retrieval/index.d.ts +71 -0
  63. package/dist/retrieval/index.d.ts.map +1 -0
  64. package/dist/retrieval/index.js +90 -0
  65. package/dist/retrieval/index.js.map +1 -0
  66. package/dist/retrieval/rerank.d.ts +34 -0
  67. package/dist/retrieval/rerank.d.ts.map +1 -0
  68. package/dist/retrieval/rerank.js +101 -0
  69. package/dist/retrieval/rerank.js.map +1 -0
  70. package/dist/retrieval/rrf.d.ts +17 -0
  71. package/dist/retrieval/rrf.d.ts.map +1 -0
  72. package/dist/retrieval/rrf.js +23 -0
  73. package/dist/retrieval/rrf.js.map +1 -0
  74. package/package.json +47 -0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 jglasskatz
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,8 @@
1
+ export interface AstChunkOpts {
2
+ language?: "typescript" | "javascript" | "python" | "auto";
3
+ maxSize?: number;
4
+ preserveImports?: boolean;
5
+ }
6
+ export declare function astImportPrefixLength(text: string, language?: "typescript" | "javascript" | "python" | "auto"): number;
7
+ export declare function astChunk(text: string, opts?: AstChunkOpts): string[];
8
+ //# sourceMappingURL=ast.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ast.d.ts","sourceRoot":"","sources":["../../src/chunking/ast.ts"],"names":[],"mappings":"AAEA,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,EAAE,YAAY,GAAG,YAAY,GAAG,QAAQ,GAAG,MAAM,CAAC;IAC3D,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,eAAe,CAAC,EAAE,OAAO,CAAC;CAC3B;AAOD,wBAAgB,qBAAqB,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,YAAY,GAAG,YAAY,GAAG,QAAQ,GAAG,MAAM,GAAG,MAAM,CAGtH;AAED,wBAAgB,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,GAAE,YAAiB,GAAG,MAAM,EAAE,CAoCxE"}
@@ -0,0 +1,86 @@
1
+ import { recursiveChunk } from "./recursive.js";
2
+ // Regex-based heuristic boundary chunker for TypeScript/JavaScript and Python.
3
+ // tree-sitter would give exact AST boundaries, but its native bindings often
4
+ // fail to install in restricted environments; we deliberately fall back to a
5
+ // regex pass that handles `function`/`class`/`export`/`def` boundaries with
6
+ // import preservation. Behaviour matches the documented fallback in the spec.
7
+ export function astImportPrefixLength(text, language) {
8
+ const lang = language === "auto" || language === undefined ? detectLanguage(text) : language;
9
+ return extractImports(text, lang).length;
10
+ }
11
+ export function astChunk(text, opts = {}) {
12
+ const lang = opts.language === "auto" || opts.language === undefined
13
+ ? detectLanguage(text)
14
+ : opts.language;
15
+ const maxSize = opts.maxSize ?? 1200;
16
+ const preserveImports = opts.preserveImports ?? true;
17
+ if (text.length === 0)
18
+ return [];
19
+ const imports = preserveImports ? extractImports(text, lang) : "";
20
+ const body = preserveImports ? text.slice(imports.length) : text;
21
+ const boundaries = findBoundaries(body, lang);
22
+ if (boundaries.length <= 1) {
23
+ const single = imports + body;
24
+ if (single.length <= maxSize)
25
+ return [single];
26
+ return recursiveChunk(single, { maxSize });
27
+ }
28
+ const segments = [];
29
+ for (let i = 0; i < boundaries.length; i++) {
30
+ const start = boundaries[i] ?? 0;
31
+ const end = boundaries[i + 1] ?? body.length;
32
+ segments.push(body.slice(start, end));
33
+ }
34
+ const chunks = [];
35
+ for (const seg of segments) {
36
+ const candidate = imports + seg;
37
+ if (candidate.length <= maxSize) {
38
+ chunks.push(candidate);
39
+ }
40
+ else {
41
+ for (const sub of recursiveChunk(candidate, { maxSize }))
42
+ chunks.push(sub);
43
+ }
44
+ }
45
+ return chunks;
46
+ }
47
+ function detectLanguage(text) {
48
+ if (/\bdef\s+\w+\s*\(|^\s*class\s+\w+\s*\(/m.test(text))
49
+ return "python";
50
+ if (/\binterface\s+\w+|:\s*\w+\s*[=,)]/.test(text))
51
+ return "typescript";
52
+ return "javascript";
53
+ }
54
+ function extractImports(text, lang) {
55
+ const lines = text.split("\n");
56
+ const out = [];
57
+ const importRe = lang === "python"
58
+ ? /^\s*(import\s+|from\s+\S+\s+import\s+)/
59
+ : /^\s*(import\s+|export\s+\*\s+from|export\s+\{[^}]*\}\s+from)/;
60
+ let i = 0;
61
+ for (; i < lines.length; i++) {
62
+ const line = lines[i] ?? "";
63
+ if (importRe.test(line) || line.trim() === "" || line.trim().startsWith("//") || line.trim().startsWith("#")) {
64
+ out.push(line);
65
+ }
66
+ else {
67
+ break;
68
+ }
69
+ }
70
+ if (out.length === 0)
71
+ return "";
72
+ return out.join("\n") + "\n";
73
+ }
74
+ function findBoundaries(body, lang) {
75
+ const re = lang === "python"
76
+ ? /^(?:def|class)\s+\w+/gm
77
+ : /^(?:export\s+(?:default\s+)?(?:async\s+)?(?:function|class)|(?:async\s+)?function|class)\s+\w+/gm;
78
+ const positions = [];
79
+ let m;
80
+ while ((m = re.exec(body)) !== null)
81
+ positions.push(m.index);
82
+ if (positions[0] !== 0)
83
+ positions.unshift(0);
84
+ return positions;
85
+ }
86
+ //# sourceMappingURL=ast.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ast.js","sourceRoot":"","sources":["../../src/chunking/ast.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAQhD,+EAA+E;AAC/E,6EAA6E;AAC7E,6EAA6E;AAC7E,4EAA4E;AAC5E,8EAA8E;AAC9E,MAAM,UAAU,qBAAqB,CAAC,IAAY,EAAE,QAA0D;IAC5G,MAAM,IAAI,GAAG,QAAQ,KAAK,MAAM,IAAI,QAAQ,KAAK,SAAS,CAAC,CAAC,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC;IAC7F,OAAO,cAAc,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,MAAM,CAAC;AAC3C,CAAC;AAED,MAAM,UAAU,QAAQ,CAAC,IAAY,EAAE,OAAqB,EAAE;IAC5D,MAAM,IAAI,GAAG,IAAI,CAAC,QAAQ,KAAK,MAAM,IAAI,IAAI,CAAC,QAAQ,KAAK,SAAS;QAClE,CAAC,CAAC,cAAc,CAAC,IAAI,CAAC;QACtB,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC;IAClB,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC;IACrC,MAAM,eAAe,GAAG,IAAI,CAAC,eAAe,IAAI,IAAI,CAAC;IAErD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEjC,MAAM,OAAO,GAAG,eAAe,CAAC,CAAC,CAAC,cAAc,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IAClE,MAAM,IAAI,GAAG,eAAe,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAEjE,MAAM,UAAU,GAAG,cAAc,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;IAC9C,IAAI,UAAU,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;QAC3B,MAAM,MAAM,GAAG,OAAO,GAAG,IAAI,CAAC;QAC9B,IAAI,MAAM,CAAC,MAAM,IAAI,OAAO;YAAE,OAAO,CAAC,MAAM,CAAC,CAAC;QAC9C,OAAO,cAAc,CAAC,MAAM,EAAE,EAAE,OAAO,EAAE,CAAC,CAAC;IAC7C,CAAC;IAED,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3C,MAAM,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QACjC,MAAM,GAAG,GAAG,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,IAAI,CAAC,MAAM,CAAC;QAC7C,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,CAAC;IACxC,CAAC;IAED,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,KAAK,MAAM,GAAG,IAAI,QAAQ,EAAE,CAAC;QAC3B,MAAM,SAAS,GAAG,OAAO,GAAG,GAAG,CAAC;QAChC,IAAI,SAAS,CAAC,MAAM,IAAI,OAAO,EAAE,CAAC;YAChC,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QACzB,CAAC;aAAM,CAAC;YACN,KAAK,MAAM,GAAG,IAAI,cAAc,CAAC,SAAS,EAAE,EAAE,OAAO,EAAE,CAAC;gBAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC7E,CAAC;IACH,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,cAAc,CAAC,IAAY;IAClC,IAAI,wCAAwC,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,QAAQ,CAAC;IACzE,IAAI,mCAAmC,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,YAAY,CAAC;IACxE,OAAO,YAAY,CAAC;AACtB,CAAC;AAED,SAAS,cAAc,CAAC,IAAY,EAAE,IAA4C;IAChF,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC/B,MAAM,GAAG,GAAa,EAAE,CAAC;IACzB,MAAM,QAAQ,GACZ,IAAI,KAAK,QAAQ;QACf,CAAC,CAAC,wCAAwC;QAC1C,CAAC,CAAC,8DAA8D,CAAC;IACrE,IAAI,CAAC,GAAG,CAAC,CAAC;IACV,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC7B,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAC5B,IAAI,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;YAC7G,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACjB,CAAC;aAAM,CAAC;YACN,MAAM;QACR,CAAC;IACH,CAAC;IACD,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAChC,OAAO,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC;AAC/B,CAAC;AAED,SAAS,cAAc,CAAC,IAAY,EAAE,IAA4C;IAChF,MAAM,EAAE,GACN,IAAI,KAAK,QAAQ;QACf,CAAC,CAAC,wBAAwB;QAC1B,CAAC,CAAC,kGAAkG,CAAC;IACzG,MAAM,SAAS,GAAa,EAAE,CAAC;IAC/B,IAAI,CAAyB,CAAC;IAC9B,OAAO,CAAC,CAAC,GAAG,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI;QAAE,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;IAC7D,IAAI,SAAS,CAAC,CAAC,CAAC,KAAK,CAAC;QAAE,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;IAC7C,OAAO,SAAS,CAAC;AACnB,CAAC"}
@@ -0,0 +1,15 @@
1
+ export interface ContextProvider {
2
+ generate(args: {
3
+ systemPrompt: string;
4
+ userPrompt: string;
5
+ }): Promise<string>;
6
+ }
7
+ export interface AddChunkContextArgs {
8
+ chunk: string;
9
+ document: string;
10
+ provider: ContextProvider;
11
+ documentBudgetChars?: number;
12
+ }
13
+ export declare function addChunkContext(args: AddChunkContextArgs): Promise<string>;
14
+ export declare function prependContext(chunk: string, contextParagraph: string): string;
15
+ //# sourceMappingURL=contextual-fallback.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"contextual-fallback.d.ts","sourceRoot":"","sources":["../../src/chunking/contextual-fallback.ts"],"names":[],"mappings":"AAKA,MAAM,WAAW,eAAe;IAI9B,QAAQ,CAAC,IAAI,EAAE;QAAE,YAAY,EAAE,MAAM,CAAC;QAAC,UAAU,EAAE,MAAM,CAAA;KAAE,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;CAC/E;AAED,MAAM,WAAW,mBAAmB;IAClC,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,eAAe,CAAC;IAG1B,mBAAmB,CAAC,EAAE,MAAM,CAAC;CAC9B;AAKD,wBAAsB,eAAe,CAAC,IAAI,EAAE,mBAAmB,GAAG,OAAO,CAAC,MAAM,CAAC,CAqBhF;AAED,wBAAgB,cAAc,CAAC,KAAK,EAAE,MAAM,EAAE,gBAAgB,EAAE,MAAM,GAAG,MAAM,CAG9E"}
@@ -0,0 +1,33 @@
1
+ // Per-chunk LLM context generator (Anthropic Contextual Retrieval recipe). Only
2
+ // invoked when the embedder is NOT context-aware (so we get the same lift on
3
+ // OpenAI / MiniLM that voyage-context-3 gets for free). The prompt asks for a
4
+ // short paragraph explaining where the chunk sits in the document.
5
+ const SYSTEM_PROMPT = "You write a single ~50–100 token context paragraph that situates a chunk inside its source document. The paragraph is prepended to the chunk before embedding, so retrieval can use both the chunk's content and its surrounding context. Output the paragraph and nothing else — no preamble, no markdown.";
6
+ export async function addChunkContext(args) {
7
+ const docBudget = args.documentBudgetChars ?? 16_000;
8
+ const doc = args.document.length > docBudget
9
+ ? args.document.slice(0, docBudget) + "\n[... truncated ...]"
10
+ : args.document;
11
+ const userPrompt = [
12
+ "<document>",
13
+ doc,
14
+ "</document>",
15
+ "",
16
+ "<chunk>",
17
+ args.chunk,
18
+ "</chunk>",
19
+ "",
20
+ "Write the ~50–100 token context paragraph for this chunk.",
21
+ ].join("\n");
22
+ const out = await args.provider.generate({
23
+ systemPrompt: SYSTEM_PROMPT,
24
+ userPrompt,
25
+ });
26
+ return out.trim();
27
+ }
28
+ export function prependContext(chunk, contextParagraph) {
29
+ if (contextParagraph.length === 0)
30
+ return chunk;
31
+ return `${contextParagraph}\n\n${chunk}`;
32
+ }
33
+ //# sourceMappingURL=contextual-fallback.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"contextual-fallback.js","sourceRoot":"","sources":["../../src/chunking/contextual-fallback.ts"],"names":[],"mappings":"AAAA,gFAAgF;AAChF,6EAA6E;AAC7E,8EAA8E;AAC9E,mEAAmE;AAkBnE,MAAM,aAAa,GACjB,6SAA6S,CAAC;AAEhT,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,IAAyB;IAC7D,MAAM,SAAS,GAAG,IAAI,CAAC,mBAAmB,IAAI,MAAM,CAAC;IACrD,MAAM,GAAG,GAAG,IAAI,CAAC,QAAQ,CAAC,MAAM,GAAG,SAAS;QAC1C,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,GAAG,uBAAuB;QAC7D,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC;IAClB,MAAM,UAAU,GAAG;QACjB,YAAY;QACZ,GAAG;QACH,aAAa;QACb,EAAE;QACF,SAAS;QACT,IAAI,CAAC,KAAK;QACV,UAAU;QACV,EAAE;QACF,2DAA2D;KAC5D,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACb,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC;QACvC,YAAY,EAAE,aAAa;QAC3B,UAAU;KACX,CAAC,CAAC;IACH,OAAO,GAAG,CAAC,IAAI,EAAE,CAAC;AACpB,CAAC;AAED,MAAM,UAAU,cAAc,CAAC,KAAa,EAAE,gBAAwB;IACpE,IAAI,gBAAgB,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC;IAChD,OAAO,GAAG,gBAAgB,OAAO,KAAK,EAAE,CAAC;AAC3C,CAAC"}
@@ -0,0 +1,6 @@
1
+ export interface FixedChunkOpts {
2
+ windowSize?: number;
3
+ overlap?: number;
4
+ }
5
+ export declare function fixedChunk(text: string, opts?: FixedChunkOpts): string[];
6
+ //# sourceMappingURL=fixed.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fixed.d.ts","sourceRoot":"","sources":["../../src/chunking/fixed.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,cAAc;IAC7B,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,UAAU,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,GAAE,cAAmB,GAAG,MAAM,EAAE,CAmB5E"}
@@ -0,0 +1,24 @@
1
+ export function fixedChunk(text, opts = {}) {
2
+ const windowSize = opts.windowSize ?? 800;
3
+ const overlap = opts.overlap ?? 100;
4
+ if (windowSize <= 0)
5
+ throw new Error("windowSize must be > 0");
6
+ if (text.length === 0)
7
+ return [];
8
+ if (text.length <= windowSize)
9
+ return [text];
10
+ if (overlap < 0 || overlap >= windowSize) {
11
+ throw new Error("overlap must be in [0, windowSize)");
12
+ }
13
+ const chunks = [];
14
+ let start = 0;
15
+ while (start < text.length) {
16
+ const end = Math.min(start + windowSize, text.length);
17
+ chunks.push(text.slice(start, end));
18
+ if (end === text.length)
19
+ break;
20
+ start += windowSize - overlap;
21
+ }
22
+ return chunks;
23
+ }
24
+ //# sourceMappingURL=fixed.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fixed.js","sourceRoot":"","sources":["../../src/chunking/fixed.ts"],"names":[],"mappings":"AAKA,MAAM,UAAU,UAAU,CAAC,IAAY,EAAE,OAAuB,EAAE;IAChE,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,IAAI,GAAG,CAAC;IAC1C,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,IAAI,GAAG,CAAC;IACpC,IAAI,UAAU,IAAI,CAAC;QAAE,MAAM,IAAI,KAAK,CAAC,wBAAwB,CAAC,CAAC;IAC/D,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IACjC,IAAI,IAAI,CAAC,MAAM,IAAI,UAAU;QAAE,OAAO,CAAC,IAAI,CAAC,CAAC;IAC7C,IAAI,OAAO,GAAG,CAAC,IAAI,OAAO,IAAI,UAAU,EAAE,CAAC;QACzC,MAAM,IAAI,KAAK,CAAC,oCAAoC,CAAC,CAAC;IACxD,CAAC;IAED,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,OAAO,KAAK,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QAC3B,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,GAAG,UAAU,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;QACtD,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,CAAC;QACpC,IAAI,GAAG,KAAK,IAAI,CAAC,MAAM;YAAE,MAAM;QAC/B,KAAK,IAAI,UAAU,GAAG,OAAO,CAAC;IAChC,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC"}
@@ -0,0 +1,24 @@
1
+ export type ChunkMode = "fixed" | "recursive" | "ast" | "late";
2
+ export interface ChunkDocumentOpts {
3
+ content: string;
4
+ mode?: ChunkMode;
5
+ mimeType?: string;
6
+ maxSize?: number;
7
+ overlap?: number;
8
+ }
9
+ export interface Chunk {
10
+ content: string;
11
+ ordinal: number;
12
+ charStart: number;
13
+ charEnd: number;
14
+ mode: ChunkMode;
15
+ }
16
+ export declare function chunkDocument(opts: ChunkDocumentOpts): Chunk[];
17
+ export { fixedChunk } from "./fixed.js";
18
+ export { recursiveChunk } from "./recursive.js";
19
+ export { astChunk } from "./ast.js";
20
+ export { meanPoolByRange } from "./late.js";
21
+ export type { LateChunkRange, LateChunkTokenPosition } from "./late.js";
22
+ export { addChunkContext, prependContext } from "./contextual-fallback.js";
23
+ export type { ContextProvider, AddChunkContextArgs } from "./contextual-fallback.js";
24
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/chunking/index.ts"],"names":[],"mappings":"AAIA,MAAM,MAAM,SAAS,GAAG,OAAO,GAAG,WAAW,GAAG,KAAK,GAAG,MAAM,CAAC;AAE/D,MAAM,WAAW,iBAAiB;IAChC,OAAO,EAAE,MAAM,CAAC;IAChB,IAAI,CAAC,EAAE,SAAS,CAAC;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,KAAK;IACpB,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,IAAI,EAAE,SAAS,CAAC;CACjB;AAWD,wBAAgB,aAAa,CAAC,IAAI,EAAE,iBAAiB,GAAG,KAAK,EAAE,CAiD9D;AAiBD,OAAO,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AACxC,OAAO,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAChD,OAAO,EAAE,QAAQ,EAAE,MAAM,UAAU,CAAC;AACpC,OAAO,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAC5C,YAAY,EAAE,cAAc,EAAE,sBAAsB,EAAE,MAAM,WAAW,CAAC;AACxE,OAAO,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,0BAA0B,CAAC;AAC3E,YAAY,EAAE,eAAe,EAAE,mBAAmB,EAAE,MAAM,0BAA0B,CAAC"}
@@ -0,0 +1,86 @@
1
+ import { fixedChunk } from "./fixed.js";
2
+ import { recursiveChunk } from "./recursive.js";
3
+ import { astChunk, astImportPrefixLength } from "./ast.js";
4
+ const CODE_MIME_PREFIXES = [
5
+ "text/x-typescript",
6
+ "text/typescript",
7
+ "application/typescript",
8
+ "text/javascript",
9
+ "application/javascript",
10
+ "text/x-python",
11
+ ];
12
+ export function chunkDocument(opts) {
13
+ const mode = opts.mode ?? autoMode(opts.mimeType);
14
+ const maxSize = opts.maxSize ?? 800;
15
+ const overlap = opts.overlap ?? 100;
16
+ let raw;
17
+ if (mode === "ast") {
18
+ raw = astChunk(opts.content, { maxSize, language: pickLanguage(opts.mimeType) });
19
+ }
20
+ else if (mode === "recursive") {
21
+ raw = recursiveChunk(opts.content, { maxSize, overlap });
22
+ }
23
+ else if (mode === "late") {
24
+ raw = recursiveChunk(opts.content, { maxSize, overlap });
25
+ }
26
+ else {
27
+ raw = fixedChunk(opts.content, { windowSize: maxSize, overlap });
28
+ }
29
+ const importPrefixLen = mode === "ast" ? astImportPrefixLength(opts.content, pickLanguage(opts.mimeType)) : 0;
30
+ let cursor = 0;
31
+ return raw.map((content, i) => {
32
+ const searchText = content.slice(importPrefixLen);
33
+ const searchFrom = i === 0 ? 0 : cursor;
34
+ const found = searchText.length > 0
35
+ ? opts.content.indexOf(searchText, searchFrom)
36
+ : opts.content.indexOf(content, searchFrom);
37
+ let charStart;
38
+ let charEnd;
39
+ if (found >= 0) {
40
+ charStart = found;
41
+ charEnd = found + searchText.length;
42
+ cursor = charStart + 1;
43
+ }
44
+ else {
45
+ const directFound = opts.content.indexOf(content, searchFrom);
46
+ if (directFound >= 0) {
47
+ charStart = directFound;
48
+ charEnd = directFound + content.length;
49
+ cursor = charStart + 1;
50
+ }
51
+ else {
52
+ charStart = cursor;
53
+ charEnd = cursor + content.length;
54
+ }
55
+ }
56
+ return {
57
+ content,
58
+ ordinal: i,
59
+ charStart,
60
+ charEnd,
61
+ mode,
62
+ };
63
+ });
64
+ }
65
+ function autoMode(mimeType) {
66
+ if (mimeType && CODE_MIME_PREFIXES.some((p) => mimeType.startsWith(p)))
67
+ return "ast";
68
+ return "recursive";
69
+ }
70
+ function pickLanguage(mimeType) {
71
+ if (!mimeType)
72
+ return "auto";
73
+ if (mimeType.includes("typescript"))
74
+ return "typescript";
75
+ if (mimeType.includes("javascript"))
76
+ return "javascript";
77
+ if (mimeType.includes("python"))
78
+ return "python";
79
+ return "auto";
80
+ }
81
+ export { fixedChunk } from "./fixed.js";
82
+ export { recursiveChunk } from "./recursive.js";
83
+ export { astChunk } from "./ast.js";
84
+ export { meanPoolByRange } from "./late.js";
85
+ export { addChunkContext, prependContext } from "./contextual-fallback.js";
86
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/chunking/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AACxC,OAAO,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAChD,OAAO,EAAE,QAAQ,EAAE,qBAAqB,EAAE,MAAM,UAAU,CAAC;AAoB3D,MAAM,kBAAkB,GAAG;IACzB,mBAAmB;IACnB,iBAAiB;IACjB,wBAAwB;IACxB,iBAAiB;IACjB,wBAAwB;IACxB,eAAe;CAChB,CAAC;AAEF,MAAM,UAAU,aAAa,CAAC,IAAuB;IACnD,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,IAAI,QAAQ,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IAClD,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,IAAI,GAAG,CAAC;IACpC,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,IAAI,GAAG,CAAC;IAEpC,IAAI,GAAa,CAAC;IAClB,IAAI,IAAI,KAAK,KAAK,EAAE,CAAC;QACnB,GAAG,GAAG,QAAQ,CAAC,IAAI,CAAC,OAAO,EAAE,EAAE,OAAO,EAAE,QAAQ,EAAE,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;IACnF,CAAC;SAAM,IAAI,IAAI,KAAK,WAAW,EAAE,CAAC;QAChC,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,OAAO,EAAE,EAAE,OAAO,EAAE,OAAO,EAAE,CAAC,CAAC;IAC3D,CAAC;SAAM,IAAI,IAAI,KAAK,MAAM,EAAE,CAAC;QAC3B,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,OAAO,EAAE,EAAE,OAAO,EAAE,OAAO,EAAE,CAAC,CAAC;IAC3D,CAAC;SAAM,CAAC;QACN,GAAG,GAAG,UAAU,CAAC,IAAI,CAAC,OAAO,EAAE,EAAE,UAAU,EAAE,OAAO,EAAE,OAAO,EAAE,CAAC,CAAC;IACnE,CAAC;IAED,MAAM,eAAe,GAAG,IAAI,KAAK,KAAK,CAAC,CAAC,CAAC,qBAAqB,CAAC,IAAI,CAAC,OAAO,EAAE,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC9G,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,OAAO,GAAG,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,CAAC,EAAE,EAAE;QAC5B,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC;QAClD,MAAM,UAAU,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;QACxC,MAAM,KAAK,GAAG,UAAU,CAAC,MAAM,GAAG,CAAC;YACjC,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,UAAU,EAAE,UAAU,CAAC;YAC9C,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,OAAO,EAAE,UAAU,CAAC,CAAC;QAC9C,IAAI,SAAiB,CAAC;QACtB,IAAI,OAAe,CAAC;QACpB,IAAI,KAAK,IAAI,CAAC,EAAE,CAAC;YACf,SAAS,GAAG,KAAK,CAAC;YAClB,OAAO,GAAG,KAAK,GAAG,UAAU,CAAC,MAAM,CAAC;YACpC,MAAM,GAAG,SAAS,GAAG,CAAC,CAAC;QACzB,CAAC;aAAM,CAAC;YACN,MAAM,WAAW,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,OAAO,EAAE,UAAU,CAAC,CAAC;YAC9D,IAAI,WAAW,IAAI,CAAC,EAAE,CAAC;gBACrB,SAAS,GAAG,WAAW,CAAC;gBACxB,OAAO,GAAG,WAAW,GAAG,OAAO,CAAC,MAAM,CAAC;gBACvC,MAAM,GAAG,SAAS,GAAG,CAAC,CAAC;YACzB,CAAC;iBAAM,CAAC;gBACN,SAAS,GAAG,MAAM,CAAC;gBACnB,OAAO,GAAG,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;YACpC,CAAC;QACH,CAAC;QACD,OAAO;YACL,OAAO;YACP,OAAO,EAAE,CAAC;YACV,SAAS;YACT,OAAO;YACP,IAAI;SACL,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC;AAED,SAAS,QAAQ,CAAC,QAAiB;IACjC,IAAI,QAAQ,IAAI,kBAAkB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QAAE,OAAO,KAAK,CAAC;IACrF,OAAO,WAAW,CAAC;AACrB,CAAC;AAED,SAAS,YAAY,CACnB,QAAiB;IAEjB,IAAI,CAAC,QAAQ;QAAE,OAAO,MAAM,CAAC;IAC7B,IAAI,QAAQ,CAAC,QAAQ,CAAC,YAAY,CAAC;QAAE,OAAO,YAAY,CAAC;IACzD,IAAI,QAAQ,CAAC,QAAQ,CAAC,YAAY,CAAC;QAAE,OAAO,YAAY,CAAC;IACzD,IAAI,QAAQ,CAAC,QAAQ,CAAC,QAAQ,CAAC;QAAE,OAAO,QAAQ,CAAC;IACjD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,OAAO,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AACxC,OAAO,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAChD,OAAO,EAAE,QAAQ,EAAE,MAAM,UAAU,CAAC;AACpC,OAAO,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAE5C,OAAO,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,0BAA0B,CAAC"}
@@ -0,0 +1,11 @@
1
+ export interface LateChunkRange {
2
+ start: number;
3
+ end: number;
4
+ }
5
+ export interface LateChunkTokenPosition {
6
+ charStart: number;
7
+ charEnd: number;
8
+ vector: number[];
9
+ }
10
+ export declare function meanPoolByRange(tokens: LateChunkTokenPosition[], ranges: LateChunkRange[]): number[][];
11
+ //# sourceMappingURL=late.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"late.d.ts","sourceRoot":"","sources":["../../src/chunking/late.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,cAAc;IAC7B,KAAK,EAAE,MAAM,CAAC;IACd,GAAG,EAAE,MAAM,CAAC;CACb;AAED,MAAM,WAAW,sBAAsB;IACrC,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,EAAE,CAAC;CAClB;AAMD,wBAAgB,eAAe,CAC7B,MAAM,EAAE,sBAAsB,EAAE,EAChC,MAAM,EAAE,cAAc,EAAE,GACvB,MAAM,EAAE,EAAE,CAEZ"}
@@ -0,0 +1,27 @@
1
+ // Late chunking: given token-level embeddings over the entire document and a
2
+ // list of (charStart, charEnd) chunk ranges, returns one mean-pooled vector
3
+ // per chunk. The contract: the token positions are sorted; we accumulate any
4
+ // token whose midpoint falls inside the chunk's char range and average.
5
+ export function meanPoolByRange(tokens, ranges) {
6
+ return ranges.map((r) => meanPoolOne(tokens, r));
7
+ }
8
+ function meanPoolOne(tokens, range) {
9
+ const matching = tokens.filter((t) => {
10
+ const mid = (t.charStart + t.charEnd) / 2;
11
+ return mid >= range.start && mid < range.end;
12
+ });
13
+ if (matching.length === 0) {
14
+ const dim = tokens[0]?.vector.length ?? 0;
15
+ return new Array(dim).fill(0);
16
+ }
17
+ const dim = matching[0]?.vector.length ?? 0;
18
+ const out = new Array(dim).fill(0);
19
+ for (const tok of matching) {
20
+ for (let i = 0; i < dim; i++)
21
+ out[i] = (out[i] ?? 0) + (tok.vector[i] ?? 0);
22
+ }
23
+ for (let i = 0; i < dim; i++)
24
+ out[i] = (out[i] ?? 0) / matching.length;
25
+ return out;
26
+ }
27
+ //# sourceMappingURL=late.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"late.js","sourceRoot":"","sources":["../../src/chunking/late.ts"],"names":[],"mappings":"AAWA,6EAA6E;AAC7E,4EAA4E;AAC5E,6EAA6E;AAC7E,wEAAwE;AACxE,MAAM,UAAU,eAAe,CAC7B,MAAgC,EAChC,MAAwB;IAExB,OAAO,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,WAAW,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,CAAC;AACnD,CAAC;AAED,SAAS,WAAW,CAAC,MAAgC,EAAE,KAAqB;IAC1E,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;QACnC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QAC1C,OAAO,GAAG,IAAI,KAAK,CAAC,KAAK,IAAI,GAAG,GAAG,KAAK,CAAC,GAAG,CAAC;IAC/C,CAAC,CAAC,CAAC;IACH,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1B,MAAM,GAAG,GAAG,MAAM,CAAC,CAAC,CAAC,EAAE,MAAM,CAAC,MAAM,IAAI,CAAC,CAAC;QAC1C,OAAO,IAAI,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAa,CAAC;IAC5C,CAAC;IACD,MAAM,GAAG,GAAG,QAAQ,CAAC,CAAC,CAAC,EAAE,MAAM,CAAC,MAAM,IAAI,CAAC,CAAC;IAC5C,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAa,CAAC;IAC/C,KAAK,MAAM,GAAG,IAAI,QAAQ,EAAE,CAAC;QAC3B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE;YAAE,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;IAC9E,CAAC;IACD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE;QAAE,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,QAAQ,CAAC,MAAM,CAAC;IACvE,OAAO,GAAG,CAAC;AACb,CAAC"}
@@ -0,0 +1,7 @@
1
+ export interface RecursiveChunkOpts {
2
+ maxSize?: number;
3
+ separators?: string[];
4
+ overlap?: number;
5
+ }
6
+ export declare function recursiveChunk(text: string, opts?: RecursiveChunkOpts): string[];
7
+ //# sourceMappingURL=recursive.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"recursive.d.ts","sourceRoot":"","sources":["../../src/chunking/recursive.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,kBAAkB;IACjC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC;IACtB,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAKD,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,GAAE,kBAAuB,GAAG,MAAM,EAAE,CAUpF"}
@@ -0,0 +1,87 @@
1
+ // Recursive separator-priority splitter — for each separator in order, splits the
2
+ // text and merges adjacent pieces while staying under maxSize. Falls back to a
3
+ // hard slice when a single piece exceeds maxSize and no smaller separator helps.
4
+ export function recursiveChunk(text, opts = {}) {
5
+ const maxSize = opts.maxSize ?? 800;
6
+ const overlap = opts.overlap ?? 100;
7
+ const separators = opts.separators ?? ["\n\n", "\n", " ", ""];
8
+ if (text.length === 0)
9
+ return [];
10
+ if (text.length <= maxSize)
11
+ return [text];
12
+ const split = splitWithSeparator(text, separators, maxSize);
13
+ return mergeAdjacent(split, maxSize, overlap);
14
+ }
15
+ function splitWithSeparator(text, separators, maxSize) {
16
+ if (text.length <= maxSize)
17
+ return [text];
18
+ const sep = separators[0];
19
+ if (sep === undefined)
20
+ return [text];
21
+ let parts;
22
+ if (sep === "") {
23
+ parts = sliceFixed(text, maxSize);
24
+ }
25
+ else {
26
+ parts = splitKeepDelimiter(text, sep);
27
+ }
28
+ const result = [];
29
+ for (const part of parts) {
30
+ if (part.length <= maxSize) {
31
+ result.push(part);
32
+ }
33
+ else {
34
+ const rest = separators.slice(1);
35
+ for (const sub of splitWithSeparator(part, rest.length > 0 ? rest : [""], maxSize)) {
36
+ result.push(sub);
37
+ }
38
+ }
39
+ }
40
+ return result;
41
+ }
42
+ function splitKeepDelimiter(text, sep) {
43
+ const out = [];
44
+ let i = 0;
45
+ while (i < text.length) {
46
+ const idx = text.indexOf(sep, i);
47
+ if (idx === -1) {
48
+ out.push(text.slice(i));
49
+ break;
50
+ }
51
+ out.push(text.slice(i, idx + sep.length));
52
+ i = idx + sep.length;
53
+ }
54
+ return out.filter((s) => s.length > 0);
55
+ }
56
+ function sliceFixed(text, size) {
57
+ const parts = [];
58
+ for (let i = 0; i < text.length; i += size)
59
+ parts.push(text.slice(i, i + size));
60
+ return parts;
61
+ }
62
+ function mergeAdjacent(parts, maxSize, overlap) {
63
+ const merged = [];
64
+ let current = "";
65
+ for (const p of parts) {
66
+ if (current.length === 0) {
67
+ current = p;
68
+ continue;
69
+ }
70
+ if (current.length + p.length <= maxSize) {
71
+ current += p;
72
+ }
73
+ else {
74
+ merged.push(current);
75
+ const overlapText = overlap > 0 ? current.slice(-overlap) : "";
76
+ current = overlapText + p;
77
+ if (current.length > maxSize) {
78
+ merged.push(current.slice(0, maxSize));
79
+ current = current.slice(maxSize - overlap < 0 ? 0 : maxSize - overlap);
80
+ }
81
+ }
82
+ }
83
+ if (current.length > 0)
84
+ merged.push(current);
85
+ return merged;
86
+ }
87
+ //# sourceMappingURL=recursive.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"recursive.js","sourceRoot":"","sources":["../../src/chunking/recursive.ts"],"names":[],"mappings":"AAMA,kFAAkF;AAClF,+EAA+E;AAC/E,iFAAiF;AACjF,MAAM,UAAU,cAAc,CAAC,IAAY,EAAE,OAA2B,EAAE;IACxE,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,IAAI,GAAG,CAAC;IACpC,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,IAAI,GAAG,CAAC;IACpC,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC;IAE9D,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IACjC,IAAI,IAAI,CAAC,MAAM,IAAI,OAAO;QAAE,OAAO,CAAC,IAAI,CAAC,CAAC;IAE1C,MAAM,KAAK,GAAG,kBAAkB,CAAC,IAAI,EAAE,UAAU,EAAE,OAAO,CAAC,CAAC;IAC5D,OAAO,aAAa,CAAC,KAAK,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC;AAChD,CAAC;AAED,SAAS,kBAAkB,CAAC,IAAY,EAAE,UAAoB,EAAE,OAAe;IAC7E,IAAI,IAAI,CAAC,MAAM,IAAI,OAAO;QAAE,OAAO,CAAC,IAAI,CAAC,CAAC;IAC1C,MAAM,GAAG,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;IAC1B,IAAI,GAAG,KAAK,SAAS;QAAE,OAAO,CAAC,IAAI,CAAC,CAAC;IAErC,IAAI,KAAe,CAAC;IACpB,IAAI,GAAG,KAAK,EAAE,EAAE,CAAC;QACf,KAAK,GAAG,UAAU,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACpC,CAAC;SAAM,CAAC;QACN,KAAK,GAAG,kBAAkB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;IACxC,CAAC;IAED,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,IAAI,CAAC,MAAM,IAAI,OAAO,EAAE,CAAC;YAC3B,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACpB,CAAC;aAAM,CAAC;YACN,MAAM,IAAI,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;YACjC,KAAK,MAAM,GAAG,IAAI,kBAAkB,CAAC,IAAI,EAAE,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,OAAO,CAAC,EAAE,CAAC;gBACnF,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACnB,CAAC;QACH,CAAC;IACH,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,kBAAkB,CAAC,IAAY,EAAE,GAAW;IACnD,MAAM,GAAG,GAAa,EAAE,CAAC;IACzB,IAAI,CAAC,GAAG,CAAC,CAAC;IACV,OAAO,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QACvB,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;QACjC,IAAI,GAAG,KAAK,CAAC,CAAC,EAAE,CAAC;YACf,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;YACxB,MAAM;QACR,CAAC;QACD,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC;QAC1C,CAAC,GAAG,GAAG,GAAG,GAAG,CAAC,MAAM,CAAC;IACvB,CAAC;IACD,OAAO,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;AACzC,CAAC;AAED,SAAS,UAAU,CAAC,IAAY,EAAE,IAAY;IAC5C,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,IAAI,IAAI;QAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,CAAC,CAAC;IAChF,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,aAAa,CAAC,KAAe,EAAE,OAAe,EAAE,OAAe;IACtE,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,IAAI,OAAO,GAAG,EAAE,CAAC;IACjB,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACzB,OAAO,GAAG,CAAC,CAAC;YACZ,SAAS;QACX,CAAC;QACD,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,MAAM,IAAI,OAAO,EAAE,CAAC;YACzC,OAAO,IAAI,CAAC,CAAC;QACf,CAAC;aAAM,CAAC;YACN,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACrB,MAAM,WAAW,GAAG,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAC/D,OAAO,GAAG,WAAW,GAAG,CAAC,CAAC;YAC1B,IAAI,OAAO,CAAC,MAAM,GAAG,OAAO,EAAE,CAAC;gBAC7B,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC;gBACvC,OAAO,GAAG,OAAO,CAAC,KAAK,CAAC,OAAO,GAAG,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,GAAG,OAAO,CAAC,CAAC;YACzE,CAAC;QACH,CAAC;IACH,CAAC;IACD,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;QAAE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IAC7C,OAAO,MAAM,CAAC;AAChB,CAAC"}
@@ -0,0 +1,14 @@
1
+ import type { Embedder } from "./types.js";
2
+ import { createVoyageContext3Embedder } from "./voyage-context-3.js";
3
+ import { createOpenAIEmbedder } from "./openai-text-3-small.js";
4
+ import { createLocalMiniLMEmbedder } from "./local-minilm.js";
5
+ export type EmbedderModel = "voyage-context-3" | "text-embedding-3-small" | "all-MiniLM-L6-v2" | "auto";
6
+ export interface CreateEmbedderOpts {
7
+ model?: EmbedderModel;
8
+ dimensions?: number;
9
+ env?: NodeJS.ProcessEnv;
10
+ }
11
+ export declare function createEmbedder(opts?: CreateEmbedderOpts): Embedder;
12
+ export { createVoyageContext3Embedder, createOpenAIEmbedder, createLocalMiniLMEmbedder, };
13
+ export type { Embedder, EmbedKind } from "./types.js";
14
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/embedding/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAC3C,OAAO,EAAE,4BAA4B,EAAE,MAAM,uBAAuB,CAAC;AACrE,OAAO,EAAE,oBAAoB,EAAE,MAAM,0BAA0B,CAAC;AAChE,OAAO,EAAE,yBAAyB,EAAE,MAAM,mBAAmB,CAAC;AAE9D,MAAM,MAAM,aAAa,GACrB,kBAAkB,GAClB,wBAAwB,GACxB,kBAAkB,GAClB,MAAM,CAAC;AAEX,MAAM,WAAW,kBAAkB;IACjC,KAAK,CAAC,EAAE,aAAa,CAAC;IACtB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,GAAG,CAAC,EAAE,MAAM,CAAC,UAAU,CAAC;CACzB;AAID,wBAAgB,cAAc,CAAC,IAAI,GAAE,kBAAuB,GAAG,QAAQ,CAmBtE;AAQD,OAAO,EACL,4BAA4B,EAC5B,oBAAoB,EACpB,yBAAyB,GAC1B,CAAC;AACF,YAAY,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC"}
@@ -0,0 +1,33 @@
1
+ import { createVoyageContext3Embedder } from "./voyage-context-3.js";
2
+ import { createOpenAIEmbedder } from "./openai-text-3-small.js";
3
+ import { createLocalMiniLMEmbedder } from "./local-minilm.js";
4
+ // Auto-select an embedder based on which provider keys are present.
5
+ // Precedence: voyage > openai > local. Callers can force a specific model.
6
+ export function createEmbedder(opts = {}) {
7
+ const env = opts.env ?? process.env;
8
+ const model = opts.model ?? autoModel(env);
9
+ if (model === "voyage-context-3") {
10
+ return createVoyageContext3Embedder({
11
+ ...(env["VOYAGE_API_KEY"] ? { apiKey: env["VOYAGE_API_KEY"] } : {}),
12
+ ...(opts.dimensions !== undefined
13
+ ? { outputDimension: opts.dimensions }
14
+ : {}),
15
+ });
16
+ }
17
+ if (model === "text-embedding-3-small") {
18
+ return createOpenAIEmbedder({
19
+ ...(env["OPENAI_API_KEY"] ? { apiKey: env["OPENAI_API_KEY"] } : {}),
20
+ ...(opts.dimensions !== undefined ? { dimensions: opts.dimensions } : {}),
21
+ });
22
+ }
23
+ return createLocalMiniLMEmbedder();
24
+ }
25
+ function autoModel(env) {
26
+ if (env["VOYAGE_API_KEY"])
27
+ return "voyage-context-3";
28
+ if (env["OPENAI_API_KEY"])
29
+ return "text-embedding-3-small";
30
+ return "all-MiniLM-L6-v2";
31
+ }
32
+ export { createVoyageContext3Embedder, createOpenAIEmbedder, createLocalMiniLMEmbedder, };
33
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/embedding/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,4BAA4B,EAAE,MAAM,uBAAuB,CAAC;AACrE,OAAO,EAAE,oBAAoB,EAAE,MAAM,0BAA0B,CAAC;AAChE,OAAO,EAAE,yBAAyB,EAAE,MAAM,mBAAmB,CAAC;AAc9D,oEAAoE;AACpE,2EAA2E;AAC3E,MAAM,UAAU,cAAc,CAAC,OAA2B,EAAE;IAC1D,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,IAAI,OAAO,CAAC,GAAG,CAAC;IACpC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,IAAI,SAAS,CAAC,GAAG,CAAC,CAAC;IAE3C,IAAI,KAAK,KAAK,kBAAkB,EAAE,CAAC;QACjC,OAAO,4BAA4B,CAAC;YAClC,GAAG,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,GAAG,CAAC,gBAAgB,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YACnE,GAAG,CAAC,IAAI,CAAC,UAAU,KAAK,SAAS;gBAC/B,CAAC,CAAC,EAAE,eAAe,EAAE,IAAI,CAAC,UAAqC,EAAE;gBACjE,CAAC,CAAC,EAAE,CAAC;SACR,CAAC,CAAC;IACL,CAAC;IACD,IAAI,KAAK,KAAK,wBAAwB,EAAE,CAAC;QACvC,OAAO,oBAAoB,CAAC;YAC1B,GAAG,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,GAAG,CAAC,gBAAgB,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YACnE,GAAG,CAAC,IAAI,CAAC,UAAU,KAAK,SAAS,CAAC,CAAC,CAAC,EAAE,UAAU,EAAE,IAAI,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SAC1E,CAAC,CAAC;IACL,CAAC;IACD,OAAO,yBAAyB,EAAE,CAAC;AACrC,CAAC;AAED,SAAS,SAAS,CAAC,GAAsB;IACvC,IAAI,GAAG,CAAC,gBAAgB,CAAC;QAAE,OAAO,kBAAkB,CAAC;IACrD,IAAI,GAAG,CAAC,gBAAgB,CAAC;QAAE,OAAO,wBAAwB,CAAC;IAC3D,OAAO,kBAAkB,CAAC;AAC5B,CAAC;AAED,OAAO,EACL,4BAA4B,EAC5B,oBAAoB,EACpB,yBAAyB,GAC1B,CAAC"}
@@ -0,0 +1,8 @@
1
+ import type { Embedder } from "./types.js";
2
+ export interface LocalMiniLMOpts {
3
+ pipelineFactory?: () => Promise<(text: string, opts: object) => Promise<{
4
+ data: Float32Array;
5
+ }>>;
6
+ }
7
+ export declare function createLocalMiniLMEmbedder(opts?: LocalMiniLMOpts): Embedder;
8
+ //# sourceMappingURL=local-minilm.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"local-minilm.d.ts","sourceRoot":"","sources":["../../src/embedding/local-minilm.ts"],"names":[],"mappings":"AAOA,OAAO,KAAK,EAAE,QAAQ,EAAa,MAAM,YAAY,CAAC;AAuBtD,MAAM,WAAW,eAAe;IAG9B,eAAe,CAAC,EAAE,MAAM,OAAO,CAAC,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,KAAK,OAAO,CAAC;QAAE,IAAI,EAAE,YAAY,CAAA;KAAE,CAAC,CAAC,CAAC;CAClG;AAED,wBAAgB,yBAAyB,CAAC,IAAI,GAAE,eAAoB,GAAG,QAAQ,CAyC9E"}
@@ -0,0 +1,51 @@
1
+ const MODEL = "Xenova/all-MiniLM-L6-v2";
2
+ const DIM = 384;
3
+ // Loader factored into a named function so consumers (apps/api in particular)
4
+ // can mock it via vi.mock("@keel_flow/kb-pipeline/dist/embedding/loader.js") when the
5
+ // upstream @huggingface/transformers package isn't installable in their test
6
+ // env. Default behaviour is an unconditional dynamic import.
7
+ async function loadHuggingface() {
8
+ try {
9
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any -- dynamic import of the optional @huggingface/transformers peer; module shape is validated at the call site
10
+ return await import("@huggingface/transformers");
11
+ }
12
+ catch (cause) {
13
+ throw new Error("The local-minilm embedder requires @huggingface/transformers, which is an optional peer dependency of @keel_flow/kb-pipeline. " +
14
+ "Install it with: pnpm add @huggingface/transformers — " +
15
+ "or set OPENAI_API_KEY to use the OpenAI text-embedding-3-small path instead.", { cause });
16
+ }
17
+ }
18
+ export function createLocalMiniLMEmbedder(opts = {}) {
19
+ let pipelinePromise = null;
20
+ async function getPipeline() {
21
+ if (pipelinePromise)
22
+ return pipelinePromise;
23
+ pipelinePromise = (async () => {
24
+ if (opts.pipelineFactory)
25
+ return opts.pipelineFactory();
26
+ const mod = (await loadHuggingface());
27
+ return mod.pipeline("feature-extraction", MODEL, { revision: "main" });
28
+ })();
29
+ pipelinePromise.catch(() => {
30
+ pipelinePromise = null;
31
+ });
32
+ return pipelinePromise;
33
+ }
34
+ return {
35
+ modelId: MODEL,
36
+ dim: DIM,
37
+ isContextAware: false,
38
+ async embed(texts, _kind) {
39
+ if (texts.length === 0)
40
+ return [];
41
+ const pipeline = await getPipeline();
42
+ const out = [];
43
+ for (const text of texts) {
44
+ const result = await pipeline(text, { pooling: "mean", normalize: true });
45
+ out.push(Array.from(result.data));
46
+ }
47
+ return out;
48
+ },
49
+ };
50
+ }
51
+ //# sourceMappingURL=local-minilm.js.map