@convex-dev/rag 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +371 -0
- package/dist/client/_generated/_ignore.d.ts +1 -0
- package/dist/client/_generated/_ignore.d.ts.map +1 -0
- package/dist/client/_generated/_ignore.js +3 -0
- package/dist/client/_generated/_ignore.js.map +1 -0
- package/dist/client/defaultChunker.d.ts +15 -0
- package/dist/client/defaultChunker.d.ts.map +1 -0
- package/dist/client/defaultChunker.js +148 -0
- package/dist/client/defaultChunker.js.map +1 -0
- package/dist/client/fileUtils.d.ts +24 -0
- package/dist/client/fileUtils.d.ts.map +1 -0
- package/dist/client/fileUtils.js +179 -0
- package/dist/client/fileUtils.js.map +1 -0
- package/dist/client/index.d.ts +442 -0
- package/dist/client/index.d.ts.map +1 -0
- package/dist/client/index.js +597 -0
- package/dist/client/index.js.map +1 -0
- package/dist/client/types.d.ts +29 -0
- package/dist/client/types.d.ts.map +1 -0
- package/dist/client/types.js +2 -0
- package/dist/client/types.js.map +1 -0
- package/dist/component/_generated/api.d.ts +439 -0
- package/dist/component/_generated/api.d.ts.map +1 -0
- package/dist/component/_generated/api.js +22 -0
- package/dist/component/_generated/api.js.map +1 -0
- package/dist/component/_generated/dataModel.d.ts +60 -0
- package/dist/component/_generated/server.d.ts +149 -0
- package/dist/component/_generated/server.d.ts.map +1 -0
- package/dist/component/_generated/server.js +74 -0
- package/dist/component/_generated/server.js.map +1 -0
- package/dist/component/chunks.d.ts +139 -0
- package/dist/component/chunks.d.ts.map +1 -0
- package/dist/component/chunks.js +413 -0
- package/dist/component/chunks.js.map +1 -0
- package/dist/component/convex.config.d.ts +3 -0
- package/dist/component/convex.config.d.ts.map +1 -0
- package/dist/component/convex.config.js +6 -0
- package/dist/component/convex.config.js.map +1 -0
- package/dist/component/embeddings/importance.d.ts +21 -0
- package/dist/component/embeddings/importance.d.ts.map +1 -0
- package/dist/component/embeddings/importance.js +67 -0
- package/dist/component/embeddings/importance.js.map +1 -0
- package/dist/component/embeddings/index.d.ts +23 -0
- package/dist/component/embeddings/index.d.ts.map +1 -0
- package/dist/component/embeddings/index.js +54 -0
- package/dist/component/embeddings/index.js.map +1 -0
- package/dist/component/embeddings/tables.d.ts +39 -0
- package/dist/component/embeddings/tables.d.ts.map +1 -0
- package/dist/component/embeddings/tables.js +53 -0
- package/dist/component/embeddings/tables.js.map +1 -0
- package/dist/component/entries.d.ts +167 -0
- package/dist/component/entries.d.ts.map +1 -0
- package/dist/component/entries.js +409 -0
- package/dist/component/entries.js.map +1 -0
- package/dist/component/filters.d.ts +46 -0
- package/dist/component/filters.d.ts.map +1 -0
- package/dist/component/filters.js +72 -0
- package/dist/component/filters.js.map +1 -0
- package/dist/component/namespaces.d.ts +131 -0
- package/dist/component/namespaces.d.ts.map +1 -0
- package/dist/component/namespaces.js +222 -0
- package/dist/component/namespaces.js.map +1 -0
- package/dist/component/schema.d.ts +1697 -0
- package/dist/component/schema.d.ts.map +1 -0
- package/dist/component/schema.js +88 -0
- package/dist/component/schema.js.map +1 -0
- package/dist/component/search.d.ts +20 -0
- package/dist/component/search.d.ts.map +1 -0
- package/dist/component/search.js +69 -0
- package/dist/component/search.js.map +1 -0
- package/dist/package.json +3 -0
- package/dist/react/index.d.ts +2 -0
- package/dist/react/index.d.ts.map +1 -0
- package/dist/react/index.js +6 -0
- package/dist/react/index.js.map +1 -0
- package/dist/shared.d.ts +479 -0
- package/dist/shared.d.ts.map +1 -0
- package/dist/shared.js +98 -0
- package/dist/shared.js.map +1 -0
- package/package.json +97 -0
- package/src/client/_generated/_ignore.ts +1 -0
- package/src/client/defaultChunker.test.ts +243 -0
- package/src/client/defaultChunker.ts +183 -0
- package/src/client/fileUtils.ts +179 -0
- package/src/client/index.test.ts +475 -0
- package/src/client/index.ts +1125 -0
- package/src/client/setup.test.ts +28 -0
- package/src/client/types.ts +69 -0
- package/src/component/_generated/api.d.ts +439 -0
- package/src/component/_generated/api.js +23 -0
- package/src/component/_generated/dataModel.d.ts +60 -0
- package/src/component/_generated/server.d.ts +149 -0
- package/src/component/_generated/server.js +90 -0
- package/src/component/chunks.test.ts +915 -0
- package/src/component/chunks.ts +555 -0
- package/src/component/convex.config.ts +7 -0
- package/src/component/embeddings/importance.test.ts +249 -0
- package/src/component/embeddings/importance.ts +75 -0
- package/src/component/embeddings/index.test.ts +482 -0
- package/src/component/embeddings/index.ts +99 -0
- package/src/component/embeddings/tables.ts +114 -0
- package/src/component/entries.test.ts +341 -0
- package/src/component/entries.ts +546 -0
- package/src/component/filters.ts +119 -0
- package/src/component/namespaces.ts +299 -0
- package/src/component/schema.ts +106 -0
- package/src/component/search.test.ts +445 -0
- package/src/component/search.ts +97 -0
- package/src/component/setup.test.ts +5 -0
- package/src/react/index.ts +7 -0
- package/src/shared.ts +247 -0
- package/src/vitest.config.ts +7 -0
package/package.json
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@convex-dev/rag",
|
|
3
|
+
"description": "A rag component for Convex.",
|
|
4
|
+
"repository": "github:get-convex/rag",
|
|
5
|
+
"homepage": "https://github.com/get-convex/rag#readme",
|
|
6
|
+
"bugs": {
|
|
7
|
+
"email": "support@convex.dev",
|
|
8
|
+
"url": "https://github.com/get-convex/rag/issues"
|
|
9
|
+
},
|
|
10
|
+
"version": "0.1.7",
|
|
11
|
+
"license": "Apache-2.0",
|
|
12
|
+
"keywords": [
|
|
13
|
+
"convex",
|
|
14
|
+
"component",
|
|
15
|
+
"document",
|
|
16
|
+
"embeddings",
|
|
17
|
+
"rag",
|
|
18
|
+
"search",
|
|
19
|
+
"semantic",
|
|
20
|
+
"vector"
|
|
21
|
+
],
|
|
22
|
+
"type": "module",
|
|
23
|
+
"scripts": {
|
|
24
|
+
"example": "cd example && npm run dev",
|
|
25
|
+
"dev": "run-p -r 'example' 'build:watch'",
|
|
26
|
+
"dashboard": "cd example && npx convex dashboard",
|
|
27
|
+
"all": "run-p -r 'example' 'build:watch' 'test:watch'",
|
|
28
|
+
"setup": "npm i && npm run build && cd example && npm i && npx convex dev --once && printf 'VITE_CONVEX_SITE_URL=' >> .env.local && npx convex env get CONVEX_SITE_URL >> .env.local",
|
|
29
|
+
"build:watch": "cd src && npx chokidar -d 1000 '../tsconfig.json' '**/*.ts' -c 'npm run build' --initial",
|
|
30
|
+
"build": "tsc --project ./tsconfig.build.json && npm run copy:dts && echo '{\\n \"type\": \"module\"\\n}' > dist/package.json",
|
|
31
|
+
"copy:dts": "rsync -a --include='*/' --include='*.d.ts' --exclude='*' src/ dist/ || cpy 'src/**/*.d.ts' 'dist/' --parents",
|
|
32
|
+
"typecheck": "tsc --noEmit",
|
|
33
|
+
"clean": "rm -rf dist tsconfig.build.tsbuildinfo",
|
|
34
|
+
"alpha": "npm run clean && npm run build && run-p test lint typecheck && npm version prerelease --preid alpha && npm publish --tag alpha && git push --tags",
|
|
35
|
+
"release": "npm run clean && npm run build && run-p test lint typecheck && npm version patch && npm publish && git push --tags",
|
|
36
|
+
"test": "vitest run --typecheck --config ./src/vitest.config.ts",
|
|
37
|
+
"test:watch": "vitest --typecheck --config ./src/vitest.config.ts",
|
|
38
|
+
"test:debug": "vitest --inspect-brk --no-file-parallelism --config ./src/vitest.config.ts",
|
|
39
|
+
"test:coverage": "vitest run --coverage --coverage.reporter=text",
|
|
40
|
+
"lint": "eslint src",
|
|
41
|
+
"version": "pbcopy <<<$npm_package_version; vim CHANGELOG.md && git add CHANGELOG.md"
|
|
42
|
+
},
|
|
43
|
+
"files": [
|
|
44
|
+
"dist",
|
|
45
|
+
"src"
|
|
46
|
+
],
|
|
47
|
+
"exports": {
|
|
48
|
+
"./package.json": "./package.json",
|
|
49
|
+
".": {
|
|
50
|
+
"@convex-dev/component-source": "./src/client/index.ts",
|
|
51
|
+
"types": "./dist/client/index.d.ts",
|
|
52
|
+
"default": "./dist/client/index.js"
|
|
53
|
+
},
|
|
54
|
+
"./react": {
|
|
55
|
+
"@convex-dev/component-source": "./src/react/index.ts",
|
|
56
|
+
"types": "./dist/react/index.d.ts",
|
|
57
|
+
"default": "./dist/react/index.js"
|
|
58
|
+
},
|
|
59
|
+
"./convex.config": {
|
|
60
|
+
"@convex-dev/component-source": "./src/component/convex.config.ts",
|
|
61
|
+
"types": "./dist/component/convex.config.d.ts",
|
|
62
|
+
"default": "./dist/component/convex.config.js"
|
|
63
|
+
}
|
|
64
|
+
},
|
|
65
|
+
"peerDependencies": {
|
|
66
|
+
"@ai-sdk/provider": "^1.1.3",
|
|
67
|
+
"@convex-dev/workpool": "^0.2.14",
|
|
68
|
+
"ai": "^4.3.16",
|
|
69
|
+
"convex": "^1.24.8",
|
|
70
|
+
"convex-helpers": "^0.1.94"
|
|
71
|
+
},
|
|
72
|
+
"devDependencies": {
|
|
73
|
+
"@ai-sdk/openai": "^1.3.22",
|
|
74
|
+
"@arethetypeswrong/cli": "^0.17.4",
|
|
75
|
+
"@edge-runtime/vm": "^5.0.0",
|
|
76
|
+
"@eslint/js": "9.29.0",
|
|
77
|
+
"@types/node": "18.17.0",
|
|
78
|
+
"@typescript-eslint/eslint-plugin": "8.35.0",
|
|
79
|
+
"@typescript-eslint/parser": "8.35.0",
|
|
80
|
+
"chokidar-cli": "3.0.0",
|
|
81
|
+
"convex-helpers": "0.1.95",
|
|
82
|
+
"convex-test": "0.0.37",
|
|
83
|
+
"cpy-cli": "5.0.0",
|
|
84
|
+
"eslint": "9.29.0",
|
|
85
|
+
"eslint-plugin-react": "^7.37.5",
|
|
86
|
+
"eslint-plugin-react-hooks": "^5.2.0",
|
|
87
|
+
"globals": "^15.9.0",
|
|
88
|
+
"npm-run-all2": "7.0.2",
|
|
89
|
+
"prettier": "3.2.5",
|
|
90
|
+
"typescript": "5.5",
|
|
91
|
+
"typescript-eslint": "8.4.0",
|
|
92
|
+
"vitest": "3.2.4"
|
|
93
|
+
},
|
|
94
|
+
"main": "./dist/client/index.js",
|
|
95
|
+
"types": "./dist/client/index.d.ts",
|
|
96
|
+
"module": "./dist/client/index.js"
|
|
97
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
// This is only here so convex-test can detect a _generated folder
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
import { defaultChunker } from "./defaultChunker.js";
|
|
2
|
+
import { describe, test, expect } from "vitest";
|
|
3
|
+
|
|
4
|
+
describe("defaultChunker", () => {
|
|
5
|
+
test("handles empty text", () => {
|
|
6
|
+
expect(defaultChunker("")).toEqual([]);
|
|
7
|
+
expect(defaultChunker(" ")).toEqual([" "]);
|
|
8
|
+
});
|
|
9
|
+
|
|
10
|
+
test("chunks paragraphs that fit within limits", () => {
|
|
11
|
+
const text = `This is the first paragraph with about 100 characters. It should be combined with others.
|
|
12
|
+
|
|
13
|
+
This is the second paragraph with similar length to make a good chunk together.
|
|
14
|
+
|
|
15
|
+
This is the third paragraph that will likely be in the next chunk.`;
|
|
16
|
+
|
|
17
|
+
const chunks = defaultChunker(text);
|
|
18
|
+
|
|
19
|
+
// Should combine all paragraphs since total length (238 chars) is well within limits
|
|
20
|
+
expect(chunks.length).toBe(1);
|
|
21
|
+
expect(chunks[0]).toBe(text);
|
|
22
|
+
chunks.forEach((chunk: string) => {
|
|
23
|
+
expect(chunk.length).toBeGreaterThan(0);
|
|
24
|
+
expect(chunk.length).toBeLessThanOrEqual(2000);
|
|
25
|
+
});
|
|
26
|
+
expect(chunks.join("\n")).toBe(text);
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
test("combines small paragraphs to meet minimum character limit", () => {
|
|
30
|
+
const text = `Short para 1.
|
|
31
|
+
|
|
32
|
+
Short para 2.
|
|
33
|
+
|
|
34
|
+
Short para 3.
|
|
35
|
+
|
|
36
|
+
Short para 4.`;
|
|
37
|
+
|
|
38
|
+
const chunks = defaultChunker(text, {
|
|
39
|
+
minCharsSoftLimit: 50,
|
|
40
|
+
maxCharsSoftLimit: 200,
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
// Should combine multiple short paragraphs
|
|
44
|
+
chunks.forEach((chunk: string) => {
|
|
45
|
+
expect(chunk.length).toBeGreaterThanOrEqual(50);
|
|
46
|
+
expect(chunk.length).toBeLessThanOrEqual(200);
|
|
47
|
+
});
|
|
48
|
+
expect(chunks.length).toBe(1);
|
|
49
|
+
expect(chunks[0]).toBe(
|
|
50
|
+
"Short para 1.\n\nShort para 2.\n\nShort para 3.\n\nShort para 4."
|
|
51
|
+
);
|
|
52
|
+
expect(chunks.join("\n")).toBe(text);
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
test("splits large paragraphs by lines", () => {
|
|
56
|
+
const longParagraph = Array(50)
|
|
57
|
+
.fill("This is a line that makes the paragraph very long.")
|
|
58
|
+
.join("\n");
|
|
59
|
+
|
|
60
|
+
const chunks = defaultChunker(longParagraph, {
|
|
61
|
+
minLines: 2,
|
|
62
|
+
minCharsSoftLimit: 200,
|
|
63
|
+
maxCharsSoftLimit: 500,
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
expect(chunks.length).toBeGreaterThan(1);
|
|
67
|
+
chunks.forEach((chunk: string) => {
|
|
68
|
+
expect(chunk.length).toBeLessThanOrEqual(500);
|
|
69
|
+
// Each chunk should have at least 2 lines (minLines)
|
|
70
|
+
expect(chunk.split("\n").length).toBeGreaterThanOrEqual(2);
|
|
71
|
+
});
|
|
72
|
+
expect(chunks.join("\n")).toBe(longParagraph);
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
test("respects minLines constraint when splitting", () => {
|
|
76
|
+
const text =
|
|
77
|
+
"Line 1\nLine 2\nLine 3\nLine 4\nLine 5\nLine 6\nLine 7\nLine 8";
|
|
78
|
+
|
|
79
|
+
const chunks = defaultChunker(text, {
|
|
80
|
+
minLines: 3,
|
|
81
|
+
minCharsSoftLimit: 10,
|
|
82
|
+
maxCharsSoftLimit: 30, // Very small to force splitting
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
chunks.forEach((chunk: string) => {
|
|
86
|
+
const lineCount = chunk.split("\n").length;
|
|
87
|
+
expect(lineCount).toBeGreaterThanOrEqual(3);
|
|
88
|
+
});
|
|
89
|
+
expect(chunks.join("\n")).toBe(text);
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
test("handles mixed content with paragraphs and large sections", () => {
|
|
93
|
+
const text = `Small paragraph 1.
|
|
94
|
+
|
|
95
|
+
Small paragraph 2.
|
|
96
|
+
|
|
97
|
+
This is a very long paragraph that definitely exceeds the maximum character limit and should be split by lines instead of being treated as a single paragraph unit.
|
|
98
|
+
Line 2 of the long paragraph.
|
|
99
|
+
Line 3 of the long paragraph.
|
|
100
|
+
Line 4 of the long paragraph.
|
|
101
|
+
Line 5 of the long paragraph.
|
|
102
|
+
|
|
103
|
+
Another small paragraph at the end.`;
|
|
104
|
+
|
|
105
|
+
const chunks = defaultChunker(text, {
|
|
106
|
+
minLines: 1,
|
|
107
|
+
minCharsSoftLimit: 100,
|
|
108
|
+
maxCharsSoftLimit: 300,
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
expect(chunks.length).toBeGreaterThan(1);
|
|
112
|
+
chunks.forEach((chunk: string) => {
|
|
113
|
+
expect(chunk.length).toBeLessThanOrEqual(300);
|
|
114
|
+
expect(chunk.trim().length).toBeGreaterThan(0);
|
|
115
|
+
});
|
|
116
|
+
expect(chunks.join("\n")).toBe(text);
|
|
117
|
+
});
|
|
118
|
+
|
|
119
|
+
test("uses custom delimiter", () => {
|
|
120
|
+
const text = "Section 1\n---\nSection 2\n---\nSection 3";
|
|
121
|
+
|
|
122
|
+
const chunks = defaultChunker(text, {
|
|
123
|
+
delimiter: "\n---\n",
|
|
124
|
+
minCharsSoftLimit: 5,
|
|
125
|
+
maxCharsSoftLimit: 50,
|
|
126
|
+
});
|
|
127
|
+
|
|
128
|
+
expect(chunks.length).toBeGreaterThan(0);
|
|
129
|
+
// Should be able to reconstruct original text with join("\n")
|
|
130
|
+
expect(chunks.join("\n")).toBe(text);
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
test("handles single line that exceeds max limit", () => {
|
|
134
|
+
const veryLongLine = "A".repeat(3000);
|
|
135
|
+
|
|
136
|
+
const chunks = defaultChunker(veryLongLine, {
|
|
137
|
+
minLines: 1,
|
|
138
|
+
minCharsSoftLimit: 200,
|
|
139
|
+
maxCharsSoftLimit: 1000,
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
// Should split even a single line if it's too long
|
|
143
|
+
expect(chunks.length).toBe(1);
|
|
144
|
+
expect(chunks.join("\n")).toBe(veryLongLine);
|
|
145
|
+
});
|
|
146
|
+
|
|
147
|
+
test("splits single line exceeding hard limit with custom hard limit", () => {
|
|
148
|
+
const longLine = "A".repeat(15000);
|
|
149
|
+
|
|
150
|
+
const chunks = defaultChunker(longLine, {
|
|
151
|
+
minLines: 1,
|
|
152
|
+
minCharsSoftLimit: 200,
|
|
153
|
+
maxCharsSoftLimit: 1000,
|
|
154
|
+
maxCharsHardLimit: 5000,
|
|
155
|
+
});
|
|
156
|
+
|
|
157
|
+
// Should be split into multiple chunks
|
|
158
|
+
expect(chunks.length).toBeGreaterThan(1);
|
|
159
|
+
|
|
160
|
+
// Each chunk should not exceed the hard limit
|
|
161
|
+
chunks.forEach((chunk: string) => {
|
|
162
|
+
expect(chunk.length).toBeLessThanOrEqual(5000);
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
// Content should be preserved when joined back together
|
|
166
|
+
expect(chunks.join("")).toBe(longLine);
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
test("splits extremely long single line with default hard limit", () => {
|
|
170
|
+
// Create a line that exceeds the default hard limit of 10000
|
|
171
|
+
const extremelyLongLine = "B".repeat(25000);
|
|
172
|
+
|
|
173
|
+
const chunks = defaultChunker(extremelyLongLine, {
|
|
174
|
+
minLines: 1,
|
|
175
|
+
minCharsSoftLimit: 200,
|
|
176
|
+
maxCharsSoftLimit: 1000,
|
|
177
|
+
// Using default maxCharsHardLimit of 10000
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
// Should be split into multiple chunks
|
|
181
|
+
expect(chunks.length).toBeGreaterThan(1);
|
|
182
|
+
|
|
183
|
+
// Each chunk should not exceed the default hard limit
|
|
184
|
+
chunks.forEach((chunk: string) => {
|
|
185
|
+
expect(chunk.length).toBeLessThanOrEqual(10000);
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
// Content should be preserved when joined back together
|
|
189
|
+
expect(chunks.join("")).toBe(extremelyLongLine);
|
|
190
|
+
|
|
191
|
+
// Should have at least 3 chunks for 25000 characters with 10000 limit
|
|
192
|
+
expect(chunks.length).toBeGreaterThanOrEqual(3);
|
|
193
|
+
});
|
|
194
|
+
|
|
195
|
+
test("verifies hard limit splitting with different character patterns", () => {
|
|
196
|
+
const longLine = "A".repeat(15000);
|
|
197
|
+
|
|
198
|
+
const chunks = defaultChunker(longLine, {
|
|
199
|
+
minLines: 1,
|
|
200
|
+
minCharsSoftLimit: 200,
|
|
201
|
+
maxCharsSoftLimit: 1000,
|
|
202
|
+
maxCharsHardLimit: 5000,
|
|
203
|
+
});
|
|
204
|
+
|
|
205
|
+
// Should be split into multiple chunks
|
|
206
|
+
expect(chunks.length).toBeGreaterThan(1);
|
|
207
|
+
|
|
208
|
+
// Each chunk should not exceed the hard limit
|
|
209
|
+
chunks.forEach((chunk: string) => {
|
|
210
|
+
expect(chunk.length).toBeLessThanOrEqual(5000);
|
|
211
|
+
});
|
|
212
|
+
|
|
213
|
+
// Content should be preserved when joined back together
|
|
214
|
+
expect(chunks.join("")).toBe(longLine);
|
|
215
|
+
});
|
|
216
|
+
|
|
217
|
+
test("preserves content without losing text", () => {
|
|
218
|
+
const originalText = `Paragraph 1 with some content.
|
|
219
|
+
|
|
220
|
+
Paragraph 2 with different content.
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
Paragraph 3 with more content.`;
|
|
224
|
+
|
|
225
|
+
const chunks = defaultChunker(originalText);
|
|
226
|
+
const reconstructed = chunks.join("\n");
|
|
227
|
+
|
|
228
|
+
// Should be able to reconstruct original text with join("\n")
|
|
229
|
+
expect(reconstructed).toBe(originalText);
|
|
230
|
+
|
|
231
|
+
// All original words should be preserved
|
|
232
|
+
const originalWords = originalText.split(/\s+/).filter((w) => w.length > 0);
|
|
233
|
+
const reconstructedWords = reconstructed
|
|
234
|
+
.split(/\s+/)
|
|
235
|
+
.filter((w) => w.length > 0);
|
|
236
|
+
|
|
237
|
+
expect(reconstructedWords.length).toBe(originalWords.length);
|
|
238
|
+
originalWords.forEach((word) => {
|
|
239
|
+
expect(reconstructed).toContain(word);
|
|
240
|
+
});
|
|
241
|
+
expect(chunks.join("\n")).toBe(originalText);
|
|
242
|
+
});
|
|
243
|
+
});
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Chunk text for embedding.
|
|
3
|
+
*
|
|
4
|
+
* By default, it will chunk into paragraphs and target
|
|
5
|
+
* 200-2000 characters per chunk (only less than 1 line if the hard limit is reached).
|
|
6
|
+
*/
|
|
7
|
+
export function defaultChunker(
|
|
8
|
+
text: string,
|
|
9
|
+
{
|
|
10
|
+
minLines = 1,
|
|
11
|
+
minCharsSoftLimit = 200,
|
|
12
|
+
maxCharsSoftLimit = 2000,
|
|
13
|
+
maxCharsHardLimit = 10000,
|
|
14
|
+
delimiter = "\n\n",
|
|
15
|
+
}: {
|
|
16
|
+
minLines?: number;
|
|
17
|
+
minCharsSoftLimit?: number;
|
|
18
|
+
maxCharsSoftLimit?: number;
|
|
19
|
+
maxCharsHardLimit?: number;
|
|
20
|
+
delimiter?: string;
|
|
21
|
+
} = {}
|
|
22
|
+
): string[] {
|
|
23
|
+
if (!text) return [];
|
|
24
|
+
|
|
25
|
+
// Split text into individual lines
|
|
26
|
+
const lines = text.split("\n");
|
|
27
|
+
const chunks: string[] = [];
|
|
28
|
+
|
|
29
|
+
let currentChunk: string[] = [];
|
|
30
|
+
|
|
31
|
+
for (let i = 0; i < lines.length; i++) {
|
|
32
|
+
const line = lines[i];
|
|
33
|
+
|
|
34
|
+
// Check if this line starts a new section (based on delimiter pattern)
|
|
35
|
+
const isNewSection = shouldStartNewSection(lines, i, delimiter);
|
|
36
|
+
|
|
37
|
+
// Calculate potential chunk if we add this line
|
|
38
|
+
const potentialChunk = [...currentChunk, line].join("\n");
|
|
39
|
+
|
|
40
|
+
// If adding this line would exceed max chars, finalize current chunk first
|
|
41
|
+
if (potentialChunk.length > maxCharsSoftLimit && currentChunk.length > 0) {
|
|
42
|
+
const trimmedChunk = removeTrailingEmptyLines(currentChunk);
|
|
43
|
+
chunks.push(trimmedChunk.join("\n"));
|
|
44
|
+
|
|
45
|
+
// Split the line if it exceeds hard limit
|
|
46
|
+
const splitLines = maybeSplitLine(line, maxCharsHardLimit);
|
|
47
|
+
// Add all but the last split piece as separate chunks
|
|
48
|
+
for (let j = 0; j < splitLines.length - 1; j++) {
|
|
49
|
+
chunks.push(splitLines[j]);
|
|
50
|
+
}
|
|
51
|
+
// Keep the last piece for potential combination with next lines
|
|
52
|
+
currentChunk = [splitLines[splitLines.length - 1]];
|
|
53
|
+
continue;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// If we're starting a new section and current chunk meets minimum requirements
|
|
57
|
+
if (
|
|
58
|
+
isNewSection &&
|
|
59
|
+
currentChunk.length >= minLines &&
|
|
60
|
+
currentChunk.join("\n").length >= Math.min(minCharsSoftLimit * 0.8, 150)
|
|
61
|
+
) {
|
|
62
|
+
// Simple logic: only split if potential chunk would exceed the soft max limit
|
|
63
|
+
if (potentialChunk.length > maxCharsSoftLimit) {
|
|
64
|
+
// When splitting at delimiter boundary, preserve natural empty lines (don't remove trailing empty lines)
|
|
65
|
+
chunks.push(currentChunk.join("\n"));
|
|
66
|
+
currentChunk = [line];
|
|
67
|
+
continue;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Add line to current chunk
|
|
72
|
+
currentChunk.push(line);
|
|
73
|
+
|
|
74
|
+
// If current chunk is too big, split it
|
|
75
|
+
if (currentChunk.join("\n").length > maxCharsSoftLimit) {
|
|
76
|
+
if (currentChunk.length === 1) {
|
|
77
|
+
// Single line too long - split it if it exceeds hard limit
|
|
78
|
+
const splitLines = maybeSplitLine(line, maxCharsHardLimit);
|
|
79
|
+
if (splitLines.length > 1) {
|
|
80
|
+
// Line was split - add all but the last piece as separate chunks
|
|
81
|
+
for (let j = 0; j < splitLines.length - 1; j++) {
|
|
82
|
+
chunks.push(splitLines[j]);
|
|
83
|
+
}
|
|
84
|
+
// Keep the last piece for potential combination with next lines
|
|
85
|
+
currentChunk = [splitLines[splitLines.length - 1]];
|
|
86
|
+
} else {
|
|
87
|
+
// Line doesn't exceed hard limit, keep it as is
|
|
88
|
+
chunks.push(line);
|
|
89
|
+
currentChunk = [];
|
|
90
|
+
}
|
|
91
|
+
} else {
|
|
92
|
+
// Remove last line and finalize chunk
|
|
93
|
+
const lastLine = currentChunk.pop()!;
|
|
94
|
+
const trimmedChunk = removeTrailingEmptyLines(currentChunk);
|
|
95
|
+
chunks.push(trimmedChunk.join("\n"));
|
|
96
|
+
currentChunk = [lastLine];
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Add remaining chunk, splitting if it exceeds hard limit
|
|
102
|
+
if (currentChunk.length > 0) {
|
|
103
|
+
const remainingText = currentChunk.join("\n");
|
|
104
|
+
if (remainingText.length > maxCharsHardLimit) {
|
|
105
|
+
// Split the remaining chunk if it exceeds hard limit
|
|
106
|
+
const splitLines = maybeSplitLine(remainingText, maxCharsHardLimit);
|
|
107
|
+
chunks.push(...splitLines);
|
|
108
|
+
} else {
|
|
109
|
+
const trimmedChunk = removeTrailingEmptyLines(currentChunk);
|
|
110
|
+
chunks.push(trimmedChunk.join("\n"));
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
return chunks;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
function maybeSplitLine(line: string, maxCharsHardLimit: number): string[] {
|
|
118
|
+
const inputs = [line]; // in reverse order
|
|
119
|
+
const lines: string[] = [];
|
|
120
|
+
while (inputs.length > 0) {
|
|
121
|
+
const input = inputs.pop()!;
|
|
122
|
+
if (input.length <= maxCharsHardLimit) {
|
|
123
|
+
lines.push(input);
|
|
124
|
+
continue;
|
|
125
|
+
}
|
|
126
|
+
// split it in half
|
|
127
|
+
const splitIndex = Math.floor(input.length / 2);
|
|
128
|
+
const candidate = input.slice(0, splitIndex);
|
|
129
|
+
const rest = input.slice(splitIndex);
|
|
130
|
+
if (candidate.length < maxCharsHardLimit) {
|
|
131
|
+
lines.push(candidate, rest);
|
|
132
|
+
} else {
|
|
133
|
+
inputs.push(rest, candidate);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
return lines;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
function shouldStartNewSection(
|
|
140
|
+
lines: string[],
|
|
141
|
+
index: number,
|
|
142
|
+
delimiter: string
|
|
143
|
+
): boolean {
|
|
144
|
+
if (index === 0) return false;
|
|
145
|
+
|
|
146
|
+
// For default "\n\n" delimiter, check for blank lines
|
|
147
|
+
if (delimiter === "\n\n") {
|
|
148
|
+
return lines[index - 1] === "";
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// For custom delimiters, check if previous lines match the delimiter pattern
|
|
152
|
+
const delimiterLines = delimiter.split("\n");
|
|
153
|
+
if (delimiterLines.length <= 1) return false;
|
|
154
|
+
|
|
155
|
+
// Check if the delimiter pattern appears before this line
|
|
156
|
+
for (let i = 0; i < delimiterLines.length - 1; i++) {
|
|
157
|
+
const checkIndex = index - delimiterLines.length + 1 + i;
|
|
158
|
+
if (checkIndex < 0 || lines[checkIndex] !== delimiterLines[i]) {
|
|
159
|
+
return false;
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
return true;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
function removeTrailingEmptyLines(lines: string[]): string[] {
|
|
167
|
+
// Don't remove anything if there's only one line
|
|
168
|
+
if (lines.length <= 1) {
|
|
169
|
+
return lines;
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// Find the last non-empty line
|
|
173
|
+
for (let i = lines.length - 1; i >= 0; i--) {
|
|
174
|
+
if (lines[i].trim() !== "") {
|
|
175
|
+
return lines.slice(0, i + 1);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// If all lines are empty, keep at least one
|
|
180
|
+
return lines.length > 0 ? [lines[0]] : [];
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
export default defaultChunker;
|