@convex-dev/rag 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +371 -0
  3. package/dist/client/_generated/_ignore.d.ts +1 -0
  4. package/dist/client/_generated/_ignore.d.ts.map +1 -0
  5. package/dist/client/_generated/_ignore.js +3 -0
  6. package/dist/client/_generated/_ignore.js.map +1 -0
  7. package/dist/client/defaultChunker.d.ts +15 -0
  8. package/dist/client/defaultChunker.d.ts.map +1 -0
  9. package/dist/client/defaultChunker.js +148 -0
  10. package/dist/client/defaultChunker.js.map +1 -0
  11. package/dist/client/fileUtils.d.ts +24 -0
  12. package/dist/client/fileUtils.d.ts.map +1 -0
  13. package/dist/client/fileUtils.js +179 -0
  14. package/dist/client/fileUtils.js.map +1 -0
  15. package/dist/client/index.d.ts +442 -0
  16. package/dist/client/index.d.ts.map +1 -0
  17. package/dist/client/index.js +597 -0
  18. package/dist/client/index.js.map +1 -0
  19. package/dist/client/types.d.ts +29 -0
  20. package/dist/client/types.d.ts.map +1 -0
  21. package/dist/client/types.js +2 -0
  22. package/dist/client/types.js.map +1 -0
  23. package/dist/component/_generated/api.d.ts +439 -0
  24. package/dist/component/_generated/api.d.ts.map +1 -0
  25. package/dist/component/_generated/api.js +22 -0
  26. package/dist/component/_generated/api.js.map +1 -0
  27. package/dist/component/_generated/dataModel.d.ts +60 -0
  28. package/dist/component/_generated/server.d.ts +149 -0
  29. package/dist/component/_generated/server.d.ts.map +1 -0
  30. package/dist/component/_generated/server.js +74 -0
  31. package/dist/component/_generated/server.js.map +1 -0
  32. package/dist/component/chunks.d.ts +139 -0
  33. package/dist/component/chunks.d.ts.map +1 -0
  34. package/dist/component/chunks.js +413 -0
  35. package/dist/component/chunks.js.map +1 -0
  36. package/dist/component/convex.config.d.ts +3 -0
  37. package/dist/component/convex.config.d.ts.map +1 -0
  38. package/dist/component/convex.config.js +6 -0
  39. package/dist/component/convex.config.js.map +1 -0
  40. package/dist/component/embeddings/importance.d.ts +21 -0
  41. package/dist/component/embeddings/importance.d.ts.map +1 -0
  42. package/dist/component/embeddings/importance.js +67 -0
  43. package/dist/component/embeddings/importance.js.map +1 -0
  44. package/dist/component/embeddings/index.d.ts +23 -0
  45. package/dist/component/embeddings/index.d.ts.map +1 -0
  46. package/dist/component/embeddings/index.js +54 -0
  47. package/dist/component/embeddings/index.js.map +1 -0
  48. package/dist/component/embeddings/tables.d.ts +39 -0
  49. package/dist/component/embeddings/tables.d.ts.map +1 -0
  50. package/dist/component/embeddings/tables.js +53 -0
  51. package/dist/component/embeddings/tables.js.map +1 -0
  52. package/dist/component/entries.d.ts +167 -0
  53. package/dist/component/entries.d.ts.map +1 -0
  54. package/dist/component/entries.js +409 -0
  55. package/dist/component/entries.js.map +1 -0
  56. package/dist/component/filters.d.ts +46 -0
  57. package/dist/component/filters.d.ts.map +1 -0
  58. package/dist/component/filters.js +72 -0
  59. package/dist/component/filters.js.map +1 -0
  60. package/dist/component/namespaces.d.ts +131 -0
  61. package/dist/component/namespaces.d.ts.map +1 -0
  62. package/dist/component/namespaces.js +222 -0
  63. package/dist/component/namespaces.js.map +1 -0
  64. package/dist/component/schema.d.ts +1697 -0
  65. package/dist/component/schema.d.ts.map +1 -0
  66. package/dist/component/schema.js +88 -0
  67. package/dist/component/schema.js.map +1 -0
  68. package/dist/component/search.d.ts +20 -0
  69. package/dist/component/search.d.ts.map +1 -0
  70. package/dist/component/search.js +69 -0
  71. package/dist/component/search.js.map +1 -0
  72. package/dist/package.json +3 -0
  73. package/dist/react/index.d.ts +2 -0
  74. package/dist/react/index.d.ts.map +1 -0
  75. package/dist/react/index.js +6 -0
  76. package/dist/react/index.js.map +1 -0
  77. package/dist/shared.d.ts +479 -0
  78. package/dist/shared.d.ts.map +1 -0
  79. package/dist/shared.js +98 -0
  80. package/dist/shared.js.map +1 -0
  81. package/package.json +97 -0
  82. package/src/client/_generated/_ignore.ts +1 -0
  83. package/src/client/defaultChunker.test.ts +243 -0
  84. package/src/client/defaultChunker.ts +183 -0
  85. package/src/client/fileUtils.ts +179 -0
  86. package/src/client/index.test.ts +475 -0
  87. package/src/client/index.ts +1125 -0
  88. package/src/client/setup.test.ts +28 -0
  89. package/src/client/types.ts +69 -0
  90. package/src/component/_generated/api.d.ts +439 -0
  91. package/src/component/_generated/api.js +23 -0
  92. package/src/component/_generated/dataModel.d.ts +60 -0
  93. package/src/component/_generated/server.d.ts +149 -0
  94. package/src/component/_generated/server.js +90 -0
  95. package/src/component/chunks.test.ts +915 -0
  96. package/src/component/chunks.ts +555 -0
  97. package/src/component/convex.config.ts +7 -0
  98. package/src/component/embeddings/importance.test.ts +249 -0
  99. package/src/component/embeddings/importance.ts +75 -0
  100. package/src/component/embeddings/index.test.ts +482 -0
  101. package/src/component/embeddings/index.ts +99 -0
  102. package/src/component/embeddings/tables.ts +114 -0
  103. package/src/component/entries.test.ts +341 -0
  104. package/src/component/entries.ts +546 -0
  105. package/src/component/filters.ts +119 -0
  106. package/src/component/namespaces.ts +299 -0
  107. package/src/component/schema.ts +106 -0
  108. package/src/component/search.test.ts +445 -0
  109. package/src/component/search.ts +97 -0
  110. package/src/component/setup.test.ts +5 -0
  111. package/src/react/index.ts +7 -0
  112. package/src/shared.ts +247 -0
  113. package/src/vitest.config.ts +7 -0
package/package.json ADDED
@@ -0,0 +1,97 @@
1
+ {
2
+ "name": "@convex-dev/rag",
3
+ "description": "A rag component for Convex.",
4
+ "repository": "github:get-convex/rag",
5
+ "homepage": "https://github.com/get-convex/rag#readme",
6
+ "bugs": {
7
+ "email": "support@convex.dev",
8
+ "url": "https://github.com/get-convex/rag/issues"
9
+ },
10
+ "version": "0.1.7",
11
+ "license": "Apache-2.0",
12
+ "keywords": [
13
+ "convex",
14
+ "component",
15
+ "document",
16
+ "embeddings",
17
+ "rag",
18
+ "search",
19
+ "semantic",
20
+ "vector"
21
+ ],
22
+ "type": "module",
23
+ "scripts": {
24
+ "example": "cd example && npm run dev",
25
+ "dev": "run-p -r 'example' 'build:watch'",
26
+ "dashboard": "cd example && npx convex dashboard",
27
+ "all": "run-p -r 'example' 'build:watch' 'test:watch'",
28
+ "setup": "npm i && npm run build && cd example && npm i && npx convex dev --once && printf 'VITE_CONVEX_SITE_URL=' >> .env.local && npx convex env get CONVEX_SITE_URL >> .env.local",
29
+ "build:watch": "cd src && npx chokidar -d 1000 '../tsconfig.json' '**/*.ts' -c 'npm run build' --initial",
30
+ "build": "tsc --project ./tsconfig.build.json && npm run copy:dts && echo '{\\n \"type\": \"module\"\\n}' > dist/package.json",
31
+ "copy:dts": "rsync -a --include='*/' --include='*.d.ts' --exclude='*' src/ dist/ || cpy 'src/**/*.d.ts' 'dist/' --parents",
32
+ "typecheck": "tsc --noEmit",
33
+ "clean": "rm -rf dist tsconfig.build.tsbuildinfo",
34
+ "alpha": "npm run clean && npm run build && run-p test lint typecheck && npm version prerelease --preid alpha && npm publish --tag alpha && git push --tags",
35
+ "release": "npm run clean && npm run build && run-p test lint typecheck && npm version patch && npm publish && git push --tags",
36
+ "test": "vitest run --typecheck --config ./src/vitest.config.ts",
37
+ "test:watch": "vitest --typecheck --config ./src/vitest.config.ts",
38
+ "test:debug": "vitest --inspect-brk --no-file-parallelism --config ./src/vitest.config.ts",
39
+ "test:coverage": "vitest run --coverage --coverage.reporter=text",
40
+ "lint": "eslint src",
41
+ "version": "pbcopy <<<$npm_package_version; vim CHANGELOG.md && git add CHANGELOG.md"
42
+ },
43
+ "files": [
44
+ "dist",
45
+ "src"
46
+ ],
47
+ "exports": {
48
+ "./package.json": "./package.json",
49
+ ".": {
50
+ "@convex-dev/component-source": "./src/client/index.ts",
51
+ "types": "./dist/client/index.d.ts",
52
+ "default": "./dist/client/index.js"
53
+ },
54
+ "./react": {
55
+ "@convex-dev/component-source": "./src/react/index.ts",
56
+ "types": "./dist/react/index.d.ts",
57
+ "default": "./dist/react/index.js"
58
+ },
59
+ "./convex.config": {
60
+ "@convex-dev/component-source": "./src/component/convex.config.ts",
61
+ "types": "./dist/component/convex.config.d.ts",
62
+ "default": "./dist/component/convex.config.js"
63
+ }
64
+ },
65
+ "peerDependencies": {
66
+ "@ai-sdk/provider": "^1.1.3",
67
+ "@convex-dev/workpool": "^0.2.14",
68
+ "ai": "^4.3.16",
69
+ "convex": "^1.24.8",
70
+ "convex-helpers": "^0.1.94"
71
+ },
72
+ "devDependencies": {
73
+ "@ai-sdk/openai": "^1.3.22",
74
+ "@arethetypeswrong/cli": "^0.17.4",
75
+ "@edge-runtime/vm": "^5.0.0",
76
+ "@eslint/js": "9.29.0",
77
+ "@types/node": "18.17.0",
78
+ "@typescript-eslint/eslint-plugin": "8.35.0",
79
+ "@typescript-eslint/parser": "8.35.0",
80
+ "chokidar-cli": "3.0.0",
81
+ "convex-helpers": "0.1.95",
82
+ "convex-test": "0.0.37",
83
+ "cpy-cli": "5.0.0",
84
+ "eslint": "9.29.0",
85
+ "eslint-plugin-react": "^7.37.5",
86
+ "eslint-plugin-react-hooks": "^5.2.0",
87
+ "globals": "^15.9.0",
88
+ "npm-run-all2": "7.0.2",
89
+ "prettier": "3.2.5",
90
+ "typescript": "5.5",
91
+ "typescript-eslint": "8.4.0",
92
+ "vitest": "3.2.4"
93
+ },
94
+ "main": "./dist/client/index.js",
95
+ "types": "./dist/client/index.d.ts",
96
+ "module": "./dist/client/index.js"
97
+ }
@@ -0,0 +1 @@
1
+ // This is only here so convex-test can detect a _generated folder
@@ -0,0 +1,243 @@
1
+ import { defaultChunker } from "./defaultChunker.js";
2
+ import { describe, test, expect } from "vitest";
3
+
4
+ describe("defaultChunker", () => {
5
+ test("handles empty text", () => {
6
+ expect(defaultChunker("")).toEqual([]);
7
+ expect(defaultChunker(" ")).toEqual([" "]);
8
+ });
9
+
10
+ test("chunks paragraphs that fit within limits", () => {
11
+ const text = `This is the first paragraph with about 100 characters. It should be combined with others.
12
+
13
+ This is the second paragraph with similar length to make a good chunk together.
14
+
15
+ This is the third paragraph that will likely be in the next chunk.`;
16
+
17
+ const chunks = defaultChunker(text);
18
+
19
+ // Should combine all paragraphs since total length (238 chars) is well within limits
20
+ expect(chunks.length).toBe(1);
21
+ expect(chunks[0]).toBe(text);
22
+ chunks.forEach((chunk: string) => {
23
+ expect(chunk.length).toBeGreaterThan(0);
24
+ expect(chunk.length).toBeLessThanOrEqual(2000);
25
+ });
26
+ expect(chunks.join("\n")).toBe(text);
27
+ });
28
+
29
+ test("combines small paragraphs to meet minimum character limit", () => {
30
+ const text = `Short para 1.
31
+
32
+ Short para 2.
33
+
34
+ Short para 3.
35
+
36
+ Short para 4.`;
37
+
38
+ const chunks = defaultChunker(text, {
39
+ minCharsSoftLimit: 50,
40
+ maxCharsSoftLimit: 200,
41
+ });
42
+
43
+ // Should combine multiple short paragraphs
44
+ chunks.forEach((chunk: string) => {
45
+ expect(chunk.length).toBeGreaterThanOrEqual(50);
46
+ expect(chunk.length).toBeLessThanOrEqual(200);
47
+ });
48
+ expect(chunks.length).toBe(1);
49
+ expect(chunks[0]).toBe(
50
+ "Short para 1.\n\nShort para 2.\n\nShort para 3.\n\nShort para 4."
51
+ );
52
+ expect(chunks.join("\n")).toBe(text);
53
+ });
54
+
55
+ test("splits large paragraphs by lines", () => {
56
+ const longParagraph = Array(50)
57
+ .fill("This is a line that makes the paragraph very long.")
58
+ .join("\n");
59
+
60
+ const chunks = defaultChunker(longParagraph, {
61
+ minLines: 2,
62
+ minCharsSoftLimit: 200,
63
+ maxCharsSoftLimit: 500,
64
+ });
65
+
66
+ expect(chunks.length).toBeGreaterThan(1);
67
+ chunks.forEach((chunk: string) => {
68
+ expect(chunk.length).toBeLessThanOrEqual(500);
69
+ // Each chunk should have at least 2 lines (minLines)
70
+ expect(chunk.split("\n").length).toBeGreaterThanOrEqual(2);
71
+ });
72
+ expect(chunks.join("\n")).toBe(longParagraph);
73
+ });
74
+
75
+ test("respects minLines constraint when splitting", () => {
76
+ const text =
77
+ "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\nLine 6\nLine 7\nLine 8";
78
+
79
+ const chunks = defaultChunker(text, {
80
+ minLines: 3,
81
+ minCharsSoftLimit: 10,
82
+ maxCharsSoftLimit: 30, // Very small to force splitting
83
+ });
84
+
85
+ chunks.forEach((chunk: string) => {
86
+ const lineCount = chunk.split("\n").length;
87
+ expect(lineCount).toBeGreaterThanOrEqual(3);
88
+ });
89
+ expect(chunks.join("\n")).toBe(text);
90
+ });
91
+
92
+ test("handles mixed content with paragraphs and large sections", () => {
93
+ const text = `Small paragraph 1.
94
+
95
+ Small paragraph 2.
96
+
97
+ This is a very long paragraph that definitely exceeds the maximum character limit and should be split by lines instead of being treated as a single paragraph unit.
98
+ Line 2 of the long paragraph.
99
+ Line 3 of the long paragraph.
100
+ Line 4 of the long paragraph.
101
+ Line 5 of the long paragraph.
102
+
103
+ Another small paragraph at the end.`;
104
+
105
+ const chunks = defaultChunker(text, {
106
+ minLines: 1,
107
+ minCharsSoftLimit: 100,
108
+ maxCharsSoftLimit: 300,
109
+ });
110
+
111
+ expect(chunks.length).toBeGreaterThan(1);
112
+ chunks.forEach((chunk: string) => {
113
+ expect(chunk.length).toBeLessThanOrEqual(300);
114
+ expect(chunk.trim().length).toBeGreaterThan(0);
115
+ });
116
+ expect(chunks.join("\n")).toBe(text);
117
+ });
118
+
119
+ test("uses custom delimiter", () => {
120
+ const text = "Section 1\n---\nSection 2\n---\nSection 3";
121
+
122
+ const chunks = defaultChunker(text, {
123
+ delimiter: "\n---\n",
124
+ minCharsSoftLimit: 5,
125
+ maxCharsSoftLimit: 50,
126
+ });
127
+
128
+ expect(chunks.length).toBeGreaterThan(0);
129
+ // Should be able to reconstruct original text with join("\n")
130
+ expect(chunks.join("\n")).toBe(text);
131
+ });
132
+
133
+ test("handles single line that exceeds max limit", () => {
134
+ const veryLongLine = "A".repeat(3000);
135
+
136
+ const chunks = defaultChunker(veryLongLine, {
137
+ minLines: 1,
138
+ minCharsSoftLimit: 200,
139
+ maxCharsSoftLimit: 1000,
140
+ });
141
+
142
+ // Should split even a single line if it's too long
143
+ expect(chunks.length).toBe(1);
144
+ expect(chunks.join("\n")).toBe(veryLongLine);
145
+ });
146
+
147
+ test("splits single line exceeding hard limit with custom hard limit", () => {
148
+ const longLine = "A".repeat(15000);
149
+
150
+ const chunks = defaultChunker(longLine, {
151
+ minLines: 1,
152
+ minCharsSoftLimit: 200,
153
+ maxCharsSoftLimit: 1000,
154
+ maxCharsHardLimit: 5000,
155
+ });
156
+
157
+ // Should be split into multiple chunks
158
+ expect(chunks.length).toBeGreaterThan(1);
159
+
160
+ // Each chunk should not exceed the hard limit
161
+ chunks.forEach((chunk: string) => {
162
+ expect(chunk.length).toBeLessThanOrEqual(5000);
163
+ });
164
+
165
+ // Content should be preserved when joined back together
166
+ expect(chunks.join("")).toBe(longLine);
167
+ });
168
+
169
+ test("splits extremely long single line with default hard limit", () => {
170
+ // Create a line that exceeds the default hard limit of 10000
171
+ const extremelyLongLine = "B".repeat(25000);
172
+
173
+ const chunks = defaultChunker(extremelyLongLine, {
174
+ minLines: 1,
175
+ minCharsSoftLimit: 200,
176
+ maxCharsSoftLimit: 1000,
177
+ // Using default maxCharsHardLimit of 10000
178
+ });
179
+
180
+ // Should be split into multiple chunks
181
+ expect(chunks.length).toBeGreaterThan(1);
182
+
183
+ // Each chunk should not exceed the default hard limit
184
+ chunks.forEach((chunk: string) => {
185
+ expect(chunk.length).toBeLessThanOrEqual(10000);
186
+ });
187
+
188
+ // Content should be preserved when joined back together
189
+ expect(chunks.join("")).toBe(extremelyLongLine);
190
+
191
+ // Should have at least 3 chunks for 25000 characters with 10000 limit
192
+ expect(chunks.length).toBeGreaterThanOrEqual(3);
193
+ });
194
+
195
+ test("verifies hard limit splitting with different character patterns", () => {
196
+ const longLine = "A".repeat(15000);
197
+
198
+ const chunks = defaultChunker(longLine, {
199
+ minLines: 1,
200
+ minCharsSoftLimit: 200,
201
+ maxCharsSoftLimit: 1000,
202
+ maxCharsHardLimit: 5000,
203
+ });
204
+
205
+ // Should be split into multiple chunks
206
+ expect(chunks.length).toBeGreaterThan(1);
207
+
208
+ // Each chunk should not exceed the hard limit
209
+ chunks.forEach((chunk: string) => {
210
+ expect(chunk.length).toBeLessThanOrEqual(5000);
211
+ });
212
+
213
+ // Content should be preserved when joined back together
214
+ expect(chunks.join("")).toBe(longLine);
215
+ });
216
+
217
+ test("preserves content without losing text", () => {
218
+ const originalText = `Paragraph 1 with some content.
219
+
220
+ Paragraph 2 with different content.
221
+
222
+
223
+ Paragraph 3 with more content.`;
224
+
225
+ const chunks = defaultChunker(originalText);
226
+ const reconstructed = chunks.join("\n");
227
+
228
+ // Should be able to reconstruct original text with join("\n")
229
+ expect(reconstructed).toBe(originalText);
230
+
231
+ // All original words should be preserved
232
+ const originalWords = originalText.split(/\s+/).filter((w) => w.length > 0);
233
+ const reconstructedWords = reconstructed
234
+ .split(/\s+/)
235
+ .filter((w) => w.length > 0);
236
+
237
+ expect(reconstructedWords.length).toBe(originalWords.length);
238
+ originalWords.forEach((word) => {
239
+ expect(reconstructed).toContain(word);
240
+ });
241
+ expect(chunks.join("\n")).toBe(originalText);
242
+ });
243
+ });
@@ -0,0 +1,183 @@
1
+ /**
2
+ * Chunk text for embedding.
3
+ *
4
+ * By default, it will chunk into paragraphs and target
5
+ * 200-2000 characters per chunk (only less than 1 line if the hard limit is reached).
6
+ */
7
+ export function defaultChunker(
8
+ text: string,
9
+ {
10
+ minLines = 1,
11
+ minCharsSoftLimit = 200,
12
+ maxCharsSoftLimit = 2000,
13
+ maxCharsHardLimit = 10000,
14
+ delimiter = "\n\n",
15
+ }: {
16
+ minLines?: number;
17
+ minCharsSoftLimit?: number;
18
+ maxCharsSoftLimit?: number;
19
+ maxCharsHardLimit?: number;
20
+ delimiter?: string;
21
+ } = {}
22
+ ): string[] {
23
+ if (!text) return [];
24
+
25
+ // Split text into individual lines
26
+ const lines = text.split("\n");
27
+ const chunks: string[] = [];
28
+
29
+ let currentChunk: string[] = [];
30
+
31
+ for (let i = 0; i < lines.length; i++) {
32
+ const line = lines[i];
33
+
34
+ // Check if this line starts a new section (based on delimiter pattern)
35
+ const isNewSection = shouldStartNewSection(lines, i, delimiter);
36
+
37
+ // Calculate potential chunk if we add this line
38
+ const potentialChunk = [...currentChunk, line].join("\n");
39
+
40
+ // If adding this line would exceed max chars, finalize current chunk first
41
+ if (potentialChunk.length > maxCharsSoftLimit && currentChunk.length > 0) {
42
+ const trimmedChunk = removeTrailingEmptyLines(currentChunk);
43
+ chunks.push(trimmedChunk.join("\n"));
44
+
45
+ // Split the line if it exceeds hard limit
46
+ const splitLines = maybeSplitLine(line, maxCharsHardLimit);
47
+ // Add all but the last split piece as separate chunks
48
+ for (let j = 0; j < splitLines.length - 1; j++) {
49
+ chunks.push(splitLines[j]);
50
+ }
51
+ // Keep the last piece for potential combination with next lines
52
+ currentChunk = [splitLines[splitLines.length - 1]];
53
+ continue;
54
+ }
55
+
56
+ // If we're starting a new section and current chunk meets minimum requirements
57
+ if (
58
+ isNewSection &&
59
+ currentChunk.length >= minLines &&
60
+ currentChunk.join("\n").length >= Math.min(minCharsSoftLimit * 0.8, 150)
61
+ ) {
62
+ // Simple logic: only split if potential chunk would exceed the soft max limit
63
+ if (potentialChunk.length > maxCharsSoftLimit) {
64
+ // When splitting at delimiter boundary, preserve natural empty lines (don't remove trailing empty lines)
65
+ chunks.push(currentChunk.join("\n"));
66
+ currentChunk = [line];
67
+ continue;
68
+ }
69
+ }
70
+
71
+ // Add line to current chunk
72
+ currentChunk.push(line);
73
+
74
+ // If current chunk is too big, split it
75
+ if (currentChunk.join("\n").length > maxCharsSoftLimit) {
76
+ if (currentChunk.length === 1) {
77
+ // Single line too long - split it if it exceeds hard limit
78
+ const splitLines = maybeSplitLine(line, maxCharsHardLimit);
79
+ if (splitLines.length > 1) {
80
+ // Line was split - add all but the last piece as separate chunks
81
+ for (let j = 0; j < splitLines.length - 1; j++) {
82
+ chunks.push(splitLines[j]);
83
+ }
84
+ // Keep the last piece for potential combination with next lines
85
+ currentChunk = [splitLines[splitLines.length - 1]];
86
+ } else {
87
+ // Line doesn't exceed hard limit, keep it as is
88
+ chunks.push(line);
89
+ currentChunk = [];
90
+ }
91
+ } else {
92
+ // Remove last line and finalize chunk
93
+ const lastLine = currentChunk.pop()!;
94
+ const trimmedChunk = removeTrailingEmptyLines(currentChunk);
95
+ chunks.push(trimmedChunk.join("\n"));
96
+ currentChunk = [lastLine];
97
+ }
98
+ }
99
+ }
100
+
101
+ // Add remaining chunk, splitting if it exceeds hard limit
102
+ if (currentChunk.length > 0) {
103
+ const remainingText = currentChunk.join("\n");
104
+ if (remainingText.length > maxCharsHardLimit) {
105
+ // Split the remaining chunk if it exceeds hard limit
106
+ const splitLines = maybeSplitLine(remainingText, maxCharsHardLimit);
107
+ chunks.push(...splitLines);
108
+ } else {
109
+ const trimmedChunk = removeTrailingEmptyLines(currentChunk);
110
+ chunks.push(trimmedChunk.join("\n"));
111
+ }
112
+ }
113
+
114
+ return chunks;
115
+ }
116
+
117
+ function maybeSplitLine(line: string, maxCharsHardLimit: number): string[] {
118
+ const inputs = [line]; // in reverse order
119
+ const lines: string[] = [];
120
+ while (inputs.length > 0) {
121
+ const input = inputs.pop()!;
122
+ if (input.length <= maxCharsHardLimit) {
123
+ lines.push(input);
124
+ continue;
125
+ }
126
+ // split it in half
127
+ const splitIndex = Math.floor(input.length / 2);
128
+ const candidate = input.slice(0, splitIndex);
129
+ const rest = input.slice(splitIndex);
130
+ if (candidate.length < maxCharsHardLimit) {
131
+ lines.push(candidate, rest);
132
+ } else {
133
+ inputs.push(rest, candidate);
134
+ }
135
+ }
136
+ return lines;
137
+ }
138
+
139
+ function shouldStartNewSection(
140
+ lines: string[],
141
+ index: number,
142
+ delimiter: string
143
+ ): boolean {
144
+ if (index === 0) return false;
145
+
146
+ // For default "\n\n" delimiter, check for blank lines
147
+ if (delimiter === "\n\n") {
148
+ return lines[index - 1] === "";
149
+ }
150
+
151
+ // For custom delimiters, check if previous lines match the delimiter pattern
152
+ const delimiterLines = delimiter.split("\n");
153
+ if (delimiterLines.length <= 1) return false;
154
+
155
+ // Check if the delimiter pattern appears before this line
156
+ for (let i = 0; i < delimiterLines.length - 1; i++) {
157
+ const checkIndex = index - delimiterLines.length + 1 + i;
158
+ if (checkIndex < 0 || lines[checkIndex] !== delimiterLines[i]) {
159
+ return false;
160
+ }
161
+ }
162
+
163
+ return true;
164
+ }
165
+
166
+ function removeTrailingEmptyLines(lines: string[]): string[] {
167
+ // Don't remove anything if there's only one line
168
+ if (lines.length <= 1) {
169
+ return lines;
170
+ }
171
+
172
+ // Find the last non-empty line
173
+ for (let i = lines.length - 1; i >= 0; i--) {
174
+ if (lines[i].trim() !== "") {
175
+ return lines.slice(0, i + 1);
176
+ }
177
+ }
178
+
179
+ // If all lines are empty, keep at least one
180
+ return lines.length > 0 ? [lines[0]] : [];
181
+ }
182
+
183
+ export default defaultChunker;