@convex-dev/rag 0.3.1 → 0.3.3-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.md +327 -98
  2. package/dist/client/defaultChunker.d.ts.map +1 -1
  3. package/dist/client/defaultChunker.js +47 -16
  4. package/dist/client/defaultChunker.js.map +1 -1
  5. package/dist/client/fileUtils.d.ts +4 -2
  6. package/dist/client/fileUtils.d.ts.map +1 -1
  7. package/dist/client/fileUtils.js +5 -3
  8. package/dist/client/fileUtils.js.map +1 -1
  9. package/dist/client/hybridRank.d.ts +23 -0
  10. package/dist/client/hybridRank.d.ts.map +1 -0
  11. package/dist/client/hybridRank.js +21 -0
  12. package/dist/client/hybridRank.js.map +1 -0
  13. package/dist/client/index.d.ts +18 -35
  14. package/dist/client/index.d.ts.map +1 -1
  15. package/dist/client/index.js +12 -27
  16. package/dist/client/index.js.map +1 -1
  17. package/dist/component/_generated/api.d.ts +1 -0
  18. package/dist/component/chunks.d.ts +1 -0
  19. package/dist/component/chunks.d.ts.map +1 -1
  20. package/dist/component/chunks.js +31 -2
  21. package/dist/component/chunks.js.map +1 -1
  22. package/dist/component/entries.d.ts +2 -2
  23. package/dist/component/entries.d.ts.map +1 -1
  24. package/dist/component/entries.js +1 -1
  25. package/dist/component/entries.js.map +1 -1
  26. package/dist/shared.d.ts +2 -2
  27. package/dist/shared.d.ts.map +1 -1
  28. package/package.json +1 -1
  29. package/src/client/defaultChunker.test.ts +1 -1
  30. package/src/client/defaultChunker.ts +73 -17
  31. package/src/client/fileUtils.ts +8 -4
  32. package/src/client/hybridRank.ts +39 -0
  33. package/src/client/index.test.ts +11 -7
  34. package/src/client/index.ts +25 -58
  35. package/src/component/_generated/api.d.ts +1 -0
  36. package/src/component/chunks.test.ts +11 -1
  37. package/src/component/chunks.ts +33 -3
  38. package/src/component/entries.ts +3 -3
  39. package/src/shared.ts +2 -2
package/README.md CHANGED
@@ -1,12 +1,14 @@
1
1
  # Convex RAG Component
2
2
 
3
- [![npm version](https://badge.fury.io/js/@convex-dev%2Fmemory.svg)](https://badge.fury.io/js/@convex-dev%2Fmemory)
3
+ [![npm version](https://badge.fury.io/js/@convex-dev%2Frag.svg)](https://badge.fury.io/js/@convex-dev%2Frag)
4
4
 
5
5
  <!-- START: Include on https://convex.dev/components -->
6
6
 
7
7
  A component for semantic search, usually used to look up context for LLMs.
8
8
  Use with an Agent for Retrieval-Augmented Generation (RAG).
9
9
 
10
+ [![Use AI to search HUGE amounts of text with the RAG Component](https://thumbs.video-to-markdown.com/1ff18153.jpg)](https://youtu.be/dGmtAmdAaFs)
11
+
10
12
  ## ✨ Key Features
11
13
 
12
14
  - **Add Content**: Add or replace content with text chunks and embeddings.
@@ -57,23 +59,13 @@ import { RAG } from "@convex-dev/rag";
57
59
  // Any AI SDK model that supports embeddings will work.
58
60
  import { openai } from "@ai-sdk/openai";
59
61
 
60
- const rag = new RAG<FilterTypes>(components.rag, {
61
- filterNames: ["category", "contentType", "categoryAndType"],
62
+ const rag = new RAG(components.rag, {
62
63
  textEmbeddingModel: openai.embedding("text-embedding-3-small"),
63
- embeddingDimension: 1536,
64
+ embeddingDimension: 1536, // Needs to match your embedding model
64
65
  });
65
-
66
- // Optional: Add type safety to your filters.
67
- type FilterTypes = {
68
- category: string;
69
- contentType: string;
70
- categoryAndType: { category: string; contentType: string };
71
- };
72
66
  ```
73
67
 
74
- ## Usage Examples
75
-
76
- ### Add context to RAG
68
+ ## Add context to RAG
77
69
 
78
70
  Add content with text chunks. Each call to `add` will create a new **entry**.
79
71
  It will embed the chunks automatically if you don't provide them.
@@ -91,52 +83,9 @@ export const add = action({
91
83
  });
92
84
  ```
93
85
 
94
- See below for how to add content asynchronously, e.g. to handle large files.
95
-
96
- ### Generate a response based on RAG context
97
-
98
- You can use the `generateText` function to generate a response based on RAG context. This will automatically search for relevant entries and use them as context for the LLM, using default formatting.
99
-
100
- The arguments to `generateText` are compatible with all arguments to `generateText` from the AI SDK.
101
-
102
- To have more control over the context and prompting, you can use the `search` function to get the context, and then use any model to generate a response.
103
- See below for more details.
104
-
105
- ```ts
106
- export const askQuestion = action({
107
- args: {
108
- prompt: v.string(),
109
- },
110
- handler: async (ctx, args) => {
111
- const userId = await getAuthUserId(ctx);
112
- const { text, context } = await rag.generateText(ctx, {
113
- search: { namespace: userId, limit: 10 },
114
- prompt: args.prompt,
115
- model: openai.chat("gpt-4o-mini"),
116
- });
117
- return { answer: text, context };
118
- },
119
- ```
120
-
121
- Note: You can specify any of the search options available on `rag.search`.
122
- See below for more details.
123
-
124
- ### Using your own content splitter
125
-
126
- By default, the component uses the `defaultChunker` to split the content into chunks.
127
- You can pass in your own content chunks to the `add` or `addAsync` functions.
128
-
129
- ```ts
130
- const chunks = await textSplitter.split(content);
131
- await rag.add(ctx, { namespace: "global", chunks });
132
- ```
133
-
134
- Note: The `textSplitter` here could be LangChain, Mastra, or something custom.
135
- The simplest version makes an array of strings like `content.split("\n")`.
86
+ See below for how to chunk the text yourself or add content asynchronously, e.g. to handle large files.
136
87
 
137
- Note: you can pass in an async iterator instead of an array to handle large content.
138
-
139
- ### Semantic Search
88
+ ## Semantic Search
140
89
 
141
90
  Search across content with vector similarity
142
91
 
@@ -157,7 +106,7 @@ export const search = action({
157
106
  const { results, text, entries } = await rag.search(ctx, {
158
107
  namespace: "global",
159
108
  query: args.query,
160
- limit: 10
109
+ limit: 10,
161
110
  vectorScoreThreshold: 0.5, // Only return results with a score >= 0.5
162
111
  });
163
112
 
@@ -166,40 +115,93 @@ export const search = action({
166
115
  });
167
116
  ```
168
117
 
169
- ### Using keys to gracefully replace content
118
+ ## Generate a response based on RAG context
170
119
 
171
- When you add content to a namespace, you can provide a `key` to uniquely identify the content.
172
- If you add content with the same key, it will replace the existing content.
120
+ Once you have searched for the context, you can use it with an LLM.
121
+
122
+ Generally you'll already be using something to make LLM requests, e.g.
123
+ the [Agent Component](https://www.convex.dev/components/agent),
124
+ which tracks the message history for you.
125
+ See the [Agent Component docs](https://docs.convex.dev/agents)
126
+ for more details on doing RAG with the Agent Component.
127
+
128
+ However, if you just want a one-off response, you can use the `generateText`
129
+ function as a convenience.
130
+
131
+ This will automatically search for relevant entries and use them as context
132
+ for the LLM, using default formatting.
133
+
134
+ The arguments to `generateText` are compatible with all arguments to
135
+ `generateText` from the AI SDK.
173
136
 
174
137
  ```ts
175
- await rag.add(ctx, { namespace: userId, key: "my-file.txt", text });
138
+ export const askQuestion = action({
139
+ args: {
140
+ prompt: v.string(),
141
+ },
142
+ handler: async (ctx, args) => {
143
+ const userId = await getAuthUserId(ctx);
144
+ const { text, context } = await rag.generateText(ctx, {
145
+ search: { namespace: userId, limit: 10 },
146
+ prompt: args.prompt,
147
+ model: openai.chat("gpt-4o-mini"),
148
+ });
149
+ return { answer: text, context };
150
+ },
176
151
  ```
177
152
 
178
- When a new document is added, it will start with a status of "pending" while
179
- it chunks, embeds, and inserts the data into the database.
180
- Once all data is inserted, it will iterate over the chunks and swap the old
181
- content embeddings with the new ones, and then update the status to "ready",
182
- marking the previous version as "replaced".
153
+ Note: You can specify any of the search options available on `rag.search`.
183
154
 
184
- The old content is kept around by default, so in-flight searches will get
185
- results for old vector search results.
186
- See below for more details on deleting.
155
+ ## Filtered Search
187
156
 
188
- This means that if searches are happening while the document is being added,
189
- they will see the old content results
190
- This is useful if you want to add content to a namespace and then immediately
191
- search for it, or if you want to add content to a namespace and then immediately
192
- add more content to the same namespace.
157
+ You can provide filters when adding content and use them to search.
158
+ To do this, you'll need to give the RAG component a list of the filter names.
159
+ You can optionally provide a type parameter for type safety (no runtime validation).
193
160
 
194
- ### Filtered Search
161
+ Note: these filters can be OR'd together when searching. In order to get an AND,
162
+ you provide a filter with a more complex value, such as `categoryAndType` below.
163
+
164
+ ```ts
165
+ // convex/example.ts
166
+ import { components } from "./_generated/api";
167
+ import { RAG } from "@convex-dev/rag";
168
+ // Any AI SDK model that supports embeddings will work.
169
+ import { openai } from "@ai-sdk/openai";
170
+
171
+ // Optional: Add type safety to your filters.
172
+ type FilterTypes = {
173
+ category: string;
174
+ contentType: string;
175
+ categoryAndType: { category: string; contentType: string };
176
+ };
177
+
178
+ const rag = new RAG<FilterTypes>(components.rag, {
179
+ textEmbeddingModel: openai.embedding("text-embedding-3-small"),
180
+ embeddingDimension: 1536, // Needs to match your embedding model
181
+ filterNames: ["category", "contentType", "categoryAndType"],
182
+ });
183
+ ```
184
+
185
+ Adding content with filters:
186
+
187
+ ```ts
188
+ await rag.add(ctx, {
189
+ namespace: "global",
190
+ text,
191
+ filterValues: [
192
+ { name: "category", value: "news" },
193
+ { name: "contentType", value: "article" },
194
+ { name: "categoryAndType", value: { category: "news", contentType: "article" } },
195
+ ],
196
+ });
197
+ ```
195
198
 
196
199
  Search with metadata filters:
197
200
 
198
201
  ```ts
199
- export const searchByCategory = action({
202
+ export const searchForNewsOrSports = action({
200
203
  args: {
201
204
  query: v.string(),
202
- category: v.string(),
203
205
  },
204
206
  handler: async (ctx, args) => {
205
207
  const userId = await getUserId(ctx);
@@ -208,7 +210,10 @@ export const searchByCategory = action({
208
210
  const results = await rag.search(ctx, {
209
211
  namespace: userId,
210
212
  query: args.query,
211
- filters: [{ name: "category", value: args.category }],
213
+ filters: [
214
+ { name: "category", value: "news" },
215
+ { name: "category", value: "sports" },
216
+ ],
212
217
  limit: 10,
213
218
  });
214
219
 
@@ -257,14 +262,14 @@ export const searchWithContext = action({
257
262
  });
258
263
  ```
259
264
 
260
- ### Formatting results
265
+ ## Formatting results
261
266
 
262
267
  Formatting the results for use in a prompt depends a bit on the use case.
263
268
  By default, the results will be sorted by score, not necessarily in the order
264
269
  they appear in the original text. You may want to sort them by the order they
265
270
  appear in the original text so they follow the flow of the original document.
266
271
 
267
- For convenienct, the `text` field of the search results is a string formatted
272
+ For convenience, the `text` field of the search results is a string formatted
268
273
  with `...` separating non-sequential chunks, `---` separating entries, and
269
274
  `# Title:` at each entry boundary (if titles are available).
270
275
 
@@ -274,14 +279,18 @@ console.log(text);
274
279
  ```
275
280
 
276
281
  ```txt
277
- # Title 1:
282
+ ## Title 1:
278
283
  Chunk 1 contents
279
284
  Chunk 2 contents
285
+
280
286
  ...
287
+
281
288
  Chunk 8 contents
282
289
  Chunk 9 contents
290
+
283
291
  ---
284
- # Title 2:
292
+
293
+ ## Title 2:
285
294
  Chunk 4 contents
286
295
  Chunk 5 contents
287
296
  ```
@@ -330,7 +339,49 @@ await generateText({
330
339
  });
331
340
  ```
332
341
 
333
- ### Providing custom embeddings per-chunk
342
+ ## Using keys to gracefully replace content
343
+
344
+ When you add content to a namespace, you can provide a `key` to uniquely identify the content.
345
+ If you add content with the same key, it will make a new entry to replace the old one.
346
+
347
+ ```ts
348
+ await rag.add(ctx, { namespace: userId, key: "my-file.txt", text });
349
+ ```
350
+
351
+ When a new document is added, it will start with a status of "pending" while
352
+ it chunks, embeds, and inserts the data into the database.
353
+ Once all data is inserted, it will iterate over the chunks and swap the old
354
+ content embeddings with the new ones, and then update the status to "ready",
355
+ marking the previous version as "replaced".
356
+
357
+ The old content is kept around by default, so in-flight searches will get
358
+ results for old vector search results.
359
+ See below for more details on deleting.
360
+
361
+ This means that if searches are happening while the document is being added,
362
+ they will see the old content results
363
+ This is useful if you want to add content to a namespace and then immediately
364
+ search for it, or if you want to add content to a namespace and then immediately
365
+ add more content to the same namespace.
366
+
367
+ ## Using your own content splitter
368
+
369
+ By default, the component uses the `defaultChunker` to split the content into chunks.
370
+ You can pass in your own content chunks to the `add` or `addAsync` functions.
371
+
372
+ ```ts
373
+ const chunks = await textSplitter.split(content);
374
+ await rag.add(ctx, { namespace: "global", chunks });
375
+ ```
376
+
377
+ Note: The `textSplitter` here could be LangChain, Mastra, or something custom.
378
+ The simplest version makes an array of strings like `content.split("\n")`.
379
+
380
+ Note: you can pass in an async iterator instead of an array to handle large content.
381
+ Or use the `addAsync` function (see below).
382
+
383
+
384
+ ## Providing custom embeddings per-chunk
334
385
 
335
386
  In addition to the text, you can provide your own embeddings for each chunk.
336
387
 
@@ -348,7 +399,7 @@ const chunksWithEmbeddings = await Promise.all(chunks.map(async chunk => {
348
399
  await rag.add(ctx, { namespace: "global", chunks });
349
400
  ```
350
401
 
351
- ### Add Entries Asynchronously using File Storage
402
+ ## Add Entries Asynchronously using File Storage
352
403
 
353
404
  For large files, you can upload them to file storage, then provide a chunker
354
405
  action to split them into chunks.
@@ -462,18 +513,196 @@ Generally you'd do this:
462
513
  1. Periodically by querying:
463
514
 
464
515
  ```ts
465
- const toDelete = await rag.list(ctx, {
466
- status: "replaced",
467
- paginationOpts: { cursor: null, numItems: 100 }
516
+ // in convex/crons.ts
517
+ import { cronJobs } from "convex/server";
518
+ import { internal } from "./_generated/api.js";
519
+ import { internalMutation } from "./_generated/server.js";
520
+ import { v } from "convex/values";
521
+ import { rag } from "./example.js";
522
+ import { assert } from "convex-helpers";
523
+
524
+ const WEEK = 7 * 24 * 60 * 60 * 1000;
525
+
526
+ export const deleteOldContent = internalMutation({
527
+ args: { cursor: v.optional(v.string()) },
528
+ handler: async (ctx, args) => {
529
+ const toDelete = await rag.list(ctx, {
530
+ status: "replaced",
531
+ paginationOpts: { cursor: args.cursor ?? null, numItems: 100 },
532
+ });
533
+
534
+ for (const entry of toDelete.page) {
535
+ assert(entry.status === "replaced");
536
+ if (entry.replacedAt >= Date.now() - WEEK) {
537
+ return; // we're done when we catch up to a week ago
538
+ }
539
+ await rag.delete(ctx, { entryId: entry.entryId });
540
+ }
541
+ if (!toDelete.isDone) {
542
+ await ctx.scheduler.runAfter(0, internal.example.deleteOldContent, {
543
+ cursor: toDelete.continueCursor,
544
+ });
545
+ }
546
+ },
468
547
  });
469
548
 
470
- for (const entry of toDelete) {
471
- assert(entry.status === "replaced");
472
- if (entry.replacedAt >= Date.now() - ONE_WEEK_MS) {
473
- break;
474
- }
475
- await rag.delete(ctx, { entryId: entry.entryId });
476
- }
549
+ // See example/convex/crons.ts for a complete example.
550
+ const crons = cronJobs();
551
+ crons.interval("deleteOldContent", { hours: 1 }, internal.crons.deleteOldContent, {});
552
+ export default crons;
553
+ ```
554
+
555
+ ## Working with types
556
+
557
+ You can use the provided types to validate and store data.
558
+ `import { ... } from "@convex-dev/rag";`
559
+
560
+ Types for the various elements:
561
+
562
+ `Entry`, `EntryFilter`, `SearchEntry`, `SearchResult`
563
+
564
+ - `SearchEntry` is an `Entry` with a `text` field including the combined search
565
+ results for that entry, whereas a `SearchResult` is a specific chunk result,
566
+ along with surrounding chunks.
567
+
568
+ `EntryId`, `NamespaceId`
569
+
570
+ - While the `EntryId` and `NamespaceId` are strings under the hood, they are
571
+ given more specific types to make it easier to use them correctly.
572
+
573
+ Validators can be used in `args` and schema table definitions:
574
+ `vEntry`, `vEntryId`, `vNamespaceId`, `vSearchEntry`, `vSearchResult`
575
+
576
+ e.g. `defineTable({ myDocTitle: v.string(), entryId: vEntryId })`
577
+
578
+ The validators for the branded IDs will only validate they are strings,
579
+ but will have the more specific types, to provide type safety.
580
+
581
+ ## Utility Functions
582
+
583
+ In addition to the function on the `rag` instance, there are other utilities
584
+ provided:
585
+
586
+ ### `defaultChunker`
587
+
588
+ This is the default chunker used by the `add` and `addAsync` functions.
589
+
590
+ It is customizable, but by default:
591
+ - It tries to break up the text into paragraphs between 100-1k characters.
592
+ - It will combine paragraphs to meet the minimum character count (100).
593
+ - It will break up paragraphs into separate lines to keep it under 1k.
594
+ - It will not split up a single line unless it's longer than 10k characters.
595
+
596
+ ```ts
597
+ import { defaultChunker } from "@convex-dev/rag";
598
+
599
+ const chunks = defaultChunker(text, {
600
+ // these are the defaults
601
+ minLines: 1,
602
+ minCharsSoftLimit: 100,
603
+ maxCharsSoftLimit: 1000,
604
+ maxCharsHardLimit: 10000,
605
+ delimiter: "\n\n",
606
+ });
607
+ ```
608
+
609
+ ### `hybridRank`
610
+
611
+ This is an implementation of "Reciprocal Rank Fusion" for ranking search results
612
+ based on multiple scoring arrays. The premise is that if both arrays of results
613
+ are sorted by score, the best results show up near the top of both arrays and
614
+ should be preferred over results higher in one but much lower in the other.
615
+
616
+ ```ts
617
+ import { hybridRank } from "@convex-dev/rag";
618
+
619
+ const textSearchResults = [id1, id2, id3];
620
+ const vectorSearchResults = [id2, id3, id1];
621
+ const results = hybridRank([
622
+ textSearchResults,
623
+ vectorSearchResults,
624
+ ]);
625
+ // results = [id2, id1, id3]
626
+ ```
627
+
628
+ It can take more than two arrays, and you can provide weights for each array.
629
+
630
+ ```ts
631
+
632
+ const recentSearchResults = [id5, id4, id3];
633
+ const results = hybridRank([
634
+ textSearchResults,
635
+ vectorSearchResults,
636
+ recentSearchResults,
637
+ ], {
638
+ weights: [2, 1, 3], // prefer recent results more than text or vector
639
+ });
640
+ // results = [ id3, id5, id1, id2, id4 ]
641
+ ```
642
+
643
+ To have it more biased towards the top few results, you can set the `k` value
644
+ to a lower number (10 by default).
645
+
646
+ ```ts
647
+ const results = hybridRank([
648
+ textSearchResults,
649
+ vectorSearchResults,
650
+ recentSearchResults,
651
+ ], { k: 1 });
652
+ // results = [ id5, id1, id3, id2, id4 ]
653
+ ```
654
+
655
+ ### `contentHashFromArrayBuffer`
656
+
657
+ This generates the hash of a file's contents, which can be used to avoid
658
+ adding the same file twice.
659
+
660
+ Note: doing `blob.arrayBuffer()` will consume the blob's data, so you'll need
661
+ to make a new blob to use it after calling this function.
662
+
663
+ ```ts
664
+ import { contentHashFromArrayBuffer } from "@convex-dev/rag";
665
+
666
+ export const addFile = action({
667
+ args: { bytes: v.bytes() },
668
+ handler: async (ctx, { bytes }) => {
669
+
670
+ const hash = await contentHashFromArrayBuffer(bytes);
671
+
672
+ const existing = await rag.findEntryByContentHash(ctx, {
673
+ namespace: "global",
674
+ key: "my-file.txt",
675
+ contentHash: hash,
676
+ });
677
+ if (existing) {
678
+ console.log("File contents are the same, skipping");
679
+ return;
680
+ }
681
+ const blob = new Blob([bytes], { type: "text/plain" });
682
+ //...
683
+ },
684
+ });
685
+ ```
686
+
687
+ ### `guessMimeTypeFromExtension`
688
+
689
+ This guesses the mime type of a file from its extension.
690
+
691
+ ```ts
692
+ import { guessMimeTypeFromExtension } from "@convex-dev/rag";
693
+
694
+ const mimeType = guessMimeTypeFromExtension("my-file.mjs");
695
+ console.log(mimeType); // "text/javascript"
696
+ ```
697
+
698
+ ### `guessMimeTypeFromContents`
699
+
700
+ This guesses the mime type of a file from the first few bytes of its contents.
701
+
702
+ ```ts
703
+ import { guessMimeTypeFromContents } from "@convex-dev/rag";
704
+
705
+ const mimeType = guessMimeTypeFromContents(await file.arrayBuffer());
477
706
  ```
478
707
 
479
708
  ### Example Usage
@@ -482,5 +711,5 @@ See more example usage in [example.ts](./example/convex/example.ts).
482
711
 
483
712
  ### Running the example
484
713
 
485
- Run the example with `npm i && npm run example`.
714
+ Run the example with `npm i && npm run setup && npm run example`.
486
715
  <!-- END: Include on https://convex.dev/components -->
@@ -1 +1 @@
1
- {"version":3,"file":"defaultChunker.d.ts","sourceRoot":"","sources":["../../src/client/defaultChunker.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AACH,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,EACE,QAAY,EACZ,iBAAuB,EACvB,iBAAwB,EACxB,iBAAyB,EACzB,SAAkB,GACnB,GAAE;IACD,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,SAAS,CAAC,EAAE,MAAM,CAAC;CACf,GACL,MAAM,EAAE,CA6FV;AAoED,eAAe,cAAc,CAAC"}
1
+ {"version":3,"file":"defaultChunker.d.ts","sourceRoot":"","sources":["../../src/client/defaultChunker.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AACH,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,EACE,QAAY,EACZ,iBAAuB,EACvB,iBAAwB,EACxB,iBAAyB,EACzB,SAAkB,GACnB,GAAE;IACD,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,SAAS,CAAC,EAAE,MAAM,CAAC;CACf,GACL,MAAM,EAAE,CA6HV;AA4FD,eAAe,cAAc,CAAC"}
@@ -4,7 +4,7 @@
4
4
  * By default, it will chunk into paragraphs and target
5
5
  * 200-2000 characters per chunk (only less than 1 line if the hard limit is reached).
6
6
  */
7
- export function defaultChunker(text, { minLines = 1, minCharsSoftLimit = 200, maxCharsSoftLimit = 2000, maxCharsHardLimit = 10000, delimiter = "\n\n", } = {}) {
7
+ export function defaultChunker(text, { minLines = 1, minCharsSoftLimit = 100, maxCharsSoftLimit = 1000, maxCharsHardLimit = 10000, delimiter = "\n\n", } = {}) {
8
8
  if (!text)
9
9
  return [];
10
10
  // Split text into individual lines
@@ -19,13 +19,17 @@ export function defaultChunker(text, { minLines = 1, minCharsSoftLimit = 200, ma
19
19
  const potentialChunk = [...currentChunk, line].join("\n");
20
20
  // If adding this line would exceed max chars, finalize current chunk first
21
21
  if (potentialChunk.length > maxCharsSoftLimit && currentChunk.length > 0) {
22
- const trimmedChunk = removeTrailingEmptyLines(currentChunk);
23
- chunks.push(trimmedChunk.join("\n"));
22
+ const processedChunk = processChunkForOutput(currentChunk, lines, i - currentChunk.length);
23
+ if (processedChunk.trim()) {
24
+ chunks.push(processedChunk);
25
+ }
24
26
  // Split the line if it exceeds hard limit
25
27
  const splitLines = maybeSplitLine(line, maxCharsHardLimit);
26
28
  // Add all but the last split piece as separate chunks
27
29
  for (let j = 0; j < splitLines.length - 1; j++) {
28
- chunks.push(splitLines[j]);
30
+ if (splitLines[j].trim()) {
31
+ chunks.push(splitLines[j]);
32
+ }
29
33
  }
30
34
  // Keep the last piece for potential combination with next lines
31
35
  currentChunk = [splitLines[splitLines.length - 1]];
@@ -37,8 +41,11 @@ export function defaultChunker(text, { minLines = 1, minCharsSoftLimit = 200, ma
37
41
  currentChunk.join("\n").length >= Math.min(minCharsSoftLimit * 0.8, 150)) {
38
42
  // Simple logic: only split if potential chunk would exceed the soft max limit
39
43
  if (potentialChunk.length > maxCharsSoftLimit) {
40
- // When splitting at delimiter boundary, preserve natural empty lines (don't remove trailing empty lines)
41
- chunks.push(currentChunk.join("\n"));
44
+ // When splitting at delimiter boundary, preserve natural empty lines and trailing newlines
45
+ const processedChunk = processChunkForOutput(currentChunk, lines, i - currentChunk.length);
46
+ if (processedChunk.trim()) {
47
+ chunks.push(processedChunk);
48
+ }
42
49
  currentChunk = [line];
43
50
  continue;
44
51
  }
@@ -53,22 +60,28 @@ export function defaultChunker(text, { minLines = 1, minCharsSoftLimit = 200, ma
53
60
  if (splitLines.length > 1) {
54
61
  // Line was split - add all but the last piece as separate chunks
55
62
  for (let j = 0; j < splitLines.length - 1; j++) {
56
- chunks.push(splitLines[j]);
63
+ if (splitLines[j].trim()) {
64
+ chunks.push(splitLines[j]);
65
+ }
57
66
  }
58
67
  // Keep the last piece for potential combination with next lines
59
68
  currentChunk = [splitLines[splitLines.length - 1]];
60
69
  }
61
70
  else {
62
71
  // Line doesn't exceed hard limit, keep it as is
63
- chunks.push(line);
72
+ if (line.trim()) {
73
+ chunks.push(line);
74
+ }
64
75
  currentChunk = [];
65
76
  }
66
77
  }
67
78
  else {
68
79
  // Remove last line and finalize chunk
69
80
  const lastLine = currentChunk.pop();
70
- const trimmedChunk = removeTrailingEmptyLines(currentChunk);
71
- chunks.push(trimmedChunk.join("\n"));
81
+ const processedChunk = processChunkForOutput(currentChunk, lines, i - currentChunk.length);
82
+ if (processedChunk.trim()) {
83
+ chunks.push(processedChunk);
84
+ }
72
85
  currentChunk = [lastLine];
73
86
  }
74
87
  }
@@ -79,14 +92,32 @@ export function defaultChunker(text, { minLines = 1, minCharsSoftLimit = 200, ma
79
92
  if (remainingText.length > maxCharsHardLimit) {
80
93
  // Split the remaining chunk if it exceeds hard limit
81
94
  const splitLines = maybeSplitLine(remainingText, maxCharsHardLimit);
82
- chunks.push(...splitLines);
95
+ chunks.push(...splitLines.filter((chunk) => chunk.trim()));
83
96
  }
84
97
  else {
85
- const trimmedChunk = removeTrailingEmptyLines(currentChunk);
86
- chunks.push(trimmedChunk.join("\n"));
98
+ const processedChunk = processChunkForOutput(currentChunk, lines, lines.length - currentChunk.length);
99
+ if (processedChunk.trim()) {
100
+ chunks.push(processedChunk);
101
+ }
87
102
  }
88
103
  }
89
- return chunks;
104
+ // Filter out any empty chunks that might have slipped through
105
+ return chunks.filter((chunk) => chunk.trim().length > 0);
106
+ }
107
+ function processChunkForOutput(chunkLines, allLines, startIndex) {
108
+ if (chunkLines.length === 0)
109
+ return "";
110
+ // Remove trailing empty lines but preserve meaningful structure
111
+ const trimmedLines = removeTrailingEmptyLines(chunkLines);
112
+ // Check if we should preserve some trailing newlines by looking at the original context
113
+ const endIndex = startIndex + chunkLines.length - 1;
114
+ const hasTrailingNewlines = endIndex < allLines.length - 1 && chunkLines.length > trimmedLines.length;
115
+ // If we removed empty lines but there are more lines after this chunk,
116
+ // preserve one trailing newline to maintain paragraph separation
117
+ if (hasTrailingNewlines && trimmedLines.length > 0) {
118
+ return trimmedLines.join("\n") + "\n";
119
+ }
120
+ return trimmedLines.join("\n");
90
121
  }
91
122
  function maybeSplitLine(line, maxCharsHardLimit) {
92
123
  const inputs = [line]; // in reverse order
@@ -141,8 +172,8 @@ function removeTrailingEmptyLines(lines) {
141
172
  return lines.slice(0, i + 1);
142
173
  }
143
174
  }
144
- // If all lines are empty, keep at least one
145
- return lines.length > 0 ? [lines[0]] : [];
175
+ // If all lines are empty, return empty array instead of keeping empty strings
176
+ return [];
146
177
  }
147
178
  export default defaultChunker;
148
179
  //# sourceMappingURL=defaultChunker.js.map