@opensaas/stack-rag 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. package/.turbo/turbo-build.log +4 -0
  2. package/CHANGELOG.md +10 -0
  3. package/CLAUDE.md +565 -0
  4. package/LICENSE +21 -0
  5. package/README.md +406 -0
  6. package/dist/config/index.d.ts +63 -0
  7. package/dist/config/index.d.ts.map +1 -0
  8. package/dist/config/index.js +94 -0
  9. package/dist/config/index.js.map +1 -0
  10. package/dist/config/plugin.d.ts +38 -0
  11. package/dist/config/plugin.d.ts.map +1 -0
  12. package/dist/config/plugin.js +215 -0
  13. package/dist/config/plugin.js.map +1 -0
  14. package/dist/config/plugin.test.d.ts +2 -0
  15. package/dist/config/plugin.test.d.ts.map +1 -0
  16. package/dist/config/plugin.test.js +554 -0
  17. package/dist/config/plugin.test.js.map +1 -0
  18. package/dist/config/types.d.ts +249 -0
  19. package/dist/config/types.d.ts.map +1 -0
  20. package/dist/config/types.js +5 -0
  21. package/dist/config/types.js.map +1 -0
  22. package/dist/fields/embedding.d.ts +85 -0
  23. package/dist/fields/embedding.d.ts.map +1 -0
  24. package/dist/fields/embedding.js +81 -0
  25. package/dist/fields/embedding.js.map +1 -0
  26. package/dist/fields/embedding.test.d.ts +2 -0
  27. package/dist/fields/embedding.test.d.ts.map +1 -0
  28. package/dist/fields/embedding.test.js +323 -0
  29. package/dist/fields/embedding.test.js.map +1 -0
  30. package/dist/fields/index.d.ts +6 -0
  31. package/dist/fields/index.d.ts.map +1 -0
  32. package/dist/fields/index.js +5 -0
  33. package/dist/fields/index.js.map +1 -0
  34. package/dist/index.d.ts +8 -0
  35. package/dist/index.d.ts.map +1 -0
  36. package/dist/index.js +9 -0
  37. package/dist/index.js.map +1 -0
  38. package/dist/mcp/index.d.ts +19 -0
  39. package/dist/mcp/index.d.ts.map +1 -0
  40. package/dist/mcp/index.js +18 -0
  41. package/dist/mcp/index.js.map +1 -0
  42. package/dist/providers/index.d.ts +38 -0
  43. package/dist/providers/index.d.ts.map +1 -0
  44. package/dist/providers/index.js +68 -0
  45. package/dist/providers/index.js.map +1 -0
  46. package/dist/providers/ollama.d.ts +49 -0
  47. package/dist/providers/ollama.d.ts.map +1 -0
  48. package/dist/providers/ollama.js +151 -0
  49. package/dist/providers/ollama.js.map +1 -0
  50. package/dist/providers/openai.d.ts +41 -0
  51. package/dist/providers/openai.d.ts.map +1 -0
  52. package/dist/providers/openai.js +126 -0
  53. package/dist/providers/openai.js.map +1 -0
  54. package/dist/providers/providers.test.d.ts +2 -0
  55. package/dist/providers/providers.test.d.ts.map +1 -0
  56. package/dist/providers/providers.test.js +224 -0
  57. package/dist/providers/providers.test.js.map +1 -0
  58. package/dist/providers/types.d.ts +88 -0
  59. package/dist/providers/types.d.ts.map +1 -0
  60. package/dist/providers/types.js +2 -0
  61. package/dist/providers/types.js.map +1 -0
  62. package/dist/runtime/batch.d.ts +183 -0
  63. package/dist/runtime/batch.d.ts.map +1 -0
  64. package/dist/runtime/batch.js +240 -0
  65. package/dist/runtime/batch.js.map +1 -0
  66. package/dist/runtime/batch.test.d.ts +2 -0
  67. package/dist/runtime/batch.test.d.ts.map +1 -0
  68. package/dist/runtime/batch.test.js +251 -0
  69. package/dist/runtime/batch.test.js.map +1 -0
  70. package/dist/runtime/chunking.d.ts +42 -0
  71. package/dist/runtime/chunking.d.ts.map +1 -0
  72. package/dist/runtime/chunking.js +264 -0
  73. package/dist/runtime/chunking.js.map +1 -0
  74. package/dist/runtime/chunking.test.d.ts +2 -0
  75. package/dist/runtime/chunking.test.d.ts.map +1 -0
  76. package/dist/runtime/chunking.test.js +212 -0
  77. package/dist/runtime/chunking.test.js.map +1 -0
  78. package/dist/runtime/embeddings.d.ts +147 -0
  79. package/dist/runtime/embeddings.d.ts.map +1 -0
  80. package/dist/runtime/embeddings.js +201 -0
  81. package/dist/runtime/embeddings.js.map +1 -0
  82. package/dist/runtime/embeddings.test.d.ts +2 -0
  83. package/dist/runtime/embeddings.test.d.ts.map +1 -0
  84. package/dist/runtime/embeddings.test.js +366 -0
  85. package/dist/runtime/embeddings.test.js.map +1 -0
  86. package/dist/runtime/index.d.ts +14 -0
  87. package/dist/runtime/index.d.ts.map +1 -0
  88. package/dist/runtime/index.js +18 -0
  89. package/dist/runtime/index.js.map +1 -0
  90. package/dist/runtime/search.d.ts +135 -0
  91. package/dist/runtime/search.d.ts.map +1 -0
  92. package/dist/runtime/search.js +101 -0
  93. package/dist/runtime/search.js.map +1 -0
  94. package/dist/storage/index.d.ts +41 -0
  95. package/dist/storage/index.d.ts.map +1 -0
  96. package/dist/storage/index.js +73 -0
  97. package/dist/storage/index.js.map +1 -0
  98. package/dist/storage/json.d.ts +34 -0
  99. package/dist/storage/json.d.ts.map +1 -0
  100. package/dist/storage/json.js +82 -0
  101. package/dist/storage/json.js.map +1 -0
  102. package/dist/storage/pgvector.d.ts +53 -0
  103. package/dist/storage/pgvector.d.ts.map +1 -0
  104. package/dist/storage/pgvector.js +168 -0
  105. package/dist/storage/pgvector.js.map +1 -0
  106. package/dist/storage/sqlite-vss.d.ts +49 -0
  107. package/dist/storage/sqlite-vss.d.ts.map +1 -0
  108. package/dist/storage/sqlite-vss.js +148 -0
  109. package/dist/storage/sqlite-vss.js.map +1 -0
  110. package/dist/storage/storage.test.d.ts +2 -0
  111. package/dist/storage/storage.test.d.ts.map +1 -0
  112. package/dist/storage/storage.test.js +440 -0
  113. package/dist/storage/storage.test.js.map +1 -0
  114. package/dist/storage/types.d.ts +79 -0
  115. package/dist/storage/types.d.ts.map +1 -0
  116. package/dist/storage/types.js +49 -0
  117. package/dist/storage/types.js.map +1 -0
  118. package/package.json +82 -0
  119. package/src/config/index.ts +116 -0
  120. package/src/config/plugin.test.ts +664 -0
  121. package/src/config/plugin.ts +257 -0
  122. package/src/config/types.ts +283 -0
  123. package/src/fields/embedding.test.ts +408 -0
  124. package/src/fields/embedding.ts +150 -0
  125. package/src/fields/index.ts +6 -0
  126. package/src/index.ts +33 -0
  127. package/src/mcp/index.ts +21 -0
  128. package/src/providers/index.ts +81 -0
  129. package/src/providers/ollama.ts +186 -0
  130. package/src/providers/openai.ts +161 -0
  131. package/src/providers/providers.test.ts +275 -0
  132. package/src/providers/types.ts +100 -0
  133. package/src/runtime/batch.test.ts +332 -0
  134. package/src/runtime/batch.ts +424 -0
  135. package/src/runtime/chunking.test.ts +258 -0
  136. package/src/runtime/chunking.ts +334 -0
  137. package/src/runtime/embeddings.test.ts +441 -0
  138. package/src/runtime/embeddings.ts +380 -0
  139. package/src/runtime/index.ts +51 -0
  140. package/src/runtime/search.ts +243 -0
  141. package/src/storage/index.ts +86 -0
  142. package/src/storage/json.ts +106 -0
  143. package/src/storage/pgvector.ts +206 -0
  144. package/src/storage/sqlite-vss.ts +193 -0
  145. package/src/storage/storage.test.ts +521 -0
  146. package/src/storage/types.ts +126 -0
  147. package/tsconfig.json +13 -0
  148. package/tsconfig.tsbuildinfo +1 -0
  149. package/vitest.config.ts +18 -0
@@ -0,0 +1,424 @@
1
+ /**
2
+ * Batch processing utilities with rate limiting and progress tracking
3
+ */
4
+
5
+ import type { EmbeddingProvider } from '../providers/types.js'
6
+ import type { StoredEmbedding } from '../config/types.js'
7
+ import { generateEmbeddings } from './embeddings.js'
8
+
9
+ export interface BatchProcessOptions {
10
+ /**
11
+ * Embedding provider to use
12
+ */
13
+ provider: EmbeddingProvider
14
+
15
+ /**
16
+ * Array of texts to process
17
+ */
18
+ texts: string[]
19
+
20
+ /**
21
+ * Batch size for processing
22
+ * @default 10
23
+ */
24
+ batchSize?: number
25
+
26
+ /**
27
+ * Rate limit in requests per minute
28
+ * @default 100
29
+ */
30
+ rateLimit?: number
31
+
32
+ /**
33
+ * Progress callback called after each batch
34
+ */
35
+ onProgress?: (progress: BatchProgress) => void
36
+
37
+ /**
38
+ * Error callback called when a batch fails
39
+ * If not provided, errors will be thrown
40
+ */
41
+ onError?: (error: BatchError) => void
42
+
43
+ /**
44
+ * Number of retries for failed batches
45
+ * @default 3
46
+ */
47
+ maxRetries?: number
48
+
49
+ /**
50
+ * Initial retry delay in milliseconds
51
+ * @default 1000
52
+ */
53
+ retryDelay?: number
54
+
55
+ /**
56
+ * Whether to include source hash in metadata
57
+ * @default true
58
+ */
59
+ includeSourceHash?: boolean
60
+ }
61
+
62
+ export interface BatchProgress {
63
+ /**
64
+ * Number of items processed so far
65
+ */
66
+ processed: number
67
+
68
+ /**
69
+ * Total number of items to process
70
+ */
71
+ total: number
72
+
73
+ /**
74
+ * Number of items that failed
75
+ */
76
+ failed: number
77
+
78
+ /**
79
+ * Percentage completed (0-100)
80
+ */
81
+ percentage: number
82
+
83
+ /**
84
+ * Current batch number (1-indexed)
85
+ */
86
+ currentBatch: number
87
+
88
+ /**
89
+ * Total number of batches
90
+ */
91
+ totalBatches: number
92
+ }
93
+
94
+ export interface BatchError {
95
+ /**
96
+ * Batch number that failed
97
+ */
98
+ batchNumber: number
99
+
100
+ /**
101
+ * Items in the failed batch
102
+ */
103
+ items: string[]
104
+
105
+ /**
106
+ * Error that occurred
107
+ */
108
+ error: Error
109
+
110
+ /**
111
+ * Number of retry attempts made
112
+ */
113
+ retries: number
114
+ }
115
+
116
+ export interface BatchProcessResult {
117
+ /**
118
+ * Successfully generated embeddings
119
+ */
120
+ embeddings: StoredEmbedding[]
121
+
122
+ /**
123
+ * Texts that failed to process
124
+ */
125
+ failed: Array<{ text: string; error: Error }>
126
+
127
+ /**
128
+ * Total processing statistics
129
+ */
130
+ stats: {
131
+ total: number
132
+ successful: number
133
+ failed: number
134
+ duration: number
135
+ }
136
+ }
137
+
138
+ /**
139
+ * Process embeddings in batches with rate limiting and retry logic
140
+ *
141
+ * @example
142
+ * ```typescript
143
+ * const result = await batchProcess({
144
+ * provider: createEmbeddingProvider({ type: 'openai', apiKey: '...' }),
145
+ * texts: largeTextArray,
146
+ * batchSize: 10,
147
+ * rateLimit: 60, // 60 requests per minute
148
+ * onProgress: (progress) => {
149
+ * console.log(`Progress: ${progress.percentage}%`)
150
+ * },
151
+ * })
152
+ *
153
+ * console.log(`Successfully processed: ${result.stats.successful}`)
154
+ * console.log(`Failed: ${result.stats.failed}`)
155
+ * ```
156
+ */
157
+ export async function batchProcess(options: BatchProcessOptions): Promise<BatchProcessResult> {
158
+ const {
159
+ provider,
160
+ texts,
161
+ batchSize = 10,
162
+ rateLimit = 100,
163
+ onProgress,
164
+ onError,
165
+ maxRetries = 3,
166
+ retryDelay = 1000,
167
+ includeSourceHash = true,
168
+ } = options
169
+
170
+ const startTime = Date.now()
171
+ const totalBatches = Math.ceil(texts.length / batchSize)
172
+ const embeddings: StoredEmbedding[] = []
173
+ const failed: Array<{ text: string; error: Error }> = []
174
+
175
+ // Calculate delay between batches to respect rate limit
176
+ const delayBetweenBatches = calculateBatchDelay(rateLimit)
177
+
178
+ for (let i = 0; i < texts.length; i += batchSize) {
179
+ const batchNumber = Math.floor(i / batchSize) + 1
180
+ const batch = texts.slice(i, i + batchSize)
181
+
182
+ try {
183
+ // Process batch with retry logic
184
+ const batchEmbeddings = await retryWithBackoff(
185
+ async () =>
186
+ generateEmbeddings({
187
+ provider,
188
+ texts: batch,
189
+ includeSourceHash,
190
+ batchSize: batch.length,
191
+ }),
192
+ maxRetries,
193
+ retryDelay,
194
+ )
195
+
196
+ embeddings.push(...batchEmbeddings)
197
+
198
+ // Report progress
199
+ if (onProgress) {
200
+ const processed = Math.min(i + batchSize, texts.length)
201
+ onProgress({
202
+ processed,
203
+ total: texts.length,
204
+ failed: failed.length,
205
+ percentage: Math.round((processed / texts.length) * 100),
206
+ currentBatch: batchNumber,
207
+ totalBatches,
208
+ })
209
+ }
210
+
211
+ // Rate limiting: wait before next batch (except for last batch)
212
+ if (batchNumber < totalBatches && delayBetweenBatches > 0) {
213
+ await sleep(delayBetweenBatches)
214
+ }
215
+ } catch (error) {
216
+ const batchError: BatchError = {
217
+ batchNumber,
218
+ items: batch,
219
+ error: error instanceof Error ? error : new Error(String(error)),
220
+ retries: maxRetries,
221
+ }
222
+
223
+ if (onError) {
224
+ onError(batchError)
225
+ } else {
226
+ // If no error handler, throw the error
227
+ throw error
228
+ }
229
+
230
+ // Add all items in batch to failed list
231
+ for (const text of batch) {
232
+ failed.push({
233
+ text,
234
+ error: batchError.error,
235
+ })
236
+ }
237
+ }
238
+ }
239
+
240
+ const duration = Date.now() - startTime
241
+
242
+ return {
243
+ embeddings,
244
+ failed,
245
+ stats: {
246
+ total: texts.length,
247
+ successful: embeddings.length,
248
+ failed: failed.length,
249
+ duration,
250
+ },
251
+ }
252
+ }
253
+
254
+ /**
255
+ * Rate limiter for controlling API request rate
256
+ */
257
+ export class RateLimiter {
258
+ private queue: Array<() => void> = []
259
+ private requestTimestamps: number[] = []
260
+ private readonly requestsPerMinute: number
261
+
262
+ constructor(requestsPerMinute: number) {
263
+ this.requestsPerMinute = requestsPerMinute
264
+ }
265
+
266
+ /**
267
+ * Wait until rate limit allows next request
268
+ */
269
+ async waitForSlot(): Promise<void> {
270
+ return new Promise((resolve) => {
271
+ const tryAcquire = () => {
272
+ const now = Date.now()
273
+ const oneMinuteAgo = now - 60_000
274
+
275
+ // Remove timestamps older than 1 minute
276
+ this.requestTimestamps = this.requestTimestamps.filter((t) => t > oneMinuteAgo)
277
+
278
+ // Check if we can make a request
279
+ if (this.requestTimestamps.length < this.requestsPerMinute) {
280
+ this.requestTimestamps.push(now)
281
+ resolve()
282
+ return
283
+ }
284
+
285
+ // Calculate wait time until oldest request expires
286
+ const oldestTimestamp = this.requestTimestamps[0]
287
+ const waitTime = oldestTimestamp + 60_000 - now
288
+
289
+ // Try again after wait time
290
+ setTimeout(tryAcquire, Math.max(waitTime, 100))
291
+ }
292
+
293
+ tryAcquire()
294
+ })
295
+ }
296
+
297
+ /**
298
+ * Execute a function with rate limiting
299
+ */
300
+ async execute<T>(fn: () => Promise<T>): Promise<T> {
301
+ await this.waitForSlot()
302
+ return fn()
303
+ }
304
+ }
305
+
306
+ /**
307
+ * Calculate delay between batches to respect rate limit
308
+ */
309
+ function calculateBatchDelay(requestsPerMinute: number): number {
310
+ // Each batch counts as one request
311
+ const requestsPerSecond = requestsPerMinute / 60
312
+ const delayPerRequest = 1000 / requestsPerSecond
313
+
314
+ // Return delay in milliseconds
315
+ return Math.ceil(delayPerRequest)
316
+ }
317
+
318
+ /**
319
+ * Sleep for specified milliseconds
320
+ */
321
+ function sleep(ms: number): Promise<void> {
322
+ return new Promise((resolve) => setTimeout(resolve, ms))
323
+ }
324
+
325
+ /**
326
+ * Retry a function with exponential backoff
327
+ */
328
+ async function retryWithBackoff<T>(
329
+ fn: () => Promise<T>,
330
+ maxRetries: number,
331
+ initialDelay: number,
332
+ ): Promise<T> {
333
+ let lastError: Error | undefined
334
+
335
+ for (let attempt = 0; attempt <= maxRetries; attempt++) {
336
+ try {
337
+ return await fn()
338
+ } catch (error) {
339
+ lastError = error instanceof Error ? error : new Error(String(error))
340
+
341
+ // Don't retry on last attempt
342
+ if (attempt === maxRetries) {
343
+ break
344
+ }
345
+
346
+ // Exponential backoff: delay * 2^attempt
347
+ const delay = initialDelay * Math.pow(2, attempt)
348
+ await sleep(delay)
349
+ }
350
+ }
351
+
352
+ throw lastError
353
+ }
354
+
355
+ /**
356
+ * Queue for processing items with concurrency control
357
+ */
358
+ export class ProcessingQueue<T, R> {
359
+ private queue: Array<{ item: T; resolve: (value: R) => void; reject: (error: Error) => void }> =
360
+ []
361
+ private processing = 0
362
+ private readonly concurrency: number
363
+ private readonly processor: (item: T) => Promise<R>
364
+
365
+ constructor(processor: (item: T) => Promise<R>, concurrency: number = 1) {
366
+ this.processor = processor
367
+ this.concurrency = concurrency
368
+ }
369
+
370
+ /**
371
+ * Add an item to the queue
372
+ */
373
+ async add(item: T): Promise<R> {
374
+ return new Promise((resolve, reject) => {
375
+ this.queue.push({ item, resolve, reject })
376
+ this.processNext()
377
+ })
378
+ }
379
+
380
+ /**
381
+ * Add multiple items to the queue
382
+ */
383
+ async addBatch(items: T[]): Promise<R[]> {
384
+ return Promise.all(items.map((item) => this.add(item)))
385
+ }
386
+
387
+ /**
388
+ * Process next item in queue
389
+ */
390
+ private async processNext(): Promise<void> {
391
+ if (this.processing >= this.concurrency || this.queue.length === 0) {
392
+ return
393
+ }
394
+
395
+ this.processing++
396
+ const next = this.queue.shift()
397
+
398
+ if (next) {
399
+ try {
400
+ const result = await this.processor(next.item)
401
+ next.resolve(result)
402
+ } catch (error) {
403
+ next.reject(error instanceof Error ? error : new Error(String(error)))
404
+ } finally {
405
+ this.processing--
406
+ this.processNext()
407
+ }
408
+ }
409
+ }
410
+
411
+ /**
412
+ * Get current queue size
413
+ */
414
+ get size(): number {
415
+ return this.queue.length
416
+ }
417
+
418
+ /**
419
+ * Get number of items currently being processed
420
+ */
421
+ get activeCount(): number {
422
+ return this.processing
423
+ }
424
+ }
@@ -0,0 +1,258 @@
1
+ import { describe, it, expect } from 'vitest'
2
+ import { chunkText, estimateTokenCount, mergeSmallChunks } from './chunking.js'
3
+
4
+ describe('chunkText', () => {
5
+ describe('recursive strategy', () => {
6
+ it('should chunk text at paragraph boundaries', () => {
7
+ const text = 'Paragraph 1\n\nParagraph 2\n\nParagraph 3'
8
+ const chunks = chunkText(text, {
9
+ strategy: 'recursive',
10
+ chunkSize: 15,
11
+ chunkOverlap: 0,
12
+ })
13
+
14
+ expect(chunks).toHaveLength(3)
15
+ expect(chunks[0].text).toBe('Paragraph 1\n\n')
16
+ expect(chunks[1].text).toBe('Paragraph 2\n\n')
17
+ expect(chunks[2].text).toBe('Paragraph 3')
18
+ })
19
+
20
+ it('should handle overlap between chunks', () => {
21
+ const text = 'First chunk here. Second chunk here. Third chunk here.'
22
+ const chunks = chunkText(text, {
23
+ strategy: 'recursive',
24
+ chunkSize: 20,
25
+ chunkOverlap: 5,
26
+ })
27
+
28
+ expect(chunks.length).toBeGreaterThan(1)
29
+
30
+ // Check that chunks have some overlap
31
+ for (let i = 1; i < chunks.length; i++) {
32
+ const prevChunk = chunks[i - 1].text
33
+ const currChunk = chunks[i].text
34
+ const overlap = prevChunk.slice(-5)
35
+ // Overlap might not be exact due to sentence boundaries
36
+ expect(currChunk).toContain(overlap.trim().split(' ')[0])
37
+ }
38
+ })
39
+
40
+ it('should respect chunk size limits', () => {
41
+ const text = 'A'.repeat(1000)
42
+ const chunks = chunkText(text, {
43
+ strategy: 'recursive',
44
+ chunkSize: 100,
45
+ chunkOverlap: 0,
46
+ })
47
+
48
+ for (const chunk of chunks) {
49
+ expect(chunk.text.length).toBeLessThanOrEqual(100)
50
+ }
51
+ })
52
+
53
+ it('should track chunk positions correctly', () => {
54
+ const text = 'Start. Middle. End.'
55
+ const chunks = chunkText(text, {
56
+ strategy: 'recursive',
57
+ chunkSize: 100,
58
+ chunkOverlap: 0,
59
+ })
60
+
61
+ expect(chunks[0].start).toBe(0)
62
+ expect(chunks[0].end).toBe(text.length)
63
+ expect(chunks[0].text).toBe(text)
64
+ })
65
+ })
66
+
67
+ describe('sentence strategy', () => {
68
+ it('should preserve sentence boundaries', () => {
69
+ const text = 'First sentence. Second sentence. Third sentence.'
70
+ const chunks = chunkText(text, {
71
+ strategy: 'sentence',
72
+ chunkSize: 20,
73
+ chunkOverlap: 0,
74
+ })
75
+
76
+ expect(chunks.length).toBeGreaterThan(1)
77
+
78
+ // Each chunk should end with sentence punctuation
79
+ for (const chunk of chunks) {
80
+ expect(chunk.text.trim()).toMatch(/[.!?]$/)
81
+ }
82
+ })
83
+
84
+ it('should handle text with no sentences', () => {
85
+ const text = 'No sentence markers here'
86
+ const chunks = chunkText(text, {
87
+ strategy: 'sentence',
88
+ chunkSize: 10,
89
+ chunkOverlap: 0,
90
+ })
91
+
92
+ expect(chunks).toHaveLength(1)
93
+ expect(chunks[0].text).toBe(text)
94
+ })
95
+ })
96
+
97
+ describe('sliding-window strategy', () => {
98
+ it('should create overlapping fixed-size chunks', () => {
99
+ const text = 'A'.repeat(100)
100
+ const chunks = chunkText(text, {
101
+ strategy: 'sliding-window',
102
+ chunkSize: 30,
103
+ chunkOverlap: 10,
104
+ })
105
+
106
+ expect(chunks.length).toBeGreaterThan(1)
107
+
108
+ // Check fixed size (except possibly last chunk)
109
+ for (let i = 0; i < chunks.length - 1; i++) {
110
+ expect(chunks[i].text.length).toBe(30)
111
+ }
112
+
113
+ // Check overlap
114
+ for (let i = 1; i < chunks.length; i++) {
115
+ const prevChunk = chunks[i - 1]
116
+ const currChunk = chunks[i]
117
+ expect(currChunk.start).toBe(prevChunk.start + 20) // step = chunkSize - overlap
118
+ }
119
+ })
120
+
121
+ it('should skip empty chunks', () => {
122
+ const text = ' ' // Just whitespace
123
+ const chunks = chunkText(text, {
124
+ strategy: 'sliding-window',
125
+ chunkSize: 10,
126
+ chunkOverlap: 0,
127
+ })
128
+
129
+ expect(chunks).toHaveLength(0)
130
+ })
131
+ })
132
+
133
+ describe('token-aware strategy', () => {
134
+ it('should estimate token limits', () => {
135
+ const text = 'A'.repeat(400) // ~100 tokens at 4 chars/token
136
+ const chunks = chunkText(text, {
137
+ strategy: 'token-aware',
138
+ tokenLimit: 50,
139
+ chunkOverlap: 0,
140
+ })
141
+
142
+ expect(chunks.length).toBeGreaterThanOrEqual(2)
143
+
144
+ // Each chunk should be roughly under token limit * 4 chars
145
+ for (const chunk of chunks) {
146
+ expect(chunk.text.length).toBeLessThanOrEqual(50 * 4)
147
+ }
148
+ })
149
+ })
150
+
151
+ describe('edge cases', () => {
152
+ it('should handle empty text', () => {
153
+ const chunks = chunkText('', { chunkSize: 100 })
154
+ expect(chunks).toHaveLength(0)
155
+ })
156
+
157
+ it('should handle text smaller than chunk size', () => {
158
+ const text = 'Small text'
159
+ const chunks = chunkText(text, { chunkSize: 1000 })
160
+
161
+ expect(chunks).toHaveLength(1)
162
+ expect(chunks[0].text).toBe(text)
163
+ expect(chunks[0].start).toBe(0)
164
+ expect(chunks[0].end).toBe(text.length)
165
+ })
166
+
167
+ it('should throw error if overlap >= chunk size', () => {
168
+ expect(() => {
169
+ chunkText('text', { chunkSize: 10, chunkOverlap: 10 })
170
+ }).toThrow('chunkOverlap must be less than chunkSize')
171
+ })
172
+
173
+ it('should assign correct chunk indexes', () => {
174
+ const text = 'A'.repeat(300)
175
+ const chunks = chunkText(text, { chunkSize: 100, chunkOverlap: 0 })
176
+
177
+ chunks.forEach((chunk, i) => {
178
+ expect(chunk.index).toBe(i)
179
+ })
180
+ })
181
+ })
182
+ })
183
+
184
+ describe('estimateTokenCount', () => {
185
+ it('should estimate token count', () => {
186
+ const text = 'Hello world'
187
+ const count = estimateTokenCount(text)
188
+
189
+ // "Hello world" is 11 chars / 4 = ~3 tokens
190
+ expect(count).toBe(3)
191
+ })
192
+
193
+ it('should handle empty text', () => {
194
+ expect(estimateTokenCount('')).toBe(0)
195
+ })
196
+
197
+ it('should handle long text', () => {
198
+ const text = 'A'.repeat(1000)
199
+ const count = estimateTokenCount(text)
200
+
201
+ expect(count).toBe(250) // 1000 / 4
202
+ })
203
+ })
204
+
205
+ describe('mergeSmallChunks', () => {
206
+ it('should merge chunks below minimum size', () => {
207
+ const chunks = [
208
+ { text: 'A', start: 0, end: 1, index: 0 },
209
+ { text: 'B', start: 1, end: 2, index: 1 },
210
+ { text: 'C', start: 2, end: 3, index: 2 },
211
+ ]
212
+
213
+ const merged = mergeSmallChunks(chunks, 2)
214
+
215
+ expect(merged.length).toBeLessThan(chunks.length)
216
+ expect(merged[0].text.length).toBeGreaterThanOrEqual(2)
217
+ })
218
+
219
+ it('should not merge chunks already above minimum size', () => {
220
+ const chunks = [
221
+ { text: 'AAA', start: 0, end: 3, index: 0 },
222
+ { text: 'BBB', start: 3, end: 6, index: 1 },
223
+ ]
224
+
225
+ const merged = mergeSmallChunks(chunks, 2)
226
+
227
+ expect(merged).toHaveLength(2)
228
+ })
229
+
230
+ it('should handle empty array', () => {
231
+ const merged = mergeSmallChunks([], 10)
232
+ expect(merged).toHaveLength(0)
233
+ })
234
+
235
+ it('should update chunk indexes after merge', () => {
236
+ const chunks = [
237
+ { text: 'A', start: 0, end: 1, index: 0 },
238
+ { text: 'B', start: 1, end: 2, index: 1 },
239
+ ]
240
+
241
+ const merged = mergeSmallChunks(chunks, 5)
242
+
243
+ merged.forEach((chunk, i) => {
244
+ expect(chunk.index).toBe(i)
245
+ })
246
+ })
247
+
248
+ it('should merge metadata from merged chunks', () => {
249
+ const chunks = [
250
+ { text: 'A', start: 0, end: 1, index: 0, metadata: { foo: 1 } },
251
+ { text: 'B', start: 1, end: 2, index: 1, metadata: { bar: 2 } },
252
+ ]
253
+
254
+ const merged = mergeSmallChunks(chunks, 5)
255
+
256
+ expect(merged[0].metadata).toEqual({ foo: 1, bar: 2 })
257
+ })
258
+ })