@opensaas/stack-rag 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +4 -0
- package/CHANGELOG.md +10 -0
- package/CLAUDE.md +565 -0
- package/LICENSE +21 -0
- package/README.md +406 -0
- package/dist/config/index.d.ts +63 -0
- package/dist/config/index.d.ts.map +1 -0
- package/dist/config/index.js +94 -0
- package/dist/config/index.js.map +1 -0
- package/dist/config/plugin.d.ts +38 -0
- package/dist/config/plugin.d.ts.map +1 -0
- package/dist/config/plugin.js +215 -0
- package/dist/config/plugin.js.map +1 -0
- package/dist/config/plugin.test.d.ts +2 -0
- package/dist/config/plugin.test.d.ts.map +1 -0
- package/dist/config/plugin.test.js +554 -0
- package/dist/config/plugin.test.js.map +1 -0
- package/dist/config/types.d.ts +249 -0
- package/dist/config/types.d.ts.map +1 -0
- package/dist/config/types.js +5 -0
- package/dist/config/types.js.map +1 -0
- package/dist/fields/embedding.d.ts +85 -0
- package/dist/fields/embedding.d.ts.map +1 -0
- package/dist/fields/embedding.js +81 -0
- package/dist/fields/embedding.js.map +1 -0
- package/dist/fields/embedding.test.d.ts +2 -0
- package/dist/fields/embedding.test.d.ts.map +1 -0
- package/dist/fields/embedding.test.js +323 -0
- package/dist/fields/embedding.test.js.map +1 -0
- package/dist/fields/index.d.ts +6 -0
- package/dist/fields/index.d.ts.map +1 -0
- package/dist/fields/index.js +5 -0
- package/dist/fields/index.js.map +1 -0
- package/dist/index.d.ts +8 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +9 -0
- package/dist/index.js.map +1 -0
- package/dist/mcp/index.d.ts +19 -0
- package/dist/mcp/index.d.ts.map +1 -0
- package/dist/mcp/index.js +18 -0
- package/dist/mcp/index.js.map +1 -0
- package/dist/providers/index.d.ts +38 -0
- package/dist/providers/index.d.ts.map +1 -0
- package/dist/providers/index.js +68 -0
- package/dist/providers/index.js.map +1 -0
- package/dist/providers/ollama.d.ts +49 -0
- package/dist/providers/ollama.d.ts.map +1 -0
- package/dist/providers/ollama.js +151 -0
- package/dist/providers/ollama.js.map +1 -0
- package/dist/providers/openai.d.ts +41 -0
- package/dist/providers/openai.d.ts.map +1 -0
- package/dist/providers/openai.js +126 -0
- package/dist/providers/openai.js.map +1 -0
- package/dist/providers/providers.test.d.ts +2 -0
- package/dist/providers/providers.test.d.ts.map +1 -0
- package/dist/providers/providers.test.js +224 -0
- package/dist/providers/providers.test.js.map +1 -0
- package/dist/providers/types.d.ts +88 -0
- package/dist/providers/types.d.ts.map +1 -0
- package/dist/providers/types.js +2 -0
- package/dist/providers/types.js.map +1 -0
- package/dist/runtime/batch.d.ts +183 -0
- package/dist/runtime/batch.d.ts.map +1 -0
- package/dist/runtime/batch.js +240 -0
- package/dist/runtime/batch.js.map +1 -0
- package/dist/runtime/batch.test.d.ts +2 -0
- package/dist/runtime/batch.test.d.ts.map +1 -0
- package/dist/runtime/batch.test.js +251 -0
- package/dist/runtime/batch.test.js.map +1 -0
- package/dist/runtime/chunking.d.ts +42 -0
- package/dist/runtime/chunking.d.ts.map +1 -0
- package/dist/runtime/chunking.js +264 -0
- package/dist/runtime/chunking.js.map +1 -0
- package/dist/runtime/chunking.test.d.ts +2 -0
- package/dist/runtime/chunking.test.d.ts.map +1 -0
- package/dist/runtime/chunking.test.js +212 -0
- package/dist/runtime/chunking.test.js.map +1 -0
- package/dist/runtime/embeddings.d.ts +147 -0
- package/dist/runtime/embeddings.d.ts.map +1 -0
- package/dist/runtime/embeddings.js +201 -0
- package/dist/runtime/embeddings.js.map +1 -0
- package/dist/runtime/embeddings.test.d.ts +2 -0
- package/dist/runtime/embeddings.test.d.ts.map +1 -0
- package/dist/runtime/embeddings.test.js +366 -0
- package/dist/runtime/embeddings.test.js.map +1 -0
- package/dist/runtime/index.d.ts +14 -0
- package/dist/runtime/index.d.ts.map +1 -0
- package/dist/runtime/index.js +18 -0
- package/dist/runtime/index.js.map +1 -0
- package/dist/runtime/search.d.ts +135 -0
- package/dist/runtime/search.d.ts.map +1 -0
- package/dist/runtime/search.js +101 -0
- package/dist/runtime/search.js.map +1 -0
- package/dist/storage/index.d.ts +41 -0
- package/dist/storage/index.d.ts.map +1 -0
- package/dist/storage/index.js +73 -0
- package/dist/storage/index.js.map +1 -0
- package/dist/storage/json.d.ts +34 -0
- package/dist/storage/json.d.ts.map +1 -0
- package/dist/storage/json.js +82 -0
- package/dist/storage/json.js.map +1 -0
- package/dist/storage/pgvector.d.ts +53 -0
- package/dist/storage/pgvector.d.ts.map +1 -0
- package/dist/storage/pgvector.js +168 -0
- package/dist/storage/pgvector.js.map +1 -0
- package/dist/storage/sqlite-vss.d.ts +49 -0
- package/dist/storage/sqlite-vss.d.ts.map +1 -0
- package/dist/storage/sqlite-vss.js +148 -0
- package/dist/storage/sqlite-vss.js.map +1 -0
- package/dist/storage/storage.test.d.ts +2 -0
- package/dist/storage/storage.test.d.ts.map +1 -0
- package/dist/storage/storage.test.js +440 -0
- package/dist/storage/storage.test.js.map +1 -0
- package/dist/storage/types.d.ts +79 -0
- package/dist/storage/types.d.ts.map +1 -0
- package/dist/storage/types.js +49 -0
- package/dist/storage/types.js.map +1 -0
- package/package.json +82 -0
- package/src/config/index.ts +116 -0
- package/src/config/plugin.test.ts +664 -0
- package/src/config/plugin.ts +257 -0
- package/src/config/types.ts +283 -0
- package/src/fields/embedding.test.ts +408 -0
- package/src/fields/embedding.ts +150 -0
- package/src/fields/index.ts +6 -0
- package/src/index.ts +33 -0
- package/src/mcp/index.ts +21 -0
- package/src/providers/index.ts +81 -0
- package/src/providers/ollama.ts +186 -0
- package/src/providers/openai.ts +161 -0
- package/src/providers/providers.test.ts +275 -0
- package/src/providers/types.ts +100 -0
- package/src/runtime/batch.test.ts +332 -0
- package/src/runtime/batch.ts +424 -0
- package/src/runtime/chunking.test.ts +258 -0
- package/src/runtime/chunking.ts +334 -0
- package/src/runtime/embeddings.test.ts +441 -0
- package/src/runtime/embeddings.ts +380 -0
- package/src/runtime/index.ts +51 -0
- package/src/runtime/search.ts +243 -0
- package/src/storage/index.ts +86 -0
- package/src/storage/json.ts +106 -0
- package/src/storage/pgvector.ts +206 -0
- package/src/storage/sqlite-vss.ts +193 -0
- package/src/storage/storage.test.ts +521 -0
- package/src/storage/types.ts +126 -0
- package/tsconfig.json +13 -0
- package/tsconfig.tsbuildinfo +1 -0
- package/vitest.config.ts +18 -0
|
@@ -0,0 +1,424 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Batch processing utilities with rate limiting and progress tracking
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import type { EmbeddingProvider } from '../providers/types.js'
|
|
6
|
+
import type { StoredEmbedding } from '../config/types.js'
|
|
7
|
+
import { generateEmbeddings } from './embeddings.js'
|
|
8
|
+
|
|
9
|
+
export interface BatchProcessOptions {
|
|
10
|
+
/**
|
|
11
|
+
* Embedding provider to use
|
|
12
|
+
*/
|
|
13
|
+
provider: EmbeddingProvider
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Array of texts to process
|
|
17
|
+
*/
|
|
18
|
+
texts: string[]
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Batch size for processing
|
|
22
|
+
* @default 10
|
|
23
|
+
*/
|
|
24
|
+
batchSize?: number
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Rate limit in requests per minute
|
|
28
|
+
* @default 100
|
|
29
|
+
*/
|
|
30
|
+
rateLimit?: number
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Progress callback called after each batch
|
|
34
|
+
*/
|
|
35
|
+
onProgress?: (progress: BatchProgress) => void
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Error callback called when a batch fails
|
|
39
|
+
* If not provided, errors will be thrown
|
|
40
|
+
*/
|
|
41
|
+
onError?: (error: BatchError) => void
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Number of retries for failed batches
|
|
45
|
+
* @default 3
|
|
46
|
+
*/
|
|
47
|
+
maxRetries?: number
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Initial retry delay in milliseconds
|
|
51
|
+
* @default 1000
|
|
52
|
+
*/
|
|
53
|
+
retryDelay?: number
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Whether to include source hash in metadata
|
|
57
|
+
* @default true
|
|
58
|
+
*/
|
|
59
|
+
includeSourceHash?: boolean
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
export interface BatchProgress {
|
|
63
|
+
/**
|
|
64
|
+
* Number of items processed so far
|
|
65
|
+
*/
|
|
66
|
+
processed: number
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Total number of items to process
|
|
70
|
+
*/
|
|
71
|
+
total: number
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Number of items that failed
|
|
75
|
+
*/
|
|
76
|
+
failed: number
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Percentage completed (0-100)
|
|
80
|
+
*/
|
|
81
|
+
percentage: number
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Current batch number (1-indexed)
|
|
85
|
+
*/
|
|
86
|
+
currentBatch: number
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Total number of batches
|
|
90
|
+
*/
|
|
91
|
+
totalBatches: number
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
export interface BatchError {
|
|
95
|
+
/**
|
|
96
|
+
* Batch number that failed
|
|
97
|
+
*/
|
|
98
|
+
batchNumber: number
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Items in the failed batch
|
|
102
|
+
*/
|
|
103
|
+
items: string[]
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Error that occurred
|
|
107
|
+
*/
|
|
108
|
+
error: Error
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Number of retry attempts made
|
|
112
|
+
*/
|
|
113
|
+
retries: number
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
export interface BatchProcessResult {
|
|
117
|
+
/**
|
|
118
|
+
* Successfully generated embeddings
|
|
119
|
+
*/
|
|
120
|
+
embeddings: StoredEmbedding[]
|
|
121
|
+
|
|
122
|
+
/**
|
|
123
|
+
* Texts that failed to process
|
|
124
|
+
*/
|
|
125
|
+
failed: Array<{ text: string; error: Error }>
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Total processing statistics
|
|
129
|
+
*/
|
|
130
|
+
stats: {
|
|
131
|
+
total: number
|
|
132
|
+
successful: number
|
|
133
|
+
failed: number
|
|
134
|
+
duration: number
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Process embeddings in batches with rate limiting and retry logic
|
|
140
|
+
*
|
|
141
|
+
* @example
|
|
142
|
+
* ```typescript
|
|
143
|
+
* const result = await batchProcess({
|
|
144
|
+
* provider: createEmbeddingProvider({ type: 'openai', apiKey: '...' }),
|
|
145
|
+
* texts: largeTextArray,
|
|
146
|
+
* batchSize: 10,
|
|
147
|
+
* rateLimit: 60, // 60 requests per minute
|
|
148
|
+
* onProgress: (progress) => {
|
|
149
|
+
* console.log(`Progress: ${progress.percentage}%`)
|
|
150
|
+
* },
|
|
151
|
+
* })
|
|
152
|
+
*
|
|
153
|
+
* console.log(`Successfully processed: ${result.stats.successful}`)
|
|
154
|
+
* console.log(`Failed: ${result.stats.failed}`)
|
|
155
|
+
* ```
|
|
156
|
+
*/
|
|
157
|
+
export async function batchProcess(options: BatchProcessOptions): Promise<BatchProcessResult> {
|
|
158
|
+
const {
|
|
159
|
+
provider,
|
|
160
|
+
texts,
|
|
161
|
+
batchSize = 10,
|
|
162
|
+
rateLimit = 100,
|
|
163
|
+
onProgress,
|
|
164
|
+
onError,
|
|
165
|
+
maxRetries = 3,
|
|
166
|
+
retryDelay = 1000,
|
|
167
|
+
includeSourceHash = true,
|
|
168
|
+
} = options
|
|
169
|
+
|
|
170
|
+
const startTime = Date.now()
|
|
171
|
+
const totalBatches = Math.ceil(texts.length / batchSize)
|
|
172
|
+
const embeddings: StoredEmbedding[] = []
|
|
173
|
+
const failed: Array<{ text: string; error: Error }> = []
|
|
174
|
+
|
|
175
|
+
// Calculate delay between batches to respect rate limit
|
|
176
|
+
const delayBetweenBatches = calculateBatchDelay(rateLimit)
|
|
177
|
+
|
|
178
|
+
for (let i = 0; i < texts.length; i += batchSize) {
|
|
179
|
+
const batchNumber = Math.floor(i / batchSize) + 1
|
|
180
|
+
const batch = texts.slice(i, i + batchSize)
|
|
181
|
+
|
|
182
|
+
try {
|
|
183
|
+
// Process batch with retry logic
|
|
184
|
+
const batchEmbeddings = await retryWithBackoff(
|
|
185
|
+
async () =>
|
|
186
|
+
generateEmbeddings({
|
|
187
|
+
provider,
|
|
188
|
+
texts: batch,
|
|
189
|
+
includeSourceHash,
|
|
190
|
+
batchSize: batch.length,
|
|
191
|
+
}),
|
|
192
|
+
maxRetries,
|
|
193
|
+
retryDelay,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
embeddings.push(...batchEmbeddings)
|
|
197
|
+
|
|
198
|
+
// Report progress
|
|
199
|
+
if (onProgress) {
|
|
200
|
+
const processed = Math.min(i + batchSize, texts.length)
|
|
201
|
+
onProgress({
|
|
202
|
+
processed,
|
|
203
|
+
total: texts.length,
|
|
204
|
+
failed: failed.length,
|
|
205
|
+
percentage: Math.round((processed / texts.length) * 100),
|
|
206
|
+
currentBatch: batchNumber,
|
|
207
|
+
totalBatches,
|
|
208
|
+
})
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
// Rate limiting: wait before next batch (except for last batch)
|
|
212
|
+
if (batchNumber < totalBatches && delayBetweenBatches > 0) {
|
|
213
|
+
await sleep(delayBetweenBatches)
|
|
214
|
+
}
|
|
215
|
+
} catch (error) {
|
|
216
|
+
const batchError: BatchError = {
|
|
217
|
+
batchNumber,
|
|
218
|
+
items: batch,
|
|
219
|
+
error: error instanceof Error ? error : new Error(String(error)),
|
|
220
|
+
retries: maxRetries,
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
if (onError) {
|
|
224
|
+
onError(batchError)
|
|
225
|
+
} else {
|
|
226
|
+
// If no error handler, throw the error
|
|
227
|
+
throw error
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// Add all items in batch to failed list
|
|
231
|
+
for (const text of batch) {
|
|
232
|
+
failed.push({
|
|
233
|
+
text,
|
|
234
|
+
error: batchError.error,
|
|
235
|
+
})
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
const duration = Date.now() - startTime
|
|
241
|
+
|
|
242
|
+
return {
|
|
243
|
+
embeddings,
|
|
244
|
+
failed,
|
|
245
|
+
stats: {
|
|
246
|
+
total: texts.length,
|
|
247
|
+
successful: embeddings.length,
|
|
248
|
+
failed: failed.length,
|
|
249
|
+
duration,
|
|
250
|
+
},
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
/**
|
|
255
|
+
* Rate limiter for controlling API request rate
|
|
256
|
+
*/
|
|
257
|
+
export class RateLimiter {
|
|
258
|
+
private queue: Array<() => void> = []
|
|
259
|
+
private requestTimestamps: number[] = []
|
|
260
|
+
private readonly requestsPerMinute: number
|
|
261
|
+
|
|
262
|
+
constructor(requestsPerMinute: number) {
|
|
263
|
+
this.requestsPerMinute = requestsPerMinute
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
/**
|
|
267
|
+
* Wait until rate limit allows next request
|
|
268
|
+
*/
|
|
269
|
+
async waitForSlot(): Promise<void> {
|
|
270
|
+
return new Promise((resolve) => {
|
|
271
|
+
const tryAcquire = () => {
|
|
272
|
+
const now = Date.now()
|
|
273
|
+
const oneMinuteAgo = now - 60_000
|
|
274
|
+
|
|
275
|
+
// Remove timestamps older than 1 minute
|
|
276
|
+
this.requestTimestamps = this.requestTimestamps.filter((t) => t > oneMinuteAgo)
|
|
277
|
+
|
|
278
|
+
// Check if we can make a request
|
|
279
|
+
if (this.requestTimestamps.length < this.requestsPerMinute) {
|
|
280
|
+
this.requestTimestamps.push(now)
|
|
281
|
+
resolve()
|
|
282
|
+
return
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
// Calculate wait time until oldest request expires
|
|
286
|
+
const oldestTimestamp = this.requestTimestamps[0]
|
|
287
|
+
const waitTime = oldestTimestamp + 60_000 - now
|
|
288
|
+
|
|
289
|
+
// Try again after wait time
|
|
290
|
+
setTimeout(tryAcquire, Math.max(waitTime, 100))
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
tryAcquire()
|
|
294
|
+
})
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
/**
|
|
298
|
+
* Execute a function with rate limiting
|
|
299
|
+
*/
|
|
300
|
+
async execute<T>(fn: () => Promise<T>): Promise<T> {
|
|
301
|
+
await this.waitForSlot()
|
|
302
|
+
return fn()
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
/**
|
|
307
|
+
* Calculate delay between batches to respect rate limit
|
|
308
|
+
*/
|
|
309
|
+
function calculateBatchDelay(requestsPerMinute: number): number {
|
|
310
|
+
// Each batch counts as one request
|
|
311
|
+
const requestsPerSecond = requestsPerMinute / 60
|
|
312
|
+
const delayPerRequest = 1000 / requestsPerSecond
|
|
313
|
+
|
|
314
|
+
// Return delay in milliseconds
|
|
315
|
+
return Math.ceil(delayPerRequest)
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
/**
|
|
319
|
+
* Sleep for specified milliseconds
|
|
320
|
+
*/
|
|
321
|
+
function sleep(ms: number): Promise<void> {
|
|
322
|
+
return new Promise((resolve) => setTimeout(resolve, ms))
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
/**
|
|
326
|
+
* Retry a function with exponential backoff
|
|
327
|
+
*/
|
|
328
|
+
async function retryWithBackoff<T>(
|
|
329
|
+
fn: () => Promise<T>,
|
|
330
|
+
maxRetries: number,
|
|
331
|
+
initialDelay: number,
|
|
332
|
+
): Promise<T> {
|
|
333
|
+
let lastError: Error | undefined
|
|
334
|
+
|
|
335
|
+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
336
|
+
try {
|
|
337
|
+
return await fn()
|
|
338
|
+
} catch (error) {
|
|
339
|
+
lastError = error instanceof Error ? error : new Error(String(error))
|
|
340
|
+
|
|
341
|
+
// Don't retry on last attempt
|
|
342
|
+
if (attempt === maxRetries) {
|
|
343
|
+
break
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
// Exponential backoff: delay * 2^attempt
|
|
347
|
+
const delay = initialDelay * Math.pow(2, attempt)
|
|
348
|
+
await sleep(delay)
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
throw lastError
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
/**
|
|
356
|
+
* Queue for processing items with concurrency control
|
|
357
|
+
*/
|
|
358
|
+
export class ProcessingQueue<T, R> {
|
|
359
|
+
private queue: Array<{ item: T; resolve: (value: R) => void; reject: (error: Error) => void }> =
|
|
360
|
+
[]
|
|
361
|
+
private processing = 0
|
|
362
|
+
private readonly concurrency: number
|
|
363
|
+
private readonly processor: (item: T) => Promise<R>
|
|
364
|
+
|
|
365
|
+
constructor(processor: (item: T) => Promise<R>, concurrency: number = 1) {
|
|
366
|
+
this.processor = processor
|
|
367
|
+
this.concurrency = concurrency
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
/**
|
|
371
|
+
* Add an item to the queue
|
|
372
|
+
*/
|
|
373
|
+
async add(item: T): Promise<R> {
|
|
374
|
+
return new Promise((resolve, reject) => {
|
|
375
|
+
this.queue.push({ item, resolve, reject })
|
|
376
|
+
this.processNext()
|
|
377
|
+
})
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
/**
|
|
381
|
+
* Add multiple items to the queue
|
|
382
|
+
*/
|
|
383
|
+
async addBatch(items: T[]): Promise<R[]> {
|
|
384
|
+
return Promise.all(items.map((item) => this.add(item)))
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
/**
|
|
388
|
+
* Process next item in queue
|
|
389
|
+
*/
|
|
390
|
+
private async processNext(): Promise<void> {
|
|
391
|
+
if (this.processing >= this.concurrency || this.queue.length === 0) {
|
|
392
|
+
return
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
this.processing++
|
|
396
|
+
const next = this.queue.shift()
|
|
397
|
+
|
|
398
|
+
if (next) {
|
|
399
|
+
try {
|
|
400
|
+
const result = await this.processor(next.item)
|
|
401
|
+
next.resolve(result)
|
|
402
|
+
} catch (error) {
|
|
403
|
+
next.reject(error instanceof Error ? error : new Error(String(error)))
|
|
404
|
+
} finally {
|
|
405
|
+
this.processing--
|
|
406
|
+
this.processNext()
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
/**
|
|
412
|
+
* Get current queue size
|
|
413
|
+
*/
|
|
414
|
+
get size(): number {
|
|
415
|
+
return this.queue.length
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
/**
|
|
419
|
+
* Get number of items currently being processed
|
|
420
|
+
*/
|
|
421
|
+
get activeCount(): number {
|
|
422
|
+
return this.processing
|
|
423
|
+
}
|
|
424
|
+
}
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest'
|
|
2
|
+
import { chunkText, estimateTokenCount, mergeSmallChunks } from './chunking.js'
|
|
3
|
+
|
|
4
|
+
describe('chunkText', () => {
|
|
5
|
+
describe('recursive strategy', () => {
|
|
6
|
+
it('should chunk text at paragraph boundaries', () => {
|
|
7
|
+
const text = 'Paragraph 1\n\nParagraph 2\n\nParagraph 3'
|
|
8
|
+
const chunks = chunkText(text, {
|
|
9
|
+
strategy: 'recursive',
|
|
10
|
+
chunkSize: 15,
|
|
11
|
+
chunkOverlap: 0,
|
|
12
|
+
})
|
|
13
|
+
|
|
14
|
+
expect(chunks).toHaveLength(3)
|
|
15
|
+
expect(chunks[0].text).toBe('Paragraph 1\n\n')
|
|
16
|
+
expect(chunks[1].text).toBe('Paragraph 2\n\n')
|
|
17
|
+
expect(chunks[2].text).toBe('Paragraph 3')
|
|
18
|
+
})
|
|
19
|
+
|
|
20
|
+
it('should handle overlap between chunks', () => {
|
|
21
|
+
const text = 'First chunk here. Second chunk here. Third chunk here.'
|
|
22
|
+
const chunks = chunkText(text, {
|
|
23
|
+
strategy: 'recursive',
|
|
24
|
+
chunkSize: 20,
|
|
25
|
+
chunkOverlap: 5,
|
|
26
|
+
})
|
|
27
|
+
|
|
28
|
+
expect(chunks.length).toBeGreaterThan(1)
|
|
29
|
+
|
|
30
|
+
// Check that chunks have some overlap
|
|
31
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
32
|
+
const prevChunk = chunks[i - 1].text
|
|
33
|
+
const currChunk = chunks[i].text
|
|
34
|
+
const overlap = prevChunk.slice(-5)
|
|
35
|
+
// Overlap might not be exact due to sentence boundaries
|
|
36
|
+
expect(currChunk).toContain(overlap.trim().split(' ')[0])
|
|
37
|
+
}
|
|
38
|
+
})
|
|
39
|
+
|
|
40
|
+
it('should respect chunk size limits', () => {
|
|
41
|
+
const text = 'A'.repeat(1000)
|
|
42
|
+
const chunks = chunkText(text, {
|
|
43
|
+
strategy: 'recursive',
|
|
44
|
+
chunkSize: 100,
|
|
45
|
+
chunkOverlap: 0,
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
for (const chunk of chunks) {
|
|
49
|
+
expect(chunk.text.length).toBeLessThanOrEqual(100)
|
|
50
|
+
}
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
it('should track chunk positions correctly', () => {
|
|
54
|
+
const text = 'Start. Middle. End.'
|
|
55
|
+
const chunks = chunkText(text, {
|
|
56
|
+
strategy: 'recursive',
|
|
57
|
+
chunkSize: 100,
|
|
58
|
+
chunkOverlap: 0,
|
|
59
|
+
})
|
|
60
|
+
|
|
61
|
+
expect(chunks[0].start).toBe(0)
|
|
62
|
+
expect(chunks[0].end).toBe(text.length)
|
|
63
|
+
expect(chunks[0].text).toBe(text)
|
|
64
|
+
})
|
|
65
|
+
})
|
|
66
|
+
|
|
67
|
+
describe('sentence strategy', () => {
|
|
68
|
+
it('should preserve sentence boundaries', () => {
|
|
69
|
+
const text = 'First sentence. Second sentence. Third sentence.'
|
|
70
|
+
const chunks = chunkText(text, {
|
|
71
|
+
strategy: 'sentence',
|
|
72
|
+
chunkSize: 20,
|
|
73
|
+
chunkOverlap: 0,
|
|
74
|
+
})
|
|
75
|
+
|
|
76
|
+
expect(chunks.length).toBeGreaterThan(1)
|
|
77
|
+
|
|
78
|
+
// Each chunk should end with sentence punctuation
|
|
79
|
+
for (const chunk of chunks) {
|
|
80
|
+
expect(chunk.text.trim()).toMatch(/[.!?]$/)
|
|
81
|
+
}
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
it('should handle text with no sentences', () => {
|
|
85
|
+
const text = 'No sentence markers here'
|
|
86
|
+
const chunks = chunkText(text, {
|
|
87
|
+
strategy: 'sentence',
|
|
88
|
+
chunkSize: 10,
|
|
89
|
+
chunkOverlap: 0,
|
|
90
|
+
})
|
|
91
|
+
|
|
92
|
+
expect(chunks).toHaveLength(1)
|
|
93
|
+
expect(chunks[0].text).toBe(text)
|
|
94
|
+
})
|
|
95
|
+
})
|
|
96
|
+
|
|
97
|
+
describe('sliding-window strategy', () => {
|
|
98
|
+
it('should create overlapping fixed-size chunks', () => {
|
|
99
|
+
const text = 'A'.repeat(100)
|
|
100
|
+
const chunks = chunkText(text, {
|
|
101
|
+
strategy: 'sliding-window',
|
|
102
|
+
chunkSize: 30,
|
|
103
|
+
chunkOverlap: 10,
|
|
104
|
+
})
|
|
105
|
+
|
|
106
|
+
expect(chunks.length).toBeGreaterThan(1)
|
|
107
|
+
|
|
108
|
+
// Check fixed size (except possibly last chunk)
|
|
109
|
+
for (let i = 0; i < chunks.length - 1; i++) {
|
|
110
|
+
expect(chunks[i].text.length).toBe(30)
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// Check overlap
|
|
114
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
115
|
+
const prevChunk = chunks[i - 1]
|
|
116
|
+
const currChunk = chunks[i]
|
|
117
|
+
expect(currChunk.start).toBe(prevChunk.start + 20) // step = chunkSize - overlap
|
|
118
|
+
}
|
|
119
|
+
})
|
|
120
|
+
|
|
121
|
+
it('should skip empty chunks', () => {
|
|
122
|
+
const text = ' ' // Just whitespace
|
|
123
|
+
const chunks = chunkText(text, {
|
|
124
|
+
strategy: 'sliding-window',
|
|
125
|
+
chunkSize: 10,
|
|
126
|
+
chunkOverlap: 0,
|
|
127
|
+
})
|
|
128
|
+
|
|
129
|
+
expect(chunks).toHaveLength(0)
|
|
130
|
+
})
|
|
131
|
+
})
|
|
132
|
+
|
|
133
|
+
describe('token-aware strategy', () => {
|
|
134
|
+
it('should estimate token limits', () => {
|
|
135
|
+
const text = 'A'.repeat(400) // ~100 tokens at 4 chars/token
|
|
136
|
+
const chunks = chunkText(text, {
|
|
137
|
+
strategy: 'token-aware',
|
|
138
|
+
tokenLimit: 50,
|
|
139
|
+
chunkOverlap: 0,
|
|
140
|
+
})
|
|
141
|
+
|
|
142
|
+
expect(chunks.length).toBeGreaterThanOrEqual(2)
|
|
143
|
+
|
|
144
|
+
// Each chunk should be roughly under token limit * 4 chars
|
|
145
|
+
for (const chunk of chunks) {
|
|
146
|
+
expect(chunk.text.length).toBeLessThanOrEqual(50 * 4)
|
|
147
|
+
}
|
|
148
|
+
})
|
|
149
|
+
})
|
|
150
|
+
|
|
151
|
+
describe('edge cases', () => {
|
|
152
|
+
it('should handle empty text', () => {
|
|
153
|
+
const chunks = chunkText('', { chunkSize: 100 })
|
|
154
|
+
expect(chunks).toHaveLength(0)
|
|
155
|
+
})
|
|
156
|
+
|
|
157
|
+
it('should handle text smaller than chunk size', () => {
|
|
158
|
+
const text = 'Small text'
|
|
159
|
+
const chunks = chunkText(text, { chunkSize: 1000 })
|
|
160
|
+
|
|
161
|
+
expect(chunks).toHaveLength(1)
|
|
162
|
+
expect(chunks[0].text).toBe(text)
|
|
163
|
+
expect(chunks[0].start).toBe(0)
|
|
164
|
+
expect(chunks[0].end).toBe(text.length)
|
|
165
|
+
})
|
|
166
|
+
|
|
167
|
+
it('should throw error if overlap >= chunk size', () => {
|
|
168
|
+
expect(() => {
|
|
169
|
+
chunkText('text', { chunkSize: 10, chunkOverlap: 10 })
|
|
170
|
+
}).toThrow('chunkOverlap must be less than chunkSize')
|
|
171
|
+
})
|
|
172
|
+
|
|
173
|
+
it('should assign correct chunk indexes', () => {
|
|
174
|
+
const text = 'A'.repeat(300)
|
|
175
|
+
const chunks = chunkText(text, { chunkSize: 100, chunkOverlap: 0 })
|
|
176
|
+
|
|
177
|
+
chunks.forEach((chunk, i) => {
|
|
178
|
+
expect(chunk.index).toBe(i)
|
|
179
|
+
})
|
|
180
|
+
})
|
|
181
|
+
})
|
|
182
|
+
})
|
|
183
|
+
|
|
184
|
+
describe('estimateTokenCount', () => {
|
|
185
|
+
it('should estimate token count', () => {
|
|
186
|
+
const text = 'Hello world'
|
|
187
|
+
const count = estimateTokenCount(text)
|
|
188
|
+
|
|
189
|
+
// "Hello world" is 11 chars / 4 = ~3 tokens
|
|
190
|
+
expect(count).toBe(3)
|
|
191
|
+
})
|
|
192
|
+
|
|
193
|
+
it('should handle empty text', () => {
|
|
194
|
+
expect(estimateTokenCount('')).toBe(0)
|
|
195
|
+
})
|
|
196
|
+
|
|
197
|
+
it('should handle long text', () => {
|
|
198
|
+
const text = 'A'.repeat(1000)
|
|
199
|
+
const count = estimateTokenCount(text)
|
|
200
|
+
|
|
201
|
+
expect(count).toBe(250) // 1000 / 4
|
|
202
|
+
})
|
|
203
|
+
})
|
|
204
|
+
|
|
205
|
+
describe('mergeSmallChunks', () => {
|
|
206
|
+
it('should merge chunks below minimum size', () => {
|
|
207
|
+
const chunks = [
|
|
208
|
+
{ text: 'A', start: 0, end: 1, index: 0 },
|
|
209
|
+
{ text: 'B', start: 1, end: 2, index: 1 },
|
|
210
|
+
{ text: 'C', start: 2, end: 3, index: 2 },
|
|
211
|
+
]
|
|
212
|
+
|
|
213
|
+
const merged = mergeSmallChunks(chunks, 2)
|
|
214
|
+
|
|
215
|
+
expect(merged.length).toBeLessThan(chunks.length)
|
|
216
|
+
expect(merged[0].text.length).toBeGreaterThanOrEqual(2)
|
|
217
|
+
})
|
|
218
|
+
|
|
219
|
+
it('should not merge chunks already above minimum size', () => {
|
|
220
|
+
const chunks = [
|
|
221
|
+
{ text: 'AAA', start: 0, end: 3, index: 0 },
|
|
222
|
+
{ text: 'BBB', start: 3, end: 6, index: 1 },
|
|
223
|
+
]
|
|
224
|
+
|
|
225
|
+
const merged = mergeSmallChunks(chunks, 2)
|
|
226
|
+
|
|
227
|
+
expect(merged).toHaveLength(2)
|
|
228
|
+
})
|
|
229
|
+
|
|
230
|
+
it('should handle empty array', () => {
|
|
231
|
+
const merged = mergeSmallChunks([], 10)
|
|
232
|
+
expect(merged).toHaveLength(0)
|
|
233
|
+
})
|
|
234
|
+
|
|
235
|
+
it('should update chunk indexes after merge', () => {
|
|
236
|
+
const chunks = [
|
|
237
|
+
{ text: 'A', start: 0, end: 1, index: 0 },
|
|
238
|
+
{ text: 'B', start: 1, end: 2, index: 1 },
|
|
239
|
+
]
|
|
240
|
+
|
|
241
|
+
const merged = mergeSmallChunks(chunks, 5)
|
|
242
|
+
|
|
243
|
+
merged.forEach((chunk, i) => {
|
|
244
|
+
expect(chunk.index).toBe(i)
|
|
245
|
+
})
|
|
246
|
+
})
|
|
247
|
+
|
|
248
|
+
it('should merge metadata from merged chunks', () => {
|
|
249
|
+
const chunks = [
|
|
250
|
+
{ text: 'A', start: 0, end: 1, index: 0, metadata: { foo: 1 } },
|
|
251
|
+
{ text: 'B', start: 1, end: 2, index: 1, metadata: { bar: 2 } },
|
|
252
|
+
]
|
|
253
|
+
|
|
254
|
+
const merged = mergeSmallChunks(chunks, 5)
|
|
255
|
+
|
|
256
|
+
expect(merged[0].metadata).toEqual({ foo: 1, bar: 2 })
|
|
257
|
+
})
|
|
258
|
+
})
|