@opensaas/stack-rag 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +4 -0
- package/CHANGELOG.md +10 -0
- package/CLAUDE.md +565 -0
- package/LICENSE +21 -0
- package/README.md +406 -0
- package/dist/config/index.d.ts +63 -0
- package/dist/config/index.d.ts.map +1 -0
- package/dist/config/index.js +94 -0
- package/dist/config/index.js.map +1 -0
- package/dist/config/plugin.d.ts +38 -0
- package/dist/config/plugin.d.ts.map +1 -0
- package/dist/config/plugin.js +215 -0
- package/dist/config/plugin.js.map +1 -0
- package/dist/config/plugin.test.d.ts +2 -0
- package/dist/config/plugin.test.d.ts.map +1 -0
- package/dist/config/plugin.test.js +554 -0
- package/dist/config/plugin.test.js.map +1 -0
- package/dist/config/types.d.ts +249 -0
- package/dist/config/types.d.ts.map +1 -0
- package/dist/config/types.js +5 -0
- package/dist/config/types.js.map +1 -0
- package/dist/fields/embedding.d.ts +85 -0
- package/dist/fields/embedding.d.ts.map +1 -0
- package/dist/fields/embedding.js +81 -0
- package/dist/fields/embedding.js.map +1 -0
- package/dist/fields/embedding.test.d.ts +2 -0
- package/dist/fields/embedding.test.d.ts.map +1 -0
- package/dist/fields/embedding.test.js +323 -0
- package/dist/fields/embedding.test.js.map +1 -0
- package/dist/fields/index.d.ts +6 -0
- package/dist/fields/index.d.ts.map +1 -0
- package/dist/fields/index.js +5 -0
- package/dist/fields/index.js.map +1 -0
- package/dist/index.d.ts +8 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +9 -0
- package/dist/index.js.map +1 -0
- package/dist/mcp/index.d.ts +19 -0
- package/dist/mcp/index.d.ts.map +1 -0
- package/dist/mcp/index.js +18 -0
- package/dist/mcp/index.js.map +1 -0
- package/dist/providers/index.d.ts +38 -0
- package/dist/providers/index.d.ts.map +1 -0
- package/dist/providers/index.js +68 -0
- package/dist/providers/index.js.map +1 -0
- package/dist/providers/ollama.d.ts +49 -0
- package/dist/providers/ollama.d.ts.map +1 -0
- package/dist/providers/ollama.js +151 -0
- package/dist/providers/ollama.js.map +1 -0
- package/dist/providers/openai.d.ts +41 -0
- package/dist/providers/openai.d.ts.map +1 -0
- package/dist/providers/openai.js +126 -0
- package/dist/providers/openai.js.map +1 -0
- package/dist/providers/providers.test.d.ts +2 -0
- package/dist/providers/providers.test.d.ts.map +1 -0
- package/dist/providers/providers.test.js +224 -0
- package/dist/providers/providers.test.js.map +1 -0
- package/dist/providers/types.d.ts +88 -0
- package/dist/providers/types.d.ts.map +1 -0
- package/dist/providers/types.js +2 -0
- package/dist/providers/types.js.map +1 -0
- package/dist/runtime/batch.d.ts +183 -0
- package/dist/runtime/batch.d.ts.map +1 -0
- package/dist/runtime/batch.js +240 -0
- package/dist/runtime/batch.js.map +1 -0
- package/dist/runtime/batch.test.d.ts +2 -0
- package/dist/runtime/batch.test.d.ts.map +1 -0
- package/dist/runtime/batch.test.js +251 -0
- package/dist/runtime/batch.test.js.map +1 -0
- package/dist/runtime/chunking.d.ts +42 -0
- package/dist/runtime/chunking.d.ts.map +1 -0
- package/dist/runtime/chunking.js +264 -0
- package/dist/runtime/chunking.js.map +1 -0
- package/dist/runtime/chunking.test.d.ts +2 -0
- package/dist/runtime/chunking.test.d.ts.map +1 -0
- package/dist/runtime/chunking.test.js +212 -0
- package/dist/runtime/chunking.test.js.map +1 -0
- package/dist/runtime/embeddings.d.ts +147 -0
- package/dist/runtime/embeddings.d.ts.map +1 -0
- package/dist/runtime/embeddings.js +201 -0
- package/dist/runtime/embeddings.js.map +1 -0
- package/dist/runtime/embeddings.test.d.ts +2 -0
- package/dist/runtime/embeddings.test.d.ts.map +1 -0
- package/dist/runtime/embeddings.test.js +366 -0
- package/dist/runtime/embeddings.test.js.map +1 -0
- package/dist/runtime/index.d.ts +14 -0
- package/dist/runtime/index.d.ts.map +1 -0
- package/dist/runtime/index.js +18 -0
- package/dist/runtime/index.js.map +1 -0
- package/dist/runtime/search.d.ts +135 -0
- package/dist/runtime/search.d.ts.map +1 -0
- package/dist/runtime/search.js +101 -0
- package/dist/runtime/search.js.map +1 -0
- package/dist/storage/index.d.ts +41 -0
- package/dist/storage/index.d.ts.map +1 -0
- package/dist/storage/index.js +73 -0
- package/dist/storage/index.js.map +1 -0
- package/dist/storage/json.d.ts +34 -0
- package/dist/storage/json.d.ts.map +1 -0
- package/dist/storage/json.js +82 -0
- package/dist/storage/json.js.map +1 -0
- package/dist/storage/pgvector.d.ts +53 -0
- package/dist/storage/pgvector.d.ts.map +1 -0
- package/dist/storage/pgvector.js +168 -0
- package/dist/storage/pgvector.js.map +1 -0
- package/dist/storage/sqlite-vss.d.ts +49 -0
- package/dist/storage/sqlite-vss.d.ts.map +1 -0
- package/dist/storage/sqlite-vss.js +148 -0
- package/dist/storage/sqlite-vss.js.map +1 -0
- package/dist/storage/storage.test.d.ts +2 -0
- package/dist/storage/storage.test.d.ts.map +1 -0
- package/dist/storage/storage.test.js +440 -0
- package/dist/storage/storage.test.js.map +1 -0
- package/dist/storage/types.d.ts +79 -0
- package/dist/storage/types.d.ts.map +1 -0
- package/dist/storage/types.js +49 -0
- package/dist/storage/types.js.map +1 -0
- package/package.json +82 -0
- package/src/config/index.ts +116 -0
- package/src/config/plugin.test.ts +664 -0
- package/src/config/plugin.ts +257 -0
- package/src/config/types.ts +283 -0
- package/src/fields/embedding.test.ts +408 -0
- package/src/fields/embedding.ts +150 -0
- package/src/fields/index.ts +6 -0
- package/src/index.ts +33 -0
- package/src/mcp/index.ts +21 -0
- package/src/providers/index.ts +81 -0
- package/src/providers/ollama.ts +186 -0
- package/src/providers/openai.ts +161 -0
- package/src/providers/providers.test.ts +275 -0
- package/src/providers/types.ts +100 -0
- package/src/runtime/batch.test.ts +332 -0
- package/src/runtime/batch.ts +424 -0
- package/src/runtime/chunking.test.ts +258 -0
- package/src/runtime/chunking.ts +334 -0
- package/src/runtime/embeddings.test.ts +441 -0
- package/src/runtime/embeddings.ts +380 -0
- package/src/runtime/index.ts +51 -0
- package/src/runtime/search.ts +243 -0
- package/src/storage/index.ts +86 -0
- package/src/storage/json.ts +106 -0
- package/src/storage/pgvector.ts +206 -0
- package/src/storage/sqlite-vss.ts +193 -0
- package/src/storage/storage.test.ts +521 -0
- package/src/storage/types.ts +126 -0
- package/tsconfig.json +13 -0
- package/tsconfig.tsbuildinfo +1 -0
- package/vitest.config.ts +18 -0
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Text chunking utilities for splitting documents into smaller segments
|
|
3
|
+
* suitable for embedding generation.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
export type ChunkingStrategy = 'recursive' | 'sentence' | 'sliding-window' | 'token-aware'
|
|
7
|
+
|
|
8
|
+
export interface ChunkingOptions {
|
|
9
|
+
/** Target chunk size in characters */
|
|
10
|
+
chunkSize?: number
|
|
11
|
+
/** Overlap between chunks in characters */
|
|
12
|
+
chunkOverlap?: number
|
|
13
|
+
/** Strategy for chunking text */
|
|
14
|
+
strategy?: ChunkingStrategy
|
|
15
|
+
/** Separators for recursive strategy (in priority order) */
|
|
16
|
+
separators?: string[]
|
|
17
|
+
/** Token limit for token-aware strategy */
|
|
18
|
+
tokenLimit?: number
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export interface TextChunk {
|
|
22
|
+
/** The chunked text content */
|
|
23
|
+
text: string
|
|
24
|
+
/** Start position in original text */
|
|
25
|
+
start: number
|
|
26
|
+
/** End position in original text */
|
|
27
|
+
end: number
|
|
28
|
+
/** Chunk index */
|
|
29
|
+
index: number
|
|
30
|
+
/** Metadata about the chunk */
|
|
31
|
+
metadata?: Record<string, unknown>
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
const DEFAULT_SEPARATORS = ['\n\n', '\n', '. ', ' ', '']
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Split text into chunks using specified strategy
|
|
38
|
+
*/
|
|
39
|
+
export function chunkText(text: string, options: ChunkingOptions = {}): TextChunk[] {
|
|
40
|
+
const {
|
|
41
|
+
chunkSize = 1000,
|
|
42
|
+
chunkOverlap = 200,
|
|
43
|
+
strategy = 'recursive',
|
|
44
|
+
separators = DEFAULT_SEPARATORS,
|
|
45
|
+
tokenLimit,
|
|
46
|
+
} = options
|
|
47
|
+
|
|
48
|
+
// Handle empty text early
|
|
49
|
+
if (!text || text.trim().length === 0) {
|
|
50
|
+
return []
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
if (chunkOverlap >= chunkSize) {
|
|
54
|
+
throw new Error('chunkOverlap must be less than chunkSize')
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
switch (strategy) {
|
|
58
|
+
case 'recursive':
|
|
59
|
+
return recursiveChunk(text, chunkSize, chunkOverlap, separators)
|
|
60
|
+
case 'sentence':
|
|
61
|
+
return sentenceChunk(text, chunkSize, chunkOverlap)
|
|
62
|
+
case 'sliding-window':
|
|
63
|
+
return slidingWindowChunk(text, chunkSize, chunkOverlap)
|
|
64
|
+
case 'token-aware':
|
|
65
|
+
return tokenAwareChunk(text, tokenLimit || chunkSize, chunkOverlap)
|
|
66
|
+
default:
|
|
67
|
+
throw new Error(`Unknown chunking strategy: ${strategy}`)
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Recursive text splitting - tries to split by paragraphs, then sentences, then words
|
|
73
|
+
*/
|
|
74
|
+
function recursiveChunk(
|
|
75
|
+
text: string,
|
|
76
|
+
chunkSize: number,
|
|
77
|
+
overlap: number,
|
|
78
|
+
separators: string[],
|
|
79
|
+
): TextChunk[] {
|
|
80
|
+
const chunks: TextChunk[] = []
|
|
81
|
+
|
|
82
|
+
function splitRecursive(content: string, startPos: number, sepIndex: number): void {
|
|
83
|
+
if (content.length <= chunkSize) {
|
|
84
|
+
if (content.trim()) {
|
|
85
|
+
chunks.push({
|
|
86
|
+
text: content,
|
|
87
|
+
start: startPos,
|
|
88
|
+
end: startPos + content.length,
|
|
89
|
+
index: chunks.length,
|
|
90
|
+
})
|
|
91
|
+
}
|
|
92
|
+
return
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
if (sepIndex >= separators.length) {
|
|
96
|
+
// No more separators, force split at chunkSize
|
|
97
|
+
let pos = 0
|
|
98
|
+
while (pos < content.length) {
|
|
99
|
+
const end = Math.min(pos + chunkSize, content.length)
|
|
100
|
+
const chunk = content.slice(pos, end)
|
|
101
|
+
if (chunk.trim()) {
|
|
102
|
+
chunks.push({
|
|
103
|
+
text: chunk,
|
|
104
|
+
start: startPos + pos,
|
|
105
|
+
end: startPos + end,
|
|
106
|
+
index: chunks.length,
|
|
107
|
+
})
|
|
108
|
+
}
|
|
109
|
+
pos += chunkSize - overlap
|
|
110
|
+
}
|
|
111
|
+
return
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const separator = separators[sepIndex]
|
|
115
|
+
const parts = content.split(separator)
|
|
116
|
+
|
|
117
|
+
let currentChunk = ''
|
|
118
|
+
let chunkStart = startPos
|
|
119
|
+
|
|
120
|
+
for (let i = 0; i < parts.length; i++) {
|
|
121
|
+
const part = parts[i] + (i < parts.length - 1 ? separator : '')
|
|
122
|
+
|
|
123
|
+
if (currentChunk.length + part.length <= chunkSize) {
|
|
124
|
+
currentChunk += part
|
|
125
|
+
} else {
|
|
126
|
+
if (currentChunk.trim()) {
|
|
127
|
+
// Try to split current chunk with next separator
|
|
128
|
+
if (currentChunk.length > chunkSize) {
|
|
129
|
+
splitRecursive(currentChunk, chunkStart, sepIndex + 1)
|
|
130
|
+
chunkStart += currentChunk.length
|
|
131
|
+
} else {
|
|
132
|
+
chunks.push({
|
|
133
|
+
text: currentChunk,
|
|
134
|
+
start: chunkStart,
|
|
135
|
+
end: chunkStart + currentChunk.length,
|
|
136
|
+
index: chunks.length,
|
|
137
|
+
})
|
|
138
|
+
chunkStart += currentChunk.length
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Handle overlap
|
|
143
|
+
if (overlap > 0 && currentChunk.length >= overlap) {
|
|
144
|
+
currentChunk = currentChunk.slice(-overlap) + part
|
|
145
|
+
chunkStart -= overlap
|
|
146
|
+
} else {
|
|
147
|
+
currentChunk = part
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
if (currentChunk.trim()) {
|
|
153
|
+
if (currentChunk.length > chunkSize) {
|
|
154
|
+
splitRecursive(currentChunk, chunkStart, sepIndex + 1)
|
|
155
|
+
} else {
|
|
156
|
+
chunks.push({
|
|
157
|
+
text: currentChunk,
|
|
158
|
+
start: chunkStart,
|
|
159
|
+
end: chunkStart + currentChunk.length,
|
|
160
|
+
index: chunks.length,
|
|
161
|
+
})
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
splitRecursive(text, 0, 0)
|
|
167
|
+
return chunks
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/**
|
|
171
|
+
* Sentence-based chunking - preserves sentence boundaries
|
|
172
|
+
*/
|
|
173
|
+
function sentenceChunk(text: string, chunkSize: number, overlap: number): TextChunk[] {
|
|
174
|
+
const chunks: TextChunk[] = []
|
|
175
|
+
|
|
176
|
+
// Split into sentences (simple regex, can be improved)
|
|
177
|
+
const sentenceRegex = /[^.!?]+[.!?]+/g
|
|
178
|
+
const sentences: { text: string; start: number; end: number }[] = []
|
|
179
|
+
|
|
180
|
+
let match: RegExpExecArray | null
|
|
181
|
+
while ((match = sentenceRegex.exec(text)) !== null) {
|
|
182
|
+
sentences.push({
|
|
183
|
+
text: match[0],
|
|
184
|
+
start: match.index,
|
|
185
|
+
end: match.index + match[0].length,
|
|
186
|
+
})
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
if (sentences.length === 0) {
|
|
190
|
+
// No sentences found, return whole text as one chunk
|
|
191
|
+
return [
|
|
192
|
+
{
|
|
193
|
+
text: text,
|
|
194
|
+
start: 0,
|
|
195
|
+
end: text.length,
|
|
196
|
+
index: 0,
|
|
197
|
+
},
|
|
198
|
+
]
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
let currentChunk: typeof sentences = []
|
|
202
|
+
let currentLength = 0
|
|
203
|
+
|
|
204
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
205
|
+
const sentence = sentences[i]
|
|
206
|
+
|
|
207
|
+
if (currentLength + sentence.text.length > chunkSize && currentChunk.length > 0) {
|
|
208
|
+
// Save current chunk
|
|
209
|
+
const chunkText = currentChunk.map((s) => s.text).join('')
|
|
210
|
+
chunks.push({
|
|
211
|
+
text: chunkText,
|
|
212
|
+
start: currentChunk[0].start,
|
|
213
|
+
end: currentChunk[currentChunk.length - 1].end,
|
|
214
|
+
index: chunks.length,
|
|
215
|
+
})
|
|
216
|
+
|
|
217
|
+
// Calculate overlap
|
|
218
|
+
if (overlap > 0) {
|
|
219
|
+
let overlapLength = 0
|
|
220
|
+
const overlapSentences: typeof sentences = []
|
|
221
|
+
|
|
222
|
+
for (let j = currentChunk.length - 1; j >= 0; j--) {
|
|
223
|
+
if (overlapLength + currentChunk[j].text.length <= overlap) {
|
|
224
|
+
overlapSentences.unshift(currentChunk[j])
|
|
225
|
+
overlapLength += currentChunk[j].text.length
|
|
226
|
+
} else {
|
|
227
|
+
break
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
currentChunk = overlapSentences
|
|
232
|
+
currentLength = overlapLength
|
|
233
|
+
} else {
|
|
234
|
+
currentChunk = []
|
|
235
|
+
currentLength = 0
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
currentChunk.push(sentence)
|
|
240
|
+
currentLength += sentence.text.length
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// Add final chunk
|
|
244
|
+
if (currentChunk.length > 0) {
|
|
245
|
+
const chunkText = currentChunk.map((s) => s.text).join('')
|
|
246
|
+
chunks.push({
|
|
247
|
+
text: chunkText,
|
|
248
|
+
start: currentChunk[0].start,
|
|
249
|
+
end: currentChunk[currentChunk.length - 1].end,
|
|
250
|
+
index: chunks.length,
|
|
251
|
+
})
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
return chunks
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
/**
|
|
258
|
+
* Sliding window chunking - fixed-size chunks with overlap
|
|
259
|
+
*/
|
|
260
|
+
function slidingWindowChunk(text: string, chunkSize: number, overlap: number): TextChunk[] {
|
|
261
|
+
const chunks: TextChunk[] = []
|
|
262
|
+
const step = chunkSize - overlap
|
|
263
|
+
|
|
264
|
+
for (let i = 0; i < text.length; i += step) {
|
|
265
|
+
const end = Math.min(i + chunkSize, text.length)
|
|
266
|
+
const chunk = text.slice(i, end)
|
|
267
|
+
|
|
268
|
+
if (chunk.trim()) {
|
|
269
|
+
chunks.push({
|
|
270
|
+
text: chunk,
|
|
271
|
+
start: i,
|
|
272
|
+
end: end,
|
|
273
|
+
index: chunks.length,
|
|
274
|
+
})
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
// Stop if we've reached the end
|
|
278
|
+
if (end === text.length) break
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
return chunks
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
/**
|
|
285
|
+
* Token-aware chunking - estimates token count and splits accordingly
|
|
286
|
+
* Uses a rough estimate of ~4 characters per token (actual depends on tokenizer)
|
|
287
|
+
*/
|
|
288
|
+
function tokenAwareChunk(text: string, tokenLimit: number, overlap: number): TextChunk[] {
|
|
289
|
+
const CHARS_PER_TOKEN = 4 // Rough estimate
|
|
290
|
+
const chunkSize = tokenLimit * CHARS_PER_TOKEN
|
|
291
|
+
const overlapChars = overlap * CHARS_PER_TOKEN
|
|
292
|
+
|
|
293
|
+
// Use recursive strategy with token-aware chunk size
|
|
294
|
+
return recursiveChunk(text, chunkSize, overlapChars, DEFAULT_SEPARATORS)
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
/**
|
|
298
|
+
* Estimate token count for text (rough approximation)
|
|
299
|
+
*/
|
|
300
|
+
export function estimateTokenCount(text: string): number {
|
|
301
|
+
const CHARS_PER_TOKEN = 4
|
|
302
|
+
return Math.ceil(text.length / CHARS_PER_TOKEN)
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
/**
|
|
306
|
+
* Merge small chunks to improve efficiency
|
|
307
|
+
*/
|
|
308
|
+
export function mergeSmallChunks(chunks: TextChunk[], minSize: number): TextChunk[] {
|
|
309
|
+
if (chunks.length === 0) return []
|
|
310
|
+
|
|
311
|
+
const merged: TextChunk[] = []
|
|
312
|
+
let current = chunks[0]
|
|
313
|
+
|
|
314
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
315
|
+
const next = chunks[i]
|
|
316
|
+
|
|
317
|
+
if (current.text.length < minSize) {
|
|
318
|
+
// Merge with next chunk
|
|
319
|
+
current = {
|
|
320
|
+
text: current.text + next.text,
|
|
321
|
+
start: current.start,
|
|
322
|
+
end: next.end,
|
|
323
|
+
index: merged.length,
|
|
324
|
+
metadata: { ...current.metadata, ...next.metadata },
|
|
325
|
+
}
|
|
326
|
+
} else {
|
|
327
|
+
merged.push(current)
|
|
328
|
+
current = { ...next, index: merged.length }
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
merged.push(current)
|
|
333
|
+
return merged
|
|
334
|
+
}
|