@opensaas/stack-rag 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. package/.turbo/turbo-build.log +4 -0
  2. package/CHANGELOG.md +10 -0
  3. package/CLAUDE.md +565 -0
  4. package/LICENSE +21 -0
  5. package/README.md +406 -0
  6. package/dist/config/index.d.ts +63 -0
  7. package/dist/config/index.d.ts.map +1 -0
  8. package/dist/config/index.js +94 -0
  9. package/dist/config/index.js.map +1 -0
  10. package/dist/config/plugin.d.ts +38 -0
  11. package/dist/config/plugin.d.ts.map +1 -0
  12. package/dist/config/plugin.js +215 -0
  13. package/dist/config/plugin.js.map +1 -0
  14. package/dist/config/plugin.test.d.ts +2 -0
  15. package/dist/config/plugin.test.d.ts.map +1 -0
  16. package/dist/config/plugin.test.js +554 -0
  17. package/dist/config/plugin.test.js.map +1 -0
  18. package/dist/config/types.d.ts +249 -0
  19. package/dist/config/types.d.ts.map +1 -0
  20. package/dist/config/types.js +5 -0
  21. package/dist/config/types.js.map +1 -0
  22. package/dist/fields/embedding.d.ts +85 -0
  23. package/dist/fields/embedding.d.ts.map +1 -0
  24. package/dist/fields/embedding.js +81 -0
  25. package/dist/fields/embedding.js.map +1 -0
  26. package/dist/fields/embedding.test.d.ts +2 -0
  27. package/dist/fields/embedding.test.d.ts.map +1 -0
  28. package/dist/fields/embedding.test.js +323 -0
  29. package/dist/fields/embedding.test.js.map +1 -0
  30. package/dist/fields/index.d.ts +6 -0
  31. package/dist/fields/index.d.ts.map +1 -0
  32. package/dist/fields/index.js +5 -0
  33. package/dist/fields/index.js.map +1 -0
  34. package/dist/index.d.ts +8 -0
  35. package/dist/index.d.ts.map +1 -0
  36. package/dist/index.js +9 -0
  37. package/dist/index.js.map +1 -0
  38. package/dist/mcp/index.d.ts +19 -0
  39. package/dist/mcp/index.d.ts.map +1 -0
  40. package/dist/mcp/index.js +18 -0
  41. package/dist/mcp/index.js.map +1 -0
  42. package/dist/providers/index.d.ts +38 -0
  43. package/dist/providers/index.d.ts.map +1 -0
  44. package/dist/providers/index.js +68 -0
  45. package/dist/providers/index.js.map +1 -0
  46. package/dist/providers/ollama.d.ts +49 -0
  47. package/dist/providers/ollama.d.ts.map +1 -0
  48. package/dist/providers/ollama.js +151 -0
  49. package/dist/providers/ollama.js.map +1 -0
  50. package/dist/providers/openai.d.ts +41 -0
  51. package/dist/providers/openai.d.ts.map +1 -0
  52. package/dist/providers/openai.js +126 -0
  53. package/dist/providers/openai.js.map +1 -0
  54. package/dist/providers/providers.test.d.ts +2 -0
  55. package/dist/providers/providers.test.d.ts.map +1 -0
  56. package/dist/providers/providers.test.js +224 -0
  57. package/dist/providers/providers.test.js.map +1 -0
  58. package/dist/providers/types.d.ts +88 -0
  59. package/dist/providers/types.d.ts.map +1 -0
  60. package/dist/providers/types.js +2 -0
  61. package/dist/providers/types.js.map +1 -0
  62. package/dist/runtime/batch.d.ts +183 -0
  63. package/dist/runtime/batch.d.ts.map +1 -0
  64. package/dist/runtime/batch.js +240 -0
  65. package/dist/runtime/batch.js.map +1 -0
  66. package/dist/runtime/batch.test.d.ts +2 -0
  67. package/dist/runtime/batch.test.d.ts.map +1 -0
  68. package/dist/runtime/batch.test.js +251 -0
  69. package/dist/runtime/batch.test.js.map +1 -0
  70. package/dist/runtime/chunking.d.ts +42 -0
  71. package/dist/runtime/chunking.d.ts.map +1 -0
  72. package/dist/runtime/chunking.js +264 -0
  73. package/dist/runtime/chunking.js.map +1 -0
  74. package/dist/runtime/chunking.test.d.ts +2 -0
  75. package/dist/runtime/chunking.test.d.ts.map +1 -0
  76. package/dist/runtime/chunking.test.js +212 -0
  77. package/dist/runtime/chunking.test.js.map +1 -0
  78. package/dist/runtime/embeddings.d.ts +147 -0
  79. package/dist/runtime/embeddings.d.ts.map +1 -0
  80. package/dist/runtime/embeddings.js +201 -0
  81. package/dist/runtime/embeddings.js.map +1 -0
  82. package/dist/runtime/embeddings.test.d.ts +2 -0
  83. package/dist/runtime/embeddings.test.d.ts.map +1 -0
  84. package/dist/runtime/embeddings.test.js +366 -0
  85. package/dist/runtime/embeddings.test.js.map +1 -0
  86. package/dist/runtime/index.d.ts +14 -0
  87. package/dist/runtime/index.d.ts.map +1 -0
  88. package/dist/runtime/index.js +18 -0
  89. package/dist/runtime/index.js.map +1 -0
  90. package/dist/runtime/search.d.ts +135 -0
  91. package/dist/runtime/search.d.ts.map +1 -0
  92. package/dist/runtime/search.js +101 -0
  93. package/dist/runtime/search.js.map +1 -0
  94. package/dist/storage/index.d.ts +41 -0
  95. package/dist/storage/index.d.ts.map +1 -0
  96. package/dist/storage/index.js +73 -0
  97. package/dist/storage/index.js.map +1 -0
  98. package/dist/storage/json.d.ts +34 -0
  99. package/dist/storage/json.d.ts.map +1 -0
  100. package/dist/storage/json.js +82 -0
  101. package/dist/storage/json.js.map +1 -0
  102. package/dist/storage/pgvector.d.ts +53 -0
  103. package/dist/storage/pgvector.d.ts.map +1 -0
  104. package/dist/storage/pgvector.js +168 -0
  105. package/dist/storage/pgvector.js.map +1 -0
  106. package/dist/storage/sqlite-vss.d.ts +49 -0
  107. package/dist/storage/sqlite-vss.d.ts.map +1 -0
  108. package/dist/storage/sqlite-vss.js +148 -0
  109. package/dist/storage/sqlite-vss.js.map +1 -0
  110. package/dist/storage/storage.test.d.ts +2 -0
  111. package/dist/storage/storage.test.d.ts.map +1 -0
  112. package/dist/storage/storage.test.js +440 -0
  113. package/dist/storage/storage.test.js.map +1 -0
  114. package/dist/storage/types.d.ts +79 -0
  115. package/dist/storage/types.d.ts.map +1 -0
  116. package/dist/storage/types.js +49 -0
  117. package/dist/storage/types.js.map +1 -0
  118. package/package.json +82 -0
  119. package/src/config/index.ts +116 -0
  120. package/src/config/plugin.test.ts +664 -0
  121. package/src/config/plugin.ts +257 -0
  122. package/src/config/types.ts +283 -0
  123. package/src/fields/embedding.test.ts +408 -0
  124. package/src/fields/embedding.ts +150 -0
  125. package/src/fields/index.ts +6 -0
  126. package/src/index.ts +33 -0
  127. package/src/mcp/index.ts +21 -0
  128. package/src/providers/index.ts +81 -0
  129. package/src/providers/ollama.ts +186 -0
  130. package/src/providers/openai.ts +161 -0
  131. package/src/providers/providers.test.ts +275 -0
  132. package/src/providers/types.ts +100 -0
  133. package/src/runtime/batch.test.ts +332 -0
  134. package/src/runtime/batch.ts +424 -0
  135. package/src/runtime/chunking.test.ts +258 -0
  136. package/src/runtime/chunking.ts +334 -0
  137. package/src/runtime/embeddings.test.ts +441 -0
  138. package/src/runtime/embeddings.ts +380 -0
  139. package/src/runtime/index.ts +51 -0
  140. package/src/runtime/search.ts +243 -0
  141. package/src/storage/index.ts +86 -0
  142. package/src/storage/json.ts +106 -0
  143. package/src/storage/pgvector.ts +206 -0
  144. package/src/storage/sqlite-vss.ts +193 -0
  145. package/src/storage/storage.test.ts +521 -0
  146. package/src/storage/types.ts +126 -0
  147. package/tsconfig.json +13 -0
  148. package/tsconfig.tsbuildinfo +1 -0
  149. package/vitest.config.ts +18 -0
@@ -0,0 +1,334 @@
1
+ /**
2
+ * Text chunking utilities for splitting documents into smaller segments
3
+ * suitable for embedding generation.
4
+ */
5
+
6
+ export type ChunkingStrategy = 'recursive' | 'sentence' | 'sliding-window' | 'token-aware'
7
+
8
+ export interface ChunkingOptions {
9
+ /** Target chunk size in characters */
10
+ chunkSize?: number
11
+ /** Overlap between chunks in characters */
12
+ chunkOverlap?: number
13
+ /** Strategy for chunking text */
14
+ strategy?: ChunkingStrategy
15
+ /** Separators for recursive strategy (in priority order) */
16
+ separators?: string[]
17
+ /** Token limit for token-aware strategy */
18
+ tokenLimit?: number
19
+ }
20
+
21
+ export interface TextChunk {
22
+ /** The chunked text content */
23
+ text: string
24
+ /** Start position in original text */
25
+ start: number
26
+ /** End position in original text */
27
+ end: number
28
+ /** Chunk index */
29
+ index: number
30
+ /** Metadata about the chunk */
31
+ metadata?: Record<string, unknown>
32
+ }
33
+
34
+ const DEFAULT_SEPARATORS = ['\n\n', '\n', '. ', ' ', '']
35
+
36
+ /**
37
+ * Split text into chunks using specified strategy
38
+ */
39
+ export function chunkText(text: string, options: ChunkingOptions = {}): TextChunk[] {
40
+ const {
41
+ chunkSize = 1000,
42
+ chunkOverlap = 200,
43
+ strategy = 'recursive',
44
+ separators = DEFAULT_SEPARATORS,
45
+ tokenLimit,
46
+ } = options
47
+
48
+ // Handle empty text early
49
+ if (!text || text.trim().length === 0) {
50
+ return []
51
+ }
52
+
53
+ if (chunkOverlap >= chunkSize) {
54
+ throw new Error('chunkOverlap must be less than chunkSize')
55
+ }
56
+
57
+ switch (strategy) {
58
+ case 'recursive':
59
+ return recursiveChunk(text, chunkSize, chunkOverlap, separators)
60
+ case 'sentence':
61
+ return sentenceChunk(text, chunkSize, chunkOverlap)
62
+ case 'sliding-window':
63
+ return slidingWindowChunk(text, chunkSize, chunkOverlap)
64
+ case 'token-aware':
65
+ return tokenAwareChunk(text, tokenLimit || chunkSize, chunkOverlap)
66
+ default:
67
+ throw new Error(`Unknown chunking strategy: ${strategy}`)
68
+ }
69
+ }
70
+
71
+ /**
72
+ * Recursive text splitting - tries to split by paragraphs, then sentences, then words
73
+ */
74
+ function recursiveChunk(
75
+ text: string,
76
+ chunkSize: number,
77
+ overlap: number,
78
+ separators: string[],
79
+ ): TextChunk[] {
80
+ const chunks: TextChunk[] = []
81
+
82
+ function splitRecursive(content: string, startPos: number, sepIndex: number): void {
83
+ if (content.length <= chunkSize) {
84
+ if (content.trim()) {
85
+ chunks.push({
86
+ text: content,
87
+ start: startPos,
88
+ end: startPos + content.length,
89
+ index: chunks.length,
90
+ })
91
+ }
92
+ return
93
+ }
94
+
95
+ if (sepIndex >= separators.length) {
96
+ // No more separators, force split at chunkSize
97
+ let pos = 0
98
+ while (pos < content.length) {
99
+ const end = Math.min(pos + chunkSize, content.length)
100
+ const chunk = content.slice(pos, end)
101
+ if (chunk.trim()) {
102
+ chunks.push({
103
+ text: chunk,
104
+ start: startPos + pos,
105
+ end: startPos + end,
106
+ index: chunks.length,
107
+ })
108
+ }
109
+ pos += chunkSize - overlap
110
+ }
111
+ return
112
+ }
113
+
114
+ const separator = separators[sepIndex]
115
+ const parts = content.split(separator)
116
+
117
+ let currentChunk = ''
118
+ let chunkStart = startPos
119
+
120
+ for (let i = 0; i < parts.length; i++) {
121
+ const part = parts[i] + (i < parts.length - 1 ? separator : '')
122
+
123
+ if (currentChunk.length + part.length <= chunkSize) {
124
+ currentChunk += part
125
+ } else {
126
+ if (currentChunk.trim()) {
127
+ // Try to split current chunk with next separator
128
+ if (currentChunk.length > chunkSize) {
129
+ splitRecursive(currentChunk, chunkStart, sepIndex + 1)
130
+ chunkStart += currentChunk.length
131
+ } else {
132
+ chunks.push({
133
+ text: currentChunk,
134
+ start: chunkStart,
135
+ end: chunkStart + currentChunk.length,
136
+ index: chunks.length,
137
+ })
138
+ chunkStart += currentChunk.length
139
+ }
140
+ }
141
+
142
+ // Handle overlap
143
+ if (overlap > 0 && currentChunk.length >= overlap) {
144
+ currentChunk = currentChunk.slice(-overlap) + part
145
+ chunkStart -= overlap
146
+ } else {
147
+ currentChunk = part
148
+ }
149
+ }
150
+ }
151
+
152
+ if (currentChunk.trim()) {
153
+ if (currentChunk.length > chunkSize) {
154
+ splitRecursive(currentChunk, chunkStart, sepIndex + 1)
155
+ } else {
156
+ chunks.push({
157
+ text: currentChunk,
158
+ start: chunkStart,
159
+ end: chunkStart + currentChunk.length,
160
+ index: chunks.length,
161
+ })
162
+ }
163
+ }
164
+ }
165
+
166
+ splitRecursive(text, 0, 0)
167
+ return chunks
168
+ }
169
+
170
+ /**
171
+ * Sentence-based chunking - preserves sentence boundaries
172
+ */
173
+ function sentenceChunk(text: string, chunkSize: number, overlap: number): TextChunk[] {
174
+ const chunks: TextChunk[] = []
175
+
176
+ // Split into sentences (simple regex, can be improved)
177
+ const sentenceRegex = /[^.!?]+[.!?]+/g
178
+ const sentences: { text: string; start: number; end: number }[] = []
179
+
180
+ let match: RegExpExecArray | null
181
+ while ((match = sentenceRegex.exec(text)) !== null) {
182
+ sentences.push({
183
+ text: match[0],
184
+ start: match.index,
185
+ end: match.index + match[0].length,
186
+ })
187
+ }
188
+
189
+ if (sentences.length === 0) {
190
+ // No sentences found, return whole text as one chunk
191
+ return [
192
+ {
193
+ text: text,
194
+ start: 0,
195
+ end: text.length,
196
+ index: 0,
197
+ },
198
+ ]
199
+ }
200
+
201
+ let currentChunk: typeof sentences = []
202
+ let currentLength = 0
203
+
204
+ for (let i = 0; i < sentences.length; i++) {
205
+ const sentence = sentences[i]
206
+
207
+ if (currentLength + sentence.text.length > chunkSize && currentChunk.length > 0) {
208
+ // Save current chunk
209
+ const chunkText = currentChunk.map((s) => s.text).join('')
210
+ chunks.push({
211
+ text: chunkText,
212
+ start: currentChunk[0].start,
213
+ end: currentChunk[currentChunk.length - 1].end,
214
+ index: chunks.length,
215
+ })
216
+
217
+ // Calculate overlap
218
+ if (overlap > 0) {
219
+ let overlapLength = 0
220
+ const overlapSentences: typeof sentences = []
221
+
222
+ for (let j = currentChunk.length - 1; j >= 0; j--) {
223
+ if (overlapLength + currentChunk[j].text.length <= overlap) {
224
+ overlapSentences.unshift(currentChunk[j])
225
+ overlapLength += currentChunk[j].text.length
226
+ } else {
227
+ break
228
+ }
229
+ }
230
+
231
+ currentChunk = overlapSentences
232
+ currentLength = overlapLength
233
+ } else {
234
+ currentChunk = []
235
+ currentLength = 0
236
+ }
237
+ }
238
+
239
+ currentChunk.push(sentence)
240
+ currentLength += sentence.text.length
241
+ }
242
+
243
+ // Add final chunk
244
+ if (currentChunk.length > 0) {
245
+ const chunkText = currentChunk.map((s) => s.text).join('')
246
+ chunks.push({
247
+ text: chunkText,
248
+ start: currentChunk[0].start,
249
+ end: currentChunk[currentChunk.length - 1].end,
250
+ index: chunks.length,
251
+ })
252
+ }
253
+
254
+ return chunks
255
+ }
256
+
257
+ /**
258
+ * Sliding window chunking - fixed-size chunks with overlap
259
+ */
260
+ function slidingWindowChunk(text: string, chunkSize: number, overlap: number): TextChunk[] {
261
+ const chunks: TextChunk[] = []
262
+ const step = chunkSize - overlap
263
+
264
+ for (let i = 0; i < text.length; i += step) {
265
+ const end = Math.min(i + chunkSize, text.length)
266
+ const chunk = text.slice(i, end)
267
+
268
+ if (chunk.trim()) {
269
+ chunks.push({
270
+ text: chunk,
271
+ start: i,
272
+ end: end,
273
+ index: chunks.length,
274
+ })
275
+ }
276
+
277
+ // Stop if we've reached the end
278
+ if (end === text.length) break
279
+ }
280
+
281
+ return chunks
282
+ }
283
+
284
+ /**
285
+ * Token-aware chunking - estimates token count and splits accordingly
286
+ * Uses a rough estimate of ~4 characters per token (actual depends on tokenizer)
287
+ */
288
+ function tokenAwareChunk(text: string, tokenLimit: number, overlap: number): TextChunk[] {
289
+ const CHARS_PER_TOKEN = 4 // Rough estimate
290
+ const chunkSize = tokenLimit * CHARS_PER_TOKEN
291
+ const overlapChars = overlap * CHARS_PER_TOKEN
292
+
293
+ // Use recursive strategy with token-aware chunk size
294
+ return recursiveChunk(text, chunkSize, overlapChars, DEFAULT_SEPARATORS)
295
+ }
296
+
297
+ /**
298
+ * Estimate token count for text (rough approximation)
299
+ */
300
+ export function estimateTokenCount(text: string): number {
301
+ const CHARS_PER_TOKEN = 4
302
+ return Math.ceil(text.length / CHARS_PER_TOKEN)
303
+ }
304
+
305
+ /**
306
+ * Merge small chunks to improve efficiency
307
+ */
308
+ export function mergeSmallChunks(chunks: TextChunk[], minSize: number): TextChunk[] {
309
+ if (chunks.length === 0) return []
310
+
311
+ const merged: TextChunk[] = []
312
+ let current = chunks[0]
313
+
314
+ for (let i = 1; i < chunks.length; i++) {
315
+ const next = chunks[i]
316
+
317
+ if (current.text.length < minSize) {
318
+ // Merge with next chunk
319
+ current = {
320
+ text: current.text + next.text,
321
+ start: current.start,
322
+ end: next.end,
323
+ index: merged.length,
324
+ metadata: { ...current.metadata, ...next.metadata },
325
+ }
326
+ } else {
327
+ merged.push(current)
328
+ current = { ...next, index: merged.length }
329
+ }
330
+ }
331
+
332
+ merged.push(current)
333
+ return merged
334
+ }