@opensaas/stack-rag 0.1.7 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +1 -1
- package/CHANGELOG.md +132 -0
- package/dist/config/index.d.ts.map +1 -1
- package/dist/config/index.js +9 -0
- package/dist/config/index.js.map +1 -1
- package/dist/config/plugin.d.ts.map +1 -1
- package/dist/config/plugin.js +32 -0
- package/dist/config/plugin.js.map +1 -1
- package/dist/config/plugin.test.js +70 -14
- package/dist/config/plugin.test.js.map +1 -1
- package/dist/config/types.d.ts +135 -0
- package/dist/config/types.d.ts.map +1 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/providers/openai.js +1 -1
- package/dist/providers/openai.js.map +1 -1
- package/dist/runtime/build-time.d.ts +100 -0
- package/dist/runtime/build-time.d.ts.map +1 -0
- package/dist/runtime/build-time.js +185 -0
- package/dist/runtime/build-time.js.map +1 -0
- package/dist/runtime/index.d.ts +3 -0
- package/dist/runtime/index.d.ts.map +1 -1
- package/dist/runtime/index.js +6 -0
- package/dist/runtime/index.js.map +1 -1
- package/dist/runtime/markdown.d.ts +33 -0
- package/dist/runtime/markdown.d.ts.map +1 -0
- package/dist/runtime/markdown.js +94 -0
- package/dist/runtime/markdown.js.map +1 -0
- package/dist/runtime/provider-helpers.d.ts +56 -0
- package/dist/runtime/provider-helpers.d.ts.map +1 -0
- package/dist/runtime/provider-helpers.js +95 -0
- package/dist/runtime/provider-helpers.js.map +1 -0
- package/dist/runtime/types.d.ts +29 -0
- package/dist/runtime/types.d.ts.map +1 -0
- package/dist/runtime/types.js +6 -0
- package/dist/runtime/types.js.map +1 -0
- package/dist/storage/index.d.ts +1 -0
- package/dist/storage/index.d.ts.map +1 -1
- package/dist/storage/index.js +1 -0
- package/dist/storage/index.js.map +1 -1
- package/dist/storage/json-file.d.ts +53 -0
- package/dist/storage/json-file.d.ts.map +1 -0
- package/dist/storage/json-file.js +124 -0
- package/dist/storage/json-file.js.map +1 -0
- package/dist/storage/storage.test.js +1 -0
- package/dist/storage/storage.test.js.map +1 -1
- package/package.json +6 -5
- package/src/config/index.ts +9 -0
- package/src/config/plugin.test.ts +70 -14
- package/src/config/plugin.ts +37 -0
- package/src/config/types.ts +158 -0
- package/src/index.ts +6 -0
- package/src/providers/openai.ts +1 -1
- package/src/runtime/build-time.ts +216 -0
- package/src/runtime/index.ts +18 -0
- package/src/runtime/markdown.ts +119 -0
- package/src/runtime/provider-helpers.ts +115 -0
- package/src/runtime/types.ts +30 -0
- package/src/storage/index.ts +1 -0
- package/src/storage/json-file.ts +157 -0
- package/src/storage/storage.test.ts +1 -0
- package/tsconfig.tsbuildinfo +1 -1
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@opensaas/stack-rag",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "RAG and AI embeddings integration for OpenSaas Stack",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -51,8 +51,8 @@
|
|
|
51
51
|
"url": "https://github.com/OpenSaasAU/stack/issues"
|
|
52
52
|
},
|
|
53
53
|
"peerDependencies": {
|
|
54
|
-
"
|
|
55
|
-
"
|
|
54
|
+
"@opensaas/stack-core": "^0",
|
|
55
|
+
"openai": "^6.8.0"
|
|
56
56
|
},
|
|
57
57
|
"peerDependenciesMeta": {
|
|
58
58
|
"openai": {
|
|
@@ -60,16 +60,17 @@
|
|
|
60
60
|
}
|
|
61
61
|
},
|
|
62
62
|
"dependencies": {
|
|
63
|
+
"dotenv": "^16.4.7",
|
|
63
64
|
"zod": "^4.1.12"
|
|
64
65
|
},
|
|
65
66
|
"devDependencies": {
|
|
66
67
|
"@types/node": "^24.7.2",
|
|
67
68
|
"@vitest/coverage-v8": "^4.0.4",
|
|
68
69
|
"@vitest/ui": "^4.0.0",
|
|
69
|
-
"openai": "^6.
|
|
70
|
+
"openai": "^6.8.0",
|
|
70
71
|
"typescript": "^5.9.3",
|
|
71
72
|
"vitest": "^4.0.0",
|
|
72
|
-
"@opensaas/stack-core": "0.
|
|
73
|
+
"@opensaas/stack-core": "0.3.0"
|
|
73
74
|
},
|
|
74
75
|
"scripts": {
|
|
75
76
|
"build": "tsc",
|
package/src/config/index.ts
CHANGED
|
@@ -19,6 +19,15 @@ export function normalizeRAGConfig(config: RAGConfig): NormalizedRAGConfig {
|
|
|
19
19
|
maxTokens: config.chunking?.maxTokens || 500,
|
|
20
20
|
overlap: config.chunking?.overlap || 50,
|
|
21
21
|
},
|
|
22
|
+
buildTime: config.buildTime
|
|
23
|
+
? {
|
|
24
|
+
enabled: config.buildTime.enabled,
|
|
25
|
+
outputPath: config.buildTime.outputPath || '.embeddings/embeddings.json',
|
|
26
|
+
chunkSize: config.buildTime.chunkSize || 500,
|
|
27
|
+
chunkOverlap: config.buildTime.chunkOverlap || 50,
|
|
28
|
+
differential: config.buildTime.differential ?? true,
|
|
29
|
+
}
|
|
30
|
+
: null,
|
|
22
31
|
enableMcpTools: config.enableMcpTools ?? true,
|
|
23
32
|
batchSize: config.batchSize || 10,
|
|
24
33
|
rateLimit: config.rateLimit || 100,
|
|
@@ -78,7 +78,11 @@ describe('RAG Plugin', () => {
|
|
|
78
78
|
|
|
79
79
|
const mockContext = {
|
|
80
80
|
config: {
|
|
81
|
-
db: {
|
|
81
|
+
db: {
|
|
82
|
+
provider: 'sqlite',
|
|
83
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
84
|
+
prismaClientConstructor: (() => null) as any,
|
|
85
|
+
},
|
|
82
86
|
lists: {
|
|
83
87
|
Article: {
|
|
84
88
|
fields: {
|
|
@@ -122,7 +126,11 @@ describe('RAG Plugin', () => {
|
|
|
122
126
|
|
|
123
127
|
const mockContext = {
|
|
124
128
|
config: {
|
|
125
|
-
db: {
|
|
129
|
+
db: {
|
|
130
|
+
provider: 'sqlite',
|
|
131
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
132
|
+
prismaClientConstructor: (() => null) as any,
|
|
133
|
+
},
|
|
126
134
|
lists: {
|
|
127
135
|
Article: {
|
|
128
136
|
fields: {
|
|
@@ -158,7 +166,11 @@ describe('RAG Plugin', () => {
|
|
|
158
166
|
|
|
159
167
|
const mockContext = {
|
|
160
168
|
config: {
|
|
161
|
-
db: {
|
|
169
|
+
db: {
|
|
170
|
+
provider: 'sqlite',
|
|
171
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
172
|
+
prismaClientConstructor: (() => null) as any,
|
|
173
|
+
},
|
|
162
174
|
lists: {
|
|
163
175
|
Article: {
|
|
164
176
|
fields: {
|
|
@@ -197,7 +209,11 @@ describe('RAG Plugin', () => {
|
|
|
197
209
|
|
|
198
210
|
const mockContext = {
|
|
199
211
|
config: {
|
|
200
|
-
db: {
|
|
212
|
+
db: {
|
|
213
|
+
provider: 'sqlite',
|
|
214
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
215
|
+
prismaClientConstructor: (() => null) as any,
|
|
216
|
+
},
|
|
201
217
|
lists: {},
|
|
202
218
|
},
|
|
203
219
|
setPluginData: vi.fn(),
|
|
@@ -231,7 +247,11 @@ describe('RAG Plugin', () => {
|
|
|
231
247
|
|
|
232
248
|
const mockContext = {
|
|
233
249
|
config: {
|
|
234
|
-
db: {
|
|
250
|
+
db: {
|
|
251
|
+
provider: 'sqlite',
|
|
252
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
253
|
+
prismaClientConstructor: (() => null) as any,
|
|
254
|
+
},
|
|
235
255
|
lists: {
|
|
236
256
|
Article: {
|
|
237
257
|
fields: {
|
|
@@ -275,7 +295,11 @@ describe('RAG Plugin', () => {
|
|
|
275
295
|
|
|
276
296
|
const mockContext = {
|
|
277
297
|
config: {
|
|
278
|
-
db: {
|
|
298
|
+
db: {
|
|
299
|
+
provider: 'sqlite',
|
|
300
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
301
|
+
prismaClientConstructor: (() => null) as any,
|
|
302
|
+
},
|
|
279
303
|
lists: {
|
|
280
304
|
Article: {
|
|
281
305
|
fields: {
|
|
@@ -324,7 +348,11 @@ describe('RAG Plugin', () => {
|
|
|
324
348
|
|
|
325
349
|
const mockContext = {
|
|
326
350
|
config: {
|
|
327
|
-
db: {
|
|
351
|
+
db: {
|
|
352
|
+
provider: 'sqlite',
|
|
353
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
354
|
+
prismaClientConstructor: (() => null) as any,
|
|
355
|
+
},
|
|
328
356
|
lists: {
|
|
329
357
|
Article: {
|
|
330
358
|
fields: {
|
|
@@ -366,7 +394,11 @@ describe('RAG Plugin', () => {
|
|
|
366
394
|
|
|
367
395
|
const mockContext = {
|
|
368
396
|
config: {
|
|
369
|
-
db: {
|
|
397
|
+
db: {
|
|
398
|
+
provider: 'sqlite',
|
|
399
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
400
|
+
prismaClientConstructor: (() => null) as any,
|
|
401
|
+
},
|
|
370
402
|
lists: {
|
|
371
403
|
Article: {
|
|
372
404
|
fields: {
|
|
@@ -401,7 +433,11 @@ describe('RAG Plugin', () => {
|
|
|
401
433
|
|
|
402
434
|
const mockContext = {
|
|
403
435
|
config: {
|
|
404
|
-
db: {
|
|
436
|
+
db: {
|
|
437
|
+
provider: 'sqlite',
|
|
438
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
439
|
+
prismaClientConstructor: (() => null) as any,
|
|
440
|
+
},
|
|
405
441
|
lists: {
|
|
406
442
|
User: {
|
|
407
443
|
fields: {
|
|
@@ -435,7 +471,11 @@ describe('RAG Plugin', () => {
|
|
|
435
471
|
|
|
436
472
|
const mockContext = {
|
|
437
473
|
config: {
|
|
438
|
-
db: {
|
|
474
|
+
db: {
|
|
475
|
+
provider: 'sqlite',
|
|
476
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
477
|
+
prismaClientConstructor: (() => null) as any,
|
|
478
|
+
},
|
|
439
479
|
lists: {
|
|
440
480
|
BlogPost: {
|
|
441
481
|
fields: {
|
|
@@ -483,7 +523,11 @@ describe('RAG Plugin', () => {
|
|
|
483
523
|
|
|
484
524
|
const mockContext = {
|
|
485
525
|
config: {
|
|
486
|
-
db: {
|
|
526
|
+
db: {
|
|
527
|
+
provider: 'sqlite',
|
|
528
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
529
|
+
prismaClientConstructor: (() => null) as any,
|
|
530
|
+
},
|
|
487
531
|
lists: {
|
|
488
532
|
Article: {
|
|
489
533
|
fields: {
|
|
@@ -537,7 +581,11 @@ describe('RAG Plugin', () => {
|
|
|
537
581
|
|
|
538
582
|
const mockContext = {
|
|
539
583
|
config: {
|
|
540
|
-
db: {
|
|
584
|
+
db: {
|
|
585
|
+
provider: 'sqlite',
|
|
586
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
587
|
+
prismaClientConstructor: (() => null) as any,
|
|
588
|
+
},
|
|
541
589
|
lists: {
|
|
542
590
|
Article: {
|
|
543
591
|
fields: {
|
|
@@ -574,7 +622,11 @@ describe('RAG Plugin', () => {
|
|
|
574
622
|
|
|
575
623
|
const mockContext = {
|
|
576
624
|
config: {
|
|
577
|
-
db: {
|
|
625
|
+
db: {
|
|
626
|
+
provider: 'sqlite',
|
|
627
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
628
|
+
prismaClientConstructor: (() => null) as any,
|
|
629
|
+
},
|
|
578
630
|
lists: {
|
|
579
631
|
Article: {
|
|
580
632
|
fields: {
|
|
@@ -613,7 +665,11 @@ describe('RAG Plugin', () => {
|
|
|
613
665
|
|
|
614
666
|
const mockContext = {
|
|
615
667
|
config: {
|
|
616
|
-
db: {
|
|
668
|
+
db: {
|
|
669
|
+
provider: 'sqlite',
|
|
670
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
671
|
+
prismaClientConstructor: (() => null) as any,
|
|
672
|
+
},
|
|
617
673
|
lists: {},
|
|
618
674
|
},
|
|
619
675
|
setPluginData: vi.fn(),
|
package/src/config/plugin.ts
CHANGED
|
@@ -45,6 +45,11 @@ export function ragPlugin(config: RAGConfig): Plugin {
|
|
|
45
45
|
name: 'rag',
|
|
46
46
|
version: '0.1.0',
|
|
47
47
|
|
|
48
|
+
runtimeServiceTypes: {
|
|
49
|
+
import: "import type { RAGRuntimeServices } from '@opensaas/stack-rag'",
|
|
50
|
+
typeName: 'RAGRuntimeServices',
|
|
51
|
+
},
|
|
52
|
+
|
|
48
53
|
init: async (context) => {
|
|
49
54
|
// First pass: Scan for searchable() wrapped fields and inject embedding fields
|
|
50
55
|
for (const [listName, listConfig] of Object.entries(context.config.lists)) {
|
|
@@ -247,6 +252,38 @@ export function ragPlugin(config: RAGConfig): Plugin {
|
|
|
247
252
|
// Access at runtime via: config._pluginData.rag
|
|
248
253
|
context.setPluginData<NormalizedRAGConfig>('rag', normalized)
|
|
249
254
|
},
|
|
255
|
+
|
|
256
|
+
runtime: () => {
|
|
257
|
+
// Provide RAG-related utilities at runtime
|
|
258
|
+
return {
|
|
259
|
+
/**
|
|
260
|
+
* Generate embedding for a given text
|
|
261
|
+
* Uses the configured embedding provider
|
|
262
|
+
*/
|
|
263
|
+
generateEmbedding: async (text: string) => {
|
|
264
|
+
const ragConfig = normalized
|
|
265
|
+
if (!ragConfig || !ragConfig.provider) {
|
|
266
|
+
throw new Error('RAG plugin not configured')
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
const provider = createEmbeddingProvider(ragConfig.provider)
|
|
270
|
+
return await provider.embed(text)
|
|
271
|
+
},
|
|
272
|
+
|
|
273
|
+
/**
|
|
274
|
+
* Generate embeddings for multiple texts (batch)
|
|
275
|
+
*/
|
|
276
|
+
generateEmbeddings: async (texts: string[]) => {
|
|
277
|
+
const ragConfig = normalized
|
|
278
|
+
if (!ragConfig || !ragConfig.provider) {
|
|
279
|
+
throw new Error('RAG plugin not configured')
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
const provider = createEmbeddingProvider(ragConfig.provider)
|
|
283
|
+
return await provider.embedBatch(texts)
|
|
284
|
+
},
|
|
285
|
+
}
|
|
286
|
+
},
|
|
250
287
|
}
|
|
251
288
|
}
|
|
252
289
|
|
package/src/config/types.ts
CHANGED
|
@@ -155,6 +155,42 @@ export type VectorStorageConfig =
|
|
|
155
155
|
| JsonStorageConfig
|
|
156
156
|
| CustomStorageConfig
|
|
157
157
|
|
|
158
|
+
/**
|
|
159
|
+
* Build-time embedding generation configuration
|
|
160
|
+
*/
|
|
161
|
+
export type BuildTimeConfig = {
|
|
162
|
+
/**
|
|
163
|
+
* Enable build-time embedding generation
|
|
164
|
+
*/
|
|
165
|
+
enabled: boolean
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Output path for embeddings JSON file
|
|
169
|
+
* Relative to project root
|
|
170
|
+
* @default '.embeddings/embeddings.json'
|
|
171
|
+
*/
|
|
172
|
+
outputPath?: string
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* Chunk size for text splitting (in characters)
|
|
176
|
+
* @default 500
|
|
177
|
+
*/
|
|
178
|
+
chunkSize?: number
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Overlap between chunks (in characters)
|
|
182
|
+
* @default 50
|
|
183
|
+
*/
|
|
184
|
+
chunkOverlap?: number
|
|
185
|
+
|
|
186
|
+
/**
|
|
187
|
+
* Whether to enable differential updates
|
|
188
|
+
* Only regenerate embeddings for changed content
|
|
189
|
+
* @default true
|
|
190
|
+
*/
|
|
191
|
+
differential?: boolean
|
|
192
|
+
}
|
|
193
|
+
|
|
158
194
|
/**
|
|
159
195
|
* Main RAG configuration
|
|
160
196
|
*/
|
|
@@ -191,6 +227,13 @@ export type RAGConfig = {
|
|
|
191
227
|
*/
|
|
192
228
|
chunking?: ChunkingConfig
|
|
193
229
|
|
|
230
|
+
/**
|
|
231
|
+
* Build-time embedding generation configuration
|
|
232
|
+
* When enabled, embeddings are generated at build time and stored in a JSON file
|
|
233
|
+
* instead of being generated at runtime via hooks
|
|
234
|
+
*/
|
|
235
|
+
buildTime?: BuildTimeConfig
|
|
236
|
+
|
|
194
237
|
/**
|
|
195
238
|
* Whether to enable MCP tools for semantic search
|
|
196
239
|
* Requires MCP to be enabled in main config
|
|
@@ -219,6 +262,7 @@ export type NormalizedRAGConfig = {
|
|
|
219
262
|
providers: Record<string, EmbeddingProviderConfig>
|
|
220
263
|
storage: VectorStorageConfig
|
|
221
264
|
chunking: Required<ChunkingConfig>
|
|
265
|
+
buildTime: Required<BuildTimeConfig> | null
|
|
222
266
|
enableMcpTools: boolean
|
|
223
267
|
batchSize: number
|
|
224
268
|
rateLimit: number
|
|
@@ -340,3 +384,117 @@ export type SearchableMetadata = {
|
|
|
340
384
|
*/
|
|
341
385
|
chunking?: ChunkingConfig
|
|
342
386
|
}
|
|
387
|
+
|
|
388
|
+
/**
|
|
389
|
+
* A chunk of text with its embedding
|
|
390
|
+
* Used in build-time generation output
|
|
391
|
+
*/
|
|
392
|
+
export type EmbeddingChunk = {
|
|
393
|
+
/**
|
|
394
|
+
* The text content of this chunk
|
|
395
|
+
*/
|
|
396
|
+
text: string
|
|
397
|
+
|
|
398
|
+
/**
|
|
399
|
+
* The embedding vector for this chunk
|
|
400
|
+
*/
|
|
401
|
+
embedding: number[]
|
|
402
|
+
|
|
403
|
+
/**
|
|
404
|
+
* Metadata about the chunk
|
|
405
|
+
*/
|
|
406
|
+
metadata: {
|
|
407
|
+
/**
|
|
408
|
+
* Index of this chunk within the document
|
|
409
|
+
*/
|
|
410
|
+
chunkIndex: number
|
|
411
|
+
|
|
412
|
+
/**
|
|
413
|
+
* Start character position in original text
|
|
414
|
+
*/
|
|
415
|
+
startOffset: number
|
|
416
|
+
|
|
417
|
+
/**
|
|
418
|
+
* End character position in original text
|
|
419
|
+
*/
|
|
420
|
+
endOffset: number
|
|
421
|
+
|
|
422
|
+
/**
|
|
423
|
+
* Whether this chunk represents a document title
|
|
424
|
+
* Title chunks receive boosted scoring during search
|
|
425
|
+
*/
|
|
426
|
+
isTitle?: boolean
|
|
427
|
+
|
|
428
|
+
/**
|
|
429
|
+
* Additional custom metadata
|
|
430
|
+
*/
|
|
431
|
+
[key: string]: unknown
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
/**
|
|
436
|
+
* Document with embeddings
|
|
437
|
+
* Used in build-time generation output
|
|
438
|
+
*/
|
|
439
|
+
export type EmbeddedDocument = {
|
|
440
|
+
/**
|
|
441
|
+
* Document ID or slug
|
|
442
|
+
*/
|
|
443
|
+
id: string
|
|
444
|
+
|
|
445
|
+
/**
|
|
446
|
+
* Document title
|
|
447
|
+
*/
|
|
448
|
+
title?: string
|
|
449
|
+
|
|
450
|
+
/**
|
|
451
|
+
* The chunks of this document with embeddings
|
|
452
|
+
*/
|
|
453
|
+
chunks: EmbeddingChunk[]
|
|
454
|
+
|
|
455
|
+
/**
|
|
456
|
+
* Embedding metadata
|
|
457
|
+
*/
|
|
458
|
+
embeddingMetadata: EmbeddingMetadata
|
|
459
|
+
|
|
460
|
+
/**
|
|
461
|
+
* When the embeddings were generated
|
|
462
|
+
*/
|
|
463
|
+
generatedAt: string
|
|
464
|
+
|
|
465
|
+
/**
|
|
466
|
+
* Hash of the source content (for differential updates)
|
|
467
|
+
*/
|
|
468
|
+
contentHash: string
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
/**
|
|
472
|
+
* Build-time embeddings index file format
|
|
473
|
+
*/
|
|
474
|
+
export type EmbeddingsIndex = {
|
|
475
|
+
/**
|
|
476
|
+
* Version of the embeddings format
|
|
477
|
+
*/
|
|
478
|
+
version: string
|
|
479
|
+
|
|
480
|
+
/**
|
|
481
|
+
* Embedding configuration used to generate these embeddings
|
|
482
|
+
*/
|
|
483
|
+
config: {
|
|
484
|
+
provider: string
|
|
485
|
+
model: string
|
|
486
|
+
dimensions: number
|
|
487
|
+
chunkSize: number
|
|
488
|
+
chunkOverlap: number
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
/**
|
|
492
|
+
* Documents with embeddings
|
|
493
|
+
*/
|
|
494
|
+
documents: Record<string, EmbeddedDocument>
|
|
495
|
+
|
|
496
|
+
/**
|
|
497
|
+
* When the index was generated
|
|
498
|
+
*/
|
|
499
|
+
generatedAt: string
|
|
500
|
+
}
|
package/src/index.ts
CHANGED
|
@@ -15,6 +15,9 @@ export {
|
|
|
15
15
|
// Plugin export
|
|
16
16
|
export { ragPlugin } from './config/plugin.js'
|
|
17
17
|
|
|
18
|
+
// Runtime type exports
|
|
19
|
+
export type { RAGRuntimeServices } from './runtime/types.js'
|
|
20
|
+
|
|
18
21
|
export type {
|
|
19
22
|
RAGConfig,
|
|
20
23
|
NormalizedRAGConfig,
|
|
@@ -30,4 +33,7 @@ export type {
|
|
|
30
33
|
EmbeddingMetadata,
|
|
31
34
|
StoredEmbedding,
|
|
32
35
|
SearchResult,
|
|
36
|
+
EmbeddingsIndex,
|
|
37
|
+
EmbeddedDocument,
|
|
38
|
+
EmbeddingChunk,
|
|
33
39
|
} from './config/types.js'
|
package/src/providers/openai.ts
CHANGED
|
@@ -17,7 +17,7 @@ async function getOpenAI() {
|
|
|
17
17
|
try {
|
|
18
18
|
const module = await import('openai')
|
|
19
19
|
return module.default
|
|
20
|
-
} catch
|
|
20
|
+
} catch {
|
|
21
21
|
throw new Error(
|
|
22
22
|
'OpenAI package not found. Install it with: npm install openai\n' +
|
|
23
23
|
'Make sure to run: pnpm install openai',
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Build-time utilities for generating and managing embeddings
|
|
3
|
+
* Used by CLI tools and custom build scripts
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { readFileSync, existsSync } from 'node:fs'
|
|
7
|
+
import { createHash } from 'node:crypto'
|
|
8
|
+
import type { EmbeddingProvider } from '../providers/types.js'
|
|
9
|
+
import type { EmbeddingsIndex, EmbeddedDocument, EmbeddingChunk } from '../config/types.js'
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Simple character-based text chunking for build-time generation
|
|
13
|
+
*
|
|
14
|
+
* Simpler than the runtime chunking strategies, optimized for build-time batch processing.
|
|
15
|
+
* Splits text into fixed-size chunks with overlap.
|
|
16
|
+
*
|
|
17
|
+
* @param text - Text to chunk
|
|
18
|
+
* @param chunkSize - Size of each chunk in characters
|
|
19
|
+
* @param overlap - Overlap between chunks in characters
|
|
20
|
+
* @returns Array of text chunks
|
|
21
|
+
*
|
|
22
|
+
* @example
|
|
23
|
+
* ```typescript
|
|
24
|
+
* import { simpleChunkText } from '@opensaas/stack-rag/runtime'
|
|
25
|
+
*
|
|
26
|
+
* const chunks = simpleChunkText("Long document...", 500, 50)
|
|
27
|
+
* ```
|
|
28
|
+
*/
|
|
29
|
+
export function simpleChunkText(text: string, chunkSize: number, overlap: number): string[] {
|
|
30
|
+
const chunks: string[] = []
|
|
31
|
+
let start = 0
|
|
32
|
+
|
|
33
|
+
while (start < text.length) {
|
|
34
|
+
const end = Math.min(start + chunkSize, text.length)
|
|
35
|
+
chunks.push(text.slice(start, end))
|
|
36
|
+
start += chunkSize - overlap
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
return chunks
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Compute SHA256 hash of content for change detection
|
|
44
|
+
*
|
|
45
|
+
* @param content - Content to hash
|
|
46
|
+
* @returns Hexadecimal hash string
|
|
47
|
+
*
|
|
48
|
+
* @example
|
|
49
|
+
* ```typescript
|
|
50
|
+
* import { hashContent } from '@opensaas/stack-rag/runtime'
|
|
51
|
+
*
|
|
52
|
+
* const hash = hashContent("document content")
|
|
53
|
+
* ```
|
|
54
|
+
*/
|
|
55
|
+
export function hashContent(content: string): string {
|
|
56
|
+
return createHash('sha256').update(content).digest('hex')
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Load existing embeddings index from file
|
|
61
|
+
*
|
|
62
|
+
* Used for differential updates - only regenerate embeddings for changed content.
|
|
63
|
+
*
|
|
64
|
+
* @param filePath - Path to embeddings JSON file
|
|
65
|
+
* @returns Loaded index or null if file doesn't exist or can't be loaded
|
|
66
|
+
*
|
|
67
|
+
* @example
|
|
68
|
+
* ```typescript
|
|
69
|
+
* import { loadExistingIndex } from '@opensaas/stack-rag/runtime'
|
|
70
|
+
*
|
|
71
|
+
* const existing = loadExistingIndex('.embeddings/docs.json')
|
|
72
|
+
* if (existing) {
|
|
73
|
+
* console.log(`Found ${Object.keys(existing.documents).length} existing documents`)
|
|
74
|
+
* }
|
|
75
|
+
* ```
|
|
76
|
+
*/
|
|
77
|
+
export function loadExistingIndex(filePath: string): EmbeddingsIndex | null {
|
|
78
|
+
if (!existsSync(filePath)) {
|
|
79
|
+
return null
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
try {
|
|
83
|
+
const content = readFileSync(filePath, 'utf-8')
|
|
84
|
+
return JSON.parse(content) as EmbeddingsIndex
|
|
85
|
+
} catch {
|
|
86
|
+
console.warn(`Warning: Could not load existing embeddings from ${filePath}`)
|
|
87
|
+
return null
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Generate embeddings for a document with chunking
|
|
93
|
+
*
|
|
94
|
+
* Main utility for build-time embedding generation. Chunks the document,
|
|
95
|
+
* generates embeddings for each chunk, and returns a complete EmbeddedDocument.
|
|
96
|
+
*
|
|
97
|
+
* @param documentId - Unique identifier for the document
|
|
98
|
+
* @param content - Document content (plain text)
|
|
99
|
+
* @param provider - Embedding provider instance
|
|
100
|
+
* @param options - Generation options
|
|
101
|
+
* @returns Complete embedded document ready to be added to index
|
|
102
|
+
*
|
|
103
|
+
* @example
|
|
104
|
+
* ```typescript
|
|
105
|
+
* import { generateDocumentEmbeddings } from '@opensaas/stack-rag/runtime'
|
|
106
|
+
* import { createEmbeddingProvider } from '@opensaas/stack-rag/providers'
|
|
107
|
+
*
|
|
108
|
+
* const provider = createEmbeddingProvider({
|
|
109
|
+
* type: 'openai',
|
|
110
|
+
* apiKey: process.env.OPENAI_API_KEY
|
|
111
|
+
* })
|
|
112
|
+
*
|
|
113
|
+
* const doc = await generateDocumentEmbeddings(
|
|
114
|
+
* 'docs/getting-started',
|
|
115
|
+
* 'Document content here...',
|
|
116
|
+
* provider,
|
|
117
|
+
* {
|
|
118
|
+
* title: 'Getting Started',
|
|
119
|
+
* chunkSize: 500,
|
|
120
|
+
* chunkOverlap: 50,
|
|
121
|
+
* metadata: { section: 'guides' }
|
|
122
|
+
* }
|
|
123
|
+
* )
|
|
124
|
+
* ```
|
|
125
|
+
*/
|
|
126
|
+
export async function generateDocumentEmbeddings(
|
|
127
|
+
documentId: string,
|
|
128
|
+
content: string,
|
|
129
|
+
provider: EmbeddingProvider,
|
|
130
|
+
options: {
|
|
131
|
+
title?: string
|
|
132
|
+
chunkSize: number
|
|
133
|
+
chunkOverlap: number
|
|
134
|
+
metadata?: Record<string, unknown>
|
|
135
|
+
},
|
|
136
|
+
): Promise<EmbeddedDocument> {
|
|
137
|
+
const { title, chunkSize, chunkOverlap, metadata = {} } = options
|
|
138
|
+
|
|
139
|
+
// Hash content for differential updates
|
|
140
|
+
const contentHash = hashContent(content)
|
|
141
|
+
|
|
142
|
+
// Prepare all text chunks to embed
|
|
143
|
+
const allTextChunks: string[] = []
|
|
144
|
+
const chunkTypes: Array<'title' | 'content'> = []
|
|
145
|
+
|
|
146
|
+
// Add title chunk first if title exists
|
|
147
|
+
if (title) {
|
|
148
|
+
allTextChunks.push(title)
|
|
149
|
+
chunkTypes.push('title')
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// Chunk the content
|
|
153
|
+
const contentChunks = simpleChunkText(content, chunkSize, chunkOverlap)
|
|
154
|
+
allTextChunks.push(...contentChunks)
|
|
155
|
+
contentChunks.forEach(() => chunkTypes.push('content'))
|
|
156
|
+
|
|
157
|
+
// Generate embeddings in batch for all chunks
|
|
158
|
+
const allEmbeddings = await provider.embedBatch(allTextChunks)
|
|
159
|
+
|
|
160
|
+
// Build chunks with embeddings
|
|
161
|
+
const chunks: EmbeddingChunk[] = []
|
|
162
|
+
|
|
163
|
+
let embeddingIndex = 0
|
|
164
|
+
let contentChunkIndex = 0
|
|
165
|
+
|
|
166
|
+
for (let i = 0; i < chunkTypes.length; i++) {
|
|
167
|
+
const type = chunkTypes[i]
|
|
168
|
+
|
|
169
|
+
if (type === 'title') {
|
|
170
|
+
// Title chunk
|
|
171
|
+
chunks.push({
|
|
172
|
+
text: allTextChunks[embeddingIndex],
|
|
173
|
+
embedding: allEmbeddings[embeddingIndex],
|
|
174
|
+
metadata: {
|
|
175
|
+
chunkIndex: -1, // Special index for title
|
|
176
|
+
startOffset: 0,
|
|
177
|
+
endOffset: 0,
|
|
178
|
+
isTitle: true,
|
|
179
|
+
...metadata,
|
|
180
|
+
},
|
|
181
|
+
})
|
|
182
|
+
} else {
|
|
183
|
+
// Content chunk
|
|
184
|
+
chunks.push({
|
|
185
|
+
text: allTextChunks[embeddingIndex],
|
|
186
|
+
embedding: allEmbeddings[embeddingIndex],
|
|
187
|
+
metadata: {
|
|
188
|
+
chunkIndex: contentChunkIndex,
|
|
189
|
+
startOffset: contentChunkIndex * (chunkSize - chunkOverlap),
|
|
190
|
+
endOffset: Math.min(
|
|
191
|
+
(contentChunkIndex + 1) * chunkSize - contentChunkIndex * chunkOverlap,
|
|
192
|
+
content.length,
|
|
193
|
+
),
|
|
194
|
+
...metadata,
|
|
195
|
+
},
|
|
196
|
+
})
|
|
197
|
+
contentChunkIndex++
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
embeddingIndex++
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
return {
|
|
204
|
+
id: documentId,
|
|
205
|
+
title,
|
|
206
|
+
chunks,
|
|
207
|
+
embeddingMetadata: {
|
|
208
|
+
model: provider.model,
|
|
209
|
+
provider: provider.type,
|
|
210
|
+
dimensions: provider.dimensions,
|
|
211
|
+
generatedAt: new Date().toISOString(),
|
|
212
|
+
},
|
|
213
|
+
generatedAt: new Date().toISOString(),
|
|
214
|
+
contentHash,
|
|
215
|
+
}
|
|
216
|
+
}
|