@opensaas/stack-rag 0.1.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/.turbo/turbo-build.log +1 -1
  2. package/CHANGELOG.md +132 -0
  3. package/dist/config/index.d.ts.map +1 -1
  4. package/dist/config/index.js +9 -0
  5. package/dist/config/index.js.map +1 -1
  6. package/dist/config/plugin.d.ts.map +1 -1
  7. package/dist/config/plugin.js +32 -0
  8. package/dist/config/plugin.js.map +1 -1
  9. package/dist/config/plugin.test.js +70 -14
  10. package/dist/config/plugin.test.js.map +1 -1
  11. package/dist/config/types.d.ts +135 -0
  12. package/dist/config/types.d.ts.map +1 -1
  13. package/dist/index.d.ts +2 -1
  14. package/dist/index.d.ts.map +1 -1
  15. package/dist/providers/openai.js +1 -1
  16. package/dist/providers/openai.js.map +1 -1
  17. package/dist/runtime/build-time.d.ts +100 -0
  18. package/dist/runtime/build-time.d.ts.map +1 -0
  19. package/dist/runtime/build-time.js +185 -0
  20. package/dist/runtime/build-time.js.map +1 -0
  21. package/dist/runtime/index.d.ts +3 -0
  22. package/dist/runtime/index.d.ts.map +1 -1
  23. package/dist/runtime/index.js +6 -0
  24. package/dist/runtime/index.js.map +1 -1
  25. package/dist/runtime/markdown.d.ts +33 -0
  26. package/dist/runtime/markdown.d.ts.map +1 -0
  27. package/dist/runtime/markdown.js +94 -0
  28. package/dist/runtime/markdown.js.map +1 -0
  29. package/dist/runtime/provider-helpers.d.ts +56 -0
  30. package/dist/runtime/provider-helpers.d.ts.map +1 -0
  31. package/dist/runtime/provider-helpers.js +95 -0
  32. package/dist/runtime/provider-helpers.js.map +1 -0
  33. package/dist/runtime/types.d.ts +29 -0
  34. package/dist/runtime/types.d.ts.map +1 -0
  35. package/dist/runtime/types.js +6 -0
  36. package/dist/runtime/types.js.map +1 -0
  37. package/dist/storage/index.d.ts +1 -0
  38. package/dist/storage/index.d.ts.map +1 -1
  39. package/dist/storage/index.js +1 -0
  40. package/dist/storage/index.js.map +1 -1
  41. package/dist/storage/json-file.d.ts +53 -0
  42. package/dist/storage/json-file.d.ts.map +1 -0
  43. package/dist/storage/json-file.js +124 -0
  44. package/dist/storage/json-file.js.map +1 -0
  45. package/dist/storage/storage.test.js +1 -0
  46. package/dist/storage/storage.test.js.map +1 -1
  47. package/package.json +6 -5
  48. package/src/config/index.ts +9 -0
  49. package/src/config/plugin.test.ts +70 -14
  50. package/src/config/plugin.ts +37 -0
  51. package/src/config/types.ts +158 -0
  52. package/src/index.ts +6 -0
  53. package/src/providers/openai.ts +1 -1
  54. package/src/runtime/build-time.ts +216 -0
  55. package/src/runtime/index.ts +18 -0
  56. package/src/runtime/markdown.ts +119 -0
  57. package/src/runtime/provider-helpers.ts +115 -0
  58. package/src/runtime/types.ts +30 -0
  59. package/src/storage/index.ts +1 -0
  60. package/src/storage/json-file.ts +157 -0
  61. package/src/storage/storage.test.ts +1 -0
  62. package/tsconfig.tsbuildinfo +1 -1
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@opensaas/stack-rag",
3
- "version": "0.1.7",
3
+ "version": "0.3.0",
4
4
  "description": "RAG and AI embeddings integration for OpenSaas Stack",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -51,8 +51,8 @@
51
51
  "url": "https://github.com/OpenSaasAU/stack/issues"
52
52
  },
53
53
  "peerDependencies": {
54
- "openai": "^6.8.0",
55
- "@opensaas/stack-core": "0.1.7"
54
+ "@opensaas/stack-core": "^0",
55
+ "openai": "^6.8.0"
56
56
  },
57
57
  "peerDependenciesMeta": {
58
58
  "openai": {
@@ -60,16 +60,17 @@
60
60
  }
61
61
  },
62
62
  "dependencies": {
63
+ "dotenv": "^16.4.7",
63
64
  "zod": "^4.1.12"
64
65
  },
65
66
  "devDependencies": {
66
67
  "@types/node": "^24.7.2",
67
68
  "@vitest/coverage-v8": "^4.0.4",
68
69
  "@vitest/ui": "^4.0.0",
69
- "openai": "^6.0.0",
70
+ "openai": "^6.8.0",
70
71
  "typescript": "^5.9.3",
71
72
  "vitest": "^4.0.0",
72
- "@opensaas/stack-core": "0.1.7"
73
+ "@opensaas/stack-core": "0.3.0"
73
74
  },
74
75
  "scripts": {
75
76
  "build": "tsc",
@@ -19,6 +19,15 @@ export function normalizeRAGConfig(config: RAGConfig): NormalizedRAGConfig {
19
19
  maxTokens: config.chunking?.maxTokens || 500,
20
20
  overlap: config.chunking?.overlap || 50,
21
21
  },
22
+ buildTime: config.buildTime
23
+ ? {
24
+ enabled: config.buildTime.enabled,
25
+ outputPath: config.buildTime.outputPath || '.embeddings/embeddings.json',
26
+ chunkSize: config.buildTime.chunkSize || 500,
27
+ chunkOverlap: config.buildTime.chunkOverlap || 50,
28
+ differential: config.buildTime.differential ?? true,
29
+ }
30
+ : null,
22
31
  enableMcpTools: config.enableMcpTools ?? true,
23
32
  batchSize: config.batchSize || 10,
24
33
  rateLimit: config.rateLimit || 100,
@@ -78,7 +78,11 @@ describe('RAG Plugin', () => {
78
78
 
79
79
  const mockContext = {
80
80
  config: {
81
- db: { provider: 'sqlite', url: 'file:./test.db' },
81
+ db: {
82
+ provider: 'sqlite',
83
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
84
+ prismaClientConstructor: (() => null) as any,
85
+ },
82
86
  lists: {
83
87
  Article: {
84
88
  fields: {
@@ -122,7 +126,11 @@ describe('RAG Plugin', () => {
122
126
 
123
127
  const mockContext = {
124
128
  config: {
125
- db: { provider: 'sqlite', url: 'file:./test.db' },
129
+ db: {
130
+ provider: 'sqlite',
131
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
132
+ prismaClientConstructor: (() => null) as any,
133
+ },
126
134
  lists: {
127
135
  Article: {
128
136
  fields: {
@@ -158,7 +166,11 @@ describe('RAG Plugin', () => {
158
166
 
159
167
  const mockContext = {
160
168
  config: {
161
- db: { provider: 'sqlite', url: 'file:./test.db' },
169
+ db: {
170
+ provider: 'sqlite',
171
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
172
+ prismaClientConstructor: (() => null) as any,
173
+ },
162
174
  lists: {
163
175
  Article: {
164
176
  fields: {
@@ -197,7 +209,11 @@ describe('RAG Plugin', () => {
197
209
 
198
210
  const mockContext = {
199
211
  config: {
200
- db: { provider: 'sqlite', url: 'file:./test.db' },
212
+ db: {
213
+ provider: 'sqlite',
214
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
215
+ prismaClientConstructor: (() => null) as any,
216
+ },
201
217
  lists: {},
202
218
  },
203
219
  setPluginData: vi.fn(),
@@ -231,7 +247,11 @@ describe('RAG Plugin', () => {
231
247
 
232
248
  const mockContext = {
233
249
  config: {
234
- db: { provider: 'sqlite', url: 'file:./test.db' },
250
+ db: {
251
+ provider: 'sqlite',
252
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
253
+ prismaClientConstructor: (() => null) as any,
254
+ },
235
255
  lists: {
236
256
  Article: {
237
257
  fields: {
@@ -275,7 +295,11 @@ describe('RAG Plugin', () => {
275
295
 
276
296
  const mockContext = {
277
297
  config: {
278
- db: { provider: 'sqlite', url: 'file:./test.db' },
298
+ db: {
299
+ provider: 'sqlite',
300
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
301
+ prismaClientConstructor: (() => null) as any,
302
+ },
279
303
  lists: {
280
304
  Article: {
281
305
  fields: {
@@ -324,7 +348,11 @@ describe('RAG Plugin', () => {
324
348
 
325
349
  const mockContext = {
326
350
  config: {
327
- db: { provider: 'sqlite', url: 'file:./test.db' },
351
+ db: {
352
+ provider: 'sqlite',
353
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
354
+ prismaClientConstructor: (() => null) as any,
355
+ },
328
356
  lists: {
329
357
  Article: {
330
358
  fields: {
@@ -366,7 +394,11 @@ describe('RAG Plugin', () => {
366
394
 
367
395
  const mockContext = {
368
396
  config: {
369
- db: { provider: 'sqlite', url: 'file:./test.db' },
397
+ db: {
398
+ provider: 'sqlite',
399
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
400
+ prismaClientConstructor: (() => null) as any,
401
+ },
370
402
  lists: {
371
403
  Article: {
372
404
  fields: {
@@ -401,7 +433,11 @@ describe('RAG Plugin', () => {
401
433
 
402
434
  const mockContext = {
403
435
  config: {
404
- db: { provider: 'sqlite', url: 'file:./test.db' },
436
+ db: {
437
+ provider: 'sqlite',
438
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
439
+ prismaClientConstructor: (() => null) as any,
440
+ },
405
441
  lists: {
406
442
  User: {
407
443
  fields: {
@@ -435,7 +471,11 @@ describe('RAG Plugin', () => {
435
471
 
436
472
  const mockContext = {
437
473
  config: {
438
- db: { provider: 'sqlite', url: 'file:./test.db' },
474
+ db: {
475
+ provider: 'sqlite',
476
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
477
+ prismaClientConstructor: (() => null) as any,
478
+ },
439
479
  lists: {
440
480
  BlogPost: {
441
481
  fields: {
@@ -483,7 +523,11 @@ describe('RAG Plugin', () => {
483
523
 
484
524
  const mockContext = {
485
525
  config: {
486
- db: { provider: 'sqlite', url: 'file:./test.db' },
526
+ db: {
527
+ provider: 'sqlite',
528
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
529
+ prismaClientConstructor: (() => null) as any,
530
+ },
487
531
  lists: {
488
532
  Article: {
489
533
  fields: {
@@ -537,7 +581,11 @@ describe('RAG Plugin', () => {
537
581
 
538
582
  const mockContext = {
539
583
  config: {
540
- db: { provider: 'sqlite', url: 'file:./test.db' },
584
+ db: {
585
+ provider: 'sqlite',
586
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
587
+ prismaClientConstructor: (() => null) as any,
588
+ },
541
589
  lists: {
542
590
  Article: {
543
591
  fields: {
@@ -574,7 +622,11 @@ describe('RAG Plugin', () => {
574
622
 
575
623
  const mockContext = {
576
624
  config: {
577
- db: { provider: 'sqlite', url: 'file:./test.db' },
625
+ db: {
626
+ provider: 'sqlite',
627
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
628
+ prismaClientConstructor: (() => null) as any,
629
+ },
578
630
  lists: {
579
631
  Article: {
580
632
  fields: {
@@ -613,7 +665,11 @@ describe('RAG Plugin', () => {
613
665
 
614
666
  const mockContext = {
615
667
  config: {
616
- db: { provider: 'sqlite', url: 'file:./test.db' },
668
+ db: {
669
+ provider: 'sqlite',
670
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
671
+ prismaClientConstructor: (() => null) as any,
672
+ },
617
673
  lists: {},
618
674
  },
619
675
  setPluginData: vi.fn(),
@@ -45,6 +45,11 @@ export function ragPlugin(config: RAGConfig): Plugin {
45
45
  name: 'rag',
46
46
  version: '0.1.0',
47
47
 
48
+ runtimeServiceTypes: {
49
+ import: "import type { RAGRuntimeServices } from '@opensaas/stack-rag'",
50
+ typeName: 'RAGRuntimeServices',
51
+ },
52
+
48
53
  init: async (context) => {
49
54
  // First pass: Scan for searchable() wrapped fields and inject embedding fields
50
55
  for (const [listName, listConfig] of Object.entries(context.config.lists)) {
@@ -247,6 +252,38 @@ export function ragPlugin(config: RAGConfig): Plugin {
247
252
  // Access at runtime via: config._pluginData.rag
248
253
  context.setPluginData<NormalizedRAGConfig>('rag', normalized)
249
254
  },
255
+
256
+ runtime: () => {
257
+ // Provide RAG-related utilities at runtime
258
+ return {
259
+ /**
260
+ * Generate embedding for a given text
261
+ * Uses the configured embedding provider
262
+ */
263
+ generateEmbedding: async (text: string) => {
264
+ const ragConfig = normalized
265
+ if (!ragConfig || !ragConfig.provider) {
266
+ throw new Error('RAG plugin not configured')
267
+ }
268
+
269
+ const provider = createEmbeddingProvider(ragConfig.provider)
270
+ return await provider.embed(text)
271
+ },
272
+
273
+ /**
274
+ * Generate embeddings for multiple texts (batch)
275
+ */
276
+ generateEmbeddings: async (texts: string[]) => {
277
+ const ragConfig = normalized
278
+ if (!ragConfig || !ragConfig.provider) {
279
+ throw new Error('RAG plugin not configured')
280
+ }
281
+
282
+ const provider = createEmbeddingProvider(ragConfig.provider)
283
+ return await provider.embedBatch(texts)
284
+ },
285
+ }
286
+ },
250
287
  }
251
288
  }
252
289
 
@@ -155,6 +155,42 @@ export type VectorStorageConfig =
155
155
  | JsonStorageConfig
156
156
  | CustomStorageConfig
157
157
 
158
+ /**
159
+ * Build-time embedding generation configuration
160
+ */
161
+ export type BuildTimeConfig = {
162
+ /**
163
+ * Enable build-time embedding generation
164
+ */
165
+ enabled: boolean
166
+
167
+ /**
168
+ * Output path for embeddings JSON file
169
+ * Relative to project root
170
+ * @default '.embeddings/embeddings.json'
171
+ */
172
+ outputPath?: string
173
+
174
+ /**
175
+ * Chunk size for text splitting (in characters)
176
+ * @default 500
177
+ */
178
+ chunkSize?: number
179
+
180
+ /**
181
+ * Overlap between chunks (in characters)
182
+ * @default 50
183
+ */
184
+ chunkOverlap?: number
185
+
186
+ /**
187
+ * Whether to enable differential updates
188
+ * Only regenerate embeddings for changed content
189
+ * @default true
190
+ */
191
+ differential?: boolean
192
+ }
193
+
158
194
  /**
159
195
  * Main RAG configuration
160
196
  */
@@ -191,6 +227,13 @@ export type RAGConfig = {
191
227
  */
192
228
  chunking?: ChunkingConfig
193
229
 
230
+ /**
231
+ * Build-time embedding generation configuration
232
+ * When enabled, embeddings are generated at build time and stored in a JSON file
233
+ * instead of being generated at runtime via hooks
234
+ */
235
+ buildTime?: BuildTimeConfig
236
+
194
237
  /**
195
238
  * Whether to enable MCP tools for semantic search
196
239
  * Requires MCP to be enabled in main config
@@ -219,6 +262,7 @@ export type NormalizedRAGConfig = {
219
262
  providers: Record<string, EmbeddingProviderConfig>
220
263
  storage: VectorStorageConfig
221
264
  chunking: Required<ChunkingConfig>
265
+ buildTime: Required<BuildTimeConfig> | null
222
266
  enableMcpTools: boolean
223
267
  batchSize: number
224
268
  rateLimit: number
@@ -340,3 +384,117 @@ export type SearchableMetadata = {
340
384
  */
341
385
  chunking?: ChunkingConfig
342
386
  }
387
+
388
+ /**
389
+ * A chunk of text with its embedding
390
+ * Used in build-time generation output
391
+ */
392
+ export type EmbeddingChunk = {
393
+ /**
394
+ * The text content of this chunk
395
+ */
396
+ text: string
397
+
398
+ /**
399
+ * The embedding vector for this chunk
400
+ */
401
+ embedding: number[]
402
+
403
+ /**
404
+ * Metadata about the chunk
405
+ */
406
+ metadata: {
407
+ /**
408
+ * Index of this chunk within the document
409
+ */
410
+ chunkIndex: number
411
+
412
+ /**
413
+ * Start character position in original text
414
+ */
415
+ startOffset: number
416
+
417
+ /**
418
+ * End character position in original text
419
+ */
420
+ endOffset: number
421
+
422
+ /**
423
+ * Whether this chunk represents a document title
424
+ * Title chunks receive boosted scoring during search
425
+ */
426
+ isTitle?: boolean
427
+
428
+ /**
429
+ * Additional custom metadata
430
+ */
431
+ [key: string]: unknown
432
+ }
433
+ }
434
+
435
+ /**
436
+ * Document with embeddings
437
+ * Used in build-time generation output
438
+ */
439
+ export type EmbeddedDocument = {
440
+ /**
441
+ * Document ID or slug
442
+ */
443
+ id: string
444
+
445
+ /**
446
+ * Document title
447
+ */
448
+ title?: string
449
+
450
+ /**
451
+ * The chunks of this document with embeddings
452
+ */
453
+ chunks: EmbeddingChunk[]
454
+
455
+ /**
456
+ * Embedding metadata
457
+ */
458
+ embeddingMetadata: EmbeddingMetadata
459
+
460
+ /**
461
+ * When the embeddings were generated
462
+ */
463
+ generatedAt: string
464
+
465
+ /**
466
+ * Hash of the source content (for differential updates)
467
+ */
468
+ contentHash: string
469
+ }
470
+
471
+ /**
472
+ * Build-time embeddings index file format
473
+ */
474
+ export type EmbeddingsIndex = {
475
+ /**
476
+ * Version of the embeddings format
477
+ */
478
+ version: string
479
+
480
+ /**
481
+ * Embedding configuration used to generate these embeddings
482
+ */
483
+ config: {
484
+ provider: string
485
+ model: string
486
+ dimensions: number
487
+ chunkSize: number
488
+ chunkOverlap: number
489
+ }
490
+
491
+ /**
492
+ * Documents with embeddings
493
+ */
494
+ documents: Record<string, EmbeddedDocument>
495
+
496
+ /**
497
+ * When the index was generated
498
+ */
499
+ generatedAt: string
500
+ }
package/src/index.ts CHANGED
@@ -15,6 +15,9 @@ export {
15
15
  // Plugin export
16
16
  export { ragPlugin } from './config/plugin.js'
17
17
 
18
+ // Runtime type exports
19
+ export type { RAGRuntimeServices } from './runtime/types.js'
20
+
18
21
  export type {
19
22
  RAGConfig,
20
23
  NormalizedRAGConfig,
@@ -30,4 +33,7 @@ export type {
30
33
  EmbeddingMetadata,
31
34
  StoredEmbedding,
32
35
  SearchResult,
36
+ EmbeddingsIndex,
37
+ EmbeddedDocument,
38
+ EmbeddingChunk,
33
39
  } from './config/types.js'
@@ -17,7 +17,7 @@ async function getOpenAI() {
17
17
  try {
18
18
  const module = await import('openai')
19
19
  return module.default
20
- } catch (error) {
20
+ } catch {
21
21
  throw new Error(
22
22
  'OpenAI package not found. Install it with: npm install openai\n' +
23
23
  'Make sure to run: pnpm install openai',
@@ -0,0 +1,216 @@
1
+ /**
2
+ * Build-time utilities for generating and managing embeddings
3
+ * Used by CLI tools and custom build scripts
4
+ */
5
+
6
+ import { readFileSync, existsSync } from 'node:fs'
7
+ import { createHash } from 'node:crypto'
8
+ import type { EmbeddingProvider } from '../providers/types.js'
9
+ import type { EmbeddingsIndex, EmbeddedDocument, EmbeddingChunk } from '../config/types.js'
10
+
11
+ /**
12
+ * Simple character-based text chunking for build-time generation
13
+ *
14
+ * Simpler than the runtime chunking strategies, optimized for build-time batch processing.
15
+ * Splits text into fixed-size chunks with overlap.
16
+ *
17
+ * @param text - Text to chunk
18
+ * @param chunkSize - Size of each chunk in characters
19
+ * @param overlap - Overlap between chunks in characters
20
+ * @returns Array of text chunks
21
+ *
22
+ * @example
23
+ * ```typescript
24
+ * import { simpleChunkText } from '@opensaas/stack-rag/runtime'
25
+ *
26
+ * const chunks = simpleChunkText("Long document...", 500, 50)
27
+ * ```
28
+ */
29
+ export function simpleChunkText(text: string, chunkSize: number, overlap: number): string[] {
30
+ const chunks: string[] = []
31
+ let start = 0
32
+
33
+ while (start < text.length) {
34
+ const end = Math.min(start + chunkSize, text.length)
35
+ chunks.push(text.slice(start, end))
36
+ start += chunkSize - overlap
37
+ }
38
+
39
+ return chunks
40
+ }
41
+
42
+ /**
43
+ * Compute SHA256 hash of content for change detection
44
+ *
45
+ * @param content - Content to hash
46
+ * @returns Hexadecimal hash string
47
+ *
48
+ * @example
49
+ * ```typescript
50
+ * import { hashContent } from '@opensaas/stack-rag/runtime'
51
+ *
52
+ * const hash = hashContent("document content")
53
+ * ```
54
+ */
55
+ export function hashContent(content: string): string {
56
+ return createHash('sha256').update(content).digest('hex')
57
+ }
58
+
59
+ /**
60
+ * Load existing embeddings index from file
61
+ *
62
+ * Used for differential updates - only regenerate embeddings for changed content.
63
+ *
64
+ * @param filePath - Path to embeddings JSON file
65
+ * @returns Loaded index or null if file doesn't exist or can't be loaded
66
+ *
67
+ * @example
68
+ * ```typescript
69
+ * import { loadExistingIndex } from '@opensaas/stack-rag/runtime'
70
+ *
71
+ * const existing = loadExistingIndex('.embeddings/docs.json')
72
+ * if (existing) {
73
+ * console.log(`Found ${Object.keys(existing.documents).length} existing documents`)
74
+ * }
75
+ * ```
76
+ */
77
+ export function loadExistingIndex(filePath: string): EmbeddingsIndex | null {
78
+ if (!existsSync(filePath)) {
79
+ return null
80
+ }
81
+
82
+ try {
83
+ const content = readFileSync(filePath, 'utf-8')
84
+ return JSON.parse(content) as EmbeddingsIndex
85
+ } catch {
86
+ console.warn(`Warning: Could not load existing embeddings from ${filePath}`)
87
+ return null
88
+ }
89
+ }
90
+
91
+ /**
92
+ * Generate embeddings for a document with chunking
93
+ *
94
+ * Main utility for build-time embedding generation. Chunks the document,
95
+ * generates embeddings for each chunk, and returns a complete EmbeddedDocument.
96
+ *
97
+ * @param documentId - Unique identifier for the document
98
+ * @param content - Document content (plain text)
99
+ * @param provider - Embedding provider instance
100
+ * @param options - Generation options
101
+ * @returns Complete embedded document ready to be added to index
102
+ *
103
+ * @example
104
+ * ```typescript
105
+ * import { generateDocumentEmbeddings } from '@opensaas/stack-rag/runtime'
106
+ * import { createEmbeddingProvider } from '@opensaas/stack-rag/providers'
107
+ *
108
+ * const provider = createEmbeddingProvider({
109
+ * type: 'openai',
110
+ * apiKey: process.env.OPENAI_API_KEY
111
+ * })
112
+ *
113
+ * const doc = await generateDocumentEmbeddings(
114
+ * 'docs/getting-started',
115
+ * 'Document content here...',
116
+ * provider,
117
+ * {
118
+ * title: 'Getting Started',
119
+ * chunkSize: 500,
120
+ * chunkOverlap: 50,
121
+ * metadata: { section: 'guides' }
122
+ * }
123
+ * )
124
+ * ```
125
+ */
126
+ export async function generateDocumentEmbeddings(
127
+ documentId: string,
128
+ content: string,
129
+ provider: EmbeddingProvider,
130
+ options: {
131
+ title?: string
132
+ chunkSize: number
133
+ chunkOverlap: number
134
+ metadata?: Record<string, unknown>
135
+ },
136
+ ): Promise<EmbeddedDocument> {
137
+ const { title, chunkSize, chunkOverlap, metadata = {} } = options
138
+
139
+ // Hash content for differential updates
140
+ const contentHash = hashContent(content)
141
+
142
+ // Prepare all text chunks to embed
143
+ const allTextChunks: string[] = []
144
+ const chunkTypes: Array<'title' | 'content'> = []
145
+
146
+ // Add title chunk first if title exists
147
+ if (title) {
148
+ allTextChunks.push(title)
149
+ chunkTypes.push('title')
150
+ }
151
+
152
+ // Chunk the content
153
+ const contentChunks = simpleChunkText(content, chunkSize, chunkOverlap)
154
+ allTextChunks.push(...contentChunks)
155
+ contentChunks.forEach(() => chunkTypes.push('content'))
156
+
157
+ // Generate embeddings in batch for all chunks
158
+ const allEmbeddings = await provider.embedBatch(allTextChunks)
159
+
160
+ // Build chunks with embeddings
161
+ const chunks: EmbeddingChunk[] = []
162
+
163
+ let embeddingIndex = 0
164
+ let contentChunkIndex = 0
165
+
166
+ for (let i = 0; i < chunkTypes.length; i++) {
167
+ const type = chunkTypes[i]
168
+
169
+ if (type === 'title') {
170
+ // Title chunk
171
+ chunks.push({
172
+ text: allTextChunks[embeddingIndex],
173
+ embedding: allEmbeddings[embeddingIndex],
174
+ metadata: {
175
+ chunkIndex: -1, // Special index for title
176
+ startOffset: 0,
177
+ endOffset: 0,
178
+ isTitle: true,
179
+ ...metadata,
180
+ },
181
+ })
182
+ } else {
183
+ // Content chunk
184
+ chunks.push({
185
+ text: allTextChunks[embeddingIndex],
186
+ embedding: allEmbeddings[embeddingIndex],
187
+ metadata: {
188
+ chunkIndex: contentChunkIndex,
189
+ startOffset: contentChunkIndex * (chunkSize - chunkOverlap),
190
+ endOffset: Math.min(
191
+ (contentChunkIndex + 1) * chunkSize - contentChunkIndex * chunkOverlap,
192
+ content.length,
193
+ ),
194
+ ...metadata,
195
+ },
196
+ })
197
+ contentChunkIndex++
198
+ }
199
+
200
+ embeddingIndex++
201
+ }
202
+
203
+ return {
204
+ id: documentId,
205
+ title,
206
+ chunks,
207
+ embeddingMetadata: {
208
+ model: provider.model,
209
+ provider: provider.type,
210
+ dimensions: provider.dimensions,
211
+ generatedAt: new Date().toISOString(),
212
+ },
213
+ generatedAt: new Date().toISOString(),
214
+ contentHash,
215
+ }
216
+ }