@opensaas/stack-rag 0.1.6 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/.turbo/turbo-build.log +1 -1
  2. package/CHANGELOG.md +141 -0
  3. package/README.md +82 -6
  4. package/dist/config/index.d.ts.map +1 -1
  5. package/dist/config/index.js +9 -0
  6. package/dist/config/index.js.map +1 -1
  7. package/dist/config/plugin.d.ts.map +1 -1
  8. package/dist/config/plugin.js +61 -1
  9. package/dist/config/plugin.js.map +1 -1
  10. package/dist/config/plugin.test.js +70 -14
  11. package/dist/config/plugin.test.js.map +1 -1
  12. package/dist/config/types.d.ts +186 -0
  13. package/dist/config/types.d.ts.map +1 -1
  14. package/dist/fields/index.d.ts +1 -0
  15. package/dist/fields/index.d.ts.map +1 -1
  16. package/dist/fields/index.js +1 -0
  17. package/dist/fields/index.js.map +1 -1
  18. package/dist/fields/searchable.d.ts +42 -0
  19. package/dist/fields/searchable.d.ts.map +1 -0
  20. package/dist/fields/searchable.js +51 -0
  21. package/dist/fields/searchable.js.map +1 -0
  22. package/dist/fields/searchable.test.d.ts +2 -0
  23. package/dist/fields/searchable.test.d.ts.map +1 -0
  24. package/dist/fields/searchable.test.js +112 -0
  25. package/dist/fields/searchable.test.js.map +1 -0
  26. package/dist/index.d.ts +2 -1
  27. package/dist/index.d.ts.map +1 -1
  28. package/dist/providers/openai.d.ts +2 -0
  29. package/dist/providers/openai.d.ts.map +1 -1
  30. package/dist/providers/openai.js +35 -20
  31. package/dist/providers/openai.js.map +1 -1
  32. package/dist/runtime/batch.test.js +1 -1
  33. package/dist/runtime/build-time.d.ts +100 -0
  34. package/dist/runtime/build-time.d.ts.map +1 -0
  35. package/dist/runtime/build-time.js +185 -0
  36. package/dist/runtime/build-time.js.map +1 -0
  37. package/dist/runtime/index.d.ts +3 -0
  38. package/dist/runtime/index.d.ts.map +1 -1
  39. package/dist/runtime/index.js +6 -0
  40. package/dist/runtime/index.js.map +1 -1
  41. package/dist/runtime/markdown.d.ts +33 -0
  42. package/dist/runtime/markdown.d.ts.map +1 -0
  43. package/dist/runtime/markdown.js +94 -0
  44. package/dist/runtime/markdown.js.map +1 -0
  45. package/dist/runtime/provider-helpers.d.ts +56 -0
  46. package/dist/runtime/provider-helpers.d.ts.map +1 -0
  47. package/dist/runtime/provider-helpers.js +95 -0
  48. package/dist/runtime/provider-helpers.js.map +1 -0
  49. package/dist/runtime/types.d.ts +29 -0
  50. package/dist/runtime/types.d.ts.map +1 -0
  51. package/dist/runtime/types.js +6 -0
  52. package/dist/runtime/types.js.map +1 -0
  53. package/dist/storage/access-filter.d.ts +30 -0
  54. package/dist/storage/access-filter.d.ts.map +1 -0
  55. package/dist/storage/access-filter.js +241 -0
  56. package/dist/storage/access-filter.js.map +1 -0
  57. package/dist/storage/index.d.ts +2 -0
  58. package/dist/storage/index.d.ts.map +1 -1
  59. package/dist/storage/index.js +3 -0
  60. package/dist/storage/index.js.map +1 -1
  61. package/dist/storage/json-file.d.ts +53 -0
  62. package/dist/storage/json-file.d.ts.map +1 -0
  63. package/dist/storage/json-file.js +124 -0
  64. package/dist/storage/json-file.js.map +1 -0
  65. package/dist/storage/pgvector.d.ts.map +1 -1
  66. package/dist/storage/pgvector.js +26 -11
  67. package/dist/storage/pgvector.js.map +1 -1
  68. package/dist/storage/storage.test.js +2 -0
  69. package/dist/storage/storage.test.js.map +1 -1
  70. package/dist/storage/types.d.ts +5 -0
  71. package/dist/storage/types.d.ts.map +1 -1
  72. package/dist/storage/types.js.map +1 -1
  73. package/package.json +6 -5
  74. package/src/config/index.ts +9 -0
  75. package/src/config/plugin.test.ts +70 -14
  76. package/src/config/plugin.ts +72 -2
  77. package/src/config/types.ts +217 -0
  78. package/src/fields/index.ts +2 -0
  79. package/src/fields/searchable.test.ts +136 -0
  80. package/src/fields/searchable.ts +57 -0
  81. package/src/index.ts +6 -0
  82. package/src/providers/openai.ts +37 -22
  83. package/src/runtime/batch.test.ts +1 -1
  84. package/src/runtime/build-time.ts +216 -0
  85. package/src/runtime/index.ts +18 -0
  86. package/src/runtime/markdown.ts +119 -0
  87. package/src/runtime/provider-helpers.ts +115 -0
  88. package/src/runtime/types.ts +30 -0
  89. package/src/storage/access-filter.ts +303 -0
  90. package/src/storage/index.ts +4 -0
  91. package/src/storage/json-file.ts +157 -0
  92. package/src/storage/pgvector.ts +31 -11
  93. package/src/storage/storage.test.ts +2 -0
  94. package/src/storage/types.ts +6 -0
  95. package/tsconfig.tsbuildinfo +1 -1
@@ -0,0 +1,216 @@
1
+ /**
2
+ * Build-time utilities for generating and managing embeddings
3
+ * Used by CLI tools and custom build scripts
4
+ */
5
+
6
+ import { readFileSync, existsSync } from 'node:fs'
7
+ import { createHash } from 'node:crypto'
8
+ import type { EmbeddingProvider } from '../providers/types.js'
9
+ import type { EmbeddingsIndex, EmbeddedDocument, EmbeddingChunk } from '../config/types.js'
10
+
11
+ /**
12
+ * Simple character-based text chunking for build-time generation
13
+ *
14
+ * Simpler than the runtime chunking strategies, optimized for build-time batch processing.
15
+ * Splits text into fixed-size chunks with overlap.
16
+ *
17
+ * @param text - Text to chunk
18
+ * @param chunkSize - Size of each chunk in characters
19
+ * @param overlap - Overlap between chunks in characters
20
+ * @returns Array of text chunks
21
+ *
22
+ * @example
23
+ * ```typescript
24
+ * import { simpleChunkText } from '@opensaas/stack-rag/runtime'
25
+ *
26
+ * const chunks = simpleChunkText("Long document...", 500, 50)
27
+ * ```
28
+ */
29
+ export function simpleChunkText(text: string, chunkSize: number, overlap: number): string[] {
30
+ const chunks: string[] = []
31
+ let start = 0
32
+
33
+ while (start < text.length) {
34
+ const end = Math.min(start + chunkSize, text.length)
35
+ chunks.push(text.slice(start, end))
36
+ start += chunkSize - overlap
37
+ }
38
+
39
+ return chunks
40
+ }
41
+
42
+ /**
43
+ * Compute SHA256 hash of content for change detection
44
+ *
45
+ * @param content - Content to hash
46
+ * @returns Hexadecimal hash string
47
+ *
48
+ * @example
49
+ * ```typescript
50
+ * import { hashContent } from '@opensaas/stack-rag/runtime'
51
+ *
52
+ * const hash = hashContent("document content")
53
+ * ```
54
+ */
55
+ export function hashContent(content: string): string {
56
+ return createHash('sha256').update(content).digest('hex')
57
+ }
58
+
59
+ /**
60
+ * Load existing embeddings index from file
61
+ *
62
+ * Used for differential updates - only regenerate embeddings for changed content.
63
+ *
64
+ * @param filePath - Path to embeddings JSON file
65
+ * @returns Loaded index or null if file doesn't exist or can't be loaded
66
+ *
67
+ * @example
68
+ * ```typescript
69
+ * import { loadExistingIndex } from '@opensaas/stack-rag/runtime'
70
+ *
71
+ * const existing = loadExistingIndex('.embeddings/docs.json')
72
+ * if (existing) {
73
+ * console.log(`Found ${Object.keys(existing.documents).length} existing documents`)
74
+ * }
75
+ * ```
76
+ */
77
+ export function loadExistingIndex(filePath: string): EmbeddingsIndex | null {
78
+ if (!existsSync(filePath)) {
79
+ return null
80
+ }
81
+
82
+ try {
83
+ const content = readFileSync(filePath, 'utf-8')
84
+ return JSON.parse(content) as EmbeddingsIndex
85
+ } catch {
86
+ console.warn(`Warning: Could not load existing embeddings from ${filePath}`)
87
+ return null
88
+ }
89
+ }
90
+
91
+ /**
92
+ * Generate embeddings for a document with chunking
93
+ *
94
+ * Main utility for build-time embedding generation. Chunks the document,
95
+ * generates embeddings for each chunk, and returns a complete EmbeddedDocument.
96
+ *
97
+ * @param documentId - Unique identifier for the document
98
+ * @param content - Document content (plain text)
99
+ * @param provider - Embedding provider instance
100
+ * @param options - Generation options
101
+ * @returns Complete embedded document ready to be added to index
102
+ *
103
+ * @example
104
+ * ```typescript
105
+ * import { generateDocumentEmbeddings } from '@opensaas/stack-rag/runtime'
106
+ * import { createEmbeddingProvider } from '@opensaas/stack-rag/providers'
107
+ *
108
+ * const provider = createEmbeddingProvider({
109
+ * type: 'openai',
110
+ * apiKey: process.env.OPENAI_API_KEY
111
+ * })
112
+ *
113
+ * const doc = await generateDocumentEmbeddings(
114
+ * 'docs/getting-started',
115
+ * 'Document content here...',
116
+ * provider,
117
+ * {
118
+ * title: 'Getting Started',
119
+ * chunkSize: 500,
120
+ * chunkOverlap: 50,
121
+ * metadata: { section: 'guides' }
122
+ * }
123
+ * )
124
+ * ```
125
+ */
126
+ export async function generateDocumentEmbeddings(
127
+ documentId: string,
128
+ content: string,
129
+ provider: EmbeddingProvider,
130
+ options: {
131
+ title?: string
132
+ chunkSize: number
133
+ chunkOverlap: number
134
+ metadata?: Record<string, unknown>
135
+ },
136
+ ): Promise<EmbeddedDocument> {
137
+ const { title, chunkSize, chunkOverlap, metadata = {} } = options
138
+
139
+ // Hash content for differential updates
140
+ const contentHash = hashContent(content)
141
+
142
+ // Prepare all text chunks to embed
143
+ const allTextChunks: string[] = []
144
+ const chunkTypes: Array<'title' | 'content'> = []
145
+
146
+ // Add title chunk first if title exists
147
+ if (title) {
148
+ allTextChunks.push(title)
149
+ chunkTypes.push('title')
150
+ }
151
+
152
+ // Chunk the content
153
+ const contentChunks = simpleChunkText(content, chunkSize, chunkOverlap)
154
+ allTextChunks.push(...contentChunks)
155
+ contentChunks.forEach(() => chunkTypes.push('content'))
156
+
157
+ // Generate embeddings in batch for all chunks
158
+ const allEmbeddings = await provider.embedBatch(allTextChunks)
159
+
160
+ // Build chunks with embeddings
161
+ const chunks: EmbeddingChunk[] = []
162
+
163
+ let embeddingIndex = 0
164
+ let contentChunkIndex = 0
165
+
166
+ for (let i = 0; i < chunkTypes.length; i++) {
167
+ const type = chunkTypes[i]
168
+
169
+ if (type === 'title') {
170
+ // Title chunk
171
+ chunks.push({
172
+ text: allTextChunks[embeddingIndex],
173
+ embedding: allEmbeddings[embeddingIndex],
174
+ metadata: {
175
+ chunkIndex: -1, // Special index for title
176
+ startOffset: 0,
177
+ endOffset: 0,
178
+ isTitle: true,
179
+ ...metadata,
180
+ },
181
+ })
182
+ } else {
183
+ // Content chunk
184
+ chunks.push({
185
+ text: allTextChunks[embeddingIndex],
186
+ embedding: allEmbeddings[embeddingIndex],
187
+ metadata: {
188
+ chunkIndex: contentChunkIndex,
189
+ startOffset: contentChunkIndex * (chunkSize - chunkOverlap),
190
+ endOffset: Math.min(
191
+ (contentChunkIndex + 1) * chunkSize - contentChunkIndex * chunkOverlap,
192
+ content.length,
193
+ ),
194
+ ...metadata,
195
+ },
196
+ })
197
+ contentChunkIndex++
198
+ }
199
+
200
+ embeddingIndex++
201
+ }
202
+
203
+ return {
204
+ id: documentId,
205
+ title,
206
+ chunks,
207
+ embeddingMetadata: {
208
+ model: provider.model,
209
+ provider: provider.type,
210
+ dimensions: provider.dimensions,
211
+ generatedAt: new Date().toISOString(),
212
+ },
213
+ generatedAt: new Date().toISOString(),
214
+ contentHash,
215
+ }
216
+ }
@@ -49,3 +49,21 @@ export {
49
49
  type BatchError,
50
50
  type BatchProcessResult,
51
51
  } from './batch.js'
52
+
53
+ // Build-time utilities
54
+ export {
55
+ simpleChunkText,
56
+ hashContent,
57
+ loadExistingIndex,
58
+ generateDocumentEmbeddings,
59
+ } from './build-time.js'
60
+
61
+ // Markdown processing
62
+ export { stripMarkdown, extractMarkdownText } from './markdown.js'
63
+
64
+ // Provider helpers
65
+ export {
66
+ createProviderFromEnv,
67
+ getProviderConfigFromEnv,
68
+ type ProviderType,
69
+ } from './provider-helpers.js'
@@ -0,0 +1,119 @@
1
+ /**
2
+ * Markdown processing utilities for content preparation
3
+ */
4
+
5
+ /**
6
+ * Strip markdown formatting for cleaner text suitable for embeddings
7
+ *
8
+ * Removes code blocks, formatting markers, links, images, and HTML tags
9
+ * while preserving the actual content.
10
+ *
11
+ * @param markdown - Markdown text to process
12
+ * @returns Plain text with markdown removed
13
+ *
14
+ * @example
15
+ * ```typescript
16
+ * import { stripMarkdown } from '@opensaas/stack-rag/runtime'
17
+ *
18
+ * const markdown = '# Hello\n\nThis is **bold** text with a [link](url).'
19
+ * const plain = stripMarkdown(markdown)
20
+ * // Returns: 'Hello\n\nThis is bold text with a link.'
21
+ * ```
22
+ */
23
+ export function stripMarkdown(markdown: string): string {
24
+ let text = markdown
25
+
26
+ // Remove code blocks
27
+ text = text.replace(/```[\s\S]*?```/g, '')
28
+ text = text.replace(/`[^`]+`/g, '')
29
+
30
+ // Remove headings markers but keep text
31
+ text = text.replace(/^#+\s+/gm, '')
32
+
33
+ // Remove bold/italic markers
34
+ text = text.replace(/\*\*([^*]+)\*\*/g, '$1')
35
+ text = text.replace(/\*([^*]+)\*/g, '$1')
36
+ text = text.replace(/__([^_]+)__/g, '$1')
37
+ text = text.replace(/_([^_]+)_/g, '$1')
38
+
39
+ // Remove links but keep text
40
+ text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
41
+
42
+ // Remove images
43
+ text = text.replace(/!\[([^\]]*)\]\([^)]+\)/g, '')
44
+
45
+ // Remove HTML tags
46
+ text = text.replace(/<[^>]+>/g, '')
47
+
48
+ // Normalize whitespace
49
+ text = text.replace(/\n{3,}/g, '\n\n')
50
+ text = text.replace(/[ \t]+/g, ' ')
51
+
52
+ return text.trim()
53
+ }
54
+
55
+ /**
56
+ * Extract text content from common markdown structures
57
+ *
58
+ * More aggressive than stripMarkdown - extracts only text content,
59
+ * removes all structural elements.
60
+ *
61
+ * @param markdown - Markdown text
62
+ * @returns Extracted plain text
63
+ */
64
+ export function extractMarkdownText(markdown: string): string {
65
+ let text = markdown
66
+
67
+ // Remove YAML frontmatter
68
+ text = text.replace(/^---[\s\S]*?---\n/m, '')
69
+
70
+ // Remove code blocks entirely (including content)
71
+ text = text.replace(/```[\s\S]*?```/g, '')
72
+
73
+ // Remove inline code
74
+ text = text.replace(/`[^`]+`/g, '')
75
+
76
+ // Remove horizontal rules
77
+ text = text.replace(/^[-*_]{3,}$/gm, '')
78
+
79
+ // Remove blockquotes markers
80
+ text = text.replace(/^>\s+/gm, '')
81
+
82
+ // Remove list markers
83
+ text = text.replace(/^[\s]*[-*+]\s+/gm, '')
84
+ text = text.replace(/^[\s]*\d+\.\s+/gm, '')
85
+
86
+ // Remove headings markers
87
+ text = text.replace(/^#+\s+/gm, '')
88
+
89
+ // Remove emphasis markers
90
+ text = text.replace(/\*\*([^*]+)\*\*/g, '$1')
91
+ text = text.replace(/\*([^*]+)\*/g, '$1')
92
+ text = text.replace(/__([^_]+)__/g, '$1')
93
+ text = text.replace(/_([^_]+)_/g, '$1')
94
+
95
+ // Remove strikethrough
96
+ text = text.replace(/~~([^~]+)~~/g, '$1')
97
+
98
+ // Remove links but keep text
99
+ text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
100
+
101
+ // Remove reference-style links
102
+ text = text.replace(/\[([^\]]+)\]\[[^\]]*\]/g, '$1')
103
+
104
+ // Remove images
105
+ text = text.replace(/!\[([^\]]*)\]\([^)]+\)/g, '')
106
+
107
+ // Remove HTML tags
108
+ text = text.replace(/<[^>]+>/g, '')
109
+
110
+ // Remove HTML entities
111
+ text = text.replace(/&[a-z]+;/gi, '')
112
+
113
+ // Normalize whitespace
114
+ text = text.replace(/\n{3,}/g, '\n\n')
115
+ text = text.replace(/[ \t]+/g, ' ')
116
+ text = text.replace(/^\s+|\s+$/gm, '')
117
+
118
+ return text.trim()
119
+ }
@@ -0,0 +1,115 @@
1
+ /**
2
+ * Helper utilities for working with embedding providers
3
+ * Simplifies provider creation from environment variables
4
+ */
5
+
6
+ import { createEmbeddingProvider } from '../providers/index.js'
7
+ import type { EmbeddingProvider } from '../providers/types.js'
8
+ import type { EmbeddingProviderConfig } from '../config/types.js'
9
+ import 'dotenv/config'
10
+
11
+ /**
12
+ * Provider type from environment or configuration
13
+ */
14
+ export type ProviderType = 'openai' | 'ollama'
15
+
16
+ /**
17
+ * Create an embedding provider from environment variables
18
+ *
19
+ * Reads configuration from environment variables:
20
+ * - EMBEDDING_PROVIDER: 'openai' or 'ollama' (default: 'openai')
21
+ * - OPENAI_API_KEY: Required if using OpenAI
22
+ * - OLLAMA_BASE_URL: Ollama endpoint (default: 'http://localhost:11434')
23
+ *
24
+ * @param overrides - Optional overrides for environment config
25
+ * @returns Configured embedding provider
26
+ *
27
+ * @example
28
+ * ```typescript
29
+ * import { createProviderFromEnv } from '@opensaas/stack-rag/runtime'
30
+ *
31
+ * // Uses EMBEDDING_PROVIDER and OPENAI_API_KEY from env
32
+ * const provider = createProviderFromEnv()
33
+ *
34
+ * // Override provider type
35
+ * const ollamaProvider = createProviderFromEnv({ provider: 'ollama' })
36
+ * ```
37
+ */
38
+ export function createProviderFromEnv(overrides?: {
39
+ provider?: ProviderType
40
+ openaiApiKey?: string
41
+ ollamaBaseUrl?: string
42
+ model?: string
43
+ }): EmbeddingProvider {
44
+ const providerType =
45
+ overrides?.provider || (process.env.EMBEDDING_PROVIDER as ProviderType) || 'openai'
46
+
47
+ let config: EmbeddingProviderConfig
48
+
49
+ if (providerType === 'openai') {
50
+ const apiKey = overrides?.openaiApiKey || process.env.OPENAI_API_KEY
51
+
52
+ if (!apiKey) {
53
+ throw new Error('OPENAI_API_KEY environment variable is required when using OpenAI provider')
54
+ }
55
+
56
+ config = {
57
+ type: 'openai',
58
+ apiKey,
59
+ model:
60
+ (overrides?.model as 'text-embedding-3-small' | 'text-embedding-3-large') ||
61
+ 'text-embedding-3-small',
62
+ }
63
+ } else if (providerType === 'ollama') {
64
+ config = {
65
+ type: 'ollama',
66
+ baseURL: overrides?.ollamaBaseUrl || process.env.OLLAMA_BASE_URL || 'http://localhost:11434',
67
+ model: overrides?.model || 'nomic-embed-text',
68
+ }
69
+ } else {
70
+ throw new Error(`Unknown provider type: ${providerType}. Supported: openai, ollama`)
71
+ }
72
+
73
+ return createEmbeddingProvider(config)
74
+ }
75
+
76
+ /**
77
+ * Get provider configuration from environment
78
+ *
79
+ * Useful for inspecting what provider would be used without creating it.
80
+ *
81
+ * @returns Provider configuration object
82
+ *
83
+ * @example
84
+ * ```typescript
85
+ * import { getProviderConfigFromEnv } from '@opensaas/stack-rag/runtime'
86
+ *
87
+ * const config = getProviderConfigFromEnv()
88
+ * console.log(`Using ${config.type} provider`)
89
+ * ```
90
+ */
91
+ export function getProviderConfigFromEnv(): EmbeddingProviderConfig {
92
+ const providerType = (process.env.EMBEDDING_PROVIDER as ProviderType) || 'openai'
93
+
94
+ if (providerType === 'openai') {
95
+ const apiKey = process.env.OPENAI_API_KEY
96
+
97
+ if (!apiKey) {
98
+ throw new Error('OPENAI_API_KEY environment variable is required')
99
+ }
100
+
101
+ return {
102
+ type: 'openai',
103
+ apiKey,
104
+ model: 'text-embedding-3-small',
105
+ }
106
+ } else if (providerType === 'ollama') {
107
+ return {
108
+ type: 'ollama',
109
+ baseURL: process.env.OLLAMA_BASE_URL || 'http://localhost:11434',
110
+ model: 'nomic-embed-text',
111
+ }
112
+ } else {
113
+ throw new Error(`Unknown provider type: ${providerType}`)
114
+ }
115
+ }
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Type definitions for RAG plugin runtime services
3
+ * These types are used for type-safe access to context.plugins.rag
4
+ */
5
+
6
+ /**
7
+ * Runtime services provided by the RAG plugin
8
+ * Available via context.plugins.rag
9
+ */
10
+ export interface RAGRuntimeServices {
11
+ /**
12
+ * Generate embedding for a given text
13
+ * Uses the configured embedding provider
14
+ *
15
+ * @param text - The text to generate an embedding for
16
+ * @param providerName - Optional provider name if multiple providers are configured
17
+ * @returns Vector embedding as array of numbers
18
+ */
19
+ generateEmbedding: (text: string, providerName?: string) => Promise<number[]>
20
+
21
+ /**
22
+ * Generate embeddings for multiple texts (batch)
23
+ * More efficient than calling generateEmbedding multiple times
24
+ *
25
+ * @param texts - Array of texts to generate embeddings for
26
+ * @param providerName - Optional provider name if multiple providers are configured
27
+ * @returns Array of vector embeddings
28
+ */
29
+ generateEmbeddings: (texts: string[], providerName?: string) => Promise<number[][]>
30
+ }