@twelvehart/supermemory-runtime 1.0.0-next.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +57 -0
- package/README.md +374 -0
- package/dist/index.js +189 -0
- package/dist/mcp/index.js +1132 -0
- package/docker-compose.prod.yml +91 -0
- package/docker-compose.yml +358 -0
- package/drizzle/0000_dapper_the_professor.sql +159 -0
- package/drizzle/0001_api_keys.sql +51 -0
- package/drizzle/meta/0000_snapshot.json +1532 -0
- package/drizzle/meta/_journal.json +13 -0
- package/drizzle.config.ts +20 -0
- package/package.json +114 -0
- package/scripts/add-extraction-job.ts +122 -0
- package/scripts/benchmark-pgvector.ts +122 -0
- package/scripts/bootstrap.sh +209 -0
- package/scripts/check-runtime-pack.ts +111 -0
- package/scripts/claude-mcp-config.ts +336 -0
- package/scripts/docker-entrypoint.sh +183 -0
- package/scripts/doctor.ts +377 -0
- package/scripts/init-db.sql +33 -0
- package/scripts/install.sh +1110 -0
- package/scripts/mcp-setup.ts +271 -0
- package/scripts/migrations/001_create_pgvector_extension.sql +31 -0
- package/scripts/migrations/002_create_memory_embeddings_table.sql +75 -0
- package/scripts/migrations/003_create_hnsw_index.sql +94 -0
- package/scripts/migrations/004_create_memory_embeddings_standalone.sql +70 -0
- package/scripts/migrations/005_create_chunks_table.sql +95 -0
- package/scripts/migrations/006_create_processing_queue.sql +45 -0
- package/scripts/migrations/generate_test_data.sql +42 -0
- package/scripts/migrations/phase1_comprehensive_test.sql +204 -0
- package/scripts/migrations/run_migrations.sh +286 -0
- package/scripts/migrations/test_hnsw_index.sql +255 -0
- package/scripts/pre-commit-secrets +282 -0
- package/scripts/run-extraction-worker.ts +46 -0
- package/scripts/run-phase1-tests.sh +291 -0
- package/scripts/setup.ts +222 -0
- package/scripts/smoke-install.sh +12 -0
- package/scripts/test-health-endpoint.sh +328 -0
- package/src/api/index.ts +2 -0
- package/src/api/middleware/auth.ts +80 -0
- package/src/api/middleware/csrf.ts +308 -0
- package/src/api/middleware/errorHandler.ts +166 -0
- package/src/api/middleware/rateLimit.ts +360 -0
- package/src/api/middleware/validation.ts +514 -0
- package/src/api/routes/documents.ts +286 -0
- package/src/api/routes/profiles.ts +237 -0
- package/src/api/routes/search.ts +71 -0
- package/src/api/stores/index.ts +58 -0
- package/src/config/bootstrap-env.ts +3 -0
- package/src/config/env.ts +71 -0
- package/src/config/feature-flags.ts +25 -0
- package/src/config/index.ts +140 -0
- package/src/config/secrets.config.ts +291 -0
- package/src/db/client.ts +92 -0
- package/src/db/index.ts +73 -0
- package/src/db/postgres.ts +72 -0
- package/src/db/schema/chunks.schema.ts +31 -0
- package/src/db/schema/containers.schema.ts +46 -0
- package/src/db/schema/documents.schema.ts +49 -0
- package/src/db/schema/embeddings.schema.ts +32 -0
- package/src/db/schema/index.ts +11 -0
- package/src/db/schema/memories.schema.ts +72 -0
- package/src/db/schema/profiles.schema.ts +34 -0
- package/src/db/schema/queue.schema.ts +59 -0
- package/src/db/schema/relationships.schema.ts +42 -0
- package/src/db/schema.ts +223 -0
- package/src/db/worker-connection.ts +47 -0
- package/src/index.ts +235 -0
- package/src/mcp/CLAUDE.md +1 -0
- package/src/mcp/index.ts +1380 -0
- package/src/mcp/legacyState.ts +22 -0
- package/src/mcp/rateLimit.ts +358 -0
- package/src/mcp/resources.ts +309 -0
- package/src/mcp/results.ts +104 -0
- package/src/mcp/tools.ts +401 -0
- package/src/queues/config.ts +119 -0
- package/src/queues/index.ts +289 -0
- package/src/sdk/client.ts +225 -0
- package/src/sdk/errors.ts +266 -0
- package/src/sdk/http.ts +560 -0
- package/src/sdk/index.ts +244 -0
- package/src/sdk/resources/base.ts +65 -0
- package/src/sdk/resources/connections.ts +204 -0
- package/src/sdk/resources/documents.ts +163 -0
- package/src/sdk/resources/index.ts +10 -0
- package/src/sdk/resources/memories.ts +150 -0
- package/src/sdk/resources/search.ts +60 -0
- package/src/sdk/resources/settings.ts +36 -0
- package/src/sdk/types.ts +674 -0
- package/src/services/chunking/index.ts +451 -0
- package/src/services/chunking.service.ts +650 -0
- package/src/services/csrf.service.ts +252 -0
- package/src/services/documents.repository.ts +219 -0
- package/src/services/documents.service.ts +191 -0
- package/src/services/embedding.service.ts +404 -0
- package/src/services/extraction.service.ts +300 -0
- package/src/services/extractors/code.extractor.ts +451 -0
- package/src/services/extractors/index.ts +9 -0
- package/src/services/extractors/markdown.extractor.ts +461 -0
- package/src/services/extractors/pdf.extractor.ts +315 -0
- package/src/services/extractors/text.extractor.ts +118 -0
- package/src/services/extractors/url.extractor.ts +243 -0
- package/src/services/index.ts +235 -0
- package/src/services/ingestion.service.ts +177 -0
- package/src/services/llm/anthropic.ts +400 -0
- package/src/services/llm/base.ts +460 -0
- package/src/services/llm/contradiction-detector.service.ts +526 -0
- package/src/services/llm/heuristics.ts +148 -0
- package/src/services/llm/index.ts +309 -0
- package/src/services/llm/memory-classifier.service.ts +383 -0
- package/src/services/llm/memory-extension-detector.service.ts +523 -0
- package/src/services/llm/mock.ts +470 -0
- package/src/services/llm/openai.ts +398 -0
- package/src/services/llm/prompts.ts +438 -0
- package/src/services/llm/types.ts +373 -0
- package/src/services/memory.repository.ts +1769 -0
- package/src/services/memory.service.ts +1338 -0
- package/src/services/memory.types.ts +234 -0
- package/src/services/persistence/index.ts +295 -0
- package/src/services/pipeline.service.ts +509 -0
- package/src/services/profile.repository.ts +436 -0
- package/src/services/profile.service.ts +560 -0
- package/src/services/profile.types.ts +270 -0
- package/src/services/relationships/detector.ts +1128 -0
- package/src/services/relationships/index.ts +268 -0
- package/src/services/relationships/memory-integration.ts +459 -0
- package/src/services/relationships/strategies.ts +132 -0
- package/src/services/relationships/types.ts +370 -0
- package/src/services/search.service.ts +761 -0
- package/src/services/search.types.ts +220 -0
- package/src/services/secrets.service.ts +384 -0
- package/src/services/vectorstore/base.ts +327 -0
- package/src/services/vectorstore/index.ts +444 -0
- package/src/services/vectorstore/memory.ts +286 -0
- package/src/services/vectorstore/migration.ts +295 -0
- package/src/services/vectorstore/mock.ts +403 -0
- package/src/services/vectorstore/pgvector.ts +695 -0
- package/src/services/vectorstore/types.ts +247 -0
- package/src/startup.ts +389 -0
- package/src/types/api.types.ts +193 -0
- package/src/types/document.types.ts +103 -0
- package/src/types/index.ts +241 -0
- package/src/types/profile.base.ts +133 -0
- package/src/utils/errors.ts +447 -0
- package/src/utils/id.ts +15 -0
- package/src/utils/index.ts +101 -0
- package/src/utils/logger.ts +313 -0
- package/src/utils/sanitization.ts +501 -0
- package/src/utils/secret-validation.ts +273 -0
- package/src/utils/synonyms.ts +188 -0
- package/src/utils/validation.ts +581 -0
- package/src/workers/chunking.worker.ts +242 -0
- package/src/workers/embedding.worker.ts +358 -0
- package/src/workers/extraction.worker.ts +346 -0
- package/src/workers/indexing.worker.ts +505 -0
- package/tsconfig.json +38 -0
|
@@ -0,0 +1,404 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embedding Service for Supermemory Clone
|
|
3
|
+
*
|
|
4
|
+
* Provides vector embedding generation using OpenAI's text-embedding-3-small
|
|
5
|
+
* with fallback to local embeddings.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { EmbeddingConfig, EmbeddingProvider } from './search.types.js'
|
|
9
|
+
import { ValidationError, EmbeddingError, ExternalServiceError } from '../utils/errors.js'
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Configuration for embedding models
|
|
13
|
+
*/
|
|
14
|
+
const EMBEDDING_CONFIGS: Record<EmbeddingProvider, EmbeddingConfig> = {
|
|
15
|
+
openai: {
|
|
16
|
+
model: 'text-embedding-3-small',
|
|
17
|
+
dimensions: 1536,
|
|
18
|
+
isLocal: false,
|
|
19
|
+
maxTokens: 8191,
|
|
20
|
+
batchSize: 100,
|
|
21
|
+
},
|
|
22
|
+
local: {
|
|
23
|
+
model: 'local-tfidf',
|
|
24
|
+
dimensions: 384,
|
|
25
|
+
isLocal: true,
|
|
26
|
+
maxTokens: 512,
|
|
27
|
+
batchSize: 50,
|
|
28
|
+
},
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Simple hash function for consistent local embeddings
|
|
33
|
+
*/
|
|
34
|
+
function hashCode(str: string): number {
|
|
35
|
+
let hash = 0
|
|
36
|
+
for (let i = 0; i < str.length; i++) {
|
|
37
|
+
const char = str.charCodeAt(i)
|
|
38
|
+
hash = (hash << 5) - hash + char
|
|
39
|
+
hash = hash & hash // Convert to 32-bit integer
|
|
40
|
+
}
|
|
41
|
+
return hash
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Generate a deterministic pseudo-random number from a seed
|
|
46
|
+
*/
|
|
47
|
+
function seededRandom(seed: number): () => number {
|
|
48
|
+
return function (): number {
|
|
49
|
+
seed = (seed * 1103515245 + 12345) & 0x7fffffff
|
|
50
|
+
return seed / 0x7fffffff
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Normalize a vector to unit length (L2 normalization)
|
|
56
|
+
*/
|
|
57
|
+
function normalizeVector(vector: number[]): number[] {
|
|
58
|
+
const magnitude = Math.sqrt(vector.reduce((sum, val) => sum + val * val, 0))
|
|
59
|
+
if (magnitude === 0) return vector
|
|
60
|
+
return vector.map((val) => val / magnitude)
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Local TF-IDF based embedding generator (fallback)
|
|
65
|
+
* Generates deterministic embeddings based on text content
|
|
66
|
+
*/
|
|
67
|
+
function generateLocalEmbedding(text: string, dimensions: number = 384): number[] {
|
|
68
|
+
// Tokenize and normalize text
|
|
69
|
+
const tokens = text
|
|
70
|
+
.toLowerCase()
|
|
71
|
+
.replace(/[^\w\s]/g, ' ')
|
|
72
|
+
.split(/\s+/)
|
|
73
|
+
.filter((t) => t.length > 0)
|
|
74
|
+
|
|
75
|
+
// Initialize embedding vector
|
|
76
|
+
const embedding = new Array(dimensions).fill(0)
|
|
77
|
+
|
|
78
|
+
// Combine token-based features with random projection
|
|
79
|
+
const tokenWeights = new Map<string, number>()
|
|
80
|
+
|
|
81
|
+
// Calculate term frequency
|
|
82
|
+
for (const token of tokens) {
|
|
83
|
+
tokenWeights.set(token, (tokenWeights.get(token) || 0) + 1)
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// Apply TF weighting and random projection
|
|
87
|
+
for (const [token, count] of tokenWeights.entries()) {
|
|
88
|
+
const tf = Math.log(1 + count)
|
|
89
|
+
const tokenHash = hashCode(token)
|
|
90
|
+
const tokenRandom = seededRandom(tokenHash)
|
|
91
|
+
|
|
92
|
+
// Project each token into the embedding space
|
|
93
|
+
for (let i = 0; i < dimensions; i++) {
|
|
94
|
+
embedding[i] += tf * (tokenRandom() * 2 - 1)
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Add positional information
|
|
99
|
+
for (let i = 0; i < Math.min(tokens.length, 50); i++) {
|
|
100
|
+
const token = tokens[i]
|
|
101
|
+
if (!token) continue
|
|
102
|
+
const posWeight = 1 / (1 + i * 0.1)
|
|
103
|
+
const tokenHash = hashCode(token + ':' + i)
|
|
104
|
+
const posRandom = seededRandom(tokenHash)
|
|
105
|
+
|
|
106
|
+
for (let j = 0; j < dimensions; j++) {
|
|
107
|
+
embedding[j] += posWeight * (posRandom() * 2 - 1) * 0.1
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// Normalize to unit vector
|
|
112
|
+
return normalizeVector(embedding)
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Embedding Service class
|
|
117
|
+
*/
|
|
118
|
+
export class EmbeddingService {
|
|
119
|
+
private readonly apiKey: string | undefined
|
|
120
|
+
private readonly baseUrl: string
|
|
121
|
+
private readonly config: EmbeddingConfig
|
|
122
|
+
private readonly provider: EmbeddingProvider
|
|
123
|
+
|
|
124
|
+
constructor(options?: { apiKey?: string; baseUrl?: string; provider?: EmbeddingProvider }) {
|
|
125
|
+
this.apiKey = options?.apiKey || process.env.OPENAI_API_KEY
|
|
126
|
+
this.baseUrl = options?.baseUrl || 'https://api.openai.com/v1'
|
|
127
|
+
this.provider = options?.provider || (this.apiKey ? 'openai' : 'local')
|
|
128
|
+
this.config = EMBEDDING_CONFIGS[this.provider]
|
|
129
|
+
|
|
130
|
+
if (!this.apiKey && this.provider === 'openai') {
|
|
131
|
+
console.warn('[EmbeddingService] No OpenAI API key found, falling back to local embeddings')
|
|
132
|
+
this.provider = 'local'
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Get the current embedding configuration
|
|
138
|
+
*/
|
|
139
|
+
getConfig(): EmbeddingConfig {
|
|
140
|
+
return { ...this.config }
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Get the embedding dimensions
|
|
145
|
+
*/
|
|
146
|
+
getDimensions(): number {
|
|
147
|
+
return EMBEDDING_CONFIGS[this.provider].dimensions
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Check if using local fallback
|
|
152
|
+
*/
|
|
153
|
+
isUsingLocalFallback(): boolean {
|
|
154
|
+
return this.provider === 'local'
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Generate embedding for a single text
|
|
159
|
+
*/
|
|
160
|
+
async generateEmbedding(text: string): Promise<number[]> {
|
|
161
|
+
if (!text || text.trim().length === 0) {
|
|
162
|
+
throw new ValidationError('Text cannot be empty', {
|
|
163
|
+
text: ['Text is required and cannot be empty'],
|
|
164
|
+
})
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// Truncate if too long
|
|
168
|
+
const maxChars = (this.config.maxTokens || 8191) * 4 // Rough estimate
|
|
169
|
+
const truncatedText = text.length > maxChars ? text.slice(0, maxChars) : text
|
|
170
|
+
|
|
171
|
+
if (this.provider === 'local') {
|
|
172
|
+
return this.generateLocalEmbedding(truncatedText)
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
try {
|
|
176
|
+
return await this.generateOpenAIEmbedding(truncatedText)
|
|
177
|
+
} catch (error) {
|
|
178
|
+
console.warn('[EmbeddingService] OpenAI embedding failed, falling back to local:', error)
|
|
179
|
+
return this.generateLocalEmbedding(truncatedText)
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* Generate embeddings for multiple texts (batch)
|
|
185
|
+
*/
|
|
186
|
+
async batchEmbed(texts: string[]): Promise<number[][]> {
|
|
187
|
+
if (!texts || texts.length === 0) {
|
|
188
|
+
return []
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// Filter empty texts and track indices
|
|
192
|
+
const validTexts: { text: string; originalIndex: number }[] = []
|
|
193
|
+
for (let i = 0; i < texts.length; i++) {
|
|
194
|
+
const text = texts[i]
|
|
195
|
+
if (text && text.trim().length > 0) {
|
|
196
|
+
validTexts.push({ text, originalIndex: i })
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
if (validTexts.length === 0) {
|
|
201
|
+
return texts.map(() => [])
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// Truncate texts
|
|
205
|
+
const maxChars = (this.config.maxTokens || 8191) * 4
|
|
206
|
+
const truncatedTexts = validTexts.map(({ text }) => (text.length > maxChars ? text.slice(0, maxChars) : text))
|
|
207
|
+
|
|
208
|
+
if (this.provider === 'local') {
|
|
209
|
+
const embeddings = truncatedTexts.map((text) => this.generateLocalEmbedding(text))
|
|
210
|
+
return this.reconstructBatch(
|
|
211
|
+
embeddings,
|
|
212
|
+
validTexts.map((v) => v.originalIndex),
|
|
213
|
+
texts.length
|
|
214
|
+
)
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
try {
|
|
218
|
+
const batchSize = this.config.batchSize || 100
|
|
219
|
+
const allEmbeddings: number[][] = []
|
|
220
|
+
|
|
221
|
+
// Process in batches
|
|
222
|
+
for (let i = 0; i < truncatedTexts.length; i += batchSize) {
|
|
223
|
+
const batch = truncatedTexts.slice(i, i + batchSize)
|
|
224
|
+
const batchEmbeddings = await this.generateOpenAIBatchEmbedding(batch)
|
|
225
|
+
allEmbeddings.push(...batchEmbeddings)
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
return this.reconstructBatch(
|
|
229
|
+
allEmbeddings,
|
|
230
|
+
validTexts.map((v) => v.originalIndex),
|
|
231
|
+
texts.length
|
|
232
|
+
)
|
|
233
|
+
} catch (error) {
|
|
234
|
+
console.warn('[EmbeddingService] OpenAI batch embedding failed, falling back to local:', error)
|
|
235
|
+
const embeddings = truncatedTexts.map((text) => this.generateLocalEmbedding(text))
|
|
236
|
+
return this.reconstructBatch(
|
|
237
|
+
embeddings,
|
|
238
|
+
validTexts.map((v) => v.originalIndex),
|
|
239
|
+
texts.length
|
|
240
|
+
)
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
/**
|
|
245
|
+
* Reconstruct batch with empty embeddings for filtered entries
|
|
246
|
+
*/
|
|
247
|
+
private reconstructBatch(embeddings: number[][], validIndices: number[], totalLength: number): number[][] {
|
|
248
|
+
const result: number[][] = new Array(totalLength).fill(null).map(() => [])
|
|
249
|
+
for (let i = 0; i < validIndices.length; i++) {
|
|
250
|
+
const idx = validIndices[i]
|
|
251
|
+
const emb = embeddings[i]
|
|
252
|
+
if (idx !== undefined && emb !== undefined) {
|
|
253
|
+
result[idx] = emb
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
return result
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/**
|
|
260
|
+
* Generate local embedding (wrapper for static function)
|
|
261
|
+
*/
|
|
262
|
+
private generateLocalEmbedding(text: string): number[] {
|
|
263
|
+
const dimensions = EMBEDDING_CONFIGS.local.dimensions
|
|
264
|
+
return generateLocalEmbedding(text, dimensions)
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
/**
|
|
268
|
+
* Generate embedding using OpenAI API
|
|
269
|
+
*/
|
|
270
|
+
private async generateOpenAIEmbedding(text: string): Promise<number[]> {
|
|
271
|
+
const response = await fetch(`${this.baseUrl}/embeddings`, {
|
|
272
|
+
method: 'POST',
|
|
273
|
+
headers: {
|
|
274
|
+
'Content-Type': 'application/json',
|
|
275
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
276
|
+
},
|
|
277
|
+
body: JSON.stringify({
|
|
278
|
+
model: this.config.model,
|
|
279
|
+
input: text,
|
|
280
|
+
encoding_format: 'float',
|
|
281
|
+
}),
|
|
282
|
+
})
|
|
283
|
+
|
|
284
|
+
if (!response.ok) {
|
|
285
|
+
const error = await response.text()
|
|
286
|
+
throw new ExternalServiceError('OpenAI', `OpenAI API error: ${error}`, response.status, {
|
|
287
|
+
model: this.config.model,
|
|
288
|
+
endpoint: 'embeddings',
|
|
289
|
+
})
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
const data = (await response.json()) as {
|
|
293
|
+
data: Array<{ embedding: number[] }>
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
const firstResult = data.data[0]
|
|
297
|
+
if (!firstResult) {
|
|
298
|
+
throw new EmbeddingError('No embedding returned from OpenAI API', 'openai', {
|
|
299
|
+
model: this.config.model,
|
|
300
|
+
})
|
|
301
|
+
}
|
|
302
|
+
return firstResult.embedding
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
/**
|
|
306
|
+
* Generate batch embeddings using OpenAI API
|
|
307
|
+
*/
|
|
308
|
+
private async generateOpenAIBatchEmbedding(texts: string[]): Promise<number[][]> {
|
|
309
|
+
const response = await fetch(`${this.baseUrl}/embeddings`, {
|
|
310
|
+
method: 'POST',
|
|
311
|
+
headers: {
|
|
312
|
+
'Content-Type': 'application/json',
|
|
313
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
314
|
+
},
|
|
315
|
+
body: JSON.stringify({
|
|
316
|
+
model: this.config.model,
|
|
317
|
+
input: texts,
|
|
318
|
+
encoding_format: 'float',
|
|
319
|
+
}),
|
|
320
|
+
})
|
|
321
|
+
|
|
322
|
+
if (!response.ok) {
|
|
323
|
+
const error = await response.text()
|
|
324
|
+
throw new ExternalServiceError('OpenAI', `OpenAI API batch embedding error: ${error}`, response.status, {
|
|
325
|
+
model: this.config.model,
|
|
326
|
+
batchSize: texts.length,
|
|
327
|
+
})
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
const data = (await response.json()) as {
|
|
331
|
+
data: Array<{ embedding: number[]; index: number }>
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
// Sort by index to maintain order
|
|
335
|
+
const sorted = data.data.sort((a, b) => a.index - b.index)
|
|
336
|
+
return sorted.map((item) => item.embedding)
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
/**
|
|
341
|
+
* Calculate cosine similarity between two vectors
|
|
342
|
+
*/
|
|
343
|
+
export function cosineSimilarity(a: number[], b: number[]): number {
|
|
344
|
+
if (a.length !== b.length) {
|
|
345
|
+
throw new ValidationError(`Vector dimension mismatch: ${a.length} vs ${b.length}`, {
|
|
346
|
+
vectorA: [`Expected dimension ${b.length}, got ${a.length}`],
|
|
347
|
+
})
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
let dotProduct = 0
|
|
351
|
+
let normA = 0
|
|
352
|
+
let normB = 0
|
|
353
|
+
|
|
354
|
+
for (let i = 0; i < a.length; i++) {
|
|
355
|
+
const aVal = a[i] ?? 0
|
|
356
|
+
const bVal = b[i] ?? 0
|
|
357
|
+
dotProduct += aVal * bVal
|
|
358
|
+
normA += aVal * aVal
|
|
359
|
+
normB += bVal * bVal
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
const magnitude = Math.sqrt(normA) * Math.sqrt(normB)
|
|
363
|
+
if (magnitude === 0) return 0
|
|
364
|
+
|
|
365
|
+
return dotProduct / magnitude
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
/**
|
|
369
|
+
* Create a default embedding service instance
|
|
370
|
+
*/
|
|
371
|
+
export function createEmbeddingService(options?: {
|
|
372
|
+
apiKey?: string
|
|
373
|
+
baseUrl?: string
|
|
374
|
+
provider?: EmbeddingProvider
|
|
375
|
+
}): EmbeddingService {
|
|
376
|
+
return new EmbeddingService(options)
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
// Lazy singleton instance
|
|
380
|
+
let _embeddingService: EmbeddingService | null = null
|
|
381
|
+
|
|
382
|
+
/**
|
|
383
|
+
* Get the singleton embedding service instance (created lazily)
|
|
384
|
+
*/
|
|
385
|
+
export function getEmbeddingService(): EmbeddingService {
|
|
386
|
+
if (!_embeddingService) {
|
|
387
|
+
_embeddingService = new EmbeddingService()
|
|
388
|
+
}
|
|
389
|
+
return _embeddingService
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
/**
|
|
393
|
+
* Reset the singleton instance (useful for testing)
|
|
394
|
+
*/
|
|
395
|
+
export function resetEmbeddingService(): void {
|
|
396
|
+
_embeddingService = null
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
// Export default instance (lazy getter for backwards compatibility)
|
|
400
|
+
export const embeddingService = new Proxy({} as EmbeddingService, {
|
|
401
|
+
get(_, prop) {
|
|
402
|
+
return getEmbeddingService()[prop as keyof EmbeddingService]
|
|
403
|
+
},
|
|
404
|
+
})
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Main extraction orchestrator - routes documents to appropriate extractors
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { Document, ContentType, ExtractionResult, ExtractorInterface } from '../types/document.types.js'
|
|
6
|
+
import { TextExtractor } from './extractors/text.extractor.js'
|
|
7
|
+
import { UrlExtractor } from './extractors/url.extractor.js'
|
|
8
|
+
import { PdfExtractor } from './extractors/pdf.extractor.js'
|
|
9
|
+
import { MarkdownExtractor } from './extractors/markdown.extractor.js'
|
|
10
|
+
import { CodeExtractor } from './extractors/code.extractor.js'
|
|
11
|
+
|
|
12
|
+
interface ExtractorConfig {
|
|
13
|
+
extractor: ExtractorInterface
|
|
14
|
+
priority: number
|
|
15
|
+
contentType: ContentType
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export class ExtractionService {
|
|
19
|
+
private readonly extractors: ExtractorConfig[]
|
|
20
|
+
private readonly textExtractor: TextExtractor
|
|
21
|
+
private readonly urlExtractor: UrlExtractor
|
|
22
|
+
private readonly pdfExtractor: PdfExtractor
|
|
23
|
+
private readonly markdownExtractor: MarkdownExtractor
|
|
24
|
+
private readonly codeExtractor: CodeExtractor
|
|
25
|
+
|
|
26
|
+
constructor() {
|
|
27
|
+
// Initialize all extractors
|
|
28
|
+
this.textExtractor = new TextExtractor()
|
|
29
|
+
this.urlExtractor = new UrlExtractor()
|
|
30
|
+
this.pdfExtractor = new PdfExtractor()
|
|
31
|
+
this.markdownExtractor = new MarkdownExtractor()
|
|
32
|
+
this.codeExtractor = new CodeExtractor()
|
|
33
|
+
|
|
34
|
+
// Configure extractors with priorities (higher = checked first)
|
|
35
|
+
this.extractors = [
|
|
36
|
+
{ extractor: this.urlExtractor, priority: 100, contentType: 'url' as ContentType },
|
|
37
|
+
{ extractor: this.pdfExtractor, priority: 90, contentType: 'pdf' as ContentType },
|
|
38
|
+
{ extractor: this.codeExtractor, priority: 80, contentType: 'code' as ContentType },
|
|
39
|
+
{ extractor: this.markdownExtractor, priority: 70, contentType: 'markdown' as ContentType },
|
|
40
|
+
{ extractor: this.textExtractor, priority: 10, contentType: 'text' as ContentType },
|
|
41
|
+
].sort((a, b) => b.priority - a.priority)
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Extract content from a document, routing to the appropriate extractor
|
|
46
|
+
*/
|
|
47
|
+
async extract(document: Document): Promise<ExtractionResult> {
|
|
48
|
+
const contentType = document.contentType || this.detectContentType(document.content)
|
|
49
|
+
const extractor = this.getExtractor(contentType)
|
|
50
|
+
|
|
51
|
+
const options: Record<string, unknown> = {
|
|
52
|
+
metadata: document.metadata,
|
|
53
|
+
fileName: document.fileName,
|
|
54
|
+
language: document.language,
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
let result: ExtractionResult
|
|
58
|
+
try {
|
|
59
|
+
result = await extractor.extract(document.content, options)
|
|
60
|
+
} catch (error) {
|
|
61
|
+
const message = error instanceof Error ? error.message : 'Unknown extraction error'
|
|
62
|
+
throw new Error(`Extraction failed for document ${document.id} (type: ${contentType}): ${message}`)
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
return {
|
|
66
|
+
...result,
|
|
67
|
+
metadata: {
|
|
68
|
+
...result.metadata,
|
|
69
|
+
documentId: document.id,
|
|
70
|
+
originalContentType: document.contentType,
|
|
71
|
+
detectedContentType: contentType,
|
|
72
|
+
},
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Auto-detect content type from content
|
|
78
|
+
*/
|
|
79
|
+
detectContentType(content: string): ContentType {
|
|
80
|
+
if (!content || typeof content !== 'string') {
|
|
81
|
+
return 'unknown'
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Check each extractor in priority order
|
|
85
|
+
for (const config of this.extractors) {
|
|
86
|
+
if (config.extractor.canHandle(content)) {
|
|
87
|
+
return config.contentType
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
return 'unknown'
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Get the appropriate extractor for a content type
|
|
96
|
+
*/
|
|
97
|
+
private getExtractor(contentType: ContentType): ExtractorInterface {
|
|
98
|
+
switch (contentType) {
|
|
99
|
+
case 'url':
|
|
100
|
+
return this.urlExtractor
|
|
101
|
+
case 'pdf':
|
|
102
|
+
return this.pdfExtractor
|
|
103
|
+
case 'code':
|
|
104
|
+
return this.codeExtractor
|
|
105
|
+
case 'markdown':
|
|
106
|
+
return this.markdownExtractor
|
|
107
|
+
case 'text':
|
|
108
|
+
case 'unknown':
|
|
109
|
+
default:
|
|
110
|
+
return this.textExtractor
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Detect content type from file extension
|
|
116
|
+
*/
|
|
117
|
+
detectFromFileName(fileName: string): ContentType {
|
|
118
|
+
const ext = fileName.toLowerCase().split('.').pop()
|
|
119
|
+
|
|
120
|
+
if (!ext) return 'unknown'
|
|
121
|
+
|
|
122
|
+
// PDF
|
|
123
|
+
if (ext === 'pdf') return 'pdf'
|
|
124
|
+
|
|
125
|
+
// Markdown
|
|
126
|
+
if (['md', 'markdown', 'mdx'].includes(ext)) return 'markdown'
|
|
127
|
+
|
|
128
|
+
// Code files
|
|
129
|
+
const codeExtensions = [
|
|
130
|
+
'ts',
|
|
131
|
+
'tsx',
|
|
132
|
+
'js',
|
|
133
|
+
'jsx',
|
|
134
|
+
'mjs',
|
|
135
|
+
'cjs',
|
|
136
|
+
'py',
|
|
137
|
+
'pyw',
|
|
138
|
+
'go',
|
|
139
|
+
'java',
|
|
140
|
+
'rs',
|
|
141
|
+
'c',
|
|
142
|
+
'cpp',
|
|
143
|
+
'cc',
|
|
144
|
+
'cxx',
|
|
145
|
+
'h',
|
|
146
|
+
'hpp',
|
|
147
|
+
'cs',
|
|
148
|
+
'rb',
|
|
149
|
+
'php',
|
|
150
|
+
'swift',
|
|
151
|
+
'kt',
|
|
152
|
+
'kts',
|
|
153
|
+
'scala',
|
|
154
|
+
'sh',
|
|
155
|
+
'bash',
|
|
156
|
+
'zsh',
|
|
157
|
+
'sql',
|
|
158
|
+
'json',
|
|
159
|
+
'yaml',
|
|
160
|
+
'yml',
|
|
161
|
+
'toml',
|
|
162
|
+
'xml',
|
|
163
|
+
'css',
|
|
164
|
+
'scss',
|
|
165
|
+
'sass',
|
|
166
|
+
'less',
|
|
167
|
+
'html',
|
|
168
|
+
'htm',
|
|
169
|
+
'vue',
|
|
170
|
+
'svelte',
|
|
171
|
+
]
|
|
172
|
+
|
|
173
|
+
if (codeExtensions.includes(ext)) return 'code'
|
|
174
|
+
|
|
175
|
+
// Plain text
|
|
176
|
+
if (['txt', 'text', 'log'].includes(ext)) return 'text'
|
|
177
|
+
|
|
178
|
+
return 'unknown'
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
/**
|
|
182
|
+
* Detect content type from MIME type
|
|
183
|
+
*/
|
|
184
|
+
detectFromMimeType(mimeType: string): ContentType {
|
|
185
|
+
const normalized = mimeType.toLowerCase().split(';')[0]?.trim() ?? ''
|
|
186
|
+
|
|
187
|
+
// PDF
|
|
188
|
+
if (normalized === 'application/pdf') return 'pdf'
|
|
189
|
+
|
|
190
|
+
// Markdown
|
|
191
|
+
if (normalized === 'text/markdown' || normalized === 'text/x-markdown') {
|
|
192
|
+
return 'markdown'
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// HTML (URL content)
|
|
196
|
+
if (normalized === 'text/html') return 'url'
|
|
197
|
+
|
|
198
|
+
// Code types
|
|
199
|
+
const codeTypes = [
|
|
200
|
+
'text/javascript',
|
|
201
|
+
'application/javascript',
|
|
202
|
+
'text/typescript',
|
|
203
|
+
'text/x-python',
|
|
204
|
+
'text/x-go',
|
|
205
|
+
'text/x-java',
|
|
206
|
+
'text/x-rust',
|
|
207
|
+
'text/x-c',
|
|
208
|
+
'text/x-c++',
|
|
209
|
+
'application/json',
|
|
210
|
+
'text/css',
|
|
211
|
+
'text/xml',
|
|
212
|
+
'application/xml',
|
|
213
|
+
]
|
|
214
|
+
|
|
215
|
+
if (codeTypes.includes(normalized)) return 'code'
|
|
216
|
+
|
|
217
|
+
// Plain text
|
|
218
|
+
if (normalized === 'text/plain') return 'text'
|
|
219
|
+
|
|
220
|
+
return 'unknown'
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
/**
|
|
224
|
+
* Extract with all extractors and return the best result
|
|
225
|
+
* Useful for ambiguous content
|
|
226
|
+
*/
|
|
227
|
+
async extractWithAllExtractors(
|
|
228
|
+
content: string,
|
|
229
|
+
options?: Record<string, unknown>
|
|
230
|
+
): Promise<{ results: Map<ContentType, ExtractionResult>; bestType: ContentType }> {
|
|
231
|
+
const results = new Map<ContentType, ExtractionResult>()
|
|
232
|
+
let bestType: ContentType = 'unknown'
|
|
233
|
+
let bestScore = 0
|
|
234
|
+
|
|
235
|
+
for (const config of this.extractors) {
|
|
236
|
+
if (config.extractor.canHandle(content)) {
|
|
237
|
+
try {
|
|
238
|
+
const result = await config.extractor.extract(content, options)
|
|
239
|
+
results.set(config.contentType, result)
|
|
240
|
+
|
|
241
|
+
// Score based on metadata richness
|
|
242
|
+
const score = this.scoreExtractionResult(result)
|
|
243
|
+
if (score > bestScore) {
|
|
244
|
+
bestScore = score
|
|
245
|
+
bestType = config.contentType
|
|
246
|
+
}
|
|
247
|
+
} catch {
|
|
248
|
+
// Extractor failed, skip it
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
return { results, bestType }
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
/**
|
|
257
|
+
* Score an extraction result based on metadata quality
|
|
258
|
+
*/
|
|
259
|
+
private scoreExtractionResult(result: ExtractionResult): number {
|
|
260
|
+
let score = 0
|
|
261
|
+
|
|
262
|
+
if (result.metadata.title) score += 10
|
|
263
|
+
if (result.metadata.description) score += 5
|
|
264
|
+
if (result.metadata.author) score += 3
|
|
265
|
+
if (result.metadata.tags && result.metadata.tags.length > 0) score += 2
|
|
266
|
+
if (result.content.length > 0) score += 1
|
|
267
|
+
|
|
268
|
+
// Penalize if content is too short
|
|
269
|
+
if (result.content.length < 50) score -= 5
|
|
270
|
+
|
|
271
|
+
return score
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
/**
|
|
275
|
+
* Get supported content types
|
|
276
|
+
*/
|
|
277
|
+
getSupportedContentTypes(): ContentType[] {
|
|
278
|
+
return this.extractors.map((e) => e.contentType)
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
/**
|
|
282
|
+
* Check if a content type is supported
|
|
283
|
+
*/
|
|
284
|
+
isContentTypeSupported(contentType: ContentType): boolean {
|
|
285
|
+
return this.extractors.some((e) => e.contentType === contentType)
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
/**
|
|
289
|
+
* Get extractor instances for direct access
|
|
290
|
+
*/
|
|
291
|
+
getExtractors() {
|
|
292
|
+
return {
|
|
293
|
+
text: this.textExtractor,
|
|
294
|
+
url: this.urlExtractor,
|
|
295
|
+
pdf: this.pdfExtractor,
|
|
296
|
+
markdown: this.markdownExtractor,
|
|
297
|
+
code: this.codeExtractor,
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
}
|