@twelvehart/supermemory-runtime 1.0.0-next.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. package/.env.example +57 -0
  2. package/README.md +374 -0
  3. package/dist/index.js +189 -0
  4. package/dist/mcp/index.js +1132 -0
  5. package/docker-compose.prod.yml +91 -0
  6. package/docker-compose.yml +358 -0
  7. package/drizzle/0000_dapper_the_professor.sql +159 -0
  8. package/drizzle/0001_api_keys.sql +51 -0
  9. package/drizzle/meta/0000_snapshot.json +1532 -0
  10. package/drizzle/meta/_journal.json +13 -0
  11. package/drizzle.config.ts +20 -0
  12. package/package.json +114 -0
  13. package/scripts/add-extraction-job.ts +122 -0
  14. package/scripts/benchmark-pgvector.ts +122 -0
  15. package/scripts/bootstrap.sh +209 -0
  16. package/scripts/check-runtime-pack.ts +111 -0
  17. package/scripts/claude-mcp-config.ts +336 -0
  18. package/scripts/docker-entrypoint.sh +183 -0
  19. package/scripts/doctor.ts +377 -0
  20. package/scripts/init-db.sql +33 -0
  21. package/scripts/install.sh +1110 -0
  22. package/scripts/mcp-setup.ts +271 -0
  23. package/scripts/migrations/001_create_pgvector_extension.sql +31 -0
  24. package/scripts/migrations/002_create_memory_embeddings_table.sql +75 -0
  25. package/scripts/migrations/003_create_hnsw_index.sql +94 -0
  26. package/scripts/migrations/004_create_memory_embeddings_standalone.sql +70 -0
  27. package/scripts/migrations/005_create_chunks_table.sql +95 -0
  28. package/scripts/migrations/006_create_processing_queue.sql +45 -0
  29. package/scripts/migrations/generate_test_data.sql +42 -0
  30. package/scripts/migrations/phase1_comprehensive_test.sql +204 -0
  31. package/scripts/migrations/run_migrations.sh +286 -0
  32. package/scripts/migrations/test_hnsw_index.sql +255 -0
  33. package/scripts/pre-commit-secrets +282 -0
  34. package/scripts/run-extraction-worker.ts +46 -0
  35. package/scripts/run-phase1-tests.sh +291 -0
  36. package/scripts/setup.ts +222 -0
  37. package/scripts/smoke-install.sh +12 -0
  38. package/scripts/test-health-endpoint.sh +328 -0
  39. package/src/api/index.ts +2 -0
  40. package/src/api/middleware/auth.ts +80 -0
  41. package/src/api/middleware/csrf.ts +308 -0
  42. package/src/api/middleware/errorHandler.ts +166 -0
  43. package/src/api/middleware/rateLimit.ts +360 -0
  44. package/src/api/middleware/validation.ts +514 -0
  45. package/src/api/routes/documents.ts +286 -0
  46. package/src/api/routes/profiles.ts +237 -0
  47. package/src/api/routes/search.ts +71 -0
  48. package/src/api/stores/index.ts +58 -0
  49. package/src/config/bootstrap-env.ts +3 -0
  50. package/src/config/env.ts +71 -0
  51. package/src/config/feature-flags.ts +25 -0
  52. package/src/config/index.ts +140 -0
  53. package/src/config/secrets.config.ts +291 -0
  54. package/src/db/client.ts +92 -0
  55. package/src/db/index.ts +73 -0
  56. package/src/db/postgres.ts +72 -0
  57. package/src/db/schema/chunks.schema.ts +31 -0
  58. package/src/db/schema/containers.schema.ts +46 -0
  59. package/src/db/schema/documents.schema.ts +49 -0
  60. package/src/db/schema/embeddings.schema.ts +32 -0
  61. package/src/db/schema/index.ts +11 -0
  62. package/src/db/schema/memories.schema.ts +72 -0
  63. package/src/db/schema/profiles.schema.ts +34 -0
  64. package/src/db/schema/queue.schema.ts +59 -0
  65. package/src/db/schema/relationships.schema.ts +42 -0
  66. package/src/db/schema.ts +223 -0
  67. package/src/db/worker-connection.ts +47 -0
  68. package/src/index.ts +235 -0
  69. package/src/mcp/CLAUDE.md +1 -0
  70. package/src/mcp/index.ts +1380 -0
  71. package/src/mcp/legacyState.ts +22 -0
  72. package/src/mcp/rateLimit.ts +358 -0
  73. package/src/mcp/resources.ts +309 -0
  74. package/src/mcp/results.ts +104 -0
  75. package/src/mcp/tools.ts +401 -0
  76. package/src/queues/config.ts +119 -0
  77. package/src/queues/index.ts +289 -0
  78. package/src/sdk/client.ts +225 -0
  79. package/src/sdk/errors.ts +266 -0
  80. package/src/sdk/http.ts +560 -0
  81. package/src/sdk/index.ts +244 -0
  82. package/src/sdk/resources/base.ts +65 -0
  83. package/src/sdk/resources/connections.ts +204 -0
  84. package/src/sdk/resources/documents.ts +163 -0
  85. package/src/sdk/resources/index.ts +10 -0
  86. package/src/sdk/resources/memories.ts +150 -0
  87. package/src/sdk/resources/search.ts +60 -0
  88. package/src/sdk/resources/settings.ts +36 -0
  89. package/src/sdk/types.ts +674 -0
  90. package/src/services/chunking/index.ts +451 -0
  91. package/src/services/chunking.service.ts +650 -0
  92. package/src/services/csrf.service.ts +252 -0
  93. package/src/services/documents.repository.ts +219 -0
  94. package/src/services/documents.service.ts +191 -0
  95. package/src/services/embedding.service.ts +404 -0
  96. package/src/services/extraction.service.ts +300 -0
  97. package/src/services/extractors/code.extractor.ts +451 -0
  98. package/src/services/extractors/index.ts +9 -0
  99. package/src/services/extractors/markdown.extractor.ts +461 -0
  100. package/src/services/extractors/pdf.extractor.ts +315 -0
  101. package/src/services/extractors/text.extractor.ts +118 -0
  102. package/src/services/extractors/url.extractor.ts +243 -0
  103. package/src/services/index.ts +235 -0
  104. package/src/services/ingestion.service.ts +177 -0
  105. package/src/services/llm/anthropic.ts +400 -0
  106. package/src/services/llm/base.ts +460 -0
  107. package/src/services/llm/contradiction-detector.service.ts +526 -0
  108. package/src/services/llm/heuristics.ts +148 -0
  109. package/src/services/llm/index.ts +309 -0
  110. package/src/services/llm/memory-classifier.service.ts +383 -0
  111. package/src/services/llm/memory-extension-detector.service.ts +523 -0
  112. package/src/services/llm/mock.ts +470 -0
  113. package/src/services/llm/openai.ts +398 -0
  114. package/src/services/llm/prompts.ts +438 -0
  115. package/src/services/llm/types.ts +373 -0
  116. package/src/services/memory.repository.ts +1769 -0
  117. package/src/services/memory.service.ts +1338 -0
  118. package/src/services/memory.types.ts +234 -0
  119. package/src/services/persistence/index.ts +295 -0
  120. package/src/services/pipeline.service.ts +509 -0
  121. package/src/services/profile.repository.ts +436 -0
  122. package/src/services/profile.service.ts +560 -0
  123. package/src/services/profile.types.ts +270 -0
  124. package/src/services/relationships/detector.ts +1128 -0
  125. package/src/services/relationships/index.ts +268 -0
  126. package/src/services/relationships/memory-integration.ts +459 -0
  127. package/src/services/relationships/strategies.ts +132 -0
  128. package/src/services/relationships/types.ts +370 -0
  129. package/src/services/search.service.ts +761 -0
  130. package/src/services/search.types.ts +220 -0
  131. package/src/services/secrets.service.ts +384 -0
  132. package/src/services/vectorstore/base.ts +327 -0
  133. package/src/services/vectorstore/index.ts +444 -0
  134. package/src/services/vectorstore/memory.ts +286 -0
  135. package/src/services/vectorstore/migration.ts +295 -0
  136. package/src/services/vectorstore/mock.ts +403 -0
  137. package/src/services/vectorstore/pgvector.ts +695 -0
  138. package/src/services/vectorstore/types.ts +247 -0
  139. package/src/startup.ts +389 -0
  140. package/src/types/api.types.ts +193 -0
  141. package/src/types/document.types.ts +103 -0
  142. package/src/types/index.ts +241 -0
  143. package/src/types/profile.base.ts +133 -0
  144. package/src/utils/errors.ts +447 -0
  145. package/src/utils/id.ts +15 -0
  146. package/src/utils/index.ts +101 -0
  147. package/src/utils/logger.ts +313 -0
  148. package/src/utils/sanitization.ts +501 -0
  149. package/src/utils/secret-validation.ts +273 -0
  150. package/src/utils/synonyms.ts +188 -0
  151. package/src/utils/validation.ts +581 -0
  152. package/src/workers/chunking.worker.ts +242 -0
  153. package/src/workers/embedding.worker.ts +358 -0
  154. package/src/workers/extraction.worker.ts +346 -0
  155. package/src/workers/indexing.worker.ts +505 -0
  156. package/tsconfig.json +38 -0
@@ -0,0 +1,404 @@
1
+ /**
2
+ * Embedding Service for Supermemory Clone
3
+ *
4
+ * Provides vector embedding generation using OpenAI's text-embedding-3-small
5
+ * with fallback to local embeddings.
6
+ */
7
+
8
+ import type { EmbeddingConfig, EmbeddingProvider } from './search.types.js'
9
+ import { ValidationError, EmbeddingError, ExternalServiceError } from '../utils/errors.js'
10
+
11
+ /**
12
+ * Configuration for embedding models
13
+ */
14
+ const EMBEDDING_CONFIGS: Record<EmbeddingProvider, EmbeddingConfig> = {
15
+ openai: {
16
+ model: 'text-embedding-3-small',
17
+ dimensions: 1536,
18
+ isLocal: false,
19
+ maxTokens: 8191,
20
+ batchSize: 100,
21
+ },
22
+ local: {
23
+ model: 'local-tfidf',
24
+ dimensions: 384,
25
+ isLocal: true,
26
+ maxTokens: 512,
27
+ batchSize: 50,
28
+ },
29
+ }
30
+
31
+ /**
32
+ * Simple hash function for consistent local embeddings
33
+ */
34
+ function hashCode(str: string): number {
35
+ let hash = 0
36
+ for (let i = 0; i < str.length; i++) {
37
+ const char = str.charCodeAt(i)
38
+ hash = (hash << 5) - hash + char
39
+ hash = hash & hash // Convert to 32-bit integer
40
+ }
41
+ return hash
42
+ }
43
+
44
+ /**
45
+ * Generate a deterministic pseudo-random number from a seed
46
+ */
47
+ function seededRandom(seed: number): () => number {
48
+ return function (): number {
49
+ seed = (seed * 1103515245 + 12345) & 0x7fffffff
50
+ return seed / 0x7fffffff
51
+ }
52
+ }
53
+
54
+ /**
55
+ * Normalize a vector to unit length (L2 normalization)
56
+ */
57
+ function normalizeVector(vector: number[]): number[] {
58
+ const magnitude = Math.sqrt(vector.reduce((sum, val) => sum + val * val, 0))
59
+ if (magnitude === 0) return vector
60
+ return vector.map((val) => val / magnitude)
61
+ }
62
+
63
+ /**
64
+ * Local TF-IDF based embedding generator (fallback)
65
+ * Generates deterministic embeddings based on text content
66
+ */
67
+ function generateLocalEmbedding(text: string, dimensions: number = 384): number[] {
68
+ // Tokenize and normalize text
69
+ const tokens = text
70
+ .toLowerCase()
71
+ .replace(/[^\w\s]/g, ' ')
72
+ .split(/\s+/)
73
+ .filter((t) => t.length > 0)
74
+
75
+ // Initialize embedding vector
76
+ const embedding = new Array(dimensions).fill(0)
77
+
78
+ // Combine token-based features with random projection
79
+ const tokenWeights = new Map<string, number>()
80
+
81
+ // Calculate term frequency
82
+ for (const token of tokens) {
83
+ tokenWeights.set(token, (tokenWeights.get(token) || 0) + 1)
84
+ }
85
+
86
+ // Apply TF weighting and random projection
87
+ for (const [token, count] of tokenWeights.entries()) {
88
+ const tf = Math.log(1 + count)
89
+ const tokenHash = hashCode(token)
90
+ const tokenRandom = seededRandom(tokenHash)
91
+
92
+ // Project each token into the embedding space
93
+ for (let i = 0; i < dimensions; i++) {
94
+ embedding[i] += tf * (tokenRandom() * 2 - 1)
95
+ }
96
+ }
97
+
98
+ // Add positional information
99
+ for (let i = 0; i < Math.min(tokens.length, 50); i++) {
100
+ const token = tokens[i]
101
+ if (!token) continue
102
+ const posWeight = 1 / (1 + i * 0.1)
103
+ const tokenHash = hashCode(token + ':' + i)
104
+ const posRandom = seededRandom(tokenHash)
105
+
106
+ for (let j = 0; j < dimensions; j++) {
107
+ embedding[j] += posWeight * (posRandom() * 2 - 1) * 0.1
108
+ }
109
+ }
110
+
111
+ // Normalize to unit vector
112
+ return normalizeVector(embedding)
113
+ }
114
+
115
+ /**
116
+ * Embedding Service class
117
+ */
118
+ export class EmbeddingService {
119
+ private readonly apiKey: string | undefined
120
+ private readonly baseUrl: string
121
+ private readonly config: EmbeddingConfig
122
+ private readonly provider: EmbeddingProvider
123
+
124
+ constructor(options?: { apiKey?: string; baseUrl?: string; provider?: EmbeddingProvider }) {
125
+ this.apiKey = options?.apiKey || process.env.OPENAI_API_KEY
126
+ this.baseUrl = options?.baseUrl || 'https://api.openai.com/v1'
127
+ this.provider = options?.provider || (this.apiKey ? 'openai' : 'local')
128
+ this.config = EMBEDDING_CONFIGS[this.provider]
129
+
130
+ if (!this.apiKey && this.provider === 'openai') {
131
+ console.warn('[EmbeddingService] No OpenAI API key found, falling back to local embeddings')
132
+ this.provider = 'local'
133
+ }
134
+ }
135
+
136
+ /**
137
+ * Get the current embedding configuration
138
+ */
139
+ getConfig(): EmbeddingConfig {
140
+ return { ...this.config }
141
+ }
142
+
143
+ /**
144
+ * Get the embedding dimensions
145
+ */
146
+ getDimensions(): number {
147
+ return EMBEDDING_CONFIGS[this.provider].dimensions
148
+ }
149
+
150
+ /**
151
+ * Check if using local fallback
152
+ */
153
+ isUsingLocalFallback(): boolean {
154
+ return this.provider === 'local'
155
+ }
156
+
157
+ /**
158
+ * Generate embedding for a single text
159
+ */
160
+ async generateEmbedding(text: string): Promise<number[]> {
161
+ if (!text || text.trim().length === 0) {
162
+ throw new ValidationError('Text cannot be empty', {
163
+ text: ['Text is required and cannot be empty'],
164
+ })
165
+ }
166
+
167
+ // Truncate if too long
168
+ const maxChars = (this.config.maxTokens || 8191) * 4 // Rough estimate
169
+ const truncatedText = text.length > maxChars ? text.slice(0, maxChars) : text
170
+
171
+ if (this.provider === 'local') {
172
+ return this.generateLocalEmbedding(truncatedText)
173
+ }
174
+
175
+ try {
176
+ return await this.generateOpenAIEmbedding(truncatedText)
177
+ } catch (error) {
178
+ console.warn('[EmbeddingService] OpenAI embedding failed, falling back to local:', error)
179
+ return this.generateLocalEmbedding(truncatedText)
180
+ }
181
+ }
182
+
183
+ /**
184
+ * Generate embeddings for multiple texts (batch)
185
+ */
186
+ async batchEmbed(texts: string[]): Promise<number[][]> {
187
+ if (!texts || texts.length === 0) {
188
+ return []
189
+ }
190
+
191
+ // Filter empty texts and track indices
192
+ const validTexts: { text: string; originalIndex: number }[] = []
193
+ for (let i = 0; i < texts.length; i++) {
194
+ const text = texts[i]
195
+ if (text && text.trim().length > 0) {
196
+ validTexts.push({ text, originalIndex: i })
197
+ }
198
+ }
199
+
200
+ if (validTexts.length === 0) {
201
+ return texts.map(() => [])
202
+ }
203
+
204
+ // Truncate texts
205
+ const maxChars = (this.config.maxTokens || 8191) * 4
206
+ const truncatedTexts = validTexts.map(({ text }) => (text.length > maxChars ? text.slice(0, maxChars) : text))
207
+
208
+ if (this.provider === 'local') {
209
+ const embeddings = truncatedTexts.map((text) => this.generateLocalEmbedding(text))
210
+ return this.reconstructBatch(
211
+ embeddings,
212
+ validTexts.map((v) => v.originalIndex),
213
+ texts.length
214
+ )
215
+ }
216
+
217
+ try {
218
+ const batchSize = this.config.batchSize || 100
219
+ const allEmbeddings: number[][] = []
220
+
221
+ // Process in batches
222
+ for (let i = 0; i < truncatedTexts.length; i += batchSize) {
223
+ const batch = truncatedTexts.slice(i, i + batchSize)
224
+ const batchEmbeddings = await this.generateOpenAIBatchEmbedding(batch)
225
+ allEmbeddings.push(...batchEmbeddings)
226
+ }
227
+
228
+ return this.reconstructBatch(
229
+ allEmbeddings,
230
+ validTexts.map((v) => v.originalIndex),
231
+ texts.length
232
+ )
233
+ } catch (error) {
234
+ console.warn('[EmbeddingService] OpenAI batch embedding failed, falling back to local:', error)
235
+ const embeddings = truncatedTexts.map((text) => this.generateLocalEmbedding(text))
236
+ return this.reconstructBatch(
237
+ embeddings,
238
+ validTexts.map((v) => v.originalIndex),
239
+ texts.length
240
+ )
241
+ }
242
+ }
243
+
244
+ /**
245
+ * Reconstruct batch with empty embeddings for filtered entries
246
+ */
247
+ private reconstructBatch(embeddings: number[][], validIndices: number[], totalLength: number): number[][] {
248
+ const result: number[][] = new Array(totalLength).fill(null).map(() => [])
249
+ for (let i = 0; i < validIndices.length; i++) {
250
+ const idx = validIndices[i]
251
+ const emb = embeddings[i]
252
+ if (idx !== undefined && emb !== undefined) {
253
+ result[idx] = emb
254
+ }
255
+ }
256
+ return result
257
+ }
258
+
259
+ /**
260
+ * Generate local embedding (wrapper for static function)
261
+ */
262
+ private generateLocalEmbedding(text: string): number[] {
263
+ const dimensions = EMBEDDING_CONFIGS.local.dimensions
264
+ return generateLocalEmbedding(text, dimensions)
265
+ }
266
+
267
+ /**
268
+ * Generate embedding using OpenAI API
269
+ */
270
+ private async generateOpenAIEmbedding(text: string): Promise<number[]> {
271
+ const response = await fetch(`${this.baseUrl}/embeddings`, {
272
+ method: 'POST',
273
+ headers: {
274
+ 'Content-Type': 'application/json',
275
+ Authorization: `Bearer ${this.apiKey}`,
276
+ },
277
+ body: JSON.stringify({
278
+ model: this.config.model,
279
+ input: text,
280
+ encoding_format: 'float',
281
+ }),
282
+ })
283
+
284
+ if (!response.ok) {
285
+ const error = await response.text()
286
+ throw new ExternalServiceError('OpenAI', `OpenAI API error: ${error}`, response.status, {
287
+ model: this.config.model,
288
+ endpoint: 'embeddings',
289
+ })
290
+ }
291
+
292
+ const data = (await response.json()) as {
293
+ data: Array<{ embedding: number[] }>
294
+ }
295
+
296
+ const firstResult = data.data[0]
297
+ if (!firstResult) {
298
+ throw new EmbeddingError('No embedding returned from OpenAI API', 'openai', {
299
+ model: this.config.model,
300
+ })
301
+ }
302
+ return firstResult.embedding
303
+ }
304
+
305
+ /**
306
+ * Generate batch embeddings using OpenAI API
307
+ */
308
+ private async generateOpenAIBatchEmbedding(texts: string[]): Promise<number[][]> {
309
+ const response = await fetch(`${this.baseUrl}/embeddings`, {
310
+ method: 'POST',
311
+ headers: {
312
+ 'Content-Type': 'application/json',
313
+ Authorization: `Bearer ${this.apiKey}`,
314
+ },
315
+ body: JSON.stringify({
316
+ model: this.config.model,
317
+ input: texts,
318
+ encoding_format: 'float',
319
+ }),
320
+ })
321
+
322
+ if (!response.ok) {
323
+ const error = await response.text()
324
+ throw new ExternalServiceError('OpenAI', `OpenAI API batch embedding error: ${error}`, response.status, {
325
+ model: this.config.model,
326
+ batchSize: texts.length,
327
+ })
328
+ }
329
+
330
+ const data = (await response.json()) as {
331
+ data: Array<{ embedding: number[]; index: number }>
332
+ }
333
+
334
+ // Sort by index to maintain order
335
+ const sorted = data.data.sort((a, b) => a.index - b.index)
336
+ return sorted.map((item) => item.embedding)
337
+ }
338
+ }
339
+
340
+ /**
341
+ * Calculate cosine similarity between two vectors
342
+ */
343
+ export function cosineSimilarity(a: number[], b: number[]): number {
344
+ if (a.length !== b.length) {
345
+ throw new ValidationError(`Vector dimension mismatch: ${a.length} vs ${b.length}`, {
346
+ vectorA: [`Expected dimension ${b.length}, got ${a.length}`],
347
+ })
348
+ }
349
+
350
+ let dotProduct = 0
351
+ let normA = 0
352
+ let normB = 0
353
+
354
+ for (let i = 0; i < a.length; i++) {
355
+ const aVal = a[i] ?? 0
356
+ const bVal = b[i] ?? 0
357
+ dotProduct += aVal * bVal
358
+ normA += aVal * aVal
359
+ normB += bVal * bVal
360
+ }
361
+
362
+ const magnitude = Math.sqrt(normA) * Math.sqrt(normB)
363
+ if (magnitude === 0) return 0
364
+
365
+ return dotProduct / magnitude
366
+ }
367
+
368
+ /**
369
+ * Create a default embedding service instance
370
+ */
371
+ export function createEmbeddingService(options?: {
372
+ apiKey?: string
373
+ baseUrl?: string
374
+ provider?: EmbeddingProvider
375
+ }): EmbeddingService {
376
+ return new EmbeddingService(options)
377
+ }
378
+
379
+ // Lazy singleton instance
380
+ let _embeddingService: EmbeddingService | null = null
381
+
382
+ /**
383
+ * Get the singleton embedding service instance (created lazily)
384
+ */
385
+ export function getEmbeddingService(): EmbeddingService {
386
+ if (!_embeddingService) {
387
+ _embeddingService = new EmbeddingService()
388
+ }
389
+ return _embeddingService
390
+ }
391
+
392
+ /**
393
+ * Reset the singleton instance (useful for testing)
394
+ */
395
+ export function resetEmbeddingService(): void {
396
+ _embeddingService = null
397
+ }
398
+
399
+ // Export default instance (lazy getter for backwards compatibility)
400
+ export const embeddingService = new Proxy({} as EmbeddingService, {
401
+ get(_, prop) {
402
+ return getEmbeddingService()[prop as keyof EmbeddingService]
403
+ },
404
+ })
@@ -0,0 +1,300 @@
1
+ /**
2
+ * Main extraction orchestrator - routes documents to appropriate extractors
3
+ */
4
+
5
+ import { Document, ContentType, ExtractionResult, ExtractorInterface } from '../types/document.types.js'
6
+ import { TextExtractor } from './extractors/text.extractor.js'
7
+ import { UrlExtractor } from './extractors/url.extractor.js'
8
+ import { PdfExtractor } from './extractors/pdf.extractor.js'
9
+ import { MarkdownExtractor } from './extractors/markdown.extractor.js'
10
+ import { CodeExtractor } from './extractors/code.extractor.js'
11
+
12
+ interface ExtractorConfig {
13
+ extractor: ExtractorInterface
14
+ priority: number
15
+ contentType: ContentType
16
+ }
17
+
18
+ export class ExtractionService {
19
+ private readonly extractors: ExtractorConfig[]
20
+ private readonly textExtractor: TextExtractor
21
+ private readonly urlExtractor: UrlExtractor
22
+ private readonly pdfExtractor: PdfExtractor
23
+ private readonly markdownExtractor: MarkdownExtractor
24
+ private readonly codeExtractor: CodeExtractor
25
+
26
+ constructor() {
27
+ // Initialize all extractors
28
+ this.textExtractor = new TextExtractor()
29
+ this.urlExtractor = new UrlExtractor()
30
+ this.pdfExtractor = new PdfExtractor()
31
+ this.markdownExtractor = new MarkdownExtractor()
32
+ this.codeExtractor = new CodeExtractor()
33
+
34
+ // Configure extractors with priorities (higher = checked first)
35
+ this.extractors = [
36
+ { extractor: this.urlExtractor, priority: 100, contentType: 'url' as ContentType },
37
+ { extractor: this.pdfExtractor, priority: 90, contentType: 'pdf' as ContentType },
38
+ { extractor: this.codeExtractor, priority: 80, contentType: 'code' as ContentType },
39
+ { extractor: this.markdownExtractor, priority: 70, contentType: 'markdown' as ContentType },
40
+ { extractor: this.textExtractor, priority: 10, contentType: 'text' as ContentType },
41
+ ].sort((a, b) => b.priority - a.priority)
42
+ }
43
+
44
+ /**
45
+ * Extract content from a document, routing to the appropriate extractor
46
+ */
47
+ async extract(document: Document): Promise<ExtractionResult> {
48
+ const contentType = document.contentType || this.detectContentType(document.content)
49
+ const extractor = this.getExtractor(contentType)
50
+
51
+ const options: Record<string, unknown> = {
52
+ metadata: document.metadata,
53
+ fileName: document.fileName,
54
+ language: document.language,
55
+ }
56
+
57
+ let result: ExtractionResult
58
+ try {
59
+ result = await extractor.extract(document.content, options)
60
+ } catch (error) {
61
+ const message = error instanceof Error ? error.message : 'Unknown extraction error'
62
+ throw new Error(`Extraction failed for document ${document.id} (type: ${contentType}): ${message}`)
63
+ }
64
+
65
+ return {
66
+ ...result,
67
+ metadata: {
68
+ ...result.metadata,
69
+ documentId: document.id,
70
+ originalContentType: document.contentType,
71
+ detectedContentType: contentType,
72
+ },
73
+ }
74
+ }
75
+
76
+ /**
77
+ * Auto-detect content type from content
78
+ */
79
+ detectContentType(content: string): ContentType {
80
+ if (!content || typeof content !== 'string') {
81
+ return 'unknown'
82
+ }
83
+
84
+ // Check each extractor in priority order
85
+ for (const config of this.extractors) {
86
+ if (config.extractor.canHandle(content)) {
87
+ return config.contentType
88
+ }
89
+ }
90
+
91
+ return 'unknown'
92
+ }
93
+
94
+ /**
95
+ * Get the appropriate extractor for a content type
96
+ */
97
+ private getExtractor(contentType: ContentType): ExtractorInterface {
98
+ switch (contentType) {
99
+ case 'url':
100
+ return this.urlExtractor
101
+ case 'pdf':
102
+ return this.pdfExtractor
103
+ case 'code':
104
+ return this.codeExtractor
105
+ case 'markdown':
106
+ return this.markdownExtractor
107
+ case 'text':
108
+ case 'unknown':
109
+ default:
110
+ return this.textExtractor
111
+ }
112
+ }
113
+
114
+ /**
115
+ * Detect content type from file extension
116
+ */
117
+ detectFromFileName(fileName: string): ContentType {
118
+ const ext = fileName.toLowerCase().split('.').pop()
119
+
120
+ if (!ext) return 'unknown'
121
+
122
+ // PDF
123
+ if (ext === 'pdf') return 'pdf'
124
+
125
+ // Markdown
126
+ if (['md', 'markdown', 'mdx'].includes(ext)) return 'markdown'
127
+
128
+ // Code files
129
+ const codeExtensions = [
130
+ 'ts',
131
+ 'tsx',
132
+ 'js',
133
+ 'jsx',
134
+ 'mjs',
135
+ 'cjs',
136
+ 'py',
137
+ 'pyw',
138
+ 'go',
139
+ 'java',
140
+ 'rs',
141
+ 'c',
142
+ 'cpp',
143
+ 'cc',
144
+ 'cxx',
145
+ 'h',
146
+ 'hpp',
147
+ 'cs',
148
+ 'rb',
149
+ 'php',
150
+ 'swift',
151
+ 'kt',
152
+ 'kts',
153
+ 'scala',
154
+ 'sh',
155
+ 'bash',
156
+ 'zsh',
157
+ 'sql',
158
+ 'json',
159
+ 'yaml',
160
+ 'yml',
161
+ 'toml',
162
+ 'xml',
163
+ 'css',
164
+ 'scss',
165
+ 'sass',
166
+ 'less',
167
+ 'html',
168
+ 'htm',
169
+ 'vue',
170
+ 'svelte',
171
+ ]
172
+
173
+ if (codeExtensions.includes(ext)) return 'code'
174
+
175
+ // Plain text
176
+ if (['txt', 'text', 'log'].includes(ext)) return 'text'
177
+
178
+ return 'unknown'
179
+ }
180
+
181
+ /**
182
+ * Detect content type from MIME type
183
+ */
184
+ detectFromMimeType(mimeType: string): ContentType {
185
+ const normalized = mimeType.toLowerCase().split(';')[0]?.trim() ?? ''
186
+
187
+ // PDF
188
+ if (normalized === 'application/pdf') return 'pdf'
189
+
190
+ // Markdown
191
+ if (normalized === 'text/markdown' || normalized === 'text/x-markdown') {
192
+ return 'markdown'
193
+ }
194
+
195
+ // HTML (URL content)
196
+ if (normalized === 'text/html') return 'url'
197
+
198
+ // Code types
199
+ const codeTypes = [
200
+ 'text/javascript',
201
+ 'application/javascript',
202
+ 'text/typescript',
203
+ 'text/x-python',
204
+ 'text/x-go',
205
+ 'text/x-java',
206
+ 'text/x-rust',
207
+ 'text/x-c',
208
+ 'text/x-c++',
209
+ 'application/json',
210
+ 'text/css',
211
+ 'text/xml',
212
+ 'application/xml',
213
+ ]
214
+
215
+ if (codeTypes.includes(normalized)) return 'code'
216
+
217
+ // Plain text
218
+ if (normalized === 'text/plain') return 'text'
219
+
220
+ return 'unknown'
221
+ }
222
+
223
+ /**
224
+ * Extract with all extractors and return the best result
225
+ * Useful for ambiguous content
226
+ */
227
+ async extractWithAllExtractors(
228
+ content: string,
229
+ options?: Record<string, unknown>
230
+ ): Promise<{ results: Map<ContentType, ExtractionResult>; bestType: ContentType }> {
231
+ const results = new Map<ContentType, ExtractionResult>()
232
+ let bestType: ContentType = 'unknown'
233
+ let bestScore = 0
234
+
235
+ for (const config of this.extractors) {
236
+ if (config.extractor.canHandle(content)) {
237
+ try {
238
+ const result = await config.extractor.extract(content, options)
239
+ results.set(config.contentType, result)
240
+
241
+ // Score based on metadata richness
242
+ const score = this.scoreExtractionResult(result)
243
+ if (score > bestScore) {
244
+ bestScore = score
245
+ bestType = config.contentType
246
+ }
247
+ } catch {
248
+ // Extractor failed, skip it
249
+ }
250
+ }
251
+ }
252
+
253
+ return { results, bestType }
254
+ }
255
+
256
+ /**
257
+ * Score an extraction result based on metadata quality
258
+ */
259
+ private scoreExtractionResult(result: ExtractionResult): number {
260
+ let score = 0
261
+
262
+ if (result.metadata.title) score += 10
263
+ if (result.metadata.description) score += 5
264
+ if (result.metadata.author) score += 3
265
+ if (result.metadata.tags && result.metadata.tags.length > 0) score += 2
266
+ if (result.content.length > 0) score += 1
267
+
268
+ // Penalize if content is too short
269
+ if (result.content.length < 50) score -= 5
270
+
271
+ return score
272
+ }
273
+
274
+ /**
275
+ * Get supported content types
276
+ */
277
+ getSupportedContentTypes(): ContentType[] {
278
+ return this.extractors.map((e) => e.contentType)
279
+ }
280
+
281
+ /**
282
+ * Check if a content type is supported
283
+ */
284
+ isContentTypeSupported(contentType: ContentType): boolean {
285
+ return this.extractors.some((e) => e.contentType === contentType)
286
+ }
287
+
288
+ /**
289
+ * Get extractor instances for direct access
290
+ */
291
+ getExtractors() {
292
+ return {
293
+ text: this.textExtractor,
294
+ url: this.urlExtractor,
295
+ pdf: this.pdfExtractor,
296
+ markdown: this.markdownExtractor,
297
+ code: this.codeExtractor,
298
+ }
299
+ }
300
+ }