@twelvehart/supermemory-runtime 1.0.0-next.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. package/.env.example +57 -0
  2. package/README.md +374 -0
  3. package/dist/index.js +189 -0
  4. package/dist/mcp/index.js +1132 -0
  5. package/docker-compose.prod.yml +91 -0
  6. package/docker-compose.yml +358 -0
  7. package/drizzle/0000_dapper_the_professor.sql +159 -0
  8. package/drizzle/0001_api_keys.sql +51 -0
  9. package/drizzle/meta/0000_snapshot.json +1532 -0
  10. package/drizzle/meta/_journal.json +13 -0
  11. package/drizzle.config.ts +20 -0
  12. package/package.json +114 -0
  13. package/scripts/add-extraction-job.ts +122 -0
  14. package/scripts/benchmark-pgvector.ts +122 -0
  15. package/scripts/bootstrap.sh +209 -0
  16. package/scripts/check-runtime-pack.ts +111 -0
  17. package/scripts/claude-mcp-config.ts +336 -0
  18. package/scripts/docker-entrypoint.sh +183 -0
  19. package/scripts/doctor.ts +377 -0
  20. package/scripts/init-db.sql +33 -0
  21. package/scripts/install.sh +1110 -0
  22. package/scripts/mcp-setup.ts +271 -0
  23. package/scripts/migrations/001_create_pgvector_extension.sql +31 -0
  24. package/scripts/migrations/002_create_memory_embeddings_table.sql +75 -0
  25. package/scripts/migrations/003_create_hnsw_index.sql +94 -0
  26. package/scripts/migrations/004_create_memory_embeddings_standalone.sql +70 -0
  27. package/scripts/migrations/005_create_chunks_table.sql +95 -0
  28. package/scripts/migrations/006_create_processing_queue.sql +45 -0
  29. package/scripts/migrations/generate_test_data.sql +42 -0
  30. package/scripts/migrations/phase1_comprehensive_test.sql +204 -0
  31. package/scripts/migrations/run_migrations.sh +286 -0
  32. package/scripts/migrations/test_hnsw_index.sql +255 -0
  33. package/scripts/pre-commit-secrets +282 -0
  34. package/scripts/run-extraction-worker.ts +46 -0
  35. package/scripts/run-phase1-tests.sh +291 -0
  36. package/scripts/setup.ts +222 -0
  37. package/scripts/smoke-install.sh +12 -0
  38. package/scripts/test-health-endpoint.sh +328 -0
  39. package/src/api/index.ts +2 -0
  40. package/src/api/middleware/auth.ts +80 -0
  41. package/src/api/middleware/csrf.ts +308 -0
  42. package/src/api/middleware/errorHandler.ts +166 -0
  43. package/src/api/middleware/rateLimit.ts +360 -0
  44. package/src/api/middleware/validation.ts +514 -0
  45. package/src/api/routes/documents.ts +286 -0
  46. package/src/api/routes/profiles.ts +237 -0
  47. package/src/api/routes/search.ts +71 -0
  48. package/src/api/stores/index.ts +58 -0
  49. package/src/config/bootstrap-env.ts +3 -0
  50. package/src/config/env.ts +71 -0
  51. package/src/config/feature-flags.ts +25 -0
  52. package/src/config/index.ts +140 -0
  53. package/src/config/secrets.config.ts +291 -0
  54. package/src/db/client.ts +92 -0
  55. package/src/db/index.ts +73 -0
  56. package/src/db/postgres.ts +72 -0
  57. package/src/db/schema/chunks.schema.ts +31 -0
  58. package/src/db/schema/containers.schema.ts +46 -0
  59. package/src/db/schema/documents.schema.ts +49 -0
  60. package/src/db/schema/embeddings.schema.ts +32 -0
  61. package/src/db/schema/index.ts +11 -0
  62. package/src/db/schema/memories.schema.ts +72 -0
  63. package/src/db/schema/profiles.schema.ts +34 -0
  64. package/src/db/schema/queue.schema.ts +59 -0
  65. package/src/db/schema/relationships.schema.ts +42 -0
  66. package/src/db/schema.ts +223 -0
  67. package/src/db/worker-connection.ts +47 -0
  68. package/src/index.ts +235 -0
  69. package/src/mcp/CLAUDE.md +1 -0
  70. package/src/mcp/index.ts +1380 -0
  71. package/src/mcp/legacyState.ts +22 -0
  72. package/src/mcp/rateLimit.ts +358 -0
  73. package/src/mcp/resources.ts +309 -0
  74. package/src/mcp/results.ts +104 -0
  75. package/src/mcp/tools.ts +401 -0
  76. package/src/queues/config.ts +119 -0
  77. package/src/queues/index.ts +289 -0
  78. package/src/sdk/client.ts +225 -0
  79. package/src/sdk/errors.ts +266 -0
  80. package/src/sdk/http.ts +560 -0
  81. package/src/sdk/index.ts +244 -0
  82. package/src/sdk/resources/base.ts +65 -0
  83. package/src/sdk/resources/connections.ts +204 -0
  84. package/src/sdk/resources/documents.ts +163 -0
  85. package/src/sdk/resources/index.ts +10 -0
  86. package/src/sdk/resources/memories.ts +150 -0
  87. package/src/sdk/resources/search.ts +60 -0
  88. package/src/sdk/resources/settings.ts +36 -0
  89. package/src/sdk/types.ts +674 -0
  90. package/src/services/chunking/index.ts +451 -0
  91. package/src/services/chunking.service.ts +650 -0
  92. package/src/services/csrf.service.ts +252 -0
  93. package/src/services/documents.repository.ts +219 -0
  94. package/src/services/documents.service.ts +191 -0
  95. package/src/services/embedding.service.ts +404 -0
  96. package/src/services/extraction.service.ts +300 -0
  97. package/src/services/extractors/code.extractor.ts +451 -0
  98. package/src/services/extractors/index.ts +9 -0
  99. package/src/services/extractors/markdown.extractor.ts +461 -0
  100. package/src/services/extractors/pdf.extractor.ts +315 -0
  101. package/src/services/extractors/text.extractor.ts +118 -0
  102. package/src/services/extractors/url.extractor.ts +243 -0
  103. package/src/services/index.ts +235 -0
  104. package/src/services/ingestion.service.ts +177 -0
  105. package/src/services/llm/anthropic.ts +400 -0
  106. package/src/services/llm/base.ts +460 -0
  107. package/src/services/llm/contradiction-detector.service.ts +526 -0
  108. package/src/services/llm/heuristics.ts +148 -0
  109. package/src/services/llm/index.ts +309 -0
  110. package/src/services/llm/memory-classifier.service.ts +383 -0
  111. package/src/services/llm/memory-extension-detector.service.ts +523 -0
  112. package/src/services/llm/mock.ts +470 -0
  113. package/src/services/llm/openai.ts +398 -0
  114. package/src/services/llm/prompts.ts +438 -0
  115. package/src/services/llm/types.ts +373 -0
  116. package/src/services/memory.repository.ts +1769 -0
  117. package/src/services/memory.service.ts +1338 -0
  118. package/src/services/memory.types.ts +234 -0
  119. package/src/services/persistence/index.ts +295 -0
  120. package/src/services/pipeline.service.ts +509 -0
  121. package/src/services/profile.repository.ts +436 -0
  122. package/src/services/profile.service.ts +560 -0
  123. package/src/services/profile.types.ts +270 -0
  124. package/src/services/relationships/detector.ts +1128 -0
  125. package/src/services/relationships/index.ts +268 -0
  126. package/src/services/relationships/memory-integration.ts +459 -0
  127. package/src/services/relationships/strategies.ts +132 -0
  128. package/src/services/relationships/types.ts +370 -0
  129. package/src/services/search.service.ts +761 -0
  130. package/src/services/search.types.ts +220 -0
  131. package/src/services/secrets.service.ts +384 -0
  132. package/src/services/vectorstore/base.ts +327 -0
  133. package/src/services/vectorstore/index.ts +444 -0
  134. package/src/services/vectorstore/memory.ts +286 -0
  135. package/src/services/vectorstore/migration.ts +295 -0
  136. package/src/services/vectorstore/mock.ts +403 -0
  137. package/src/services/vectorstore/pgvector.ts +695 -0
  138. package/src/services/vectorstore/types.ts +247 -0
  139. package/src/startup.ts +389 -0
  140. package/src/types/api.types.ts +193 -0
  141. package/src/types/document.types.ts +103 -0
  142. package/src/types/index.ts +241 -0
  143. package/src/types/profile.base.ts +133 -0
  144. package/src/utils/errors.ts +447 -0
  145. package/src/utils/id.ts +15 -0
  146. package/src/utils/index.ts +101 -0
  147. package/src/utils/logger.ts +313 -0
  148. package/src/utils/sanitization.ts +501 -0
  149. package/src/utils/secret-validation.ts +273 -0
  150. package/src/utils/synonyms.ts +188 -0
  151. package/src/utils/validation.ts +581 -0
  152. package/src/workers/chunking.worker.ts +242 -0
  153. package/src/workers/embedding.worker.ts +358 -0
  154. package/src/workers/extraction.worker.ts +346 -0
  155. package/src/workers/indexing.worker.ts +505 -0
  156. package/tsconfig.json +38 -0
@@ -0,0 +1,346 @@
1
+ /**
2
+ * Extraction Worker - Processes documents from queue and extracts content
3
+ *
4
+ * Flow:
5
+ * 1. Job Received (0%)
6
+ * 2. Fetch Document from database
7
+ * 3. Detect Content Type (text/url/file)
8
+ * 4. Call Appropriate Extractor (50%)
9
+ * 5. Save Extracted Content to database
10
+ * 6. Chain to Chunking Queue (90%)
11
+ * 7. Mark Job Complete (100%)
12
+ *
13
+ * Error Handling:
14
+ * - Retry with exponential backoff (max 3 attempts)
15
+ * - Move to dead letter queue after max retries
16
+ * - Update processing_queue table status
17
+ */
18
+
19
+ import { Worker, Job, Queue } from 'bullmq'
20
+ import type { ConnectionOptions } from 'bullmq'
21
+ import {
22
+ TextExtractor,
23
+ UrlExtractor,
24
+ PdfExtractor,
25
+ MarkdownExtractor,
26
+ CodeExtractor,
27
+ } from '../services/extractors/index.js'
28
+ import type { ContentType, ExtractionResult } from '../types/document.types.js'
29
+ import { documents, processingQueue } from '../db/schema/index.js'
30
+ import { eq } from 'drizzle-orm'
31
+ import { workerDb as db } from '../db/worker-connection.js'
32
+ import { getLogger } from '../utils/logger.js'
33
+ import { NotFoundError, ErrorCode } from '../utils/errors.js'
34
+
35
+ const logger = getLogger('ExtractionWorker')
36
+
37
+ // Shared queue instance for chaining (prevents connection leak)
38
+ let sharedChunkingQueue: Queue | null = null
39
+
40
+ // Job data interface
41
+ export interface ExtractionJobData {
42
+ documentId: string
43
+ sourceUrl?: string
44
+ sourceType?: 'text' | 'url' | 'file'
45
+ filePath?: string
46
+ containerTag: string
47
+ }
48
+
49
+ // Job result interface
50
+ export interface ExtractionJobResult {
51
+ documentId: string
52
+ extractedContent: string
53
+ contentType: ContentType
54
+ metadata: Record<string, unknown>
55
+ processingTimeMs: number
56
+ }
57
+
58
+ // Extractor instances (singleton pattern)
59
+ const extractors = {
60
+ text: new TextExtractor(),
61
+ url: new UrlExtractor(),
62
+ pdf: new PdfExtractor(),
63
+ markdown: new MarkdownExtractor(),
64
+ code: new CodeExtractor(),
65
+ }
66
+
67
+ /**
68
+ * Convert content type to MIME type for database storage
69
+ */
70
+ function contentTypeToMimeType(contentType: ContentType): string {
71
+ const mimeTypeMap: Record<ContentType, string> = {
72
+ text: 'text/plain',
73
+ url: 'text/html',
74
+ pdf: 'application/pdf',
75
+ markdown: 'text/markdown',
76
+ code: 'text/plain',
77
+ unknown: 'application/octet-stream',
78
+ }
79
+ return mimeTypeMap[contentType] || 'text/plain'
80
+ }
81
+
82
+ /**
83
+ * Detect content type from content string, URL, or file path
84
+ */
85
+ function detectContentType(content: string, sourceType?: string, filePath?: string): ContentType {
86
+ // Explicit source type
87
+ if (sourceType === 'url' && extractors.url.canHandle(content)) {
88
+ return 'url'
89
+ }
90
+
91
+ // File type detection from path
92
+ if (sourceType === 'file' && filePath) {
93
+ const ext = filePath.toLowerCase().split('.').pop()
94
+ if (ext === 'pdf') return 'pdf'
95
+ if (ext === 'md' || ext === 'markdown') return 'markdown'
96
+ if (['js', 'ts', 'jsx', 'tsx', 'py', 'java', 'c', 'cpp', 'go', 'rs'].includes(ext ?? '')) {
97
+ return 'code'
98
+ }
99
+ }
100
+
101
+ // Content-based detection
102
+ if (extractors.url.canHandle(content)) {
103
+ return 'url'
104
+ }
105
+
106
+ // Check for markdown patterns
107
+ if (content.includes('```') || /^#{1,6}\s/.test(content) || content.includes('[](')) {
108
+ return 'markdown'
109
+ }
110
+
111
+ // Check for code patterns
112
+ if (
113
+ content.includes('function ') ||
114
+ content.includes('class ') ||
115
+ content.includes('import ') ||
116
+ content.includes('const ') ||
117
+ content.includes('def ') ||
118
+ content.includes('public class ')
119
+ ) {
120
+ return 'code'
121
+ }
122
+
123
+ // Default to text
124
+ return 'text'
125
+ }
126
+
127
+ /**
128
+ * Extract content using appropriate extractor
129
+ */
130
+ async function extractContent(
131
+ content: string,
132
+ contentType: ContentType,
133
+ options?: Record<string, unknown>
134
+ ): Promise<ExtractionResult> {
135
+ switch (contentType) {
136
+ case 'url':
137
+ return extractors.url.extract(content, options)
138
+ case 'pdf':
139
+ return extractors.pdf.extract(content, options)
140
+ case 'markdown':
141
+ return extractors.markdown.extract(content, options)
142
+ case 'code':
143
+ return extractors.code.extract(content, options)
144
+ case 'text':
145
+ default:
146
+ return extractors.text.extract(content, options)
147
+ }
148
+ }
149
+
150
+ /**
151
+ * Job processor function
152
+ */
153
+ export async function processExtractionJob(job: Job<ExtractionJobData>): Promise<ExtractionJobResult> {
154
+ const startTime = Date.now()
155
+ const { documentId, sourceUrl, sourceType, filePath, containerTag } = job.data
156
+
157
+ try {
158
+ // Update progress: 0% - Job received
159
+ await job.updateProgress(0)
160
+ await db
161
+ .update(processingQueue)
162
+ .set({
163
+ status: 'processing',
164
+ startedAt: new Date(),
165
+ workerId: job.id,
166
+ })
167
+ .where(eq(processingQueue.documentId, documentId))
168
+
169
+ // Fetch document from database
170
+ const [doc] = await db.select().from(documents).where(eq(documents.id, documentId)).limit(1)
171
+
172
+ if (!doc) {
173
+ throw new NotFoundError('Document', documentId, ErrorCode.DOCUMENT_NOT_FOUND)
174
+ }
175
+
176
+ // Detect content type
177
+ const contentType = detectContentType(sourceUrl || doc.content, sourceType, filePath)
178
+
179
+ // Update progress: 25% - Content type detected
180
+ await job.updateProgress(25)
181
+
182
+ // Extract content using appropriate extractor
183
+ const extractionOptions = {
184
+ metadata: doc.metadata || {},
185
+ sourceUrl,
186
+ filePath,
187
+ }
188
+
189
+ const extractionResult = await extractContent(sourceUrl || doc.content, contentType, extractionOptions)
190
+
191
+ // Update progress: 50% - Content extracted
192
+ await job.updateProgress(50)
193
+
194
+ // Save extracted content to database
195
+ await db
196
+ .update(documents)
197
+ .set({
198
+ content: extractionResult.content,
199
+ contentType: contentTypeToMimeType(contentType),
200
+ metadata: Object.assign({}, doc.metadata || {}, extractionResult.metadata),
201
+ status: 'processing',
202
+ updatedAt: new Date(),
203
+ })
204
+ .where(eq(documents.id, documentId))
205
+
206
+ // Update progress: 75% - Saved to database
207
+ await job.updateProgress(75)
208
+
209
+ // Chain to chunking queue (using shared instance to prevent connection leak)
210
+ if (!sharedChunkingQueue) {
211
+ // Lazy initialization for direct processExtractionJob calls (e.g., in tests)
212
+ const connection = {
213
+ host: process.env.REDIS_HOST || 'localhost',
214
+ port: parseInt(process.env.REDIS_PORT || '6379', 10),
215
+ }
216
+ sharedChunkingQueue = new Queue('chunking', { connection })
217
+ }
218
+
219
+ await sharedChunkingQueue.add(
220
+ 'chunk',
221
+ {
222
+ documentId,
223
+ content: extractionResult.content,
224
+ contentType,
225
+ containerTag,
226
+ },
227
+ {
228
+ priority: job.opts.priority || 0,
229
+ removeOnComplete: true,
230
+ removeOnFail: false,
231
+ }
232
+ )
233
+
234
+ // Update progress: 90% - Chained to chunking
235
+ await job.updateProgress(90)
236
+
237
+ // Mark processing queue job as completed
238
+ await db
239
+ .update(processingQueue)
240
+ .set({
241
+ status: 'completed',
242
+ completedAt: new Date(),
243
+ })
244
+ .where(eq(processingQueue.documentId, documentId))
245
+
246
+ // Update progress: 100% - Complete
247
+ await job.updateProgress(100)
248
+
249
+ const processingTimeMs = Date.now() - startTime
250
+
251
+ return {
252
+ documentId,
253
+ extractedContent: extractionResult.content,
254
+ contentType,
255
+ metadata: extractionResult.metadata,
256
+ processingTimeMs,
257
+ }
258
+ } catch (error) {
259
+ // Update processing queue with error
260
+ const errorMessage = error instanceof Error ? error.message : 'Unknown error'
261
+ const attemptNumber = job.attemptsMade + 1
262
+
263
+ await db
264
+ .update(processingQueue)
265
+ .set({
266
+ status: attemptNumber >= 3 ? 'failed' : 'retry',
267
+ error: errorMessage,
268
+ errorCode: 'EXTRACTION_FAILED',
269
+ attempts: attemptNumber,
270
+ })
271
+ .where(eq(processingQueue.documentId, documentId))
272
+
273
+ // Update document status
274
+ await db
275
+ .update(documents)
276
+ .set({
277
+ status: 'failed',
278
+ updatedAt: new Date(),
279
+ })
280
+ .where(eq(documents.id, documentId))
281
+
282
+ throw error
283
+ }
284
+ }
285
+
286
+ /**
287
+ * Create and configure extraction worker
288
+ */
289
+ export function createExtractionWorker(connection: ConnectionOptions): Worker<ExtractionJobData, ExtractionJobResult> {
290
+ // Initialize shared chunking queue to prevent connection leak
291
+ if (!sharedChunkingQueue) {
292
+ sharedChunkingQueue = new Queue('chunking', { connection })
293
+ }
294
+
295
+ const worker = new Worker<ExtractionJobData, ExtractionJobResult>('extraction', processExtractionJob, {
296
+ connection,
297
+ concurrency: parseInt(process.env.BULLMQ_CONCURRENCY_EXTRACTION || '5', 10),
298
+ removeOnComplete: { count: 100 },
299
+ removeOnFail: { count: 500 },
300
+ limiter: {
301
+ max: 10,
302
+ duration: 1000,
303
+ },
304
+ })
305
+
306
+ // Worker event handlers
307
+ worker.on('completed', (job: Job<ExtractionJobData, ExtractionJobResult>) => {
308
+ logger.info('Job completed', { jobId: job.id, documentId: job.data.documentId })
309
+ })
310
+
311
+ worker.on('failed', (job: Job<ExtractionJobData> | undefined, err: Error) => {
312
+ if (job) {
313
+ logger.error('Job failed', { jobId: job.id, documentId: job.data.documentId, error: err.message })
314
+ } else {
315
+ logger.error('Job failed', { error: err.message })
316
+ }
317
+ })
318
+
319
+ worker.on('error', (err: Error) => {
320
+ logger.error('Worker error', { error: err.message })
321
+ })
322
+
323
+ worker.on('active', (job: Job<ExtractionJobData>) => {
324
+ logger.info('Processing job', { jobId: job.id, documentId: job.data.documentId })
325
+ })
326
+
327
+ return worker
328
+ }
329
+
330
+ /**
331
+ * Create extraction queue for enqueueing jobs
332
+ */
333
+ export function createExtractionQueue(connection: ConnectionOptions): Queue<ExtractionJobData, ExtractionJobResult> {
334
+ return new Queue<ExtractionJobData, ExtractionJobResult>('extraction', {
335
+ connection,
336
+ defaultJobOptions: {
337
+ attempts: 3,
338
+ backoff: {
339
+ type: 'exponential',
340
+ delay: 2000, // 2s, 4s, 8s
341
+ },
342
+ removeOnComplete: true,
343
+ removeOnFail: false,
344
+ },
345
+ })
346
+ }