@twelvehart/supermemory-runtime 1.0.0-next.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +57 -0
- package/README.md +374 -0
- package/dist/index.js +189 -0
- package/dist/mcp/index.js +1132 -0
- package/docker-compose.prod.yml +91 -0
- package/docker-compose.yml +358 -0
- package/drizzle/0000_dapper_the_professor.sql +159 -0
- package/drizzle/0001_api_keys.sql +51 -0
- package/drizzle/meta/0000_snapshot.json +1532 -0
- package/drizzle/meta/_journal.json +13 -0
- package/drizzle.config.ts +20 -0
- package/package.json +114 -0
- package/scripts/add-extraction-job.ts +122 -0
- package/scripts/benchmark-pgvector.ts +122 -0
- package/scripts/bootstrap.sh +209 -0
- package/scripts/check-runtime-pack.ts +111 -0
- package/scripts/claude-mcp-config.ts +336 -0
- package/scripts/docker-entrypoint.sh +183 -0
- package/scripts/doctor.ts +377 -0
- package/scripts/init-db.sql +33 -0
- package/scripts/install.sh +1110 -0
- package/scripts/mcp-setup.ts +271 -0
- package/scripts/migrations/001_create_pgvector_extension.sql +31 -0
- package/scripts/migrations/002_create_memory_embeddings_table.sql +75 -0
- package/scripts/migrations/003_create_hnsw_index.sql +94 -0
- package/scripts/migrations/004_create_memory_embeddings_standalone.sql +70 -0
- package/scripts/migrations/005_create_chunks_table.sql +95 -0
- package/scripts/migrations/006_create_processing_queue.sql +45 -0
- package/scripts/migrations/generate_test_data.sql +42 -0
- package/scripts/migrations/phase1_comprehensive_test.sql +204 -0
- package/scripts/migrations/run_migrations.sh +286 -0
- package/scripts/migrations/test_hnsw_index.sql +255 -0
- package/scripts/pre-commit-secrets +282 -0
- package/scripts/run-extraction-worker.ts +46 -0
- package/scripts/run-phase1-tests.sh +291 -0
- package/scripts/setup.ts +222 -0
- package/scripts/smoke-install.sh +12 -0
- package/scripts/test-health-endpoint.sh +328 -0
- package/src/api/index.ts +2 -0
- package/src/api/middleware/auth.ts +80 -0
- package/src/api/middleware/csrf.ts +308 -0
- package/src/api/middleware/errorHandler.ts +166 -0
- package/src/api/middleware/rateLimit.ts +360 -0
- package/src/api/middleware/validation.ts +514 -0
- package/src/api/routes/documents.ts +286 -0
- package/src/api/routes/profiles.ts +237 -0
- package/src/api/routes/search.ts +71 -0
- package/src/api/stores/index.ts +58 -0
- package/src/config/bootstrap-env.ts +3 -0
- package/src/config/env.ts +71 -0
- package/src/config/feature-flags.ts +25 -0
- package/src/config/index.ts +140 -0
- package/src/config/secrets.config.ts +291 -0
- package/src/db/client.ts +92 -0
- package/src/db/index.ts +73 -0
- package/src/db/postgres.ts +72 -0
- package/src/db/schema/chunks.schema.ts +31 -0
- package/src/db/schema/containers.schema.ts +46 -0
- package/src/db/schema/documents.schema.ts +49 -0
- package/src/db/schema/embeddings.schema.ts +32 -0
- package/src/db/schema/index.ts +11 -0
- package/src/db/schema/memories.schema.ts +72 -0
- package/src/db/schema/profiles.schema.ts +34 -0
- package/src/db/schema/queue.schema.ts +59 -0
- package/src/db/schema/relationships.schema.ts +42 -0
- package/src/db/schema.ts +223 -0
- package/src/db/worker-connection.ts +47 -0
- package/src/index.ts +235 -0
- package/src/mcp/CLAUDE.md +1 -0
- package/src/mcp/index.ts +1380 -0
- package/src/mcp/legacyState.ts +22 -0
- package/src/mcp/rateLimit.ts +358 -0
- package/src/mcp/resources.ts +309 -0
- package/src/mcp/results.ts +104 -0
- package/src/mcp/tools.ts +401 -0
- package/src/queues/config.ts +119 -0
- package/src/queues/index.ts +289 -0
- package/src/sdk/client.ts +225 -0
- package/src/sdk/errors.ts +266 -0
- package/src/sdk/http.ts +560 -0
- package/src/sdk/index.ts +244 -0
- package/src/sdk/resources/base.ts +65 -0
- package/src/sdk/resources/connections.ts +204 -0
- package/src/sdk/resources/documents.ts +163 -0
- package/src/sdk/resources/index.ts +10 -0
- package/src/sdk/resources/memories.ts +150 -0
- package/src/sdk/resources/search.ts +60 -0
- package/src/sdk/resources/settings.ts +36 -0
- package/src/sdk/types.ts +674 -0
- package/src/services/chunking/index.ts +451 -0
- package/src/services/chunking.service.ts +650 -0
- package/src/services/csrf.service.ts +252 -0
- package/src/services/documents.repository.ts +219 -0
- package/src/services/documents.service.ts +191 -0
- package/src/services/embedding.service.ts +404 -0
- package/src/services/extraction.service.ts +300 -0
- package/src/services/extractors/code.extractor.ts +451 -0
- package/src/services/extractors/index.ts +9 -0
- package/src/services/extractors/markdown.extractor.ts +461 -0
- package/src/services/extractors/pdf.extractor.ts +315 -0
- package/src/services/extractors/text.extractor.ts +118 -0
- package/src/services/extractors/url.extractor.ts +243 -0
- package/src/services/index.ts +235 -0
- package/src/services/ingestion.service.ts +177 -0
- package/src/services/llm/anthropic.ts +400 -0
- package/src/services/llm/base.ts +460 -0
- package/src/services/llm/contradiction-detector.service.ts +526 -0
- package/src/services/llm/heuristics.ts +148 -0
- package/src/services/llm/index.ts +309 -0
- package/src/services/llm/memory-classifier.service.ts +383 -0
- package/src/services/llm/memory-extension-detector.service.ts +523 -0
- package/src/services/llm/mock.ts +470 -0
- package/src/services/llm/openai.ts +398 -0
- package/src/services/llm/prompts.ts +438 -0
- package/src/services/llm/types.ts +373 -0
- package/src/services/memory.repository.ts +1769 -0
- package/src/services/memory.service.ts +1338 -0
- package/src/services/memory.types.ts +234 -0
- package/src/services/persistence/index.ts +295 -0
- package/src/services/pipeline.service.ts +509 -0
- package/src/services/profile.repository.ts +436 -0
- package/src/services/profile.service.ts +560 -0
- package/src/services/profile.types.ts +270 -0
- package/src/services/relationships/detector.ts +1128 -0
- package/src/services/relationships/index.ts +268 -0
- package/src/services/relationships/memory-integration.ts +459 -0
- package/src/services/relationships/strategies.ts +132 -0
- package/src/services/relationships/types.ts +370 -0
- package/src/services/search.service.ts +761 -0
- package/src/services/search.types.ts +220 -0
- package/src/services/secrets.service.ts +384 -0
- package/src/services/vectorstore/base.ts +327 -0
- package/src/services/vectorstore/index.ts +444 -0
- package/src/services/vectorstore/memory.ts +286 -0
- package/src/services/vectorstore/migration.ts +295 -0
- package/src/services/vectorstore/mock.ts +403 -0
- package/src/services/vectorstore/pgvector.ts +695 -0
- package/src/services/vectorstore/types.ts +247 -0
- package/src/startup.ts +389 -0
- package/src/types/api.types.ts +193 -0
- package/src/types/document.types.ts +103 -0
- package/src/types/index.ts +241 -0
- package/src/types/profile.base.ts +133 -0
- package/src/utils/errors.ts +447 -0
- package/src/utils/id.ts +15 -0
- package/src/utils/index.ts +101 -0
- package/src/utils/logger.ts +313 -0
- package/src/utils/sanitization.ts +501 -0
- package/src/utils/secret-validation.ts +273 -0
- package/src/utils/synonyms.ts +188 -0
- package/src/utils/validation.ts +581 -0
- package/src/workers/chunking.worker.ts +242 -0
- package/src/workers/embedding.worker.ts +358 -0
- package/src/workers/extraction.worker.ts +346 -0
- package/src/workers/indexing.worker.ts +505 -0
- package/tsconfig.json +38 -0
|
@@ -0,0 +1,509 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document processing pipeline - orchestrates the full extraction workflow
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { v4 as uuidv4 } from 'uuid'
|
|
6
|
+
import { Document, DocumentStatus, Chunk, PipelineResult, ChunkingOptions } from '../types/document.types.js'
|
|
7
|
+
import { ExtractionService } from './extraction.service.js'
|
|
8
|
+
import { ChunkingService } from './chunking.service.js'
|
|
9
|
+
import { NotFoundError, ExtractionError, ErrorCode } from '../utils/errors.js'
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Simple mutex implementation for protecting queue operations
|
|
13
|
+
*/
|
|
14
|
+
class Mutex {
|
|
15
|
+
private locked = false
|
|
16
|
+
private waitQueue: Array<() => void> = []
|
|
17
|
+
|
|
18
|
+
async acquire(): Promise<void> {
|
|
19
|
+
if (!this.locked) {
|
|
20
|
+
this.locked = true
|
|
21
|
+
return
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
return new Promise<void>((resolve) => {
|
|
25
|
+
this.waitQueue.push(resolve)
|
|
26
|
+
})
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
release(): void {
|
|
30
|
+
if (this.waitQueue.length > 0) {
|
|
31
|
+
const next = this.waitQueue.shift()
|
|
32
|
+
next?.()
|
|
33
|
+
} else {
|
|
34
|
+
this.locked = false
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
async withLock<T>(fn: () => T | Promise<T>): Promise<T> {
|
|
39
|
+
await this.acquire()
|
|
40
|
+
try {
|
|
41
|
+
return await fn()
|
|
42
|
+
} finally {
|
|
43
|
+
this.release()
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Thread-safe concurrent queue for document processing
|
|
50
|
+
*/
|
|
51
|
+
class ConcurrentQueue<T> {
|
|
52
|
+
private items: T[] = []
|
|
53
|
+
private mutex = new Mutex()
|
|
54
|
+
|
|
55
|
+
async enqueue(item: T): Promise<void> {
|
|
56
|
+
await this.mutex.withLock(() => {
|
|
57
|
+
this.items.push(item)
|
|
58
|
+
})
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
async enqueueBatch(items: T[]): Promise<void> {
|
|
62
|
+
await this.mutex.withLock(() => {
|
|
63
|
+
this.items.push(...items)
|
|
64
|
+
})
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
async dequeue(): Promise<T | undefined> {
|
|
68
|
+
return this.mutex.withLock(() => {
|
|
69
|
+
return this.items.shift()
|
|
70
|
+
})
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
async size(): Promise<number> {
|
|
74
|
+
return this.mutex.withLock(() => {
|
|
75
|
+
return this.items.length
|
|
76
|
+
})
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
async isEmpty(): Promise<boolean> {
|
|
80
|
+
return this.mutex.withLock(() => {
|
|
81
|
+
return this.items.length === 0
|
|
82
|
+
})
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
interface PipelineConfig {
|
|
87
|
+
maxRetries: number
|
|
88
|
+
retryDelayMs: number
|
|
89
|
+
chunkingOptions?: ChunkingOptions
|
|
90
|
+
/** Timeout for extraction stage in milliseconds (default: 30000) */
|
|
91
|
+
extractionTimeoutMs?: number
|
|
92
|
+
/** Timeout for chunking stage in milliseconds (default: 10000) */
|
|
93
|
+
chunkingTimeoutMs?: number
|
|
94
|
+
/** Timeout for embedding stage in milliseconds (default: 60000) */
|
|
95
|
+
embeddingTimeoutMs?: number
|
|
96
|
+
/** Timeout for indexing stage in milliseconds (default: 30000) */
|
|
97
|
+
indexingTimeoutMs?: number
|
|
98
|
+
onStatusChange?: (docId: string, status: DocumentStatus) => void
|
|
99
|
+
onError?: (docId: string, error: Error) => void
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
interface EmbeddingProvider {
|
|
103
|
+
embed(text: string): Promise<number[]>
|
|
104
|
+
embedBatch(texts: string[]): Promise<number[][]>
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
interface IndexProvider {
|
|
108
|
+
index(chunks: Chunk[]): Promise<void>
|
|
109
|
+
remove(documentId: string): Promise<void>
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
const DEFAULT_CONFIG: PipelineConfig = {
|
|
113
|
+
maxRetries: 3,
|
|
114
|
+
retryDelayMs: 1000,
|
|
115
|
+
extractionTimeoutMs: 30000,
|
|
116
|
+
chunkingTimeoutMs: 10000,
|
|
117
|
+
embeddingTimeoutMs: 60000,
|
|
118
|
+
indexingTimeoutMs: 30000,
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Timeout error for pipeline stage cancellation
|
|
123
|
+
*/
|
|
124
|
+
class PipelineTimeoutError extends Error {
|
|
125
|
+
constructor(stage: string, timeoutMs: number) {
|
|
126
|
+
super(`Pipeline ${stage} stage timed out after ${timeoutMs}ms`)
|
|
127
|
+
this.name = 'PipelineTimeoutError'
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Wrap an operation with a timeout that properly cancels on timeout
|
|
133
|
+
*/
|
|
134
|
+
async function withTimeout<T>(operation: () => Promise<T>, timeoutMs: number, stageName: string): Promise<T> {
|
|
135
|
+
let timeoutId: NodeJS.Timeout | undefined
|
|
136
|
+
|
|
137
|
+
const timeoutPromise = new Promise<never>((_, reject) => {
|
|
138
|
+
timeoutId = setTimeout(() => {
|
|
139
|
+
reject(new PipelineTimeoutError(stageName, timeoutMs))
|
|
140
|
+
}, timeoutMs)
|
|
141
|
+
})
|
|
142
|
+
|
|
143
|
+
try {
|
|
144
|
+
const result = await Promise.race([operation(), timeoutPromise])
|
|
145
|
+
return result
|
|
146
|
+
} finally {
|
|
147
|
+
if (timeoutId) {
|
|
148
|
+
clearTimeout(timeoutId)
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
export class PipelineService {
|
|
154
|
+
private readonly extractionService: ExtractionService
|
|
155
|
+
private readonly chunkingService: ChunkingService
|
|
156
|
+
private readonly config: PipelineConfig
|
|
157
|
+
|
|
158
|
+
// Document store (in-memory for now, could be replaced with database)
|
|
159
|
+
private readonly documents: Map<string, Document> = new Map()
|
|
160
|
+
private readonly chunks: Map<string, Chunk[]> = new Map()
|
|
161
|
+
|
|
162
|
+
// Optional providers
|
|
163
|
+
private embeddingProvider?: EmbeddingProvider
|
|
164
|
+
private indexProvider?: IndexProvider
|
|
165
|
+
|
|
166
|
+
constructor(config?: Partial<PipelineConfig>) {
|
|
167
|
+
this.extractionService = new ExtractionService()
|
|
168
|
+
this.chunkingService = new ChunkingService()
|
|
169
|
+
this.config = { ...DEFAULT_CONFIG, ...config }
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
/**
|
|
173
|
+
* Set embedding provider for generating vector embeddings
|
|
174
|
+
*/
|
|
175
|
+
setEmbeddingProvider(provider: EmbeddingProvider): void {
|
|
176
|
+
this.embeddingProvider = provider
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
/**
|
|
180
|
+
* Set index provider for storing and searching chunks
|
|
181
|
+
*/
|
|
182
|
+
setIndexProvider(provider: IndexProvider): void {
|
|
183
|
+
this.indexProvider = provider
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
/**
|
|
187
|
+
* Create a new document and add it to the queue
|
|
188
|
+
*/
|
|
189
|
+
async createDocument(content: string, metadata?: Document['metadata']): Promise<Document> {
|
|
190
|
+
const now = new Date()
|
|
191
|
+
const document: Document = {
|
|
192
|
+
id: uuidv4(),
|
|
193
|
+
content,
|
|
194
|
+
status: 'queued',
|
|
195
|
+
metadata: metadata || {},
|
|
196
|
+
createdAt: now,
|
|
197
|
+
updatedAt: now,
|
|
198
|
+
retryCount: 0,
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
this.documents.set(document.id, document)
|
|
202
|
+
return document
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
/**
|
|
206
|
+
* Process a document through the full pipeline with configurable timeouts
|
|
207
|
+
*/
|
|
208
|
+
async processDocument(docId: string): Promise<PipelineResult> {
|
|
209
|
+
const startTime = Date.now()
|
|
210
|
+
const document = this.documents.get(docId)
|
|
211
|
+
|
|
212
|
+
if (!document) {
|
|
213
|
+
throw new NotFoundError('Document', docId, ErrorCode.DOCUMENT_NOT_FOUND)
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
try {
|
|
217
|
+
// Stage 1: Extracting (with timeout)
|
|
218
|
+
await this.updateStatus(docId, 'extracting')
|
|
219
|
+
const extractionResult = await withTimeout(
|
|
220
|
+
() => this.withRetry(() => this.extractionService.extract(document), 'extraction'),
|
|
221
|
+
this.config.extractionTimeoutMs ?? DEFAULT_CONFIG.extractionTimeoutMs!,
|
|
222
|
+
'extraction'
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
// Update document with extraction results
|
|
226
|
+
document.contentType = extractionResult.contentType
|
|
227
|
+
document.metadata = {
|
|
228
|
+
...document.metadata,
|
|
229
|
+
...extractionResult.metadata,
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// Stage 2: Chunking (with timeout)
|
|
233
|
+
await this.updateStatus(docId, 'chunking')
|
|
234
|
+
const chunks = await withTimeout(
|
|
235
|
+
() =>
|
|
236
|
+
this.withRetry(
|
|
237
|
+
() =>
|
|
238
|
+
Promise.resolve(
|
|
239
|
+
this.chunkingService.chunk(
|
|
240
|
+
docId,
|
|
241
|
+
extractionResult.content,
|
|
242
|
+
extractionResult.contentType,
|
|
243
|
+
this.config.chunkingOptions
|
|
244
|
+
)
|
|
245
|
+
),
|
|
246
|
+
'chunking'
|
|
247
|
+
),
|
|
248
|
+
this.config.chunkingTimeoutMs ?? DEFAULT_CONFIG.chunkingTimeoutMs!,
|
|
249
|
+
'chunking'
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
// Stage 3: Embedding (if provider available, with timeout)
|
|
253
|
+
if (this.embeddingProvider) {
|
|
254
|
+
await this.updateStatus(docId, 'embedding')
|
|
255
|
+
await withTimeout(
|
|
256
|
+
() => this.withRetry(() => this.generateEmbeddings(chunks), 'embedding'),
|
|
257
|
+
this.config.embeddingTimeoutMs ?? DEFAULT_CONFIG.embeddingTimeoutMs!,
|
|
258
|
+
'embedding'
|
|
259
|
+
)
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// Stage 4: Indexing (if provider available, with timeout)
|
|
263
|
+
if (this.indexProvider) {
|
|
264
|
+
await this.updateStatus(docId, 'indexing')
|
|
265
|
+
await withTimeout(
|
|
266
|
+
() => this.withRetry(() => this.indexProvider!.index(chunks), 'indexing'),
|
|
267
|
+
this.config.indexingTimeoutMs ?? DEFAULT_CONFIG.indexingTimeoutMs!,
|
|
268
|
+
'indexing'
|
|
269
|
+
)
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
// Stage 5: Done
|
|
273
|
+
await this.updateStatus(docId, 'done')
|
|
274
|
+
this.chunks.set(docId, chunks)
|
|
275
|
+
|
|
276
|
+
return {
|
|
277
|
+
documentId: docId,
|
|
278
|
+
status: 'done',
|
|
279
|
+
chunks,
|
|
280
|
+
processingTimeMs: Date.now() - startTime,
|
|
281
|
+
}
|
|
282
|
+
} catch (error) {
|
|
283
|
+
const errorMessage = error instanceof Error ? error.message : 'Unknown error'
|
|
284
|
+
document.errorMessage = errorMessage
|
|
285
|
+
await this.updateStatus(docId, 'error')
|
|
286
|
+
|
|
287
|
+
this.config.onError?.(docId, error as Error)
|
|
288
|
+
|
|
289
|
+
return {
|
|
290
|
+
documentId: docId,
|
|
291
|
+
status: 'error',
|
|
292
|
+
chunks: [],
|
|
293
|
+
processingTimeMs: Date.now() - startTime,
|
|
294
|
+
error: errorMessage,
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
/**
|
|
300
|
+
* Process multiple documents in parallel with thread-safe queue
|
|
301
|
+
*/
|
|
302
|
+
async processDocuments(docIds: string[], concurrency: number = 5): Promise<PipelineResult[]> {
|
|
303
|
+
const results: PipelineResult[] = []
|
|
304
|
+
const resultsMutex = new Mutex()
|
|
305
|
+
const queue = new ConcurrentQueue<string>()
|
|
306
|
+
|
|
307
|
+
// Enqueue all document IDs
|
|
308
|
+
await queue.enqueueBatch(docIds)
|
|
309
|
+
|
|
310
|
+
const processNext = async (): Promise<void> => {
|
|
311
|
+
while (!(await queue.isEmpty())) {
|
|
312
|
+
const docId = await queue.dequeue()
|
|
313
|
+
if (docId) {
|
|
314
|
+
const result = await this.processDocument(docId)
|
|
315
|
+
// Thread-safe push to results array
|
|
316
|
+
await resultsMutex.withLock(() => {
|
|
317
|
+
results.push(result)
|
|
318
|
+
})
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Create concurrent workers
|
|
324
|
+
const workers = Array(Math.min(concurrency, docIds.length))
|
|
325
|
+
.fill(null)
|
|
326
|
+
.map(() => processNext())
|
|
327
|
+
|
|
328
|
+
await Promise.all(workers)
|
|
329
|
+
return results
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
/**
|
|
333
|
+
* Reprocess a failed document
|
|
334
|
+
*/
|
|
335
|
+
async reprocessDocument(docId: string): Promise<PipelineResult> {
|
|
336
|
+
const document = this.documents.get(docId)
|
|
337
|
+
|
|
338
|
+
if (!document) {
|
|
339
|
+
throw new NotFoundError('Document', docId, ErrorCode.DOCUMENT_NOT_FOUND)
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
// Reset retry count and clear error
|
|
343
|
+
document.retryCount = 0
|
|
344
|
+
document.errorMessage = undefined
|
|
345
|
+
document.status = 'queued'
|
|
346
|
+
|
|
347
|
+
return this.processDocument(docId)
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
/**
|
|
351
|
+
* Get document by ID
|
|
352
|
+
*/
|
|
353
|
+
getDocument(docId: string): Document | undefined {
|
|
354
|
+
return this.documents.get(docId)
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
/**
|
|
358
|
+
* Get chunks for a document
|
|
359
|
+
*/
|
|
360
|
+
getChunks(docId: string): Chunk[] | undefined {
|
|
361
|
+
return this.chunks.get(docId)
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
/**
|
|
365
|
+
* Get all documents with a specific status
|
|
366
|
+
*/
|
|
367
|
+
getDocumentsByStatus(status: DocumentStatus): Document[] {
|
|
368
|
+
return Array.from(this.documents.values()).filter((doc) => doc.status === status)
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
/**
|
|
372
|
+
* Delete a document and its chunks
|
|
373
|
+
*/
|
|
374
|
+
async deleteDocument(docId: string): Promise<void> {
|
|
375
|
+
if (this.indexProvider) {
|
|
376
|
+
await this.indexProvider.remove(docId)
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
this.documents.delete(docId)
|
|
380
|
+
this.chunks.delete(docId)
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
/**
|
|
384
|
+
* Get pipeline statistics
|
|
385
|
+
*/
|
|
386
|
+
getStats(): {
|
|
387
|
+
total: number
|
|
388
|
+
byStatus: Record<DocumentStatus, number>
|
|
389
|
+
totalChunks: number
|
|
390
|
+
} {
|
|
391
|
+
const docs = Array.from(this.documents.values())
|
|
392
|
+
const statuses: DocumentStatus[] = ['queued', 'extracting', 'chunking', 'embedding', 'indexing', 'done', 'error']
|
|
393
|
+
|
|
394
|
+
const byStatus = statuses.reduce(
|
|
395
|
+
(acc, status) => {
|
|
396
|
+
acc[status] = docs.filter((d) => d.status === status).length
|
|
397
|
+
return acc
|
|
398
|
+
},
|
|
399
|
+
{} as Record<DocumentStatus, number>
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
const totalChunks = Array.from(this.chunks.values()).reduce((sum, chunks) => sum + chunks.length, 0)
|
|
403
|
+
|
|
404
|
+
return {
|
|
405
|
+
total: docs.length,
|
|
406
|
+
byStatus,
|
|
407
|
+
totalChunks,
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
/**
|
|
412
|
+
* Update document status and notify listeners
|
|
413
|
+
*/
|
|
414
|
+
private async updateStatus(docId: string, status: DocumentStatus): Promise<void> {
|
|
415
|
+
const document = this.documents.get(docId)
|
|
416
|
+
if (document) {
|
|
417
|
+
document.status = status
|
|
418
|
+
document.updatedAt = new Date()
|
|
419
|
+
this.config.onStatusChange?.(docId, status)
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
/**
|
|
424
|
+
* Execute with retry logic
|
|
425
|
+
*/
|
|
426
|
+
private async withRetry<T>(operation: () => Promise<T>, stageName: string): Promise<T> {
|
|
427
|
+
let lastError: Error | undefined
|
|
428
|
+
|
|
429
|
+
for (let attempt = 0; attempt <= this.config.maxRetries; attempt++) {
|
|
430
|
+
try {
|
|
431
|
+
return await operation()
|
|
432
|
+
} catch (error) {
|
|
433
|
+
lastError = error as Error
|
|
434
|
+
|
|
435
|
+
if (attempt < this.config.maxRetries) {
|
|
436
|
+
// Exponential backoff
|
|
437
|
+
const delay = this.config.retryDelayMs * Math.pow(2, attempt)
|
|
438
|
+
await this.delay(delay)
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
throw new ExtractionError(
|
|
444
|
+
`${stageName} failed after ${this.config.maxRetries + 1} attempts: ${lastError?.message}`,
|
|
445
|
+
undefined,
|
|
446
|
+
{
|
|
447
|
+
stage: stageName,
|
|
448
|
+
attempts: this.config.maxRetries + 1,
|
|
449
|
+
lastError: lastError?.message,
|
|
450
|
+
}
|
|
451
|
+
)
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
/**
|
|
455
|
+
* Generate embeddings for chunks
|
|
456
|
+
*/
|
|
457
|
+
private async generateEmbeddings(chunks: Chunk[]): Promise<void> {
|
|
458
|
+
if (!this.embeddingProvider) return
|
|
459
|
+
|
|
460
|
+
const texts = chunks.map((c) => c.content)
|
|
461
|
+
const embeddings = await this.embeddingProvider.embedBatch(texts)
|
|
462
|
+
|
|
463
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
464
|
+
const chunk = chunks[i]
|
|
465
|
+
const embedding = embeddings[i]
|
|
466
|
+
if (chunk && embedding) {
|
|
467
|
+
chunk.embedding = embedding
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
/**
|
|
473
|
+
* Delay utility
|
|
474
|
+
*/
|
|
475
|
+
private delay(ms: number): Promise<void> {
|
|
476
|
+
return new Promise((resolve) => setTimeout(resolve, ms))
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
/**
|
|
480
|
+
* Export documents for backup
|
|
481
|
+
*/
|
|
482
|
+
exportDocuments(): { documents: Document[]; chunks: Record<string, Chunk[]> } {
|
|
483
|
+
return {
|
|
484
|
+
documents: Array.from(this.documents.values()),
|
|
485
|
+
chunks: Object.fromEntries(this.chunks.entries()),
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
/**
|
|
490
|
+
* Import documents from backup
|
|
491
|
+
*/
|
|
492
|
+
importDocuments(data: { documents: Document[]; chunks: Record<string, Chunk[]> }): void {
|
|
493
|
+
for (const doc of data.documents) {
|
|
494
|
+
this.documents.set(doc.id, doc)
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
for (const [docId, docChunks] of Object.entries(data.chunks)) {
|
|
498
|
+
this.chunks.set(docId, docChunks)
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
/**
|
|
503
|
+
* Clear all documents
|
|
504
|
+
*/
|
|
505
|
+
clear(): void {
|
|
506
|
+
this.documents.clear()
|
|
507
|
+
this.chunks.clear()
|
|
508
|
+
}
|
|
509
|
+
}
|