pdf-brain 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/cli.ts +35 -0
- package/src/index.ts +6 -0
- package/src/services/Database.ts +74 -0
package/package.json
CHANGED
package/src/cli.ts
CHANGED
|
@@ -85,6 +85,9 @@ Commands:
|
|
|
85
85
|
|
|
86
86
|
check Check if Ollama is ready
|
|
87
87
|
|
|
88
|
+
repair Fix database integrity issues
|
|
89
|
+
Removes orphaned chunks/embeddings
|
|
90
|
+
|
|
88
91
|
migrate Database migration utilities
|
|
89
92
|
--check Check if migration is needed
|
|
90
93
|
--import <file> Import from SQL dump file
|
|
@@ -322,6 +325,38 @@ const program = Effect.gen(function* () {
|
|
|
322
325
|
break;
|
|
323
326
|
}
|
|
324
327
|
|
|
328
|
+
case "repair": {
|
|
329
|
+
yield* Console.log("Checking database integrity...\n");
|
|
330
|
+
const result = yield* library.repair();
|
|
331
|
+
|
|
332
|
+
if (
|
|
333
|
+
result.orphanedChunks === 0 &&
|
|
334
|
+
result.orphanedEmbeddings === 0 &&
|
|
335
|
+
result.zeroVectorEmbeddings === 0
|
|
336
|
+
) {
|
|
337
|
+
yield* Console.log("✓ Database is healthy - no repairs needed");
|
|
338
|
+
} else {
|
|
339
|
+
yield* Console.log("Repairs completed:");
|
|
340
|
+
if (result.orphanedChunks > 0) {
|
|
341
|
+
yield* Console.log(
|
|
342
|
+
` • Removed ${result.orphanedChunks} orphaned chunks`,
|
|
343
|
+
);
|
|
344
|
+
}
|
|
345
|
+
if (result.orphanedEmbeddings > 0) {
|
|
346
|
+
yield* Console.log(
|
|
347
|
+
` • Removed ${result.orphanedEmbeddings} orphaned embeddings`,
|
|
348
|
+
);
|
|
349
|
+
}
|
|
350
|
+
if (result.zeroVectorEmbeddings > 0) {
|
|
351
|
+
yield* Console.log(
|
|
352
|
+
` • Removed ${result.zeroVectorEmbeddings} zero-dimension embeddings`,
|
|
353
|
+
);
|
|
354
|
+
}
|
|
355
|
+
yield* Console.log("\n✓ Database repaired");
|
|
356
|
+
}
|
|
357
|
+
break;
|
|
358
|
+
}
|
|
359
|
+
|
|
325
360
|
default:
|
|
326
361
|
yield* Console.error(`Unknown command: ${command}`);
|
|
327
362
|
yield* Console.log(HELP);
|
package/src/index.ts
CHANGED
|
@@ -289,6 +289,12 @@ export class PDFLibrary extends Effect.Service<PDFLibrary>()("PDFLibrary", {
|
|
|
289
289
|
libraryPath: config.libraryPath,
|
|
290
290
|
};
|
|
291
291
|
}),
|
|
292
|
+
|
|
293
|
+
/**
|
|
294
|
+
* Repair database integrity issues
|
|
295
|
+
* Removes orphaned chunks and embeddings
|
|
296
|
+
*/
|
|
297
|
+
repair: () => db.repair(),
|
|
292
298
|
};
|
|
293
299
|
}),
|
|
294
300
|
dependencies: [OllamaLive, PDFExtractorLive, DatabaseLive],
|
package/src/services/Database.ts
CHANGED
|
@@ -76,6 +76,16 @@ export class Database extends Context.Tag("Database")<
|
|
|
76
76
|
{ documents: number; chunks: number; embeddings: number },
|
|
77
77
|
DatabaseError
|
|
78
78
|
>;
|
|
79
|
+
|
|
80
|
+
// Maintenance
|
|
81
|
+
readonly repair: () => Effect.Effect<
|
|
82
|
+
{
|
|
83
|
+
orphanedChunks: number;
|
|
84
|
+
orphanedEmbeddings: number;
|
|
85
|
+
zeroVectorEmbeddings: number;
|
|
86
|
+
},
|
|
87
|
+
DatabaseError
|
|
88
|
+
>;
|
|
79
89
|
}
|
|
80
90
|
>() {}
|
|
81
91
|
|
|
@@ -464,6 +474,70 @@ export const DatabaseLive = Layer.scoped(
|
|
|
464
474
|
},
|
|
465
475
|
catch: (e) => new DatabaseError({ reason: String(e) }),
|
|
466
476
|
}),
|
|
477
|
+
|
|
478
|
+
repair: () =>
|
|
479
|
+
Effect.tryPromise({
|
|
480
|
+
try: async () => {
|
|
481
|
+
// Count orphaned chunks (doc_id not in documents)
|
|
482
|
+
const orphanedChunksResult = await db.query(`
|
|
483
|
+
SELECT COUNT(*) as count FROM chunks c
|
|
484
|
+
WHERE NOT EXISTS (SELECT 1 FROM documents d WHERE d.id = c.doc_id)
|
|
485
|
+
`);
|
|
486
|
+
const orphanedChunks = Number(
|
|
487
|
+
(orphanedChunksResult.rows[0] as { count: number }).count,
|
|
488
|
+
);
|
|
489
|
+
|
|
490
|
+
// Count orphaned embeddings (chunk_id not in chunks)
|
|
491
|
+
const orphanedEmbeddingsResult = await db.query(`
|
|
492
|
+
SELECT COUNT(*) as count FROM embeddings e
|
|
493
|
+
WHERE NOT EXISTS (SELECT 1 FROM chunks c WHERE c.id = e.chunk_id)
|
|
494
|
+
`);
|
|
495
|
+
const orphanedEmbeddings = Number(
|
|
496
|
+
(orphanedEmbeddingsResult.rows[0] as { count: number }).count,
|
|
497
|
+
);
|
|
498
|
+
|
|
499
|
+
// Count zero-dimension embeddings (vector_dims returns 0 or null)
|
|
500
|
+
// Note: In pgvector, we check for malformed vectors
|
|
501
|
+
const zeroVectorResult = await db.query(`
|
|
502
|
+
SELECT COUNT(*) as count FROM embeddings
|
|
503
|
+
WHERE embedding IS NULL OR vector_dims(embedding) = 0
|
|
504
|
+
`);
|
|
505
|
+
const zeroVectorEmbeddings = Number(
|
|
506
|
+
(zeroVectorResult.rows[0] as { count: number }).count,
|
|
507
|
+
);
|
|
508
|
+
|
|
509
|
+
// Delete orphaned embeddings first (depends on chunks)
|
|
510
|
+
if (orphanedEmbeddings > 0) {
|
|
511
|
+
await db.query(`
|
|
512
|
+
DELETE FROM embeddings e
|
|
513
|
+
WHERE NOT EXISTS (SELECT 1 FROM chunks c WHERE c.id = e.chunk_id)
|
|
514
|
+
`);
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
// Delete orphaned chunks (depends on documents)
|
|
518
|
+
if (orphanedChunks > 0) {
|
|
519
|
+
await db.query(`
|
|
520
|
+
DELETE FROM chunks c
|
|
521
|
+
WHERE NOT EXISTS (SELECT 1 FROM documents d WHERE d.id = c.doc_id)
|
|
522
|
+
`);
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
// Delete zero-dimension embeddings
|
|
526
|
+
if (zeroVectorEmbeddings > 0) {
|
|
527
|
+
await db.query(`
|
|
528
|
+
DELETE FROM embeddings
|
|
529
|
+
WHERE embedding IS NULL OR vector_dims(embedding) = 0
|
|
530
|
+
`);
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
return {
|
|
534
|
+
orphanedChunks,
|
|
535
|
+
orphanedEmbeddings,
|
|
536
|
+
zeroVectorEmbeddings,
|
|
537
|
+
};
|
|
538
|
+
},
|
|
539
|
+
catch: (e) => new DatabaseError({ reason: String(e) }),
|
|
540
|
+
}),
|
|
467
541
|
};
|
|
468
542
|
}),
|
|
469
543
|
);
|