pdf-brain 0.1.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pdf-brain",
3
- "version": "0.1.1",
3
+ "version": "0.4.0",
4
4
  "description": "Local PDF knowledge base with vector search",
5
5
  "type": "module",
6
6
  "main": "src/index.ts",
@@ -18,7 +18,8 @@
18
18
  "@effect/platform": "^0.72.0",
19
19
  "@effect/platform-bun": "^0.52.0",
20
20
  "@effect/schema": "^0.75.0",
21
- "@electric-sql/pglite": "^0.2.17",
21
+ "@electric-sql/pglite": "^0.3.14",
22
+ "@electric-sql/pglite-tools": "^0.2.19",
22
23
  "effect": "^3.12.0"
23
24
  },
24
25
  "devDependencies": {
@@ -0,0 +1,150 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Export data from PGlite 0.2.x (PostgreSQL 16) database
4
+ *
5
+ * This script exports your existing database using PGlite 0.2.x before upgrading.
6
+ *
7
+ * Prerequisites:
8
+ * npm install @electric-sql/pglite@0.2.12
9
+ *
10
+ * Usage:
11
+ * node export-pg16.mjs [db-path] [output-dir]
12
+ *
13
+ * Example:
14
+ * node export-pg16.mjs ~/.pdf-library/library ~/.pdf-library
15
+ */
16
+
17
+ import { PGlite } from '@electric-sql/pglite';
18
+ import { writeFileSync, appendFileSync, existsSync, unlinkSync, mkdirSync } from 'fs';
19
+ import { join, dirname } from 'path';
20
+
21
+ const args = process.argv.slice(2);
22
+ const dbPath = args[0] || join(process.env.HOME, 'Documents/.pdf-library/library');
23
+ const outputDir = args[1] || dirname(dbPath);
24
+
25
+ async function main() {
26
+ console.log('=== PGlite 0.2.x Database Export ===\n');
27
+ console.log(`Database: ${dbPath}`);
28
+ console.log(`Output: ${outputDir}\n`);
29
+
30
+ // Check if database exists
31
+ if (!existsSync(dbPath)) {
32
+ console.error(`Error: Database not found at ${dbPath}`);
33
+ process.exit(1);
34
+ }
35
+
36
+ // Check PG_VERSION
37
+ const versionFile = join(dbPath, 'PG_VERSION');
38
+ if (existsSync(versionFile)) {
39
+ const version = require('fs').readFileSync(versionFile, 'utf-8').trim();
40
+ console.log(`PostgreSQL version: ${version}`);
41
+ if (version !== '16') {
42
+ console.warn(`Warning: Expected PG version 16, found ${version}`);
43
+ }
44
+ }
45
+
46
+ console.log('\nOpening database with PGlite 0.2.x...');
47
+
48
+ let db;
49
+ try {
50
+ db = await PGlite.create({ dataDir: dbPath });
51
+ } catch (e) {
52
+ console.error(`Failed to open database: ${e.message}`);
53
+ console.error('\nMake sure you have PGlite 0.2.x installed:');
54
+ console.error(' npm install @electric-sql/pglite@0.2.12');
55
+ process.exit(1);
56
+ }
57
+
58
+ // Ensure output directory exists
59
+ if (!existsSync(outputDir)) {
60
+ mkdirSync(outputDir, { recursive: true });
61
+ }
62
+
63
+ // Export documents
64
+ console.log('\nExporting documents...');
65
+ const docs = await db.query('SELECT * FROM documents');
66
+ console.log(` Found ${docs.rows.length} documents`);
67
+
68
+ const docsFile = join(outputDir, 'backup-documents.json');
69
+ writeFileSync(docsFile, JSON.stringify(docs.rows, null, 2));
70
+ console.log(` Saved to: ${docsFile}`);
71
+
72
+ // Count chunks
73
+ const chunkCount = await db.query('SELECT COUNT(*) as c FROM chunks');
74
+ const totalChunks = parseInt(chunkCount.rows[0].c);
75
+ console.log(`\nExporting ${totalChunks} chunks...`);
76
+
77
+ // Export chunks in batches
78
+ const chunksFile = join(outputDir, 'backup-chunks.jsonl');
79
+ if (existsSync(chunksFile)) unlinkSync(chunksFile);
80
+
81
+ const batchSize = 100;
82
+ let exported = 0;
83
+
84
+ for (let offset = 0; offset < totalChunks; offset += batchSize) {
85
+ const batch = await db.query(
86
+ `SELECT * FROM chunks ORDER BY id LIMIT ${batchSize} OFFSET ${offset}`
87
+ );
88
+
89
+ for (const row of batch.rows) {
90
+ appendFileSync(chunksFile, JSON.stringify(row) + '\n');
91
+ }
92
+
93
+ exported += batch.rows.length;
94
+ if (exported % 10000 === 0 || exported === totalChunks) {
95
+ console.log(` Progress: ${exported}/${totalChunks}`);
96
+ }
97
+ }
98
+ console.log(` Saved to: ${chunksFile}`);
99
+
100
+ // Try to export embeddings (may fail if vector extension issues)
101
+ console.log('\nExporting embeddings...');
102
+ try {
103
+ const embCount = await db.query('SELECT COUNT(*) as c FROM embeddings');
104
+ const totalEmb = parseInt(embCount.rows[0].c);
105
+ console.log(` Found ${totalEmb} embeddings`);
106
+
107
+ if (totalEmb > 0) {
108
+ const embFile = join(outputDir, 'backup-embeddings.jsonl');
109
+ if (existsSync(embFile)) unlinkSync(embFile);
110
+
111
+ let embExported = 0;
112
+ for (let offset = 0; offset < totalEmb; offset += batchSize) {
113
+ const batch = await db.query(
114
+ `SELECT chunk_id, embedding::text as embedding FROM embeddings ORDER BY chunk_id LIMIT ${batchSize} OFFSET ${offset}`
115
+ );
116
+
117
+ for (const row of batch.rows) {
118
+ appendFileSync(embFile, JSON.stringify(row) + '\n');
119
+ }
120
+
121
+ embExported += batch.rows.length;
122
+ if (embExported % 5000 === 0 || embExported === totalEmb) {
123
+ console.log(` Progress: ${embExported}/${totalEmb}`);
124
+ }
125
+ }
126
+ console.log(` Saved to: ${embFile}`);
127
+ }
128
+ } catch (e) {
129
+ console.log(` Skipped: ${e.message}`);
130
+ console.log(' (Embeddings can be regenerated after import)');
131
+ }
132
+
133
+ await db.close();
134
+
135
+ console.log('\n=== Export Complete ===');
136
+ console.log('\nBackup files:');
137
+ console.log(` - ${join(outputDir, 'backup-documents.json')}`);
138
+ console.log(` - ${join(outputDir, 'backup-chunks.jsonl')}`);
139
+ if (existsSync(join(outputDir, 'backup-embeddings.jsonl'))) {
140
+ console.log(` - ${join(outputDir, 'backup-embeddings.jsonl')}`);
141
+ }
142
+ console.log('\nNext steps:');
143
+ console.log(' 1. Upgrade to PGlite 0.3.x: bun install');
144
+ console.log(' 2. Import data: bun run scripts/migration/import-pg17.ts');
145
+ }
146
+
147
+ main().catch(e => {
148
+ console.error('Export failed:', e);
149
+ process.exit(1);
150
+ });
@@ -0,0 +1,284 @@
1
+ #!/usr/bin/env bun
2
+ /**
3
+ * Import backup data into PGlite 0.3.x (PostgreSQL 17) database
4
+ *
5
+ * This script imports data exported from PGlite 0.2.x.
6
+ *
7
+ * Usage:
8
+ * bun run scripts/migration/import-pg17.ts [backup-dir] [db-path]
9
+ *
10
+ * Example:
11
+ * bun run scripts/migration/import-pg17.ts ~/.pdf-library ~/.pdf-library/library
12
+ */
13
+
14
+ import { PGlite } from "@electric-sql/pglite";
15
+ import { vector } from "@electric-sql/pglite/vector";
16
+ import { readFileSync, createReadStream, existsSync } from "fs";
17
+ import { createInterface } from "readline";
18
+ import { join } from "path";
19
+
20
+ const EMBEDDING_DIM = 1024;
21
+
22
+ const args = process.argv.slice(2);
23
+ const backupDir = args[0] || join(process.env.HOME!, "Documents/.pdf-library");
24
+ const dbPath = args[1] || join(backupDir, "library");
25
+
26
+ async function main() {
27
+ console.log("=== PGlite 0.3.x Database Import ===\n");
28
+ console.log(`Backup dir: ${backupDir}`);
29
+ console.log(`Database: ${dbPath}\n`);
30
+
31
+ // Check backup files exist
32
+ const docsFile = join(backupDir, "backup-documents.json");
33
+ const chunksFile = join(backupDir, "backup-chunks.jsonl");
34
+
35
+ if (!existsSync(docsFile)) {
36
+ console.error(`Error: Documents backup not found: ${docsFile}`);
37
+ console.error("\nRun the export script first:");
38
+ console.error(" node scripts/migration/export-pg16.mjs");
39
+ process.exit(1);
40
+ }
41
+
42
+ if (!existsSync(chunksFile)) {
43
+ console.error(`Error: Chunks backup not found: ${chunksFile}`);
44
+ process.exit(1);
45
+ }
46
+
47
+ // Check if database already exists
48
+ if (existsSync(dbPath)) {
49
+ const versionFile = join(dbPath, "PG_VERSION");
50
+ if (existsSync(versionFile)) {
51
+ const version = readFileSync(versionFile, "utf-8").trim();
52
+ if (version === "17") {
53
+ console.error("Error: PG17 database already exists at this location.");
54
+ console.error("Remove it first if you want to re-import:");
55
+ console.error(` rm -r "${dbPath}"`);
56
+ process.exit(1);
57
+ }
58
+ }
59
+ }
60
+
61
+ console.log("Creating new PGlite 0.3.x database...");
62
+ const db = new PGlite(dbPath, { extensions: { vector } });
63
+ await db.waitReady;
64
+
65
+ console.log("Initializing schema...");
66
+ await db.exec("CREATE EXTENSION IF NOT EXISTS vector;");
67
+
68
+ await db.exec(`
69
+ CREATE TABLE IF NOT EXISTS documents (
70
+ id TEXT PRIMARY KEY,
71
+ title TEXT NOT NULL,
72
+ path TEXT NOT NULL UNIQUE,
73
+ added_at TIMESTAMPTZ NOT NULL,
74
+ page_count INTEGER NOT NULL,
75
+ size_bytes INTEGER NOT NULL,
76
+ tags JSONB DEFAULT '[]',
77
+ metadata JSONB DEFAULT '{}'
78
+ )
79
+ `);
80
+
81
+ await db.exec(`
82
+ CREATE TABLE IF NOT EXISTS chunks (
83
+ id TEXT PRIMARY KEY,
84
+ doc_id TEXT NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
85
+ page INTEGER NOT NULL,
86
+ chunk_index INTEGER NOT NULL,
87
+ content TEXT NOT NULL
88
+ )
89
+ `);
90
+
91
+ await db.exec(`
92
+ CREATE TABLE IF NOT EXISTS embeddings (
93
+ chunk_id TEXT PRIMARY KEY REFERENCES chunks(id) ON DELETE CASCADE,
94
+ embedding vector(${EMBEDDING_DIM}) NOT NULL
95
+ )
96
+ `);
97
+
98
+ // Create indexes
99
+ await db.exec(`
100
+ CREATE INDEX IF NOT EXISTS embeddings_hnsw_idx
101
+ ON embeddings USING hnsw (embedding vector_cosine_ops)
102
+ `);
103
+ await db.exec(`
104
+ CREATE INDEX IF NOT EXISTS chunks_content_idx
105
+ ON chunks USING gin (to_tsvector('english', content))
106
+ `);
107
+ await db.exec(`CREATE INDEX IF NOT EXISTS idx_chunks_doc ON chunks(doc_id)`);
108
+ await db.exec(`CREATE INDEX IF NOT EXISTS idx_docs_path ON documents(path)`);
109
+
110
+ console.log("Schema created!\n");
111
+
112
+ // Import documents
113
+ console.log("Importing documents...");
114
+ const docs = JSON.parse(readFileSync(docsFile, "utf-8"));
115
+ const docIds = new Set<string>();
116
+
117
+ for (const doc of docs) {
118
+ docIds.add(doc.id);
119
+ await db.query(
120
+ `INSERT INTO documents (id, title, path, added_at, page_count, size_bytes, tags, metadata)
121
+ VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
122
+ ON CONFLICT (id) DO NOTHING`,
123
+ [
124
+ doc.id,
125
+ doc.title,
126
+ doc.path,
127
+ doc.added_at,
128
+ doc.page_count,
129
+ doc.size_bytes,
130
+ JSON.stringify(doc.tags || []),
131
+ JSON.stringify(doc.metadata || {}),
132
+ ],
133
+ );
134
+ }
135
+ console.log(` Imported ${docs.length} documents`);
136
+
137
+ // Import chunks
138
+ console.log("\nImporting chunks...");
139
+ const rl = createInterface({
140
+ input: createReadStream(chunksFile),
141
+ crlfDelay: Infinity,
142
+ });
143
+
144
+ let chunkCount = 0;
145
+ let skipped = 0;
146
+ let batch: any[] = [];
147
+ const BATCH_SIZE = 100;
148
+
149
+ for await (const line of rl) {
150
+ if (!line.trim()) continue;
151
+ const chunk = JSON.parse(line);
152
+
153
+ // Skip orphaned chunks
154
+ if (!docIds.has(chunk.doc_id)) {
155
+ skipped++;
156
+ continue;
157
+ }
158
+
159
+ batch.push(chunk);
160
+
161
+ if (batch.length >= BATCH_SIZE) {
162
+ await db.exec("BEGIN");
163
+ for (const c of batch) {
164
+ await db.query(
165
+ `INSERT INTO chunks (id, doc_id, page, chunk_index, content)
166
+ VALUES ($1, $2, $3, $4, $5)
167
+ ON CONFLICT (id) DO NOTHING`,
168
+ [c.id, c.doc_id, c.page, c.chunk_index, c.content],
169
+ );
170
+ }
171
+ await db.exec("COMMIT");
172
+ chunkCount += batch.length;
173
+ batch = [];
174
+
175
+ if (chunkCount % 10000 === 0) {
176
+ console.log(` Progress: ${chunkCount} chunks...`);
177
+ }
178
+ }
179
+ }
180
+
181
+ // Final batch
182
+ if (batch.length > 0) {
183
+ await db.exec("BEGIN");
184
+ for (const c of batch) {
185
+ await db.query(
186
+ `INSERT INTO chunks (id, doc_id, page, chunk_index, content)
187
+ VALUES ($1, $2, $3, $4, $5)
188
+ ON CONFLICT (id) DO NOTHING`,
189
+ [c.id, c.doc_id, c.page, c.chunk_index, c.content],
190
+ );
191
+ }
192
+ await db.exec("COMMIT");
193
+ chunkCount += batch.length;
194
+ }
195
+
196
+ console.log(` Imported ${chunkCount} chunks`);
197
+ if (skipped > 0) {
198
+ console.log(` Skipped ${skipped} orphaned chunks`);
199
+ }
200
+
201
+ // Import embeddings if available
202
+ const embFile = join(backupDir, "backup-embeddings.jsonl");
203
+ if (existsSync(embFile)) {
204
+ console.log("\nImporting embeddings...");
205
+ const embRl = createInterface({
206
+ input: createReadStream(embFile),
207
+ crlfDelay: Infinity,
208
+ });
209
+
210
+ let embCount = 0;
211
+ let embBatch: any[] = [];
212
+
213
+ for await (const line of embRl) {
214
+ if (!line.trim()) continue;
215
+ const emb = JSON.parse(line);
216
+ embBatch.push(emb);
217
+
218
+ if (embBatch.length >= 50) {
219
+ await db.exec("BEGIN");
220
+ for (const e of embBatch) {
221
+ await db.query(
222
+ `INSERT INTO embeddings (chunk_id, embedding)
223
+ VALUES ($1, $2::vector)
224
+ ON CONFLICT (chunk_id) DO NOTHING`,
225
+ [e.chunk_id, e.embedding],
226
+ );
227
+ }
228
+ await db.exec("COMMIT");
229
+ embCount += embBatch.length;
230
+ embBatch = [];
231
+
232
+ if (embCount % 5000 === 0) {
233
+ console.log(` Progress: ${embCount} embeddings...`);
234
+ }
235
+ }
236
+ }
237
+
238
+ if (embBatch.length > 0) {
239
+ await db.exec("BEGIN");
240
+ for (const e of embBatch) {
241
+ await db.query(
242
+ `INSERT INTO embeddings (chunk_id, embedding)
243
+ VALUES ($1, $2::vector)
244
+ ON CONFLICT (chunk_id) DO NOTHING`,
245
+ [e.chunk_id, e.embedding],
246
+ );
247
+ }
248
+ await db.exec("COMMIT");
249
+ embCount += embBatch.length;
250
+ }
251
+
252
+ console.log(` Imported ${embCount} embeddings`);
253
+ }
254
+
255
+ // Verify
256
+ const docResult = await db.query<{ c: number }>(
257
+ "SELECT COUNT(*) as c FROM documents",
258
+ );
259
+ const chunkResult = await db.query<{ c: number }>(
260
+ "SELECT COUNT(*) as c FROM chunks",
261
+ );
262
+ const embResult = await db.query<{ c: number }>(
263
+ "SELECT COUNT(*) as c FROM embeddings",
264
+ );
265
+
266
+ console.log("\n=== Import Complete ===");
267
+ console.log(`Documents: ${docResult.rows[0]?.c}`);
268
+ console.log(`Chunks: ${chunkResult.rows[0]?.c}`);
269
+ console.log(`Embeddings: ${embResult.rows[0]?.c}`);
270
+
271
+ if (Number(embResult.rows[0]?.c) === 0) {
272
+ console.log("\nEmbeddings need to be regenerated.");
273
+ console.log("Run: bun run scripts/migration/regenerate-embeddings.ts");
274
+ }
275
+
276
+ console.log("\nVerify with: pdf-library stats");
277
+
278
+ await db.close();
279
+ }
280
+
281
+ main().catch((e) => {
282
+ console.error("Import failed:", e);
283
+ process.exit(1);
284
+ });
@@ -0,0 +1,177 @@
1
+ #!/usr/bin/env bun
2
+ /**
3
+ * Regenerate embeddings for chunks using Ollama
4
+ *
5
+ * This script generates embeddings for chunks that don't have them.
6
+ * Useful after migrating from PGlite 0.2.x where embeddings couldn't be exported.
7
+ *
8
+ * Prerequisites:
9
+ * - Ollama running: ollama serve
10
+ * - Embedding model: ollama pull mxbai-embed-large
11
+ *
12
+ * Usage:
13
+ * bun run scripts/migration/regenerate-embeddings.ts [db-path]
14
+ *
15
+ * Environment:
16
+ * OLLAMA_HOST - Ollama server URL (default: http://localhost:11434)
17
+ * OLLAMA_MODEL - Embedding model (default: mxbai-embed-large)
18
+ */
19
+
20
+ import { PGlite } from "@electric-sql/pglite";
21
+ import { vector } from "@electric-sql/pglite/vector";
22
+ import { join } from "path";
23
+
24
+ const OLLAMA_HOST = process.env.OLLAMA_HOST || "http://localhost:11434";
25
+ const OLLAMA_MODEL = process.env.OLLAMA_MODEL || "mxbai-embed-large";
26
+ const BATCH_SIZE = 5; // Process 5 chunks at a time (balance speed vs memory)
27
+
28
+ const args = process.argv.slice(2);
29
+ const dbPath =
30
+ args[0] || join(process.env.HOME!, "Documents/.pdf-library/library");
31
+
32
+ interface EmbeddingResponse {
33
+ embedding: number[];
34
+ }
35
+
36
+ async function getEmbedding(text: string): Promise<number[]> {
37
+ const response = await fetch(`${OLLAMA_HOST}/api/embeddings`, {
38
+ method: "POST",
39
+ headers: { "Content-Type": "application/json" },
40
+ body: JSON.stringify({ model: OLLAMA_MODEL, prompt: text }),
41
+ });
42
+
43
+ if (!response.ok) {
44
+ throw new Error(`Ollama error: ${response.status} ${response.statusText}`);
45
+ }
46
+
47
+ const data = (await response.json()) as EmbeddingResponse;
48
+ return data.embedding;
49
+ }
50
+
51
+ async function checkOllama(): Promise<boolean> {
52
+ try {
53
+ const response = await fetch(`${OLLAMA_HOST}/api/tags`);
54
+ if (!response.ok) return false;
55
+
56
+ // Test embedding generation
57
+ const testEmb = await getEmbedding("test");
58
+ console.log(
59
+ `Ollama ready! Model: ${OLLAMA_MODEL}, Dimension: ${testEmb.length}`,
60
+ );
61
+ return true;
62
+ } catch {
63
+ return false;
64
+ }
65
+ }
66
+
67
+ async function main() {
68
+ console.log("=== Embedding Regeneration ===\n");
69
+ console.log(`Database: ${dbPath}`);
70
+ console.log(`Ollama: ${OLLAMA_HOST}`);
71
+ console.log(`Model: ${OLLAMA_MODEL}\n`);
72
+
73
+ // Check Ollama
74
+ console.log("Checking Ollama...");
75
+ if (!(await checkOllama())) {
76
+ console.error("\nOllama not available. Make sure it's running:");
77
+ console.error(" ollama serve");
78
+ console.error(` ollama pull ${OLLAMA_MODEL}`);
79
+ process.exit(1);
80
+ }
81
+
82
+ // Connect to database
83
+ console.log("\nConnecting to database...");
84
+ const db = new PGlite(dbPath, { extensions: { vector } });
85
+ await db.waitReady;
86
+
87
+ // Count chunks needing embeddings
88
+ const countResult = await db.query<{ c: string }>(`
89
+ SELECT COUNT(*) as c FROM chunks c
90
+ LEFT JOIN embeddings e ON e.chunk_id = c.id
91
+ WHERE e.chunk_id IS NULL
92
+ `);
93
+ const total = parseInt(countResult.rows[0]?.c || "0");
94
+
95
+ console.log(`Chunks needing embeddings: ${total}`);
96
+
97
+ if (total === 0) {
98
+ console.log("\nAll chunks have embeddings!");
99
+ await db.close();
100
+ return;
101
+ }
102
+
103
+ // Estimate time
104
+ const estimatedMinutes = Math.ceil(total / 15 / 60); // ~15 embeddings/sec
105
+ console.log(`Estimated time: ~${estimatedMinutes} minutes\n`);
106
+
107
+ let processed = 0;
108
+ let errors = 0;
109
+ const startTime = Date.now();
110
+
111
+ while (processed < total) {
112
+ // Get batch of chunks without embeddings
113
+ const batch = await db.query<{ id: string; content: string }>(`
114
+ SELECT c.id, c.content FROM chunks c
115
+ LEFT JOIN embeddings e ON e.chunk_id = c.id
116
+ WHERE e.chunk_id IS NULL
117
+ LIMIT ${BATCH_SIZE}
118
+ `);
119
+
120
+ if (batch.rows.length === 0) break;
121
+
122
+ // Generate embeddings for batch
123
+ for (const row of batch.rows) {
124
+ try {
125
+ const embedding = await getEmbedding(row.content);
126
+ const vectorStr = `[${embedding.join(",")}]`;
127
+
128
+ await db.query(
129
+ `INSERT INTO embeddings (chunk_id, embedding)
130
+ VALUES ($1, $2::vector)
131
+ ON CONFLICT (chunk_id) DO NOTHING`,
132
+ [row.id, vectorStr],
133
+ );
134
+
135
+ processed++;
136
+ } catch (e) {
137
+ errors++;
138
+ console.error(`Error on chunk ${row.id}: ${e}`);
139
+ }
140
+ }
141
+
142
+ // Progress update every 100 chunks
143
+ if (processed % 100 === 0 || processed === total) {
144
+ const elapsed = (Date.now() - startTime) / 1000;
145
+ const rate = processed / elapsed;
146
+ const remaining = (total - processed) / rate;
147
+ const pct = ((processed / total) * 100).toFixed(1);
148
+
149
+ console.log(
150
+ `Progress: ${processed}/${total} (${pct}%) | ` +
151
+ `${rate.toFixed(1)}/s | ` +
152
+ `ETA: ${Math.ceil(remaining / 60)}min`,
153
+ );
154
+ }
155
+ }
156
+
157
+ // Final stats
158
+ const embResult = await db.query<{ c: string }>(
159
+ "SELECT COUNT(*) as c FROM embeddings",
160
+ );
161
+ const finalCount = parseInt(embResult.rows[0]?.c || "0");
162
+
163
+ console.log("\n=== Complete ===");
164
+ console.log(`Embeddings generated: ${processed}`);
165
+ console.log(`Total embeddings: ${finalCount}`);
166
+ console.log(`Errors: ${errors}`);
167
+ console.log(
168
+ `Time: ${((Date.now() - startTime) / 1000 / 60).toFixed(1)} minutes`,
169
+ );
170
+
171
+ await db.close();
172
+ }
173
+
174
+ main().catch((e) => {
175
+ console.error("Regeneration failed:", e);
176
+ process.exit(1);
177
+ });
package/src/cli.ts CHANGED
@@ -14,6 +14,7 @@ import {
14
14
  LibraryConfig,
15
15
  URLFetchError,
16
16
  } from "./index.js";
17
+ import { Migration, MigrationLive } from "./services/Migration.js";
17
18
 
18
19
  /**
19
20
  * Check if a string is a URL
@@ -84,6 +85,14 @@ Commands:
84
85
 
85
86
  check Check if Ollama is ready
86
87
 
88
+ repair Fix database integrity issues
89
+ Removes orphaned chunks/embeddings
90
+
91
+ migrate Database migration utilities
92
+ --check Check if migration is needed
93
+ --import <file> Import from SQL dump file
94
+ --generate-script Generate export script for current database
95
+
87
96
  Options:
88
97
  --help, -h Show this help
89
98
 
@@ -91,6 +100,8 @@ Examples:
91
100
  pdf-library add ./book.pdf --tags "programming,rust"
92
101
  pdf-library add https://example.com/paper.pdf --title "Research Paper"
93
102
  pdf-library search "machine learning" --limit 5
103
+ pdf-library migrate --check
104
+ pdf-library migrate --import backup.sql
94
105
  `;
95
106
 
96
107
  function parseArgs(args: string[]) {
@@ -314,6 +325,38 @@ const program = Effect.gen(function* () {
314
325
  break;
315
326
  }
316
327
 
328
+ case "repair": {
329
+ yield* Console.log("Checking database integrity...\n");
330
+ const result = yield* library.repair();
331
+
332
+ if (
333
+ result.orphanedChunks === 0 &&
334
+ result.orphanedEmbeddings === 0 &&
335
+ result.zeroVectorEmbeddings === 0
336
+ ) {
337
+ yield* Console.log("✓ Database is healthy - no repairs needed");
338
+ } else {
339
+ yield* Console.log("Repairs completed:");
340
+ if (result.orphanedChunks > 0) {
341
+ yield* Console.log(
342
+ ` • Removed ${result.orphanedChunks} orphaned chunks`,
343
+ );
344
+ }
345
+ if (result.orphanedEmbeddings > 0) {
346
+ yield* Console.log(
347
+ ` • Removed ${result.orphanedEmbeddings} orphaned embeddings`,
348
+ );
349
+ }
350
+ if (result.zeroVectorEmbeddings > 0) {
351
+ yield* Console.log(
352
+ ` • Removed ${result.zeroVectorEmbeddings} zero-dimension embeddings`,
353
+ );
354
+ }
355
+ yield* Console.log("\n✓ Database repaired");
356
+ }
357
+ break;
358
+ }
359
+
317
360
  default:
318
361
  yield* Console.error(`Unknown command: ${command}`);
319
362
  yield* Console.log(HELP);
@@ -321,15 +364,89 @@ const program = Effect.gen(function* () {
321
364
  }
322
365
  });
323
366
 
324
- // Run with error handling
325
- Effect.runPromise(
326
- program.pipe(
327
- Effect.provide(PDFLibraryLive),
328
- Effect.catchAll((error) =>
329
- Effect.gen(function* () {
330
- yield* Console.error(`Error: ${error._tag}: ${JSON.stringify(error)}`);
331
- process.exit(1);
332
- }),
367
+ // Handle migrate command separately (doesn't need full PDFLibrary)
368
+ const args = process.argv.slice(2);
369
+
370
+ if (args[0] === "migrate") {
371
+ const migrateProgram = Effect.gen(function* () {
372
+ const opts = parseArgs(args.slice(1));
373
+ const migration = yield* Migration;
374
+ const config = LibraryConfig.fromEnv();
375
+ const dbPath = config.dbPath.replace(".db", "");
376
+
377
+ if (opts.check) {
378
+ const needed = yield* migration.checkMigrationNeeded(dbPath);
379
+ if (needed) {
380
+ yield* Console.log(
381
+ "Migration needed:\n" + migration.getMigrationMessage(),
382
+ );
383
+ } else {
384
+ yield* Console.log("✓ No migration needed - database is compatible");
385
+ }
386
+ } else if (opts.import) {
387
+ yield* migration.importFromDump(opts.import as string, dbPath);
388
+ yield* Console.log("✓ Import complete");
389
+ } else if (opts["generate-script"]) {
390
+ yield* Console.log(migration.generateExportScript(dbPath));
391
+ } else {
392
+ // Default: check and show message
393
+ const needed = yield* migration.checkMigrationNeeded(dbPath);
394
+ if (needed) {
395
+ yield* Console.log(migration.getMigrationMessage());
396
+ } else {
397
+ yield* Console.log("✓ No migration needed - database is compatible");
398
+ }
399
+ }
400
+ });
401
+
402
+ Effect.runPromise(
403
+ migrateProgram.pipe(
404
+ Effect.provide(MigrationLive),
405
+ Effect.catchAll((error) =>
406
+ Effect.gen(function* () {
407
+ if (error._tag === "MigrationError") {
408
+ yield* Console.error(`Migration Error: ${error.message}`);
409
+ } else {
410
+ yield* Console.error(
411
+ `Error: ${error._tag}: ${JSON.stringify(error)}`,
412
+ );
413
+ }
414
+ process.exit(1);
415
+ }),
416
+ ),
333
417
  ),
334
- ),
335
- );
418
+ );
419
+ } else {
420
+ // Run with error handling
421
+ Effect.runPromise(
422
+ program.pipe(
423
+ Effect.provide(PDFLibraryLive),
424
+ Effect.catchAll((error) =>
425
+ Effect.gen(function* () {
426
+ // Check if it's a database initialization error
427
+ const errorStr = JSON.stringify(error);
428
+ if (
429
+ errorStr.includes("PGlite") ||
430
+ errorStr.includes("version") ||
431
+ errorStr.includes("incompatible")
432
+ ) {
433
+ yield* Console.error(
434
+ `Database Error: ${error._tag}: ${JSON.stringify(error)}`,
435
+ );
436
+ yield* Console.error(
437
+ "\nThis may be a database version compatibility issue.",
438
+ );
439
+ yield* Console.error(
440
+ "Run 'pdf-library migrate --check' to diagnose.",
441
+ );
442
+ } else {
443
+ yield* Console.error(
444
+ `Error: ${error._tag}: ${JSON.stringify(error)}`,
445
+ );
446
+ }
447
+ process.exit(1);
448
+ }),
449
+ ),
450
+ ),
451
+ );
452
+ }
package/src/index.ts CHANGED
@@ -289,6 +289,12 @@ export class PDFLibrary extends Effect.Service<PDFLibrary>()("PDFLibrary", {
289
289
  libraryPath: config.libraryPath,
290
290
  };
291
291
  }),
292
+
293
+ /**
294
+ * Repair database integrity issues
295
+ * Removes orphaned chunks and embeddings
296
+ */
297
+ repair: () => db.repair(),
292
298
  };
293
299
  }),
294
300
  dependencies: [OllamaLive, PDFExtractorLive, DatabaseLive],
@@ -76,6 +76,16 @@ export class Database extends Context.Tag("Database")<
76
76
  { documents: number; chunks: number; embeddings: number },
77
77
  DatabaseError
78
78
  >;
79
+
80
+ // Maintenance
81
+ readonly repair: () => Effect.Effect<
82
+ {
83
+ orphanedChunks: number;
84
+ orphanedEmbeddings: number;
85
+ zeroVectorEmbeddings: number;
86
+ },
87
+ DatabaseError
88
+ >;
79
89
  }
80
90
  >() {}
81
91
 
@@ -97,13 +107,10 @@ export const DatabaseLive = Layer.scoped(
97
107
  // PGlite stores data in a directory, not a single file
98
108
  const pgDataDir = config.dbPath.replace(".db", "");
99
109
 
100
- // Initialize PGlite with pgvector extension
101
- const db = yield* Effect.tryPromise({
102
- try: () =>
103
- PGlite.create({
104
- dataDir: pgDataDir,
105
- extensions: { vector },
106
- }),
110
+ // Initialize PGlite with pgvector extension (0.3.x API)
111
+ const db = new PGlite(pgDataDir, { extensions: { vector } });
112
+ yield* Effect.tryPromise({
113
+ try: () => db.waitReady,
107
114
  catch: (e) =>
108
115
  new DatabaseError({ reason: `Failed to init PGlite: ${e}` }),
109
116
  });
@@ -467,6 +474,70 @@ export const DatabaseLive = Layer.scoped(
467
474
  },
468
475
  catch: (e) => new DatabaseError({ reason: String(e) }),
469
476
  }),
477
+
478
+ repair: () =>
479
+ Effect.tryPromise({
480
+ try: async () => {
481
+ // Count orphaned chunks (doc_id not in documents)
482
+ const orphanedChunksResult = await db.query(`
483
+ SELECT COUNT(*) as count FROM chunks c
484
+ WHERE NOT EXISTS (SELECT 1 FROM documents d WHERE d.id = c.doc_id)
485
+ `);
486
+ const orphanedChunks = Number(
487
+ (orphanedChunksResult.rows[0] as { count: number }).count,
488
+ );
489
+
490
+ // Count orphaned embeddings (chunk_id not in chunks)
491
+ const orphanedEmbeddingsResult = await db.query(`
492
+ SELECT COUNT(*) as count FROM embeddings e
493
+ WHERE NOT EXISTS (SELECT 1 FROM chunks c WHERE c.id = e.chunk_id)
494
+ `);
495
+ const orphanedEmbeddings = Number(
496
+ (orphanedEmbeddingsResult.rows[0] as { count: number }).count,
497
+ );
498
+
499
+ // Count zero-dimension embeddings (vector_dims returns 0 or null)
500
+ // Note: In pgvector, we check for malformed vectors
501
+ const zeroVectorResult = await db.query(`
502
+ SELECT COUNT(*) as count FROM embeddings
503
+ WHERE embedding IS NULL OR vector_dims(embedding) = 0
504
+ `);
505
+ const zeroVectorEmbeddings = Number(
506
+ (zeroVectorResult.rows[0] as { count: number }).count,
507
+ );
508
+
509
+ // Delete orphaned embeddings first (depends on chunks)
510
+ if (orphanedEmbeddings > 0) {
511
+ await db.query(`
512
+ DELETE FROM embeddings e
513
+ WHERE NOT EXISTS (SELECT 1 FROM chunks c WHERE c.id = e.chunk_id)
514
+ `);
515
+ }
516
+
517
+ // Delete orphaned chunks (depends on documents)
518
+ if (orphanedChunks > 0) {
519
+ await db.query(`
520
+ DELETE FROM chunks c
521
+ WHERE NOT EXISTS (SELECT 1 FROM documents d WHERE d.id = c.doc_id)
522
+ `);
523
+ }
524
+
525
+ // Delete zero-dimension embeddings
526
+ if (zeroVectorEmbeddings > 0) {
527
+ await db.query(`
528
+ DELETE FROM embeddings
529
+ WHERE embedding IS NULL OR vector_dims(embedding) = 0
530
+ `);
531
+ }
532
+
533
+ return {
534
+ orphanedChunks,
535
+ orphanedEmbeddings,
536
+ zeroVectorEmbeddings,
537
+ };
538
+ },
539
+ catch: (e) => new DatabaseError({ reason: String(e) }),
540
+ }),
470
541
  };
471
542
  }),
472
543
  );
@@ -0,0 +1,319 @@
1
+ /**
2
+ * PGlite Migration Service
3
+ *
4
+ * Handles migration from PGlite 0.2.x (PostgreSQL 16) to 0.3.x (PostgreSQL 17).
5
+ * Since automatic pg_dump migration isn't possible (0.2.x WASM crashes on current runtimes),
6
+ * provides detection and manual migration paths.
7
+ */
8
+
9
+ import { Effect, Context, Layer, Schema } from "effect";
10
+ import { existsSync, readFileSync } from "fs";
11
+ import { join } from "path";
12
+ import { PGlite } from "@electric-sql/pglite";
13
+ import { vector } from "@electric-sql/pglite/vector";
14
+
15
+ // ============================================================================
16
+ // Errors
17
+ // ============================================================================
18
+
19
+ export class MigrationError extends Schema.TaggedError<MigrationError>()(
20
+ "MigrationError",
21
+ { reason: Schema.String },
22
+ ) {}
23
+
24
+ // ============================================================================
25
+ // Service Definition
26
+ // ============================================================================
27
+
28
+ export class Migration extends Context.Tag("Migration")<
29
+ Migration,
30
+ {
31
+ /**
32
+ * Check if database at given path needs migration from PG16 to PG17.
33
+ * Returns true if migration is needed.
34
+ */
35
+ readonly checkMigrationNeeded: (
36
+ dbPath: string,
37
+ ) => Effect.Effect<boolean, MigrationError>;
38
+
39
+ /**
40
+ * Get helpful error message with migration options.
41
+ */
42
+ readonly getMigrationMessage: () => string;
43
+
44
+ /**
45
+ * Import SQL dump into fresh PG17 database.
46
+ */
47
+ readonly importFromDump: (
48
+ dumpFile: string,
49
+ dbPath: string,
50
+ ) => Effect.Effect<void, MigrationError>;
51
+
52
+ /**
53
+ * Generate shell script for manual pg_dump with old PGlite.
54
+ * User runs this with PGlite 0.2.x installed to export data.
55
+ */
56
+ readonly generateExportScript: (dbPath: string) => string;
57
+ }
58
+ >() {}
59
+
60
+ // ============================================================================
61
+ // Implementation
62
+ // ============================================================================
63
+
64
+ export const MigrationLive = Layer.succeed(
65
+ Migration,
66
+ Migration.of({
67
+ checkMigrationNeeded: (dbPath) =>
68
+ Effect.gen(function* () {
69
+ const pgDataDir = dbPath.replace(".db", "");
70
+
71
+ // If directory doesn't exist, no migration needed (fresh install)
72
+ if (!existsSync(pgDataDir)) {
73
+ return false;
74
+ }
75
+
76
+ // Check for PG_VERSION file
77
+ const versionFile = join(pgDataDir, "PG_VERSION");
78
+ if (!existsSync(versionFile)) {
79
+ // Directory exists but no version file - might be corrupted
80
+ // Attempt to detect by trying to open with PG17
81
+ return yield* Effect.tryPromise({
82
+ try: async () => {
83
+ try {
84
+ const testDb = await PGlite.create({
85
+ dataDir: pgDataDir,
86
+ extensions: { vector },
87
+ });
88
+ await testDb.close();
89
+ return false; // Successfully opened - no migration needed
90
+ } catch (e) {
91
+ // Failed to open - likely needs migration
92
+ return true;
93
+ }
94
+ },
95
+ catch: (e) =>
96
+ new MigrationError({
97
+ reason: `Failed to check database version: ${e}`,
98
+ }),
99
+ });
100
+ }
101
+
102
+ // Read PG_VERSION file
103
+ const version = yield* Effect.try({
104
+ try: () => readFileSync(versionFile, "utf8").trim(),
105
+ catch: (e) =>
106
+ new MigrationError({
107
+ reason: `Failed to read PG_VERSION: ${e}`,
108
+ }),
109
+ });
110
+
111
+ // If version is 16, migration is needed
112
+ return version === "16";
113
+ }),
114
+
115
+ getMigrationMessage: () => `
116
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
117
+ DATABASE MIGRATION REQUIRED
118
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
119
+
120
+ Your existing PDF library database uses PostgreSQL 16 (PGlite 0.2.x).
121
+ This version requires PostgreSQL 17 (PGlite 0.3.x).
122
+
123
+ Automatic migration is not possible because PGlite 0.2.x WASM crashes on
124
+ current Node.js runtimes. You have two options:
125
+
126
+ OPTION 1: Fresh Start (Easiest)
127
+ ────────────────────────────────
128
+ 1. Backup your database directory if desired
129
+ 2. Delete the database directory to start fresh
130
+ 3. Re-add your PDFs
131
+
132
+ OPTION 2: Manual Migration (Preserve Data)
133
+ ───────────────────────────────────────────
134
+ Migration scripts are provided in the repository.
135
+
136
+ 1. Export data using PGlite 0.2.x:
137
+ cd /path/to/pdf-library
138
+ npm install @electric-sql/pglite@0.2.12
139
+ node scripts/migration/export-pg16.mjs
140
+
141
+ 2. Import into PGlite 0.3.x:
142
+ bun run scripts/migration/import-pg17.ts
143
+
144
+ 3. Regenerate embeddings (if needed):
145
+ bun run scripts/migration/regenerate-embeddings.ts
146
+
147
+ For detailed instructions, see:
148
+ https://github.com/joelhooks/pdf-library#migration
149
+
150
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
151
+ `,
152
+
153
+ importFromDump: (dumpFile, dbPath) =>
154
+ Effect.gen(function* () {
155
+ // Verify dump file exists
156
+ if (!existsSync(dumpFile)) {
157
+ return yield* Effect.fail(
158
+ new MigrationError({
159
+ reason: `Dump file not found: ${dumpFile}`,
160
+ }),
161
+ );
162
+ }
163
+
164
+ // Read SQL dump
165
+ const sql = yield* Effect.try({
166
+ try: () => readFileSync(dumpFile, "utf8"),
167
+ catch: (e) =>
168
+ new MigrationError({
169
+ reason: `Failed to read dump file: ${e}`,
170
+ }),
171
+ });
172
+
173
+ // Create fresh PG17 database
174
+ const pgDataDir = dbPath.replace(".db", "");
175
+ const db = yield* Effect.tryPromise({
176
+ try: () =>
177
+ PGlite.create({
178
+ dataDir: pgDataDir,
179
+ extensions: { vector },
180
+ }),
181
+ catch: (e) =>
182
+ new MigrationError({
183
+ reason: `Failed to create new database: ${e}`,
184
+ }),
185
+ });
186
+
187
+ // Execute dump SQL
188
+ yield* Effect.tryPromise({
189
+ try: async () => {
190
+ await db.exec(sql);
191
+ await db.close();
192
+ },
193
+ catch: (e) =>
194
+ new MigrationError({
195
+ reason: `Failed to import dump: ${e}`,
196
+ }),
197
+ });
198
+ }),
199
+
200
+ generateExportScript: (dbPath) => {
201
+ const pgDataDir = dbPath.replace(".db", "");
202
+ const dumpFile = join(process.cwd(), "library-dump.sql");
203
+
204
+ // Generate a Node.js script that uses PGlite 0.2.x to export data
205
+ return `#!/usr/bin/env node
206
+ /**
207
+ * PDF Library Database Export Script
208
+ *
209
+ * This script exports your database using PGlite 0.2.x.
210
+ *
211
+ * Prerequisites:
212
+ * 1. Node.js 18+
213
+ * 2. Install dependencies:
214
+ * npm install @electric-sql/pglite@0.2.12
215
+ *
216
+ * Usage:
217
+ * node export.js
218
+ */
219
+
220
+ const { PGlite } = require('@electric-sql/pglite');
221
+ const { writeFileSync } = require('fs');
222
+
223
+ (async () => {
224
+ console.log('Opening database with PGlite 0.2.x...');
225
+
226
+ const db = new PGlite('${pgDataDir}');
227
+
228
+ console.log('Exporting documents table...');
229
+ const docs = await db.query('SELECT * FROM documents');
230
+
231
+ console.log('Exporting chunks table...');
232
+ const chunks = await db.query('SELECT * FROM chunks');
233
+
234
+ console.log('Exporting embeddings table...');
235
+ const embeddings = await db.query('SELECT * FROM embeddings');
236
+
237
+ // Generate SQL dump
238
+ let sql = '-- PDF Library Database Dump\\n';
239
+ sql += '-- Generated with PGlite 0.2.x\\n\\n';
240
+
241
+ sql += 'BEGIN;\\n\\n';
242
+
243
+ // Enable extensions
244
+ sql += 'CREATE EXTENSION IF NOT EXISTS vector;\\n\\n';
245
+
246
+ // Create tables
247
+ sql += \`CREATE TABLE IF NOT EXISTS documents (
248
+ id TEXT PRIMARY KEY,
249
+ title TEXT NOT NULL,
250
+ path TEXT NOT NULL UNIQUE,
251
+ added_at TIMESTAMPTZ NOT NULL,
252
+ page_count INTEGER NOT NULL,
253
+ size_bytes INTEGER NOT NULL,
254
+ tags JSONB DEFAULT '[]',
255
+ metadata JSONB DEFAULT '{}'
256
+ );\\n\\n\`;
257
+
258
+ sql += \`CREATE TABLE IF NOT EXISTS chunks (
259
+ id TEXT PRIMARY KEY,
260
+ doc_id TEXT NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
261
+ page INTEGER NOT NULL,
262
+ chunk_index INTEGER NOT NULL,
263
+ content TEXT NOT NULL
264
+ );\\n\\n\`;
265
+
266
+ sql += \`CREATE TABLE IF NOT EXISTS embeddings (
267
+ chunk_id TEXT PRIMARY KEY REFERENCES chunks(id) ON DELETE CASCADE,
268
+ embedding vector(1024) NOT NULL
269
+ );\\n\\n\`;
270
+
271
+ // Insert documents
272
+ for (const row of docs.rows) {
273
+ const values = [
274
+ row.id,
275
+ row.title.replace(/'/g, "''"),
276
+ row.path.replace(/'/g, "''"),
277
+ row.added_at,
278
+ row.page_count,
279
+ row.size_bytes,
280
+ JSON.stringify(row.tags),
281
+ JSON.stringify(row.metadata)
282
+ ];
283
+ sql += \`INSERT INTO documents (id, title, path, added_at, page_count, size_bytes, tags, metadata) VALUES ('\${values[0]}', '\${values[1]}', '\${values[2]}', '\${values[3]}', \${values[4]}, \${values[5]}, '\${values[6]}', '\${values[7]}');\\n\`;
284
+ }
285
+
286
+ sql += '\\n';
287
+
288
+ // Insert chunks
289
+ for (const row of chunks.rows) {
290
+ const content = row.content.replace(/'/g, "''");
291
+ sql += \`INSERT INTO chunks (id, doc_id, page, chunk_index, content) VALUES ('\${row.id}', '\${row.doc_id}', \${row.page}, \${row.chunk_index}, '\${content}');\\n\`;
292
+ }
293
+
294
+ sql += '\\n';
295
+
296
+ // Insert embeddings
297
+ for (const row of embeddings.rows) {
298
+ sql += \`INSERT INTO embeddings (chunk_id, embedding) VALUES ('\${row.chunk_id}', '\${row.embedding}');\\n\`;
299
+ }
300
+
301
+ sql += '\\nCOMMIT;\\n';
302
+
303
+ // Write dump file
304
+ writeFileSync('${dumpFile}', sql);
305
+
306
+ await db.close();
307
+
308
+ console.log(\`\\nExport complete! Dump saved to: ${dumpFile}\`);
309
+ console.log(\`\\nNext steps:\`);
310
+ console.log(\` 1. Upgrade to PGlite 0.3.x\`);
311
+ console.log(\` 2. Import dump: pdf-library migration import ${dumpFile}\`);
312
+ })().catch(err => {
313
+ console.error('Export failed:', err);
314
+ process.exit(1);
315
+ });
316
+ `;
317
+ },
318
+ }),
319
+ );