pdf-brain 0.1.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +3 -2
- package/scripts/migration/export-pg16.mjs +150 -0
- package/scripts/migration/import-pg17.ts +284 -0
- package/scripts/migration/regenerate-embeddings.ts +177 -0
- package/src/cli.ts +128 -11
- package/src/index.ts +6 -0
- package/src/services/Database.ts +78 -7
- package/src/services/Migration.ts +319 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pdf-brain",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"description": "Local PDF knowledge base with vector search",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "src/index.ts",
|
|
@@ -18,7 +18,8 @@
|
|
|
18
18
|
"@effect/platform": "^0.72.0",
|
|
19
19
|
"@effect/platform-bun": "^0.52.0",
|
|
20
20
|
"@effect/schema": "^0.75.0",
|
|
21
|
-
"@electric-sql/pglite": "^0.
|
|
21
|
+
"@electric-sql/pglite": "^0.3.14",
|
|
22
|
+
"@electric-sql/pglite-tools": "^0.2.19",
|
|
22
23
|
"effect": "^3.12.0"
|
|
23
24
|
},
|
|
24
25
|
"devDependencies": {
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Export data from PGlite 0.2.x (PostgreSQL 16) database
|
|
4
|
+
*
|
|
5
|
+
* This script exports your existing database using PGlite 0.2.x before upgrading.
|
|
6
|
+
*
|
|
7
|
+
* Prerequisites:
|
|
8
|
+
* npm install @electric-sql/pglite@0.2.12
|
|
9
|
+
*
|
|
10
|
+
* Usage:
|
|
11
|
+
* node export-pg16.mjs [db-path] [output-dir]
|
|
12
|
+
*
|
|
13
|
+
* Example:
|
|
14
|
+
* node export-pg16.mjs ~/.pdf-library/library ~/.pdf-library
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import { PGlite } from '@electric-sql/pglite';
|
|
18
|
+
import { writeFileSync, appendFileSync, existsSync, unlinkSync, mkdirSync } from 'fs';
|
|
19
|
+
import { join, dirname } from 'path';
|
|
20
|
+
|
|
21
|
+
const args = process.argv.slice(2);
|
|
22
|
+
const dbPath = args[0] || join(process.env.HOME, 'Documents/.pdf-library/library');
|
|
23
|
+
const outputDir = args[1] || dirname(dbPath);
|
|
24
|
+
|
|
25
|
+
async function main() {
|
|
26
|
+
console.log('=== PGlite 0.2.x Database Export ===\n');
|
|
27
|
+
console.log(`Database: ${dbPath}`);
|
|
28
|
+
console.log(`Output: ${outputDir}\n`);
|
|
29
|
+
|
|
30
|
+
// Check if database exists
|
|
31
|
+
if (!existsSync(dbPath)) {
|
|
32
|
+
console.error(`Error: Database not found at ${dbPath}`);
|
|
33
|
+
process.exit(1);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// Check PG_VERSION
|
|
37
|
+
const versionFile = join(dbPath, 'PG_VERSION');
|
|
38
|
+
if (existsSync(versionFile)) {
|
|
39
|
+
const version = require('fs').readFileSync(versionFile, 'utf-8').trim();
|
|
40
|
+
console.log(`PostgreSQL version: ${version}`);
|
|
41
|
+
if (version !== '16') {
|
|
42
|
+
console.warn(`Warning: Expected PG version 16, found ${version}`);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
console.log('\nOpening database with PGlite 0.2.x...');
|
|
47
|
+
|
|
48
|
+
let db;
|
|
49
|
+
try {
|
|
50
|
+
db = await PGlite.create({ dataDir: dbPath });
|
|
51
|
+
} catch (e) {
|
|
52
|
+
console.error(`Failed to open database: ${e.message}`);
|
|
53
|
+
console.error('\nMake sure you have PGlite 0.2.x installed:');
|
|
54
|
+
console.error(' npm install @electric-sql/pglite@0.2.12');
|
|
55
|
+
process.exit(1);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// Ensure output directory exists
|
|
59
|
+
if (!existsSync(outputDir)) {
|
|
60
|
+
mkdirSync(outputDir, { recursive: true });
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Export documents
|
|
64
|
+
console.log('\nExporting documents...');
|
|
65
|
+
const docs = await db.query('SELECT * FROM documents');
|
|
66
|
+
console.log(` Found ${docs.rows.length} documents`);
|
|
67
|
+
|
|
68
|
+
const docsFile = join(outputDir, 'backup-documents.json');
|
|
69
|
+
writeFileSync(docsFile, JSON.stringify(docs.rows, null, 2));
|
|
70
|
+
console.log(` Saved to: ${docsFile}`);
|
|
71
|
+
|
|
72
|
+
// Count chunks
|
|
73
|
+
const chunkCount = await db.query('SELECT COUNT(*) as c FROM chunks');
|
|
74
|
+
const totalChunks = parseInt(chunkCount.rows[0].c);
|
|
75
|
+
console.log(`\nExporting ${totalChunks} chunks...`);
|
|
76
|
+
|
|
77
|
+
// Export chunks in batches
|
|
78
|
+
const chunksFile = join(outputDir, 'backup-chunks.jsonl');
|
|
79
|
+
if (existsSync(chunksFile)) unlinkSync(chunksFile);
|
|
80
|
+
|
|
81
|
+
const batchSize = 100;
|
|
82
|
+
let exported = 0;
|
|
83
|
+
|
|
84
|
+
for (let offset = 0; offset < totalChunks; offset += batchSize) {
|
|
85
|
+
const batch = await db.query(
|
|
86
|
+
`SELECT * FROM chunks ORDER BY id LIMIT ${batchSize} OFFSET ${offset}`
|
|
87
|
+
);
|
|
88
|
+
|
|
89
|
+
for (const row of batch.rows) {
|
|
90
|
+
appendFileSync(chunksFile, JSON.stringify(row) + '\n');
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
exported += batch.rows.length;
|
|
94
|
+
if (exported % 10000 === 0 || exported === totalChunks) {
|
|
95
|
+
console.log(` Progress: ${exported}/${totalChunks}`);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
console.log(` Saved to: ${chunksFile}`);
|
|
99
|
+
|
|
100
|
+
// Try to export embeddings (may fail if vector extension issues)
|
|
101
|
+
console.log('\nExporting embeddings...');
|
|
102
|
+
try {
|
|
103
|
+
const embCount = await db.query('SELECT COUNT(*) as c FROM embeddings');
|
|
104
|
+
const totalEmb = parseInt(embCount.rows[0].c);
|
|
105
|
+
console.log(` Found ${totalEmb} embeddings`);
|
|
106
|
+
|
|
107
|
+
if (totalEmb > 0) {
|
|
108
|
+
const embFile = join(outputDir, 'backup-embeddings.jsonl');
|
|
109
|
+
if (existsSync(embFile)) unlinkSync(embFile);
|
|
110
|
+
|
|
111
|
+
let embExported = 0;
|
|
112
|
+
for (let offset = 0; offset < totalEmb; offset += batchSize) {
|
|
113
|
+
const batch = await db.query(
|
|
114
|
+
`SELECT chunk_id, embedding::text as embedding FROM embeddings ORDER BY chunk_id LIMIT ${batchSize} OFFSET ${offset}`
|
|
115
|
+
);
|
|
116
|
+
|
|
117
|
+
for (const row of batch.rows) {
|
|
118
|
+
appendFileSync(embFile, JSON.stringify(row) + '\n');
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
embExported += batch.rows.length;
|
|
122
|
+
if (embExported % 5000 === 0 || embExported === totalEmb) {
|
|
123
|
+
console.log(` Progress: ${embExported}/${totalEmb}`);
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
console.log(` Saved to: ${embFile}`);
|
|
127
|
+
}
|
|
128
|
+
} catch (e) {
|
|
129
|
+
console.log(` Skipped: ${e.message}`);
|
|
130
|
+
console.log(' (Embeddings can be regenerated after import)');
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
await db.close();
|
|
134
|
+
|
|
135
|
+
console.log('\n=== Export Complete ===');
|
|
136
|
+
console.log('\nBackup files:');
|
|
137
|
+
console.log(` - ${join(outputDir, 'backup-documents.json')}`);
|
|
138
|
+
console.log(` - ${join(outputDir, 'backup-chunks.jsonl')}`);
|
|
139
|
+
if (existsSync(join(outputDir, 'backup-embeddings.jsonl'))) {
|
|
140
|
+
console.log(` - ${join(outputDir, 'backup-embeddings.jsonl')}`);
|
|
141
|
+
}
|
|
142
|
+
console.log('\nNext steps:');
|
|
143
|
+
console.log(' 1. Upgrade to PGlite 0.3.x: bun install');
|
|
144
|
+
console.log(' 2. Import data: bun run scripts/migration/import-pg17.ts');
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
main().catch(e => {
|
|
148
|
+
console.error('Export failed:', e);
|
|
149
|
+
process.exit(1);
|
|
150
|
+
});
|
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* Import backup data into PGlite 0.3.x (PostgreSQL 17) database
|
|
4
|
+
*
|
|
5
|
+
* This script imports data exported from PGlite 0.2.x.
|
|
6
|
+
*
|
|
7
|
+
* Usage:
|
|
8
|
+
* bun run scripts/migration/import-pg17.ts [backup-dir] [db-path]
|
|
9
|
+
*
|
|
10
|
+
* Example:
|
|
11
|
+
* bun run scripts/migration/import-pg17.ts ~/.pdf-library ~/.pdf-library/library
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { PGlite } from "@electric-sql/pglite";
|
|
15
|
+
import { vector } from "@electric-sql/pglite/vector";
|
|
16
|
+
import { readFileSync, createReadStream, existsSync } from "fs";
|
|
17
|
+
import { createInterface } from "readline";
|
|
18
|
+
import { join } from "path";
|
|
19
|
+
|
|
20
|
+
const EMBEDDING_DIM = 1024;
|
|
21
|
+
|
|
22
|
+
const args = process.argv.slice(2);
|
|
23
|
+
const backupDir = args[0] || join(process.env.HOME!, "Documents/.pdf-library");
|
|
24
|
+
const dbPath = args[1] || join(backupDir, "library");
|
|
25
|
+
|
|
26
|
+
async function main() {
|
|
27
|
+
console.log("=== PGlite 0.3.x Database Import ===\n");
|
|
28
|
+
console.log(`Backup dir: ${backupDir}`);
|
|
29
|
+
console.log(`Database: ${dbPath}\n`);
|
|
30
|
+
|
|
31
|
+
// Check backup files exist
|
|
32
|
+
const docsFile = join(backupDir, "backup-documents.json");
|
|
33
|
+
const chunksFile = join(backupDir, "backup-chunks.jsonl");
|
|
34
|
+
|
|
35
|
+
if (!existsSync(docsFile)) {
|
|
36
|
+
console.error(`Error: Documents backup not found: ${docsFile}`);
|
|
37
|
+
console.error("\nRun the export script first:");
|
|
38
|
+
console.error(" node scripts/migration/export-pg16.mjs");
|
|
39
|
+
process.exit(1);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
if (!existsSync(chunksFile)) {
|
|
43
|
+
console.error(`Error: Chunks backup not found: ${chunksFile}`);
|
|
44
|
+
process.exit(1);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Check if database already exists
|
|
48
|
+
if (existsSync(dbPath)) {
|
|
49
|
+
const versionFile = join(dbPath, "PG_VERSION");
|
|
50
|
+
if (existsSync(versionFile)) {
|
|
51
|
+
const version = readFileSync(versionFile, "utf-8").trim();
|
|
52
|
+
if (version === "17") {
|
|
53
|
+
console.error("Error: PG17 database already exists at this location.");
|
|
54
|
+
console.error("Remove it first if you want to re-import:");
|
|
55
|
+
console.error(` rm -r "${dbPath}"`);
|
|
56
|
+
process.exit(1);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
console.log("Creating new PGlite 0.3.x database...");
|
|
62
|
+
const db = new PGlite(dbPath, { extensions: { vector } });
|
|
63
|
+
await db.waitReady;
|
|
64
|
+
|
|
65
|
+
console.log("Initializing schema...");
|
|
66
|
+
await db.exec("CREATE EXTENSION IF NOT EXISTS vector;");
|
|
67
|
+
|
|
68
|
+
await db.exec(`
|
|
69
|
+
CREATE TABLE IF NOT EXISTS documents (
|
|
70
|
+
id TEXT PRIMARY KEY,
|
|
71
|
+
title TEXT NOT NULL,
|
|
72
|
+
path TEXT NOT NULL UNIQUE,
|
|
73
|
+
added_at TIMESTAMPTZ NOT NULL,
|
|
74
|
+
page_count INTEGER NOT NULL,
|
|
75
|
+
size_bytes INTEGER NOT NULL,
|
|
76
|
+
tags JSONB DEFAULT '[]',
|
|
77
|
+
metadata JSONB DEFAULT '{}'
|
|
78
|
+
)
|
|
79
|
+
`);
|
|
80
|
+
|
|
81
|
+
await db.exec(`
|
|
82
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
83
|
+
id TEXT PRIMARY KEY,
|
|
84
|
+
doc_id TEXT NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
85
|
+
page INTEGER NOT NULL,
|
|
86
|
+
chunk_index INTEGER NOT NULL,
|
|
87
|
+
content TEXT NOT NULL
|
|
88
|
+
)
|
|
89
|
+
`);
|
|
90
|
+
|
|
91
|
+
await db.exec(`
|
|
92
|
+
CREATE TABLE IF NOT EXISTS embeddings (
|
|
93
|
+
chunk_id TEXT PRIMARY KEY REFERENCES chunks(id) ON DELETE CASCADE,
|
|
94
|
+
embedding vector(${EMBEDDING_DIM}) NOT NULL
|
|
95
|
+
)
|
|
96
|
+
`);
|
|
97
|
+
|
|
98
|
+
// Create indexes
|
|
99
|
+
await db.exec(`
|
|
100
|
+
CREATE INDEX IF NOT EXISTS embeddings_hnsw_idx
|
|
101
|
+
ON embeddings USING hnsw (embedding vector_cosine_ops)
|
|
102
|
+
`);
|
|
103
|
+
await db.exec(`
|
|
104
|
+
CREATE INDEX IF NOT EXISTS chunks_content_idx
|
|
105
|
+
ON chunks USING gin (to_tsvector('english', content))
|
|
106
|
+
`);
|
|
107
|
+
await db.exec(`CREATE INDEX IF NOT EXISTS idx_chunks_doc ON chunks(doc_id)`);
|
|
108
|
+
await db.exec(`CREATE INDEX IF NOT EXISTS idx_docs_path ON documents(path)`);
|
|
109
|
+
|
|
110
|
+
console.log("Schema created!\n");
|
|
111
|
+
|
|
112
|
+
// Import documents
|
|
113
|
+
console.log("Importing documents...");
|
|
114
|
+
const docs = JSON.parse(readFileSync(docsFile, "utf-8"));
|
|
115
|
+
const docIds = new Set<string>();
|
|
116
|
+
|
|
117
|
+
for (const doc of docs) {
|
|
118
|
+
docIds.add(doc.id);
|
|
119
|
+
await db.query(
|
|
120
|
+
`INSERT INTO documents (id, title, path, added_at, page_count, size_bytes, tags, metadata)
|
|
121
|
+
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
|
122
|
+
ON CONFLICT (id) DO NOTHING`,
|
|
123
|
+
[
|
|
124
|
+
doc.id,
|
|
125
|
+
doc.title,
|
|
126
|
+
doc.path,
|
|
127
|
+
doc.added_at,
|
|
128
|
+
doc.page_count,
|
|
129
|
+
doc.size_bytes,
|
|
130
|
+
JSON.stringify(doc.tags || []),
|
|
131
|
+
JSON.stringify(doc.metadata || {}),
|
|
132
|
+
],
|
|
133
|
+
);
|
|
134
|
+
}
|
|
135
|
+
console.log(` Imported ${docs.length} documents`);
|
|
136
|
+
|
|
137
|
+
// Import chunks
|
|
138
|
+
console.log("\nImporting chunks...");
|
|
139
|
+
const rl = createInterface({
|
|
140
|
+
input: createReadStream(chunksFile),
|
|
141
|
+
crlfDelay: Infinity,
|
|
142
|
+
});
|
|
143
|
+
|
|
144
|
+
let chunkCount = 0;
|
|
145
|
+
let skipped = 0;
|
|
146
|
+
let batch: any[] = [];
|
|
147
|
+
const BATCH_SIZE = 100;
|
|
148
|
+
|
|
149
|
+
for await (const line of rl) {
|
|
150
|
+
if (!line.trim()) continue;
|
|
151
|
+
const chunk = JSON.parse(line);
|
|
152
|
+
|
|
153
|
+
// Skip orphaned chunks
|
|
154
|
+
if (!docIds.has(chunk.doc_id)) {
|
|
155
|
+
skipped++;
|
|
156
|
+
continue;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
batch.push(chunk);
|
|
160
|
+
|
|
161
|
+
if (batch.length >= BATCH_SIZE) {
|
|
162
|
+
await db.exec("BEGIN");
|
|
163
|
+
for (const c of batch) {
|
|
164
|
+
await db.query(
|
|
165
|
+
`INSERT INTO chunks (id, doc_id, page, chunk_index, content)
|
|
166
|
+
VALUES ($1, $2, $3, $4, $5)
|
|
167
|
+
ON CONFLICT (id) DO NOTHING`,
|
|
168
|
+
[c.id, c.doc_id, c.page, c.chunk_index, c.content],
|
|
169
|
+
);
|
|
170
|
+
}
|
|
171
|
+
await db.exec("COMMIT");
|
|
172
|
+
chunkCount += batch.length;
|
|
173
|
+
batch = [];
|
|
174
|
+
|
|
175
|
+
if (chunkCount % 10000 === 0) {
|
|
176
|
+
console.log(` Progress: ${chunkCount} chunks...`);
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// Final batch
|
|
182
|
+
if (batch.length > 0) {
|
|
183
|
+
await db.exec("BEGIN");
|
|
184
|
+
for (const c of batch) {
|
|
185
|
+
await db.query(
|
|
186
|
+
`INSERT INTO chunks (id, doc_id, page, chunk_index, content)
|
|
187
|
+
VALUES ($1, $2, $3, $4, $5)
|
|
188
|
+
ON CONFLICT (id) DO NOTHING`,
|
|
189
|
+
[c.id, c.doc_id, c.page, c.chunk_index, c.content],
|
|
190
|
+
);
|
|
191
|
+
}
|
|
192
|
+
await db.exec("COMMIT");
|
|
193
|
+
chunkCount += batch.length;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
console.log(` Imported ${chunkCount} chunks`);
|
|
197
|
+
if (skipped > 0) {
|
|
198
|
+
console.log(` Skipped ${skipped} orphaned chunks`);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// Import embeddings if available
|
|
202
|
+
const embFile = join(backupDir, "backup-embeddings.jsonl");
|
|
203
|
+
if (existsSync(embFile)) {
|
|
204
|
+
console.log("\nImporting embeddings...");
|
|
205
|
+
const embRl = createInterface({
|
|
206
|
+
input: createReadStream(embFile),
|
|
207
|
+
crlfDelay: Infinity,
|
|
208
|
+
});
|
|
209
|
+
|
|
210
|
+
let embCount = 0;
|
|
211
|
+
let embBatch: any[] = [];
|
|
212
|
+
|
|
213
|
+
for await (const line of embRl) {
|
|
214
|
+
if (!line.trim()) continue;
|
|
215
|
+
const emb = JSON.parse(line);
|
|
216
|
+
embBatch.push(emb);
|
|
217
|
+
|
|
218
|
+
if (embBatch.length >= 50) {
|
|
219
|
+
await db.exec("BEGIN");
|
|
220
|
+
for (const e of embBatch) {
|
|
221
|
+
await db.query(
|
|
222
|
+
`INSERT INTO embeddings (chunk_id, embedding)
|
|
223
|
+
VALUES ($1, $2::vector)
|
|
224
|
+
ON CONFLICT (chunk_id) DO NOTHING`,
|
|
225
|
+
[e.chunk_id, e.embedding],
|
|
226
|
+
);
|
|
227
|
+
}
|
|
228
|
+
await db.exec("COMMIT");
|
|
229
|
+
embCount += embBatch.length;
|
|
230
|
+
embBatch = [];
|
|
231
|
+
|
|
232
|
+
if (embCount % 5000 === 0) {
|
|
233
|
+
console.log(` Progress: ${embCount} embeddings...`);
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
if (embBatch.length > 0) {
|
|
239
|
+
await db.exec("BEGIN");
|
|
240
|
+
for (const e of embBatch) {
|
|
241
|
+
await db.query(
|
|
242
|
+
`INSERT INTO embeddings (chunk_id, embedding)
|
|
243
|
+
VALUES ($1, $2::vector)
|
|
244
|
+
ON CONFLICT (chunk_id) DO NOTHING`,
|
|
245
|
+
[e.chunk_id, e.embedding],
|
|
246
|
+
);
|
|
247
|
+
}
|
|
248
|
+
await db.exec("COMMIT");
|
|
249
|
+
embCount += embBatch.length;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
console.log(` Imported ${embCount} embeddings`);
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
// Verify
|
|
256
|
+
const docResult = await db.query<{ c: number }>(
|
|
257
|
+
"SELECT COUNT(*) as c FROM documents",
|
|
258
|
+
);
|
|
259
|
+
const chunkResult = await db.query<{ c: number }>(
|
|
260
|
+
"SELECT COUNT(*) as c FROM chunks",
|
|
261
|
+
);
|
|
262
|
+
const embResult = await db.query<{ c: number }>(
|
|
263
|
+
"SELECT COUNT(*) as c FROM embeddings",
|
|
264
|
+
);
|
|
265
|
+
|
|
266
|
+
console.log("\n=== Import Complete ===");
|
|
267
|
+
console.log(`Documents: ${docResult.rows[0]?.c}`);
|
|
268
|
+
console.log(`Chunks: ${chunkResult.rows[0]?.c}`);
|
|
269
|
+
console.log(`Embeddings: ${embResult.rows[0]?.c}`);
|
|
270
|
+
|
|
271
|
+
if (Number(embResult.rows[0]?.c) === 0) {
|
|
272
|
+
console.log("\nEmbeddings need to be regenerated.");
|
|
273
|
+
console.log("Run: bun run scripts/migration/regenerate-embeddings.ts");
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
console.log("\nVerify with: pdf-library stats");
|
|
277
|
+
|
|
278
|
+
await db.close();
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
main().catch((e) => {
|
|
282
|
+
console.error("Import failed:", e);
|
|
283
|
+
process.exit(1);
|
|
284
|
+
});
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* Regenerate embeddings for chunks using Ollama
|
|
4
|
+
*
|
|
5
|
+
* This script generates embeddings for chunks that don't have them.
|
|
6
|
+
* Useful after migrating from PGlite 0.2.x where embeddings couldn't be exported.
|
|
7
|
+
*
|
|
8
|
+
* Prerequisites:
|
|
9
|
+
* - Ollama running: ollama serve
|
|
10
|
+
* - Embedding model: ollama pull mxbai-embed-large
|
|
11
|
+
*
|
|
12
|
+
* Usage:
|
|
13
|
+
* bun run scripts/migration/regenerate-embeddings.ts [db-path]
|
|
14
|
+
*
|
|
15
|
+
* Environment:
|
|
16
|
+
* OLLAMA_HOST - Ollama server URL (default: http://localhost:11434)
|
|
17
|
+
* OLLAMA_MODEL - Embedding model (default: mxbai-embed-large)
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
import { PGlite } from "@electric-sql/pglite";
|
|
21
|
+
import { vector } from "@electric-sql/pglite/vector";
|
|
22
|
+
import { join } from "path";
|
|
23
|
+
|
|
24
|
+
const OLLAMA_HOST = process.env.OLLAMA_HOST || "http://localhost:11434";
|
|
25
|
+
const OLLAMA_MODEL = process.env.OLLAMA_MODEL || "mxbai-embed-large";
|
|
26
|
+
const BATCH_SIZE = 5; // Process 5 chunks at a time (balance speed vs memory)
|
|
27
|
+
|
|
28
|
+
const args = process.argv.slice(2);
|
|
29
|
+
const dbPath =
|
|
30
|
+
args[0] || join(process.env.HOME!, "Documents/.pdf-library/library");
|
|
31
|
+
|
|
32
|
+
interface EmbeddingResponse {
|
|
33
|
+
embedding: number[];
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
async function getEmbedding(text: string): Promise<number[]> {
|
|
37
|
+
const response = await fetch(`${OLLAMA_HOST}/api/embeddings`, {
|
|
38
|
+
method: "POST",
|
|
39
|
+
headers: { "Content-Type": "application/json" },
|
|
40
|
+
body: JSON.stringify({ model: OLLAMA_MODEL, prompt: text }),
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
if (!response.ok) {
|
|
44
|
+
throw new Error(`Ollama error: ${response.status} ${response.statusText}`);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const data = (await response.json()) as EmbeddingResponse;
|
|
48
|
+
return data.embedding;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
async function checkOllama(): Promise<boolean> {
|
|
52
|
+
try {
|
|
53
|
+
const response = await fetch(`${OLLAMA_HOST}/api/tags`);
|
|
54
|
+
if (!response.ok) return false;
|
|
55
|
+
|
|
56
|
+
// Test embedding generation
|
|
57
|
+
const testEmb = await getEmbedding("test");
|
|
58
|
+
console.log(
|
|
59
|
+
`Ollama ready! Model: ${OLLAMA_MODEL}, Dimension: ${testEmb.length}`,
|
|
60
|
+
);
|
|
61
|
+
return true;
|
|
62
|
+
} catch {
|
|
63
|
+
return false;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
async function main() {
|
|
68
|
+
console.log("=== Embedding Regeneration ===\n");
|
|
69
|
+
console.log(`Database: ${dbPath}`);
|
|
70
|
+
console.log(`Ollama: ${OLLAMA_HOST}`);
|
|
71
|
+
console.log(`Model: ${OLLAMA_MODEL}\n`);
|
|
72
|
+
|
|
73
|
+
// Check Ollama
|
|
74
|
+
console.log("Checking Ollama...");
|
|
75
|
+
if (!(await checkOllama())) {
|
|
76
|
+
console.error("\nOllama not available. Make sure it's running:");
|
|
77
|
+
console.error(" ollama serve");
|
|
78
|
+
console.error(` ollama pull ${OLLAMA_MODEL}`);
|
|
79
|
+
process.exit(1);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// Connect to database
|
|
83
|
+
console.log("\nConnecting to database...");
|
|
84
|
+
const db = new PGlite(dbPath, { extensions: { vector } });
|
|
85
|
+
await db.waitReady;
|
|
86
|
+
|
|
87
|
+
// Count chunks needing embeddings
|
|
88
|
+
const countResult = await db.query<{ c: string }>(`
|
|
89
|
+
SELECT COUNT(*) as c FROM chunks c
|
|
90
|
+
LEFT JOIN embeddings e ON e.chunk_id = c.id
|
|
91
|
+
WHERE e.chunk_id IS NULL
|
|
92
|
+
`);
|
|
93
|
+
const total = parseInt(countResult.rows[0]?.c || "0");
|
|
94
|
+
|
|
95
|
+
console.log(`Chunks needing embeddings: ${total}`);
|
|
96
|
+
|
|
97
|
+
if (total === 0) {
|
|
98
|
+
console.log("\nAll chunks have embeddings!");
|
|
99
|
+
await db.close();
|
|
100
|
+
return;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Estimate time
|
|
104
|
+
const estimatedMinutes = Math.ceil(total / 15 / 60); // ~15 embeddings/sec
|
|
105
|
+
console.log(`Estimated time: ~${estimatedMinutes} minutes\n`);
|
|
106
|
+
|
|
107
|
+
let processed = 0;
|
|
108
|
+
let errors = 0;
|
|
109
|
+
const startTime = Date.now();
|
|
110
|
+
|
|
111
|
+
while (processed < total) {
|
|
112
|
+
// Get batch of chunks without embeddings
|
|
113
|
+
const batch = await db.query<{ id: string; content: string }>(`
|
|
114
|
+
SELECT c.id, c.content FROM chunks c
|
|
115
|
+
LEFT JOIN embeddings e ON e.chunk_id = c.id
|
|
116
|
+
WHERE e.chunk_id IS NULL
|
|
117
|
+
LIMIT ${BATCH_SIZE}
|
|
118
|
+
`);
|
|
119
|
+
|
|
120
|
+
if (batch.rows.length === 0) break;
|
|
121
|
+
|
|
122
|
+
// Generate embeddings for batch
|
|
123
|
+
for (const row of batch.rows) {
|
|
124
|
+
try {
|
|
125
|
+
const embedding = await getEmbedding(row.content);
|
|
126
|
+
const vectorStr = `[${embedding.join(",")}]`;
|
|
127
|
+
|
|
128
|
+
await db.query(
|
|
129
|
+
`INSERT INTO embeddings (chunk_id, embedding)
|
|
130
|
+
VALUES ($1, $2::vector)
|
|
131
|
+
ON CONFLICT (chunk_id) DO NOTHING`,
|
|
132
|
+
[row.id, vectorStr],
|
|
133
|
+
);
|
|
134
|
+
|
|
135
|
+
processed++;
|
|
136
|
+
} catch (e) {
|
|
137
|
+
errors++;
|
|
138
|
+
console.error(`Error on chunk ${row.id}: ${e}`);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Progress update every 100 chunks
|
|
143
|
+
if (processed % 100 === 0 || processed === total) {
|
|
144
|
+
const elapsed = (Date.now() - startTime) / 1000;
|
|
145
|
+
const rate = processed / elapsed;
|
|
146
|
+
const remaining = (total - processed) / rate;
|
|
147
|
+
const pct = ((processed / total) * 100).toFixed(1);
|
|
148
|
+
|
|
149
|
+
console.log(
|
|
150
|
+
`Progress: ${processed}/${total} (${pct}%) | ` +
|
|
151
|
+
`${rate.toFixed(1)}/s | ` +
|
|
152
|
+
`ETA: ${Math.ceil(remaining / 60)}min`,
|
|
153
|
+
);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// Final stats
|
|
158
|
+
const embResult = await db.query<{ c: string }>(
|
|
159
|
+
"SELECT COUNT(*) as c FROM embeddings",
|
|
160
|
+
);
|
|
161
|
+
const finalCount = parseInt(embResult.rows[0]?.c || "0");
|
|
162
|
+
|
|
163
|
+
console.log("\n=== Complete ===");
|
|
164
|
+
console.log(`Embeddings generated: ${processed}`);
|
|
165
|
+
console.log(`Total embeddings: ${finalCount}`);
|
|
166
|
+
console.log(`Errors: ${errors}`);
|
|
167
|
+
console.log(
|
|
168
|
+
`Time: ${((Date.now() - startTime) / 1000 / 60).toFixed(1)} minutes`,
|
|
169
|
+
);
|
|
170
|
+
|
|
171
|
+
await db.close();
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
main().catch((e) => {
|
|
175
|
+
console.error("Regeneration failed:", e);
|
|
176
|
+
process.exit(1);
|
|
177
|
+
});
|
package/src/cli.ts
CHANGED
|
@@ -14,6 +14,7 @@ import {
|
|
|
14
14
|
LibraryConfig,
|
|
15
15
|
URLFetchError,
|
|
16
16
|
} from "./index.js";
|
|
17
|
+
import { Migration, MigrationLive } from "./services/Migration.js";
|
|
17
18
|
|
|
18
19
|
/**
|
|
19
20
|
* Check if a string is a URL
|
|
@@ -84,6 +85,14 @@ Commands:
|
|
|
84
85
|
|
|
85
86
|
check Check if Ollama is ready
|
|
86
87
|
|
|
88
|
+
repair Fix database integrity issues
|
|
89
|
+
Removes orphaned chunks/embeddings
|
|
90
|
+
|
|
91
|
+
migrate Database migration utilities
|
|
92
|
+
--check Check if migration is needed
|
|
93
|
+
--import <file> Import from SQL dump file
|
|
94
|
+
--generate-script Generate export script for current database
|
|
95
|
+
|
|
87
96
|
Options:
|
|
88
97
|
--help, -h Show this help
|
|
89
98
|
|
|
@@ -91,6 +100,8 @@ Examples:
|
|
|
91
100
|
pdf-library add ./book.pdf --tags "programming,rust"
|
|
92
101
|
pdf-library add https://example.com/paper.pdf --title "Research Paper"
|
|
93
102
|
pdf-library search "machine learning" --limit 5
|
|
103
|
+
pdf-library migrate --check
|
|
104
|
+
pdf-library migrate --import backup.sql
|
|
94
105
|
`;
|
|
95
106
|
|
|
96
107
|
function parseArgs(args: string[]) {
|
|
@@ -314,6 +325,38 @@ const program = Effect.gen(function* () {
|
|
|
314
325
|
break;
|
|
315
326
|
}
|
|
316
327
|
|
|
328
|
+
case "repair": {
|
|
329
|
+
yield* Console.log("Checking database integrity...\n");
|
|
330
|
+
const result = yield* library.repair();
|
|
331
|
+
|
|
332
|
+
if (
|
|
333
|
+
result.orphanedChunks === 0 &&
|
|
334
|
+
result.orphanedEmbeddings === 0 &&
|
|
335
|
+
result.zeroVectorEmbeddings === 0
|
|
336
|
+
) {
|
|
337
|
+
yield* Console.log("✓ Database is healthy - no repairs needed");
|
|
338
|
+
} else {
|
|
339
|
+
yield* Console.log("Repairs completed:");
|
|
340
|
+
if (result.orphanedChunks > 0) {
|
|
341
|
+
yield* Console.log(
|
|
342
|
+
` • Removed ${result.orphanedChunks} orphaned chunks`,
|
|
343
|
+
);
|
|
344
|
+
}
|
|
345
|
+
if (result.orphanedEmbeddings > 0) {
|
|
346
|
+
yield* Console.log(
|
|
347
|
+
` • Removed ${result.orphanedEmbeddings} orphaned embeddings`,
|
|
348
|
+
);
|
|
349
|
+
}
|
|
350
|
+
if (result.zeroVectorEmbeddings > 0) {
|
|
351
|
+
yield* Console.log(
|
|
352
|
+
` • Removed ${result.zeroVectorEmbeddings} zero-dimension embeddings`,
|
|
353
|
+
);
|
|
354
|
+
}
|
|
355
|
+
yield* Console.log("\n✓ Database repaired");
|
|
356
|
+
}
|
|
357
|
+
break;
|
|
358
|
+
}
|
|
359
|
+
|
|
317
360
|
default:
|
|
318
361
|
yield* Console.error(`Unknown command: ${command}`);
|
|
319
362
|
yield* Console.log(HELP);
|
|
@@ -321,15 +364,89 @@ const program = Effect.gen(function* () {
|
|
|
321
364
|
}
|
|
322
365
|
});
|
|
323
366
|
|
|
324
|
-
//
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
367
|
+
// Handle migrate command separately (doesn't need full PDFLibrary)
|
|
368
|
+
const args = process.argv.slice(2);
|
|
369
|
+
|
|
370
|
+
if (args[0] === "migrate") {
|
|
371
|
+
const migrateProgram = Effect.gen(function* () {
|
|
372
|
+
const opts = parseArgs(args.slice(1));
|
|
373
|
+
const migration = yield* Migration;
|
|
374
|
+
const config = LibraryConfig.fromEnv();
|
|
375
|
+
const dbPath = config.dbPath.replace(".db", "");
|
|
376
|
+
|
|
377
|
+
if (opts.check) {
|
|
378
|
+
const needed = yield* migration.checkMigrationNeeded(dbPath);
|
|
379
|
+
if (needed) {
|
|
380
|
+
yield* Console.log(
|
|
381
|
+
"Migration needed:\n" + migration.getMigrationMessage(),
|
|
382
|
+
);
|
|
383
|
+
} else {
|
|
384
|
+
yield* Console.log("✓ No migration needed - database is compatible");
|
|
385
|
+
}
|
|
386
|
+
} else if (opts.import) {
|
|
387
|
+
yield* migration.importFromDump(opts.import as string, dbPath);
|
|
388
|
+
yield* Console.log("✓ Import complete");
|
|
389
|
+
} else if (opts["generate-script"]) {
|
|
390
|
+
yield* Console.log(migration.generateExportScript(dbPath));
|
|
391
|
+
} else {
|
|
392
|
+
// Default: check and show message
|
|
393
|
+
const needed = yield* migration.checkMigrationNeeded(dbPath);
|
|
394
|
+
if (needed) {
|
|
395
|
+
yield* Console.log(migration.getMigrationMessage());
|
|
396
|
+
} else {
|
|
397
|
+
yield* Console.log("✓ No migration needed - database is compatible");
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
});
|
|
401
|
+
|
|
402
|
+
Effect.runPromise(
|
|
403
|
+
migrateProgram.pipe(
|
|
404
|
+
Effect.provide(MigrationLive),
|
|
405
|
+
Effect.catchAll((error) =>
|
|
406
|
+
Effect.gen(function* () {
|
|
407
|
+
if (error._tag === "MigrationError") {
|
|
408
|
+
yield* Console.error(`Migration Error: ${error.message}`);
|
|
409
|
+
} else {
|
|
410
|
+
yield* Console.error(
|
|
411
|
+
`Error: ${error._tag}: ${JSON.stringify(error)}`,
|
|
412
|
+
);
|
|
413
|
+
}
|
|
414
|
+
process.exit(1);
|
|
415
|
+
}),
|
|
416
|
+
),
|
|
333
417
|
),
|
|
334
|
-
)
|
|
335
|
-
|
|
418
|
+
);
|
|
419
|
+
} else {
|
|
420
|
+
// Run with error handling
|
|
421
|
+
Effect.runPromise(
|
|
422
|
+
program.pipe(
|
|
423
|
+
Effect.provide(PDFLibraryLive),
|
|
424
|
+
Effect.catchAll((error) =>
|
|
425
|
+
Effect.gen(function* () {
|
|
426
|
+
// Check if it's a database initialization error
|
|
427
|
+
const errorStr = JSON.stringify(error);
|
|
428
|
+
if (
|
|
429
|
+
errorStr.includes("PGlite") ||
|
|
430
|
+
errorStr.includes("version") ||
|
|
431
|
+
errorStr.includes("incompatible")
|
|
432
|
+
) {
|
|
433
|
+
yield* Console.error(
|
|
434
|
+
`Database Error: ${error._tag}: ${JSON.stringify(error)}`,
|
|
435
|
+
);
|
|
436
|
+
yield* Console.error(
|
|
437
|
+
"\nThis may be a database version compatibility issue.",
|
|
438
|
+
);
|
|
439
|
+
yield* Console.error(
|
|
440
|
+
"Run 'pdf-library migrate --check' to diagnose.",
|
|
441
|
+
);
|
|
442
|
+
} else {
|
|
443
|
+
yield* Console.error(
|
|
444
|
+
`Error: ${error._tag}: ${JSON.stringify(error)}`,
|
|
445
|
+
);
|
|
446
|
+
}
|
|
447
|
+
process.exit(1);
|
|
448
|
+
}),
|
|
449
|
+
),
|
|
450
|
+
),
|
|
451
|
+
);
|
|
452
|
+
}
|
package/src/index.ts
CHANGED
|
@@ -289,6 +289,12 @@ export class PDFLibrary extends Effect.Service<PDFLibrary>()("PDFLibrary", {
|
|
|
289
289
|
libraryPath: config.libraryPath,
|
|
290
290
|
};
|
|
291
291
|
}),
|
|
292
|
+
|
|
293
|
+
/**
|
|
294
|
+
* Repair database integrity issues
|
|
295
|
+
* Removes orphaned chunks and embeddings
|
|
296
|
+
*/
|
|
297
|
+
repair: () => db.repair(),
|
|
292
298
|
};
|
|
293
299
|
}),
|
|
294
300
|
dependencies: [OllamaLive, PDFExtractorLive, DatabaseLive],
|
package/src/services/Database.ts
CHANGED
|
@@ -76,6 +76,16 @@ export class Database extends Context.Tag("Database")<
|
|
|
76
76
|
{ documents: number; chunks: number; embeddings: number },
|
|
77
77
|
DatabaseError
|
|
78
78
|
>;
|
|
79
|
+
|
|
80
|
+
// Maintenance
|
|
81
|
+
readonly repair: () => Effect.Effect<
|
|
82
|
+
{
|
|
83
|
+
orphanedChunks: number;
|
|
84
|
+
orphanedEmbeddings: number;
|
|
85
|
+
zeroVectorEmbeddings: number;
|
|
86
|
+
},
|
|
87
|
+
DatabaseError
|
|
88
|
+
>;
|
|
79
89
|
}
|
|
80
90
|
>() {}
|
|
81
91
|
|
|
@@ -97,13 +107,10 @@ export const DatabaseLive = Layer.scoped(
|
|
|
97
107
|
// PGlite stores data in a directory, not a single file
|
|
98
108
|
const pgDataDir = config.dbPath.replace(".db", "");
|
|
99
109
|
|
|
100
|
-
// Initialize PGlite with pgvector extension
|
|
101
|
-
const db =
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
dataDir: pgDataDir,
|
|
105
|
-
extensions: { vector },
|
|
106
|
-
}),
|
|
110
|
+
// Initialize PGlite with pgvector extension (0.3.x API)
|
|
111
|
+
const db = new PGlite(pgDataDir, { extensions: { vector } });
|
|
112
|
+
yield* Effect.tryPromise({
|
|
113
|
+
try: () => db.waitReady,
|
|
107
114
|
catch: (e) =>
|
|
108
115
|
new DatabaseError({ reason: `Failed to init PGlite: ${e}` }),
|
|
109
116
|
});
|
|
@@ -467,6 +474,70 @@ export const DatabaseLive = Layer.scoped(
|
|
|
467
474
|
},
|
|
468
475
|
catch: (e) => new DatabaseError({ reason: String(e) }),
|
|
469
476
|
}),
|
|
477
|
+
|
|
478
|
+
repair: () =>
|
|
479
|
+
Effect.tryPromise({
|
|
480
|
+
try: async () => {
|
|
481
|
+
// Count orphaned chunks (doc_id not in documents)
|
|
482
|
+
const orphanedChunksResult = await db.query(`
|
|
483
|
+
SELECT COUNT(*) as count FROM chunks c
|
|
484
|
+
WHERE NOT EXISTS (SELECT 1 FROM documents d WHERE d.id = c.doc_id)
|
|
485
|
+
`);
|
|
486
|
+
const orphanedChunks = Number(
|
|
487
|
+
(orphanedChunksResult.rows[0] as { count: number }).count,
|
|
488
|
+
);
|
|
489
|
+
|
|
490
|
+
// Count orphaned embeddings (chunk_id not in chunks)
|
|
491
|
+
const orphanedEmbeddingsResult = await db.query(`
|
|
492
|
+
SELECT COUNT(*) as count FROM embeddings e
|
|
493
|
+
WHERE NOT EXISTS (SELECT 1 FROM chunks c WHERE c.id = e.chunk_id)
|
|
494
|
+
`);
|
|
495
|
+
const orphanedEmbeddings = Number(
|
|
496
|
+
(orphanedEmbeddingsResult.rows[0] as { count: number }).count,
|
|
497
|
+
);
|
|
498
|
+
|
|
499
|
+
// Count zero-dimension embeddings (vector_dims returns 0 or null)
|
|
500
|
+
// Note: In pgvector, we check for malformed vectors
|
|
501
|
+
const zeroVectorResult = await db.query(`
|
|
502
|
+
SELECT COUNT(*) as count FROM embeddings
|
|
503
|
+
WHERE embedding IS NULL OR vector_dims(embedding) = 0
|
|
504
|
+
`);
|
|
505
|
+
const zeroVectorEmbeddings = Number(
|
|
506
|
+
(zeroVectorResult.rows[0] as { count: number }).count,
|
|
507
|
+
);
|
|
508
|
+
|
|
509
|
+
// Delete orphaned embeddings first (depends on chunks)
|
|
510
|
+
if (orphanedEmbeddings > 0) {
|
|
511
|
+
await db.query(`
|
|
512
|
+
DELETE FROM embeddings e
|
|
513
|
+
WHERE NOT EXISTS (SELECT 1 FROM chunks c WHERE c.id = e.chunk_id)
|
|
514
|
+
`);
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
// Delete orphaned chunks (depends on documents)
|
|
518
|
+
if (orphanedChunks > 0) {
|
|
519
|
+
await db.query(`
|
|
520
|
+
DELETE FROM chunks c
|
|
521
|
+
WHERE NOT EXISTS (SELECT 1 FROM documents d WHERE d.id = c.doc_id)
|
|
522
|
+
`);
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
// Delete zero-dimension embeddings
|
|
526
|
+
if (zeroVectorEmbeddings > 0) {
|
|
527
|
+
await db.query(`
|
|
528
|
+
DELETE FROM embeddings
|
|
529
|
+
WHERE embedding IS NULL OR vector_dims(embedding) = 0
|
|
530
|
+
`);
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
return {
|
|
534
|
+
orphanedChunks,
|
|
535
|
+
orphanedEmbeddings,
|
|
536
|
+
zeroVectorEmbeddings,
|
|
537
|
+
};
|
|
538
|
+
},
|
|
539
|
+
catch: (e) => new DatabaseError({ reason: String(e) }),
|
|
540
|
+
}),
|
|
470
541
|
};
|
|
471
542
|
}),
|
|
472
543
|
);
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PGlite Migration Service
|
|
3
|
+
*
|
|
4
|
+
* Handles migration from PGlite 0.2.x (PostgreSQL 16) to 0.3.x (PostgreSQL 17).
|
|
5
|
+
* Since automatic pg_dump migration isn't possible (0.2.x WASM crashes on current runtimes),
|
|
6
|
+
* provides detection and manual migration paths.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { Effect, Context, Layer, Schema } from "effect";
|
|
10
|
+
import { existsSync, readFileSync } from "fs";
|
|
11
|
+
import { join } from "path";
|
|
12
|
+
import { PGlite } from "@electric-sql/pglite";
|
|
13
|
+
import { vector } from "@electric-sql/pglite/vector";
|
|
14
|
+
|
|
15
|
+
// ============================================================================
|
|
16
|
+
// Errors
|
|
17
|
+
// ============================================================================
|
|
18
|
+
|
|
19
|
+
export class MigrationError extends Schema.TaggedError<MigrationError>()(
|
|
20
|
+
"MigrationError",
|
|
21
|
+
{ reason: Schema.String },
|
|
22
|
+
) {}
|
|
23
|
+
|
|
24
|
+
// ============================================================================
|
|
25
|
+
// Service Definition
|
|
26
|
+
// ============================================================================
|
|
27
|
+
|
|
28
|
+
export class Migration extends Context.Tag("Migration")<
|
|
29
|
+
Migration,
|
|
30
|
+
{
|
|
31
|
+
/**
|
|
32
|
+
* Check if database at given path needs migration from PG16 to PG17.
|
|
33
|
+
* Returns true if migration is needed.
|
|
34
|
+
*/
|
|
35
|
+
readonly checkMigrationNeeded: (
|
|
36
|
+
dbPath: string,
|
|
37
|
+
) => Effect.Effect<boolean, MigrationError>;
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Get helpful error message with migration options.
|
|
41
|
+
*/
|
|
42
|
+
readonly getMigrationMessage: () => string;
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Import SQL dump into fresh PG17 database.
|
|
46
|
+
*/
|
|
47
|
+
readonly importFromDump: (
|
|
48
|
+
dumpFile: string,
|
|
49
|
+
dbPath: string,
|
|
50
|
+
) => Effect.Effect<void, MigrationError>;
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Generate shell script for manual pg_dump with old PGlite.
|
|
54
|
+
* User runs this with PGlite 0.2.x installed to export data.
|
|
55
|
+
*/
|
|
56
|
+
readonly generateExportScript: (dbPath: string) => string;
|
|
57
|
+
}
|
|
58
|
+
>() {}
|
|
59
|
+
|
|
60
|
+
// ============================================================================
|
|
61
|
+
// Implementation
|
|
62
|
+
// ============================================================================
|
|
63
|
+
|
|
64
|
+
export const MigrationLive = Layer.succeed(
|
|
65
|
+
Migration,
|
|
66
|
+
Migration.of({
|
|
67
|
+
checkMigrationNeeded: (dbPath) =>
|
|
68
|
+
Effect.gen(function* () {
|
|
69
|
+
const pgDataDir = dbPath.replace(".db", "");
|
|
70
|
+
|
|
71
|
+
// If directory doesn't exist, no migration needed (fresh install)
|
|
72
|
+
if (!existsSync(pgDataDir)) {
|
|
73
|
+
return false;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Check for PG_VERSION file
|
|
77
|
+
const versionFile = join(pgDataDir, "PG_VERSION");
|
|
78
|
+
if (!existsSync(versionFile)) {
|
|
79
|
+
// Directory exists but no version file - might be corrupted
|
|
80
|
+
// Attempt to detect by trying to open with PG17
|
|
81
|
+
return yield* Effect.tryPromise({
|
|
82
|
+
try: async () => {
|
|
83
|
+
try {
|
|
84
|
+
const testDb = await PGlite.create({
|
|
85
|
+
dataDir: pgDataDir,
|
|
86
|
+
extensions: { vector },
|
|
87
|
+
});
|
|
88
|
+
await testDb.close();
|
|
89
|
+
return false; // Successfully opened - no migration needed
|
|
90
|
+
} catch (e) {
|
|
91
|
+
// Failed to open - likely needs migration
|
|
92
|
+
return true;
|
|
93
|
+
}
|
|
94
|
+
},
|
|
95
|
+
catch: (e) =>
|
|
96
|
+
new MigrationError({
|
|
97
|
+
reason: `Failed to check database version: ${e}`,
|
|
98
|
+
}),
|
|
99
|
+
});
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// Read PG_VERSION file
|
|
103
|
+
const version = yield* Effect.try({
|
|
104
|
+
try: () => readFileSync(versionFile, "utf8").trim(),
|
|
105
|
+
catch: (e) =>
|
|
106
|
+
new MigrationError({
|
|
107
|
+
reason: `Failed to read PG_VERSION: ${e}`,
|
|
108
|
+
}),
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
// If version is 16, migration is needed
|
|
112
|
+
return version === "16";
|
|
113
|
+
}),
|
|
114
|
+
|
|
115
|
+
getMigrationMessage: () => `
|
|
116
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
117
|
+
DATABASE MIGRATION REQUIRED
|
|
118
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
119
|
+
|
|
120
|
+
Your existing PDF library database uses PostgreSQL 16 (PGlite 0.2.x).
|
|
121
|
+
This version requires PostgreSQL 17 (PGlite 0.3.x).
|
|
122
|
+
|
|
123
|
+
Automatic migration is not possible because PGlite 0.2.x WASM crashes on
|
|
124
|
+
current Node.js runtimes. You have two options:
|
|
125
|
+
|
|
126
|
+
OPTION 1: Fresh Start (Easiest)
|
|
127
|
+
────────────────────────────────
|
|
128
|
+
1. Backup your database directory if desired
|
|
129
|
+
2. Delete the database directory to start fresh
|
|
130
|
+
3. Re-add your PDFs
|
|
131
|
+
|
|
132
|
+
OPTION 2: Manual Migration (Preserve Data)
|
|
133
|
+
───────────────────────────────────────────
|
|
134
|
+
Migration scripts are provided in the repository.
|
|
135
|
+
|
|
136
|
+
1. Export data using PGlite 0.2.x:
|
|
137
|
+
cd /path/to/pdf-library
|
|
138
|
+
npm install @electric-sql/pglite@0.2.12
|
|
139
|
+
node scripts/migration/export-pg16.mjs
|
|
140
|
+
|
|
141
|
+
2. Import into PGlite 0.3.x:
|
|
142
|
+
bun run scripts/migration/import-pg17.ts
|
|
143
|
+
|
|
144
|
+
3. Regenerate embeddings (if needed):
|
|
145
|
+
bun run scripts/migration/regenerate-embeddings.ts
|
|
146
|
+
|
|
147
|
+
For detailed instructions, see:
|
|
148
|
+
https://github.com/joelhooks/pdf-library#migration
|
|
149
|
+
|
|
150
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
151
|
+
`,
|
|
152
|
+
|
|
153
|
+
importFromDump: (dumpFile, dbPath) =>
|
|
154
|
+
Effect.gen(function* () {
|
|
155
|
+
// Verify dump file exists
|
|
156
|
+
if (!existsSync(dumpFile)) {
|
|
157
|
+
return yield* Effect.fail(
|
|
158
|
+
new MigrationError({
|
|
159
|
+
reason: `Dump file not found: ${dumpFile}`,
|
|
160
|
+
}),
|
|
161
|
+
);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// Read SQL dump
|
|
165
|
+
const sql = yield* Effect.try({
|
|
166
|
+
try: () => readFileSync(dumpFile, "utf8"),
|
|
167
|
+
catch: (e) =>
|
|
168
|
+
new MigrationError({
|
|
169
|
+
reason: `Failed to read dump file: ${e}`,
|
|
170
|
+
}),
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
// Create fresh PG17 database
|
|
174
|
+
const pgDataDir = dbPath.replace(".db", "");
|
|
175
|
+
const db = yield* Effect.tryPromise({
|
|
176
|
+
try: () =>
|
|
177
|
+
PGlite.create({
|
|
178
|
+
dataDir: pgDataDir,
|
|
179
|
+
extensions: { vector },
|
|
180
|
+
}),
|
|
181
|
+
catch: (e) =>
|
|
182
|
+
new MigrationError({
|
|
183
|
+
reason: `Failed to create new database: ${e}`,
|
|
184
|
+
}),
|
|
185
|
+
});
|
|
186
|
+
|
|
187
|
+
// Execute dump SQL
|
|
188
|
+
yield* Effect.tryPromise({
|
|
189
|
+
try: async () => {
|
|
190
|
+
await db.exec(sql);
|
|
191
|
+
await db.close();
|
|
192
|
+
},
|
|
193
|
+
catch: (e) =>
|
|
194
|
+
new MigrationError({
|
|
195
|
+
reason: `Failed to import dump: ${e}`,
|
|
196
|
+
}),
|
|
197
|
+
});
|
|
198
|
+
}),
|
|
199
|
+
|
|
200
|
+
generateExportScript: (dbPath) => {
|
|
201
|
+
const pgDataDir = dbPath.replace(".db", "");
|
|
202
|
+
const dumpFile = join(process.cwd(), "library-dump.sql");
|
|
203
|
+
|
|
204
|
+
// Generate a Node.js script that uses PGlite 0.2.x to export data
|
|
205
|
+
return `#!/usr/bin/env node
|
|
206
|
+
/**
|
|
207
|
+
* PDF Library Database Export Script
|
|
208
|
+
*
|
|
209
|
+
* This script exports your database using PGlite 0.2.x.
|
|
210
|
+
*
|
|
211
|
+
* Prerequisites:
|
|
212
|
+
* 1. Node.js 18+
|
|
213
|
+
* 2. Install dependencies:
|
|
214
|
+
* npm install @electric-sql/pglite@0.2.12
|
|
215
|
+
*
|
|
216
|
+
* Usage:
|
|
217
|
+
* node export.js
|
|
218
|
+
*/
|
|
219
|
+
|
|
220
|
+
const { PGlite } = require('@electric-sql/pglite');
|
|
221
|
+
const { writeFileSync } = require('fs');
|
|
222
|
+
|
|
223
|
+
(async () => {
|
|
224
|
+
console.log('Opening database with PGlite 0.2.x...');
|
|
225
|
+
|
|
226
|
+
const db = new PGlite('${pgDataDir}');
|
|
227
|
+
|
|
228
|
+
console.log('Exporting documents table...');
|
|
229
|
+
const docs = await db.query('SELECT * FROM documents');
|
|
230
|
+
|
|
231
|
+
console.log('Exporting chunks table...');
|
|
232
|
+
const chunks = await db.query('SELECT * FROM chunks');
|
|
233
|
+
|
|
234
|
+
console.log('Exporting embeddings table...');
|
|
235
|
+
const embeddings = await db.query('SELECT * FROM embeddings');
|
|
236
|
+
|
|
237
|
+
// Generate SQL dump
|
|
238
|
+
let sql = '-- PDF Library Database Dump\\n';
|
|
239
|
+
sql += '-- Generated with PGlite 0.2.x\\n\\n';
|
|
240
|
+
|
|
241
|
+
sql += 'BEGIN;\\n\\n';
|
|
242
|
+
|
|
243
|
+
// Enable extensions
|
|
244
|
+
sql += 'CREATE EXTENSION IF NOT EXISTS vector;\\n\\n';
|
|
245
|
+
|
|
246
|
+
// Create tables
|
|
247
|
+
sql += \`CREATE TABLE IF NOT EXISTS documents (
|
|
248
|
+
id TEXT PRIMARY KEY,
|
|
249
|
+
title TEXT NOT NULL,
|
|
250
|
+
path TEXT NOT NULL UNIQUE,
|
|
251
|
+
added_at TIMESTAMPTZ NOT NULL,
|
|
252
|
+
page_count INTEGER NOT NULL,
|
|
253
|
+
size_bytes INTEGER NOT NULL,
|
|
254
|
+
tags JSONB DEFAULT '[]',
|
|
255
|
+
metadata JSONB DEFAULT '{}'
|
|
256
|
+
);\\n\\n\`;
|
|
257
|
+
|
|
258
|
+
sql += \`CREATE TABLE IF NOT EXISTS chunks (
|
|
259
|
+
id TEXT PRIMARY KEY,
|
|
260
|
+
doc_id TEXT NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
261
|
+
page INTEGER NOT NULL,
|
|
262
|
+
chunk_index INTEGER NOT NULL,
|
|
263
|
+
content TEXT NOT NULL
|
|
264
|
+
);\\n\\n\`;
|
|
265
|
+
|
|
266
|
+
sql += \`CREATE TABLE IF NOT EXISTS embeddings (
|
|
267
|
+
chunk_id TEXT PRIMARY KEY REFERENCES chunks(id) ON DELETE CASCADE,
|
|
268
|
+
embedding vector(1024) NOT NULL
|
|
269
|
+
);\\n\\n\`;
|
|
270
|
+
|
|
271
|
+
// Insert documents
|
|
272
|
+
for (const row of docs.rows) {
|
|
273
|
+
const values = [
|
|
274
|
+
row.id,
|
|
275
|
+
row.title.replace(/'/g, "''"),
|
|
276
|
+
row.path.replace(/'/g, "''"),
|
|
277
|
+
row.added_at,
|
|
278
|
+
row.page_count,
|
|
279
|
+
row.size_bytes,
|
|
280
|
+
JSON.stringify(row.tags),
|
|
281
|
+
JSON.stringify(row.metadata)
|
|
282
|
+
];
|
|
283
|
+
sql += \`INSERT INTO documents (id, title, path, added_at, page_count, size_bytes, tags, metadata) VALUES ('\${values[0]}', '\${values[1]}', '\${values[2]}', '\${values[3]}', \${values[4]}, \${values[5]}, '\${values[6]}', '\${values[7]}');\\n\`;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
sql += '\\n';
|
|
287
|
+
|
|
288
|
+
// Insert chunks
|
|
289
|
+
for (const row of chunks.rows) {
|
|
290
|
+
const content = row.content.replace(/'/g, "''");
|
|
291
|
+
sql += \`INSERT INTO chunks (id, doc_id, page, chunk_index, content) VALUES ('\${row.id}', '\${row.doc_id}', \${row.page}, \${row.chunk_index}, '\${content}');\\n\`;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
sql += '\\n';
|
|
295
|
+
|
|
296
|
+
// Insert embeddings
|
|
297
|
+
for (const row of embeddings.rows) {
|
|
298
|
+
sql += \`INSERT INTO embeddings (chunk_id, embedding) VALUES ('\${row.chunk_id}', '\${row.embedding}');\\n\`;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
sql += '\\nCOMMIT;\\n';
|
|
302
|
+
|
|
303
|
+
// Write dump file
|
|
304
|
+
writeFileSync('${dumpFile}', sql);
|
|
305
|
+
|
|
306
|
+
await db.close();
|
|
307
|
+
|
|
308
|
+
console.log(\`\\nExport complete! Dump saved to: ${dumpFile}\`);
|
|
309
|
+
console.log(\`\\nNext steps:\`);
|
|
310
|
+
console.log(\` 1. Upgrade to PGlite 0.3.x\`);
|
|
311
|
+
console.log(\` 2. Import dump: pdf-library migration import ${dumpFile}\`);
|
|
312
|
+
})().catch(err => {
|
|
313
|
+
console.error('Export failed:', err);
|
|
314
|
+
process.exit(1);
|
|
315
|
+
});
|
|
316
|
+
`;
|
|
317
|
+
},
|
|
318
|
+
}),
|
|
319
|
+
);
|