pdf-brain 0.9.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pdf-brain",
3
- "version": "0.9.0",
3
+ "version": "1.0.0",
4
4
  "description": "Local PDF knowledge base with vector search",
5
5
  "type": "module",
6
6
  "main": "src/index.ts",
@@ -23,17 +23,13 @@
23
23
  "@effect/platform": "^0.72.0",
24
24
  "@effect/platform-bun": "^0.52.0",
25
25
  "@effect/schema": "^0.75.0",
26
- "@electric-sql/pglite": "^0.3.14",
27
- "@electric-sql/pglite-socket": "^0.0.19",
28
- "@electric-sql/pglite-tools": "^0.2.19",
29
- "@types/pg": "^8.16.0",
26
+ "@libsql/client": "^0.15.15",
30
27
  "ai": "^5.0.115",
31
28
  "effect": "^3.12.0",
32
29
  "gray-matter": "^4.0.3",
33
30
  "ink": "^6.5.1",
34
31
  "ink-spinner": "^5.0.0",
35
32
  "mdast-util-to-string": "^4.0.0",
36
- "pg": "^8.16.3",
37
33
  "react": "^19.2.3",
38
34
  "remark-frontmatter": "^5.0.0",
39
35
  "remark-gfm": "^4.0.1",
@@ -63,8 +59,7 @@
63
59
  "pdf",
64
60
  "vector-search",
65
61
  "embeddings",
66
- "pglite",
67
- "pgvector",
62
+ "libsql",
68
63
  "knowledge-base",
69
64
  "effect-ts"
70
65
  ],
@@ -0,0 +1,480 @@
1
+ #!/usr/bin/env bun
2
+ /**
3
+ * Migrate data from PGlite to LibSQL
4
+ *
5
+ * This script migrates an existing PGlite database to LibSQL format.
6
+ * Handles schema translation, vector format conversion (pgvector → F32_BLOB),
7
+ * and data migration.
8
+ *
9
+ * Prerequisites:
10
+ * - Existing PGlite database at specified path
11
+ * - Bun runtime (for @libsql/client)
12
+ *
13
+ * Usage:
14
+ * bun run scripts/migration/pglite-to-libsql.ts [pglite-db-path] [libsql-db-path]
15
+ *
16
+ * Example:
17
+ * bun run scripts/migration/pglite-to-libsql.ts \
18
+ * ~/Documents/.pdf-library/library \
19
+ * ~/Documents/.pdf-library/library-libsql.db
20
+ *
21
+ * Vector Format Conversion:
22
+ * - PGlite: vector(1024) column with '[1.2,3.4,...]' text format
23
+ * - LibSQL: F32_BLOB(1024) column with JSON.stringify([1.2,3.4,...])
24
+ */
25
+
26
+ import { PGlite } from "@electric-sql/pglite";
27
+ import { vector } from "@electric-sql/pglite/vector";
28
+ import { createClient } from "@libsql/client";
29
+ import { existsSync, mkdirSync } from "fs";
30
+ import { dirname } from "path";
31
+
32
+ // Embedding dimension for mxbai-embed-large
33
+ const EMBEDDING_DIM = 1024;
34
+
35
+ const args = process.argv.slice(2);
36
+ const pglitePath =
37
+ args[0] ||
38
+ `${process.env.HOME}/Documents/.pdf-library/library`.replace(".db", "");
39
+ const libsqlPath =
40
+ args[1] || `${process.env.HOME}/Documents/.pdf-library/library-libsql.db`;
41
+
42
+ interface MigrationStats {
43
+ documents: number;
44
+ chunks: number;
45
+ embeddings: number;
46
+ skippedEmbeddings: number;
47
+ errors: string[];
48
+ }
49
+
50
+ /**
51
+ * Parse pgvector text format to number array
52
+ * PGlite returns vectors as '[1.2,3.4,5.6,...]'
53
+ */
54
+ function parseVector(vectorText: string): number[] {
55
+ // Remove brackets and parse
56
+ const cleaned = vectorText.replace(/^\[|\]$/g, "");
57
+ return cleaned.split(",").map((v) => parseFloat(v.trim()));
58
+ }
59
+
60
+ /**
61
+ * Initialize LibSQL schema
62
+ */
63
+ async function initLibSQLSchema(client: ReturnType<typeof createClient>) {
64
+ console.log("Creating LibSQL schema...");
65
+
66
+ // Documents table
67
+ await client.execute(`
68
+ CREATE TABLE IF NOT EXISTS documents (
69
+ id TEXT PRIMARY KEY,
70
+ title TEXT NOT NULL,
71
+ path TEXT NOT NULL UNIQUE,
72
+ added_at TEXT NOT NULL,
73
+ page_count INTEGER NOT NULL,
74
+ size_bytes INTEGER NOT NULL,
75
+ tags TEXT DEFAULT '[]',
76
+ metadata TEXT DEFAULT '{}'
77
+ )
78
+ `);
79
+
80
+ // Chunks table
81
+ await client.execute(`
82
+ CREATE TABLE IF NOT EXISTS chunks (
83
+ id TEXT PRIMARY KEY,
84
+ doc_id TEXT NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
85
+ page INTEGER NOT NULL,
86
+ chunk_index INTEGER NOT NULL,
87
+ content TEXT NOT NULL
88
+ )
89
+ `);
90
+
91
+ // Embeddings table with F32_BLOB
92
+ await client.execute(`
93
+ CREATE TABLE IF NOT EXISTS embeddings (
94
+ chunk_id TEXT PRIMARY KEY REFERENCES chunks(id) ON DELETE CASCADE,
95
+ embedding F32_BLOB(${EMBEDDING_DIM}) NOT NULL
96
+ )
97
+ `);
98
+
99
+ // Create indexes
100
+ await client.execute(
101
+ `CREATE INDEX IF NOT EXISTS idx_chunks_doc ON chunks(doc_id)`
102
+ );
103
+ await client.execute(
104
+ `CREATE INDEX IF NOT EXISTS idx_docs_path ON documents(path)`
105
+ );
106
+
107
+ console.log("✓ Schema created");
108
+ }
109
+
110
+ /**
111
+ * Migrate documents from PGlite to LibSQL
112
+ */
113
+ async function migrateDocuments(
114
+ pgDb: PGlite,
115
+ libsqlClient: ReturnType<typeof createClient>,
116
+ stats: MigrationStats
117
+ ) {
118
+ console.log("\nMigrating documents...");
119
+
120
+ const result = await pgDb.query("SELECT * FROM documents ORDER BY id");
121
+ const docs = result.rows;
122
+
123
+ console.log(` Found ${docs.length} documents`);
124
+
125
+ for (const doc of docs) {
126
+ try {
127
+ const docRow = doc as {
128
+ id: string;
129
+ title: string;
130
+ path: string;
131
+ added_at: string;
132
+ page_count: number;
133
+ size_bytes: number;
134
+ tags: unknown;
135
+ metadata: unknown;
136
+ };
137
+
138
+ await libsqlClient.execute({
139
+ sql: `INSERT INTO documents (id, title, path, added_at, page_count, size_bytes, tags, metadata)
140
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)`,
141
+ args: [
142
+ docRow.id,
143
+ docRow.title,
144
+ docRow.path,
145
+ docRow.added_at, // Already ISO string in PGlite
146
+ docRow.page_count,
147
+ docRow.size_bytes,
148
+ JSON.stringify(docRow.tags), // JSONB → TEXT
149
+ JSON.stringify(docRow.metadata || {}), // JSONB → TEXT
150
+ ],
151
+ });
152
+ stats.documents++;
153
+ } catch (e) {
154
+ const error = `Document ${(doc as { id: string }).id}: ${e}`;
155
+ stats.errors.push(error);
156
+ console.error(` ✗ ${error}`);
157
+ }
158
+ }
159
+
160
+ console.log(` ✓ Migrated ${stats.documents} documents`);
161
+ }
162
+
163
+ /**
164
+ * Migrate chunks from PGlite to LibSQL
165
+ */
166
+ async function migrateChunks(
167
+ pgDb: PGlite,
168
+ libsqlClient: ReturnType<typeof createClient>,
169
+ stats: MigrationStats
170
+ ) {
171
+ console.log("\nMigrating chunks...");
172
+
173
+ const countResult = await pgDb.query("SELECT COUNT(*) as c FROM chunks");
174
+ const totalChunks = parseInt((countResult.rows[0] as any).c);
175
+
176
+ console.log(` Found ${totalChunks} chunks`);
177
+
178
+ const batchSize = 100;
179
+ let migrated = 0;
180
+
181
+ for (let offset = 0; offset < totalChunks; offset += batchSize) {
182
+ const batch = await pgDb.query(
183
+ `SELECT * FROM chunks ORDER BY id LIMIT ${batchSize} OFFSET ${offset}`
184
+ );
185
+
186
+ // Use libSQL batch for transaction
187
+ const statements = batch.rows.map((chunk) => {
188
+ const chunkRow = chunk as {
189
+ id: string;
190
+ doc_id: string;
191
+ page: number;
192
+ chunk_index: number;
193
+ content: string;
194
+ };
195
+ return {
196
+ sql: "INSERT INTO chunks (id, doc_id, page, chunk_index, content) VALUES (?, ?, ?, ?, ?)",
197
+ args: [
198
+ chunkRow.id,
199
+ chunkRow.doc_id,
200
+ chunkRow.page,
201
+ chunkRow.chunk_index,
202
+ chunkRow.content,
203
+ ],
204
+ };
205
+ });
206
+
207
+ try {
208
+ await libsqlClient.batch(statements, "write");
209
+ migrated += batch.rows.length;
210
+ stats.chunks += batch.rows.length;
211
+ } catch (e) {
212
+ const error = `Chunk batch at offset ${offset}: ${e}`;
213
+ stats.errors.push(error);
214
+ console.error(` ✗ ${error}`);
215
+ }
216
+
217
+ if (migrated % 1000 === 0 || migrated === totalChunks) {
218
+ console.log(` Progress: ${migrated}/${totalChunks}`);
219
+ }
220
+ }
221
+
222
+ console.log(` ✓ Migrated ${stats.chunks} chunks`);
223
+ }
224
+
225
+ /**
226
+ * Migrate embeddings from PGlite to LibSQL
227
+ *
228
+ * KEY CONVERSION:
229
+ * - PGlite: vector(1024) → returns as '[1.2,3.4,...]' text
230
+ * - LibSQL: F32_BLOB(1024) → requires JSON.stringify([1.2,3.4,...])
231
+ */
232
+ async function migrateEmbeddings(
233
+ pgDb: PGlite,
234
+ libsqlClient: ReturnType<typeof createClient>,
235
+ stats: MigrationStats
236
+ ) {
237
+ console.log("\nMigrating embeddings...");
238
+
239
+ try {
240
+ const countResult = await pgDb.query(
241
+ "SELECT COUNT(*) as c FROM embeddings"
242
+ );
243
+ const totalEmb = parseInt((countResult.rows[0] as any).c);
244
+
245
+ console.log(` Found ${totalEmb} embeddings`);
246
+
247
+ const batchSize = 50; // Smaller batches for large embedding data
248
+ let migrated = 0;
249
+
250
+ for (let offset = 0; offset < totalEmb; offset += batchSize) {
251
+ const batch = await pgDb.query(
252
+ `SELECT chunk_id, embedding::text as embedding
253
+ FROM embeddings
254
+ ORDER BY chunk_id
255
+ LIMIT ${batchSize} OFFSET ${offset}`
256
+ );
257
+
258
+ const statements: Array<{ sql: string; args: [string, string] }> = [];
259
+
260
+ for (const row of batch.rows) {
261
+ try {
262
+ const embRow = row as { chunk_id: string; embedding: string };
263
+
264
+ // Parse pgvector text format → number array
265
+ const vectorArray = parseVector(embRow.embedding);
266
+
267
+ // Validate dimension
268
+ if (vectorArray.length !== EMBEDDING_DIM) {
269
+ stats.skippedEmbeddings++;
270
+ stats.errors.push(
271
+ `Embedding ${embRow.chunk_id}: wrong dimension ${vectorArray.length}, expected ${EMBEDDING_DIM}`
272
+ );
273
+ continue;
274
+ }
275
+
276
+ // Convert to LibSQL format: JSON.stringify for F32_BLOB
277
+ statements.push({
278
+ sql: "INSERT INTO embeddings (chunk_id, embedding) VALUES (?, vector(?))",
279
+ args: [embRow.chunk_id, JSON.stringify(vectorArray)],
280
+ });
281
+ } catch (e) {
282
+ const embRow = row as { chunk_id: string };
283
+ stats.skippedEmbeddings++;
284
+ stats.errors.push(`Embedding ${embRow.chunk_id}: ${e}`);
285
+ }
286
+ }
287
+
288
+ if (statements.length > 0) {
289
+ try {
290
+ await libsqlClient.batch(statements, "write");
291
+ migrated += statements.length;
292
+ stats.embeddings += statements.length;
293
+ } catch (e) {
294
+ const error = `Embedding batch at offset ${offset}: ${e}`;
295
+ stats.errors.push(error);
296
+ console.error(` ✗ ${error}`);
297
+ }
298
+ }
299
+
300
+ if (
301
+ migrated % 500 === 0 ||
302
+ migrated + stats.skippedEmbeddings >= totalEmb
303
+ ) {
304
+ console.log(
305
+ ` Progress: ${migrated}/${totalEmb} (${stats.skippedEmbeddings} skipped)`
306
+ );
307
+ }
308
+ }
309
+
310
+ console.log(` ✓ Migrated ${stats.embeddings} embeddings`);
311
+ if (stats.skippedEmbeddings > 0) {
312
+ console.log(
313
+ ` ⚠ Skipped ${stats.skippedEmbeddings} embeddings (see errors)`
314
+ );
315
+ }
316
+ } catch (e) {
317
+ console.error(` ✗ Failed to migrate embeddings: ${e}`);
318
+ console.log(" (Embeddings can be regenerated after migration)");
319
+ stats.errors.push(`Embeddings migration failed: ${e}`);
320
+ }
321
+ }
322
+
323
+ /**
324
+ * Verify migration succeeded
325
+ */
326
+ async function verifyMigration(
327
+ pgDb: PGlite,
328
+ libsqlClient: ReturnType<typeof createClient>,
329
+ stats: MigrationStats
330
+ ): Promise<boolean> {
331
+ console.log("\nVerifying migration...");
332
+
333
+ try {
334
+ // Check counts
335
+ const pgDocs = await pgDb.query("SELECT COUNT(*) as c FROM documents");
336
+ const pgChunks = await pgDb.query("SELECT COUNT(*) as c FROM chunks");
337
+ const pgEmb = await pgDb.query("SELECT COUNT(*) as c FROM embeddings");
338
+
339
+ const libsqlDocs = await libsqlClient.execute(
340
+ "SELECT COUNT(*) as c FROM documents"
341
+ );
342
+ const libsqlChunks = await libsqlClient.execute(
343
+ "SELECT COUNT(*) as c FROM chunks"
344
+ );
345
+ const libsqlEmb = await libsqlClient.execute(
346
+ "SELECT COUNT(*) as c FROM embeddings"
347
+ );
348
+
349
+ const pgDocsCount = parseInt((pgDocs.rows[0] as { c: string }).c);
350
+ const pgChunksCount = parseInt((pgChunks.rows[0] as { c: string }).c);
351
+ const pgEmbCount = parseInt((pgEmb.rows[0] as { c: string }).c);
352
+
353
+ const libsqlDocsCount = Number(
354
+ (libsqlDocs.rows[0] as unknown as { c: number }).c
355
+ );
356
+ const libsqlChunksCount = Number(
357
+ (libsqlChunks.rows[0] as unknown as { c: number }).c
358
+ );
359
+ const libsqlEmbCount = Number(
360
+ (libsqlEmb.rows[0] as unknown as { c: number }).c
361
+ );
362
+
363
+ console.log("\nComparison:");
364
+ console.log(` Documents: ${pgDocsCount} → ${libsqlDocsCount}`);
365
+ console.log(` Chunks: ${pgChunksCount} → ${libsqlChunksCount}`);
366
+ console.log(
367
+ ` Embeddings: ${pgEmbCount} → ${libsqlEmbCount} (${stats.skippedEmbeddings} skipped)`
368
+ );
369
+
370
+ const docsMatch = pgDocsCount === libsqlDocsCount;
371
+ const chunksMatch = pgChunksCount === libsqlChunksCount;
372
+ const embMatch = pgEmbCount === libsqlEmbCount + stats.skippedEmbeddings;
373
+
374
+ if (docsMatch && chunksMatch && embMatch) {
375
+ console.log("\n✓ Verification passed - all counts match!");
376
+ return true;
377
+ } else {
378
+ console.log("\n✗ Verification failed - count mismatch");
379
+ if (!docsMatch) console.log(" - Documents don't match");
380
+ if (!chunksMatch) console.log(" - Chunks don't match");
381
+ if (!embMatch) console.log(" - Embeddings don't match");
382
+ return false;
383
+ }
384
+ } catch (e) {
385
+ console.error(`\n✗ Verification error: ${e}`);
386
+ return false;
387
+ }
388
+ }
389
+
390
+ /**
391
+ * Main migration workflow
392
+ */
393
+ async function main() {
394
+ console.log("=== PGlite to LibSQL Migration ===\n");
395
+ console.log(`PGlite DB: ${pglitePath}`);
396
+ console.log(`LibSQL DB: ${libsqlPath}`);
397
+
398
+ // Check PGlite database exists
399
+ if (!existsSync(pglitePath)) {
400
+ console.error(`\nError: PGlite database not found at ${pglitePath}`);
401
+ console.error("Run with correct path or use default location.");
402
+ process.exit(1);
403
+ }
404
+
405
+ const stats: MigrationStats = {
406
+ documents: 0,
407
+ chunks: 0,
408
+ embeddings: 0,
409
+ skippedEmbeddings: 0,
410
+ errors: [],
411
+ };
412
+
413
+ // Ensure LibSQL directory exists
414
+ const libsqlDir = dirname(libsqlPath);
415
+ if (!existsSync(libsqlDir)) {
416
+ mkdirSync(libsqlDir, { recursive: true });
417
+ }
418
+
419
+ console.log("\nOpening PGlite database...");
420
+ const pgDb = new PGlite(pglitePath, { extensions: { vector } });
421
+ await pgDb.waitReady;
422
+ console.log("✓ PGlite ready");
423
+
424
+ console.log("\nOpening LibSQL database...");
425
+ const libsqlClient = createClient({
426
+ url: `file:${libsqlPath}`,
427
+ });
428
+ console.log("✓ LibSQL ready");
429
+
430
+ try {
431
+ // Initialize LibSQL schema
432
+ await initLibSQLSchema(libsqlClient);
433
+
434
+ // Migrate data
435
+ await migrateDocuments(pgDb, libsqlClient, stats);
436
+ await migrateChunks(pgDb, libsqlClient, stats);
437
+ await migrateEmbeddings(pgDb, libsqlClient, stats);
438
+
439
+ // Verify migration
440
+ const verified = await verifyMigration(pgDb, libsqlClient, stats);
441
+
442
+ // Summary
443
+ console.log("\n=== Migration Summary ===");
444
+ console.log(`Documents migrated: ${stats.documents}`);
445
+ console.log(`Chunks migrated: ${stats.chunks}`);
446
+ console.log(`Embeddings migrated: ${stats.embeddings}`);
447
+ if (stats.skippedEmbeddings > 0) {
448
+ console.log(`Embeddings skipped: ${stats.skippedEmbeddings}`);
449
+ }
450
+ console.log(`Errors: ${stats.errors.length}`);
451
+
452
+ if (stats.errors.length > 0) {
453
+ console.log("\nErrors:");
454
+ stats.errors.slice(0, 10).forEach((err) => console.log(` - ${err}`));
455
+ if (stats.errors.length > 10) {
456
+ console.log(` ... and ${stats.errors.length - 10} more`);
457
+ }
458
+ }
459
+
460
+ if (verified) {
461
+ console.log("\n✓ Migration completed successfully!");
462
+ console.log(`\nLibSQL database created at: ${libsqlPath}`);
463
+ process.exit(0);
464
+ } else {
465
+ console.log("\n⚠ Migration completed with verification warnings");
466
+ process.exit(1);
467
+ }
468
+ } catch (e) {
469
+ console.error(`\nMigration failed: ${e}`);
470
+ process.exit(1);
471
+ } finally {
472
+ await pgDb.close();
473
+ libsqlClient.close();
474
+ }
475
+ }
476
+
477
+ main().catch((e) => {
478
+ console.error("Fatal error:", e);
479
+ process.exit(1);
480
+ });