pdf-brain 1.1.0 → 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +177 -26
- package/package.json +29 -6
- package/scripts/bulk-ingest.ts +22 -0
- package/scripts/migration/backfill-document-concepts.ts +169 -0
- package/scripts/migration/backfill-summaries.ts +231 -0
- package/src/cli.test.ts +46 -0
- package/src/cli.ts +10 -2
- package/src/services/AutoTagger.ts +29 -1
- package/src/services/ClusterConceptMapper.test.ts +57 -0
- package/src/services/ClusterConceptMapper.ts +120 -0
- package/src/services/ClusterSummarizer.test.ts +198 -0
- package/src/services/ClusterSummarizer.ts +196 -0
- package/src/services/Clustering.test.ts +487 -0
- package/src/services/Clustering.ts +754 -0
- package/src/services/Database.ts +18 -0
- package/src/services/LibSQLDatabase.test.ts +559 -1
- package/src/services/LibSQLDatabase.ts +208 -5
- package/src/types.ts +4 -0
package/README.md
CHANGED
|
@@ -1,25 +1,28 @@
|
|
|
1
1
|
# pdf-brain
|
|
2
2
|
|
|
3
|
-
Local PDF & Markdown knowledge base with semantic search and AI-powered enrichment.
|
|
3
|
+
Local **PDF & Markdown** knowledge base with semantic search and AI-powered enrichment.
|
|
4
|
+
|
|
5
|
+
> **Works with PDFs AND Markdown files** - Index your research papers, books, notes, docs, and any `.md` files in one unified, searchable knowledge base.
|
|
4
6
|
|
|
5
7
|
```
|
|
6
8
|
┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
|
7
|
-
│
|
|
9
|
+
│ PDF / MD │────▶│ Ollama │────▶│ Ollama │────▶│ libSQL │
|
|
8
10
|
│ (extract) │ │ (LLM) │ │ (embeddings)│ │ (vectors) │
|
|
9
11
|
└─────────────┘ └─────────────┘ └─────────────┘ └─────────────┘
|
|
10
12
|
│ │ │ │
|
|
11
13
|
pdf-parse llama3.2:3b mxbai-embed HNSW index
|
|
12
|
-
markdown
|
|
14
|
+
+ markdown enrichment 1024 dims cosine sim
|
|
13
15
|
```
|
|
14
16
|
|
|
15
17
|
## Features
|
|
16
18
|
|
|
19
|
+
- **PDF + Markdown** - Index `.pdf` and `.md` files with the same workflow
|
|
17
20
|
- **Local-first** - Everything runs on your machine, no API costs
|
|
18
21
|
- **AI enrichment** - LLM extracts titles, summaries, tags, and concepts
|
|
19
22
|
- **SKOS taxonomy** - Organize documents with hierarchical concepts
|
|
20
23
|
- **Vector search** - Semantic search via Ollama embeddings
|
|
21
24
|
- **Hybrid search** - Combine vector similarity with full-text search
|
|
22
|
-
- **
|
|
25
|
+
- **MCP server** - Use with Claude, Cursor, and other AI assistants
|
|
23
26
|
|
|
24
27
|
## Quick Start
|
|
25
28
|
|
|
@@ -105,16 +108,18 @@ pdf-brain init
|
|
|
105
108
|
# Add a PDF
|
|
106
109
|
pdf-brain add /path/to/document.pdf
|
|
107
110
|
|
|
108
|
-
# Add
|
|
111
|
+
# Add a Markdown file
|
|
112
|
+
pdf-brain add /path/to/notes.md
|
|
113
|
+
|
|
114
|
+
# Add from URL (PDF or MD)
|
|
109
115
|
pdf-brain add https://example.com/paper.pdf
|
|
116
|
+
pdf-brain add https://raw.githubusercontent.com/user/repo/main/README.md
|
|
110
117
|
|
|
111
118
|
# Add with manual tags
|
|
112
119
|
pdf-brain add document.pdf --tags "ai,agents,research"
|
|
113
120
|
|
|
114
121
|
# Add with AI enrichment (extracts title, summary, concepts)
|
|
115
122
|
pdf-brain add document.pdf --enrich
|
|
116
|
-
|
|
117
|
-
# Add Markdown file
|
|
118
123
|
pdf-brain add notes.md --enrich
|
|
119
124
|
```
|
|
120
125
|
|
|
@@ -185,11 +190,16 @@ pdf-brain taxonomy seed --file data/taxonomy.json
|
|
|
185
190
|
|
|
186
191
|
### Bulk Ingest
|
|
187
192
|
|
|
193
|
+
Recursively ingest directories containing PDFs and/or Markdown files:
|
|
194
|
+
|
|
188
195
|
```bash
|
|
189
196
|
# Ingest a directory with full LLM enrichment
|
|
190
197
|
pdf-brain ingest ~/Documents/papers --enrich
|
|
191
198
|
|
|
192
|
-
# Ingest
|
|
199
|
+
# Ingest your Obsidian vault or notes folder
|
|
200
|
+
pdf-brain ingest ~/Documents/obsidian --enrich
|
|
201
|
+
|
|
202
|
+
# Ingest multiple directories (PDFs, Markdown, mixed)
|
|
193
203
|
pdf-brain ingest ~/papers ~/books ~/notes --enrich
|
|
194
204
|
|
|
195
205
|
# With manual tags
|
|
@@ -205,6 +215,11 @@ pdf-brain ingest ~/papers --enrich --sample 10
|
|
|
205
215
|
pdf-brain ingest ~/papers --enrich --no-tui
|
|
206
216
|
```
|
|
207
217
|
|
|
218
|
+
**Supported formats:**
|
|
219
|
+
|
|
220
|
+
- `.pdf` - Research papers, books, documents
|
|
221
|
+
- `.md` - Notes, documentation, Obsidian vaults, READMEs
|
|
222
|
+
|
|
208
223
|
## Enrichment
|
|
209
224
|
|
|
210
225
|
When you add documents with `--enrich`, the LLM extracts:
|
|
@@ -222,16 +237,36 @@ When you add documents with `--enrich`, the LLM extracts:
|
|
|
222
237
|
|
|
223
238
|
### LLM Providers
|
|
224
239
|
|
|
225
|
-
Enrichment
|
|
240
|
+
Enrichment supports multiple providers via the config system:
|
|
226
241
|
|
|
227
242
|
```bash
|
|
228
|
-
#
|
|
229
|
-
pdf-brain
|
|
243
|
+
# Check current config
|
|
244
|
+
pdf-brain config show
|
|
245
|
+
|
|
246
|
+
# Use local Ollama (default)
|
|
247
|
+
pdf-brain config set enrichment.provider ollama
|
|
248
|
+
pdf-brain config set enrichment.model llama3.2:3b
|
|
249
|
+
|
|
250
|
+
# Use AI Gateway (Anthropic, OpenAI, etc.)
|
|
251
|
+
pdf-brain config set enrichment.provider gateway
|
|
252
|
+
pdf-brain config set enrichment.model anthropic/claude-haiku-4-5
|
|
253
|
+
export AI_GATEWAY_API_KEY=your-key
|
|
230
254
|
|
|
231
|
-
#
|
|
232
|
-
|
|
255
|
+
# Provider priority: config > CLI flag > auto-detect
|
|
256
|
+
pdf-brain add paper.pdf --enrich # uses config
|
|
257
|
+
pdf-brain add paper.pdf --enrich --provider ollama # override
|
|
233
258
|
```
|
|
234
259
|
|
|
260
|
+
### Enrichment Fallback
|
|
261
|
+
|
|
262
|
+
If LLM enrichment fails (API error, rate limit, malformed response), pdf-brain automatically falls back to heuristic-based enrichment:
|
|
263
|
+
|
|
264
|
+
- **Title**: Cleaned from filename
|
|
265
|
+
- **Tags**: Extracted from path, filename, and content keywords
|
|
266
|
+
- **Category**: Inferred from directory structure
|
|
267
|
+
|
|
268
|
+
The actual error is logged so you can debug provider issues.
|
|
269
|
+
|
|
235
270
|
## Taxonomy
|
|
236
271
|
|
|
237
272
|
The taxonomy is a hierarchical concept system for organizing documents. It ships with a starter taxonomy covering:
|
|
@@ -293,12 +328,78 @@ pdf-brain taxonomy seed --file my-taxonomy.json
|
|
|
293
328
|
|
|
294
329
|
## Configuration
|
|
295
330
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
331
|
+
### Config File
|
|
332
|
+
|
|
333
|
+
pdf-brain stores configuration in `$PDF_LIBRARY_PATH/config.json`:
|
|
334
|
+
|
|
335
|
+
```bash
|
|
336
|
+
# Show all config
|
|
337
|
+
pdf-brain config show
|
|
338
|
+
|
|
339
|
+
# Get a specific value
|
|
340
|
+
pdf-brain config get enrichment.provider
|
|
341
|
+
|
|
342
|
+
# Set a value
|
|
343
|
+
pdf-brain config set enrichment.model anthropic/claude-haiku-4-5
|
|
344
|
+
```
|
|
345
|
+
|
|
346
|
+
### Config Options
|
|
347
|
+
|
|
348
|
+
```json
|
|
349
|
+
{
|
|
350
|
+
"ollama": {
|
|
351
|
+
"host": "http://localhost:11434"
|
|
352
|
+
},
|
|
353
|
+
"embedding": {
|
|
354
|
+
"provider": "ollama",
|
|
355
|
+
"model": "mxbai-embed-large"
|
|
356
|
+
},
|
|
357
|
+
"enrichment": {
|
|
358
|
+
"provider": "gateway",
|
|
359
|
+
"model": "anthropic/claude-haiku-4-5"
|
|
360
|
+
},
|
|
361
|
+
"judge": {
|
|
362
|
+
"provider": "gateway",
|
|
363
|
+
"model": "anthropic/claude-haiku-4-5"
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
```
|
|
367
|
+
|
|
368
|
+
| Setting | Default | Description |
|
|
369
|
+
| --------------------- | ------------------------ | ------------------------------------ |
|
|
370
|
+
| `ollama.host` | `http://localhost:11434` | Ollama API endpoint |
|
|
371
|
+
| `embedding.provider` | `ollama` | Embedding provider (ollama only) |
|
|
372
|
+
| `embedding.model` | `mxbai-embed-large` | Embedding model (1024 dims) |
|
|
373
|
+
| `enrichment.provider` | `ollama` | LLM provider: `ollama` or `gateway` |
|
|
374
|
+
| `enrichment.model` | `llama3.2:3b` | Model for document enrichment |
|
|
375
|
+
| `judge.provider` | `ollama` | Provider for concept deduplication |
|
|
376
|
+
| `judge.model` | `llama3.2:3b` | Model for judging duplicate concepts |
|
|
377
|
+
|
|
378
|
+
### Environment Variables
|
|
379
|
+
|
|
380
|
+
| Variable | Default | Description |
|
|
381
|
+
| -------------------- | -------------------------- | ------------------------ |
|
|
382
|
+
| `PDF_LIBRARY_PATH` | `~/Documents/.pdf-library` | Library storage location |
|
|
383
|
+
| `OLLAMA_HOST` | `http://localhost:11434` | Ollama API endpoint |
|
|
384
|
+
| `AI_GATEWAY_API_KEY` | - | API key for AI Gateway |
|
|
385
|
+
|
|
386
|
+
### AI Gateway
|
|
387
|
+
|
|
388
|
+
For cloud LLM providers (Anthropic, OpenAI, etc.), use the AI Gateway:
|
|
389
|
+
|
|
390
|
+
```bash
|
|
391
|
+
# Set your API key
|
|
392
|
+
export AI_GATEWAY_API_KEY=your-key
|
|
393
|
+
|
|
394
|
+
# Configure to use gateway
|
|
395
|
+
pdf-brain config set enrichment.provider gateway
|
|
396
|
+
pdf-brain config set enrichment.model anthropic/claude-haiku-4-5
|
|
397
|
+
|
|
398
|
+
# Other supported models:
|
|
399
|
+
# - anthropic/claude-sonnet-4-20250514
|
|
400
|
+
# - openai/gpt-4o-mini
|
|
401
|
+
# - openai/gpt-4o
|
|
402
|
+
```
|
|
302
403
|
|
|
303
404
|
## Storage
|
|
304
405
|
|
|
@@ -310,6 +411,25 @@ pdf-brain taxonomy seed --file my-taxonomy.json
|
|
|
310
411
|
└── downloads/ # PDFs downloaded from URLs
|
|
311
412
|
```
|
|
312
413
|
|
|
414
|
+
### Database Size
|
|
415
|
+
|
|
416
|
+
The database can get **large** due to vector index overhead. For ~500k chunks:
|
|
417
|
+
|
|
418
|
+
| Component | Size | Notes |
|
|
419
|
+
| ------------ | ------ | --------------------------------- |
|
|
420
|
+
| Text content | ~180MB | Actual chunk text |
|
|
421
|
+
| Embeddings | ~1.9GB | 500k × 1024 dims × 4 bytes |
|
|
422
|
+
| Vector index | ~48GB | HNSW neighbor graphs (~100KB/row) |
|
|
423
|
+
| FTS index | ~200MB | Full-text search |
|
|
424
|
+
|
|
425
|
+
The `*_idx_shadow` tables store HNSW neighbor graphs for approximate nearest neighbor search. Each row averages ~100KB.
|
|
426
|
+
|
|
427
|
+
**libSQL quirk**: `SELECT COUNT(*) FROM embeddings` returns 0. Always count a specific column:
|
|
428
|
+
|
|
429
|
+
```sql
|
|
430
|
+
SELECT COUNT(chunk_id) FROM embeddings -- correct
|
|
431
|
+
```
|
|
432
|
+
|
|
313
433
|
## How It Works
|
|
314
434
|
|
|
315
435
|
1. **Extract** - PDF text via `pdf-parse`, Markdown parsed directly
|
|
@@ -334,13 +454,44 @@ pdf-brain ships as an MCP server for AI coding assistants:
|
|
|
334
454
|
}
|
|
335
455
|
```
|
|
336
456
|
|
|
337
|
-
Tools
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
457
|
+
### Document Tools
|
|
458
|
+
|
|
459
|
+
| Tool | Description |
|
|
460
|
+
| --------------------- | --------------------------------------------- |
|
|
461
|
+
| `pdf-brain_add` | Add PDF/Markdown to library (supports URLs) |
|
|
462
|
+
| `pdf-brain_batch_add` | Bulk ingest from directory |
|
|
463
|
+
| `pdf-brain_search` | Unified semantic search (docs + concepts) |
|
|
464
|
+
| `pdf-brain_list` | List documents, optionally filter by tag |
|
|
465
|
+
| `pdf-brain_read` | Get document details and metadata |
|
|
466
|
+
| `pdf-brain_remove` | Remove document from library |
|
|
467
|
+
| `pdf-brain_tag` | Set tags on a document |
|
|
468
|
+
| `pdf-brain_stats` | Library statistics (docs, chunks, embeddings) |
|
|
469
|
+
|
|
470
|
+
### Taxonomy Tools
|
|
471
|
+
|
|
472
|
+
| Tool | Description |
|
|
473
|
+
| --------------------------- | ---------------------------------------- |
|
|
474
|
+
| `pdf-brain_taxonomy_list` | List all concepts (optional tree format) |
|
|
475
|
+
| `pdf-brain_taxonomy_tree` | Visual concept tree with box-drawing |
|
|
476
|
+
| `pdf-brain_taxonomy_add` | Add new concept to taxonomy |
|
|
477
|
+
| `pdf-brain_taxonomy_assign` | Assign concept to document |
|
|
478
|
+
| `pdf-brain_taxonomy_search` | Search concepts by label |
|
|
479
|
+
| `pdf-brain_taxonomy_seed` | Load taxonomy from JSON file |
|
|
480
|
+
|
|
481
|
+
### Config Tools
|
|
482
|
+
|
|
483
|
+
| Tool | Description |
|
|
484
|
+
| ----------------------- | ------------------------- |
|
|
485
|
+
| `pdf-brain_config_show` | Display all config |
|
|
486
|
+
| `pdf-brain_config_get` | Get specific config value |
|
|
487
|
+
| `pdf-brain_config_set` | Set config value |
|
|
488
|
+
|
|
489
|
+
### Utility Tools
|
|
490
|
+
|
|
491
|
+
| Tool | Description |
|
|
492
|
+
| ------------------ | ----------------------------- |
|
|
493
|
+
| `pdf-brain_check` | Check if Ollama is ready |
|
|
494
|
+
| `pdf-brain_repair` | Fix database integrity issues |
|
|
344
495
|
|
|
345
496
|
## Troubleshooting
|
|
346
497
|
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pdf-brain",
|
|
3
|
-
"version": "1.1.
|
|
4
|
-
"description": "Local PDF knowledge base with
|
|
3
|
+
"version": "1.1.3",
|
|
4
|
+
"description": "Local PDF & Markdown knowledge base with semantic search, AI enrichment, and SKOS taxonomy",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "src/index.ts",
|
|
7
7
|
"bin": {
|
|
@@ -15,7 +15,8 @@
|
|
|
15
15
|
"setup": "./scripts/setup.sh",
|
|
16
16
|
"changeset": "changeset",
|
|
17
17
|
"version": "changeset version",
|
|
18
|
-
"release": "bun test && bun run typecheck && changeset publish"
|
|
18
|
+
"release": "bun test && bun run typecheck && changeset publish",
|
|
19
|
+
"prepare": "husky"
|
|
19
20
|
},
|
|
20
21
|
"dependencies": {
|
|
21
22
|
"@ai-sdk/anthropic": "^2.0.56",
|
|
@@ -23,6 +24,7 @@
|
|
|
23
24
|
"@effect/platform": "^0.72.0",
|
|
24
25
|
"@effect/platform-bun": "^0.52.0",
|
|
25
26
|
"@effect/schema": "^0.75.0",
|
|
27
|
+
"@electric-sql/pglite": "^0.3.0",
|
|
26
28
|
"@libsql/client": "^0.15.15",
|
|
27
29
|
"ai": "^5.0.115",
|
|
28
30
|
"dotenv": "^17.2.3",
|
|
@@ -40,10 +42,14 @@
|
|
|
40
42
|
},
|
|
41
43
|
"devDependencies": {
|
|
42
44
|
"@changesets/cli": "^2.29.8",
|
|
45
|
+
"@secretlint/secretlint-rule-preset-recommend": "^11.2.5",
|
|
43
46
|
"@types/bun": "latest",
|
|
44
47
|
"@types/mdast": "^4.0.4",
|
|
45
48
|
"@types/react": "^19.2.7",
|
|
49
|
+
"husky": "^9.1.7",
|
|
50
|
+
"lint-staged": "^16.2.7",
|
|
46
51
|
"react-devtools-core": "^7.0.1",
|
|
52
|
+
"secretlint": "^11.2.5",
|
|
47
53
|
"typescript": "^5.7.2"
|
|
48
54
|
},
|
|
49
55
|
"peerDependencies": {
|
|
@@ -58,12 +64,29 @@
|
|
|
58
64
|
],
|
|
59
65
|
"keywords": [
|
|
60
66
|
"pdf",
|
|
67
|
+
"markdown",
|
|
68
|
+
"md",
|
|
69
|
+
"knowledge-base",
|
|
61
70
|
"vector-search",
|
|
71
|
+
"semantic-search",
|
|
62
72
|
"embeddings",
|
|
73
|
+
"rag",
|
|
74
|
+
"ollama",
|
|
75
|
+
"local-first",
|
|
76
|
+
"ai",
|
|
77
|
+
"llm",
|
|
78
|
+
"taxonomy",
|
|
79
|
+
"skos",
|
|
63
80
|
"libsql",
|
|
64
|
-
"
|
|
65
|
-
"effect-ts"
|
|
81
|
+
"mcp",
|
|
82
|
+
"effect-ts",
|
|
83
|
+
"document-management",
|
|
84
|
+
"personal-knowledge-base",
|
|
85
|
+
"pkm"
|
|
66
86
|
],
|
|
67
87
|
"author": "Joel Hooks",
|
|
68
|
-
"license": "MIT"
|
|
88
|
+
"license": "MIT",
|
|
89
|
+
"lint-staged": {
|
|
90
|
+
"*": "secretlint"
|
|
91
|
+
}
|
|
69
92
|
}
|
package/scripts/bulk-ingest.ts
CHANGED
|
@@ -283,6 +283,28 @@ ${
|
|
|
283
283
|
})
|
|
284
284
|
);
|
|
285
285
|
|
|
286
|
+
if (enrichResult._tag === "Left") {
|
|
287
|
+
// Check for fatal errors that should stop the entire batch
|
|
288
|
+
const errMsg = String(enrichResult.left.message || enrichResult.left);
|
|
289
|
+
const isFatalError =
|
|
290
|
+
errMsg.includes("Insufficient funds") ||
|
|
291
|
+
errMsg.includes("rate limit") ||
|
|
292
|
+
errMsg.includes("Rate limit") ||
|
|
293
|
+
errMsg.includes("quota exceeded") ||
|
|
294
|
+
errMsg.includes("API key");
|
|
295
|
+
|
|
296
|
+
if (isFatalError) {
|
|
297
|
+
console.error(`\n\n🛑 FATAL ERROR: ${errMsg}`);
|
|
298
|
+
console.error(
|
|
299
|
+
"Stopping batch import to avoid wasting resources.\n"
|
|
300
|
+
);
|
|
301
|
+
process.exit(1);
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
// Log the actual error so we can debug
|
|
305
|
+
console.log(` ⚠️ Enrichment failed: ${errMsg}`);
|
|
306
|
+
}
|
|
307
|
+
|
|
286
308
|
if (enrichResult._tag === "Right") {
|
|
287
309
|
const r = enrichResult.right;
|
|
288
310
|
title = r.title;
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* Backfill document_concepts from existing tags
|
|
4
|
+
*
|
|
5
|
+
* Tags are stored as the leaf part of concept IDs (e.g., "instructional-design")
|
|
6
|
+
* Concepts are stored with category prefix (e.g., "education/instructional-design")
|
|
7
|
+
*
|
|
8
|
+
* This script:
|
|
9
|
+
* 1. Builds a mapping from normalized tag -> concept ID
|
|
10
|
+
* 2. For each document, matches its tags to concepts
|
|
11
|
+
* 3. Inserts into document_concepts join table
|
|
12
|
+
*
|
|
13
|
+
* Usage:
|
|
14
|
+
* bun run scripts/migration/backfill-document-concepts.ts
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import { createClient } from "@libsql/client";
|
|
18
|
+
import { join } from "path";
|
|
19
|
+
|
|
20
|
+
const dbPath = `file:${join(
|
|
21
|
+
process.env.HOME!,
|
|
22
|
+
"Documents/.pdf-library/library.db"
|
|
23
|
+
)}`;
|
|
24
|
+
|
|
25
|
+
interface Concept {
|
|
26
|
+
id: string;
|
|
27
|
+
pref_label: string;
|
|
28
|
+
alt_labels: string;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
interface Document {
|
|
32
|
+
id: string;
|
|
33
|
+
title: string;
|
|
34
|
+
tags: string;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function normalizeTag(s: string): string {
|
|
38
|
+
return s
|
|
39
|
+
.toLowerCase()
|
|
40
|
+
.replace(/[^a-z0-9]+/g, "-")
|
|
41
|
+
.replace(/^-|-$/g, "");
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
async function main() {
|
|
45
|
+
console.log("=== Backfill document_concepts ===\n");
|
|
46
|
+
console.log(`Database: ${dbPath}\n`);
|
|
47
|
+
|
|
48
|
+
const client = createClient({ url: dbPath });
|
|
49
|
+
|
|
50
|
+
// Step 1: Build tag -> concept mapping
|
|
51
|
+
console.log("Building tag -> concept mapping...");
|
|
52
|
+
|
|
53
|
+
const conceptsResult = await client.execute(
|
|
54
|
+
"SELECT id, pref_label, alt_labels FROM concepts"
|
|
55
|
+
);
|
|
56
|
+
|
|
57
|
+
const tagToConcept = new Map<string, string>();
|
|
58
|
+
let conceptCount = 0;
|
|
59
|
+
|
|
60
|
+
for (const row of conceptsResult.rows) {
|
|
61
|
+
const concept = row as unknown as Concept;
|
|
62
|
+
conceptCount++;
|
|
63
|
+
|
|
64
|
+
// Extract leaf from concept ID (e.g., "education/instructional-design" -> "instructional-design")
|
|
65
|
+
const leaf = concept.id.includes("/")
|
|
66
|
+
? concept.id.split("/").pop()!
|
|
67
|
+
: concept.id;
|
|
68
|
+
|
|
69
|
+
// Map normalized leaf to concept ID
|
|
70
|
+
tagToConcept.set(normalizeTag(leaf), concept.id);
|
|
71
|
+
|
|
72
|
+
// Also map normalized pref_label
|
|
73
|
+
tagToConcept.set(normalizeTag(concept.pref_label), concept.id);
|
|
74
|
+
|
|
75
|
+
// Map alt_labels
|
|
76
|
+
const altLabels: string[] = JSON.parse(concept.alt_labels || "[]");
|
|
77
|
+
for (const alt of altLabels) {
|
|
78
|
+
tagToConcept.set(normalizeTag(alt), concept.id);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
console.log(` ${conceptCount} concepts loaded`);
|
|
83
|
+
console.log(` ${tagToConcept.size} tag mappings created\n`);
|
|
84
|
+
|
|
85
|
+
// Step 2: Get all documents with tags
|
|
86
|
+
console.log("Processing documents...");
|
|
87
|
+
|
|
88
|
+
const docsResult = await client.execute(
|
|
89
|
+
"SELECT id, title, tags FROM documents"
|
|
90
|
+
);
|
|
91
|
+
|
|
92
|
+
let docsProcessed = 0;
|
|
93
|
+
let linksCreated = 0;
|
|
94
|
+
let docsWithConcepts = 0;
|
|
95
|
+
|
|
96
|
+
for (const row of docsResult.rows) {
|
|
97
|
+
const doc = row as unknown as Document;
|
|
98
|
+
const tags: string[] = JSON.parse(doc.tags || "[]");
|
|
99
|
+
|
|
100
|
+
if (tags.length === 0) continue;
|
|
101
|
+
|
|
102
|
+
const matchedConcepts = new Set<string>();
|
|
103
|
+
|
|
104
|
+
for (const tag of tags) {
|
|
105
|
+
const normalizedTag = normalizeTag(tag);
|
|
106
|
+
const conceptId = tagToConcept.get(normalizedTag);
|
|
107
|
+
if (conceptId) {
|
|
108
|
+
matchedConcepts.add(conceptId);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
if (matchedConcepts.size > 0) {
|
|
113
|
+
docsWithConcepts++;
|
|
114
|
+
|
|
115
|
+
// Insert into document_concepts
|
|
116
|
+
for (const conceptId of matchedConcepts) {
|
|
117
|
+
try {
|
|
118
|
+
await client.execute({
|
|
119
|
+
sql: `INSERT INTO document_concepts (doc_id, concept_id, confidence, source)
|
|
120
|
+
VALUES (?, ?, ?, ?)
|
|
121
|
+
ON CONFLICT (doc_id, concept_id) DO NOTHING`,
|
|
122
|
+
args: [doc.id, conceptId, 0.8, "backfill"],
|
|
123
|
+
});
|
|
124
|
+
linksCreated++;
|
|
125
|
+
} catch (e) {
|
|
126
|
+
// Ignore constraint violations (concept doesn't exist)
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
docsProcessed++;
|
|
132
|
+
if (docsProcessed % 100 === 0) {
|
|
133
|
+
console.log(` Processed ${docsProcessed} documents...`);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Step 3: Summary
|
|
138
|
+
const finalCount = await client.execute(
|
|
139
|
+
"SELECT COUNT(doc_id) as count FROM document_concepts"
|
|
140
|
+
);
|
|
141
|
+
const totalLinks = Number((finalCount.rows[0] as any).count || 0);
|
|
142
|
+
|
|
143
|
+
console.log("\n=== Complete ===");
|
|
144
|
+
console.log(`Documents processed: ${docsProcessed}`);
|
|
145
|
+
console.log(`Documents with concepts: ${docsWithConcepts}`);
|
|
146
|
+
console.log(`Links created: ${linksCreated}`);
|
|
147
|
+
console.log(`Total document_concepts: ${totalLinks}`);
|
|
148
|
+
|
|
149
|
+
// Show sample
|
|
150
|
+
console.log("\nSample links:");
|
|
151
|
+
const sample = await client.execute(`
|
|
152
|
+
SELECT d.title, c.pref_label
|
|
153
|
+
FROM document_concepts dc
|
|
154
|
+
JOIN documents d ON d.id = dc.doc_id
|
|
155
|
+
JOIN concepts c ON c.id = dc.concept_id
|
|
156
|
+
LIMIT 10
|
|
157
|
+
`);
|
|
158
|
+
|
|
159
|
+
for (const row of sample.rows) {
|
|
160
|
+
console.log(` "${row.title}" -> ${row.pref_label}`);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
client.close();
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
main().catch((e) => {
|
|
167
|
+
console.error("Backfill failed:", e);
|
|
168
|
+
process.exit(1);
|
|
169
|
+
});
|