pdf-brain 1.0.0 β 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +354 -103
- package/package.json +2 -1
- package/scripts/analyze-tags.ts +171 -0
- package/scripts/benchmark-vector-search.ts +222 -0
- package/scripts/bulk-ingest.ts +447 -0
- package/scripts/migration/add-compression.ts +162 -0
- package/scripts/migration/add-concept-embeddings.ts +228 -0
- package/scripts/seed-taxonomy.ts +90 -0
- package/src/cli.ts +886 -92
- package/src/services/AutoTagger.test.ts +52 -0
- package/src/services/AutoTagger.ts +692 -157
- package/src/services/LibSQLDatabase.test.ts +229 -0
- package/src/services/LibSQLDatabase.ts +147 -26
- package/src/services/Ollama.ts +68 -15
- package/src/services/TaxonomyService.test.ts +433 -0
- package/src/services/TaxonomyService.ts +670 -0
- package/src/types.test.ts +205 -0
- package/src/types.ts +163 -15
package/README.md
CHANGED
|
@@ -1,159 +1,410 @@
|
|
|
1
|
-
#
|
|
1
|
+
# pdf-brain
|
|
2
2
|
|
|
3
|
-
Local PDF knowledge base with
|
|
3
|
+
Local PDF & Markdown knowledge base with semantic search and AI-powered enrichment.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
```
|
|
6
|
+
βββββββββββββββ βββββββββββββββ βββββββββββββββ βββββββββββββββ
|
|
7
|
+
β PDF/MD ββββββΆβ Ollama ββββββΆβ Ollama ββββββΆβ libSQL β
|
|
8
|
+
β (extract) β β (LLM) β β (embeddings)β β (vectors) β
|
|
9
|
+
βββββββββββββββ βββββββββββββββ βββββββββββββββ βββββββββββββββ
|
|
10
|
+
β β β β
|
|
11
|
+
pdf-parse llama3.2:3b mxbai-embed HNSW index
|
|
12
|
+
markdown enrichment 1024 dims cosine sim
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Features
|
|
16
|
+
|
|
17
|
+
- **Local-first** - Everything runs on your machine, no API costs
|
|
18
|
+
- **AI enrichment** - LLM extracts titles, summaries, tags, and concepts
|
|
19
|
+
- **SKOS taxonomy** - Organize documents with hierarchical concepts
|
|
20
|
+
- **Vector search** - Semantic search via Ollama embeddings
|
|
21
|
+
- **Hybrid search** - Combine vector similarity with full-text search
|
|
22
|
+
- **Markdown support** - Index `.md` files alongside PDFs
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
6
25
|
|
|
7
26
|
```bash
|
|
8
|
-
|
|
27
|
+
# 1. Install
|
|
28
|
+
npm install -g pdf-brain
|
|
9
29
|
|
|
10
|
-
#
|
|
30
|
+
# 2. Install Ollama (macOS)
|
|
11
31
|
brew install ollama
|
|
32
|
+
|
|
33
|
+
# 3. Pull required models
|
|
34
|
+
ollama pull mxbai-embed-large # embeddings (required)
|
|
35
|
+
ollama pull llama3.2:3b # enrichment (optional but recommended)
|
|
36
|
+
|
|
37
|
+
# 4. Start Ollama
|
|
38
|
+
ollama serve
|
|
39
|
+
|
|
40
|
+
# 5. Initialize (creates DB + seeds starter taxonomy)
|
|
41
|
+
pdf-brain init
|
|
42
|
+
|
|
43
|
+
# 6. Add your first document
|
|
44
|
+
pdf-brain add ~/Documents/paper.pdf --enrich
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Installation
|
|
48
|
+
|
|
49
|
+
### Prerequisites
|
|
50
|
+
|
|
51
|
+
**Ollama** is required for embeddings. The LLM model is optional but recommended for enrichment.
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
# macOS
|
|
55
|
+
brew install ollama
|
|
56
|
+
|
|
57
|
+
# Linux
|
|
58
|
+
curl -fsSL https://ollama.com/install.sh | sh
|
|
59
|
+
|
|
60
|
+
# Windows
|
|
61
|
+
# Download from https://ollama.com/download
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Models
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
# Required: Embedding model (1024 dimensions)
|
|
12
68
|
ollama pull mxbai-embed-large
|
|
69
|
+
|
|
70
|
+
# Recommended: Local LLM for enrichment
|
|
71
|
+
ollama pull llama3.2:3b
|
|
72
|
+
|
|
73
|
+
# Start Ollama server
|
|
74
|
+
ollama serve
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Install pdf-brain
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
# npm
|
|
81
|
+
npm install -g pdf-brain
|
|
82
|
+
|
|
83
|
+
# or run directly
|
|
84
|
+
npx pdf-brain <command>
|
|
13
85
|
```
|
|
14
86
|
|
|
15
|
-
## CLI
|
|
87
|
+
## CLI Reference
|
|
88
|
+
|
|
89
|
+
### Basic Commands
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
# Check Ollama status
|
|
93
|
+
pdf-brain check
|
|
94
|
+
|
|
95
|
+
# Show library stats
|
|
96
|
+
pdf-brain stats
|
|
97
|
+
|
|
98
|
+
# Initialize library (creates DB, seeds taxonomy)
|
|
99
|
+
pdf-brain init
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Adding Documents
|
|
16
103
|
|
|
17
104
|
```bash
|
|
18
105
|
# Add a PDF
|
|
19
|
-
|
|
106
|
+
pdf-brain add /path/to/document.pdf
|
|
20
107
|
|
|
21
108
|
# Add from URL
|
|
22
|
-
|
|
109
|
+
pdf-brain add https://example.com/paper.pdf
|
|
110
|
+
|
|
111
|
+
# Add with manual tags
|
|
112
|
+
pdf-brain add document.pdf --tags "ai,agents,research"
|
|
113
|
+
|
|
114
|
+
# Add with AI enrichment (extracts title, summary, concepts)
|
|
115
|
+
pdf-brain add document.pdf --enrich
|
|
116
|
+
|
|
117
|
+
# Add Markdown file
|
|
118
|
+
pdf-brain add notes.md --enrich
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Searching
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
# Semantic search (uses embeddings)
|
|
125
|
+
pdf-brain search "context engineering patterns"
|
|
126
|
+
|
|
127
|
+
# Full-text search only (faster, no embeddings)
|
|
128
|
+
pdf-brain search "context engineering" --fts
|
|
129
|
+
|
|
130
|
+
# Hybrid search (combines both)
|
|
131
|
+
pdf-brain search "machine learning" --hybrid
|
|
23
132
|
|
|
24
|
-
#
|
|
25
|
-
|
|
133
|
+
# Limit results
|
|
134
|
+
pdf-brain search "query" --limit 5
|
|
26
135
|
|
|
27
|
-
#
|
|
28
|
-
|
|
136
|
+
# Expand context around matches
|
|
137
|
+
pdf-brain search "query" --expand 500
|
|
138
|
+
```
|
|
29
139
|
|
|
30
|
-
|
|
31
|
-
npx pdf-brain search "context engineering" --fts
|
|
140
|
+
### Managing Documents
|
|
32
141
|
|
|
142
|
+
```bash
|
|
33
143
|
# List all documents
|
|
34
|
-
|
|
144
|
+
pdf-brain list
|
|
35
145
|
|
|
36
146
|
# List by tag
|
|
37
|
-
|
|
147
|
+
pdf-brain list --tag ai
|
|
38
148
|
|
|
39
149
|
# Get document details
|
|
40
|
-
|
|
150
|
+
pdf-brain read "document-title"
|
|
41
151
|
|
|
42
152
|
# Remove a document
|
|
43
|
-
|
|
153
|
+
pdf-brain remove "document-title"
|
|
44
154
|
|
|
45
155
|
# Update tags
|
|
46
|
-
|
|
156
|
+
pdf-brain tag "document-title" "new,tags,here"
|
|
157
|
+
```
|
|
47
158
|
|
|
48
|
-
|
|
49
|
-
npx pdf-brain stats
|
|
159
|
+
### Taxonomy Commands
|
|
50
160
|
|
|
51
|
-
|
|
52
|
-
|
|
161
|
+
The taxonomy system uses SKOS (Simple Knowledge Organization System) for hierarchical concept organization.
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
# List all concepts
|
|
165
|
+
pdf-brain taxonomy list
|
|
166
|
+
|
|
167
|
+
# Show concept tree
|
|
168
|
+
pdf-brain taxonomy tree
|
|
169
|
+
|
|
170
|
+
# Show subtree from a concept
|
|
171
|
+
pdf-brain taxonomy tree programming
|
|
172
|
+
|
|
173
|
+
# Search concepts
|
|
174
|
+
pdf-brain taxonomy search "machine learning"
|
|
175
|
+
|
|
176
|
+
# Add a new concept
|
|
177
|
+
pdf-brain taxonomy add ai/transformers --label "Transformers" --broader ai-ml
|
|
178
|
+
|
|
179
|
+
# Assign concept to document
|
|
180
|
+
pdf-brain taxonomy assign "doc-id" "programming/typescript"
|
|
181
|
+
|
|
182
|
+
# Seed taxonomy from JSON file
|
|
183
|
+
pdf-brain taxonomy seed --file data/taxonomy.json
|
|
53
184
|
```
|
|
54
185
|
|
|
55
|
-
|
|
186
|
+
### Bulk Ingest
|
|
56
187
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
-
|
|
60
|
-
- **iCloud sync** - Default storage in `~/Documents/.pdf-library/`
|
|
61
|
-
- **PGlite + pgvector** - Real Postgres vector search, no server needed
|
|
188
|
+
```bash
|
|
189
|
+
# Ingest a directory with full LLM enrichment
|
|
190
|
+
pdf-brain ingest ~/Documents/papers --enrich
|
|
62
191
|
|
|
63
|
-
|
|
192
|
+
# Ingest multiple directories
|
|
193
|
+
pdf-brain ingest ~/papers ~/books ~/notes --enrich
|
|
64
194
|
|
|
65
|
-
|
|
195
|
+
# With manual tags
|
|
196
|
+
pdf-brain ingest ~/books --tags "books,reference"
|
|
66
197
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
import { $ } from "bun";
|
|
198
|
+
# Auto-tag only (faster, heuristics + light LLM)
|
|
199
|
+
pdf-brain ingest ~/docs --auto-tag
|
|
70
200
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
return result.trim();
|
|
74
|
-
}
|
|
201
|
+
# Process only first N files (for testing)
|
|
202
|
+
pdf-brain ingest ~/papers --enrich --sample 10
|
|
75
203
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
args: {
|
|
79
|
-
path: tool.schema.string().describe("Path to PDF file or URL"),
|
|
80
|
-
tags: tool.schema.string().optional().describe("Comma-separated tags"),
|
|
81
|
-
},
|
|
82
|
-
async execute({ path, tags }) {
|
|
83
|
-
const args = ["add", path];
|
|
84
|
-
if (tags) args.push("--tags", tags);
|
|
85
|
-
return run(args);
|
|
86
|
-
},
|
|
87
|
-
});
|
|
88
|
-
|
|
89
|
-
export const search = tool({
|
|
90
|
-
description: "Semantic search across all PDFs",
|
|
91
|
-
args: {
|
|
92
|
-
query: tool.schema.string().describe("Natural language search query"),
|
|
93
|
-
limit: tool.schema.number().optional().describe("Max results"),
|
|
94
|
-
fts: tool.schema.boolean().optional().describe("Full-text search only"),
|
|
95
|
-
},
|
|
96
|
-
async execute({ query, limit, fts }) {
|
|
97
|
-
const args = ["search", query];
|
|
98
|
-
if (limit) args.push("--limit", String(limit));
|
|
99
|
-
if (fts) args.push("--fts");
|
|
100
|
-
return run(args);
|
|
101
|
-
},
|
|
102
|
-
});
|
|
103
|
-
|
|
104
|
-
export const list = tool({
|
|
105
|
-
description: "List all PDFs in the library",
|
|
106
|
-
args: { tag: tool.schema.string().optional() },
|
|
107
|
-
async execute({ tag }) {
|
|
108
|
-
const args = ["list"];
|
|
109
|
-
if (tag) args.push("--tag", tag);
|
|
110
|
-
return run(args);
|
|
111
|
-
},
|
|
112
|
-
});
|
|
113
|
-
|
|
114
|
-
export const stats = tool({
|
|
115
|
-
description: "Show library statistics",
|
|
116
|
-
args: {},
|
|
117
|
-
async execute() {
|
|
118
|
-
return run(["stats"]);
|
|
119
|
-
},
|
|
120
|
-
});
|
|
204
|
+
# Disable TUI for simple output
|
|
205
|
+
pdf-brain ingest ~/papers --enrich --no-tui
|
|
121
206
|
```
|
|
122
207
|
|
|
123
|
-
##
|
|
208
|
+
## Enrichment
|
|
124
209
|
|
|
125
|
-
|
|
126
|
-
| ------------------ | -------------------------- | ------------------------ |
|
|
127
|
-
| `PDF_LIBRARY_PATH` | `~/Documents/.pdf-library` | Library storage location |
|
|
128
|
-
| `OLLAMA_HOST` | `http://localhost:11434` | Ollama API endpoint |
|
|
129
|
-
| `OLLAMA_MODEL` | `mxbai-embed-large` | Embedding model |
|
|
210
|
+
When you add documents with `--enrich`, the LLM extracts:
|
|
130
211
|
|
|
131
|
-
|
|
212
|
+
| Field | Description |
|
|
213
|
+
| -------------------- | ------------------------------------------- |
|
|
214
|
+
| **title** | Clean, properly formatted title |
|
|
215
|
+
| **author** | Author name(s) if detectable |
|
|
216
|
+
| **summary** | 2-3 sentence summary |
|
|
217
|
+
| **documentType** | book, paper, tutorial, guide, article, etc. |
|
|
218
|
+
| **category** | Primary category |
|
|
219
|
+
| **tags** | 5-10 descriptive tags |
|
|
220
|
+
| **concepts** | Matched concepts from your taxonomy |
|
|
221
|
+
| **proposedConcepts** | New concepts the LLM suggests adding |
|
|
222
|
+
|
|
223
|
+
### LLM Providers
|
|
224
|
+
|
|
225
|
+
Enrichment tries local Ollama first, falls back to Anthropic if configured:
|
|
226
|
+
|
|
227
|
+
```bash
|
|
228
|
+
# Uses Ollama by default (llama3.2:3b)
|
|
229
|
+
pdf-brain add paper.pdf --enrich
|
|
230
|
+
|
|
231
|
+
# Set Anthropic API key for fallback
|
|
232
|
+
export ANTHROPIC_API_KEY=sk-ant-...
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
## Taxonomy
|
|
236
|
+
|
|
237
|
+
The taxonomy is a hierarchical concept system for organizing documents. It ships with a starter taxonomy covering:
|
|
238
|
+
|
|
239
|
+
- **Programming** - TypeScript, React, Next.js, Testing, Architecture, DevOps, AI/ML
|
|
240
|
+
- **Education** - Instructional Design, Learning Science, Course Creation, Assessment
|
|
241
|
+
- **Business** - Marketing, Copywriting, Bootstrapping, Product, Sales
|
|
242
|
+
- **Design** - UX, Visual Design, Systems Thinking, Information Architecture
|
|
243
|
+
- **Meta** - Productivity, Note-taking, Knowledge Management, Writing
|
|
244
|
+
|
|
245
|
+
### Growing Your Taxonomy
|
|
246
|
+
|
|
247
|
+
When enriching documents, the LLM may propose new concepts. These are saved for review:
|
|
248
|
+
|
|
249
|
+
```bash
|
|
250
|
+
# See proposed concepts from enrichment
|
|
251
|
+
pdf-brain taxonomy proposed
|
|
132
252
|
|
|
253
|
+
# Accept a specific concept
|
|
254
|
+
pdf-brain taxonomy accept ai/rag --broader ai-ml
|
|
255
|
+
|
|
256
|
+
# Accept all proposed concepts
|
|
257
|
+
pdf-brain taxonomy accept --all
|
|
258
|
+
|
|
259
|
+
# Reject a concept
|
|
260
|
+
pdf-brain taxonomy reject ai/rag
|
|
261
|
+
|
|
262
|
+
# Clear all proposals
|
|
263
|
+
pdf-brain taxonomy clear-proposed
|
|
264
|
+
|
|
265
|
+
# Manually add a concept
|
|
266
|
+
pdf-brain taxonomy add ai/rag --label "RAG" --broader ai-ml
|
|
267
|
+
|
|
268
|
+
# Or edit data/taxonomy.json and re-seed
|
|
269
|
+
pdf-brain taxonomy seed --file data/taxonomy.json
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
### Custom Taxonomy
|
|
273
|
+
|
|
274
|
+
Create your own `taxonomy.json`:
|
|
275
|
+
|
|
276
|
+
```json
|
|
277
|
+
{
|
|
278
|
+
"concepts": [
|
|
279
|
+
{ "id": "cooking", "prefLabel": "Cooking" },
|
|
280
|
+
{ "id": "cooking/baking", "prefLabel": "Baking" },
|
|
281
|
+
{ "id": "cooking/grilling", "prefLabel": "Grilling" }
|
|
282
|
+
],
|
|
283
|
+
"hierarchy": [
|
|
284
|
+
{ "conceptId": "cooking/baking", "broaderId": "cooking" },
|
|
285
|
+
{ "conceptId": "cooking/grilling", "broaderId": "cooking" }
|
|
286
|
+
]
|
|
287
|
+
}
|
|
133
288
|
```
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
βββββββββββββββ βββββββββββββββ βββββββββββββββ
|
|
138
|
-
β β β
|
|
139
|
-
pypdf mxbai-embed HNSW index
|
|
140
|
-
chunks 1024 dims cosine sim
|
|
289
|
+
|
|
290
|
+
```bash
|
|
291
|
+
pdf-brain taxonomy seed --file my-taxonomy.json
|
|
141
292
|
```
|
|
142
293
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
294
|
+
## Configuration
|
|
295
|
+
|
|
296
|
+
| Variable | Default | Description |
|
|
297
|
+
| ------------------- | -------------------------- | ------------------------ |
|
|
298
|
+
| `PDF_LIBRARY_PATH` | `~/Documents/.pdf-library` | Library storage location |
|
|
299
|
+
| `OLLAMA_HOST` | `http://localhost:11434` | Ollama API endpoint |
|
|
300
|
+
| `OLLAMA_MODEL` | `mxbai-embed-large` | Embedding model |
|
|
301
|
+
| `ANTHROPIC_API_KEY` | - | Fallback for enrichment |
|
|
148
302
|
|
|
149
303
|
## Storage
|
|
150
304
|
|
|
151
305
|
```
|
|
152
306
|
~/Documents/.pdf-library/
|
|
153
|
-
βββ library.db #
|
|
307
|
+
βββ library.db # libSQL database (vectors, FTS, metadata, taxonomy)
|
|
308
|
+
βββ library.db-shm # Shared memory (WAL mode)
|
|
309
|
+
βββ library.db-wal # Write-ahead log
|
|
154
310
|
βββ downloads/ # PDFs downloaded from URLs
|
|
155
311
|
```
|
|
156
312
|
|
|
313
|
+
## How It Works
|
|
314
|
+
|
|
315
|
+
1. **Extract** - PDF text via `pdf-parse`, Markdown parsed directly
|
|
316
|
+
2. **Enrich** (optional) - LLM extracts metadata, matches taxonomy concepts
|
|
317
|
+
3. **Chunk** - Text split into ~512 token chunks with overlap
|
|
318
|
+
4. **Embed** - Each chunk embedded via Ollama (1024 dimensions)
|
|
319
|
+
5. **Store** - libSQL with vector index (HNSW) + FTS5
|
|
320
|
+
6. **Search** - Query embedded, compared via cosine similarity
|
|
321
|
+
|
|
322
|
+
## MCP Integration
|
|
323
|
+
|
|
324
|
+
pdf-brain ships as an MCP server for AI coding assistants:
|
|
325
|
+
|
|
326
|
+
```json
|
|
327
|
+
{
|
|
328
|
+
"mcpServers": {
|
|
329
|
+
"pdf-brain": {
|
|
330
|
+
"command": "npx",
|
|
331
|
+
"args": ["pdf-brain", "mcp"]
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
Tools exposed:
|
|
338
|
+
|
|
339
|
+
- `pdf-brain_add` - Add document to library
|
|
340
|
+
- `pdf-brain_search` - Semantic search
|
|
341
|
+
- `pdf-brain_list` - List documents
|
|
342
|
+
- `pdf-brain_read` - Get document details
|
|
343
|
+
- `pdf-brain_stats` - Library statistics
|
|
344
|
+
|
|
345
|
+
## Troubleshooting
|
|
346
|
+
|
|
347
|
+
### "Ollama not available"
|
|
348
|
+
|
|
349
|
+
```bash
|
|
350
|
+
# Check if Ollama is running
|
|
351
|
+
curl http://localhost:11434/api/tags
|
|
352
|
+
|
|
353
|
+
# Start Ollama
|
|
354
|
+
ollama serve
|
|
355
|
+
|
|
356
|
+
# Check models
|
|
357
|
+
ollama list
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
### "Model not found"
|
|
361
|
+
|
|
362
|
+
```bash
|
|
363
|
+
# Pull required models
|
|
364
|
+
ollama pull mxbai-embed-large
|
|
365
|
+
ollama pull llama3.2:3b
|
|
366
|
+
```
|
|
367
|
+
|
|
368
|
+
### "Database locked"
|
|
369
|
+
|
|
370
|
+
The database uses WAL mode. If you see lock errors:
|
|
371
|
+
|
|
372
|
+
```bash
|
|
373
|
+
# Check for zombie processes
|
|
374
|
+
lsof ~/Documents/.pdf-library/library.db*
|
|
375
|
+
|
|
376
|
+
# Force checkpoint
|
|
377
|
+
sqlite3 ~/Documents/.pdf-library/library.db "PRAGMA wal_checkpoint(TRUNCATE);"
|
|
378
|
+
```
|
|
379
|
+
|
|
380
|
+
### Slow enrichment
|
|
381
|
+
|
|
382
|
+
Enrichment is CPU-intensive. For large batches:
|
|
383
|
+
|
|
384
|
+
- Use `--auto-tag` instead of `--enrich` for faster processing
|
|
385
|
+
- Run overnight for large libraries
|
|
386
|
+
- Consider GPU acceleration for Ollama
|
|
387
|
+
|
|
388
|
+
## Development
|
|
389
|
+
|
|
390
|
+
```bash
|
|
391
|
+
# Clone
|
|
392
|
+
git clone https://github.com/joelhooks/pdf-brain
|
|
393
|
+
cd pdf-brain
|
|
394
|
+
|
|
395
|
+
# Install
|
|
396
|
+
bun install
|
|
397
|
+
|
|
398
|
+
# Run CLI
|
|
399
|
+
bun run src/cli.ts <command>
|
|
400
|
+
|
|
401
|
+
# Run tests
|
|
402
|
+
bun test
|
|
403
|
+
|
|
404
|
+
# Type check
|
|
405
|
+
bun run typecheck
|
|
406
|
+
```
|
|
407
|
+
|
|
157
408
|
## License
|
|
158
409
|
|
|
159
410
|
MIT
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pdf-brain",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.1.0",
|
|
4
4
|
"description": "Local PDF knowledge base with vector search",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "src/index.ts",
|
|
@@ -25,6 +25,7 @@
|
|
|
25
25
|
"@effect/schema": "^0.75.0",
|
|
26
26
|
"@libsql/client": "^0.15.15",
|
|
27
27
|
"ai": "^5.0.115",
|
|
28
|
+
"dotenv": "^17.2.3",
|
|
28
29
|
"effect": "^3.12.0",
|
|
29
30
|
"gray-matter": "^4.0.3",
|
|
30
31
|
"ink": "^6.5.1",
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Analyze tag distribution in the library database
|
|
5
|
+
* Outputs statistics useful for designing partial index strategy
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { createClient, type ResultSet } from "@libsql/client";
|
|
9
|
+
import os from "node:os";
|
|
10
|
+
import path from "node:path";
|
|
11
|
+
|
|
12
|
+
const DB_PATH = path.join(
|
|
13
|
+
os.homedir(),
|
|
14
|
+
"Documents",
|
|
15
|
+
".pdf-library",
|
|
16
|
+
"library.db"
|
|
17
|
+
);
|
|
18
|
+
|
|
19
|
+
interface TagStats {
|
|
20
|
+
tag: string;
|
|
21
|
+
documentCount: number;
|
|
22
|
+
percentage: number;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
async function analyzeTagDistribution() {
|
|
26
|
+
const client = createClient({
|
|
27
|
+
url: `file:${DB_PATH}`,
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
try {
|
|
31
|
+
// Get total document count
|
|
32
|
+
const totalResult = await client.execute(
|
|
33
|
+
"SELECT COUNT(*) as count FROM documents"
|
|
34
|
+
);
|
|
35
|
+
const totalDocs = Number(
|
|
36
|
+
(totalResult.rows[0] as unknown as { count: number }).count
|
|
37
|
+
);
|
|
38
|
+
|
|
39
|
+
console.log(`\nπ Database Statistics`);
|
|
40
|
+
console.log(`ββββββββββββββββββββββββββββββββββββββββββββββ\n`);
|
|
41
|
+
console.log(`Total Documents: ${totalDocs}`);
|
|
42
|
+
|
|
43
|
+
// Get all documents with their tags
|
|
44
|
+
const docsResult = await client.execute("SELECT tags FROM documents");
|
|
45
|
+
const allTags = new Map<string, number>();
|
|
46
|
+
|
|
47
|
+
// Count tag occurrences
|
|
48
|
+
for (const row of docsResult.rows) {
|
|
49
|
+
const tags = JSON.parse(
|
|
50
|
+
(row as unknown as { tags: string }).tags
|
|
51
|
+
) as string[];
|
|
52
|
+
for (const tag of tags) {
|
|
53
|
+
allTags.set(tag, (allTags.get(tag) || 0) + 1);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// Calculate statistics
|
|
58
|
+
const tagStats: TagStats[] = Array.from(allTags.entries())
|
|
59
|
+
.map(([tag, count]) => ({
|
|
60
|
+
tag,
|
|
61
|
+
documentCount: count,
|
|
62
|
+
percentage: (count / totalDocs) * 100,
|
|
63
|
+
}))
|
|
64
|
+
.sort((a, b) => b.documentCount - a.documentCount);
|
|
65
|
+
|
|
66
|
+
// Display tag distribution
|
|
67
|
+
console.log(`\nTotal Unique Tags: ${tagStats.length}`);
|
|
68
|
+
console.log(`\nπ Tag Distribution (Top 20)\n`);
|
|
69
|
+
|
|
70
|
+
const top20 = tagStats.slice(0, 20);
|
|
71
|
+
for (const stat of top20) {
|
|
72
|
+
const bar = "β".repeat(Math.floor(stat.percentage / 2));
|
|
73
|
+
console.log(
|
|
74
|
+
`${stat.tag.padEnd(30)} ${stat.documentCount
|
|
75
|
+
.toString()
|
|
76
|
+
.padStart(5)} docs ${stat.percentage
|
|
77
|
+
.toFixed(1)
|
|
78
|
+
.padStart(5)}% ${bar}`
|
|
79
|
+
);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// Analyze tag usage patterns
|
|
83
|
+
console.log(`\n\nπ Tag Usage Patterns\n`);
|
|
84
|
+
|
|
85
|
+
const high = tagStats.filter((t) => t.percentage >= 20);
|
|
86
|
+
const medium = tagStats.filter(
|
|
87
|
+
(t) => t.percentage >= 5 && t.percentage < 20
|
|
88
|
+
);
|
|
89
|
+
const low = tagStats.filter((t) => t.percentage >= 1 && t.percentage < 5);
|
|
90
|
+
const rare = tagStats.filter((t) => t.percentage < 1);
|
|
91
|
+
|
|
92
|
+
console.log(`High Usage (β₯20%): ${high.length} tags`);
|
|
93
|
+
if (high.length > 0) {
|
|
94
|
+
console.log(` ${high.map((t) => t.tag).join(", ")}`);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
console.log(`\nMedium Usage (5-20%): ${medium.length} tags`);
|
|
98
|
+
if (medium.length > 0) {
|
|
99
|
+
console.log(` ${medium.map((t) => t.tag).join(", ")}`);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
console.log(`\nLow Usage (1-5%): ${low.length} tags`);
|
|
103
|
+
if (low.length > 0) {
|
|
104
|
+
console.log(` ${low.map((t) => t.tag).join(", ")}`);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
console.log(`\nRare Usage (<1%): ${rare.length} tags`);
|
|
108
|
+
|
|
109
|
+
// Documents without tags
|
|
110
|
+
const noTagsResult = await client.execute(
|
|
111
|
+
"SELECT COUNT(*) as count FROM documents WHERE tags = '[]'"
|
|
112
|
+
);
|
|
113
|
+
const noTags = Number(
|
|
114
|
+
(noTagsResult.rows[0] as unknown as { count: number }).count
|
|
115
|
+
);
|
|
116
|
+
console.log(
|
|
117
|
+
`\nDocuments with no tags: ${noTags} (${(
|
|
118
|
+
(noTags / totalDocs) *
|
|
119
|
+
100
|
|
120
|
+
).toFixed(1)}%)`
|
|
121
|
+
);
|
|
122
|
+
|
|
123
|
+
// Multiple tags analysis
|
|
124
|
+
const multiTagDocs = docsResult.rows.filter(
|
|
125
|
+
(row) => JSON.parse((row as unknown as { tags: string }).tags).length > 1
|
|
126
|
+
);
|
|
127
|
+
console.log(
|
|
128
|
+
`Documents with 2+ tags: ${multiTagDocs.length} (${(
|
|
129
|
+
(multiTagDocs.length / totalDocs) *
|
|
130
|
+
100
|
|
131
|
+
).toFixed(1)}%)`
|
|
132
|
+
);
|
|
133
|
+
|
|
134
|
+
// Co-occurrence analysis (top 5 tag pairs)
|
|
135
|
+
const tagPairs = new Map<string, number>();
|
|
136
|
+
for (const row of docsResult.rows) {
|
|
137
|
+
const tags = JSON.parse(
|
|
138
|
+
(row as unknown as { tags: string }).tags
|
|
139
|
+
) as string[];
|
|
140
|
+
if (tags.length < 2) continue;
|
|
141
|
+
|
|
142
|
+
const sortedTags = [...tags].sort();
|
|
143
|
+
for (let i = 0; i < sortedTags.length; i++) {
|
|
144
|
+
for (let j = i + 1; j < sortedTags.length; j++) {
|
|
145
|
+
const pair = `${sortedTags[i]} + ${sortedTags[j]}`;
|
|
146
|
+
tagPairs.set(pair, (tagPairs.get(pair) || 0) + 1);
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
const topPairs = Array.from(tagPairs.entries())
|
|
152
|
+
.sort((a, b) => b[1] - a[1])
|
|
153
|
+
.slice(0, 10);
|
|
154
|
+
|
|
155
|
+
if (topPairs.length > 0) {
|
|
156
|
+
console.log(`\n\nπ Most Common Tag Combinations (Top 10)\n`);
|
|
157
|
+
for (const [pair, count] of topPairs) {
|
|
158
|
+
console.log(`${pair.padEnd(50)} ${count} docs`);
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
console.log(`\n`);
|
|
163
|
+
} catch (error) {
|
|
164
|
+
console.error("Error analyzing tags:", error);
|
|
165
|
+
process.exit(1);
|
|
166
|
+
} finally {
|
|
167
|
+
client.close();
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
analyzeTagDistribution();
|