pdf-brain 1.0.0 β†’ 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,159 +1,410 @@
1
- # πŸ“„ pdf-brain
1
+ # pdf-brain
2
2
 
3
- Local PDF knowledge base with vector search. Extract, embed, and semantically search your PDFs.
3
+ Local PDF & Markdown knowledge base with semantic search and AI-powered enrichment.
4
4
 
5
- ## Install
5
+ ```
6
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
7
+ β”‚ PDF/MD │────▢│ Ollama │────▢│ Ollama │────▢│ libSQL β”‚
8
+ β”‚ (extract) β”‚ β”‚ (LLM) β”‚ β”‚ (embeddings)β”‚ β”‚ (vectors) β”‚
9
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
10
+ β”‚ β”‚ β”‚ β”‚
11
+ pdf-parse llama3.2:3b mxbai-embed HNSW index
12
+ markdown enrichment 1024 dims cosine sim
13
+ ```
14
+
15
+ ## Features
16
+
17
+ - **Local-first** - Everything runs on your machine, no API costs
18
+ - **AI enrichment** - LLM extracts titles, summaries, tags, and concepts
19
+ - **SKOS taxonomy** - Organize documents with hierarchical concepts
20
+ - **Vector search** - Semantic search via Ollama embeddings
21
+ - **Hybrid search** - Combine vector similarity with full-text search
22
+ - **Markdown support** - Index `.md` files alongside PDFs
23
+
24
+ ## Quick Start
6
25
 
7
26
  ```bash
8
- npm install pdf-brain
27
+ # 1. Install
28
+ npm install -g pdf-brain
9
29
 
10
- # Need Ollama for embeddings
30
+ # 2. Install Ollama (macOS)
11
31
  brew install ollama
32
+
33
+ # 3. Pull required models
34
+ ollama pull mxbai-embed-large # embeddings (required)
35
+ ollama pull llama3.2:3b # enrichment (optional but recommended)
36
+
37
+ # 4. Start Ollama
38
+ ollama serve
39
+
40
+ # 5. Initialize (creates DB + seeds starter taxonomy)
41
+ pdf-brain init
42
+
43
+ # 6. Add your first document
44
+ pdf-brain add ~/Documents/paper.pdf --enrich
45
+ ```
46
+
47
+ ## Installation
48
+
49
+ ### Prerequisites
50
+
51
+ **Ollama** is required for embeddings. The LLM model is optional but recommended for enrichment.
52
+
53
+ ```bash
54
+ # macOS
55
+ brew install ollama
56
+
57
+ # Linux
58
+ curl -fsSL https://ollama.com/install.sh | sh
59
+
60
+ # Windows
61
+ # Download from https://ollama.com/download
62
+ ```
63
+
64
+ ### Models
65
+
66
+ ```bash
67
+ # Required: Embedding model (1024 dimensions)
12
68
  ollama pull mxbai-embed-large
69
+
70
+ # Recommended: Local LLM for enrichment
71
+ ollama pull llama3.2:3b
72
+
73
+ # Start Ollama server
74
+ ollama serve
75
+ ```
76
+
77
+ ### Install pdf-brain
78
+
79
+ ```bash
80
+ # npm
81
+ npm install -g pdf-brain
82
+
83
+ # or run directly
84
+ npx pdf-brain <command>
13
85
  ```
14
86
 
15
- ## CLI
87
+ ## CLI Reference
88
+
89
+ ### Basic Commands
90
+
91
+ ```bash
92
+ # Check Ollama status
93
+ pdf-brain check
94
+
95
+ # Show library stats
96
+ pdf-brain stats
97
+
98
+ # Initialize library (creates DB, seeds taxonomy)
99
+ pdf-brain init
100
+ ```
101
+
102
+ ### Adding Documents
16
103
 
17
104
  ```bash
18
105
  # Add a PDF
19
- npx pdf-brain add /path/to/document.pdf
106
+ pdf-brain add /path/to/document.pdf
20
107
 
21
108
  # Add from URL
22
- npx pdf-brain add https://example.com/paper.pdf
109
+ pdf-brain add https://example.com/paper.pdf
110
+
111
+ # Add with manual tags
112
+ pdf-brain add document.pdf --tags "ai,agents,research"
113
+
114
+ # Add with AI enrichment (extracts title, summary, concepts)
115
+ pdf-brain add document.pdf --enrich
116
+
117
+ # Add Markdown file
118
+ pdf-brain add notes.md --enrich
119
+ ```
120
+
121
+ ### Searching
122
+
123
+ ```bash
124
+ # Semantic search (uses embeddings)
125
+ pdf-brain search "context engineering patterns"
126
+
127
+ # Full-text search only (faster, no embeddings)
128
+ pdf-brain search "context engineering" --fts
129
+
130
+ # Hybrid search (combines both)
131
+ pdf-brain search "machine learning" --hybrid
23
132
 
24
- # Add with tags
25
- npx pdf-brain add /path/to/document.pdf --tags "ai,agents"
133
+ # Limit results
134
+ pdf-brain search "query" --limit 5
26
135
 
27
- # Semantic search
28
- npx pdf-brain search "context engineering patterns"
136
+ # Expand context around matches
137
+ pdf-brain search "query" --expand 500
138
+ ```
29
139
 
30
- # Full-text search (no embeddings)
31
- npx pdf-brain search "context engineering" --fts
140
+ ### Managing Documents
32
141
 
142
+ ```bash
33
143
  # List all documents
34
- npx pdf-brain list
144
+ pdf-brain list
35
145
 
36
146
  # List by tag
37
- npx pdf-brain list --tag ai
147
+ pdf-brain list --tag ai
38
148
 
39
149
  # Get document details
40
- npx pdf-brain get "document-title"
150
+ pdf-brain read "document-title"
41
151
 
42
152
  # Remove a document
43
- npx pdf-brain remove "document-title"
153
+ pdf-brain remove "document-title"
44
154
 
45
155
  # Update tags
46
- npx pdf-brain tag "document-title" "new,tags,here"
156
+ pdf-brain tag "document-title" "new,tags,here"
157
+ ```
47
158
 
48
- # Show stats
49
- npx pdf-brain stats
159
+ ### Taxonomy Commands
50
160
 
51
- # Check Ollama status
52
- npx pdf-brain check
161
+ The taxonomy system uses SKOS (Simple Knowledge Organization System) for hierarchical concept organization.
162
+
163
+ ```bash
164
+ # List all concepts
165
+ pdf-brain taxonomy list
166
+
167
+ # Show concept tree
168
+ pdf-brain taxonomy tree
169
+
170
+ # Show subtree from a concept
171
+ pdf-brain taxonomy tree programming
172
+
173
+ # Search concepts
174
+ pdf-brain taxonomy search "machine learning"
175
+
176
+ # Add a new concept
177
+ pdf-brain taxonomy add ai/transformers --label "Transformers" --broader ai-ml
178
+
179
+ # Assign concept to document
180
+ pdf-brain taxonomy assign "doc-id" "programming/typescript"
181
+
182
+ # Seed taxonomy from JSON file
183
+ pdf-brain taxonomy seed --file data/taxonomy.json
53
184
  ```
54
185
 
55
- ## Features
186
+ ### Bulk Ingest
56
187
 
57
- - **Local-first** - Everything runs on your machine, no API costs
58
- - **Vector search** - Semantic search via Ollama embeddings (mxbai-embed-large)
59
- - **Hybrid search** - Combine vector similarity with full-text search
60
- - **iCloud sync** - Default storage in `~/Documents/.pdf-library/`
61
- - **PGlite + pgvector** - Real Postgres vector search, no server needed
188
+ ```bash
189
+ # Ingest a directory with full LLM enrichment
190
+ pdf-brain ingest ~/Documents/papers --enrich
62
191
 
63
- ## OpenCode Integration
192
+ # Ingest multiple directories
193
+ pdf-brain ingest ~/papers ~/books ~/notes --enrich
64
194
 
65
- Drop this in `~/.config/opencode/tool/pdf-brain.ts`:
195
+ # With manual tags
196
+ pdf-brain ingest ~/books --tags "books,reference"
66
197
 
67
- ```typescript
68
- import { tool } from "@opencode-ai/plugin";
69
- import { $ } from "bun";
198
+ # Auto-tag only (faster, heuristics + light LLM)
199
+ pdf-brain ingest ~/docs --auto-tag
70
200
 
71
- async function run(args: string[]): Promise<string> {
72
- const result = await $`npx pdf-brain ${args}`.text();
73
- return result.trim();
74
- }
201
+ # Process only first N files (for testing)
202
+ pdf-brain ingest ~/papers --enrich --sample 10
75
203
 
76
- export const add = tool({
77
- description: "Add a PDF to the library",
78
- args: {
79
- path: tool.schema.string().describe("Path to PDF file or URL"),
80
- tags: tool.schema.string().optional().describe("Comma-separated tags"),
81
- },
82
- async execute({ path, tags }) {
83
- const args = ["add", path];
84
- if (tags) args.push("--tags", tags);
85
- return run(args);
86
- },
87
- });
88
-
89
- export const search = tool({
90
- description: "Semantic search across all PDFs",
91
- args: {
92
- query: tool.schema.string().describe("Natural language search query"),
93
- limit: tool.schema.number().optional().describe("Max results"),
94
- fts: tool.schema.boolean().optional().describe("Full-text search only"),
95
- },
96
- async execute({ query, limit, fts }) {
97
- const args = ["search", query];
98
- if (limit) args.push("--limit", String(limit));
99
- if (fts) args.push("--fts");
100
- return run(args);
101
- },
102
- });
103
-
104
- export const list = tool({
105
- description: "List all PDFs in the library",
106
- args: { tag: tool.schema.string().optional() },
107
- async execute({ tag }) {
108
- const args = ["list"];
109
- if (tag) args.push("--tag", tag);
110
- return run(args);
111
- },
112
- });
113
-
114
- export const stats = tool({
115
- description: "Show library statistics",
116
- args: {},
117
- async execute() {
118
- return run(["stats"]);
119
- },
120
- });
204
+ # Disable TUI for simple output
205
+ pdf-brain ingest ~/papers --enrich --no-tui
121
206
  ```
122
207
 
123
- ## Configuration
208
+ ## Enrichment
124
209
 
125
- | Variable | Default | Description |
126
- | ------------------ | -------------------------- | ------------------------ |
127
- | `PDF_LIBRARY_PATH` | `~/Documents/.pdf-library` | Library storage location |
128
- | `OLLAMA_HOST` | `http://localhost:11434` | Ollama API endpoint |
129
- | `OLLAMA_MODEL` | `mxbai-embed-large` | Embedding model |
210
+ When you add documents with `--enrich`, the LLM extracts:
130
211
 
131
- ## How It Works
212
+ | Field | Description |
213
+ | -------------------- | ------------------------------------------- |
214
+ | **title** | Clean, properly formatted title |
215
+ | **author** | Author name(s) if detectable |
216
+ | **summary** | 2-3 sentence summary |
217
+ | **documentType** | book, paper, tutorial, guide, article, etc. |
218
+ | **category** | Primary category |
219
+ | **tags** | 5-10 descriptive tags |
220
+ | **concepts** | Matched concepts from your taxonomy |
221
+ | **proposedConcepts** | New concepts the LLM suggests adding |
222
+
223
+ ### LLM Providers
224
+
225
+ Enrichment tries local Ollama first, falls back to Anthropic if configured:
226
+
227
+ ```bash
228
+ # Uses Ollama by default (llama3.2:3b)
229
+ pdf-brain add paper.pdf --enrich
230
+
231
+ # Set Anthropic API key for fallback
232
+ export ANTHROPIC_API_KEY=sk-ant-...
233
+ ```
234
+
235
+ ## Taxonomy
236
+
237
+ The taxonomy is a hierarchical concept system for organizing documents. It ships with a starter taxonomy covering:
238
+
239
+ - **Programming** - TypeScript, React, Next.js, Testing, Architecture, DevOps, AI/ML
240
+ - **Education** - Instructional Design, Learning Science, Course Creation, Assessment
241
+ - **Business** - Marketing, Copywriting, Bootstrapping, Product, Sales
242
+ - **Design** - UX, Visual Design, Systems Thinking, Information Architecture
243
+ - **Meta** - Productivity, Note-taking, Knowledge Management, Writing
244
+
245
+ ### Growing Your Taxonomy
246
+
247
+ When enriching documents, the LLM may propose new concepts. These are saved for review:
248
+
249
+ ```bash
250
+ # See proposed concepts from enrichment
251
+ pdf-brain taxonomy proposed
132
252
 
253
+ # Accept a specific concept
254
+ pdf-brain taxonomy accept ai/rag --broader ai-ml
255
+
256
+ # Accept all proposed concepts
257
+ pdf-brain taxonomy accept --all
258
+
259
+ # Reject a concept
260
+ pdf-brain taxonomy reject ai/rag
261
+
262
+ # Clear all proposals
263
+ pdf-brain taxonomy clear-proposed
264
+
265
+ # Manually add a concept
266
+ pdf-brain taxonomy add ai/rag --label "RAG" --broader ai-ml
267
+
268
+ # Or edit data/taxonomy.json and re-seed
269
+ pdf-brain taxonomy seed --file data/taxonomy.json
270
+ ```
271
+
272
+ ### Custom Taxonomy
273
+
274
+ Create your own `taxonomy.json`:
275
+
276
+ ```json
277
+ {
278
+ "concepts": [
279
+ { "id": "cooking", "prefLabel": "Cooking" },
280
+ { "id": "cooking/baking", "prefLabel": "Baking" },
281
+ { "id": "cooking/grilling", "prefLabel": "Grilling" }
282
+ ],
283
+ "hierarchy": [
284
+ { "conceptId": "cooking/baking", "broaderId": "cooking" },
285
+ { "conceptId": "cooking/grilling", "broaderId": "cooking" }
286
+ ]
287
+ }
133
288
  ```
134
- β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
135
- β”‚ PDF │────▢│ Ollama │────▢│ PGlite β”‚
136
- β”‚ (extract) β”‚ β”‚ (embeddings)β”‚ β”‚ (pgvector) β”‚
137
- β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
138
- β”‚ β”‚ β”‚
139
- pypdf mxbai-embed HNSW index
140
- chunks 1024 dims cosine sim
289
+
290
+ ```bash
291
+ pdf-brain taxonomy seed --file my-taxonomy.json
141
292
  ```
142
293
 
143
- 1. **Extract** - PDF text extracted via `pypdf`
144
- 2. **Chunk** - Text split into ~512 token chunks with overlap
145
- 3. **Embed** - Each chunk embedded via Ollama (1024 dimensions)
146
- 4. **Store** - PGlite + pgvector with HNSW index + FTS
147
- 5. **Search** - Query embedded, compared via cosine similarity
294
+ ## Configuration
295
+
296
+ | Variable | Default | Description |
297
+ | ------------------- | -------------------------- | ------------------------ |
298
+ | `PDF_LIBRARY_PATH` | `~/Documents/.pdf-library` | Library storage location |
299
+ | `OLLAMA_HOST` | `http://localhost:11434` | Ollama API endpoint |
300
+ | `OLLAMA_MODEL` | `mxbai-embed-large` | Embedding model |
301
+ | `ANTHROPIC_API_KEY` | - | Fallback for enrichment |
148
302
 
149
303
  ## Storage
150
304
 
151
305
  ```
152
306
  ~/Documents/.pdf-library/
153
- β”œβ”€β”€ library.db # PGlite database (vectors, FTS, metadata)
307
+ β”œβ”€β”€ library.db # libSQL database (vectors, FTS, metadata, taxonomy)
308
+ β”œβ”€β”€ library.db-shm # Shared memory (WAL mode)
309
+ β”œβ”€β”€ library.db-wal # Write-ahead log
154
310
  └── downloads/ # PDFs downloaded from URLs
155
311
  ```
156
312
 
313
+ ## How It Works
314
+
315
+ 1. **Extract** - PDF text via `pdf-parse`, Markdown parsed directly
316
+ 2. **Enrich** (optional) - LLM extracts metadata, matches taxonomy concepts
317
+ 3. **Chunk** - Text split into ~512 token chunks with overlap
318
+ 4. **Embed** - Each chunk embedded via Ollama (1024 dimensions)
319
+ 5. **Store** - libSQL with vector index (HNSW) + FTS5
320
+ 6. **Search** - Query embedded, compared via cosine similarity
321
+
322
+ ## MCP Integration
323
+
324
+ pdf-brain ships as an MCP server for AI coding assistants:
325
+
326
+ ```json
327
+ {
328
+ "mcpServers": {
329
+ "pdf-brain": {
330
+ "command": "npx",
331
+ "args": ["pdf-brain", "mcp"]
332
+ }
333
+ }
334
+ }
335
+ ```
336
+
337
+ Tools exposed:
338
+
339
+ - `pdf-brain_add` - Add document to library
340
+ - `pdf-brain_search` - Semantic search
341
+ - `pdf-brain_list` - List documents
342
+ - `pdf-brain_read` - Get document details
343
+ - `pdf-brain_stats` - Library statistics
344
+
345
+ ## Troubleshooting
346
+
347
+ ### "Ollama not available"
348
+
349
+ ```bash
350
+ # Check if Ollama is running
351
+ curl http://localhost:11434/api/tags
352
+
353
+ # Start Ollama
354
+ ollama serve
355
+
356
+ # Check models
357
+ ollama list
358
+ ```
359
+
360
+ ### "Model not found"
361
+
362
+ ```bash
363
+ # Pull required models
364
+ ollama pull mxbai-embed-large
365
+ ollama pull llama3.2:3b
366
+ ```
367
+
368
+ ### "Database locked"
369
+
370
+ The database uses WAL mode. If you see lock errors:
371
+
372
+ ```bash
373
+ # Check for zombie processes
374
+ lsof ~/Documents/.pdf-library/library.db*
375
+
376
+ # Force checkpoint
377
+ sqlite3 ~/Documents/.pdf-library/library.db "PRAGMA wal_checkpoint(TRUNCATE);"
378
+ ```
379
+
380
+ ### Slow enrichment
381
+
382
+ Enrichment is CPU-intensive. For large batches:
383
+
384
+ - Use `--auto-tag` instead of `--enrich` for faster processing
385
+ - Run overnight for large libraries
386
+ - Consider GPU acceleration for Ollama
387
+
388
+ ## Development
389
+
390
+ ```bash
391
+ # Clone
392
+ git clone https://github.com/joelhooks/pdf-brain
393
+ cd pdf-brain
394
+
395
+ # Install
396
+ bun install
397
+
398
+ # Run CLI
399
+ bun run src/cli.ts <command>
400
+
401
+ # Run tests
402
+ bun test
403
+
404
+ # Type check
405
+ bun run typecheck
406
+ ```
407
+
157
408
  ## License
158
409
 
159
410
  MIT
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pdf-brain",
3
- "version": "1.0.0",
3
+ "version": "1.1.0",
4
4
  "description": "Local PDF knowledge base with vector search",
5
5
  "type": "module",
6
6
  "main": "src/index.ts",
@@ -25,6 +25,7 @@
25
25
  "@effect/schema": "^0.75.0",
26
26
  "@libsql/client": "^0.15.15",
27
27
  "ai": "^5.0.115",
28
+ "dotenv": "^17.2.3",
28
29
  "effect": "^3.12.0",
29
30
  "gray-matter": "^4.0.3",
30
31
  "ink": "^6.5.1",
@@ -0,0 +1,171 @@
1
+ #!/usr/bin/env bun
2
+
3
+ /**
4
+ * Analyze tag distribution in the library database
5
+ * Outputs statistics useful for designing partial index strategy
6
+ */
7
+
8
+ import { createClient, type ResultSet } from "@libsql/client";
9
+ import os from "node:os";
10
+ import path from "node:path";
11
+
12
+ const DB_PATH = path.join(
13
+ os.homedir(),
14
+ "Documents",
15
+ ".pdf-library",
16
+ "library.db"
17
+ );
18
+
19
+ interface TagStats {
20
+ tag: string;
21
+ documentCount: number;
22
+ percentage: number;
23
+ }
24
+
25
+ async function analyzeTagDistribution() {
26
+ const client = createClient({
27
+ url: `file:${DB_PATH}`,
28
+ });
29
+
30
+ try {
31
+ // Get total document count
32
+ const totalResult = await client.execute(
33
+ "SELECT COUNT(*) as count FROM documents"
34
+ );
35
+ const totalDocs = Number(
36
+ (totalResult.rows[0] as unknown as { count: number }).count
37
+ );
38
+
39
+ console.log(`\nπŸ“Š Database Statistics`);
40
+ console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
41
+ console.log(`Total Documents: ${totalDocs}`);
42
+
43
+ // Get all documents with their tags
44
+ const docsResult = await client.execute("SELECT tags FROM documents");
45
+ const allTags = new Map<string, number>();
46
+
47
+ // Count tag occurrences
48
+ for (const row of docsResult.rows) {
49
+ const tags = JSON.parse(
50
+ (row as unknown as { tags: string }).tags
51
+ ) as string[];
52
+ for (const tag of tags) {
53
+ allTags.set(tag, (allTags.get(tag) || 0) + 1);
54
+ }
55
+ }
56
+
57
+ // Calculate statistics
58
+ const tagStats: TagStats[] = Array.from(allTags.entries())
59
+ .map(([tag, count]) => ({
60
+ tag,
61
+ documentCount: count,
62
+ percentage: (count / totalDocs) * 100,
63
+ }))
64
+ .sort((a, b) => b.documentCount - a.documentCount);
65
+
66
+ // Display tag distribution
67
+ console.log(`\nTotal Unique Tags: ${tagStats.length}`);
68
+ console.log(`\nπŸ“ˆ Tag Distribution (Top 20)\n`);
69
+
70
+ const top20 = tagStats.slice(0, 20);
71
+ for (const stat of top20) {
72
+ const bar = "β–ˆ".repeat(Math.floor(stat.percentage / 2));
73
+ console.log(
74
+ `${stat.tag.padEnd(30)} ${stat.documentCount
75
+ .toString()
76
+ .padStart(5)} docs ${stat.percentage
77
+ .toFixed(1)
78
+ .padStart(5)}% ${bar}`
79
+ );
80
+ }
81
+
82
+ // Analyze tag usage patterns
83
+ console.log(`\n\nπŸ“Š Tag Usage Patterns\n`);
84
+
85
+ const high = tagStats.filter((t) => t.percentage >= 20);
86
+ const medium = tagStats.filter(
87
+ (t) => t.percentage >= 5 && t.percentage < 20
88
+ );
89
+ const low = tagStats.filter((t) => t.percentage >= 1 && t.percentage < 5);
90
+ const rare = tagStats.filter((t) => t.percentage < 1);
91
+
92
+ console.log(`High Usage (β‰₯20%): ${high.length} tags`);
93
+ if (high.length > 0) {
94
+ console.log(` ${high.map((t) => t.tag).join(", ")}`);
95
+ }
96
+
97
+ console.log(`\nMedium Usage (5-20%): ${medium.length} tags`);
98
+ if (medium.length > 0) {
99
+ console.log(` ${medium.map((t) => t.tag).join(", ")}`);
100
+ }
101
+
102
+ console.log(`\nLow Usage (1-5%): ${low.length} tags`);
103
+ if (low.length > 0) {
104
+ console.log(` ${low.map((t) => t.tag).join(", ")}`);
105
+ }
106
+
107
+ console.log(`\nRare Usage (<1%): ${rare.length} tags`);
108
+
109
+ // Documents without tags
110
+ const noTagsResult = await client.execute(
111
+ "SELECT COUNT(*) as count FROM documents WHERE tags = '[]'"
112
+ );
113
+ const noTags = Number(
114
+ (noTagsResult.rows[0] as unknown as { count: number }).count
115
+ );
116
+ console.log(
117
+ `\nDocuments with no tags: ${noTags} (${(
118
+ (noTags / totalDocs) *
119
+ 100
120
+ ).toFixed(1)}%)`
121
+ );
122
+
123
+ // Multiple tags analysis
124
+ const multiTagDocs = docsResult.rows.filter(
125
+ (row) => JSON.parse((row as unknown as { tags: string }).tags).length > 1
126
+ );
127
+ console.log(
128
+ `Documents with 2+ tags: ${multiTagDocs.length} (${(
129
+ (multiTagDocs.length / totalDocs) *
130
+ 100
131
+ ).toFixed(1)}%)`
132
+ );
133
+
134
+ // Co-occurrence analysis (top 5 tag pairs)
135
+ const tagPairs = new Map<string, number>();
136
+ for (const row of docsResult.rows) {
137
+ const tags = JSON.parse(
138
+ (row as unknown as { tags: string }).tags
139
+ ) as string[];
140
+ if (tags.length < 2) continue;
141
+
142
+ const sortedTags = [...tags].sort();
143
+ for (let i = 0; i < sortedTags.length; i++) {
144
+ for (let j = i + 1; j < sortedTags.length; j++) {
145
+ const pair = `${sortedTags[i]} + ${sortedTags[j]}`;
146
+ tagPairs.set(pair, (tagPairs.get(pair) || 0) + 1);
147
+ }
148
+ }
149
+ }
150
+
151
+ const topPairs = Array.from(tagPairs.entries())
152
+ .sort((a, b) => b[1] - a[1])
153
+ .slice(0, 10);
154
+
155
+ if (topPairs.length > 0) {
156
+ console.log(`\n\nπŸ“Š Most Common Tag Combinations (Top 10)\n`);
157
+ for (const [pair, count] of topPairs) {
158
+ console.log(`${pair.padEnd(50)} ${count} docs`);
159
+ }
160
+ }
161
+
162
+ console.log(`\n`);
163
+ } catch (error) {
164
+ console.error("Error analyzing tags:", error);
165
+ process.exit(1);
166
+ } finally {
167
+ client.close();
168
+ }
169
+ }
170
+
171
+ analyzeTagDistribution();