npm - pdf-brain - Versions diffs - 1.0.0 → 1.1.0 - Mend

pdf-brain 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/README.md +354 -103
package/package.json +2 -1
package/scripts/analyze-tags.ts +171 -0
package/scripts/benchmark-vector-search.ts +222 -0
package/scripts/bulk-ingest.ts +447 -0
package/scripts/migration/add-compression.ts +162 -0
package/scripts/migration/add-concept-embeddings.ts +228 -0
package/scripts/seed-taxonomy.ts +90 -0
package/src/cli.ts +886 -92
package/src/services/AutoTagger.test.ts +52 -0
package/src/services/AutoTagger.ts +692 -157
package/src/services/LibSQLDatabase.test.ts +229 -0
package/src/services/LibSQLDatabase.ts +147 -26
package/src/services/Ollama.ts +68 -15
package/src/services/TaxonomyService.test.ts +433 -0
package/src/services/TaxonomyService.ts +670 -0
package/src/types.test.ts +205 -0
package/src/types.ts +163 -15

package/README.md CHANGED Viewed

@@ -1,159 +1,410 @@
-# 📄 pdf-brain
+# pdf-brain
-Local PDF knowledge base with vector search. Extract, embed, and semantically search your PDFs.
+Local PDF & Markdown knowledge base with semantic search and AI-powered enrichment.
-## Install
+```
+┌─────────────┐     ┌─────────────┐     ┌─────────────┐     ┌─────────────┐
+│   PDF/MD    │────▶│   Ollama    │────▶│   Ollama    │────▶│   libSQL    │
+│  (extract)  │     │    (LLM)    │     │ (embeddings)│     │  (vectors)  │
+└─────────────┘     └─────────────┘     └─────────────┘     └─────────────┘
+      │                   │                   │                   │
+   pdf-parse         llama3.2:3b        mxbai-embed          HNSW index
+   markdown          enrichment          1024 dims           cosine sim
+```
+## Features
+- **Local-first** - Everything runs on your machine, no API costs
+- **AI enrichment** - LLM extracts titles, summaries, tags, and concepts
+- **SKOS taxonomy** - Organize documents with hierarchical concepts
+- **Vector search** - Semantic search via Ollama embeddings
+- **Hybrid search** - Combine vector similarity with full-text search
+- **Markdown support** - Index `.md` files alongside PDFs
+## Quick Start
 ```bash
-npm install pdf-brain
+# 1. Install
+npm install -g pdf-brain
-# Need Ollama for embeddings
+# 2. Install Ollama (macOS)
 brew install ollama
+# 3. Pull required models
+ollama pull mxbai-embed-large   # embeddings (required)
+ollama pull llama3.2:3b         # enrichment (optional but recommended)
+# 4. Start Ollama
+ollama serve
+# 5. Initialize (creates DB + seeds starter taxonomy)
+pdf-brain init
+# 6. Add your first document
+pdf-brain add ~/Documents/paper.pdf --enrich
+```
+## Installation
+### Prerequisites
+**Ollama** is required for embeddings. The LLM model is optional but recommended for enrichment.
+```bash
+# macOS
+brew install ollama
+# Linux
+curl -fsSL https://ollama.com/install.sh | sh
+# Windows
+# Download from https://ollama.com/download
+```
+### Models
+```bash
+# Required: Embedding model (1024 dimensions)
 ollama pull mxbai-embed-large
+# Recommended: Local LLM for enrichment
+ollama pull llama3.2:3b
+# Start Ollama server
+ollama serve
+```
+### Install pdf-brain
+```bash
+# npm
+npm install -g pdf-brain
+# or run directly
+npx pdf-brain <command>
 ```
-## CLI
+## CLI Reference
+### Basic Commands
+```bash
+# Check Ollama status
+pdf-brain check
+# Show library stats
+pdf-brain stats
+# Initialize library (creates DB, seeds taxonomy)
+pdf-brain init
+```
+### Adding Documents
 ```bash
 # Add a PDF
-npx pdf-brain add /path/to/document.pdf
+pdf-brain add /path/to/document.pdf
 # Add from URL
-npx pdf-brain add https://example.com/paper.pdf
+pdf-brain add https://example.com/paper.pdf
+# Add with manual tags
+pdf-brain add document.pdf --tags "ai,agents,research"
+# Add with AI enrichment (extracts title, summary, concepts)
+pdf-brain add document.pdf --enrich
+# Add Markdown file
+pdf-brain add notes.md --enrich
+```
+### Searching
+```bash
+# Semantic search (uses embeddings)
+pdf-brain search "context engineering patterns"
+# Full-text search only (faster, no embeddings)
+pdf-brain search "context engineering" --fts
+# Hybrid search (combines both)
+pdf-brain search "machine learning" --hybrid
-# Add with tags
-npx pdf-brain add /path/to/document.pdf --tags "ai,agents"
+# Limit results
+pdf-brain search "query" --limit 5
-# Semantic search
-npx pdf-brain search "context engineering patterns"
+# Expand context around matches
+pdf-brain search "query" --expand 500
+```
-# Full-text search (no embeddings)
-npx pdf-brain search "context engineering" --fts
+### Managing Documents
+```bash
 # List all documents
-npx pdf-brain list
+pdf-brain list
 # List by tag
-npx pdf-brain list --tag ai
+pdf-brain list --tag ai
 # Get document details
-npx pdf-brain get "document-title"
+pdf-brain read "document-title"
 # Remove a document
-npx pdf-brain remove "document-title"
+pdf-brain remove "document-title"
 # Update tags
-npx pdf-brain tag "document-title" "new,tags,here"
+pdf-brain tag "document-title" "new,tags,here"
+```
-# Show stats
-npx pdf-brain stats
+### Taxonomy Commands
-# Check Ollama status
-npx pdf-brain check
+The taxonomy system uses SKOS (Simple Knowledge Organization System) for hierarchical concept organization.
+```bash
+# List all concepts
+pdf-brain taxonomy list
+# Show concept tree
+pdf-brain taxonomy tree
+# Show subtree from a concept
+pdf-brain taxonomy tree programming
+# Search concepts
+pdf-brain taxonomy search "machine learning"
+# Add a new concept
+pdf-brain taxonomy add ai/transformers --label "Transformers" --broader ai-ml
+# Assign concept to document
+pdf-brain taxonomy assign "doc-id" "programming/typescript"
+# Seed taxonomy from JSON file
+pdf-brain taxonomy seed --file data/taxonomy.json
 ```
-## Features
+### Bulk Ingest
-- **Local-first** - Everything runs on your machine, no API costs
-- **Vector search** - Semantic search via Ollama embeddings (mxbai-embed-large)
-- **Hybrid search** - Combine vector similarity with full-text search
-- **iCloud sync** - Default storage in `~/Documents/.pdf-library/`
-- **PGlite + pgvector** - Real Postgres vector search, no server needed
+```bash
+# Ingest a directory with full LLM enrichment
+pdf-brain ingest ~/Documents/papers --enrich
-## OpenCode Integration
+# Ingest multiple directories
+pdf-brain ingest ~/papers ~/books ~/notes --enrich
-Drop this in `~/.config/opencode/tool/pdf-brain.ts`:
+# With manual tags
+pdf-brain ingest ~/books --tags "books,reference"
-```typescript
-import { tool } from "@opencode-ai/plugin";
-import { $ } from "bun";
+# Auto-tag only (faster, heuristics + light LLM)
+pdf-brain ingest ~/docs --auto-tag
-async function run(args: string[]): Promise<string> {
-  const result = await $`npx pdf-brain ${args}`.text();
-  return result.trim();
-}
+# Process only first N files (for testing)
+pdf-brain ingest ~/papers --enrich --sample 10
-export const add = tool({
-  description: "Add a PDF to the library",
-  args: {
-    path: tool.schema.string().describe("Path to PDF file or URL"),
-    tags: tool.schema.string().optional().describe("Comma-separated tags"),
-  },
-  async execute({ path, tags }) {
-    const args = ["add", path];
-    if (tags) args.push("--tags", tags);
-    return run(args);
-  },
-});
-export const search = tool({
-  description: "Semantic search across all PDFs",
-  args: {
-    query: tool.schema.string().describe("Natural language search query"),
-    limit: tool.schema.number().optional().describe("Max results"),
-    fts: tool.schema.boolean().optional().describe("Full-text search only"),
-  },
-  async execute({ query, limit, fts }) {
-    const args = ["search", query];
-    if (limit) args.push("--limit", String(limit));
-    if (fts) args.push("--fts");
-    return run(args);
-  },
-});
-export const list = tool({
-  description: "List all PDFs in the library",
-  args: { tag: tool.schema.string().optional() },
-  async execute({ tag }) {
-    const args = ["list"];
-    if (tag) args.push("--tag", tag);
-    return run(args);
-  },
-});
-export const stats = tool({
-  description: "Show library statistics",
-  args: {},
-  async execute() {
-    return run(["stats"]);
-  },
-});
+# Disable TUI for simple output
+pdf-brain ingest ~/papers --enrich --no-tui
 ```
-## Configuration
+## Enrichment
-| Variable           | Default                    | Description              |
-| ------------------ | -------------------------- | ------------------------ |
-| `PDF_LIBRARY_PATH` | `~/Documents/.pdf-library` | Library storage location |
-| `OLLAMA_HOST`      | `http://localhost:11434`   | Ollama API endpoint      |
-| `OLLAMA_MODEL`     | `mxbai-embed-large`        | Embedding model          |
+When you add documents with `--enrich`, the LLM extracts:
-## How It Works
+| Field                | Description                                 |
+| -------------------- | ------------------------------------------- |
+| **title**            | Clean, properly formatted title             |
+| **author**           | Author name(s) if detectable                |
+| **summary**          | 2-3 sentence summary                        |
+| **documentType**     | book, paper, tutorial, guide, article, etc. |
+| **category**         | Primary category                            |
+| **tags**             | 5-10 descriptive tags                       |
+| **concepts**         | Matched concepts from your taxonomy         |
+| **proposedConcepts** | New concepts the LLM suggests adding        |
+### LLM Providers
+Enrichment tries local Ollama first, falls back to Anthropic if configured:
+```bash
+# Uses Ollama by default (llama3.2:3b)
+pdf-brain add paper.pdf --enrich
+# Set Anthropic API key for fallback
+export ANTHROPIC_API_KEY=sk-ant-...
+```
+## Taxonomy
+The taxonomy is a hierarchical concept system for organizing documents. It ships with a starter taxonomy covering:
+- **Programming** - TypeScript, React, Next.js, Testing, Architecture, DevOps, AI/ML
+- **Education** - Instructional Design, Learning Science, Course Creation, Assessment
+- **Business** - Marketing, Copywriting, Bootstrapping, Product, Sales
+- **Design** - UX, Visual Design, Systems Thinking, Information Architecture
+- **Meta** - Productivity, Note-taking, Knowledge Management, Writing
+### Growing Your Taxonomy
+When enriching documents, the LLM may propose new concepts. These are saved for review:
+```bash
+# See proposed concepts from enrichment
+pdf-brain taxonomy proposed
+# Accept a specific concept
+pdf-brain taxonomy accept ai/rag --broader ai-ml
+# Accept all proposed concepts
+pdf-brain taxonomy accept --all
+# Reject a concept
+pdf-brain taxonomy reject ai/rag
+# Clear all proposals
+pdf-brain taxonomy clear-proposed
+# Manually add a concept
+pdf-brain taxonomy add ai/rag --label "RAG" --broader ai-ml
+# Or edit data/taxonomy.json and re-seed
+pdf-brain taxonomy seed --file data/taxonomy.json
+```
+### Custom Taxonomy
+Create your own `taxonomy.json`:
+```json
+{
+  "concepts": [
+    { "id": "cooking", "prefLabel": "Cooking" },
+    { "id": "cooking/baking", "prefLabel": "Baking" },
+    { "id": "cooking/grilling", "prefLabel": "Grilling" }
+  ],
+  "hierarchy": [
+    { "conceptId": "cooking/baking", "broaderId": "cooking" },
+    { "conceptId": "cooking/grilling", "broaderId": "cooking" }
+  ]
+}
 ```
-┌─────────────┐     ┌─────────────┐     ┌─────────────┐
-│    PDF      │────▶│   Ollama    │────▶│   PGlite    │
-│  (extract)  │     │ (embeddings)│     │ (pgvector)  │
-└─────────────┘     └─────────────┘     └─────────────┘
-      │                    │                   │
-   pypdf              mxbai-embed         HNSW index
-   chunks              1024 dims          cosine sim
+```bash
+pdf-brain taxonomy seed --file my-taxonomy.json
 ```
-1. **Extract** - PDF text extracted via `pypdf`
-2. **Chunk** - Text split into ~512 token chunks with overlap
-3. **Embed** - Each chunk embedded via Ollama (1024 dimensions)
-4. **Store** - PGlite + pgvector with HNSW index + FTS
-5. **Search** - Query embedded, compared via cosine similarity
+## Configuration
+| Variable            | Default                    | Description              |
+| ------------------- | -------------------------- | ------------------------ |
+| `PDF_LIBRARY_PATH`  | `~/Documents/.pdf-library` | Library storage location |
+| `OLLAMA_HOST`       | `http://localhost:11434`   | Ollama API endpoint      |
+| `OLLAMA_MODEL`      | `mxbai-embed-large`        | Embedding model          |
+| `ANTHROPIC_API_KEY` | -                          | Fallback for enrichment  |
 ## Storage
 ```
 ~/Documents/.pdf-library/
-├── library.db          # PGlite database (vectors, FTS, metadata)
+├── library.db          # libSQL database (vectors, FTS, metadata, taxonomy)
+├── library.db-shm      # Shared memory (WAL mode)
+├── library.db-wal      # Write-ahead log
 └── downloads/          # PDFs downloaded from URLs
 ```
+## How It Works
+1. **Extract** - PDF text via `pdf-parse`, Markdown parsed directly
+2. **Enrich** (optional) - LLM extracts metadata, matches taxonomy concepts
+3. **Chunk** - Text split into ~512 token chunks with overlap
+4. **Embed** - Each chunk embedded via Ollama (1024 dimensions)
+5. **Store** - libSQL with vector index (HNSW) + FTS5
+6. **Search** - Query embedded, compared via cosine similarity
+## MCP Integration
+pdf-brain ships as an MCP server for AI coding assistants:
+```json
+{
+  "mcpServers": {
+    "pdf-brain": {
+      "command": "npx",
+      "args": ["pdf-brain", "mcp"]
+    }
+  }
+}
+```
+Tools exposed:
+- `pdf-brain_add` - Add document to library
+- `pdf-brain_search` - Semantic search
+- `pdf-brain_list` - List documents
+- `pdf-brain_read` - Get document details
+- `pdf-brain_stats` - Library statistics
+## Troubleshooting
+### "Ollama not available"
+```bash
+# Check if Ollama is running
+curl http://localhost:11434/api/tags
+# Start Ollama
+ollama serve
+# Check models
+ollama list
+```
+### "Model not found"
+```bash
+# Pull required models
+ollama pull mxbai-embed-large
+ollama pull llama3.2:3b
+```
+### "Database locked"
+The database uses WAL mode. If you see lock errors:
+```bash
+# Check for zombie processes
+lsof ~/Documents/.pdf-library/library.db*
+# Force checkpoint
+sqlite3 ~/Documents/.pdf-library/library.db "PRAGMA wal_checkpoint(TRUNCATE);"
+```
+### Slow enrichment
+Enrichment is CPU-intensive. For large batches:
+- Use `--auto-tag` instead of `--enrich` for faster processing
+- Run overnight for large libraries
+- Consider GPU acceleration for Ollama
+## Development
+```bash
+# Clone
+git clone https://github.com/joelhooks/pdf-brain
+cd pdf-brain
+# Install
+bun install
+# Run CLI
+bun run src/cli.ts <command>
+# Run tests
+bun test
+# Type check
+bun run typecheck
+```
 ## License
 MIT

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "pdf-brain",
-  "version": "1.0.0",
+  "version": "1.1.0",
   "description": "Local PDF knowledge base with vector search",
   "type": "module",
   "main": "src/index.ts",
@@ -25,6 +25,7 @@
     "@effect/schema": "^0.75.0",
     "@libsql/client": "^0.15.15",
     "ai": "^5.0.115",
+    "dotenv": "^17.2.3",
     "effect": "^3.12.0",
     "gray-matter": "^4.0.3",
     "ink": "^6.5.1",

package/scripts/analyze-tags.ts ADDED Viewed

@@ -0,0 +1,171 @@
+#!/usr/bin/env bun
+/**
+ * Analyze tag distribution in the library database
+ * Outputs statistics useful for designing partial index strategy
+ */
+import { createClient, type ResultSet } from "@libsql/client";
+import os from "node:os";
+import path from "node:path";
+const DB_PATH = path.join(
+  os.homedir(),
+  "Documents",
+  ".pdf-library",
+  "library.db"
+);
+interface TagStats {
+  tag: string;
+  documentCount: number;
+  percentage: number;
+}
+async function analyzeTagDistribution() {
+  const client = createClient({
+    url: `file:${DB_PATH}`,
+  });
+  try {
+    // Get total document count
+    const totalResult = await client.execute(
+      "SELECT COUNT(*) as count FROM documents"
+    );
+    const totalDocs = Number(
+      (totalResult.rows[0] as unknown as { count: number }).count
+    );
+    console.log(`\n📊 Database Statistics`);
+    console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
+    console.log(`Total Documents: ${totalDocs}`);
+    // Get all documents with their tags
+    const docsResult = await client.execute("SELECT tags FROM documents");
+    const allTags = new Map<string, number>();
+    // Count tag occurrences
+    for (const row of docsResult.rows) {
+      const tags = JSON.parse(
+        (row as unknown as { tags: string }).tags
+      ) as string[];
+      for (const tag of tags) {
+        allTags.set(tag, (allTags.get(tag) || 0) + 1);
+      }
+    }
+    // Calculate statistics
+    const tagStats: TagStats[] = Array.from(allTags.entries())
+      .map(([tag, count]) => ({
+        tag,
+        documentCount: count,
+        percentage: (count / totalDocs) * 100,
+      }))
+      .sort((a, b) => b.documentCount - a.documentCount);
+    // Display tag distribution
+    console.log(`\nTotal Unique Tags: ${tagStats.length}`);
+    console.log(`\n📈 Tag Distribution (Top 20)\n`);
+    const top20 = tagStats.slice(0, 20);
+    for (const stat of top20) {
+      const bar = "█".repeat(Math.floor(stat.percentage / 2));
+      console.log(
+        `${stat.tag.padEnd(30)} ${stat.documentCount
+          .toString()
+          .padStart(5)} docs  ${stat.percentage
+          .toFixed(1)
+          .padStart(5)}%  ${bar}`
+      );
+    }
+    // Analyze tag usage patterns
+    console.log(`\n\n📊 Tag Usage Patterns\n`);
+    const high = tagStats.filter((t) => t.percentage >= 20);
+    const medium = tagStats.filter(
+      (t) => t.percentage >= 5 && t.percentage < 20
+    );
+    const low = tagStats.filter((t) => t.percentage >= 1 && t.percentage < 5);
+    const rare = tagStats.filter((t) => t.percentage < 1);
+    console.log(`High Usage (≥20%):      ${high.length} tags`);
+    if (high.length > 0) {
+      console.log(`  ${high.map((t) => t.tag).join(", ")}`);
+    }
+    console.log(`\nMedium Usage (5-20%):   ${medium.length} tags`);
+    if (medium.length > 0) {
+      console.log(`  ${medium.map((t) => t.tag).join(", ")}`);
+    }
+    console.log(`\nLow Usage (1-5%):       ${low.length} tags`);
+    if (low.length > 0) {
+      console.log(`  ${low.map((t) => t.tag).join(", ")}`);
+    }
+    console.log(`\nRare Usage (<1%):       ${rare.length} tags`);
+    // Documents without tags
+    const noTagsResult = await client.execute(
+      "SELECT COUNT(*) as count FROM documents WHERE tags = '[]'"
+    );
+    const noTags = Number(
+      (noTagsResult.rows[0] as unknown as { count: number }).count
+    );
+    console.log(
+      `\nDocuments with no tags: ${noTags} (${(
+        (noTags / totalDocs) *
+        100
+      ).toFixed(1)}%)`
+    );
+    // Multiple tags analysis
+    const multiTagDocs = docsResult.rows.filter(
+      (row) => JSON.parse((row as unknown as { tags: string }).tags).length > 1
+    );
+    console.log(
+      `Documents with 2+ tags:  ${multiTagDocs.length} (${(
+        (multiTagDocs.length / totalDocs) *
+        100
+      ).toFixed(1)}%)`
+    );
+    // Co-occurrence analysis (top 5 tag pairs)
+    const tagPairs = new Map<string, number>();
+    for (const row of docsResult.rows) {
+      const tags = JSON.parse(
+        (row as unknown as { tags: string }).tags
+      ) as string[];
+      if (tags.length < 2) continue;
+      const sortedTags = [...tags].sort();
+      for (let i = 0; i < sortedTags.length; i++) {
+        for (let j = i + 1; j < sortedTags.length; j++) {
+          const pair = `${sortedTags[i]} + ${sortedTags[j]}`;
+          tagPairs.set(pair, (tagPairs.get(pair) || 0) + 1);
+        }
+      }
+    }
+    const topPairs = Array.from(tagPairs.entries())
+      .sort((a, b) => b[1] - a[1])
+      .slice(0, 10);
+    if (topPairs.length > 0) {
+      console.log(`\n\n📊 Most Common Tag Combinations (Top 10)\n`);
+      for (const [pair, count] of topPairs) {
+        console.log(`${pair.padEnd(50)} ${count} docs`);
+      }
+    }
+    console.log(`\n`);
+  } catch (error) {
+    console.error("Error analyzing tags:", error);
+    process.exit(1);
+  } finally {
+    client.close();
+  }
+}
+analyzeTagDistribution();