pdf-brain 1.2.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -2
- package/package.json +2 -1
- package/scripts/install.sh +1 -1
- package/src/agent/hints.ts +426 -3
- package/src/agent/manifest.ts +25 -4
- package/src/agent/protocol.ts +52 -0
- package/src/chunking.ts +130 -0
- package/src/cli.contract.test.ts +239 -0
- package/src/cli.ts +2575 -832
- package/src/index.ts +259 -6
- package/src/logger.ts +53 -0
- package/src/services/AutoTagger.ts +26 -38
- package/src/services/ClusterSummarizer.ts +3 -3
- package/src/services/Clustering.test.ts +20 -5
- package/src/services/Clustering.ts +48 -11
- package/src/services/Database.ts +27 -0
- package/src/services/EmbeddingProvider.ts +77 -7
- package/src/services/Gateway.ts +8 -7
- package/src/services/LibSQLDatabase.test.ts +139 -0
- package/src/services/LibSQLDatabase.ts +228 -15
- package/src/services/Migration.ts +1 -1
- package/src/services/Ollama.ts +22 -7
- package/src/services/PDFExtractor.test.ts +40 -1
- package/src/services/PDFExtractor.ts +37 -6
- package/src/types.test.ts +22 -0
- package/src/types.ts +82 -2
- package/src/updater.ts +189 -0
package/README.md
CHANGED
|
@@ -26,9 +26,12 @@ Local **PDF & Markdown** knowledge base with semantic search and AI-powered enri
|
|
|
26
26
|
|
|
27
27
|
## Quick Start
|
|
28
28
|
|
|
29
|
+
> Note: `pdf-brain` is agent-first and emits a single JSON envelope to stdout by default.
|
|
30
|
+
> Use `--format text` for human-readable output (and TUI/progress rendering), or inspect the machine contract via `pdf-brain capabilities`.
|
|
31
|
+
|
|
29
32
|
```bash
|
|
30
33
|
# 1. Install (standalone binary, no runtime needed)
|
|
31
|
-
curl -fsSL https://raw.githubusercontent.com/joelhooks/pdf-
|
|
34
|
+
curl -fsSL https://raw.githubusercontent.com/joelhooks/pdf-brain/main/scripts/install.sh | bash
|
|
32
35
|
|
|
33
36
|
# 2. Install Ollama (macOS)
|
|
34
37
|
brew install ollama
|
|
@@ -81,7 +84,7 @@ ollama serve
|
|
|
81
84
|
|
|
82
85
|
```bash
|
|
83
86
|
# Standalone binary (no runtime needed)
|
|
84
|
-
curl -fsSL https://raw.githubusercontent.com/joelhooks/pdf-
|
|
87
|
+
curl -fsSL https://raw.githubusercontent.com/joelhooks/pdf-brain/main/scripts/install.sh | bash
|
|
85
88
|
|
|
86
89
|
# or via npm
|
|
87
90
|
npm install -g pdf-brain
|
|
@@ -89,6 +92,21 @@ npm install -g pdf-brain
|
|
|
89
92
|
|
|
90
93
|
## CLI Reference
|
|
91
94
|
|
|
95
|
+
### Agent Output (Default)
|
|
96
|
+
|
|
97
|
+
`pdf-brain` is optimized for agentic workflows: stdout is machine-readable by default.
|
|
98
|
+
|
|
99
|
+
- `--format json|ndjson|text` (default: `json`)
|
|
100
|
+
- `--pretty` pretty-print JSON
|
|
101
|
+
- `--quiet` (alias: `--no-hints`) omit `nextActions`
|
|
102
|
+
- `--log-level silent|error|info|debug` (logs go to stderr)
|
|
103
|
+
|
|
104
|
+
Discover the full command/tool contract (including JSON Schemas) at runtime:
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
pdf-brain capabilities
|
|
108
|
+
```
|
|
109
|
+
|
|
92
110
|
### Basic Commands
|
|
93
111
|
|
|
94
112
|
```bash
|
|
@@ -382,6 +400,8 @@ pdf-brain config set enrichment.model anthropic/claude-haiku-4-5
|
|
|
382
400
|
| `PDF_LIBRARY_PATH` | `~/Documents/.pdf-library` | Library storage location |
|
|
383
401
|
| `OLLAMA_HOST` | `http://localhost:11434` | Ollama API endpoint |
|
|
384
402
|
| `AI_GATEWAY_API_KEY` | - | API key for AI Gateway |
|
|
403
|
+
| `PDF_BRAIN_LOG_LEVEL` | `silent` | stderr logging verbosity |
|
|
404
|
+
| `PDF_BRAIN_QUERY_EMBED_CACHE_SIZE` | `256` | Query embedding LRU cache size (0 disables) |
|
|
385
405
|
|
|
386
406
|
### AI Gateway
|
|
387
407
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pdf-brain",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "2.0.0",
|
|
4
4
|
"description": "Local PDF & Markdown knowledge base with semantic search, AI enrichment, and SKOS taxonomy",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "src/index.ts",
|
|
@@ -29,6 +29,7 @@
|
|
|
29
29
|
"@effect/schema": "^0.75.0",
|
|
30
30
|
"@electric-sql/pglite": "^0.3.0",
|
|
31
31
|
"@libsql/client": "^0.15.15",
|
|
32
|
+
"@modelcontextprotocol/sdk": "1.26.0",
|
|
32
33
|
"ai": "^5.0.115",
|
|
33
34
|
"dotenv": "^17.2.3",
|
|
34
35
|
"effect": "^3.12.0",
|
package/scripts/install.sh
CHANGED
package/src/agent/hints.ts
CHANGED
|
@@ -4,15 +4,22 @@
|
|
|
4
4
|
* Pure function: CommandResult discriminated union in, string[] hints out.
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
|
+
import type { NextAction } from "./protocol.js";
|
|
8
|
+
|
|
7
9
|
export type CommandResult =
|
|
8
10
|
| {
|
|
9
11
|
_tag: "search";
|
|
10
12
|
query: string;
|
|
11
|
-
results: { title: string; docId: string; score: number }[];
|
|
13
|
+
results: { title: string; docId: string; chunkId?: string; score: number }[];
|
|
12
14
|
concepts: { id: string; prefLabel: string }[];
|
|
13
15
|
hadExpand: boolean;
|
|
14
16
|
wasFts: boolean;
|
|
15
17
|
}
|
|
18
|
+
| {
|
|
19
|
+
_tag: "searchPack";
|
|
20
|
+
queries: string[];
|
|
21
|
+
results: { title: string; docId: string; chunkId?: string; score: number }[];
|
|
22
|
+
}
|
|
16
23
|
| { _tag: "read"; title: string; id: string; tags: string[] }
|
|
17
24
|
| {
|
|
18
25
|
_tag: "list";
|
|
@@ -37,12 +44,29 @@ export type CommandResult =
|
|
|
37
44
|
| { _tag: "remove"; title: string }
|
|
38
45
|
| { _tag: "noResults"; query: string; wasFts: boolean }
|
|
39
46
|
| { _tag: "error"; command: string; message: string }
|
|
40
|
-
| {
|
|
47
|
+
| {
|
|
48
|
+
_tag: "doctor";
|
|
49
|
+
healthy: boolean;
|
|
50
|
+
chunkerOutdated?: number;
|
|
51
|
+
chunkerMissing?: number;
|
|
52
|
+
chunkerMismatch?: number;
|
|
53
|
+
}
|
|
41
54
|
| { _tag: "config"; subcommand: string }
|
|
42
55
|
| { _tag: "tag"; title: string; tags: string[] }
|
|
43
56
|
| { _tag: "check"; reachable: boolean }
|
|
44
57
|
| { _tag: "repair"; orphanedChunks: number; orphanedEmbeddings: number }
|
|
45
|
-
| { _tag: "reindex"; count: number; errors: number }
|
|
58
|
+
| { _tag: "reindex"; count: number; errors: number }
|
|
59
|
+
| {
|
|
60
|
+
_tag: "rechunk";
|
|
61
|
+
dryRun: boolean;
|
|
62
|
+
planned: number;
|
|
63
|
+
succeeded: number;
|
|
64
|
+
failed: number;
|
|
65
|
+
includeMissing?: boolean;
|
|
66
|
+
skippedMissing?: number;
|
|
67
|
+
plannedMissing?: number;
|
|
68
|
+
plannedMismatch?: number;
|
|
69
|
+
};
|
|
46
70
|
|
|
47
71
|
/**
|
|
48
72
|
* Generate contextual next-action hints from a command result.
|
|
@@ -84,6 +108,26 @@ export function generateHints(result: CommandResult): string[] {
|
|
|
84
108
|
return hints;
|
|
85
109
|
}
|
|
86
110
|
|
|
111
|
+
case "searchPack": {
|
|
112
|
+
const hints: string[] = [];
|
|
113
|
+
if (result.results.length > 0) {
|
|
114
|
+
const top = result.results[0];
|
|
115
|
+
hints.push(
|
|
116
|
+
`\`pdf-brain read "${top.title}"\` -- Full metadata for top result`
|
|
117
|
+
);
|
|
118
|
+
if (top.chunkId) {
|
|
119
|
+
hints.push(
|
|
120
|
+
`\`pdf-brain chunk get "${top.chunkId}"\` -- Fetch exact top chunk text`
|
|
121
|
+
);
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
hints.push(
|
|
125
|
+
`\`pdf-brain search "<query>"\` -- Drill into a single query`,
|
|
126
|
+
`\`pdf-brain search-pack --with-content "${result.queries[0] ?? "query"}"\` -- Include chunk text in pack output`,
|
|
127
|
+
);
|
|
128
|
+
return hints;
|
|
129
|
+
}
|
|
130
|
+
|
|
87
131
|
case "noResults": {
|
|
88
132
|
const hints: string[] = [];
|
|
89
133
|
if (!result.wasFts) {
|
|
@@ -219,6 +263,22 @@ export function generateHints(result: CommandResult): string[] {
|
|
|
219
263
|
`\`pdf-brain doctor --fix\` -- Auto-repair detected issues`
|
|
220
264
|
);
|
|
221
265
|
}
|
|
266
|
+
const missing = result.chunkerMissing ?? 0;
|
|
267
|
+
const mismatch = result.chunkerMismatch ?? 0;
|
|
268
|
+
|
|
269
|
+
if (mismatch > 0) {
|
|
270
|
+
hints.push(
|
|
271
|
+
`\`pdf-brain rechunk --dry-run\` -- Preview docs with stale chunker metadata`,
|
|
272
|
+
`\`pdf-brain rechunk\` -- Apply rechunk (rebuild chunks + embeddings)`,
|
|
273
|
+
);
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
if (missing > 0) {
|
|
277
|
+
hints.push(
|
|
278
|
+
`\`pdf-brain rechunk --dry-run --include-missing\` -- Preview docs missing chunker metadata (upgrade sweep)`,
|
|
279
|
+
`\`pdf-brain rechunk --include-missing --max-docs 25\` -- Rechunk a small batch (expensive)`,
|
|
280
|
+
);
|
|
281
|
+
}
|
|
222
282
|
hints.push(
|
|
223
283
|
`\`pdf-brain stats\` -- Check library statistics`,
|
|
224
284
|
`\`pdf-brain search "<query>"\` -- Search documents`
|
|
@@ -257,6 +317,32 @@ export function generateHints(result: CommandResult): string[] {
|
|
|
257
317
|
];
|
|
258
318
|
}
|
|
259
319
|
|
|
320
|
+
case "rechunk": {
|
|
321
|
+
const hints: string[] = [];
|
|
322
|
+
if (result.dryRun) {
|
|
323
|
+
if (result.includeMissing) {
|
|
324
|
+
hints.push(
|
|
325
|
+
`\`pdf-brain rechunk --include-missing --max-docs 25\` -- Rechunk a small batch (rebuild chunks + embeddings)`,
|
|
326
|
+
);
|
|
327
|
+
} else {
|
|
328
|
+
hints.push(
|
|
329
|
+
`\`pdf-brain rechunk\` -- Apply rechunk (rebuild chunks + embeddings)`,
|
|
330
|
+
);
|
|
331
|
+
if ((result.skippedMissing ?? 0) > 0) {
|
|
332
|
+
hints.push(
|
|
333
|
+
`\`pdf-brain rechunk --dry-run --include-missing\` -- Include missing-metadata docs in the plan`,
|
|
334
|
+
);
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
} else {
|
|
338
|
+
hints.push(
|
|
339
|
+
`\`pdf-brain stats\` -- Verify counts after rechunk`,
|
|
340
|
+
`\`pdf-brain search "<query>"\` -- Sanity-check retrieval quality`,
|
|
341
|
+
);
|
|
342
|
+
}
|
|
343
|
+
return hints;
|
|
344
|
+
}
|
|
345
|
+
|
|
260
346
|
case "reindex": {
|
|
261
347
|
return [
|
|
262
348
|
`\`pdf-brain stats\` -- Check updated statistics`,
|
|
@@ -278,3 +364,340 @@ export function generateHints(result: CommandResult): string[] {
|
|
|
278
364
|
}
|
|
279
365
|
}
|
|
280
366
|
}
|
|
367
|
+
|
|
368
|
+
/**
|
|
369
|
+
* Structured follow-up actions for agent workflows.
|
|
370
|
+
* These are equivalent to `generateHints`, but machine-friendly.
|
|
371
|
+
*/
|
|
372
|
+
export function generateNextActions(result: CommandResult): NextAction[] {
|
|
373
|
+
switch (result._tag) {
|
|
374
|
+
case "search": {
|
|
375
|
+
const actions: NextAction[] = [];
|
|
376
|
+
if (result.results.length > 0) {
|
|
377
|
+
const top = result.results[0];
|
|
378
|
+
actions.push({
|
|
379
|
+
kind: "shell",
|
|
380
|
+
argv: ["pdf-brain", "read", top.docId],
|
|
381
|
+
description: "Full metadata for top result",
|
|
382
|
+
});
|
|
383
|
+
|
|
384
|
+
if (top.chunkId) {
|
|
385
|
+
actions.push({
|
|
386
|
+
kind: "shell",
|
|
387
|
+
argv: ["pdf-brain", "chunk", "get", top.chunkId],
|
|
388
|
+
description: "Fetch exact top chunk text",
|
|
389
|
+
});
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
if (!result.hadExpand) {
|
|
393
|
+
actions.push({
|
|
394
|
+
kind: "shell",
|
|
395
|
+
argv: ["pdf-brain", "search", result.query, "--expand", "2000"],
|
|
396
|
+
description: "Get expanded context around matches",
|
|
397
|
+
});
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
if (result.concepts.length > 0) {
|
|
402
|
+
const topConcept = result.concepts[0];
|
|
403
|
+
actions.push({
|
|
404
|
+
kind: "shell",
|
|
405
|
+
argv: ["pdf-brain", "taxonomy", "tree", topConcept.id],
|
|
406
|
+
description: "Navigate concept hierarchy",
|
|
407
|
+
});
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
if (result.results.length > 0 && !result.wasFts) {
|
|
411
|
+
actions.push({
|
|
412
|
+
kind: "shell",
|
|
413
|
+
argv: ["pdf-brain", "search", result.query, "--fts"],
|
|
414
|
+
description: "Try keyword matching instead",
|
|
415
|
+
});
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
if (result.results.length === 0 && result.concepts.length === 0) {
|
|
419
|
+
return generateNextActions({
|
|
420
|
+
_tag: "noResults",
|
|
421
|
+
query: result.query,
|
|
422
|
+
wasFts: result.wasFts,
|
|
423
|
+
});
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
return actions;
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
case "searchPack": {
|
|
430
|
+
const actions: NextAction[] = [];
|
|
431
|
+
if (result.results.length > 0) {
|
|
432
|
+
const top = result.results[0];
|
|
433
|
+
actions.push({
|
|
434
|
+
kind: "shell",
|
|
435
|
+
argv: ["pdf-brain", "read", top.docId],
|
|
436
|
+
description: "Read top document metadata",
|
|
437
|
+
});
|
|
438
|
+
if (top.chunkId) {
|
|
439
|
+
actions.push({
|
|
440
|
+
kind: "shell",
|
|
441
|
+
argv: ["pdf-brain", "chunk", "get", top.chunkId],
|
|
442
|
+
description: "Fetch exact top chunk text",
|
|
443
|
+
});
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
actions.push({
|
|
447
|
+
kind: "shell",
|
|
448
|
+
argv: ["pdf-brain", "search", "your query here"],
|
|
449
|
+
description: "Drill into a single query",
|
|
450
|
+
});
|
|
451
|
+
return actions;
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
case "noResults": {
|
|
455
|
+
const actions: NextAction[] = [];
|
|
456
|
+
if (!result.wasFts) {
|
|
457
|
+
actions.push({
|
|
458
|
+
kind: "shell",
|
|
459
|
+
argv: ["pdf-brain", "search", result.query, "--fts"],
|
|
460
|
+
description: "Try full-text keyword search",
|
|
461
|
+
});
|
|
462
|
+
} else {
|
|
463
|
+
actions.push({
|
|
464
|
+
kind: "shell",
|
|
465
|
+
argv: ["pdf-brain", "search", result.query],
|
|
466
|
+
description: "Try semantic vector search",
|
|
467
|
+
});
|
|
468
|
+
}
|
|
469
|
+
actions.push(
|
|
470
|
+
{ kind: "shell", argv: ["pdf-brain", "list"], description: "Browse all documents" },
|
|
471
|
+
{
|
|
472
|
+
kind: "shell",
|
|
473
|
+
argv: ["pdf-brain", "taxonomy", "search", result.query],
|
|
474
|
+
description: "Search taxonomy concepts",
|
|
475
|
+
},
|
|
476
|
+
);
|
|
477
|
+
return actions;
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
case "read": {
|
|
481
|
+
const actions: NextAction[] = [];
|
|
482
|
+
actions.push({
|
|
483
|
+
kind: "shell",
|
|
484
|
+
argv: ["pdf-brain", "search", result.title, "--expand", "2000"],
|
|
485
|
+
description: "Search within this document's content",
|
|
486
|
+
});
|
|
487
|
+
if (result.tags.length > 0) {
|
|
488
|
+
actions.push({
|
|
489
|
+
kind: "shell",
|
|
490
|
+
argv: ["pdf-brain", "list", "--tag", result.tags[0]],
|
|
491
|
+
description: "Browse documents with same tag",
|
|
492
|
+
});
|
|
493
|
+
}
|
|
494
|
+
actions.push({
|
|
495
|
+
kind: "shell",
|
|
496
|
+
argv: ["pdf-brain", "taxonomy", "search", result.title],
|
|
497
|
+
description: "Find related concepts",
|
|
498
|
+
});
|
|
499
|
+
return actions;
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
case "list": {
|
|
503
|
+
const actions: NextAction[] = [];
|
|
504
|
+
if (result.firstDoc) {
|
|
505
|
+
actions.push({
|
|
506
|
+
kind: "shell",
|
|
507
|
+
argv: ["pdf-brain", "read", result.firstDoc.id],
|
|
508
|
+
description: "Read the first listed document",
|
|
509
|
+
});
|
|
510
|
+
}
|
|
511
|
+
actions.push({
|
|
512
|
+
kind: "shell",
|
|
513
|
+
argv: ["pdf-brain", "search", "your query here"],
|
|
514
|
+
description: "Search the library",
|
|
515
|
+
});
|
|
516
|
+
return actions;
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
case "stats": {
|
|
520
|
+
return [
|
|
521
|
+
{ kind: "shell", argv: ["pdf-brain", "search", "your question here"], description: "Search the library" },
|
|
522
|
+
{ kind: "shell", argv: ["pdf-brain", "list"], description: "Browse all documents" },
|
|
523
|
+
{ kind: "shell", argv: ["pdf-brain", "taxonomy", "list"], description: "Browse taxonomy concepts" },
|
|
524
|
+
{ kind: "shell", argv: ["pdf-brain", "doctor"], description: "Check database health" },
|
|
525
|
+
];
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
case "taxonomySearch": {
|
|
529
|
+
const actions: NextAction[] = [];
|
|
530
|
+
if (result.matches.length > 0) {
|
|
531
|
+
actions.push({
|
|
532
|
+
kind: "shell",
|
|
533
|
+
argv: ["pdf-brain", "taxonomy", "tree", result.matches[0].id],
|
|
534
|
+
description: "Navigate concept hierarchy",
|
|
535
|
+
});
|
|
536
|
+
} else {
|
|
537
|
+
actions.push({
|
|
538
|
+
kind: "shell",
|
|
539
|
+
argv: ["pdf-brain", "taxonomy", "list"],
|
|
540
|
+
description: "Browse all concepts",
|
|
541
|
+
});
|
|
542
|
+
}
|
|
543
|
+
actions.push({
|
|
544
|
+
kind: "shell",
|
|
545
|
+
argv: ["pdf-brain", "search", result.query],
|
|
546
|
+
description: "Search documents for this concept",
|
|
547
|
+
});
|
|
548
|
+
return actions;
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
case "taxonomyList": {
|
|
552
|
+
return [
|
|
553
|
+
{ kind: "shell", argv: ["pdf-brain", "taxonomy", "tree"], description: "View full concept tree" },
|
|
554
|
+
{ kind: "shell", argv: ["pdf-brain", "taxonomy", "search", "your query"], description: "Search concepts" },
|
|
555
|
+
];
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
case "taxonomyTree": {
|
|
559
|
+
return [
|
|
560
|
+
{ kind: "shell", argv: ["pdf-brain", "taxonomy", "tree"], description: "View full concept tree" },
|
|
561
|
+
];
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
case "add": {
|
|
565
|
+
return [
|
|
566
|
+
{ kind: "shell", argv: ["pdf-brain", "read", result.id], description: "Read the new document" },
|
|
567
|
+
{ kind: "shell", argv: ["pdf-brain", "search", result.title], description: "Search for related content" },
|
|
568
|
+
{ kind: "shell", argv: ["pdf-brain", "tag", result.id, "tag1,tag2"], description: "Apply tags" },
|
|
569
|
+
];
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
case "remove": {
|
|
573
|
+
return [
|
|
574
|
+
{ kind: "shell", argv: ["pdf-brain", "list"], description: "Browse remaining documents" },
|
|
575
|
+
{ kind: "shell", argv: ["pdf-brain", "stats"], description: "Verify counts" },
|
|
576
|
+
];
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
case "tag": {
|
|
580
|
+
const actions: NextAction[] = [
|
|
581
|
+
{ kind: "shell", argv: ["pdf-brain", "read", result.title], description: "Read document metadata" },
|
|
582
|
+
{ kind: "shell", argv: ["pdf-brain", "list", "--tag", result.tags[0] ?? ""], description: "Browse by tag" },
|
|
583
|
+
];
|
|
584
|
+
return actions.filter((a) => a.argv[a.argv.length - 1] !== "");
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
case "doctor": {
|
|
588
|
+
const actions: NextAction[] = [];
|
|
589
|
+
if (!result.healthy) {
|
|
590
|
+
actions.push({
|
|
591
|
+
kind: "shell",
|
|
592
|
+
argv: ["pdf-brain", "doctor", "--fix"],
|
|
593
|
+
description: "Attempt auto-repair",
|
|
594
|
+
});
|
|
595
|
+
}
|
|
596
|
+
const missing = result.chunkerMissing ?? 0;
|
|
597
|
+
const mismatch = result.chunkerMismatch ?? 0;
|
|
598
|
+
|
|
599
|
+
if (mismatch > 0) {
|
|
600
|
+
actions.push(
|
|
601
|
+
{
|
|
602
|
+
kind: "shell",
|
|
603
|
+
argv: ["pdf-brain", "rechunk", "--dry-run"],
|
|
604
|
+
description: "Preview docs with stale chunker metadata",
|
|
605
|
+
},
|
|
606
|
+
{
|
|
607
|
+
kind: "shell",
|
|
608
|
+
argv: ["pdf-brain", "rechunk"],
|
|
609
|
+
description: "Apply rechunk (rebuild chunks + embeddings)",
|
|
610
|
+
},
|
|
611
|
+
);
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
if (missing > 0) {
|
|
615
|
+
actions.push(
|
|
616
|
+
{
|
|
617
|
+
kind: "shell",
|
|
618
|
+
argv: ["pdf-brain", "rechunk", "--dry-run", "--include-missing"],
|
|
619
|
+
description: "Preview docs missing chunker metadata (upgrade sweep)",
|
|
620
|
+
},
|
|
621
|
+
{
|
|
622
|
+
kind: "shell",
|
|
623
|
+
argv: ["pdf-brain", "rechunk", "--include-missing", "--max-docs", "25"],
|
|
624
|
+
description: "Rechunk a small batch (expensive)",
|
|
625
|
+
},
|
|
626
|
+
);
|
|
627
|
+
}
|
|
628
|
+
actions.push({
|
|
629
|
+
kind: "shell",
|
|
630
|
+
argv: ["pdf-brain", "stats"],
|
|
631
|
+
description: "Verify counts",
|
|
632
|
+
});
|
|
633
|
+
return actions;
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
case "config": {
|
|
637
|
+
return [
|
|
638
|
+
{ kind: "shell", argv: ["pdf-brain", "config", "show"], description: "Show config" },
|
|
639
|
+
];
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
case "check": {
|
|
643
|
+
return [
|
|
644
|
+
{ kind: "shell", argv: ["pdf-brain", "stats"], description: "Check library stats" },
|
|
645
|
+
];
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
case "repair": {
|
|
649
|
+
return [
|
|
650
|
+
{ kind: "shell", argv: ["pdf-brain", "doctor"], description: "Re-run health check" },
|
|
651
|
+
];
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
case "reindex": {
|
|
655
|
+
return [
|
|
656
|
+
{ kind: "shell", argv: ["pdf-brain", "stats"], description: "Verify counts" },
|
|
657
|
+
];
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
case "rechunk": {
|
|
661
|
+
if (result.dryRun) {
|
|
662
|
+
if (result.includeMissing) {
|
|
663
|
+
return [
|
|
664
|
+
{
|
|
665
|
+
kind: "shell",
|
|
666
|
+
argv: ["pdf-brain", "rechunk", "--include-missing", "--max-docs", "25"],
|
|
667
|
+
description: "Rechunk a small batch (rebuild chunks + embeddings)",
|
|
668
|
+
},
|
|
669
|
+
];
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
const actions: NextAction[] = [
|
|
673
|
+
{
|
|
674
|
+
kind: "shell",
|
|
675
|
+
argv: ["pdf-brain", "rechunk"],
|
|
676
|
+
description: "Apply rechunk (rebuild chunks + embeddings)",
|
|
677
|
+
},
|
|
678
|
+
];
|
|
679
|
+
|
|
680
|
+
if ((result.skippedMissing ?? 0) > 0) {
|
|
681
|
+
actions.push({
|
|
682
|
+
kind: "shell",
|
|
683
|
+
argv: ["pdf-brain", "rechunk", "--dry-run", "--include-missing"],
|
|
684
|
+
description: "Include missing-metadata docs in the plan",
|
|
685
|
+
});
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
return actions;
|
|
689
|
+
}
|
|
690
|
+
return [
|
|
691
|
+
{ kind: "shell", argv: ["pdf-brain", "stats"], description: "Verify counts" },
|
|
692
|
+
];
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
case "error": {
|
|
696
|
+
return [
|
|
697
|
+
{ kind: "shell", argv: ["pdf-brain", "doctor"], description: "Check database health" },
|
|
698
|
+
{ kind: "shell", argv: ["pdf-brain", "check"], description: "Check embedding provider connectivity" },
|
|
699
|
+
{ kind: "shell", argv: ["pdf-brain", "--help"], description: "Show available commands" },
|
|
700
|
+
];
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
}
|
package/src/agent/manifest.ts
CHANGED
|
@@ -33,11 +33,23 @@ ${docCount} documents indexed. Every command returns contextual next-action hint
|
|
|
33
33
|
--docs-only Search documents only
|
|
34
34
|
--include-clusters Include multi-scale cluster summaries
|
|
35
35
|
|
|
36
|
+
pdf-brain search-pack "<q1>" "<q2>" ... [options]
|
|
37
|
+
--limit <n> Max results per query (default 10)
|
|
38
|
+
--global-limit <n> Max deduped results across all queries (optional)
|
|
39
|
+
--fts Full-text search only (keyword matching)
|
|
40
|
+
--expand <chars> Surrounding context (up to 4000 chars)
|
|
41
|
+
--with-content Include chunk text in pack output (default: handles only)
|
|
42
|
+
|
|
36
43
|
### Read & Browse
|
|
37
44
|
pdf-brain read "<id|title>" Document metadata (title, pages, tags, path)
|
|
38
45
|
pdf-brain list [--tag <tag>] All documents, optionally filtered by tag
|
|
39
46
|
pdf-brain stats Library statistics (doc/chunk/embedding counts)
|
|
40
47
|
|
|
48
|
+
### Progressive Disclosure (agent primitives)
|
|
49
|
+
pdf-brain chunk get <chunkId> Fetch a single chunk's full text
|
|
50
|
+
pdf-brain doc chunks <docId> [--page N] List chunk IDs for a document (optionally by page)
|
|
51
|
+
pdf-brain page get <docId> <page> Reconstruct full page text by concatenating chunks
|
|
52
|
+
|
|
41
53
|
### Taxonomy (concept navigation)
|
|
42
54
|
pdf-brain taxonomy search "<q>" Find concepts by keyword or semantic similarity
|
|
43
55
|
pdf-brain taxonomy tree [id] Visual hierarchy tree from a concept
|
|
@@ -51,17 +63,23 @@ ${docCount} documents indexed. Every command returns contextual next-action hint
|
|
|
51
63
|
pdf-brain ingest <dir> [--enrich] [--auto-tag] [--recursive]
|
|
52
64
|
|
|
53
65
|
### Maintenance
|
|
66
|
+
pdf-brain capabilities Self-describing command list + JSON Schemas
|
|
67
|
+
pdf-brain mcp Start MCP server (stdio) for tool-based agent access
|
|
68
|
+
pdf-brain update Self-update to latest release
|
|
54
69
|
pdf-brain doctor [--fix] Health check (WAL, orphans, connectivity)
|
|
55
70
|
pdf-brain config show|get|set View/modify configuration
|
|
56
71
|
pdf-brain reindex [--clean] Re-embed all documents
|
|
72
|
+
pdf-brain rechunk [--dry-run] [--include-missing] [--max-docs N] [--max-chunks N] Rebuild chunks + embeddings when the chunker changes
|
|
57
73
|
pdf-brain export / import Backup and restore
|
|
58
74
|
|
|
59
75
|
## Agent Workflow
|
|
60
76
|
1. \`search\` -> find relevant chunks with similarity scores
|
|
61
|
-
2. \`search
|
|
62
|
-
3. \`
|
|
63
|
-
4. \`
|
|
64
|
-
5. \`
|
|
77
|
+
2. Copy chunk IDs from \`search\` output -> \`chunk get\` to pull full text precisely
|
|
78
|
+
3. Use \`doc chunks\` / \`page get\` to expand context only when needed
|
|
79
|
+
4. \`search --expand 2000\` -> get full surrounding context for deeper reading
|
|
80
|
+
5. \`read\` -> get document metadata (title, tags, page count)
|
|
81
|
+
6. \`taxonomy search\` -> find concept categories, then \`taxonomy tree\` to navigate
|
|
82
|
+
7. \`list --tag\` -> discover documents by topic area
|
|
65
83
|
|
|
66
84
|
## Tips
|
|
67
85
|
- Scores closer to 1.0 = stronger semantic match
|
|
@@ -74,5 +92,8 @@ ${docCount} documents indexed. Every command returns contextual next-action hint
|
|
|
74
92
|
## Options
|
|
75
93
|
--help, -h Show this help
|
|
76
94
|
--version, -v Show version
|
|
95
|
+
--format <mode> Output mode: json (default), ndjson, text
|
|
96
|
+
--pretty Pretty-print JSON
|
|
97
|
+
--log-level <level> stderr logs: silent (default), error, info, debug
|
|
77
98
|
--quiet, --no-hints Suppress next-action hints`;
|
|
78
99
|
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Agent-first output protocol for pdf-brain.
|
|
3
|
+
*
|
|
4
|
+
* Design goals:
|
|
5
|
+
* - stdout is machine-readable (JSON by default)
|
|
6
|
+
* - stderr is diagnostics only (opt-in via log-level)
|
|
7
|
+
* - stable envelope so agents can reliably parse responses and chain next actions
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
export const PDF_BRAIN_PROTOCOL_VERSION = 1 as const;
|
|
11
|
+
|
|
12
|
+
export type OutputFormat = "json" | "ndjson" | "text";
|
|
13
|
+
export type LogLevel = "silent" | "error" | "info" | "debug";
|
|
14
|
+
|
|
15
|
+
export interface NextAction {
|
|
16
|
+
kind: "shell";
|
|
17
|
+
argv: string[];
|
|
18
|
+
description?: string;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export interface AgentErrorShape {
|
|
22
|
+
code: string;
|
|
23
|
+
message: string;
|
|
24
|
+
details?: unknown;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export type AgentEnvelope<T> =
|
|
28
|
+
| {
|
|
29
|
+
ok: true;
|
|
30
|
+
command: string;
|
|
31
|
+
protocolVersion: typeof PDF_BRAIN_PROTOCOL_VERSION;
|
|
32
|
+
result: T;
|
|
33
|
+
nextActions?: NextAction[];
|
|
34
|
+
meta?: Record<string, unknown>;
|
|
35
|
+
}
|
|
36
|
+
| {
|
|
37
|
+
ok: false;
|
|
38
|
+
command: string;
|
|
39
|
+
protocolVersion: typeof PDF_BRAIN_PROTOCOL_VERSION;
|
|
40
|
+
error: AgentErrorShape;
|
|
41
|
+
nextActions?: NextAction[];
|
|
42
|
+
meta?: Record<string, unknown>;
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
export function toJsonLine(
|
|
46
|
+
value: unknown,
|
|
47
|
+
opts?: { pretty?: boolean }
|
|
48
|
+
): string {
|
|
49
|
+
const pretty = opts?.pretty === true;
|
|
50
|
+
return JSON.stringify(value, null, pretty ? 2 : 0) + "\n";
|
|
51
|
+
}
|
|
52
|
+
|