pdf-brain 1.3.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -2
- package/package.json +2 -1
- package/scripts/install.sh +1 -1
- package/src/agent/hints.ts +426 -3
- package/src/agent/manifest.ts +24 -4
- package/src/agent/protocol.ts +52 -0
- package/src/chunking.ts +130 -0
- package/src/cli.contract.test.ts +239 -0
- package/src/cli.ts +2573 -840
- package/src/index.ts +259 -6
- package/src/logger.ts +53 -0
- package/src/services/AutoTagger.ts +26 -38
- package/src/services/ClusterSummarizer.ts +3 -3
- package/src/services/Clustering.test.ts +20 -5
- package/src/services/Clustering.ts +48 -11
- package/src/services/Database.ts +27 -0
- package/src/services/EmbeddingProvider.ts +77 -7
- package/src/services/Gateway.ts +8 -7
- package/src/services/LibSQLDatabase.test.ts +139 -0
- package/src/services/LibSQLDatabase.ts +228 -15
- package/src/services/Migration.ts +1 -1
- package/src/services/Ollama.ts +22 -7
- package/src/services/PDFExtractor.test.ts +40 -1
- package/src/services/PDFExtractor.ts +37 -6
- package/src/types.test.ts +22 -0
- package/src/types.ts +82 -2
- package/src/updater.ts +8 -3
package/src/chunking.ts
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
import type { Document, LibraryConfig } from "./types.js";
|
|
2
|
+
|
|
3
|
+
export type ChunkerMetadata = {
|
|
4
|
+
/** Stable identifier for the chunking algorithm implementation */
|
|
5
|
+
id: string;
|
|
6
|
+
/** Monotonic version for breaking chunker changes */
|
|
7
|
+
version: number;
|
|
8
|
+
/** Unit used for chunk sizing and overlap */
|
|
9
|
+
unit: "chars";
|
|
10
|
+
chunkSize: number;
|
|
11
|
+
chunkOverlap: number;
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
export const CURRENT_CHUNKER: Record<
|
|
15
|
+
"pdf" | "markdown",
|
|
16
|
+
{ id: string; version: number }
|
|
17
|
+
> = {
|
|
18
|
+
// v2: paragraph-preserving normalization + hyphenation fix
|
|
19
|
+
pdf: { id: "pdf-extractor:paragraphs-v2", version: 2 },
|
|
20
|
+
// v1: section-aware markdown parsing + placeholder-preserving chunking
|
|
21
|
+
markdown: { id: "markdown-extractor:sections+placeholders-v1", version: 1 },
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
export function inferFileTypeFromPath(path: string): "pdf" | "markdown" {
|
|
25
|
+
const lower = path.toLowerCase();
|
|
26
|
+
if (lower.endsWith(".md") || lower.endsWith(".markdown")) return "markdown";
|
|
27
|
+
return "pdf";
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export function buildChunkerMetadata(
|
|
31
|
+
fileType: "pdf" | "markdown",
|
|
32
|
+
config: Pick<LibraryConfig, "chunkSize" | "chunkOverlap">,
|
|
33
|
+
): ChunkerMetadata {
|
|
34
|
+
const base = CURRENT_CHUNKER[fileType];
|
|
35
|
+
return {
|
|
36
|
+
id: base.id,
|
|
37
|
+
version: base.version,
|
|
38
|
+
unit: "chars",
|
|
39
|
+
chunkSize: config.chunkSize,
|
|
40
|
+
chunkOverlap: config.chunkOverlap,
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export function parseChunkerMetadata(value: unknown): ChunkerMetadata | null {
|
|
45
|
+
if (!value || typeof value !== "object") return null;
|
|
46
|
+
const v: any = value;
|
|
47
|
+
|
|
48
|
+
const id = typeof v.id === "string" ? v.id : null;
|
|
49
|
+
const version = typeof v.version === "number" ? v.version : null;
|
|
50
|
+
const unit = v.unit === "chars" ? ("chars" as const) : null;
|
|
51
|
+
const chunkSize = typeof v.chunkSize === "number" ? v.chunkSize : null;
|
|
52
|
+
const chunkOverlap = typeof v.chunkOverlap === "number" ? v.chunkOverlap : null;
|
|
53
|
+
|
|
54
|
+
if (!id || version === null || !unit || chunkSize === null || chunkOverlap === null) {
|
|
55
|
+
return null;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
return { id, version, unit, chunkSize, chunkOverlap };
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export function getDocChunkerMetadata(doc: Document): ChunkerMetadata | null {
|
|
62
|
+
const meta = doc.metadata;
|
|
63
|
+
if (!meta || typeof meta !== "object") return null;
|
|
64
|
+
const chunker = (meta as any).chunker;
|
|
65
|
+
return parseChunkerMetadata(chunker);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
export function assessDocChunker(
|
|
69
|
+
doc: Document,
|
|
70
|
+
config: Pick<LibraryConfig, "chunkSize" | "chunkOverlap">,
|
|
71
|
+
): {
|
|
72
|
+
needsRechunk: boolean;
|
|
73
|
+
code:
|
|
74
|
+
| "ok"
|
|
75
|
+
| "missing_metadata"
|
|
76
|
+
| "id_version_mismatch"
|
|
77
|
+
| "config_mismatch"
|
|
78
|
+
| "unit_mismatch";
|
|
79
|
+
reason: string;
|
|
80
|
+
expected: ChunkerMetadata;
|
|
81
|
+
actual: ChunkerMetadata | null;
|
|
82
|
+
} {
|
|
83
|
+
const fileType =
|
|
84
|
+
doc.fileType ??
|
|
85
|
+
(doc.path ? inferFileTypeFromPath(doc.path) : ("pdf" as const));
|
|
86
|
+
const expected = buildChunkerMetadata(fileType, config);
|
|
87
|
+
const actual = getDocChunkerMetadata(doc);
|
|
88
|
+
|
|
89
|
+
if (!actual) {
|
|
90
|
+
return {
|
|
91
|
+
needsRechunk: true,
|
|
92
|
+
code: "missing_metadata",
|
|
93
|
+
reason: "missing chunker metadata",
|
|
94
|
+
expected,
|
|
95
|
+
actual: null,
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
if (actual.id !== expected.id || actual.version !== expected.version) {
|
|
100
|
+
return {
|
|
101
|
+
needsRechunk: true,
|
|
102
|
+
code: "id_version_mismatch",
|
|
103
|
+
reason: `chunker id/version mismatch (${actual.id}@${actual.version} != ${expected.id}@${expected.version})`,
|
|
104
|
+
expected,
|
|
105
|
+
actual,
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
if (actual.chunkSize !== expected.chunkSize || actual.chunkOverlap !== expected.chunkOverlap) {
|
|
110
|
+
return {
|
|
111
|
+
needsRechunk: true,
|
|
112
|
+
code: "config_mismatch",
|
|
113
|
+
reason: `chunkSize/chunkOverlap mismatch (${actual.chunkSize}/${actual.chunkOverlap} != ${expected.chunkSize}/${expected.chunkOverlap})`,
|
|
114
|
+
expected,
|
|
115
|
+
actual,
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
if (actual.unit !== expected.unit) {
|
|
120
|
+
return {
|
|
121
|
+
needsRechunk: true,
|
|
122
|
+
code: "unit_mismatch",
|
|
123
|
+
reason: `chunk unit mismatch (${actual.unit} != ${expected.unit})`,
|
|
124
|
+
expected,
|
|
125
|
+
actual,
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
return { needsRechunk: false, code: "ok", reason: "ok", expected, actual };
|
|
130
|
+
}
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
import { describe, expect, test } from "bun:test";
|
|
2
|
+
import { mkdtempSync, rmSync } from "fs";
|
|
3
|
+
import { tmpdir } from "os";
|
|
4
|
+
import { join } from "path";
|
|
5
|
+
import { Client } from "@modelcontextprotocol/sdk/client";
|
|
6
|
+
import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js";
|
|
7
|
+
|
|
8
|
+
function runCli(
|
|
9
|
+
argv: string[],
|
|
10
|
+
opts?: { env?: Record<string, string | undefined> },
|
|
11
|
+
): { exitCode: number; stdout: string; stderr: string } {
|
|
12
|
+
const env: Record<string, string> = {
|
|
13
|
+
...process.env,
|
|
14
|
+
...(opts?.env ?? {}),
|
|
15
|
+
} as any;
|
|
16
|
+
|
|
17
|
+
const proc = Bun.spawnSync([process.execPath, "run", "src/cli.ts", ...argv], {
|
|
18
|
+
env,
|
|
19
|
+
stdout: "pipe",
|
|
20
|
+
stderr: "pipe",
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
return {
|
|
24
|
+
exitCode: proc.exitCode ?? 0,
|
|
25
|
+
stdout: new TextDecoder().decode(proc.stdout),
|
|
26
|
+
stderr: new TextDecoder().decode(proc.stderr),
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function withTempLibraryPath<T>(fn: (libraryPath: string) => T): T {
|
|
31
|
+
const dir = mkdtempSync(join(tmpdir(), "pdf-brain-cli-contract-"));
|
|
32
|
+
try {
|
|
33
|
+
return fn(dir);
|
|
34
|
+
} finally {
|
|
35
|
+
rmSync(dir, { recursive: true, force: true });
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
async function withTempLibraryPathAsync<T>(
|
|
40
|
+
fn: (libraryPath: string) => Promise<T>,
|
|
41
|
+
): Promise<T> {
|
|
42
|
+
const dir = mkdtempSync(join(tmpdir(), "pdf-brain-cli-contract-"));
|
|
43
|
+
try {
|
|
44
|
+
return await fn(dir);
|
|
45
|
+
} finally {
|
|
46
|
+
rmSync(dir, { recursive: true, force: true });
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
describe("CLI JSON Envelope Contract", () => {
|
|
51
|
+
test("stats emits a single JSON envelope with nextActions when not --quiet", () =>
|
|
52
|
+
withTempLibraryPath((libraryPath) => {
|
|
53
|
+
const res = runCli(["stats", "--format", "json"], {
|
|
54
|
+
env: {
|
|
55
|
+
PDF_LIBRARY_PATH: libraryPath,
|
|
56
|
+
// Avoid touching any real local Ollama instance during tests.
|
|
57
|
+
OLLAMA_HOST: "http://127.0.0.1:1",
|
|
58
|
+
},
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
expect(res.exitCode).toBe(0);
|
|
62
|
+
expect(res.stdout.trim().startsWith("{")).toBe(true);
|
|
63
|
+
|
|
64
|
+
const obj = JSON.parse(res.stdout);
|
|
65
|
+
expect(obj.ok).toBe(true);
|
|
66
|
+
expect(obj.command).toBe("stats");
|
|
67
|
+
expect(obj.protocolVersion).toBe(1);
|
|
68
|
+
expect(obj.result).toBeDefined();
|
|
69
|
+
expect(obj.result.libraryPath).toBe(libraryPath);
|
|
70
|
+
expect(obj.result.documents).toBe(0);
|
|
71
|
+
expect(obj.result.chunks).toBe(0);
|
|
72
|
+
expect(obj.result.embeddings).toBe(0);
|
|
73
|
+
|
|
74
|
+
// Agent mode: nextActions should exist by default (unless --quiet)
|
|
75
|
+
expect(Array.isArray(obj.nextActions)).toBe(true);
|
|
76
|
+
expect(obj.nextActions.length).toBeGreaterThan(0);
|
|
77
|
+
}));
|
|
78
|
+
|
|
79
|
+
test("stats with --quiet omits nextActions", () =>
|
|
80
|
+
withTempLibraryPath((libraryPath) => {
|
|
81
|
+
const res = runCli(["stats", "--format", "json", "--quiet"], {
|
|
82
|
+
env: {
|
|
83
|
+
PDF_LIBRARY_PATH: libraryPath,
|
|
84
|
+
OLLAMA_HOST: "http://127.0.0.1:1",
|
|
85
|
+
},
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
expect(res.exitCode).toBe(0);
|
|
89
|
+
const obj = JSON.parse(res.stdout);
|
|
90
|
+
expect(obj.ok).toBe(true);
|
|
91
|
+
expect(obj.command).toBe("stats");
|
|
92
|
+
expect("nextActions" in obj).toBe(false);
|
|
93
|
+
}));
|
|
94
|
+
|
|
95
|
+
test("invalid --format returns a structured error envelope and non-zero exit code", () =>
|
|
96
|
+
withTempLibraryPath((libraryPath) => {
|
|
97
|
+
const res = runCli(["--format", "wat", "stats"], {
|
|
98
|
+
env: {
|
|
99
|
+
PDF_LIBRARY_PATH: libraryPath,
|
|
100
|
+
OLLAMA_HOST: "http://127.0.0.1:1",
|
|
101
|
+
},
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
expect(res.exitCode).not.toBe(0);
|
|
105
|
+
const obj = JSON.parse(res.stdout);
|
|
106
|
+
expect(obj.ok).toBe(false);
|
|
107
|
+
expect(obj.protocolVersion).toBe(1);
|
|
108
|
+
expect(obj.error).toBeDefined();
|
|
109
|
+
expect(obj.error.code).toBe("INVALID_FLAG");
|
|
110
|
+
}));
|
|
111
|
+
|
|
112
|
+
test("rechunk flag validation: --max-docs requires a numeric value", () =>
|
|
113
|
+
withTempLibraryPath((libraryPath) => {
|
|
114
|
+
const res = runCli(["rechunk", "--max-docs", "--format", "json"], {
|
|
115
|
+
env: {
|
|
116
|
+
PDF_LIBRARY_PATH: libraryPath,
|
|
117
|
+
OLLAMA_HOST: "http://127.0.0.1:1",
|
|
118
|
+
},
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
expect(res.exitCode).not.toBe(0);
|
|
122
|
+
const obj = JSON.parse(res.stdout);
|
|
123
|
+
expect(obj.ok).toBe(false);
|
|
124
|
+
expect(obj.error.code).toBe("INVALID_ARGS");
|
|
125
|
+
expect(String(obj.error.message)).toContain("--max-docs");
|
|
126
|
+
}));
|
|
127
|
+
|
|
128
|
+
test("capabilities is self-describing and includes JSON Schemas", () =>
|
|
129
|
+
withTempLibraryPath((libraryPath) => {
|
|
130
|
+
const res = runCli(["capabilities", "--format", "json", "--quiet"], {
|
|
131
|
+
env: {
|
|
132
|
+
PDF_LIBRARY_PATH: libraryPath,
|
|
133
|
+
OLLAMA_HOST: "http://127.0.0.1:1",
|
|
134
|
+
},
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
expect(res.exitCode).toBe(0);
|
|
138
|
+
const obj = JSON.parse(res.stdout);
|
|
139
|
+
expect(obj.ok).toBe(true);
|
|
140
|
+
expect(obj.command).toBe("capabilities");
|
|
141
|
+
expect(obj.protocolVersion).toBe(1);
|
|
142
|
+
|
|
143
|
+
const result = obj.result;
|
|
144
|
+
expect(result).toBeDefined();
|
|
145
|
+
expect(result.protocolVersion).toBe(1);
|
|
146
|
+
expect(typeof result.pdfBrainVersion).toBe("string");
|
|
147
|
+
|
|
148
|
+
// Command list invariants (agent discovery depends on these names)
|
|
149
|
+
const commandNames = new Set(
|
|
150
|
+
(result.commands as Array<any>).map((c) => String(c.name)),
|
|
151
|
+
);
|
|
152
|
+
expect(commandNames.has("search")).toBe(true);
|
|
153
|
+
expect(commandNames.has("search-pack")).toBe(true);
|
|
154
|
+
expect(commandNames.has("chunk")).toBe(true);
|
|
155
|
+
expect(commandNames.has("doc")).toBe(true);
|
|
156
|
+
expect(commandNames.has("page")).toBe(true);
|
|
157
|
+
expect(commandNames.has("stats")).toBe(true);
|
|
158
|
+
expect(commandNames.has("rechunk")).toBe(true);
|
|
159
|
+
expect(commandNames.has("reindex")).toBe(true);
|
|
160
|
+
expect(commandNames.has("mcp")).toBe(true);
|
|
161
|
+
|
|
162
|
+
// Schema invariants (agents can validate/parses these)
|
|
163
|
+
expect(result.schemas).toBeDefined();
|
|
164
|
+
expect(result.schemas.Document).toBeDefined();
|
|
165
|
+
expect(result.schemas.PDFChunk).toBeDefined();
|
|
166
|
+
expect(result.schemas.SearchResult).toBeDefined();
|
|
167
|
+
expect(result.schemas.Config).toBeDefined();
|
|
168
|
+
|
|
169
|
+
// Lightweight stability assertions: required field names shouldn't drift.
|
|
170
|
+
const docSchema = result.schemas.Document as any;
|
|
171
|
+
expect(docSchema.type).toBe("object");
|
|
172
|
+
expect(Array.isArray(docSchema.required)).toBe(true);
|
|
173
|
+
expect(docSchema.required).toContain("id");
|
|
174
|
+
expect(docSchema.required).toContain("title");
|
|
175
|
+
expect(docSchema.required).toContain("path");
|
|
176
|
+
expect(docSchema.required).toContain("tags");
|
|
177
|
+
}));
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
describe("MCP Tool Output Contract", () => {
|
|
181
|
+
test(
|
|
182
|
+
"mcp tools return structuredContent matching the agent envelope schema",
|
|
183
|
+
async () =>
|
|
184
|
+
withTempLibraryPathAsync(async (libraryPath) => {
|
|
185
|
+
const transport = new StdioClientTransport({
|
|
186
|
+
command: process.execPath,
|
|
187
|
+
args: ["run", "src/cli.ts", "mcp", "--quiet"],
|
|
188
|
+
cwd: process.cwd(),
|
|
189
|
+
stderr: "pipe",
|
|
190
|
+
env: {
|
|
191
|
+
...process.env,
|
|
192
|
+
PDF_LIBRARY_PATH: libraryPath,
|
|
193
|
+
// Avoid hitting any real local Ollama instance during tests.
|
|
194
|
+
OLLAMA_HOST: "http://127.0.0.1:1",
|
|
195
|
+
// Keep noise down if something logs unexpectedly.
|
|
196
|
+
PDF_BRAIN_LOG_LEVEL: "silent",
|
|
197
|
+
} as any,
|
|
198
|
+
});
|
|
199
|
+
|
|
200
|
+
const client = new Client({
|
|
201
|
+
name: "pdf-brain-contract-test",
|
|
202
|
+
version: "0.0.0",
|
|
203
|
+
});
|
|
204
|
+
|
|
205
|
+
try {
|
|
206
|
+
await client.connect(transport);
|
|
207
|
+
|
|
208
|
+
const tools = await client.listTools();
|
|
209
|
+
const toolNames = new Set(tools.tools.map((t) => t.name));
|
|
210
|
+
expect(toolNames.has("capabilities")).toBe(true);
|
|
211
|
+
expect(toolNames.has("stats")).toBe(true);
|
|
212
|
+
expect(toolNames.has("search")).toBe(true);
|
|
213
|
+
|
|
214
|
+
const call = await client.callTool({ name: "stats", arguments: {} });
|
|
215
|
+
expect(Boolean(call.isError)).toBe(false);
|
|
216
|
+
expect(call.structuredContent).toBeDefined();
|
|
217
|
+
|
|
218
|
+
const envelope: any = call.structuredContent;
|
|
219
|
+
expect(envelope.ok).toBe(true);
|
|
220
|
+
expect(envelope.command).toBe("stats");
|
|
221
|
+
expect(envelope.protocolVersion).toBe(1);
|
|
222
|
+
expect(envelope.result).toBeDefined();
|
|
223
|
+
expect(envelope.result.libraryPath).toBe(libraryPath);
|
|
224
|
+
} finally {
|
|
225
|
+
try {
|
|
226
|
+
await client.close();
|
|
227
|
+
} catch {
|
|
228
|
+
// ignore
|
|
229
|
+
}
|
|
230
|
+
try {
|
|
231
|
+
await transport.close();
|
|
232
|
+
} catch {
|
|
233
|
+
// ignore
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
}),
|
|
237
|
+
{ timeout: 20000 },
|
|
238
|
+
);
|
|
239
|
+
});
|