llm-kb 0.4.0 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +183 -42
- package/bin/anthropic-5TIU2EED.js +5515 -0
- package/bin/azure-openai-responses-ZVUVMK3G.js +190 -0
- package/bin/chunk-2WV6TQRI.js +4792 -0
- package/bin/chunk-3YMNGUZZ.js +262 -0
- package/bin/chunk-5PYKQQLA.js +14295 -0
- package/bin/chunk-65KFH7OI.js +31 -0
- package/bin/chunk-DHOXVEIR.js +7261 -0
- package/bin/chunk-EAQYK3U2.js +41 -0
- package/bin/chunk-IFS3OKBN.js +428 -0
- package/bin/chunk-LDHOKBJA.js +86 -0
- package/bin/chunk-SLYBG6ZQ.js +32681 -0
- package/bin/chunk-UEODFF7H.js +17 -0
- package/bin/chunk-XCXTZJGO.js +174 -0
- package/bin/chunk-XFV534WU.js +7056 -0
- package/bin/cli.js +30 -4
- package/bin/dist-3YH7P2QF.js +1244 -0
- package/bin/google-JFC43EFJ.js +371 -0
- package/bin/google-gemini-cli-K4XNMYDI.js +712 -0
- package/bin/google-vertex-Y42F254G.js +414 -0
- package/bin/indexer-KSYRIVVN.js +10 -0
- package/bin/mistral-ZU2JS5XZ.js +38406 -0
- package/bin/multipart-parser-CO464TZY.js +371 -0
- package/bin/openai-codex-responses-NW2LELBH.js +712 -0
- package/bin/openai-completions-TW3VKTHO.js +662 -0
- package/bin/openai-responses-VGL522MK.js +198 -0
- package/bin/src-Y22OHE3S.js +1408 -0
- package/package.json +6 -1
- package/PHASE2_SPEC.md +0 -274
- package/PHASE3_SPEC.md +0 -245
- package/PHASE4_SPEC.md +0 -358
- package/SPEC.md +0 -275
- package/plan.md +0 -300
- package/src/auth.ts +0 -55
- package/src/cli.ts +0 -257
- package/src/config.ts +0 -61
- package/src/eval.ts +0 -548
- package/src/indexer.ts +0 -152
- package/src/md-stream.ts +0 -133
- package/src/pdf.ts +0 -119
- package/src/query.ts +0 -408
- package/src/resolve-kb.ts +0 -19
- package/src/scan.ts +0 -59
- package/src/session-store.ts +0 -22
- package/src/session-watcher.ts +0 -89
- package/src/trace-builder.ts +0 -168
- package/src/tui-display.ts +0 -281
- package/src/utils.ts +0 -17
- package/src/watcher.ts +0 -87
- package/src/wiki-updater.ts +0 -136
- package/test/auth.test.ts +0 -65
- package/test/config.test.ts +0 -96
- package/test/md-stream.test.ts +0 -98
- package/test/resolve-kb.test.ts +0 -33
- package/test/scan.test.ts +0 -65
- package/test/trace-builder.test.ts +0 -215
- package/tsconfig.json +0 -14
- package/vitest.config.ts +0 -8
package/src/eval.ts
DELETED
|
@@ -1,548 +0,0 @@
|
|
|
1
|
-
import { getModels, completeSimple } from "@mariozechner/pi-ai";
|
|
2
|
-
import { AuthStorage } from "@mariozechner/pi-coding-agent";
|
|
3
|
-
import { readFile, readdir, writeFile, mkdir } from "node:fs/promises";
|
|
4
|
-
import { existsSync } from "node:fs";
|
|
5
|
-
import { join, basename } from "node:path";
|
|
6
|
-
import { homedir } from "node:os";
|
|
7
|
-
|
|
8
|
-
// ── Types ───────────────────────────────────────────────────────────────────
|
|
9
|
-
|
|
10
|
-
interface SessionQA {
|
|
11
|
-
sessionFile: string;
|
|
12
|
-
question: string;
|
|
13
|
-
thinking: string;
|
|
14
|
-
filesRead: { path: string; content: string }[]; // what agent actually saw
|
|
15
|
-
filesAvailable: string[]; // all sources
|
|
16
|
-
filesSkipped: string[];
|
|
17
|
-
answer: string;
|
|
18
|
-
model: string;
|
|
19
|
-
durationMs: number;
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
interface EvalIssue {
|
|
23
|
-
type: "citation" | "contradiction" | "wiki-gap" | "wasted-read" | "index-issue";
|
|
24
|
-
severity: "error" | "warning" | "info";
|
|
25
|
-
sessionFile: string;
|
|
26
|
-
question: string;
|
|
27
|
-
detail: string;
|
|
28
|
-
recommendation: string;
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
interface EvalMetrics {
|
|
32
|
-
totalSessions: number;
|
|
33
|
-
totalQAs: number;
|
|
34
|
-
avgDurationMs: number;
|
|
35
|
-
wikiHits: number; // answered without reading source files
|
|
36
|
-
sourceReads: number; // needed source files
|
|
37
|
-
totalFilesRead: number;
|
|
38
|
-
uniqueFilesRead: Map<string, number>; // file → read count
|
|
39
|
-
wastedReads: number; // files read but not cited in answer
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
export interface EvalResult {
|
|
43
|
-
metrics: EvalMetrics;
|
|
44
|
-
issues: EvalIssue[];
|
|
45
|
-
wikiGaps: string[];
|
|
46
|
-
timestamp: string;
|
|
47
|
-
agentsInsights: string; // markdown block to inject into AGENTS.md
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
// ── Session parser ──────────────────────────────────────────────────────────
|
|
51
|
-
|
|
52
|
-
async function parseSessionsForEval(sessionsDir: string, sourcesDir: string, limit?: number): Promise<SessionQA[]> {
|
|
53
|
-
if (!existsSync(sessionsDir)) return [];
|
|
54
|
-
|
|
55
|
-
const sessionFiles = (await readdir(sessionsDir))
|
|
56
|
-
.filter((f) => f.endsWith(".jsonl"))
|
|
57
|
-
.sort()
|
|
58
|
-
.reverse(); // newest first
|
|
59
|
-
|
|
60
|
-
const files = limit ? sessionFiles.slice(0, limit) : sessionFiles;
|
|
61
|
-
const qas: SessionQA[] = [];
|
|
62
|
-
|
|
63
|
-
// Get available source files
|
|
64
|
-
let filesAvailable: string[] = [];
|
|
65
|
-
try {
|
|
66
|
-
filesAvailable = (await readdir(sourcesDir)).filter((f) => f.endsWith(".md"));
|
|
67
|
-
} catch {}
|
|
68
|
-
|
|
69
|
-
for (const file of files) {
|
|
70
|
-
try {
|
|
71
|
-
const raw = await readFile(join(sessionsDir, file), "utf-8");
|
|
72
|
-
const lines = raw.trim().split("\n").filter(Boolean);
|
|
73
|
-
|
|
74
|
-
const entries: any[] = [];
|
|
75
|
-
for (const line of lines) {
|
|
76
|
-
try { entries.push(JSON.parse(line)); } catch {}
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
const messages = entries.filter((e) => e.type === "message");
|
|
80
|
-
|
|
81
|
-
// Check if this is a query session
|
|
82
|
-
const sessionInfo = entries.find((e) => e.type === "session_info");
|
|
83
|
-
const name: string = sessionInfo?.name ?? "";
|
|
84
|
-
if (!name.startsWith("query:")) continue;
|
|
85
|
-
|
|
86
|
-
// Find user questions and their corresponding answers
|
|
87
|
-
let currentQuestion = "";
|
|
88
|
-
let currentThinking = "";
|
|
89
|
-
let currentFilesRead: { path: string; content: string }[] = [];
|
|
90
|
-
let currentAnswer = "";
|
|
91
|
-
let currentModel = "";
|
|
92
|
-
let startTs = 0;
|
|
93
|
-
let endTs = 0;
|
|
94
|
-
|
|
95
|
-
for (const entry of messages) {
|
|
96
|
-
const msg = entry.message;
|
|
97
|
-
if (!msg) continue;
|
|
98
|
-
|
|
99
|
-
if (msg.role === "user") {
|
|
100
|
-
// If we had a previous Q&A, save it
|
|
101
|
-
if (currentQuestion && currentAnswer) {
|
|
102
|
-
qas.push({
|
|
103
|
-
sessionFile: file,
|
|
104
|
-
question: currentQuestion,
|
|
105
|
-
thinking: currentThinking,
|
|
106
|
-
filesRead: currentFilesRead,
|
|
107
|
-
filesAvailable,
|
|
108
|
-
filesSkipped: filesAvailable.filter(
|
|
109
|
-
(f) => !currentFilesRead.some((r) => r.path.endsWith(f))
|
|
110
|
-
),
|
|
111
|
-
answer: currentAnswer,
|
|
112
|
-
model: currentModel,
|
|
113
|
-
durationMs: endTs - startTs,
|
|
114
|
-
});
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
// Start new Q&A
|
|
118
|
-
currentQuestion = extractText(msg.content);
|
|
119
|
-
currentThinking = "";
|
|
120
|
-
currentFilesRead = [];
|
|
121
|
-
currentAnswer = "";
|
|
122
|
-
startTs = new Date(entry.timestamp).getTime();
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
if (msg.role === "assistant") {
|
|
126
|
-
currentModel = msg.model ?? "";
|
|
127
|
-
endTs = new Date(entry.timestamp).getTime();
|
|
128
|
-
|
|
129
|
-
for (const block of msg.content ?? []) {
|
|
130
|
-
if (block.type === "thinking") currentThinking += block.thinking;
|
|
131
|
-
if (block.type === "text") currentAnswer += block.text;
|
|
132
|
-
}
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
if (msg.role === "toolResult" && !msg.isError) {
|
|
136
|
-
// Find the corresponding tool call to get the path
|
|
137
|
-
const toolCallId = msg.toolCallId;
|
|
138
|
-
// Look back for the assistant message with this tool call
|
|
139
|
-
for (const prev of messages) {
|
|
140
|
-
if (prev.message?.role !== "assistant") continue;
|
|
141
|
-
for (const block of prev.message?.content ?? []) {
|
|
142
|
-
if (block.type === "toolCall" && block.id === toolCallId && block.name === "read") {
|
|
143
|
-
const path = block.arguments?.path ?? "";
|
|
144
|
-
const content = extractText(msg.content);
|
|
145
|
-
if (path && content) {
|
|
146
|
-
currentFilesRead.push({ path, content: content.slice(0, 2000) }); // cap content size
|
|
147
|
-
}
|
|
148
|
-
}
|
|
149
|
-
}
|
|
150
|
-
}
|
|
151
|
-
}
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
// Save the last Q&A
|
|
155
|
-
if (currentQuestion && currentAnswer) {
|
|
156
|
-
qas.push({
|
|
157
|
-
sessionFile: file,
|
|
158
|
-
question: currentQuestion,
|
|
159
|
-
thinking: currentThinking,
|
|
160
|
-
filesRead: currentFilesRead,
|
|
161
|
-
filesAvailable,
|
|
162
|
-
filesSkipped: filesAvailable.filter(
|
|
163
|
-
(f) => !currentFilesRead.some((r) => r.path.endsWith(f))
|
|
164
|
-
),
|
|
165
|
-
answer: currentAnswer,
|
|
166
|
-
model: currentModel,
|
|
167
|
-
durationMs: endTs - startTs,
|
|
168
|
-
});
|
|
169
|
-
}
|
|
170
|
-
} catch {
|
|
171
|
-
// Skip malformed session files
|
|
172
|
-
}
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
return qas;
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
// ── Metrics calculator ──────────────────────────────────────────────────────
|
|
179
|
-
|
|
180
|
-
function calculateMetrics(qas: SessionQA[]): EvalMetrics {
|
|
181
|
-
const uniqueFiles = new Map<string, number>();
|
|
182
|
-
let totalFilesRead = 0;
|
|
183
|
-
let wikiHits = 0;
|
|
184
|
-
let sourceReads = 0;
|
|
185
|
-
let wastedReads = 0;
|
|
186
|
-
let totalDuration = 0;
|
|
187
|
-
|
|
188
|
-
const uniqueSessions = new Set(qas.map((q) => q.sessionFile));
|
|
189
|
-
|
|
190
|
-
for (const qa of qas) {
|
|
191
|
-
totalDuration += qa.durationMs;
|
|
192
|
-
|
|
193
|
-
const sourceFilesRead = qa.filesRead.filter(
|
|
194
|
-
(f) => !f.path.includes("index.md") && !f.path.includes("wiki.md")
|
|
195
|
-
);
|
|
196
|
-
|
|
197
|
-
if (sourceFilesRead.length === 0) {
|
|
198
|
-
wikiHits++;
|
|
199
|
-
} else {
|
|
200
|
-
sourceReads++;
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
for (const f of sourceFilesRead) {
|
|
204
|
-
totalFilesRead++;
|
|
205
|
-
const name = basename(f.path);
|
|
206
|
-
uniqueFiles.set(name, (uniqueFiles.get(name) ?? 0) + 1);
|
|
207
|
-
|
|
208
|
-
// Check if this file was actually cited in the answer
|
|
209
|
-
if (!qa.answer.includes(name) && !qa.answer.includes(name.replace(".md", ""))) {
|
|
210
|
-
wastedReads++;
|
|
211
|
-
}
|
|
212
|
-
}
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
return {
|
|
216
|
-
totalSessions: uniqueSessions.size,
|
|
217
|
-
totalQAs: qas.length,
|
|
218
|
-
avgDurationMs: qas.length > 0 ? totalDuration / qas.length : 0,
|
|
219
|
-
wikiHits,
|
|
220
|
-
sourceReads,
|
|
221
|
-
totalFilesRead,
|
|
222
|
-
uniqueFilesRead: uniqueFiles,
|
|
223
|
-
wastedReads,
|
|
224
|
-
};
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
// ── LLM judge ───────────────────────────────────────────────────────────────
|
|
228
|
-
|
|
229
|
-
async function resolveApiKey(authStorage?: AuthStorage): Promise<string | undefined> {
|
|
230
|
-
if (authStorage) return authStorage.getApiKey("anthropic");
|
|
231
|
-
const piAuthPath = join(homedir(), ".pi", "agent", "auth.json");
|
|
232
|
-
if (existsSync(piAuthPath)) {
|
|
233
|
-
const storage = AuthStorage.create(piAuthPath);
|
|
234
|
-
return storage.getApiKey("anthropic");
|
|
235
|
-
}
|
|
236
|
-
return process.env.ANTHROPIC_API_KEY;
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
async function judgeQA(
|
|
240
|
-
qa: SessionQA,
|
|
241
|
-
apiKey: string,
|
|
242
|
-
modelId: string
|
|
243
|
-
): Promise<EvalIssue[]> {
|
|
244
|
-
const issues: EvalIssue[] = [];
|
|
245
|
-
const model = getModels("anthropic").find((m) => m.id === modelId);
|
|
246
|
-
if (!model) return issues;
|
|
247
|
-
|
|
248
|
-
// Build context for the judge
|
|
249
|
-
const filesSummary = qa.filesRead
|
|
250
|
-
.map((f) => `File: ${basename(f.path)}\nContent (first 2000 chars):\n${f.content}`)
|
|
251
|
-
.join("\n\n---\n\n");
|
|
252
|
-
|
|
253
|
-
const skippedList = qa.filesSkipped.join(", ") || "none";
|
|
254
|
-
|
|
255
|
-
const prompt = `You are an eval judge for a knowledge base Q&A system.
|
|
256
|
-
|
|
257
|
-
QUESTION: ${qa.question}
|
|
258
|
-
|
|
259
|
-
ANSWER:
|
|
260
|
-
${qa.answer.slice(0, 3000)}
|
|
261
|
-
|
|
262
|
-
FILES READ BY AGENT:
|
|
263
|
-
${filesSummary || "None — answered from wiki cache"}
|
|
264
|
-
|
|
265
|
-
FILES AVAILABLE BUT SKIPPED: ${skippedList}
|
|
266
|
-
|
|
267
|
-
---
|
|
268
|
-
|
|
269
|
-
Check for these issues and return a JSON array of findings. Each finding has:
|
|
270
|
-
- "type": one of "citation", "contradiction", "wiki-gap", "wasted-read"
|
|
271
|
-
- "severity": "error" or "warning"
|
|
272
|
-
- "detail": what's wrong (one sentence)
|
|
273
|
-
- "recommendation": what to fix (one sentence)
|
|
274
|
-
|
|
275
|
-
Checks:
|
|
276
|
-
1. CITATION: Does the answer cite specific sources? If so, does the file content support the claims?
|
|
277
|
-
2. CONTRADICTION: Does the answer say anything that contradicts the file content?
|
|
278
|
-
3. WIKI-GAP: If the agent read source files (not just wiki), what topic should be added to the wiki so next time it can answer without reading files?
|
|
279
|
-
4. WASTED-READ: Were any files read but not actually used in the answer?
|
|
280
|
-
|
|
281
|
-
Return ONLY a JSON array. If no issues found, return [].
|
|
282
|
-
Example: [{"type":"wiki-gap","severity":"warning","detail":"Electronic evidence topic not in wiki","recommendation":"Add electronic evidence section to wiki"}]`;
|
|
283
|
-
|
|
284
|
-
try {
|
|
285
|
-
const result = await completeSimple(
|
|
286
|
-
model,
|
|
287
|
-
{
|
|
288
|
-
systemPrompt: "You are a precise QA evaluator. Return only valid JSON arrays. No explanation.",
|
|
289
|
-
messages: [{ role: "user", content: prompt, timestamp: Date.now() }],
|
|
290
|
-
},
|
|
291
|
-
{ apiKey }
|
|
292
|
-
);
|
|
293
|
-
|
|
294
|
-
const text = result.content
|
|
295
|
-
.filter((b) => b.type === "text")
|
|
296
|
-
.map((b) => (b as any).text)
|
|
297
|
-
.join("")
|
|
298
|
-
.trim();
|
|
299
|
-
|
|
300
|
-
// Parse JSON response
|
|
301
|
-
const jsonMatch = text.match(/\[[\s\S]*\]/);
|
|
302
|
-
if (jsonMatch) {
|
|
303
|
-
const findings = JSON.parse(jsonMatch[0]);
|
|
304
|
-
for (const f of findings) {
|
|
305
|
-
issues.push({
|
|
306
|
-
type: f.type ?? "citation",
|
|
307
|
-
severity: f.severity ?? "warning",
|
|
308
|
-
sessionFile: qa.sessionFile,
|
|
309
|
-
question: qa.question,
|
|
310
|
-
detail: f.detail ?? "",
|
|
311
|
-
recommendation: f.recommendation ?? "",
|
|
312
|
-
});
|
|
313
|
-
}
|
|
314
|
-
}
|
|
315
|
-
} catch {
|
|
316
|
-
// Judge call failed — non-fatal
|
|
317
|
-
}
|
|
318
|
-
|
|
319
|
-
return issues;
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
// ── Report writer ───────────────────────────────────────────────────────────
|
|
323
|
-
|
|
324
|
-
function buildReport(result: EvalResult): string {
|
|
325
|
-
const { metrics, issues, wikiGaps } = result;
|
|
326
|
-
const lines: string[] = [];
|
|
327
|
-
|
|
328
|
-
lines.push(`# Eval Report`);
|
|
329
|
-
lines.push(``);
|
|
330
|
-
lines.push(`> ${metrics.totalQAs} queries across ${metrics.totalSessions} sessions · ${result.timestamp}`);
|
|
331
|
-
lines.push(``);
|
|
332
|
-
|
|
333
|
-
// Performance
|
|
334
|
-
lines.push(`## Performance`);
|
|
335
|
-
lines.push(``);
|
|
336
|
-
lines.push(`| Metric | Value |`);
|
|
337
|
-
lines.push(`|---|---|`);
|
|
338
|
-
lines.push(`| Total queries | ${metrics.totalQAs} |`);
|
|
339
|
-
lines.push(`| Avg duration | ${(metrics.avgDurationMs / 1000).toFixed(1)}s |`);
|
|
340
|
-
lines.push(`| Wiki hits (no file reads) | ${metrics.wikiHits} (${metrics.totalQAs > 0 ? Math.round(metrics.wikiHits / metrics.totalQAs * 100) : 0}%) |`);
|
|
341
|
-
lines.push(`| Needed source files | ${metrics.sourceReads} |`);
|
|
342
|
-
lines.push(`| Total file reads | ${metrics.totalFilesRead} |`);
|
|
343
|
-
lines.push(`| Wasted reads | ${metrics.wastedReads} |`);
|
|
344
|
-
lines.push(``);
|
|
345
|
-
|
|
346
|
-
// Most read files
|
|
347
|
-
if (metrics.uniqueFilesRead.size > 0) {
|
|
348
|
-
lines.push(`### Most Read Files`);
|
|
349
|
-
lines.push(``);
|
|
350
|
-
const sorted = [...metrics.uniqueFilesRead.entries()].sort((a, b) => b[1] - a[1]);
|
|
351
|
-
lines.push(`| File | Times Read |`);
|
|
352
|
-
lines.push(`|---|---|`);
|
|
353
|
-
for (const [file, count] of sorted.slice(0, 10)) {
|
|
354
|
-
lines.push(`| ${file} | ${count} |`);
|
|
355
|
-
}
|
|
356
|
-
lines.push(``);
|
|
357
|
-
}
|
|
358
|
-
|
|
359
|
-
// Issues
|
|
360
|
-
const errors = issues.filter((i) => i.severity === "error");
|
|
361
|
-
const warnings = issues.filter((i) => i.severity === "warning");
|
|
362
|
-
|
|
363
|
-
if (errors.length > 0) {
|
|
364
|
-
lines.push(`## 🔴 Errors (${errors.length})`);
|
|
365
|
-
lines.push(``);
|
|
366
|
-
for (const issue of errors) {
|
|
367
|
-
lines.push(`### ${issue.type}: ${issue.detail}`);
|
|
368
|
-
lines.push(`- **Query:** ${issue.question}`);
|
|
369
|
-
lines.push(`- **Recommendation:** ${issue.recommendation}`);
|
|
370
|
-
lines.push(``);
|
|
371
|
-
}
|
|
372
|
-
}
|
|
373
|
-
|
|
374
|
-
if (warnings.length > 0) {
|
|
375
|
-
lines.push(`## 🟡 Warnings (${warnings.length})`);
|
|
376
|
-
lines.push(``);
|
|
377
|
-
for (const issue of warnings) {
|
|
378
|
-
lines.push(`### ${issue.type}: ${issue.detail}`);
|
|
379
|
-
lines.push(`- **Query:** ${issue.question}`);
|
|
380
|
-
lines.push(`- **Recommendation:** ${issue.recommendation}`);
|
|
381
|
-
lines.push(``);
|
|
382
|
-
}
|
|
383
|
-
}
|
|
384
|
-
|
|
385
|
-
// Wiki gaps
|
|
386
|
-
if (wikiGaps.length > 0) {
|
|
387
|
-
lines.push(`## 📝 Wiki Gaps (auto-fixable)`);
|
|
388
|
-
lines.push(``);
|
|
389
|
-
for (const gap of wikiGaps) {
|
|
390
|
-
lines.push(`- ${gap}`);
|
|
391
|
-
}
|
|
392
|
-
lines.push(``);
|
|
393
|
-
}
|
|
394
|
-
|
|
395
|
-
if (errors.length === 0 && warnings.length === 0 && wikiGaps.length === 0) {
|
|
396
|
-
lines.push(`## ✅ No issues found`);
|
|
397
|
-
lines.push(``);
|
|
398
|
-
}
|
|
399
|
-
|
|
400
|
-
return lines.join("\n");
|
|
401
|
-
}
|
|
402
|
-
|
|
403
|
-
// ── Agents insights (injected into AGENTS.md) ──────────────────────────────
|
|
404
|
-
|
|
405
|
-
function buildAgentsInsights(result: EvalResult): string {
|
|
406
|
-
const { metrics, issues, wikiGaps } = result;
|
|
407
|
-
const lines: string[] = [];
|
|
408
|
-
|
|
409
|
-
lines.push(`## Eval Insights (auto-generated ${result.timestamp.slice(0, 10)})`);
|
|
410
|
-
lines.push(``);
|
|
411
|
-
|
|
412
|
-
// Wiki gaps — tell the agent to fill these
|
|
413
|
-
if (wikiGaps.length > 0) {
|
|
414
|
-
lines.push(`### Wiki Gaps — add to wiki when users ask about these topics`);
|
|
415
|
-
for (const gap of wikiGaps.slice(0, 15)) { // cap at 15 to avoid context bloat
|
|
416
|
-
lines.push(`- ${gap}`);
|
|
417
|
-
}
|
|
418
|
-
lines.push(``);
|
|
419
|
-
}
|
|
420
|
-
|
|
421
|
-
// Behaviour fixes from errors
|
|
422
|
-
const citationErrors = issues.filter((i) => i.type === "citation" && i.severity === "error");
|
|
423
|
-
const contradictions = issues.filter((i) => i.type === "contradiction");
|
|
424
|
-
if (citationErrors.length > 0 || contradictions.length > 0) {
|
|
425
|
-
lines.push(`### Behaviour Fixes`);
|
|
426
|
-
if (citationErrors.some((i) => i.detail.includes("wiki cache"))) {
|
|
427
|
-
lines.push(`- Do NOT claim "I read the actual document" when answering from wiki. Say "Based on the knowledge wiki" instead.`);
|
|
428
|
-
}
|
|
429
|
-
if (contradictions.length > 0) {
|
|
430
|
-
lines.push(`- Double-check claims against source text before stating them as fact.`);
|
|
431
|
-
}
|
|
432
|
-
if (metrics.wastedReads > 10) {
|
|
433
|
-
lines.push(`- Be more selective with file reads. Last eval found ${metrics.wastedReads} wasted reads (files read but not cited).`);
|
|
434
|
-
}
|
|
435
|
-
lines.push(``);
|
|
436
|
-
}
|
|
437
|
-
|
|
438
|
-
// Most-read files — tell agent to prefer wiki for these
|
|
439
|
-
if (metrics.uniqueFilesRead.size > 0) {
|
|
440
|
-
const sorted = [...metrics.uniqueFilesRead.entries()].sort((a, b) => b[1] - a[1]);
|
|
441
|
-
const heavy = sorted.filter(([, count]) => count >= 3);
|
|
442
|
-
if (heavy.length > 0) {
|
|
443
|
-
lines.push(`### Heavily-Read Files — prefer wiki knowledge over re-reading these`);
|
|
444
|
-
for (const [file, count] of heavy.slice(0, 5)) {
|
|
445
|
-
lines.push(`- ${file} (read ${count} times)`);
|
|
446
|
-
}
|
|
447
|
-
lines.push(``);
|
|
448
|
-
}
|
|
449
|
-
}
|
|
450
|
-
|
|
451
|
-
// Performance note
|
|
452
|
-
const hitRate = metrics.totalQAs > 0 ? Math.round(metrics.wikiHits / metrics.totalQAs * 100) : 0;
|
|
453
|
-
lines.push(`### Performance`);
|
|
454
|
-
lines.push(`- Wiki hit rate: ${hitRate}% (target: 80%+)`);
|
|
455
|
-
lines.push(`- Avg query time: ${(metrics.avgDurationMs / 1000).toFixed(1)}s`);
|
|
456
|
-
lines.push(``);
|
|
457
|
-
|
|
458
|
-
return lines.join("\n");
|
|
459
|
-
}
|
|
460
|
-
|
|
461
|
-
// ── Main eval function ──────────────────────────────────────────────────────
|
|
462
|
-
|
|
463
|
-
export async function runEval(
|
|
464
|
-
kbRoot: string,
|
|
465
|
-
options: { authStorage?: AuthStorage; last?: number; onProgress?: (msg: string) => void }
|
|
466
|
-
): Promise<EvalResult> {
|
|
467
|
-
const sessionsDir = join(kbRoot, ".llm-kb", "sessions");
|
|
468
|
-
const sourcesDir = join(kbRoot, ".llm-kb", "wiki", "sources");
|
|
469
|
-
const log = options.onProgress ?? (() => {});
|
|
470
|
-
|
|
471
|
-
// 1. Parse sessions
|
|
472
|
-
log("Reading sessions...");
|
|
473
|
-
const qas = await parseSessionsForEval(sessionsDir, sourcesDir, options.last);
|
|
474
|
-
log(`Found ${qas.length} Q&A exchanges across sessions`);
|
|
475
|
-
|
|
476
|
-
if (qas.length === 0) {
|
|
477
|
-
return {
|
|
478
|
-
metrics: { totalSessions: 0, totalQAs: 0, avgDurationMs: 0, wikiHits: 0, sourceReads: 0, totalFilesRead: 0, uniqueFilesRead: new Map(), wastedReads: 0 },
|
|
479
|
-
issues: [],
|
|
480
|
-
wikiGaps: [],
|
|
481
|
-
timestamp: new Date().toISOString(),
|
|
482
|
-
};
|
|
483
|
-
}
|
|
484
|
-
|
|
485
|
-
// 2. Calculate metrics
|
|
486
|
-
log("Calculating metrics...");
|
|
487
|
-
const metrics = calculateMetrics(qas);
|
|
488
|
-
|
|
489
|
-
// 3. Run LLM judge on each Q&A
|
|
490
|
-
const apiKey = await resolveApiKey(options.authStorage);
|
|
491
|
-
const allIssues: EvalIssue[] = [];
|
|
492
|
-
|
|
493
|
-
if (apiKey) {
|
|
494
|
-
const modelId = "claude-haiku-4-5";
|
|
495
|
-
for (let i = 0; i < qas.length; i++) {
|
|
496
|
-
log(`Judging ${i + 1}/${qas.length}: "${qas[i].question.slice(0, 50)}..."`);
|
|
497
|
-
const issues = await judgeQA(qas[i], apiKey, modelId);
|
|
498
|
-
allIssues.push(...issues);
|
|
499
|
-
}
|
|
500
|
-
} else {
|
|
501
|
-
log("No API key — skipping LLM judge checks");
|
|
502
|
-
}
|
|
503
|
-
|
|
504
|
-
// 4. Extract wiki gaps
|
|
505
|
-
const wikiGaps = allIssues
|
|
506
|
-
.filter((i) => i.type === "wiki-gap")
|
|
507
|
-
.map((i) => i.detail);
|
|
508
|
-
|
|
509
|
-
const result: EvalResult = {
|
|
510
|
-
metrics,
|
|
511
|
-
issues: allIssues.filter((i) => i.type !== "wiki-gap"),
|
|
512
|
-
wikiGaps,
|
|
513
|
-
timestamp: new Date().toISOString(),
|
|
514
|
-
agentsInsights: "",
|
|
515
|
-
};
|
|
516
|
-
|
|
517
|
-
// 5. Build agents insights (injected into AGENTS.md on next query)
|
|
518
|
-
result.agentsInsights = buildAgentsInsights(result);
|
|
519
|
-
|
|
520
|
-
// 6. Write report + insights file
|
|
521
|
-
log("Writing eval report + insights...");
|
|
522
|
-
const outputsDir = join(kbRoot, ".llm-kb", "wiki", "outputs");
|
|
523
|
-
await mkdir(outputsDir, { recursive: true });
|
|
524
|
-
|
|
525
|
-
const report = buildReport(result);
|
|
526
|
-
await writeFile(join(outputsDir, "eval-report.md"), report, "utf-8");
|
|
527
|
-
|
|
528
|
-
// Save insights to a file that query.ts reads and injects into AGENTS.md
|
|
529
|
-
await writeFile(join(kbRoot, ".llm-kb", "eval-insights.md"), result.agentsInsights, "utf-8");
|
|
530
|
-
log("Insights saved to .llm-kb/eval-insights.md (injected into next query)");
|
|
531
|
-
|
|
532
|
-
return result;
|
|
533
|
-
}
|
|
534
|
-
|
|
535
|
-
// ── Helpers ─────────────────────────────────────────────────────────────────
|
|
536
|
-
|
|
537
|
-
function extractText(content: any): string {
|
|
538
|
-
if (!content) return "";
|
|
539
|
-
if (typeof content === "string") return content;
|
|
540
|
-
if (Array.isArray(content)) {
|
|
541
|
-
return content
|
|
542
|
-
.filter((b) => b.type === "text")
|
|
543
|
-
.map((b) => b.text ?? "")
|
|
544
|
-
.join("")
|
|
545
|
-
.trim();
|
|
546
|
-
}
|
|
547
|
-
return "";
|
|
548
|
-
}
|
package/src/indexer.ts
DELETED
|
@@ -1,152 +0,0 @@
|
|
|
1
|
-
import {
|
|
2
|
-
createAgentSession,
|
|
3
|
-
createBashTool,
|
|
4
|
-
createReadTool,
|
|
5
|
-
createWriteTool,
|
|
6
|
-
DefaultResourceLoader,
|
|
7
|
-
SessionManager,
|
|
8
|
-
SettingsManager,
|
|
9
|
-
AuthStorage,
|
|
10
|
-
} from "@mariozechner/pi-coding-agent";
|
|
11
|
-
import { getModels } from "@mariozechner/pi-ai";
|
|
12
|
-
import { readdir, readFile } from "node:fs/promises";
|
|
13
|
-
import { createKBSession } from "./session-store.js";
|
|
14
|
-
import { getNodeModulesPath } from "./utils.js";
|
|
15
|
-
import { join } from "node:path";
|
|
16
|
-
|
|
17
|
-
function buildAgentsContent(sourcesDir: string, files: string[]): string {
|
|
18
|
-
const sourceList = files
|
|
19
|
-
.filter((f) => f.endsWith(".md"))
|
|
20
|
-
.map((f) => ` - ${f}`)
|
|
21
|
-
.join("\n");
|
|
22
|
-
|
|
23
|
-
return `# llm-kb Knowledge Base
|
|
24
|
-
|
|
25
|
-
## How to access documents
|
|
26
|
-
|
|
27
|
-
### PDFs (pre-parsed)
|
|
28
|
-
PDFs have been parsed to markdown with bounding boxes.
|
|
29
|
-
Read the markdown versions in \`.llm-kb/wiki/sources/\` instead of the raw PDFs.
|
|
30
|
-
|
|
31
|
-
Available parsed sources:
|
|
32
|
-
${sourceList}
|
|
33
|
-
|
|
34
|
-
### Other file types (Excel, Word, PowerPoint)
|
|
35
|
-
You have bash and read tools. Use bash to run Node.js scripts.
|
|
36
|
-
Libraries are pre-installed via require().
|
|
37
|
-
|
|
38
|
-
For .docx (structured XML — ZIP containing word/document.xml):
|
|
39
|
-
const AdmZip = require('adm-zip');
|
|
40
|
-
const zip = new AdmZip('file.docx');
|
|
41
|
-
const xml = zip.readAsText('word/document.xml');
|
|
42
|
-
// Parse XML to extract headings and first paragraphs for summary
|
|
43
|
-
|
|
44
|
-
For .xlsx use exceljs:
|
|
45
|
-
const ExcelJS = require('exceljs');
|
|
46
|
-
const wb = new ExcelJS.Workbook();
|
|
47
|
-
await wb.xlsx.readFile('file.xlsx');
|
|
48
|
-
const sheet = wb.getWorksheet(1);
|
|
49
|
-
|
|
50
|
-
For .pptx use officeparser:
|
|
51
|
-
const officeparser = require('officeparser');
|
|
52
|
-
const text = await officeparser.parseOfficeAsync('file.pptx');
|
|
53
|
-
|
|
54
|
-
## Index file
|
|
55
|
-
Write the index to \`.llm-kb/wiki/index.md\`.
|
|
56
|
-
|
|
57
|
-
The index should be a markdown file with:
|
|
58
|
-
1. A title and last-updated timestamp
|
|
59
|
-
2. A summary table with columns: Source, Type, Pages/Size, Summary, Key Topics
|
|
60
|
-
3. Each source gets a one-line summary (read the first ~500 chars of each file to generate it)
|
|
61
|
-
4. Total word count across all sources
|
|
62
|
-
`;
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
export async function buildIndex(
|
|
66
|
-
folder: string,
|
|
67
|
-
sourcesDir: string,
|
|
68
|
-
onOutput?: (text: string) => void,
|
|
69
|
-
authStorage?: AuthStorage,
|
|
70
|
-
modelId?: string
|
|
71
|
-
): Promise<string> {
|
|
72
|
-
// List source files
|
|
73
|
-
const files = await readdir(sourcesDir);
|
|
74
|
-
const mdFiles = files.filter((f) => f.endsWith(".md"));
|
|
75
|
-
|
|
76
|
-
if (mdFiles.length === 0) {
|
|
77
|
-
throw new Error("No source files found to index");
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
// Build AGENTS.md content
|
|
81
|
-
const agentsContent = buildAgentsContent(sourcesDir, files);
|
|
82
|
-
|
|
83
|
-
// Set NODE_PATH so agent's bash scripts can use bundled libraries
|
|
84
|
-
const nodeModulesPath = getNodeModulesPath();
|
|
85
|
-
process.env.NODE_PATH = nodeModulesPath;
|
|
86
|
-
|
|
87
|
-
const loader = new DefaultResourceLoader({
|
|
88
|
-
cwd: folder,
|
|
89
|
-
agentsFilesOverride: (current) => ({
|
|
90
|
-
agentsFiles: [
|
|
91
|
-
...current.agentsFiles,
|
|
92
|
-
{ path: ".llm-kb/AGENTS.md", content: agentsContent },
|
|
93
|
-
],
|
|
94
|
-
}),
|
|
95
|
-
});
|
|
96
|
-
await loader.reload();
|
|
97
|
-
|
|
98
|
-
const model = modelId
|
|
99
|
-
? getModels("anthropic").find((m) => m.id === modelId)
|
|
100
|
-
: undefined;
|
|
101
|
-
|
|
102
|
-
const { session } = await createAgentSession({
|
|
103
|
-
cwd: folder,
|
|
104
|
-
resourceLoader: loader,
|
|
105
|
-
tools: [
|
|
106
|
-
createReadTool(folder),
|
|
107
|
-
createBashTool(folder),
|
|
108
|
-
createWriteTool(folder),
|
|
109
|
-
],
|
|
110
|
-
sessionManager: await createKBSession(folder),
|
|
111
|
-
settingsManager: SettingsManager.inMemory({
|
|
112
|
-
compaction: { enabled: false },
|
|
113
|
-
}),
|
|
114
|
-
...(authStorage ? { authStorage } : {}),
|
|
115
|
-
...(model ? { model } : {}),
|
|
116
|
-
});
|
|
117
|
-
|
|
118
|
-
// Subscribe to streaming output
|
|
119
|
-
if (onOutput) {
|
|
120
|
-
session.subscribe((event) => {
|
|
121
|
-
if (
|
|
122
|
-
event.type === "message_update" &&
|
|
123
|
-
event.assistantMessageEvent.type === "text_delta"
|
|
124
|
-
) {
|
|
125
|
-
onOutput(event.assistantMessageEvent.delta);
|
|
126
|
-
}
|
|
127
|
-
});
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
// Tag the session so the session-watcher can identify it as an index run
|
|
131
|
-
session.setSessionName(`index: ${new Date().toISOString()}`);
|
|
132
|
-
|
|
133
|
-
// Build the prompt
|
|
134
|
-
const prompt = `Read each file in .llm-kb/wiki/sources/ (one at a time, just the first 500 characters of each).
|
|
135
|
-
Then write .llm-kb/wiki/index.md with a summary table of all sources.
|
|
136
|
-
|
|
137
|
-
Include: Source filename, Type (PDF/Excel/Word/etc), Pages (from the JSON if available), a one-line summary, and key topics.
|
|
138
|
-
Add a total word count estimate at the bottom.`;
|
|
139
|
-
|
|
140
|
-
await session.prompt(prompt);
|
|
141
|
-
|
|
142
|
-
// Read the generated index
|
|
143
|
-
const indexPath = join(sourcesDir, "..", "index.md");
|
|
144
|
-
try {
|
|
145
|
-
const content = await readFile(indexPath, "utf-8");
|
|
146
|
-
session.dispose();
|
|
147
|
-
return content;
|
|
148
|
-
} catch {
|
|
149
|
-
session.dispose();
|
|
150
|
-
throw new Error("Agent did not create index.md");
|
|
151
|
-
}
|
|
152
|
-
}
|