@titan-design/brain 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +96 -40
- package/dist/adapters-ZNJ4FL2E.js +14 -0
- package/dist/brain-service-2QO6JM3Z.js +11 -0
- package/dist/chunk-4SD4JRLS.js +840 -0
- package/dist/chunk-BDNH2E2O.js +112 -0
- package/dist/chunk-PO3GJPIC.js +66 -0
- package/dist/chunk-QL2GPXP6.js +1803 -0
- package/dist/chunk-ZVXSW52A.js +307 -0
- package/dist/cli.js +10942 -3829
- package/dist/command-resolution-FJHE2YBQ.js +134 -0
- package/dist/file-scanner-LBBH5I44.js +12 -0
- package/dist/search-HNUALOXQ.js +14 -0
- package/package.json +4 -1
- package/scripts/diagnostic/assemble.ts +384 -0
- package/scripts/diagnostic/quality-gate.sh +60 -0
- package/scripts/diagnostic/run.sh +352 -0
- package/scripts/diagnostic/schema.json +54 -0
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
// src/modules/pm/engine/command-resolution.ts
|
|
2
|
+
var KNOWN_COMMANDS = [
|
|
3
|
+
"task",
|
|
4
|
+
"workstream",
|
|
5
|
+
"project",
|
|
6
|
+
"waves",
|
|
7
|
+
"next",
|
|
8
|
+
"dispatch",
|
|
9
|
+
"briefing",
|
|
10
|
+
"context",
|
|
11
|
+
"verify",
|
|
12
|
+
"audit",
|
|
13
|
+
"check",
|
|
14
|
+
"onboard",
|
|
15
|
+
"relate",
|
|
16
|
+
"activity",
|
|
17
|
+
"import",
|
|
18
|
+
"capture",
|
|
19
|
+
"setup",
|
|
20
|
+
"decision",
|
|
21
|
+
"prompt",
|
|
22
|
+
"show",
|
|
23
|
+
"claim",
|
|
24
|
+
"orchestrate",
|
|
25
|
+
"list"
|
|
26
|
+
];
|
|
27
|
+
var WRITE_OPERATIONS = /* @__PURE__ */ new Set([
|
|
28
|
+
"add",
|
|
29
|
+
"update",
|
|
30
|
+
"delete",
|
|
31
|
+
"claim",
|
|
32
|
+
"complete",
|
|
33
|
+
"block",
|
|
34
|
+
"done",
|
|
35
|
+
"cancel",
|
|
36
|
+
"start",
|
|
37
|
+
"release",
|
|
38
|
+
"unblock",
|
|
39
|
+
"init",
|
|
40
|
+
"onboard",
|
|
41
|
+
"import",
|
|
42
|
+
"relate"
|
|
43
|
+
]);
|
|
44
|
+
var HELP_GROUPS = {
|
|
45
|
+
Project: ["init", "list", "status", "use", "show", "delete"],
|
|
46
|
+
Workstream: ["add", "list", "show"],
|
|
47
|
+
Task: ["add", "list", "show", "update", "claim", "complete", "block"],
|
|
48
|
+
Planning: ["waves", "next", "dispatch", "briefing"],
|
|
49
|
+
Context: ["context", "audit", "check"],
|
|
50
|
+
Data: ["onboard", "relate", "activity", "import", "capture"]
|
|
51
|
+
};
|
|
52
|
+
function levenshtein(a, b) {
|
|
53
|
+
const m = [];
|
|
54
|
+
for (let i = 0; i <= a.length; i++) m[i] = [i];
|
|
55
|
+
for (let j = 0; j <= b.length; j++) m[0][j] = j;
|
|
56
|
+
for (let i = 1; i <= a.length; i++) {
|
|
57
|
+
for (let j = 1; j <= b.length; j++) {
|
|
58
|
+
const cost = a[i - 1] === b[j - 1] ? 0 : 1;
|
|
59
|
+
m[i][j] = Math.min(m[i - 1][j] + 1, m[i][j - 1] + 1, m[i - 1][j - 1] + cost);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
return m[a.length][b.length];
|
|
63
|
+
}
|
|
64
|
+
function fuzzyMatch(input, maxDistance = 2) {
|
|
65
|
+
let bestMatch = null;
|
|
66
|
+
let bestDist = maxDistance + 1;
|
|
67
|
+
for (const cmd of KNOWN_COMMANDS) {
|
|
68
|
+
const dist = levenshtein(input.toLowerCase(), cmd);
|
|
69
|
+
if (dist < bestDist) {
|
|
70
|
+
bestDist = dist;
|
|
71
|
+
bestMatch = cmd;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
return bestDist <= maxDistance ? bestMatch : null;
|
|
75
|
+
}
|
|
76
|
+
function buildHelpMenu(input) {
|
|
77
|
+
const lines = [`Unknown command "${input}". Available commands:
|
|
78
|
+
`];
|
|
79
|
+
for (const [group, cmds] of Object.entries(HELP_GROUPS)) {
|
|
80
|
+
lines.push(` ${group.padEnd(12)} ${cmds.join(", ")}`);
|
|
81
|
+
}
|
|
82
|
+
return lines.join("\n");
|
|
83
|
+
}
|
|
84
|
+
async function resolveUnknownCommand(input, db, embedder, fusionWeights) {
|
|
85
|
+
const match = fuzzyMatch(input);
|
|
86
|
+
if (match) {
|
|
87
|
+
return {
|
|
88
|
+
tier: 1,
|
|
89
|
+
suggestion: match,
|
|
90
|
+
message: `Unknown command "${input}". Did you mean "${match}"?`
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
if (db && embedder) {
|
|
94
|
+
try {
|
|
95
|
+
const { search } = await import("./search-HNUALOXQ.js");
|
|
96
|
+
const results = await search(
|
|
97
|
+
db,
|
|
98
|
+
embedder,
|
|
99
|
+
input,
|
|
100
|
+
{ limit: 3, includePm: true },
|
|
101
|
+
fusionWeights ?? { bm25: 0.4, vector: 0.6 }
|
|
102
|
+
);
|
|
103
|
+
const relevant = results.filter((r) => {
|
|
104
|
+
const note = db.getNoteById(r.noteId);
|
|
105
|
+
return note?.type === "reference" && note?.module === "pm";
|
|
106
|
+
});
|
|
107
|
+
if (relevant.length > 0) {
|
|
108
|
+
const suggestions = relevant.slice(0, 3).map((r) => {
|
|
109
|
+
const title = r.heading ?? r.filePath;
|
|
110
|
+
const excerpt = (r.excerpt ?? "").slice(0, 80);
|
|
111
|
+
return ` ${title}: ${excerpt}`;
|
|
112
|
+
});
|
|
113
|
+
return {
|
|
114
|
+
tier: 2,
|
|
115
|
+
message: `Unknown command "${input}". Related documentation:
|
|
116
|
+
${suggestions.join("\n")}`
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
} catch {
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
return {
|
|
123
|
+
tier: 3,
|
|
124
|
+
message: buildHelpMenu(input)
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
export {
|
|
128
|
+
HELP_GROUPS,
|
|
129
|
+
KNOWN_COMMANDS,
|
|
130
|
+
WRITE_OPERATIONS,
|
|
131
|
+
fuzzyMatch,
|
|
132
|
+
levenshtein,
|
|
133
|
+
resolveUnknownCommand
|
|
134
|
+
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@titan-design/brain",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.5.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Developer second brain with hybrid RAG search",
|
|
6
6
|
"license": "MIT",
|
|
@@ -21,6 +21,9 @@
|
|
|
21
21
|
"build": "tsup",
|
|
22
22
|
"test": "vitest run",
|
|
23
23
|
"test:watch": "vitest",
|
|
24
|
+
"test:unit": "vitest run --project unit",
|
|
25
|
+
"test:integration": "vitest run --project integration",
|
|
26
|
+
"test:eval": "vitest run --project eval",
|
|
24
27
|
"typecheck": "tsc --noEmit",
|
|
25
28
|
"lint": "eslint src __tests__ --no-error-on-unmatched-pattern",
|
|
26
29
|
"format": "prettier --write \"src/**/*.ts\" \"__tests__/**/*.ts\"",
|
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Diagnostic Results Assembler
|
|
3
|
+
*
|
|
4
|
+
* Reads structured JSON from test bench agents, computes aggregates,
|
|
5
|
+
* and writes a markdown report with scorecard and per-prompt analysis.
|
|
6
|
+
*
|
|
7
|
+
* Usage: npx tsx scripts/diagnostic/assemble.ts --version v4 --previous v3 --results-dir docs/pm-module/diagnostic/v4
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { readFileSync, writeFileSync, existsSync, readdirSync } from 'node:fs';
|
|
11
|
+
import { join } from 'node:path';
|
|
12
|
+
import { parseArgs } from 'node:util';
|
|
13
|
+
|
|
14
|
+
interface TestMetrics {
|
|
15
|
+
total_calls: number;
|
|
16
|
+
brain_cli_calls: number;
|
|
17
|
+
non_brain_calls: number;
|
|
18
|
+
file_reads: number;
|
|
19
|
+
quality: number;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
interface TestAnalysis {
|
|
23
|
+
what_worked: string;
|
|
24
|
+
friction_points: string;
|
|
25
|
+
known_gaps_hit: string[];
|
|
26
|
+
new_issues: Array<{ description: string; severity: string }>;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
interface TestResult {
|
|
30
|
+
id: string;
|
|
31
|
+
version: string;
|
|
32
|
+
category: string;
|
|
33
|
+
prompt: string;
|
|
34
|
+
commands: string[];
|
|
35
|
+
metrics: TestMetrics;
|
|
36
|
+
answer: string;
|
|
37
|
+
analysis: TestAnalysis;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
const CATEGORIES = [
|
|
41
|
+
'Discovery', 'Navigation', 'Context Assembly', 'Planning',
|
|
42
|
+
'Capabilities', 'Gap Exercisers', 'Write Ops', 'Agent Commands',
|
|
43
|
+
'Cross-System', 'Filtering',
|
|
44
|
+
];
|
|
45
|
+
|
|
46
|
+
const CATEGORY_SHORT: Record<string, string> = {
|
|
47
|
+
'Discovery': 'Discovery',
|
|
48
|
+
'Navigation': 'Navigation',
|
|
49
|
+
'Context Assembly': 'Context',
|
|
50
|
+
'Planning': 'Planning',
|
|
51
|
+
'Capabilities': 'Capabilities',
|
|
52
|
+
'Gap Exercisers': 'Gap Exercise',
|
|
53
|
+
'Write Ops': 'Write Ops',
|
|
54
|
+
'Agent Commands': 'Agent Cmds',
|
|
55
|
+
'Cross-System': 'Cross-System',
|
|
56
|
+
'Filtering': 'Filtering',
|
|
57
|
+
};
|
|
58
|
+
|
|
59
|
+
function loadResults(dir: string): TestResult[] {
|
|
60
|
+
const results: TestResult[] = [];
|
|
61
|
+
const benchDir = join(dir, 'test-bench');
|
|
62
|
+
|
|
63
|
+
if (!existsSync(benchDir)) return results;
|
|
64
|
+
|
|
65
|
+
const files = readdirSync(benchDir)
|
|
66
|
+
.filter(f => f.match(/^P-\d{2}\.json$/))
|
|
67
|
+
.sort();
|
|
68
|
+
|
|
69
|
+
for (const file of files) {
|
|
70
|
+
try {
|
|
71
|
+
const raw = readFileSync(join(benchDir, file), 'utf-8');
|
|
72
|
+
const parsed = JSON.parse(raw);
|
|
73
|
+
|
|
74
|
+
// Handle claude -p --output-format json wrapping
|
|
75
|
+
if (!parsed.result) {
|
|
76
|
+
console.error(` WARN: Skipping ${file}: no result field (${parsed.subtype ?? 'unknown'})`);
|
|
77
|
+
continue;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// Strip markdown code fences if the agent wrapped its JSON
|
|
81
|
+
let resultStr = parsed.result;
|
|
82
|
+
resultStr = resultStr.replace(/^```(?:json)?\s*\n?/, '').replace(/\n?```\s*$/, '');
|
|
83
|
+
|
|
84
|
+
// Strip any prose before/after the JSON object
|
|
85
|
+
const jsonStart = resultStr.indexOf('{');
|
|
86
|
+
const jsonEnd = resultStr.lastIndexOf('}');
|
|
87
|
+
if (jsonStart === -1 || jsonEnd === -1) {
|
|
88
|
+
console.error(` WARN: Skipping ${file}: no JSON object found in result`);
|
|
89
|
+
continue;
|
|
90
|
+
}
|
|
91
|
+
resultStr = resultStr.slice(jsonStart, jsonEnd + 1);
|
|
92
|
+
|
|
93
|
+
const result = JSON.parse(resultStr);
|
|
94
|
+
results.push(result);
|
|
95
|
+
} catch (e) {
|
|
96
|
+
console.error(` WARN: Failed to parse ${file}: ${(e as Error).message}`);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
return results;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
function brainPct(r: TestResult): number {
|
|
104
|
+
if (r.metrics.total_calls === 0) return 0;
|
|
105
|
+
return Math.round((r.metrics.brain_cli_calls / r.metrics.total_calls) * 100);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
function buildScorecard(
|
|
109
|
+
current: TestResult[],
|
|
110
|
+
previous: TestResult[],
|
|
111
|
+
version: string,
|
|
112
|
+
prevVersion: string,
|
|
113
|
+
): string {
|
|
114
|
+
const prevMap = new Map(previous.map(r => [r.id, r]));
|
|
115
|
+
const lines: string[] = [];
|
|
116
|
+
|
|
117
|
+
lines.push(`| Prompt | Category | ${prevVersion} Calls | ${version} Calls | ${prevVersion} Brain% | ${version} Brain% | ${prevVersion} Reads | ${version} Reads | ${prevVersion} Q | ${version} Q |`);
|
|
118
|
+
lines.push('|--------|----------|----------|----------|-----------|-----------|----------|----------|------|------|');
|
|
119
|
+
|
|
120
|
+
for (const r of current) {
|
|
121
|
+
const prev = prevMap.get(r.id);
|
|
122
|
+
const cat = CATEGORY_SHORT[r.category] ?? r.category;
|
|
123
|
+
const prevCalls = prev ? String(prev.metrics.total_calls) : 'N/A';
|
|
124
|
+
const prevBrain = prev ? `${brainPct(prev)}%` : 'N/A';
|
|
125
|
+
const prevReads = prev ? String(prev.metrics.file_reads) : 'N/A';
|
|
126
|
+
const prevQ = prev ? `${prev.metrics.quality}/5` : 'N/A';
|
|
127
|
+
|
|
128
|
+
lines.push(
|
|
129
|
+
`| ${r.id} | ${cat} | ${prevCalls} | **${r.metrics.total_calls}** | ${prevBrain} | **${brainPct(r)}%** | ${prevReads} | **${r.metrics.file_reads}** | ${prevQ} | **${r.metrics.quality}/5** |`,
|
|
130
|
+
);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
return lines.join('\n');
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
function buildAggregates(
|
|
137
|
+
current: TestResult[],
|
|
138
|
+
previous: TestResult[],
|
|
139
|
+
version: string,
|
|
140
|
+
prevVersion: string,
|
|
141
|
+
): string {
|
|
142
|
+
const calc = (results: TestResult[]) => {
|
|
143
|
+
if (results.length === 0) return null;
|
|
144
|
+
const totalCalls = results.reduce((s, r) => s + r.metrics.total_calls, 0);
|
|
145
|
+
const totalBrain = results.reduce((s, r) => s + r.metrics.brain_cli_calls, 0);
|
|
146
|
+
const totalReads = results.reduce((s, r) => s + r.metrics.file_reads, 0);
|
|
147
|
+
const totalNonBrain = results.reduce((s, r) => s + r.metrics.non_brain_calls, 0);
|
|
148
|
+
const avgQuality = results.reduce((s, r) => s + r.metrics.quality, 0) / results.length;
|
|
149
|
+
const at5 = results.filter(r => r.metrics.quality === 5).length;
|
|
150
|
+
const at3orBelow = results.filter(r => r.metrics.quality <= 3).length;
|
|
151
|
+
return {
|
|
152
|
+
totalCalls,
|
|
153
|
+
avgCalls: totalCalls / results.length,
|
|
154
|
+
brainPct: totalCalls > 0 ? Math.round((totalBrain / totalCalls) * 100) : 0,
|
|
155
|
+
totalReads,
|
|
156
|
+
totalNonBrain,
|
|
157
|
+
avgQuality,
|
|
158
|
+
at5,
|
|
159
|
+
at3orBelow,
|
|
160
|
+
count: results.length,
|
|
161
|
+
};
|
|
162
|
+
};
|
|
163
|
+
|
|
164
|
+
const cur = calc(current)!;
|
|
165
|
+
const prev = calc(previous);
|
|
166
|
+
|
|
167
|
+
const delta = (curVal: number, prevVal: number | undefined, suffix = '', invert = false) => {
|
|
168
|
+
if (prevVal === undefined) return '—';
|
|
169
|
+
const diff = curVal - prevVal;
|
|
170
|
+
if (diff === 0) return 'flat';
|
|
171
|
+
const sign = diff > 0 ? '+' : '';
|
|
172
|
+
return `**${sign}${diff.toFixed(1)}${suffix}**`;
|
|
173
|
+
};
|
|
174
|
+
|
|
175
|
+
const pctDelta = (curVal: number, prevVal: number | undefined) => {
|
|
176
|
+
if (prevVal === undefined || prevVal === 0) return '—';
|
|
177
|
+
const pct = ((curVal - prevVal) / prevVal) * 100;
|
|
178
|
+
const sign = pct > 0 ? '+' : '';
|
|
179
|
+
return `**${sign}${pct.toFixed(1)}%**`;
|
|
180
|
+
};
|
|
181
|
+
|
|
182
|
+
if (!cur) {
|
|
183
|
+
return `*No results for ${version} — all test agents failed or produced no output.*`;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
const lines = [
|
|
187
|
+
`| Metric | ${prevVersion} (${prev?.count ?? '?'}p) | ${version} (${cur.count}p) | Delta |`,
|
|
188
|
+
'|--------|----------|----------|-------|',
|
|
189
|
+
`| Total tool calls | ${prev?.totalCalls ?? '?'} | **${cur.totalCalls}** | ${pctDelta(cur.totalCalls, prev?.totalCalls)} |`,
|
|
190
|
+
`| Avg calls per prompt | ${prev ? prev.avgCalls.toFixed(1) : '?'} | **${cur.avgCalls.toFixed(1)}** | ${delta(cur.avgCalls, prev?.avgCalls)} |`,
|
|
191
|
+
`| Brain CLI % | ${prev ? prev.brainPct + '%' : '?'} | **${cur.brainPct}%** | ${delta(cur.brainPct, prev?.brainPct, 'pp')} |`,
|
|
192
|
+
`| Direct file reads | ${prev?.totalReads ?? '?'} | **${cur.totalReads}** | ${delta(cur.totalReads, prev?.totalReads)} |`,
|
|
193
|
+
`| Non-brain calls | ${prev?.totalNonBrain ?? '?'} | **${cur.totalNonBrain}** | ${delta(cur.totalNonBrain, prev?.totalNonBrain)} |`,
|
|
194
|
+
`| Avg quality | ${prev ? (prev.avgQuality).toFixed(1) + '/5' : '?'} | **${cur.avgQuality.toFixed(1)}/5** | ${delta(cur.avgQuality, prev?.avgQuality)} |`,
|
|
195
|
+
`| Prompts at 5/5 | ${prev ? prev.at5 + '/' + prev.count : '?'} | **${cur.at5}/${cur.count}** | ${delta(cur.at5, prev?.at5)} |`,
|
|
196
|
+
`| Prompts at <=3/5 | ${prev ? prev.at3orBelow + '/' + prev.count : '?'} | **${cur.at3orBelow}/${cur.count}** | ${delta(cur.at3orBelow, prev?.at3orBelow)} |`,
|
|
197
|
+
];
|
|
198
|
+
|
|
199
|
+
return lines.join('\n');
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
function buildPerPromptAnalysis(current: TestResult[], previous: TestResult[]): string {
|
|
203
|
+
const prevMap = new Map(previous.map(r => [r.id, r]));
|
|
204
|
+
const sections: string[] = [];
|
|
205
|
+
|
|
206
|
+
let currentCategory = '';
|
|
207
|
+
|
|
208
|
+
for (const r of current) {
|
|
209
|
+
if (r.category !== currentCategory) {
|
|
210
|
+
currentCategory = r.category;
|
|
211
|
+
sections.push(`\n### ${currentCategory}\n`);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
const prev = prevMap.get(r.id);
|
|
215
|
+
|
|
216
|
+
sections.push(`#### ${r.id}: "${r.prompt}"\n`);
|
|
217
|
+
|
|
218
|
+
// Metrics table
|
|
219
|
+
const lines = [
|
|
220
|
+
'| Metric | Previous | Current |',
|
|
221
|
+
'|--------|----------|---------|',
|
|
222
|
+
`| Total calls | ${prev?.metrics.total_calls ?? 'N/A'} | **${r.metrics.total_calls}** |`,
|
|
223
|
+
`| Brain CLI | ${prev?.metrics.brain_cli_calls ?? 'N/A'} | **${r.metrics.brain_cli_calls}** |`,
|
|
224
|
+
`| Non-brain | ${prev?.metrics.non_brain_calls ?? 'N/A'} | **${r.metrics.non_brain_calls}** |`,
|
|
225
|
+
`| File reads | ${prev?.metrics.file_reads ?? 'N/A'} | **${r.metrics.file_reads}** |`,
|
|
226
|
+
`| Quality | ${prev ? prev.metrics.quality + '/5' : 'N/A'} | **${r.metrics.quality}/5** |`,
|
|
227
|
+
];
|
|
228
|
+
sections.push(lines.join('\n'));
|
|
229
|
+
|
|
230
|
+
// Command log
|
|
231
|
+
if (r.commands.length > 0) {
|
|
232
|
+
sections.push(`\n**Commands:** ${r.commands.map(c => '`' + c + '`').join(' → ')}\n`);
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
// Analysis
|
|
236
|
+
if (r.analysis.what_worked) {
|
|
237
|
+
sections.push(`**What worked:** ${r.analysis.what_worked}\n`);
|
|
238
|
+
}
|
|
239
|
+
if (r.analysis.friction_points) {
|
|
240
|
+
sections.push(`**Friction:** ${r.analysis.friction_points}\n`);
|
|
241
|
+
}
|
|
242
|
+
if (r.analysis.known_gaps_hit.length > 0) {
|
|
243
|
+
sections.push(`**Known gaps confirmed:** ${r.analysis.known_gaps_hit.join(', ')}\n`);
|
|
244
|
+
}
|
|
245
|
+
if (r.analysis.new_issues.length > 0) {
|
|
246
|
+
sections.push('**New issues:**');
|
|
247
|
+
for (const issue of r.analysis.new_issues) {
|
|
248
|
+
sections.push(`- [${issue.severity}] ${issue.description}`);
|
|
249
|
+
}
|
|
250
|
+
sections.push('');
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
sections.push('---\n');
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
return sections.join('\n');
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
function buildCrossCutting(current: TestResult[]): string {
|
|
260
|
+
const sections: string[] = [];
|
|
261
|
+
|
|
262
|
+
// Quality distribution by category
|
|
263
|
+
const byCategory = new Map<string, TestResult[]>();
|
|
264
|
+
for (const r of current) {
|
|
265
|
+
const cat = r.category;
|
|
266
|
+
if (!byCategory.has(cat)) byCategory.set(cat, []);
|
|
267
|
+
byCategory.get(cat)!.push(r);
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
sections.push('### Quality by Category\n');
|
|
271
|
+
sections.push('| Category | Avg Quality | Avg Calls |');
|
|
272
|
+
sections.push('|----------|-------------|-----------|');
|
|
273
|
+
for (const [cat, results] of byCategory) {
|
|
274
|
+
const avgQ = results.reduce((s, r) => s + r.metrics.quality, 0) / results.length;
|
|
275
|
+
const avgC = results.reduce((s, r) => s + r.metrics.total_calls, 0) / results.length;
|
|
276
|
+
sections.push(`| ${cat} | ${avgQ.toFixed(1)}/5 | ${avgC.toFixed(1)} |`);
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
// Most common gaps hit
|
|
280
|
+
const gapCounts = new Map<string, number>();
|
|
281
|
+
for (const r of current) {
|
|
282
|
+
for (const gap of r.analysis.known_gaps_hit) {
|
|
283
|
+
gapCounts.set(gap, (gapCounts.get(gap) ?? 0) + 1);
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
if (gapCounts.size > 0) {
|
|
288
|
+
const sorted = [...gapCounts.entries()].sort((a, b) => b[1] - a[1]);
|
|
289
|
+
sections.push('\n### Most Frequent Gaps Hit\n');
|
|
290
|
+
sections.push('| Observation | Prompts Affected |');
|
|
291
|
+
sections.push('|-------------|-----------------|');
|
|
292
|
+
for (const [gap, count] of sorted.slice(0, 10)) {
|
|
293
|
+
sections.push(`| ${gap} | ${count} |`);
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
// New issues across all prompts
|
|
298
|
+
const allIssues: Array<{ description: string; severity: string; source: string }> = [];
|
|
299
|
+
for (const r of current) {
|
|
300
|
+
for (const issue of r.analysis.new_issues) {
|
|
301
|
+
allIssues.push({ ...issue, source: r.id });
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
if (allIssues.length > 0) {
|
|
306
|
+
sections.push('\n### New Issues Discovered\n');
|
|
307
|
+
sections.push('| Source | Severity | Description |');
|
|
308
|
+
sections.push('|--------|----------|-------------|');
|
|
309
|
+
for (const issue of allIssues) {
|
|
310
|
+
sections.push(`| ${issue.source} | ${issue.severity} | ${issue.description} |`);
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
return sections.join('\n');
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
function main() {
|
|
318
|
+
const { values } = parseArgs({
|
|
319
|
+
options: {
|
|
320
|
+
version: { type: 'string' },
|
|
321
|
+
previous: { type: 'string' },
|
|
322
|
+
'results-dir': { type: 'string' },
|
|
323
|
+
},
|
|
324
|
+
});
|
|
325
|
+
|
|
326
|
+
const version = values.version!;
|
|
327
|
+
const previous = values.previous!;
|
|
328
|
+
const resultsDir = values['results-dir']!;
|
|
329
|
+
|
|
330
|
+
console.log(` Loading ${version} results from ${resultsDir}/test-bench/`);
|
|
331
|
+
const current = loadResults(resultsDir);
|
|
332
|
+
console.log(` Found ${current.length} results`);
|
|
333
|
+
|
|
334
|
+
const prevDir = resultsDir.replace(version, previous);
|
|
335
|
+
let previousResults: TestResult[] = [];
|
|
336
|
+
if (existsSync(join(prevDir, 'test-bench'))) {
|
|
337
|
+
console.log(` Loading ${previous} results from ${prevDir}/test-bench/`);
|
|
338
|
+
previousResults = loadResults(prevDir);
|
|
339
|
+
console.log(` Found ${previousResults.length} previous results`);
|
|
340
|
+
} else {
|
|
341
|
+
console.log(` No previous results found at ${prevDir}`);
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
// Sort by prompt number
|
|
345
|
+
current.sort((a, b) => a.id.localeCompare(b.id));
|
|
346
|
+
previousResults.sort((a, b) => a.id.localeCompare(b.id));
|
|
347
|
+
|
|
348
|
+
const today = new Date().toISOString().split('T')[0];
|
|
349
|
+
|
|
350
|
+
const report = `# PM Module Test Bench Results — ${version.toUpperCase()}
|
|
351
|
+
|
|
352
|
+
**Date:** ${today}
|
|
353
|
+
**Agent model:** claude-sonnet-4-6
|
|
354
|
+
**Prompts run:** ${current.length} of 30
|
|
355
|
+
|
|
356
|
+
---
|
|
357
|
+
|
|
358
|
+
## Aggregate Metrics
|
|
359
|
+
|
|
360
|
+
${buildAggregates(current, previousResults, version, previous)}
|
|
361
|
+
|
|
362
|
+
---
|
|
363
|
+
|
|
364
|
+
## Full Scorecard
|
|
365
|
+
|
|
366
|
+
${buildScorecard(current, previousResults, version, previous)}
|
|
367
|
+
|
|
368
|
+
---
|
|
369
|
+
|
|
370
|
+
## Per-Prompt Analysis
|
|
371
|
+
|
|
372
|
+
${buildPerPromptAnalysis(current, previousResults)}
|
|
373
|
+
|
|
374
|
+
## Cross-Cutting Findings
|
|
375
|
+
|
|
376
|
+
${buildCrossCutting(current)}
|
|
377
|
+
`;
|
|
378
|
+
|
|
379
|
+
const outPath = join(resultsDir, 'test-bench-results.md');
|
|
380
|
+
writeFileSync(outPath, report);
|
|
381
|
+
console.log(` Wrote ${outPath}`);
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
main();
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -euo pipefail
|
|
3
|
+
|
|
4
|
+
# Diagnostic Quality Gate — runs after setup, before test bench
|
|
5
|
+
# Validates that setup agent produced high-quality data
|
|
6
|
+
# Usage: quality-gate.sh [--skip]
|
|
7
|
+
|
|
8
|
+
if [[ "${1:-}" == "--skip" ]]; then
|
|
9
|
+
echo " Quality gate SKIPPED (--skip flag)"
|
|
10
|
+
exit 0
|
|
11
|
+
fi
|
|
12
|
+
|
|
13
|
+
echo " Running data quality gate..."
|
|
14
|
+
|
|
15
|
+
FAILURES=0
|
|
16
|
+
|
|
17
|
+
# Check: every task has a non-empty body
|
|
18
|
+
# Run from /tmp to avoid local .brain/ instance resolution
|
|
19
|
+
TASK_JSON=$(cd /tmp && brain pm task list --json --full 2>/dev/null || echo '[]')
|
|
20
|
+
TASK_COUNT=$(echo "$TASK_JSON" | jq 'length')
|
|
21
|
+
|
|
22
|
+
if [[ "$TASK_COUNT" -eq 0 ]]; then
|
|
23
|
+
echo " FAIL: No tasks found"
|
|
24
|
+
exit 1
|
|
25
|
+
fi
|
|
26
|
+
|
|
27
|
+
# Check empty descriptions
|
|
28
|
+
EMPTY_DESC=$(echo "$TASK_JSON" | jq '[.[] | select(.description == null or .description == "" or (.description | length) < 20)] | length')
|
|
29
|
+
if [[ "$EMPTY_DESC" -gt 0 ]]; then
|
|
30
|
+
echo " FAIL: ${EMPTY_DESC}/${TASK_COUNT} tasks have empty or trivial descriptions"
|
|
31
|
+
((FAILURES++))
|
|
32
|
+
fi
|
|
33
|
+
|
|
34
|
+
# Check: at least 2 distinct categories
|
|
35
|
+
CATEGORY_COUNT=$(echo "$TASK_JSON" | jq '[.[].category] | unique | length')
|
|
36
|
+
if [[ "$CATEGORY_COUNT" -lt 2 ]]; then
|
|
37
|
+
echo " FAIL: Only ${CATEGORY_COUNT} distinct category(ies) — need at least 2"
|
|
38
|
+
((FAILURES++))
|
|
39
|
+
fi
|
|
40
|
+
|
|
41
|
+
# Check: at least 1 low priority task
|
|
42
|
+
LOW_COUNT=$(echo "$TASK_JSON" | jq '[.[] | select(.priority == "low")] | length')
|
|
43
|
+
if [[ "$LOW_COUNT" -eq 0 ]]; then
|
|
44
|
+
echo " WARN: No low-priority tasks — priority range underutilized"
|
|
45
|
+
fi
|
|
46
|
+
|
|
47
|
+
# Check: depends_on is always an array (not null/missing)
|
|
48
|
+
NULL_DEPS=$(echo "$TASK_JSON" | jq '[.[] | select(.depends_on == null)] | length')
|
|
49
|
+
if [[ "$NULL_DEPS" -gt 0 ]]; then
|
|
50
|
+
echo " FAIL: ${NULL_DEPS}/${TASK_COUNT} tasks have null depends_on (should be [])"
|
|
51
|
+
((FAILURES++))
|
|
52
|
+
fi
|
|
53
|
+
|
|
54
|
+
if [[ "$FAILURES" -gt 0 ]]; then
|
|
55
|
+
echo " Quality gate FAILED (${FAILURES} check(s))"
|
|
56
|
+
echo " Use --skip-quality-gate to bypass"
|
|
57
|
+
exit 1
|
|
58
|
+
else
|
|
59
|
+
echo " Quality gate PASSED (${TASK_COUNT} tasks, ${CATEGORY_COUNT} categories)"
|
|
60
|
+
fi
|