@infinitedusky/indusk-mcp 1.12.1 → 1.12.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/cli.js +22 -0
- package/dist/bin/commands/eval.d.ts +4 -0
- package/dist/bin/commands/eval.js +25 -0
- package/dist/lib/eval/findings.d.ts +23 -0
- package/dist/lib/eval/findings.js +68 -0
- package/dist/lib/eval/judge-runner.js +3 -0
- package/dist/lib/eval/types.js +1 -1
- package/hooks/eval-trigger.js +19 -0
- package/package.json +1 -1
- package/skills/planner.md +6 -6
package/dist/bin/cli.js
CHANGED
|
@@ -259,6 +259,28 @@ eval_
|
|
|
259
259
|
const { evalSummary } = await import("./commands/eval.js");
|
|
260
260
|
await evalSummary(process.cwd(), opts);
|
|
261
261
|
});
|
|
262
|
+
eval_
|
|
263
|
+
.command("findings")
|
|
264
|
+
.description("List unresolved eval findings")
|
|
265
|
+
.option("--all", "Show all findings including fixed/ignored")
|
|
266
|
+
.action(async (opts) => {
|
|
267
|
+
const { evalFindings } = await import("./commands/eval.js");
|
|
268
|
+
await evalFindings(process.cwd(), opts);
|
|
269
|
+
});
|
|
270
|
+
eval_
|
|
271
|
+
.command("fix <key>")
|
|
272
|
+
.description("Mark an eval finding as fixed")
|
|
273
|
+
.action(async (key) => {
|
|
274
|
+
const { evalMark } = await import("./commands/eval.js");
|
|
275
|
+
await evalMark(process.cwd(), key, "fixed");
|
|
276
|
+
});
|
|
277
|
+
eval_
|
|
278
|
+
.command("ignore <key>")
|
|
279
|
+
.description("Mark an eval finding as ignored")
|
|
280
|
+
.action(async (key) => {
|
|
281
|
+
const { evalMark } = await import("./commands/eval.js");
|
|
282
|
+
await evalMark(process.cwd(), key, "ignored");
|
|
283
|
+
});
|
|
262
284
|
eval_
|
|
263
285
|
.command("baseline")
|
|
264
286
|
.description("Run baseline evaluation with vanilla agent")
|
|
@@ -9,6 +9,10 @@ export declare function evalSummary(projectRoot: string, opts: {
|
|
|
9
9
|
since?: string;
|
|
10
10
|
json?: boolean;
|
|
11
11
|
}): Promise<void>;
|
|
12
|
+
export declare function evalFindings(projectRoot: string, opts: {
|
|
13
|
+
all?: boolean;
|
|
14
|
+
}): Promise<void>;
|
|
15
|
+
export declare function evalMark(projectRoot: string, key: string, state: "fixed" | "ignored"): Promise<void>;
|
|
12
16
|
export declare function evalBaseline(projectRoot: string, opts: {
|
|
13
17
|
task: string;
|
|
14
18
|
keep?: boolean;
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
*/
|
|
7
7
|
import { existsSync } from "node:fs";
|
|
8
8
|
import { join } from "node:path";
|
|
9
|
+
import { getAllFindings, getUnresolvedFindings, markFinding } from "../../lib/eval/findings.js";
|
|
9
10
|
import { readAllEntries } from "../../lib/eval/log-reader.js";
|
|
10
11
|
import { isScorecard } from "../../lib/eval/types.js";
|
|
11
12
|
function getEvalLogPath(projectRoot) {
|
|
@@ -106,6 +107,30 @@ function computeSummary(scorecards) {
|
|
|
106
107
|
trend,
|
|
107
108
|
};
|
|
108
109
|
}
|
|
110
|
+
export async function evalFindings(projectRoot, opts) {
|
|
111
|
+
const findings = opts.all ? getAllFindings(projectRoot) : getUnresolvedFindings(projectRoot);
|
|
112
|
+
if (findings.length === 0) {
|
|
113
|
+
console.info(opts.all ? "No eval findings." : "No unresolved findings.");
|
|
114
|
+
return;
|
|
115
|
+
}
|
|
116
|
+
console.info(`\n${opts.all ? "All" : "Unresolved"} eval findings (${findings.length}):\n`);
|
|
117
|
+
for (const f of findings) {
|
|
118
|
+
const icon = f.state === "fixed" ? "✓" : f.state === "ignored" ? "–" : "●";
|
|
119
|
+
console.info(` ${icon} [${f.severity}] ${f.questionId}: ${f.finding}`);
|
|
120
|
+
console.info(` key: ${f.key} change: ${f.changeId.slice(0, 8)} state: ${f.state}`);
|
|
121
|
+
}
|
|
122
|
+
console.info("");
|
|
123
|
+
}
|
|
124
|
+
export async function evalMark(projectRoot, key, state) {
|
|
125
|
+
const success = markFinding(projectRoot, key, state);
|
|
126
|
+
if (success) {
|
|
127
|
+
console.info(`Marked ${key} as ${state}`);
|
|
128
|
+
}
|
|
129
|
+
else {
|
|
130
|
+
console.error(`Finding not found: ${key}`);
|
|
131
|
+
process.exit(1);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
109
134
|
function computePassRates(cards) {
|
|
110
135
|
const counts = {};
|
|
111
136
|
for (const card of cards) {
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tracks eval finding resolution state.
|
|
3
|
+
*
|
|
4
|
+
* Findings persist as "unresolved" until explicitly fixed or ignored.
|
|
5
|
+
* The eval hook surfaces unresolved findings on every jj describe.
|
|
6
|
+
*/
|
|
7
|
+
import type { EvalScorecard } from "./types.js";
|
|
8
|
+
export type FindingState = "unresolved" | "fixed" | "ignored";
|
|
9
|
+
export interface FindingEntry {
|
|
10
|
+
state: FindingState;
|
|
11
|
+
questionId: string;
|
|
12
|
+
severity: string;
|
|
13
|
+
finding: string;
|
|
14
|
+
changeId: string;
|
|
15
|
+
}
|
|
16
|
+
export declare function getUnresolvedFindings(projectRoot: string): Array<{
|
|
17
|
+
key: string;
|
|
18
|
+
} & FindingEntry>;
|
|
19
|
+
export declare function getAllFindings(projectRoot: string): Array<{
|
|
20
|
+
key: string;
|
|
21
|
+
} & FindingEntry>;
|
|
22
|
+
export declare function markFinding(projectRoot: string, key: string, state: FindingState): boolean;
|
|
23
|
+
export declare function ingestScorecard(projectRoot: string, scorecard: EvalScorecard): number;
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tracks eval finding resolution state.
|
|
3
|
+
*
|
|
4
|
+
* Findings persist as "unresolved" until explicitly fixed or ignored.
|
|
5
|
+
* The eval hook surfaces unresolved findings on every jj describe.
|
|
6
|
+
*/
|
|
7
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
8
|
+
import { dirname, join } from "node:path";
|
|
9
|
+
function getFindingsPath(projectRoot) {
|
|
10
|
+
return join(projectRoot, ".indusk", "eval", "findings.json");
|
|
11
|
+
}
|
|
12
|
+
function readFindings(projectRoot) {
|
|
13
|
+
const path = getFindingsPath(projectRoot);
|
|
14
|
+
if (!existsSync(path))
|
|
15
|
+
return {};
|
|
16
|
+
try {
|
|
17
|
+
return JSON.parse(readFileSync(path, "utf8"));
|
|
18
|
+
}
|
|
19
|
+
catch {
|
|
20
|
+
return {};
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
function writeFindings(projectRoot, findings) {
|
|
24
|
+
const path = getFindingsPath(projectRoot);
|
|
25
|
+
mkdirSync(dirname(path), { recursive: true });
|
|
26
|
+
writeFileSync(path, `${JSON.stringify(findings, null, 2)}\n`);
|
|
27
|
+
}
|
|
28
|
+
export function getUnresolvedFindings(projectRoot) {
|
|
29
|
+
const findings = readFindings(projectRoot);
|
|
30
|
+
return Object.entries(findings)
|
|
31
|
+
.filter(([, entry]) => entry.state === "unresolved")
|
|
32
|
+
.map(([key, entry]) => ({ key, ...entry }));
|
|
33
|
+
}
|
|
34
|
+
export function getAllFindings(projectRoot) {
|
|
35
|
+
const findings = readFindings(projectRoot);
|
|
36
|
+
return Object.entries(findings).map(([key, entry]) => ({ key, ...entry }));
|
|
37
|
+
}
|
|
38
|
+
export function markFinding(projectRoot, key, state) {
|
|
39
|
+
const findings = readFindings(projectRoot);
|
|
40
|
+
if (!findings[key])
|
|
41
|
+
return false;
|
|
42
|
+
findings[key].state = state;
|
|
43
|
+
writeFindings(projectRoot, findings);
|
|
44
|
+
return true;
|
|
45
|
+
}
|
|
46
|
+
export function ingestScorecard(projectRoot, scorecard) {
|
|
47
|
+
const findings = readFindings(projectRoot);
|
|
48
|
+
let added = 0;
|
|
49
|
+
for (const q of scorecard.questions) {
|
|
50
|
+
if (q.answer === "yes")
|
|
51
|
+
continue; // no finding for passing questions
|
|
52
|
+
const key = `${scorecard.changeId}:${q.id}`;
|
|
53
|
+
if (!findings[key]) {
|
|
54
|
+
findings[key] = {
|
|
55
|
+
state: "unresolved",
|
|
56
|
+
questionId: q.id,
|
|
57
|
+
severity: q.severity,
|
|
58
|
+
finding: q.finding,
|
|
59
|
+
changeId: scorecard.changeId,
|
|
60
|
+
};
|
|
61
|
+
added++;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
if (added > 0) {
|
|
65
|
+
writeFindings(projectRoot, findings);
|
|
66
|
+
}
|
|
67
|
+
return added;
|
|
68
|
+
}
|
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
import { spawn } from "node:child_process";
|
|
9
9
|
import { join } from "node:path";
|
|
10
10
|
import { getProjectGroupId } from "../config.js";
|
|
11
|
+
import { ingestScorecard } from "./findings.js";
|
|
11
12
|
import { EvalLogWriter } from "./log-writer.js";
|
|
12
13
|
import { buildJudgePrompt } from "./prompt-builder.js";
|
|
13
14
|
import { V1_RUBRIC } from "./rubric.js";
|
|
@@ -127,6 +128,7 @@ export function runJudgeBackground(opts) {
|
|
|
127
128
|
scorecard.telemetryPosted = true;
|
|
128
129
|
}
|
|
129
130
|
await logWriter.append(scorecard);
|
|
131
|
+
ingestScorecard(opts.projectRoot, scorecard);
|
|
130
132
|
}
|
|
131
133
|
catch (err) {
|
|
132
134
|
const errorEntry = {
|
|
@@ -230,6 +232,7 @@ export async function runJudgeSync(opts) {
|
|
|
230
232
|
scorecard.telemetryPosted = true;
|
|
231
233
|
}
|
|
232
234
|
await logWriter.append(scorecard);
|
|
235
|
+
ingestScorecard(opts.projectRoot, scorecard);
|
|
233
236
|
resolve(scorecard);
|
|
234
237
|
}
|
|
235
238
|
catch (err) {
|
package/dist/lib/eval/types.js
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* rubric, defined in rubric.ts and answered by the judge agent.
|
|
6
6
|
*/
|
|
7
7
|
export function isScorecard(entry) {
|
|
8
|
-
return !("error" in entry) && "questions" in entry && Array.isArray(entry.questions);
|
|
8
|
+
return (!("error" in entry) && "questions" in entry && Array.isArray(entry.questions));
|
|
9
9
|
}
|
|
10
10
|
export function isErrorEntry(entry) {
|
|
11
11
|
return "error" in entry && entry.error === true;
|
package/hooks/eval-trigger.js
CHANGED
|
@@ -141,6 +141,25 @@ if (!judgeRunnerPath) {
|
|
|
141
141
|
process.exit(0);
|
|
142
142
|
}
|
|
143
143
|
|
|
144
|
+
// Surface unresolved findings from previous evals
|
|
145
|
+
const findingsPath = judgeRunnerPath.replace("judge-runner.js", "findings.js");
|
|
146
|
+
if (existsSync(findingsPath)) {
|
|
147
|
+
try {
|
|
148
|
+
const { getUnresolvedFindings } = await import(findingsPath);
|
|
149
|
+
const unresolved = getUnresolvedFindings(projectRoot);
|
|
150
|
+
if (unresolved.length > 0) {
|
|
151
|
+
const lines = unresolved.map(
|
|
152
|
+
(f) => ` [${f.severity}] ${f.questionId}: ${f.finding} (change ${f.changeId.slice(0, 8)})`,
|
|
153
|
+
);
|
|
154
|
+
process.stderr.write(
|
|
155
|
+
`\n📊 Unresolved eval findings (${unresolved.length}):\n${lines.join("\n")}\nUse \`indusk eval fix <key>\` or \`indusk eval ignore <key>\` to resolve.\n\n`,
|
|
156
|
+
);
|
|
157
|
+
}
|
|
158
|
+
} catch {
|
|
159
|
+
// findings module not available — skip silently
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
144
163
|
// Spawn a detached node process that calls runJudgeSync (which awaits completion).
|
|
145
164
|
const judgeScript = `
|
|
146
165
|
import("${judgeRunnerPath}")
|
package/package.json
CHANGED
package/skills/planner.md
CHANGED
|
@@ -25,7 +25,7 @@ Each document builds on the ones before it. Not every plan needs all five — us
|
|
|
25
25
|
|
|
26
26
|
The order is always preserved — never write an ADR before the brief, or an impl before the ADR (when both exist).
|
|
27
27
|
|
|
28
|
-
General-purpose research (insights useful across plans) also lives in
|
|
28
|
+
General-purpose research (insights useful across plans) also lives in `.indusk/research/`.
|
|
29
29
|
|
|
30
30
|
## Workflow Types
|
|
31
31
|
|
|
@@ -62,12 +62,12 @@ Workflow templates are in `templates/workflows/` in the package. They describe w
|
|
|
62
62
|
- **refactor**: start with brief (includes boundary map)
|
|
63
63
|
- **spike**: start with research (and stop there)
|
|
64
64
|
|
|
65
|
-
**Check for existing research first.** Before writing new research, scan
|
|
65
|
+
**Check for existing research first.** Before writing new research, scan `.indusk/research/` for relevant standalone research docs. If one exists (e.g., `.indusk/research/auth-options.md`), ask the user: "I found existing research at `.indusk/research/auth-options.md`. Want to use this as the starting point?" If yes:
|
|
66
66
|
- Copy it to `.indusk/planning/{plan-name}/research.md`
|
|
67
67
|
- Set the frontmatter status to `complete`
|
|
68
68
|
- Move straight to the brief
|
|
69
69
|
|
|
70
|
-
The
|
|
70
|
+
The `.indusk/research/` directory is for standalone exploration that isn't tied to a plan yet. When it becomes a plan, it moves into the planning folder. The original in `.indusk/research/` can be deleted or kept as a reference — user's choice.
|
|
71
71
|
|
|
72
72
|
For feature/spike workflows that need new research: Explore the problem space — read code, search the web, check Context7 for library docs. **Query the code graph before scoping** (see toolbelt "Before Modifying Code") — include structural findings in research.md with concrete numbers.
|
|
73
73
|
Document what you find. The research doc records findings and analysis, but saves the recommendation for the brief.
|
|
@@ -336,7 +336,7 @@ date: {YYYY-MM-DD}
|
|
|
336
336
|
- {Hindsight — decisions that could have been better, steps to skip or add}
|
|
337
337
|
|
|
338
338
|
## Insights Worth Carrying Forward
|
|
339
|
-
{Takeaways for future plans. Save to research/ if broadly useful.}
|
|
339
|
+
{Takeaways for future plans. Save to .indusk/research/ if broadly useful.}
|
|
340
340
|
|
|
341
341
|
## Quality Ratchet
|
|
342
342
|
{Could any mistakes in this plan have been caught automatically by a Biome rule? If yes, add the rule to biome.json and document it in biome-rationale.md. The quality ratchet only gets tighter.}
|
|
@@ -361,7 +361,7 @@ date: {YYYY-MM-DD}
|
|
|
361
361
|
└── archive/
|
|
362
362
|
└── {completed-plan}/
|
|
363
363
|
|
|
364
|
-
research/
|
|
364
|
+
.indusk/research/ # Standalone insights useful across plans
|
|
365
365
|
```
|
|
366
366
|
|
|
367
367
|
- Kebab-case folder names
|
|
@@ -374,6 +374,6 @@ research/ # Standalone insights useful across plans
|
|
|
374
374
|
- **Use the code graph for scoping.** Before writing a brief or impl, query `analyze_code_relationships` to understand what depends on what. "How many files import X?" and "What calls this function?" prevent underscoping.
|
|
375
375
|
- Keep Y-statements concise but complete. Every field filled in.
|
|
376
376
|
- Impl checklists: granular enough to track, not so granular they're busywork.
|
|
377
|
-
- When research produces broadly useful insights, also save to
|
|
377
|
+
- When research produces broadly useful insights, also save to `.indusk/research/`.
|
|
378
378
|
- Cross-reference related plans by path whenever work overlaps between plans.
|
|
379
379
|
- The user's input is: $ARGUMENTS
|