@infinitedusky/indusk-mcp 1.24.4 → 1.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/eval/findings.js +4 -7
- package/dist/lib/eval/persistent-evaluator.js +6 -2
- package/dist/lib/eval/scorecard-extractor.d.ts +20 -0
- package/dist/lib/eval/scorecard-extractor.js +20 -0
- package/dist/lib/trajectory/validator.d.ts +24 -11
- package/dist/lib/trajectory/validator.js +20 -15
- package/hooks/validate-impl-structure.js +12 -7
- package/package.json +1 -1
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
*/
|
|
7
7
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
8
8
|
import { dirname, join } from "node:path";
|
|
9
|
+
import { getScorecardQuestions } from "./scorecard-extractor.js";
|
|
9
10
|
function getFindingsPath(projectRoot) {
|
|
10
11
|
return join(projectRoot, ".indusk", "eval", "findings.json");
|
|
11
12
|
}
|
|
@@ -47,13 +48,9 @@ export function ingestScorecard(projectRoot, scorecard) {
|
|
|
47
48
|
const findings = readFindings(projectRoot);
|
|
48
49
|
let added = 0;
|
|
49
50
|
// Defensive: the model occasionally returns a scorecard with a missing,
|
|
50
|
-
// null, or non-array `questions` field (it invents its own schema).
|
|
51
|
-
//
|
|
52
|
-
|
|
53
|
-
// entry lands right after, falsely implying the scorecard was lost.
|
|
54
|
-
// Tolerate the malformed shape silently; downstream consumers can still
|
|
55
|
-
// see the raw scorecard in results.log.
|
|
56
|
-
const questions = Array.isArray(scorecard.questions) ? scorecard.questions : [];
|
|
51
|
+
// null, or non-array `questions` field (it invents its own schema). See
|
|
52
|
+
// `scorecard-extractor.ts` getScorecardQuestions for the central guard.
|
|
53
|
+
const questions = getScorecardQuestions(scorecard);
|
|
57
54
|
for (const q of questions) {
|
|
58
55
|
if (q.answer === "yes")
|
|
59
56
|
continue; // no finding for passing questions
|
|
@@ -16,7 +16,7 @@ import { EvalLogWriter } from "./log-writer.js";
|
|
|
16
16
|
import { initEvalOtel, initEvalOtelLogs, logEvalContent, shutdownEvalOtel, withSpan, } from "./otel.js";
|
|
17
17
|
import { buildEvaluatorPrompt } from "./prompt-builder.js";
|
|
18
18
|
import { V1_RUBRIC } from "./rubric.js";
|
|
19
|
-
import { extractScorecardJson, formatParseError } from "./scorecard-extractor.js";
|
|
19
|
+
import { extractScorecardJson, formatParseError, getScorecardQuestions, } from "./scorecard-extractor.js";
|
|
20
20
|
function getSessionPath(projectRoot) {
|
|
21
21
|
return join(projectRoot, ".indusk", "eval", "evaluator-session.json");
|
|
22
22
|
}
|
|
@@ -263,7 +263,11 @@ Output ONLY the JSON scorecard as before — no commentary.`;
|
|
|
263
263
|
rootSpan.setAttribute("scorecard.output_tokens", scorecard.usage.outputTokens);
|
|
264
264
|
}
|
|
265
265
|
const answerCounts = { yes: 0, no: 0, partial: 0 };
|
|
266
|
-
|
|
266
|
+
// Use the central guard from scorecard-extractor — `?? []` here was
|
|
267
|
+
// the bug: it only catches null/undefined, not non-array shapes like
|
|
268
|
+
// `{}` (which the model has been observed to return — e.g. on Numero
|
|
269
|
+
// 2026-04-19 19:54 with `questions: { conventions: {...} }` keyed by id).
|
|
270
|
+
for (const q of getScorecardQuestions(scorecard)) {
|
|
267
271
|
if (q.answer in answerCounts)
|
|
268
272
|
answerCounts[q.answer]++;
|
|
269
273
|
}
|
|
@@ -26,6 +26,26 @@
|
|
|
26
26
|
* This function only locates the JSON; it doesn't validate it.
|
|
27
27
|
*/
|
|
28
28
|
export declare function extractScorecardJson(text: string): string | null;
|
|
29
|
+
/**
|
|
30
|
+
* Defensive accessor for `scorecard.questions`. Returns the array if the
|
|
31
|
+
* field is array-shaped; returns `[]` for any other shape (missing, null,
|
|
32
|
+
* boolean, number, object-keyed-by-id, etc.). The model occasionally invents
|
|
33
|
+
* its own scorecard schema and puts non-arrays here — the wrapper must not
|
|
34
|
+
* crash when that happens.
|
|
35
|
+
*
|
|
36
|
+
* Use this everywhere the wrapper iterates `scorecard.questions`. Never
|
|
37
|
+
* iterate the field directly (with `?? []` or otherwise) — `?? []` only
|
|
38
|
+
* catches null/undefined, not falsy-but-not-nullish values like `false`,
|
|
39
|
+
* `0`, `""`, or non-array objects.
|
|
40
|
+
*
|
|
41
|
+
* Surfaced bugs this prevents:
|
|
42
|
+
* - `for (const q of scorecard.questions)` when `questions` is missing
|
|
43
|
+
* - `for (const q of scorecard.questions ?? [])` when `questions` is `{}`
|
|
44
|
+
* (e.g., model returned `questions: { conventions: {...} }` keyed by id)
|
|
45
|
+
*/
|
|
46
|
+
export declare function getScorecardQuestions<T>(scorecard: {
|
|
47
|
+
questions?: unknown;
|
|
48
|
+
}): T[];
|
|
29
49
|
/**
|
|
30
50
|
* Build an error message for the case where scorecard parsing failed.
|
|
31
51
|
* Includes the underlying error and a snippet of the raw stdout so post-
|
|
@@ -118,6 +118,26 @@ function findFirstBalancedJsonObject(text) {
|
|
|
118
118
|
// Walked to end of string without closing the outermost brace.
|
|
119
119
|
return null;
|
|
120
120
|
}
|
|
121
|
+
/**
|
|
122
|
+
* Defensive accessor for `scorecard.questions`. Returns the array if the
|
|
123
|
+
* field is array-shaped; returns `[]` for any other shape (missing, null,
|
|
124
|
+
* boolean, number, object-keyed-by-id, etc.). The model occasionally invents
|
|
125
|
+
* its own scorecard schema and puts non-arrays here — the wrapper must not
|
|
126
|
+
* crash when that happens.
|
|
127
|
+
*
|
|
128
|
+
* Use this everywhere the wrapper iterates `scorecard.questions`. Never
|
|
129
|
+
* iterate the field directly (with `?? []` or otherwise) — `?? []` only
|
|
130
|
+
* catches null/undefined, not falsy-but-not-nullish values like `false`,
|
|
131
|
+
* `0`, `""`, or non-array objects.
|
|
132
|
+
*
|
|
133
|
+
* Surfaced bugs this prevents:
|
|
134
|
+
* - `for (const q of scorecard.questions)` when `questions` is missing
|
|
135
|
+
* - `for (const q of scorecard.questions ?? [])` when `questions` is `{}`
|
|
136
|
+
* (e.g., model returned `questions: { conventions: {...} }` keyed by id)
|
|
137
|
+
*/
|
|
138
|
+
export function getScorecardQuestions(scorecard) {
|
|
139
|
+
return Array.isArray(scorecard.questions) ? scorecard.questions : [];
|
|
140
|
+
}
|
|
121
141
|
/**
|
|
122
142
|
* Build an error message for the case where scorecard parsing failed.
|
|
123
143
|
* Includes the underlying error and a snippet of the raw stdout so post-
|
|
@@ -14,6 +14,15 @@ export interface ValidateTrajectoryOptions {
|
|
|
14
14
|
* check at apps/indusk-mcp/hooks/validate-impl-structure.js.
|
|
15
15
|
*/
|
|
16
16
|
rationaleRequired?: boolean;
|
|
17
|
+
/**
|
|
18
|
+
* The phase number that counts as "writable today against the current stack."
|
|
19
|
+
* Trajectory rows whose `Writable at` is ≤ baseline are exempt from the
|
|
20
|
+
* rationale-completeness rule. Defaults to 0 (the original behavior:
|
|
21
|
+
* Phase 0 rows are exempt). Plans where Phase 1 IS the enabling work
|
|
22
|
+
* (refactors, schema migrations, scaffolding) set this to 1 so rows
|
|
23
|
+
* authored at Phase 1 don't require justification entries.
|
|
24
|
+
*/
|
|
25
|
+
rationaleBaseline?: number;
|
|
17
26
|
}
|
|
18
27
|
/**
|
|
19
28
|
* Rule 1: Every impl document must have a `## Test Trajectory` section.
|
|
@@ -39,23 +48,27 @@ export declare function validateTemporalCoherence(trajectory: Trajectory): Valid
|
|
|
39
48
|
export declare function validateDeferredCompleteness(trajectory: Trajectory): ValidationError[];
|
|
40
49
|
/**
|
|
41
50
|
* Rule 5: When the impl frontmatter sets `rationale: required`, every
|
|
42
|
-
* trajectory row whose `Writable at` is later than
|
|
43
|
-
*
|
|
51
|
+
* trajectory row whose `Writable at` is later than the configured baseline
|
|
52
|
+
* (default Phase 0) must have an entry in the `### Trajectory Rationale`
|
|
53
|
+
* subsection.
|
|
44
54
|
*
|
|
45
|
-
*
|
|
46
|
-
*
|
|
47
|
-
*
|
|
48
|
-
*
|
|
55
|
+
* The baseline names the phase that counts as "writable today against the
|
|
56
|
+
* current stack" for this plan. Default 0 — Phase 0 rows are exempt because
|
|
57
|
+
* they're writable before any plan code lands. Plans where Phase 1 IS the
|
|
58
|
+
* enabling work (refactors, schema migrations, scaffolding) can declare
|
|
59
|
+
* `rationale_baseline: 1` in frontmatter so Phase 1 rows are exempt too.
|
|
49
60
|
*
|
|
50
|
-
* If no row needs a rationale (every row is
|
|
51
|
-
* is optional. If any row is
|
|
52
|
-
* an entry for every
|
|
53
|
-
* trajectory) are always flagged.
|
|
61
|
+
* If no row needs a rationale (every row is ≤ baseline), the subsection
|
|
62
|
+
* itself is optional. If any row is later than baseline, the subsection
|
|
63
|
+
* must exist and contain an entry for every such row. Stale entries
|
|
64
|
+
* (entries for IDs not in the trajectory) are always flagged.
|
|
54
65
|
*
|
|
55
66
|
* Mirrors `validateRationaleCompleteness` in
|
|
56
67
|
* `.claude/hooks/validate-impl-structure.js`.
|
|
57
68
|
*/
|
|
58
|
-
export declare function validateRationaleCompleteness(body: string, trajectory: Trajectory
|
|
69
|
+
export declare function validateRationaleCompleteness(body: string, trajectory: Trajectory, options?: {
|
|
70
|
+
baseline?: number;
|
|
71
|
+
}): ValidationError[];
|
|
59
72
|
/**
|
|
60
73
|
* Run all trajectory validation rules against an impl body. The body is the
|
|
61
74
|
* markdown content after the frontmatter — pass the output of `gray-matter`
|
|
@@ -185,31 +185,34 @@ export function validateDeferredCompleteness(trajectory) {
|
|
|
185
185
|
}
|
|
186
186
|
/**
|
|
187
187
|
* Rule 5: When the impl frontmatter sets `rationale: required`, every
|
|
188
|
-
* trajectory row whose `Writable at` is later than
|
|
189
|
-
*
|
|
188
|
+
* trajectory row whose `Writable at` is later than the configured baseline
|
|
189
|
+
* (default Phase 0) must have an entry in the `### Trajectory Rationale`
|
|
190
|
+
* subsection.
|
|
190
191
|
*
|
|
191
|
-
*
|
|
192
|
-
*
|
|
193
|
-
*
|
|
194
|
-
*
|
|
192
|
+
* The baseline names the phase that counts as "writable today against the
|
|
193
|
+
* current stack" for this plan. Default 0 — Phase 0 rows are exempt because
|
|
194
|
+
* they're writable before any plan code lands. Plans where Phase 1 IS the
|
|
195
|
+
* enabling work (refactors, schema migrations, scaffolding) can declare
|
|
196
|
+
* `rationale_baseline: 1` in frontmatter so Phase 1 rows are exempt too.
|
|
195
197
|
*
|
|
196
|
-
* If no row needs a rationale (every row is
|
|
197
|
-
* is optional. If any row is
|
|
198
|
-
* an entry for every
|
|
199
|
-
* trajectory) are always flagged.
|
|
198
|
+
* If no row needs a rationale (every row is ≤ baseline), the subsection
|
|
199
|
+
* itself is optional. If any row is later than baseline, the subsection
|
|
200
|
+
* must exist and contain an entry for every such row. Stale entries
|
|
201
|
+
* (entries for IDs not in the trajectory) are always flagged.
|
|
200
202
|
*
|
|
201
203
|
* Mirrors `validateRationaleCompleteness` in
|
|
202
204
|
* `.claude/hooks/validate-impl-structure.js`.
|
|
203
205
|
*/
|
|
204
|
-
export function validateRationaleCompleteness(body, trajectory) {
|
|
206
|
+
export function validateRationaleCompleteness(body, trajectory, options = {}) {
|
|
205
207
|
const errors = [];
|
|
206
|
-
const
|
|
208
|
+
const baseline = Number.isFinite(options.baseline) ? Number(options.baseline) : 0;
|
|
209
|
+
const rowsNeedingRationale = trajectory.rows.filter((r) => Number.isFinite(r.writableAt) && r.writableAt > baseline);
|
|
207
210
|
const hasSubsection = /^###\s+Trajectory Rationale\b/m.test(body);
|
|
208
211
|
const rationaleIds = hasSubsection ? parseRationaleBlock(body) : new Set();
|
|
209
212
|
if (rowsNeedingRationale.length > 0 && !hasSubsection) {
|
|
210
213
|
errors.push({
|
|
211
214
|
rule: "rationale-completeness",
|
|
212
|
-
message: `\`rationale: required\` is set and ${rowsNeedingRationale.length} trajectory row(s) have \`Writable at\` later than Phase
|
|
215
|
+
message: `\`rationale: required\` is set and ${rowsNeedingRationale.length} trajectory row(s) have \`Writable at\` later than Phase ${baseline}, but the impl is missing the \`### Trajectory Rationale\` subsection. Rows at or below the baseline don't need rationale; rows where authoring waits on later plan code do — add an entry for ${rowsNeedingRationale.map((r) => r.id).join(", ")}.`,
|
|
213
216
|
});
|
|
214
217
|
// Even without the subsection, fall through to also check for stale entries
|
|
215
218
|
// (there are none in this case, but the structure is symmetric).
|
|
@@ -222,7 +225,7 @@ export function validateRationaleCompleteness(body, trajectory) {
|
|
|
222
225
|
if (missing.length > 0 && hasSubsection) {
|
|
223
226
|
errors.push({
|
|
224
227
|
rule: "rationale-completeness",
|
|
225
|
-
message: `Trajectory rows with \`Writable at\` later than Phase
|
|
228
|
+
message: `Trajectory rows with \`Writable at\` later than Phase ${baseline} missing from \`### Trajectory Rationale\`: ${missing.join(", ")}. Every row whose authoring waits on later plan code needs a \`- **TN** \`Writable at: Phase N\` — {reason}\` entry. Rows at or below the baseline (Phase ${baseline}) do not need rationale.`,
|
|
226
229
|
});
|
|
227
230
|
}
|
|
228
231
|
const knownIds = new Set(trajectory.rows.map((r) => r.id));
|
|
@@ -278,7 +281,9 @@ export function validateTrajectory(body, options = {}) {
|
|
|
278
281
|
...validateDeferredCompleteness(trajectory),
|
|
279
282
|
];
|
|
280
283
|
if (options.rationaleRequired) {
|
|
281
|
-
errors.push(...validateRationaleCompleteness(body, trajectory
|
|
284
|
+
errors.push(...validateRationaleCompleteness(body, trajectory, {
|
|
285
|
+
baseline: options.rationaleBaseline,
|
|
286
|
+
}));
|
|
282
287
|
}
|
|
283
288
|
return errors;
|
|
284
289
|
}
|
|
@@ -312,9 +312,13 @@ const trajectoryRequiredFrontmatter = /trajectory:\s*required/.test(frontmatter)
|
|
|
312
312
|
const hasTrajectoryHeading = /^##\s+Test Trajectory\b/m.test(body);
|
|
313
313
|
const trajectoryValidationEnabled = trajectoryRequiredFrontmatter || hasTrajectoryHeading;
|
|
314
314
|
const rationaleRequiredFrontmatter = /rationale:\s*required/.test(frontmatter);
|
|
315
|
+
const rationaleBaselineMatch = frontmatter.match(/rationale_baseline:\s*(\d+)/);
|
|
316
|
+
const rationaleBaseline = rationaleBaselineMatch
|
|
317
|
+
? Number.parseInt(rationaleBaselineMatch[1], 10)
|
|
318
|
+
: 0;
|
|
315
319
|
|
|
316
320
|
if (trajectoryValidationEnabled) {
|
|
317
|
-
const trajectoryErrors = validateTrajectory(body, rationaleRequiredFrontmatter);
|
|
321
|
+
const trajectoryErrors = validateTrajectory(body, rationaleRequiredFrontmatter, rationaleBaseline);
|
|
318
322
|
if (trajectoryErrors.length > 0) {
|
|
319
323
|
process.stderr.write(
|
|
320
324
|
`Test Trajectory validation failed (policy: ${gatePolicy}):\n${trajectoryErrors.map((e) => ` [${e.rule}] ${e.message}`).join("\n")}\n\nSee .indusk/planning/tests-first-planning/adr.md Sections 3-6 for the Test Trajectory shape and validator rules.\n`,
|
|
@@ -347,7 +351,7 @@ process.exit(0);
|
|
|
347
351
|
// apps/indusk-mcp/src/lib/trajectory/validator.ts and parser.ts)
|
|
348
352
|
// ------------------------------------------------------------------
|
|
349
353
|
|
|
350
|
-
function validateTrajectory(implBody, rationaleRequired) {
|
|
354
|
+
function validateTrajectory(implBody, rationaleRequired, rationaleBaseline = 0) {
|
|
351
355
|
const errors = [];
|
|
352
356
|
|
|
353
357
|
// Rule 1: trajectory presence
|
|
@@ -365,7 +369,7 @@ function validateTrajectory(implBody, rationaleRequired) {
|
|
|
365
369
|
errors.push(...validateTemporalCoherence(trajectory));
|
|
366
370
|
errors.push(...validateDeferredCompleteness(trajectory));
|
|
367
371
|
if (rationaleRequired) {
|
|
368
|
-
errors.push(...validateRationaleCompleteness(implBody, trajectory));
|
|
372
|
+
errors.push(...validateRationaleCompleteness(implBody, trajectory, rationaleBaseline));
|
|
369
373
|
}
|
|
370
374
|
return errors;
|
|
371
375
|
}
|
|
@@ -646,11 +650,12 @@ function validateDeferredCompleteness(trajectory) {
|
|
|
646
650
|
// Read the entries together: shared weak excuses signal over-sequencing.
|
|
647
651
|
// ------------------------------------------------------------------
|
|
648
652
|
|
|
649
|
-
function validateRationaleCompleteness(implBody, trajectory) {
|
|
653
|
+
function validateRationaleCompleteness(implBody, trajectory, baseline = 0) {
|
|
650
654
|
const errors = [];
|
|
655
|
+
const baselineNum = Number.isFinite(baseline) ? Number(baseline) : 0;
|
|
651
656
|
|
|
652
657
|
const rowsNeedingRationale = trajectory.rows.filter(
|
|
653
|
-
(r) => Number.isFinite(r.writableAt) && r.writableAt >
|
|
658
|
+
(r) => Number.isFinite(r.writableAt) && r.writableAt > baselineNum,
|
|
654
659
|
);
|
|
655
660
|
const hasSubsection = /^###\s+Trajectory Rationale\b/m.test(implBody);
|
|
656
661
|
const rationaleIds = hasSubsection ? parseRationaleBlock(implBody) : new Set();
|
|
@@ -658,7 +663,7 @@ function validateRationaleCompleteness(implBody, trajectory) {
|
|
|
658
663
|
if (rowsNeedingRationale.length > 0 && !hasSubsection) {
|
|
659
664
|
errors.push({
|
|
660
665
|
rule: "rationale-completeness",
|
|
661
|
-
message: `\`rationale: required\` is set and ${rowsNeedingRationale.length} trajectory row(s) have \`Writable at\` later than Phase
|
|
666
|
+
message: `\`rationale: required\` is set and ${rowsNeedingRationale.length} trajectory row(s) have \`Writable at\` later than Phase ${baselineNum}, but the impl is missing the \`### Trajectory Rationale\` subsection. Rows at or below the baseline don't need rationale; rows where authoring waits on later plan code do — add an entry for ${rowsNeedingRationale.map((r) => r.id).join(", ")}.`,
|
|
662
667
|
});
|
|
663
668
|
}
|
|
664
669
|
|
|
@@ -670,7 +675,7 @@ function validateRationaleCompleteness(implBody, trajectory) {
|
|
|
670
675
|
if (missing.length > 0 && hasSubsection) {
|
|
671
676
|
errors.push({
|
|
672
677
|
rule: "rationale-completeness",
|
|
673
|
-
message: `Trajectory rows with \`Writable at\` later than Phase
|
|
678
|
+
message: `Trajectory rows with \`Writable at\` later than Phase ${baselineNum} missing from \`### Trajectory Rationale\`: ${missing.join(", ")}. Every row whose authoring waits on later plan code needs a \`- **TN** \`Writable at: Phase N\` — {reason}\` entry. Rows at or below the baseline (Phase ${baselineNum}) do not need rationale.`,
|
|
674
679
|
});
|
|
675
680
|
}
|
|
676
681
|
|