gyoshu 0.2.5 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +363 -0
- package/README.md +1 -0
- package/package.json +1 -1
- package/src/agent/baksa.md +81 -0
- package/src/agent/gyoshu.md +180 -0
- package/src/agent/jogyo.md +55 -0
- package/src/lib/goal-gates.ts +753 -0
- package/src/lib/notebook-frontmatter.ts +307 -40
- package/src/tool/gyoshu-completion.ts +53 -0
|
@@ -0,0 +1,753 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Goal Gates Library - Evaluate acceptance criteria against notebook outputs.
|
|
3
|
+
*
|
|
4
|
+
* This module implements the "Goal Gate" component of the Two-Gate system:
|
|
5
|
+
* - Trust Gate: Evidence quality (implemented in quality-gates.ts)
|
|
6
|
+
* - Goal Gate: Acceptance criteria met (THIS MODULE)
|
|
7
|
+
*
|
|
8
|
+
* Goal Gate Rules:
|
|
9
|
+
* - Evaluates acceptance criteria defined in notebook frontmatter
|
|
10
|
+
* - Checks metric thresholds, marker existence, artifact creation, finding counts
|
|
11
|
+
* - Provides pivot recommendations when criteria are not met
|
|
12
|
+
*
|
|
13
|
+
* @module goal-gates
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import {
|
|
17
|
+
GoalContract,
|
|
18
|
+
AcceptanceCriterion,
|
|
19
|
+
ComparisonOperator,
|
|
20
|
+
} from "./notebook-frontmatter";
|
|
21
|
+
import { parseMarkers, getMarkersByType } from "./marker-parser";
|
|
22
|
+
|
|
23
|
+
// =============================================================================
|
|
24
|
+
// TYPES AND INTERFACES
|
|
25
|
+
// =============================================================================
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Status of a single acceptance criterion evaluation.
|
|
29
|
+
*/
|
|
30
|
+
export type CriterionStatus = "MET" | "NOT_MET" | "BLOCKED" | "UNKNOWN";
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Result of evaluating a single acceptance criterion.
|
|
34
|
+
*/
|
|
35
|
+
export interface CriterionResult {
|
|
36
|
+
/** The criterion that was evaluated */
|
|
37
|
+
criterion: AcceptanceCriterion;
|
|
38
|
+
/** Whether the criterion was met */
|
|
39
|
+
status: CriterionStatus;
|
|
40
|
+
/** Actual value found (for metric_threshold and finding_count) */
|
|
41
|
+
actualValue?: number | string | boolean;
|
|
42
|
+
/** Human-readable message describing the result */
|
|
43
|
+
message: string;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Result of evaluating all goal gate criteria.
|
|
48
|
+
*/
|
|
49
|
+
export interface GoalGateResult {
|
|
50
|
+
/** Whether all criteria are MET */
|
|
51
|
+
passed: boolean;
|
|
52
|
+
/** Overall status of the goal gate */
|
|
53
|
+
overallStatus: "MET" | "NOT_MET" | "BLOCKED" | "NO_CONTRACT";
|
|
54
|
+
/** Individual results for each criterion */
|
|
55
|
+
criteriaResults: CriterionResult[];
|
|
56
|
+
/** Number of criteria that were MET */
|
|
57
|
+
metCount: number;
|
|
58
|
+
/** Total number of criteria evaluated */
|
|
59
|
+
totalCount: number;
|
|
60
|
+
/** List of blocker messages (for BLOCKED status) */
|
|
61
|
+
blockers?: string[];
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Recommendation for pivoting when goal criteria are not met.
|
|
66
|
+
*/
|
|
67
|
+
export interface PivotRecommendation {
|
|
68
|
+
/** Whether a pivot is recommended */
|
|
69
|
+
shouldPivot: boolean;
|
|
70
|
+
/** Current attempt number */
|
|
71
|
+
attemptNumber: number;
|
|
72
|
+
/** Maximum attempts allowed */
|
|
73
|
+
maxAttempts: number;
|
|
74
|
+
/** List of suggested actions to improve results */
|
|
75
|
+
suggestions: string[];
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// =============================================================================
|
|
79
|
+
// HELPER FUNCTIONS
|
|
80
|
+
// =============================================================================
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Extract a metric value from notebook output.
|
|
84
|
+
*
|
|
85
|
+
* Searches for pattern: [METRIC:name] value
|
|
86
|
+
*
|
|
87
|
+
* @param output - Combined stdout from notebook cells
|
|
88
|
+
* @param metricName - Name of the metric to extract (e.g., "cv_accuracy_mean")
|
|
89
|
+
* @returns Extracted numeric value or undefined if not found
|
|
90
|
+
*
|
|
91
|
+
* @example
|
|
92
|
+
* ```typescript
|
|
93
|
+
* const output = "[METRIC:cv_accuracy_mean] 0.85\n[METRIC:baseline] 0.65";
|
|
94
|
+
* const value = extractMetricValue(output, "cv_accuracy_mean");
|
|
95
|
+
* // value === 0.85
|
|
96
|
+
* ```
|
|
97
|
+
*/
|
|
98
|
+
export function extractMetricValue(
|
|
99
|
+
output: string,
|
|
100
|
+
metricName: string
|
|
101
|
+
): number | undefined {
|
|
102
|
+
// Pattern to match [METRIC:name] value
|
|
103
|
+
// Handles various formats:
|
|
104
|
+
// - [METRIC:name] 0.85
|
|
105
|
+
// - [METRIC:name] 0.85 (with description)
|
|
106
|
+
// - [METRIC:name] value = 0.85
|
|
107
|
+
const escapedName = metricName.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
108
|
+
const pattern = new RegExp(
|
|
109
|
+
`\\[METRIC:${escapedName}\\]\\s*(?:=\\s*)?(\\d+\\.?\\d*)`,
|
|
110
|
+
"i"
|
|
111
|
+
);
|
|
112
|
+
|
|
113
|
+
const match = output.match(pattern);
|
|
114
|
+
if (match && match[1]) {
|
|
115
|
+
const value = parseFloat(match[1]);
|
|
116
|
+
if (!isNaN(value)) {
|
|
117
|
+
return value;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
return undefined;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Compare two values using a comparison operator.
|
|
126
|
+
*
|
|
127
|
+
* @param actual - The actual value to compare
|
|
128
|
+
* @param op - Comparison operator
|
|
129
|
+
* @param target - Target value to compare against
|
|
130
|
+
* @returns true if comparison passes
|
|
131
|
+
*/
|
|
132
|
+
function compareValues(
|
|
133
|
+
actual: number,
|
|
134
|
+
op: ComparisonOperator,
|
|
135
|
+
target: number
|
|
136
|
+
): boolean {
|
|
137
|
+
switch (op) {
|
|
138
|
+
case ">=":
|
|
139
|
+
return actual >= target;
|
|
140
|
+
case ">":
|
|
141
|
+
return actual > target;
|
|
142
|
+
case "<=":
|
|
143
|
+
return actual <= target;
|
|
144
|
+
case "<":
|
|
145
|
+
return actual < target;
|
|
146
|
+
case "==":
|
|
147
|
+
return actual === target;
|
|
148
|
+
case "!=":
|
|
149
|
+
return actual !== target;
|
|
150
|
+
default:
|
|
151
|
+
return false;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* Get the human-readable comparison description.
|
|
157
|
+
*
|
|
158
|
+
* @param op - Comparison operator
|
|
159
|
+
* @returns Human-readable comparison string
|
|
160
|
+
*/
|
|
161
|
+
function getComparisonDescription(op: ComparisonOperator): string {
|
|
162
|
+
switch (op) {
|
|
163
|
+
case ">=":
|
|
164
|
+
return ">=";
|
|
165
|
+
case ">":
|
|
166
|
+
return ">";
|
|
167
|
+
case "<=":
|
|
168
|
+
return "<=";
|
|
169
|
+
case "<":
|
|
170
|
+
return "<";
|
|
171
|
+
case "==":
|
|
172
|
+
return "==";
|
|
173
|
+
case "!=":
|
|
174
|
+
return "!=";
|
|
175
|
+
default:
|
|
176
|
+
return op;
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Match a glob pattern against a file path.
|
|
182
|
+
*
|
|
183
|
+
* Supports basic glob patterns:
|
|
184
|
+
* - * matches any characters except /
|
|
185
|
+
* - ** matches any characters including /
|
|
186
|
+
* - ? matches single character
|
|
187
|
+
*
|
|
188
|
+
* @param pattern - Glob pattern (e.g., "*.pkl", "models/*.joblib")
|
|
189
|
+
* @param path - File path to match
|
|
190
|
+
* @returns true if pattern matches
|
|
191
|
+
*/
|
|
192
|
+
function matchGlobPattern(pattern: string, path: string): boolean {
|
|
193
|
+
// Escape regex special chars except * and ?
|
|
194
|
+
let regexStr = pattern
|
|
195
|
+
.replace(/[.+^${}()|[\]\\]/g, "\\$&")
|
|
196
|
+
// Handle ** first (match any including /)
|
|
197
|
+
.replace(/\*\*/g, ".*")
|
|
198
|
+
// Handle * (match any except /)
|
|
199
|
+
.replace(/(?<!\.\*)\*/g, "[^/]*")
|
|
200
|
+
// Handle ?
|
|
201
|
+
.replace(/\?/g, ".");
|
|
202
|
+
|
|
203
|
+
// Ensure pattern matches the end of the path (basename or relative path)
|
|
204
|
+
if (!pattern.includes("/")) {
|
|
205
|
+
// Pattern is just a filename pattern, match against basename
|
|
206
|
+
const basename = path.split("/").pop() || path;
|
|
207
|
+
const regex = new RegExp(`^${regexStr}$`);
|
|
208
|
+
return regex.test(basename);
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
// Pattern includes path separators, match against full path
|
|
212
|
+
const regex = new RegExp(`${regexStr}$`);
|
|
213
|
+
return regex.test(path);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// =============================================================================
|
|
217
|
+
// EVALUATOR FUNCTIONS
|
|
218
|
+
// =============================================================================
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Evaluate a metric threshold criterion.
|
|
222
|
+
*
|
|
223
|
+
* Extracts the metric value from output and compares against target.
|
|
224
|
+
*
|
|
225
|
+
* @param criterion - The metric_threshold criterion to evaluate
|
|
226
|
+
* @param output - Combined stdout from notebook cells
|
|
227
|
+
* @returns Evaluation result
|
|
228
|
+
*
|
|
229
|
+
* @example
|
|
230
|
+
* ```typescript
|
|
231
|
+
* const criterion = {
|
|
232
|
+
* id: "AC1",
|
|
233
|
+
* kind: "metric_threshold",
|
|
234
|
+
* metric: "cv_accuracy_mean",
|
|
235
|
+
* op: ">=",
|
|
236
|
+
* target: 0.90
|
|
237
|
+
* };
|
|
238
|
+
* const result = evaluateMetricThreshold(criterion, "[METRIC:cv_accuracy_mean] 0.85");
|
|
239
|
+
* // result.status === "NOT_MET"
|
|
240
|
+
* // result.actualValue === 0.85
|
|
241
|
+
* // result.message === "cv_accuracy_mean (0.85) < target (0.90)"
|
|
242
|
+
* ```
|
|
243
|
+
*/
|
|
244
|
+
export function evaluateMetricThreshold(
|
|
245
|
+
criterion: AcceptanceCriterion,
|
|
246
|
+
output: string
|
|
247
|
+
): CriterionResult {
|
|
248
|
+
// Validate criterion has required fields
|
|
249
|
+
if (!criterion.metric || !criterion.op || criterion.target === undefined) {
|
|
250
|
+
return {
|
|
251
|
+
criterion,
|
|
252
|
+
status: "UNKNOWN",
|
|
253
|
+
message: `Invalid metric_threshold criterion: missing metric, op, or target`,
|
|
254
|
+
};
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
const actualValue = extractMetricValue(output, criterion.metric);
|
|
258
|
+
|
|
259
|
+
if (actualValue === undefined) {
|
|
260
|
+
return {
|
|
261
|
+
criterion,
|
|
262
|
+
status: "NOT_MET",
|
|
263
|
+
message: `Metric [METRIC:${criterion.metric}] not found in output`,
|
|
264
|
+
};
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
const passed = compareValues(actualValue, criterion.op, criterion.target);
|
|
268
|
+
const compDesc = getComparisonDescription(criterion.op);
|
|
269
|
+
|
|
270
|
+
if (passed) {
|
|
271
|
+
return {
|
|
272
|
+
criterion,
|
|
273
|
+
status: "MET",
|
|
274
|
+
actualValue,
|
|
275
|
+
message: `${criterion.metric} (${actualValue}) ${compDesc} target (${criterion.target})`,
|
|
276
|
+
};
|
|
277
|
+
} else {
|
|
278
|
+
// Invert the operator for the failure message
|
|
279
|
+
const failedOp = criterion.op.replace(">=", "<")
|
|
280
|
+
.replace(">", "<=")
|
|
281
|
+
.replace("<=", ">")
|
|
282
|
+
.replace("<", ">=")
|
|
283
|
+
.replace("==", "!=")
|
|
284
|
+
.replace("!=", "==");
|
|
285
|
+
return {
|
|
286
|
+
criterion,
|
|
287
|
+
status: "NOT_MET",
|
|
288
|
+
actualValue,
|
|
289
|
+
message: `${criterion.metric} (${actualValue}) ${failedOp.charAt(0) === criterion.op.charAt(0) ? "does not meet" : failedOp} target (${criterion.target})`,
|
|
290
|
+
};
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
/**
|
|
295
|
+
* Evaluate a marker required criterion.
|
|
296
|
+
*
|
|
297
|
+
* Checks if the specified marker exists in the output.
|
|
298
|
+
*
|
|
299
|
+
* @param criterion - The marker_required criterion to evaluate
|
|
300
|
+
* @param output - Combined stdout from notebook cells
|
|
301
|
+
* @returns Evaluation result
|
|
302
|
+
*
|
|
303
|
+
* @example
|
|
304
|
+
* ```typescript
|
|
305
|
+
* const criterion = {
|
|
306
|
+
* id: "AC2",
|
|
307
|
+
* kind: "marker_required",
|
|
308
|
+
* marker: "METRIC:baseline_accuracy"
|
|
309
|
+
* };
|
|
310
|
+
* const result = evaluateMarkerRequired(criterion, "[METRIC:baseline_accuracy] 0.65");
|
|
311
|
+
* // result.status === "MET"
|
|
312
|
+
* // result.message === "Marker [METRIC:baseline_accuracy] found"
|
|
313
|
+
* ```
|
|
314
|
+
*/
|
|
315
|
+
export function evaluateMarkerRequired(
|
|
316
|
+
criterion: AcceptanceCriterion,
|
|
317
|
+
output: string
|
|
318
|
+
): CriterionResult {
|
|
319
|
+
// Validate criterion has required fields
|
|
320
|
+
if (!criterion.marker) {
|
|
321
|
+
return {
|
|
322
|
+
criterion,
|
|
323
|
+
status: "UNKNOWN",
|
|
324
|
+
message: `Invalid marker_required criterion: missing marker field`,
|
|
325
|
+
};
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// Build pattern to match the marker
|
|
329
|
+
// Marker format: "METRIC:baseline_accuracy" matches [METRIC:baseline_accuracy]
|
|
330
|
+
const escapedMarker = criterion.marker.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
331
|
+
const pattern = new RegExp(`\\[${escapedMarker}\\]`, "i");
|
|
332
|
+
|
|
333
|
+
const found = pattern.test(output);
|
|
334
|
+
|
|
335
|
+
if (found) {
|
|
336
|
+
return {
|
|
337
|
+
criterion,
|
|
338
|
+
status: "MET",
|
|
339
|
+
actualValue: true,
|
|
340
|
+
message: `Marker [${criterion.marker}] found`,
|
|
341
|
+
};
|
|
342
|
+
} else {
|
|
343
|
+
return {
|
|
344
|
+
criterion,
|
|
345
|
+
status: "NOT_MET",
|
|
346
|
+
actualValue: false,
|
|
347
|
+
message: `Marker [${criterion.marker}] not found in output`,
|
|
348
|
+
};
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
/**
|
|
353
|
+
* Evaluate an artifact exists criterion.
|
|
354
|
+
*
|
|
355
|
+
* Checks if any artifact matches the specified glob pattern.
|
|
356
|
+
*
|
|
357
|
+
* @param criterion - The artifact_exists criterion to evaluate
|
|
358
|
+
* @param artifacts - List of artifact file paths
|
|
359
|
+
* @returns Evaluation result
|
|
360
|
+
*
|
|
361
|
+
* @example
|
|
362
|
+
* ```typescript
|
|
363
|
+
* const criterion = {
|
|
364
|
+
* id: "AC3",
|
|
365
|
+
* kind: "artifact_exists",
|
|
366
|
+
* artifactPattern: "*.pkl"
|
|
367
|
+
* };
|
|
368
|
+
* const result = evaluateArtifactExists(criterion, ["reports/my-research/model.pkl"]);
|
|
369
|
+
* // result.status === "MET"
|
|
370
|
+
* // result.message === "Artifact matching *.pkl found: model.pkl"
|
|
371
|
+
* ```
|
|
372
|
+
*/
|
|
373
|
+
export function evaluateArtifactExists(
|
|
374
|
+
criterion: AcceptanceCriterion,
|
|
375
|
+
artifacts: string[]
|
|
376
|
+
): CriterionResult {
|
|
377
|
+
// Validate criterion has required fields
|
|
378
|
+
if (!criterion.artifactPattern) {
|
|
379
|
+
return {
|
|
380
|
+
criterion,
|
|
381
|
+
status: "UNKNOWN",
|
|
382
|
+
message: `Invalid artifact_exists criterion: missing artifactPattern field`,
|
|
383
|
+
};
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
// Find matching artifacts
|
|
387
|
+
const matchingArtifacts = artifacts.filter((artifact) =>
|
|
388
|
+
matchGlobPattern(criterion.artifactPattern!, artifact)
|
|
389
|
+
);
|
|
390
|
+
|
|
391
|
+
if (matchingArtifacts.length > 0) {
|
|
392
|
+
const matchedNames = matchingArtifacts
|
|
393
|
+
.map((a) => a.split("/").pop())
|
|
394
|
+
.join(", ");
|
|
395
|
+
return {
|
|
396
|
+
criterion,
|
|
397
|
+
status: "MET",
|
|
398
|
+
actualValue: matchingArtifacts[0],
|
|
399
|
+
message: `Artifact matching ${criterion.artifactPattern} found: ${matchedNames}`,
|
|
400
|
+
};
|
|
401
|
+
} else {
|
|
402
|
+
return {
|
|
403
|
+
criterion,
|
|
404
|
+
status: "NOT_MET",
|
|
405
|
+
message: `No artifact matching ${criterion.artifactPattern} found`,
|
|
406
|
+
};
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
/**
|
|
411
|
+
* Evaluate a finding count criterion.
|
|
412
|
+
*
|
|
413
|
+
* Counts [FINDING] markers in the output and compares against minimum.
|
|
414
|
+
*
|
|
415
|
+
* @param criterion - The finding_count criterion to evaluate
|
|
416
|
+
* @param output - Combined stdout from notebook cells
|
|
417
|
+
* @returns Evaluation result
|
|
418
|
+
*
|
|
419
|
+
* @example
|
|
420
|
+
* ```typescript
|
|
421
|
+
* const criterion = {
|
|
422
|
+
* id: "AC4",
|
|
423
|
+
* kind: "finding_count",
|
|
424
|
+
* minCount: 3
|
|
425
|
+
* };
|
|
426
|
+
* const text = "[FINDING] First\n[FINDING] Second";
|
|
427
|
+
* const result = evaluateFindingCount(criterion, text);
|
|
428
|
+
* // result.status === "NOT_MET"
|
|
429
|
+
* // result.actualValue === 2
|
|
430
|
+
* // result.message === "Found 2 findings, need at least 3"
|
|
431
|
+
* ```
|
|
432
|
+
*/
|
|
433
|
+
export function evaluateFindingCount(
|
|
434
|
+
criterion: AcceptanceCriterion,
|
|
435
|
+
output: string
|
|
436
|
+
): CriterionResult {
|
|
437
|
+
// Validate criterion has required fields
|
|
438
|
+
if (criterion.minCount === undefined || criterion.minCount < 0) {
|
|
439
|
+
return {
|
|
440
|
+
criterion,
|
|
441
|
+
status: "UNKNOWN",
|
|
442
|
+
message: `Invalid finding_count criterion: missing or invalid minCount field`,
|
|
443
|
+
};
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
// Parse markers and count FINDING markers
|
|
447
|
+
const parseResult = parseMarkers(output);
|
|
448
|
+
const findings = getMarkersByType(parseResult.markers, "FINDING");
|
|
449
|
+
const count = findings.length;
|
|
450
|
+
|
|
451
|
+
if (count >= criterion.minCount) {
|
|
452
|
+
return {
|
|
453
|
+
criterion,
|
|
454
|
+
status: "MET",
|
|
455
|
+
actualValue: count,
|
|
456
|
+
message: `Found ${count} finding${count !== 1 ? "s" : ""}, meets minimum of ${criterion.minCount}`,
|
|
457
|
+
};
|
|
458
|
+
} else {
|
|
459
|
+
return {
|
|
460
|
+
criterion,
|
|
461
|
+
status: "NOT_MET",
|
|
462
|
+
actualValue: count,
|
|
463
|
+
message: `Found ${count} finding${count !== 1 ? "s" : ""}, need at least ${criterion.minCount}`,
|
|
464
|
+
};
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
// =============================================================================
|
|
469
|
+
// MAIN EVALUATION FUNCTION
|
|
470
|
+
// =============================================================================
|
|
471
|
+
|
|
472
|
+
/**
|
|
473
|
+
* Evaluate goal contract against notebook outputs.
|
|
474
|
+
*
|
|
475
|
+
* Runs all acceptance criteria evaluators and aggregates results.
|
|
476
|
+
*
|
|
477
|
+
* @param contract - Goal contract with acceptance criteria (or undefined)
|
|
478
|
+
* @param notebookOutput - Combined stdout from all notebook cells
|
|
479
|
+
* @param artifacts - List of artifact file paths
|
|
480
|
+
* @returns Goal gate evaluation result
|
|
481
|
+
*
|
|
482
|
+
* @example
|
|
483
|
+
* ```typescript
|
|
484
|
+
* const contract = {
|
|
485
|
+
* version: 1,
|
|
486
|
+
* goal_text: "Build a classifier with 90% accuracy",
|
|
487
|
+
* acceptance_criteria: [
|
|
488
|
+
* { id: "AC1", kind: "metric_threshold", metric: "cv_accuracy_mean", op: ">=", target: 0.90 },
|
|
489
|
+
* { id: "AC2", kind: "marker_required", marker: "METRIC:baseline_accuracy" }
|
|
490
|
+
* ]
|
|
491
|
+
* };
|
|
492
|
+
* const output = "[METRIC:cv_accuracy_mean] 0.92\n[METRIC:baseline_accuracy] 0.65";
|
|
493
|
+
* const result = evaluateGoalGate(contract, output, []);
|
|
494
|
+
* // result.passed === true
|
|
495
|
+
* // result.overallStatus === "MET"
|
|
496
|
+
* ```
|
|
497
|
+
*/
|
|
498
|
+
export function evaluateGoalGate(
|
|
499
|
+
contract: GoalContract | undefined,
|
|
500
|
+
notebookOutput: string,
|
|
501
|
+
artifacts: string[]
|
|
502
|
+
): GoalGateResult {
|
|
503
|
+
// Handle missing contract
|
|
504
|
+
if (!contract) {
|
|
505
|
+
return {
|
|
506
|
+
passed: true,
|
|
507
|
+
overallStatus: "NO_CONTRACT",
|
|
508
|
+
criteriaResults: [],
|
|
509
|
+
metCount: 0,
|
|
510
|
+
totalCount: 0,
|
|
511
|
+
};
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
// Handle empty criteria
|
|
515
|
+
if (
|
|
516
|
+
!contract.acceptance_criteria ||
|
|
517
|
+
contract.acceptance_criteria.length === 0
|
|
518
|
+
) {
|
|
519
|
+
return {
|
|
520
|
+
passed: true,
|
|
521
|
+
overallStatus: "MET",
|
|
522
|
+
criteriaResults: [],
|
|
523
|
+
metCount: 0,
|
|
524
|
+
totalCount: 0,
|
|
525
|
+
};
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
const criteriaResults: CriterionResult[] = [];
|
|
529
|
+
const blockers: string[] = [];
|
|
530
|
+
|
|
531
|
+
// Evaluate each criterion
|
|
532
|
+
for (const criterion of contract.acceptance_criteria) {
|
|
533
|
+
let result: CriterionResult;
|
|
534
|
+
|
|
535
|
+
switch (criterion.kind) {
|
|
536
|
+
case "metric_threshold":
|
|
537
|
+
result = evaluateMetricThreshold(criterion, notebookOutput);
|
|
538
|
+
break;
|
|
539
|
+
case "marker_required":
|
|
540
|
+
result = evaluateMarkerRequired(criterion, notebookOutput);
|
|
541
|
+
break;
|
|
542
|
+
case "artifact_exists":
|
|
543
|
+
result = evaluateArtifactExists(criterion, artifacts);
|
|
544
|
+
break;
|
|
545
|
+
case "finding_count":
|
|
546
|
+
result = evaluateFindingCount(criterion, notebookOutput);
|
|
547
|
+
break;
|
|
548
|
+
default:
|
|
549
|
+
result = {
|
|
550
|
+
criterion,
|
|
551
|
+
status: "UNKNOWN",
|
|
552
|
+
message: `Unknown criterion kind: ${(criterion as AcceptanceCriterion).kind}`,
|
|
553
|
+
};
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
criteriaResults.push(result);
|
|
557
|
+
|
|
558
|
+
// Track blockers
|
|
559
|
+
if (result.status === "BLOCKED") {
|
|
560
|
+
blockers.push(result.message);
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
// Calculate aggregate status
|
|
565
|
+
const metCount = criteriaResults.filter((r) => r.status === "MET").length;
|
|
566
|
+
const totalCount = criteriaResults.length;
|
|
567
|
+
const blockedCount = criteriaResults.filter(
|
|
568
|
+
(r) => r.status === "BLOCKED"
|
|
569
|
+
).length;
|
|
570
|
+
const notMetCount = criteriaResults.filter(
|
|
571
|
+
(r) => r.status === "NOT_MET"
|
|
572
|
+
).length;
|
|
573
|
+
|
|
574
|
+
let overallStatus: "MET" | "NOT_MET" | "BLOCKED" | "NO_CONTRACT";
|
|
575
|
+
let passed: boolean;
|
|
576
|
+
|
|
577
|
+
if (blockedCount > 0) {
|
|
578
|
+
overallStatus = "BLOCKED";
|
|
579
|
+
passed = false;
|
|
580
|
+
} else if (metCount === totalCount) {
|
|
581
|
+
overallStatus = "MET";
|
|
582
|
+
passed = true;
|
|
583
|
+
} else {
|
|
584
|
+
overallStatus = "NOT_MET";
|
|
585
|
+
passed = false;
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
const result: GoalGateResult = {
|
|
589
|
+
passed,
|
|
590
|
+
overallStatus,
|
|
591
|
+
criteriaResults,
|
|
592
|
+
metCount,
|
|
593
|
+
totalCount,
|
|
594
|
+
};
|
|
595
|
+
|
|
596
|
+
if (blockers.length > 0) {
|
|
597
|
+
result.blockers = blockers;
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
return result;
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
// =============================================================================
|
|
604
|
+
// PIVOT RECOMMENDATION
|
|
605
|
+
// =============================================================================
|
|
606
|
+
|
|
607
|
+
/**
|
|
608
|
+
* Generate pivot recommendations based on goal gate results.
|
|
609
|
+
*
|
|
610
|
+
* Analyzes which criteria failed and suggests actions to improve results.
|
|
611
|
+
*
|
|
612
|
+
* @param result - Goal gate evaluation result
|
|
613
|
+
* @param attemptNumber - Current attempt number (1-indexed)
|
|
614
|
+
* @param maxAttempts - Maximum attempts allowed (default: 3)
|
|
615
|
+
* @returns Pivot recommendation with suggestions
|
|
616
|
+
*
|
|
617
|
+
* @example
|
|
618
|
+
* ```typescript
|
|
619
|
+
* const gateResult = evaluateGoalGate(contract, output, artifacts);
|
|
620
|
+
* const pivot = recommendPivot(gateResult, 1, 3);
|
|
621
|
+
* if (pivot.shouldPivot) {
|
|
622
|
+
* console.log("Suggestions:", pivot.suggestions);
|
|
623
|
+
* }
|
|
624
|
+
* ```
|
|
625
|
+
*/
|
|
626
|
+
export function recommendPivot(
|
|
627
|
+
result: GoalGateResult,
|
|
628
|
+
attemptNumber: number,
|
|
629
|
+
maxAttempts: number = 3
|
|
630
|
+
): PivotRecommendation {
|
|
631
|
+
const suggestions: string[] = [];
|
|
632
|
+
|
|
633
|
+
// If all criteria met, no pivot needed
|
|
634
|
+
if (result.passed || result.overallStatus === "NO_CONTRACT") {
|
|
635
|
+
return {
|
|
636
|
+
shouldPivot: false,
|
|
637
|
+
attemptNumber,
|
|
638
|
+
maxAttempts,
|
|
639
|
+
suggestions: [],
|
|
640
|
+
};
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
// If blocked, can't pivot - need to resolve blockers
|
|
644
|
+
if (result.overallStatus === "BLOCKED") {
|
|
645
|
+
return {
|
|
646
|
+
shouldPivot: false,
|
|
647
|
+
attemptNumber,
|
|
648
|
+
maxAttempts,
|
|
649
|
+
suggestions: result.blockers || ["Resolve blocking issues before retrying"],
|
|
650
|
+
};
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
// If no attempts remaining, can't pivot
|
|
654
|
+
if (attemptNumber >= maxAttempts) {
|
|
655
|
+
return {
|
|
656
|
+
shouldPivot: false,
|
|
657
|
+
attemptNumber,
|
|
658
|
+
maxAttempts,
|
|
659
|
+
suggestions: [
|
|
660
|
+
"Maximum attempts reached. Consider revising goal criteria or escalating.",
|
|
661
|
+
],
|
|
662
|
+
};
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
// Generate suggestions based on failed criteria
|
|
666
|
+
for (const criterionResult of result.criteriaResults) {
|
|
667
|
+
if (criterionResult.status !== "MET") {
|
|
668
|
+
const criterion = criterionResult.criterion;
|
|
669
|
+
|
|
670
|
+
switch (criterion.kind) {
|
|
671
|
+
case "metric_threshold":
|
|
672
|
+
if (criterionResult.actualValue !== undefined) {
|
|
673
|
+
const actual = criterionResult.actualValue as number;
|
|
674
|
+
const target = criterion.target!;
|
|
675
|
+
const gap = Math.abs(target - actual);
|
|
676
|
+
const percentGap = ((gap / target) * 100).toFixed(1);
|
|
677
|
+
|
|
678
|
+
if (gap > 0) {
|
|
679
|
+
suggestions.push(
|
|
680
|
+
`Improve ${criterion.metric}: current ${actual.toFixed(3)}, need ${criterion.op} ${target} (gap: ${percentGap}%)`
|
|
681
|
+
);
|
|
682
|
+
// Add specific suggestions based on metric type
|
|
683
|
+
if (criterion.metric?.includes("accuracy")) {
|
|
684
|
+
suggestions.push(
|
|
685
|
+
"Consider: feature engineering, hyperparameter tuning, or trying different algorithms"
|
|
686
|
+
);
|
|
687
|
+
} else if (criterion.metric?.includes("cv_")) {
|
|
688
|
+
suggestions.push(
|
|
689
|
+
"Consider: increasing cross-validation folds, stratified sampling, or data augmentation"
|
|
690
|
+
);
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
} else {
|
|
694
|
+
suggestions.push(
|
|
695
|
+
`Add [METRIC:${criterion.metric}] output to track ${criterion.metric}`
|
|
696
|
+
);
|
|
697
|
+
}
|
|
698
|
+
break;
|
|
699
|
+
|
|
700
|
+
case "marker_required":
|
|
701
|
+
suggestions.push(
|
|
702
|
+
`Add missing marker: [${criterion.marker}] - required for acceptance`
|
|
703
|
+
);
|
|
704
|
+
if (criterion.marker?.startsWith("METRIC:baseline")) {
|
|
705
|
+
suggestions.push(
|
|
706
|
+
"Tip: Compute baseline model performance before training main model"
|
|
707
|
+
);
|
|
708
|
+
}
|
|
709
|
+
break;
|
|
710
|
+
|
|
711
|
+
case "artifact_exists":
|
|
712
|
+
suggestions.push(
|
|
713
|
+
`Create artifact matching: ${criterion.artifactPattern}`
|
|
714
|
+
);
|
|
715
|
+
if (criterion.artifactPattern?.includes(".pkl")) {
|
|
716
|
+
suggestions.push(
|
|
717
|
+
"Tip: Use joblib.dump() or pickle.dump() to save model artifacts"
|
|
718
|
+
);
|
|
719
|
+
}
|
|
720
|
+
break;
|
|
721
|
+
|
|
722
|
+
case "finding_count":
|
|
723
|
+
const actual = (criterionResult.actualValue as number) || 0;
|
|
724
|
+
const needed = criterion.minCount! - actual;
|
|
725
|
+
suggestions.push(
|
|
726
|
+
`Add ${needed} more [FINDING] marker${needed !== 1 ? "s" : ""} with statistical evidence`
|
|
727
|
+
);
|
|
728
|
+
suggestions.push(
|
|
729
|
+
"Remember: Each finding needs [STAT:ci] and [STAT:effect_size] within 10 lines before it"
|
|
730
|
+
);
|
|
731
|
+
break;
|
|
732
|
+
}
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
// Add general pivot suggestions based on attempt number
|
|
737
|
+
if (attemptNumber === 1) {
|
|
738
|
+
suggestions.push(
|
|
739
|
+
"First attempt incomplete. Review the specific criteria above and iterate."
|
|
740
|
+
);
|
|
741
|
+
} else if (attemptNumber === 2) {
|
|
742
|
+
suggestions.push(
|
|
743
|
+
"Second attempt. Consider more aggressive changes: different algorithms, data transformations, or feature sets."
|
|
744
|
+
);
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
return {
|
|
748
|
+
shouldPivot: true,
|
|
749
|
+
attemptNumber,
|
|
750
|
+
maxAttempts,
|
|
751
|
+
suggestions,
|
|
752
|
+
};
|
|
753
|
+
}
|