gyoshu 0.2.5 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,753 @@
1
+ /**
2
+ * Goal Gates Library - Evaluate acceptance criteria against notebook outputs.
3
+ *
4
+ * This module implements the "Goal Gate" component of the Two-Gate system:
5
+ * - Trust Gate: Evidence quality (implemented in quality-gates.ts)
6
+ * - Goal Gate: Acceptance criteria met (THIS MODULE)
7
+ *
8
+ * Goal Gate Rules:
9
+ * - Evaluates acceptance criteria defined in notebook frontmatter
10
+ * - Checks metric thresholds, marker existence, artifact creation, finding counts
11
+ * - Provides pivot recommendations when criteria are not met
12
+ *
13
+ * @module goal-gates
14
+ */
15
+
16
+ import {
17
+ GoalContract,
18
+ AcceptanceCriterion,
19
+ ComparisonOperator,
20
+ } from "./notebook-frontmatter";
21
+ import { parseMarkers, getMarkersByType } from "./marker-parser";
22
+
23
+ // =============================================================================
24
+ // TYPES AND INTERFACES
25
+ // =============================================================================
26
+
27
+ /**
28
+ * Status of a single acceptance criterion evaluation.
29
+ */
30
+ export type CriterionStatus = "MET" | "NOT_MET" | "BLOCKED" | "UNKNOWN";
31
+
32
+ /**
33
+ * Result of evaluating a single acceptance criterion.
34
+ */
35
+ export interface CriterionResult {
36
+ /** The criterion that was evaluated */
37
+ criterion: AcceptanceCriterion;
38
+ /** Whether the criterion was met */
39
+ status: CriterionStatus;
40
+ /** Actual value found (for metric_threshold and finding_count) */
41
+ actualValue?: number | string | boolean;
42
+ /** Human-readable message describing the result */
43
+ message: string;
44
+ }
45
+
46
+ /**
47
+ * Result of evaluating all goal gate criteria.
48
+ */
49
+ export interface GoalGateResult {
50
+ /** Whether all criteria are MET */
51
+ passed: boolean;
52
+ /** Overall status of the goal gate */
53
+ overallStatus: "MET" | "NOT_MET" | "BLOCKED" | "NO_CONTRACT";
54
+ /** Individual results for each criterion */
55
+ criteriaResults: CriterionResult[];
56
+ /** Number of criteria that were MET */
57
+ metCount: number;
58
+ /** Total number of criteria evaluated */
59
+ totalCount: number;
60
+ /** List of blocker messages (for BLOCKED status) */
61
+ blockers?: string[];
62
+ }
63
+
64
+ /**
65
+ * Recommendation for pivoting when goal criteria are not met.
66
+ */
67
+ export interface PivotRecommendation {
68
+ /** Whether a pivot is recommended */
69
+ shouldPivot: boolean;
70
+ /** Current attempt number */
71
+ attemptNumber: number;
72
+ /** Maximum attempts allowed */
73
+ maxAttempts: number;
74
+ /** List of suggested actions to improve results */
75
+ suggestions: string[];
76
+ }
77
+
78
+ // =============================================================================
79
+ // HELPER FUNCTIONS
80
+ // =============================================================================
81
+
82
+ /**
83
+ * Extract a metric value from notebook output.
84
+ *
85
+ * Searches for pattern: [METRIC:name] value
86
+ *
87
+ * @param output - Combined stdout from notebook cells
88
+ * @param metricName - Name of the metric to extract (e.g., "cv_accuracy_mean")
89
+ * @returns Extracted numeric value or undefined if not found
90
+ *
91
+ * @example
92
+ * ```typescript
93
+ * const output = "[METRIC:cv_accuracy_mean] 0.85\n[METRIC:baseline] 0.65";
94
+ * const value = extractMetricValue(output, "cv_accuracy_mean");
95
+ * // value === 0.85
96
+ * ```
97
+ */
98
+ export function extractMetricValue(
99
+ output: string,
100
+ metricName: string
101
+ ): number | undefined {
102
+ // Pattern to match [METRIC:name] value
103
+ // Handles various formats:
104
+ // - [METRIC:name] 0.85
105
+ // - [METRIC:name] 0.85 (with description)
106
+ // - [METRIC:name] value = 0.85
107
+ const escapedName = metricName.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
108
+ const pattern = new RegExp(
109
+ `\\[METRIC:${escapedName}\\]\\s*(?:=\\s*)?(\\d+\\.?\\d*)`,
110
+ "i"
111
+ );
112
+
113
+ const match = output.match(pattern);
114
+ if (match && match[1]) {
115
+ const value = parseFloat(match[1]);
116
+ if (!isNaN(value)) {
117
+ return value;
118
+ }
119
+ }
120
+
121
+ return undefined;
122
+ }
123
+
124
+ /**
125
+ * Compare two values using a comparison operator.
126
+ *
127
+ * @param actual - The actual value to compare
128
+ * @param op - Comparison operator
129
+ * @param target - Target value to compare against
130
+ * @returns true if comparison passes
131
+ */
132
+ function compareValues(
133
+ actual: number,
134
+ op: ComparisonOperator,
135
+ target: number
136
+ ): boolean {
137
+ switch (op) {
138
+ case ">=":
139
+ return actual >= target;
140
+ case ">":
141
+ return actual > target;
142
+ case "<=":
143
+ return actual <= target;
144
+ case "<":
145
+ return actual < target;
146
+ case "==":
147
+ return actual === target;
148
+ case "!=":
149
+ return actual !== target;
150
+ default:
151
+ return false;
152
+ }
153
+ }
154
+
155
+ /**
156
+ * Get the human-readable comparison description.
157
+ *
158
+ * @param op - Comparison operator
159
+ * @returns Human-readable comparison string
160
+ */
161
+ function getComparisonDescription(op: ComparisonOperator): string {
162
+ switch (op) {
163
+ case ">=":
164
+ return ">=";
165
+ case ">":
166
+ return ">";
167
+ case "<=":
168
+ return "<=";
169
+ case "<":
170
+ return "<";
171
+ case "==":
172
+ return "==";
173
+ case "!=":
174
+ return "!=";
175
+ default:
176
+ return op;
177
+ }
178
+ }
179
+
180
+ /**
181
+ * Match a glob pattern against a file path.
182
+ *
183
+ * Supports basic glob patterns:
184
+ * - * matches any characters except /
185
+ * - ** matches any characters including /
186
+ * - ? matches single character
187
+ *
188
+ * @param pattern - Glob pattern (e.g., "*.pkl", "models/*.joblib")
189
+ * @param path - File path to match
190
+ * @returns true if pattern matches
191
+ */
192
+ function matchGlobPattern(pattern: string, path: string): boolean {
193
+ // Escape regex special chars except * and ?
194
+ let regexStr = pattern
195
+ .replace(/[.+^${}()|[\]\\]/g, "\\$&")
196
+ // Handle ** first (match any including /)
197
+ .replace(/\*\*/g, ".*")
198
+ // Handle * (match any except /)
199
+ .replace(/(?<!\.\*)\*/g, "[^/]*")
200
+ // Handle ?
201
+ .replace(/\?/g, ".");
202
+
203
+ // Ensure pattern matches the end of the path (basename or relative path)
204
+ if (!pattern.includes("/")) {
205
+ // Pattern is just a filename pattern, match against basename
206
+ const basename = path.split("/").pop() || path;
207
+ const regex = new RegExp(`^${regexStr}$`);
208
+ return regex.test(basename);
209
+ }
210
+
211
+ // Pattern includes path separators, match against full path
212
+ const regex = new RegExp(`${regexStr}$`);
213
+ return regex.test(path);
214
+ }
215
+
216
+ // =============================================================================
217
+ // EVALUATOR FUNCTIONS
218
+ // =============================================================================
219
+
220
+ /**
221
+ * Evaluate a metric threshold criterion.
222
+ *
223
+ * Extracts the metric value from output and compares against target.
224
+ *
225
+ * @param criterion - The metric_threshold criterion to evaluate
226
+ * @param output - Combined stdout from notebook cells
227
+ * @returns Evaluation result
228
+ *
229
+ * @example
230
+ * ```typescript
231
+ * const criterion = {
232
+ * id: "AC1",
233
+ * kind: "metric_threshold",
234
+ * metric: "cv_accuracy_mean",
235
+ * op: ">=",
236
+ * target: 0.90
237
+ * };
238
+ * const result = evaluateMetricThreshold(criterion, "[METRIC:cv_accuracy_mean] 0.85");
239
+ * // result.status === "NOT_MET"
240
+ * // result.actualValue === 0.85
241
+ * // result.message === "cv_accuracy_mean (0.85) < target (0.90)"
242
+ * ```
243
+ */
244
+ export function evaluateMetricThreshold(
245
+ criterion: AcceptanceCriterion,
246
+ output: string
247
+ ): CriterionResult {
248
+ // Validate criterion has required fields
249
+ if (!criterion.metric || !criterion.op || criterion.target === undefined) {
250
+ return {
251
+ criterion,
252
+ status: "UNKNOWN",
253
+ message: `Invalid metric_threshold criterion: missing metric, op, or target`,
254
+ };
255
+ }
256
+
257
+ const actualValue = extractMetricValue(output, criterion.metric);
258
+
259
+ if (actualValue === undefined) {
260
+ return {
261
+ criterion,
262
+ status: "NOT_MET",
263
+ message: `Metric [METRIC:${criterion.metric}] not found in output`,
264
+ };
265
+ }
266
+
267
+ const passed = compareValues(actualValue, criterion.op, criterion.target);
268
+ const compDesc = getComparisonDescription(criterion.op);
269
+
270
+ if (passed) {
271
+ return {
272
+ criterion,
273
+ status: "MET",
274
+ actualValue,
275
+ message: `${criterion.metric} (${actualValue}) ${compDesc} target (${criterion.target})`,
276
+ };
277
+ } else {
278
+ // Invert the operator for the failure message
279
+ const failedOp = criterion.op.replace(">=", "<")
280
+ .replace(">", "<=")
281
+ .replace("<=", ">")
282
+ .replace("<", ">=")
283
+ .replace("==", "!=")
284
+ .replace("!=", "==");
285
+ return {
286
+ criterion,
287
+ status: "NOT_MET",
288
+ actualValue,
289
+ message: `${criterion.metric} (${actualValue}) ${failedOp.charAt(0) === criterion.op.charAt(0) ? "does not meet" : failedOp} target (${criterion.target})`,
290
+ };
291
+ }
292
+ }
293
+
294
+ /**
295
+ * Evaluate a marker required criterion.
296
+ *
297
+ * Checks if the specified marker exists in the output.
298
+ *
299
+ * @param criterion - The marker_required criterion to evaluate
300
+ * @param output - Combined stdout from notebook cells
301
+ * @returns Evaluation result
302
+ *
303
+ * @example
304
+ * ```typescript
305
+ * const criterion = {
306
+ * id: "AC2",
307
+ * kind: "marker_required",
308
+ * marker: "METRIC:baseline_accuracy"
309
+ * };
310
+ * const result = evaluateMarkerRequired(criterion, "[METRIC:baseline_accuracy] 0.65");
311
+ * // result.status === "MET"
312
+ * // result.message === "Marker [METRIC:baseline_accuracy] found"
313
+ * ```
314
+ */
315
+ export function evaluateMarkerRequired(
316
+ criterion: AcceptanceCriterion,
317
+ output: string
318
+ ): CriterionResult {
319
+ // Validate criterion has required fields
320
+ if (!criterion.marker) {
321
+ return {
322
+ criterion,
323
+ status: "UNKNOWN",
324
+ message: `Invalid marker_required criterion: missing marker field`,
325
+ };
326
+ }
327
+
328
+ // Build pattern to match the marker
329
+ // Marker format: "METRIC:baseline_accuracy" matches [METRIC:baseline_accuracy]
330
+ const escapedMarker = criterion.marker.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
331
+ const pattern = new RegExp(`\\[${escapedMarker}\\]`, "i");
332
+
333
+ const found = pattern.test(output);
334
+
335
+ if (found) {
336
+ return {
337
+ criterion,
338
+ status: "MET",
339
+ actualValue: true,
340
+ message: `Marker [${criterion.marker}] found`,
341
+ };
342
+ } else {
343
+ return {
344
+ criterion,
345
+ status: "NOT_MET",
346
+ actualValue: false,
347
+ message: `Marker [${criterion.marker}] not found in output`,
348
+ };
349
+ }
350
+ }
351
+
352
+ /**
353
+ * Evaluate an artifact exists criterion.
354
+ *
355
+ * Checks if any artifact matches the specified glob pattern.
356
+ *
357
+ * @param criterion - The artifact_exists criterion to evaluate
358
+ * @param artifacts - List of artifact file paths
359
+ * @returns Evaluation result
360
+ *
361
+ * @example
362
+ * ```typescript
363
+ * const criterion = {
364
+ * id: "AC3",
365
+ * kind: "artifact_exists",
366
+ * artifactPattern: "*.pkl"
367
+ * };
368
+ * const result = evaluateArtifactExists(criterion, ["reports/my-research/model.pkl"]);
369
+ * // result.status === "MET"
370
+ * // result.message === "Artifact matching *.pkl found: model.pkl"
371
+ * ```
372
+ */
373
+ export function evaluateArtifactExists(
374
+ criterion: AcceptanceCriterion,
375
+ artifacts: string[]
376
+ ): CriterionResult {
377
+ // Validate criterion has required fields
378
+ if (!criterion.artifactPattern) {
379
+ return {
380
+ criterion,
381
+ status: "UNKNOWN",
382
+ message: `Invalid artifact_exists criterion: missing artifactPattern field`,
383
+ };
384
+ }
385
+
386
+ // Find matching artifacts
387
+ const matchingArtifacts = artifacts.filter((artifact) =>
388
+ matchGlobPattern(criterion.artifactPattern!, artifact)
389
+ );
390
+
391
+ if (matchingArtifacts.length > 0) {
392
+ const matchedNames = matchingArtifacts
393
+ .map((a) => a.split("/").pop())
394
+ .join(", ");
395
+ return {
396
+ criterion,
397
+ status: "MET",
398
+ actualValue: matchingArtifacts[0],
399
+ message: `Artifact matching ${criterion.artifactPattern} found: ${matchedNames}`,
400
+ };
401
+ } else {
402
+ return {
403
+ criterion,
404
+ status: "NOT_MET",
405
+ message: `No artifact matching ${criterion.artifactPattern} found`,
406
+ };
407
+ }
408
+ }
409
+
410
+ /**
411
+ * Evaluate a finding count criterion.
412
+ *
413
+ * Counts [FINDING] markers in the output and compares against minimum.
414
+ *
415
+ * @param criterion - The finding_count criterion to evaluate
416
+ * @param output - Combined stdout from notebook cells
417
+ * @returns Evaluation result
418
+ *
419
+ * @example
420
+ * ```typescript
421
+ * const criterion = {
422
+ * id: "AC4",
423
+ * kind: "finding_count",
424
+ * minCount: 3
425
+ * };
426
+ * const text = "[FINDING] First\n[FINDING] Second";
427
+ * const result = evaluateFindingCount(criterion, text);
428
+ * // result.status === "NOT_MET"
429
+ * // result.actualValue === 2
430
+ * // result.message === "Found 2 findings, need at least 3"
431
+ * ```
432
+ */
433
+ export function evaluateFindingCount(
434
+ criterion: AcceptanceCriterion,
435
+ output: string
436
+ ): CriterionResult {
437
+ // Validate criterion has required fields
438
+ if (criterion.minCount === undefined || criterion.minCount < 0) {
439
+ return {
440
+ criterion,
441
+ status: "UNKNOWN",
442
+ message: `Invalid finding_count criterion: missing or invalid minCount field`,
443
+ };
444
+ }
445
+
446
+ // Parse markers and count FINDING markers
447
+ const parseResult = parseMarkers(output);
448
+ const findings = getMarkersByType(parseResult.markers, "FINDING");
449
+ const count = findings.length;
450
+
451
+ if (count >= criterion.minCount) {
452
+ return {
453
+ criterion,
454
+ status: "MET",
455
+ actualValue: count,
456
+ message: `Found ${count} finding${count !== 1 ? "s" : ""}, meets minimum of ${criterion.minCount}`,
457
+ };
458
+ } else {
459
+ return {
460
+ criterion,
461
+ status: "NOT_MET",
462
+ actualValue: count,
463
+ message: `Found ${count} finding${count !== 1 ? "s" : ""}, need at least ${criterion.minCount}`,
464
+ };
465
+ }
466
+ }
467
+
468
+ // =============================================================================
469
+ // MAIN EVALUATION FUNCTION
470
+ // =============================================================================
471
+
472
+ /**
473
+ * Evaluate goal contract against notebook outputs.
474
+ *
475
+ * Runs all acceptance criteria evaluators and aggregates results.
476
+ *
477
+ * @param contract - Goal contract with acceptance criteria (or undefined)
478
+ * @param notebookOutput - Combined stdout from all notebook cells
479
+ * @param artifacts - List of artifact file paths
480
+ * @returns Goal gate evaluation result
481
+ *
482
+ * @example
483
+ * ```typescript
484
+ * const contract = {
485
+ * version: 1,
486
+ * goal_text: "Build a classifier with 90% accuracy",
487
+ * acceptance_criteria: [
488
+ * { id: "AC1", kind: "metric_threshold", metric: "cv_accuracy_mean", op: ">=", target: 0.90 },
489
+ * { id: "AC2", kind: "marker_required", marker: "METRIC:baseline_accuracy" }
490
+ * ]
491
+ * };
492
+ * const output = "[METRIC:cv_accuracy_mean] 0.92\n[METRIC:baseline_accuracy] 0.65";
493
+ * const result = evaluateGoalGate(contract, output, []);
494
+ * // result.passed === true
495
+ * // result.overallStatus === "MET"
496
+ * ```
497
+ */
498
+ export function evaluateGoalGate(
499
+ contract: GoalContract | undefined,
500
+ notebookOutput: string,
501
+ artifacts: string[]
502
+ ): GoalGateResult {
503
+ // Handle missing contract
504
+ if (!contract) {
505
+ return {
506
+ passed: true,
507
+ overallStatus: "NO_CONTRACT",
508
+ criteriaResults: [],
509
+ metCount: 0,
510
+ totalCount: 0,
511
+ };
512
+ }
513
+
514
+ // Handle empty criteria
515
+ if (
516
+ !contract.acceptance_criteria ||
517
+ contract.acceptance_criteria.length === 0
518
+ ) {
519
+ return {
520
+ passed: true,
521
+ overallStatus: "MET",
522
+ criteriaResults: [],
523
+ metCount: 0,
524
+ totalCount: 0,
525
+ };
526
+ }
527
+
528
+ const criteriaResults: CriterionResult[] = [];
529
+ const blockers: string[] = [];
530
+
531
+ // Evaluate each criterion
532
+ for (const criterion of contract.acceptance_criteria) {
533
+ let result: CriterionResult;
534
+
535
+ switch (criterion.kind) {
536
+ case "metric_threshold":
537
+ result = evaluateMetricThreshold(criterion, notebookOutput);
538
+ break;
539
+ case "marker_required":
540
+ result = evaluateMarkerRequired(criterion, notebookOutput);
541
+ break;
542
+ case "artifact_exists":
543
+ result = evaluateArtifactExists(criterion, artifacts);
544
+ break;
545
+ case "finding_count":
546
+ result = evaluateFindingCount(criterion, notebookOutput);
547
+ break;
548
+ default:
549
+ result = {
550
+ criterion,
551
+ status: "UNKNOWN",
552
+ message: `Unknown criterion kind: ${(criterion as AcceptanceCriterion).kind}`,
553
+ };
554
+ }
555
+
556
+ criteriaResults.push(result);
557
+
558
+ // Track blockers
559
+ if (result.status === "BLOCKED") {
560
+ blockers.push(result.message);
561
+ }
562
+ }
563
+
564
+ // Calculate aggregate status
565
+ const metCount = criteriaResults.filter((r) => r.status === "MET").length;
566
+ const totalCount = criteriaResults.length;
567
+ const blockedCount = criteriaResults.filter(
568
+ (r) => r.status === "BLOCKED"
569
+ ).length;
570
+ const notMetCount = criteriaResults.filter(
571
+ (r) => r.status === "NOT_MET"
572
+ ).length;
573
+
574
+ let overallStatus: "MET" | "NOT_MET" | "BLOCKED" | "NO_CONTRACT";
575
+ let passed: boolean;
576
+
577
+ if (blockedCount > 0) {
578
+ overallStatus = "BLOCKED";
579
+ passed = false;
580
+ } else if (metCount === totalCount) {
581
+ overallStatus = "MET";
582
+ passed = true;
583
+ } else {
584
+ overallStatus = "NOT_MET";
585
+ passed = false;
586
+ }
587
+
588
+ const result: GoalGateResult = {
589
+ passed,
590
+ overallStatus,
591
+ criteriaResults,
592
+ metCount,
593
+ totalCount,
594
+ };
595
+
596
+ if (blockers.length > 0) {
597
+ result.blockers = blockers;
598
+ }
599
+
600
+ return result;
601
+ }
602
+
603
+ // =============================================================================
604
+ // PIVOT RECOMMENDATION
605
+ // =============================================================================
606
+
607
+ /**
608
+ * Generate pivot recommendations based on goal gate results.
609
+ *
610
+ * Analyzes which criteria failed and suggests actions to improve results.
611
+ *
612
+ * @param result - Goal gate evaluation result
613
+ * @param attemptNumber - Current attempt number (1-indexed)
614
+ * @param maxAttempts - Maximum attempts allowed (default: 3)
615
+ * @returns Pivot recommendation with suggestions
616
+ *
617
+ * @example
618
+ * ```typescript
619
+ * const gateResult = evaluateGoalGate(contract, output, artifacts);
620
+ * const pivot = recommendPivot(gateResult, 1, 3);
621
+ * if (pivot.shouldPivot) {
622
+ * console.log("Suggestions:", pivot.suggestions);
623
+ * }
624
+ * ```
625
+ */
626
+ export function recommendPivot(
627
+ result: GoalGateResult,
628
+ attemptNumber: number,
629
+ maxAttempts: number = 3
630
+ ): PivotRecommendation {
631
+ const suggestions: string[] = [];
632
+
633
+ // If all criteria met, no pivot needed
634
+ if (result.passed || result.overallStatus === "NO_CONTRACT") {
635
+ return {
636
+ shouldPivot: false,
637
+ attemptNumber,
638
+ maxAttempts,
639
+ suggestions: [],
640
+ };
641
+ }
642
+
643
+ // If blocked, can't pivot - need to resolve blockers
644
+ if (result.overallStatus === "BLOCKED") {
645
+ return {
646
+ shouldPivot: false,
647
+ attemptNumber,
648
+ maxAttempts,
649
+ suggestions: result.blockers || ["Resolve blocking issues before retrying"],
650
+ };
651
+ }
652
+
653
+ // If no attempts remaining, can't pivot
654
+ if (attemptNumber >= maxAttempts) {
655
+ return {
656
+ shouldPivot: false,
657
+ attemptNumber,
658
+ maxAttempts,
659
+ suggestions: [
660
+ "Maximum attempts reached. Consider revising goal criteria or escalating.",
661
+ ],
662
+ };
663
+ }
664
+
665
+ // Generate suggestions based on failed criteria
666
+ for (const criterionResult of result.criteriaResults) {
667
+ if (criterionResult.status !== "MET") {
668
+ const criterion = criterionResult.criterion;
669
+
670
+ switch (criterion.kind) {
671
+ case "metric_threshold":
672
+ if (criterionResult.actualValue !== undefined) {
673
+ const actual = criterionResult.actualValue as number;
674
+ const target = criterion.target!;
675
+ const gap = Math.abs(target - actual);
676
+ const percentGap = ((gap / target) * 100).toFixed(1);
677
+
678
+ if (gap > 0) {
679
+ suggestions.push(
680
+ `Improve ${criterion.metric}: current ${actual.toFixed(3)}, need ${criterion.op} ${target} (gap: ${percentGap}%)`
681
+ );
682
+ // Add specific suggestions based on metric type
683
+ if (criterion.metric?.includes("accuracy")) {
684
+ suggestions.push(
685
+ "Consider: feature engineering, hyperparameter tuning, or trying different algorithms"
686
+ );
687
+ } else if (criterion.metric?.includes("cv_")) {
688
+ suggestions.push(
689
+ "Consider: increasing cross-validation folds, stratified sampling, or data augmentation"
690
+ );
691
+ }
692
+ }
693
+ } else {
694
+ suggestions.push(
695
+ `Add [METRIC:${criterion.metric}] output to track ${criterion.metric}`
696
+ );
697
+ }
698
+ break;
699
+
700
+ case "marker_required":
701
+ suggestions.push(
702
+ `Add missing marker: [${criterion.marker}] - required for acceptance`
703
+ );
704
+ if (criterion.marker?.startsWith("METRIC:baseline")) {
705
+ suggestions.push(
706
+ "Tip: Compute baseline model performance before training main model"
707
+ );
708
+ }
709
+ break;
710
+
711
+ case "artifact_exists":
712
+ suggestions.push(
713
+ `Create artifact matching: ${criterion.artifactPattern}`
714
+ );
715
+ if (criterion.artifactPattern?.includes(".pkl")) {
716
+ suggestions.push(
717
+ "Tip: Use joblib.dump() or pickle.dump() to save model artifacts"
718
+ );
719
+ }
720
+ break;
721
+
722
+ case "finding_count":
723
+ const actual = (criterionResult.actualValue as number) || 0;
724
+ const needed = criterion.minCount! - actual;
725
+ suggestions.push(
726
+ `Add ${needed} more [FINDING] marker${needed !== 1 ? "s" : ""} with statistical evidence`
727
+ );
728
+ suggestions.push(
729
+ "Remember: Each finding needs [STAT:ci] and [STAT:effect_size] within 10 lines before it"
730
+ );
731
+ break;
732
+ }
733
+ }
734
+ }
735
+
736
+ // Add general pivot suggestions based on attempt number
737
+ if (attemptNumber === 1) {
738
+ suggestions.push(
739
+ "First attempt incomplete. Review the specific criteria above and iterate."
740
+ );
741
+ } else if (attemptNumber === 2) {
742
+ suggestions.push(
743
+ "Second attempt. Consider more aggressive changes: different algorithms, data transformations, or feature sets."
744
+ );
745
+ }
746
+
747
+ return {
748
+ shouldPivot: true,
749
+ attemptNumber,
750
+ maxAttempts,
751
+ suggestions,
752
+ };
753
+ }