wogiflow 2.4.3 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "wogiflow",
3
- "version": "2.4.3",
3
+ "version": "2.5.0",
4
4
  "description": "AI-powered development workflow management system with multi-model support",
5
5
  "main": "lib/index.js",
6
6
  "bin": {
@@ -573,6 +573,34 @@ const CONFIG_DEFAULTS = {
573
573
  failureThresholdForFallback: 3
574
574
  },
575
575
 
576
+ // --- Skeptical Evaluator (Anthropic harness design pattern) ---
577
+ // Spawns a separate sub-agent to evaluate task output before quality gates.
578
+ // Addresses "confident praise bias" where the implementer always thinks it did well.
579
+ skepticalEvaluator: {
580
+ enabled: true,
581
+ _comment_enabled: 'Spawn a separate evaluator agent between Step 3.5 and Step 4',
582
+ maxIterations: 3,
583
+ _comment_maxIterations: 'Max eval→fix cycles before proceeding anyway',
584
+ model: 'sonnet',
585
+ _comment_model: 'Use a different model than the implementer for diversity',
586
+ calibration: true,
587
+ _comment_calibration: 'Inject few-shot calibration examples into evaluator prompt',
588
+ skipForL3: true,
589
+ _comment_skipForL3: 'Skip for trivial L3 subtasks'
590
+ },
591
+
592
+ // --- Sprint-Based Context Reset (Anthropic harness design pattern) ---
593
+ // For large tasks (5+ criteria), commit and reset context every N criteria.
594
+ // Fresh context per sprint prevents quality degradation on later criteria.
595
+ sprintReset: {
596
+ enabled: true,
597
+ _comment_enabled: 'Enable sprint-based context resets for large tasks',
598
+ criteriaPerSprint: 3,
599
+ _comment_criteriaPerSprint: 'Number of criteria to complete before a context reset',
600
+ minTaskCriteria: 5,
601
+ _comment_minTaskCriteria: 'Only activate for tasks with this many or more criteria'
602
+ },
603
+
576
604
  // --- Session Features ---
577
605
  morningBriefing: { enabled: false },
578
606
  techDebt: {
@@ -0,0 +1,257 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * Wogi Flow - Eval Calibration
5
+ *
6
+ * Stores and retrieves calibrated eval examples for anchoring judge scores.
7
+ * Prevents score drift by providing few-shot examples of what high and low
8
+ * scores look like in practice.
9
+ *
10
+ * Based on Anthropic's harness design research finding that "few-shot examples
11
+ * with detailed score breakdowns calibrated evaluator judgment, reducing score
12
+ * drift across iterations."
13
+ *
14
+ * Usage:
15
+ * node flow-eval-calibration.js save <taskId> <quality> — save as calibration example
16
+ * node flow-eval-calibration.js get — get calibration examples for prompt injection
17
+ * node flow-eval-calibration.js list — list all calibration examples
18
+ */
19
+
20
+ const path = require('node:path');
21
+ const fs = require('node:fs');
22
+ const { PATHS, safeJsonParse, writeJson } = require('./flow-utils');
23
+
24
+ // ============================================================
25
+ // Constants
26
+ // ============================================================
27
+
28
+ const CALIBRATION_PATH = path.join(PATHS.state, 'eval-calibration.json');
29
+ const MAX_EXAMPLES_PER_QUALITY = 3; // Keep 3 high, 3 low
30
+
31
+ // ============================================================
32
+ // Storage
33
+ // ============================================================
34
+
35
+ /**
36
+ * Load calibration data
37
+ * @returns {Object} { high: [], low: [], lastUpdated }
38
+ */
39
+ function loadCalibration() {
40
+ return safeJsonParse(CALIBRATION_PATH, {
41
+ high: [],
42
+ low: [],
43
+ lastUpdated: null
44
+ });
45
+ }
46
+
47
+ /**
48
+ * Save a completed eval as a calibration example.
49
+ * Called after /wogi-eval produces scores.
50
+ *
51
+ * @param {Object} params
52
+ * @param {string} params.taskId — the task that was evaluated
53
+ * @param {string} params.quality — "high" or "low"
54
+ * @param {Object} params.scores — { completeness, accuracy, workflowCompliance, tokenEfficiency, quality }
55
+ * @param {string} params.specSummary — brief spec description (first 500 chars)
56
+ * @param {string} params.diffSummary — brief diff description (file count, line count)
57
+ * @param {string} params.notes — judge's justification notes
58
+ */
59
+ function saveCalibrationExample(params) {
60
+ const { taskId, quality, scores, specSummary, diffSummary, notes } = params;
61
+
62
+ if (quality !== 'high' && quality !== 'low') {
63
+ throw new Error('Quality must be "high" or "low"');
64
+ }
65
+
66
+ const cal = loadCalibration();
67
+ const example = {
68
+ taskId,
69
+ scores,
70
+ specSummary: (specSummary || '').slice(0, 500),
71
+ diffSummary: (diffSummary || '').slice(0, 200),
72
+ notes: (notes || '').slice(0, 500),
73
+ savedAt: new Date().toISOString()
74
+ };
75
+
76
+ cal[quality].unshift(example);
77
+
78
+ // Keep only MAX_EXAMPLES_PER_QUALITY
79
+ if (cal[quality].length > MAX_EXAMPLES_PER_QUALITY) {
80
+ cal[quality] = cal[quality].slice(0, MAX_EXAMPLES_PER_QUALITY);
81
+ }
82
+
83
+ cal.lastUpdated = new Date().toISOString();
84
+ writeJson(CALIBRATION_PATH, cal);
85
+
86
+ return example;
87
+ }
88
+
89
+ /**
90
+ * Auto-classify and save an eval result as calibration.
91
+ * High = average score >= 8. Low = average score <= 4.
92
+ *
93
+ * @param {Object} evalResult — from flow-eval.js
94
+ * @returns {Object|null} saved example or null if score is in the middle range
95
+ */
96
+ function autoSaveFromEval(evalResult) {
97
+ if (!evalResult || !evalResult.scores) return null;
98
+
99
+ const scores = evalResult.scores;
100
+ const values = Object.values(scores).filter(v => typeof v === 'number');
101
+ if (values.length === 0) return null;
102
+
103
+ const avg = values.reduce((sum, v) => sum + v, 0) / values.length;
104
+
105
+ let quality = null;
106
+ if (avg >= 8) quality = 'high';
107
+ else if (avg <= 4) quality = 'low';
108
+ else return null; // Middle range — not a good calibration anchor
109
+
110
+ return saveCalibrationExample({
111
+ taskId: evalResult.taskId,
112
+ quality,
113
+ scores,
114
+ specSummary: evalResult.specSummary || '',
115
+ diffSummary: evalResult.diffSummary || '',
116
+ notes: evalResult.notes || ''
117
+ });
118
+ }
119
+
120
+ // ============================================================
121
+ // Retrieval (for prompt injection)
122
+ // ============================================================
123
+
124
+ /**
125
+ * Get calibration examples formatted for injection into judge/evaluator prompts.
126
+ * Returns 1 high + 1 low example (if available).
127
+ *
128
+ * @returns {string} formatted calibration text, or empty string if no examples
129
+ */
130
+ function getCalibrationPrompt() {
131
+ const cal = loadCalibration();
132
+ const parts = [];
133
+
134
+ if (cal.high.length > 0) {
135
+ const ex = cal.high[0];
136
+ parts.push(`## Calibration Example: HIGH QUALITY (reference)
137
+
138
+ **Task**: ${ex.taskId}
139
+ **Spec**: ${ex.specSummary}
140
+ **Scores**: completeness=${ex.scores.completeness}, accuracy=${ex.scores.accuracy}, workflowCompliance=${ex.scores.workflowCompliance}, tokenEfficiency=${ex.scores.tokenEfficiency}, quality=${ex.scores.quality}
141
+ **Why this scored high**: ${ex.notes}`);
142
+ }
143
+
144
+ if (cal.low.length > 0) {
145
+ const ex = cal.low[0];
146
+ parts.push(`## Calibration Example: LOW QUALITY (reference)
147
+
148
+ **Task**: ${ex.taskId}
149
+ **Spec**: ${ex.specSummary}
150
+ **Scores**: completeness=${ex.scores.completeness}, accuracy=${ex.scores.accuracy}, workflowCompliance=${ex.scores.workflowCompliance}, tokenEfficiency=${ex.scores.tokenEfficiency}, quality=${ex.scores.quality}
151
+ **Why this scored low**: ${ex.notes}`);
152
+ }
153
+
154
+ if (parts.length === 0) return '';
155
+
156
+ return `
157
+ ## Score Calibration (anchoring examples)
158
+
159
+ Use these real examples to calibrate your scoring. They represent the extremes of the scale — most tasks should score between these.
160
+
161
+ ${parts.join('\n\n')}
162
+
163
+ ---
164
+ `;
165
+ }
166
+
167
+ /**
168
+ * Get calibration examples as structured data
169
+ * @returns {{ high: Object|null, low: Object|null }}
170
+ */
171
+ function getCalibrationExamples() {
172
+ const cal = loadCalibration();
173
+ return {
174
+ high: cal.high[0] || null,
175
+ low: cal.low[0] || null
176
+ };
177
+ }
178
+
179
+ // ============================================================
180
+ // CLI
181
+ // ============================================================
182
+
183
+ function main() {
184
+ const args = process.argv.slice(2);
185
+ const command = args[0];
186
+
187
+ switch (command) {
188
+ case 'save': {
189
+ const taskId = args[1];
190
+ const quality = args[2];
191
+ if (!taskId || !quality) {
192
+ console.error('Usage: flow-eval-calibration.js save <taskId> <high|low>');
193
+ process.exit(1);
194
+ }
195
+ // Read scores from stdin or eval results
196
+ const evalsDir = path.join(PATHS.workflow, 'evals');
197
+ const evalFiles = fs.existsSync(evalsDir) ? fs.readdirSync(evalsDir).filter(f => f.includes(taskId)) : [];
198
+ if (evalFiles.length === 0) {
199
+ console.error(`No eval results found for task ${taskId}`);
200
+ process.exit(1);
201
+ }
202
+ const evalResult = safeJsonParse(path.join(evalsDir, evalFiles[0]), null);
203
+ if (evalResult) {
204
+ const saved = saveCalibrationExample({
205
+ taskId,
206
+ quality,
207
+ scores: evalResult.aggregated || evalResult.scores || {},
208
+ specSummary: evalResult.spec?.substring(0, 500) || '',
209
+ diffSummary: `${(evalResult.changedFiles || []).length} files changed`,
210
+ notes: evalResult.notes || evalResult.aggregated?.notes || ''
211
+ });
212
+ console.log(`Saved ${quality} calibration example: ${saved.taskId}`);
213
+ }
214
+ break;
215
+ }
216
+
217
+ case 'get':
218
+ console.log(getCalibrationPrompt() || 'No calibration examples yet.');
219
+ break;
220
+
221
+ case 'list': {
222
+ const cal = loadCalibration();
223
+ console.log(`High examples: ${cal.high.length}`);
224
+ for (const ex of cal.high) {
225
+ const values = Object.values(ex.scores).filter(v => typeof v === 'number');
226
+ const avg = values.length > 0 ? values.reduce((s, v) => s + v, 0) / values.length : 0;
227
+ console.log(` ${ex.taskId} — avg ${avg.toFixed(1)} (${ex.savedAt})`);
228
+ }
229
+ console.log(`Low examples: ${cal.low.length}`);
230
+ for (const ex of cal.low) {
231
+ const values = Object.values(ex.scores).filter(v => typeof v === 'number');
232
+ const avg = values.length > 0 ? values.reduce((s, v) => s + v, 0) / values.length : 0;
233
+ console.log(` ${ex.taskId} — avg ${avg.toFixed(1)} (${ex.savedAt})`);
234
+ }
235
+ break;
236
+ }
237
+
238
+ default:
239
+ console.log('Usage: flow-eval-calibration.js <save|get|list>');
240
+ }
241
+ }
242
+
243
+ // ============================================================
244
+ // Exports
245
+ // ============================================================
246
+
247
+ module.exports = {
248
+ loadCalibration,
249
+ saveCalibrationExample,
250
+ autoSaveFromEval,
251
+ getCalibrationPrompt,
252
+ getCalibrationExamples
253
+ };
254
+
255
+ if (require.main === module) {
256
+ main();
257
+ }
@@ -62,10 +62,19 @@ const DEFAULT_EVAL_CONFIG = {
62
62
  function buildJudgePrompt(params) {
63
63
  const { taskId, specContent, implementationDiff, iterations, tokenEstimate } = params;
64
64
 
65
+ // Inject calibration examples if available (prevents score drift)
66
+ let calibrationBlock = '';
67
+ try {
68
+ const { getCalibrationPrompt } = require('./flow-eval-calibration');
69
+ calibrationBlock = getCalibrationPrompt();
70
+ } catch (_err) {
71
+ // Calibration module not available — continue without it
72
+ }
73
+
65
74
  return `You are an expert code reviewer evaluating AI-generated implementation quality.
66
75
 
67
76
  ## Task: ${taskId}
68
-
77
+ ${calibrationBlock}
69
78
  ## Specification
70
79
  ${specContent}
71
80
 
@@ -248,6 +248,15 @@ function saveEvalResult(evalResult) {
248
248
 
249
249
  try {
250
250
  writeJson(filePath, evalResult);
251
+
252
+ // Auto-save as calibration example if scores are extreme (high or low)
253
+ try {
254
+ const { autoSaveFromEval } = require('./flow-eval-calibration');
255
+ autoSaveFromEval(evalResult);
256
+ } catch (_err) {
257
+ // Calibration module not available — non-critical
258
+ }
259
+
251
260
  return filePath;
252
261
  } catch (err) {
253
262
  if (process.env.DEBUG) {