wogiflow 2.4.3 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/commands/wogi-audit.md +26 -0
- package/.claude/commands/wogi-review.md +29 -0
- package/.claude/commands/wogi-start.md +124 -0
- package/.claude/docs/claude-code-compatibility.md +24 -0
- package/.claude/docs/explore-agents.md +19 -2
- package/.claude/settings.json +11 -0
- package/bin/flow +11 -1
- package/lib/workspace-channel-server.js +364 -0
- package/lib/workspace-contracts.js +599 -0
- package/lib/workspace-intelligence.js +600 -0
- package/lib/workspace-messages.js +441 -0
- package/lib/workspace-routing.js +782 -0
- package/lib/workspace-sync.js +339 -0
- package/lib/workspace.js +1349 -0
- package/package.json +1 -1
- package/scripts/flow-config-defaults.js +28 -0
- package/scripts/flow-eval-calibration.js +257 -0
- package/scripts/flow-eval-judge.js +10 -1
- package/scripts/flow-eval.js +9 -0
- package/scripts/flow-schema-drift.js +837 -0
- package/scripts/hooks/adapters/claude-code.js +29 -0
- package/scripts/hooks/core/task-created.js +83 -0
- package/scripts/hooks/entry/claude-code/task-created.js +15 -0
- package/scripts/postinstall.js +2 -0
package/package.json
CHANGED
|
@@ -573,6 +573,34 @@ const CONFIG_DEFAULTS = {
|
|
|
573
573
|
failureThresholdForFallback: 3
|
|
574
574
|
},
|
|
575
575
|
|
|
576
|
+
// --- Skeptical Evaluator (Anthropic harness design pattern) ---
|
|
577
|
+
// Spawns a separate sub-agent to evaluate task output before quality gates.
|
|
578
|
+
// Addresses "confident praise bias" where the implementer always thinks it did well.
|
|
579
|
+
skepticalEvaluator: {
|
|
580
|
+
enabled: true,
|
|
581
|
+
_comment_enabled: 'Spawn a separate evaluator agent between Step 3.5 and Step 4',
|
|
582
|
+
maxIterations: 3,
|
|
583
|
+
_comment_maxIterations: 'Max eval→fix cycles before proceeding anyway',
|
|
584
|
+
model: 'sonnet',
|
|
585
|
+
_comment_model: 'Use a different model than the implementer for diversity',
|
|
586
|
+
calibration: true,
|
|
587
|
+
_comment_calibration: 'Inject few-shot calibration examples into evaluator prompt',
|
|
588
|
+
skipForL3: true,
|
|
589
|
+
_comment_skipForL3: 'Skip for trivial L3 subtasks'
|
|
590
|
+
},
|
|
591
|
+
|
|
592
|
+
// --- Sprint-Based Context Reset (Anthropic harness design pattern) ---
|
|
593
|
+
// For large tasks (5+ criteria), commit and reset context every N criteria.
|
|
594
|
+
// Fresh context per sprint prevents quality degradation on later criteria.
|
|
595
|
+
sprintReset: {
|
|
596
|
+
enabled: true,
|
|
597
|
+
_comment_enabled: 'Enable sprint-based context resets for large tasks',
|
|
598
|
+
criteriaPerSprint: 3,
|
|
599
|
+
_comment_criteriaPerSprint: 'Number of criteria to complete before a context reset',
|
|
600
|
+
minTaskCriteria: 5,
|
|
601
|
+
_comment_minTaskCriteria: 'Only activate for tasks with this many or more criteria'
|
|
602
|
+
},
|
|
603
|
+
|
|
576
604
|
// --- Session Features ---
|
|
577
605
|
morningBriefing: { enabled: false },
|
|
578
606
|
techDebt: {
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Wogi Flow - Eval Calibration
|
|
5
|
+
*
|
|
6
|
+
* Stores and retrieves calibrated eval examples for anchoring judge scores.
|
|
7
|
+
* Prevents score drift by providing few-shot examples of what high and low
|
|
8
|
+
* scores look like in practice.
|
|
9
|
+
*
|
|
10
|
+
* Based on Anthropic's harness design research finding that "few-shot examples
|
|
11
|
+
* with detailed score breakdowns calibrated evaluator judgment, reducing score
|
|
12
|
+
* drift across iterations."
|
|
13
|
+
*
|
|
14
|
+
* Usage:
|
|
15
|
+
* node flow-eval-calibration.js save <taskId> <quality> — save as calibration example
|
|
16
|
+
* node flow-eval-calibration.js get — get calibration examples for prompt injection
|
|
17
|
+
* node flow-eval-calibration.js list — list all calibration examples
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
const path = require('node:path');
|
|
21
|
+
const fs = require('node:fs');
|
|
22
|
+
const { PATHS, safeJsonParse, writeJson } = require('./flow-utils');
|
|
23
|
+
|
|
24
|
+
// ============================================================
|
|
25
|
+
// Constants
|
|
26
|
+
// ============================================================
|
|
27
|
+
|
|
28
|
+
const CALIBRATION_PATH = path.join(PATHS.state, 'eval-calibration.json');
|
|
29
|
+
const MAX_EXAMPLES_PER_QUALITY = 3; // Keep 3 high, 3 low
|
|
30
|
+
|
|
31
|
+
// ============================================================
|
|
32
|
+
// Storage
|
|
33
|
+
// ============================================================
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Load calibration data
|
|
37
|
+
* @returns {Object} { high: [], low: [], lastUpdated }
|
|
38
|
+
*/
|
|
39
|
+
function loadCalibration() {
|
|
40
|
+
return safeJsonParse(CALIBRATION_PATH, {
|
|
41
|
+
high: [],
|
|
42
|
+
low: [],
|
|
43
|
+
lastUpdated: null
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Save a completed eval as a calibration example.
|
|
49
|
+
* Called after /wogi-eval produces scores.
|
|
50
|
+
*
|
|
51
|
+
* @param {Object} params
|
|
52
|
+
* @param {string} params.taskId — the task that was evaluated
|
|
53
|
+
* @param {string} params.quality — "high" or "low"
|
|
54
|
+
* @param {Object} params.scores — { completeness, accuracy, workflowCompliance, tokenEfficiency, quality }
|
|
55
|
+
* @param {string} params.specSummary — brief spec description (first 500 chars)
|
|
56
|
+
* @param {string} params.diffSummary — brief diff description (file count, line count)
|
|
57
|
+
* @param {string} params.notes — judge's justification notes
|
|
58
|
+
*/
|
|
59
|
+
function saveCalibrationExample(params) {
|
|
60
|
+
const { taskId, quality, scores, specSummary, diffSummary, notes } = params;
|
|
61
|
+
|
|
62
|
+
if (quality !== 'high' && quality !== 'low') {
|
|
63
|
+
throw new Error('Quality must be "high" or "low"');
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
const cal = loadCalibration();
|
|
67
|
+
const example = {
|
|
68
|
+
taskId,
|
|
69
|
+
scores,
|
|
70
|
+
specSummary: (specSummary || '').slice(0, 500),
|
|
71
|
+
diffSummary: (diffSummary || '').slice(0, 200),
|
|
72
|
+
notes: (notes || '').slice(0, 500),
|
|
73
|
+
savedAt: new Date().toISOString()
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
cal[quality].unshift(example);
|
|
77
|
+
|
|
78
|
+
// Keep only MAX_EXAMPLES_PER_QUALITY
|
|
79
|
+
if (cal[quality].length > MAX_EXAMPLES_PER_QUALITY) {
|
|
80
|
+
cal[quality] = cal[quality].slice(0, MAX_EXAMPLES_PER_QUALITY);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
cal.lastUpdated = new Date().toISOString();
|
|
84
|
+
writeJson(CALIBRATION_PATH, cal);
|
|
85
|
+
|
|
86
|
+
return example;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Auto-classify and save an eval result as calibration.
|
|
91
|
+
* High = average score >= 8. Low = average score <= 4.
|
|
92
|
+
*
|
|
93
|
+
* @param {Object} evalResult — from flow-eval.js
|
|
94
|
+
* @returns {Object|null} saved example or null if score is in the middle range
|
|
95
|
+
*/
|
|
96
|
+
function autoSaveFromEval(evalResult) {
|
|
97
|
+
if (!evalResult || !evalResult.scores) return null;
|
|
98
|
+
|
|
99
|
+
const scores = evalResult.scores;
|
|
100
|
+
const values = Object.values(scores).filter(v => typeof v === 'number');
|
|
101
|
+
if (values.length === 0) return null;
|
|
102
|
+
|
|
103
|
+
const avg = values.reduce((sum, v) => sum + v, 0) / values.length;
|
|
104
|
+
|
|
105
|
+
let quality = null;
|
|
106
|
+
if (avg >= 8) quality = 'high';
|
|
107
|
+
else if (avg <= 4) quality = 'low';
|
|
108
|
+
else return null; // Middle range — not a good calibration anchor
|
|
109
|
+
|
|
110
|
+
return saveCalibrationExample({
|
|
111
|
+
taskId: evalResult.taskId,
|
|
112
|
+
quality,
|
|
113
|
+
scores,
|
|
114
|
+
specSummary: evalResult.specSummary || '',
|
|
115
|
+
diffSummary: evalResult.diffSummary || '',
|
|
116
|
+
notes: evalResult.notes || ''
|
|
117
|
+
});
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// ============================================================
|
|
121
|
+
// Retrieval (for prompt injection)
|
|
122
|
+
// ============================================================
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Get calibration examples formatted for injection into judge/evaluator prompts.
|
|
126
|
+
* Returns 1 high + 1 low example (if available).
|
|
127
|
+
*
|
|
128
|
+
* @returns {string} formatted calibration text, or empty string if no examples
|
|
129
|
+
*/
|
|
130
|
+
function getCalibrationPrompt() {
|
|
131
|
+
const cal = loadCalibration();
|
|
132
|
+
const parts = [];
|
|
133
|
+
|
|
134
|
+
if (cal.high.length > 0) {
|
|
135
|
+
const ex = cal.high[0];
|
|
136
|
+
parts.push(`## Calibration Example: HIGH QUALITY (reference)
|
|
137
|
+
|
|
138
|
+
**Task**: ${ex.taskId}
|
|
139
|
+
**Spec**: ${ex.specSummary}
|
|
140
|
+
**Scores**: completeness=${ex.scores.completeness}, accuracy=${ex.scores.accuracy}, workflowCompliance=${ex.scores.workflowCompliance}, tokenEfficiency=${ex.scores.tokenEfficiency}, quality=${ex.scores.quality}
|
|
141
|
+
**Why this scored high**: ${ex.notes}`);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
if (cal.low.length > 0) {
|
|
145
|
+
const ex = cal.low[0];
|
|
146
|
+
parts.push(`## Calibration Example: LOW QUALITY (reference)
|
|
147
|
+
|
|
148
|
+
**Task**: ${ex.taskId}
|
|
149
|
+
**Spec**: ${ex.specSummary}
|
|
150
|
+
**Scores**: completeness=${ex.scores.completeness}, accuracy=${ex.scores.accuracy}, workflowCompliance=${ex.scores.workflowCompliance}, tokenEfficiency=${ex.scores.tokenEfficiency}, quality=${ex.scores.quality}
|
|
151
|
+
**Why this scored low**: ${ex.notes}`);
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
if (parts.length === 0) return '';
|
|
155
|
+
|
|
156
|
+
return `
|
|
157
|
+
## Score Calibration (anchoring examples)
|
|
158
|
+
|
|
159
|
+
Use these real examples to calibrate your scoring. They represent the extremes of the scale — most tasks should score between these.
|
|
160
|
+
|
|
161
|
+
${parts.join('\n\n')}
|
|
162
|
+
|
|
163
|
+
---
|
|
164
|
+
`;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Get calibration examples as structured data
|
|
169
|
+
* @returns {{ high: Object|null, low: Object|null }}
|
|
170
|
+
*/
|
|
171
|
+
function getCalibrationExamples() {
|
|
172
|
+
const cal = loadCalibration();
|
|
173
|
+
return {
|
|
174
|
+
high: cal.high[0] || null,
|
|
175
|
+
low: cal.low[0] || null
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// ============================================================
|
|
180
|
+
// CLI
|
|
181
|
+
// ============================================================
|
|
182
|
+
|
|
183
|
+
function main() {
|
|
184
|
+
const args = process.argv.slice(2);
|
|
185
|
+
const command = args[0];
|
|
186
|
+
|
|
187
|
+
switch (command) {
|
|
188
|
+
case 'save': {
|
|
189
|
+
const taskId = args[1];
|
|
190
|
+
const quality = args[2];
|
|
191
|
+
if (!taskId || !quality) {
|
|
192
|
+
console.error('Usage: flow-eval-calibration.js save <taskId> <high|low>');
|
|
193
|
+
process.exit(1);
|
|
194
|
+
}
|
|
195
|
+
// Read scores from stdin or eval results
|
|
196
|
+
const evalsDir = path.join(PATHS.workflow, 'evals');
|
|
197
|
+
const evalFiles = fs.existsSync(evalsDir) ? fs.readdirSync(evalsDir).filter(f => f.includes(taskId)) : [];
|
|
198
|
+
if (evalFiles.length === 0) {
|
|
199
|
+
console.error(`No eval results found for task ${taskId}`);
|
|
200
|
+
process.exit(1);
|
|
201
|
+
}
|
|
202
|
+
const evalResult = safeJsonParse(path.join(evalsDir, evalFiles[0]), null);
|
|
203
|
+
if (evalResult) {
|
|
204
|
+
const saved = saveCalibrationExample({
|
|
205
|
+
taskId,
|
|
206
|
+
quality,
|
|
207
|
+
scores: evalResult.aggregated || evalResult.scores || {},
|
|
208
|
+
specSummary: evalResult.spec?.substring(0, 500) || '',
|
|
209
|
+
diffSummary: `${(evalResult.changedFiles || []).length} files changed`,
|
|
210
|
+
notes: evalResult.notes || evalResult.aggregated?.notes || ''
|
|
211
|
+
});
|
|
212
|
+
console.log(`Saved ${quality} calibration example: ${saved.taskId}`);
|
|
213
|
+
}
|
|
214
|
+
break;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
case 'get':
|
|
218
|
+
console.log(getCalibrationPrompt() || 'No calibration examples yet.');
|
|
219
|
+
break;
|
|
220
|
+
|
|
221
|
+
case 'list': {
|
|
222
|
+
const cal = loadCalibration();
|
|
223
|
+
console.log(`High examples: ${cal.high.length}`);
|
|
224
|
+
for (const ex of cal.high) {
|
|
225
|
+
const values = Object.values(ex.scores).filter(v => typeof v === 'number');
|
|
226
|
+
const avg = values.length > 0 ? values.reduce((s, v) => s + v, 0) / values.length : 0;
|
|
227
|
+
console.log(` ${ex.taskId} — avg ${avg.toFixed(1)} (${ex.savedAt})`);
|
|
228
|
+
}
|
|
229
|
+
console.log(`Low examples: ${cal.low.length}`);
|
|
230
|
+
for (const ex of cal.low) {
|
|
231
|
+
const values = Object.values(ex.scores).filter(v => typeof v === 'number');
|
|
232
|
+
const avg = values.length > 0 ? values.reduce((s, v) => s + v, 0) / values.length : 0;
|
|
233
|
+
console.log(` ${ex.taskId} — avg ${avg.toFixed(1)} (${ex.savedAt})`);
|
|
234
|
+
}
|
|
235
|
+
break;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
default:
|
|
239
|
+
console.log('Usage: flow-eval-calibration.js <save|get|list>');
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// ============================================================
|
|
244
|
+
// Exports
|
|
245
|
+
// ============================================================
|
|
246
|
+
|
|
247
|
+
module.exports = {
|
|
248
|
+
loadCalibration,
|
|
249
|
+
saveCalibrationExample,
|
|
250
|
+
autoSaveFromEval,
|
|
251
|
+
getCalibrationPrompt,
|
|
252
|
+
getCalibrationExamples
|
|
253
|
+
};
|
|
254
|
+
|
|
255
|
+
if (require.main === module) {
|
|
256
|
+
main();
|
|
257
|
+
}
|
|
@@ -62,10 +62,19 @@ const DEFAULT_EVAL_CONFIG = {
|
|
|
62
62
|
function buildJudgePrompt(params) {
|
|
63
63
|
const { taskId, specContent, implementationDiff, iterations, tokenEstimate } = params;
|
|
64
64
|
|
|
65
|
+
// Inject calibration examples if available (prevents score drift)
|
|
66
|
+
let calibrationBlock = '';
|
|
67
|
+
try {
|
|
68
|
+
const { getCalibrationPrompt } = require('./flow-eval-calibration');
|
|
69
|
+
calibrationBlock = getCalibrationPrompt();
|
|
70
|
+
} catch (_err) {
|
|
71
|
+
// Calibration module not available — continue without it
|
|
72
|
+
}
|
|
73
|
+
|
|
65
74
|
return `You are an expert code reviewer evaluating AI-generated implementation quality.
|
|
66
75
|
|
|
67
76
|
## Task: ${taskId}
|
|
68
|
-
|
|
77
|
+
${calibrationBlock}
|
|
69
78
|
## Specification
|
|
70
79
|
${specContent}
|
|
71
80
|
|
package/scripts/flow-eval.js
CHANGED
|
@@ -248,6 +248,15 @@ function saveEvalResult(evalResult) {
|
|
|
248
248
|
|
|
249
249
|
try {
|
|
250
250
|
writeJson(filePath, evalResult);
|
|
251
|
+
|
|
252
|
+
// Auto-save as calibration example if scores are extreme (high or low)
|
|
253
|
+
try {
|
|
254
|
+
const { autoSaveFromEval } = require('./flow-eval-calibration');
|
|
255
|
+
autoSaveFromEval(evalResult);
|
|
256
|
+
} catch (_err) {
|
|
257
|
+
// Calibration module not available — non-critical
|
|
258
|
+
}
|
|
259
|
+
|
|
251
260
|
return filePath;
|
|
252
261
|
} catch (err) {
|
|
253
262
|
if (process.env.DEBUG) {
|