wogiflow 2.4.3 → 2.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "wogiflow",
3
- "version": "2.4.3",
3
+ "version": "2.4.4",
4
4
  "description": "AI-powered development workflow management system with multi-model support",
5
5
  "main": "lib/index.js",
6
6
  "bin": {
@@ -573,6 +573,34 @@ const CONFIG_DEFAULTS = {
573
573
  failureThresholdForFallback: 3
574
574
  },
575
575
 
576
+ // --- Skeptical Evaluator (Anthropic harness design pattern) ---
577
+ // Spawns a separate sub-agent to evaluate task output before quality gates.
578
+ // Addresses "confident praise bias" where the implementer always thinks it did well.
579
+ skepticalEvaluator: {
580
+ enabled: true,
581
+ _comment_enabled: 'Spawn a separate evaluator agent between Step 3.5 and Step 4',
582
+ maxIterations: 3,
583
+ _comment_maxIterations: 'Max eval→fix cycles before proceeding anyway',
584
+ model: 'sonnet',
585
+ _comment_model: 'Use a different model than the implementer for diversity',
586
+ calibration: true,
587
+ _comment_calibration: 'Inject few-shot calibration examples into evaluator prompt',
588
+ skipForL3: true,
589
+ _comment_skipForL3: 'Skip for trivial L3 subtasks'
590
+ },
591
+
592
+ // --- Sprint-Based Context Reset (Anthropic harness design pattern) ---
593
+ // For large tasks (5+ criteria), commit and reset context every N criteria.
594
+ // Fresh context per sprint prevents quality degradation on later criteria.
595
+ sprintReset: {
596
+ enabled: true,
597
+ _comment_enabled: 'Enable sprint-based context resets for large tasks',
598
+ criteriaPerSprint: 3,
599
+ _comment_criteriaPerSprint: 'Number of criteria to complete before a context reset',
600
+ minTaskCriteria: 5,
601
+ _comment_minTaskCriteria: 'Only activate for tasks with this many or more criteria'
602
+ },
603
+
576
604
  // --- Session Features ---
577
605
  morningBriefing: { enabled: false },
578
606
  techDebt: {
@@ -0,0 +1,257 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * Wogi Flow - Eval Calibration
5
+ *
6
+ * Stores and retrieves calibrated eval examples for anchoring judge scores.
7
+ * Prevents score drift by providing few-shot examples of what high and low
8
+ * scores look like in practice.
9
+ *
10
+ * Based on Anthropic's harness design research finding that "few-shot examples
11
+ * with detailed score breakdowns calibrated evaluator judgment, reducing score
12
+ * drift across iterations."
13
+ *
14
+ * Usage:
15
+ * node flow-eval-calibration.js save <taskId> <quality> — save as calibration example
16
+ * node flow-eval-calibration.js get — get calibration examples for prompt injection
17
+ * node flow-eval-calibration.js list — list all calibration examples
18
+ */
19
+
20
+ const path = require('node:path');
21
+ const fs = require('node:fs');
22
+ const { PATHS, safeJsonParse, writeJson } = require('./flow-utils');
23
+
24
+ // ============================================================
25
+ // Constants
26
+ // ============================================================
27
+
28
+ const CALIBRATION_PATH = path.join(PATHS.state, 'eval-calibration.json');
29
+ const MAX_EXAMPLES_PER_QUALITY = 3; // Keep 3 high, 3 low
30
+
31
+ // ============================================================
32
+ // Storage
33
+ // ============================================================
34
+
35
+ /**
36
+ * Load calibration data
37
+ * @returns {Object} { high: [], low: [], lastUpdated }
38
+ */
39
+ function loadCalibration() {
40
+ return safeJsonParse(CALIBRATION_PATH, {
41
+ high: [],
42
+ low: [],
43
+ lastUpdated: null
44
+ });
45
+ }
46
+
47
+ /**
48
+ * Save a completed eval as a calibration example.
49
+ * Called after /wogi-eval produces scores.
50
+ *
51
+ * @param {Object} params
52
+ * @param {string} params.taskId — the task that was evaluated
53
+ * @param {string} params.quality — "high" or "low"
54
+ * @param {Object} params.scores — { completeness, accuracy, workflowCompliance, tokenEfficiency, quality }
55
+ * @param {string} params.specSummary — brief spec description (first 500 chars)
56
+ * @param {string} params.diffSummary — brief diff description (file count, line count)
57
+ * @param {string} params.notes — judge's justification notes
58
+ */
59
+ function saveCalibrationExample(params) {
60
+ const { taskId, quality, scores, specSummary, diffSummary, notes } = params;
61
+
62
+ if (quality !== 'high' && quality !== 'low') {
63
+ throw new Error('Quality must be "high" or "low"');
64
+ }
65
+
66
+ const cal = loadCalibration();
67
+ const example = {
68
+ taskId,
69
+ scores,
70
+ specSummary: (specSummary || '').slice(0, 500),
71
+ diffSummary: (diffSummary || '').slice(0, 200),
72
+ notes: (notes || '').slice(0, 500),
73
+ savedAt: new Date().toISOString()
74
+ };
75
+
76
+ cal[quality].unshift(example);
77
+
78
+ // Keep only MAX_EXAMPLES_PER_QUALITY
79
+ if (cal[quality].length > MAX_EXAMPLES_PER_QUALITY) {
80
+ cal[quality] = cal[quality].slice(0, MAX_EXAMPLES_PER_QUALITY);
81
+ }
82
+
83
+ cal.lastUpdated = new Date().toISOString();
84
+ writeJson(CALIBRATION_PATH, cal);
85
+
86
+ return example;
87
+ }
88
+
89
+ /**
90
+ * Auto-classify and save an eval result as calibration.
91
+ * High = average score >= 8. Low = average score <= 4.
92
+ *
93
+ * @param {Object} evalResult — from flow-eval.js
94
+ * @returns {Object|null} saved example or null if score is in the middle range
95
+ */
96
+ function autoSaveFromEval(evalResult) {
97
+ if (!evalResult || !evalResult.scores) return null;
98
+
99
+ const scores = evalResult.scores;
100
+ const values = Object.values(scores).filter(v => typeof v === 'number');
101
+ if (values.length === 0) return null;
102
+
103
+ const avg = values.reduce((sum, v) => sum + v, 0) / values.length;
104
+
105
+ let quality = null;
106
+ if (avg >= 8) quality = 'high';
107
+ else if (avg <= 4) quality = 'low';
108
+ else return null; // Middle range — not a good calibration anchor
109
+
110
+ return saveCalibrationExample({
111
+ taskId: evalResult.taskId,
112
+ quality,
113
+ scores,
114
+ specSummary: evalResult.specSummary || '',
115
+ diffSummary: evalResult.diffSummary || '',
116
+ notes: evalResult.notes || ''
117
+ });
118
+ }
119
+
120
+ // ============================================================
121
+ // Retrieval (for prompt injection)
122
+ // ============================================================
123
+
124
+ /**
125
+ * Get calibration examples formatted for injection into judge/evaluator prompts.
126
+ * Returns 1 high + 1 low example (if available).
127
+ *
128
+ * @returns {string} formatted calibration text, or empty string if no examples
129
+ */
130
+ function getCalibrationPrompt() {
131
+ const cal = loadCalibration();
132
+ const parts = [];
133
+
134
+ if (cal.high.length > 0) {
135
+ const ex = cal.high[0];
136
+ parts.push(`## Calibration Example: HIGH QUALITY (reference)
137
+
138
+ **Task**: ${ex.taskId}
139
+ **Spec**: ${ex.specSummary}
140
+ **Scores**: completeness=${ex.scores.completeness}, accuracy=${ex.scores.accuracy}, workflowCompliance=${ex.scores.workflowCompliance}, tokenEfficiency=${ex.scores.tokenEfficiency}, quality=${ex.scores.quality}
141
+ **Why this scored high**: ${ex.notes}`);
142
+ }
143
+
144
+ if (cal.low.length > 0) {
145
+ const ex = cal.low[0];
146
+ parts.push(`## Calibration Example: LOW QUALITY (reference)
147
+
148
+ **Task**: ${ex.taskId}
149
+ **Spec**: ${ex.specSummary}
150
+ **Scores**: completeness=${ex.scores.completeness}, accuracy=${ex.scores.accuracy}, workflowCompliance=${ex.scores.workflowCompliance}, tokenEfficiency=${ex.scores.tokenEfficiency}, quality=${ex.scores.quality}
151
+ **Why this scored low**: ${ex.notes}`);
152
+ }
153
+
154
+ if (parts.length === 0) return '';
155
+
156
+ return `
157
+ ## Score Calibration (anchoring examples)
158
+
159
+ Use these real examples to calibrate your scoring. They represent the extremes of the scale — most tasks should score between these.
160
+
161
+ ${parts.join('\n\n')}
162
+
163
+ ---
164
+ `;
165
+ }
166
+
167
+ /**
168
+ * Get calibration examples as structured data
169
+ * @returns {{ high: Object|null, low: Object|null }}
170
+ */
171
+ function getCalibrationExamples() {
172
+ const cal = loadCalibration();
173
+ return {
174
+ high: cal.high[0] || null,
175
+ low: cal.low[0] || null
176
+ };
177
+ }
178
+
179
+ // ============================================================
180
+ // CLI
181
+ // ============================================================
182
+
183
+ function main() {
184
+ const args = process.argv.slice(2);
185
+ const command = args[0];
186
+
187
+ switch (command) {
188
+ case 'save': {
189
+ const taskId = args[1];
190
+ const quality = args[2];
191
+ if (!taskId || !quality) {
192
+ console.error('Usage: flow-eval-calibration.js save <taskId> <high|low>');
193
+ process.exit(1);
194
+ }
195
+ // Read scores from stdin or eval results
196
+ const evalsDir = path.join(PATHS.workflow, 'evals');
197
+ const evalFiles = fs.existsSync(evalsDir) ? fs.readdirSync(evalsDir).filter(f => f.includes(taskId)) : [];
198
+ if (evalFiles.length === 0) {
199
+ console.error(`No eval results found for task ${taskId}`);
200
+ process.exit(1);
201
+ }
202
+ const evalResult = safeJsonParse(path.join(evalsDir, evalFiles[0]), null);
203
+ if (evalResult) {
204
+ const saved = saveCalibrationExample({
205
+ taskId,
206
+ quality,
207
+ scores: evalResult.aggregated || evalResult.scores || {},
208
+ specSummary: evalResult.spec?.substring(0, 500) || '',
209
+ diffSummary: `${(evalResult.changedFiles || []).length} files changed`,
210
+ notes: evalResult.notes || evalResult.aggregated?.notes || ''
211
+ });
212
+ console.log(`Saved ${quality} calibration example: ${saved.taskId}`);
213
+ }
214
+ break;
215
+ }
216
+
217
+ case 'get':
218
+ console.log(getCalibrationPrompt() || 'No calibration examples yet.');
219
+ break;
220
+
221
+ case 'list': {
222
+ const cal = loadCalibration();
223
+ console.log(`High examples: ${cal.high.length}`);
224
+ for (const ex of cal.high) {
225
+ const values = Object.values(ex.scores).filter(v => typeof v === 'number');
226
+ const avg = values.length > 0 ? values.reduce((s, v) => s + v, 0) / values.length : 0;
227
+ console.log(` ${ex.taskId} — avg ${avg.toFixed(1)} (${ex.savedAt})`);
228
+ }
229
+ console.log(`Low examples: ${cal.low.length}`);
230
+ for (const ex of cal.low) {
231
+ const values = Object.values(ex.scores).filter(v => typeof v === 'number');
232
+ const avg = values.length > 0 ? values.reduce((s, v) => s + v, 0) / values.length : 0;
233
+ console.log(` ${ex.taskId} — avg ${avg.toFixed(1)} (${ex.savedAt})`);
234
+ }
235
+ break;
236
+ }
237
+
238
+ default:
239
+ console.log('Usage: flow-eval-calibration.js <save|get|list>');
240
+ }
241
+ }
242
+
243
+ // ============================================================
244
+ // Exports
245
+ // ============================================================
246
+
247
+ module.exports = {
248
+ loadCalibration,
249
+ saveCalibrationExample,
250
+ autoSaveFromEval,
251
+ getCalibrationPrompt,
252
+ getCalibrationExamples
253
+ };
254
+
255
+ if (require.main === module) {
256
+ main();
257
+ }
@@ -62,10 +62,19 @@ const DEFAULT_EVAL_CONFIG = {
62
62
  function buildJudgePrompt(params) {
63
63
  const { taskId, specContent, implementationDiff, iterations, tokenEstimate } = params;
64
64
 
65
+ // Inject calibration examples if available (prevents score drift)
66
+ let calibrationBlock = '';
67
+ try {
68
+ const { getCalibrationPrompt } = require('./flow-eval-calibration');
69
+ calibrationBlock = getCalibrationPrompt();
70
+ } catch (_err) {
71
+ // Calibration module not available — continue without it
72
+ }
73
+
65
74
  return `You are an expert code reviewer evaluating AI-generated implementation quality.
66
75
 
67
76
  ## Task: ${taskId}
68
-
77
+ ${calibrationBlock}
69
78
  ## Specification
70
79
  ${specContent}
71
80
 
@@ -248,6 +248,15 @@ function saveEvalResult(evalResult) {
248
248
 
249
249
  try {
250
250
  writeJson(filePath, evalResult);
251
+
252
+ // Auto-save as calibration example if scores are extreme (high or low)
253
+ try {
254
+ const { autoSaveFromEval } = require('./flow-eval-calibration');
255
+ autoSaveFromEval(evalResult);
256
+ } catch (_err) {
257
+ // Calibration module not available — non-critical
258
+ }
259
+
251
260
  return filePath;
252
261
  } catch (err) {
253
262
  if (process.env.DEBUG) {
@@ -32,6 +32,7 @@ const HOOK_TIMEOUTS = {
32
32
  STOP: 5, // Loop enforcement check
33
33
  SESSION_END: 10, // Session cleanup/logging
34
34
  TASK_COMPLETED: 10, // Post-task cleanup (Claude Code 2.1.33+)
35
+ TASK_CREATED: 5, // Task creation tracking (Claude Code 2.1.84+)
35
36
  TEAMMATE_IDLE: 5, // Task dispatch for idle agents (Claude Code 2.1.33+)
36
37
  CONFIG_CHANGE: 5, // Mid-session config change detection (Claude Code latest)
37
38
  INSTRUCTIONS_LOADED: 5 // Instructions loaded event (Claude Code latest)
@@ -55,6 +56,7 @@ const CLAUDE_CODE_EVENTS = [
55
56
  'ConfigChange', // Claude Code 2.1.63+ — mid-session config change detection
56
57
  'InstructionsLoaded', // Claude Code latest — fires when CLAUDE.md/.claude/rules loaded
57
58
  'PostCompact', // Claude Code 2.1.76+ — fires after context compaction completes
59
+ 'TaskCreated', // Claude Code 2.1.84+ — fires when a task is created via TaskCreate
58
60
  ];
59
61
 
60
62
  /**
@@ -69,6 +71,7 @@ const CLAUDE_CODE_EVENTS = [
69
71
  // 'ElicitationResult', // Claude Code 2.1.76+ — intercept/override elicitation responses before sending
70
72
  // 'CwdChanged', // Claude Code 2.1.83+ — fires when working directory changes (e.g., direnv)
71
73
  // 'FileChanged', // Claude Code 2.1.83+ — fires when watched files change on disk
74
+ // 'WorktreeCreate (http)', // Claude Code 2.1.84+ — WorktreeCreate now supports type:"http" transport
72
75
  // ];
73
76
 
74
77
  /**
@@ -204,6 +207,8 @@ class ClaudeCodeAdapter extends BaseAdapter {
204
207
  return this.transformInstructionsLoaded(coreResult);
205
208
  case 'PostCompact':
206
209
  return this.transformPostCompact(coreResult);
210
+ case 'TaskCreated':
211
+ return this.transformTaskCreated(coreResult);
207
212
  default:
208
213
  return { continue: true };
209
214
  }
@@ -506,6 +511,23 @@ Run: /wogi-start ${coreResult.nextTaskId}`;
506
511
  };
507
512
  }
508
513
 
514
+ /**
515
+ * Transform TaskCreated result (Claude Code 2.1.84+)
516
+ * Fires when a task is created via TaskCreate.
517
+ * Links native tasks to the active WogiFlow task for tracking.
518
+ */
519
+ transformTaskCreated(coreResult) {
520
+ return {
521
+ continue: true,
522
+ ...(coreResult.message && { systemMessage: coreResult.message }),
523
+ hookSpecificOutput: {
524
+ hookEventName: 'TaskCreated',
525
+ linked: coreResult.linked || false,
526
+ wogiTaskId: coreResult.wogiTaskId || null
527
+ }
528
+ };
529
+ }
530
+
509
531
  /**
510
532
  * Transform InstructionsLoaded result
511
533
  * Fires when CLAUDE.md or .claude/rules/*.md files are loaded into context.
@@ -641,6 +663,13 @@ Run: /wogi-start ${coreResult.nextTaskId}`;
641
663
  }];
642
664
  }
643
665
 
666
+ // TaskCreated hook — link native tasks to WogiFlow task (Claude Code 2.1.84+)
667
+ if (rules.taskCreated?.enabled !== false) {
668
+ hooks.TaskCreated = [{
669
+ hooks: [hookEntry('TaskCreated', 'task-created.js', HOOK_TIMEOUTS.TASK_CREATED)]
670
+ }];
671
+ }
672
+
644
673
  // WorktreeCreate hook — copy essential state to new worktree (Claude Code 2.1.50+)
645
674
  if (rules.worktreeLifecycle?.enabled !== false) {
646
675
  hooks.WorktreeCreate = [{
@@ -0,0 +1,83 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * Wogi Flow - Task Created (Core Module)
5
+ *
6
+ * CLI-agnostic task creation tracking logic.
7
+ * Called when a native task is created via TaskCreate (Claude Code 2.1.84+).
8
+ *
9
+ * Handles:
10
+ * - Linking native Claude Code tasks to the active WogiFlow task
11
+ * - Tracking subtask creation for progress visibility
12
+ * - Logging task creation events to session state
13
+ *
14
+ * Returns a standardized result that adapters transform for specific CLIs.
15
+ */
16
+
17
+ const path = require('node:path');
18
+ const { getConfig, PATHS, safeJsonParse } = require('../../flow-utils');
19
+
20
+ /**
21
+ * Check if task created handling is enabled
22
+ * @returns {boolean}
23
+ */
24
+ function isTaskCreatedEnabled() {
25
+ const config = getConfig();
26
+ return config.hooks?.rules?.taskCreated?.enabled !== false;
27
+ }
28
+
29
+ /**
30
+ * Handle task creation event
31
+ * @param {Object} input - Parsed hook input from Claude Code
32
+ * @returns {Object} Core result
33
+ */
34
+ async function handleTaskCreated(input) {
35
+ if (!isTaskCreatedEnabled()) {
36
+ return { enabled: false, message: null };
37
+ }
38
+
39
+ const result = {
40
+ enabled: true,
41
+ linked: false,
42
+ wogiTaskId: null,
43
+ message: null
44
+ };
45
+
46
+ try {
47
+ // Find the active WogiFlow task to link against
48
+ const readyPath = path.join(PATHS.state, 'ready.json');
49
+ const ready = safeJsonParse(readyPath, { inProgress: [] });
50
+
51
+ if (Array.isArray(ready.inProgress) && ready.inProgress.length > 0) {
52
+ const activeTask = ready.inProgress[0];
53
+ const wogiTaskId = typeof activeTask === 'string' ? activeTask : activeTask?.id;
54
+
55
+ if (wogiTaskId) {
56
+ result.linked = true;
57
+ result.wogiTaskId = wogiTaskId;
58
+ }
59
+ }
60
+
61
+ // Track creation in session state (fire-and-forget)
62
+ try {
63
+ const sessionStatePath = path.join(PATHS.state, 'session-state.json');
64
+ const sessionState = safeJsonParse(sessionStatePath, {});
65
+ if (!sessionState.nativeTasksCreated) {
66
+ sessionState.nativeTasksCreated = 0;
67
+ }
68
+ sessionState.nativeTasksCreated += 1;
69
+ sessionState.lastNativeTaskAt = new Date().toISOString();
70
+
71
+ const fs = require('node:fs');
72
+ fs.writeFileSync(sessionStatePath, JSON.stringify(sessionState, null, 2));
73
+ } catch (_err) {
74
+ // Non-critical — session state tracking is best-effort
75
+ }
76
+ } catch (err) {
77
+ result.message = `Task created handler error: ${err.message}`;
78
+ }
79
+
80
+ return result;
81
+ }
82
+
83
+ module.exports = { handleTaskCreated, isTaskCreatedEnabled };
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * Wogi Flow - Claude Code TaskCreated Hook
5
+ *
6
+ * Called when a native task is created via TaskCreate (Claude Code 2.1.84+).
7
+ * Links native tasks to the active WogiFlow task for tracking.
8
+ */
9
+
10
+ const { handleTaskCreated } = require('../../core/task-created');
11
+ const { runHook } = require('../shared/hook-runner');
12
+
13
+ runHook('TaskCreated', async ({ parsedInput }) => {
14
+ return await handleTaskCreated(parsedInput);
15
+ }, { failMode: 'silent' });
@@ -267,6 +267,8 @@ const HOOK_VERSION_MAP = {
267
267
  InstructionsLoaded: { major: 2, minor: 1, patch: 72 },
268
268
  // Hooks added in 2.1.76+
269
269
  PostCompact: { major: 2, minor: 1, patch: 76 },
270
+ // Hooks added in 2.1.84+
271
+ TaskCreated: { major: 2, minor: 1, patch: 84 },
270
272
  };
271
273
 
272
274
  /**