npm - wogiflow - Versions diffs - 2.4.3 → 2.4.4 - Mend

wogiflow 2.4.3 → 2.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/.claude/commands/wogi-start.md +124 -0
package/.claude/docs/claude-code-compatibility.md +24 -0
package/.claude/docs/explore-agents.md +11 -0
package/.claude/settings.json +11 -0
package/bin/flow +11 -1
package/lib/workspace-contracts.js +599 -0
package/lib/workspace-intelligence.js +600 -0
package/lib/workspace-messages.js +441 -0
package/lib/workspace-routing.js +485 -0
package/lib/workspace-sync.js +339 -0
package/lib/workspace.js +1073 -0
package/package.json +1 -1
package/scripts/flow-config-defaults.js +28 -0
package/scripts/flow-eval-calibration.js +257 -0
package/scripts/flow-eval-judge.js +10 -1
package/scripts/flow-eval.js +9 -0
package/scripts/hooks/adapters/claude-code.js +29 -0
package/scripts/hooks/core/task-created.js +83 -0
package/scripts/hooks/entry/claude-code/task-created.js +15 -0
package/scripts/postinstall.js +2 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "wogiflow",
-  "version": "2.4.3",
+  "version": "2.4.4",
   "description": "AI-powered development workflow management system with multi-model support",
   "main": "lib/index.js",
   "bin": {

package/scripts/flow-config-defaults.js CHANGED Viewed

@@ -573,6 +573,34 @@ const CONFIG_DEFAULTS = {
     failureThresholdForFallback: 3
   },
+  // --- Skeptical Evaluator (Anthropic harness design pattern) ---
+  // Spawns a separate sub-agent to evaluate task output before quality gates.
+  // Addresses "confident praise bias" where the implementer always thinks it did well.
+  skepticalEvaluator: {
+    enabled: true,
+    _comment_enabled: 'Spawn a separate evaluator agent between Step 3.5 and Step 4',
+    maxIterations: 3,
+    _comment_maxIterations: 'Max eval→fix cycles before proceeding anyway',
+    model: 'sonnet',
+    _comment_model: 'Use a different model than the implementer for diversity',
+    calibration: true,
+    _comment_calibration: 'Inject few-shot calibration examples into evaluator prompt',
+    skipForL3: true,
+    _comment_skipForL3: 'Skip for trivial L3 subtasks'
+  },
+  // --- Sprint-Based Context Reset (Anthropic harness design pattern) ---
+  // For large tasks (5+ criteria), commit and reset context every N criteria.
+  // Fresh context per sprint prevents quality degradation on later criteria.
+  sprintReset: {
+    enabled: true,
+    _comment_enabled: 'Enable sprint-based context resets for large tasks',
+    criteriaPerSprint: 3,
+    _comment_criteriaPerSprint: 'Number of criteria to complete before a context reset',
+    minTaskCriteria: 5,
+    _comment_minTaskCriteria: 'Only activate for tasks with this many or more criteria'
+  },
   // --- Session Features ---
   morningBriefing: { enabled: false },
   techDebt: {

package/scripts/flow-eval-calibration.js ADDED Viewed

@@ -0,0 +1,257 @@
+#!/usr/bin/env node
+/**
+ * Wogi Flow - Eval Calibration
+ *
+ * Stores and retrieves calibrated eval examples for anchoring judge scores.
+ * Prevents score drift by providing few-shot examples of what high and low
+ * scores look like in practice.
+ *
+ * Based on Anthropic's harness design research finding that "few-shot examples
+ * with detailed score breakdowns calibrated evaluator judgment, reducing score
+ * drift across iterations."
+ *
+ * Usage:
+ *   node flow-eval-calibration.js save <taskId> <quality>  — save as calibration example
+ *   node flow-eval-calibration.js get                       — get calibration examples for prompt injection
+ *   node flow-eval-calibration.js list                      — list all calibration examples
+ */
+const path = require('node:path');
+const fs = require('node:fs');
+const { PATHS, safeJsonParse, writeJson } = require('./flow-utils');
+// ============================================================
+// Constants
+// ============================================================
+const CALIBRATION_PATH = path.join(PATHS.state, 'eval-calibration.json');
+const MAX_EXAMPLES_PER_QUALITY = 3; // Keep 3 high, 3 low
+// ============================================================
+// Storage
+// ============================================================
+/**
+ * Load calibration data
+ * @returns {Object} { high: [], low: [], lastUpdated }
+ */
+function loadCalibration() {
+  return safeJsonParse(CALIBRATION_PATH, {
+    high: [],
+    low: [],
+    lastUpdated: null
+  });
+}
+/**
+ * Save a completed eval as a calibration example.
+ * Called after /wogi-eval produces scores.
+ *
+ * @param {Object} params
+ * @param {string} params.taskId — the task that was evaluated
+ * @param {string} params.quality — "high" or "low"
+ * @param {Object} params.scores — { completeness, accuracy, workflowCompliance, tokenEfficiency, quality }
+ * @param {string} params.specSummary — brief spec description (first 500 chars)
+ * @param {string} params.diffSummary — brief diff description (file count, line count)
+ * @param {string} params.notes — judge's justification notes
+ */
+function saveCalibrationExample(params) {
+  const { taskId, quality, scores, specSummary, diffSummary, notes } = params;
+  if (quality !== 'high' && quality !== 'low') {
+    throw new Error('Quality must be "high" or "low"');
+  }
+  const cal = loadCalibration();
+  const example = {
+    taskId,
+    scores,
+    specSummary: (specSummary || '').slice(0, 500),
+    diffSummary: (diffSummary || '').slice(0, 200),
+    notes: (notes || '').slice(0, 500),
+    savedAt: new Date().toISOString()
+  };
+  cal[quality].unshift(example);
+  // Keep only MAX_EXAMPLES_PER_QUALITY
+  if (cal[quality].length > MAX_EXAMPLES_PER_QUALITY) {
+    cal[quality] = cal[quality].slice(0, MAX_EXAMPLES_PER_QUALITY);
+  }
+  cal.lastUpdated = new Date().toISOString();
+  writeJson(CALIBRATION_PATH, cal);
+  return example;
+}
+/**
+ * Auto-classify and save an eval result as calibration.
+ * High = average score >= 8. Low = average score <= 4.
+ *
+ * @param {Object} evalResult — from flow-eval.js
+ * @returns {Object|null} saved example or null if score is in the middle range
+ */
+function autoSaveFromEval(evalResult) {
+  if (!evalResult || !evalResult.scores) return null;
+  const scores = evalResult.scores;
+  const values = Object.values(scores).filter(v => typeof v === 'number');
+  if (values.length === 0) return null;
+  const avg = values.reduce((sum, v) => sum + v, 0) / values.length;
+  let quality = null;
+  if (avg >= 8) quality = 'high';
+  else if (avg <= 4) quality = 'low';
+  else return null; // Middle range — not a good calibration anchor
+  return saveCalibrationExample({
+    taskId: evalResult.taskId,
+    quality,
+    scores,
+    specSummary: evalResult.specSummary || '',
+    diffSummary: evalResult.diffSummary || '',
+    notes: evalResult.notes || ''
+  });
+}
+// ============================================================
+// Retrieval (for prompt injection)
+// ============================================================
+/**
+ * Get calibration examples formatted for injection into judge/evaluator prompts.
+ * Returns 1 high + 1 low example (if available).
+ *
+ * @returns {string} formatted calibration text, or empty string if no examples
+ */
+function getCalibrationPrompt() {
+  const cal = loadCalibration();
+  const parts = [];
+  if (cal.high.length > 0) {
+    const ex = cal.high[0];
+    parts.push(`## Calibration Example: HIGH QUALITY (reference)
+**Task**: ${ex.taskId}
+**Spec**: ${ex.specSummary}
+**Scores**: completeness=${ex.scores.completeness}, accuracy=${ex.scores.accuracy}, workflowCompliance=${ex.scores.workflowCompliance}, tokenEfficiency=${ex.scores.tokenEfficiency}, quality=${ex.scores.quality}
+**Why this scored high**: ${ex.notes}`);
+  }
+  if (cal.low.length > 0) {
+    const ex = cal.low[0];
+    parts.push(`## Calibration Example: LOW QUALITY (reference)
+**Task**: ${ex.taskId}
+**Spec**: ${ex.specSummary}
+**Scores**: completeness=${ex.scores.completeness}, accuracy=${ex.scores.accuracy}, workflowCompliance=${ex.scores.workflowCompliance}, tokenEfficiency=${ex.scores.tokenEfficiency}, quality=${ex.scores.quality}
+**Why this scored low**: ${ex.notes}`);
+  }
+  if (parts.length === 0) return '';
+  return `
+## Score Calibration (anchoring examples)
+Use these real examples to calibrate your scoring. They represent the extremes of the scale — most tasks should score between these.
+${parts.join('\n\n')}
+---
+`;
+}
+/**
+ * Get calibration examples as structured data
+ * @returns {{ high: Object|null, low: Object|null }}
+ */
+function getCalibrationExamples() {
+  const cal = loadCalibration();
+  return {
+    high: cal.high[0] || null,
+    low: cal.low[0] || null
+  };
+}
+// ============================================================
+// CLI
+// ============================================================
+function main() {
+  const args = process.argv.slice(2);
+  const command = args[0];
+  switch (command) {
+    case 'save': {
+      const taskId = args[1];
+      const quality = args[2];
+      if (!taskId || !quality) {
+        console.error('Usage: flow-eval-calibration.js save <taskId> <high|low>');
+        process.exit(1);
+      }
+      // Read scores from stdin or eval results
+      const evalsDir = path.join(PATHS.workflow, 'evals');
+      const evalFiles = fs.existsSync(evalsDir) ? fs.readdirSync(evalsDir).filter(f => f.includes(taskId)) : [];
+      if (evalFiles.length === 0) {
+        console.error(`No eval results found for task ${taskId}`);
+        process.exit(1);
+      }
+      const evalResult = safeJsonParse(path.join(evalsDir, evalFiles[0]), null);
+      if (evalResult) {
+        const saved = saveCalibrationExample({
+          taskId,
+          quality,
+          scores: evalResult.aggregated || evalResult.scores || {},
+          specSummary: evalResult.spec?.substring(0, 500) || '',
+          diffSummary: `${(evalResult.changedFiles || []).length} files changed`,
+          notes: evalResult.notes || evalResult.aggregated?.notes || ''
+        });
+        console.log(`Saved ${quality} calibration example: ${saved.taskId}`);
+      }
+      break;
+    }
+    case 'get':
+      console.log(getCalibrationPrompt() || 'No calibration examples yet.');
+      break;
+    case 'list': {
+      const cal = loadCalibration();
+      console.log(`High examples: ${cal.high.length}`);
+      for (const ex of cal.high) {
+        const values = Object.values(ex.scores).filter(v => typeof v === 'number');
+        const avg = values.length > 0 ? values.reduce((s, v) => s + v, 0) / values.length : 0;
+        console.log(`  ${ex.taskId} — avg ${avg.toFixed(1)} (${ex.savedAt})`);
+      }
+      console.log(`Low examples: ${cal.low.length}`);
+      for (const ex of cal.low) {
+        const values = Object.values(ex.scores).filter(v => typeof v === 'number');
+        const avg = values.length > 0 ? values.reduce((s, v) => s + v, 0) / values.length : 0;
+        console.log(`  ${ex.taskId} — avg ${avg.toFixed(1)} (${ex.savedAt})`);
+      }
+      break;
+    }
+    default:
+      console.log('Usage: flow-eval-calibration.js <save|get|list>');
+  }
+}
+// ============================================================
+// Exports
+// ============================================================
+module.exports = {
+  loadCalibration,
+  saveCalibrationExample,
+  autoSaveFromEval,
+  getCalibrationPrompt,
+  getCalibrationExamples
+};
+if (require.main === module) {
+  main();
+}

package/scripts/flow-eval-judge.js CHANGED Viewed

@@ -62,10 +62,19 @@ const DEFAULT_EVAL_CONFIG = {
 function buildJudgePrompt(params) {
   const { taskId, specContent, implementationDiff, iterations, tokenEstimate } = params;
+  // Inject calibration examples if available (prevents score drift)
+  let calibrationBlock = '';
+  try {
+    const { getCalibrationPrompt } = require('./flow-eval-calibration');
+    calibrationBlock = getCalibrationPrompt();
+  } catch (_err) {
+    // Calibration module not available — continue without it
+  }
   return `You are an expert code reviewer evaluating AI-generated implementation quality.
 ## Task: ${taskId}
+${calibrationBlock}
 ## Specification
 ${specContent}

package/scripts/flow-eval.js CHANGED Viewed

@@ -248,6 +248,15 @@ function saveEvalResult(evalResult) {
   try {
     writeJson(filePath, evalResult);
+    // Auto-save as calibration example if scores are extreme (high or low)
+    try {
+      const { autoSaveFromEval } = require('./flow-eval-calibration');
+      autoSaveFromEval(evalResult);
+    } catch (_err) {
+      // Calibration module not available — non-critical
+    }
     return filePath;
   } catch (err) {
     if (process.env.DEBUG) {

package/scripts/hooks/adapters/claude-code.js CHANGED Viewed

@@ -32,6 +32,7 @@ const HOOK_TIMEOUTS = {
   STOP: 5,                // Loop enforcement check
   SESSION_END: 10,        // Session cleanup/logging
   TASK_COMPLETED: 10,     // Post-task cleanup (Claude Code 2.1.33+)
+  TASK_CREATED: 5,        // Task creation tracking (Claude Code 2.1.84+)
   TEAMMATE_IDLE: 5,       // Task dispatch for idle agents (Claude Code 2.1.33+)
   CONFIG_CHANGE: 5,       // Mid-session config change detection (Claude Code latest)
   INSTRUCTIONS_LOADED: 5  // Instructions loaded event (Claude Code latest)
@@ -55,6 +56,7 @@ const CLAUDE_CODE_EVENTS = [
   'ConfigChange',         // Claude Code 2.1.63+ — mid-session config change detection
   'InstructionsLoaded',   // Claude Code latest — fires when CLAUDE.md/.claude/rules loaded
   'PostCompact',          // Claude Code 2.1.76+ — fires after context compaction completes
+  'TaskCreated',          // Claude Code 2.1.84+ — fires when a task is created via TaskCreate
 ];
 /**
@@ -69,6 +71,7 @@ const CLAUDE_CODE_EVENTS = [
 //   'ElicitationResult',  // Claude Code 2.1.76+ — intercept/override elicitation responses before sending
 //   'CwdChanged',         // Claude Code 2.1.83+ — fires when working directory changes (e.g., direnv)
 //   'FileChanged',        // Claude Code 2.1.83+ — fires when watched files change on disk
+//   'WorktreeCreate (http)', // Claude Code 2.1.84+ — WorktreeCreate now supports type:"http" transport
 // ];
 /**
@@ -204,6 +207,8 @@ class ClaudeCodeAdapter extends BaseAdapter {
         return this.transformInstructionsLoaded(coreResult);
       case 'PostCompact':
         return this.transformPostCompact(coreResult);
+      case 'TaskCreated':
+        return this.transformTaskCreated(coreResult);
       default:
         return { continue: true };
     }
@@ -506,6 +511,23 @@ Run: /wogi-start ${coreResult.nextTaskId}`;
     };
   }
+  /**
+   * Transform TaskCreated result (Claude Code 2.1.84+)
+   * Fires when a task is created via TaskCreate.
+   * Links native tasks to the active WogiFlow task for tracking.
+   */
+  transformTaskCreated(coreResult) {
+    return {
+      continue: true,
+      ...(coreResult.message && { systemMessage: coreResult.message }),
+      hookSpecificOutput: {
+        hookEventName: 'TaskCreated',
+        linked: coreResult.linked || false,
+        wogiTaskId: coreResult.wogiTaskId || null
+      }
+    };
+  }
   /**
    * Transform InstructionsLoaded result
    * Fires when CLAUDE.md or .claude/rules/*.md files are loaded into context.
@@ -641,6 +663,13 @@ Run: /wogi-start ${coreResult.nextTaskId}`;
       }];
     }
+    // TaskCreated hook — link native tasks to WogiFlow task (Claude Code 2.1.84+)
+    if (rules.taskCreated?.enabled !== false) {
+      hooks.TaskCreated = [{
+        hooks: [hookEntry('TaskCreated', 'task-created.js', HOOK_TIMEOUTS.TASK_CREATED)]
+      }];
+    }
     // WorktreeCreate hook — copy essential state to new worktree (Claude Code 2.1.50+)
     if (rules.worktreeLifecycle?.enabled !== false) {
       hooks.WorktreeCreate = [{

package/scripts/hooks/core/task-created.js ADDED Viewed

@@ -0,0 +1,83 @@
+#!/usr/bin/env node
+/**
+ * Wogi Flow - Task Created (Core Module)
+ *
+ * CLI-agnostic task creation tracking logic.
+ * Called when a native task is created via TaskCreate (Claude Code 2.1.84+).
+ *
+ * Handles:
+ * - Linking native Claude Code tasks to the active WogiFlow task
+ * - Tracking subtask creation for progress visibility
+ * - Logging task creation events to session state
+ *
+ * Returns a standardized result that adapters transform for specific CLIs.
+ */
+const path = require('node:path');
+const { getConfig, PATHS, safeJsonParse } = require('../../flow-utils');
+/**
+ * Check if task created handling is enabled
+ * @returns {boolean}
+ */
+function isTaskCreatedEnabled() {
+  const config = getConfig();
+  return config.hooks?.rules?.taskCreated?.enabled !== false;
+}
+/**
+ * Handle task creation event
+ * @param {Object} input - Parsed hook input from Claude Code
+ * @returns {Object} Core result
+ */
+async function handleTaskCreated(input) {
+  if (!isTaskCreatedEnabled()) {
+    return { enabled: false, message: null };
+  }
+  const result = {
+    enabled: true,
+    linked: false,
+    wogiTaskId: null,
+    message: null
+  };
+  try {
+    // Find the active WogiFlow task to link against
+    const readyPath = path.join(PATHS.state, 'ready.json');
+    const ready = safeJsonParse(readyPath, { inProgress: [] });
+    if (Array.isArray(ready.inProgress) && ready.inProgress.length > 0) {
+      const activeTask = ready.inProgress[0];
+      const wogiTaskId = typeof activeTask === 'string' ? activeTask : activeTask?.id;
+      if (wogiTaskId) {
+        result.linked = true;
+        result.wogiTaskId = wogiTaskId;
+      }
+    }
+    // Track creation in session state (fire-and-forget)
+    try {
+      const sessionStatePath = path.join(PATHS.state, 'session-state.json');
+      const sessionState = safeJsonParse(sessionStatePath, {});
+      if (!sessionState.nativeTasksCreated) {
+        sessionState.nativeTasksCreated = 0;
+      }
+      sessionState.nativeTasksCreated += 1;
+      sessionState.lastNativeTaskAt = new Date().toISOString();
+      const fs = require('node:fs');
+      fs.writeFileSync(sessionStatePath, JSON.stringify(sessionState, null, 2));
+    } catch (_err) {
+      // Non-critical — session state tracking is best-effort
+    }
+  } catch (err) {
+    result.message = `Task created handler error: ${err.message}`;
+  }
+  return result;
+}
+module.exports = { handleTaskCreated, isTaskCreatedEnabled };

package/scripts/hooks/entry/claude-code/task-created.js ADDED Viewed

@@ -0,0 +1,15 @@
+#!/usr/bin/env node
+/**
+ * Wogi Flow - Claude Code TaskCreated Hook
+ *
+ * Called when a native task is created via TaskCreate (Claude Code 2.1.84+).
+ * Links native tasks to the active WogiFlow task for tracking.
+ */
+const { handleTaskCreated } = require('../../core/task-created');
+const { runHook } = require('../shared/hook-runner');
+runHook('TaskCreated', async ({ parsedInput }) => {
+  return await handleTaskCreated(parsedInput);
+}, { failMode: 'silent' });

package/scripts/postinstall.js CHANGED Viewed

@@ -267,6 +267,8 @@ const HOOK_VERSION_MAP = {
   InstructionsLoaded: { major: 2, minor: 1, patch: 72 },
   // Hooks added in 2.1.76+
   PostCompact: { major: 2, minor: 1, patch: 76 },
+  // Hooks added in 2.1.84+
+  TaskCreated: { major: 2, minor: 1, patch: 84 },
 };
 /**