hail-hydra-cc 2.3.2 β 2.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +99 -99
- package/bin/cli.js +105 -105
- package/files/SKILL.md +1172 -1217
- package/files/agents/hydra-analyst.md +1 -1
- package/files/agents/hydra-coder.md +2 -2
- package/files/agents/hydra-git.md +1 -1
- package/files/agents/hydra-guard.md +3 -3
- package/files/agents/hydra-runner.md +1 -1
- package/files/agents/hydra-scout.md +1 -1
- package/files/agents/hydra-scribe.md +1 -1
- package/files/agents/hydra-sentinel-scan.md +19 -1
- package/files/agents/hydra-sentinel.md +19 -1
- package/files/commands/hydra/config.md +37 -37
- package/files/commands/hydra/guard.md +71 -71
- package/files/commands/hydra/help.md +47 -47
- package/files/commands/hydra/quiet.md +16 -16
- package/files/commands/hydra/stats.md +31 -0
- package/files/commands/hydra/status.md +85 -85
- package/files/commands/hydra/verbose.md +29 -29
- package/files/hooks/hydra-auto-guard.js +130 -54
- package/files/hooks/hydra-check-update.js +99 -99
- package/files/hooks/hydra-statusline.js +131 -128
- package/files/references/model-capabilities.md +164 -164
- package/files/references/routing-guide.md +303 -303
- package/package.json +1 -1
- package/src/display.js +1 -1
- package/src/files.js +110 -106
- package/src/installer.js +401 -393
- package/src/prompts.js +80 -80
|
@@ -1,128 +1,131 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
// Hydra StatusLine β persistent status bar at bottom of Claude Code
|
|
4
|
-
// Receives session JSON via stdin, outputs one formatted line to stdout.
|
|
5
|
-
//
|
|
6
|
-
// Display format:
|
|
7
|
-
// π β Opus β Ctx: 37% ββββββββββ β $0.42 β my-project β β‘ Update available
|
|
8
|
-
//
|
|
9
|
-
// Context bar is color-coded:
|
|
10
|
-
// Green (0-49%) β Yellow (50-79%) β Red (80%+)
|
|
11
|
-
|
|
12
|
-
const fs = require('fs');
|
|
13
|
-
const path = require('path');
|
|
14
|
-
const os = require('os');
|
|
15
|
-
|
|
16
|
-
const cacheFile = path.join(os.homedir(), '.claude', 'cache', 'hydra-update-check.json');
|
|
17
|
-
|
|
18
|
-
let input = '';
|
|
19
|
-
process.stdin.on('data', (chunk) => (input += chunk));
|
|
20
|
-
process.stdin.on('end', () => {
|
|
21
|
-
try {
|
|
22
|
-
const data = JSON.parse(input);
|
|
23
|
-
|
|
24
|
-
// === Model ===
|
|
25
|
-
const model = data.model?.display_name || 'Unknown';
|
|
26
|
-
|
|
27
|
-
// === Context Usage ===
|
|
28
|
-
// Use precomputed used_percentage from Claude Code (most reliable)
|
|
29
|
-
const ctxPct = Math.round(data.context_window?.used_percentage || 0);
|
|
30
|
-
|
|
31
|
-
// Build visual context bar (10 chars wide)
|
|
32
|
-
const filled = Math.round(ctxPct / 10);
|
|
33
|
-
const empty = 10 - filled;
|
|
34
|
-
const bar = '\u2588'.repeat(filled) + '\u2591'.repeat(empty);
|
|
35
|
-
|
|
36
|
-
// Color-code: Green <50%, Yellow 50-79%, Red 80%+
|
|
37
|
-
let ctxColor;
|
|
38
|
-
if (ctxPct < 50) {
|
|
39
|
-
ctxColor = '\x1b[32m'; // Green
|
|
40
|
-
} else if (ctxPct < 80) {
|
|
41
|
-
ctxColor = '\x1b[33m'; // Yellow
|
|
42
|
-
} else {
|
|
43
|
-
ctxColor = '\x1b[31m'; // Red
|
|
44
|
-
}
|
|
45
|
-
const reset = '\x1b[0m';
|
|
46
|
-
const dim = '\x1b[2m';
|
|
47
|
-
|
|
48
|
-
const ctxDisplay = `${ctxColor}Ctx: ${ctxPct}% ${bar}${reset}`;
|
|
49
|
-
|
|
50
|
-
// === Session Cost ===
|
|
51
|
-
const cost = (data.cost?.total_cost_usd || 0).toFixed(2);
|
|
52
|
-
|
|
53
|
-
// === Savings vs all-Opus baseline (cached, silent on failure) ===
|
|
54
|
-
let savingsStr = '';
|
|
55
|
-
try {
|
|
56
|
-
const tokenMath = require('./hydra-token-math');
|
|
57
|
-
const summary = tokenMath.computeSummaryCached();
|
|
58
|
-
if (summary.available && summary.savedUSD >= 0.01) {
|
|
59
|
-
savingsStr = ` \x1b[32mβ$${summary.savedUSD.toFixed(2)}\x1b[0m`;
|
|
60
|
-
}
|
|
61
|
-
} catch (e) { /* silent fallback */ }
|
|
62
|
-
|
|
63
|
-
// === Working Directory ===
|
|
64
|
-
const dirName = path.basename(data.workspace?.current_dir || data.cwd || '');
|
|
65
|
-
|
|
66
|
-
// === Update Check (read from cache) ===
|
|
67
|
-
let updateNotice = '';
|
|
68
|
-
try {
|
|
69
|
-
const cache = JSON.parse(fs.readFileSync(cacheFile, 'utf8'));
|
|
70
|
-
if (cache.update_available) {
|
|
71
|
-
updateNotice = ` \x1b[33m\u26A1 v${cache.latest} available${reset}`;
|
|
72
|
-
}
|
|
73
|
-
} catch (e) {
|
|
74
|
-
// No cache β skip update notice
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
// === Compose Status Line ===
|
|
78
|
-
const parts = [
|
|
79
|
-
'\x1b[32m\uD83D\uDC32\x1b[0m', // Green dragon emoji (π)
|
|
80
|
-
`${dim}${model}${reset}`, // Dim model name
|
|
81
|
-
ctxDisplay, // Color-coded context bar
|
|
82
|
-
`${dim}$${cost}${reset}${savingsStr}`, // Dim cost + green βsavings
|
|
83
|
-
`${dim}${dirName}${reset}`, // Dim directory
|
|
84
|
-
];
|
|
85
|
-
|
|
86
|
-
// Append update notice if available
|
|
87
|
-
if (updateNotice) {
|
|
88
|
-
parts.push(updateNotice);
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
// Compaction warning β only show at 70%+ context usage
|
|
92
|
-
if (ctxPct >= 80) {
|
|
93
|
-
parts.push(`\x1b[31m\u26A0 Compacting soon!\x1b[0m`);
|
|
94
|
-
} else if (ctxPct >= 70) {
|
|
95
|
-
parts.push(`\x1b[31m\u26A0 Auto-compact at 85%\x1b[0m`);
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
// === Sentinel
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
const
|
|
103
|
-
const
|
|
104
|
-
|
|
105
|
-
const
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
const
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
}
|
|
128
|
-
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// Hydra StatusLine β persistent status bar at bottom of Claude Code
|
|
4
|
+
// Receives session JSON via stdin, outputs one formatted line to stdout.
|
|
5
|
+
//
|
|
6
|
+
// Display format:
|
|
7
|
+
// π β Opus β Ctx: 37% ββββββββββ β $0.42 β my-project β β‘ Update available
|
|
8
|
+
//
|
|
9
|
+
// Context bar is color-coded:
|
|
10
|
+
// Green (0-49%) β Yellow (50-79%) β Red (80%+)
|
|
11
|
+
|
|
12
|
+
const fs = require('fs');
|
|
13
|
+
const path = require('path');
|
|
14
|
+
const os = require('os');
|
|
15
|
+
|
|
16
|
+
const cacheFile = path.join(os.homedir(), '.claude', 'cache', 'hydra-update-check.json');
|
|
17
|
+
|
|
18
|
+
let input = '';
|
|
19
|
+
process.stdin.on('data', (chunk) => (input += chunk));
|
|
20
|
+
process.stdin.on('end', () => {
|
|
21
|
+
try {
|
|
22
|
+
const data = JSON.parse(input);
|
|
23
|
+
|
|
24
|
+
// === Model ===
|
|
25
|
+
const model = data.model?.display_name || 'Unknown';
|
|
26
|
+
|
|
27
|
+
// === Context Usage ===
|
|
28
|
+
// Use precomputed used_percentage from Claude Code (most reliable)
|
|
29
|
+
const ctxPct = Math.round(data.context_window?.used_percentage || 0);
|
|
30
|
+
|
|
31
|
+
// Build visual context bar (10 chars wide)
|
|
32
|
+
const filled = Math.round(ctxPct / 10);
|
|
33
|
+
const empty = 10 - filled;
|
|
34
|
+
const bar = '\u2588'.repeat(filled) + '\u2591'.repeat(empty);
|
|
35
|
+
|
|
36
|
+
// Color-code: Green <50%, Yellow 50-79%, Red 80%+
|
|
37
|
+
let ctxColor;
|
|
38
|
+
if (ctxPct < 50) {
|
|
39
|
+
ctxColor = '\x1b[32m'; // Green
|
|
40
|
+
} else if (ctxPct < 80) {
|
|
41
|
+
ctxColor = '\x1b[33m'; // Yellow
|
|
42
|
+
} else {
|
|
43
|
+
ctxColor = '\x1b[31m'; // Red
|
|
44
|
+
}
|
|
45
|
+
const reset = '\x1b[0m';
|
|
46
|
+
const dim = '\x1b[2m';
|
|
47
|
+
|
|
48
|
+
const ctxDisplay = `${ctxColor}Ctx: ${ctxPct}% ${bar}${reset}`;
|
|
49
|
+
|
|
50
|
+
// === Session Cost ===
|
|
51
|
+
const cost = (data.cost?.total_cost_usd || 0).toFixed(2);
|
|
52
|
+
|
|
53
|
+
// === Savings vs all-Opus baseline (cached, silent on failure) ===
|
|
54
|
+
let savingsStr = '';
|
|
55
|
+
try {
|
|
56
|
+
const tokenMath = require('./hydra-token-math');
|
|
57
|
+
const summary = tokenMath.computeSummaryCached();
|
|
58
|
+
if (summary.available && summary.savedUSD >= 0.01) {
|
|
59
|
+
savingsStr = ` \x1b[32mβ$${summary.savedUSD.toFixed(2)}\x1b[0m`;
|
|
60
|
+
}
|
|
61
|
+
} catch (e) { /* silent fallback */ }
|
|
62
|
+
|
|
63
|
+
// === Working Directory ===
|
|
64
|
+
const dirName = path.basename(data.workspace?.current_dir || data.cwd || '');
|
|
65
|
+
|
|
66
|
+
// === Update Check (read from cache) ===
|
|
67
|
+
let updateNotice = '';
|
|
68
|
+
try {
|
|
69
|
+
const cache = JSON.parse(fs.readFileSync(cacheFile, 'utf8'));
|
|
70
|
+
if (cache.update_available) {
|
|
71
|
+
updateNotice = ` \x1b[33m\u26A1 v${cache.latest} available${reset}`;
|
|
72
|
+
}
|
|
73
|
+
} catch (e) {
|
|
74
|
+
// No cache β skip update notice
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// === Compose Status Line ===
|
|
78
|
+
const parts = [
|
|
79
|
+
'\x1b[32m\uD83D\uDC32\x1b[0m', // Green dragon emoji (π)
|
|
80
|
+
`${dim}${model}${reset}`, // Dim model name
|
|
81
|
+
ctxDisplay, // Color-coded context bar
|
|
82
|
+
`${dim}$${cost}${reset}${savingsStr}`, // Dim cost + green βsavings
|
|
83
|
+
`${dim}${dirName}${reset}`, // Dim directory
|
|
84
|
+
];
|
|
85
|
+
|
|
86
|
+
// Append update notice if available
|
|
87
|
+
if (updateNotice) {
|
|
88
|
+
parts.push(updateNotice);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Compaction warning β only show at 70%+ context usage
|
|
92
|
+
if (ctxPct >= 80) {
|
|
93
|
+
parts.push(`\x1b[31m\u26A0 Compacting soon!\x1b[0m`);
|
|
94
|
+
} else if (ctxPct >= 70) {
|
|
95
|
+
parts.push(`\x1b[31m\u26A0 Auto-compact at 85%\x1b[0m`);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// === Sentinel Indicator: 3 states (pending / clean / quiet) ===
|
|
99
|
+
try {
|
|
100
|
+
const sessionId = data.session_id || 'unknown';
|
|
101
|
+
const sentinelDir = path.join(os.tmpdir(), 'hydra-sentinel');
|
|
102
|
+
const pendingFile = path.join(sentinelDir, `${sessionId}-pending.json`);
|
|
103
|
+
const scanMarker = path.join(sentinelDir, `${sessionId}-last-scan`);
|
|
104
|
+
|
|
105
|
+
const pendingExists = fs.existsSync(pendingFile);
|
|
106
|
+
const markerExists = fs.existsSync(scanMarker);
|
|
107
|
+
|
|
108
|
+
if (pendingExists) {
|
|
109
|
+
const pendingData = JSON.parse(fs.readFileSync(pendingFile, 'utf8'));
|
|
110
|
+
const count = pendingData.files?.length || 0;
|
|
111
|
+
const age = Date.now() - (pendingData.updated_at || 0);
|
|
112
|
+
if (count > 0 && age < 600000) {
|
|
113
|
+
parts.push(`\x1b[33m\u26A0 Sentinel pending (${count} file${count === 1 ? '' : 's'})\x1b[0m`);
|
|
114
|
+
}
|
|
115
|
+
} else if (markerExists) {
|
|
116
|
+
const markerMs = parseInt(fs.readFileSync(scanMarker, 'utf8').trim(), 10) * 1000;
|
|
117
|
+
if (Date.now() - markerMs < 60000) {
|
|
118
|
+
parts.push(`\x1b[32m\u2705 Sentinel clean\x1b[0m`);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
} catch (e) {
|
|
122
|
+
// No flag β silent quiet state
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
process.stdout.write(parts.join(' \u2502 '));
|
|
126
|
+
|
|
127
|
+
} catch (e) {
|
|
128
|
+
// Fallback if JSON parse fails
|
|
129
|
+
process.stdout.write('\uD83D\uDC32 Hydra');
|
|
130
|
+
}
|
|
131
|
+
});
|
|
@@ -1,164 +1,164 @@
|
|
|
1
|
-
# Model Capabilities Reference
|
|
2
|
-
|
|
3
|
-
Understanding what each model does well (and where it struggles) is key to effective routing.
|
|
4
|
-
This reference helps calibrate delegation decisions.
|
|
5
|
-
|
|
6
|
-
## Claude Haiku 4.5
|
|
7
|
-
|
|
8
|
-
### Strengths
|
|
9
|
-
- Extremely fast response times (~10Γ faster than Opus)
|
|
10
|
-
- Very low cost per token (~5Γ cheaper than Opus 4.6 β $1/$5 vs $5/$25 per MTok)
|
|
11
|
-
- Excellent at following clear, well-defined instructions
|
|
12
|
-
- Strong at text extraction, search, and pattern matching
|
|
13
|
-
- Good at generating code from templates and clear patterns
|
|
14
|
-
- Reliable for mechanical tasks with unambiguous specifications
|
|
15
|
-
- Great at summarization and information retrieval
|
|
16
|
-
|
|
17
|
-
### Limitations
|
|
18
|
-
- Weaker at multi-step reasoning chains
|
|
19
|
-
- Can miss subtle bugs or edge cases in code review
|
|
20
|
-
- Less reliable with complex architectural decisions
|
|
21
|
-
- May produce simpler solutions when a nuanced approach is needed
|
|
22
|
-
- Can struggle with ambiguous or underspecified requirements
|
|
23
|
-
- Less creative in problem-solving approaches
|
|
24
|
-
|
|
25
|
-
### Ideal Task Profile
|
|
26
|
-
Short context, clear instructions, well-defined output, no judgment calls needed.
|
|
27
|
-
|
|
28
|
-
### Auto-Accept Thresholds
|
|
29
|
-
Haiku outputs qualify for auto-accept when they are raw, factual, and unambiguous:
|
|
30
|
-
- **hydra-scout**: File paths, grep results, directory listings, code snippets with location markers
|
|
31
|
-
- **hydra-runner**: All-pass results, clean build/lint output, git status output
|
|
32
|
-
- **hydra-scribe**: Internal docstrings, inline comments, changelog entries
|
|
33
|
-
- **Requires verify**: Any analysis, interpretation, or user-facing documentation
|
|
34
|
-
|
|
35
|
-
### hydra-scout (Haiku 4.5) β Updated in v2.1.0
|
|
36
|
-
- **Strengths**: Codebase exploration, file search, reading, AND codebase
|
|
37
|
-
map building/maintenance
|
|
38
|
-
- **New capability**: Builds and incrementally updates the codebase dependency
|
|
39
|
-
map using grep-based import extraction. No external parsers required.
|
|
40
|
-
- **Memory focus**: Codebase structure, key file locations, module boundaries,
|
|
41
|
-
map build history, files that failed to parse
|
|
42
|
-
|
|
43
|
-
### hydra-sentinel-scan (Haiku 4.5) β Updated in v2.1.0
|
|
44
|
-
- **Strengths**: Pattern matching, grep-level analysis, import tracing,
|
|
45
|
-
fast structural checks, AND map-based instant blast-radius lookups
|
|
46
|
-
- **New capability**: Reads codebase map for instant dependency lookups
|
|
47
|
-
instead of grepping. Falls back to grep if map doesn't exist.
|
|
48
|
-
- **Map-aware checks**: Risk-based severity, test coverage warnings,
|
|
49
|
-
env var index lookups, blast radius reporting
|
|
50
|
-
- **Limitations**: Cannot understand semantic meaning of data shapes,
|
|
51
|
-
may produce false positives on complex contract changes
|
|
52
|
-
- **Memory focus**: Codebase dependency graph, coupling patterns,
|
|
53
|
-
false positive history
|
|
54
|
-
|
|
55
|
-
---
|
|
56
|
-
|
|
57
|
-
## Claude Sonnet 4.6
|
|
58
|
-
|
|
59
|
-
### Strengths
|
|
60
|
-
- Strong code generation across most languages and frameworks
|
|
61
|
-
- Good reasoning about code structure and patterns
|
|
62
|
-
- Reliable bug fixing when errors are identifiable
|
|
63
|
-
- Effective code review for common issues
|
|
64
|
-
- Good at test writing with understanding of business logic
|
|
65
|
-
- Handles refactoring with awareness of dependencies
|
|
66
|
-
- Balances speed and capability well
|
|
67
|
-
|
|
68
|
-
### Limitations
|
|
69
|
-
- May not catch the most subtle architectural issues
|
|
70
|
-
- Less reliable than Opus for novel algorithm design
|
|
71
|
-
- Can sometimes miss non-obvious security implications
|
|
72
|
-
- May not fully optimize complex performance bottlenecks
|
|
73
|
-
- Less effective at synthesizing large amounts of disparate information
|
|
74
|
-
|
|
75
|
-
### Ideal Task Profile
|
|
76
|
-
Standard software engineering tasks: implementation, testing, debugging, review. Tasks where
|
|
77
|
-
the approach is established even if the specific implementation requires thought.
|
|
78
|
-
|
|
79
|
-
### Auto-Accept Thresholds
|
|
80
|
-
Sonnet outputs always require orchestrator review β code changes and analysis are never auto-accepted:
|
|
81
|
-
- **hydra-coder**: ALWAYS verify β scan for correctness, edge cases, project pattern alignment
|
|
82
|
-
- **hydra-analyst**: ALWAYS verify β validate reasoning, check suggested fix against actual code
|
|
83
|
-
|
|
84
|
-
### hydra-sentinel (Sonnet 4.6)
|
|
85
|
-
- **Strengths**: Semantic understanding of data flow, contract validation
|
|
86
|
-
across component boundaries, accurate false positive filtering,
|
|
87
|
-
specific fix suggestions
|
|
88
|
-
- **Limitations**: Slower and more expensive β only triggered when needed
|
|
89
|
-
- **Memory focus**: API patterns, architectural boundaries, historical
|
|
90
|
-
breakage patterns, component communication flows
|
|
91
|
-
|
|
92
|
-
---
|
|
93
|
-
|
|
94
|
-
## Claude Opus 4.6
|
|
95
|
-
|
|
96
|
-
### Strengths
|
|
97
|
-
- Deepest reasoning and analysis capability
|
|
98
|
-
- Best at novel problem-solving and architecture design
|
|
99
|
-
- Most reliable for subtle bug detection
|
|
100
|
-
- Strongest at synthesizing complex, multi-source information
|
|
101
|
-
- Best judgment on ambiguous tradeoffs
|
|
102
|
-
- Most creative in approach selection
|
|
103
|
-
- Highest accuracy on edge cases
|
|
104
|
-
|
|
105
|
-
### Limitations
|
|
106
|
-
- Slowest response time
|
|
107
|
-
- Highest cost per token
|
|
108
|
-
- Overkill for routine tasks (same quality as Sonnet on standard work)
|
|
109
|
-
|
|
110
|
-
### Ideal Task Profile
|
|
111
|
-
Hard problems: architecture design, subtle debugging, complex tradeoffs, novel implementations,
|
|
112
|
-
security analysis, anything where getting it wrong is costly.
|
|
113
|
-
|
|
114
|
-
### Auto-Accept Thresholds
|
|
115
|
-
N/A β Opus is the orchestrator, not a delegated head. Opus output goes directly to the user.
|
|
116
|
-
|
|
117
|
-
---
|
|
118
|
-
|
|
119
|
-
## Cost and Speed Comparison (February 2026 Pricing)
|
|
120
|
-
|
|
121
|
-
| Model | Input Cost | Output Cost | Relative Speed | Input Cost vs Opus 4.6 | Output Cost vs Opus 4.6 |
|
|
122
|
-
|-------|-----------|-------------|----------------|----------------------|------------------------|
|
|
123
|
-
| Haiku 4.5 | $1 / MTok | $5 / MTok | ~10Γ faster | 5Γ cheaper | 5Γ cheaper |
|
|
124
|
-
| Sonnet 4.6 | $3 / MTok | $15 / MTok | ~3Γ faster | ~1.7Γ cheaper | ~1.7Γ cheaper |
|
|
125
|
-
| Opus 4.6 | $5 / MTok | $25 / MTok | 1Γ (baseline) | 1Γ (baseline) | 1Γ (baseline) |
|
|
126
|
-
|
|
127
|
-
Source: https://platform.claude.com/docs/en/about-claude/pricing
|
|
128
|
-
|
|
129
|
-
### Blended Cost with Hydra (typical 50/30/20 task split)
|
|
130
|
-
|
|
131
|
-
| Metric | All Opus 4.6 | With Hydra | Savings |
|
|
132
|
-
|--------|-------------|------------|---------|
|
|
133
|
-
| Input cost / MTok | $5.00 | $2.40 | 52% |
|
|
134
|
-
| Output cost / MTok | $25.00 | $12.00 | 52% |
|
|
135
|
-
| Blended effective cost | $30.00 / MTok | $14.40 / MTok | ~50% |
|
|
136
|
-
|
|
137
|
-
Note: Savings calculated against Opus 4.6 pricing ($5/$25 per MTok) as of February 2026.
|
|
138
|
-
Savings would be significantly higher when compared to Opus 4.1/4.0 pricing ($15/$75 per MTok).
|
|
139
|
-
|
|
140
|
-
These are approximate ratios. The key insight: for 60-70% of coding tasks, Haiku 4.5 or
|
|
141
|
-
Sonnet 4.6 produces output identical in quality to what Opus 4.6 would produce, but
|
|
142
|
-
dramatically faster and cheaper. The skill is in identifying the 30-40% where Opus 4.6
|
|
143
|
-
is genuinely needed.
|
|
144
|
-
|
|
145
|
-
---
|
|
146
|
-
|
|
147
|
-
## Acceptance Rate Expectations
|
|
148
|
-
|
|
149
|
-
Drawing from speculative decoding theory, track these metrics mentally:
|
|
150
|
-
|
|
151
|
-
| Draft Model | Expected Acceptance Rate | Notes |
|
|
152
|
-
|-------------|------------------------|-------|
|
|
153
|
-
| Haiku β Opus verification | ~85-90% | For well-classified Tier 1 tasks |
|
|
154
|
-
| Sonnet β Opus verification | ~90-95% | For well-classified Tier 2 tasks |
|
|
155
|
-
| sentinel-scan β sentinel escalation | ~20% | ~80%+ of scans return clean β only ~20% escalate to deep analysis |
|
|
156
|
-
| sentinel β Opus verification | ~95% | Sonnet's deep analysis is highly accurate; Opus rarely overrides |
|
|
157
|
-
|
|
158
|
-
If your acceptance rate drops below 80%, you're likely misclassifying tasks β shift borderline
|
|
159
|
-
tasks to a higher tier. If it's consistently above 95%, you might be too conservative.
|
|
160
|
-
|
|
161
|
-
The analogy to speculative decoding is direct: just as the paper found acceptance rates of
|
|
162
|
-
~0.7-0.9 for draft tokens depending on domain, our task-level acceptance rates should be
|
|
163
|
-
similar or better, since we have more context for classification than a draft model has for
|
|
164
|
-
next-token prediction.
|
|
1
|
+
# Model Capabilities Reference
|
|
2
|
+
|
|
3
|
+
Understanding what each model does well (and where it struggles) is key to effective routing.
|
|
4
|
+
This reference helps calibrate delegation decisions.
|
|
5
|
+
|
|
6
|
+
## Claude Haiku 4.5
|
|
7
|
+
|
|
8
|
+
### Strengths
|
|
9
|
+
- Extremely fast response times (~10Γ faster than Opus)
|
|
10
|
+
- Very low cost per token (~5Γ cheaper than Opus 4.6 β $1/$5 vs $5/$25 per MTok)
|
|
11
|
+
- Excellent at following clear, well-defined instructions
|
|
12
|
+
- Strong at text extraction, search, and pattern matching
|
|
13
|
+
- Good at generating code from templates and clear patterns
|
|
14
|
+
- Reliable for mechanical tasks with unambiguous specifications
|
|
15
|
+
- Great at summarization and information retrieval
|
|
16
|
+
|
|
17
|
+
### Limitations
|
|
18
|
+
- Weaker at multi-step reasoning chains
|
|
19
|
+
- Can miss subtle bugs or edge cases in code review
|
|
20
|
+
- Less reliable with complex architectural decisions
|
|
21
|
+
- May produce simpler solutions when a nuanced approach is needed
|
|
22
|
+
- Can struggle with ambiguous or underspecified requirements
|
|
23
|
+
- Less creative in problem-solving approaches
|
|
24
|
+
|
|
25
|
+
### Ideal Task Profile
|
|
26
|
+
Short context, clear instructions, well-defined output, no judgment calls needed.
|
|
27
|
+
|
|
28
|
+
### Auto-Accept Thresholds
|
|
29
|
+
Haiku outputs qualify for auto-accept when they are raw, factual, and unambiguous:
|
|
30
|
+
- **hydra-scout**: File paths, grep results, directory listings, code snippets with location markers
|
|
31
|
+
- **hydra-runner**: All-pass results, clean build/lint output, git status output
|
|
32
|
+
- **hydra-scribe**: Internal docstrings, inline comments, changelog entries
|
|
33
|
+
- **Requires verify**: Any analysis, interpretation, or user-facing documentation
|
|
34
|
+
|
|
35
|
+
### hydra-scout (Haiku 4.5) β Updated in v2.1.0
|
|
36
|
+
- **Strengths**: Codebase exploration, file search, reading, AND codebase
|
|
37
|
+
map building/maintenance
|
|
38
|
+
- **New capability**: Builds and incrementally updates the codebase dependency
|
|
39
|
+
map using grep-based import extraction. No external parsers required.
|
|
40
|
+
- **Memory focus**: Codebase structure, key file locations, module boundaries,
|
|
41
|
+
map build history, files that failed to parse
|
|
42
|
+
|
|
43
|
+
### hydra-sentinel-scan (Haiku 4.5) β Updated in v2.1.0
|
|
44
|
+
- **Strengths**: Pattern matching, grep-level analysis, import tracing,
|
|
45
|
+
fast structural checks, AND map-based instant blast-radius lookups
|
|
46
|
+
- **New capability**: Reads codebase map for instant dependency lookups
|
|
47
|
+
instead of grepping. Falls back to grep if map doesn't exist.
|
|
48
|
+
- **Map-aware checks**: Risk-based severity, test coverage warnings,
|
|
49
|
+
env var index lookups, blast radius reporting
|
|
50
|
+
- **Limitations**: Cannot understand semantic meaning of data shapes,
|
|
51
|
+
may produce false positives on complex contract changes
|
|
52
|
+
- **Memory focus**: Codebase dependency graph, coupling patterns,
|
|
53
|
+
false positive history
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Claude Sonnet 4.6
|
|
58
|
+
|
|
59
|
+
### Strengths
|
|
60
|
+
- Strong code generation across most languages and frameworks
|
|
61
|
+
- Good reasoning about code structure and patterns
|
|
62
|
+
- Reliable bug fixing when errors are identifiable
|
|
63
|
+
- Effective code review for common issues
|
|
64
|
+
- Good at test writing with understanding of business logic
|
|
65
|
+
- Handles refactoring with awareness of dependencies
|
|
66
|
+
- Balances speed and capability well
|
|
67
|
+
|
|
68
|
+
### Limitations
|
|
69
|
+
- May not catch the most subtle architectural issues
|
|
70
|
+
- Less reliable than Opus for novel algorithm design
|
|
71
|
+
- Can sometimes miss non-obvious security implications
|
|
72
|
+
- May not fully optimize complex performance bottlenecks
|
|
73
|
+
- Less effective at synthesizing large amounts of disparate information
|
|
74
|
+
|
|
75
|
+
### Ideal Task Profile
|
|
76
|
+
Standard software engineering tasks: implementation, testing, debugging, review. Tasks where
|
|
77
|
+
the approach is established even if the specific implementation requires thought.
|
|
78
|
+
|
|
79
|
+
### Auto-Accept Thresholds
|
|
80
|
+
Sonnet outputs always require orchestrator review β code changes and analysis are never auto-accepted:
|
|
81
|
+
- **hydra-coder**: ALWAYS verify β scan for correctness, edge cases, project pattern alignment
|
|
82
|
+
- **hydra-analyst**: ALWAYS verify β validate reasoning, check suggested fix against actual code
|
|
83
|
+
|
|
84
|
+
### hydra-sentinel (Sonnet 4.6)
|
|
85
|
+
- **Strengths**: Semantic understanding of data flow, contract validation
|
|
86
|
+
across component boundaries, accurate false positive filtering,
|
|
87
|
+
specific fix suggestions
|
|
88
|
+
- **Limitations**: Slower and more expensive β only triggered when needed
|
|
89
|
+
- **Memory focus**: API patterns, architectural boundaries, historical
|
|
90
|
+
breakage patterns, component communication flows
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
## Claude Opus 4.6
|
|
95
|
+
|
|
96
|
+
### Strengths
|
|
97
|
+
- Deepest reasoning and analysis capability
|
|
98
|
+
- Best at novel problem-solving and architecture design
|
|
99
|
+
- Most reliable for subtle bug detection
|
|
100
|
+
- Strongest at synthesizing complex, multi-source information
|
|
101
|
+
- Best judgment on ambiguous tradeoffs
|
|
102
|
+
- Most creative in approach selection
|
|
103
|
+
- Highest accuracy on edge cases
|
|
104
|
+
|
|
105
|
+
### Limitations
|
|
106
|
+
- Slowest response time
|
|
107
|
+
- Highest cost per token
|
|
108
|
+
- Overkill for routine tasks (same quality as Sonnet on standard work)
|
|
109
|
+
|
|
110
|
+
### Ideal Task Profile
|
|
111
|
+
Hard problems: architecture design, subtle debugging, complex tradeoffs, novel implementations,
|
|
112
|
+
security analysis, anything where getting it wrong is costly.
|
|
113
|
+
|
|
114
|
+
### Auto-Accept Thresholds
|
|
115
|
+
N/A β Opus is the orchestrator, not a delegated head. Opus output goes directly to the user.
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## Cost and Speed Comparison (February 2026 Pricing)
|
|
120
|
+
|
|
121
|
+
| Model | Input Cost | Output Cost | Relative Speed | Input Cost vs Opus 4.6 | Output Cost vs Opus 4.6 |
|
|
122
|
+
|-------|-----------|-------------|----------------|----------------------|------------------------|
|
|
123
|
+
| Haiku 4.5 | $1 / MTok | $5 / MTok | ~10Γ faster | 5Γ cheaper | 5Γ cheaper |
|
|
124
|
+
| Sonnet 4.6 | $3 / MTok | $15 / MTok | ~3Γ faster | ~1.7Γ cheaper | ~1.7Γ cheaper |
|
|
125
|
+
| Opus 4.6 | $5 / MTok | $25 / MTok | 1Γ (baseline) | 1Γ (baseline) | 1Γ (baseline) |
|
|
126
|
+
|
|
127
|
+
Source: https://platform.claude.com/docs/en/about-claude/pricing
|
|
128
|
+
|
|
129
|
+
### Blended Cost with Hydra (typical 50/30/20 task split)
|
|
130
|
+
|
|
131
|
+
| Metric | All Opus 4.6 | With Hydra | Savings |
|
|
132
|
+
|--------|-------------|------------|---------|
|
|
133
|
+
| Input cost / MTok | $5.00 | $2.40 | 52% |
|
|
134
|
+
| Output cost / MTok | $25.00 | $12.00 | 52% |
|
|
135
|
+
| Blended effective cost | $30.00 / MTok | $14.40 / MTok | ~50% |
|
|
136
|
+
|
|
137
|
+
Note: Savings calculated against Opus 4.6 pricing ($5/$25 per MTok) as of February 2026.
|
|
138
|
+
Savings would be significantly higher when compared to Opus 4.1/4.0 pricing ($15/$75 per MTok).
|
|
139
|
+
|
|
140
|
+
These are approximate ratios. The key insight: for 60-70% of coding tasks, Haiku 4.5 or
|
|
141
|
+
Sonnet 4.6 produces output identical in quality to what Opus 4.6 would produce, but
|
|
142
|
+
dramatically faster and cheaper. The skill is in identifying the 30-40% where Opus 4.6
|
|
143
|
+
is genuinely needed.
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## Acceptance Rate Expectations
|
|
148
|
+
|
|
149
|
+
Drawing from speculative decoding theory, track these metrics mentally:
|
|
150
|
+
|
|
151
|
+
| Draft Model | Expected Acceptance Rate | Notes |
|
|
152
|
+
|-------------|------------------------|-------|
|
|
153
|
+
| Haiku β Opus verification | ~85-90% | For well-classified Tier 1 tasks |
|
|
154
|
+
| Sonnet β Opus verification | ~90-95% | For well-classified Tier 2 tasks |
|
|
155
|
+
| sentinel-scan β sentinel escalation | ~20% | ~80%+ of scans return clean β only ~20% escalate to deep analysis |
|
|
156
|
+
| sentinel β Opus verification | ~95% | Sonnet's deep analysis is highly accurate; Opus rarely overrides |
|
|
157
|
+
|
|
158
|
+
If your acceptance rate drops below 80%, you're likely misclassifying tasks β shift borderline
|
|
159
|
+
tasks to a higher tier. If it's consistently above 95%, you might be too conservative.
|
|
160
|
+
|
|
161
|
+
The analogy to speculative decoding is direct: just as the paper found acceptance rates of
|
|
162
|
+
~0.7-0.9 for draft tokens depending on domain, our task-level acceptance rates should be
|
|
163
|
+
similar or better, since we have more context for classification than a draft model has for
|
|
164
|
+
next-token prediction.
|