ninja-terminals 2.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +2 -17
- package/cli.js +23 -0
- package/lib/auth.js +195 -0
- package/lib/hypothesis-validator.js +346 -0
- package/lib/post-session.js +426 -0
- package/lib/pre-dispatch.js +265 -0
- package/lib/prompt-delivery.js +127 -0
- package/lib/settings-gen.js +82 -23
- package/package.json +8 -6
- package/public/app.js +282 -13
- package/public/index.html +45 -0
- package/public/style.css +300 -0
- package/server.js +358 -33
- package/ORCHESTRATOR-PROMPT.md +0 -295
- package/orchestrator/evolution-log.md +0 -33
- package/orchestrator/identity.md +0 -60
- package/orchestrator/metrics/.gitkeep +0 -0
- package/orchestrator/metrics/raw/.gitkeep +0 -0
- package/orchestrator/metrics/session-2026-03-23-setup.md +0 -54
- package/orchestrator/metrics/session-2026-03-24-appcast-build.md +0 -55
- package/orchestrator/playbooks.md +0 -71
- package/orchestrator/security-protocol.md +0 -69
- package/orchestrator/tool-registry.md +0 -96
package/CLAUDE.md
CHANGED
|
@@ -54,25 +54,10 @@ These status lines are CRITICAL — the orchestrator parses them to know your st
|
|
|
54
54
|
- The orchestrator relays between terminals
|
|
55
55
|
|
|
56
56
|
## MCP Tools
|
|
57
|
-
|
|
58
|
-
- **postforme**: Video rendering, social publishing, Meta ads, content management, brand profiles, asset management, insights/analytics
|
|
59
|
-
- **studychat**: RAG knowledge base, DMs, C2C messaging, document upload/query
|
|
60
|
-
- **chrome-devtools**: Browser automation — navigate, click, type, screenshot, forms, network monitoring
|
|
61
|
-
- **gmail**: Search emails, read messages, download attachments
|
|
62
|
-
- **netlify-billing / render-billing**: Deployment status, billing, service health
|
|
63
|
-
- **builder-pro**: Code review, security scan, auto-fix, architecture validation
|
|
64
|
-
- **gkchatty**: Knowledge base queries, uploads — DO NOT USE unless explicitly instructed
|
|
65
|
-
|
|
66
|
-
### PostForMe Publishing — Use the Right Tool
|
|
67
|
-
| Content Type | Correct Tool | Wrong Tool (will fail) |
|
|
68
|
-
|---|---|---|
|
|
69
|
-
| Video → IG Reel | `publish_meta(contentId, platform: "instagram")` | — |
|
|
70
|
-
| Video → FB | `publish_meta(contentId, platform: "facebook")` | — |
|
|
71
|
-
| Video → Story | `publish_story(contentId, imageUrl/videoUrl)` | publish_meta |
|
|
72
|
-
| Carousel (multi-image) | `publish_carousel(imageUrls: [...], caption)` | publish_meta |
|
|
57
|
+
Ninja Terminals works with any MCP tools you have configured. The orchestrator and workers will automatically detect and use your installed MCP servers.
|
|
73
58
|
|
|
74
59
|
### Tool Selection Priority
|
|
75
|
-
1. Check
|
|
60
|
+
1. Check your available tool list first — verify it accepts the parameters you need
|
|
76
61
|
2. Use the most direct tool available (MCP > browser automation > manual)
|
|
77
62
|
3. If an MCP tool exists for the task, prefer it over browser-driving
|
|
78
63
|
4. Use browser automation for websites without an MCP/API
|
package/cli.js
CHANGED
|
@@ -33,13 +33,23 @@ OPTIONS
|
|
|
33
33
|
--port <number> Port to listen on (default: 3300)
|
|
34
34
|
--terminals <number> Number of terminals to spawn (default: 4)
|
|
35
35
|
--cwd <path> Working directory for terminals (default: current dir)
|
|
36
|
+
--token <jwt> Auth token for Pro users / CI (skips browser login)
|
|
37
|
+
--offline Offline mode for Pro users (skips backend validation)
|
|
36
38
|
--version, -v Print version and exit
|
|
37
39
|
--help, -h Show this help message
|
|
38
40
|
|
|
41
|
+
AUTHENTICATION
|
|
42
|
+
Pro users can authenticate via:
|
|
43
|
+
1. Browser login (default) - sign in at the web UI
|
|
44
|
+
2. --token flag - pass JWT directly (useful for CI/scripts)
|
|
45
|
+
3. --offline flag - skip validation (requires downloaded Pro package)
|
|
46
|
+
|
|
39
47
|
EXAMPLES
|
|
40
48
|
npx ninja-terminals
|
|
41
49
|
npx ninja-terminals --port 3301 --terminals 2
|
|
42
50
|
npx ninja-terminals --cwd /path/to/my-project
|
|
51
|
+
npx ninja-terminals --token eyJhbGciOiJIUzI1NiIs...
|
|
52
|
+
npx ninja-terminals --offline
|
|
43
53
|
`);
|
|
44
54
|
process.exit(0);
|
|
45
55
|
}
|
|
@@ -52,6 +62,8 @@ if (hasFlag('--version') || hasFlag('-v')) {
|
|
|
52
62
|
const port = parseInt(getArg('--port', '3300'), 10);
|
|
53
63
|
const terminals = parseInt(getArg('--terminals', '4'), 10);
|
|
54
64
|
const cwd = getArg('--cwd', process.cwd());
|
|
65
|
+
const token = getArg('--token', null);
|
|
66
|
+
const offline = hasFlag('--offline');
|
|
55
67
|
|
|
56
68
|
if (isNaN(port) || port < 1 || port > 65535) {
|
|
57
69
|
console.error(`Error: --port must be a number between 1 and 65535`);
|
|
@@ -65,6 +77,8 @@ if (isNaN(terminals) || terminals < 1 || terminals > 16) {
|
|
|
65
77
|
|
|
66
78
|
// ── Startup banner ───────────────────────────────────────────
|
|
67
79
|
|
|
80
|
+
const authMode = offline ? 'offline' : (token ? 'token' : 'browser');
|
|
81
|
+
|
|
68
82
|
console.log(`
|
|
69
83
|
╔═══════════════════════════════════════╗
|
|
70
84
|
║ NINJA TERMINALS v${pkg.version} ║
|
|
@@ -73,6 +87,7 @@ console.log(`
|
|
|
73
87
|
║ Port : ${String(port).padEnd(24)} ║
|
|
74
88
|
║ Terminals : ${String(terminals).padEnd(24)} ║
|
|
75
89
|
║ CWD : ${cwd.length > 24 ? '...' + cwd.slice(-21) : cwd.padEnd(24)} ║
|
|
90
|
+
║ Auth : ${authMode.padEnd(24)} ║
|
|
76
91
|
╚═══════════════════════════════════════╝
|
|
77
92
|
`);
|
|
78
93
|
|
|
@@ -84,6 +99,14 @@ process.env.PORT = String(port);
|
|
|
84
99
|
process.env.DEFAULT_TERMINALS = String(terminals);
|
|
85
100
|
process.env.DEFAULT_CWD = cwd;
|
|
86
101
|
|
|
102
|
+
// Auth env vars
|
|
103
|
+
if (token) {
|
|
104
|
+
process.env.NINJA_AUTH_TOKEN = token;
|
|
105
|
+
}
|
|
106
|
+
if (offline) {
|
|
107
|
+
process.env.NINJA_OFFLINE = '1';
|
|
108
|
+
}
|
|
109
|
+
|
|
87
110
|
// ── Auto-open browser ────────────────────────────────────────
|
|
88
111
|
|
|
89
112
|
function openBrowser(url) {
|
package/lib/auth.js
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
// Auth module — Token validation and session middleware for Ninja Terminals
|
|
5
|
+
// ---------------------------------------------------------------------------
|
|
6
|
+
|
|
7
|
+
const BACKEND_URL = process.env.NINJA_BACKEND_URL || 'https://emtchat-backend.onrender.com';
|
|
8
|
+
|
|
9
|
+
// In-memory cache for validated sessions (token -> session data)
|
|
10
|
+
// Used as fallback when network is unavailable
|
|
11
|
+
const validationCache = new Map();
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Validate a token against the backend.
|
|
15
|
+
*
|
|
16
|
+
* @param {string} token - Bearer token to validate
|
|
17
|
+
* @returns {Promise<{valid: boolean, tier: string, terminalsMax: number, features: string[]}|null>}
|
|
18
|
+
*/
|
|
19
|
+
async function validateToken(token) {
|
|
20
|
+
if (!token) return null;
|
|
21
|
+
|
|
22
|
+
try {
|
|
23
|
+
const response = await fetch(`${BACKEND_URL}/api/ninja/validate`, {
|
|
24
|
+
method: 'POST',
|
|
25
|
+
headers: {
|
|
26
|
+
'Authorization': `Bearer ${token}`,
|
|
27
|
+
'Content-Type': 'application/json',
|
|
28
|
+
},
|
|
29
|
+
signal: AbortSignal.timeout(10000), // 10s timeout
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
if (!response.ok) {
|
|
33
|
+
// Token invalid or expired
|
|
34
|
+
return null;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
const data = await response.json();
|
|
38
|
+
|
|
39
|
+
// Cache the result
|
|
40
|
+
const result = {
|
|
41
|
+
valid: true,
|
|
42
|
+
tier: data.tier || 'free',
|
|
43
|
+
terminalsMax: data.terminalsMax || 1,
|
|
44
|
+
features: data.features || [],
|
|
45
|
+
validatedAt: Date.now(),
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
validationCache.set(token, result);
|
|
49
|
+
return result;
|
|
50
|
+
|
|
51
|
+
} catch (err) {
|
|
52
|
+
// Network error — check cache for fallback
|
|
53
|
+
const cached = validationCache.get(token);
|
|
54
|
+
if (cached && cached.valid) {
|
|
55
|
+
console.warn(`[auth] Network error validating token, using cache: ${err.message}`);
|
|
56
|
+
return cached;
|
|
57
|
+
}
|
|
58
|
+
return null;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Create Express middleware that validates Authorization Bearer tokens.
|
|
64
|
+
*
|
|
65
|
+
* @param {Map} sessionCache - Shared session cache (token -> session data)
|
|
66
|
+
* @returns {import('express').RequestHandler}
|
|
67
|
+
*/
|
|
68
|
+
function createAuthMiddleware(sessionCache) {
|
|
69
|
+
return async function authMiddleware(req, res, next) {
|
|
70
|
+
// Extract Bearer token from Authorization header
|
|
71
|
+
const authHeader = req.headers.authorization;
|
|
72
|
+
if (!authHeader || !authHeader.startsWith('Bearer ')) {
|
|
73
|
+
return res.status(401).json({ error: 'Missing or invalid Authorization header' });
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
const token = authHeader.slice(7); // Remove 'Bearer ' prefix
|
|
77
|
+
if (!token) {
|
|
78
|
+
return res.status(401).json({ error: 'Empty token' });
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Check session cache first
|
|
82
|
+
const cached = sessionCache.get(token);
|
|
83
|
+
const now = Date.now();
|
|
84
|
+
const CACHE_TTL = 5 * 60 * 1000; // 5 minutes
|
|
85
|
+
|
|
86
|
+
if (cached && (now - cached.validatedAt) < CACHE_TTL) {
|
|
87
|
+
// Cache hit and fresh
|
|
88
|
+
req.ninjaUser = {
|
|
89
|
+
tier: cached.tier,
|
|
90
|
+
terminalsMax: cached.terminalsMax,
|
|
91
|
+
features: cached.features,
|
|
92
|
+
token,
|
|
93
|
+
};
|
|
94
|
+
return next();
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// Cache miss or stale — validate against backend
|
|
98
|
+
const result = await validateToken(token);
|
|
99
|
+
|
|
100
|
+
if (!result || !result.valid) {
|
|
101
|
+
return res.status(401).json({ error: 'Invalid or expired token' });
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// Update cache
|
|
105
|
+
sessionCache.set(token, {
|
|
106
|
+
tier: result.tier,
|
|
107
|
+
terminalsMax: result.terminalsMax,
|
|
108
|
+
features: result.features,
|
|
109
|
+
validatedAt: now,
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
req.ninjaUser = {
|
|
113
|
+
tier: result.tier,
|
|
114
|
+
terminalsMax: result.terminalsMax,
|
|
115
|
+
features: result.features,
|
|
116
|
+
token,
|
|
117
|
+
};
|
|
118
|
+
|
|
119
|
+
next();
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* WebSocket token validation for upgrade requests.
|
|
125
|
+
*
|
|
126
|
+
* @param {string} token - Token from query param
|
|
127
|
+
* @param {Map} sessionCache - Shared session cache
|
|
128
|
+
* @returns {Promise<{valid: boolean, tier?: string, terminalsMax?: number, features?: string[]}>}
|
|
129
|
+
*/
|
|
130
|
+
async function validateWebSocketToken(token, sessionCache) {
|
|
131
|
+
if (!token) {
|
|
132
|
+
return { valid: false };
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// Check session cache first
|
|
136
|
+
const cached = sessionCache.get(token);
|
|
137
|
+
const now = Date.now();
|
|
138
|
+
const CACHE_TTL = 5 * 60 * 1000;
|
|
139
|
+
|
|
140
|
+
if (cached && (now - cached.validatedAt) < CACHE_TTL) {
|
|
141
|
+
return { valid: true, ...cached };
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Validate against backend
|
|
145
|
+
const result = await validateToken(token);
|
|
146
|
+
if (!result || !result.valid) {
|
|
147
|
+
return { valid: false };
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// Update cache
|
|
151
|
+
sessionCache.set(token, {
|
|
152
|
+
tier: result.tier,
|
|
153
|
+
terminalsMax: result.terminalsMax,
|
|
154
|
+
features: result.features,
|
|
155
|
+
validatedAt: now,
|
|
156
|
+
});
|
|
157
|
+
|
|
158
|
+
return { valid: true, ...result };
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Start heartbeat that re-validates stored sessions.
|
|
163
|
+
* If a session becomes invalid, the callback is invoked to clean up.
|
|
164
|
+
*
|
|
165
|
+
* @param {Map} sessionCache - Shared session cache
|
|
166
|
+
* @param {(token: string) => void} onInvalid - Callback when a session becomes invalid
|
|
167
|
+
* @param {number} [intervalMs=300000] - Heartbeat interval (default 5 min)
|
|
168
|
+
* @returns {NodeJS.Timeout} Interval handle
|
|
169
|
+
*/
|
|
170
|
+
function startSessionHeartbeat(sessionCache, onInvalid, intervalMs = 5 * 60 * 1000) {
|
|
171
|
+
return setInterval(async () => {
|
|
172
|
+
for (const [token, session] of sessionCache.entries()) {
|
|
173
|
+
const result = await validateToken(token);
|
|
174
|
+
if (!result || !result.valid) {
|
|
175
|
+
console.log(`[auth] Session invalidated during heartbeat`);
|
|
176
|
+
sessionCache.delete(token);
|
|
177
|
+
onInvalid(token);
|
|
178
|
+
} else {
|
|
179
|
+
// Update cached data
|
|
180
|
+
session.tier = result.tier;
|
|
181
|
+
session.terminalsMax = result.terminalsMax;
|
|
182
|
+
session.features = result.features;
|
|
183
|
+
session.validatedAt = Date.now();
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
}, intervalMs);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
module.exports = {
|
|
190
|
+
validateToken,
|
|
191
|
+
createAuthMiddleware,
|
|
192
|
+
validateWebSocketToken,
|
|
193
|
+
startSessionHeartbeat,
|
|
194
|
+
BACKEND_URL,
|
|
195
|
+
};
|
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const fs = require('fs');
|
|
4
|
+
const path = require('path');
|
|
5
|
+
const { parsePlaybooks } = require('./playbook-tracker');
|
|
6
|
+
const { SUMMARIES_PATH } = require('./analyze-session');
|
|
7
|
+
|
|
8
|
+
const STATE_PATH = path.join(__dirname, '..', 'orchestrator', 'metrics', 'hypothesis-state.json');
|
|
9
|
+
|
|
10
|
+
// Decision thresholds from Phase 4 spec
|
|
11
|
+
const MIN_TEST_SESSIONS = 3;
|
|
12
|
+
const IMPROVEMENT_THRESHOLD = 0.10; // 10%
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Extract metric targets from hypothesis text.
|
|
16
|
+
* Maps hypothesis claims to measurable metrics.
|
|
17
|
+
* @param {string} hypothesisText - The full hypothesis section text
|
|
18
|
+
* @returns {object} { type: 'tool'|'session'|'pattern', target: string, metric: string }
|
|
19
|
+
*/
|
|
20
|
+
function extractMetricTarget(hypothesisText) {
|
|
21
|
+
const text = hypothesisText.toLowerCase();
|
|
22
|
+
|
|
23
|
+
// Tool-specific hypotheses: "Edit has C rating", "prefer Write over Edit", "Glob is reliable"
|
|
24
|
+
const toolPatterns = [
|
|
25
|
+
{ regex: /\b(edit|write|read|bash|glob|grep|agent)\b.*\b(rating|reliable|failure|prefer)/i, metric: 'success_rate' },
|
|
26
|
+
{ regex: /prefer\s+(\w+)\s+over\s+(\w+)/i, metric: 'success_rate' },
|
|
27
|
+
{ regex: /\b(\w+)\s+has\s+[a-s]\s+rating/i, metric: 'success_rate' },
|
|
28
|
+
];
|
|
29
|
+
|
|
30
|
+
for (const pattern of toolPatterns) {
|
|
31
|
+
const match = hypothesisText.match(pattern.regex);
|
|
32
|
+
if (match) {
|
|
33
|
+
// Extract the tool name (capitalize first letter)
|
|
34
|
+
const toolName = match[1].charAt(0).toUpperCase() + match[1].slice(1).toLowerCase();
|
|
35
|
+
return { type: 'tool', target: toolName, metric: pattern.metric };
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// Session-level hypotheses: "staggered dispatch", "session time"
|
|
40
|
+
if (text.includes('staggered') || text.includes('dispatch') || text.includes('session time')) {
|
|
41
|
+
return { type: 'session', target: 'duration', metric: 'duration_min' };
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Default: overall tool success rate
|
|
45
|
+
return { type: 'aggregate', target: 'all_tools', metric: 'success_rate' };
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Load all session summaries from NDJSON file.
|
|
50
|
+
* @param {string} summariesPath
|
|
51
|
+
* @returns {Array<object>} Parsed session summaries
|
|
52
|
+
*/
|
|
53
|
+
function loadSummaries(summariesPath) {
|
|
54
|
+
const filePath = summariesPath || SUMMARIES_PATH;
|
|
55
|
+
if (!fs.existsSync(filePath)) return [];
|
|
56
|
+
|
|
57
|
+
const lines = fs.readFileSync(filePath, 'utf8').trim().split('\n').filter(Boolean);
|
|
58
|
+
const summaries = [];
|
|
59
|
+
|
|
60
|
+
for (const line of lines) {
|
|
61
|
+
try {
|
|
62
|
+
summaries.push(JSON.parse(line));
|
|
63
|
+
} catch { /* skip malformed */ }
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Deduplicate by session_id (keep latest)
|
|
67
|
+
const seen = new Map();
|
|
68
|
+
for (const s of summaries) {
|
|
69
|
+
seen.set(s.session_id, s);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
return Array.from(seen.values());
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Load or initialize hypothesis tracking state.
|
|
77
|
+
* Tracks when each hypothesis was first observed (by session count).
|
|
78
|
+
* @returns {object} { hypotheses: { [name]: { firstSeenAt: number, sessionCount: number } } }
|
|
79
|
+
*/
|
|
80
|
+
function loadState() {
|
|
81
|
+
if (!fs.existsSync(STATE_PATH)) {
|
|
82
|
+
return { hypotheses: {} };
|
|
83
|
+
}
|
|
84
|
+
try {
|
|
85
|
+
return JSON.parse(fs.readFileSync(STATE_PATH, 'utf8'));
|
|
86
|
+
} catch {
|
|
87
|
+
return { hypotheses: {} };
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Save hypothesis tracking state.
|
|
93
|
+
* @param {object} state
|
|
94
|
+
*/
|
|
95
|
+
function saveState(state) {
|
|
96
|
+
const dir = path.dirname(STATE_PATH);
|
|
97
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
98
|
+
fs.writeFileSync(STATE_PATH, JSON.stringify(state, null, 2), 'utf8');
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* Compute aggregate metrics for a set of sessions.
|
|
103
|
+
* @param {Array<object>} sessions
|
|
104
|
+
* @param {object} target - { type, target, metric }
|
|
105
|
+
* @returns {object} { value: number, sampleSize: number }
|
|
106
|
+
*/
|
|
107
|
+
function computeMetric(sessions, target) {
|
|
108
|
+
if (sessions.length === 0) {
|
|
109
|
+
return { value: null, sampleSize: 0 };
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
if (target.type === 'tool') {
|
|
113
|
+
// Aggregate tool-specific metrics
|
|
114
|
+
let totalInvocations = 0;
|
|
115
|
+
let totalSuccesses = 0;
|
|
116
|
+
|
|
117
|
+
for (const s of sessions) {
|
|
118
|
+
const toolData = s.tools?.[target.target];
|
|
119
|
+
if (toolData) {
|
|
120
|
+
totalInvocations += toolData.invocations || 0;
|
|
121
|
+
totalSuccesses += toolData.successes || 0;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
if (totalInvocations === 0) {
|
|
126
|
+
return { value: null, sampleSize: 0 };
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
return {
|
|
130
|
+
value: totalSuccesses / totalInvocations,
|
|
131
|
+
sampleSize: totalInvocations,
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
if (target.type === 'session') {
|
|
136
|
+
// Session-level metrics (e.g., duration)
|
|
137
|
+
const values = sessions.map(s => s[target.metric]).filter(v => v != null && v > 0);
|
|
138
|
+
if (values.length === 0) {
|
|
139
|
+
return { value: null, sampleSize: 0 };
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
const avg = values.reduce((a, b) => a + b, 0) / values.length;
|
|
143
|
+
return { value: avg, sampleSize: values.length };
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Aggregate: all tools combined
|
|
147
|
+
let totalInvocations = 0;
|
|
148
|
+
let totalSuccesses = 0;
|
|
149
|
+
|
|
150
|
+
for (const s of sessions) {
|
|
151
|
+
for (const toolData of Object.values(s.tools || {})) {
|
|
152
|
+
totalInvocations += toolData.invocations || 0;
|
|
153
|
+
totalSuccesses += toolData.successes || 0;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
if (totalInvocations === 0) {
|
|
158
|
+
return { value: null, sampleSize: 0 };
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
return {
|
|
162
|
+
value: totalSuccesses / totalInvocations,
|
|
163
|
+
sampleSize: totalInvocations,
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Calculate percentage change between baseline and test.
|
|
169
|
+
* For success rates: positive = improvement
|
|
170
|
+
* For duration: negative = improvement (faster)
|
|
171
|
+
* @param {number} baseline
|
|
172
|
+
* @param {number} test
|
|
173
|
+
* @param {string} metric
|
|
174
|
+
* @returns {number} Change as decimal (-0.15 = 15% worse, 0.15 = 15% better)
|
|
175
|
+
*/
|
|
176
|
+
function calculateChange(baseline, test, metric) {
|
|
177
|
+
if (baseline === 0 || baseline == null || test == null) {
|
|
178
|
+
return null;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
const rawChange = (test - baseline) / baseline;
|
|
182
|
+
|
|
183
|
+
// For duration, lower is better, so invert
|
|
184
|
+
if (metric === 'duration_min') {
|
|
185
|
+
return -rawChange;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
return rawChange;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
/**
|
|
192
|
+
* Validate all hypotheses in playbooks.md against session metrics.
|
|
193
|
+
* Compares baseline (before hypothesis) vs test (after hypothesis) periods.
|
|
194
|
+
*
|
|
195
|
+
* Decision rules (from Phase 4 spec):
|
|
196
|
+
* - 3+ test sessions AND metric improved by >10% -> promote
|
|
197
|
+
* - 3+ test sessions AND metric worsened by >10% -> reject
|
|
198
|
+
* - Otherwise -> continue (need more data)
|
|
199
|
+
*
|
|
200
|
+
* @param {string} playbooksPath - Path to playbooks.md
|
|
201
|
+
* @param {string} summariesPath - Path to summaries.ndjson
|
|
202
|
+
* @returns {Array<{ hypothesis: string, decision: 'promote'|'reject'|'continue', evidence: string, metrics: object }>}
|
|
203
|
+
*/
|
|
204
|
+
function validateHypotheses(playbooksPath, summariesPath) {
|
|
205
|
+
const playbooks = parsePlaybooks(playbooksPath);
|
|
206
|
+
const summaries = loadSummaries(summariesPath);
|
|
207
|
+
const state = loadState();
|
|
208
|
+
const results = [];
|
|
209
|
+
|
|
210
|
+
// Filter for hypothesis/testing entries (status may contain extra text)
|
|
211
|
+
const hypotheses = playbooks.filter(p =>
|
|
212
|
+
p.status.includes('hypothesis') || p.status.includes('testing')
|
|
213
|
+
);
|
|
214
|
+
|
|
215
|
+
const currentSessionCount = summaries.length;
|
|
216
|
+
|
|
217
|
+
for (const hyp of hypotheses) {
|
|
218
|
+
// Initialize tracking if new hypothesis
|
|
219
|
+
if (!state.hypotheses[hyp.name]) {
|
|
220
|
+
state.hypotheses[hyp.name] = {
|
|
221
|
+
firstSeenAt: currentSessionCount,
|
|
222
|
+
sessionCountAtStart: currentSessionCount,
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
const tracking = state.hypotheses[hyp.name];
|
|
227
|
+
const target = extractMetricTarget(hyp.section);
|
|
228
|
+
|
|
229
|
+
// Split sessions into baseline (before hypothesis) and test (after)
|
|
230
|
+
const baselineSessions = summaries.slice(0, tracking.firstSeenAt);
|
|
231
|
+
const testSessions = summaries.slice(tracking.firstSeenAt);
|
|
232
|
+
|
|
233
|
+
const baselineMetric = computeMetric(baselineSessions, target);
|
|
234
|
+
const testMetric = computeMetric(testSessions, target);
|
|
235
|
+
|
|
236
|
+
const change = calculateChange(baselineMetric.value, testMetric.value, target.metric);
|
|
237
|
+
|
|
238
|
+
// Build evidence string
|
|
239
|
+
const evidenceParts = [];
|
|
240
|
+
evidenceParts.push(`Target: ${target.target} (${target.metric})`);
|
|
241
|
+
evidenceParts.push(`Baseline: ${baselineMetric.value?.toFixed(3) ?? 'N/A'} (${baselineMetric.sampleSize} samples)`);
|
|
242
|
+
evidenceParts.push(`Test: ${testMetric.value?.toFixed(3) ?? 'N/A'} (${testMetric.sampleSize} samples)`);
|
|
243
|
+
if (change != null) {
|
|
244
|
+
const changePercent = (change * 100).toFixed(1);
|
|
245
|
+
evidenceParts.push(`Change: ${change > 0 ? '+' : ''}${changePercent}%`);
|
|
246
|
+
}
|
|
247
|
+
evidenceParts.push(`Test sessions: ${testSessions.length}`);
|
|
248
|
+
|
|
249
|
+
// Decision logic
|
|
250
|
+
let decision = 'continue';
|
|
251
|
+
let reason = '';
|
|
252
|
+
|
|
253
|
+
if (testSessions.length < MIN_TEST_SESSIONS) {
|
|
254
|
+
reason = `Need ${MIN_TEST_SESSIONS}+ test sessions, have ${testSessions.length}`;
|
|
255
|
+
} else if (change == null) {
|
|
256
|
+
reason = 'Insufficient metric data for comparison';
|
|
257
|
+
} else if (change >= IMPROVEMENT_THRESHOLD) {
|
|
258
|
+
decision = 'promote';
|
|
259
|
+
reason = `Improved by ${(change * 100).toFixed(1)}% (>${IMPROVEMENT_THRESHOLD * 100}% threshold)`;
|
|
260
|
+
} else if (change <= -IMPROVEMENT_THRESHOLD) {
|
|
261
|
+
decision = 'reject';
|
|
262
|
+
reason = `Worsened by ${(-change * 100).toFixed(1)}% (>${IMPROVEMENT_THRESHOLD * 100}% threshold)`;
|
|
263
|
+
} else {
|
|
264
|
+
reason = `Change of ${(change * 100).toFixed(1)}% within neutral zone (±${IMPROVEMENT_THRESHOLD * 100}%)`;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
results.push({
|
|
268
|
+
hypothesis: hyp.name,
|
|
269
|
+
status: hyp.status,
|
|
270
|
+
decision,
|
|
271
|
+
evidence: evidenceParts.join(' | ') + ` | ${reason}`,
|
|
272
|
+
metrics: {
|
|
273
|
+
target,
|
|
274
|
+
baseline: baselineMetric,
|
|
275
|
+
test: testMetric,
|
|
276
|
+
change,
|
|
277
|
+
testSessionCount: testSessions.length,
|
|
278
|
+
},
|
|
279
|
+
});
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
// Save updated state
|
|
283
|
+
saveState(state);
|
|
284
|
+
|
|
285
|
+
return results;
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
/**
|
|
289
|
+
* Get a summary of hypothesis validation status.
|
|
290
|
+
* @param {Array} results - Output from validateHypotheses
|
|
291
|
+
* @returns {object} { total, promote, reject, continue, summary: string }
|
|
292
|
+
*/
|
|
293
|
+
function summarizeResults(results) {
|
|
294
|
+
const counts = {
|
|
295
|
+
total: results.length,
|
|
296
|
+
promote: results.filter(r => r.decision === 'promote').length,
|
|
297
|
+
reject: results.filter(r => r.decision === 'reject').length,
|
|
298
|
+
continue: results.filter(r => r.decision === 'continue').length,
|
|
299
|
+
};
|
|
300
|
+
|
|
301
|
+
const lines = [
|
|
302
|
+
`Hypothesis validation: ${counts.total} total`,
|
|
303
|
+
` Promote: ${counts.promote}`,
|
|
304
|
+
` Reject: ${counts.reject}`,
|
|
305
|
+
` Continue: ${counts.continue}`,
|
|
306
|
+
];
|
|
307
|
+
|
|
308
|
+
for (const r of results) {
|
|
309
|
+
lines.push(` - [${r.decision.toUpperCase()}] ${r.hypothesis}`);
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
return { ...counts, summary: lines.join('\n') };
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// CLI mode
|
|
316
|
+
if (require.main === module) {
|
|
317
|
+
const playbooksPath = process.argv[2] || path.join(__dirname, '..', 'orchestrator', 'playbooks.md');
|
|
318
|
+
const summariesPath = process.argv[3] || SUMMARIES_PATH;
|
|
319
|
+
|
|
320
|
+
console.log('Validating hypotheses...');
|
|
321
|
+
console.log(' Playbooks:', playbooksPath);
|
|
322
|
+
console.log(' Summaries:', summariesPath);
|
|
323
|
+
console.log('');
|
|
324
|
+
|
|
325
|
+
const results = validateHypotheses(playbooksPath, summariesPath);
|
|
326
|
+
const summary = summarizeResults(results);
|
|
327
|
+
|
|
328
|
+
console.log(summary.summary);
|
|
329
|
+
console.log('');
|
|
330
|
+
console.log('Details:');
|
|
331
|
+
for (const r of results) {
|
|
332
|
+
console.log(`\n${r.hypothesis}:`);
|
|
333
|
+
console.log(` Decision: ${r.decision}`);
|
|
334
|
+
console.log(` Evidence: ${r.evidence}`);
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
module.exports = {
|
|
339
|
+
validateHypotheses,
|
|
340
|
+
summarizeResults,
|
|
341
|
+
extractMetricTarget,
|
|
342
|
+
loadSummaries,
|
|
343
|
+
computeMetric,
|
|
344
|
+
calculateChange,
|
|
345
|
+
STATE_PATH,
|
|
346
|
+
};
|