@dotsetlabs/dotclaw 2.4.0 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +9 -10
- package/README.md +8 -4
- package/config-examples/runtime.json +34 -8
- package/config-examples/tool-policy.json +12 -2
- package/container/agent-runner/package-lock.json +2 -2
- package/container/agent-runner/package.json +1 -1
- package/container/agent-runner/src/agent-config.ts +19 -3
- package/container/agent-runner/src/container-protocol.ts +11 -0
- package/container/agent-runner/src/context-overflow-recovery.ts +39 -0
- package/container/agent-runner/src/index.ts +603 -165
- package/container/agent-runner/src/openrouter-input.ts +159 -0
- package/container/agent-runner/src/system-prompt.ts +13 -3
- package/container/agent-runner/src/tool-loop-policy.ts +741 -0
- package/container/agent-runner/src/tools.ts +211 -8
- package/dist/agent-context.d.ts +1 -0
- package/dist/agent-context.d.ts.map +1 -1
- package/dist/agent-context.js +21 -9
- package/dist/agent-context.js.map +1 -1
- package/dist/agent-execution.d.ts +2 -0
- package/dist/agent-execution.d.ts.map +1 -1
- package/dist/agent-execution.js +164 -15
- package/dist/agent-execution.js.map +1 -1
- package/dist/agent-semaphore.d.ts +24 -1
- package/dist/agent-semaphore.d.ts.map +1 -1
- package/dist/agent-semaphore.js +109 -20
- package/dist/agent-semaphore.js.map +1 -1
- package/dist/cli.js +3 -11
- package/dist/cli.js.map +1 -1
- package/dist/config.d.ts +2 -0
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +2 -0
- package/dist/config.js.map +1 -1
- package/dist/container-protocol.d.ts +22 -0
- package/dist/container-protocol.d.ts.map +1 -1
- package/dist/container-protocol.js.map +1 -1
- package/dist/container-runner.d.ts +7 -0
- package/dist/container-runner.d.ts.map +1 -1
- package/dist/container-runner.js +417 -143
- package/dist/container-runner.js.map +1 -1
- package/dist/db.d.ts.map +1 -1
- package/dist/db.js +46 -12
- package/dist/db.js.map +1 -1
- package/dist/error-messages.d.ts.map +1 -1
- package/dist/error-messages.js +18 -4
- package/dist/error-messages.js.map +1 -1
- package/dist/failover-policy.d.ts +41 -0
- package/dist/failover-policy.d.ts.map +1 -0
- package/dist/failover-policy.js +261 -0
- package/dist/failover-policy.js.map +1 -0
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/dist/ipc-dispatcher.d.ts.map +1 -1
- package/dist/ipc-dispatcher.js +27 -43
- package/dist/ipc-dispatcher.js.map +1 -1
- package/dist/mcp-config.d.ts +22 -0
- package/dist/mcp-config.d.ts.map +1 -0
- package/dist/mcp-config.js +94 -0
- package/dist/mcp-config.js.map +1 -0
- package/dist/memory-backend.d.ts +27 -0
- package/dist/memory-backend.d.ts.map +1 -0
- package/dist/memory-backend.js +112 -0
- package/dist/memory-backend.js.map +1 -0
- package/dist/memory-recall.d.ts.map +1 -1
- package/dist/memory-recall.js +135 -22
- package/dist/memory-recall.js.map +1 -1
- package/dist/memory-store.d.ts +1 -0
- package/dist/memory-store.d.ts.map +1 -1
- package/dist/memory-store.js +55 -7
- package/dist/memory-store.js.map +1 -1
- package/dist/message-pipeline.d.ts +24 -0
- package/dist/message-pipeline.d.ts.map +1 -1
- package/dist/message-pipeline.js +131 -27
- package/dist/message-pipeline.js.map +1 -1
- package/dist/metrics.d.ts +1 -0
- package/dist/metrics.d.ts.map +1 -1
- package/dist/metrics.js +9 -0
- package/dist/metrics.js.map +1 -1
- package/dist/providers/discord/discord-provider.d.ts.map +1 -1
- package/dist/providers/discord/discord-provider.js +72 -4
- package/dist/providers/discord/discord-provider.js.map +1 -1
- package/dist/providers/telegram/telegram-provider.d.ts.map +1 -1
- package/dist/providers/telegram/telegram-provider.js +65 -3
- package/dist/providers/telegram/telegram-provider.js.map +1 -1
- package/dist/recall-policy.d.ts +12 -0
- package/dist/recall-policy.d.ts.map +1 -0
- package/dist/recall-policy.js +89 -0
- package/dist/recall-policy.js.map +1 -0
- package/dist/runtime-config.d.ts +33 -0
- package/dist/runtime-config.d.ts.map +1 -1
- package/dist/runtime-config.js +109 -9
- package/dist/runtime-config.js.map +1 -1
- package/dist/streaming.d.ts.map +1 -1
- package/dist/streaming.js +125 -33
- package/dist/streaming.js.map +1 -1
- package/dist/task-scheduler.d.ts.map +1 -1
- package/dist/task-scheduler.js +4 -2
- package/dist/task-scheduler.js.map +1 -1
- package/dist/tool-policy.d.ts.map +1 -1
- package/dist/tool-policy.js +26 -4
- package/dist/tool-policy.js.map +1 -1
- package/dist/trace-writer.d.ts +12 -0
- package/dist/trace-writer.d.ts.map +1 -1
- package/dist/trace-writer.js.map +1 -1
- package/dist/turn-hygiene.d.ts +14 -0
- package/dist/turn-hygiene.d.ts.map +1 -0
- package/dist/turn-hygiene.js +214 -0
- package/dist/turn-hygiene.js.map +1 -0
- package/dist/webhook.d.ts.map +1 -1
- package/dist/webhook.js +1 -0
- package/dist/webhook.js.map +1 -1
- package/package.json +15 -1
- package/scripts/benchmark-baseline.js +365 -0
- package/scripts/benchmark-harness.js +1413 -0
- package/scripts/benchmark-scenarios.js +301 -0
- package/scripts/canary-suite.js +123 -0
- package/scripts/generate-controlled-traces.js +230 -0
- package/scripts/release-slo-check.js +214 -0
- package/scripts/run-live-canary.js +339 -0
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import fs from 'node:fs';
|
|
4
|
+
import os from 'node:os';
|
|
5
|
+
import path from 'node:path';
|
|
6
|
+
|
|
7
|
+
const DEFAULT_THRESHOLDS = {
|
|
8
|
+
memory_carryover: { min_candidates: 5, min_pass_rate: 0.9 },
|
|
9
|
+
tool_heavy: { min_candidates: 5, min_pass_rate: 0.95 },
|
|
10
|
+
transient_recovery: { min_candidates: 3, min_pass_rate: 0.8 },
|
|
11
|
+
context_recovery: { min_candidates: 2, min_pass_rate: 0.75 },
|
|
12
|
+
empty_success_rate: { min_success: 20, max_rate: 0.02 }
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
function parseArgs(argv) {
|
|
16
|
+
const args = {
|
|
17
|
+
days: 7,
|
|
18
|
+
dir: '',
|
|
19
|
+
input: '',
|
|
20
|
+
enforce: false,
|
|
21
|
+
recoveryWindowMs: 10 * 60 * 1000,
|
|
22
|
+
};
|
|
23
|
+
for (let i = 0; i < argv.length; i += 1) {
|
|
24
|
+
const arg = argv[i];
|
|
25
|
+
if (arg === '--days' && i + 1 < argv.length) {
|
|
26
|
+
const value = Number(argv[i + 1]);
|
|
27
|
+
if (Number.isFinite(value) && value > 0) args.days = Math.floor(value);
|
|
28
|
+
i += 1;
|
|
29
|
+
continue;
|
|
30
|
+
}
|
|
31
|
+
if (arg === '--dir' && i + 1 < argv.length) {
|
|
32
|
+
args.dir = argv[i + 1];
|
|
33
|
+
i += 1;
|
|
34
|
+
continue;
|
|
35
|
+
}
|
|
36
|
+
if (arg === '--input' && i + 1 < argv.length) {
|
|
37
|
+
args.input = argv[i + 1];
|
|
38
|
+
i += 1;
|
|
39
|
+
continue;
|
|
40
|
+
}
|
|
41
|
+
if (arg === '--enforce') {
|
|
42
|
+
args.enforce = true;
|
|
43
|
+
continue;
|
|
44
|
+
}
|
|
45
|
+
if (arg === '--recovery-window-ms' && i + 1 < argv.length) {
|
|
46
|
+
const value = Number(argv[i + 1]);
|
|
47
|
+
if (Number.isFinite(value) && value > 0) args.recoveryWindowMs = value;
|
|
48
|
+
i += 1;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
return args;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function safeJsonParse(line) {
|
|
55
|
+
try {
|
|
56
|
+
return JSON.parse(line);
|
|
57
|
+
} catch {
|
|
58
|
+
return null;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
function loadJsonlFile(filePath) {
|
|
63
|
+
if (!fs.existsSync(filePath)) return [];
|
|
64
|
+
let content = '';
|
|
65
|
+
try {
|
|
66
|
+
content = fs.readFileSync(filePath, 'utf-8');
|
|
67
|
+
} catch {
|
|
68
|
+
return [];
|
|
69
|
+
}
|
|
70
|
+
const rows = [];
|
|
71
|
+
for (const line of content.split('\n')) {
|
|
72
|
+
if (!line.trim()) continue;
|
|
73
|
+
const parsed = safeJsonParse(line);
|
|
74
|
+
if (parsed && typeof parsed === 'object') rows.push(parsed);
|
|
75
|
+
}
|
|
76
|
+
return rows;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function loadTracesFromDir(traceDir, sinceMs) {
|
|
80
|
+
if (!fs.existsSync(traceDir)) return [];
|
|
81
|
+
const rows = [];
|
|
82
|
+
const files = fs.readdirSync(traceDir)
|
|
83
|
+
.filter(name => /^trace-\d{4}-\d{2}-\d{2}\.jsonl$/.test(name))
|
|
84
|
+
.sort();
|
|
85
|
+
for (const fileName of files) {
|
|
86
|
+
const fileRows = loadJsonlFile(path.join(traceDir, fileName));
|
|
87
|
+
for (const row of fileRows) {
|
|
88
|
+
const ts = Date.parse(String(row.timestamp || ''));
|
|
89
|
+
if (!Number.isFinite(ts) || ts < sinceMs) continue;
|
|
90
|
+
rows.push(row);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return rows;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
function getTimestampMs(row) {
|
|
97
|
+
const ts = Date.parse(String(row.timestamp || ''));
|
|
98
|
+
if (Number.isFinite(ts)) return ts;
|
|
99
|
+
const created = Number(row.created_at);
|
|
100
|
+
return Number.isFinite(created) ? created : 0;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
function isSuccess(row) {
|
|
104
|
+
return !(typeof row.error_code === 'string' && row.error_code.trim());
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
function isTransientErrorMessage(message) {
|
|
108
|
+
const lower = String(message || '').toLowerCase();
|
|
109
|
+
return /rate.?limit|too many requests|429|timeout|timed out|deadline|overloaded|unavailable|bad gateway|server error|econnrefused|econnreset|eai_again|enotfound|provider error|model not available/.test(lower);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
function isContextErrorMessage(message) {
|
|
113
|
+
const lower = String(message || '').toLowerCase();
|
|
114
|
+
return /context.?length|maximum.?context|too many tokens|token.?limit/.test(lower);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
function hasScenarioTag(inputText, tag) {
|
|
118
|
+
const lower = String(inputText || '').toLowerCase();
|
|
119
|
+
if (!lower) return false;
|
|
120
|
+
const escaped = tag.replace(/[-/\\^$*+?.()|[\]{}]/g, '\\$&');
|
|
121
|
+
return new RegExp(`\\[(?:scenario:)?${escaped}\\]`, 'i').test(lower);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
function evaluateRecoveryScenario(rows, matcher, recoveryWindowMs) {
|
|
125
|
+
const sorted = [...rows].sort((a, b) => getTimestampMs(a) - getTimestampMs(b));
|
|
126
|
+
const byChat = new Map();
|
|
127
|
+
for (const row of sorted) {
|
|
128
|
+
const chatId = String(row.chat_id || 'unknown');
|
|
129
|
+
if (!byChat.has(chatId)) byChat.set(chatId, []);
|
|
130
|
+
byChat.get(chatId).push(row);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
let candidates = 0;
|
|
134
|
+
let passed = 0;
|
|
135
|
+
for (const events of byChat.values()) {
|
|
136
|
+
for (let i = 0; i < events.length; i += 1) {
|
|
137
|
+
const row = events[i];
|
|
138
|
+
if (!matcher(row)) continue;
|
|
139
|
+
candidates += 1;
|
|
140
|
+
const startMs = getTimestampMs(row);
|
|
141
|
+
let recovered = false;
|
|
142
|
+
for (let j = i + 1; j < events.length; j += 1) {
|
|
143
|
+
const next = events[j];
|
|
144
|
+
const delta = getTimestampMs(next) - startMs;
|
|
145
|
+
if (delta < 0) continue;
|
|
146
|
+
if (delta > recoveryWindowMs) break;
|
|
147
|
+
if (isSuccess(next)) {
|
|
148
|
+
recovered = true;
|
|
149
|
+
break;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
if (recovered) passed += 1;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
return {
|
|
156
|
+
candidates,
|
|
157
|
+
passed,
|
|
158
|
+
pass_rate: candidates > 0 ? Number((passed / candidates).toFixed(4)) : null
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
export function evaluateScenarioMetrics(rows, options = {}) {
|
|
163
|
+
const recoveryWindowMs = Number.isFinite(options.recoveryWindowMs)
|
|
164
|
+
? Number(options.recoveryWindowMs)
|
|
165
|
+
: (10 * 60 * 1000);
|
|
166
|
+
|
|
167
|
+
const successRows = rows.filter(isSuccess);
|
|
168
|
+
const memoryCandidates = rows.filter((row) => {
|
|
169
|
+
const recallCount = Number(row.memory_recall_count);
|
|
170
|
+
const hasRecallActivity = Number.isFinite(recallCount) && recallCount > 0;
|
|
171
|
+
const explicitScenario = hasScenarioTag(row.input_text, 'memory')
|
|
172
|
+
|| hasScenarioTag(row.input_text, 'memory_carryover');
|
|
173
|
+
return hasRecallActivity || explicitScenario;
|
|
174
|
+
});
|
|
175
|
+
let memoryPassed = 0;
|
|
176
|
+
for (const row of memoryCandidates) {
|
|
177
|
+
const recallCount = Number(row.memory_recall_count) || 0;
|
|
178
|
+
const outputText = typeof row.output_text === 'string' ? row.output_text.trim() : '';
|
|
179
|
+
if (isSuccess(row) && recallCount > 0 && outputText) memoryPassed += 1;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
const toolCandidates = rows.filter((row) => {
|
|
183
|
+
const explicitScenario = hasScenarioTag(row.input_text, 'tool_heavy');
|
|
184
|
+
const calls = Array.isArray(row.tool_calls) ? row.tool_calls.length : 0;
|
|
185
|
+
return explicitScenario || calls >= 3;
|
|
186
|
+
});
|
|
187
|
+
let toolPassed = 0;
|
|
188
|
+
for (const row of toolCandidates) {
|
|
189
|
+
const explicitScenario = hasScenarioTag(row.input_text, 'tool_heavy');
|
|
190
|
+
const calls = Array.isArray(row.tool_calls) ? row.tool_calls : [];
|
|
191
|
+
const minCalls = explicitScenario ? 1 : 3;
|
|
192
|
+
if (calls.length < minCalls) continue;
|
|
193
|
+
const failedCalls = calls.filter(call => !call?.ok).length;
|
|
194
|
+
const outputText = typeof row.output_text === 'string' ? row.output_text.trim() : '';
|
|
195
|
+
if (isSuccess(row) && outputText && failedCalls <= Math.floor(calls.length * 0.2)) {
|
|
196
|
+
toolPassed += 1;
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
const transientRecovery = evaluateRecoveryScenario(
|
|
201
|
+
rows,
|
|
202
|
+
row => !isSuccess(row) && isTransientErrorMessage(row.error_code),
|
|
203
|
+
recoveryWindowMs
|
|
204
|
+
);
|
|
205
|
+
const contextRecovery = evaluateRecoveryScenario(
|
|
206
|
+
rows,
|
|
207
|
+
row => !isSuccess(row) && isContextErrorMessage(row.error_code),
|
|
208
|
+
recoveryWindowMs
|
|
209
|
+
);
|
|
210
|
+
|
|
211
|
+
const emptySuccess = successRows.filter(row => {
|
|
212
|
+
const outputText = typeof row.output_text === 'string' ? row.output_text.trim() : '';
|
|
213
|
+
return !outputText;
|
|
214
|
+
}).length;
|
|
215
|
+
|
|
216
|
+
return {
|
|
217
|
+
totals: {
|
|
218
|
+
records: rows.length,
|
|
219
|
+
success: successRows.length,
|
|
220
|
+
empty_success: emptySuccess,
|
|
221
|
+
empty_success_rate: successRows.length > 0 ? Number((emptySuccess / successRows.length).toFixed(4)) : null
|
|
222
|
+
},
|
|
223
|
+
scenarios: {
|
|
224
|
+
memory_carryover: {
|
|
225
|
+
candidates: memoryCandidates.length,
|
|
226
|
+
passed: memoryPassed,
|
|
227
|
+
pass_rate: memoryCandidates.length > 0 ? Number((memoryPassed / memoryCandidates.length).toFixed(4)) : null
|
|
228
|
+
},
|
|
229
|
+
tool_heavy: {
|
|
230
|
+
candidates: toolCandidates.length,
|
|
231
|
+
passed: toolPassed,
|
|
232
|
+
pass_rate: toolCandidates.length > 0 ? Number((toolPassed / toolCandidates.length).toFixed(4)) : null
|
|
233
|
+
},
|
|
234
|
+
transient_recovery: transientRecovery,
|
|
235
|
+
context_recovery: contextRecovery
|
|
236
|
+
}
|
|
237
|
+
};
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
export function evaluateScenarioThresholds(metrics, thresholds = DEFAULT_THRESHOLDS) {
|
|
241
|
+
const failures = [];
|
|
242
|
+
const checks = [
|
|
243
|
+
['memory_carryover', thresholds.memory_carryover],
|
|
244
|
+
['tool_heavy', thresholds.tool_heavy],
|
|
245
|
+
['transient_recovery', thresholds.transient_recovery],
|
|
246
|
+
['context_recovery', thresholds.context_recovery],
|
|
247
|
+
];
|
|
248
|
+
|
|
249
|
+
for (const [scenarioKey, config] of checks) {
|
|
250
|
+
const scenario = metrics.scenarios[scenarioKey];
|
|
251
|
+
if (!scenario || !config) continue;
|
|
252
|
+
if (scenario.candidates < config.min_candidates) continue;
|
|
253
|
+
if ((scenario.pass_rate ?? 0) < config.min_pass_rate) {
|
|
254
|
+
failures.push(
|
|
255
|
+
`${scenarioKey} pass_rate ${scenario.pass_rate} below ${config.min_pass_rate} ` +
|
|
256
|
+
`(candidates=${scenario.candidates})`
|
|
257
|
+
);
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
const emptyCfg = thresholds.empty_success_rate;
|
|
262
|
+
if (emptyCfg && metrics.totals.success >= emptyCfg.min_success) {
|
|
263
|
+
if ((metrics.totals.empty_success_rate ?? 0) > emptyCfg.max_rate) {
|
|
264
|
+
failures.push(
|
|
265
|
+
`empty_success_rate ${metrics.totals.empty_success_rate} above ${emptyCfg.max_rate} ` +
|
|
266
|
+
`(success=${metrics.totals.success})`
|
|
267
|
+
);
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
return failures;
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
function main() {
|
|
275
|
+
const args = parseArgs(process.argv.slice(2));
|
|
276
|
+
const dotclawHome = process.env.DOTCLAW_HOME || path.join(os.homedir(), '.dotclaw');
|
|
277
|
+
const traceDir = args.dir || path.join(dotclawHome, 'traces');
|
|
278
|
+
const sinceMs = Date.now() - (args.days * 24 * 60 * 60 * 1000);
|
|
279
|
+
const rows = args.input
|
|
280
|
+
? loadJsonlFile(args.input)
|
|
281
|
+
: loadTracesFromDir(traceDir, sinceMs);
|
|
282
|
+
|
|
283
|
+
const metrics = evaluateScenarioMetrics(rows, { recoveryWindowMs: args.recoveryWindowMs });
|
|
284
|
+
const failures = evaluateScenarioThresholds(metrics, DEFAULT_THRESHOLDS);
|
|
285
|
+
const output = {
|
|
286
|
+
source: args.input || traceDir,
|
|
287
|
+
window_days: args.input ? null : args.days,
|
|
288
|
+
recovery_window_ms: args.recoveryWindowMs,
|
|
289
|
+
thresholds: DEFAULT_THRESHOLDS,
|
|
290
|
+
...metrics,
|
|
291
|
+
failures
|
|
292
|
+
};
|
|
293
|
+
console.log(JSON.stringify(output, null, 2));
|
|
294
|
+
if (args.enforce && failures.length > 0) {
|
|
295
|
+
process.exitCode = 1;
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
300
|
+
main();
|
|
301
|
+
}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import fs from 'node:fs';
|
|
4
|
+
import path from 'node:path';
|
|
5
|
+
import { evaluateScenarioMetrics } from './benchmark-scenarios.js';
|
|
6
|
+
|
|
7
|
+
const DEFAULT_INPUT = path.join(process.cwd(), 'test', 'fixtures', 'benchmark', 'scenario-traces.jsonl');
|
|
8
|
+
const DEFAULT_EXPECTED = path.join(process.cwd(), 'test', 'fixtures', 'benchmark', 'canary-expected.json');
|
|
9
|
+
|
|
10
|
+
function parseArgs(argv) {
|
|
11
|
+
const args = {
|
|
12
|
+
input: DEFAULT_INPUT,
|
|
13
|
+
expected: DEFAULT_EXPECTED,
|
|
14
|
+
enforce: false,
|
|
15
|
+
recoveryWindowMs: 10 * 60 * 1000
|
|
16
|
+
};
|
|
17
|
+
for (let i = 0; i < argv.length; i += 1) {
|
|
18
|
+
const arg = argv[i];
|
|
19
|
+
if (arg === '--input' && i + 1 < argv.length) {
|
|
20
|
+
args.input = argv[i + 1];
|
|
21
|
+
i += 1;
|
|
22
|
+
continue;
|
|
23
|
+
}
|
|
24
|
+
if (arg === '--expected' && i + 1 < argv.length) {
|
|
25
|
+
args.expected = argv[i + 1];
|
|
26
|
+
i += 1;
|
|
27
|
+
continue;
|
|
28
|
+
}
|
|
29
|
+
if (arg === '--recovery-window-ms' && i + 1 < argv.length) {
|
|
30
|
+
const value = Number(argv[i + 1]);
|
|
31
|
+
if (Number.isFinite(value) && value > 0) args.recoveryWindowMs = value;
|
|
32
|
+
i += 1;
|
|
33
|
+
continue;
|
|
34
|
+
}
|
|
35
|
+
if (arg === '--enforce') {
|
|
36
|
+
args.enforce = true;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
return args;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function loadJson(filePath) {
|
|
43
|
+
return JSON.parse(fs.readFileSync(filePath, 'utf-8'));
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
function loadJsonl(filePath) {
|
|
47
|
+
const rows = [];
|
|
48
|
+
if (!fs.existsSync(filePath)) return rows;
|
|
49
|
+
const raw = fs.readFileSync(filePath, 'utf-8');
|
|
50
|
+
for (const line of raw.split('\n')) {
|
|
51
|
+
if (!line.trim()) continue;
|
|
52
|
+
try {
|
|
53
|
+
rows.push(JSON.parse(line));
|
|
54
|
+
} catch {
|
|
55
|
+
// ignore malformed lines
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
return rows;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export function evaluateCanaryExpectations(metrics, expected) {
|
|
62
|
+
const failures = [];
|
|
63
|
+
const checks = [];
|
|
64
|
+
|
|
65
|
+
const scenarioExpectations = expected?.scenarios || {};
|
|
66
|
+
for (const [scenarioKey, config] of Object.entries(scenarioExpectations)) {
|
|
67
|
+
const scenario = metrics.scenarios?.[scenarioKey];
|
|
68
|
+
if (!scenario) {
|
|
69
|
+
failures.push(`missing scenario metrics: ${scenarioKey}`);
|
|
70
|
+
continue;
|
|
71
|
+
}
|
|
72
|
+
const minCandidates = Number(config.min_candidates ?? 0);
|
|
73
|
+
const minPassRate = Number(config.min_pass_rate ?? 0);
|
|
74
|
+
checks.push({
|
|
75
|
+
scenario: scenarioKey,
|
|
76
|
+
candidates: scenario.candidates,
|
|
77
|
+
pass_rate: scenario.pass_rate,
|
|
78
|
+
min_candidates: minCandidates,
|
|
79
|
+
min_pass_rate: minPassRate
|
|
80
|
+
});
|
|
81
|
+
if (scenario.candidates < minCandidates) {
|
|
82
|
+
failures.push(`${scenarioKey} candidates ${scenario.candidates} below ${minCandidates}`);
|
|
83
|
+
continue;
|
|
84
|
+
}
|
|
85
|
+
if ((scenario.pass_rate ?? 0) < minPassRate) {
|
|
86
|
+
failures.push(`${scenarioKey} pass_rate ${scenario.pass_rate} below ${minPassRate}`);
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
const maxEmptySuccessRate = Number(expected?.totals?.max_empty_success_rate ?? Number.POSITIVE_INFINITY);
|
|
91
|
+
const emptyRate = metrics.totals?.empty_success_rate;
|
|
92
|
+
if (Number.isFinite(maxEmptySuccessRate) && Number.isFinite(emptyRate) && emptyRate > maxEmptySuccessRate) {
|
|
93
|
+
failures.push(`empty_success_rate ${emptyRate} above ${maxEmptySuccessRate}`);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
return { checks, failures };
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
function main() {
|
|
100
|
+
const args = parseArgs(process.argv.slice(2));
|
|
101
|
+
const rows = loadJsonl(args.input);
|
|
102
|
+
const expected = loadJson(args.expected);
|
|
103
|
+
const metrics = evaluateScenarioMetrics(rows, { recoveryWindowMs: args.recoveryWindowMs });
|
|
104
|
+
const evaluation = evaluateCanaryExpectations(metrics, expected);
|
|
105
|
+
|
|
106
|
+
const output = {
|
|
107
|
+
input: args.input,
|
|
108
|
+
expected: args.expected,
|
|
109
|
+
recovery_window_ms: args.recoveryWindowMs,
|
|
110
|
+
metrics,
|
|
111
|
+
checks: evaluation.checks,
|
|
112
|
+
failures: evaluation.failures
|
|
113
|
+
};
|
|
114
|
+
console.log(JSON.stringify(output, null, 2));
|
|
115
|
+
|
|
116
|
+
if (args.enforce && evaluation.failures.length > 0) {
|
|
117
|
+
process.exitCode = 1;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
122
|
+
main();
|
|
123
|
+
}
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import fs from 'node:fs';
|
|
4
|
+
import path from 'node:path';
|
|
5
|
+
|
|
6
|
+
function parseArgs(argv) {
|
|
7
|
+
const args = {
|
|
8
|
+
dir: '',
|
|
9
|
+
fixture: path.join(process.cwd(), 'test', 'fixtures', 'benchmark', 'scenario-traces.jsonl'),
|
|
10
|
+
repeat: 8,
|
|
11
|
+
stepMs: 30_000,
|
|
12
|
+
startMs: Date.now(),
|
|
13
|
+
reset: false,
|
|
14
|
+
seedDir: '',
|
|
15
|
+
seedDays: 14,
|
|
16
|
+
seedLimit: 400,
|
|
17
|
+
chatPrefix: 'controlled',
|
|
18
|
+
skipErrorRows: false,
|
|
19
|
+
};
|
|
20
|
+
|
|
21
|
+
for (let i = 0; i < argv.length; i += 1) {
|
|
22
|
+
const arg = argv[i];
|
|
23
|
+
if (arg === '--dir' && i + 1 < argv.length) {
|
|
24
|
+
args.dir = argv[i + 1];
|
|
25
|
+
i += 1;
|
|
26
|
+
continue;
|
|
27
|
+
}
|
|
28
|
+
if (arg === '--fixture' && i + 1 < argv.length) {
|
|
29
|
+
args.fixture = argv[i + 1];
|
|
30
|
+
i += 1;
|
|
31
|
+
continue;
|
|
32
|
+
}
|
|
33
|
+
if (arg === '--repeat' && i + 1 < argv.length) {
|
|
34
|
+
const value = Number(argv[i + 1]);
|
|
35
|
+
if (Number.isFinite(value) && value >= 0) args.repeat = Math.floor(value);
|
|
36
|
+
i += 1;
|
|
37
|
+
continue;
|
|
38
|
+
}
|
|
39
|
+
if (arg === '--step-ms' && i + 1 < argv.length) {
|
|
40
|
+
const value = Number(argv[i + 1]);
|
|
41
|
+
if (Number.isFinite(value) && value > 0) args.stepMs = Math.floor(value);
|
|
42
|
+
i += 1;
|
|
43
|
+
continue;
|
|
44
|
+
}
|
|
45
|
+
if (arg === '--start' && i + 1 < argv.length) {
|
|
46
|
+
const parsed = Date.parse(argv[i + 1]);
|
|
47
|
+
if (Number.isFinite(parsed)) args.startMs = parsed;
|
|
48
|
+
i += 1;
|
|
49
|
+
continue;
|
|
50
|
+
}
|
|
51
|
+
if (arg === '--reset') {
|
|
52
|
+
args.reset = true;
|
|
53
|
+
continue;
|
|
54
|
+
}
|
|
55
|
+
if (arg === '--seed-dir' && i + 1 < argv.length) {
|
|
56
|
+
args.seedDir = argv[i + 1];
|
|
57
|
+
i += 1;
|
|
58
|
+
continue;
|
|
59
|
+
}
|
|
60
|
+
if (arg === '--seed-days' && i + 1 < argv.length) {
|
|
61
|
+
const value = Number(argv[i + 1]);
|
|
62
|
+
if (Number.isFinite(value) && value > 0) args.seedDays = Math.floor(value);
|
|
63
|
+
i += 1;
|
|
64
|
+
continue;
|
|
65
|
+
}
|
|
66
|
+
if (arg === '--seed-limit' && i + 1 < argv.length) {
|
|
67
|
+
const value = Number(argv[i + 1]);
|
|
68
|
+
if (Number.isFinite(value) && value > 0) args.seedLimit = Math.floor(value);
|
|
69
|
+
i += 1;
|
|
70
|
+
continue;
|
|
71
|
+
}
|
|
72
|
+
if (arg === '--chat-prefix' && i + 1 < argv.length) {
|
|
73
|
+
args.chatPrefix = String(argv[i + 1] || '').trim() || args.chatPrefix;
|
|
74
|
+
i += 1;
|
|
75
|
+
continue;
|
|
76
|
+
}
|
|
77
|
+
if (arg === '--skip-error-rows') {
|
|
78
|
+
args.skipErrorRows = true;
|
|
79
|
+
continue;
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
return args;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
function readJsonl(filePath) {
|
|
87
|
+
if (!fs.existsSync(filePath)) return [];
|
|
88
|
+
const rows = [];
|
|
89
|
+
const raw = fs.readFileSync(filePath, 'utf-8');
|
|
90
|
+
for (const line of raw.split('\n')) {
|
|
91
|
+
if (!line.trim()) continue;
|
|
92
|
+
try {
|
|
93
|
+
const parsed = JSON.parse(line);
|
|
94
|
+
if (parsed && typeof parsed === 'object') rows.push(parsed);
|
|
95
|
+
} catch {
|
|
96
|
+
// ignore malformed lines
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
return rows;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function traceFileName(ms) {
|
|
103
|
+
const date = new Date(ms);
|
|
104
|
+
const year = date.getUTCFullYear();
|
|
105
|
+
const month = String(date.getUTCMonth() + 1).padStart(2, '0');
|
|
106
|
+
const day = String(date.getUTCDate()).padStart(2, '0');
|
|
107
|
+
return `trace-${year}-${month}-${day}.jsonl`;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
function loadSeedRows(seedDir, seedDays, seedLimit) {
|
|
111
|
+
if (!seedDir || !fs.existsSync(seedDir)) return [];
|
|
112
|
+
const sinceMs = Date.now() - (seedDays * 24 * 60 * 60 * 1000);
|
|
113
|
+
const files = fs.readdirSync(seedDir)
|
|
114
|
+
.filter(name => /^trace-\d{4}-\d{2}-\d{2}\.jsonl$/.test(name))
|
|
115
|
+
.sort();
|
|
116
|
+
const rows = [];
|
|
117
|
+
for (const fileName of files) {
|
|
118
|
+
const fileRows = readJsonl(path.join(seedDir, fileName));
|
|
119
|
+
for (const row of fileRows) {
|
|
120
|
+
const ts = Date.parse(String(row.timestamp || ''));
|
|
121
|
+
if (!Number.isFinite(ts) || ts < sinceMs) continue;
|
|
122
|
+
rows.push(row);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
if (!Number.isFinite(seedLimit) || seedLimit <= 0 || rows.length <= seedLimit) return rows;
|
|
126
|
+
return rows.slice(rows.length - seedLimit);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
function synthesizeScenarioRows(fixtureRows, params) {
|
|
130
|
+
const {
|
|
131
|
+
repeat,
|
|
132
|
+
startMs,
|
|
133
|
+
stepMs,
|
|
134
|
+
chatPrefix,
|
|
135
|
+
} = params;
|
|
136
|
+
|
|
137
|
+
const rows = [];
|
|
138
|
+
let cursor = startMs;
|
|
139
|
+
for (let r = 0; r < repeat; r += 1) {
|
|
140
|
+
for (const sourceRow of fixtureRows) {
|
|
141
|
+
const timestamp = new Date(cursor).toISOString();
|
|
142
|
+
const chatId = String(sourceRow.chat_id || `chat-${r}`);
|
|
143
|
+
rows.push({
|
|
144
|
+
...sourceRow,
|
|
145
|
+
timestamp,
|
|
146
|
+
created_at: cursor,
|
|
147
|
+
chat_id: `${chatPrefix}:${r}:${chatId}`
|
|
148
|
+
});
|
|
149
|
+
cursor += stepMs;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
return rows;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
function writeRows(traceDir, rows) {
|
|
156
|
+
fs.mkdirSync(traceDir, { recursive: true });
|
|
157
|
+
const byFile = new Map();
|
|
158
|
+
for (const row of rows) {
|
|
159
|
+
const ts = Date.parse(String(row.timestamp || ''));
|
|
160
|
+
const fileName = traceFileName(Number.isFinite(ts) ? ts : Date.now());
|
|
161
|
+
if (!byFile.has(fileName)) byFile.set(fileName, []);
|
|
162
|
+
byFile.get(fileName).push(row);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
for (const [fileName, fileRows] of byFile.entries()) {
|
|
166
|
+
const filePath = path.join(traceDir, fileName);
|
|
167
|
+
const lines = fileRows.map(row => JSON.stringify(row)).join('\n');
|
|
168
|
+
fs.appendFileSync(filePath, `${lines}\n`, 'utf-8');
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
function clearTraceDir(traceDir) {
|
|
173
|
+
if (!fs.existsSync(traceDir)) return 0;
|
|
174
|
+
let removed = 0;
|
|
175
|
+
for (const name of fs.readdirSync(traceDir)) {
|
|
176
|
+
if (!/^trace-\d{4}-\d{2}-\d{2}\.jsonl$/.test(name)) continue;
|
|
177
|
+
fs.rmSync(path.join(traceDir, name), { force: true });
|
|
178
|
+
removed += 1;
|
|
179
|
+
}
|
|
180
|
+
return removed;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
function main() {
|
|
184
|
+
const args = parseArgs(process.argv.slice(2));
|
|
185
|
+
if (!args.dir) {
|
|
186
|
+
console.error('Usage: node scripts/generate-controlled-traces.js --dir <trace-dir> [options]');
|
|
187
|
+
process.exit(1);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
const traceDir = path.resolve(args.dir);
|
|
191
|
+
const fixturePath = path.resolve(args.fixture);
|
|
192
|
+
if (!fs.existsSync(fixturePath)) {
|
|
193
|
+
console.error(`Fixture not found: ${fixturePath}`);
|
|
194
|
+
process.exit(1);
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
let clearedFiles = 0;
|
|
198
|
+
if (args.reset) {
|
|
199
|
+
clearedFiles = clearTraceDir(traceDir);
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
const seededRows = loadSeedRows(args.seedDir ? path.resolve(args.seedDir) : '', args.seedDays, args.seedLimit);
|
|
203
|
+
const fixtureRows = readJsonl(fixturePath);
|
|
204
|
+
const filteredFixtureRows = args.skipErrorRows
|
|
205
|
+
? fixtureRows.filter((row) => !(typeof row.error_code === 'string' && row.error_code.trim().length > 0))
|
|
206
|
+
: fixtureRows;
|
|
207
|
+
const scenarioRows = synthesizeScenarioRows(filteredFixtureRows, args);
|
|
208
|
+
|
|
209
|
+
writeRows(traceDir, seededRows);
|
|
210
|
+
writeRows(traceDir, scenarioRows);
|
|
211
|
+
|
|
212
|
+
const summary = {
|
|
213
|
+
trace_dir: traceDir,
|
|
214
|
+
fixture: fixturePath,
|
|
215
|
+
cleared_files: clearedFiles,
|
|
216
|
+
seeded_rows: seededRows.length,
|
|
217
|
+
scenario_rows: scenarioRows.length,
|
|
218
|
+
total_rows_written: seededRows.length + scenarioRows.length,
|
|
219
|
+
repeat: args.repeat,
|
|
220
|
+
step_ms: args.stepMs,
|
|
221
|
+
chat_prefix: args.chatPrefix
|
|
222
|
+
,
|
|
223
|
+
skip_error_rows: args.skipErrorRows
|
|
224
|
+
};
|
|
225
|
+
console.log(JSON.stringify(summary, null, 2));
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
229
|
+
main();
|
|
230
|
+
}
|