@dotsetlabs/dotclaw 2.4.0 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +9 -10
- package/README.md +8 -4
- package/config-examples/runtime.json +34 -8
- package/config-examples/tool-policy.json +12 -2
- package/container/agent-runner/package-lock.json +2 -2
- package/container/agent-runner/package.json +1 -1
- package/container/agent-runner/src/agent-config.ts +19 -3
- package/container/agent-runner/src/container-protocol.ts +11 -0
- package/container/agent-runner/src/context-overflow-recovery.ts +39 -0
- package/container/agent-runner/src/index.ts +603 -165
- package/container/agent-runner/src/openrouter-input.ts +159 -0
- package/container/agent-runner/src/system-prompt.ts +13 -3
- package/container/agent-runner/src/tool-loop-policy.ts +741 -0
- package/container/agent-runner/src/tools.ts +211 -8
- package/dist/agent-context.d.ts +1 -0
- package/dist/agent-context.d.ts.map +1 -1
- package/dist/agent-context.js +21 -9
- package/dist/agent-context.js.map +1 -1
- package/dist/agent-execution.d.ts +2 -0
- package/dist/agent-execution.d.ts.map +1 -1
- package/dist/agent-execution.js +164 -15
- package/dist/agent-execution.js.map +1 -1
- package/dist/agent-semaphore.d.ts +24 -1
- package/dist/agent-semaphore.d.ts.map +1 -1
- package/dist/agent-semaphore.js +109 -20
- package/dist/agent-semaphore.js.map +1 -1
- package/dist/cli.js +3 -11
- package/dist/cli.js.map +1 -1
- package/dist/config.d.ts +2 -0
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +2 -0
- package/dist/config.js.map +1 -1
- package/dist/container-protocol.d.ts +22 -0
- package/dist/container-protocol.d.ts.map +1 -1
- package/dist/container-protocol.js.map +1 -1
- package/dist/container-runner.d.ts +7 -0
- package/dist/container-runner.d.ts.map +1 -1
- package/dist/container-runner.js +417 -143
- package/dist/container-runner.js.map +1 -1
- package/dist/db.d.ts.map +1 -1
- package/dist/db.js +46 -12
- package/dist/db.js.map +1 -1
- package/dist/error-messages.d.ts.map +1 -1
- package/dist/error-messages.js +18 -4
- package/dist/error-messages.js.map +1 -1
- package/dist/failover-policy.d.ts +41 -0
- package/dist/failover-policy.d.ts.map +1 -0
- package/dist/failover-policy.js +261 -0
- package/dist/failover-policy.js.map +1 -0
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/dist/ipc-dispatcher.d.ts.map +1 -1
- package/dist/ipc-dispatcher.js +27 -43
- package/dist/ipc-dispatcher.js.map +1 -1
- package/dist/mcp-config.d.ts +22 -0
- package/dist/mcp-config.d.ts.map +1 -0
- package/dist/mcp-config.js +94 -0
- package/dist/mcp-config.js.map +1 -0
- package/dist/memory-backend.d.ts +27 -0
- package/dist/memory-backend.d.ts.map +1 -0
- package/dist/memory-backend.js +112 -0
- package/dist/memory-backend.js.map +1 -0
- package/dist/memory-recall.d.ts.map +1 -1
- package/dist/memory-recall.js +135 -22
- package/dist/memory-recall.js.map +1 -1
- package/dist/memory-store.d.ts +1 -0
- package/dist/memory-store.d.ts.map +1 -1
- package/dist/memory-store.js +55 -7
- package/dist/memory-store.js.map +1 -1
- package/dist/message-pipeline.d.ts +24 -0
- package/dist/message-pipeline.d.ts.map +1 -1
- package/dist/message-pipeline.js +131 -27
- package/dist/message-pipeline.js.map +1 -1
- package/dist/metrics.d.ts +1 -0
- package/dist/metrics.d.ts.map +1 -1
- package/dist/metrics.js +9 -0
- package/dist/metrics.js.map +1 -1
- package/dist/providers/discord/discord-provider.d.ts.map +1 -1
- package/dist/providers/discord/discord-provider.js +72 -4
- package/dist/providers/discord/discord-provider.js.map +1 -1
- package/dist/providers/telegram/telegram-provider.d.ts.map +1 -1
- package/dist/providers/telegram/telegram-provider.js +65 -3
- package/dist/providers/telegram/telegram-provider.js.map +1 -1
- package/dist/recall-policy.d.ts +12 -0
- package/dist/recall-policy.d.ts.map +1 -0
- package/dist/recall-policy.js +89 -0
- package/dist/recall-policy.js.map +1 -0
- package/dist/runtime-config.d.ts +33 -0
- package/dist/runtime-config.d.ts.map +1 -1
- package/dist/runtime-config.js +109 -9
- package/dist/runtime-config.js.map +1 -1
- package/dist/streaming.d.ts.map +1 -1
- package/dist/streaming.js +125 -33
- package/dist/streaming.js.map +1 -1
- package/dist/task-scheduler.d.ts.map +1 -1
- package/dist/task-scheduler.js +4 -2
- package/dist/task-scheduler.js.map +1 -1
- package/dist/tool-policy.d.ts.map +1 -1
- package/dist/tool-policy.js +26 -4
- package/dist/tool-policy.js.map +1 -1
- package/dist/trace-writer.d.ts +12 -0
- package/dist/trace-writer.d.ts.map +1 -1
- package/dist/trace-writer.js.map +1 -1
- package/dist/turn-hygiene.d.ts +14 -0
- package/dist/turn-hygiene.d.ts.map +1 -0
- package/dist/turn-hygiene.js +214 -0
- package/dist/turn-hygiene.js.map +1 -0
- package/dist/webhook.d.ts.map +1 -1
- package/dist/webhook.js +1 -0
- package/dist/webhook.js.map +1 -1
- package/package.json +15 -1
- package/scripts/benchmark-baseline.js +365 -0
- package/scripts/benchmark-harness.js +1413 -0
- package/scripts/benchmark-scenarios.js +301 -0
- package/scripts/canary-suite.js +123 -0
- package/scripts/generate-controlled-traces.js +230 -0
- package/scripts/release-slo-check.js +214 -0
- package/scripts/run-live-canary.js +339 -0
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import fs from 'node:fs';
|
|
4
|
+
import path from 'node:path';
|
|
5
|
+
import { execFileSync } from 'node:child_process';
|
|
6
|
+
|
|
7
|
+
const DEFAULT_SLOS = {
|
|
8
|
+
min_records: 20,
|
|
9
|
+
min_success_rate: 0.95,
|
|
10
|
+
max_error_rate: 0.05,
|
|
11
|
+
max_empty_success_rate: 0.01,
|
|
12
|
+
min_tool_success_rate: 0.95,
|
|
13
|
+
max_p95_latency_ms: 120000,
|
|
14
|
+
max_error_class_rate: {
|
|
15
|
+
auth: 0.02,
|
|
16
|
+
rate_limit: 0.1,
|
|
17
|
+
timeout: 0.1,
|
|
18
|
+
context_overflow: 0.05,
|
|
19
|
+
unknown: 0.1
|
|
20
|
+
}
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
function parseArgs(argv) {
|
|
24
|
+
const args = {
|
|
25
|
+
days: 7,
|
|
26
|
+
dir: '',
|
|
27
|
+
input: '',
|
|
28
|
+
thresholds: '',
|
|
29
|
+
enforce: false
|
|
30
|
+
};
|
|
31
|
+
for (let i = 0; i < argv.length; i += 1) {
|
|
32
|
+
const arg = argv[i];
|
|
33
|
+
if (arg === '--days' && i + 1 < argv.length) {
|
|
34
|
+
const value = Number(argv[i + 1]);
|
|
35
|
+
if (Number.isFinite(value) && value > 0) args.days = Math.floor(value);
|
|
36
|
+
i += 1;
|
|
37
|
+
continue;
|
|
38
|
+
}
|
|
39
|
+
if (arg === '--dir' && i + 1 < argv.length) {
|
|
40
|
+
args.dir = argv[i + 1];
|
|
41
|
+
i += 1;
|
|
42
|
+
continue;
|
|
43
|
+
}
|
|
44
|
+
if (arg === '--input' && i + 1 < argv.length) {
|
|
45
|
+
args.input = argv[i + 1];
|
|
46
|
+
i += 1;
|
|
47
|
+
continue;
|
|
48
|
+
}
|
|
49
|
+
if (arg === '--thresholds' && i + 1 < argv.length) {
|
|
50
|
+
args.thresholds = argv[i + 1];
|
|
51
|
+
i += 1;
|
|
52
|
+
continue;
|
|
53
|
+
}
|
|
54
|
+
if (arg === '--enforce') {
|
|
55
|
+
args.enforce = true;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
return args;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function readJson(filePath) {
|
|
62
|
+
return JSON.parse(fs.readFileSync(filePath, 'utf-8'));
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
function classifyErrorMessage(message) {
|
|
66
|
+
const lower = String(message || '').toLowerCase();
|
|
67
|
+
if (/invalid.?api.?key|unauthorized|forbidden|payment|required|insufficient.?credit|\b401\b|\b402\b|\b403\b/.test(lower)) {
|
|
68
|
+
return 'auth';
|
|
69
|
+
}
|
|
70
|
+
if (/rate.?limit|too many requests|\b429\b/.test(lower)) {
|
|
71
|
+
return 'rate_limit';
|
|
72
|
+
}
|
|
73
|
+
if (/timeout|timed out|deadline|econnreset|econnrefused|enotfound|eai_again/.test(lower)) {
|
|
74
|
+
return 'timeout';
|
|
75
|
+
}
|
|
76
|
+
if (/context.?length|maximum.?context|too many tokens|token.?limit/.test(lower)) {
|
|
77
|
+
return 'context_overflow';
|
|
78
|
+
}
|
|
79
|
+
return 'unknown';
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function mergeThresholds(base, overrides) {
|
|
83
|
+
if (!overrides || typeof overrides !== 'object') return base;
|
|
84
|
+
const merged = JSON.parse(JSON.stringify(base));
|
|
85
|
+
for (const [key, value] of Object.entries(overrides)) {
|
|
86
|
+
if (value && typeof value === 'object' && !Array.isArray(value) && merged[key] && typeof merged[key] === 'object') {
|
|
87
|
+
merged[key] = mergeThresholds(merged[key], value);
|
|
88
|
+
} else {
|
|
89
|
+
merged[key] = value;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
return merged;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
function computeEmptySuccessRate(report) {
|
|
96
|
+
const success = Number(report.records_success || 0);
|
|
97
|
+
const empty = Number(report.empty_success_responses || 0);
|
|
98
|
+
if (!Number.isFinite(success) || success <= 0) return null;
|
|
99
|
+
if (!Number.isFinite(empty) || empty < 0) return null;
|
|
100
|
+
return empty / success;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
export function evaluateReleaseSlo(report, thresholds = DEFAULT_SLOS) {
|
|
104
|
+
const failures = [];
|
|
105
|
+
const checks = [];
|
|
106
|
+
|
|
107
|
+
const recordsTotal = Number(report.records_total || 0);
|
|
108
|
+
const successRate = Number(report.success_rate || 0);
|
|
109
|
+
const errorRate = recordsTotal > 0 ? Number(report.records_error || 0) / recordsTotal : 0;
|
|
110
|
+
const emptySuccessRate = computeEmptySuccessRate(report) ?? 0;
|
|
111
|
+
const toolSuccessRate = Number(report?.tool_calls?.success_rate ?? 0);
|
|
112
|
+
const p95Latency = Number(report?.latency_ms?.p95 ?? 0);
|
|
113
|
+
|
|
114
|
+
checks.push({ name: 'records_total', actual: recordsTotal, threshold: thresholds.min_records, comparator: '>=' });
|
|
115
|
+
if (recordsTotal < thresholds.min_records) {
|
|
116
|
+
failures.push(`records_total ${recordsTotal} below ${thresholds.min_records}`);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
checks.push({ name: 'success_rate', actual: successRate, threshold: thresholds.min_success_rate, comparator: '>=' });
|
|
120
|
+
if (successRate < thresholds.min_success_rate) {
|
|
121
|
+
failures.push(`success_rate ${successRate} below ${thresholds.min_success_rate}`);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
checks.push({ name: 'error_rate', actual: Number(errorRate.toFixed(4)), threshold: thresholds.max_error_rate, comparator: '<=' });
|
|
125
|
+
if (errorRate > thresholds.max_error_rate) {
|
|
126
|
+
failures.push(`error_rate ${errorRate.toFixed(4)} above ${thresholds.max_error_rate}`);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
checks.push({ name: 'empty_success_rate', actual: Number(emptySuccessRate.toFixed(4)), threshold: thresholds.max_empty_success_rate, comparator: '<=' });
|
|
130
|
+
if (emptySuccessRate > thresholds.max_empty_success_rate) {
|
|
131
|
+
failures.push(`empty_success_rate ${emptySuccessRate.toFixed(4)} above ${thresholds.max_empty_success_rate}`);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
checks.push({ name: 'tool_success_rate', actual: toolSuccessRate, threshold: thresholds.min_tool_success_rate, comparator: '>=' });
|
|
135
|
+
if (toolSuccessRate < thresholds.min_tool_success_rate) {
|
|
136
|
+
failures.push(`tool_success_rate ${toolSuccessRate} below ${thresholds.min_tool_success_rate}`);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
checks.push({ name: 'latency_p95_ms', actual: p95Latency, threshold: thresholds.max_p95_latency_ms, comparator: '<=' });
|
|
140
|
+
if (p95Latency > thresholds.max_p95_latency_ms) {
|
|
141
|
+
failures.push(`latency_p95_ms ${p95Latency} above ${thresholds.max_p95_latency_ms}`);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
const classCounts = {
|
|
145
|
+
auth: 0,
|
|
146
|
+
rate_limit: 0,
|
|
147
|
+
timeout: 0,
|
|
148
|
+
context_overflow: 0,
|
|
149
|
+
unknown: 0
|
|
150
|
+
};
|
|
151
|
+
const topErrors = Array.isArray(report.top_errors) ? report.top_errors : [];
|
|
152
|
+
for (const entry of topErrors) {
|
|
153
|
+
const errorClass = classifyErrorMessage(entry.key);
|
|
154
|
+
classCounts[errorClass] += Number(entry.count || 0);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
for (const [errorClass, maxRate] of Object.entries(thresholds.max_error_class_rate || {})) {
|
|
158
|
+
const rate = recordsTotal > 0 ? classCounts[errorClass] / recordsTotal : 0;
|
|
159
|
+
checks.push({
|
|
160
|
+
name: `error_class_${errorClass}_rate`,
|
|
161
|
+
actual: Number(rate.toFixed(4)),
|
|
162
|
+
threshold: maxRate,
|
|
163
|
+
comparator: '<='
|
|
164
|
+
});
|
|
165
|
+
if (rate > maxRate) {
|
|
166
|
+
failures.push(`error_class_${errorClass}_rate ${rate.toFixed(4)} above ${maxRate}`);
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
return {
|
|
171
|
+
checks,
|
|
172
|
+
failures,
|
|
173
|
+
passed: failures.length === 0
|
|
174
|
+
};
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
function buildBaselineReport(args) {
|
|
178
|
+
if (args.input) {
|
|
179
|
+
return readJson(args.input);
|
|
180
|
+
}
|
|
181
|
+
const baselineScript = path.join(process.cwd(), 'scripts', 'benchmark-baseline.js');
|
|
182
|
+
const baselineArgs = [baselineScript, '--days', String(args.days)];
|
|
183
|
+
if (args.dir) {
|
|
184
|
+
baselineArgs.push('--dir', args.dir);
|
|
185
|
+
}
|
|
186
|
+
const stdout = execFileSync(process.execPath, baselineArgs, { encoding: 'utf-8' });
|
|
187
|
+
return JSON.parse(stdout);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
function main() {
|
|
191
|
+
const args = parseArgs(process.argv.slice(2));
|
|
192
|
+
const report = buildBaselineReport(args);
|
|
193
|
+
const thresholds = args.thresholds
|
|
194
|
+
? mergeThresholds(DEFAULT_SLOS, readJson(args.thresholds))
|
|
195
|
+
: DEFAULT_SLOS;
|
|
196
|
+
const evaluation = evaluateReleaseSlo(report, thresholds);
|
|
197
|
+
const output = {
|
|
198
|
+
source: args.input || 'benchmark-baseline',
|
|
199
|
+
thresholds,
|
|
200
|
+
report,
|
|
201
|
+
checks: evaluation.checks,
|
|
202
|
+
failures: evaluation.failures,
|
|
203
|
+
passed: evaluation.passed
|
|
204
|
+
};
|
|
205
|
+
|
|
206
|
+
console.log(JSON.stringify(output, null, 2));
|
|
207
|
+
if (args.enforce && !evaluation.passed) {
|
|
208
|
+
process.exitCode = 1;
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
213
|
+
main();
|
|
214
|
+
}
|
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import path from 'node:path';
|
|
4
|
+
|
|
5
|
+
import { DATA_DIR } from '../dist/config.js';
|
|
6
|
+
import { initDatabase } from '../dist/db.js';
|
|
7
|
+
import { loadJson } from '../dist/utils.js';
|
|
8
|
+
import {
|
|
9
|
+
AgentExecutionError,
|
|
10
|
+
createTraceBase,
|
|
11
|
+
executeAgentRun,
|
|
12
|
+
recordAgentTelemetry,
|
|
13
|
+
} from '../dist/agent-execution.js';
|
|
14
|
+
import { routeRequest } from '../dist/request-router.js';
|
|
15
|
+
import { writeTrace } from '../dist/trace-writer.js';
|
|
16
|
+
import { percentile } from './benchmark-baseline.js';
|
|
17
|
+
|
|
18
|
+
function parseArgs(argv) {
|
|
19
|
+
const args = {
|
|
20
|
+
rounds: 8,
|
|
21
|
+
toolAllow: ['Read', 'Write', 'Edit', 'Glob', 'Grep', 'Bash'],
|
|
22
|
+
chatJid: '',
|
|
23
|
+
groupFolder: 'main',
|
|
24
|
+
userId: 'canary-live-user',
|
|
25
|
+
userName: 'Canary',
|
|
26
|
+
source: 'live-canary',
|
|
27
|
+
reasoningEffort: 'low',
|
|
28
|
+
maxToolSteps: 40,
|
|
29
|
+
timeoutMs: 180_000,
|
|
30
|
+
promptPrefix: '[CANARY:LIVE]'
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
for (let i = 0; i < argv.length; i += 1) {
|
|
34
|
+
const arg = argv[i];
|
|
35
|
+
if (arg === '--rounds' && i + 1 < argv.length) {
|
|
36
|
+
const value = Number(argv[i + 1]);
|
|
37
|
+
if (Number.isFinite(value) && value > 0) args.rounds = Math.floor(value);
|
|
38
|
+
i += 1;
|
|
39
|
+
continue;
|
|
40
|
+
}
|
|
41
|
+
if (arg === '--chat-jid' && i + 1 < argv.length) {
|
|
42
|
+
args.chatJid = argv[i + 1];
|
|
43
|
+
i += 1;
|
|
44
|
+
continue;
|
|
45
|
+
}
|
|
46
|
+
if (arg === '--group-folder' && i + 1 < argv.length) {
|
|
47
|
+
args.groupFolder = argv[i + 1];
|
|
48
|
+
i += 1;
|
|
49
|
+
continue;
|
|
50
|
+
}
|
|
51
|
+
if (arg === '--max-tool-steps' && i + 1 < argv.length) {
|
|
52
|
+
const value = Number(argv[i + 1]);
|
|
53
|
+
if (Number.isFinite(value) && value > 0) args.maxToolSteps = Math.floor(value);
|
|
54
|
+
i += 1;
|
|
55
|
+
continue;
|
|
56
|
+
}
|
|
57
|
+
if (arg === '--timeout-ms' && i + 1 < argv.length) {
|
|
58
|
+
const value = Number(argv[i + 1]);
|
|
59
|
+
if (Number.isFinite(value) && value > 0) args.timeoutMs = Math.floor(value);
|
|
60
|
+
i += 1;
|
|
61
|
+
continue;
|
|
62
|
+
}
|
|
63
|
+
if (arg === '--reasoning-effort' && i + 1 < argv.length) {
|
|
64
|
+
const value = String(argv[i + 1]).trim().toLowerCase();
|
|
65
|
+
if (value === 'off' || value === 'low' || value === 'medium' || value === 'high') {
|
|
66
|
+
args.reasoningEffort = value;
|
|
67
|
+
}
|
|
68
|
+
i += 1;
|
|
69
|
+
continue;
|
|
70
|
+
}
|
|
71
|
+
if (arg === '--prompt-prefix' && i + 1 < argv.length) {
|
|
72
|
+
args.promptPrefix = String(argv[i + 1] || '').trim() || args.promptPrefix;
|
|
73
|
+
i += 1;
|
|
74
|
+
continue;
|
|
75
|
+
}
|
|
76
|
+
if (arg === '--source' && i + 1 < argv.length) {
|
|
77
|
+
args.source = String(argv[i + 1] || '').trim() || args.source;
|
|
78
|
+
i += 1;
|
|
79
|
+
continue;
|
|
80
|
+
}
|
|
81
|
+
if (arg === '--tool-allow' && i + 1 < argv.length) {
|
|
82
|
+
const tools = String(argv[i + 1] || '')
|
|
83
|
+
.split(',')
|
|
84
|
+
.map((item) => item.trim())
|
|
85
|
+
.filter(Boolean);
|
|
86
|
+
if (tools.length > 0) args.toolAllow = tools;
|
|
87
|
+
i += 1;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
return args;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
function loadRegisteredGroups() {
|
|
95
|
+
const filePath = path.join(DATA_DIR, 'registered_groups.json');
|
|
96
|
+
return loadJson(filePath, {});
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
function resolveChatAndGroup(args) {
|
|
100
|
+
const groups = loadRegisteredGroups();
|
|
101
|
+
const entries = Object.entries(groups);
|
|
102
|
+
if (!entries.length) {
|
|
103
|
+
throw new Error(`No registered groups found in ${path.join(DATA_DIR, 'registered_groups.json')}`);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
if (args.chatJid) {
|
|
107
|
+
const group = groups[args.chatJid];
|
|
108
|
+
if (!group || typeof group !== 'object') {
|
|
109
|
+
throw new Error(`Chat not registered: ${args.chatJid}`);
|
|
110
|
+
}
|
|
111
|
+
return { chatJid: args.chatJid, group };
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const matching = entries.find(([, group]) => group?.folder === args.groupFolder);
|
|
115
|
+
if (matching) return { chatJid: matching[0], group: matching[1] };
|
|
116
|
+
return { chatJid: entries[0][0], group: entries[0][1] };
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
function buildRoundPrompts(prefix, round, stamp) {
|
|
120
|
+
const canaryFile = `inbox/live-canary-${stamp}-r${String(round).padStart(2, '0')}.txt`;
|
|
121
|
+
return [
|
|
122
|
+
`${prefix} [SCENARIO:tool_heavy] Round ${round}: Create file "${canaryFile}" with 3 lines: alpha-${round}, beta-${round}, gamma-${round}. Then read it back and return a 1-sentence summary with exact filename.`,
|
|
123
|
+
`${prefix} [SCENARIO:memory] Round ${round}: From this same conversation session, what exact filename did you just create and what was line 2? Answer in one concise sentence.`,
|
|
124
|
+
`${prefix} [SCENARIO:tool_heavy] Round ${round}: List the 5 newest files under inbox/, read the newest one, and return exactly 2 bullet points with key details.`,
|
|
125
|
+
];
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
function summarizeResults(executions) {
|
|
129
|
+
const rows = executions.length;
|
|
130
|
+
const successRows = executions.filter((item) => item.output?.status === 'success');
|
|
131
|
+
const errorRows = executions.filter((item) => item.output?.status === 'error' || item.error);
|
|
132
|
+
const emptySuccess = successRows.filter((item) => {
|
|
133
|
+
const text = typeof item.output?.result === 'string' ? item.output.result.trim() : '';
|
|
134
|
+
return !text;
|
|
135
|
+
}).length;
|
|
136
|
+
const latencies = successRows
|
|
137
|
+
.map((item) => Number(item.output?.latency_ms))
|
|
138
|
+
.filter((value) => Number.isFinite(value) && value >= 0);
|
|
139
|
+
const toolCalls = successRows.flatMap((item) => Array.isArray(item.output?.tool_calls) ? item.output.tool_calls : []);
|
|
140
|
+
const failedToolCalls = toolCalls.filter((call) => !call?.ok).length;
|
|
141
|
+
const errorCounts = new Map();
|
|
142
|
+
for (const item of errorRows) {
|
|
143
|
+
const key = String(item.error || item.output?.error || 'unknown error').trim() || 'unknown error';
|
|
144
|
+
errorCounts.set(key, (errorCounts.get(key) || 0) + 1);
|
|
145
|
+
}
|
|
146
|
+
const topErrors = Array.from(errorCounts.entries())
|
|
147
|
+
.sort((a, b) => b[1] - a[1])
|
|
148
|
+
.slice(0, 8)
|
|
149
|
+
.map(([error, count]) => ({ error, count }));
|
|
150
|
+
|
|
151
|
+
const memoryCandidates = executions.filter((item) => /\[(?:scenario:)?memory(?:_carryover)?\]/i.test(item.prompt));
|
|
152
|
+
const memoryPassed = memoryCandidates.filter((item) => {
|
|
153
|
+
const text = typeof item.output?.result === 'string' ? item.output.result.trim() : '';
|
|
154
|
+
return item.output?.status === 'success' && text.length > 0;
|
|
155
|
+
}).length;
|
|
156
|
+
const toolHeavyCandidates = executions.filter((item) => /\[(?:scenario:)?tool_heavy\]/i.test(item.prompt));
|
|
157
|
+
const toolHeavyPassed = toolHeavyCandidates.filter((item) => {
|
|
158
|
+
if (item.output?.status !== 'success') return false;
|
|
159
|
+
const calls = Array.isArray(item.output?.tool_calls) ? item.output.tool_calls : [];
|
|
160
|
+
if (calls.length < 2) return false;
|
|
161
|
+
const failed = calls.filter((call) => !call?.ok).length;
|
|
162
|
+
return failed <= Math.floor(calls.length * 0.2);
|
|
163
|
+
}).length;
|
|
164
|
+
|
|
165
|
+
return {
|
|
166
|
+
rows_total: rows,
|
|
167
|
+
rows_success: successRows.length,
|
|
168
|
+
rows_error: errorRows.length,
|
|
169
|
+
success_rate: rows > 0 ? Number((successRows.length / rows).toFixed(4)) : null,
|
|
170
|
+
empty_success_rate: successRows.length > 0 ? Number((emptySuccess / successRows.length).toFixed(4)) : null,
|
|
171
|
+
latency_ms: {
|
|
172
|
+
p50: percentile(latencies, 50),
|
|
173
|
+
p90: percentile(latencies, 90),
|
|
174
|
+
p95: percentile(latencies, 95),
|
|
175
|
+
p99: percentile(latencies, 99),
|
|
176
|
+
},
|
|
177
|
+
tool_calls: {
|
|
178
|
+
total: toolCalls.length,
|
|
179
|
+
failed: failedToolCalls,
|
|
180
|
+
success_rate: toolCalls.length > 0
|
|
181
|
+
? Number(((toolCalls.length - failedToolCalls) / toolCalls.length).toFixed(4))
|
|
182
|
+
: null,
|
|
183
|
+
},
|
|
184
|
+
top_errors: topErrors,
|
|
185
|
+
scenarios: {
|
|
186
|
+
memory_carryover: {
|
|
187
|
+
candidates: memoryCandidates.length,
|
|
188
|
+
passed: memoryPassed,
|
|
189
|
+
pass_rate: memoryCandidates.length > 0 ? Number((memoryPassed / memoryCandidates.length).toFixed(4)) : null,
|
|
190
|
+
},
|
|
191
|
+
tool_heavy: {
|
|
192
|
+
candidates: toolHeavyCandidates.length,
|
|
193
|
+
passed: toolHeavyPassed,
|
|
194
|
+
pass_rate: toolHeavyCandidates.length > 0 ? Number((toolHeavyPassed / toolHeavyCandidates.length).toFixed(4)) : null,
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
};
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
async function runOne(params) {
|
|
201
|
+
const traceBase = createTraceBase({
|
|
202
|
+
chatId: params.chatJid,
|
|
203
|
+
groupFolder: params.group.folder,
|
|
204
|
+
userId: params.userId,
|
|
205
|
+
inputText: params.prompt,
|
|
206
|
+
source: params.source
|
|
207
|
+
});
|
|
208
|
+
|
|
209
|
+
let output = null;
|
|
210
|
+
let context = null;
|
|
211
|
+
let errorMessage = null;
|
|
212
|
+
|
|
213
|
+
try {
|
|
214
|
+
const execution = await executeAgentRun({
|
|
215
|
+
group: params.group,
|
|
216
|
+
prompt: params.prompt,
|
|
217
|
+
chatJid: params.chatJid,
|
|
218
|
+
userId: params.userId,
|
|
219
|
+
userName: params.userName,
|
|
220
|
+
recallQuery: params.prompt,
|
|
221
|
+
recallMaxResults: params.routing.recallMaxResults,
|
|
222
|
+
recallMaxTokens: params.routing.recallMaxTokens,
|
|
223
|
+
sessionId: params.sessionId,
|
|
224
|
+
persistSession: true,
|
|
225
|
+
useGroupLock: true,
|
|
226
|
+
useSemaphore: true,
|
|
227
|
+
modelFallbacks: params.routing.fallbacks,
|
|
228
|
+
reasoningEffort: params.reasoningEffort,
|
|
229
|
+
modelMaxOutputTokens: params.routing.maxOutputTokens || undefined,
|
|
230
|
+
maxToolSteps: params.maxToolSteps,
|
|
231
|
+
lane: 'maintenance',
|
|
232
|
+
toolAllow: params.toolAllow,
|
|
233
|
+
timeoutMs: params.timeoutMs
|
|
234
|
+
});
|
|
235
|
+
output = execution.output;
|
|
236
|
+
context = execution.context;
|
|
237
|
+
if (output.status === 'error') {
|
|
238
|
+
errorMessage = output.error || 'Unknown error';
|
|
239
|
+
}
|
|
240
|
+
return {
|
|
241
|
+
output,
|
|
242
|
+
context,
|
|
243
|
+
errorMessage,
|
|
244
|
+
nextSessionId: output?.newSessionId || params.sessionId
|
|
245
|
+
};
|
|
246
|
+
} catch (err) {
|
|
247
|
+
if (err instanceof AgentExecutionError) {
|
|
248
|
+
context = err.context;
|
|
249
|
+
errorMessage = err.message;
|
|
250
|
+
} else {
|
|
251
|
+
errorMessage = err instanceof Error ? err.message : String(err);
|
|
252
|
+
}
|
|
253
|
+
return {
|
|
254
|
+
output,
|
|
255
|
+
context,
|
|
256
|
+
errorMessage,
|
|
257
|
+
nextSessionId: params.sessionId
|
|
258
|
+
};
|
|
259
|
+
} finally {
|
|
260
|
+
if (context) {
|
|
261
|
+
recordAgentTelemetry({
|
|
262
|
+
traceBase,
|
|
263
|
+
output,
|
|
264
|
+
context,
|
|
265
|
+
metricsSource: 'live_canary',
|
|
266
|
+
toolAuditSource: 'heartbeat',
|
|
267
|
+
errorMessage: errorMessage || undefined
|
|
268
|
+
});
|
|
269
|
+
} else {
|
|
270
|
+
writeTrace({
|
|
271
|
+
...traceBase,
|
|
272
|
+
output_text: output?.result ?? null,
|
|
273
|
+
model_id: output?.model || 'unknown',
|
|
274
|
+
memory_recall: [],
|
|
275
|
+
error_code: errorMessage || undefined,
|
|
276
|
+
source: params.source
|
|
277
|
+
});
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
async function main() {
|
|
283
|
+
const args = parseArgs(process.argv.slice(2));
|
|
284
|
+
const startedAt = new Date().toISOString();
|
|
285
|
+
initDatabase();
|
|
286
|
+
const resolved = resolveChatAndGroup(args);
|
|
287
|
+
const routing = routeRequest();
|
|
288
|
+
const stamp = new Date().toISOString().replace(/[:.]/g, '-');
|
|
289
|
+
|
|
290
|
+
const executions = [];
|
|
291
|
+
let sessionId;
|
|
292
|
+
|
|
293
|
+
for (let round = 1; round <= args.rounds; round += 1) {
|
|
294
|
+
const prompts = buildRoundPrompts(args.promptPrefix, round, stamp);
|
|
295
|
+
for (const prompt of prompts) {
|
|
296
|
+
const result = await runOne({
|
|
297
|
+
chatJid: resolved.chatJid,
|
|
298
|
+
group: resolved.group,
|
|
299
|
+
prompt,
|
|
300
|
+
userId: args.userId,
|
|
301
|
+
userName: args.userName,
|
|
302
|
+
source: args.source,
|
|
303
|
+
routing,
|
|
304
|
+
sessionId,
|
|
305
|
+
reasoningEffort: args.reasoningEffort,
|
|
306
|
+
maxToolSteps: args.maxToolSteps,
|
|
307
|
+
timeoutMs: args.timeoutMs,
|
|
308
|
+
toolAllow: args.toolAllow
|
|
309
|
+
});
|
|
310
|
+
sessionId = result.nextSessionId;
|
|
311
|
+
executions.push({
|
|
312
|
+
round,
|
|
313
|
+
prompt,
|
|
314
|
+
output: result.output,
|
|
315
|
+
error: result.errorMessage
|
|
316
|
+
});
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
const summary = summarizeResults(executions);
|
|
321
|
+
const output = {
|
|
322
|
+
generated_at: new Date().toISOString(),
|
|
323
|
+
started_at: startedAt,
|
|
324
|
+
chat_jid: resolved.chatJid,
|
|
325
|
+
group_folder: resolved.group.folder,
|
|
326
|
+
source: args.source,
|
|
327
|
+
rounds: args.rounds,
|
|
328
|
+
prompts_executed: executions.length,
|
|
329
|
+
model: routing.model,
|
|
330
|
+
fallbacks: routing.fallbacks,
|
|
331
|
+
metrics: summary
|
|
332
|
+
};
|
|
333
|
+
console.log(JSON.stringify(output, null, 2));
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
main().catch((err) => {
|
|
337
|
+
console.error(err instanceof Error ? err.message : String(err));
|
|
338
|
+
process.exitCode = 1;
|
|
339
|
+
});
|