@dotsetlabs/dotclaw 2.4.0 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/.env.example +9 -10
  2. package/README.md +8 -4
  3. package/config-examples/runtime.json +34 -8
  4. package/config-examples/tool-policy.json +12 -2
  5. package/container/agent-runner/package-lock.json +2 -2
  6. package/container/agent-runner/package.json +1 -1
  7. package/container/agent-runner/src/agent-config.ts +19 -3
  8. package/container/agent-runner/src/container-protocol.ts +11 -0
  9. package/container/agent-runner/src/context-overflow-recovery.ts +39 -0
  10. package/container/agent-runner/src/index.ts +603 -165
  11. package/container/agent-runner/src/openrouter-input.ts +159 -0
  12. package/container/agent-runner/src/system-prompt.ts +13 -3
  13. package/container/agent-runner/src/tool-loop-policy.ts +741 -0
  14. package/container/agent-runner/src/tools.ts +211 -8
  15. package/dist/agent-context.d.ts +1 -0
  16. package/dist/agent-context.d.ts.map +1 -1
  17. package/dist/agent-context.js +21 -9
  18. package/dist/agent-context.js.map +1 -1
  19. package/dist/agent-execution.d.ts +2 -0
  20. package/dist/agent-execution.d.ts.map +1 -1
  21. package/dist/agent-execution.js +164 -15
  22. package/dist/agent-execution.js.map +1 -1
  23. package/dist/agent-semaphore.d.ts +24 -1
  24. package/dist/agent-semaphore.d.ts.map +1 -1
  25. package/dist/agent-semaphore.js +109 -20
  26. package/dist/agent-semaphore.js.map +1 -1
  27. package/dist/cli.js +3 -11
  28. package/dist/cli.js.map +1 -1
  29. package/dist/config.d.ts +2 -0
  30. package/dist/config.d.ts.map +1 -1
  31. package/dist/config.js +2 -0
  32. package/dist/config.js.map +1 -1
  33. package/dist/container-protocol.d.ts +22 -0
  34. package/dist/container-protocol.d.ts.map +1 -1
  35. package/dist/container-protocol.js.map +1 -1
  36. package/dist/container-runner.d.ts +7 -0
  37. package/dist/container-runner.d.ts.map +1 -1
  38. package/dist/container-runner.js +417 -143
  39. package/dist/container-runner.js.map +1 -1
  40. package/dist/db.d.ts.map +1 -1
  41. package/dist/db.js +46 -12
  42. package/dist/db.js.map +1 -1
  43. package/dist/error-messages.d.ts.map +1 -1
  44. package/dist/error-messages.js +18 -4
  45. package/dist/error-messages.js.map +1 -1
  46. package/dist/failover-policy.d.ts +41 -0
  47. package/dist/failover-policy.d.ts.map +1 -0
  48. package/dist/failover-policy.js +261 -0
  49. package/dist/failover-policy.js.map +1 -0
  50. package/dist/index.js +1 -0
  51. package/dist/index.js.map +1 -1
  52. package/dist/ipc-dispatcher.d.ts.map +1 -1
  53. package/dist/ipc-dispatcher.js +27 -43
  54. package/dist/ipc-dispatcher.js.map +1 -1
  55. package/dist/mcp-config.d.ts +22 -0
  56. package/dist/mcp-config.d.ts.map +1 -0
  57. package/dist/mcp-config.js +94 -0
  58. package/dist/mcp-config.js.map +1 -0
  59. package/dist/memory-backend.d.ts +27 -0
  60. package/dist/memory-backend.d.ts.map +1 -0
  61. package/dist/memory-backend.js +112 -0
  62. package/dist/memory-backend.js.map +1 -0
  63. package/dist/memory-recall.d.ts.map +1 -1
  64. package/dist/memory-recall.js +135 -22
  65. package/dist/memory-recall.js.map +1 -1
  66. package/dist/memory-store.d.ts +1 -0
  67. package/dist/memory-store.d.ts.map +1 -1
  68. package/dist/memory-store.js +55 -7
  69. package/dist/memory-store.js.map +1 -1
  70. package/dist/message-pipeline.d.ts +24 -0
  71. package/dist/message-pipeline.d.ts.map +1 -1
  72. package/dist/message-pipeline.js +131 -27
  73. package/dist/message-pipeline.js.map +1 -1
  74. package/dist/metrics.d.ts +1 -0
  75. package/dist/metrics.d.ts.map +1 -1
  76. package/dist/metrics.js +9 -0
  77. package/dist/metrics.js.map +1 -1
  78. package/dist/providers/discord/discord-provider.d.ts.map +1 -1
  79. package/dist/providers/discord/discord-provider.js +72 -4
  80. package/dist/providers/discord/discord-provider.js.map +1 -1
  81. package/dist/providers/telegram/telegram-provider.d.ts.map +1 -1
  82. package/dist/providers/telegram/telegram-provider.js +65 -3
  83. package/dist/providers/telegram/telegram-provider.js.map +1 -1
  84. package/dist/recall-policy.d.ts +12 -0
  85. package/dist/recall-policy.d.ts.map +1 -0
  86. package/dist/recall-policy.js +89 -0
  87. package/dist/recall-policy.js.map +1 -0
  88. package/dist/runtime-config.d.ts +33 -0
  89. package/dist/runtime-config.d.ts.map +1 -1
  90. package/dist/runtime-config.js +109 -9
  91. package/dist/runtime-config.js.map +1 -1
  92. package/dist/streaming.d.ts.map +1 -1
  93. package/dist/streaming.js +125 -33
  94. package/dist/streaming.js.map +1 -1
  95. package/dist/task-scheduler.d.ts.map +1 -1
  96. package/dist/task-scheduler.js +4 -2
  97. package/dist/task-scheduler.js.map +1 -1
  98. package/dist/tool-policy.d.ts.map +1 -1
  99. package/dist/tool-policy.js +26 -4
  100. package/dist/tool-policy.js.map +1 -1
  101. package/dist/trace-writer.d.ts +12 -0
  102. package/dist/trace-writer.d.ts.map +1 -1
  103. package/dist/trace-writer.js.map +1 -1
  104. package/dist/turn-hygiene.d.ts +14 -0
  105. package/dist/turn-hygiene.d.ts.map +1 -0
  106. package/dist/turn-hygiene.js +214 -0
  107. package/dist/turn-hygiene.js.map +1 -0
  108. package/dist/webhook.d.ts.map +1 -1
  109. package/dist/webhook.js +1 -0
  110. package/dist/webhook.js.map +1 -1
  111. package/package.json +15 -1
  112. package/scripts/benchmark-baseline.js +365 -0
  113. package/scripts/benchmark-harness.js +1413 -0
  114. package/scripts/benchmark-scenarios.js +301 -0
  115. package/scripts/canary-suite.js +123 -0
  116. package/scripts/generate-controlled-traces.js +230 -0
  117. package/scripts/release-slo-check.js +214 -0
  118. package/scripts/run-live-canary.js +339 -0
@@ -0,0 +1,214 @@
1
+ #!/usr/bin/env node
2
+
3
+ import fs from 'node:fs';
4
+ import path from 'node:path';
5
+ import { execFileSync } from 'node:child_process';
6
+
7
+ const DEFAULT_SLOS = {
8
+ min_records: 20,
9
+ min_success_rate: 0.95,
10
+ max_error_rate: 0.05,
11
+ max_empty_success_rate: 0.01,
12
+ min_tool_success_rate: 0.95,
13
+ max_p95_latency_ms: 120000,
14
+ max_error_class_rate: {
15
+ auth: 0.02,
16
+ rate_limit: 0.1,
17
+ timeout: 0.1,
18
+ context_overflow: 0.05,
19
+ unknown: 0.1
20
+ }
21
+ };
22
+
23
+ function parseArgs(argv) {
24
+ const args = {
25
+ days: 7,
26
+ dir: '',
27
+ input: '',
28
+ thresholds: '',
29
+ enforce: false
30
+ };
31
+ for (let i = 0; i < argv.length; i += 1) {
32
+ const arg = argv[i];
33
+ if (arg === '--days' && i + 1 < argv.length) {
34
+ const value = Number(argv[i + 1]);
35
+ if (Number.isFinite(value) && value > 0) args.days = Math.floor(value);
36
+ i += 1;
37
+ continue;
38
+ }
39
+ if (arg === '--dir' && i + 1 < argv.length) {
40
+ args.dir = argv[i + 1];
41
+ i += 1;
42
+ continue;
43
+ }
44
+ if (arg === '--input' && i + 1 < argv.length) {
45
+ args.input = argv[i + 1];
46
+ i += 1;
47
+ continue;
48
+ }
49
+ if (arg === '--thresholds' && i + 1 < argv.length) {
50
+ args.thresholds = argv[i + 1];
51
+ i += 1;
52
+ continue;
53
+ }
54
+ if (arg === '--enforce') {
55
+ args.enforce = true;
56
+ }
57
+ }
58
+ return args;
59
+ }
60
+
61
+ function readJson(filePath) {
62
+ return JSON.parse(fs.readFileSync(filePath, 'utf-8'));
63
+ }
64
+
65
+ function classifyErrorMessage(message) {
66
+ const lower = String(message || '').toLowerCase();
67
+ if (/invalid.?api.?key|unauthorized|forbidden|payment|required|insufficient.?credit|\b401\b|\b402\b|\b403\b/.test(lower)) {
68
+ return 'auth';
69
+ }
70
+ if (/rate.?limit|too many requests|\b429\b/.test(lower)) {
71
+ return 'rate_limit';
72
+ }
73
+ if (/timeout|timed out|deadline|econnreset|econnrefused|enotfound|eai_again/.test(lower)) {
74
+ return 'timeout';
75
+ }
76
+ if (/context.?length|maximum.?context|too many tokens|token.?limit/.test(lower)) {
77
+ return 'context_overflow';
78
+ }
79
+ return 'unknown';
80
+ }
81
+
82
+ function mergeThresholds(base, overrides) {
83
+ if (!overrides || typeof overrides !== 'object') return base;
84
+ const merged = JSON.parse(JSON.stringify(base));
85
+ for (const [key, value] of Object.entries(overrides)) {
86
+ if (value && typeof value === 'object' && !Array.isArray(value) && merged[key] && typeof merged[key] === 'object') {
87
+ merged[key] = mergeThresholds(merged[key], value);
88
+ } else {
89
+ merged[key] = value;
90
+ }
91
+ }
92
+ return merged;
93
+ }
94
+
95
+ function computeEmptySuccessRate(report) {
96
+ const success = Number(report.records_success || 0);
97
+ const empty = Number(report.empty_success_responses || 0);
98
+ if (!Number.isFinite(success) || success <= 0) return null;
99
+ if (!Number.isFinite(empty) || empty < 0) return null;
100
+ return empty / success;
101
+ }
102
+
103
+ export function evaluateReleaseSlo(report, thresholds = DEFAULT_SLOS) {
104
+ const failures = [];
105
+ const checks = [];
106
+
107
+ const recordsTotal = Number(report.records_total || 0);
108
+ const successRate = Number(report.success_rate || 0);
109
+ const errorRate = recordsTotal > 0 ? Number(report.records_error || 0) / recordsTotal : 0;
110
+ const emptySuccessRate = computeEmptySuccessRate(report) ?? 0;
111
+ const toolSuccessRate = Number(report?.tool_calls?.success_rate ?? 0);
112
+ const p95Latency = Number(report?.latency_ms?.p95 ?? 0);
113
+
114
+ checks.push({ name: 'records_total', actual: recordsTotal, threshold: thresholds.min_records, comparator: '>=' });
115
+ if (recordsTotal < thresholds.min_records) {
116
+ failures.push(`records_total ${recordsTotal} below ${thresholds.min_records}`);
117
+ }
118
+
119
+ checks.push({ name: 'success_rate', actual: successRate, threshold: thresholds.min_success_rate, comparator: '>=' });
120
+ if (successRate < thresholds.min_success_rate) {
121
+ failures.push(`success_rate ${successRate} below ${thresholds.min_success_rate}`);
122
+ }
123
+
124
+ checks.push({ name: 'error_rate', actual: Number(errorRate.toFixed(4)), threshold: thresholds.max_error_rate, comparator: '<=' });
125
+ if (errorRate > thresholds.max_error_rate) {
126
+ failures.push(`error_rate ${errorRate.toFixed(4)} above ${thresholds.max_error_rate}`);
127
+ }
128
+
129
+ checks.push({ name: 'empty_success_rate', actual: Number(emptySuccessRate.toFixed(4)), threshold: thresholds.max_empty_success_rate, comparator: '<=' });
130
+ if (emptySuccessRate > thresholds.max_empty_success_rate) {
131
+ failures.push(`empty_success_rate ${emptySuccessRate.toFixed(4)} above ${thresholds.max_empty_success_rate}`);
132
+ }
133
+
134
+ checks.push({ name: 'tool_success_rate', actual: toolSuccessRate, threshold: thresholds.min_tool_success_rate, comparator: '>=' });
135
+ if (toolSuccessRate < thresholds.min_tool_success_rate) {
136
+ failures.push(`tool_success_rate ${toolSuccessRate} below ${thresholds.min_tool_success_rate}`);
137
+ }
138
+
139
+ checks.push({ name: 'latency_p95_ms', actual: p95Latency, threshold: thresholds.max_p95_latency_ms, comparator: '<=' });
140
+ if (p95Latency > thresholds.max_p95_latency_ms) {
141
+ failures.push(`latency_p95_ms ${p95Latency} above ${thresholds.max_p95_latency_ms}`);
142
+ }
143
+
144
+ const classCounts = {
145
+ auth: 0,
146
+ rate_limit: 0,
147
+ timeout: 0,
148
+ context_overflow: 0,
149
+ unknown: 0
150
+ };
151
+ const topErrors = Array.isArray(report.top_errors) ? report.top_errors : [];
152
+ for (const entry of topErrors) {
153
+ const errorClass = classifyErrorMessage(entry.key);
154
+ classCounts[errorClass] += Number(entry.count || 0);
155
+ }
156
+
157
+ for (const [errorClass, maxRate] of Object.entries(thresholds.max_error_class_rate || {})) {
158
+ const rate = recordsTotal > 0 ? classCounts[errorClass] / recordsTotal : 0;
159
+ checks.push({
160
+ name: `error_class_${errorClass}_rate`,
161
+ actual: Number(rate.toFixed(4)),
162
+ threshold: maxRate,
163
+ comparator: '<='
164
+ });
165
+ if (rate > maxRate) {
166
+ failures.push(`error_class_${errorClass}_rate ${rate.toFixed(4)} above ${maxRate}`);
167
+ }
168
+ }
169
+
170
+ return {
171
+ checks,
172
+ failures,
173
+ passed: failures.length === 0
174
+ };
175
+ }
176
+
177
+ function buildBaselineReport(args) {
178
+ if (args.input) {
179
+ return readJson(args.input);
180
+ }
181
+ const baselineScript = path.join(process.cwd(), 'scripts', 'benchmark-baseline.js');
182
+ const baselineArgs = [baselineScript, '--days', String(args.days)];
183
+ if (args.dir) {
184
+ baselineArgs.push('--dir', args.dir);
185
+ }
186
+ const stdout = execFileSync(process.execPath, baselineArgs, { encoding: 'utf-8' });
187
+ return JSON.parse(stdout);
188
+ }
189
+
190
+ function main() {
191
+ const args = parseArgs(process.argv.slice(2));
192
+ const report = buildBaselineReport(args);
193
+ const thresholds = args.thresholds
194
+ ? mergeThresholds(DEFAULT_SLOS, readJson(args.thresholds))
195
+ : DEFAULT_SLOS;
196
+ const evaluation = evaluateReleaseSlo(report, thresholds);
197
+ const output = {
198
+ source: args.input || 'benchmark-baseline',
199
+ thresholds,
200
+ report,
201
+ checks: evaluation.checks,
202
+ failures: evaluation.failures,
203
+ passed: evaluation.passed
204
+ };
205
+
206
+ console.log(JSON.stringify(output, null, 2));
207
+ if (args.enforce && !evaluation.passed) {
208
+ process.exitCode = 1;
209
+ }
210
+ }
211
+
212
+ if (import.meta.url === `file://${process.argv[1]}`) {
213
+ main();
214
+ }
@@ -0,0 +1,339 @@
1
+ #!/usr/bin/env node
2
+
3
+ import path from 'node:path';
4
+
5
+ import { DATA_DIR } from '../dist/config.js';
6
+ import { initDatabase } from '../dist/db.js';
7
+ import { loadJson } from '../dist/utils.js';
8
+ import {
9
+ AgentExecutionError,
10
+ createTraceBase,
11
+ executeAgentRun,
12
+ recordAgentTelemetry,
13
+ } from '../dist/agent-execution.js';
14
+ import { routeRequest } from '../dist/request-router.js';
15
+ import { writeTrace } from '../dist/trace-writer.js';
16
+ import { percentile } from './benchmark-baseline.js';
17
+
18
+ function parseArgs(argv) {
19
+ const args = {
20
+ rounds: 8,
21
+ toolAllow: ['Read', 'Write', 'Edit', 'Glob', 'Grep', 'Bash'],
22
+ chatJid: '',
23
+ groupFolder: 'main',
24
+ userId: 'canary-live-user',
25
+ userName: 'Canary',
26
+ source: 'live-canary',
27
+ reasoningEffort: 'low',
28
+ maxToolSteps: 40,
29
+ timeoutMs: 180_000,
30
+ promptPrefix: '[CANARY:LIVE]'
31
+ };
32
+
33
+ for (let i = 0; i < argv.length; i += 1) {
34
+ const arg = argv[i];
35
+ if (arg === '--rounds' && i + 1 < argv.length) {
36
+ const value = Number(argv[i + 1]);
37
+ if (Number.isFinite(value) && value > 0) args.rounds = Math.floor(value);
38
+ i += 1;
39
+ continue;
40
+ }
41
+ if (arg === '--chat-jid' && i + 1 < argv.length) {
42
+ args.chatJid = argv[i + 1];
43
+ i += 1;
44
+ continue;
45
+ }
46
+ if (arg === '--group-folder' && i + 1 < argv.length) {
47
+ args.groupFolder = argv[i + 1];
48
+ i += 1;
49
+ continue;
50
+ }
51
+ if (arg === '--max-tool-steps' && i + 1 < argv.length) {
52
+ const value = Number(argv[i + 1]);
53
+ if (Number.isFinite(value) && value > 0) args.maxToolSteps = Math.floor(value);
54
+ i += 1;
55
+ continue;
56
+ }
57
+ if (arg === '--timeout-ms' && i + 1 < argv.length) {
58
+ const value = Number(argv[i + 1]);
59
+ if (Number.isFinite(value) && value > 0) args.timeoutMs = Math.floor(value);
60
+ i += 1;
61
+ continue;
62
+ }
63
+ if (arg === '--reasoning-effort' && i + 1 < argv.length) {
64
+ const value = String(argv[i + 1]).trim().toLowerCase();
65
+ if (value === 'off' || value === 'low' || value === 'medium' || value === 'high') {
66
+ args.reasoningEffort = value;
67
+ }
68
+ i += 1;
69
+ continue;
70
+ }
71
+ if (arg === '--prompt-prefix' && i + 1 < argv.length) {
72
+ args.promptPrefix = String(argv[i + 1] || '').trim() || args.promptPrefix;
73
+ i += 1;
74
+ continue;
75
+ }
76
+ if (arg === '--source' && i + 1 < argv.length) {
77
+ args.source = String(argv[i + 1] || '').trim() || args.source;
78
+ i += 1;
79
+ continue;
80
+ }
81
+ if (arg === '--tool-allow' && i + 1 < argv.length) {
82
+ const tools = String(argv[i + 1] || '')
83
+ .split(',')
84
+ .map((item) => item.trim())
85
+ .filter(Boolean);
86
+ if (tools.length > 0) args.toolAllow = tools;
87
+ i += 1;
88
+ }
89
+ }
90
+
91
+ return args;
92
+ }
93
+
94
+ function loadRegisteredGroups() {
95
+ const filePath = path.join(DATA_DIR, 'registered_groups.json');
96
+ return loadJson(filePath, {});
97
+ }
98
+
99
+ function resolveChatAndGroup(args) {
100
+ const groups = loadRegisteredGroups();
101
+ const entries = Object.entries(groups);
102
+ if (!entries.length) {
103
+ throw new Error(`No registered groups found in ${path.join(DATA_DIR, 'registered_groups.json')}`);
104
+ }
105
+
106
+ if (args.chatJid) {
107
+ const group = groups[args.chatJid];
108
+ if (!group || typeof group !== 'object') {
109
+ throw new Error(`Chat not registered: ${args.chatJid}`);
110
+ }
111
+ return { chatJid: args.chatJid, group };
112
+ }
113
+
114
+ const matching = entries.find(([, group]) => group?.folder === args.groupFolder);
115
+ if (matching) return { chatJid: matching[0], group: matching[1] };
116
+ return { chatJid: entries[0][0], group: entries[0][1] };
117
+ }
118
+
119
+ function buildRoundPrompts(prefix, round, stamp) {
120
+ const canaryFile = `inbox/live-canary-${stamp}-r${String(round).padStart(2, '0')}.txt`;
121
+ return [
122
+ `${prefix} [SCENARIO:tool_heavy] Round ${round}: Create file "${canaryFile}" with 3 lines: alpha-${round}, beta-${round}, gamma-${round}. Then read it back and return a 1-sentence summary with exact filename.`,
123
+ `${prefix} [SCENARIO:memory] Round ${round}: From this same conversation session, what exact filename did you just create and what was line 2? Answer in one concise sentence.`,
124
+ `${prefix} [SCENARIO:tool_heavy] Round ${round}: List the 5 newest files under inbox/, read the newest one, and return exactly 2 bullet points with key details.`,
125
+ ];
126
+ }
127
+
128
+ function summarizeResults(executions) {
129
+ const rows = executions.length;
130
+ const successRows = executions.filter((item) => item.output?.status === 'success');
131
+ const errorRows = executions.filter((item) => item.output?.status === 'error' || item.error);
132
+ const emptySuccess = successRows.filter((item) => {
133
+ const text = typeof item.output?.result === 'string' ? item.output.result.trim() : '';
134
+ return !text;
135
+ }).length;
136
+ const latencies = successRows
137
+ .map((item) => Number(item.output?.latency_ms))
138
+ .filter((value) => Number.isFinite(value) && value >= 0);
139
+ const toolCalls = successRows.flatMap((item) => Array.isArray(item.output?.tool_calls) ? item.output.tool_calls : []);
140
+ const failedToolCalls = toolCalls.filter((call) => !call?.ok).length;
141
+ const errorCounts = new Map();
142
+ for (const item of errorRows) {
143
+ const key = String(item.error || item.output?.error || 'unknown error').trim() || 'unknown error';
144
+ errorCounts.set(key, (errorCounts.get(key) || 0) + 1);
145
+ }
146
+ const topErrors = Array.from(errorCounts.entries())
147
+ .sort((a, b) => b[1] - a[1])
148
+ .slice(0, 8)
149
+ .map(([error, count]) => ({ error, count }));
150
+
151
+ const memoryCandidates = executions.filter((item) => /\[(?:scenario:)?memory(?:_carryover)?\]/i.test(item.prompt));
152
+ const memoryPassed = memoryCandidates.filter((item) => {
153
+ const text = typeof item.output?.result === 'string' ? item.output.result.trim() : '';
154
+ return item.output?.status === 'success' && text.length > 0;
155
+ }).length;
156
+ const toolHeavyCandidates = executions.filter((item) => /\[(?:scenario:)?tool_heavy\]/i.test(item.prompt));
157
+ const toolHeavyPassed = toolHeavyCandidates.filter((item) => {
158
+ if (item.output?.status !== 'success') return false;
159
+ const calls = Array.isArray(item.output?.tool_calls) ? item.output.tool_calls : [];
160
+ if (calls.length < 2) return false;
161
+ const failed = calls.filter((call) => !call?.ok).length;
162
+ return failed <= Math.floor(calls.length * 0.2);
163
+ }).length;
164
+
165
+ return {
166
+ rows_total: rows,
167
+ rows_success: successRows.length,
168
+ rows_error: errorRows.length,
169
+ success_rate: rows > 0 ? Number((successRows.length / rows).toFixed(4)) : null,
170
+ empty_success_rate: successRows.length > 0 ? Number((emptySuccess / successRows.length).toFixed(4)) : null,
171
+ latency_ms: {
172
+ p50: percentile(latencies, 50),
173
+ p90: percentile(latencies, 90),
174
+ p95: percentile(latencies, 95),
175
+ p99: percentile(latencies, 99),
176
+ },
177
+ tool_calls: {
178
+ total: toolCalls.length,
179
+ failed: failedToolCalls,
180
+ success_rate: toolCalls.length > 0
181
+ ? Number(((toolCalls.length - failedToolCalls) / toolCalls.length).toFixed(4))
182
+ : null,
183
+ },
184
+ top_errors: topErrors,
185
+ scenarios: {
186
+ memory_carryover: {
187
+ candidates: memoryCandidates.length,
188
+ passed: memoryPassed,
189
+ pass_rate: memoryCandidates.length > 0 ? Number((memoryPassed / memoryCandidates.length).toFixed(4)) : null,
190
+ },
191
+ tool_heavy: {
192
+ candidates: toolHeavyCandidates.length,
193
+ passed: toolHeavyPassed,
194
+ pass_rate: toolHeavyCandidates.length > 0 ? Number((toolHeavyPassed / toolHeavyCandidates.length).toFixed(4)) : null,
195
+ }
196
+ }
197
+ };
198
+ }
199
+
200
+ async function runOne(params) {
201
+ const traceBase = createTraceBase({
202
+ chatId: params.chatJid,
203
+ groupFolder: params.group.folder,
204
+ userId: params.userId,
205
+ inputText: params.prompt,
206
+ source: params.source
207
+ });
208
+
209
+ let output = null;
210
+ let context = null;
211
+ let errorMessage = null;
212
+
213
+ try {
214
+ const execution = await executeAgentRun({
215
+ group: params.group,
216
+ prompt: params.prompt,
217
+ chatJid: params.chatJid,
218
+ userId: params.userId,
219
+ userName: params.userName,
220
+ recallQuery: params.prompt,
221
+ recallMaxResults: params.routing.recallMaxResults,
222
+ recallMaxTokens: params.routing.recallMaxTokens,
223
+ sessionId: params.sessionId,
224
+ persistSession: true,
225
+ useGroupLock: true,
226
+ useSemaphore: true,
227
+ modelFallbacks: params.routing.fallbacks,
228
+ reasoningEffort: params.reasoningEffort,
229
+ modelMaxOutputTokens: params.routing.maxOutputTokens || undefined,
230
+ maxToolSteps: params.maxToolSteps,
231
+ lane: 'maintenance',
232
+ toolAllow: params.toolAllow,
233
+ timeoutMs: params.timeoutMs
234
+ });
235
+ output = execution.output;
236
+ context = execution.context;
237
+ if (output.status === 'error') {
238
+ errorMessage = output.error || 'Unknown error';
239
+ }
240
+ return {
241
+ output,
242
+ context,
243
+ errorMessage,
244
+ nextSessionId: output?.newSessionId || params.sessionId
245
+ };
246
+ } catch (err) {
247
+ if (err instanceof AgentExecutionError) {
248
+ context = err.context;
249
+ errorMessage = err.message;
250
+ } else {
251
+ errorMessage = err instanceof Error ? err.message : String(err);
252
+ }
253
+ return {
254
+ output,
255
+ context,
256
+ errorMessage,
257
+ nextSessionId: params.sessionId
258
+ };
259
+ } finally {
260
+ if (context) {
261
+ recordAgentTelemetry({
262
+ traceBase,
263
+ output,
264
+ context,
265
+ metricsSource: 'live_canary',
266
+ toolAuditSource: 'heartbeat',
267
+ errorMessage: errorMessage || undefined
268
+ });
269
+ } else {
270
+ writeTrace({
271
+ ...traceBase,
272
+ output_text: output?.result ?? null,
273
+ model_id: output?.model || 'unknown',
274
+ memory_recall: [],
275
+ error_code: errorMessage || undefined,
276
+ source: params.source
277
+ });
278
+ }
279
+ }
280
+ }
281
+
282
+ async function main() {
283
+ const args = parseArgs(process.argv.slice(2));
284
+ const startedAt = new Date().toISOString();
285
+ initDatabase();
286
+ const resolved = resolveChatAndGroup(args);
287
+ const routing = routeRequest();
288
+ const stamp = new Date().toISOString().replace(/[:.]/g, '-');
289
+
290
+ const executions = [];
291
+ let sessionId;
292
+
293
+ for (let round = 1; round <= args.rounds; round += 1) {
294
+ const prompts = buildRoundPrompts(args.promptPrefix, round, stamp);
295
+ for (const prompt of prompts) {
296
+ const result = await runOne({
297
+ chatJid: resolved.chatJid,
298
+ group: resolved.group,
299
+ prompt,
300
+ userId: args.userId,
301
+ userName: args.userName,
302
+ source: args.source,
303
+ routing,
304
+ sessionId,
305
+ reasoningEffort: args.reasoningEffort,
306
+ maxToolSteps: args.maxToolSteps,
307
+ timeoutMs: args.timeoutMs,
308
+ toolAllow: args.toolAllow
309
+ });
310
+ sessionId = result.nextSessionId;
311
+ executions.push({
312
+ round,
313
+ prompt,
314
+ output: result.output,
315
+ error: result.errorMessage
316
+ });
317
+ }
318
+ }
319
+
320
+ const summary = summarizeResults(executions);
321
+ const output = {
322
+ generated_at: new Date().toISOString(),
323
+ started_at: startedAt,
324
+ chat_jid: resolved.chatJid,
325
+ group_folder: resolved.group.folder,
326
+ source: args.source,
327
+ rounds: args.rounds,
328
+ prompts_executed: executions.length,
329
+ model: routing.model,
330
+ fallbacks: routing.fallbacks,
331
+ metrics: summary
332
+ };
333
+ console.log(JSON.stringify(output, null, 2));
334
+ }
335
+
336
+ main().catch((err) => {
337
+ console.error(err instanceof Error ? err.message : String(err));
338
+ process.exitCode = 1;
339
+ });