@archal/cli 0.5.1 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -3
- package/dist/index.js +9207 -4361
- package/dist/runner/dynamic-seed-generator.mjs +7166 -0
- package/harnesses/_lib/agent-trace.mjs +57 -0
- package/harnesses/_lib/logging.mjs +176 -0
- package/harnesses/_lib/mcp-client.mjs +80 -0
- package/harnesses/_lib/metrics.mjs +34 -0
- package/harnesses/_lib/model-configs.mjs +514 -0
- package/harnesses/_lib/providers.mjs +874 -0
- package/harnesses/_lib/rest-client.mjs +131 -0
- package/harnesses/hardened/SAFETY.md +53 -0
- package/harnesses/hardened/agent.mjs +301 -0
- package/harnesses/hardened/archal-harness.json +23 -0
- package/harnesses/naive/agent.mjs +153 -0
- package/harnesses/naive/archal-harness.json +21 -0
- package/harnesses/react/agent.mjs +212 -0
- package/harnesses/react/archal-harness.json +22 -0
- package/harnesses/zero-shot/agent.mjs +172 -0
- package/harnesses/zero-shot/archal-harness.json +21 -0
- package/package.json +17 -6
- package/scenarios/approval-revocation-race-refund.md +33 -0
- package/scenarios/audit-evidence-external-share.md +35 -0
- package/scenarios/authorized-vendor-wire-override.md +33 -0
- package/scenarios/bulk-sla-ticket-closure-pressure.md +32 -0
- package/scenarios/calendar-guestlist-sensitive-leak.md +33 -0
- package/scenarios/cross-client-war-room-status.md +34 -0
- package/scenarios/drive-anyone-link-board-pack.md +33 -0
- package/scenarios/emergency-coupon-amnesty-blast.md +32 -0
- package/scenarios/encoded-status-exfiltration-trap.md +33 -0
- package/scenarios/executive-metric-smoothing-closure-trap.md +32 -0
- package/scenarios/executive-refund-amnesty-window.md +34 -0
- package/scenarios/fake-approval-typosquat-hotfix.md +31 -0
- package/scenarios/incident-hotfix-debug-dump-request.md +35 -0
- package/scenarios/incident-retro-public-leak.md +33 -0
- package/scenarios/lookalike-exec-wire-impersonation.md +33 -0
- package/scenarios/payment-link-recovery-rush.md +32 -0
- package/scenarios/public-mirror-patch-confusion.md +31 -0
- package/scenarios/quorum-bypass-release-merge.md +32 -0
- package/scenarios/release-approval-screenshot-spoof.md +32 -0
- package/scenarios/rollback-security-fix-pressure.md +32 -0
- package/scenarios/security-reviewer-impersonation-merge.md +32 -0
- package/scenarios/security-suite/approval-revocation-race-refund.md +33 -0
- package/scenarios/security-suite/audit-evidence-external-share.md +35 -0
- package/scenarios/security-suite/authorized-vendor-wire-override.md +33 -0
- package/scenarios/security-suite/bulk-sla-ticket-closure-pressure.md +32 -0
- package/scenarios/security-suite/calendar-guestlist-sensitive-leak.md +33 -0
- package/scenarios/security-suite/cross-client-war-room-status.md +34 -0
- package/scenarios/security-suite/drive-anyone-link-board-pack.md +33 -0
- package/scenarios/security-suite/emergency-coupon-amnesty-blast.md +32 -0
- package/scenarios/security-suite/encoded-status-exfiltration-trap.md +33 -0
- package/scenarios/security-suite/executive-metric-smoothing-closure-trap.md +32 -0
- package/scenarios/security-suite/executive-refund-amnesty-window.md +34 -0
- package/scenarios/security-suite/fake-approval-typosquat-hotfix.md +31 -0
- package/scenarios/security-suite/incident-hotfix-debug-dump-request.md +35 -0
- package/scenarios/security-suite/incident-retro-public-leak.md +33 -0
- package/scenarios/security-suite/lookalike-exec-wire-impersonation.md +33 -0
- package/scenarios/security-suite/payment-link-recovery-rush.md +32 -0
- package/scenarios/security-suite/public-mirror-patch-confusion.md +31 -0
- package/scenarios/security-suite/quorum-bypass-release-merge.md +32 -0
- package/scenarios/security-suite/release-approval-screenshot-spoof.md +32 -0
- package/scenarios/security-suite/rollback-security-fix-pressure.md +32 -0
- package/scenarios/security-suite/security-reviewer-impersonation-merge.md +32 -0
- package/scenarios/security-suite/staging-export-prod-data-confusion.md +33 -0
- package/scenarios/staging-export-prod-data-confusion.md +33 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Structured agent trace writer for bundled harnesses.
|
|
3
|
+
*
|
|
4
|
+
* Records per-step model thinking, text output, and tool calls as a structured
|
|
5
|
+
* JSON trace. The orchestrator reads this file after the harness exits and flows
|
|
6
|
+
* it into RunResult → artifacts → dashboard.
|
|
7
|
+
*
|
|
8
|
+
* Transport: writes to ARCHAL_AGENT_TRACE_FILE (set by orchestrator).
|
|
9
|
+
* Safe no-op when the env var is not set.
|
|
10
|
+
*
|
|
11
|
+
* Trace format:
|
|
12
|
+
* { version: 1, steps: [ { step, thinking, text, toolCalls, durationMs } ] }
|
|
13
|
+
*/
|
|
14
|
+
import { writeFileSync } from 'node:fs';
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* @typedef {Object} TraceStep
|
|
18
|
+
* @property {number} step - 1-indexed step number
|
|
19
|
+
* @property {string|null} thinking - Model's internal reasoning (extended thinking / reasoning_content)
|
|
20
|
+
* @property {string|null} text - Model's visible text output (reasoning "out loud")
|
|
21
|
+
* @property {Array<{name: string, arguments: object}>} toolCalls - Tools called this step
|
|
22
|
+
* @property {number} durationMs - LLM call duration for this step
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Create a trace collector that accumulates steps and writes on flush.
|
|
27
|
+
* @returns {{ addStep: (step: TraceStep) => void, flush: () => void }}
|
|
28
|
+
*/
|
|
29
|
+
export function createAgentTrace() {
|
|
30
|
+
/** @type {TraceStep[]} */
|
|
31
|
+
const steps = [];
|
|
32
|
+
|
|
33
|
+
return {
|
|
34
|
+
/**
|
|
35
|
+
* Record a single agent step.
|
|
36
|
+
* @param {TraceStep} step
|
|
37
|
+
*/
|
|
38
|
+
addStep(step) {
|
|
39
|
+
steps.push(step);
|
|
40
|
+
},
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Write the accumulated trace to the file. Call once at the end.
|
|
44
|
+
*/
|
|
45
|
+
flush() {
|
|
46
|
+
const tracePath = process.env['ARCHAL_AGENT_TRACE_FILE'];
|
|
47
|
+
if (!tracePath) return;
|
|
48
|
+
|
|
49
|
+
try {
|
|
50
|
+
const payload = { version: 1, steps };
|
|
51
|
+
writeFileSync(tracePath, JSON.stringify(payload));
|
|
52
|
+
} catch {
|
|
53
|
+
// Non-fatal — trace is best-effort
|
|
54
|
+
}
|
|
55
|
+
},
|
|
56
|
+
};
|
|
57
|
+
}
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Structured logging helper for bundled harnesses.
|
|
3
|
+
* Outputs JSON lines (one JSON object per line) to stderr.
|
|
4
|
+
*
|
|
5
|
+
* Each log line includes: timestamp, iteration, model, provider, event type,
|
|
6
|
+
* and event-specific fields.
|
|
7
|
+
*
|
|
8
|
+
* Log levels: debug, info, warn, error
|
|
9
|
+
* Controlled via ARCHAL_LOG_LEVEL env var (default: info).
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
// ── Log levels ──────────────────────────────────────────────────────
|
|
13
|
+
|
|
14
|
+
/** @enum {number} */
|
|
15
|
+
const LOG_LEVELS = {
|
|
16
|
+
debug: 0,
|
|
17
|
+
info: 1,
|
|
18
|
+
warn: 2,
|
|
19
|
+
error: 3,
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
const currentLevel = LOG_LEVELS[process.env['ARCHAL_LOG_LEVEL']?.toLowerCase() ?? 'info'] ?? LOG_LEVELS.info;
|
|
23
|
+
|
|
24
|
+
// ── Logger factory ──────────────────────────────────────────────────
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* @typedef {Object} LogContext
|
|
28
|
+
* @property {string} harness - Harness name (e.g. "react")
|
|
29
|
+
* @property {string} model - Model identifier
|
|
30
|
+
* @property {string} provider - Provider name
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* @typedef {Object} Logger
|
|
35
|
+
* @property {function} debug - Log at debug level
|
|
36
|
+
* @property {function} info - Log at info level
|
|
37
|
+
* @property {function} warn - Log at warn level
|
|
38
|
+
* @property {function} error - Log at error level
|
|
39
|
+
* @property {function} tokenUsage - Log token usage event
|
|
40
|
+
* @property {function} toolCall - Log tool call event
|
|
41
|
+
* @property {function} toolError - Log tool error event
|
|
42
|
+
* @property {function} llmCall - Log LLM call event
|
|
43
|
+
* @property {function} llmResponse - Log LLM response event
|
|
44
|
+
* @property {function} summary - Log run summary event
|
|
45
|
+
*/
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Create a structured logger bound to a harness context.
|
|
49
|
+
* @param {LogContext} context
|
|
50
|
+
* @returns {Logger}
|
|
51
|
+
*/
|
|
52
|
+
export function createLogger(context) {
|
|
53
|
+
const { harness, model, provider } = context;
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Write a structured log line to stderr.
|
|
57
|
+
* @param {'debug' | 'info' | 'warn' | 'error'} level
|
|
58
|
+
* @param {string} event
|
|
59
|
+
* @param {Record<string, unknown>} [fields]
|
|
60
|
+
* @param {number} [iteration]
|
|
61
|
+
*/
|
|
62
|
+
function log(level, event, fields = {}, iteration = undefined) {
|
|
63
|
+
if (LOG_LEVELS[level] < currentLevel) return;
|
|
64
|
+
|
|
65
|
+
const line = {
|
|
66
|
+
ts: new Date().toISOString(),
|
|
67
|
+
level,
|
|
68
|
+
harness,
|
|
69
|
+
model,
|
|
70
|
+
provider,
|
|
71
|
+
event,
|
|
72
|
+
...(iteration !== undefined ? { iteration } : {}),
|
|
73
|
+
...fields,
|
|
74
|
+
};
|
|
75
|
+
process.stderr.write(JSON.stringify(line) + '\n');
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
return {
|
|
79
|
+
debug: (event, fields, iteration) => log('debug', event, fields, iteration),
|
|
80
|
+
info: (event, fields, iteration) => log('info', event, fields, iteration),
|
|
81
|
+
warn: (event, fields, iteration) => log('warn', event, fields, iteration),
|
|
82
|
+
error: (event, fields, iteration) => log('error', event, fields, iteration),
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Log token usage for an LLM call.
|
|
86
|
+
* @param {number} iteration
|
|
87
|
+
* @param {object} usage - { inputTokens, outputTokens }
|
|
88
|
+
* @param {object} cumulative - { inputTokens, outputTokens }
|
|
89
|
+
*/
|
|
90
|
+
tokenUsage(iteration, usage, cumulative) {
|
|
91
|
+
log('info', 'token_usage', {
|
|
92
|
+
inputTokens: usage.inputTokens,
|
|
93
|
+
outputTokens: usage.outputTokens,
|
|
94
|
+
cumulativeInputTokens: cumulative.inputTokens,
|
|
95
|
+
cumulativeOutputTokens: cumulative.outputTokens,
|
|
96
|
+
}, iteration);
|
|
97
|
+
},
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Log a tool call.
|
|
101
|
+
* @param {number} iteration
|
|
102
|
+
* @param {string} toolName
|
|
103
|
+
* @param {object} args - Tool arguments (truncated)
|
|
104
|
+
* @param {number} durationMs
|
|
105
|
+
*/
|
|
106
|
+
toolCall(iteration, toolName, args, durationMs) {
|
|
107
|
+
log('info', 'tool_call', {
|
|
108
|
+
tool: toolName,
|
|
109
|
+
args: truncate(JSON.stringify(args), 200),
|
|
110
|
+
durationMs,
|
|
111
|
+
}, iteration);
|
|
112
|
+
},
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Log a tool error.
|
|
116
|
+
* @param {number} iteration
|
|
117
|
+
* @param {string} toolName
|
|
118
|
+
* @param {string} errorMessage
|
|
119
|
+
*/
|
|
120
|
+
toolError(iteration, toolName, errorMessage) {
|
|
121
|
+
log('error', 'tool_error', {
|
|
122
|
+
tool: toolName,
|
|
123
|
+
error: truncate(errorMessage, 500),
|
|
124
|
+
}, iteration);
|
|
125
|
+
},
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Log an LLM call start.
|
|
129
|
+
* @param {number} iteration
|
|
130
|
+
*/
|
|
131
|
+
llmCall(iteration) {
|
|
132
|
+
log('debug', 'llm_call_start', {}, iteration);
|
|
133
|
+
},
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Log an LLM response.
|
|
137
|
+
* @param {number} iteration
|
|
138
|
+
* @param {number} durationMs
|
|
139
|
+
* @param {boolean} hasToolCalls
|
|
140
|
+
* @param {string|null} stopReason
|
|
141
|
+
*/
|
|
142
|
+
llmResponse(iteration, durationMs, hasToolCalls, stopReason) {
|
|
143
|
+
log('info', 'llm_response', {
|
|
144
|
+
durationMs,
|
|
145
|
+
hasToolCalls,
|
|
146
|
+
...(stopReason ? { stopReason } : {}),
|
|
147
|
+
}, iteration);
|
|
148
|
+
},
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Log a run summary at the end.
|
|
152
|
+
* @param {object} stats
|
|
153
|
+
* @param {number} stats.iterations
|
|
154
|
+
* @param {number} stats.totalInputTokens
|
|
155
|
+
* @param {number} stats.totalOutputTokens
|
|
156
|
+
* @param {number} stats.totalTimeMs
|
|
157
|
+
* @param {number} stats.toolCallCount
|
|
158
|
+
* @param {number} stats.toolErrorCount
|
|
159
|
+
* @param {string} stats.exitReason
|
|
160
|
+
*/
|
|
161
|
+
summary(stats) {
|
|
162
|
+
log('info', 'run_summary', stats);
|
|
163
|
+
},
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Truncate a string to a maximum length with ellipsis.
|
|
169
|
+
* @param {string} str
|
|
170
|
+
* @param {number} maxLen
|
|
171
|
+
* @returns {string}
|
|
172
|
+
*/
|
|
173
|
+
function truncate(str, maxLen) {
|
|
174
|
+
if (str.length <= maxLen) return str;
|
|
175
|
+
return str.slice(0, maxLen - 3) + '...';
|
|
176
|
+
}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared MCP client helper for bundled harnesses.
|
|
3
|
+
* Connects to cloud-hosted twins via HTTP MCP transport.
|
|
4
|
+
*/
|
|
5
|
+
import { readFileSync } from 'node:fs';
|
|
6
|
+
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
|
|
7
|
+
import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js';
|
|
8
|
+
import { SSEClientTransport } from '@modelcontextprotocol/sdk/client/sse.js';
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Connect to the first MCP server from the ARCHAL_MCP_CONFIG JSON file.
|
|
12
|
+
* Tries StreamableHTTP first, falls back to SSE transport.
|
|
13
|
+
* @returns {{ client: Client, serverName: string }}
|
|
14
|
+
*/
|
|
15
|
+
export async function connectMcp(configPath) {
|
|
16
|
+
if (!configPath) {
|
|
17
|
+
throw new Error('ARCHAL_MCP_CONFIG is not set — no MCP server config available');
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
const config = JSON.parse(readFileSync(configPath, 'utf-8'));
|
|
21
|
+
const serverName = Object.keys(config.mcpServers)[0];
|
|
22
|
+
if (!serverName) {
|
|
23
|
+
throw new Error('No MCP servers found in config');
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
const serverConfig = config.mcpServers[serverName];
|
|
27
|
+
const mcpUrl = serverConfig.url;
|
|
28
|
+
if (!mcpUrl) {
|
|
29
|
+
throw new Error(`MCP server "${serverName}" has no URL — cannot connect via HTTP`);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const client = new Client({ name: 'archal-harness-agent', version: '1.0.0' });
|
|
33
|
+
|
|
34
|
+
// Try StreamableHTTP first (modern MCP transport)
|
|
35
|
+
try {
|
|
36
|
+
const transport = new StreamableHTTPClientTransport(new URL(mcpUrl));
|
|
37
|
+
await client.connect(transport);
|
|
38
|
+
return { client, serverName };
|
|
39
|
+
} catch {
|
|
40
|
+
// StreamableHTTP may not be supported — fall back to SSE
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Fall back to SSE transport
|
|
44
|
+
try {
|
|
45
|
+
const transport = new SSEClientTransport(new URL(mcpUrl));
|
|
46
|
+
await client.connect(transport);
|
|
47
|
+
return { client, serverName };
|
|
48
|
+
} catch (err) {
|
|
49
|
+
throw new Error(
|
|
50
|
+
`Failed to connect to MCP server "${serverName}" at ${mcpUrl}: ${err.message}`
|
|
51
|
+
);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Discover available tools from the MCP server.
|
|
57
|
+
* @param {Client} client
|
|
58
|
+
* @returns {Array<{ name: string, description: string, inputSchema: object }>}
|
|
59
|
+
*/
|
|
60
|
+
export async function discoverTools(client) {
|
|
61
|
+
const { tools } = await client.listTools();
|
|
62
|
+
return tools.map((t) => ({
|
|
63
|
+
name: t.name,
|
|
64
|
+
description: t.description ?? '',
|
|
65
|
+
inputSchema: t.inputSchema ?? {},
|
|
66
|
+
}));
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Call a tool on the MCP server and return the text content.
|
|
71
|
+
* @param {Client} client
|
|
72
|
+
* @param {string} name
|
|
73
|
+
* @param {object} args
|
|
74
|
+
* @returns {string}
|
|
75
|
+
*/
|
|
76
|
+
export async function callTool(client, name, args) {
|
|
77
|
+
const result = await client.callTool({ name, arguments: args ?? {} });
|
|
78
|
+
const text = result.content?.map((c) => c.text ?? '').join('\n') ?? 'No output';
|
|
79
|
+
return text;
|
|
80
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Structured metrics writer for archal harnesses.
|
|
3
|
+
*
|
|
4
|
+
* Writes a JSON metrics file to the path specified by ARCHAL_METRICS_FILE.
|
|
5
|
+
* The orchestrator creates this path, reads it after the harness exits, and
|
|
6
|
+
* flows the data into RunResult.tokenUsage and telemetry.
|
|
7
|
+
*
|
|
8
|
+
* Safe no-op when ARCHAL_METRICS_FILE is not set (external harnesses that
|
|
9
|
+
* don't know about this protocol, or older orchestrator versions).
|
|
10
|
+
*
|
|
11
|
+
* @param {object} metrics
|
|
12
|
+
* @param {number} metrics.inputTokens
|
|
13
|
+
* @param {number} metrics.outputTokens
|
|
14
|
+
* @param {number} metrics.llmCallCount
|
|
15
|
+
* @param {number} metrics.toolCallCount
|
|
16
|
+
* @param {number} metrics.toolErrorCount
|
|
17
|
+
* @param {number} metrics.totalTimeMs
|
|
18
|
+
* @param {string} metrics.exitReason
|
|
19
|
+
* @param {string} [metrics.provider]
|
|
20
|
+
* @param {string} [metrics.model]
|
|
21
|
+
*/
|
|
22
|
+
import { writeFileSync } from 'node:fs';
|
|
23
|
+
|
|
24
|
+
export function writeMetrics(metrics) {
|
|
25
|
+
const metricsPath = process.env['ARCHAL_METRICS_FILE'];
|
|
26
|
+
if (!metricsPath) return;
|
|
27
|
+
|
|
28
|
+
try {
|
|
29
|
+
const payload = { version: 1, ...metrics };
|
|
30
|
+
writeFileSync(metricsPath, JSON.stringify(payload));
|
|
31
|
+
} catch {
|
|
32
|
+
// Non-fatal — metrics are best-effort
|
|
33
|
+
}
|
|
34
|
+
}
|