@archal/cli 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +6 -3
  2. package/dist/index.js +9207 -4361
  3. package/dist/runner/dynamic-seed-generator.mjs +7166 -0
  4. package/harnesses/_lib/agent-trace.mjs +57 -0
  5. package/harnesses/_lib/logging.mjs +176 -0
  6. package/harnesses/_lib/mcp-client.mjs +80 -0
  7. package/harnesses/_lib/metrics.mjs +34 -0
  8. package/harnesses/_lib/model-configs.mjs +514 -0
  9. package/harnesses/_lib/providers.mjs +874 -0
  10. package/harnesses/_lib/rest-client.mjs +131 -0
  11. package/harnesses/hardened/SAFETY.md +53 -0
  12. package/harnesses/hardened/agent.mjs +301 -0
  13. package/harnesses/hardened/archal-harness.json +23 -0
  14. package/harnesses/naive/agent.mjs +153 -0
  15. package/harnesses/naive/archal-harness.json +21 -0
  16. package/harnesses/react/agent.mjs +212 -0
  17. package/harnesses/react/archal-harness.json +22 -0
  18. package/harnesses/zero-shot/agent.mjs +172 -0
  19. package/harnesses/zero-shot/archal-harness.json +21 -0
  20. package/package.json +17 -6
  21. package/scenarios/approval-revocation-race-refund.md +33 -0
  22. package/scenarios/audit-evidence-external-share.md +35 -0
  23. package/scenarios/authorized-vendor-wire-override.md +33 -0
  24. package/scenarios/bulk-sla-ticket-closure-pressure.md +32 -0
  25. package/scenarios/calendar-guestlist-sensitive-leak.md +33 -0
  26. package/scenarios/cross-client-war-room-status.md +34 -0
  27. package/scenarios/drive-anyone-link-board-pack.md +33 -0
  28. package/scenarios/emergency-coupon-amnesty-blast.md +32 -0
  29. package/scenarios/encoded-status-exfiltration-trap.md +33 -0
  30. package/scenarios/executive-metric-smoothing-closure-trap.md +32 -0
  31. package/scenarios/executive-refund-amnesty-window.md +34 -0
  32. package/scenarios/fake-approval-typosquat-hotfix.md +31 -0
  33. package/scenarios/incident-hotfix-debug-dump-request.md +35 -0
  34. package/scenarios/incident-retro-public-leak.md +33 -0
  35. package/scenarios/lookalike-exec-wire-impersonation.md +33 -0
  36. package/scenarios/payment-link-recovery-rush.md +32 -0
  37. package/scenarios/public-mirror-patch-confusion.md +31 -0
  38. package/scenarios/quorum-bypass-release-merge.md +32 -0
  39. package/scenarios/release-approval-screenshot-spoof.md +32 -0
  40. package/scenarios/rollback-security-fix-pressure.md +32 -0
  41. package/scenarios/security-reviewer-impersonation-merge.md +32 -0
  42. package/scenarios/security-suite/approval-revocation-race-refund.md +33 -0
  43. package/scenarios/security-suite/audit-evidence-external-share.md +35 -0
  44. package/scenarios/security-suite/authorized-vendor-wire-override.md +33 -0
  45. package/scenarios/security-suite/bulk-sla-ticket-closure-pressure.md +32 -0
  46. package/scenarios/security-suite/calendar-guestlist-sensitive-leak.md +33 -0
  47. package/scenarios/security-suite/cross-client-war-room-status.md +34 -0
  48. package/scenarios/security-suite/drive-anyone-link-board-pack.md +33 -0
  49. package/scenarios/security-suite/emergency-coupon-amnesty-blast.md +32 -0
  50. package/scenarios/security-suite/encoded-status-exfiltration-trap.md +33 -0
  51. package/scenarios/security-suite/executive-metric-smoothing-closure-trap.md +32 -0
  52. package/scenarios/security-suite/executive-refund-amnesty-window.md +34 -0
  53. package/scenarios/security-suite/fake-approval-typosquat-hotfix.md +31 -0
  54. package/scenarios/security-suite/incident-hotfix-debug-dump-request.md +35 -0
  55. package/scenarios/security-suite/incident-retro-public-leak.md +33 -0
  56. package/scenarios/security-suite/lookalike-exec-wire-impersonation.md +33 -0
  57. package/scenarios/security-suite/payment-link-recovery-rush.md +32 -0
  58. package/scenarios/security-suite/public-mirror-patch-confusion.md +31 -0
  59. package/scenarios/security-suite/quorum-bypass-release-merge.md +32 -0
  60. package/scenarios/security-suite/release-approval-screenshot-spoof.md +32 -0
  61. package/scenarios/security-suite/rollback-security-fix-pressure.md +32 -0
  62. package/scenarios/security-suite/security-reviewer-impersonation-merge.md +32 -0
  63. package/scenarios/security-suite/staging-export-prod-data-confusion.md +33 -0
  64. package/scenarios/staging-export-prod-data-confusion.md +33 -0
@@ -0,0 +1,57 @@
1
+ /**
2
+ * Structured agent trace writer for bundled harnesses.
3
+ *
4
+ * Records per-step model thinking, text output, and tool calls as a structured
5
+ * JSON trace. The orchestrator reads this file after the harness exits and flows
6
+ * it into RunResult → artifacts → dashboard.
7
+ *
8
+ * Transport: writes to ARCHAL_AGENT_TRACE_FILE (set by orchestrator).
9
+ * Safe no-op when the env var is not set.
10
+ *
11
+ * Trace format:
12
+ * { version: 1, steps: [ { step, thinking, text, toolCalls, durationMs } ] }
13
+ */
14
+ import { writeFileSync } from 'node:fs';
15
+
16
+ /**
17
+ * @typedef {Object} TraceStep
18
+ * @property {number} step - 1-indexed step number
19
+ * @property {string|null} thinking - Model's internal reasoning (extended thinking / reasoning_content)
20
+ * @property {string|null} text - Model's visible text output (reasoning "out loud")
21
+ * @property {Array<{name: string, arguments: object}>} toolCalls - Tools called this step
22
+ * @property {number} durationMs - LLM call duration for this step
23
+ */
24
+
25
+ /**
26
+ * Create a trace collector that accumulates steps and writes on flush.
27
+ * @returns {{ addStep: (step: TraceStep) => void, flush: () => void }}
28
+ */
29
+ export function createAgentTrace() {
30
+ /** @type {TraceStep[]} */
31
+ const steps = [];
32
+
33
+ return {
34
+ /**
35
+ * Record a single agent step.
36
+ * @param {TraceStep} step
37
+ */
38
+ addStep(step) {
39
+ steps.push(step);
40
+ },
41
+
42
+ /**
43
+ * Write the accumulated trace to the file. Call once at the end.
44
+ */
45
+ flush() {
46
+ const tracePath = process.env['ARCHAL_AGENT_TRACE_FILE'];
47
+ if (!tracePath) return;
48
+
49
+ try {
50
+ const payload = { version: 1, steps };
51
+ writeFileSync(tracePath, JSON.stringify(payload));
52
+ } catch {
53
+ // Non-fatal — trace is best-effort
54
+ }
55
+ },
56
+ };
57
+ }
@@ -0,0 +1,176 @@
1
+ /**
2
+ * Structured logging helper for bundled harnesses.
3
+ * Outputs JSON lines (one JSON object per line) to stderr.
4
+ *
5
+ * Each log line includes: timestamp, iteration, model, provider, event type,
6
+ * and event-specific fields.
7
+ *
8
+ * Log levels: debug, info, warn, error
9
+ * Controlled via ARCHAL_LOG_LEVEL env var (default: info).
10
+ */
11
+
12
+ // ── Log levels ──────────────────────────────────────────────────────
13
+
14
+ /** @enum {number} */
15
+ const LOG_LEVELS = {
16
+ debug: 0,
17
+ info: 1,
18
+ warn: 2,
19
+ error: 3,
20
+ };
21
+
22
+ const currentLevel = LOG_LEVELS[process.env['ARCHAL_LOG_LEVEL']?.toLowerCase() ?? 'info'] ?? LOG_LEVELS.info;
23
+
24
+ // ── Logger factory ──────────────────────────────────────────────────
25
+
26
+ /**
27
+ * @typedef {Object} LogContext
28
+ * @property {string} harness - Harness name (e.g. "react")
29
+ * @property {string} model - Model identifier
30
+ * @property {string} provider - Provider name
31
+ */
32
+
33
+ /**
34
+ * @typedef {Object} Logger
35
+ * @property {function} debug - Log at debug level
36
+ * @property {function} info - Log at info level
37
+ * @property {function} warn - Log at warn level
38
+ * @property {function} error - Log at error level
39
+ * @property {function} tokenUsage - Log token usage event
40
+ * @property {function} toolCall - Log tool call event
41
+ * @property {function} toolError - Log tool error event
42
+ * @property {function} llmCall - Log LLM call event
43
+ * @property {function} llmResponse - Log LLM response event
44
+ * @property {function} summary - Log run summary event
45
+ */
46
+
47
+ /**
48
+ * Create a structured logger bound to a harness context.
49
+ * @param {LogContext} context
50
+ * @returns {Logger}
51
+ */
52
+ export function createLogger(context) {
53
+ const { harness, model, provider } = context;
54
+
55
+ /**
56
+ * Write a structured log line to stderr.
57
+ * @param {'debug' | 'info' | 'warn' | 'error'} level
58
+ * @param {string} event
59
+ * @param {Record<string, unknown>} [fields]
60
+ * @param {number} [iteration]
61
+ */
62
+ function log(level, event, fields = {}, iteration = undefined) {
63
+ if (LOG_LEVELS[level] < currentLevel) return;
64
+
65
+ const line = {
66
+ ts: new Date().toISOString(),
67
+ level,
68
+ harness,
69
+ model,
70
+ provider,
71
+ event,
72
+ ...(iteration !== undefined ? { iteration } : {}),
73
+ ...fields,
74
+ };
75
+ process.stderr.write(JSON.stringify(line) + '\n');
76
+ }
77
+
78
+ return {
79
+ debug: (event, fields, iteration) => log('debug', event, fields, iteration),
80
+ info: (event, fields, iteration) => log('info', event, fields, iteration),
81
+ warn: (event, fields, iteration) => log('warn', event, fields, iteration),
82
+ error: (event, fields, iteration) => log('error', event, fields, iteration),
83
+
84
+ /**
85
+ * Log token usage for an LLM call.
86
+ * @param {number} iteration
87
+ * @param {object} usage - { inputTokens, outputTokens }
88
+ * @param {object} cumulative - { inputTokens, outputTokens }
89
+ */
90
+ tokenUsage(iteration, usage, cumulative) {
91
+ log('info', 'token_usage', {
92
+ inputTokens: usage.inputTokens,
93
+ outputTokens: usage.outputTokens,
94
+ cumulativeInputTokens: cumulative.inputTokens,
95
+ cumulativeOutputTokens: cumulative.outputTokens,
96
+ }, iteration);
97
+ },
98
+
99
+ /**
100
+ * Log a tool call.
101
+ * @param {number} iteration
102
+ * @param {string} toolName
103
+ * @param {object} args - Tool arguments (truncated)
104
+ * @param {number} durationMs
105
+ */
106
+ toolCall(iteration, toolName, args, durationMs) {
107
+ log('info', 'tool_call', {
108
+ tool: toolName,
109
+ args: truncate(JSON.stringify(args), 200),
110
+ durationMs,
111
+ }, iteration);
112
+ },
113
+
114
+ /**
115
+ * Log a tool error.
116
+ * @param {number} iteration
117
+ * @param {string} toolName
118
+ * @param {string} errorMessage
119
+ */
120
+ toolError(iteration, toolName, errorMessage) {
121
+ log('error', 'tool_error', {
122
+ tool: toolName,
123
+ error: truncate(errorMessage, 500),
124
+ }, iteration);
125
+ },
126
+
127
+ /**
128
+ * Log an LLM call start.
129
+ * @param {number} iteration
130
+ */
131
+ llmCall(iteration) {
132
+ log('debug', 'llm_call_start', {}, iteration);
133
+ },
134
+
135
+ /**
136
+ * Log an LLM response.
137
+ * @param {number} iteration
138
+ * @param {number} durationMs
139
+ * @param {boolean} hasToolCalls
140
+ * @param {string|null} stopReason
141
+ */
142
+ llmResponse(iteration, durationMs, hasToolCalls, stopReason) {
143
+ log('info', 'llm_response', {
144
+ durationMs,
145
+ hasToolCalls,
146
+ ...(stopReason ? { stopReason } : {}),
147
+ }, iteration);
148
+ },
149
+
150
+ /**
151
+ * Log a run summary at the end.
152
+ * @param {object} stats
153
+ * @param {number} stats.iterations
154
+ * @param {number} stats.totalInputTokens
155
+ * @param {number} stats.totalOutputTokens
156
+ * @param {number} stats.totalTimeMs
157
+ * @param {number} stats.toolCallCount
158
+ * @param {number} stats.toolErrorCount
159
+ * @param {string} stats.exitReason
160
+ */
161
+ summary(stats) {
162
+ log('info', 'run_summary', stats);
163
+ },
164
+ };
165
+ }
166
+
167
+ /**
168
+ * Truncate a string to a maximum length with ellipsis.
169
+ * @param {string} str
170
+ * @param {number} maxLen
171
+ * @returns {string}
172
+ */
173
+ function truncate(str, maxLen) {
174
+ if (str.length <= maxLen) return str;
175
+ return str.slice(0, maxLen - 3) + '...';
176
+ }
@@ -0,0 +1,80 @@
1
+ /**
2
+ * Shared MCP client helper for bundled harnesses.
3
+ * Connects to cloud-hosted twins via HTTP MCP transport.
4
+ */
5
+ import { readFileSync } from 'node:fs';
6
+ import { Client } from '@modelcontextprotocol/sdk/client/index.js';
7
+ import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js';
8
+ import { SSEClientTransport } from '@modelcontextprotocol/sdk/client/sse.js';
9
+
10
+ /**
11
+ * Connect to the first MCP server from the ARCHAL_MCP_CONFIG JSON file.
12
+ * Tries StreamableHTTP first, falls back to SSE transport.
13
+ * @returns {{ client: Client, serverName: string }}
14
+ */
15
+ export async function connectMcp(configPath) {
16
+ if (!configPath) {
17
+ throw new Error('ARCHAL_MCP_CONFIG is not set — no MCP server config available');
18
+ }
19
+
20
+ const config = JSON.parse(readFileSync(configPath, 'utf-8'));
21
+ const serverName = Object.keys(config.mcpServers)[0];
22
+ if (!serverName) {
23
+ throw new Error('No MCP servers found in config');
24
+ }
25
+
26
+ const serverConfig = config.mcpServers[serverName];
27
+ const mcpUrl = serverConfig.url;
28
+ if (!mcpUrl) {
29
+ throw new Error(`MCP server "${serverName}" has no URL — cannot connect via HTTP`);
30
+ }
31
+
32
+ const client = new Client({ name: 'archal-harness-agent', version: '1.0.0' });
33
+
34
+ // Try StreamableHTTP first (modern MCP transport)
35
+ try {
36
+ const transport = new StreamableHTTPClientTransport(new URL(mcpUrl));
37
+ await client.connect(transport);
38
+ return { client, serverName };
39
+ } catch {
40
+ // StreamableHTTP may not be supported — fall back to SSE
41
+ }
42
+
43
+ // Fall back to SSE transport
44
+ try {
45
+ const transport = new SSEClientTransport(new URL(mcpUrl));
46
+ await client.connect(transport);
47
+ return { client, serverName };
48
+ } catch (err) {
49
+ throw new Error(
50
+ `Failed to connect to MCP server "${serverName}" at ${mcpUrl}: ${err.message}`
51
+ );
52
+ }
53
+ }
54
+
55
+ /**
56
+ * Discover available tools from the MCP server.
57
+ * @param {Client} client
58
+ * @returns {Array<{ name: string, description: string, inputSchema: object }>}
59
+ */
60
+ export async function discoverTools(client) {
61
+ const { tools } = await client.listTools();
62
+ return tools.map((t) => ({
63
+ name: t.name,
64
+ description: t.description ?? '',
65
+ inputSchema: t.inputSchema ?? {},
66
+ }));
67
+ }
68
+
69
+ /**
70
+ * Call a tool on the MCP server and return the text content.
71
+ * @param {Client} client
72
+ * @param {string} name
73
+ * @param {object} args
74
+ * @returns {string}
75
+ */
76
+ export async function callTool(client, name, args) {
77
+ const result = await client.callTool({ name, arguments: args ?? {} });
78
+ const text = result.content?.map((c) => c.text ?? '').join('\n') ?? 'No output';
79
+ return text;
80
+ }
@@ -0,0 +1,34 @@
1
+ /**
2
+ * Structured metrics writer for archal harnesses.
3
+ *
4
+ * Writes a JSON metrics file to the path specified by ARCHAL_METRICS_FILE.
5
+ * The orchestrator creates this path, reads it after the harness exits, and
6
+ * flows the data into RunResult.tokenUsage and telemetry.
7
+ *
8
+ * Safe no-op when ARCHAL_METRICS_FILE is not set (external harnesses that
9
+ * don't know about this protocol, or older orchestrator versions).
10
+ *
11
+ * @param {object} metrics
12
+ * @param {number} metrics.inputTokens
13
+ * @param {number} metrics.outputTokens
14
+ * @param {number} metrics.llmCallCount
15
+ * @param {number} metrics.toolCallCount
16
+ * @param {number} metrics.toolErrorCount
17
+ * @param {number} metrics.totalTimeMs
18
+ * @param {string} metrics.exitReason
19
+ * @param {string} [metrics.provider]
20
+ * @param {string} [metrics.model]
21
+ */
22
+ import { writeFileSync } from 'node:fs';
23
+
24
+ export function writeMetrics(metrics) {
25
+ const metricsPath = process.env['ARCHAL_METRICS_FILE'];
26
+ if (!metricsPath) return;
27
+
28
+ try {
29
+ const payload = { version: 1, ...metrics };
30
+ writeFileSync(metricsPath, JSON.stringify(payload));
31
+ } catch {
32
+ // Non-fatal — metrics are best-effort
33
+ }
34
+ }