archal 0.9.18 → 0.9.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -1
- package/agents/github-octokit/.archal.json +8 -0
- package/agents/github-octokit/Dockerfile +8 -0
- package/agents/github-octokit/README.md +113 -0
- package/agents/github-octokit/agent.mjs +54 -0
- package/agents/github-octokit/package.json +9 -0
- package/agents/github-octokit/scenarios/test-repo-access.md +27 -0
- package/agents/google-workspace-local-tools/Dockerfile +6 -0
- package/agents/google-workspace-local-tools/README.md +58 -0
- package/agents/google-workspace-local-tools/agent.mjs +196 -0
- package/agents/google-workspace-local-tools/archal-harness.json +7 -0
- package/agents/google-workspace-local-tools/run-input.yaml +16 -0
- package/agents/google-workspace-local-tools/scenario.md +29 -0
- package/agents/hermes/.archal.json +8 -0
- package/agents/hermes/Dockerfile +46 -0
- package/agents/hermes/README.md +87 -0
- package/agents/hermes/SOUL.md +27 -0
- package/agents/hermes/config.yaml +34 -0
- package/agents/hermes/drive.mjs +113 -0
- package/agents/hermes/scenarios/stripe-customers-read-only.md +32 -0
- package/agents/openclaw/.archal.json +8 -0
- package/agents/openclaw/Dockerfile +96 -0
- package/agents/openclaw/README.md +120 -0
- package/agents/openclaw/drive.mjs +311 -0
- package/agents/openclaw/package.json +9 -0
- package/agents/openclaw/scenarios/github-issue-triage-read-only.md +44 -0
- package/agents/openclaw/workspace/AGENTS.md +23 -0
- package/agents/openclaw/workspace/IDENTITY.md +8 -0
- package/agents/openclaw/workspace/SOUL.md +14 -0
- package/agents/openclaw/workspace/TOOLS.md +35 -0
- package/agents/pagination-test/README.md +24 -0
- package/agents/pagination-test/scenario.md +24 -0
- package/agents/replay-capsule-harness/README.md +29 -0
- package/agents/replay-capsule-harness/observability-install-offline-e2e.mts +1517 -0
- package/agents/replay-capsule-harness/replay-capsule-e2e.mjs +104 -0
- package/clone-assets/apify/tools.json +213 -13
- package/clone-assets/calcom/tools.json +510 -0
- package/clone-assets/clickup/tools.json +1258 -0
- package/clone-assets/customerio/tools.json +386 -0
- package/clone-assets/datadog/tools.json +734 -0
- package/clone-assets/github/tools.json +312 -25
- package/clone-assets/gitlab/tools.json +999 -0
- package/clone-assets/google-workspace/tools.json +18 -6
- package/clone-assets/hubspot/tools.json +1406 -0
- package/clone-assets/jira/fidelity.json +1 -1
- package/clone-assets/jira/tools.json +266 -543
- package/clone-assets/linear/tools.json +238 -40
- package/clone-assets/ownerrez/tools.json +548 -0
- package/clone-assets/pricelabs/tools.json +343 -0
- package/clone-assets/sentry/tools.json +745 -0
- package/clone-assets/slack/tools.json +1 -2
- package/clone-assets/stripe/tools.json +185 -46
- package/clone-assets/supabase/tools.json +511 -14
- package/clone-assets/unipile/tools.json +408 -0
- package/clone-assets/webflow/tools.json +415 -0
- package/dist/autoloop-worker-types-BEb_E44z.d.cts +196 -0
- package/dist/cli.cjs +151033 -75282
- package/dist/commands/autoloop-hosted-worker.cjs +43942 -0
- package/dist/commands/autoloop-hosted-worker.d.cts +143 -0
- package/dist/commands/autoloop-pr-verification.cjs +4227 -0
- package/dist/commands/autoloop-pr-verification.d.cts +17 -0
- package/dist/{vitest/chunk-IVXSSEYS.js → commands/autoloop-result-parser.cjs} +16515 -18857
- package/dist/commands/autoloop-result-parser.d.cts +39 -0
- package/dist/commands/autoloop-worker.cjs +36163 -0
- package/dist/commands/autoloop-worker.d.cts +97 -0
- package/dist/harness.cjs +1 -0
- package/dist/index.cjs +1 -1
- package/dist/replay.cjs +49624 -0
- package/dist/replay.d.cts +4625 -0
- package/dist/scenarios.cjs +80343 -0
- package/dist/scenarios.d.cts +562 -0
- package/dist/vitest/chunk-6CBYFCFK.js +4667 -0
- package/dist/vitest/chunk-ARVS45PP.js +2764 -0
- package/dist/vitest/index.cjs +6079 -75089
- package/dist/vitest/index.d.ts +7 -6
- package/dist/vitest/index.js +8 -8
- package/dist/vitest/runtime/hosted-session-reaper.cjs +801 -34187
- package/dist/vitest/runtime/hosted-session-reaper.js +1 -1
- package/dist/vitest/runtime/setup-files.js +2 -2
- package/package.json +14 -9
- package/skills/archal-agent/SKILL.md +87 -0
- package/skills/autoloop/SKILL.md +376 -0
- package/skills/autoloop/references/hosted-sources.md +62 -0
- package/skills/autoloop/references/trace-schema-mapping.md +73 -0
- package/skills/eval/SKILL.md +35 -1
- package/skills/install-agent/SKILL.md +221 -0
- package/skills/onboard/SKILL.md +80 -0
- package/skills/scenario/SKILL.md +19 -4
- package/skills/seed/SKILL.md +237 -0
- package/dist/seed/dynamic-generator.cjs +0 -45564
- package/dist/seed/dynamic-generator.d.cts +0 -106
- package/dist/vitest/chunk-CTSN67QR.js +0 -47188
|
@@ -0,0 +1,562 @@
|
|
|
1
|
+
interface FieldChange {
|
|
2
|
+
field: string;
|
|
3
|
+
before: unknown;
|
|
4
|
+
after: unknown;
|
|
5
|
+
}
|
|
6
|
+
interface EntityChange {
|
|
7
|
+
entityId: number;
|
|
8
|
+
entityType: string;
|
|
9
|
+
fields: FieldChange[];
|
|
10
|
+
}
|
|
11
|
+
interface StateDiff {
|
|
12
|
+
added: Record<string, unknown[]>;
|
|
13
|
+
modified: Record<string, unknown[]>;
|
|
14
|
+
removed: Record<string, (string | number)[]>;
|
|
15
|
+
fieldChanges?: EntityChange[];
|
|
16
|
+
}
|
|
17
|
+
interface CloneEvent {
|
|
18
|
+
type: string;
|
|
19
|
+
payload: unknown;
|
|
20
|
+
timestamp: string;
|
|
21
|
+
sourceRule?: string;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
interface TraceLink {
|
|
25
|
+
traceId: string;
|
|
26
|
+
spanId: string;
|
|
27
|
+
type: 'retry' | 'read_after_write' | 'write_after_write' | 'fan_out' | 'async' | 'manual';
|
|
28
|
+
}
|
|
29
|
+
interface RetryPolicy {
|
|
30
|
+
maxAttempts?: number;
|
|
31
|
+
backoffMs?: number;
|
|
32
|
+
strategy?: 'fixed' | 'linear' | 'exponential' | 'unknown';
|
|
33
|
+
}
|
|
34
|
+
interface OutboundHttpTrace {
|
|
35
|
+
method?: string;
|
|
36
|
+
scheme?: 'http' | 'https';
|
|
37
|
+
host?: string;
|
|
38
|
+
route?: string;
|
|
39
|
+
path?: string;
|
|
40
|
+
statusCode?: number;
|
|
41
|
+
latencyMs?: number;
|
|
42
|
+
requestBytes?: number;
|
|
43
|
+
responseBytes?: number;
|
|
44
|
+
}
|
|
45
|
+
interface CausalLink {
|
|
46
|
+
traceId: string;
|
|
47
|
+
spanId: string;
|
|
48
|
+
type: 'retry' | 'read_after_write' | 'write_after_write';
|
|
49
|
+
}
|
|
50
|
+
type TraceErrorKind = 'validation_error' | 'not_found' | 'authentication_error' | 'permission_denied' | 'rate_limited' | 'timeout' | 'network_error' | 'server_error' | 'agent_error' | 'unknown';
|
|
51
|
+
interface TraceError {
|
|
52
|
+
code: string;
|
|
53
|
+
message: string;
|
|
54
|
+
kind?: TraceErrorKind;
|
|
55
|
+
class?: string;
|
|
56
|
+
normalizedCode?: string;
|
|
57
|
+
name?: string;
|
|
58
|
+
statusCode?: number;
|
|
59
|
+
retryable?: boolean;
|
|
60
|
+
retryAfterSeconds?: number;
|
|
61
|
+
stack?: string;
|
|
62
|
+
stackHash?: string;
|
|
63
|
+
causeChain?: Array<{
|
|
64
|
+
class?: string;
|
|
65
|
+
code?: string;
|
|
66
|
+
message?: string;
|
|
67
|
+
}>;
|
|
68
|
+
retryPolicy?: RetryPolicy;
|
|
69
|
+
details?: unknown;
|
|
70
|
+
}
|
|
71
|
+
interface TraceEntry {
|
|
72
|
+
id: string;
|
|
73
|
+
toolName: string;
|
|
74
|
+
traceId?: string;
|
|
75
|
+
spanId?: string;
|
|
76
|
+
parentSpanId?: string | null;
|
|
77
|
+
links?: TraceLink[];
|
|
78
|
+
runIndex?: number;
|
|
79
|
+
twinName?: string;
|
|
80
|
+
sessionId?: string;
|
|
81
|
+
input: Record<string, unknown>;
|
|
82
|
+
output: unknown;
|
|
83
|
+
startTimestamp?: string;
|
|
84
|
+
endTimestamp?: string;
|
|
85
|
+
queuedAt?: string;
|
|
86
|
+
dequeuedAt?: string;
|
|
87
|
+
startedAt?: string;
|
|
88
|
+
endedAt?: string;
|
|
89
|
+
uploadedAt?: string;
|
|
90
|
+
timestamp: string;
|
|
91
|
+
durationMs: number;
|
|
92
|
+
stateMutations: StateDiff | null;
|
|
93
|
+
error: TraceError | null;
|
|
94
|
+
sequenceIndex?: number;
|
|
95
|
+
requestBytes?: number;
|
|
96
|
+
responseBytes?: number;
|
|
97
|
+
entitiesRead?: Array<{
|
|
98
|
+
collection: string;
|
|
99
|
+
id: string | number;
|
|
100
|
+
}>;
|
|
101
|
+
entitiesWritten?: Array<{
|
|
102
|
+
collection: string;
|
|
103
|
+
id: string | number;
|
|
104
|
+
action: 'create' | 'update' | 'delete';
|
|
105
|
+
}>;
|
|
106
|
+
isRetry?: boolean;
|
|
107
|
+
retriedFromId?: string;
|
|
108
|
+
retryCount?: number;
|
|
109
|
+
rateLimitRemaining?: number;
|
|
110
|
+
triggeredRateLimit?: boolean;
|
|
111
|
+
errorCategory?: 'agent_error' | 'validation_error' | 'not_found' | 'permission_denied' | 'rate_limited' | 'server_error' | 'none';
|
|
112
|
+
causalLinks?: string[];
|
|
113
|
+
causalLinkDetails?: CausalLink[];
|
|
114
|
+
outboundHttp?: OutboundHttpTrace;
|
|
115
|
+
agentInternals?: {
|
|
116
|
+
providerRequestId?: string;
|
|
117
|
+
toolChoiceRationaleSummary?: string;
|
|
118
|
+
tokenUsage?: {
|
|
119
|
+
input?: number;
|
|
120
|
+
output?: number;
|
|
121
|
+
total?: number;
|
|
122
|
+
};
|
|
123
|
+
costUsd?: number;
|
|
124
|
+
latencyMs?: number;
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Reasons why an LLM-judged criterion could not be evaluated and must be
|
|
130
|
+
* surfaced as `status: 'error'` (i.e. excluded from the weighted satisfaction
|
|
131
|
+
* score and called out in the report). See #2507.
|
|
132
|
+
*
|
|
133
|
+
* `rate_limited` — Upstream 429 (Gemini RPM/TPM blown, or direct provider
|
|
134
|
+
* rate limit). Keep retrying a later run.
|
|
135
|
+
* `upstream_5xx` — 5xx from upstream including proxy 502 that wraps
|
|
136
|
+
* "Gemini API error: 429 Resource exhausted" (our proxy
|
|
137
|
+
* translates 429 to 502 on the wire).
|
|
138
|
+
* `auth_error` — 401/403 from upstream (misconfigured or revoked key).
|
|
139
|
+
* `provider_error` — Generic/unknown provider failure (kept for back-compat,
|
|
140
|
+
* no longer silently degrades to 0% score).
|
|
141
|
+
*/
|
|
142
|
+
type LlmEvalErrorReason = 'rate_limited' | 'upstream_5xx' | 'auth_error' | 'provider_error';
|
|
143
|
+
interface EvaluationResult {
|
|
144
|
+
criterionId: string;
|
|
145
|
+
/**
|
|
146
|
+
* `error` means the criterion could NOT be evaluated due to upstream LLM
|
|
147
|
+
* failure (429/5xx/auth). It must not be treated as a pass or a fail for the
|
|
148
|
+
* satisfaction calculation — the final report should surface "N criteria
|
|
149
|
+
* unevaluated, results incomplete" rather than silently under-reporting.
|
|
150
|
+
*
|
|
151
|
+
* `skipped` means evaluation was deliberately short-circuited as a cost
|
|
152
|
+
* optimization — currently only when every deterministic criterion already
|
|
153
|
+
* failed, so spending judge calls on the probabilistic criteria is wasteful.
|
|
154
|
+
* Like `error`, it is EXCLUDED from the weighted satisfaction average (it is
|
|
155
|
+
* not evidence of failure). Unlike `error`, it does NOT mark the run
|
|
156
|
+
* incomplete/untrustworthy: the deterministic verdict is conclusive, so the
|
|
157
|
+
* score stands. See `calculateOverallScore` and `score-trust.ts`.
|
|
158
|
+
*/
|
|
159
|
+
status: 'pass' | 'fail' | 'partial' | 'error' | 'skipped';
|
|
160
|
+
confidence: number;
|
|
161
|
+
explanation: string;
|
|
162
|
+
fallbackRecommended?: boolean;
|
|
163
|
+
fallbackFailureReason?: 'missing_credentials' | 'rate_limited' | 'context_too_large' | 'provider_error';
|
|
164
|
+
/** Structured reason when `status === 'error'`. Surfaced in the report summary. */
|
|
165
|
+
errorReason?: LlmEvalErrorReason;
|
|
166
|
+
/**
|
|
167
|
+
* Continuous [0,1] grade for criteria that admit partial credit (currently
|
|
168
|
+
* the count assertions: exact/min/max_count). 1 means fully satisfied; a
|
|
169
|
+
* fraction expresses how close the run came (e.g. 3 of 5 expected = 0.6).
|
|
170
|
+
*
|
|
171
|
+
* This is INERT metadata for the satisfaction calculation: `getCriterionScore`
|
|
172
|
+
* and `calculateOverallScore` switch on `status` only and never read this
|
|
173
|
+
* field, so `overallScore` is byte-identical whether or not it is present.
|
|
174
|
+
* Only the RL reward path (`shapedScore` / `denseReward`, gated behind the
|
|
175
|
+
* `--rl-reward` flag) consumes it, to give an RL agent a gradient instead of
|
|
176
|
+
* a binary pass/fail. Always computed on count results because it costs
|
|
177
|
+
* nothing and gating it would couple the pure assertion evaluators to run
|
|
178
|
+
* config.
|
|
179
|
+
*/
|
|
180
|
+
partialScore?: number;
|
|
181
|
+
}
|
|
182
|
+
interface TokenUsageSummary {
|
|
183
|
+
inputTokens: number;
|
|
184
|
+
outputTokens: number;
|
|
185
|
+
llmCallCount?: number;
|
|
186
|
+
provider?: string;
|
|
187
|
+
model?: string;
|
|
188
|
+
}
|
|
189
|
+
type AgentTokenUsageStatus = 'reported' | 'not_reported_by_harness';
|
|
190
|
+
interface ScoreTrustSummary {
|
|
191
|
+
status: 'complete' | 'incomplete';
|
|
192
|
+
evaluatorErrorCount: number;
|
|
193
|
+
unscoredRunFailureCount: number;
|
|
194
|
+
toolErrorCount: number;
|
|
195
|
+
scoredRunCount: number;
|
|
196
|
+
totalRunCount: number;
|
|
197
|
+
}
|
|
198
|
+
type CapabilityMissSubkind = 'missing_handler' | 'unknown_tool' | 'unimplemented_route' | 'unsupported_method' | 'shim_target_missing';
|
|
199
|
+
type CapabilityMissSurface = 'mcp' | 'tools_call' | 'rest' | 'graphql_shim' | 'alias_shim';
|
|
200
|
+
type CapabilityMissSeverity = 'high' | 'low';
|
|
201
|
+
type CapabilityMissAlertChannel = 'none' | 'derived';
|
|
202
|
+
type CapabilityMissDetails = {
|
|
203
|
+
subkind: 'unknown_tool';
|
|
204
|
+
requestedTool: string;
|
|
205
|
+
} | {
|
|
206
|
+
subkind: 'missing_handler';
|
|
207
|
+
resolvedTool: string;
|
|
208
|
+
} | {
|
|
209
|
+
subkind: 'unsupported_method';
|
|
210
|
+
method: string;
|
|
211
|
+
route: string;
|
|
212
|
+
} | {
|
|
213
|
+
subkind: 'unimplemented_route';
|
|
214
|
+
method: string;
|
|
215
|
+
route: string;
|
|
216
|
+
} | {
|
|
217
|
+
subkind: 'shim_target_missing';
|
|
218
|
+
shim: string;
|
|
219
|
+
resolvedTool: string;
|
|
220
|
+
};
|
|
221
|
+
interface CapabilityMissEvent {
|
|
222
|
+
version: 1;
|
|
223
|
+
event: 'capability.miss';
|
|
224
|
+
occurredAt: string;
|
|
225
|
+
run: {
|
|
226
|
+
sessionId?: string;
|
|
227
|
+
runId?: string;
|
|
228
|
+
scenarioId?: string;
|
|
229
|
+
};
|
|
230
|
+
miss: {
|
|
231
|
+
subkind: CapabilityMissSubkind;
|
|
232
|
+
surface: CapabilityMissSurface;
|
|
233
|
+
severity: CapabilityMissSeverity;
|
|
234
|
+
fingerprint: string;
|
|
235
|
+
};
|
|
236
|
+
target: {
|
|
237
|
+
twin: string;
|
|
238
|
+
toolName?: string;
|
|
239
|
+
method?: string;
|
|
240
|
+
route?: string;
|
|
241
|
+
};
|
|
242
|
+
details: CapabilityMissDetails;
|
|
243
|
+
handling: {
|
|
244
|
+
abortedRun: boolean;
|
|
245
|
+
alerted: boolean;
|
|
246
|
+
alertChannel: CapabilityMissAlertChannel;
|
|
247
|
+
};
|
|
248
|
+
}
|
|
249
|
+
interface CapabilityMissShadowReportRow {
|
|
250
|
+
fingerprint: string;
|
|
251
|
+
twin: string;
|
|
252
|
+
surface: CapabilityMissSurface;
|
|
253
|
+
subkind: CapabilityMissSubkind;
|
|
254
|
+
severity: CapabilityMissSeverity;
|
|
255
|
+
runsAffected: number;
|
|
256
|
+
occurrenceCount: number;
|
|
257
|
+
firstSeenAt: string;
|
|
258
|
+
lastSeenAt: string;
|
|
259
|
+
preventableClass: 'tools_surface_parity' | 'rest_surface_parity' | 'shim_wiring' | 'runtime_only_unknown_tool' | 'not_preventable_yet';
|
|
260
|
+
}
|
|
261
|
+
type RunOutcome = 'completed' | 'degraded' | 'failed_agent' | 'no_tool_calls' | 'insufficient_action' | 'inconclusive_infrastructure' | 'inconclusive_seed' | 'twin_auth_failed';
|
|
262
|
+
interface AgentTraceStep {
|
|
263
|
+
step: number;
|
|
264
|
+
thinking: string | null;
|
|
265
|
+
text: string | null;
|
|
266
|
+
toolCalls: Array<{
|
|
267
|
+
name: string;
|
|
268
|
+
arguments: unknown;
|
|
269
|
+
}>;
|
|
270
|
+
durationMs: number;
|
|
271
|
+
}
|
|
272
|
+
type FailureReasonCode = 'capability_miss' | 'agent_no_tool_calls' | 'managed_llm_unavailable' | 'twin_auth_failed' | 'evaluator_unavailable' | 'agent_process_failed' | 'unknown';
|
|
273
|
+
interface RunResult {
|
|
274
|
+
runIndex: number;
|
|
275
|
+
evaluations: EvaluationResult[];
|
|
276
|
+
overallScore: number;
|
|
277
|
+
trace: TraceEntry[];
|
|
278
|
+
durationMs: number;
|
|
279
|
+
error?: string;
|
|
280
|
+
outcome?: RunOutcome;
|
|
281
|
+
failureReasonCode?: FailureReasonCode;
|
|
282
|
+
stateBefore?: Record<string, unknown>;
|
|
283
|
+
stateAfter?: Record<string, unknown>;
|
|
284
|
+
stateDiff?: StateDiff;
|
|
285
|
+
agentLog?: string;
|
|
286
|
+
agentTrace?: AgentTraceStep[];
|
|
287
|
+
tokenUsage?: TokenUsageSummary;
|
|
288
|
+
judgeTokenUsage?: TokenUsageSummary;
|
|
289
|
+
events?: Record<string, CloneEvent[]>;
|
|
290
|
+
capabilityMisses?: CapabilityMissEvent[];
|
|
291
|
+
agentResponseText?: string;
|
|
292
|
+
/**
|
|
293
|
+
* Terminal shaped reward in [0,100]: the same weighted criterion average as
|
|
294
|
+
* `overallScore` but WITHOUT the critical-failure short-circuit (and with
|
|
295
|
+
* partial credit from `EvaluationResult.partialScore`). It gives an RL agent
|
|
296
|
+
* a gradient even when a critical gate trips `overallScore` to 0.
|
|
297
|
+
* `shapedScore >= overallScore` always. Only populated when the run was
|
|
298
|
+
* evaluated with the `--rl-reward` flag; `overallScore` is unaffected.
|
|
299
|
+
*/
|
|
300
|
+
shapedScore?: number;
|
|
301
|
+
/**
|
|
302
|
+
* Step-indexed reward in [0,1], one scalar per trace entry, produced by
|
|
303
|
+
* re-running the DETERMINISTIC criteria against successive trace prefixes
|
|
304
|
+
* (trace[0..1], trace[0..2], ...). `denseReward[i]` is the graded weighted
|
|
305
|
+
* average after step `i+1`. Deterministic-only (never invokes the LLM judge).
|
|
306
|
+
* Only populated under the `--rl-reward` flag. NOTE: the state snapshot is
|
|
307
|
+
* terminal (held constant across prefixes), so the gradient comes from
|
|
308
|
+
* trace-sensitive criteria (no_errors, trace_contains, trace counts);
|
|
309
|
+
* pure state-assertion criteria contribute their terminal value at every
|
|
310
|
+
* step. Per-step state reconstruction is a documented follow-up.
|
|
311
|
+
*/
|
|
312
|
+
denseReward?: number[];
|
|
313
|
+
}
|
|
314
|
+
interface ScenarioReport {
|
|
315
|
+
scenarioTitle: string;
|
|
316
|
+
satisfactionScore: number;
|
|
317
|
+
scoreTrust?: ScoreTrustSummary;
|
|
318
|
+
criterionDescriptions?: Record<string, string>;
|
|
319
|
+
criterionTypes?: Record<string, 'deterministic' | 'probabilistic'>;
|
|
320
|
+
/**
|
|
321
|
+
* Canonical list of clones exercised by this scenario. Prefer this field
|
|
322
|
+
* over `twinNames` for new code; both fields are populated during the
|
|
323
|
+
* twin→clone rename deprecation window.
|
|
324
|
+
*/
|
|
325
|
+
cloneNames?: string[];
|
|
326
|
+
/** Legacy alias for `cloneNames`. Will be removed once external consumers migrate. */
|
|
327
|
+
twinNames?: string[];
|
|
328
|
+
runs: RunResult[];
|
|
329
|
+
summary: string;
|
|
330
|
+
failureAnalysis?: string;
|
|
331
|
+
agentTokenUsageStatus?: AgentTokenUsageStatus;
|
|
332
|
+
agentTokenUsage?: TokenUsageSummary;
|
|
333
|
+
judgeTokenUsage?: TokenUsageSummary;
|
|
334
|
+
/**
|
|
335
|
+
* User-supplied evaluator model name. Managed evaluator model names are
|
|
336
|
+
* intentionally omitted from public reports.
|
|
337
|
+
*/
|
|
338
|
+
evaluatorModel?: string;
|
|
339
|
+
judgeMode?: 'managed' | 'user_supplied';
|
|
340
|
+
capabilityMissShadowReport?: CapabilityMissShadowReportRow[];
|
|
341
|
+
rootTraceId?: string;
|
|
342
|
+
timestamp: string;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
/**
|
|
346
|
+
* Follow-up to #2520. Failure analysis is the most operator-valuable
|
|
347
|
+
* output when a run under-performs — the reviewer on #2507 called it
|
|
348
|
+
* out explicitly. The pre-fix catch block here silently dropped on
|
|
349
|
+
* upstream LLM errors (429/5xx/auth), so an operator would see a run
|
|
350
|
+
* with unexplained `status: 'error'` criteria AND no analysis block,
|
|
351
|
+
* and have no way to distinguish "infra outage" from "analysis
|
|
352
|
+
* suppressed because all criteria passed". Surface the same error
|
|
353
|
+
* taxonomy the primary evaluator uses (#2520's classifyLlmError).
|
|
354
|
+
*/
|
|
355
|
+
interface FailureAnalysisError {
|
|
356
|
+
reason: LlmEvalErrorReason;
|
|
357
|
+
message: string;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
type ManagedScenarioReport = ScenarioReport & {
|
|
361
|
+
scenarioSlug?: string;
|
|
362
|
+
traceId?: string;
|
|
363
|
+
rootTraceId?: string;
|
|
364
|
+
/** Managed run id (`run-<uuid>`) — feed to `archal export run` / `inspect run`. */
|
|
365
|
+
runId?: string;
|
|
366
|
+
/**
|
|
367
|
+
* #2520 follow-up — set when the failure-analysis LLM call was
|
|
368
|
+
* suppressed by an upstream error (429/5xx/auth). When present,
|
|
369
|
+
* `failureAnalysis` will be undefined; callers that render the run
|
|
370
|
+
* summary should surface a "Failure analysis unavailable: <reason>"
|
|
371
|
+
* line so operators aren't left guessing.
|
|
372
|
+
*/
|
|
373
|
+
failureAnalysisError?: FailureAnalysisError;
|
|
374
|
+
};
|
|
375
|
+
|
|
376
|
+
type EnvValue = string | undefined;
|
|
377
|
+
/**
|
|
378
|
+
* Extra host path mounted read-only (default) into the agent container under the
|
|
379
|
+
* Docker harness. Structurally identical to the engine's `DockerHarnessMount`;
|
|
380
|
+
* kept local so the public SDK surface does not depend on engine internals.
|
|
381
|
+
*/
|
|
382
|
+
interface RuntimeMount {
|
|
383
|
+
source: string;
|
|
384
|
+
target: string;
|
|
385
|
+
readonly?: boolean;
|
|
386
|
+
}
|
|
387
|
+
interface RuntimeConfig {
|
|
388
|
+
command: [string, ...string[]];
|
|
389
|
+
cwd?: string;
|
|
390
|
+
env?: Record<string, EnvValue>;
|
|
391
|
+
dockerfile?: string;
|
|
392
|
+
mounts?: ReadonlyArray<RuntimeMount>;
|
|
393
|
+
}
|
|
394
|
+
interface RuntimeDefinition {
|
|
395
|
+
kind: 'runtime';
|
|
396
|
+
command: string;
|
|
397
|
+
args: string[];
|
|
398
|
+
env?: Record<string, string>;
|
|
399
|
+
cwd: string;
|
|
400
|
+
dockerfile?: string;
|
|
401
|
+
mounts?: ReadonlyArray<RuntimeMount>;
|
|
402
|
+
}
|
|
403
|
+
interface FileSeed {
|
|
404
|
+
kind: 'file';
|
|
405
|
+
path: string;
|
|
406
|
+
}
|
|
407
|
+
interface CloneDefinition {
|
|
408
|
+
name: string;
|
|
409
|
+
seed?: string | FileSeed;
|
|
410
|
+
}
|
|
411
|
+
type CloneSpec = string | CloneDefinition;
|
|
412
|
+
interface FileSeedOptions {
|
|
413
|
+
relativeTo?: string | URL;
|
|
414
|
+
}
|
|
415
|
+
interface RunDefaults {
|
|
416
|
+
runs?: number;
|
|
417
|
+
timeout?: string | number;
|
|
418
|
+
passThreshold?: number;
|
|
419
|
+
judge?: {
|
|
420
|
+
model?: string;
|
|
421
|
+
};
|
|
422
|
+
}
|
|
423
|
+
interface TraceMatcher {
|
|
424
|
+
subject: string;
|
|
425
|
+
toolNames?: string[];
|
|
426
|
+
}
|
|
427
|
+
interface CountScenarioAssertion {
|
|
428
|
+
type: 'exact_count' | 'min_count' | 'max_count';
|
|
429
|
+
subject: string;
|
|
430
|
+
value: number;
|
|
431
|
+
predicate?: string;
|
|
432
|
+
targetService?: string;
|
|
433
|
+
requireDiff?: boolean;
|
|
434
|
+
titleIncludes?: string;
|
|
435
|
+
labelFilter?: string;
|
|
436
|
+
channelFilter?: string;
|
|
437
|
+
}
|
|
438
|
+
type StructuredScenarioAssertion = CountScenarioAssertion | {
|
|
439
|
+
type: 'exists';
|
|
440
|
+
subject: string;
|
|
441
|
+
requireDiff?: boolean;
|
|
442
|
+
} | {
|
|
443
|
+
type: 'github_issue';
|
|
444
|
+
subject: 'github issue';
|
|
445
|
+
titleIncludes?: string;
|
|
446
|
+
labelFilter?: string;
|
|
447
|
+
requireDiff?: boolean;
|
|
448
|
+
} | {
|
|
449
|
+
type: 'output_contains';
|
|
450
|
+
subject: string;
|
|
451
|
+
contentPatterns: string[];
|
|
452
|
+
negated?: boolean;
|
|
453
|
+
} | {
|
|
454
|
+
type: 'trace_contains';
|
|
455
|
+
subject: string;
|
|
456
|
+
toolNames: string[];
|
|
457
|
+
negated?: boolean;
|
|
458
|
+
} | {
|
|
459
|
+
type: 'trace_order';
|
|
460
|
+
subject: string;
|
|
461
|
+
first: TraceMatcher;
|
|
462
|
+
second: TraceMatcher;
|
|
463
|
+
};
|
|
464
|
+
interface ScenarioCheck {
|
|
465
|
+
description: string;
|
|
466
|
+
type: 'deterministic' | 'probabilistic';
|
|
467
|
+
structuredAssertion?: StructuredScenarioAssertion;
|
|
468
|
+
traceMatcher?: TraceMatcher;
|
|
469
|
+
}
|
|
470
|
+
interface ScenarioDefinition {
|
|
471
|
+
name: string;
|
|
472
|
+
task: string;
|
|
473
|
+
checks: ScenarioCheck[];
|
|
474
|
+
clones?: CloneDefinition[];
|
|
475
|
+
run?: RunDefaults;
|
|
476
|
+
}
|
|
477
|
+
interface SuiteDefinition {
|
|
478
|
+
name: string;
|
|
479
|
+
agent: RuntimeDefinition;
|
|
480
|
+
clones?: CloneDefinition[];
|
|
481
|
+
run?: RunDefaults;
|
|
482
|
+
tags?: string[];
|
|
483
|
+
scenarios: ScenarioDefinition[];
|
|
484
|
+
}
|
|
485
|
+
interface ScenarioRunResult {
|
|
486
|
+
name: string;
|
|
487
|
+
passed: boolean;
|
|
488
|
+
satisfaction: number;
|
|
489
|
+
report: ManagedScenarioReport;
|
|
490
|
+
runs: RunResult[];
|
|
491
|
+
traceId?: string;
|
|
492
|
+
rootTraceId?: string;
|
|
493
|
+
}
|
|
494
|
+
interface SuiteRunResult {
|
|
495
|
+
name: string;
|
|
496
|
+
passed: boolean;
|
|
497
|
+
satisfaction: number;
|
|
498
|
+
scenarios: ScenarioRunResult[];
|
|
499
|
+
reports: ManagedScenarioReport[];
|
|
500
|
+
}
|
|
501
|
+
interface ServiceAction {
|
|
502
|
+
toolName: string;
|
|
503
|
+
aliases?: string[];
|
|
504
|
+
}
|
|
505
|
+
interface CountOptions {
|
|
506
|
+
count?: number;
|
|
507
|
+
atLeast?: number;
|
|
508
|
+
atMost?: number;
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
declare function before(first: ScenarioCheck, second: ScenarioCheck): ScenarioCheck;
|
|
512
|
+
declare const response: {
|
|
513
|
+
includes(text: string): ScenarioCheck;
|
|
514
|
+
doesNotInclude(text: string): ScenarioCheck;
|
|
515
|
+
};
|
|
516
|
+
declare const activity: {
|
|
517
|
+
didCall(action: ServiceAction): ScenarioCheck;
|
|
518
|
+
didNotCall(action: ServiceAction): ScenarioCheck;
|
|
519
|
+
};
|
|
520
|
+
declare function judge(description: string): ScenarioCheck;
|
|
521
|
+
type GithubFactory = ((options?: {
|
|
522
|
+
seed?: string | FileSeed;
|
|
523
|
+
}) => CloneDefinition) & {
|
|
524
|
+
issue: {
|
|
525
|
+
created(options?: CountOptions & {
|
|
526
|
+
titleIncludes?: string;
|
|
527
|
+
}): ScenarioCheck;
|
|
528
|
+
closed(options?: CountOptions): ScenarioCheck;
|
|
529
|
+
notClosed(): ScenarioCheck;
|
|
530
|
+
exists(options?: {
|
|
531
|
+
titleIncludes?: string;
|
|
532
|
+
}): ScenarioCheck;
|
|
533
|
+
hasLabel(options: {
|
|
534
|
+
titleIncludes?: string;
|
|
535
|
+
label: string;
|
|
536
|
+
}): ScenarioCheck;
|
|
537
|
+
};
|
|
538
|
+
issues: Record<'create' | 'delete' | 'close' | 'list', ServiceAction>;
|
|
539
|
+
};
|
|
540
|
+
declare const github: GithubFactory;
|
|
541
|
+
type SlackFactory = ((options?: {
|
|
542
|
+
seed?: string | FileSeed;
|
|
543
|
+
}) => CloneDefinition) & {
|
|
544
|
+
message: {
|
|
545
|
+
posted(options?: CountOptions & {
|
|
546
|
+
channel?: string;
|
|
547
|
+
}): ScenarioCheck;
|
|
548
|
+
notPosted(options?: {
|
|
549
|
+
channel?: string;
|
|
550
|
+
}): ScenarioCheck;
|
|
551
|
+
};
|
|
552
|
+
messages: Record<'post' | 'history', ServiceAction>;
|
|
553
|
+
};
|
|
554
|
+
declare const slack: SlackFactory;
|
|
555
|
+
|
|
556
|
+
declare function runtime(config: RuntimeConfig): RuntimeDefinition;
|
|
557
|
+
declare function suite(name: string, definition: Omit<SuiteDefinition, 'name'>): SuiteDefinition;
|
|
558
|
+
declare function run(definition: SuiteDefinition): Promise<SuiteRunResult>;
|
|
559
|
+
declare function file(path: string | URL, options?: FileSeedOptions): FileSeed;
|
|
560
|
+
declare function clones(...specs: CloneSpec[]): CloneDefinition[];
|
|
561
|
+
|
|
562
|
+
export { type CloneDefinition, type CloneSpec, type CountOptions, type CountScenarioAssertion, type EnvValue, type FileSeed, type FileSeedOptions, type RunDefaults, type RuntimeConfig, type RuntimeDefinition, type RuntimeMount, type ScenarioCheck, type ScenarioDefinition, type ScenarioRunResult, type ServiceAction, type StructuredScenarioAssertion, type SuiteDefinition, type SuiteRunResult, type TraceMatcher, activity, before, clones, file, github, judge, response, run, runtime, slack, suite };
|