archal 0.9.19 → 0.9.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/README.md +9 -1
  2. package/agents/github-octokit/.archal.json +8 -0
  3. package/agents/github-octokit/Dockerfile +8 -0
  4. package/agents/github-octokit/README.md +113 -0
  5. package/agents/github-octokit/agent.mjs +54 -0
  6. package/agents/github-octokit/package.json +9 -0
  7. package/agents/github-octokit/scenarios/test-repo-access.md +27 -0
  8. package/agents/google-workspace-local-tools/Dockerfile +6 -0
  9. package/agents/google-workspace-local-tools/README.md +58 -0
  10. package/agents/google-workspace-local-tools/agent.mjs +196 -0
  11. package/agents/google-workspace-local-tools/archal-harness.json +7 -0
  12. package/agents/google-workspace-local-tools/run-input.yaml +16 -0
  13. package/agents/google-workspace-local-tools/scenario.md +29 -0
  14. package/agents/hermes/.archal.json +8 -0
  15. package/agents/hermes/Dockerfile +46 -0
  16. package/agents/hermes/README.md +87 -0
  17. package/agents/hermes/SOUL.md +27 -0
  18. package/agents/hermes/config.yaml +34 -0
  19. package/agents/hermes/drive.mjs +113 -0
  20. package/agents/hermes/scenarios/stripe-customers-read-only.md +32 -0
  21. package/agents/openclaw/.archal.json +8 -0
  22. package/agents/openclaw/Dockerfile +96 -0
  23. package/agents/openclaw/README.md +120 -0
  24. package/agents/openclaw/drive.mjs +311 -0
  25. package/agents/openclaw/package.json +9 -0
  26. package/agents/openclaw/scenarios/github-issue-triage-read-only.md +44 -0
  27. package/agents/openclaw/workspace/AGENTS.md +23 -0
  28. package/agents/openclaw/workspace/IDENTITY.md +8 -0
  29. package/agents/openclaw/workspace/SOUL.md +14 -0
  30. package/agents/openclaw/workspace/TOOLS.md +35 -0
  31. package/agents/pagination-test/README.md +24 -0
  32. package/agents/pagination-test/scenario.md +24 -0
  33. package/agents/replay-capsule-harness/README.md +29 -0
  34. package/agents/replay-capsule-harness/observability-install-offline-e2e.mts +1517 -0
  35. package/agents/replay-capsule-harness/replay-capsule-e2e.mjs +104 -0
  36. package/clone-assets/apify/tools.json +256 -22
  37. package/clone-assets/calcom/tools.json +510 -0
  38. package/clone-assets/clickup/tools.json +1258 -0
  39. package/clone-assets/customerio/tools.json +386 -0
  40. package/clone-assets/datadog/tools.json +734 -0
  41. package/clone-assets/github/tools.json +306 -25
  42. package/clone-assets/gitlab/tools.json +999 -0
  43. package/clone-assets/google-workspace/tools.json +18 -6
  44. package/clone-assets/hubspot/tools.json +1406 -0
  45. package/clone-assets/jira/fidelity.json +1 -1
  46. package/clone-assets/jira/tools.json +266 -543
  47. package/clone-assets/linear/tools.json +238 -40
  48. package/clone-assets/ownerrez/tools.json +548 -0
  49. package/clone-assets/pricelabs/tools.json +343 -0
  50. package/clone-assets/sentry/tools.json +745 -0
  51. package/clone-assets/slack/tools.json +1 -2
  52. package/clone-assets/stripe/tools.json +185 -46
  53. package/clone-assets/supabase/tools.json +437 -0
  54. package/clone-assets/unipile/tools.json +408 -0
  55. package/clone-assets/webflow/tools.json +415 -0
  56. package/dist/autoloop-worker-types-BEb_E44z.d.cts +196 -0
  57. package/dist/cli.cjs +150299 -87430
  58. package/dist/commands/autoloop-hosted-worker.cjs +43942 -0
  59. package/dist/commands/autoloop-hosted-worker.d.cts +143 -0
  60. package/dist/commands/autoloop-pr-verification.cjs +4227 -0
  61. package/dist/commands/autoloop-pr-verification.d.cts +17 -0
  62. package/dist/{vitest/chunk-L36NXAU6.js → commands/autoloop-result-parser.cjs} +16445 -18852
  63. package/dist/commands/autoloop-result-parser.d.cts +39 -0
  64. package/dist/commands/autoloop-worker.cjs +36163 -0
  65. package/dist/commands/autoloop-worker.d.cts +97 -0
  66. package/dist/harness.cjs +1 -0
  67. package/dist/index.cjs +1 -1
  68. package/dist/replay.cjs +49624 -0
  69. package/dist/replay.d.cts +4625 -0
  70. package/dist/scenarios.cjs +80343 -0
  71. package/dist/scenarios.d.cts +562 -0
  72. package/dist/vitest/chunk-6CBYFCFK.js +4667 -0
  73. package/dist/vitest/chunk-ARVS45PP.js +2764 -0
  74. package/dist/vitest/index.cjs +6011 -75261
  75. package/dist/vitest/index.d.ts +7 -6
  76. package/dist/vitest/index.js +8 -8
  77. package/dist/vitest/runtime/hosted-session-reaper.cjs +792 -34359
  78. package/dist/vitest/runtime/hosted-session-reaper.js +1 -1
  79. package/dist/vitest/runtime/setup-files.js +2 -2
  80. package/package.json +8 -3
  81. package/skills/archal-agent/SKILL.md +87 -0
  82. package/skills/{attach → autoloop}/SKILL.md +94 -120
  83. package/skills/autoloop/references/hosted-sources.md +62 -0
  84. package/skills/autoloop/references/trace-schema-mapping.md +73 -0
  85. package/skills/eval/SKILL.md +35 -1
  86. package/skills/install-agent/SKILL.md +221 -0
  87. package/skills/onboard/SKILL.md +73 -5
  88. package/skills/scenario/SKILL.md +19 -4
  89. package/skills/seed/SKILL.md +237 -0
  90. package/dist/seed/dynamic-generator.cjs +0 -45687
  91. package/dist/seed/dynamic-generator.d.cts +0 -106
  92. package/dist/vitest/chunk-WZ7SA4CK.js +0 -47369
@@ -0,0 +1,562 @@
1
+ interface FieldChange {
2
+ field: string;
3
+ before: unknown;
4
+ after: unknown;
5
+ }
6
+ interface EntityChange {
7
+ entityId: number;
8
+ entityType: string;
9
+ fields: FieldChange[];
10
+ }
11
+ interface StateDiff {
12
+ added: Record<string, unknown[]>;
13
+ modified: Record<string, unknown[]>;
14
+ removed: Record<string, (string | number)[]>;
15
+ fieldChanges?: EntityChange[];
16
+ }
17
+ interface CloneEvent {
18
+ type: string;
19
+ payload: unknown;
20
+ timestamp: string;
21
+ sourceRule?: string;
22
+ }
23
+
24
+ interface TraceLink {
25
+ traceId: string;
26
+ spanId: string;
27
+ type: 'retry' | 'read_after_write' | 'write_after_write' | 'fan_out' | 'async' | 'manual';
28
+ }
29
+ interface RetryPolicy {
30
+ maxAttempts?: number;
31
+ backoffMs?: number;
32
+ strategy?: 'fixed' | 'linear' | 'exponential' | 'unknown';
33
+ }
34
+ interface OutboundHttpTrace {
35
+ method?: string;
36
+ scheme?: 'http' | 'https';
37
+ host?: string;
38
+ route?: string;
39
+ path?: string;
40
+ statusCode?: number;
41
+ latencyMs?: number;
42
+ requestBytes?: number;
43
+ responseBytes?: number;
44
+ }
45
+ interface CausalLink {
46
+ traceId: string;
47
+ spanId: string;
48
+ type: 'retry' | 'read_after_write' | 'write_after_write';
49
+ }
50
+ type TraceErrorKind = 'validation_error' | 'not_found' | 'authentication_error' | 'permission_denied' | 'rate_limited' | 'timeout' | 'network_error' | 'server_error' | 'agent_error' | 'unknown';
51
+ interface TraceError {
52
+ code: string;
53
+ message: string;
54
+ kind?: TraceErrorKind;
55
+ class?: string;
56
+ normalizedCode?: string;
57
+ name?: string;
58
+ statusCode?: number;
59
+ retryable?: boolean;
60
+ retryAfterSeconds?: number;
61
+ stack?: string;
62
+ stackHash?: string;
63
+ causeChain?: Array<{
64
+ class?: string;
65
+ code?: string;
66
+ message?: string;
67
+ }>;
68
+ retryPolicy?: RetryPolicy;
69
+ details?: unknown;
70
+ }
71
+ interface TraceEntry {
72
+ id: string;
73
+ toolName: string;
74
+ traceId?: string;
75
+ spanId?: string;
76
+ parentSpanId?: string | null;
77
+ links?: TraceLink[];
78
+ runIndex?: number;
79
+ twinName?: string;
80
+ sessionId?: string;
81
+ input: Record<string, unknown>;
82
+ output: unknown;
83
+ startTimestamp?: string;
84
+ endTimestamp?: string;
85
+ queuedAt?: string;
86
+ dequeuedAt?: string;
87
+ startedAt?: string;
88
+ endedAt?: string;
89
+ uploadedAt?: string;
90
+ timestamp: string;
91
+ durationMs: number;
92
+ stateMutations: StateDiff | null;
93
+ error: TraceError | null;
94
+ sequenceIndex?: number;
95
+ requestBytes?: number;
96
+ responseBytes?: number;
97
+ entitiesRead?: Array<{
98
+ collection: string;
99
+ id: string | number;
100
+ }>;
101
+ entitiesWritten?: Array<{
102
+ collection: string;
103
+ id: string | number;
104
+ action: 'create' | 'update' | 'delete';
105
+ }>;
106
+ isRetry?: boolean;
107
+ retriedFromId?: string;
108
+ retryCount?: number;
109
+ rateLimitRemaining?: number;
110
+ triggeredRateLimit?: boolean;
111
+ errorCategory?: 'agent_error' | 'validation_error' | 'not_found' | 'permission_denied' | 'rate_limited' | 'server_error' | 'none';
112
+ causalLinks?: string[];
113
+ causalLinkDetails?: CausalLink[];
114
+ outboundHttp?: OutboundHttpTrace;
115
+ agentInternals?: {
116
+ providerRequestId?: string;
117
+ toolChoiceRationaleSummary?: string;
118
+ tokenUsage?: {
119
+ input?: number;
120
+ output?: number;
121
+ total?: number;
122
+ };
123
+ costUsd?: number;
124
+ latencyMs?: number;
125
+ };
126
+ }
127
+
128
+ /**
129
+ * Reasons why an LLM-judged criterion could not be evaluated and must be
130
+ * surfaced as `status: 'error'` (i.e. excluded from the weighted satisfaction
131
+ * score and called out in the report). See #2507.
132
+ *
133
+ * `rate_limited` — Upstream 429 (Gemini RPM/TPM blown, or direct provider
134
+ * rate limit). Keep retrying a later run.
135
+ * `upstream_5xx` — 5xx from upstream including proxy 502 that wraps
136
+ * "Gemini API error: 429 Resource exhausted" (our proxy
137
+ * translates 429 to 502 on the wire).
138
+ * `auth_error` — 401/403 from upstream (misconfigured or revoked key).
139
+ * `provider_error` — Generic/unknown provider failure (kept for back-compat,
140
+ * no longer silently degrades to 0% score).
141
+ */
142
+ type LlmEvalErrorReason = 'rate_limited' | 'upstream_5xx' | 'auth_error' | 'provider_error';
143
+ interface EvaluationResult {
144
+ criterionId: string;
145
+ /**
146
+ * `error` means the criterion could NOT be evaluated due to upstream LLM
147
+ * failure (429/5xx/auth). It must not be treated as a pass or a fail for the
148
+ * satisfaction calculation — the final report should surface "N criteria
149
+ * unevaluated, results incomplete" rather than silently under-reporting.
150
+ *
151
+ * `skipped` means evaluation was deliberately short-circuited as a cost
152
+ * optimization — currently only when every deterministic criterion already
153
+ * failed, so spending judge calls on the probabilistic criteria is wasteful.
154
+ * Like `error`, it is EXCLUDED from the weighted satisfaction average (it is
155
+ * not evidence of failure). Unlike `error`, it does NOT mark the run
156
+ * incomplete/untrustworthy: the deterministic verdict is conclusive, so the
157
+ * score stands. See `calculateOverallScore` and `score-trust.ts`.
158
+ */
159
+ status: 'pass' | 'fail' | 'partial' | 'error' | 'skipped';
160
+ confidence: number;
161
+ explanation: string;
162
+ fallbackRecommended?: boolean;
163
+ fallbackFailureReason?: 'missing_credentials' | 'rate_limited' | 'context_too_large' | 'provider_error';
164
+ /** Structured reason when `status === 'error'`. Surfaced in the report summary. */
165
+ errorReason?: LlmEvalErrorReason;
166
+ /**
167
+ * Continuous [0,1] grade for criteria that admit partial credit (currently
168
+ * the count assertions: exact/min/max_count). 1 means fully satisfied; a
169
+ * fraction expresses how close the run came (e.g. 3 of 5 expected = 0.6).
170
+ *
171
+ * This is INERT metadata for the satisfaction calculation: `getCriterionScore`
172
+ * and `calculateOverallScore` switch on `status` only and never read this
173
+ * field, so `overallScore` is byte-identical whether or not it is present.
174
+ * Only the RL reward path (`shapedScore` / `denseReward`, gated behind the
175
+ * `--rl-reward` flag) consumes it, to give an RL agent a gradient instead of
176
+ * a binary pass/fail. Always computed on count results because it costs
177
+ * nothing and gating it would couple the pure assertion evaluators to run
178
+ * config.
179
+ */
180
+ partialScore?: number;
181
+ }
182
+ interface TokenUsageSummary {
183
+ inputTokens: number;
184
+ outputTokens: number;
185
+ llmCallCount?: number;
186
+ provider?: string;
187
+ model?: string;
188
+ }
189
+ type AgentTokenUsageStatus = 'reported' | 'not_reported_by_harness';
190
+ interface ScoreTrustSummary {
191
+ status: 'complete' | 'incomplete';
192
+ evaluatorErrorCount: number;
193
+ unscoredRunFailureCount: number;
194
+ toolErrorCount: number;
195
+ scoredRunCount: number;
196
+ totalRunCount: number;
197
+ }
198
+ type CapabilityMissSubkind = 'missing_handler' | 'unknown_tool' | 'unimplemented_route' | 'unsupported_method' | 'shim_target_missing';
199
+ type CapabilityMissSurface = 'mcp' | 'tools_call' | 'rest' | 'graphql_shim' | 'alias_shim';
200
+ type CapabilityMissSeverity = 'high' | 'low';
201
+ type CapabilityMissAlertChannel = 'none' | 'derived';
202
+ type CapabilityMissDetails = {
203
+ subkind: 'unknown_tool';
204
+ requestedTool: string;
205
+ } | {
206
+ subkind: 'missing_handler';
207
+ resolvedTool: string;
208
+ } | {
209
+ subkind: 'unsupported_method';
210
+ method: string;
211
+ route: string;
212
+ } | {
213
+ subkind: 'unimplemented_route';
214
+ method: string;
215
+ route: string;
216
+ } | {
217
+ subkind: 'shim_target_missing';
218
+ shim: string;
219
+ resolvedTool: string;
220
+ };
221
+ interface CapabilityMissEvent {
222
+ version: 1;
223
+ event: 'capability.miss';
224
+ occurredAt: string;
225
+ run: {
226
+ sessionId?: string;
227
+ runId?: string;
228
+ scenarioId?: string;
229
+ };
230
+ miss: {
231
+ subkind: CapabilityMissSubkind;
232
+ surface: CapabilityMissSurface;
233
+ severity: CapabilityMissSeverity;
234
+ fingerprint: string;
235
+ };
236
+ target: {
237
+ twin: string;
238
+ toolName?: string;
239
+ method?: string;
240
+ route?: string;
241
+ };
242
+ details: CapabilityMissDetails;
243
+ handling: {
244
+ abortedRun: boolean;
245
+ alerted: boolean;
246
+ alertChannel: CapabilityMissAlertChannel;
247
+ };
248
+ }
249
+ interface CapabilityMissShadowReportRow {
250
+ fingerprint: string;
251
+ twin: string;
252
+ surface: CapabilityMissSurface;
253
+ subkind: CapabilityMissSubkind;
254
+ severity: CapabilityMissSeverity;
255
+ runsAffected: number;
256
+ occurrenceCount: number;
257
+ firstSeenAt: string;
258
+ lastSeenAt: string;
259
+ preventableClass: 'tools_surface_parity' | 'rest_surface_parity' | 'shim_wiring' | 'runtime_only_unknown_tool' | 'not_preventable_yet';
260
+ }
261
+ type RunOutcome = 'completed' | 'degraded' | 'failed_agent' | 'no_tool_calls' | 'insufficient_action' | 'inconclusive_infrastructure' | 'inconclusive_seed' | 'twin_auth_failed';
262
+ interface AgentTraceStep {
263
+ step: number;
264
+ thinking: string | null;
265
+ text: string | null;
266
+ toolCalls: Array<{
267
+ name: string;
268
+ arguments: unknown;
269
+ }>;
270
+ durationMs: number;
271
+ }
272
+ type FailureReasonCode = 'capability_miss' | 'agent_no_tool_calls' | 'managed_llm_unavailable' | 'twin_auth_failed' | 'evaluator_unavailable' | 'agent_process_failed' | 'unknown';
273
+ interface RunResult {
274
+ runIndex: number;
275
+ evaluations: EvaluationResult[];
276
+ overallScore: number;
277
+ trace: TraceEntry[];
278
+ durationMs: number;
279
+ error?: string;
280
+ outcome?: RunOutcome;
281
+ failureReasonCode?: FailureReasonCode;
282
+ stateBefore?: Record<string, unknown>;
283
+ stateAfter?: Record<string, unknown>;
284
+ stateDiff?: StateDiff;
285
+ agentLog?: string;
286
+ agentTrace?: AgentTraceStep[];
287
+ tokenUsage?: TokenUsageSummary;
288
+ judgeTokenUsage?: TokenUsageSummary;
289
+ events?: Record<string, CloneEvent[]>;
290
+ capabilityMisses?: CapabilityMissEvent[];
291
+ agentResponseText?: string;
292
+ /**
293
+ * Terminal shaped reward in [0,100]: the same weighted criterion average as
294
+ * `overallScore` but WITHOUT the critical-failure short-circuit (and with
295
+ * partial credit from `EvaluationResult.partialScore`). It gives an RL agent
296
+ * a gradient even when a critical gate trips `overallScore` to 0.
297
+ * `shapedScore >= overallScore` always. Only populated when the run was
298
+ * evaluated with the `--rl-reward` flag; `overallScore` is unaffected.
299
+ */
300
+ shapedScore?: number;
301
+ /**
302
+ * Step-indexed reward in [0,1], one scalar per trace entry, produced by
303
+ * re-running the DETERMINISTIC criteria against successive trace prefixes
304
+ * (trace[0..1], trace[0..2], ...). `denseReward[i]` is the graded weighted
305
+ * average after step `i+1`. Deterministic-only (never invokes the LLM judge).
306
+ * Only populated under the `--rl-reward` flag. NOTE: the state snapshot is
307
+ * terminal (held constant across prefixes), so the gradient comes from
308
+ * trace-sensitive criteria (no_errors, trace_contains, trace counts);
309
+ * pure state-assertion criteria contribute their terminal value at every
310
+ * step. Per-step state reconstruction is a documented follow-up.
311
+ */
312
+ denseReward?: number[];
313
+ }
314
+ interface ScenarioReport {
315
+ scenarioTitle: string;
316
+ satisfactionScore: number;
317
+ scoreTrust?: ScoreTrustSummary;
318
+ criterionDescriptions?: Record<string, string>;
319
+ criterionTypes?: Record<string, 'deterministic' | 'probabilistic'>;
320
+ /**
321
+ * Canonical list of clones exercised by this scenario. Prefer this field
322
+ * over `twinNames` for new code; both fields are populated during the
323
+ * twin→clone rename deprecation window.
324
+ */
325
+ cloneNames?: string[];
326
+ /** Legacy alias for `cloneNames`. Will be removed once external consumers migrate. */
327
+ twinNames?: string[];
328
+ runs: RunResult[];
329
+ summary: string;
330
+ failureAnalysis?: string;
331
+ agentTokenUsageStatus?: AgentTokenUsageStatus;
332
+ agentTokenUsage?: TokenUsageSummary;
333
+ judgeTokenUsage?: TokenUsageSummary;
334
+ /**
335
+ * User-supplied evaluator model name. Managed evaluator model names are
336
+ * intentionally omitted from public reports.
337
+ */
338
+ evaluatorModel?: string;
339
+ judgeMode?: 'managed' | 'user_supplied';
340
+ capabilityMissShadowReport?: CapabilityMissShadowReportRow[];
341
+ rootTraceId?: string;
342
+ timestamp: string;
343
+ }
344
+
345
+ /**
346
+ * Follow-up to #2520. Failure analysis is the most operator-valuable
347
+ * output when a run under-performs — the reviewer on #2507 called it
348
+ * out explicitly. The pre-fix catch block here silently dropped on
349
+ * upstream LLM errors (429/5xx/auth), so an operator would see a run
350
+ * with unexplained `status: 'error'` criteria AND no analysis block,
351
+ * and have no way to distinguish "infra outage" from "analysis
352
+ * suppressed because all criteria passed". Surface the same error
353
+ * taxonomy the primary evaluator uses (#2520's classifyLlmError).
354
+ */
355
+ interface FailureAnalysisError {
356
+ reason: LlmEvalErrorReason;
357
+ message: string;
358
+ }
359
+
360
+ type ManagedScenarioReport = ScenarioReport & {
361
+ scenarioSlug?: string;
362
+ traceId?: string;
363
+ rootTraceId?: string;
364
+ /** Managed run id (`run-<uuid>`) — feed to `archal export run` / `inspect run`. */
365
+ runId?: string;
366
+ /**
367
+ * #2520 follow-up — set when the failure-analysis LLM call was
368
+ * suppressed by an upstream error (429/5xx/auth). When present,
369
+ * `failureAnalysis` will be undefined; callers that render the run
370
+ * summary should surface a "Failure analysis unavailable: <reason>"
371
+ * line so operators aren't left guessing.
372
+ */
373
+ failureAnalysisError?: FailureAnalysisError;
374
+ };
375
+
376
+ type EnvValue = string | undefined;
377
+ /**
378
+ * Extra host path mounted read-only (default) into the agent container under the
379
+ * Docker harness. Structurally identical to the engine's `DockerHarnessMount`;
380
+ * kept local so the public SDK surface does not depend on engine internals.
381
+ */
382
+ interface RuntimeMount {
383
+ source: string;
384
+ target: string;
385
+ readonly?: boolean;
386
+ }
387
+ interface RuntimeConfig {
388
+ command: [string, ...string[]];
389
+ cwd?: string;
390
+ env?: Record<string, EnvValue>;
391
+ dockerfile?: string;
392
+ mounts?: ReadonlyArray<RuntimeMount>;
393
+ }
394
+ interface RuntimeDefinition {
395
+ kind: 'runtime';
396
+ command: string;
397
+ args: string[];
398
+ env?: Record<string, string>;
399
+ cwd: string;
400
+ dockerfile?: string;
401
+ mounts?: ReadonlyArray<RuntimeMount>;
402
+ }
403
+ interface FileSeed {
404
+ kind: 'file';
405
+ path: string;
406
+ }
407
+ interface CloneDefinition {
408
+ name: string;
409
+ seed?: string | FileSeed;
410
+ }
411
+ type CloneSpec = string | CloneDefinition;
412
+ interface FileSeedOptions {
413
+ relativeTo?: string | URL;
414
+ }
415
+ interface RunDefaults {
416
+ runs?: number;
417
+ timeout?: string | number;
418
+ passThreshold?: number;
419
+ judge?: {
420
+ model?: string;
421
+ };
422
+ }
423
+ interface TraceMatcher {
424
+ subject: string;
425
+ toolNames?: string[];
426
+ }
427
+ interface CountScenarioAssertion {
428
+ type: 'exact_count' | 'min_count' | 'max_count';
429
+ subject: string;
430
+ value: number;
431
+ predicate?: string;
432
+ targetService?: string;
433
+ requireDiff?: boolean;
434
+ titleIncludes?: string;
435
+ labelFilter?: string;
436
+ channelFilter?: string;
437
+ }
438
+ type StructuredScenarioAssertion = CountScenarioAssertion | {
439
+ type: 'exists';
440
+ subject: string;
441
+ requireDiff?: boolean;
442
+ } | {
443
+ type: 'github_issue';
444
+ subject: 'github issue';
445
+ titleIncludes?: string;
446
+ labelFilter?: string;
447
+ requireDiff?: boolean;
448
+ } | {
449
+ type: 'output_contains';
450
+ subject: string;
451
+ contentPatterns: string[];
452
+ negated?: boolean;
453
+ } | {
454
+ type: 'trace_contains';
455
+ subject: string;
456
+ toolNames: string[];
457
+ negated?: boolean;
458
+ } | {
459
+ type: 'trace_order';
460
+ subject: string;
461
+ first: TraceMatcher;
462
+ second: TraceMatcher;
463
+ };
464
+ interface ScenarioCheck {
465
+ description: string;
466
+ type: 'deterministic' | 'probabilistic';
467
+ structuredAssertion?: StructuredScenarioAssertion;
468
+ traceMatcher?: TraceMatcher;
469
+ }
470
+ interface ScenarioDefinition {
471
+ name: string;
472
+ task: string;
473
+ checks: ScenarioCheck[];
474
+ clones?: CloneDefinition[];
475
+ run?: RunDefaults;
476
+ }
477
+ interface SuiteDefinition {
478
+ name: string;
479
+ agent: RuntimeDefinition;
480
+ clones?: CloneDefinition[];
481
+ run?: RunDefaults;
482
+ tags?: string[];
483
+ scenarios: ScenarioDefinition[];
484
+ }
485
+ interface ScenarioRunResult {
486
+ name: string;
487
+ passed: boolean;
488
+ satisfaction: number;
489
+ report: ManagedScenarioReport;
490
+ runs: RunResult[];
491
+ traceId?: string;
492
+ rootTraceId?: string;
493
+ }
494
+ interface SuiteRunResult {
495
+ name: string;
496
+ passed: boolean;
497
+ satisfaction: number;
498
+ scenarios: ScenarioRunResult[];
499
+ reports: ManagedScenarioReport[];
500
+ }
501
+ interface ServiceAction {
502
+ toolName: string;
503
+ aliases?: string[];
504
+ }
505
+ interface CountOptions {
506
+ count?: number;
507
+ atLeast?: number;
508
+ atMost?: number;
509
+ }
510
+
511
+ declare function before(first: ScenarioCheck, second: ScenarioCheck): ScenarioCheck;
512
+ declare const response: {
513
+ includes(text: string): ScenarioCheck;
514
+ doesNotInclude(text: string): ScenarioCheck;
515
+ };
516
+ declare const activity: {
517
+ didCall(action: ServiceAction): ScenarioCheck;
518
+ didNotCall(action: ServiceAction): ScenarioCheck;
519
+ };
520
+ declare function judge(description: string): ScenarioCheck;
521
+ type GithubFactory = ((options?: {
522
+ seed?: string | FileSeed;
523
+ }) => CloneDefinition) & {
524
+ issue: {
525
+ created(options?: CountOptions & {
526
+ titleIncludes?: string;
527
+ }): ScenarioCheck;
528
+ closed(options?: CountOptions): ScenarioCheck;
529
+ notClosed(): ScenarioCheck;
530
+ exists(options?: {
531
+ titleIncludes?: string;
532
+ }): ScenarioCheck;
533
+ hasLabel(options: {
534
+ titleIncludes?: string;
535
+ label: string;
536
+ }): ScenarioCheck;
537
+ };
538
+ issues: Record<'create' | 'delete' | 'close' | 'list', ServiceAction>;
539
+ };
540
+ declare const github: GithubFactory;
541
+ type SlackFactory = ((options?: {
542
+ seed?: string | FileSeed;
543
+ }) => CloneDefinition) & {
544
+ message: {
545
+ posted(options?: CountOptions & {
546
+ channel?: string;
547
+ }): ScenarioCheck;
548
+ notPosted(options?: {
549
+ channel?: string;
550
+ }): ScenarioCheck;
551
+ };
552
+ messages: Record<'post' | 'history', ServiceAction>;
553
+ };
554
+ declare const slack: SlackFactory;
555
+
556
+ declare function runtime(config: RuntimeConfig): RuntimeDefinition;
557
+ declare function suite(name: string, definition: Omit<SuiteDefinition, 'name'>): SuiteDefinition;
558
+ declare function run(definition: SuiteDefinition): Promise<SuiteRunResult>;
559
+ declare function file(path: string | URL, options?: FileSeedOptions): FileSeed;
560
+ declare function clones(...specs: CloneSpec[]): CloneDefinition[];
561
+
562
+ export { type CloneDefinition, type CloneSpec, type CountOptions, type CountScenarioAssertion, type EnvValue, type FileSeed, type FileSeedOptions, type RunDefaults, type RuntimeConfig, type RuntimeDefinition, type RuntimeMount, type ScenarioCheck, type ScenarioDefinition, type ScenarioRunResult, type ServiceAction, type StructuredScenarioAssertion, type SuiteDefinition, type SuiteRunResult, type TraceMatcher, activity, before, clones, file, github, judge, response, run, runtime, slack, suite };