archal 0.9.13 → 0.9.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +81 -73
- package/bin/archal.cjs +1 -1
- package/clone-assets/apify/tools.json +668 -0
- package/{twin-assets → clone-assets}/discord/fidelity.json +1 -1
- package/{twin-assets → clone-assets}/discord/tools.json +510 -510
- package/clone-assets/github/fidelity.json +31 -0
- package/{twin-assets → clone-assets}/github/tools.json +113 -3
- package/{twin-assets → clone-assets}/google-workspace/fidelity.json +2 -2
- package/{twin-assets → clone-assets}/google-workspace/tools.json +10 -10
- package/{twin-assets → clone-assets}/jira/fidelity.json +44 -4
- package/{twin-assets → clone-assets}/jira/tools.json +1 -1
- package/clone-assets/linear/fidelity.json +36 -0
- package/{twin-assets → clone-assets}/linear/tools.json +1 -1
- package/{twin-assets → clone-assets}/ramp/fidelity.json +1 -1
- package/{twin-assets → clone-assets}/ramp/tools.json +1 -1
- package/clone-assets/slack/fidelity.json +38 -0
- package/{twin-assets → clone-assets}/slack/tools.json +1 -1
- package/clone-assets/stripe/fidelity.json +67 -0
- package/{twin-assets → clone-assets}/stripe/tools.json +42 -11
- package/clone-assets/supabase/fidelity.json +31 -0
- package/{twin-assets → clone-assets}/supabase/tools.json +1 -1
- package/clone-assets/tavily/tools.json +115 -0
- package/dist/cli.cjs +97917 -0
- package/dist/cli.d.cts +1 -0
- package/dist/harness.cjs +62 -0
- package/dist/harness.d.cts +20 -0
- package/dist/index.cjs +5 -87878
- package/dist/index.d.cts +3 -1
- package/dist/seed/dynamic-generator.cjs +8796 -9201
- package/dist/seed/dynamic-generator.d.cts +39 -0
- package/dist/vitest/chunk-2GY4SFKE.js +29279 -0
- package/dist/vitest/{chunk-KTMNDJFB.js → chunk-WVRVNHAX.js} +45255 -44440
- package/dist/vitest/index.cjs +56408 -31519
- package/dist/vitest/index.d.ts +61 -27
- package/dist/vitest/index.js +145 -1807
- package/dist/vitest/runtime/hosted-session-reaper.cjs +34766 -28922
- package/dist/vitest/runtime/hosted-session-reaper.js +1 -2
- package/dist/vitest/runtime/setup-files.js +2 -3
- package/package.json +19 -10
- package/skills/eval/SKILL.md +113 -0
- package/skills/onboard/SKILL.md +67 -36
- package/skills/scenario/SKILL.md +22 -20
- package/skills/vitest/SKILL.md +25 -24
- package/dist/vitest/chunk-L6HSMJ3F.js +0 -2216
- package/dist/vitest/chunk-YJICENME.js +0 -1230
- package/dist/vitest/src-JGHX6UKK.js +0 -94
- package/skills/audit/SKILL.md +0 -55
- package/skills/test/SKILL.md +0 -109
- package/twin-assets/github/fidelity.json +0 -13
- package/twin-assets/linear/fidelity.json +0 -18
- package/twin-assets/slack/fidelity.json +0 -20
- package/twin-assets/stripe/fidelity.json +0 -22
- package/twin-assets/supabase/fidelity.json +0 -13
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
import {
|
|
2
2
|
bootstrapArchalVitestRouting
|
|
3
|
-
} from "../chunk-
|
|
4
|
-
import "../chunk-
|
|
5
|
-
import "../chunk-YJICENME.js";
|
|
3
|
+
} from "../chunk-2GY4SFKE.js";
|
|
4
|
+
import "../chunk-WVRVNHAX.js";
|
|
6
5
|
|
|
7
6
|
// src/runtime/setup-files.ts
|
|
8
7
|
import { existsSync, rmSync } from "fs";
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "archal",
|
|
3
|
-
"version": "0.9.
|
|
4
|
-
"description": "Test your agents & integrations against
|
|
3
|
+
"version": "0.9.15",
|
|
4
|
+
"description": "Test your agents & integrations against service clones",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.cjs",
|
|
7
7
|
"types": "dist/index.d.cts",
|
|
@@ -13,6 +13,10 @@
|
|
|
13
13
|
"types": "./dist/index.d.cts",
|
|
14
14
|
"default": "./dist/index.cjs"
|
|
15
15
|
},
|
|
16
|
+
"./harness": {
|
|
17
|
+
"types": "./dist/harness.d.cts",
|
|
18
|
+
"default": "./dist/harness.cjs"
|
|
19
|
+
},
|
|
16
20
|
"./vitest": {
|
|
17
21
|
"types": "./dist/vitest/index.d.ts",
|
|
18
22
|
"import": "./dist/vitest/index.js",
|
|
@@ -31,20 +35,27 @@
|
|
|
31
35
|
"agent",
|
|
32
36
|
"testing",
|
|
33
37
|
"mcp",
|
|
34
|
-
"
|
|
38
|
+
"service-clone",
|
|
35
39
|
"archal",
|
|
36
40
|
"vitest"
|
|
37
41
|
],
|
|
38
42
|
"engines": {
|
|
39
|
-
"node": ">=
|
|
43
|
+
"node": ">=22"
|
|
40
44
|
},
|
|
41
45
|
"files": [
|
|
42
46
|
"bin",
|
|
43
47
|
"dist",
|
|
44
48
|
"skills",
|
|
45
|
-
"
|
|
49
|
+
"clone-assets",
|
|
46
50
|
"LICENSE"
|
|
47
51
|
],
|
|
52
|
+
"scripts": {
|
|
53
|
+
"verify:artifacts": "node scripts/assert-artifacts.mjs",
|
|
54
|
+
"prepack": "pnpm run verify:artifacts",
|
|
55
|
+
"prepare": "node scripts/prepare.cjs",
|
|
56
|
+
"typecheck:raw": "node --check bin/archal.cjs && node --check scripts/assert-artifacts.mjs && node --check scripts/prepare.cjs",
|
|
57
|
+
"typecheck": "pnpm run typecheck:raw"
|
|
58
|
+
},
|
|
48
59
|
"peerDependencies": {
|
|
49
60
|
"vitest": ">=2.1.0"
|
|
50
61
|
},
|
|
@@ -53,9 +64,7 @@
|
|
|
53
64
|
"optional": true
|
|
54
65
|
}
|
|
55
66
|
},
|
|
56
|
-
"
|
|
57
|
-
"
|
|
58
|
-
"typecheck:raw": "node --check bin/archal.cjs && node --check scripts/assert-artifacts.mjs && node --check scripts/prepare.cjs",
|
|
59
|
-
"typecheck": "pnpm run typecheck:raw"
|
|
67
|
+
"dependencies": {
|
|
68
|
+
"picomatch": "^4.0.4"
|
|
60
69
|
}
|
|
61
|
-
}
|
|
70
|
+
}
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: eval
|
|
3
|
+
description: Run Archal scenarios or inline tasks against hosted clones, diagnose failed runs, and interpret satisfaction scores. Triggers on "run my scenario", "evaluate my agent", "archal run X", "debug this failing run", "what does this satisfaction score mean".
|
|
4
|
+
user-invocable: true
|
|
5
|
+
argument-hint: "[scenario.md or task description]"
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Archal Eval Runner
|
|
9
|
+
|
|
10
|
+
You run Archal scenarios and inline tasks, then help the user interpret the results. For setting up the agent path or `.archal.json` in a fresh repo, hand off to the `onboard` skill.
|
|
11
|
+
|
|
12
|
+
## What only you know (product mental model)
|
|
13
|
+
|
|
14
|
+
- `archal run` spawns the user's agent as a child process. The agent needs:
|
|
15
|
+
- A **runnable agent path**. Two ways to supply it: explicit `--harness <path>` (e.g. `./.archal/harness.mjs` from `archal init`), or `.archal.json` with an `agent` command. Repo-local auto-discovery also walks up from cwd for a top-level `harness.{ts,js,mjs,cjs}`.
|
|
16
|
+
- A **headless boundary** - no UI, no browser OAuth. The process is spawned without a shell, so interactive auth hangs forever.
|
|
17
|
+
- Env vars - auto-injected. `AGENT_TASK` is the prompt; service clones are reached through normal service URLs in a controlled runtime.
|
|
18
|
+
- Every `archal run` writes local artifacts under `.archal/cache/last-run.json` and `.archal/cache/runs/*.json` **regardless** of `--output`. `--output json` is only for machine-readable stdout; it's not needed for local persistence.
|
|
19
|
+
- **Satisfaction score** = (runs passing all criteria) / (total runs). `[D]` criteria are deterministic state checks; `[P]` criteria are LLM-judged from trace + final state.
|
|
20
|
+
|
|
21
|
+
## Preflight the harness before a run
|
|
22
|
+
|
|
23
|
+
When the agent path is uncertain, or after any change to the harness file, smoke-test the harness directly before `archal run`:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
AGENT_TASK="Reply with OK and do not use tools." node ./.archal/harness.mjs
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
A wired harness that exits cleanly with no service calls is ready. If it says the starter harness is still a stub, edit `.archal/harness.mjs` to call the user's Cursor, Codex, Claude Code, or custom agent first. Other failures catch: no runnable entrypoint, UI-boot assumptions, missing provider keys, service bridge misconfig.
|
|
30
|
+
|
|
31
|
+
If a local harness returns text but records zero clone-observed calls, check for real-service auth signals such as `api.github.com` plus `401`/`Bad credentials`. That means the harness did not reach the clone. Fix by running the SDK under sandbox/Docker routing, or by wiring the SDK to the clone REST URL pattern shown by `archal clone start <service>` / `archal clone status` and authenticating with `ARCHAL_TOKEN`.
|
|
32
|
+
|
|
33
|
+
## Running
|
|
34
|
+
|
|
35
|
+
Scenario from a file:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
archal run scenario.md
|
|
39
|
+
archal run scenario.md --runs 5 --seed enterprise-repo # N runs -> satisfaction score
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
Inline task (no scenario file):
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
archal run --task "Create an issue titled hello" --harness ./.archal/harness.mjs --clone github
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
`--task` only replaces the scenario file - it still needs a runnable agent path. `--clone` is required with `--task`; repeat or comma-separate for multiple clones.
|
|
49
|
+
|
|
50
|
+
When `.archal.json` exists in cwd, bare `archal run` uses it. If the user doesn't have one yet, that's setup - hand off to the `onboard` skill, which owns harness creation and `.archal.json` scaffolding.
|
|
51
|
+
|
|
52
|
+
## Interpret results
|
|
53
|
+
|
|
54
|
+
Score breakdown:
|
|
55
|
+
- `100%` = every run passed every criterion
|
|
56
|
+
- `80%` = 4/5 runs passed
|
|
57
|
+
- `0%` = none passed
|
|
58
|
+
|
|
59
|
+
Criterion types:
|
|
60
|
+
- `[D]` - deterministic state check. A failure is real; never a model variance artifact.
|
|
61
|
+
- `[P]` - LLM judge reads trace + final state. A single failure can be variance; re-run with `--runs 3+` to confirm before acting on it.
|
|
62
|
+
|
|
63
|
+
## Diagnose failures
|
|
64
|
+
|
|
65
|
+
Re-run with `-v` for the full trace, then classify with these signals:
|
|
66
|
+
|
|
67
|
+
- **Agent bug** - wrong tool called, wrong arguments, stopped early.
|
|
68
|
+
*Signals:* trace shows the correct tool was available but the agent chose another; or arguments are malformed.
|
|
69
|
+
*Fix:* agent prompt, tool wiring, or underlying model.
|
|
70
|
+
|
|
71
|
+
- **Scenario bug** - criteria are too strict, ambiguous, or contradict the Setup.
|
|
72
|
+
*Signals:* agent clearly did the right thing but a `[D]` criterion expects an exact count the Setup didn't guarantee; or two criteria contradict each other.
|
|
73
|
+
*Fix:* make Setup more specific, or relax the criterion. Use the `scenario` skill.
|
|
74
|
+
|
|
75
|
+
- **Seed mismatch** - clone state doesn't match what Setup describes.
|
|
76
|
+
*Signals:* agent's first introspection tool call returns unexpected state (e.g. Setup says "4 stale issues" but the seed has 3).
|
|
77
|
+
*Fix:* different seed, or adjust Setup to match. `archal seed list <clone>` to browse.
|
|
78
|
+
|
|
79
|
+
- **Harness bug** - agent process never started, crashed immediately, or hung.
|
|
80
|
+
*Signals:* no tool calls in the trace, stderr shows a boot error, or the run times out at the configured `--timeout`.
|
|
81
|
+
*Fix:* smoke-test the harness directly with `AGENT_TASK="Reply with OK." node ./.archal/harness.mjs`, then look for an untouched starter stub, UI-only imports, missing provider keys, or interactive auth.
|
|
82
|
+
|
|
83
|
+
## CI mode
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
archal run scenario.md --runs 3 --pass-threshold 80 -o json -q
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Exit codes: `0` pass, `1` fail or score < threshold, `2` validation error. For GitHub Actions, inject `ARCHAL_TOKEN` as a secret. Use a workspace API key (`archal_ws_...`) for CI, not a personal token.
|
|
90
|
+
|
|
91
|
+
Workspace API keys are runtime and CI credentials bound to one workspace. They can run clones, upload and read traces, and read usage for that workspace. They cannot manage audit events or workspace API keys. Use an owner/admin user credential, either `archal login` or a dashboard-issued user API key, for workspace administration.
|
|
92
|
+
|
|
93
|
+
## Artifacts + dashboard
|
|
94
|
+
|
|
95
|
+
- **Local (always written):** `.archal/cache/last-run.json` (summary), `.archal/cache/runs/*.json` (full redacted trace).
|
|
96
|
+
- **Hosted:** every run also uploads to https://www.archal.ai/dashboard - useful for sharing a failing trace with a colleague or comparing across agent model versions.
|
|
97
|
+
|
|
98
|
+
Don't tell users they need `-o json` to save artifacts locally - that's only for stdout.
|
|
99
|
+
|
|
100
|
+
## Anti-patterns
|
|
101
|
+
|
|
102
|
+
- Don't re-document the `archal run` flag list here. `archal run --help` and https://docs.archal.ai/cli/run own that - they'll drift if duplicated.
|
|
103
|
+
- Don't guess the agent path. If the user doesn't have `--harness`, a repo-local harness, or `.archal.json`, hand off to `onboard` - it owns setup.
|
|
104
|
+
- Don't promote local proxy or Archal-owned route env as normal service simulation. Scored runs use a controlled runtime with transparent TLS interception against real service domains; uncontainerized proxy routing is low-fidelity debug only.
|
|
105
|
+
- Don't classify a single `[P]` failure as an agent bug without re-running. Probabilistic criteria need sample size.
|
|
106
|
+
- Don't treat a `[D]` failure as model variance. Deterministic failures are real bugs.
|
|
107
|
+
|
|
108
|
+
## Docs
|
|
109
|
+
|
|
110
|
+
- Running with an agent: https://docs.archal.ai/guides/run-with-agent
|
|
111
|
+
- Existing repo playbook: https://docs.archal.ai/guides/existing-agent-repo
|
|
112
|
+
- Scenario authoring: hand off to the `scenario` skill
|
|
113
|
+
- Clone sessions: https://docs.archal.ai/guides/clone-sessions
|
package/skills/onboard/SKILL.md
CHANGED
|
@@ -6,35 +6,40 @@ user-invocable: true
|
|
|
6
6
|
|
|
7
7
|
# Archal Onboard
|
|
8
8
|
|
|
9
|
-
You are setting up Archal in this project. Archal tests AI agents against
|
|
9
|
+
You are setting up Archal in this project. Archal tests AI agents against service clones of real services (GitHub, Slack, Stripe, etc.). Handle installation and auth yourself; delegate the workflow-specific setup to the matching sub-skill.
|
|
10
10
|
|
|
11
11
|
## If this is a cold-start
|
|
12
12
|
|
|
13
13
|
The user may have landed here without running `npx archal init` first. If the
|
|
14
14
|
CLI is missing (see "Install + auth" below) AND no `.archal-manifest.json`
|
|
15
|
-
exists in `.claude/skills/`,
|
|
15
|
+
exists in any skill directory (`.claude/skills/`, `.codex/skills/`,
|
|
16
|
+
`.cursor/skills/`, `.windsurf/skills/`), the canonical first command is:
|
|
16
17
|
|
|
17
18
|
```bash
|
|
18
19
|
npx archal init
|
|
19
20
|
```
|
|
20
21
|
|
|
21
|
-
That adds `archal` as a devDependency
|
|
22
|
-
|
|
22
|
+
That adds `archal` as a devDependency, installs skills for every detected
|
|
23
|
+
agent platform (Claude Code, Codex, Cursor, Windsurf), and creates a starter
|
|
24
|
+
`.archal.json`, `.archal/harness.mjs`, and `scenarios/first-run.md`. Re-invoke
|
|
25
|
+
the onboard skill after it completes.
|
|
23
26
|
|
|
24
27
|
## Discover first
|
|
25
28
|
|
|
26
29
|
Before asking anything, read the repo:
|
|
27
30
|
|
|
28
|
-
1. `package.json` deps
|
|
29
|
-
- `@octokit/rest`, `octokit`
|
|
30
|
-
- `stripe`
|
|
31
|
-
- `@slack/web-api`, `@slack/bolt`
|
|
32
|
-
- `@linear/sdk`
|
|
33
|
-
- `@supabase/supabase-js`
|
|
34
|
-
- `googleapis`, `@google-cloud/*`
|
|
35
|
-
- `jira-client`, `jira.js`
|
|
31
|
+
1. `package.json` deps -> infer likely clones:
|
|
32
|
+
- `@octokit/rest`, `octokit` -> `github`
|
|
33
|
+
- `stripe` -> `stripe`
|
|
34
|
+
- `@slack/web-api`, `@slack/bolt` -> `slack`
|
|
35
|
+
- `@linear/sdk` -> `linear`
|
|
36
|
+
- `@supabase/supabase-js` -> `supabase`
|
|
37
|
+
- `googleapis`, `@google-cloud/*` -> `google-workspace`
|
|
38
|
+
- `jira-client`, `jira.js` -> `jira`
|
|
39
|
+
- Apify SDK or `api.apify.com` usage -> `apify`
|
|
40
|
+
- Tavily SDK or `api.tavily.com` usage -> `tavily`
|
|
36
41
|
2. Existing vitest config? Existing scenarios? Existing `.archal.json`? Those change which workflow makes sense.
|
|
37
|
-
3. If no `package.json` or no matching deps: ask "Which services does your agent interact with?" and
|
|
42
|
+
3. If no `package.json` or no matching deps: ask "Which services does your agent interact with?" and point them to the clone catalog (`archal clone --json`) rather than maintaining a separate list here.
|
|
38
43
|
|
|
39
44
|
## Install + auth
|
|
40
45
|
|
|
@@ -46,10 +51,21 @@ archal login # OAuth browser flow, or: archal login --token <toke
|
|
|
46
51
|
archal usage # verify auth + plan
|
|
47
52
|
```
|
|
48
53
|
|
|
49
|
-
In CI, set `ARCHAL_TOKEN`
|
|
54
|
+
In CI, set `ARCHAL_TOKEN` to a **workspace API key** (`archal_ws_...`)
|
|
55
|
+
instead of running `archal login`. Workspace keys are bound to one workspace,
|
|
56
|
+
do not expire when a team member leaves, and are the recommended auth for CI.
|
|
57
|
+
Create one with `archal workspace api-key create <label> --scope sessions:write`
|
|
58
|
+
(requires owner or admin role) or from the dashboard under Settings > API Keys.
|
|
59
|
+
Personal tokens (`arc_...`) are fine for local dev but should not be used in CI.
|
|
60
|
+
|
|
61
|
+
Treat workspace API keys as runtime and CI credentials, not governance
|
|
62
|
+
credentials. They can run clones, upload and read traces, and read usage for
|
|
63
|
+
their bound workspace. They cannot manage workspace API keys or audit events.
|
|
64
|
+
Use an owner/admin user credential, either `archal login` or a dashboard-issued
|
|
65
|
+
user API key, for workspace administration.
|
|
50
66
|
|
|
51
67
|
If something feels wrong (missing CLI, stale skills), these are the
|
|
52
|
-
recovery commands
|
|
68
|
+
recovery commands - don't run them otherwise:
|
|
53
69
|
|
|
54
70
|
```bash
|
|
55
71
|
npx archal --version # CLI reachable? prints e.g. 0.9.12
|
|
@@ -58,54 +74,69 @@ npx archal init --skills-only # re-stage skills if they drifted
|
|
|
58
74
|
|
|
59
75
|
## Pick a workflow
|
|
60
76
|
|
|
61
|
-
Confirm detected
|
|
77
|
+
Confirm detected clones, then ask which of these the user wants. Each delegates to a sub-skill where appropriate - don't inline those flows.
|
|
78
|
+
|
|
79
|
+
### The `agent` command (Options A and B both need this)
|
|
80
|
+
|
|
81
|
+
`archal run` spawns the agent as a child process, headlessly - no UI, no browser auth. The `agent` field in `.archal.json` is the shell command that invokes it. Typical shapes:
|
|
82
|
+
|
|
83
|
+
- `"agent": { "command": "node", "args": ["./.archal/harness.mjs"] }` - scaffolded by `archal init`
|
|
84
|
+
- `"agent": { "command": "npx", "args": ["tsx", "./.archal/harness.ts"] }` - custom TS entrypoint
|
|
85
|
+
- `"agent": { "command": "node", "args": ["./agent.js"] }` - plain Node script
|
|
86
|
+
- `"agent": { "command": "python", "args": ["agent.py"] }` - Python agent
|
|
62
87
|
|
|
63
|
-
|
|
88
|
+
If the user doesn't have a harness yet, prefer `npx archal init`; it creates `./.archal/harness.mjs`, points `.archal.json` at it, and adds a starter scenario without overwriting existing files. The generated harness is a guarded stub: Archal refuses to score it until the user edits it to call their Cursor, Codex, Claude Code, or custom agent. A custom harness should read `AGENT_TASK` from env, call the agent runtime, print `{ "text": "..." }` to stdout, and call `reportAgentMetrics()` from `archal/harness` with accumulated `{ inputTokens, outputTokens, llmCallCount }` before exit. Service clients need one explicit routing mode: use sandbox/Docker routing when the harness calls normal service URLs such as `https://api.github.com`, or configure SDK base URLs to the clone REST URL pattern shown by `archal clone start <service>` / `archal clone status` and authenticate those requests with `ARCHAL_TOKEN`. Alternative: skip `agent` in `.archal.json` and pass `--harness <path>` per-run.
|
|
64
89
|
|
|
65
|
-
|
|
90
|
+
### Option A - Evaluate an agent with scenarios
|
|
91
|
+
|
|
92
|
+
Write markdown scenario files that describe setup, prompt, and success criteria; `archal run` executes them against clones.
|
|
66
93
|
|
|
67
94
|
1. Create `.archal.json`:
|
|
68
95
|
```json
|
|
69
|
-
{ "agent": "<agent command>", "
|
|
96
|
+
{ "agent": { "command": "<agent command>", "args": ["<arg1>", "..."] }, "clones": ["<detected clones>"], "scenarios": ["scenarios/first-run.md"] }
|
|
70
97
|
```
|
|
71
|
-
2. **Delegate to the `scenario` skill** to author a starter scenario. Don't paste a canned example here
|
|
72
|
-
3. Run: `archal run scenarios/<first>.md`.
|
|
98
|
+
2. **Delegate to the `scenario` skill** to author a starter scenario. Don't paste a canned example here - the skill knows the markdown format and success-criteria syntax.
|
|
99
|
+
3. Run: `archal run scenarios/<first>.md`. **Hand off to the `eval` skill** for result interpretation and failure diagnosis.
|
|
100
|
+
|
|
101
|
+
### Option B - Run quick inline tasks
|
|
73
102
|
|
|
74
|
-
|
|
103
|
+
Same `.archal.json` as Option A (inline `--task` still needs an agent). Use this when the user wants ad-hoc runs before committing to scenario files.
|
|
75
104
|
|
|
76
|
-
1. `.archal.json
|
|
105
|
+
1. `.archal.json`:
|
|
77
106
|
```json
|
|
78
|
-
{
|
|
107
|
+
{
|
|
108
|
+
"agent": { "command": "node", "args": ["./.archal/harness.mjs"] },
|
|
109
|
+
"clones": ["<detected clones>"]
|
|
110
|
+
}
|
|
79
111
|
```
|
|
80
|
-
2. Demo: `archal run --task "Create an issue titled hello" --
|
|
81
|
-
|
|
82
|
-
No sub-skill needed — this is a one-shot.
|
|
112
|
+
2. Demo: `archal run --task "Create an issue titled hello" --clone github`.
|
|
113
|
+
3. For the generated first-run project, use bare `archal run` after wiring `.archal/harness.mjs`.
|
|
83
114
|
|
|
84
|
-
### Option C
|
|
115
|
+
### Option C - Clones in a Vitest suite
|
|
85
116
|
|
|
86
|
-
**Delegate to the `vitest` skill.** It handles reading the existing vitest config, identifying which tests should route, picking the right composition pattern, and seeding the
|
|
117
|
+
**Delegate to the `vitest` skill.** It handles reading the existing vitest config, identifying which tests should route, picking the right composition pattern, and seeding the clones.
|
|
87
118
|
|
|
88
119
|
Do not paste a sample config here. The right shape depends on what's already in the repo.
|
|
89
120
|
|
|
90
|
-
### Option D
|
|
121
|
+
### Option D - Persistent clones to develop against
|
|
91
122
|
|
|
92
|
-
Run: `archal
|
|
123
|
+
Run: `archal clone start <detected clones>` - gives live clone URLs the user's SDK clients can point at. `archal clone status` shows the active session; `archal clone stop` tears down.
|
|
93
124
|
|
|
94
125
|
## Verify
|
|
95
126
|
|
|
96
|
-
Run the first
|
|
127
|
+
Run the first scenario or task. For Options A and B, hand off to the `eval` skill to interpret the satisfaction score and diagnose failures - that skill owns the runtime mental model (`[D]` vs `[P]` criteria, trace inspection, harness execution diagnostics).
|
|
97
128
|
|
|
98
129
|
## `.archal.json` schema
|
|
99
130
|
|
|
100
131
|
| Field | Type | Required | Default | Description |
|
|
101
132
|
|-------|------|----------|---------|-------------|
|
|
102
|
-
| `agent` |
|
|
133
|
+
| `agent` | `{ command, args?, env? }` | yes (for scenarios) | | Agent command as an object (not a plain string) |
|
|
103
134
|
| `title` | string | no | | Display name for reports |
|
|
104
|
-
| `
|
|
135
|
+
| `clones` | string[] | no | inferred | Which clones to provision |
|
|
105
136
|
| `scenarios` | string[] | no | | Scenario file paths relative to config |
|
|
106
|
-
| `seeds` | `Record<string, string>` | no | | Per-
|
|
137
|
+
| `seeds` | `Record<string, string>` | no | | Per-clone seed overrides |
|
|
107
138
|
| `agentModel` | string | no | | LLM model the agent uses |
|
|
108
|
-
| `
|
|
139
|
+
| `evaluatorModel` | string | no | Archal LLM judge | Evaluator/judge model; set this only when bringing your own judge key |
|
|
109
140
|
| `runs` | number | no | `1` | Runs per scenario |
|
|
110
141
|
| `timeout` | number | no | `180` | Timeout per run in seconds |
|
|
111
142
|
|
package/skills/scenario/SKILL.md
CHANGED
|
@@ -7,7 +7,7 @@ argument-hint: "[scenario description or file path]"
|
|
|
7
7
|
|
|
8
8
|
# Archal Scenario Writer
|
|
9
9
|
|
|
10
|
-
You write and edit Archal scenario files. Scenarios are markdown files that define a test for an AI agent running against
|
|
10
|
+
You write and edit Archal scenario files. Scenarios are markdown files that define a test for an AI agent running against service clones.
|
|
11
11
|
|
|
12
12
|
## Scenario format
|
|
13
13
|
|
|
@@ -24,11 +24,11 @@ The task instruction given to the agent.
|
|
|
24
24
|
Answer key for the evaluator. Never shown to the agent.
|
|
25
25
|
|
|
26
26
|
## Success Criteria
|
|
27
|
-
- [D] Deterministic criterion checked against
|
|
27
|
+
- [D] Deterministic criterion checked against clone state
|
|
28
28
|
- [P] Probabilistic criterion judged by LLM
|
|
29
29
|
|
|
30
30
|
## Config
|
|
31
|
-
|
|
31
|
+
clones: github
|
|
32
32
|
timeout: 90
|
|
33
33
|
runs: 3
|
|
34
34
|
```
|
|
@@ -49,7 +49,7 @@ runs: 3
|
|
|
49
49
|
|
|
50
50
|
Each criterion is a bullet point. Tag with `[D]` or `[P]`:
|
|
51
51
|
|
|
52
|
-
- `[D]` = **Deterministic**. Checked against
|
|
52
|
+
- `[D]` = **Deterministic**. Checked against clone state programmatically. Use for counts, existence checks, state assertions. No LLM cost.
|
|
53
53
|
- `[P]` = **Probabilistic**. Judged by LLM evaluator from the trace and final state. Use for tone, quality, correctness, reasoning.
|
|
54
54
|
|
|
55
55
|
If no tag is provided, Archal infers the type:
|
|
@@ -78,19 +78,23 @@ If no tag is provided, Archal infers the type:
|
|
|
78
78
|
|
|
79
79
|
| Key | Type | Default | Description |
|
|
80
80
|
|-----|------|---------|-------------|
|
|
81
|
-
| `
|
|
81
|
+
| `clones` | comma-separated | inferred from content | Which clones to use |
|
|
82
82
|
| `seed` | string | | Named seed to load |
|
|
83
83
|
| `timeout` | integer | `180` | Seconds per run |
|
|
84
84
|
| `runs` | integer | `1` | Number of runs |
|
|
85
|
-
| `evaluator-model` | string |
|
|
85
|
+
| `evaluator-model` | string | Archal LLM judge | LLM for `[P]` criteria; set this only when bringing your own judge key |
|
|
86
86
|
| `tags` | comma-separated | | Scenario tags |
|
|
87
87
|
|
|
88
88
|
Aliases for `evaluator-model`: `evaluator`, `evaluatormodel`, `model`.
|
|
89
89
|
|
|
90
|
-
## Available
|
|
90
|
+
## Available clones and general-purpose seeds
|
|
91
91
|
|
|
92
|
-
|
|
92
|
+
The full clone and seed surface is manifest-backed. Prefer `archal clone --json`
|
|
93
|
+
and `archal seed list` over maintaining a separate list in this skill.
|
|
94
|
+
|
|
95
|
+
| Clone | Seeds |
|
|
93
96
|
|------|-------|
|
|
97
|
+
| `apify` | `empty` |
|
|
94
98
|
| `github` | `empty`, `small-project`, `enterprise-repo`, `ci-cd-pipeline`, `stale-issues`, `large-backlog` |
|
|
95
99
|
| `slack` | `empty`, `engineering-team`, `busy-workspace`, `incident-active` |
|
|
96
100
|
| `stripe` | `empty`, `small-business`, `checkout-flow`, `subscription-lifecycle`, `subscription-heavy` |
|
|
@@ -98,13 +102,13 @@ Aliases for `evaluator-model`: `evaluator`, `evaluatormodel`, `model`.
|
|
|
98
102
|
| `linear` | `empty`, `small-team`, `engineering-org`, `multi-team`, `busy-backlog` |
|
|
99
103
|
| `supabase` | `empty`, `small-project`, `saas-starter`, `ecommerce` |
|
|
100
104
|
| `google-workspace` | `empty`, `assistant-baseline`, `gmail-busy-inbox`, `calendar-packed-week` |
|
|
105
|
+
| `tavily` | `empty` |
|
|
101
106
|
| `ramp` | `empty`, `default` |
|
|
102
107
|
| `discord` | `empty`, `small-server`, `harvested` |
|
|
103
|
-
| `telegram` | `empty`, `harvested` |
|
|
104
108
|
|
|
105
|
-
##
|
|
109
|
+
## Clone auto-detection from content
|
|
106
110
|
|
|
107
|
-
If no `
|
|
111
|
+
If no `clones:` config is set, Archal infers clones from keywords in Setup, Expected Behavior, and Prompt:
|
|
108
112
|
|
|
109
113
|
- `github`, `repository`, `pull request`, `create_issue` -> `github`
|
|
110
114
|
- `slack`, `slack channel`, `send_message` -> `slack`
|
|
@@ -115,21 +119,19 @@ If no `twins:` config is set, Archal infers twins from keywords in Setup, Expect
|
|
|
115
119
|
- `google workspace`, `gmail`, `calendar event`, `inbox` -> `google-workspace`
|
|
116
120
|
- `discord`, `guild`, `text channel` -> `discord`
|
|
117
121
|
|
|
118
|
-
Not every
|
|
119
|
-
none. If your scenario uses `telegram`, set `twins: telegram` in the
|
|
120
|
-
Config block or in `.archal.json`. `ramp` auto-detects on `ramp`,
|
|
122
|
+
Not every clone has auto-detect keywords. `ramp` auto-detects on `ramp`,
|
|
121
123
|
`bill`, `expense`, `reimbursement`, `fund`, `card spend`.
|
|
122
124
|
|
|
123
125
|
## Multi-service scenarios
|
|
124
126
|
|
|
125
|
-
Use multiple
|
|
127
|
+
Use multiple clones by listing them in config:
|
|
126
128
|
|
|
127
129
|
```markdown
|
|
128
130
|
## Config
|
|
129
|
-
|
|
131
|
+
clones: github, slack
|
|
130
132
|
```
|
|
131
133
|
|
|
132
|
-
The Setup section can describe state across both services. Each
|
|
134
|
+
The Setup section can describe state across both services. Each clone gets its own seed.
|
|
133
135
|
|
|
134
136
|
## Validation
|
|
135
137
|
|
|
@@ -137,18 +139,18 @@ Run `archal scenario list` to verify scenarios parse correctly. A valid scenario
|
|
|
137
139
|
- A title (H1 heading)
|
|
138
140
|
- A Prompt section
|
|
139
141
|
- At least one success criterion
|
|
140
|
-
- At least one referenced
|
|
142
|
+
- At least one referenced clone (explicit or inferred)
|
|
141
143
|
- Positive timeout and runs values
|
|
142
144
|
|
|
143
145
|
## Common mistakes to avoid
|
|
144
146
|
|
|
145
147
|
1. Writing `[D]` criteria that require subjective judgment
|
|
146
148
|
2. Writing `[P]` criteria that could be checked deterministically
|
|
147
|
-
3. Forgetting to specify which
|
|
149
|
+
3. Forgetting to specify which clone the scenario uses
|
|
148
150
|
4. Writing Setup descriptions that are too vague for seed generation
|
|
149
151
|
5. Using seed names that don't exist (check the seed table above)
|
|
150
152
|
|
|
151
153
|
## Documentation
|
|
152
154
|
|
|
153
155
|
- Writing scenarios: https://docs.archal.ai/guides/writing-scenarios
|
|
154
|
-
-
|
|
156
|
+
- Clones and seeds: https://docs.archal.ai/clones/overview
|
package/skills/vitest/SKILL.md
CHANGED
|
@@ -1,27 +1,27 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: vitest
|
|
3
|
-
description: Wire `archal/vitest` into a user's existing Vitest suite so integration tests hit hosted
|
|
3
|
+
description: Wire `archal/vitest` into a user's existing Vitest suite so integration tests hit hosted clones instead of real SaaS. Use when the user asks to "add archal to vitest", "wire up vitest with clones", "test against clones in vitest", or when invoked from `archal-onboard` Option C.
|
|
4
4
|
user-invocable: true
|
|
5
5
|
---
|
|
6
6
|
|
|
7
7
|
# Archal Vitest Integration
|
|
8
8
|
|
|
9
|
-
Wire `archal/vitest` into the user's existing Vitest suite. Don't paste a canned config
|
|
9
|
+
Wire `archal/vitest` into the user's existing Vitest suite. Don't paste a canned config - inspect what's already there, surface the right choices, and compose on top of it.
|
|
10
10
|
|
|
11
11
|
## What only you know
|
|
12
12
|
|
|
13
13
|
Claude already knows what Vitest is and how a fetch interceptor works. These are the Archal-specific facts that determine your choices:
|
|
14
14
|
|
|
15
15
|
- `archal/vitest` is a **subpath export of the `archal` npm package**. Users do `pnpm add -D archal`, not `@archal/vitest`.
|
|
16
|
-
- Route mode installs a setup file that rewrites `fetch()` calls to hosted
|
|
17
|
-
-
|
|
16
|
+
- Route mode installs a setup file that rewrites `fetch()` calls to hosted clones. **Test code stays unchanged** - same SDKs, same URLs.
|
|
17
|
+
- Clones are hosted on **ECS Fargate** in Archal's AWS. First run = ~30s cold start. Subsequent runs within the 30-min idle TTL = ~2s. Tell the user; they'll think it's hung otherwise.
|
|
18
18
|
- Session cache key = `(projectName, services, seeds)` hash. Change any of those and the cache misses.
|
|
19
|
-
- **Seeds = starting state.** Omit to get the
|
|
20
|
-
- Route-mode
|
|
19
|
+
- **Seeds = starting state.** Omit to get the clone's default. Named seeds give fixtures (e.g. `small-project` for GitHub, `small-business` for Stripe). Never ask "what seed?" open-ended - the user doesn't know the catalog.
|
|
20
|
+
- Route-mode clone availability is defined by `SHARED_ROUTE_MANIFESTS` in `packages/route-runtime-core/src/manifests.ts`; use `archal clone --json` / `archal seed list` before naming supported services.
|
|
21
21
|
|
|
22
22
|
## Discover before you ask
|
|
23
23
|
|
|
24
|
-
1. `package.json` deps
|
|
24
|
+
1. `package.json` deps -> infer likely clones (`@octokit/rest` -> github, `stripe` -> stripe, `@slack/web-api` -> slack, `@supabase/supabase-js` -> supabase, `googleapis` -> google-workspace, `jira.js` -> jira).
|
|
25
25
|
2. Read any existing `vitest.config.ts` / `vitest.config.js` / `vitest.workspace.ts`. Note `setupFiles`, `include`/`exclude`, `reporters`, `projects`.
|
|
26
26
|
3. Grep test files (`__tests__/`, `tests/`, `*.test.ts`) for outbound calls: `fetch(`, `Octokit`, `new Stripe`, `WebClient`, `createClient`. These are the routing candidates.
|
|
27
27
|
4. Auth: `archal usage` tells you if they're logged in. `archal login` or `ARCHAL_TOKEN` in CI.
|
|
@@ -31,19 +31,19 @@ Claude already knows what Vitest is and how a fetch interceptor works. These are
|
|
|
31
31
|
Offer your inferred answer as the default.
|
|
32
32
|
|
|
33
33
|
1. **Scope.** "I found these N test files making outbound HTTP calls: [list]. All of them? Or a specific subset (by folder, glob, or file list)?"
|
|
34
|
-
2. **
|
|
35
|
-
3. **Seeds (per
|
|
36
|
-
> "For `github`: (a) default empty
|
|
34
|
+
2. **Clone set.** "From deps I see `[github, stripe]`. Complete, or am I missing/over-including?"
|
|
35
|
+
3. **Seeds (per clone, with inline catalog).** For each clone, present three choices:
|
|
36
|
+
> "For `github`: (a) default empty clone, (b) `small-project` seed (one repo, few issues/PRs - good starting point), (c) custom seed name. Which?"
|
|
37
37
|
|
|
38
38
|
## Pick a config pattern
|
|
39
39
|
|
|
40
40
|
Three patterns. The right one depends on what you saw in discovery.
|
|
41
41
|
|
|
42
|
-
### Pattern A
|
|
42
|
+
### Pattern A - wrap existing `vitest.config.ts` with `withArchal` (all tests hit clones)
|
|
43
43
|
|
|
44
44
|
For dedicated integration-test packages where every test should route. `withArchal` is a merge helper: it preserves everything in the existing `test` block (`coverage`, `alias`, `globalSetup`, `poolOptions`, custom reporters, etc.) and additively composes Archal's setup file, reporter, and session env on top.
|
|
45
45
|
|
|
46
|
-
Edit their existing file in place
|
|
46
|
+
Edit their existing file in place - the change is one line on the `test:` value:
|
|
47
47
|
|
|
48
48
|
```ts
|
|
49
49
|
import { defineConfig } from 'vitest/config';
|
|
@@ -71,9 +71,9 @@ Merge behavior: `setupFiles` and `reporters` are concatenated, `env` is merged (
|
|
|
71
71
|
|
|
72
72
|
If the user is starting from scratch (no existing `test` block), pass `{}` as the first argument: `withArchal({}, { services })`.
|
|
73
73
|
|
|
74
|
-
### Pattern B
|
|
74
|
+
### Pattern B - workspace with a separate Archal project (subset of tests hit clones)
|
|
75
75
|
|
|
76
|
-
Most common shape. Unit tests stay fast; only the routed subset provisions
|
|
76
|
+
Most common shape. Unit tests stay fast; only the routed subset provisions clones.
|
|
77
77
|
|
|
78
78
|
```ts
|
|
79
79
|
import { archalVitestProject } from 'archal/vitest';
|
|
@@ -82,7 +82,7 @@ export default [
|
|
|
82
82
|
'./vitest.config.ts', // their existing unit project untouched
|
|
83
83
|
archalVitestProject(
|
|
84
84
|
{
|
|
85
|
-
name: 'hosted-
|
|
85
|
+
name: 'hosted-clones',
|
|
86
86
|
services: {
|
|
87
87
|
github: { mode: 'route', seed: 'small-project' },
|
|
88
88
|
stripe: { mode: 'route' },
|
|
@@ -93,11 +93,11 @@ export default [
|
|
|
93
93
|
];
|
|
94
94
|
```
|
|
95
95
|
|
|
96
|
-
### Pattern C
|
|
96
|
+
### Pattern C - separate config + npm script (strict isolation)
|
|
97
97
|
|
|
98
98
|
`vitest.integration.config.ts` using Pattern A, plus `"test:integration": "vitest -c vitest.integration.config.ts"`. Use when `pnpm test` must stay unit-only.
|
|
99
99
|
|
|
100
|
-
## Apply
|
|
100
|
+
## Apply -> verify
|
|
101
101
|
|
|
102
102
|
1. Install `archal` if missing.
|
|
103
103
|
2. Write/edit the config.
|
|
@@ -105,6 +105,7 @@ export default [
|
|
|
105
105
|
4. Run one routed test: `pnpm vitest run <path>`.
|
|
106
106
|
|
|
107
107
|
If confirming routing is live from inside a test:
|
|
108
|
+
|
|
108
109
|
```ts
|
|
109
110
|
import { getInstalledArchalVitestSession } from 'archal/vitest';
|
|
110
111
|
console.log(getInstalledArchalVitestSession()?.resolvedRuntime.resolvedServices);
|
|
@@ -112,18 +113,18 @@ console.log(getInstalledArchalVitestSession()?.resolvedRuntime.resolvedServices)
|
|
|
112
113
|
|
|
113
114
|
## Failure modes
|
|
114
115
|
|
|
115
|
-
- **Real API response instead of
|
|
116
|
-
- **401/auth at setup**
|
|
117
|
-
- **First run takes 30+ seconds**
|
|
118
|
-
- **Seed state unexpected**
|
|
119
|
-
- **`
|
|
120
|
-
- **CI credential race** (parallel jobs corrupting `~/.archal/credentials.json`)
|
|
116
|
+
- **Real API response instead of clone response** - test file isn't in the routed project's `include` glob.
|
|
117
|
+
- **401/auth at setup** - `ARCHAL_TOKEN` unset or `archal login` not run.
|
|
118
|
+
- **First run takes 30+ seconds** - ECS cold-start, expected. Warn the user up front.
|
|
119
|
+
- **Seed state unexpected** - inspect via `getInstalledArchalVitestSession()`; confirm resolved seed matches intent.
|
|
120
|
+
- **`resetArchalClones()` not restoring** - call in `beforeEach`, not `beforeAll`.
|
|
121
|
+
- **CI credential race** (parallel jobs corrupting `~/.archal/credentials.json`) - export `ARCHAL_TOKEN` directly; don't rely on the credential file.
|
|
121
122
|
|
|
122
123
|
## Anti-patterns
|
|
123
124
|
|
|
124
125
|
- Don't route `localhost` or the user's own backend. Route mode is for external SaaS.
|
|
125
126
|
- Don't set `testIsolation: 'serial'` preemptively. Only when you've observed cross-test state leaks.
|
|
126
|
-
- Don't add route mode to tests that don't make outbound HTTP calls
|
|
127
|
+
- Don't add route mode to tests that don't make outbound HTTP calls - the interceptor install has overhead.
|
|
127
128
|
- Don't drive vitest through `.archal.json`. That file is for the CLI `archal run` flow; the vitest integration is self-contained.
|
|
128
129
|
- Don't paste a canonical config without reading what's already in the repo.
|
|
129
130
|
|