archal 0.9.13 → 0.9.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/README.md +81 -73
  2. package/bin/archal.cjs +1 -1
  3. package/clone-assets/apify/tools.json +668 -0
  4. package/{twin-assets → clone-assets}/discord/fidelity.json +1 -1
  5. package/{twin-assets → clone-assets}/discord/tools.json +510 -510
  6. package/clone-assets/github/fidelity.json +31 -0
  7. package/{twin-assets → clone-assets}/github/tools.json +113 -3
  8. package/{twin-assets → clone-assets}/google-workspace/fidelity.json +2 -2
  9. package/{twin-assets → clone-assets}/google-workspace/tools.json +10 -10
  10. package/{twin-assets → clone-assets}/jira/fidelity.json +44 -4
  11. package/{twin-assets → clone-assets}/jira/tools.json +1 -1
  12. package/clone-assets/linear/fidelity.json +36 -0
  13. package/{twin-assets → clone-assets}/linear/tools.json +1 -1
  14. package/{twin-assets → clone-assets}/ramp/fidelity.json +1 -1
  15. package/{twin-assets → clone-assets}/ramp/tools.json +1 -1
  16. package/clone-assets/slack/fidelity.json +38 -0
  17. package/{twin-assets → clone-assets}/slack/tools.json +1 -1
  18. package/clone-assets/stripe/fidelity.json +67 -0
  19. package/{twin-assets → clone-assets}/stripe/tools.json +42 -11
  20. package/clone-assets/supabase/fidelity.json +31 -0
  21. package/{twin-assets → clone-assets}/supabase/tools.json +1 -1
  22. package/clone-assets/tavily/tools.json +115 -0
  23. package/dist/cli.cjs +97917 -0
  24. package/dist/cli.d.cts +1 -0
  25. package/dist/harness.cjs +62 -0
  26. package/dist/harness.d.cts +20 -0
  27. package/dist/index.cjs +5 -87878
  28. package/dist/index.d.cts +3 -1
  29. package/dist/seed/dynamic-generator.cjs +8796 -9201
  30. package/dist/seed/dynamic-generator.d.cts +39 -0
  31. package/dist/vitest/chunk-2GY4SFKE.js +29279 -0
  32. package/dist/vitest/{chunk-KTMNDJFB.js → chunk-WVRVNHAX.js} +45255 -44440
  33. package/dist/vitest/index.cjs +56408 -31519
  34. package/dist/vitest/index.d.ts +61 -27
  35. package/dist/vitest/index.js +145 -1807
  36. package/dist/vitest/runtime/hosted-session-reaper.cjs +34766 -28922
  37. package/dist/vitest/runtime/hosted-session-reaper.js +1 -2
  38. package/dist/vitest/runtime/setup-files.js +2 -3
  39. package/package.json +19 -10
  40. package/skills/eval/SKILL.md +113 -0
  41. package/skills/onboard/SKILL.md +67 -36
  42. package/skills/scenario/SKILL.md +22 -20
  43. package/skills/vitest/SKILL.md +25 -24
  44. package/dist/vitest/chunk-L6HSMJ3F.js +0 -2216
  45. package/dist/vitest/chunk-YJICENME.js +0 -1230
  46. package/dist/vitest/src-JGHX6UKK.js +0 -94
  47. package/skills/audit/SKILL.md +0 -55
  48. package/skills/test/SKILL.md +0 -109
  49. package/twin-assets/github/fidelity.json +0 -13
  50. package/twin-assets/linear/fidelity.json +0 -18
  51. package/twin-assets/slack/fidelity.json +0 -20
  52. package/twin-assets/stripe/fidelity.json +0 -22
  53. package/twin-assets/supabase/fidelity.json +0 -13
@@ -4,8 +4,7 @@ import {
4
4
  createHostedAuthLease,
5
5
  parsePositiveInteger,
6
6
  runHostedSessionReaper
7
- } from "../chunk-KTMNDJFB.js";
8
- import "../chunk-YJICENME.js";
7
+ } from "../chunk-WVRVNHAX.js";
9
8
 
10
9
  // src/runtime/hosted-session-reaper.ts
11
10
  var VITEST_AUTH_LEASE_OPTIONS = {
@@ -1,8 +1,7 @@
1
1
  import {
2
2
  bootstrapArchalVitestRouting
3
- } from "../chunk-L6HSMJ3F.js";
4
- import "../chunk-KTMNDJFB.js";
5
- import "../chunk-YJICENME.js";
3
+ } from "../chunk-2GY4SFKE.js";
4
+ import "../chunk-WVRVNHAX.js";
6
5
 
7
6
  // src/runtime/setup-files.ts
8
7
  import { existsSync, rmSync } from "fs";
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "archal",
3
- "version": "0.9.13",
4
- "description": "Test your agents & integrations against digital twins",
3
+ "version": "0.9.15",
4
+ "description": "Test your agents & integrations against service clones",
5
5
  "type": "module",
6
6
  "main": "dist/index.cjs",
7
7
  "types": "dist/index.d.cts",
@@ -13,6 +13,10 @@
13
13
  "types": "./dist/index.d.cts",
14
14
  "default": "./dist/index.cjs"
15
15
  },
16
+ "./harness": {
17
+ "types": "./dist/harness.d.cts",
18
+ "default": "./dist/harness.cjs"
19
+ },
16
20
  "./vitest": {
17
21
  "types": "./dist/vitest/index.d.ts",
18
22
  "import": "./dist/vitest/index.js",
@@ -31,20 +35,27 @@
31
35
  "agent",
32
36
  "testing",
33
37
  "mcp",
34
- "digital-twin",
38
+ "service-clone",
35
39
  "archal",
36
40
  "vitest"
37
41
  ],
38
42
  "engines": {
39
- "node": ">=20"
43
+ "node": ">=22"
40
44
  },
41
45
  "files": [
42
46
  "bin",
43
47
  "dist",
44
48
  "skills",
45
- "twin-assets",
49
+ "clone-assets",
46
50
  "LICENSE"
47
51
  ],
52
+ "scripts": {
53
+ "verify:artifacts": "node scripts/assert-artifacts.mjs",
54
+ "prepack": "pnpm run verify:artifacts",
55
+ "prepare": "node scripts/prepare.cjs",
56
+ "typecheck:raw": "node --check bin/archal.cjs && node --check scripts/assert-artifacts.mjs && node --check scripts/prepare.cjs",
57
+ "typecheck": "pnpm run typecheck:raw"
58
+ },
48
59
  "peerDependencies": {
49
60
  "vitest": ">=2.1.0"
50
61
  },
@@ -53,9 +64,7 @@
53
64
  "optional": true
54
65
  }
55
66
  },
56
- "scripts": {
57
- "verify:artifacts": "node scripts/assert-artifacts.mjs",
58
- "typecheck:raw": "node --check bin/archal.cjs && node --check scripts/assert-artifacts.mjs && node --check scripts/prepare.cjs",
59
- "typecheck": "pnpm run typecheck:raw"
67
+ "dependencies": {
68
+ "picomatch": "^4.0.4"
60
69
  }
61
- }
70
+ }
@@ -0,0 +1,113 @@
1
+ ---
2
+ name: eval
3
+ description: Run Archal scenarios or inline tasks against hosted clones, diagnose failed runs, and interpret satisfaction scores. Triggers on "run my scenario", "evaluate my agent", "archal run X", "debug this failing run", "what does this satisfaction score mean".
4
+ user-invocable: true
5
+ argument-hint: "[scenario.md or task description]"
6
+ ---
7
+
8
+ # Archal Eval Runner
9
+
10
+ You run Archal scenarios and inline tasks, then help the user interpret the results. For setting up the agent path or `.archal.json` in a fresh repo, hand off to the `onboard` skill.
11
+
12
+ ## What only you know (product mental model)
13
+
14
+ - `archal run` spawns the user's agent as a child process. The agent needs:
15
+ - A **runnable agent path**. Two ways to supply it: explicit `--harness <path>` (e.g. `./.archal/harness.mjs` from `archal init`), or `.archal.json` with an `agent` command. Repo-local auto-discovery also walks up from cwd for a top-level `harness.{ts,js,mjs,cjs}`.
16
+ - A **headless boundary** - no UI, no browser OAuth. The process is spawned without a shell, so interactive auth hangs forever.
17
+ - Env vars - auto-injected. `AGENT_TASK` is the prompt; service clones are reached through normal service URLs in a controlled runtime.
18
+ - Every `archal run` writes local artifacts under `.archal/cache/last-run.json` and `.archal/cache/runs/*.json` **regardless** of `--output`. `--output json` is only for machine-readable stdout; it's not needed for local persistence.
19
+ - **Satisfaction score** = (runs passing all criteria) / (total runs). `[D]` criteria are deterministic state checks; `[P]` criteria are LLM-judged from trace + final state.
20
+
21
+ ## Preflight the harness before a run
22
+
23
+ When the agent path is uncertain, or after any change to the harness file, smoke-test the harness directly before `archal run`:
24
+
25
+ ```bash
26
+ AGENT_TASK="Reply with OK and do not use tools." node ./.archal/harness.mjs
27
+ ```
28
+
29
+ A wired harness that exits cleanly with no service calls is ready. If it says the starter harness is still a stub, edit `.archal/harness.mjs` to call the user's Cursor, Codex, Claude Code, or custom agent first. Other failures catch: no runnable entrypoint, UI-boot assumptions, missing provider keys, service bridge misconfig.
30
+
31
+ If a local harness returns text but records zero clone-observed calls, check for real-service auth signals such as `api.github.com` plus `401`/`Bad credentials`. That means the harness did not reach the clone. Fix by running the SDK under sandbox/Docker routing, or by wiring the SDK to the clone REST URL pattern shown by `archal clone start <service>` / `archal clone status` and authenticating with `ARCHAL_TOKEN`.
32
+
33
+ ## Running
34
+
35
+ Scenario from a file:
36
+
37
+ ```bash
38
+ archal run scenario.md
39
+ archal run scenario.md --runs 5 --seed enterprise-repo # N runs -> satisfaction score
40
+ ```
41
+
42
+ Inline task (no scenario file):
43
+
44
+ ```bash
45
+ archal run --task "Create an issue titled hello" --harness ./.archal/harness.mjs --clone github
46
+ ```
47
+
48
+ `--task` only replaces the scenario file - it still needs a runnable agent path. `--clone` is required with `--task`; repeat or comma-separate for multiple clones.
49
+
50
+ When `.archal.json` exists in cwd, bare `archal run` uses it. If the user doesn't have one yet, that's setup - hand off to the `onboard` skill, which owns harness creation and `.archal.json` scaffolding.
51
+
52
+ ## Interpret results
53
+
54
+ Score breakdown:
55
+ - `100%` = every run passed every criterion
56
+ - `80%` = 4/5 runs passed
57
+ - `0%` = none passed
58
+
59
+ Criterion types:
60
+ - `[D]` - deterministic state check. A failure is real; never a model variance artifact.
61
+ - `[P]` - LLM judge reads trace + final state. A single failure can be variance; re-run with `--runs 3+` to confirm before acting on it.
62
+
63
+ ## Diagnose failures
64
+
65
+ Re-run with `-v` for the full trace, then classify with these signals:
66
+
67
+ - **Agent bug** - wrong tool called, wrong arguments, stopped early.
68
+ *Signals:* trace shows the correct tool was available but the agent chose another; or arguments are malformed.
69
+ *Fix:* agent prompt, tool wiring, or underlying model.
70
+
71
+ - **Scenario bug** - criteria are too strict, ambiguous, or contradict the Setup.
72
+ *Signals:* agent clearly did the right thing but a `[D]` criterion expects an exact count the Setup didn't guarantee; or two criteria contradict each other.
73
+ *Fix:* make Setup more specific, or relax the criterion. Use the `scenario` skill.
74
+
75
+ - **Seed mismatch** - clone state doesn't match what Setup describes.
76
+ *Signals:* agent's first introspection tool call returns unexpected state (e.g. Setup says "4 stale issues" but the seed has 3).
77
+ *Fix:* different seed, or adjust Setup to match. `archal seed list <clone>` to browse.
78
+
79
+ - **Harness bug** - agent process never started, crashed immediately, or hung.
80
+ *Signals:* no tool calls in the trace, stderr shows a boot error, or the run times out at the configured `--timeout`.
81
+ *Fix:* smoke-test the harness directly with `AGENT_TASK="Reply with OK." node ./.archal/harness.mjs`, then look for an untouched starter stub, UI-only imports, missing provider keys, or interactive auth.
82
+
83
+ ## CI mode
84
+
85
+ ```bash
86
+ archal run scenario.md --runs 3 --pass-threshold 80 -o json -q
87
+ ```
88
+
89
+ Exit codes: `0` pass, `1` fail or score < threshold, `2` validation error. For GitHub Actions, inject `ARCHAL_TOKEN` as a secret. Use a workspace API key (`archal_ws_...`) for CI, not a personal token.
90
+
91
+ Workspace API keys are runtime and CI credentials bound to one workspace. They can run clones, upload and read traces, and read usage for that workspace. They cannot manage audit events or workspace API keys. Use an owner/admin user credential, either `archal login` or a dashboard-issued user API key, for workspace administration.
92
+
93
+ ## Artifacts + dashboard
94
+
95
+ - **Local (always written):** `.archal/cache/last-run.json` (summary), `.archal/cache/runs/*.json` (full redacted trace).
96
+ - **Hosted:** every run also uploads to https://www.archal.ai/dashboard - useful for sharing a failing trace with a colleague or comparing across agent model versions.
97
+
98
+ Don't tell users they need `-o json` to save artifacts locally - that's only for stdout.
99
+
100
+ ## Anti-patterns
101
+
102
+ - Don't re-document the `archal run` flag list here. `archal run --help` and https://docs.archal.ai/cli/run own that - they'll drift if duplicated.
103
+ - Don't guess the agent path. If the user doesn't have `--harness`, a repo-local harness, or `.archal.json`, hand off to `onboard` - it owns setup.
104
+ - Don't promote local proxy or Archal-owned route env as normal service simulation. Scored runs use a controlled runtime with transparent TLS interception against real service domains; uncontainerized proxy routing is low-fidelity debug only.
105
+ - Don't classify a single `[P]` failure as an agent bug without re-running. Probabilistic criteria need sample size.
106
+ - Don't treat a `[D]` failure as model variance. Deterministic failures are real bugs.
107
+
108
+ ## Docs
109
+
110
+ - Running with an agent: https://docs.archal.ai/guides/run-with-agent
111
+ - Existing repo playbook: https://docs.archal.ai/guides/existing-agent-repo
112
+ - Scenario authoring: hand off to the `scenario` skill
113
+ - Clone sessions: https://docs.archal.ai/guides/clone-sessions
@@ -6,35 +6,40 @@ user-invocable: true
6
6
 
7
7
  # Archal Onboard
8
8
 
9
- You are setting up Archal in this project. Archal tests AI agents against digital twins of real services (GitHub, Slack, Stripe, etc.). Handle installation and auth yourself; delegate the workflow-specific setup to the matching sub-skill.
9
+ You are setting up Archal in this project. Archal tests AI agents against service clones of real services (GitHub, Slack, Stripe, etc.). Handle installation and auth yourself; delegate the workflow-specific setup to the matching sub-skill.
10
10
 
11
11
  ## If this is a cold-start
12
12
 
13
13
  The user may have landed here without running `npx archal init` first. If the
14
14
  CLI is missing (see "Install + auth" below) AND no `.archal-manifest.json`
15
- exists in `.claude/skills/`, the canonical first command is:
15
+ exists in any skill directory (`.claude/skills/`, `.codex/skills/`,
16
+ `.cursor/skills/`, `.windsurf/skills/`), the canonical first command is:
16
17
 
17
18
  ```bash
18
19
  npx archal init
19
20
  ```
20
21
 
21
- That adds `archal` as a devDependency and reinstalls these skills at the
22
- right version. Re-invoke the onboard skill after it completes.
22
+ That adds `archal` as a devDependency, installs skills for every detected
23
+ agent platform (Claude Code, Codex, Cursor, Windsurf), and creates a starter
24
+ `.archal.json`, `.archal/harness.mjs`, and `scenarios/first-run.md`. Re-invoke
25
+ the onboard skill after it completes.
23
26
 
24
27
  ## Discover first
25
28
 
26
29
  Before asking anything, read the repo:
27
30
 
28
- 1. `package.json` deps infer likely twins:
29
- - `@octokit/rest`, `octokit` `github`
30
- - `stripe` `stripe`
31
- - `@slack/web-api`, `@slack/bolt` `slack`
32
- - `@linear/sdk` `linear`
33
- - `@supabase/supabase-js` `supabase`
34
- - `googleapis`, `@google-cloud/*` `google-workspace`
35
- - `jira-client`, `jira.js` `jira`
31
+ 1. `package.json` deps -> infer likely clones:
32
+ - `@octokit/rest`, `octokit` -> `github`
33
+ - `stripe` -> `stripe`
34
+ - `@slack/web-api`, `@slack/bolt` -> `slack`
35
+ - `@linear/sdk` -> `linear`
36
+ - `@supabase/supabase-js` -> `supabase`
37
+ - `googleapis`, `@google-cloud/*` -> `google-workspace`
38
+ - `jira-client`, `jira.js` -> `jira`
39
+ - Apify SDK or `api.apify.com` usage -> `apify`
40
+ - Tavily SDK or `api.tavily.com` usage -> `tavily`
36
41
  2. Existing vitest config? Existing scenarios? Existing `.archal.json`? Those change which workflow makes sense.
37
- 3. If no `package.json` or no matching deps: ask "Which services does your agent interact with?" and show the full list: `github`, `slack`, `stripe`, `linear`, `jira`, `supabase`, `google-workspace`, `ramp`.
42
+ 3. If no `package.json` or no matching deps: ask "Which services does your agent interact with?" and point them to the clone catalog (`archal clone --json`) rather than maintaining a separate list here.
38
43
 
39
44
  ## Install + auth
40
45
 
@@ -46,10 +51,21 @@ archal login # OAuth browser flow, or: archal login --token <toke
46
51
  archal usage # verify auth + plan
47
52
  ```
48
53
 
49
- In CI, set `ARCHAL_TOKEN` instead of running `archal login`.
54
+ In CI, set `ARCHAL_TOKEN` to a **workspace API key** (`archal_ws_...`)
55
+ instead of running `archal login`. Workspace keys are bound to one workspace,
56
+ do not expire when a team member leaves, and are the recommended auth for CI.
57
+ Create one with `archal workspace api-key create <label> --scope sessions:write`
58
+ (requires owner or admin role) or from the dashboard under Settings > API Keys.
59
+ Personal tokens (`arc_...`) are fine for local dev but should not be used in CI.
60
+
61
+ Treat workspace API keys as runtime and CI credentials, not governance
62
+ credentials. They can run clones, upload and read traces, and read usage for
63
+ their bound workspace. They cannot manage workspace API keys or audit events.
64
+ Use an owner/admin user credential, either `archal login` or a dashboard-issued
65
+ user API key, for workspace administration.
50
66
 
51
67
  If something feels wrong (missing CLI, stale skills), these are the
52
- recovery commands don't run them otherwise:
68
+ recovery commands - don't run them otherwise:
53
69
 
54
70
  ```bash
55
71
  npx archal --version # CLI reachable? prints e.g. 0.9.12
@@ -58,54 +74,69 @@ npx archal init --skills-only # re-stage skills if they drifted
58
74
 
59
75
  ## Pick a workflow
60
76
 
61
- Confirm detected twins, then ask which of these the user wants. Each delegates to a sub-skill where appropriate don't inline those flows.
77
+ Confirm detected clones, then ask which of these the user wants. Each delegates to a sub-skill where appropriate - don't inline those flows.
78
+
79
+ ### The `agent` command (Options A and B both need this)
80
+
81
+ `archal run` spawns the agent as a child process, headlessly - no UI, no browser auth. The `agent` field in `.archal.json` is the shell command that invokes it. Typical shapes:
82
+
83
+ - `"agent": { "command": "node", "args": ["./.archal/harness.mjs"] }` - scaffolded by `archal init`
84
+ - `"agent": { "command": "npx", "args": ["tsx", "./.archal/harness.ts"] }` - custom TS entrypoint
85
+ - `"agent": { "command": "node", "args": ["./agent.js"] }` - plain Node script
86
+ - `"agent": { "command": "python", "args": ["agent.py"] }` - Python agent
62
87
 
63
- ### Option A Test an agent with scenarios
88
+ If the user doesn't have a harness yet, prefer `npx archal init`; it creates `./.archal/harness.mjs`, points `.archal.json` at it, and adds a starter scenario without overwriting existing files. The generated harness is a guarded stub: Archal refuses to score it until the user edits it to call their Cursor, Codex, Claude Code, or custom agent. A custom harness should read `AGENT_TASK` from env, call the agent runtime, print `{ "text": "..." }` to stdout, and call `reportAgentMetrics()` from `archal/harness` with accumulated `{ inputTokens, outputTokens, llmCallCount }` before exit. Service clients need one explicit routing mode: use sandbox/Docker routing when the harness calls normal service URLs such as `https://api.github.com`, or configure SDK base URLs to the clone REST URL pattern shown by `archal clone start <service>` / `archal clone status` and authenticate those requests with `ARCHAL_TOKEN`. Alternative: skip `agent` in `.archal.json` and pass `--harness <path>` per-run.
64
89
 
65
- Write markdown scenario files that describe setup, prompt, and success criteria; `archal run` executes them against twins.
90
+ ### Option A - Evaluate an agent with scenarios
91
+
92
+ Write markdown scenario files that describe setup, prompt, and success criteria; `archal run` executes them against clones.
66
93
 
67
94
  1. Create `.archal.json`:
68
95
  ```json
69
- { "agent": "<agent command>", "twins": ["<detected twins>"] }
96
+ { "agent": { "command": "<agent command>", "args": ["<arg1>", "..."] }, "clones": ["<detected clones>"], "scenarios": ["scenarios/first-run.md"] }
70
97
  ```
71
- 2. **Delegate to the `scenario` skill** to author a starter scenario. Don't paste a canned example here the skill knows the markdown format and success-criteria syntax.
72
- 3. Run: `archal run scenarios/<first>.md`.
98
+ 2. **Delegate to the `scenario` skill** to author a starter scenario. Don't paste a canned example here - the skill knows the markdown format and success-criteria syntax.
99
+ 3. Run: `archal run scenarios/<first>.md`. **Hand off to the `eval` skill** for result interpretation and failure diagnosis.
100
+
101
+ ### Option B - Run quick inline tasks
73
102
 
74
- ### Option B Run quick inline tasks
103
+ Same `.archal.json` as Option A (inline `--task` still needs an agent). Use this when the user wants ad-hoc runs before committing to scenario files.
75
104
 
76
- 1. `.archal.json` with just twins:
105
+ 1. `.archal.json`:
77
106
  ```json
78
- { "twins": ["<detected twins>"] }
107
+ {
108
+ "agent": { "command": "node", "args": ["./.archal/harness.mjs"] },
109
+ "clones": ["<detected clones>"]
110
+ }
79
111
  ```
80
- 2. Demo: `archal run --task "Create an issue titled hello" --twin github`.
81
-
82
- No sub-skill needed — this is a one-shot.
112
+ 2. Demo: `archal run --task "Create an issue titled hello" --clone github`.
113
+ 3. For the generated first-run project, use bare `archal run` after wiring `.archal/harness.mjs`.
83
114
 
84
- ### Option C Twins in a Vitest suite
115
+ ### Option C - Clones in a Vitest suite
85
116
 
86
- **Delegate to the `vitest` skill.** It handles reading the existing vitest config, identifying which tests should route, picking the right composition pattern, and seeding the twins.
117
+ **Delegate to the `vitest` skill.** It handles reading the existing vitest config, identifying which tests should route, picking the right composition pattern, and seeding the clones.
87
118
 
88
119
  Do not paste a sample config here. The right shape depends on what's already in the repo.
89
120
 
90
- ### Option D Persistent twins to develop against
121
+ ### Option D - Persistent clones to develop against
91
122
 
92
- Run: `archal twin start <detected twins>` gives live twin URLs the user's SDK clients can point at.
123
+ Run: `archal clone start <detected clones>` - gives live clone URLs the user's SDK clients can point at. `archal clone status` shows the active session; `archal clone stop` tears down.
93
124
 
94
125
  ## Verify
95
126
 
96
- Run the first test or task and show the result.
127
+ Run the first scenario or task. For Options A and B, hand off to the `eval` skill to interpret the satisfaction score and diagnose failures - that skill owns the runtime mental model (`[D]` vs `[P]` criteria, trace inspection, harness execution diagnostics).
97
128
 
98
129
  ## `.archal.json` schema
99
130
 
100
131
  | Field | Type | Required | Default | Description |
101
132
  |-------|------|----------|---------|-------------|
102
- | `agent` | string or `{ command, args }` | yes (for scenarios) | | Shell command to run the agent |
133
+ | `agent` | `{ command, args?, env? }` | yes (for scenarios) | | Agent command as an object (not a plain string) |
103
134
  | `title` | string | no | | Display name for reports |
104
- | `twins` | string[] | no | inferred | Which twins to provision |
135
+ | `clones` | string[] | no | inferred | Which clones to provision |
105
136
  | `scenarios` | string[] | no | | Scenario file paths relative to config |
106
- | `seeds` | `Record<string, string>` | no | | Per-twin seed overrides |
137
+ | `seeds` | `Record<string, string>` | no | | Per-clone seed overrides |
107
138
  | `agentModel` | string | no | | LLM model the agent uses |
108
- | `model` | string | no | `gemini-2.5-pro` | Evaluator model |
139
+ | `evaluatorModel` | string | no | Archal LLM judge | Evaluator/judge model; set this only when bringing your own judge key |
109
140
  | `runs` | number | no | `1` | Runs per scenario |
110
141
  | `timeout` | number | no | `180` | Timeout per run in seconds |
111
142
 
@@ -7,7 +7,7 @@ argument-hint: "[scenario description or file path]"
7
7
 
8
8
  # Archal Scenario Writer
9
9
 
10
- You write and edit Archal scenario files. Scenarios are markdown files that define a test for an AI agent running against digital twins.
10
+ You write and edit Archal scenario files. Scenarios are markdown files that define a test for an AI agent running against service clones.
11
11
 
12
12
  ## Scenario format
13
13
 
@@ -24,11 +24,11 @@ The task instruction given to the agent.
24
24
  Answer key for the evaluator. Never shown to the agent.
25
25
 
26
26
  ## Success Criteria
27
- - [D] Deterministic criterion checked against twin state
27
+ - [D] Deterministic criterion checked against clone state
28
28
  - [P] Probabilistic criterion judged by LLM
29
29
 
30
30
  ## Config
31
- twins: github
31
+ clones: github
32
32
  timeout: 90
33
33
  runs: 3
34
34
  ```
@@ -49,7 +49,7 @@ runs: 3
49
49
 
50
50
  Each criterion is a bullet point. Tag with `[D]` or `[P]`:
51
51
 
52
- - `[D]` = **Deterministic**. Checked against twin state programmatically. Use for counts, existence checks, state assertions. No LLM cost.
52
+ - `[D]` = **Deterministic**. Checked against clone state programmatically. Use for counts, existence checks, state assertions. No LLM cost.
53
53
  - `[P]` = **Probabilistic**. Judged by LLM evaluator from the trace and final state. Use for tone, quality, correctness, reasoning.
54
54
 
55
55
  If no tag is provided, Archal infers the type:
@@ -78,19 +78,23 @@ If no tag is provided, Archal infers the type:
78
78
 
79
79
  | Key | Type | Default | Description |
80
80
  |-----|------|---------|-------------|
81
- | `twins` | comma-separated | inferred from content | Which twins to use |
81
+ | `clones` | comma-separated | inferred from content | Which clones to use |
82
82
  | `seed` | string | | Named seed to load |
83
83
  | `timeout` | integer | `180` | Seconds per run |
84
84
  | `runs` | integer | `1` | Number of runs |
85
- | `evaluator-model` | string | `gemini-2.5-pro` | LLM for `[P]` criteria |
85
+ | `evaluator-model` | string | Archal LLM judge | LLM for `[P]` criteria; set this only when bringing your own judge key |
86
86
  | `tags` | comma-separated | | Scenario tags |
87
87
 
88
88
  Aliases for `evaluator-model`: `evaluator`, `evaluatormodel`, `model`.
89
89
 
90
- ## Available twins and general-purpose seeds
90
+ ## Available clones and general-purpose seeds
91
91
 
92
- | Twin | Seeds |
92
+ The full clone and seed surface is manifest-backed. Prefer `archal clone --json`
93
+ and `archal seed list` over maintaining a separate list in this skill.
94
+
95
+ | Clone | Seeds |
93
96
  |------|-------|
97
+ | `apify` | `empty` |
94
98
  | `github` | `empty`, `small-project`, `enterprise-repo`, `ci-cd-pipeline`, `stale-issues`, `large-backlog` |
95
99
  | `slack` | `empty`, `engineering-team`, `busy-workspace`, `incident-active` |
96
100
  | `stripe` | `empty`, `small-business`, `checkout-flow`, `subscription-lifecycle`, `subscription-heavy` |
@@ -98,13 +102,13 @@ Aliases for `evaluator-model`: `evaluator`, `evaluatormodel`, `model`.
98
102
  | `linear` | `empty`, `small-team`, `engineering-org`, `multi-team`, `busy-backlog` |
99
103
  | `supabase` | `empty`, `small-project`, `saas-starter`, `ecommerce` |
100
104
  | `google-workspace` | `empty`, `assistant-baseline`, `gmail-busy-inbox`, `calendar-packed-week` |
105
+ | `tavily` | `empty` |
101
106
  | `ramp` | `empty`, `default` |
102
107
  | `discord` | `empty`, `small-server`, `harvested` |
103
- | `telegram` | `empty`, `harvested` |
104
108
 
105
- ## Twin auto-detection from content
109
+ ## Clone auto-detection from content
106
110
 
107
- If no `twins:` config is set, Archal infers twins from keywords in Setup, Expected Behavior, and Prompt:
111
+ If no `clones:` config is set, Archal infers clones from keywords in Setup, Expected Behavior, and Prompt:
108
112
 
109
113
  - `github`, `repository`, `pull request`, `create_issue` -> `github`
110
114
  - `slack`, `slack channel`, `send_message` -> `slack`
@@ -115,21 +119,19 @@ If no `twins:` config is set, Archal infers twins from keywords in Setup, Expect
115
119
  - `google workspace`, `gmail`, `calendar event`, `inbox` -> `google-workspace`
116
120
  - `discord`, `guild`, `text channel` -> `discord`
117
121
 
118
- Not every twin has auto-detect keywords `telegram` in particular has
119
- none. If your scenario uses `telegram`, set `twins: telegram` in the
120
- Config block or in `.archal.json`. `ramp` auto-detects on `ramp`,
122
+ Not every clone has auto-detect keywords. `ramp` auto-detects on `ramp`,
121
123
  `bill`, `expense`, `reimbursement`, `fund`, `card spend`.
122
124
 
123
125
  ## Multi-service scenarios
124
126
 
125
- Use multiple twins by listing them in config:
127
+ Use multiple clones by listing them in config:
126
128
 
127
129
  ```markdown
128
130
  ## Config
129
- twins: github, slack
131
+ clones: github, slack
130
132
  ```
131
133
 
132
- The Setup section can describe state across both services. Each twin gets its own seed.
134
+ The Setup section can describe state across both services. Each clone gets its own seed.
133
135
 
134
136
  ## Validation
135
137
 
@@ -137,18 +139,18 @@ Run `archal scenario list` to verify scenarios parse correctly. A valid scenario
137
139
  - A title (H1 heading)
138
140
  - A Prompt section
139
141
  - At least one success criterion
140
- - At least one referenced twin (explicit or inferred)
142
+ - At least one referenced clone (explicit or inferred)
141
143
  - Positive timeout and runs values
142
144
 
143
145
  ## Common mistakes to avoid
144
146
 
145
147
  1. Writing `[D]` criteria that require subjective judgment
146
148
  2. Writing `[P]` criteria that could be checked deterministically
147
- 3. Forgetting to specify which twin the scenario uses
149
+ 3. Forgetting to specify which clone the scenario uses
148
150
  4. Writing Setup descriptions that are too vague for seed generation
149
151
  5. Using seed names that don't exist (check the seed table above)
150
152
 
151
153
  ## Documentation
152
154
 
153
155
  - Writing scenarios: https://docs.archal.ai/guides/writing-scenarios
154
- - Twins and seeds: https://docs.archal.ai/twins/overview
156
+ - Clones and seeds: https://docs.archal.ai/clones/overview
@@ -1,27 +1,27 @@
1
1
  ---
2
2
  name: vitest
3
- description: Wire `archal/vitest` into a user's existing Vitest suite so integration tests hit hosted twins instead of real SaaS. Use when the user asks to "add archal to vitest", "wire up vitest with twins", "test against twins in vitest", or when invoked from `archal-onboard` Option C.
3
+ description: Wire `archal/vitest` into a user's existing Vitest suite so integration tests hit hosted clones instead of real SaaS. Use when the user asks to "add archal to vitest", "wire up vitest with clones", "test against clones in vitest", or when invoked from `archal-onboard` Option C.
4
4
  user-invocable: true
5
5
  ---
6
6
 
7
7
  # Archal Vitest Integration
8
8
 
9
- Wire `archal/vitest` into the user's existing Vitest suite. Don't paste a canned config inspect what's already there, surface the right choices, and compose on top of it.
9
+ Wire `archal/vitest` into the user's existing Vitest suite. Don't paste a canned config - inspect what's already there, surface the right choices, and compose on top of it.
10
10
 
11
11
  ## What only you know
12
12
 
13
13
  Claude already knows what Vitest is and how a fetch interceptor works. These are the Archal-specific facts that determine your choices:
14
14
 
15
15
  - `archal/vitest` is a **subpath export of the `archal` npm package**. Users do `pnpm add -D archal`, not `@archal/vitest`.
16
- - Route mode installs a setup file that rewrites `fetch()` calls to hosted twins. **Test code stays unchanged** same SDKs, same URLs.
17
- - Twins are hosted on **ECS Fargate** in Archal's AWS. First run = ~30s cold start. Subsequent runs within the 30-min idle TTL = ~2s. Tell the user; they'll think it's hung otherwise.
16
+ - Route mode installs a setup file that rewrites `fetch()` calls to hosted clones. **Test code stays unchanged** - same SDKs, same URLs.
17
+ - Clones are hosted on **ECS Fargate** in Archal's AWS. First run = ~30s cold start. Subsequent runs within the 30-min idle TTL = ~2s. Tell the user; they'll think it's hung otherwise.
18
18
  - Session cache key = `(projectName, services, seeds)` hash. Change any of those and the cache misses.
19
- - **Seeds = starting state.** Omit to get the twin's default. Named seeds give fixtures (e.g. `small-project` for GitHub, `small-business` for Stripe). Never ask "what seed?" open-ended the user doesn't know the catalog.
20
- - Route-mode twins available: `github`, `slack`, `stripe`, `jira`, `supabase`, `google-workspace`. Not yet: `linear`, `ramp`.
19
+ - **Seeds = starting state.** Omit to get the clone's default. Named seeds give fixtures (e.g. `small-project` for GitHub, `small-business` for Stripe). Never ask "what seed?" open-ended - the user doesn't know the catalog.
20
+ - Route-mode clone availability is defined by `SHARED_ROUTE_MANIFESTS` in `packages/route-runtime-core/src/manifests.ts`; use `archal clone --json` / `archal seed list` before naming supported services.
21
21
 
22
22
  ## Discover before you ask
23
23
 
24
- 1. `package.json` deps infer likely twins (`@octokit/rest` github, `stripe` stripe, `@slack/web-api` slack, `@supabase/supabase-js` supabase, `googleapis` google-workspace, `jira.js` jira).
24
+ 1. `package.json` deps -> infer likely clones (`@octokit/rest` -> github, `stripe` -> stripe, `@slack/web-api` -> slack, `@supabase/supabase-js` -> supabase, `googleapis` -> google-workspace, `jira.js` -> jira).
25
25
  2. Read any existing `vitest.config.ts` / `vitest.config.js` / `vitest.workspace.ts`. Note `setupFiles`, `include`/`exclude`, `reporters`, `projects`.
26
26
  3. Grep test files (`__tests__/`, `tests/`, `*.test.ts`) for outbound calls: `fetch(`, `Octokit`, `new Stripe`, `WebClient`, `createClient`. These are the routing candidates.
27
27
  4. Auth: `archal usage` tells you if they're logged in. `archal login` or `ARCHAL_TOKEN` in CI.
@@ -31,19 +31,19 @@ Claude already knows what Vitest is and how a fetch interceptor works. These are
31
31
  Offer your inferred answer as the default.
32
32
 
33
33
  1. **Scope.** "I found these N test files making outbound HTTP calls: [list]. All of them? Or a specific subset (by folder, glob, or file list)?"
34
- 2. **Twin set.** "From deps I see `[github, stripe]`. Complete, or am I missing/over-including?"
35
- 3. **Seeds (per twin, with inline catalog).** For each twin, present three choices:
36
- > "For `github`: (a) default empty twin, (b) `small-project` seed (one repo, few issues/PRs good starting point), (c) custom seed name. Which?"
34
+ 2. **Clone set.** "From deps I see `[github, stripe]`. Complete, or am I missing/over-including?"
35
+ 3. **Seeds (per clone, with inline catalog).** For each clone, present three choices:
36
+ > "For `github`: (a) default empty clone, (b) `small-project` seed (one repo, few issues/PRs - good starting point), (c) custom seed name. Which?"
37
37
 
38
38
  ## Pick a config pattern
39
39
 
40
40
  Three patterns. The right one depends on what you saw in discovery.
41
41
 
42
- ### Pattern A wrap existing `vitest.config.ts` with `withArchal` (all tests hit twins)
42
+ ### Pattern A - wrap existing `vitest.config.ts` with `withArchal` (all tests hit clones)
43
43
 
44
44
  For dedicated integration-test packages where every test should route. `withArchal` is a merge helper: it preserves everything in the existing `test` block (`coverage`, `alias`, `globalSetup`, `poolOptions`, custom reporters, etc.) and additively composes Archal's setup file, reporter, and session env on top.
45
45
 
46
- Edit their existing file in place the change is one line on the `test:` value:
46
+ Edit their existing file in place - the change is one line on the `test:` value:
47
47
 
48
48
  ```ts
49
49
  import { defineConfig } from 'vitest/config';
@@ -71,9 +71,9 @@ Merge behavior: `setupFiles` and `reporters` are concatenated, `env` is merged (
71
71
 
72
72
  If the user is starting from scratch (no existing `test` block), pass `{}` as the first argument: `withArchal({}, { services })`.
73
73
 
74
- ### Pattern B workspace with a separate Archal project (subset of tests hit twins)
74
+ ### Pattern B - workspace with a separate Archal project (subset of tests hit clones)
75
75
 
76
- Most common shape. Unit tests stay fast; only the routed subset provisions twins.
76
+ Most common shape. Unit tests stay fast; only the routed subset provisions clones.
77
77
 
78
78
  ```ts
79
79
  import { archalVitestProject } from 'archal/vitest';
@@ -82,7 +82,7 @@ export default [
82
82
  './vitest.config.ts', // their existing unit project untouched
83
83
  archalVitestProject(
84
84
  {
85
- name: 'hosted-twins',
85
+ name: 'hosted-clones',
86
86
  services: {
87
87
  github: { mode: 'route', seed: 'small-project' },
88
88
  stripe: { mode: 'route' },
@@ -93,11 +93,11 @@ export default [
93
93
  ];
94
94
  ```
95
95
 
96
- ### Pattern C separate config + npm script (strict isolation)
96
+ ### Pattern C - separate config + npm script (strict isolation)
97
97
 
98
98
  `vitest.integration.config.ts` using Pattern A, plus `"test:integration": "vitest -c vitest.integration.config.ts"`. Use when `pnpm test` must stay unit-only.
99
99
 
100
- ## Apply verify
100
+ ## Apply -> verify
101
101
 
102
102
  1. Install `archal` if missing.
103
103
  2. Write/edit the config.
@@ -105,6 +105,7 @@ export default [
105
105
  4. Run one routed test: `pnpm vitest run <path>`.
106
106
 
107
107
  If confirming routing is live from inside a test:
108
+
108
109
  ```ts
109
110
  import { getInstalledArchalVitestSession } from 'archal/vitest';
110
111
  console.log(getInstalledArchalVitestSession()?.resolvedRuntime.resolvedServices);
@@ -112,18 +113,18 @@ console.log(getInstalledArchalVitestSession()?.resolvedRuntime.resolvedServices)
112
113
 
113
114
  ## Failure modes
114
115
 
115
- - **Real API response instead of twin response** test file isn't in the routed project's `include` glob.
116
- - **401/auth at setup** `ARCHAL_TOKEN` unset or `archal login` not run.
117
- - **First run takes 30+ seconds** ECS cold-start, expected. Warn the user up front.
118
- - **Seed state unexpected** inspect via `getInstalledArchalVitestSession()`; confirm resolved seed matches intent.
119
- - **`resetArchalTwins()` not restoring** call in `beforeEach`, not `beforeAll`.
120
- - **CI credential race** (parallel jobs corrupting `~/.archal/credentials.json`) export `ARCHAL_TOKEN` directly; don't rely on the credential file.
116
+ - **Real API response instead of clone response** - test file isn't in the routed project's `include` glob.
117
+ - **401/auth at setup** - `ARCHAL_TOKEN` unset or `archal login` not run.
118
+ - **First run takes 30+ seconds** - ECS cold-start, expected. Warn the user up front.
119
+ - **Seed state unexpected** - inspect via `getInstalledArchalVitestSession()`; confirm resolved seed matches intent.
120
+ - **`resetArchalClones()` not restoring** - call in `beforeEach`, not `beforeAll`.
121
+ - **CI credential race** (parallel jobs corrupting `~/.archal/credentials.json`) - export `ARCHAL_TOKEN` directly; don't rely on the credential file.
121
122
 
122
123
  ## Anti-patterns
123
124
 
124
125
  - Don't route `localhost` or the user's own backend. Route mode is for external SaaS.
125
126
  - Don't set `testIsolation: 'serial'` preemptively. Only when you've observed cross-test state leaks.
126
- - Don't add route mode to tests that don't make outbound HTTP calls the interceptor install has overhead.
127
+ - Don't add route mode to tests that don't make outbound HTTP calls - the interceptor install has overhead.
127
128
  - Don't drive vitest through `.archal.json`. That file is for the CLI `archal run` flow; the vitest integration is self-contained.
128
129
  - Don't paste a canonical config without reading what's already in the repo.
129
130