archal 0.9.13 → 0.9.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -76567,6 +76567,19 @@ function isNoAgentModelSentinel(value) {
76567
76567
 
76568
76568
  // src/run/auth-seeds.ts
76569
76569
  init_config_merger();
76570
+ function singleQuote(arg) {
76571
+ return `'${arg.replace(/'/g, `'\\''`)}'`;
76572
+ }
76573
+ function buildRerunCommand(scenarioArg, opts) {
76574
+ if (opts.task) {
76575
+ const parts = ["archal", "run", "--task", singleQuote(opts.task)];
76576
+ for (const t of opts.twin ?? []) {
76577
+ parts.push("--twin", singleQuote(t));
76578
+ }
76579
+ return parts.join(" ");
76580
+ }
76581
+ return `archal run ${singleQuote(scenarioArg)}`;
76582
+ }
76570
76583
  async function resolveHostedScenarioPath(scenarioArg) {
76571
76584
  const credentials = getCredentials2();
76572
76585
  if (!credentials) {
@@ -76633,7 +76646,7 @@ async function resolveCredentialsAndEntitlements(scenarioArg, scenario, opts) {
76633
76646
  if (!opts.preflightOnly) {
76634
76647
  const required2 = requireAuth({
76635
76648
  action: "run a scenario",
76636
- nextCommand: `archal run ${scenarioArg}`
76649
+ nextCommand: buildRerunCommand(scenarioArg, opts)
76637
76650
  });
76638
76651
  credentials = required2 ?? getCredentials2();
76639
76652
  if (!credentials) {
@@ -83480,6 +83493,13 @@ async function resolveRunCommandScenarios(scenarioArg, opts, command) {
83480
83493
  info('Generated inline scenario for task: "' + opts.task + '"');
83481
83494
  }
83482
83495
  if (scenariosToRun.length === 0) {
83496
+ if (archalFile) {
83497
+ const configHasTwins = (archalFile.config.twins?.length ?? 0) > 0 || Object.keys(archalFile.config.seeds ?? {}).length > 0;
83498
+ const taskHint = configHasTwins ? ' Or run an inline task: archal run --task "Create an issue"' : ' Or run an inline task with a twin: archal run --task "Create an issue" --twin github';
83499
+ throw new CliUsageError(
83500
+ 'Found .archal.json but no scenarios to run.\n Add scenarios: { "scenarios": ["scenarios/foo.md"] }\n Or pass a scenario directly: archal run scenario.md\n' + taskHint
83501
+ );
83502
+ }
83483
83503
  throw new CliUsageError(
83484
83504
  'No .archal.json config found and no scenario specified.\n Create .archal.json with your twins: { "twins": ["github"] }\n Or pass a scenario directly: archal run scenario.md\n Or run an inline task with a twin: archal run --task "Create an issue" --twin github'
83485
83505
  );
@@ -87167,9 +87187,7 @@ ${GREEN5}${BOLD7}Archal skills installed${RESET8} ${DIM8}(v${version3})${RESET8}
87167
87187
  `);
87168
87188
  }
87169
87189
  log3(`
87170
- ${DIM8}Skills: ${skillDirs.join(", ")}${RESET8}
87171
- `);
87172
- log3(`${DIM8}Try: "/archal-onboard" or "/archal-test"${RESET8}
87190
+ ${DIM8}Next: /archal-onboard${RESET8}
87173
87191
 
87174
87192
  `);
87175
87193
  return {
@@ -87373,8 +87391,6 @@ function createInitCommand() {
87373
87391
  `);
87374
87392
  } else {
87375
87393
  const pm = detectPackageManager2(cwd);
87376
- process.stdout.write(`Installing archal@${CLI_VERSION} with ${pm}\u2026
87377
- `);
87378
87394
  runPmAdd(pm, cwd, `archal@${CLI_VERSION}`);
87379
87395
  }
87380
87396
  } else {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "archal",
3
- "version": "0.9.13",
3
+ "version": "0.9.14",
4
4
  "description": "Test your agents & integrations against digital twins",
5
5
  "type": "module",
6
6
  "main": "dist/index.cjs",
@@ -1,11 +1,11 @@
1
1
  ---
2
- name: test
3
- description: Run Archal scenarios or inline tasks against hosted twins, diagnose failed runs, and interpret satisfaction scores. Triggers on "run my scenario", "test my agent", "archal run X", "debug this failing run", "what does this satisfaction score mean".
2
+ name: eval
3
+ description: Run Archal scenarios or inline tasks against hosted twins, diagnose failed runs, and interpret satisfaction scores. Triggers on "run my scenario", "evaluate my agent", "archal run X", "debug this failing run", "what does this satisfaction score mean".
4
4
  user-invocable: true
5
5
  argument-hint: "[scenario.md or task description]"
6
6
  ---
7
7
 
8
- # Archal Test Runner
8
+ # Archal Eval Runner
9
9
 
10
10
  You run Archal scenarios and inline tasks, then help the user interpret the results. For setting up the agent path or `.archal.json` in a fresh repo, hand off to the `onboard` skill.
11
11
 
@@ -60,27 +60,43 @@ npx archal init --skills-only # re-stage skills if they drifted
60
60
 
61
61
  Confirm detected twins, then ask which of these the user wants. Each delegates to a sub-skill where appropriate — don't inline those flows.
62
62
 
63
- ### Option A Test an agent with scenarios
63
+ ### The `agent` command (Options A and B both need this)
64
+
65
+ `archal run` spawns the agent as a child process, headlessly — no UI, no browser auth. The `agent` field in `.archal.json` is the shell command that invokes it. Typical shapes:
66
+
67
+ - `"agent": "npx tsx ./.archal/harness.ts"` — custom TS entrypoint, most common
68
+ - `"agent": "node ./agent.js"` — plain Node script
69
+ - `"agent": "python agent.py"` — Python agent
70
+
71
+ If the user doesn't have a harness yet, scaffold one at `./.archal/harness.ts` that reads `ARCHAL_ENGINE_TASK` from env and calls their agent's runtime. Alternative: skip `agent` in `.archal.json` and pass `--harness <path>` per-run.
72
+
73
+ ### Option A — Evaluate an agent with scenarios
64
74
 
65
75
  Write markdown scenario files that describe setup, prompt, and success criteria; `archal run` executes them against twins.
66
76
 
67
77
  1. Create `.archal.json`:
68
78
  ```json
69
- { "agent": "<agent command>", "twins": ["<detected twins>"] }
79
+ {
80
+ "agent": "npx tsx ./.archal/harness.ts",
81
+ "twins": ["<detected twins>"]
82
+ }
70
83
  ```
71
84
  2. **Delegate to the `scenario` skill** to author a starter scenario. Don't paste a canned example here — the skill knows the markdown format and success-criteria syntax.
72
- 3. Run: `archal run scenarios/<first>.md`.
85
+ 3. Run: `archal run scenarios/<first>.md`. **Hand off to the `eval` skill** for result interpretation and failure diagnosis.
73
86
 
74
87
  ### Option B — Run quick inline tasks
75
88
 
76
- 1. `.archal.json` with just twins:
89
+ Same `.archal.json` as Option A (inline `--task` still needs an agent). Use this when the user wants ad-hoc runs before committing to scenario files.
90
+
91
+ 1. `.archal.json`:
77
92
  ```json
78
- { "twins": ["<detected twins>"] }
93
+ {
94
+ "agent": "npx tsx ./.archal/harness.ts",
95
+ "twins": ["<detected twins>"]
96
+ }
79
97
  ```
80
98
  2. Demo: `archal run --task "Create an issue titled hello" --twin github`.
81
99
 
82
- No sub-skill needed — this is a one-shot.
83
-
84
100
  ### Option C — Twins in a Vitest suite
85
101
 
86
102
  **Delegate to the `vitest` skill.** It handles reading the existing vitest config, identifying which tests should route, picking the right composition pattern, and seeding the twins.
@@ -89,11 +105,11 @@ Do not paste a sample config here. The right shape depends on what's already in
89
105
 
90
106
  ### Option D — Persistent twins to develop against
91
107
 
92
- Run: `archal twin start <detected twins>` — gives live twin URLs the user's SDK clients can point at.
108
+ Run: `archal twin start <detected twins>` — gives live twin URLs the user's SDK clients can point at. `archal twin status` shows the active session; `archal twin stop` tears down.
93
109
 
94
110
  ## Verify
95
111
 
96
- Run the first test or task and show the result.
112
+ Run the first scenario or task. For Options A and B, hand off to the `eval` skill to interpret the satisfaction score and diagnose failures — that skill owns the runtime mental model (`[D]` vs `[P]` criteria, trace inspection, harness preflight).
97
113
 
98
114
  ## `.archal.json` schema
99
115
 
@@ -17,7 +17,7 @@ Claude already knows what Vitest is and how a fetch interceptor works. These are
17
17
  - Twins are hosted on **ECS Fargate** in Archal's AWS. First run = ~30s cold start. Subsequent runs within the 30-min idle TTL = ~2s. Tell the user; they'll think it's hung otherwise.
18
18
  - Session cache key = `(projectName, services, seeds)` hash. Change any of those and the cache misses.
19
19
  - **Seeds = starting state.** Omit to get the twin's default. Named seeds give fixtures (e.g. `small-project` for GitHub, `small-business` for Stripe). Never ask "what seed?" open-ended — the user doesn't know the catalog.
20
- - Route-mode twins available: `github`, `slack`, `stripe`, `jira`, `supabase`, `google-workspace`. Not yet: `linear`, `ramp`.
20
+ - Route-mode twins available: `discord`, `github`, `google-workspace`, `jira`, `linear`, `ramp`, `slack`, `stripe`, `supabase`. Not yet: `telegram`. (Source of truth: `SHARED_ROUTE_MANIFESTS` in `packages/route-runtime-core/src/manifests.ts` — don't invent services that aren't in that array.)
21
21
 
22
22
  ## Discover before you ask
23
23
 
@@ -1,55 +0,0 @@
1
- ---
2
- name: audit
3
- description: Audit an Archal repository thoroughly. Trace real execution paths, identify concrete bugs and design flaws, distinguish root-cause fixes from architecture problems, and add regression tests for every confirmed issue.
4
- user-invocable: true
5
- argument-hint: "[repo path or scope]"
6
- ---
7
-
8
- # Archal Repository Audit
9
-
10
- Use this skill when the goal is to inspect an Archal repository deeply, find problems worth fixing, and avoid shallow or local-only patches.
11
-
12
- ## Audit standard
13
-
14
- - Trace real execution paths from entrypoints before proposing fixes.
15
- - Prefer root-cause fixes over guards, silencing, or narrow special cases.
16
- - If the real problem is architectural, report it instead of applying a monkey patch.
17
- - For every confirmed bug you fix, add the narrowest regression test that would have caught it earlier.
18
- - Always include at least one regression test that covers a stale-data row or pre-migration row when the touched path has compatibility logic.
19
-
20
- ## Working pattern
21
-
22
- 1. Map the hot paths first.
23
- - Identify the actual entrypoints: CLI commands, web routes, background jobs, and core runtime/session flows.
24
- - Ignore dead-looking surfaces until the primary paths are understood.
25
- 2. Read the execution path end to end.
26
- - Follow inputs through parsing, validation, persistence, normalization, and response shaping.
27
- - Inspect nearby invariants and adjacent edge cases before deciding on a fix.
28
- 3. Separate findings into two buckets.
29
- - **Fix now**: clear bug, contained scope, root cause understood, regression test is obvious.
30
- - **Escalate**: the defect comes from a bad abstraction or architectural boundary and a local patch would hide the real problem.
31
- 4. Validate narrowly, then broadly.
32
- - Run the smallest meaningful tests for the changed path first.
33
- - If code changed, also run the relevant package build/typecheck before concluding.
34
-
35
- ## What to look for
36
-
37
- - Compatibility shims that silently drop data from old rows or partially migrated schemas
38
- - Session lifecycle bugs around start, ready, teardown, stale state, and idempotency
39
- - Projection code that derives canonical state from stale denormalized fields
40
- - Fallback behavior that changes semantics instead of preserving them
41
- - Query builders that filter on derived fields inconsistently across list/count paths
42
- - Evidence, trace, or normalization code that double-counts, hides, or misattributes records
43
-
44
- ## Output format
45
-
46
- For each finding, report:
47
-
48
- - Problem
49
- - Technical cause
50
- - Simple explanation
51
- - Optimal fix
52
- - Why that fix is better than narrower alternatives
53
- - Regression test to add
54
-
55
- If no actionable problems are found in a slice, say that explicitly and note any remaining coverage gaps.