archal 0.9.12 → 0.9.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +40 -7
- package/package.json +1 -1
- package/skills/{test → eval}/SKILL.md +3 -3
- package/skills/onboard/SKILL.md +37 -15
- package/skills/scenario/SKILL.md +8 -0
- package/skills/vitest/SKILL.md +1 -1
- package/twin-assets/google-workspace/tools.json +2 -1
- package/skills/audit/SKILL.md +0 -55
package/dist/index.cjs
CHANGED
|
@@ -76567,6 +76567,19 @@ function isNoAgentModelSentinel(value) {
|
|
|
76567
76567
|
|
|
76568
76568
|
// src/run/auth-seeds.ts
|
|
76569
76569
|
init_config_merger();
|
|
76570
|
+
function singleQuote(arg) {
|
|
76571
|
+
return `'${arg.replace(/'/g, `'\\''`)}'`;
|
|
76572
|
+
}
|
|
76573
|
+
function buildRerunCommand(scenarioArg, opts) {
|
|
76574
|
+
if (opts.task) {
|
|
76575
|
+
const parts = ["archal", "run", "--task", singleQuote(opts.task)];
|
|
76576
|
+
for (const t of opts.twin ?? []) {
|
|
76577
|
+
parts.push("--twin", singleQuote(t));
|
|
76578
|
+
}
|
|
76579
|
+
return parts.join(" ");
|
|
76580
|
+
}
|
|
76581
|
+
return `archal run ${singleQuote(scenarioArg)}`;
|
|
76582
|
+
}
|
|
76570
76583
|
async function resolveHostedScenarioPath(scenarioArg) {
|
|
76571
76584
|
const credentials = getCredentials2();
|
|
76572
76585
|
if (!credentials) {
|
|
@@ -76633,7 +76646,7 @@ async function resolveCredentialsAndEntitlements(scenarioArg, scenario, opts) {
|
|
|
76633
76646
|
if (!opts.preflightOnly) {
|
|
76634
76647
|
const required2 = requireAuth({
|
|
76635
76648
|
action: "run a scenario",
|
|
76636
|
-
nextCommand:
|
|
76649
|
+
nextCommand: buildRerunCommand(scenarioArg, opts)
|
|
76637
76650
|
});
|
|
76638
76651
|
credentials = required2 ?? getCredentials2();
|
|
76639
76652
|
if (!credentials) {
|
|
@@ -83480,6 +83493,13 @@ async function resolveRunCommandScenarios(scenarioArg, opts, command) {
|
|
|
83480
83493
|
info('Generated inline scenario for task: "' + opts.task + '"');
|
|
83481
83494
|
}
|
|
83482
83495
|
if (scenariosToRun.length === 0) {
|
|
83496
|
+
if (archalFile) {
|
|
83497
|
+
const configHasTwins = (archalFile.config.twins?.length ?? 0) > 0 || Object.keys(archalFile.config.seeds ?? {}).length > 0;
|
|
83498
|
+
const taskHint = configHasTwins ? ' Or run an inline task: archal run --task "Create an issue"' : ' Or run an inline task with a twin: archal run --task "Create an issue" --twin github';
|
|
83499
|
+
throw new CliUsageError(
|
|
83500
|
+
'Found .archal.json but no scenarios to run.\n Add scenarios: { "scenarios": ["scenarios/foo.md"] }\n Or pass a scenario directly: archal run scenario.md\n' + taskHint
|
|
83501
|
+
);
|
|
83502
|
+
}
|
|
83483
83503
|
throw new CliUsageError(
|
|
83484
83504
|
'No .archal.json config found and no scenario specified.\n Create .archal.json with your twins: { "twins": ["github"] }\n Or pass a scenario directly: archal run scenario.md\n Or run an inline task with a twin: archal run --task "Create an issue" --twin github'
|
|
83485
83505
|
);
|
|
@@ -87167,9 +87187,7 @@ ${GREEN5}${BOLD7}Archal skills installed${RESET8} ${DIM8}(v${version3})${RESET8}
|
|
|
87167
87187
|
`);
|
|
87168
87188
|
}
|
|
87169
87189
|
log3(`
|
|
87170
|
-
${DIM8}
|
|
87171
|
-
`);
|
|
87172
|
-
log3(`${DIM8}Try: "/archal-onboard" or "/archal-test"${RESET8}
|
|
87190
|
+
${DIM8}Next: /archal-onboard${RESET8}
|
|
87173
87191
|
|
|
87174
87192
|
`);
|
|
87175
87193
|
return {
|
|
@@ -87281,6 +87299,16 @@ function looksLikeSkillsDir(candidate) {
|
|
|
87281
87299
|
}
|
|
87282
87300
|
return false;
|
|
87283
87301
|
}
|
|
87302
|
+
function findPnpmWorkspaceRoot(start) {
|
|
87303
|
+
let dir = start;
|
|
87304
|
+
for (let i = 0; i < 10; i++) {
|
|
87305
|
+
if ((0, import_node_fs53.existsSync)((0, import_node_path53.join)(dir, "pnpm-workspace.yaml"))) return dir;
|
|
87306
|
+
const parent = (0, import_node_path53.dirname)(dir);
|
|
87307
|
+
if (parent === dir) return null;
|
|
87308
|
+
dir = parent;
|
|
87309
|
+
}
|
|
87310
|
+
return null;
|
|
87311
|
+
}
|
|
87284
87312
|
function findRepoRoot(start) {
|
|
87285
87313
|
let dir = start;
|
|
87286
87314
|
for (let i = 0; i < 10; i++) {
|
|
@@ -87308,7 +87336,14 @@ function resolveSkillsDir() {
|
|
|
87308
87336
|
);
|
|
87309
87337
|
}
|
|
87310
87338
|
function runPmAdd(pm, cwd, spec) {
|
|
87311
|
-
|
|
87339
|
+
let args;
|
|
87340
|
+
if (pm === "npm") {
|
|
87341
|
+
args = ["install", "--save-dev", "--no-audit", "--no-fund", "--loglevel=error", spec];
|
|
87342
|
+
} else if (pm === "pnpm" && findPnpmWorkspaceRoot(cwd) === cwd) {
|
|
87343
|
+
args = ["add", "-D", "-w", spec];
|
|
87344
|
+
} else {
|
|
87345
|
+
args = ["add", "-D", spec];
|
|
87346
|
+
}
|
|
87312
87347
|
const result = (0, import_node_child_process10.spawnSync)(pm, args, { cwd, stdio: "inherit" });
|
|
87313
87348
|
if (result.error) {
|
|
87314
87349
|
throw new CliRuntimeError(
|
|
@@ -87356,8 +87391,6 @@ function createInitCommand() {
|
|
|
87356
87391
|
`);
|
|
87357
87392
|
} else {
|
|
87358
87393
|
const pm = detectPackageManager2(cwd);
|
|
87359
|
-
process.stdout.write(`Installing archal@${CLI_VERSION} with ${pm}\u2026
|
|
87360
|
-
`);
|
|
87361
87394
|
runPmAdd(pm, cwd, `archal@${CLI_VERSION}`);
|
|
87362
87395
|
}
|
|
87363
87396
|
} else {
|
package/package.json
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
---
|
|
2
|
-
name:
|
|
3
|
-
description: Run Archal scenarios or inline tasks against hosted twins, diagnose failed runs, and interpret satisfaction scores. Triggers on "run my scenario", "
|
|
2
|
+
name: eval
|
|
3
|
+
description: Run Archal scenarios or inline tasks against hosted twins, diagnose failed runs, and interpret satisfaction scores. Triggers on "run my scenario", "evaluate my agent", "archal run X", "debug this failing run", "what does this satisfaction score mean".
|
|
4
4
|
user-invocable: true
|
|
5
5
|
argument-hint: "[scenario.md or task description]"
|
|
6
6
|
---
|
|
7
7
|
|
|
8
|
-
# Archal
|
|
8
|
+
# Archal Eval Runner
|
|
9
9
|
|
|
10
10
|
You run Archal scenarios and inline tasks, then help the user interpret the results. For setting up the agent path or `.archal.json` in a fresh repo, hand off to the `onboard` skill.
|
|
11
11
|
|
package/skills/onboard/SKILL.md
CHANGED
|
@@ -38,43 +38,65 @@ Before asking anything, read the repo:
|
|
|
38
38
|
|
|
39
39
|
## Install + auth
|
|
40
40
|
|
|
41
|
-
If you're here
|
|
42
|
-
|
|
41
|
+
If you're here via `npx archal init`, archal is already a devDependency
|
|
42
|
+
and the skills are already in place. Go straight to login:
|
|
43
43
|
|
|
44
44
|
```bash
|
|
45
|
-
npx archal --version # verify CLI is on PATH / in node_modules
|
|
46
|
-
npx archal init --skills-only # re-stage skills if they drifted
|
|
47
|
-
archal usage # check auth
|
|
48
45
|
archal login # OAuth browser flow, or: archal login --token <token>
|
|
46
|
+
archal usage # verify auth + plan
|
|
49
47
|
```
|
|
50
48
|
|
|
51
|
-
In CI,
|
|
49
|
+
In CI, set `ARCHAL_TOKEN` instead of running `archal login`.
|
|
50
|
+
|
|
51
|
+
If something feels wrong (missing CLI, stale skills), these are the
|
|
52
|
+
recovery commands — don't run them otherwise:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
npx archal --version # CLI reachable? prints e.g. 0.9.12
|
|
56
|
+
npx archal init --skills-only # re-stage skills if they drifted
|
|
57
|
+
```
|
|
52
58
|
|
|
53
59
|
## Pick a workflow
|
|
54
60
|
|
|
55
61
|
Confirm detected twins, then ask which of these the user wants. Each delegates to a sub-skill where appropriate — don't inline those flows.
|
|
56
62
|
|
|
57
|
-
###
|
|
63
|
+
### The `agent` command (Options A and B both need this)
|
|
64
|
+
|
|
65
|
+
`archal run` spawns the agent as a child process, headlessly — no UI, no browser auth. The `agent` field in `.archal.json` is the shell command that invokes it. Typical shapes:
|
|
66
|
+
|
|
67
|
+
- `"agent": "npx tsx ./.archal/harness.ts"` — custom TS entrypoint, most common
|
|
68
|
+
- `"agent": "node ./agent.js"` — plain Node script
|
|
69
|
+
- `"agent": "python agent.py"` — Python agent
|
|
70
|
+
|
|
71
|
+
If the user doesn't have a harness yet, scaffold one at `./.archal/harness.ts` that reads `ARCHAL_ENGINE_TASK` from env and calls their agent's runtime. Alternative: skip `agent` in `.archal.json` and pass `--harness <path>` per-run.
|
|
72
|
+
|
|
73
|
+
### Option A — Evaluate an agent with scenarios
|
|
58
74
|
|
|
59
75
|
Write markdown scenario files that describe setup, prompt, and success criteria; `archal run` executes them against twins.
|
|
60
76
|
|
|
61
77
|
1. Create `.archal.json`:
|
|
62
78
|
```json
|
|
63
|
-
{
|
|
79
|
+
{
|
|
80
|
+
"agent": "npx tsx ./.archal/harness.ts",
|
|
81
|
+
"twins": ["<detected twins>"]
|
|
82
|
+
}
|
|
64
83
|
```
|
|
65
84
|
2. **Delegate to the `scenario` skill** to author a starter scenario. Don't paste a canned example here — the skill knows the markdown format and success-criteria syntax.
|
|
66
|
-
3. Run: `archal run scenarios/<first>.md`.
|
|
85
|
+
3. Run: `archal run scenarios/<first>.md`. **Hand off to the `eval` skill** for result interpretation and failure diagnosis.
|
|
67
86
|
|
|
68
87
|
### Option B — Run quick inline tasks
|
|
69
88
|
|
|
70
|
-
|
|
89
|
+
Same `.archal.json` as Option A (inline `--task` still needs an agent). Use this when the user wants ad-hoc runs before committing to scenario files.
|
|
90
|
+
|
|
91
|
+
1. `.archal.json`:
|
|
71
92
|
```json
|
|
72
|
-
{
|
|
93
|
+
{
|
|
94
|
+
"agent": "npx tsx ./.archal/harness.ts",
|
|
95
|
+
"twins": ["<detected twins>"]
|
|
96
|
+
}
|
|
73
97
|
```
|
|
74
98
|
2. Demo: `archal run --task "Create an issue titled hello" --twin github`.
|
|
75
99
|
|
|
76
|
-
No sub-skill needed — this is a one-shot.
|
|
77
|
-
|
|
78
100
|
### Option C — Twins in a Vitest suite
|
|
79
101
|
|
|
80
102
|
**Delegate to the `vitest` skill.** It handles reading the existing vitest config, identifying which tests should route, picking the right composition pattern, and seeding the twins.
|
|
@@ -83,11 +105,11 @@ Do not paste a sample config here. The right shape depends on what's already in
|
|
|
83
105
|
|
|
84
106
|
### Option D — Persistent twins to develop against
|
|
85
107
|
|
|
86
|
-
Run: `archal twin start <detected twins>` — gives live twin URLs the user's SDK clients can point at.
|
|
108
|
+
Run: `archal twin start <detected twins>` — gives live twin URLs the user's SDK clients can point at. `archal twin status` shows the active session; `archal twin stop` tears down.
|
|
87
109
|
|
|
88
110
|
## Verify
|
|
89
111
|
|
|
90
|
-
Run the first
|
|
112
|
+
Run the first scenario or task. For Options A and B, hand off to the `eval` skill to interpret the satisfaction score and diagnose failures — that skill owns the runtime mental model (`[D]` vs `[P]` criteria, trace inspection, harness preflight).
|
|
91
113
|
|
|
92
114
|
## `.archal.json` schema
|
|
93
115
|
|
package/skills/scenario/SKILL.md
CHANGED
|
@@ -99,6 +99,8 @@ Aliases for `evaluator-model`: `evaluator`, `evaluatormodel`, `model`.
|
|
|
99
99
|
| `supabase` | `empty`, `small-project`, `saas-starter`, `ecommerce` |
|
|
100
100
|
| `google-workspace` | `empty`, `assistant-baseline`, `gmail-busy-inbox`, `calendar-packed-week` |
|
|
101
101
|
| `ramp` | `empty`, `default` |
|
|
102
|
+
| `discord` | `empty`, `small-server`, `harvested` |
|
|
103
|
+
| `telegram` | `empty`, `harvested` |
|
|
102
104
|
|
|
103
105
|
## Twin auto-detection from content
|
|
104
106
|
|
|
@@ -111,6 +113,12 @@ If no `twins:` config is set, Archal infers twins from keywords in Setup, Expect
|
|
|
111
113
|
- `stripe`, `payment`, `refund`, `subscription`, `invoice` -> `stripe`
|
|
112
114
|
- `supabase`, `database`, `sql query` -> `supabase`
|
|
113
115
|
- `google workspace`, `gmail`, `calendar event`, `inbox` -> `google-workspace`
|
|
116
|
+
- `discord`, `guild`, `text channel` -> `discord`
|
|
117
|
+
|
|
118
|
+
Not every twin has auto-detect keywords — `telegram` in particular has
|
|
119
|
+
none. If your scenario uses `telegram`, set `twins: telegram` in the
|
|
120
|
+
Config block or in `.archal.json`. `ramp` auto-detects on `ramp`,
|
|
121
|
+
`bill`, `expense`, `reimbursement`, `fund`, `card spend`.
|
|
114
122
|
|
|
115
123
|
## Multi-service scenarios
|
|
116
124
|
|
package/skills/vitest/SKILL.md
CHANGED
|
@@ -17,7 +17,7 @@ Claude already knows what Vitest is and how a fetch interceptor works. These are
|
|
|
17
17
|
- Twins are hosted on **ECS Fargate** in Archal's AWS. First run = ~30s cold start. Subsequent runs within the 30-min idle TTL = ~2s. Tell the user; they'll think it's hung otherwise.
|
|
18
18
|
- Session cache key = `(projectName, services, seeds)` hash. Change any of those and the cache misses.
|
|
19
19
|
- **Seeds = starting state.** Omit to get the twin's default. Named seeds give fixtures (e.g. `small-project` for GitHub, `small-business` for Stripe). Never ask "what seed?" open-ended — the user doesn't know the catalog.
|
|
20
|
-
- Route-mode twins available: `
|
|
20
|
+
- Route-mode twins available: `discord`, `github`, `google-workspace`, `jira`, `linear`, `ramp`, `slack`, `stripe`, `supabase`. Not yet: `telegram`. (Source of truth: `SHARED_ROUTE_MANIFESTS` in `packages/route-runtime-core/src/manifests.ts` — don't invent services that aren't in that array.)
|
|
21
21
|
|
|
22
22
|
## Discover before you ask
|
|
23
23
|
|
package/skills/audit/SKILL.md
DELETED
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: audit
|
|
3
|
-
description: Audit an Archal repository thoroughly. Trace real execution paths, identify concrete bugs and design flaws, distinguish root-cause fixes from architecture problems, and add regression tests for every confirmed issue.
|
|
4
|
-
user-invocable: true
|
|
5
|
-
argument-hint: "[repo path or scope]"
|
|
6
|
-
---
|
|
7
|
-
|
|
8
|
-
# Archal Repository Audit
|
|
9
|
-
|
|
10
|
-
Use this skill when the goal is to inspect an Archal repository deeply, find problems worth fixing, and avoid shallow or local-only patches.
|
|
11
|
-
|
|
12
|
-
## Audit standard
|
|
13
|
-
|
|
14
|
-
- Trace real execution paths from entrypoints before proposing fixes.
|
|
15
|
-
- Prefer root-cause fixes over guards, silencing, or narrow special cases.
|
|
16
|
-
- If the real problem is architectural, report it instead of applying a monkey patch.
|
|
17
|
-
- For every confirmed bug you fix, add the narrowest regression test that would have caught it earlier.
|
|
18
|
-
- Always include at least one regression test that covers a stale-data row or pre-migration row when the touched path has compatibility logic.
|
|
19
|
-
|
|
20
|
-
## Working pattern
|
|
21
|
-
|
|
22
|
-
1. Map the hot paths first.
|
|
23
|
-
- Identify the actual entrypoints: CLI commands, web routes, background jobs, and core runtime/session flows.
|
|
24
|
-
- Ignore dead-looking surfaces until the primary paths are understood.
|
|
25
|
-
2. Read the execution path end to end.
|
|
26
|
-
- Follow inputs through parsing, validation, persistence, normalization, and response shaping.
|
|
27
|
-
- Inspect nearby invariants and adjacent edge cases before deciding on a fix.
|
|
28
|
-
3. Separate findings into two buckets.
|
|
29
|
-
- **Fix now**: clear bug, contained scope, root cause understood, regression test is obvious.
|
|
30
|
-
- **Escalate**: the defect comes from a bad abstraction or architectural boundary and a local patch would hide the real problem.
|
|
31
|
-
4. Validate narrowly, then broadly.
|
|
32
|
-
- Run the smallest meaningful tests for the changed path first.
|
|
33
|
-
- If code changed, also run the relevant package build/typecheck before concluding.
|
|
34
|
-
|
|
35
|
-
## What to look for
|
|
36
|
-
|
|
37
|
-
- Compatibility shims that silently drop data from old rows or partially migrated schemas
|
|
38
|
-
- Session lifecycle bugs around start, ready, teardown, stale state, and idempotency
|
|
39
|
-
- Projection code that derives canonical state from stale denormalized fields
|
|
40
|
-
- Fallback behavior that changes semantics instead of preserving them
|
|
41
|
-
- Query builders that filter on derived fields inconsistently across list/count paths
|
|
42
|
-
- Evidence, trace, or normalization code that double-counts, hides, or misattributes records
|
|
43
|
-
|
|
44
|
-
## Output format
|
|
45
|
-
|
|
46
|
-
For each finding, report:
|
|
47
|
-
|
|
48
|
-
- Problem
|
|
49
|
-
- Technical cause
|
|
50
|
-
- Simple explanation
|
|
51
|
-
- Optimal fix
|
|
52
|
-
- Why that fix is better than narrower alternatives
|
|
53
|
-
- Regression test to add
|
|
54
|
-
|
|
55
|
-
If no actionable problems are found in a slice, say that explicitly and note any remaining coverage gaps.
|