@archal/skills 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +8 -0
- package/README.md +33 -0
- package/bin/install.js +57 -0
- package/package.json +37 -0
- package/skills/onboard/SKILL.md +147 -0
- package/skills/scenario/SKILL.md +147 -0
- package/skills/test/SKILL.md +195 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
Copyright (c) 2026 Archal Labs, Inc. All rights reserved.
|
|
2
|
+
|
|
3
|
+
This software is proprietary and confidential. No part of this software may be
|
|
4
|
+
reproduced, distributed, or transmitted in any form or by any means, including
|
|
5
|
+
photocopying, recording, or other electronic or mechanical methods, without the
|
|
6
|
+
prior written permission of Archal Labs, Inc.
|
|
7
|
+
|
|
8
|
+
For licensing inquiries, visit https://archal.ai or contact support@archal.ai.
|
package/README.md
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# @archal/skills
|
|
2
|
+
|
|
3
|
+
Install [Archal](https://archal.ai) coding agent skills into your project. Works with Claude Code, Cursor, and Windsurf.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npx @archal/skills
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
This copies skill files into `.claude/skills/`, `.cursor/skills/`, or `.windsurf/skills/` depending on what's detected in your project. No config, no prompts.
|
|
12
|
+
|
|
13
|
+
To update:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
npx @archal/skills@latest
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Skills
|
|
20
|
+
|
|
21
|
+
| Skill | Description |
|
|
22
|
+
|-------|-------------|
|
|
23
|
+
| **onboard** | Set up Archal in your project — detects dependencies, scaffolds config, runs a first test |
|
|
24
|
+
| **scenario** | Write and edit scenario markdown files with the correct format and criteria syntax |
|
|
25
|
+
| **test** | Run scenarios and inline tasks, interpret results, debug failures |
|
|
26
|
+
|
|
27
|
+
## What is Archal?
|
|
28
|
+
|
|
29
|
+
Archal tests AI agents against digital twins of real services (GitHub, Slack, Stripe, Linear, Jira, Supabase, Google Workspace) before they touch production.
|
|
30
|
+
|
|
31
|
+
- [Documentation](https://archal.ai/docs)
|
|
32
|
+
- [Quickstart](https://archal.ai/docs/quickstart)
|
|
33
|
+
- [GitHub](https://github.com/Archal-Labs/archal)
|
package/bin/install.js
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { existsSync, mkdirSync, cpSync, readdirSync } from 'node:fs';
|
|
4
|
+
import { resolve, join, dirname } from 'node:path';
|
|
5
|
+
import { fileURLToPath } from 'node:url';
|
|
6
|
+
|
|
7
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
8
|
+
const skillsSource = resolve(__dirname, '..', 'skills');
|
|
9
|
+
const projectRoot = process.cwd();
|
|
10
|
+
|
|
11
|
+
const RESET = '\x1b[0m';
|
|
12
|
+
const GREEN = '\x1b[32m';
|
|
13
|
+
const DIM = '\x1b[2m';
|
|
14
|
+
const BOLD = '\x1b[1m';
|
|
15
|
+
|
|
16
|
+
const targets = [
|
|
17
|
+
{ dir: '.claude', label: 'Claude Code', skillsPath: '.claude/skills' },
|
|
18
|
+
{ dir: '.cursor', label: 'Cursor', skillsPath: '.cursor/skills' },
|
|
19
|
+
{ dir: '.windsurf', label: 'Windsurf', skillsPath: '.windsurf/skills' },
|
|
20
|
+
];
|
|
21
|
+
|
|
22
|
+
const detected = targets.filter((t) => existsSync(join(projectRoot, t.dir)));
|
|
23
|
+
|
|
24
|
+
if (detected.length === 0) {
|
|
25
|
+
// Default to Claude Code if no agent directory exists
|
|
26
|
+
detected.push(targets[0]);
|
|
27
|
+
mkdirSync(join(projectRoot, '.claude'), { recursive: true });
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
const skillDirs = readdirSync(skillsSource, { withFileTypes: true })
|
|
31
|
+
.filter((d) => d.isDirectory())
|
|
32
|
+
.map((d) => d.name);
|
|
33
|
+
|
|
34
|
+
let installed = 0;
|
|
35
|
+
|
|
36
|
+
for (const target of detected) {
|
|
37
|
+
for (const skill of skillDirs) {
|
|
38
|
+
const src = join(skillsSource, skill);
|
|
39
|
+
const dest = join(projectRoot, target.skillsPath, `archal-${skill}`);
|
|
40
|
+
mkdirSync(dest, { recursive: true });
|
|
41
|
+
cpSync(src, dest, { recursive: true });
|
|
42
|
+
installed++;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
console.log(
|
|
47
|
+
`\n${GREEN}${BOLD}archal skills installed${RESET}`
|
|
48
|
+
);
|
|
49
|
+
console.log(
|
|
50
|
+
`${DIM}${installed} skill(s) -> ${detected.map((t) => t.label).join(', ')}${RESET}`
|
|
51
|
+
);
|
|
52
|
+
console.log(
|
|
53
|
+
`\n${DIM}Skills: onboard, scenario, test${RESET}`
|
|
54
|
+
);
|
|
55
|
+
console.log(
|
|
56
|
+
`${DIM}Try: "Set up Archal in this project" or "/onboard"${RESET}\n`
|
|
57
|
+
);
|
package/package.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@archal/skills",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Install Archal coding agent skills into your project",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"bin": {
|
|
7
|
+
"archal-skills": "bin/install.js"
|
|
8
|
+
},
|
|
9
|
+
"files": [
|
|
10
|
+
"bin",
|
|
11
|
+
"skills",
|
|
12
|
+
"LICENSE"
|
|
13
|
+
],
|
|
14
|
+
"license": "SEE LICENSE IN LICENSE",
|
|
15
|
+
"repository": {
|
|
16
|
+
"type": "git",
|
|
17
|
+
"url": "git+https://github.com/Archal-Labs/archal.git",
|
|
18
|
+
"directory": "packages/archal-skills"
|
|
19
|
+
},
|
|
20
|
+
"homepage": "https://archal.ai",
|
|
21
|
+
"keywords": [
|
|
22
|
+
"archal",
|
|
23
|
+
"ai",
|
|
24
|
+
"agent",
|
|
25
|
+
"testing",
|
|
26
|
+
"skills",
|
|
27
|
+
"claude-code",
|
|
28
|
+
"cursor"
|
|
29
|
+
],
|
|
30
|
+
"engines": {
|
|
31
|
+
"node": ">=18"
|
|
32
|
+
},
|
|
33
|
+
"bugs": "https://github.com/Archal-Labs/archal/issues",
|
|
34
|
+
"publishConfig": {
|
|
35
|
+
"access": "public"
|
|
36
|
+
}
|
|
37
|
+
}
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: onboard
|
|
3
|
+
description: Set up Archal in this project. Detects dependencies, installs the CLI, scaffolds config, and runs a first test.
|
|
4
|
+
user-invocable: true
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# Archal Onboard
|
|
8
|
+
|
|
9
|
+
You are setting up Archal in this project. Archal tests AI agents against digital twins of real services (GitHub, Slack, Stripe, etc.) before they touch production. Walk the developer through each step interactively.
|
|
10
|
+
|
|
11
|
+
## Step 1: Check if archal is installed
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
npx archal --version
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
If not installed, install it:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
npm install -g archal
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Or as a dev dependency:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
npm install -D archal
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
The published npm package name is `archal`.
|
|
30
|
+
|
|
31
|
+
## Step 2: Check authentication
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
archal usage
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
If not authenticated:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
archal login
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
This opens a browser for OAuth. Alternatively: `archal login --token <token>`.
|
|
44
|
+
|
|
45
|
+
## Step 3: Detect which twins this project needs
|
|
46
|
+
|
|
47
|
+
If `package.json` exists, read it and check dependencies:
|
|
48
|
+
|
|
49
|
+
| Dependency | Suggested twin |
|
|
50
|
+
|-----------|----------------|
|
|
51
|
+
| `@octokit/rest`, `octokit` | `github` |
|
|
52
|
+
| `stripe` | `stripe` |
|
|
53
|
+
| `@slack/web-api`, `@slack/bolt` | `slack` |
|
|
54
|
+
| `@linear/sdk` | `linear` |
|
|
55
|
+
| `@supabase/supabase-js` | `supabase` |
|
|
56
|
+
| `googleapis`, `@google-cloud/*` | `google-workspace` |
|
|
57
|
+
| `jira-client`, `jira.js` | `jira` |
|
|
58
|
+
|
|
59
|
+
If there is no `package.json` or no matching dependencies are found, ask the developer directly: "Which services does your agent interact with?" and present the full list.
|
|
60
|
+
|
|
61
|
+
All available twins: `github`, `slack`, `stripe`, `linear`, `jira`, `supabase`, `google-workspace`, `ramp`.
|
|
62
|
+
|
|
63
|
+
## Step 4: Ask which workflow the developer wants
|
|
64
|
+
|
|
65
|
+
Present these options:
|
|
66
|
+
|
|
67
|
+
### Option A: Test my agent with scenarios
|
|
68
|
+
|
|
69
|
+
1. Create `.archal.json`:
|
|
70
|
+
|
|
71
|
+
```json
|
|
72
|
+
{
|
|
73
|
+
"agent": "<their agent command>",
|
|
74
|
+
"twins": ["<detected twins>"]
|
|
75
|
+
}
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
2. Create a starter scenario at `scenarios/hello.md`:
|
|
79
|
+
|
|
80
|
+
```markdown
|
|
81
|
+
# Hello World Test
|
|
82
|
+
|
|
83
|
+
## Setup
|
|
84
|
+
A GitHub repository with 3 open issues.
|
|
85
|
+
|
|
86
|
+
## Prompt
|
|
87
|
+
List all open issues and comment "acknowledged" on each one.
|
|
88
|
+
|
|
89
|
+
## Success Criteria
|
|
90
|
+
- [D] All 3 issues have a new comment
|
|
91
|
+
- [P] Each comment says "acknowledged"
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
3. Run it: `archal run scenarios/hello.md`
|
|
95
|
+
|
|
96
|
+
### Option B: Run quick inline tasks
|
|
97
|
+
|
|
98
|
+
1. Create `.archal.json` with just twins:
|
|
99
|
+
|
|
100
|
+
```json
|
|
101
|
+
{
|
|
102
|
+
"twins": ["<detected twins>"]
|
|
103
|
+
}
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
2. Run a demo: `archal run --task "Create an issue titled hello" --twin github`
|
|
107
|
+
|
|
108
|
+
### Option C: Twins in my Vitest suite
|
|
109
|
+
|
|
110
|
+
1. Install: `npm install -D @archal/vitest`
|
|
111
|
+
2. Show them a sample `vitest.workspace.ts` config
|
|
112
|
+
3. Create a sample test file
|
|
113
|
+
|
|
114
|
+
### Option D: Persistent twins to develop against
|
|
115
|
+
|
|
116
|
+
Run: `archal twin start <detected twins>`
|
|
117
|
+
|
|
118
|
+
This gives them live twin URLs they can point their SDK clients at.
|
|
119
|
+
|
|
120
|
+
## Step 5: Verify everything works
|
|
121
|
+
|
|
122
|
+
Run the first test or task and show the result.
|
|
123
|
+
|
|
124
|
+
## Step 6: Suggest next steps
|
|
125
|
+
|
|
126
|
+
- Write more scenarios (use the `scenario` skill)
|
|
127
|
+
- Run with `--runs 5` for a satisfaction score
|
|
128
|
+
- Set up CI with `ARCHAL_TOKEN` secret
|
|
129
|
+
|
|
130
|
+
## .archal.json full schema
|
|
131
|
+
|
|
132
|
+
| Field | Type | Required | Default | Description |
|
|
133
|
+
|-------|------|----------|---------|-------------|
|
|
134
|
+
| `agent` | string or `{ command, args }` | yes (for scenarios) | | Shell command to run the agent |
|
|
135
|
+
| `title` | string | no | | Display name for reports |
|
|
136
|
+
| `twins` | string[] | no | inferred | Which twins to provision |
|
|
137
|
+
| `scenarios` | string[] | no | | Scenario file paths relative to config |
|
|
138
|
+
| `seeds` | `Record<string, string>` | no | | Per-twin seed overrides |
|
|
139
|
+
| `agentModel` | string | no | | LLM model the agent uses |
|
|
140
|
+
| `model` | string | no | `gemini-2.5-pro` | Evaluator model |
|
|
141
|
+
| `runs` | number | no | `1` | Runs per scenario |
|
|
142
|
+
| `timeout` | number | no | `180` | Timeout per run in seconds |
|
|
143
|
+
|
|
144
|
+
## Documentation
|
|
145
|
+
|
|
146
|
+
- Quickstart: https://archal.ai/docs/quickstart
|
|
147
|
+
- Full docs: https://archal.ai/docs
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scenario
|
|
3
|
+
description: Write, edit, and validate Archal scenario files. Knows the markdown format, success criteria syntax, and config options.
|
|
4
|
+
user-invocable: true
|
|
5
|
+
argument-hint: "[scenario description or file path]"
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Archal Scenario Writer
|
|
9
|
+
|
|
10
|
+
You write and edit Archal scenario files. Scenarios are markdown files that define a test for an AI agent running against digital twins.
|
|
11
|
+
|
|
12
|
+
## Scenario format
|
|
13
|
+
|
|
14
|
+
```markdown
|
|
15
|
+
# Scenario Title
|
|
16
|
+
|
|
17
|
+
## Setup
|
|
18
|
+
Starting state described in plain English. Drives seed generation.
|
|
19
|
+
|
|
20
|
+
## Prompt
|
|
21
|
+
The task instruction given to the agent.
|
|
22
|
+
|
|
23
|
+
## Expected Behavior
|
|
24
|
+
Answer key for the evaluator. Never shown to the agent.
|
|
25
|
+
|
|
26
|
+
## Success Criteria
|
|
27
|
+
- [D] Deterministic criterion checked against twin state
|
|
28
|
+
- [P] Probabilistic criterion judged by LLM
|
|
29
|
+
|
|
30
|
+
## Config
|
|
31
|
+
twins: github
|
|
32
|
+
timeout: 90
|
|
33
|
+
runs: 3
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Sections
|
|
37
|
+
|
|
38
|
+
| Section | Required | Aliases | Purpose |
|
|
39
|
+
|---------|----------|---------|---------|
|
|
40
|
+
| `# Title` | yes | | Scenario name (H1 heading) |
|
|
41
|
+
| `## Setup` | no | `Context`, `Initial State` | Starting state in plain English |
|
|
42
|
+
| `## Prompt` | yes | `Task`, `Instruction`, `Instructions`, `Request` | Task given to the agent |
|
|
43
|
+
| `## Expected Behavior` | no | `Expected Behaviour`, `Behavior`, `Behaviour`, `Judge Notes`, `Evaluation Notes` | Answer key for evaluator (never shown to agent) |
|
|
44
|
+
| `## Success Criteria` | yes | `Success`, `Criteria`, `Checks`, `Assertions` | Evaluable checks |
|
|
45
|
+
| `## Config` | no | | Runtime settings |
|
|
46
|
+
| `## Seed State` | no | | Explicit seed data |
|
|
47
|
+
|
|
48
|
+
## Success criteria syntax
|
|
49
|
+
|
|
50
|
+
Each criterion is a bullet point. Tag with `[D]` or `[P]`:
|
|
51
|
+
|
|
52
|
+
- `[D]` = **Deterministic**. Checked against twin state programmatically. Use for counts, existence checks, state assertions. No LLM cost.
|
|
53
|
+
- `[P]` = **Probabilistic**. Judged by LLM evaluator from the trace and final state. Use for tone, quality, correctness, reasoning.
|
|
54
|
+
|
|
55
|
+
If no tag is provided, Archal infers the type:
|
|
56
|
+
- Numeric/state patterns (`exactly N`, `at least N`, `is created/closed/merged`, `no errors`, `count is/equals`) are auto-tagged `[D]`
|
|
57
|
+
- Everything else defaults to `[P]`
|
|
58
|
+
|
|
59
|
+
### Writing good criteria
|
|
60
|
+
|
|
61
|
+
**Good `[D]` criteria:**
|
|
62
|
+
- `[D] Exactly 4 issues are closed`
|
|
63
|
+
- `[D] A pull request exists with title containing "fix"`
|
|
64
|
+
- `[D] No issues have the label "wontfix"`
|
|
65
|
+
- `[D] The Slack channel #incidents has at least 1 new message`
|
|
66
|
+
|
|
67
|
+
**Good `[P]` criteria:**
|
|
68
|
+
- `[P] Each closing comment explains the inactivity period`
|
|
69
|
+
- `[P] The PR description summarizes all changes accurately`
|
|
70
|
+
- `[P] The agent does not modify any unrelated issues`
|
|
71
|
+
|
|
72
|
+
**Bad criteria (avoid):**
|
|
73
|
+
- `The agent works correctly` (too vague)
|
|
74
|
+
- `[D] The response is good` (not deterministic)
|
|
75
|
+
- `[P] Exactly 3 items exist` (should be `[D]`)
|
|
76
|
+
|
|
77
|
+
## Config keys
|
|
78
|
+
|
|
79
|
+
| Key | Type | Default | Description |
|
|
80
|
+
|-----|------|---------|-------------|
|
|
81
|
+
| `twins` | comma-separated | inferred from content | Which twins to use |
|
|
82
|
+
| `seed` | string | | Named seed to load |
|
|
83
|
+
| `timeout` | integer | `180` | Seconds per run |
|
|
84
|
+
| `runs` | integer | `1` | Number of runs |
|
|
85
|
+
| `evaluator-model` | string | `gemini-2.5-pro` | LLM for `[P]` criteria |
|
|
86
|
+
| `difficulty` | `easy`/`medium`/`hard` | | Difficulty tag |
|
|
87
|
+
| `tags` | comma-separated | | Scenario tags |
|
|
88
|
+
|
|
89
|
+
Aliases for `evaluator-model`: `evaluator`, `evaluatormodel`, `model`.
|
|
90
|
+
|
|
91
|
+
## Available twins and general-purpose seeds
|
|
92
|
+
|
|
93
|
+
| Twin | Seeds |
|
|
94
|
+
|------|-------|
|
|
95
|
+
| `github` | `empty`, `small-project`, `enterprise-repo`, `ci-cd-pipeline`, `stale-issues`, `large-backlog` |
|
|
96
|
+
| `slack` | `empty`, `engineering-team`, `busy-workspace`, `incident-active` |
|
|
97
|
+
| `stripe` | `empty`, `small-business`, `checkout-flow`, `subscription-lifecycle`, `subscription-heavy` |
|
|
98
|
+
| `jira` | `empty`, `small-project`, `enterprise`, `sprint-active`, `large-backlog` |
|
|
99
|
+
| `linear` | `empty`, `small-team`, `engineering-org`, `multi-team`, `busy-backlog` |
|
|
100
|
+
| `supabase` | `empty`, `small-project`, `saas-starter`, `ecommerce` |
|
|
101
|
+
| `google-workspace` | `empty`, `assistant-baseline`, `gmail-busy-inbox`, `calendar-packed-week` |
|
|
102
|
+
| `ramp` | `empty`, `default` |
|
|
103
|
+
|
|
104
|
+
## Twin auto-detection from content
|
|
105
|
+
|
|
106
|
+
If no `twins:` config is set, Archal infers twins from keywords in Setup, Expected Behavior, and Prompt:
|
|
107
|
+
|
|
108
|
+
- `github`, `repository`, `pull request`, `create_issue` -> `github`
|
|
109
|
+
- `slack`, `slack channel`, `send_message` -> `slack`
|
|
110
|
+
- `linear`, `linear ticket` -> `linear`
|
|
111
|
+
- `jira`, `jira sprint` -> `jira`
|
|
112
|
+
- `stripe`, `payment`, `refund`, `subscription`, `invoice` -> `stripe`
|
|
113
|
+
- `supabase`, `database`, `sql query` -> `supabase`
|
|
114
|
+
- `google workspace`, `gmail`, `calendar event`, `inbox` -> `google-workspace`
|
|
115
|
+
|
|
116
|
+
## Multi-service scenarios
|
|
117
|
+
|
|
118
|
+
Use multiple twins by listing them in config:
|
|
119
|
+
|
|
120
|
+
```markdown
|
|
121
|
+
## Config
|
|
122
|
+
twins: github, slack
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
The Setup section can describe state across both services. Each twin gets its own seed.
|
|
126
|
+
|
|
127
|
+
## Validation
|
|
128
|
+
|
|
129
|
+
Run `archal scenario list` to verify scenarios parse correctly. A valid scenario must have:
|
|
130
|
+
- A title (H1 heading)
|
|
131
|
+
- A Prompt section
|
|
132
|
+
- At least one success criterion
|
|
133
|
+
- At least one referenced twin (explicit or inferred)
|
|
134
|
+
- Positive timeout and runs values
|
|
135
|
+
|
|
136
|
+
## Common mistakes to avoid
|
|
137
|
+
|
|
138
|
+
1. Writing `[D]` criteria that require subjective judgment
|
|
139
|
+
2. Writing `[P]` criteria that could be checked deterministically
|
|
140
|
+
3. Forgetting to specify which twin the scenario uses
|
|
141
|
+
4. Writing Setup descriptions that are too vague for seed generation
|
|
142
|
+
5. Using seed names that don't exist (check the seed table above)
|
|
143
|
+
|
|
144
|
+
## Documentation
|
|
145
|
+
|
|
146
|
+
- Writing scenarios: https://archal.ai/docs/guides/writing-scenarios
|
|
147
|
+
- Twins and seeds: https://archal.ai/docs/twins/overview
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: test
|
|
3
|
+
description: Run Archal scenarios or inline tasks against digital twins. Interprets results and helps debug failures.
|
|
4
|
+
user-invocable: true
|
|
5
|
+
argument-hint: "[scenario.md or task description]"
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Archal Test Runner
|
|
9
|
+
|
|
10
|
+
You run Archal scenarios and inline tasks against digital twins, then interpret the results.
|
|
11
|
+
|
|
12
|
+
## Running a scenario
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
archal run scenario.md
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
With options:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
archal run scenario.md --runs 5 --timeout 120 --seed enterprise-repo
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Running from .archal.json
|
|
25
|
+
|
|
26
|
+
If `.archal.json` exists in the current directory:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
archal run
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
With explicit config path:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
archal run --config path/to/.archal.json
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Inline tasks (no scenario file)
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
archal run --task "Create an issue titled hello" --twin github
|
|
42
|
+
archal run --task "Send a message to #general" --twin slack
|
|
43
|
+
archal run --task "Create a customer and charge $50" --twin stripe
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
`--twin` is required with `--task`. Multiple twins: `--twin github --twin slack` or `--twin github,slack`.
|
|
47
|
+
|
|
48
|
+
`--task` and a positional scenario argument are mutually exclusive.
|
|
49
|
+
|
|
50
|
+
## CLI flags reference
|
|
51
|
+
|
|
52
|
+
| Flag | Description | Default |
|
|
53
|
+
|------|-------------|---------|
|
|
54
|
+
| `[scenario]` | Positional: scenario file path | |
|
|
55
|
+
| `-c, --config <path>` | Path to `.archal.json` | auto-discovered |
|
|
56
|
+
| `--task <description>` | Inline task (no scenario file) | |
|
|
57
|
+
| `--twin <name>` | Twin(s) for `--task` (repeatable or comma-separated) | |
|
|
58
|
+
| `-n, --runs <count>` | Number of runs | `1` |
|
|
59
|
+
| `-t, --timeout <seconds>` | Timeout per run | `180` |
|
|
60
|
+
| `-m, --model <model>` | Evaluator model | |
|
|
61
|
+
| `-o, --output <format>` | Output format: `terminal` or `json` | `terminal` |
|
|
62
|
+
| `--seed <name-or-path>` | Seed name or file path | |
|
|
63
|
+
| `--tag <tag>` | Only run scenarios with this tag | |
|
|
64
|
+
| `-q, --quiet` | Suppress non-error output | |
|
|
65
|
+
| `-v, --verbose` | Enable debug logging | |
|
|
66
|
+
| `--pass-threshold <score>` | Fail if satisfaction score below this (0-100) | `0` |
|
|
67
|
+
| `--proxy` | Route agent HTTP traffic through TLS proxy to twins | |
|
|
68
|
+
|
|
69
|
+
## Interpreting results
|
|
70
|
+
|
|
71
|
+
### Satisfaction score
|
|
72
|
+
|
|
73
|
+
Archal runs the scenario N times and evaluates each run. The satisfaction score is the percentage of runs that passed all criteria.
|
|
74
|
+
|
|
75
|
+
- `100%` = every run passed every criterion
|
|
76
|
+
- `80%` = 4 out of 5 runs passed
|
|
77
|
+
- `0%` = no runs passed
|
|
78
|
+
|
|
79
|
+
### Criterion results
|
|
80
|
+
|
|
81
|
+
Each criterion reports `pass` or `fail`:
|
|
82
|
+
- `[D]` criteria: checked against twin state (deterministic, no LLM)
|
|
83
|
+
- `[P]` criteria: judged by LLM evaluator from trace and final state
|
|
84
|
+
|
|
85
|
+
### When a run fails
|
|
86
|
+
|
|
87
|
+
Re-run with `-v` to see the full trace, then classify the root cause:
|
|
88
|
+
|
|
89
|
+
- **Agent bug**: agent called the wrong tool, passed wrong arguments, or stopped before completing all actions. Fix the agent code or prompt.
|
|
90
|
+
- **Scenario bug**: criteria are too strict, ambiguous, or contradict each other. Setup says "some issues" but criteria expect exact counts. Fix the scenario -- make Setup more specific, adjust criteria.
|
|
91
|
+
- **Seed mismatch**: twin state doesn't match Setup description. e.g. Setup says "4 stale issues" but the seed has 3. Use a different seed or adjust the Setup/criteria to match.
|
|
92
|
+
|
|
93
|
+
## CI mode
|
|
94
|
+
|
|
95
|
+
For CI pipelines, use `--pass-threshold`, `-o json`, and `-q`:
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
archal run scenario.md --runs 3 --pass-threshold 80 -o json -q
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
Exit codes:
|
|
102
|
+
- `0` = passed (score >= threshold)
|
|
103
|
+
- `1` = failed (score < threshold)
|
|
104
|
+
- `2` = validation error
|
|
105
|
+
|
|
106
|
+
GitHub Actions example:
|
|
107
|
+
|
|
108
|
+
```yaml
|
|
109
|
+
- name: Run Archal
|
|
110
|
+
env:
|
|
111
|
+
ARCHAL_TOKEN: ${{ secrets.ARCHAL_TOKEN }}
|
|
112
|
+
run: archal run scenario.md --runs 3 --pass-threshold 80 -o json -q
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## Persistent twin sessions
|
|
116
|
+
|
|
117
|
+
For interactive debugging, start persistent twins:
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
archal twin start github slack
|
|
121
|
+
archal twin status
|
|
122
|
+
archal twin seed github enterprise-repo
|
|
123
|
+
archal twin reset
|
|
124
|
+
archal twin stop
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Twin commands:
|
|
128
|
+
|
|
129
|
+
| Command | Description |
|
|
130
|
+
|---------|-------------|
|
|
131
|
+
| `archal twin start <twins...>` | Start a persistent twin session |
|
|
132
|
+
| `archal twin start --all` | Start all available twins |
|
|
133
|
+
| `archal twin status` | Show active session endpoints |
|
|
134
|
+
| `archal twin list` | List active sessions |
|
|
135
|
+
| `archal twin seed <twin> <seed-name>` | Load a named seed |
|
|
136
|
+
| `archal twin seed <twin> --file <path>` | Load a JSON seed file |
|
|
137
|
+
| `archal twin reset [twin]` | Reset twin state |
|
|
138
|
+
| `archal twin stop` | Tear down the session |
|
|
139
|
+
| `archal twin renew [ttl-seconds]` | Extend session lifetime |
|
|
140
|
+
| `archal twin attach <session-id>` | Attach to existing session |
|
|
141
|
+
|
|
142
|
+
Seed format for `twin start`: `--seed github:enterprise-repo --seed stripe:small-business`.
|
|
143
|
+
|
|
144
|
+
## Environment variables set for the agent
|
|
145
|
+
|
|
146
|
+
When `archal run` spawns the agent process, these env vars are injected:
|
|
147
|
+
|
|
148
|
+
| Variable | Description |
|
|
149
|
+
|----------|-------------|
|
|
150
|
+
| `ARCHAL_ENGINE_TASK` | The task/prompt text from the scenario |
|
|
151
|
+
| `ARCHAL_TWIN_NAMES` | Comma-separated list of active twin names |
|
|
152
|
+
| `ARCHAL_<TWIN>_URL` | MCP endpoint for a twin (e.g. `ARCHAL_GITHUB_URL`) |
|
|
153
|
+
| `ARCHAL_<TWIN>_BASE_URL` | REST API base URL (e.g. `ARCHAL_GITHUB_BASE_URL`) |
|
|
154
|
+
| `ARCHAL_MCP_CONFIG` | Path to MCP server config JSON file |
|
|
155
|
+
| `ARCHAL_MCP_SERVERS` | MCP servers JSON string |
|
|
156
|
+
| `ARCHAL_TOKEN` | Auth token for twin API calls |
|
|
157
|
+
| `ARCHAL_PREFLIGHT` | Set to `1` during boot check; agent should exit early |
|
|
158
|
+
| `ARCHAL_METRICS_FILE` | Path to write agent metrics JSON |
|
|
159
|
+
| `ARCHAL_AGENT_TRACE_FILE` | Path to write agent trace JSON |
|
|
160
|
+
| `ARCHAL_REST_CONFIG` | Path to REST routing config (when applicable) |
|
|
161
|
+
| `HTTPS_PROXY` | TLS proxy URL (when `--proxy` is active) |
|
|
162
|
+
| `NODE_EXTRA_CA_CERTS` | Path to proxy CA cert (when `--proxy` is active) |
|
|
163
|
+
|
|
164
|
+
Twin names are uppercased with hyphens replaced by underscores:
|
|
165
|
+
- `github` -> `ARCHAL_GITHUB_URL`, `ARCHAL_GITHUB_BASE_URL`
|
|
166
|
+
- `google-workspace` -> `ARCHAL_GOOGLE_WORKSPACE_URL`, `ARCHAL_GOOGLE_WORKSPACE_BASE_URL`
|
|
167
|
+
|
|
168
|
+
## Two ways agents connect to twins
|
|
169
|
+
|
|
170
|
+
**1. Environment variables (default):** Read `ARCHAL_<TWIN>_BASE_URL` and pass to SDK:
|
|
171
|
+
|
|
172
|
+
```typescript
|
|
173
|
+
const octokit = new Octokit({ baseUrl: process.env.ARCHAL_GITHUB_BASE_URL });
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
**2. TLS proxy (`--proxy`):** Intercepts HTTPS to real domains and redirects to twins. Agent code runs unmodified:
|
|
177
|
+
|
|
178
|
+
```bash
|
|
179
|
+
archal run --task "..." --proxy
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
## Browse scenarios
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
archal scenario list
|
|
186
|
+
archal scenario list --tag security
|
|
187
|
+
archal scenario list --difficulty hard
|
|
188
|
+
archal scenario list --json
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## Documentation
|
|
192
|
+
|
|
193
|
+
- Running with an agent: https://archal.ai/docs/guides/run-with-agent
|
|
194
|
+
- Twins overview and seeds: https://archal.ai/docs/twins/overview
|
|
195
|
+
- CI setup: https://archal.ai/docs/guides/ci
|