archal 0.9.18 → 0.9.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -1
- package/agents/github-octokit/.archal.json +8 -0
- package/agents/github-octokit/Dockerfile +8 -0
- package/agents/github-octokit/README.md +113 -0
- package/agents/github-octokit/agent.mjs +54 -0
- package/agents/github-octokit/package.json +9 -0
- package/agents/github-octokit/scenarios/test-repo-access.md +27 -0
- package/agents/google-workspace-local-tools/Dockerfile +6 -0
- package/agents/google-workspace-local-tools/README.md +58 -0
- package/agents/google-workspace-local-tools/agent.mjs +196 -0
- package/agents/google-workspace-local-tools/archal-harness.json +7 -0
- package/agents/google-workspace-local-tools/run-input.yaml +16 -0
- package/agents/google-workspace-local-tools/scenario.md +29 -0
- package/agents/hermes/.archal.json +8 -0
- package/agents/hermes/Dockerfile +46 -0
- package/agents/hermes/README.md +87 -0
- package/agents/hermes/SOUL.md +27 -0
- package/agents/hermes/config.yaml +34 -0
- package/agents/hermes/drive.mjs +113 -0
- package/agents/hermes/scenarios/stripe-customers-read-only.md +32 -0
- package/agents/openclaw/.archal.json +8 -0
- package/agents/openclaw/Dockerfile +96 -0
- package/agents/openclaw/README.md +120 -0
- package/agents/openclaw/drive.mjs +311 -0
- package/agents/openclaw/package.json +9 -0
- package/agents/openclaw/scenarios/github-issue-triage-read-only.md +44 -0
- package/agents/openclaw/workspace/AGENTS.md +23 -0
- package/agents/openclaw/workspace/IDENTITY.md +8 -0
- package/agents/openclaw/workspace/SOUL.md +14 -0
- package/agents/openclaw/workspace/TOOLS.md +35 -0
- package/agents/pagination-test/README.md +24 -0
- package/agents/pagination-test/scenario.md +24 -0
- package/agents/replay-capsule-harness/README.md +29 -0
- package/agents/replay-capsule-harness/observability-install-offline-e2e.mts +1517 -0
- package/agents/replay-capsule-harness/replay-capsule-e2e.mjs +104 -0
- package/clone-assets/apify/tools.json +213 -13
- package/clone-assets/calcom/tools.json +510 -0
- package/clone-assets/clickup/tools.json +1258 -0
- package/clone-assets/customerio/tools.json +386 -0
- package/clone-assets/datadog/tools.json +734 -0
- package/clone-assets/github/tools.json +312 -25
- package/clone-assets/gitlab/tools.json +999 -0
- package/clone-assets/google-workspace/tools.json +18 -6
- package/clone-assets/hubspot/tools.json +1406 -0
- package/clone-assets/jira/fidelity.json +1 -1
- package/clone-assets/jira/tools.json +266 -543
- package/clone-assets/linear/tools.json +238 -40
- package/clone-assets/ownerrez/tools.json +548 -0
- package/clone-assets/pricelabs/tools.json +343 -0
- package/clone-assets/sentry/tools.json +745 -0
- package/clone-assets/slack/tools.json +1 -2
- package/clone-assets/stripe/tools.json +185 -46
- package/clone-assets/supabase/tools.json +511 -14
- package/clone-assets/unipile/tools.json +408 -0
- package/clone-assets/webflow/tools.json +415 -0
- package/dist/autoloop-worker-types-BEb_E44z.d.cts +196 -0
- package/dist/cli.cjs +151033 -75282
- package/dist/commands/autoloop-hosted-worker.cjs +43942 -0
- package/dist/commands/autoloop-hosted-worker.d.cts +143 -0
- package/dist/commands/autoloop-pr-verification.cjs +4227 -0
- package/dist/commands/autoloop-pr-verification.d.cts +17 -0
- package/dist/{vitest/chunk-IVXSSEYS.js → commands/autoloop-result-parser.cjs} +16515 -18857
- package/dist/commands/autoloop-result-parser.d.cts +39 -0
- package/dist/commands/autoloop-worker.cjs +36163 -0
- package/dist/commands/autoloop-worker.d.cts +97 -0
- package/dist/harness.cjs +1 -0
- package/dist/index.cjs +1 -1
- package/dist/replay.cjs +49624 -0
- package/dist/replay.d.cts +4625 -0
- package/dist/scenarios.cjs +80343 -0
- package/dist/scenarios.d.cts +562 -0
- package/dist/vitest/chunk-6CBYFCFK.js +4667 -0
- package/dist/vitest/chunk-ARVS45PP.js +2764 -0
- package/dist/vitest/index.cjs +6079 -75089
- package/dist/vitest/index.d.ts +7 -6
- package/dist/vitest/index.js +8 -8
- package/dist/vitest/runtime/hosted-session-reaper.cjs +801 -34187
- package/dist/vitest/runtime/hosted-session-reaper.js +1 -1
- package/dist/vitest/runtime/setup-files.js +2 -2
- package/package.json +14 -9
- package/skills/archal-agent/SKILL.md +87 -0
- package/skills/autoloop/SKILL.md +376 -0
- package/skills/autoloop/references/hosted-sources.md +62 -0
- package/skills/autoloop/references/trace-schema-mapping.md +73 -0
- package/skills/eval/SKILL.md +35 -1
- package/skills/install-agent/SKILL.md +221 -0
- package/skills/onboard/SKILL.md +80 -0
- package/skills/scenario/SKILL.md +19 -4
- package/skills/seed/SKILL.md +237 -0
- package/dist/seed/dynamic-generator.cjs +0 -45564
- package/dist/seed/dynamic-generator.d.cts +0 -106
- package/dist/vitest/chunk-CTSN67QR.js +0 -47188
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "archal",
|
|
3
|
-
"version": "0.9.
|
|
3
|
+
"version": "0.9.20",
|
|
4
4
|
"description": "Test your agents & integrations against service clones",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.cjs",
|
|
@@ -17,6 +17,10 @@
|
|
|
17
17
|
"types": "./dist/harness.d.cts",
|
|
18
18
|
"default": "./dist/harness.cjs"
|
|
19
19
|
},
|
|
20
|
+
"./scenarios": {
|
|
21
|
+
"types": "./dist/scenarios.d.cts",
|
|
22
|
+
"default": "./dist/scenarios.cjs"
|
|
23
|
+
},
|
|
20
24
|
"./vitest": {
|
|
21
25
|
"types": "./dist/vitest/index.d.ts",
|
|
22
26
|
"import": "./dist/vitest/index.js",
|
|
@@ -47,15 +51,9 @@
|
|
|
47
51
|
"dist",
|
|
48
52
|
"skills",
|
|
49
53
|
"clone-assets",
|
|
54
|
+
"agents",
|
|
50
55
|
"LICENSE"
|
|
51
56
|
],
|
|
52
|
-
"scripts": {
|
|
53
|
-
"verify:artifacts": "node scripts/assert-artifacts.mjs",
|
|
54
|
-
"prepack": "pnpm run verify:artifacts",
|
|
55
|
-
"prepare": "node scripts/prepare.cjs",
|
|
56
|
-
"typecheck:raw": "node --check bin/archal.cjs && node --check scripts/assert-artifacts.mjs && node --check scripts/prepare.cjs",
|
|
57
|
-
"typecheck": "pnpm run typecheck:raw"
|
|
58
|
-
},
|
|
59
57
|
"peerDependencies": {
|
|
60
58
|
"vitest": ">=2.1.0"
|
|
61
59
|
},
|
|
@@ -65,6 +63,13 @@
|
|
|
65
63
|
}
|
|
66
64
|
},
|
|
67
65
|
"dependencies": {
|
|
66
|
+
"@aws-sdk/client-secrets-manager": "^3.1065.0",
|
|
67
|
+
"e2b": "^2.28.2",
|
|
68
68
|
"picomatch": "^4.0.4"
|
|
69
|
+
},
|
|
70
|
+
"scripts": {
|
|
71
|
+
"verify:artifacts": "node scripts/assert-artifacts.mjs",
|
|
72
|
+
"typecheck:raw": "node --check bin/archal.cjs && node --check scripts/assert-artifacts.mjs && node --check scripts/prepare.cjs",
|
|
73
|
+
"typecheck": "pnpm run typecheck:raw"
|
|
69
74
|
}
|
|
70
|
-
}
|
|
75
|
+
}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: archal-agent
|
|
3
|
+
description: The front door for using Archal to test, debug, and fix an AI agent. START HERE, then route to the right sub-skill instead of guessing. Use when the user says "use Archal", "test my agent", "set up Archal", "my agent is failing", "reproduce this production failure", "grade my traces", or otherwise wants to evaluate, debug, or auto-fix an agent against service clones. Pick this whenever the request is Archal-shaped but the specific workflow is unclear.
|
|
4
|
+
user-invocable: true
|
|
5
|
+
argument-hint: "[what you want to do with your agent]"
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Archal
|
|
9
|
+
|
|
10
|
+
You are the entry point for Archal. Archal is the QA layer for AI agents: it
|
|
11
|
+
runs an agent against stateful behavioral clones of real services (GitHub,
|
|
12
|
+
Slack, Stripe, Linear, Jira, Supabase, and more), scores how well it satisfies
|
|
13
|
+
each scenario, and turns failures into reproductions and PR fixes. Your job here
|
|
14
|
+
is to orient the operator and route to the sub-skill that owns their workflow.
|
|
15
|
+
Do not inline those flows; hand off by exact name and let the sub-skill drive.
|
|
16
|
+
|
|
17
|
+
## Product mental model
|
|
18
|
+
|
|
19
|
+
Archal tests AI agents against service clones instead of real services, so every
|
|
20
|
+
run is deterministic and replayable. You describe a task and success criteria,
|
|
21
|
+
the agent runs against clones, and an evaluator scores satisfaction (a
|
|
22
|
+
probability, not pass/fail). When a real production trace shows a failure, Archal
|
|
23
|
+
reproduces it on clones from trace evidence and ships the fix as a GitHub PR.
|
|
24
|
+
Everything aims at the same thing: deterministic, replayable evals you can trust.
|
|
25
|
+
|
|
26
|
+
## Decision guide
|
|
27
|
+
|
|
28
|
+
| I want to... | Route to |
|
|
29
|
+
|--------------|----------|
|
|
30
|
+
| Set up Archal in a repo from scratch (install, auth, detect clones) | `onboard` |
|
|
31
|
+
| Write or edit a scenario test file | `scenario` |
|
|
32
|
+
| Load explicit JSON/SQL/catalog state into a clone (deterministic, no LLM) | `seed` |
|
|
33
|
+
| Run scenarios or tasks and interpret satisfaction scores and failures | `eval` |
|
|
34
|
+
| Wire clones into an existing Vitest suite | `vitest` |
|
|
35
|
+
| Connect a repo's production observability so traces get captured | `install-agent` |
|
|
36
|
+
| Run the autoloop (ingest -> grade -> find-failed -> reproduce-on-clones) and ship the fix as a PR (autofix) over real trace sources | `autoloop` |
|
|
37
|
+
| Turn autofix or autoloop on/off for an agent | `autoloop`; the copilot can toggle either |
|
|
38
|
+
|
|
39
|
+
If the user is brand new and has none of this set up, start with `onboard`; it
|
|
40
|
+
detects clones and routes onward from there.
|
|
41
|
+
|
|
42
|
+
## The sub-skills
|
|
43
|
+
|
|
44
|
+
Each lives in `packages/archal/skills/` and owns its own commands, contracts,
|
|
45
|
+
and mental model. Route by exact name:
|
|
46
|
+
|
|
47
|
+
- `onboard` — set up Archal in a repo from scratch: install the CLI, handle auth,
|
|
48
|
+
detect which clones the agent needs, and hand off to the right workflow.
|
|
49
|
+
- `scenario` — author and edit scenario test files (Setup / Prompt / Expected
|
|
50
|
+
Behavior / Success Criteria) that `archal run` executes against clones.
|
|
51
|
+
- `seed` — load explicit JSON, SQL, or catalog state into a clone deterministically,
|
|
52
|
+
with no LLM in the loop, so runs start from a known fixture state.
|
|
53
|
+
- `eval` — run scenarios or inline tasks against clones and interpret the results:
|
|
54
|
+
satisfaction scores, `[D]` vs `[P]` criteria, trace inspection, failure diagnosis.
|
|
55
|
+
- `vitest` — wire clones into an existing Vitest suite using the right composition
|
|
56
|
+
pattern, so the agent's own tests route through clones.
|
|
57
|
+
- `install-agent` — connect a repo's production observability (OTLP, Langfuse,
|
|
58
|
+
Braintrust, database trace tables) so real agent traces are captured for Archal.
|
|
59
|
+
- `autoloop` — the loop over real trace sources: ingest a trace, grade it for a
|
|
60
|
+
real failure, find the failed trace, and reproduce it on clones. Autofix (the
|
|
61
|
+
fix/PR step) is a separate toggle on top of this: when turned on, autoloop
|
|
62
|
+
reproduces a failure and ships the fix as a PR.
|
|
63
|
+
|
|
64
|
+
## Autoloop and autofix toggles
|
|
65
|
+
|
|
66
|
+
Autoloop (ingest -> grade -> find-failed -> reproduce-on-clones) and autofix (the
|
|
67
|
+
fix/PR step) are **separate per-agent toggles**. Autofix is opt-in: it is not part
|
|
68
|
+
of autoloop until it is turned on. Either can be switched on or off per agent from
|
|
69
|
+
the agents tab, the CLI (`--execution-policy reproduce` is autoloop with autofix
|
|
70
|
+
off; `fix` turns autofix on), or by asking the Archal copilot in chat — the
|
|
71
|
+
copilot can toggle either for an agent. When the user asks to turn autofix or
|
|
72
|
+
autoloop on/off for an agent, handle the toggle, then route to `autoloop` for the
|
|
73
|
+
deeper flow.
|
|
74
|
+
|
|
75
|
+
## Provider-switchable remediation
|
|
76
|
+
|
|
77
|
+
The Archal copilot is not locked to one model. When autofix reproduces a failure
|
|
78
|
+
and writes a fix, the user can drive that remediation with their own agent —
|
|
79
|
+
`archal preprod` exposes `--remediation-agent auto|codex|claude|cursor` so the
|
|
80
|
+
fix is written by their Claude Code, Cursor, or Codex — or let Archal's managed
|
|
81
|
+
agent do it. Mention this when the user asks who writes the fix or wants to use
|
|
82
|
+
their own coding agent, then route to `autoloop`.
|
|
83
|
+
|
|
84
|
+
## Docs
|
|
85
|
+
|
|
86
|
+
- Quickstart: https://docs.archal.ai/quickstart
|
|
87
|
+
- Full docs: https://docs.archal.ai
|
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: autoloop
|
|
3
|
+
description: Wire Archal Autoloop to a repo plus a real agent-trace source, then drive the import -> grade -> reproduce -> PR-fix loop. USE THIS whenever the user wants to turn production agent traces into reproducible failures and fixes: "I have prod agent traces and want to reproduce a failure", "import my Langfuse / Braintrust / OTel / Supabase traces", "connect a trace source", "grade my prod traces", "turn a failed trace into a PR", "set up / configure the autoloop", or any mention of replaying, grading, or auto-fixing real traces. Also fires when diagnosing a stuck import, grade, reproduction, or PR-fix run, or configuring trace schema mapping.
|
|
4
|
+
user-invocable: true
|
|
5
|
+
argument-hint: "[repo, trace source, or failure description]"
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Archal Autoloop
|
|
9
|
+
|
|
10
|
+
You help users connect real agent traces to Archal. Your job is to wire the repo,
|
|
11
|
+
trace source, harness contract, scenario contract, and GitHub PR path without
|
|
12
|
+
guessing or leaking secrets.
|
|
13
|
+
|
|
14
|
+
Autoloop is not a replacement for `archal run`. It uses the same harness and clone
|
|
15
|
+
routing ideas, but the trigger is a trace that already happened.
|
|
16
|
+
|
|
17
|
+
Autoloop is also not arbitrary production trace replay. It can reproduce a
|
|
18
|
+
failure only when the trace, scenario contract, and seed templates contain
|
|
19
|
+
enough evidence to reconstruct the service state that matters. If the evidence
|
|
20
|
+
is thin, block and name the missing data instead of claiming reproduction.
|
|
21
|
+
|
|
22
|
+
## Product mental model
|
|
23
|
+
|
|
24
|
+
Autoloop does this loop:
|
|
25
|
+
|
|
26
|
+
1. Import a trace and its child spans from a read-only source.
|
|
27
|
+
2. Grade whether the trace contains a real failure.
|
|
28
|
+
3. Build a reproduction scenario and clone seed from trace evidence plus
|
|
29
|
+
repo-owned seed templates.
|
|
30
|
+
4. Run the reproduction against service clones through the customer harness.
|
|
31
|
+
5. If reproduced, patch the repo and open a GitHub issue or PR.
|
|
32
|
+
|
|
33
|
+
Steps 1-4 are **autoloop**: ingest -> grade -> find the failed trace ->
|
|
34
|
+
reproduce on clones. Step 5 (writing the fix and opening the PR) is **autofix**,
|
|
35
|
+
a separate opt-in step that is *not* part of autoloop until it is turned on.
|
|
36
|
+
Both are per-agent toggles, switchable from the agents tab, the CLI, or by asking
|
|
37
|
+
the Archal copilot in chat.
|
|
38
|
+
|
|
39
|
+
The CLI maps these toggles to `--execution-policy`: `reproduce` runs autoloop
|
|
40
|
+
only with autofix off, while `fix` turns autofix on (autoloop plus the fix/PR
|
|
41
|
+
step). Narrower policies stop earlier:
|
|
42
|
+
|
|
43
|
+
| Policy | Stops after | Autofix |
|
|
44
|
+
|--------|-------------|---------|
|
|
45
|
+
| `observe` | import | off |
|
|
46
|
+
| `grade` | grading | off |
|
|
47
|
+
| `reproduce` | reproduction | off |
|
|
48
|
+
| `fix` | PR or blocked fix status | on |
|
|
49
|
+
|
|
50
|
+
Do not invent or promote separate top-level judge, reproduce, fix, or
|
|
51
|
+
trace-replay commands. The public command is `archal autoloop`. Local stop
|
|
52
|
+
command is `archal detach` for file-backed trace directories.
|
|
53
|
+
|
|
54
|
+
## Discover first
|
|
55
|
+
|
|
56
|
+
Before changing anything, inspect the repo:
|
|
57
|
+
|
|
58
|
+
1. `package.json` and scripts: how is the agent run? What tests should a fix PR
|
|
59
|
+
pass?
|
|
60
|
+
2. Existing Archal files:
|
|
61
|
+
- `.archal.json`
|
|
62
|
+
- `.archal/harness.*`
|
|
63
|
+
- `archal/harness.json`
|
|
64
|
+
- `archal/scenario.md`
|
|
65
|
+
- `archal/run-input.yaml`
|
|
66
|
+
- `archal/seeds/*.json`
|
|
67
|
+
- `scenarios/*.md`
|
|
68
|
+
3. Service SDKs and likely clones:
|
|
69
|
+
- `stripe` -> `stripe`
|
|
70
|
+
- `@octokit/rest`, `octokit` -> `github`
|
|
71
|
+
- `@slack/web-api`, `@slack/bolt` -> `slack`
|
|
72
|
+
- `jira`, `jira-client`, `jira.js` -> `jira`
|
|
73
|
+
- `@linear/sdk` -> `linear`
|
|
74
|
+
- `@supabase/supabase-js`, `pg` -> `supabase`
|
|
75
|
+
- `googleapis` -> `google-workspace`
|
|
76
|
+
4. GitHub remote:
|
|
77
|
+
```bash
|
|
78
|
+
git remote get-url origin
|
|
79
|
+
```
|
|
80
|
+
Hosted sources and `--execution-policy fix` need a GitHub remote.
|
|
81
|
+
5. Trace source shape, if available:
|
|
82
|
+
- provider: local files, Postgres, Supabase
|
|
83
|
+
- or local/client-ingested sources normalized through `archal trace-source`
|
|
84
|
+
such as file, HTTP/OTel, Langfuse, Braintrust, S3/GCS, or custom JSON
|
|
85
|
+
- trace table and span table names
|
|
86
|
+
- id columns
|
|
87
|
+
- parent span column
|
|
88
|
+
- timestamp/cursor columns
|
|
89
|
+
- status, workspace, trace group, and agent filters
|
|
90
|
+
|
|
91
|
+
Never print secrets while inspecting. If you need to show a database URL, show
|
|
92
|
+
only the env var name or secret reference.
|
|
93
|
+
|
|
94
|
+
## Preconditions
|
|
95
|
+
|
|
96
|
+
You need these before a full hosted Autoloop setup:
|
|
97
|
+
|
|
98
|
+
- Archal CLI installed in the repo or reachable with `npx archal`
|
|
99
|
+
- authenticated user (`archal login`) or `ARCHAL_TOKEN=archal_ws_...`
|
|
100
|
+
- GitHub App installed on the target repo
|
|
101
|
+
- repo origin that resolves to `github.com/<owner>/<repo>`
|
|
102
|
+
- read-only trace source credential
|
|
103
|
+
- headless harness command
|
|
104
|
+
- `archal/scenario.md` for the trace family
|
|
105
|
+
- optional but strongly recommended `archal/seeds/*.json`
|
|
106
|
+
- model/provider keys required by the user's agent and tests
|
|
107
|
+
|
|
108
|
+
If any prerequisite is missing, make the smallest safe change and explain what
|
|
109
|
+
is still required. Do not continue into a fake reproduction.
|
|
110
|
+
|
|
111
|
+
## Safe setup flow
|
|
112
|
+
|
|
113
|
+
### 1. Install and authenticate
|
|
114
|
+
|
|
115
|
+
Prefer project-local install:
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
npm install -D archal
|
|
119
|
+
npx archal login
|
|
120
|
+
npx archal usage
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
In CI or a customer repo, use:
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
export ARCHAL_TOKEN=archal_ws_...
|
|
127
|
+
npx archal usage
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Use a workspace API key for automated runs. Do not commit it.
|
|
131
|
+
|
|
132
|
+
### 2. Add or verify `archal/harness.json`
|
|
133
|
+
|
|
134
|
+
Minimal shape:
|
|
135
|
+
|
|
136
|
+
```json
|
|
137
|
+
{
|
|
138
|
+
"version": 1,
|
|
139
|
+
"local": {
|
|
140
|
+
"command": "node",
|
|
141
|
+
"args": ["agent.mjs"]
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
The command must be headless and repeatable. It should run the real agent path,
|
|
147
|
+
not a hand-authored mock. During reproduction, Archal invokes this command
|
|
148
|
+
through `archal run`, so the agent should read the task from `AGENT_TASK` and
|
|
149
|
+
print its final answer to stdout. If the project already has `.archal.json` for
|
|
150
|
+
`archal run`, align the Autoloop harness with that command.
|
|
151
|
+
|
|
152
|
+
### 3. Add or verify `archal/scenario.md`
|
|
153
|
+
|
|
154
|
+
The scenario describes the standing task and checks for this trace family.
|
|
155
|
+
|
|
156
|
+
Required sections:
|
|
157
|
+
|
|
158
|
+
```md
|
|
159
|
+
# Scenario title
|
|
160
|
+
|
|
161
|
+
## Setup
|
|
162
|
+
Trace-family context and the starting state Archal should reconstruct.
|
|
163
|
+
|
|
164
|
+
## Prompt
|
|
165
|
+
The task the agent should complete.
|
|
166
|
+
|
|
167
|
+
## Expected Behavior
|
|
168
|
+
The answer key for grading and reproduction.
|
|
169
|
+
|
|
170
|
+
## Success Criteria
|
|
171
|
+
- [D] Deterministic clone-state check
|
|
172
|
+
- [P] Probabilistic trace/output check
|
|
173
|
+
|
|
174
|
+
## Config
|
|
175
|
+
clones: stripe, slack
|
|
176
|
+
timeout: 120
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
Keep model-visible instructions realistic. Do not tell the tested agent that it
|
|
180
|
+
is in Archal, a clone-backed environment, or a special replay.
|
|
181
|
+
|
|
182
|
+
### 4. Add seed templates when trace evidence is thin
|
|
183
|
+
|
|
184
|
+
Recommended:
|
|
185
|
+
|
|
186
|
+
```text
|
|
187
|
+
archal/seeds/
|
|
188
|
+
stripe-billing-support.json
|
|
189
|
+
jira-escalations.json
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
Seed templates should contain stable service state for the task family. Autoloop
|
|
193
|
+
can then fill in trace-specific identifiers. This is much safer than expecting
|
|
194
|
+
weak traces to reconstruct full service state.
|
|
195
|
+
|
|
196
|
+
## Hosted database source
|
|
197
|
+
|
|
198
|
+
Use this when traces live in Postgres or Supabase. Create a read-only DB user,
|
|
199
|
+
keep the URL in `TRACE_DATABASE_URL` (or a secret ref in hosted production), then
|
|
200
|
+
`--check` the source and re-run without `--check` to register it. Registration
|
|
201
|
+
posts the source config to Archal and returns; hosted workers own polling after
|
|
202
|
+
that, so local `archal detach` does not disable it.
|
|
203
|
+
|
|
204
|
+
See `references/hosted-sources.md` for the full check, register, and
|
|
205
|
+
`--database-url-secret-ref` flag blocks.
|
|
206
|
+
|
|
207
|
+
## Client-side trace ingestion
|
|
208
|
+
|
|
209
|
+
Use `archal trace-source` when traces are not already in a hosted Postgres or
|
|
210
|
+
Supabase table. This command normalizes source-specific payloads into Archal
|
|
211
|
+
trace upload envelopes, writes them to a trace directory, and can upload them to
|
|
212
|
+
hosted Autoloop when workspace auth is available.
|
|
213
|
+
|
|
214
|
+
Common paths:
|
|
215
|
+
|
|
216
|
+
```bash
|
|
217
|
+
npx archal trace-source import ./exports --preview --json
|
|
218
|
+
npx archal trace-source import ./exports --upload --repository owner/repo
|
|
219
|
+
|
|
220
|
+
npx archal trace-source connect langfuse \
|
|
221
|
+
--base-url https://cloud.langfuse.com \
|
|
222
|
+
--api-key-env LANGFUSE_READ_KEY \
|
|
223
|
+
--out .archal/traces/inbox
|
|
224
|
+
npx archal trace-source test langfuse
|
|
225
|
+
npx archal trace-source sync langfuse --upload --repository owner/repo
|
|
226
|
+
npx archal trace-source watch langfuse --upload --repository owner/repo
|
|
227
|
+
|
|
228
|
+
npx archal trace-source connect custom --name "prod exporter" --out .archal/traces/inbox
|
|
229
|
+
npx archal trace-source serve "prod exporter" --port 4319
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
Use `archal trace-source status [source]` to inspect registry validation,
|
|
233
|
+
cursor, and last-sync state. `watch` is for pull-style sources; push sources
|
|
234
|
+
stay continuous through `serve`.
|
|
235
|
+
|
|
236
|
+
## Trace schema mapping
|
|
237
|
+
|
|
238
|
+
Hosted sources default to `ai_traces` / `ai_spans` with `id` / `trace_id`
|
|
239
|
+
columns and `updated_at_id` cursor mode. When the customer's tables differ, pass
|
|
240
|
+
mapping flags to override table names, id columns, parent-span column, and
|
|
241
|
+
cursor columns; switch to `created_at_id` cursor mode for append-only sources;
|
|
242
|
+
and use `--source-*` filters to scope noisy sources by workspace, agent, status,
|
|
243
|
+
trace group, or limit.
|
|
244
|
+
|
|
245
|
+
See `references/trace-schema-mapping.md` for the full defaults table plus the
|
|
246
|
+
custom-schema, append-only, and filter flag blocks.
|
|
247
|
+
|
|
248
|
+
## Local trace directory
|
|
249
|
+
|
|
250
|
+
Use this for a local pilot or exported trace files:
|
|
251
|
+
|
|
252
|
+
```bash
|
|
253
|
+
npx archal autoloop ./prod-traces --repo . --execution-policy reproduce
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
Artifacts are written under:
|
|
257
|
+
|
|
258
|
+
```text
|
|
259
|
+
.archal/autoloop/
|
|
260
|
+
autoloops.json
|
|
261
|
+
runs.jsonl
|
|
262
|
+
raw/
|
|
263
|
+
grades/
|
|
264
|
+
seeds/
|
|
265
|
+
runs/
|
|
266
|
+
fixes/
|
|
267
|
+
failed/
|
|
268
|
+
logs/
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
Stop the local file-backed loop:
|
|
272
|
+
|
|
273
|
+
```bash
|
|
274
|
+
npx archal detach ./prod-traces --repo .
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
Do not describe `archal detach` as a hosted source disable command.
|
|
278
|
+
|
|
279
|
+
## CLI-first operation
|
|
280
|
+
|
|
281
|
+
Prefer CLI and artifact evidence for handoffs unless the user explicitly asks
|
|
282
|
+
for a workspace page.
|
|
283
|
+
|
|
284
|
+
- Local file-backed loops: `archal autoloop <trace-dir> --repo ...` starts the
|
|
285
|
+
watcher, `archal detach <trace-dir> --repo ...` stops it, `archal
|
|
286
|
+
autoloop-status --repo ...` summarizes trace jobs, and `archal
|
|
287
|
+
autoloop-reprocess --repo ... <trace-id>` retries terminal jobs after a
|
|
288
|
+
blocker is fixed.
|
|
289
|
+
- Hosted database sources: `archal autoloop --source postgres|supabase ...`
|
|
290
|
+
registers the source and returns. Local `archal detach` does not disable a
|
|
291
|
+
hosted source because hosted workers own polling after registration.
|
|
292
|
+
- Safe resume means re-running the same `archal autoloop` registration or
|
|
293
|
+
reprocessing a terminal local trace only after the missing evidence,
|
|
294
|
+
credential, mapping, harness, or GitHub blocker is corrected.
|
|
295
|
+
|
|
296
|
+
Report exact artifact paths and statuses. Do not make dashboard pages the only
|
|
297
|
+
place a user can understand what happened.
|
|
298
|
+
|
|
299
|
+
## How to diagnose failures
|
|
300
|
+
|
|
301
|
+
Classify failures precisely:
|
|
302
|
+
|
|
303
|
+
- Trace import failure: database/source auth, mapping, cursor, filters, bad
|
|
304
|
+
trace shape.
|
|
305
|
+
- Trace ingestion failure: `trace-source` adapter mismatch, rejected hosted
|
|
306
|
+
upload, missing workspace auth, bad idempotency key, or receiver auth failure.
|
|
307
|
+
- Grade failure: judge could not determine expected outcome, missing evaluator
|
|
308
|
+
contract, trace lacks task context.
|
|
309
|
+
- Missing evidence: trace does not contain enough state to seed. Add spans,
|
|
310
|
+
state snapshots, or repo-owned seed templates.
|
|
311
|
+
- Reproduction failure: scenario or seed could not replay the failure against
|
|
312
|
+
clones. Inspect generated `scenario.md`, `seed.json`, and run manifest.
|
|
313
|
+
- Agent behavior: the reproduced run shows the agent making the same wrong
|
|
314
|
+
service action it made in the original trace.
|
|
315
|
+
- Harness issue: the agent command crashes, hangs, needs UI auth, or does not
|
|
316
|
+
reach clone-routed services.
|
|
317
|
+
- Fix generation issue: patch does not apply, tests fail, no changes produced,
|
|
318
|
+
or generated PR metadata is incomplete.
|
|
319
|
+
- GitHub issue: GitHub App missing, branch protection, permission denied, PR
|
|
320
|
+
checks unavailable.
|
|
321
|
+
|
|
322
|
+
When evidence is insufficient, say so directly. Do not manufacture a seed or
|
|
323
|
+
claim reproduction succeeded.
|
|
324
|
+
|
|
325
|
+
## Artifact reading guide
|
|
326
|
+
|
|
327
|
+
Local file-backed Autoloop uses repo-local artifacts. Hosted Autoloop exposes the
|
|
328
|
+
same phase information in the dashboard.
|
|
329
|
+
|
|
330
|
+
| Artifact | What to inspect |
|
|
331
|
+
|----------|-----------------|
|
|
332
|
+
| `grades/<trace>/routing.json` | trace import route and selected phase |
|
|
333
|
+
| `grades/<trace>/grade.json` | verdict, summary, and reproduction decision |
|
|
334
|
+
| `seeds/<trace>/scenario.md` | generated reproduction scenario |
|
|
335
|
+
| `seeds/<trace>/seed.json` | generated seed request or materialized seed metadata |
|
|
336
|
+
| `runs/<trace>/manifest.json` | reproduction status, command, attempts, evidence |
|
|
337
|
+
| `runs/<trace>/stdout.json` | machine-readable run output |
|
|
338
|
+
| `runs/<trace>/stderr.log` | reproduction stderr |
|
|
339
|
+
| `fixes/<trace>/status.json` | blocked fix status |
|
|
340
|
+
| `fixes/<trace>/pr-details.md` | PR reviewer summary |
|
|
341
|
+
| `fixes/<trace>/repo.patch` | patch captured when PR creation cannot complete |
|
|
342
|
+
|
|
343
|
+
## Security rules
|
|
344
|
+
|
|
345
|
+
- Use read-only trace database credentials.
|
|
346
|
+
- Never commit database URLs, API keys, model keys, or GitHub tokens.
|
|
347
|
+
- Prefer `--database-url-env` locally and `--database-url-secret-ref` in hosted
|
|
348
|
+
production setup.
|
|
349
|
+
- Do not pass production write credentials to a clone-routed reproduction.
|
|
350
|
+
- Do not add model-visible copy that reveals Archal or clone routing to the
|
|
351
|
+
tested agent.
|
|
352
|
+
- Do not bypass the GitHub App PR workflow with direct pushes.
|
|
353
|
+
- Redact raw trace payloads before sharing artifacts outside the workspace.
|
|
354
|
+
|
|
355
|
+
## What to report back
|
|
356
|
+
|
|
357
|
+
After setup or debugging, give the user:
|
|
358
|
+
|
|
359
|
+
- command run
|
|
360
|
+
- source provider and source id
|
|
361
|
+
- repo full name
|
|
362
|
+
- execution policy
|
|
363
|
+
- CLI status command to run next
|
|
364
|
+
- artifacts produced, if local
|
|
365
|
+
- whether import, grade, seed, reproduce, and fix phases are ready
|
|
366
|
+
- exact blocker if any
|
|
367
|
+
- next command or next owner
|
|
368
|
+
|
|
369
|
+
## Docs
|
|
370
|
+
|
|
371
|
+
- Autoloop production traces: https://docs.archal.ai/guides/autoloop-production-traces
|
|
372
|
+
- Autonomous loops: https://docs.archal.ai/guides/autoloop-production-traces
|
|
373
|
+
- CLI reference: https://docs.archal.ai/cli/autoloop
|
|
374
|
+
- Running with an agent: https://docs.archal.ai/guides/run-with-agent
|
|
375
|
+
- Writing scenarios: https://docs.archal.ai/guides/writing-scenarios
|
|
376
|
+
- Seeds: https://docs.archal.ai/guides/seeds
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Hosted database source
|
|
2
|
+
|
|
3
|
+
Full flag reference for registering a hosted Postgres or Supabase trace source.
|
|
4
|
+
Use this when traces live in Postgres or Supabase. Registration posts the source
|
|
5
|
+
config to Archal and returns; hosted workers own polling after that.
|
|
6
|
+
|
|
7
|
+
## Contents
|
|
8
|
+
|
|
9
|
+
- Read-only credential
|
|
10
|
+
- Check the source
|
|
11
|
+
- Register the source
|
|
12
|
+
- Secret reference (hosted production)
|
|
13
|
+
|
|
14
|
+
## Read-only credential
|
|
15
|
+
|
|
16
|
+
First, create or request a read-only database user. Then keep the URL in an env
|
|
17
|
+
var:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
export TRACE_DATABASE_URL='postgres://readonly:...'
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Check the source
|
|
24
|
+
|
|
25
|
+
Run a check:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
npx archal autoloop \
|
|
29
|
+
--repo . \
|
|
30
|
+
--source supabase \
|
|
31
|
+
--database-url-env TRACE_DATABASE_URL \
|
|
32
|
+
--source-id prod-agent-traces \
|
|
33
|
+
--check
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Register the source
|
|
37
|
+
|
|
38
|
+
Register:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
npx archal autoloop \
|
|
42
|
+
--repo . \
|
|
43
|
+
--source supabase \
|
|
44
|
+
--database-url-env TRACE_DATABASE_URL \
|
|
45
|
+
--source-id prod-agent-traces
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
This posts the source config to Archal and returns. It does not start a local
|
|
49
|
+
watcher and does not write local source state.
|
|
50
|
+
|
|
51
|
+
## Secret reference (hosted production)
|
|
52
|
+
|
|
53
|
+
Use a secret reference when the customer already has one:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
npx archal autoloop \
|
|
57
|
+
--repo . \
|
|
58
|
+
--source postgres \
|
|
59
|
+
--database-url-secret-ref aws-secretsmanager://customer/prod-agent-traces
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
`--database-url-secret-ref` must not contain a plaintext credential.
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Trace schema mapping
|
|
2
|
+
|
|
3
|
+
Full mapping-flag reference for hosted Postgres/Supabase trace sources. Use this
|
|
4
|
+
when the customer's trace and span tables do not match the defaults below.
|
|
5
|
+
|
|
6
|
+
## Contents
|
|
7
|
+
|
|
8
|
+
- Defaults
|
|
9
|
+
- Custom schema flags
|
|
10
|
+
- Append-only sources
|
|
11
|
+
- Filters for noisy sources
|
|
12
|
+
|
|
13
|
+
## Defaults
|
|
14
|
+
|
|
15
|
+
| Concept | Default |
|
|
16
|
+
|---------|---------|
|
|
17
|
+
| trace table | `ai_traces` |
|
|
18
|
+
| span table | `ai_spans` |
|
|
19
|
+
| trace id | `id` |
|
|
20
|
+
| span id | `id` |
|
|
21
|
+
| span trace id | `trace_id` |
|
|
22
|
+
| trace updated cursor | `updated_at` |
|
|
23
|
+
| span updated cursor | `updated_at` |
|
|
24
|
+
| cursor mode | `updated_at_id` |
|
|
25
|
+
|
|
26
|
+
## Custom schema flags
|
|
27
|
+
|
|
28
|
+
For a custom schema, pass mapping flags:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
npx archal autoloop \
|
|
32
|
+
--repo . \
|
|
33
|
+
--source postgres \
|
|
34
|
+
--database-url-env TRACE_DATABASE_URL \
|
|
35
|
+
--trace-table public.agent_traces \
|
|
36
|
+
--span-table public.agent_spans \
|
|
37
|
+
--trace-id-column trace_id \
|
|
38
|
+
--span-id-column span_id \
|
|
39
|
+
--span-trace-id-column trace_id \
|
|
40
|
+
--parent-span-id-column parent_span_id \
|
|
41
|
+
--trace-updated-at-column updated_at \
|
|
42
|
+
--span-updated-at-column updated_at
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Append-only sources
|
|
46
|
+
|
|
47
|
+
For append-only sources:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
npx archal autoloop \
|
|
51
|
+
--repo . \
|
|
52
|
+
--source supabase \
|
|
53
|
+
--database-url-env TRACE_DATABASE_URL \
|
|
54
|
+
--cursor-mode created_at_id \
|
|
55
|
+
--trace-created-at-column created_at \
|
|
56
|
+
--span-created-at-column created_at
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Filters for noisy sources
|
|
60
|
+
|
|
61
|
+
Use filters for noisy sources:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
npx archal autoloop \
|
|
65
|
+
--repo . \
|
|
66
|
+
--source supabase \
|
|
67
|
+
--database-url-env TRACE_DATABASE_URL \
|
|
68
|
+
--source-workspace-id workspace_123 \
|
|
69
|
+
--source-agent-id support-agent \
|
|
70
|
+
--source-status failed error \
|
|
71
|
+
--source-trace-group billing-support \
|
|
72
|
+
--source-limit 250
|
|
73
|
+
```
|