archal 0.9.19 → 0.9.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -1
- package/agents/github-octokit/.archal.json +8 -0
- package/agents/github-octokit/Dockerfile +8 -0
- package/agents/github-octokit/README.md +113 -0
- package/agents/github-octokit/agent.mjs +54 -0
- package/agents/github-octokit/package.json +9 -0
- package/agents/github-octokit/scenarios/test-repo-access.md +27 -0
- package/agents/google-workspace-local-tools/Dockerfile +6 -0
- package/agents/google-workspace-local-tools/README.md +58 -0
- package/agents/google-workspace-local-tools/agent.mjs +196 -0
- package/agents/google-workspace-local-tools/archal-harness.json +7 -0
- package/agents/google-workspace-local-tools/run-input.yaml +16 -0
- package/agents/google-workspace-local-tools/scenario.md +29 -0
- package/agents/hermes/.archal.json +8 -0
- package/agents/hermes/Dockerfile +46 -0
- package/agents/hermes/README.md +87 -0
- package/agents/hermes/SOUL.md +27 -0
- package/agents/hermes/config.yaml +34 -0
- package/agents/hermes/drive.mjs +113 -0
- package/agents/hermes/scenarios/stripe-customers-read-only.md +32 -0
- package/agents/openclaw/.archal.json +8 -0
- package/agents/openclaw/Dockerfile +96 -0
- package/agents/openclaw/README.md +120 -0
- package/agents/openclaw/drive.mjs +311 -0
- package/agents/openclaw/package.json +9 -0
- package/agents/openclaw/scenarios/github-issue-triage-read-only.md +44 -0
- package/agents/openclaw/workspace/AGENTS.md +23 -0
- package/agents/openclaw/workspace/IDENTITY.md +8 -0
- package/agents/openclaw/workspace/SOUL.md +14 -0
- package/agents/openclaw/workspace/TOOLS.md +35 -0
- package/agents/pagination-test/README.md +24 -0
- package/agents/pagination-test/scenario.md +24 -0
- package/agents/replay-capsule-harness/README.md +29 -0
- package/agents/replay-capsule-harness/observability-install-offline-e2e.mts +1517 -0
- package/agents/replay-capsule-harness/replay-capsule-e2e.mjs +104 -0
- package/clone-assets/apify/tools.json +256 -22
- package/clone-assets/calcom/tools.json +510 -0
- package/clone-assets/clickup/tools.json +1258 -0
- package/clone-assets/customerio/tools.json +386 -0
- package/clone-assets/datadog/tools.json +734 -0
- package/clone-assets/github/tools.json +306 -25
- package/clone-assets/gitlab/tools.json +999 -0
- package/clone-assets/google-workspace/tools.json +18 -6
- package/clone-assets/hubspot/tools.json +1406 -0
- package/clone-assets/jira/fidelity.json +1 -1
- package/clone-assets/jira/tools.json +266 -543
- package/clone-assets/linear/tools.json +238 -40
- package/clone-assets/ownerrez/tools.json +548 -0
- package/clone-assets/pricelabs/tools.json +343 -0
- package/clone-assets/sentry/tools.json +745 -0
- package/clone-assets/slack/tools.json +1 -2
- package/clone-assets/stripe/tools.json +185 -46
- package/clone-assets/supabase/tools.json +437 -0
- package/clone-assets/unipile/tools.json +408 -0
- package/clone-assets/webflow/tools.json +415 -0
- package/dist/autoloop-worker-types-BEb_E44z.d.cts +196 -0
- package/dist/cli.cjs +150299 -87430
- package/dist/commands/autoloop-hosted-worker.cjs +43942 -0
- package/dist/commands/autoloop-hosted-worker.d.cts +143 -0
- package/dist/commands/autoloop-pr-verification.cjs +4227 -0
- package/dist/commands/autoloop-pr-verification.d.cts +17 -0
- package/dist/{vitest/chunk-L36NXAU6.js → commands/autoloop-result-parser.cjs} +16445 -18852
- package/dist/commands/autoloop-result-parser.d.cts +39 -0
- package/dist/commands/autoloop-worker.cjs +36163 -0
- package/dist/commands/autoloop-worker.d.cts +97 -0
- package/dist/harness.cjs +1 -0
- package/dist/index.cjs +1 -1
- package/dist/replay.cjs +49624 -0
- package/dist/replay.d.cts +4625 -0
- package/dist/scenarios.cjs +80343 -0
- package/dist/scenarios.d.cts +562 -0
- package/dist/vitest/chunk-6CBYFCFK.js +4667 -0
- package/dist/vitest/chunk-ARVS45PP.js +2764 -0
- package/dist/vitest/index.cjs +6011 -75261
- package/dist/vitest/index.d.ts +7 -6
- package/dist/vitest/index.js +8 -8
- package/dist/vitest/runtime/hosted-session-reaper.cjs +792 -34359
- package/dist/vitest/runtime/hosted-session-reaper.js +1 -1
- package/dist/vitest/runtime/setup-files.js +2 -2
- package/package.json +8 -3
- package/skills/archal-agent/SKILL.md +87 -0
- package/skills/{attach → autoloop}/SKILL.md +94 -120
- package/skills/autoloop/references/hosted-sources.md +62 -0
- package/skills/autoloop/references/trace-schema-mapping.md +73 -0
- package/skills/eval/SKILL.md +35 -1
- package/skills/install-agent/SKILL.md +221 -0
- package/skills/onboard/SKILL.md +73 -5
- package/skills/scenario/SKILL.md +19 -4
- package/skills/seed/SKILL.md +237 -0
- package/dist/seed/dynamic-generator.cjs +0 -45687
- package/dist/seed/dynamic-generator.d.cts +0 -106
- package/dist/vitest/chunk-WZ7SA4CK.js +0 -47369
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "archal",
|
|
3
|
-
"version": "0.9.
|
|
3
|
+
"version": "0.9.20",
|
|
4
4
|
"description": "Test your agents & integrations against service clones",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.cjs",
|
|
@@ -17,6 +17,10 @@
|
|
|
17
17
|
"types": "./dist/harness.d.cts",
|
|
18
18
|
"default": "./dist/harness.cjs"
|
|
19
19
|
},
|
|
20
|
+
"./scenarios": {
|
|
21
|
+
"types": "./dist/scenarios.d.cts",
|
|
22
|
+
"default": "./dist/scenarios.cjs"
|
|
23
|
+
},
|
|
20
24
|
"./vitest": {
|
|
21
25
|
"types": "./dist/vitest/index.d.ts",
|
|
22
26
|
"import": "./dist/vitest/index.js",
|
|
@@ -47,6 +51,7 @@
|
|
|
47
51
|
"dist",
|
|
48
52
|
"skills",
|
|
49
53
|
"clone-assets",
|
|
54
|
+
"agents",
|
|
50
55
|
"LICENSE"
|
|
51
56
|
],
|
|
52
57
|
"peerDependencies": {
|
|
@@ -58,8 +63,8 @@
|
|
|
58
63
|
}
|
|
59
64
|
},
|
|
60
65
|
"dependencies": {
|
|
61
|
-
"@aws-sdk/client-secrets-manager": "^3.
|
|
62
|
-
"e2b": "^2.
|
|
66
|
+
"@aws-sdk/client-secrets-manager": "^3.1065.0",
|
|
67
|
+
"e2b": "^2.28.2",
|
|
63
68
|
"picomatch": "^4.0.4"
|
|
64
69
|
},
|
|
65
70
|
"scripts": {
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: archal-agent
|
|
3
|
+
description: The front door for using Archal to test, debug, and fix an AI agent. START HERE, then route to the right sub-skill instead of guessing. Use when the user says "use Archal", "test my agent", "set up Archal", "my agent is failing", "reproduce this production failure", "grade my traces", or otherwise wants to evaluate, debug, or auto-fix an agent against service clones. Pick this whenever the request is Archal-shaped but the specific workflow is unclear.
|
|
4
|
+
user-invocable: true
|
|
5
|
+
argument-hint: "[what you want to do with your agent]"
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Archal
|
|
9
|
+
|
|
10
|
+
You are the entry point for Archal. Archal is the QA layer for AI agents: it
|
|
11
|
+
runs an agent against stateful behavioral clones of real services (GitHub,
|
|
12
|
+
Slack, Stripe, Linear, Jira, Supabase, and more), scores how well it satisfies
|
|
13
|
+
each scenario, and turns failures into reproductions and PR fixes. Your job here
|
|
14
|
+
is to orient the operator and route to the sub-skill that owns their workflow.
|
|
15
|
+
Do not inline those flows; hand off by exact name and let the sub-skill drive.
|
|
16
|
+
|
|
17
|
+
## Product mental model
|
|
18
|
+
|
|
19
|
+
Archal tests AI agents against service clones instead of real services, so every
|
|
20
|
+
run is deterministic and replayable. You describe a task and success criteria,
|
|
21
|
+
the agent runs against clones, and an evaluator scores satisfaction (a
|
|
22
|
+
probability, not pass/fail). When a real production trace shows a failure, Archal
|
|
23
|
+
reproduces it on clones from trace evidence and ships the fix as a GitHub PR.
|
|
24
|
+
Everything aims at the same thing: deterministic, replayable evals you can trust.
|
|
25
|
+
|
|
26
|
+
## Decision guide
|
|
27
|
+
|
|
28
|
+
| I want to... | Route to |
|
|
29
|
+
|--------------|----------|
|
|
30
|
+
| Set up Archal in a repo from scratch (install, auth, detect clones) | `onboard` |
|
|
31
|
+
| Write or edit a scenario test file | `scenario` |
|
|
32
|
+
| Load explicit JSON/SQL/catalog state into a clone (deterministic, no LLM) | `seed` |
|
|
33
|
+
| Run scenarios or tasks and interpret satisfaction scores and failures | `eval` |
|
|
34
|
+
| Wire clones into an existing Vitest suite | `vitest` |
|
|
35
|
+
| Connect a repo's production observability so traces get captured | `install-agent` |
|
|
36
|
+
| Run the autoloop (ingest -> grade -> find-failed -> reproduce-on-clones) and ship the fix as a PR (autofix) over real trace sources | `autoloop` |
|
|
37
|
+
| Turn autofix or autoloop on/off for an agent | `autoloop`; the copilot can toggle either |
|
|
38
|
+
|
|
39
|
+
If the user is brand new and has none of this set up, start with `onboard`; it
|
|
40
|
+
detects clones and routes onward from there.
|
|
41
|
+
|
|
42
|
+
## The sub-skills
|
|
43
|
+
|
|
44
|
+
Each lives in `packages/archal/skills/` and owns its own commands, contracts,
|
|
45
|
+
and mental model. Route by exact name:
|
|
46
|
+
|
|
47
|
+
- `onboard` — set up Archal in a repo from scratch: install the CLI, handle auth,
|
|
48
|
+
detect which clones the agent needs, and hand off to the right workflow.
|
|
49
|
+
- `scenario` — author and edit scenario test files (Setup / Prompt / Expected
|
|
50
|
+
Behavior / Success Criteria) that `archal run` executes against clones.
|
|
51
|
+
- `seed` — load explicit JSON, SQL, or catalog state into a clone deterministically,
|
|
52
|
+
with no LLM in the loop, so runs start from a known fixture state.
|
|
53
|
+
- `eval` — run scenarios or inline tasks against clones and interpret the results:
|
|
54
|
+
satisfaction scores, `[D]` vs `[P]` criteria, trace inspection, failure diagnosis.
|
|
55
|
+
- `vitest` — wire clones into an existing Vitest suite using the right composition
|
|
56
|
+
pattern, so the agent's own tests route through clones.
|
|
57
|
+
- `install-agent` — connect a repo's production observability (OTLP, Langfuse,
|
|
58
|
+
Braintrust, database trace tables) so real agent traces are captured for Archal.
|
|
59
|
+
- `autoloop` — the loop over real trace sources: ingest a trace, grade it for a
|
|
60
|
+
real failure, find the failed trace, and reproduce it on clones. Autofix (the
|
|
61
|
+
fix/PR step) is a separate toggle on top of this: when turned on, autoloop
|
|
62
|
+
reproduces a failure and ships the fix as a PR.
|
|
63
|
+
|
|
64
|
+
## Autoloop and autofix toggles
|
|
65
|
+
|
|
66
|
+
Autoloop (ingest -> grade -> find-failed -> reproduce-on-clones) and autofix (the
|
|
67
|
+
fix/PR step) are **separate per-agent toggles**. Autofix is opt-in: it is not part
|
|
68
|
+
of autoloop until it is turned on. Either can be switched on or off per agent from
|
|
69
|
+
the agents tab, the CLI (`--execution-policy reproduce` is autoloop with autofix
|
|
70
|
+
off; `fix` turns autofix on), or by asking the Archal copilot in chat — the
|
|
71
|
+
copilot can toggle either for an agent. When the user asks to turn autofix or
|
|
72
|
+
autoloop on/off for an agent, handle the toggle, then route to `autoloop` for the
|
|
73
|
+
deeper flow.
|
|
74
|
+
|
|
75
|
+
## Provider-switchable remediation
|
|
76
|
+
|
|
77
|
+
The Archal copilot is not locked to one model. When autofix reproduces a failure
|
|
78
|
+
and writes a fix, the user can drive that remediation with their own agent —
|
|
79
|
+
`archal preprod` exposes `--remediation-agent auto|codex|claude|cursor` so the
|
|
80
|
+
fix is written by their Claude Code, Cursor, or Codex — or let Archal's managed
|
|
81
|
+
agent do it. Mention this when the user asks who writes the fix or wants to use
|
|
82
|
+
their own coding agent, then route to `autoloop`.
|
|
83
|
+
|
|
84
|
+
## Docs
|
|
85
|
+
|
|
86
|
+
- Quickstart: https://docs.archal.ai/quickstart
|
|
87
|
+
- Full docs: https://docs.archal.ai
|
|
@@ -1,22 +1,27 @@
|
|
|
1
1
|
---
|
|
2
|
-
name:
|
|
3
|
-
description:
|
|
2
|
+
name: autoloop
|
|
3
|
+
description: Wire Archal Autoloop to a repo plus a real agent-trace source, then drive the import -> grade -> reproduce -> PR-fix loop. USE THIS whenever the user wants to turn production agent traces into reproducible failures and fixes: "I have prod agent traces and want to reproduce a failure", "import my Langfuse / Braintrust / OTel / Supabase traces", "connect a trace source", "grade my prod traces", "turn a failed trace into a PR", "set up / configure the autoloop", or any mention of replaying, grading, or auto-fixing real traces. Also fires when diagnosing a stuck import, grade, reproduction, or PR-fix run, or configuring trace schema mapping.
|
|
4
4
|
user-invocable: true
|
|
5
5
|
argument-hint: "[repo, trace source, or failure description]"
|
|
6
6
|
---
|
|
7
7
|
|
|
8
|
-
# Archal
|
|
8
|
+
# Archal Autoloop
|
|
9
9
|
|
|
10
10
|
You help users connect real agent traces to Archal. Your job is to wire the repo,
|
|
11
11
|
trace source, harness contract, scenario contract, and GitHub PR path without
|
|
12
12
|
guessing or leaking secrets.
|
|
13
13
|
|
|
14
|
-
|
|
14
|
+
Autoloop is not a replacement for `archal run`. It uses the same harness and clone
|
|
15
15
|
routing ideas, but the trigger is a trace that already happened.
|
|
16
16
|
|
|
17
|
+
Autoloop is also not arbitrary production trace replay. It can reproduce a
|
|
18
|
+
failure only when the trace, scenario contract, and seed templates contain
|
|
19
|
+
enough evidence to reconstruct the service state that matters. If the evidence
|
|
20
|
+
is thin, block and name the missing data instead of claiming reproduction.
|
|
21
|
+
|
|
17
22
|
## Product mental model
|
|
18
23
|
|
|
19
|
-
|
|
24
|
+
Autoloop does this loop:
|
|
20
25
|
|
|
21
26
|
1. Import a trace and its child spans from a read-only source.
|
|
22
27
|
2. Grade whether the trace contains a real failure.
|
|
@@ -25,17 +30,25 @@ Attach does this loop:
|
|
|
25
30
|
4. Run the reproduction against service clones through the customer harness.
|
|
26
31
|
5. If reproduced, patch the repo and open a GitHub issue or PR.
|
|
27
32
|
|
|
28
|
-
|
|
33
|
+
Steps 1-4 are **autoloop**: ingest -> grade -> find the failed trace ->
|
|
34
|
+
reproduce on clones. Step 5 (writing the fix and opening the PR) is **autofix**,
|
|
35
|
+
a separate opt-in step that is *not* part of autoloop until it is turned on.
|
|
36
|
+
Both are per-agent toggles, switchable from the agents tab, the CLI, or by asking
|
|
37
|
+
the Archal copilot in chat.
|
|
38
|
+
|
|
39
|
+
The CLI maps these toggles to `--execution-policy`: `reproduce` runs autoloop
|
|
40
|
+
only with autofix off, while `fix` turns autofix on (autoloop plus the fix/PR
|
|
41
|
+
step). Narrower policies stop earlier:
|
|
29
42
|
|
|
30
|
-
| Policy | Stops after |
|
|
31
|
-
|
|
32
|
-
| `observe` | import |
|
|
33
|
-
| `grade` | grading |
|
|
34
|
-
| `reproduce` | reproduction |
|
|
35
|
-
| `fix` | PR or blocked fix status |
|
|
43
|
+
| Policy | Stops after | Autofix |
|
|
44
|
+
|--------|-------------|---------|
|
|
45
|
+
| `observe` | import | off |
|
|
46
|
+
| `grade` | grading | off |
|
|
47
|
+
| `reproduce` | reproduction | off |
|
|
48
|
+
| `fix` | PR or blocked fix status | on |
|
|
36
49
|
|
|
37
50
|
Do not invent or promote separate top-level judge, reproduce, fix, or
|
|
38
|
-
trace-replay commands. The public command is `archal
|
|
51
|
+
trace-replay commands. The public command is `archal autoloop`. Local stop
|
|
39
52
|
command is `archal detach` for file-backed trace directories.
|
|
40
53
|
|
|
41
54
|
## Discover first
|
|
@@ -67,6 +80,8 @@ Before changing anything, inspect the repo:
|
|
|
67
80
|
Hosted sources and `--execution-policy fix` need a GitHub remote.
|
|
68
81
|
5. Trace source shape, if available:
|
|
69
82
|
- provider: local files, Postgres, Supabase
|
|
83
|
+
- or local/client-ingested sources normalized through `archal trace-source`
|
|
84
|
+
such as file, HTTP/OTel, Langfuse, Braintrust, S3/GCS, or custom JSON
|
|
70
85
|
- trace table and span table names
|
|
71
86
|
- id columns
|
|
72
87
|
- parent span column
|
|
@@ -78,7 +93,7 @@ only the env var name or secret reference.
|
|
|
78
93
|
|
|
79
94
|
## Preconditions
|
|
80
95
|
|
|
81
|
-
You need these before a full hosted
|
|
96
|
+
You need these before a full hosted Autoloop setup:
|
|
82
97
|
|
|
83
98
|
- Archal CLI installed in the repo or reachable with `npx archal`
|
|
84
99
|
- authenticated user (`archal login`) or `ARCHAL_TOKEN=archal_ws_...`
|
|
@@ -132,7 +147,7 @@ The command must be headless and repeatable. It should run the real agent path,
|
|
|
132
147
|
not a hand-authored mock. During reproduction, Archal invokes this command
|
|
133
148
|
through `archal run`, so the agent should read the task from `AGENT_TASK` and
|
|
134
149
|
print its final answer to stdout. If the project already has `.archal.json` for
|
|
135
|
-
`archal run`, align the
|
|
150
|
+
`archal run`, align the Autoloop harness with that command.
|
|
136
151
|
|
|
137
152
|
### 3. Add or verify `archal/scenario.md`
|
|
138
153
|
|
|
@@ -174,127 +189,75 @@ archal/seeds/
|
|
|
174
189
|
jira-escalations.json
|
|
175
190
|
```
|
|
176
191
|
|
|
177
|
-
Seed templates should contain stable service state for the task family.
|
|
192
|
+
Seed templates should contain stable service state for the task family. Autoloop
|
|
178
193
|
can then fill in trace-specific identifiers. This is much safer than expecting
|
|
179
194
|
weak traces to reconstruct full service state.
|
|
180
195
|
|
|
181
196
|
## Hosted database source
|
|
182
197
|
|
|
183
|
-
Use this when traces live in Postgres or Supabase.
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
```bash
|
|
189
|
-
export TRACE_DATABASE_URL='postgres://readonly:...'
|
|
190
|
-
```
|
|
191
|
-
|
|
192
|
-
Run a check:
|
|
193
|
-
|
|
194
|
-
```bash
|
|
195
|
-
npx archal attach \
|
|
196
|
-
--repo . \
|
|
197
|
-
--source supabase \
|
|
198
|
-
--database-url-env TRACE_DATABASE_URL \
|
|
199
|
-
--source-id prod-agent-traces \
|
|
200
|
-
--check
|
|
201
|
-
```
|
|
198
|
+
Use this when traces live in Postgres or Supabase. Create a read-only DB user,
|
|
199
|
+
keep the URL in `TRACE_DATABASE_URL` (or a secret ref in hosted production), then
|
|
200
|
+
`--check` the source and re-run without `--check` to register it. Registration
|
|
201
|
+
posts the source config to Archal and returns; hosted workers own polling after
|
|
202
|
+
that, so local `archal detach` does not disable it.
|
|
202
203
|
|
|
203
|
-
|
|
204
|
+
See `references/hosted-sources.md` for the full check, register, and
|
|
205
|
+
`--database-url-secret-ref` flag blocks.
|
|
204
206
|
|
|
205
|
-
|
|
206
|
-
npx archal attach \
|
|
207
|
-
--repo . \
|
|
208
|
-
--source supabase \
|
|
209
|
-
--database-url-env TRACE_DATABASE_URL \
|
|
210
|
-
--source-id prod-agent-traces
|
|
211
|
-
```
|
|
207
|
+
## Client-side trace ingestion
|
|
212
208
|
|
|
213
|
-
|
|
214
|
-
|
|
209
|
+
Use `archal trace-source` when traces are not already in a hosted Postgres or
|
|
210
|
+
Supabase table. This command normalizes source-specific payloads into Archal
|
|
211
|
+
trace upload envelopes, writes them to a trace directory, and can upload them to
|
|
212
|
+
hosted Autoloop when workspace auth is available.
|
|
215
213
|
|
|
216
|
-
|
|
214
|
+
Common paths:
|
|
217
215
|
|
|
218
216
|
```bash
|
|
219
|
-
npx archal
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
217
|
+
npx archal trace-source import ./exports --preview --json
|
|
218
|
+
npx archal trace-source import ./exports --upload --repository owner/repo
|
|
219
|
+
|
|
220
|
+
npx archal trace-source connect langfuse \
|
|
221
|
+
--base-url https://cloud.langfuse.com \
|
|
222
|
+
--api-key-env LANGFUSE_READ_KEY \
|
|
223
|
+
--out .archal/traces/inbox
|
|
224
|
+
npx archal trace-source test langfuse
|
|
225
|
+
npx archal trace-source sync langfuse --upload --repository owner/repo
|
|
226
|
+
npx archal trace-source watch langfuse --upload --repository owner/repo
|
|
227
|
+
|
|
228
|
+
npx archal trace-source connect custom --name "prod exporter" --out .archal/traces/inbox
|
|
229
|
+
npx archal trace-source serve "prod exporter" --port 4319
|
|
223
230
|
```
|
|
224
231
|
|
|
225
|
-
|
|
232
|
+
Use `archal trace-source status [source]` to inspect registry validation,
|
|
233
|
+
cursor, and last-sync state. `watch` is for pull-style sources; push sources
|
|
234
|
+
stay continuous through `serve`.
|
|
226
235
|
|
|
227
236
|
## Trace schema mapping
|
|
228
237
|
|
|
229
|
-
|
|
238
|
+
Hosted sources default to `ai_traces` / `ai_spans` with `id` / `trace_id`
|
|
239
|
+
columns and `updated_at_id` cursor mode. When the customer's tables differ, pass
|
|
240
|
+
mapping flags to override table names, id columns, parent-span column, and
|
|
241
|
+
cursor columns; switch to `created_at_id` cursor mode for append-only sources;
|
|
242
|
+
and use `--source-*` filters to scope noisy sources by workspace, agent, status,
|
|
243
|
+
trace group, or limit.
|
|
230
244
|
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
| trace table | `ai_traces` |
|
|
234
|
-
| span table | `ai_spans` |
|
|
235
|
-
| trace id | `id` |
|
|
236
|
-
| span id | `id` |
|
|
237
|
-
| span trace id | `trace_id` |
|
|
238
|
-
| trace updated cursor | `updated_at` |
|
|
239
|
-
| span updated cursor | `updated_at` |
|
|
240
|
-
| cursor mode | `updated_at_id` |
|
|
241
|
-
|
|
242
|
-
For a custom schema, pass mapping flags:
|
|
243
|
-
|
|
244
|
-
```bash
|
|
245
|
-
npx archal attach \
|
|
246
|
-
--repo . \
|
|
247
|
-
--source postgres \
|
|
248
|
-
--database-url-env TRACE_DATABASE_URL \
|
|
249
|
-
--trace-table public.agent_traces \
|
|
250
|
-
--span-table public.agent_spans \
|
|
251
|
-
--trace-id-column trace_id \
|
|
252
|
-
--span-id-column span_id \
|
|
253
|
-
--span-trace-id-column trace_id \
|
|
254
|
-
--parent-span-id-column parent_span_id \
|
|
255
|
-
--trace-updated-at-column updated_at \
|
|
256
|
-
--span-updated-at-column updated_at
|
|
257
|
-
```
|
|
258
|
-
|
|
259
|
-
For append-only sources:
|
|
260
|
-
|
|
261
|
-
```bash
|
|
262
|
-
npx archal attach \
|
|
263
|
-
--repo . \
|
|
264
|
-
--source supabase \
|
|
265
|
-
--database-url-env TRACE_DATABASE_URL \
|
|
266
|
-
--cursor-mode created_at_id \
|
|
267
|
-
--trace-created-at-column created_at \
|
|
268
|
-
--span-created-at-column created_at
|
|
269
|
-
```
|
|
270
|
-
|
|
271
|
-
Use filters for noisy sources:
|
|
272
|
-
|
|
273
|
-
```bash
|
|
274
|
-
npx archal attach \
|
|
275
|
-
--repo . \
|
|
276
|
-
--source supabase \
|
|
277
|
-
--database-url-env TRACE_DATABASE_URL \
|
|
278
|
-
--source-workspace-id workspace_123 \
|
|
279
|
-
--source-agent-id support-agent \
|
|
280
|
-
--source-status failed error \
|
|
281
|
-
--source-trace-group billing-support \
|
|
282
|
-
--source-limit 250
|
|
283
|
-
```
|
|
245
|
+
See `references/trace-schema-mapping.md` for the full defaults table plus the
|
|
246
|
+
custom-schema, append-only, and filter flag blocks.
|
|
284
247
|
|
|
285
248
|
## Local trace directory
|
|
286
249
|
|
|
287
250
|
Use this for a local pilot or exported trace files:
|
|
288
251
|
|
|
289
252
|
```bash
|
|
290
|
-
npx archal
|
|
253
|
+
npx archal autoloop ./prod-traces --repo . --execution-policy reproduce
|
|
291
254
|
```
|
|
292
255
|
|
|
293
256
|
Artifacts are written under:
|
|
294
257
|
|
|
295
258
|
```text
|
|
296
|
-
.archal/
|
|
297
|
-
|
|
259
|
+
.archal/autoloop/
|
|
260
|
+
autoloops.json
|
|
298
261
|
runs.jsonl
|
|
299
262
|
raw/
|
|
300
263
|
grades/
|
|
@@ -313,17 +276,25 @@ npx archal detach ./prod-traces --repo .
|
|
|
313
276
|
|
|
314
277
|
Do not describe `archal detach` as a hosted source disable command.
|
|
315
278
|
|
|
316
|
-
##
|
|
279
|
+
## CLI-first operation
|
|
317
280
|
|
|
318
|
-
|
|
281
|
+
Prefer CLI and artifact evidence for handoffs unless the user explicitly asks
|
|
282
|
+
for a workspace page.
|
|
319
283
|
|
|
320
|
-
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
-
|
|
284
|
+
- Local file-backed loops: `archal autoloop <trace-dir> --repo ...` starts the
|
|
285
|
+
watcher, `archal detach <trace-dir> --repo ...` stops it, `archal
|
|
286
|
+
autoloop-status --repo ...` summarizes trace jobs, and `archal
|
|
287
|
+
autoloop-reprocess --repo ... <trace-id>` retries terminal jobs after a
|
|
288
|
+
blocker is fixed.
|
|
289
|
+
- Hosted database sources: `archal autoloop --source postgres|supabase ...`
|
|
290
|
+
registers the source and returns. Local `archal detach` does not disable a
|
|
291
|
+
hosted source because hosted workers own polling after registration.
|
|
292
|
+
- Safe resume means re-running the same `archal autoloop` registration or
|
|
293
|
+
reprocessing a terminal local trace only after the missing evidence,
|
|
294
|
+
credential, mapping, harness, or GitHub blocker is corrected.
|
|
324
295
|
|
|
325
|
-
|
|
326
|
-
|
|
296
|
+
Report exact artifact paths and statuses. Do not make dashboard pages the only
|
|
297
|
+
place a user can understand what happened.
|
|
327
298
|
|
|
328
299
|
## How to diagnose failures
|
|
329
300
|
|
|
@@ -331,6 +302,8 @@ Classify failures precisely:
|
|
|
331
302
|
|
|
332
303
|
- Trace import failure: database/source auth, mapping, cursor, filters, bad
|
|
333
304
|
trace shape.
|
|
305
|
+
- Trace ingestion failure: `trace-source` adapter mismatch, rejected hosted
|
|
306
|
+
upload, missing workspace auth, bad idempotency key, or receiver auth failure.
|
|
334
307
|
- Grade failure: judge could not determine expected outcome, missing evaluator
|
|
335
308
|
contract, trace lacks task context.
|
|
336
309
|
- Missing evidence: trace does not contain enough state to seed. Add spans,
|
|
@@ -351,7 +324,7 @@ claim reproduction succeeded.
|
|
|
351
324
|
|
|
352
325
|
## Artifact reading guide
|
|
353
326
|
|
|
354
|
-
Local file-backed
|
|
327
|
+
Local file-backed Autoloop uses repo-local artifacts. Hosted Autoloop exposes the
|
|
355
328
|
same phase information in the dashboard.
|
|
356
329
|
|
|
357
330
|
| Artifact | What to inspect |
|
|
@@ -387,7 +360,7 @@ After setup or debugging, give the user:
|
|
|
387
360
|
- source provider and source id
|
|
388
361
|
- repo full name
|
|
389
362
|
- execution policy
|
|
390
|
-
-
|
|
363
|
+
- CLI status command to run next
|
|
391
364
|
- artifacts produced, if local
|
|
392
365
|
- whether import, grade, seed, reproduce, and fix phases are ready
|
|
393
366
|
- exact blocker if any
|
|
@@ -395,8 +368,9 @@ After setup or debugging, give the user:
|
|
|
395
368
|
|
|
396
369
|
## Docs
|
|
397
370
|
|
|
398
|
-
-
|
|
399
|
-
-
|
|
371
|
+
- Autoloop production traces: https://docs.archal.ai/guides/autoloop-production-traces
|
|
372
|
+
- Autonomous loops: https://docs.archal.ai/guides/autoloop-production-traces
|
|
373
|
+
- CLI reference: https://docs.archal.ai/cli/autoloop
|
|
400
374
|
- Running with an agent: https://docs.archal.ai/guides/run-with-agent
|
|
401
375
|
- Writing scenarios: https://docs.archal.ai/guides/writing-scenarios
|
|
402
376
|
- Seeds: https://docs.archal.ai/guides/seeds
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Hosted database source
|
|
2
|
+
|
|
3
|
+
Full flag reference for registering a hosted Postgres or Supabase trace source.
|
|
4
|
+
Use this when traces live in Postgres or Supabase. Registration posts the source
|
|
5
|
+
config to Archal and returns; hosted workers own polling after that.
|
|
6
|
+
|
|
7
|
+
## Contents
|
|
8
|
+
|
|
9
|
+
- Read-only credential
|
|
10
|
+
- Check the source
|
|
11
|
+
- Register the source
|
|
12
|
+
- Secret reference (hosted production)
|
|
13
|
+
|
|
14
|
+
## Read-only credential
|
|
15
|
+
|
|
16
|
+
First, create or request a read-only database user. Then keep the URL in an env
|
|
17
|
+
var:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
export TRACE_DATABASE_URL='postgres://readonly:...'
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Check the source
|
|
24
|
+
|
|
25
|
+
Run a check:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
npx archal autoloop \
|
|
29
|
+
--repo . \
|
|
30
|
+
--source supabase \
|
|
31
|
+
--database-url-env TRACE_DATABASE_URL \
|
|
32
|
+
--source-id prod-agent-traces \
|
|
33
|
+
--check
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Register the source
|
|
37
|
+
|
|
38
|
+
Register:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
npx archal autoloop \
|
|
42
|
+
--repo . \
|
|
43
|
+
--source supabase \
|
|
44
|
+
--database-url-env TRACE_DATABASE_URL \
|
|
45
|
+
--source-id prod-agent-traces
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
This posts the source config to Archal and returns. It does not start a local
|
|
49
|
+
watcher and does not write local source state.
|
|
50
|
+
|
|
51
|
+
## Secret reference (hosted production)
|
|
52
|
+
|
|
53
|
+
Use a secret reference when the customer already has one:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
npx archal autoloop \
|
|
57
|
+
--repo . \
|
|
58
|
+
--source postgres \
|
|
59
|
+
--database-url-secret-ref aws-secretsmanager://customer/prod-agent-traces
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
`--database-url-secret-ref` must not contain a plaintext credential.
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Trace schema mapping
|
|
2
|
+
|
|
3
|
+
Full mapping-flag reference for hosted Postgres/Supabase trace sources. Use this
|
|
4
|
+
when the customer's trace and span tables do not match the defaults below.
|
|
5
|
+
|
|
6
|
+
## Contents
|
|
7
|
+
|
|
8
|
+
- Defaults
|
|
9
|
+
- Custom schema flags
|
|
10
|
+
- Append-only sources
|
|
11
|
+
- Filters for noisy sources
|
|
12
|
+
|
|
13
|
+
## Defaults
|
|
14
|
+
|
|
15
|
+
| Concept | Default |
|
|
16
|
+
|---------|---------|
|
|
17
|
+
| trace table | `ai_traces` |
|
|
18
|
+
| span table | `ai_spans` |
|
|
19
|
+
| trace id | `id` |
|
|
20
|
+
| span id | `id` |
|
|
21
|
+
| span trace id | `trace_id` |
|
|
22
|
+
| trace updated cursor | `updated_at` |
|
|
23
|
+
| span updated cursor | `updated_at` |
|
|
24
|
+
| cursor mode | `updated_at_id` |
|
|
25
|
+
|
|
26
|
+
## Custom schema flags
|
|
27
|
+
|
|
28
|
+
For a custom schema, pass mapping flags:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
npx archal autoloop \
|
|
32
|
+
--repo . \
|
|
33
|
+
--source postgres \
|
|
34
|
+
--database-url-env TRACE_DATABASE_URL \
|
|
35
|
+
--trace-table public.agent_traces \
|
|
36
|
+
--span-table public.agent_spans \
|
|
37
|
+
--trace-id-column trace_id \
|
|
38
|
+
--span-id-column span_id \
|
|
39
|
+
--span-trace-id-column trace_id \
|
|
40
|
+
--parent-span-id-column parent_span_id \
|
|
41
|
+
--trace-updated-at-column updated_at \
|
|
42
|
+
--span-updated-at-column updated_at
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Append-only sources
|
|
46
|
+
|
|
47
|
+
For append-only sources:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
npx archal autoloop \
|
|
51
|
+
--repo . \
|
|
52
|
+
--source supabase \
|
|
53
|
+
--database-url-env TRACE_DATABASE_URL \
|
|
54
|
+
--cursor-mode created_at_id \
|
|
55
|
+
--trace-created-at-column created_at \
|
|
56
|
+
--span-created-at-column created_at
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Filters for noisy sources
|
|
60
|
+
|
|
61
|
+
Use filters for noisy sources:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
npx archal autoloop \
|
|
65
|
+
--repo . \
|
|
66
|
+
--source supabase \
|
|
67
|
+
--database-url-env TRACE_DATABASE_URL \
|
|
68
|
+
--source-workspace-id workspace_123 \
|
|
69
|
+
--source-agent-id support-agent \
|
|
70
|
+
--source-status failed error \
|
|
71
|
+
--source-trace-group billing-support \
|
|
72
|
+
--source-limit 250
|
|
73
|
+
```
|
package/skills/eval/SKILL.md
CHANGED
|
@@ -90,6 +90,40 @@ Exit codes: `0` pass, `1` fail or score < threshold, `2` validation error. For G
|
|
|
90
90
|
|
|
91
91
|
Workspace API keys are runtime and CI credentials bound to one workspace. They can run clones, upload and read traces, and read usage for that workspace. They cannot manage audit events or workspace API keys. Use an owner/admin user credential, either `archal login` or a dashboard-issued user API key, for workspace administration.
|
|
92
92
|
|
|
93
|
+
## Pre-production autonomous loop
|
|
94
|
+
|
|
95
|
+
Use `archal preprod start` when the user wants a coding agent to run a bounded
|
|
96
|
+
pack of scenarios before shipping, remediate failures, rerun, validate, and
|
|
97
|
+
open a draft PR. This is different from post-production `archal autoloop`: it
|
|
98
|
+
starts from repo scenarios and clone runs, not imported production traces.
|
|
99
|
+
|
|
100
|
+
First do a safe dry run:
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
archal preprod start --scenario-count 20 --dry-run --artifacts .archal/preprod
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Then, only after the dry-run artifacts look like real agent/scenario failures,
|
|
107
|
+
allow the managed remediation path:
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
archal preprod start \
|
|
111
|
+
--scenario-count 20 \
|
|
112
|
+
--allow-external-execution \
|
|
113
|
+
--remediation-agent codex \
|
|
114
|
+
--validation-command 'pnpm test' \
|
|
115
|
+
--open-pr \
|
|
116
|
+
--pr-command 'gh pr create --draft --fill' \
|
|
117
|
+
--artifacts .archal/preprod
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Read `.archal/preprod/preprod-result.json`,
|
|
121
|
+
`.archal/preprod/preprod-failures.json`, and the remediation context before
|
|
122
|
+
summarizing results. Treat runs without validation evidence as local
|
|
123
|
+
remediation passes, not release proof. If a run stops after `initial-runs`,
|
|
124
|
+
`fix`, or `validation`, resume with `archal preprod start --resume
|
|
125
|
+
.archal/preprod --artifacts .archal/preprod`.
|
|
126
|
+
|
|
93
127
|
## Artifacts + dashboard
|
|
94
128
|
|
|
95
129
|
- **Local (always written):** `.archal/cache/last-run.json` (summary), `.archal/cache/runs/*.json` (full redacted trace).
|
|
@@ -108,6 +142,6 @@ Don't tell users they need `-o json` to save artifacts locally - that's only for
|
|
|
108
142
|
## Docs
|
|
109
143
|
|
|
110
144
|
- Running with an agent: https://docs.archal.ai/guides/run-with-agent
|
|
111
|
-
- Existing repo playbook: https://docs.archal.ai/guides/
|
|
145
|
+
- Existing repo playbook: https://docs.archal.ai/guides/run-with-agent
|
|
112
146
|
- Scenario authoring: hand off to the `scenario` skill
|
|
113
147
|
- Clone sessions: https://docs.archal.ai/guides/clone-sessions
|