archal 0.9.13 → 0.9.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/README.md +81 -73
  2. package/bin/archal.cjs +1 -1
  3. package/clone-assets/apify/tools.json +668 -0
  4. package/{twin-assets → clone-assets}/discord/fidelity.json +1 -1
  5. package/{twin-assets → clone-assets}/discord/tools.json +510 -510
  6. package/clone-assets/github/fidelity.json +31 -0
  7. package/{twin-assets → clone-assets}/github/tools.json +113 -3
  8. package/{twin-assets → clone-assets}/google-workspace/fidelity.json +2 -2
  9. package/{twin-assets → clone-assets}/google-workspace/tools.json +10 -10
  10. package/{twin-assets → clone-assets}/jira/fidelity.json +44 -4
  11. package/{twin-assets → clone-assets}/jira/tools.json +1 -1
  12. package/clone-assets/linear/fidelity.json +36 -0
  13. package/{twin-assets → clone-assets}/linear/tools.json +1 -1
  14. package/{twin-assets → clone-assets}/ramp/fidelity.json +1 -1
  15. package/{twin-assets → clone-assets}/ramp/tools.json +1 -1
  16. package/clone-assets/slack/fidelity.json +38 -0
  17. package/{twin-assets → clone-assets}/slack/tools.json +1 -1
  18. package/clone-assets/stripe/fidelity.json +67 -0
  19. package/{twin-assets → clone-assets}/stripe/tools.json +42 -11
  20. package/clone-assets/supabase/fidelity.json +31 -0
  21. package/{twin-assets → clone-assets}/supabase/tools.json +1 -1
  22. package/clone-assets/tavily/tools.json +115 -0
  23. package/dist/cli.cjs +97917 -0
  24. package/dist/cli.d.cts +1 -0
  25. package/dist/harness.cjs +62 -0
  26. package/dist/harness.d.cts +20 -0
  27. package/dist/index.cjs +5 -87878
  28. package/dist/index.d.cts +3 -1
  29. package/dist/seed/dynamic-generator.cjs +8796 -9201
  30. package/dist/seed/dynamic-generator.d.cts +39 -0
  31. package/dist/vitest/chunk-2GY4SFKE.js +29279 -0
  32. package/dist/vitest/{chunk-KTMNDJFB.js → chunk-WVRVNHAX.js} +45255 -44440
  33. package/dist/vitest/index.cjs +56408 -31519
  34. package/dist/vitest/index.d.ts +61 -27
  35. package/dist/vitest/index.js +145 -1807
  36. package/dist/vitest/runtime/hosted-session-reaper.cjs +34766 -28922
  37. package/dist/vitest/runtime/hosted-session-reaper.js +1 -2
  38. package/dist/vitest/runtime/setup-files.js +2 -3
  39. package/package.json +19 -10
  40. package/skills/eval/SKILL.md +113 -0
  41. package/skills/onboard/SKILL.md +67 -36
  42. package/skills/scenario/SKILL.md +22 -20
  43. package/skills/vitest/SKILL.md +25 -24
  44. package/dist/vitest/chunk-L6HSMJ3F.js +0 -2216
  45. package/dist/vitest/chunk-YJICENME.js +0 -1230
  46. package/dist/vitest/src-JGHX6UKK.js +0 -94
  47. package/skills/audit/SKILL.md +0 -55
  48. package/skills/test/SKILL.md +0 -109
  49. package/twin-assets/github/fidelity.json +0 -13
  50. package/twin-assets/linear/fidelity.json +0 -18
  51. package/twin-assets/slack/fidelity.json +0 -20
  52. package/twin-assets/stripe/fidelity.json +0 -22
  53. package/twin-assets/supabase/fidelity.json +0 -13
@@ -1,94 +0,0 @@
1
- import {
2
- AUTH_RETRY_OPTIONS,
3
- AUTH_TOKEN_ENV_VAR,
4
- CLI_LOOPBACK_CALLBACK_HOST,
5
- CREDENTIALS_FILE,
6
- CREDENTIALS_KEY_FILE,
7
- CREDENTIALS_MASTER_KEY_ENV_VAR,
8
- ENV_TOKEN_FALLBACK_TTL_SECONDS,
9
- ExpiredEnvTokenError,
10
- HOSTED_DEFAULT_API_BASE_URL,
11
- HOSTED_DEFAULT_AUTH_BASE_URL,
12
- HOSTED_DEFAULT_RUNTIME_BASE_URL,
13
- KEYCHAIN_ACCOUNT,
14
- KEYCHAIN_SERVICE,
15
- REQUEST_TIMEOUT_MS,
16
- STRICT_ENDPOINTS_ENV_VAR,
17
- TOKEN_ENCRYPTION_PREFIX,
18
- buildAuthRequestHeaders,
19
- buildLoopbackCallbackUrl,
20
- decodeJwtPayload,
21
- deleteCredentials,
22
- errorMessage,
23
- exchangeCliAuthCode,
24
- getArchalDir,
25
- getConfiguredApiBaseUrl,
26
- getConfiguredAuthBaseUrl,
27
- getConfiguredRuntimeBaseUrl,
28
- getCredentials,
29
- getJwtExpiry,
30
- getRealHomeStoredCredentials,
31
- getStoredCredentials,
32
- isEntitled,
33
- isLoopbackHostname,
34
- isLoopbackHttpUrl,
35
- isLoopbackRedirectUri,
36
- isPlan,
37
- pollCliDeviceAuth,
38
- refreshAuthFromServer,
39
- refreshAuthFromServerWithValidation,
40
- refreshCliSession,
41
- resolveWhoamiAuthState,
42
- revokeCliSession,
43
- saveCredentials,
44
- saveRealHomeCredentials,
45
- startCliDeviceAuth,
46
- validateTokenWithServer
47
- } from "./chunk-YJICENME.js";
48
- export {
49
- AUTH_RETRY_OPTIONS,
50
- AUTH_TOKEN_ENV_VAR,
51
- CLI_LOOPBACK_CALLBACK_HOST,
52
- CREDENTIALS_FILE,
53
- CREDENTIALS_KEY_FILE,
54
- CREDENTIALS_MASTER_KEY_ENV_VAR,
55
- ENV_TOKEN_FALLBACK_TTL_SECONDS,
56
- ExpiredEnvTokenError,
57
- HOSTED_DEFAULT_API_BASE_URL,
58
- HOSTED_DEFAULT_AUTH_BASE_URL,
59
- HOSTED_DEFAULT_RUNTIME_BASE_URL,
60
- KEYCHAIN_ACCOUNT,
61
- KEYCHAIN_SERVICE,
62
- REQUEST_TIMEOUT_MS,
63
- STRICT_ENDPOINTS_ENV_VAR,
64
- TOKEN_ENCRYPTION_PREFIX,
65
- buildAuthRequestHeaders,
66
- buildLoopbackCallbackUrl,
67
- decodeJwtPayload,
68
- deleteCredentials,
69
- errorMessage,
70
- exchangeCliAuthCode,
71
- getArchalDir,
72
- getConfiguredApiBaseUrl,
73
- getConfiguredAuthBaseUrl,
74
- getConfiguredRuntimeBaseUrl,
75
- getCredentials,
76
- getJwtExpiry,
77
- getRealHomeStoredCredentials,
78
- getStoredCredentials,
79
- isEntitled,
80
- isLoopbackHostname,
81
- isLoopbackHttpUrl,
82
- isLoopbackRedirectUri,
83
- isPlan,
84
- pollCliDeviceAuth,
85
- refreshAuthFromServer,
86
- refreshAuthFromServerWithValidation,
87
- refreshCliSession,
88
- resolveWhoamiAuthState,
89
- revokeCliSession,
90
- saveCredentials,
91
- saveRealHomeCredentials,
92
- startCliDeviceAuth,
93
- validateTokenWithServer
94
- };
@@ -1,55 +0,0 @@
1
- ---
2
- name: audit
3
- description: Audit an Archal repository thoroughly. Trace real execution paths, identify concrete bugs and design flaws, distinguish root-cause fixes from architecture problems, and add regression tests for every confirmed issue.
4
- user-invocable: true
5
- argument-hint: "[repo path or scope]"
6
- ---
7
-
8
- # Archal Repository Audit
9
-
10
- Use this skill when the goal is to inspect an Archal repository deeply, find problems worth fixing, and avoid shallow or local-only patches.
11
-
12
- ## Audit standard
13
-
14
- - Trace real execution paths from entrypoints before proposing fixes.
15
- - Prefer root-cause fixes over guards, silencing, or narrow special cases.
16
- - If the real problem is architectural, report it instead of applying a monkey patch.
17
- - For every confirmed bug you fix, add the narrowest regression test that would have caught it earlier.
18
- - Always include at least one regression test that covers a stale-data row or pre-migration row when the touched path has compatibility logic.
19
-
20
- ## Working pattern
21
-
22
- 1. Map the hot paths first.
23
- - Identify the actual entrypoints: CLI commands, web routes, background jobs, and core runtime/session flows.
24
- - Ignore dead-looking surfaces until the primary paths are understood.
25
- 2. Read the execution path end to end.
26
- - Follow inputs through parsing, validation, persistence, normalization, and response shaping.
27
- - Inspect nearby invariants and adjacent edge cases before deciding on a fix.
28
- 3. Separate findings into two buckets.
29
- - **Fix now**: clear bug, contained scope, root cause understood, regression test is obvious.
30
- - **Escalate**: the defect comes from a bad abstraction or architectural boundary and a local patch would hide the real problem.
31
- 4. Validate narrowly, then broadly.
32
- - Run the smallest meaningful tests for the changed path first.
33
- - If code changed, also run the relevant package build/typecheck before concluding.
34
-
35
- ## What to look for
36
-
37
- - Compatibility shims that silently drop data from old rows or partially migrated schemas
38
- - Session lifecycle bugs around start, ready, teardown, stale state, and idempotency
39
- - Projection code that derives canonical state from stale denormalized fields
40
- - Fallback behavior that changes semantics instead of preserving them
41
- - Query builders that filter on derived fields inconsistently across list/count paths
42
- - Evidence, trace, or normalization code that double-counts, hides, or misattributes records
43
-
44
- ## Output format
45
-
46
- For each finding, report:
47
-
48
- - Problem
49
- - Technical cause
50
- - Simple explanation
51
- - Optimal fix
52
- - Why that fix is better than narrower alternatives
53
- - Regression test to add
54
-
55
- If no actionable problems are found in a slice, say that explicitly and note any remaining coverage gaps.
@@ -1,109 +0,0 @@
1
- ---
2
- name: test
3
- description: Run Archal scenarios or inline tasks against hosted twins, diagnose failed runs, and interpret satisfaction scores. Triggers on "run my scenario", "test my agent", "archal run X", "debug this failing run", "what does this satisfaction score mean".
4
- user-invocable: true
5
- argument-hint: "[scenario.md or task description]"
6
- ---
7
-
8
- # Archal Test Runner
9
-
10
- You run Archal scenarios and inline tasks, then help the user interpret the results. For setting up the agent path or `.archal.json` in a fresh repo, hand off to the `onboard` skill.
11
-
12
- ## What only you know (product mental model)
13
-
14
- - `archal run` spawns the user's agent as a child process. The agent needs:
15
- - A **runnable agent path**. Two ways to supply it: explicit `--harness <path>` (e.g. `./.archal/harness.ts`), or `.archal.json` with an `agent` command. Repo-local auto-discovery also walks up from cwd for a top-level `harness.{ts,js,mjs,cjs}`.
16
- - A **headless boundary** — no UI, no browser OAuth. The process is spawned without a shell, so interactive auth hangs forever.
17
- - Env vars — auto-injected. `ARCHAL_ENGINE_TASK` is the prompt; `ARCHAL_<TWIN>_BASE_URL` / `ARCHAL_<TWIN>_URL` point at twins; `ARCHAL_PREFLIGHT=1` is set during boot check (harness should exit early).
18
- - Every `archal run` writes local artifacts under `.archal/cache/last-run.json` and `.archal/cache/runs/*.json` **regardless** of `--output`. `--output json` is only for machine-readable stdout; it's not needed for local persistence.
19
- - **Satisfaction score** = (runs passing all criteria) / (total runs). `[D]` criteria are deterministic state checks; `[P]` criteria are LLM-judged from trace + final state.
20
-
21
- ## Preflight the harness before a run
22
-
23
- When the agent path is uncertain, or after any change to the harness file, smoke-test the harness directly before `archal run`:
24
-
25
- ```bash
26
- ARCHAL_PREFLIGHT=1 ARCHAL_ENGINE_TASK="Reply with OK and do not use tools." npx tsx ./.archal/harness.ts
27
- ```
28
-
29
- A harness that exits cleanly with no tool calls is ready. Catches: no runnable entrypoint, UI-boot assumptions, missing provider keys, service bridge misconfig. A failure here is much easier to diagnose than a silent timeout inside `archal run`.
30
-
31
- ## Running
32
-
33
- Scenario from a file:
34
-
35
- ```bash
36
- archal run scenario.md
37
- archal run scenario.md --runs 5 --seed enterprise-repo # N runs → satisfaction score
38
- ```
39
-
40
- Inline task (no scenario file):
41
-
42
- ```bash
43
- archal run --task "Create an issue titled hello" --harness ./.archal/harness.ts --twin github
44
- ```
45
-
46
- `--task` only replaces the scenario file — it still needs a runnable agent path. `--twin` is required with `--task`; repeat or comma-separate for multiple twins.
47
-
48
- When `.archal.json` exists in cwd, bare `archal run` uses it. If the user doesn't have one yet, that's setup — hand off to the `onboard` skill, which owns harness creation and `.archal.json` scaffolding.
49
-
50
- ## Interpret results
51
-
52
- Score breakdown:
53
- - `100%` = every run passed every criterion
54
- - `80%` = 4/5 runs passed
55
- - `0%` = none passed
56
-
57
- Criterion types:
58
- - `[D]` — deterministic state check. A failure is real; never a model variance artifact.
59
- - `[P]` — LLM judge reads trace + final state. A single failure can be variance; re-run with `--runs 3+` to confirm before acting on it.
60
-
61
- ## Diagnose failures
62
-
63
- Re-run with `-v` for the full trace, then classify with these signals:
64
-
65
- - **Agent bug** — wrong tool called, wrong arguments, stopped early.
66
- *Signals:* trace shows the correct tool was available but the agent chose another; or arguments are malformed.
67
- *Fix:* agent prompt, tool wiring, or underlying model.
68
-
69
- - **Scenario bug** — criteria are too strict, ambiguous, or contradict the Setup.
70
- *Signals:* agent clearly did the right thing but a `[D]` criterion expects an exact count the Setup didn't guarantee; or two criteria contradict each other.
71
- *Fix:* make Setup more specific, or relax the criterion. Use the `scenario` skill.
72
-
73
- - **Seed mismatch** — twin state doesn't match what Setup describes.
74
- *Signals:* agent's first introspection tool call returns unexpected state (e.g. Setup says "4 stale issues" but the seed has 3).
75
- *Fix:* different seed, or adjust Setup to match. `archal seed list <twin>` to browse.
76
-
77
- - **Harness bug** — agent process never started, crashed immediately, or hung.
78
- *Signals:* no tool calls in the trace, stderr shows a boot error, or the run times out at the configured `--timeout`.
79
- *Fix:* smoke-test the harness directly with `ARCHAL_PREFLIGHT=1 ARCHAL_ENGINE_TASK="Reply with OK." npx tsx ./.archal/harness.ts`, then look for UI-only imports, missing provider keys, or interactive auth.
80
-
81
- ## CI mode
82
-
83
- ```bash
84
- archal run scenario.md --runs 3 --pass-threshold 80 -o json -q
85
- ```
86
-
87
- Exit codes: `0` pass, `1` fail or score < threshold, `2` validation error. For GitHub Actions, inject `ARCHAL_TOKEN` as a secret.
88
-
89
- ## Artifacts + dashboard
90
-
91
- - **Local (always written):** `.archal/cache/last-run.json` (summary), `.archal/cache/runs/*.json` (full redacted trace).
92
- - **Hosted:** every run also uploads to https://www.archal.ai/dashboard — useful for sharing a failing trace with a colleague or comparing across agent model versions.
93
-
94
- Don't tell users they need `-o json` to save artifacts locally — that's only for stdout.
95
-
96
- ## Anti-patterns
97
-
98
- - Don't re-document the `archal run` flag list here. `archal run --help` and https://docs.archal.ai/cli/run own that — they'll drift if duplicated.
99
- - Don't guess the agent path. If the user doesn't have `--harness`, a repo-local harness, or `.archal.json`, hand off to `onboard` — it owns setup.
100
- - Don't promote `--proxy` as default. It's for agents that still call real service domains through raw HTTPS clients. Env-var wiring is the primary path; proxy is a fallback.
101
- - Don't classify a single `[P]` failure as an agent bug without re-running. Probabilistic criteria need sample size.
102
- - Don't treat a `[D]` failure as model variance. Deterministic failures are real bugs.
103
-
104
- ## Docs
105
-
106
- - Running with an agent: https://docs.archal.ai/guides/run-with-agent
107
- - Existing repo playbook: https://docs.archal.ai/guides/existing-agent-repo
108
- - Scenario authoring: hand off to the `scenario` skill
109
- - Twin sessions: https://docs.archal.ai/guides/twin-sessions
@@ -1,13 +0,0 @@
1
- {
2
- "twin": "github",
3
- "api": "REST v3",
4
- "version": "0.1.0",
5
- "capabilities": [
6
- { "name": "Stateful CRUD (issues, PRs, branches, files)", "supported": true },
7
- { "name": "Error responses (404, 422, 403)", "supported": true },
8
- { "name": "Pagination", "supported": true },
9
- { "name": "Rate limiting", "supported": true },
10
- { "name": "Webhooks", "supported": false },
11
- { "name": "Branch protection rules", "supported": false }
12
- ]
13
- }
@@ -1,18 +0,0 @@
1
- {
2
- "twin": "linear",
3
- "api": "GraphQL",
4
- "version": "0.1.0",
5
- "capabilities": [
6
- { "name": "Stateful CRUD (issues, projects, cycles, initiatives)", "supported": true },
7
- { "name": "Workflow state transitions", "supported": true },
8
- { "name": "Issue relations", "supported": true },
9
- { "name": "Comments & attachments", "supported": true },
10
- { "name": "Webhooks", "supported": false },
11
- { "name": "OAuth flows", "supported": false }
12
- ],
13
- "thresholds": {
14
- "minReplaySteps": 41,
15
- "minScenarioWindows": 18,
16
- "maxValueDifferences": 0
17
- }
18
- }
@@ -1,20 +0,0 @@
1
- {
2
- "twin": "slack",
3
- "api": "Web API",
4
- "version": "0.1.0",
5
- "capabilities": [
6
- { "name": "Visible MCP tools exactness (8-tool exposed surface)", "supported": true },
7
- { "name": "Supported Web API flows: auth.test, conversations.create/info/list/history/replies, chat.postMessage/delete, reactions.add/remove, users.list/profile.get", "supported": true },
8
- { "name": "State replay and reset exactness for supported flows", "supported": true },
9
- { "name": "Hidden/internal-only Slack tools", "supported": false },
10
- { "name": "files.* and other non-exposed Slack API areas", "supported": false },
11
- { "name": "Real-time events (WebSocket)", "supported": false }
12
- ],
13
- "thresholds": {
14
- "minReplaySteps": 29,
15
- "minWorkflowScenarios": 2,
16
- "minWorkflowSteps": 7,
17
- "minScenarioWindows": 6,
18
- "maxValueDifferences": 0
19
- }
20
- }
@@ -1,22 +0,0 @@
1
- {
2
- "twin": "stripe",
3
- "api": "Stripe API v1 (via MCP agent-toolkit)",
4
- "version": "0.1.0",
5
- "capabilities": [
6
- { "name": "Stateful CRUD (customers, products, prices)", "supported": true },
7
- { "name": "Payment lifecycle (intents, confirm, capture, cancel)", "supported": true },
8
- { "name": "Refunds (full, partial, by charge or payment_intent)", "supported": true },
9
- { "name": "Invoice lifecycle (draft, finalize, pay, void)", "supported": true },
10
- { "name": "Subscriptions (create, update, cancel, trial periods)", "supported": true },
11
- { "name": "Coupons (percent-off, amount-off, duration types)", "supported": true },
12
- { "name": "Payment Links (create, list)", "supported": true },
13
- { "name": "Balance and balance transactions", "supported": true },
14
- { "name": "Disputes (list, update evidence, submit)", "supported": true },
15
- { "name": "Error responses (400, 404, 402, 429)", "supported": true },
16
- { "name": "Rate limiting simulation", "supported": true },
17
- { "name": "Stripe Connect (multi-account)", "supported": false },
18
- { "name": "Webhooks", "supported": false },
19
- { "name": "3D Secure / SCA flows", "supported": false },
20
- { "name": "Tax calculation", "supported": false }
21
- ]
22
- }
@@ -1,13 +0,0 @@
1
- {
2
- "twin": "supabase",
3
- "api": "Management API + SQL",
4
- "version": "0.1.0",
5
- "capabilities": [
6
- { "name": "SQL execution", "supported": true },
7
- { "name": "Schema introspection", "supported": true },
8
- { "name": "Migrations", "supported": true },
9
- { "name": "Edge functions (metadata)", "supported": true },
10
- { "name": "Realtime subscriptions", "supported": false },
11
- { "name": "Storage (file uploads)", "supported": false }
12
- ]
13
- }