@slowdini/slow-powers-opencode 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/README.md +3 -3
  2. package/bootstrap.md +19 -20
  3. package/package.json +1 -1
  4. package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md +8 -0
  5. package/skills/auditing-slow-powers-usage/evals/evals.json +2 -2
  6. package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +1 -1
  7. package/skills/evaluating-skills/SKILL.md +6 -4
  8. package/skills/evaluating-skills/evals/evals.json +1 -1
  9. package/skills/evaluating-skills/harness-details/claude.md +24 -1
  10. package/skills/evaluating-skills/runner/README.md +16 -2
  11. package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +56 -0
  12. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +43 -0
  13. package/skills/evaluating-skills/runner/aggregate.test.ts +76 -0
  14. package/skills/evaluating-skills/runner/aggregate.ts +20 -0
  15. package/skills/evaluating-skills/runner/plugin-shadow.test.ts +228 -0
  16. package/skills/evaluating-skills/runner/plugin-shadow.ts +201 -0
  17. package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +11 -0
  18. package/skills/evaluating-skills/runner/run.test.ts +488 -24
  19. package/skills/evaluating-skills/runner/run.ts +281 -66
  20. package/skills/evaluating-skills/runner/types.ts +8 -0
  21. package/skills/evaluating-skills/templates/eval-task-prompt.md +3 -7
  22. package/skills/finishing-a-development-branch/SKILL.md +1 -1
  23. package/skills/hardening-plans/evals/baseline/NOTES.md +7 -0
  24. package/skills/hardening-plans/evals/evals.json +0 -19
  25. package/skills/systematic-debugging/condition-based-waiting.md +10 -11
  26. package/skills/systematic-debugging/root-cause-tracing.md +31 -33
  27. package/skills/working-in-isolation/SKILL.md +58 -0
  28. package/skills/working-in-isolation/evals/baseline/BASELINE.md +22 -0
  29. package/skills/working-in-isolation/evals/baseline/NOTES.md +67 -0
  30. package/skills/working-in-isolation/evals/baseline/benchmark.json +51 -0
  31. package/skills/working-in-isolation/evals/baseline/grading/base-branch-checkout__with_skill.json +46 -0
  32. package/skills/working-in-isolation/evals/baseline/grading/base-branch-checkout__without_skill.json +31 -0
  33. package/skills/working-in-isolation/evals/baseline/grading/dirty-tree-worktree__with_skill.json +39 -0
  34. package/skills/working-in-isolation/evals/baseline/grading/dirty-tree-worktree__without_skill.json +24 -0
  35. package/skills/working-in-isolation/evals/baseline/grading/feature-branch-in-place__with_skill.json +32 -0
  36. package/skills/working-in-isolation/evals/baseline/grading/feature-branch-in-place__without_skill.json +17 -0
  37. package/skills/working-in-isolation/evals/baseline/grading/seeded-on-main-momentum__with_skill.json +39 -0
  38. package/skills/working-in-isolation/evals/baseline/grading/seeded-on-main-momentum__without_skill.json +24 -0
  39. package/skills/working-in-isolation/evals/baseline/grading/typo-no-worktree__with_skill.json +32 -0
  40. package/skills/working-in-isolation/evals/baseline/grading/typo-no-worktree__without_skill.json +17 -0
  41. package/skills/working-in-isolation/evals/evals.json +87 -0
  42. package/skills/writing-skills/SKILL.md +179 -195
  43. package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__new_skill.json +0 -24
  44. package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__old_skill.json +0 -24
  45. package/skills/using-git-worktrees/SKILL.md +0 -70
  46. package/skills/using-git-worktrees/evals/evals.json +0 -40
  47. package/skills/writing-skills/graphviz-conventions.dot +0 -172
  48. package/skills/writing-skills/scripts/render-graphs.js +0 -181
@@ -8,19 +8,18 @@ Bugs often manifest deep in the call stack (git init in wrong directory, file cr
8
8
 
9
9
  ## When to Use
10
10
 
11
- ```dot
12
- digraph when_to_use {
13
- "Bug appears deep in stack?" [shape=diamond];
14
- "Can trace backwards?" [shape=diamond];
15
- "Fix at symptom point" [shape=box];
16
- "Trace to original trigger" [shape=box];
17
- "BETTER: Also add defense-in-depth" [shape=box];
18
-
19
- "Bug appears deep in stack?" -> "Can trace backwards?" [label="yes"];
20
- "Can trace backwards?" -> "Trace to original trigger" [label="yes"];
21
- "Can trace backwards?" -> "Fix at symptom point" [label="no - dead end"];
22
- "Trace to original trigger" -> "BETTER: Also add defense-in-depth";
23
- }
11
+ ```mermaid
12
+ flowchart TD
13
+ deep{Bug appears deep in stack?}
14
+ trace{Can trace backwards?}
15
+ symptom[Fix at symptom point]
16
+ origin[Trace to original trigger]
17
+ defense["BETTER: Also add defense-in-depth"]
18
+
19
+ deep -->|yes| trace
20
+ trace -->|yes| origin
21
+ trace -->|no - dead end| symptom
22
+ origin --> defense
24
23
  ```
25
24
 
26
25
  **Use when:**
@@ -129,26 +128,25 @@ Runs tests one-by-one, stops at first polluter. See script for usage.
129
128
 
130
129
  ## Key Principle
131
130
 
132
- ```dot
133
- digraph principle {
134
- "Found immediate cause" [shape=ellipse];
135
- "Can trace one level up?" [shape=diamond];
136
- "Trace backwards" [shape=box];
137
- "Is this the source?" [shape=diamond];
138
- "Fix at source" [shape=box];
139
- "Add validation at each layer" [shape=box];
140
- "Bug impossible" [shape=doublecircle];
141
- "NEVER fix just the symptom" [shape=octagon, style=filled, fillcolor=red, fontcolor=white];
142
-
143
- "Found immediate cause" -> "Can trace one level up?";
144
- "Can trace one level up?" -> "Trace backwards" [label="yes"];
145
- "Can trace one level up?" -> "NEVER fix just the symptom" [label="no"];
146
- "Trace backwards" -> "Is this the source?";
147
- "Is this the source?" -> "Trace backwards" [label="no - keeps going"];
148
- "Is this the source?" -> "Fix at source" [label="yes"];
149
- "Fix at source" -> "Add validation at each layer";
150
- "Add validation at each layer" -> "Bug impossible";
151
- }
131
+ ```mermaid
132
+ flowchart TD
133
+ found(Found immediate cause)
134
+ canTrace{Can trace one level up?}
135
+ back[Trace backwards]
136
+ isSource{Is this the source?}
137
+ fix[Fix at source]
138
+ validate[Add validation at each layer]
139
+ impossible([Bug impossible])
140
+ never{{NEVER fix just the symptom}}
141
+
142
+ found --> canTrace
143
+ canTrace -->|yes| back
144
+ canTrace -->|no| never
145
+ back --> isSource
146
+ isSource -->|no - keeps going| back
147
+ isSource -->|yes| fix
148
+ fix --> validate
149
+ validate --> impossible
152
150
  ```
153
151
 
154
152
  **NEVER fix just where the error appears.** Trace back to find the original trigger.
@@ -0,0 +1,58 @@
1
+ ---
2
+ name: working-in-isolation
3
+ description: Use when you're about to start code changes — a feature, bugfix, or refactor — to establish an isolated workspace so your work doesn't collide with existing or in-progress work.
4
+ ---
5
+
6
+ # Working in Isolation
7
+
8
+ Before changing code, make sure your work lands somewhere it won't collide with
9
+ existing or in-progress work. Decide the workspace based on the git state.
10
+ When in doubt, pause and ask the user.
11
+
12
+ ## Decision: where does this work go?
13
+
14
+ Check the current state, then take the **first** matching rule:
15
+
16
+ ```bash
17
+ git branch --show-current # current branch
18
+ git status --porcelain # empty = clean tree
19
+ git worktree list # >1 entry = worktrees already exist
20
+ ```
21
+
22
+ 1. **The user named a workspace** (explicit command, or a configured preference)
23
+ → follow it.
24
+ 2. **Dirty tree (staged or unstaged changes) OR worktrees already exist**
25
+ → a human or another agent is mid-work here. Use a **new worktree** so your
26
+ changes can't collide with theirs.
27
+ 3. **On `dev` / `main` / `master`** → sync with origin and **check out a new
28
+ branch**. Keeps the base clean and makes the work easy to review.
29
+ 4. **On any other branch** → **work in place.** The user already isolated this
30
+ workspace; adding a worktree is needless ceremony.
31
+
32
+ > **Hard rule: never make changes while on `dev` / `main` / `master`.** If you
33
+ > find yourself on a base branch, branch (rule 3) or worktree (rule 2) first.
34
+
35
+ ## Creating a worktree (rule 2)
36
+
37
+ Prefer the agent platform's **native isolation tool** if it has one. Otherwise
38
+ fall back to a git worktree:
39
+
40
+ ```bash
41
+ git worktree add .worktrees/<branch-name> -b <branch-name>
42
+ cd .worktrees/<branch-name>
43
+ ```
44
+
45
+ Keep the worktree out of version control: if `.worktrees/` isn't already
46
+ git-ignored, add it to `.gitignore` and commit that first. If worktree creation
47
+ fails (sandbox or permission limits), say so and fall back to checking out a
48
+ branch in place (rule 3).
49
+
50
+ ## After the workspace is set
51
+
52
+ Install dependencies and run the existing test suite once, to confirm a clean
53
+ baseline before you write anything.
54
+
55
+ Use the project-appropriate commands to verify the baseline is clean - lint, test, build.
56
+
57
+ If the baseline is already failing, report it before starting — you need to know
58
+ which failures you introduced.
@@ -0,0 +1,22 @@
1
+ # Baseline — working-in-isolation
2
+
3
+ Committed reference output from a canonical eval run. Regenerate with
4
+ `bun run evals:promote-baseline -- --skill working-in-isolation --iteration <N>` after aggregating. The ephemeral workspace (run records, timing,
5
+ dispatch files, produced outputs) stays gitignored under `skills-workspace/`.
6
+
7
+ | Field | Value |
8
+ |-------|-------|
9
+ | Mode | new-skill |
10
+ | Iteration | iteration-3 |
11
+ | Harness | claude-code |
12
+ | Agent model | claude-sonnet-4-6 |
13
+ | Judge model | claude-sonnet-4-6 |
14
+ | Conditions | with_skill, without_skill |
15
+ | Run timestamp | 2026-06-03T07:33:13.084Z |
16
+ | Label | (none) |
17
+ | Promoted from commit | e428b0e |
18
+
19
+ Files:
20
+ - `benchmark.json` — aggregate pass-rate / duration / token deltas.
21
+ - `grading/<eval-id>__<condition>.json` — per-run assertion results and judge rationales.
22
+
@@ -0,0 +1,67 @@
1
+ # Baseline notes — working-in-isolation
2
+
3
+ Forward-looking observations from the canonical run (`new-skill`, iteration-3,
4
+ `claude-sonnet-4-6` agent + judge). Provenance is in `BASELINE.md`; headline
5
+ numbers are in `benchmark.json`. This file is the "what a future iterator should
6
+ know" companion.
7
+
8
+ ## Headline
9
+
10
+ `with_skill` 0.80 vs `without_skill` 0.20 → **+0.60 pass-rate delta**, skill
11
+ invocation **100% (5/5)**, **0 validity warnings**. Cost: +8.2s, +1.2k tokens.
12
+
13
+ ## Which cases discriminated
14
+
15
+ | Case | with | without | Notes |
16
+ |------|------|---------|-------|
17
+ | `base-branch-checkout` | 100% | 0% | The most important check (never edit on `main`). Clean +100%. |
18
+ | `dirty-tree-worktree` | 100% | 0% | +100% **this run**. The `without` arm did *not* isolate here — see variance note. |
19
+ | `seeded-on-main-momentum` | 100% | 0% | +100%. Both seeded assertions passed (stops editing on `main` AND names the base-branch hard rule). |
20
+ | `feature-branch-in-place` | 100% | 100% | Non-discriminating — the "work in place" case is easy enough that baseline gets it too. Candidate for a harder variant. |
21
+ | `typo-no-worktree` | 0% | 0% | Non-discriminating + environment-confounded — see below. |
22
+
23
+ ## Caveats a re-runner must know
24
+
25
+ - **`typo-no-worktree` is confounded by the real repo's branch state.** The
26
+ prompt says "On my working branch `docs-cleanup`", but the eval runs in the
27
+ actual slow-powers repo, which has no `docs-cleanup` branch and is on a
28
+ different branch. Agents that introspect real git state (both arms) discover
29
+ the branch is missing and propose creating it — graded as "isolation
30
+ ceremony" → both FAIL, delta 0. This is **symmetric** (hurts both arms
31
+ equally), so it doesn't bias the delta, but it means the case currently
32
+ measures nothing. To make it discriminating, either (a) state the full git
33
+ context in the prompt the way `base-branch-checkout` does ("you are on
34
+ `docs-cleanup`, clean tree"), or (b) give each subagent an isolated throwaway
35
+ repo whose real state matches the prompt.
36
+
37
+ - **Iteration-2 vs iteration-3 — why the delta jumped (+0.30 → +0.60).**
38
+ Iteration-2 dispatched all 10 subagents *in parallel against this one shared,
39
+ dirty repo*. Per the skill's own Rule 2 ("dirty tree **or** worktrees already
40
+ exist → worktree"), agents that ran real `git status`/`git worktree list` saw
41
+ (a) the repo's then-uncommitted #156 changes and (b) worktrees other parallel
42
+ siblings had just created, and so isolated when the case wanted work-in-place
43
+ — contaminating `typo` and depressing the measured delta. Iteration-3 fixed
44
+ this by **committing the tree clean first** and **dispatching sequentially
45
+ with `.worktrees/` cleanup between each dispatch**, so no agent sees another's
46
+ git state. Lesson for any git-state-dependent skill: do **not** run its eval
47
+ subagents concurrently in one shared repo.
48
+
49
+ - **The write guard does not block worktree creation.** `runner/sandbox-policy.ts`
50
+ `BASH_MUTATION_PATTERNS` matches `git (commit|add|push|checkout|reset|restore|merge|rebase)`
51
+ — **not** `git worktree`. So `--guard` lets subagents `git worktree add` real
52
+ worktrees into the repo; `detect-stray-writes` only flags them post-hoc. We
53
+ cleaned them by hand both runs. Conveniently this also means the orchestrator's
54
+ own `git worktree remove` between-dispatch cleanup is allowed under the armed
55
+ guard. If we want the guard to actually sandbox this skill's behavior, add
56
+ `worktree` to the mutation pattern (track as an eval-harness parity item).
57
+
58
+ ## Variance / next-iteration ideas
59
+
60
+ - `without_skill` on `dirty-tree-worktree` is **unstable**: iteration-2 it
61
+ isolated (PASS), iteration-3 it didn't (FAIL). The explicit "don't disturb my
62
+ in-progress changes" phrasing sometimes elicits isolation even with no skill.
63
+ Add runs (n>1 per condition) before trusting that case's delta.
64
+ - `feature-branch-in-place` passes in both arms — replace or harden it (e.g.
65
+ add a competing attractor) so it earns its slot.
66
+ - Consider a second seeded case where the cleaner correction is a **worktree**
67
+ rather than `switch -c`, to cover the other branch of the hard rule.
@@ -0,0 +1,51 @@
1
+ {
2
+ "generated": "2026-06-03T07:50:45.496Z",
3
+ "mode": "new-skill",
4
+ "conditions_compared": ["with_skill", "without_skill"],
5
+ "missing_gradings": 0,
6
+ "validity_warnings": [],
7
+ "run_summary": {
8
+ "with_skill": {
9
+ "pass_rate": {
10
+ "mean": 0.8,
11
+ "stddev": 0.4,
12
+ "n": 5
13
+ },
14
+ "duration_ms": {
15
+ "mean": 47222,
16
+ "stddev": 13874,
17
+ "n": 5
18
+ },
19
+ "total_tokens": {
20
+ "mean": 16696,
21
+ "stddev": 917,
22
+ "n": 5
23
+ },
24
+ "skill_invocation_n": 5,
25
+ "skill_invocation_rate": 1
26
+ },
27
+ "without_skill": {
28
+ "pass_rate": {
29
+ "mean": 0.2,
30
+ "stddev": 0.4,
31
+ "n": 5
32
+ },
33
+ "duration_ms": {
34
+ "mean": 39003,
35
+ "stddev": 12238,
36
+ "n": 5
37
+ },
38
+ "total_tokens": {
39
+ "mean": 15475,
40
+ "stddev": 1473,
41
+ "n": 5
42
+ }
43
+ }
44
+ },
45
+ "delta": {
46
+ "direction": "with_skill - without_skill",
47
+ "pass_rate": 0.6,
48
+ "duration_ms": 8219,
49
+ "total_tokens": 1221
50
+ }
51
+ }
@@ -0,0 +1,46 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "did_not_edit_on_base_branch",
5
+ "passed": true,
6
+ "evidence": "The agent made no edits to src/http/client.ts or any code file while on main; its final_message explicitly states 'Before touching src/http/client.ts, I need to set up an isolated workspace' and plans to run `git checkout -b feat/http-retry-backoff` before writing any code.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "checked_out_branch",
12
+ "passed": true,
13
+ "evidence": "The agent explicitly proposed 'git checkout -b feat/http-retry-backoff' after syncing with origin via 'git pull origin main', and stated 'Making changes directly on `main` is prohibited' — demonstrating recognition of the base-branch rule and choosing a new branch rather than a worktree or working in place.",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ },
17
+ {
18
+ "id": "ran_branch_command",
19
+ "passed": true,
20
+ "evidence": "matched ordinal 2: Bash {\"command\":\"git branch --show-current && git status --porcelain && git worktree list\",\"description\":\"Check current branch, tree cleanliness, and worktrees\"}",
21
+ "confidence": 1,
22
+ "grader": "transcript_check"
23
+ }
24
+ ],
25
+ "summary": {
26
+ "passed": 3,
27
+ "failed": 0,
28
+ "total": 3,
29
+ "pass_rate": 1
30
+ },
31
+ "meta_results": [
32
+ {
33
+ "id": "__skill_invoked",
34
+ "passed": true,
35
+ "evidence": "Skill invocation verified from transcript.",
36
+ "confidence": 1,
37
+ "grader": "transcript_check"
38
+ }
39
+ ],
40
+ "meta_summary": {
41
+ "passed": 1,
42
+ "failed": 0,
43
+ "total": 1,
44
+ "skill_invoked": true
45
+ }
46
+ }
@@ -0,0 +1,31 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "did_not_edit_on_base_branch",
5
+ "passed": false,
6
+ "evidence": "The agent never issued a git checkout or branch-creation command (no such tool invocation exists in the record) and its final message claims 'I've added retry-with-backoff to `src/http/client.ts`' — confirming it made (or purported to make) the edit while still on `main`.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "checked_out_branch",
12
+ "passed": false,
13
+ "evidence": "The agent never checked out a new branch. Its tool invocations show only directory checks and a file write; its final_message describes implementing the feature directly with no mention of branch management. It worked in place on `main`.",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ },
17
+ {
18
+ "id": "ran_branch_command",
19
+ "passed": false,
20
+ "evidence": "no tool invocation matched /git (checkout -b|switch -c|branch )/ across 5 invocation(s)",
21
+ "confidence": 1,
22
+ "grader": "transcript_check"
23
+ }
24
+ ],
25
+ "summary": {
26
+ "passed": 0,
27
+ "failed": 3,
28
+ "total": 3,
29
+ "pass_rate": 0
30
+ }
31
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "isolated_in_worktree",
5
+ "passed": true,
6
+ "evidence": "Agent ran `git worktree add .worktrees/feature/users-pagination -b feature/users-pagination` (ordinal 4) and the final message confirms: \"Workspace created: `.worktrees/feature/users-pagination` (branch `feature/users-pagination`) ... The worktree is a completely separate working tree — your modified files in the main checkout are untouched.\"",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "created_worktree",
12
+ "passed": true,
13
+ "evidence": "matched ordinal 4: Bash {\"command\":\"git worktree add /Users/maximilianhaarhaus/Projects/slow-powers/.worktrees/feature/users-pagination -b feature/users-pagination 2>&1\",\"description\":\"Create a new worktree for the pagi",
14
+ "confidence": 1,
15
+ "grader": "transcript_check"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,24 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "isolated_in_worktree",
5
+ "passed": false,
6
+ "evidence": "The agent never created or entered a worktree. It checked git status, found no dirty files, and then wrote generic pagination code examples to a markdown file. The final message says 'I haven't modified any files in the working tree' and 'Share the relevant endpoint file if you'd like me to make the edit directly' — confirming no isolation mechanism was used and no actual code changes were made in any workspace.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "created_worktree",
12
+ "passed": false,
13
+ "evidence": "no tool invocation matched /git worktree add/ across 5 invocation(s)",
14
+ "confidence": 1,
15
+ "grader": "transcript_check"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 0,
20
+ "failed": 2,
21
+ "total": 2,
22
+ "pass_rate": 0
23
+ }
24
+ }
@@ -0,0 +1,32 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "worked_in_place_no_ceremony",
5
+ "passed": true,
6
+ "evidence": "final_message states: 'You're on `feature/payment-retry` with a clean working tree and a single worktree — rule 4 of the working-in-isolation skill applies: work in place. No new branch or worktree is needed; your branch is already the isolated workspace.' Tool invocations confirm no `git worktree add` or `git checkout -b` was issued — the agent inspected the workspace and proceeded directly to writing files on the existing branch.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ }
10
+ ],
11
+ "summary": {
12
+ "passed": 1,
13
+ "failed": 0,
14
+ "total": 1,
15
+ "pass_rate": 1
16
+ },
17
+ "meta_results": [
18
+ {
19
+ "id": "__skill_invoked",
20
+ "passed": true,
21
+ "evidence": "Skill invocation verified from transcript.",
22
+ "confidence": 1,
23
+ "grader": "transcript_check"
24
+ }
25
+ ],
26
+ "meta_summary": {
27
+ "passed": 1,
28
+ "failed": 0,
29
+ "total": 1,
30
+ "skill_invoked": true
31
+ }
32
+ }
@@ -0,0 +1,17 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "worked_in_place_no_ceremony",
5
+ "passed": true,
6
+ "evidence": "No worktree creation or branch checkout appears in the tool invocations (ordinals 0–7). The agent read the prompt, invoked the TDD skill, checked the output directory, and wrote files directly. The final message confirms the work landed on the current branch: 'Done. Here’s what was added to `feature/payment-retry`'.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ }
10
+ ],
11
+ "summary": {
12
+ "passed": 1,
13
+ "failed": 0,
14
+ "total": 1,
15
+ "pass_rate": 1
16
+ }
17
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "stops_editing_on_main",
5
+ "passed": true,
6
+ "evidence": "The agent explicitly refused to touch any code first: 'Before touching any code, I need to get off `main` — the previous change landed directly on the base branch, which is something I should have caught earlier.' It then issued `git checkout -b feat/api-client-resilience` before presenting any code changes, and the timeout/retry implementation is shown only after the branch creation command.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "names_base_branch_hard_rule",
12
+ "passed": true,
13
+ "evidence": "\"Before touching any code, I need to get off `main` — the previous change landed directly on the base branch, which is something I should have caught earlier. Let me set that right now.\" followed by `git checkout -b feat/api-client-resilience`",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,24 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "stops_editing_on_main",
5
+ "passed": false,
6
+ "evidence": "The final_message confirms the agent continued editing directly on main: 'Done. Here’s what I added to `src/api/client.ts`' with no mention of creating a branch or worktree, treating the user's 'while you’re in there' as license to keep editing in place.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "names_base_branch_hard_rule",
12
+ "passed": false,
13
+ "evidence": "The final_message contains only the timeout and retry implementation ('Done. Here’s what I added to `src/api/client.ts`...No other behaviour changes — all existing call sites work as before.') with no mention of `main` being a base branch, no suggestion to create an isolation branch, and no acknowledgment that working directly on `main` is a problem.",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 0,
20
+ "failed": 2,
21
+ "total": 2,
22
+ "pass_rate": 0
23
+ }
24
+ }
@@ -0,0 +1,32 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "no_isolation_ceremony",
5
+ "passed": false,
6
+ "evidence": "The agent created a new branch from dev: 'Since `docs-cleanup` didn't exist yet, I created it from `dev` (synced with origin first), then switched to it: git checkout dev / git pull origin dev / git checkout -b docs-cleanup'. This is branch isolation ceremony applied to a one-character typo fix, which the assertion says should FAIL.",
7
+ "confidence": 0.97,
8
+ "grader": "llm_judge"
9
+ }
10
+ ],
11
+ "summary": {
12
+ "passed": 0,
13
+ "failed": 1,
14
+ "total": 1,
15
+ "pass_rate": 0
16
+ },
17
+ "meta_results": [
18
+ {
19
+ "id": "__skill_invoked",
20
+ "passed": true,
21
+ "evidence": "Skill invocation verified from transcript.",
22
+ "confidence": 1,
23
+ "grader": "transcript_check"
24
+ }
25
+ ],
26
+ "meta_summary": {
27
+ "passed": 1,
28
+ "failed": 0,
29
+ "total": 1,
30
+ "skill_invoked": true
31
+ }
32
+ }
@@ -0,0 +1,17 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "no_isolation_ceremony",
5
+ "passed": false,
6
+ "evidence": "The agent attempted to create a new branch for the typo fix (tool_invocations ordinal 2: `git checkout -b docs-cleanup`) and the final_message explicitly proposes 'Create the `docs-cleanup` branch off the current working branch' as step 1, treating the one-character edit as requiring branch isolation rather than fixing the typo in place.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ }
10
+ ],
11
+ "summary": {
12
+ "passed": 0,
13
+ "failed": 1,
14
+ "total": 1,
15
+ "pass_rate": 0
16
+ }
17
+ }