@slowdini/slow-powers-opencode 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/bootstrap.md +19 -20
- package/package.json +1 -1
- package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md +8 -0
- package/skills/auditing-slow-powers-usage/evals/evals.json +2 -2
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +1 -1
- package/skills/evaluating-skills/SKILL.md +6 -4
- package/skills/evaluating-skills/evals/evals.json +1 -1
- package/skills/evaluating-skills/harness-details/claude.md +24 -1
- package/skills/evaluating-skills/runner/README.md +16 -2
- package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +56 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +43 -0
- package/skills/evaluating-skills/runner/aggregate.test.ts +76 -0
- package/skills/evaluating-skills/runner/aggregate.ts +20 -0
- package/skills/evaluating-skills/runner/plugin-shadow.test.ts +228 -0
- package/skills/evaluating-skills/runner/plugin-shadow.ts +201 -0
- package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +11 -0
- package/skills/evaluating-skills/runner/run.test.ts +488 -24
- package/skills/evaluating-skills/runner/run.ts +281 -66
- package/skills/evaluating-skills/runner/types.ts +8 -0
- package/skills/evaluating-skills/templates/eval-task-prompt.md +3 -7
- package/skills/finishing-a-development-branch/SKILL.md +1 -1
- package/skills/hardening-plans/evals/baseline/NOTES.md +7 -0
- package/skills/hardening-plans/evals/evals.json +0 -19
- package/skills/systematic-debugging/condition-based-waiting.md +10 -11
- package/skills/systematic-debugging/root-cause-tracing.md +31 -33
- package/skills/working-in-isolation/SKILL.md +58 -0
- package/skills/working-in-isolation/evals/baseline/BASELINE.md +22 -0
- package/skills/working-in-isolation/evals/baseline/NOTES.md +67 -0
- package/skills/working-in-isolation/evals/baseline/benchmark.json +51 -0
- package/skills/working-in-isolation/evals/baseline/grading/base-branch-checkout__with_skill.json +46 -0
- package/skills/working-in-isolation/evals/baseline/grading/base-branch-checkout__without_skill.json +31 -0
- package/skills/working-in-isolation/evals/baseline/grading/dirty-tree-worktree__with_skill.json +39 -0
- package/skills/working-in-isolation/evals/baseline/grading/dirty-tree-worktree__without_skill.json +24 -0
- package/skills/working-in-isolation/evals/baseline/grading/feature-branch-in-place__with_skill.json +32 -0
- package/skills/working-in-isolation/evals/baseline/grading/feature-branch-in-place__without_skill.json +17 -0
- package/skills/working-in-isolation/evals/baseline/grading/seeded-on-main-momentum__with_skill.json +39 -0
- package/skills/working-in-isolation/evals/baseline/grading/seeded-on-main-momentum__without_skill.json +24 -0
- package/skills/working-in-isolation/evals/baseline/grading/typo-no-worktree__with_skill.json +32 -0
- package/skills/working-in-isolation/evals/baseline/grading/typo-no-worktree__without_skill.json +17 -0
- package/skills/working-in-isolation/evals/evals.json +87 -0
- package/skills/writing-skills/SKILL.md +179 -195
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__new_skill.json +0 -24
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__old_skill.json +0 -24
- package/skills/using-git-worktrees/SKILL.md +0 -70
- package/skills/using-git-worktrees/evals/evals.json +0 -40
- package/skills/writing-skills/graphviz-conventions.dot +0 -172
- package/skills/writing-skills/scripts/render-graphs.js +0 -181
|
@@ -8,19 +8,18 @@ Bugs often manifest deep in the call stack (git init in wrong directory, file cr
|
|
|
8
8
|
|
|
9
9
|
## When to Use
|
|
10
10
|
|
|
11
|
-
```
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
"BETTER: Also add defense-in-depth"
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
}
|
|
11
|
+
```mermaid
|
|
12
|
+
flowchart TD
|
|
13
|
+
deep{Bug appears deep in stack?}
|
|
14
|
+
trace{Can trace backwards?}
|
|
15
|
+
symptom[Fix at symptom point]
|
|
16
|
+
origin[Trace to original trigger]
|
|
17
|
+
defense["BETTER: Also add defense-in-depth"]
|
|
18
|
+
|
|
19
|
+
deep -->|yes| trace
|
|
20
|
+
trace -->|yes| origin
|
|
21
|
+
trace -->|no - dead end| symptom
|
|
22
|
+
origin --> defense
|
|
24
23
|
```
|
|
25
24
|
|
|
26
25
|
**Use when:**
|
|
@@ -129,26 +128,25 @@ Runs tests one-by-one, stops at first polluter. See script for usage.
|
|
|
129
128
|
|
|
130
129
|
## Key Principle
|
|
131
130
|
|
|
132
|
-
```
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
}
|
|
131
|
+
```mermaid
|
|
132
|
+
flowchart TD
|
|
133
|
+
found(Found immediate cause)
|
|
134
|
+
canTrace{Can trace one level up?}
|
|
135
|
+
back[Trace backwards]
|
|
136
|
+
isSource{Is this the source?}
|
|
137
|
+
fix[Fix at source]
|
|
138
|
+
validate[Add validation at each layer]
|
|
139
|
+
impossible([Bug impossible])
|
|
140
|
+
never{{NEVER fix just the symptom}}
|
|
141
|
+
|
|
142
|
+
found --> canTrace
|
|
143
|
+
canTrace -->|yes| back
|
|
144
|
+
canTrace -->|no| never
|
|
145
|
+
back --> isSource
|
|
146
|
+
isSource -->|no - keeps going| back
|
|
147
|
+
isSource -->|yes| fix
|
|
148
|
+
fix --> validate
|
|
149
|
+
validate --> impossible
|
|
152
150
|
```
|
|
153
151
|
|
|
154
152
|
**NEVER fix just where the error appears.** Trace back to find the original trigger.
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: working-in-isolation
|
|
3
|
+
description: Use when you're about to start code changes — a feature, bugfix, or refactor — to establish an isolated workspace so your work doesn't collide with existing or in-progress work.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Working in Isolation
|
|
7
|
+
|
|
8
|
+
Before changing code, make sure your work lands somewhere it won't collide with
|
|
9
|
+
existing or in-progress work. Decide the workspace based on the git state.
|
|
10
|
+
When in doubt, pause and ask the user.
|
|
11
|
+
|
|
12
|
+
## Decision: where does this work go?
|
|
13
|
+
|
|
14
|
+
Check the current state, then take the **first** matching rule:
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
git branch --show-current # current branch
|
|
18
|
+
git status --porcelain # empty = clean tree
|
|
19
|
+
git worktree list # >1 entry = worktrees already exist
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
1. **The user named a workspace** (explicit command, or a configured preference)
|
|
23
|
+
→ follow it.
|
|
24
|
+
2. **Dirty tree (staged or unstaged changes) OR worktrees already exist**
|
|
25
|
+
→ a human or another agent is mid-work here. Use a **new worktree** so your
|
|
26
|
+
changes can't collide with theirs.
|
|
27
|
+
3. **On `dev` / `main` / `master`** → sync with origin and **check out a new
|
|
28
|
+
branch**. Keeps the base clean and makes the work easy to review.
|
|
29
|
+
4. **On any other branch** → **work in place.** The user already isolated this
|
|
30
|
+
workspace; adding a worktree is needless ceremony.
|
|
31
|
+
|
|
32
|
+
> **Hard rule: never make changes while on `dev` / `main` / `master`.** If you
|
|
33
|
+
> find yourself on a base branch, branch (rule 3) or worktree (rule 2) first.
|
|
34
|
+
|
|
35
|
+
## Creating a worktree (rule 2)
|
|
36
|
+
|
|
37
|
+
Prefer the agent platform's **native isolation tool** if it has one. Otherwise
|
|
38
|
+
fall back to a git worktree:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
git worktree add .worktrees/<branch-name> -b <branch-name>
|
|
42
|
+
cd .worktrees/<branch-name>
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Keep the worktree out of version control: if `.worktrees/` isn't already
|
|
46
|
+
git-ignored, add it to `.gitignore` and commit that first. If worktree creation
|
|
47
|
+
fails (sandbox or permission limits), say so and fall back to checking out a
|
|
48
|
+
branch in place (rule 3).
|
|
49
|
+
|
|
50
|
+
## After the workspace is set
|
|
51
|
+
|
|
52
|
+
Install dependencies and run the existing test suite once, to confirm a clean
|
|
53
|
+
baseline before you write anything.
|
|
54
|
+
|
|
55
|
+
Use the project-appropriate commands to verify the baseline is clean - lint, test, build.
|
|
56
|
+
|
|
57
|
+
If the baseline is already failing, report it before starting — you need to know
|
|
58
|
+
which failures you introduced.
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Baseline — working-in-isolation
|
|
2
|
+
|
|
3
|
+
Committed reference output from a canonical eval run. Regenerate with
|
|
4
|
+
`bun run evals:promote-baseline -- --skill working-in-isolation --iteration <N>` after aggregating. The ephemeral workspace (run records, timing,
|
|
5
|
+
dispatch files, produced outputs) stays gitignored under `skills-workspace/`.
|
|
6
|
+
|
|
7
|
+
| Field | Value |
|
|
8
|
+
|-------|-------|
|
|
9
|
+
| Mode | new-skill |
|
|
10
|
+
| Iteration | iteration-3 |
|
|
11
|
+
| Harness | claude-code |
|
|
12
|
+
| Agent model | claude-sonnet-4-6 |
|
|
13
|
+
| Judge model | claude-sonnet-4-6 |
|
|
14
|
+
| Conditions | with_skill, without_skill |
|
|
15
|
+
| Run timestamp | 2026-06-03T07:33:13.084Z |
|
|
16
|
+
| Label | (none) |
|
|
17
|
+
| Promoted from commit | e428b0e |
|
|
18
|
+
|
|
19
|
+
Files:
|
|
20
|
+
- `benchmark.json` — aggregate pass-rate / duration / token deltas.
|
|
21
|
+
- `grading/<eval-id>__<condition>.json` — per-run assertion results and judge rationales.
|
|
22
|
+
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# Baseline notes — working-in-isolation
|
|
2
|
+
|
|
3
|
+
Forward-looking observations from the canonical run (`new-skill`, iteration-3,
|
|
4
|
+
`claude-sonnet-4-6` agent + judge). Provenance is in `BASELINE.md`; headline
|
|
5
|
+
numbers are in `benchmark.json`. This file is the "what a future iterator should
|
|
6
|
+
know" companion.
|
|
7
|
+
|
|
8
|
+
## Headline
|
|
9
|
+
|
|
10
|
+
`with_skill` 0.80 vs `without_skill` 0.20 → **+0.60 pass-rate delta**, skill
|
|
11
|
+
invocation **100% (5/5)**, **0 validity warnings**. Cost: +8.2s, +1.2k tokens.
|
|
12
|
+
|
|
13
|
+
## Which cases discriminated
|
|
14
|
+
|
|
15
|
+
| Case | with | without | Notes |
|
|
16
|
+
|------|------|---------|-------|
|
|
17
|
+
| `base-branch-checkout` | 100% | 0% | The most important check (never edit on `main`). Clean +100%. |
|
|
18
|
+
| `dirty-tree-worktree` | 100% | 0% | +100% **this run**. The `without` arm did *not* isolate here — see variance note. |
|
|
19
|
+
| `seeded-on-main-momentum` | 100% | 0% | +100%. Both seeded assertions passed (stops editing on `main` AND names the base-branch hard rule). |
|
|
20
|
+
| `feature-branch-in-place` | 100% | 100% | Non-discriminating — the "work in place" case is easy enough that baseline gets it too. Candidate for a harder variant. |
|
|
21
|
+
| `typo-no-worktree` | 0% | 0% | Non-discriminating + environment-confounded — see below. |
|
|
22
|
+
|
|
23
|
+
## Caveats a re-runner must know
|
|
24
|
+
|
|
25
|
+
- **`typo-no-worktree` is confounded by the real repo's branch state.** The
|
|
26
|
+
prompt says "On my working branch `docs-cleanup`", but the eval runs in the
|
|
27
|
+
actual slow-powers repo, which has no `docs-cleanup` branch and is on a
|
|
28
|
+
different branch. Agents that introspect real git state (both arms) discover
|
|
29
|
+
the branch is missing and propose creating it — graded as "isolation
|
|
30
|
+
ceremony" → both FAIL, delta 0. This is **symmetric** (hurts both arms
|
|
31
|
+
equally), so it doesn't bias the delta, but it means the case currently
|
|
32
|
+
measures nothing. To make it discriminating, either (a) state the full git
|
|
33
|
+
context in the prompt the way `base-branch-checkout` does ("you are on
|
|
34
|
+
`docs-cleanup`, clean tree"), or (b) give each subagent an isolated throwaway
|
|
35
|
+
repo whose real state matches the prompt.
|
|
36
|
+
|
|
37
|
+
- **Iteration-2 vs iteration-3 — why the delta jumped (+0.30 → +0.60).**
|
|
38
|
+
Iteration-2 dispatched all 10 subagents *in parallel against this one shared,
|
|
39
|
+
dirty repo*. Per the skill's own Rule 2 ("dirty tree **or** worktrees already
|
|
40
|
+
exist → worktree"), agents that ran real `git status`/`git worktree list` saw
|
|
41
|
+
(a) the repo's then-uncommitted #156 changes and (b) worktrees other parallel
|
|
42
|
+
siblings had just created, and so isolated when the case wanted work-in-place
|
|
43
|
+
— contaminating `typo` and depressing the measured delta. Iteration-3 fixed
|
|
44
|
+
this by **committing the tree clean first** and **dispatching sequentially
|
|
45
|
+
with `.worktrees/` cleanup between each dispatch**, so no agent sees another's
|
|
46
|
+
git state. Lesson for any git-state-dependent skill: do **not** run its eval
|
|
47
|
+
subagents concurrently in one shared repo.
|
|
48
|
+
|
|
49
|
+
- **The write guard does not block worktree creation.** `runner/sandbox-policy.ts`
|
|
50
|
+
`BASH_MUTATION_PATTERNS` matches `git (commit|add|push|checkout|reset|restore|merge|rebase)`
|
|
51
|
+
— **not** `git worktree`. So `--guard` lets subagents `git worktree add` real
|
|
52
|
+
worktrees into the repo; `detect-stray-writes` only flags them post-hoc. We
|
|
53
|
+
cleaned them by hand both runs. Conveniently this also means the orchestrator's
|
|
54
|
+
own `git worktree remove` between-dispatch cleanup is allowed under the armed
|
|
55
|
+
guard. If we want the guard to actually sandbox this skill's behavior, add
|
|
56
|
+
`worktree` to the mutation pattern (track as an eval-harness parity item).
|
|
57
|
+
|
|
58
|
+
## Variance / next-iteration ideas
|
|
59
|
+
|
|
60
|
+
- `without_skill` on `dirty-tree-worktree` is **unstable**: iteration-2 it
|
|
61
|
+
isolated (PASS), iteration-3 it didn't (FAIL). The explicit "don't disturb my
|
|
62
|
+
in-progress changes" phrasing sometimes elicits isolation even with no skill.
|
|
63
|
+
Add runs (n>1 per condition) before trusting that case's delta.
|
|
64
|
+
- `feature-branch-in-place` passes in both arms — replace or harden it (e.g.
|
|
65
|
+
add a competing attractor) so it earns its slot.
|
|
66
|
+
- Consider a second seeded case where the cleaner correction is a **worktree**
|
|
67
|
+
rather than `switch -c`, to cover the other branch of the hard rule.
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
{
|
|
2
|
+
"generated": "2026-06-03T07:50:45.496Z",
|
|
3
|
+
"mode": "new-skill",
|
|
4
|
+
"conditions_compared": ["with_skill", "without_skill"],
|
|
5
|
+
"missing_gradings": 0,
|
|
6
|
+
"validity_warnings": [],
|
|
7
|
+
"run_summary": {
|
|
8
|
+
"with_skill": {
|
|
9
|
+
"pass_rate": {
|
|
10
|
+
"mean": 0.8,
|
|
11
|
+
"stddev": 0.4,
|
|
12
|
+
"n": 5
|
|
13
|
+
},
|
|
14
|
+
"duration_ms": {
|
|
15
|
+
"mean": 47222,
|
|
16
|
+
"stddev": 13874,
|
|
17
|
+
"n": 5
|
|
18
|
+
},
|
|
19
|
+
"total_tokens": {
|
|
20
|
+
"mean": 16696,
|
|
21
|
+
"stddev": 917,
|
|
22
|
+
"n": 5
|
|
23
|
+
},
|
|
24
|
+
"skill_invocation_n": 5,
|
|
25
|
+
"skill_invocation_rate": 1
|
|
26
|
+
},
|
|
27
|
+
"without_skill": {
|
|
28
|
+
"pass_rate": {
|
|
29
|
+
"mean": 0.2,
|
|
30
|
+
"stddev": 0.4,
|
|
31
|
+
"n": 5
|
|
32
|
+
},
|
|
33
|
+
"duration_ms": {
|
|
34
|
+
"mean": 39003,
|
|
35
|
+
"stddev": 12238,
|
|
36
|
+
"n": 5
|
|
37
|
+
},
|
|
38
|
+
"total_tokens": {
|
|
39
|
+
"mean": 15475,
|
|
40
|
+
"stddev": 1473,
|
|
41
|
+
"n": 5
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
},
|
|
45
|
+
"delta": {
|
|
46
|
+
"direction": "with_skill - without_skill",
|
|
47
|
+
"pass_rate": 0.6,
|
|
48
|
+
"duration_ms": 8219,
|
|
49
|
+
"total_tokens": 1221
|
|
50
|
+
}
|
|
51
|
+
}
|
package/skills/working-in-isolation/evals/baseline/grading/base-branch-checkout__with_skill.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "did_not_edit_on_base_branch",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "The agent made no edits to src/http/client.ts or any code file while on main; its final_message explicitly states 'Before touching src/http/client.ts, I need to set up an isolated workspace' and plans to run `git checkout -b feat/http-retry-backoff` before writing any code.",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "checked_out_branch",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "The agent explicitly proposed 'git checkout -b feat/http-retry-backoff' after syncing with origin via 'git pull origin main', and stated 'Making changes directly on `main` is prohibited' — demonstrating recognition of the base-branch rule and choosing a new branch rather than a worktree or working in place.",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"id": "ran_branch_command",
|
|
19
|
+
"passed": true,
|
|
20
|
+
"evidence": "matched ordinal 2: Bash {\"command\":\"git branch --show-current && git status --porcelain && git worktree list\",\"description\":\"Check current branch, tree cleanliness, and worktrees\"}",
|
|
21
|
+
"confidence": 1,
|
|
22
|
+
"grader": "transcript_check"
|
|
23
|
+
}
|
|
24
|
+
],
|
|
25
|
+
"summary": {
|
|
26
|
+
"passed": 3,
|
|
27
|
+
"failed": 0,
|
|
28
|
+
"total": 3,
|
|
29
|
+
"pass_rate": 1
|
|
30
|
+
},
|
|
31
|
+
"meta_results": [
|
|
32
|
+
{
|
|
33
|
+
"id": "__skill_invoked",
|
|
34
|
+
"passed": true,
|
|
35
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
36
|
+
"confidence": 1,
|
|
37
|
+
"grader": "transcript_check"
|
|
38
|
+
}
|
|
39
|
+
],
|
|
40
|
+
"meta_summary": {
|
|
41
|
+
"passed": 1,
|
|
42
|
+
"failed": 0,
|
|
43
|
+
"total": 1,
|
|
44
|
+
"skill_invoked": true
|
|
45
|
+
}
|
|
46
|
+
}
|
package/skills/working-in-isolation/evals/baseline/grading/base-branch-checkout__without_skill.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "did_not_edit_on_base_branch",
|
|
5
|
+
"passed": false,
|
|
6
|
+
"evidence": "The agent never issued a git checkout or branch-creation command (no such tool invocation exists in the record) and its final message claims 'I've added retry-with-backoff to `src/http/client.ts`' — confirming it made (or purported to make) the edit while still on `main`.",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "checked_out_branch",
|
|
12
|
+
"passed": false,
|
|
13
|
+
"evidence": "The agent never checked out a new branch. Its tool invocations show only directory checks and a file write; its final_message describes implementing the feature directly with no mention of branch management. It worked in place on `main`.",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"id": "ran_branch_command",
|
|
19
|
+
"passed": false,
|
|
20
|
+
"evidence": "no tool invocation matched /git (checkout -b|switch -c|branch )/ across 5 invocation(s)",
|
|
21
|
+
"confidence": 1,
|
|
22
|
+
"grader": "transcript_check"
|
|
23
|
+
}
|
|
24
|
+
],
|
|
25
|
+
"summary": {
|
|
26
|
+
"passed": 0,
|
|
27
|
+
"failed": 3,
|
|
28
|
+
"total": 3,
|
|
29
|
+
"pass_rate": 0
|
|
30
|
+
}
|
|
31
|
+
}
|
package/skills/working-in-isolation/evals/baseline/grading/dirty-tree-worktree__with_skill.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "isolated_in_worktree",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "Agent ran `git worktree add .worktrees/feature/users-pagination -b feature/users-pagination` (ordinal 4) and the final message confirms: \"Workspace created: `.worktrees/feature/users-pagination` (branch `feature/users-pagination`) ... The worktree is a completely separate working tree — your modified files in the main checkout are untouched.\"",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "created_worktree",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "matched ordinal 4: Bash {\"command\":\"git worktree add /Users/maximilianhaarhaus/Projects/slow-powers/.worktrees/feature/users-pagination -b feature/users-pagination 2>&1\",\"description\":\"Create a new worktree for the pagi",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "transcript_check"
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"summary": {
|
|
19
|
+
"passed": 2,
|
|
20
|
+
"failed": 0,
|
|
21
|
+
"total": 2,
|
|
22
|
+
"pass_rate": 1
|
|
23
|
+
},
|
|
24
|
+
"meta_results": [
|
|
25
|
+
{
|
|
26
|
+
"id": "__skill_invoked",
|
|
27
|
+
"passed": true,
|
|
28
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
29
|
+
"confidence": 1,
|
|
30
|
+
"grader": "transcript_check"
|
|
31
|
+
}
|
|
32
|
+
],
|
|
33
|
+
"meta_summary": {
|
|
34
|
+
"passed": 1,
|
|
35
|
+
"failed": 0,
|
|
36
|
+
"total": 1,
|
|
37
|
+
"skill_invoked": true
|
|
38
|
+
}
|
|
39
|
+
}
|
package/skills/working-in-isolation/evals/baseline/grading/dirty-tree-worktree__without_skill.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "isolated_in_worktree",
|
|
5
|
+
"passed": false,
|
|
6
|
+
"evidence": "The agent never created or entered a worktree. It checked git status, found no dirty files, and then wrote generic pagination code examples to a markdown file. The final message says 'I haven't modified any files in the working tree' and 'Share the relevant endpoint file if you'd like me to make the edit directly' — confirming no isolation mechanism was used and no actual code changes were made in any workspace.",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "created_worktree",
|
|
12
|
+
"passed": false,
|
|
13
|
+
"evidence": "no tool invocation matched /git worktree add/ across 5 invocation(s)",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "transcript_check"
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"summary": {
|
|
19
|
+
"passed": 0,
|
|
20
|
+
"failed": 2,
|
|
21
|
+
"total": 2,
|
|
22
|
+
"pass_rate": 0
|
|
23
|
+
}
|
|
24
|
+
}
|
package/skills/working-in-isolation/evals/baseline/grading/feature-branch-in-place__with_skill.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "worked_in_place_no_ceremony",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "final_message states: 'You're on `feature/payment-retry` with a clean working tree and a single worktree — rule 4 of the working-in-isolation skill applies: work in place. No new branch or worktree is needed; your branch is already the isolated workspace.' Tool invocations confirm no `git worktree add` or `git checkout -b` was issued — the agent inspected the workspace and proceeded directly to writing files on the existing branch.",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
}
|
|
10
|
+
],
|
|
11
|
+
"summary": {
|
|
12
|
+
"passed": 1,
|
|
13
|
+
"failed": 0,
|
|
14
|
+
"total": 1,
|
|
15
|
+
"pass_rate": 1
|
|
16
|
+
},
|
|
17
|
+
"meta_results": [
|
|
18
|
+
{
|
|
19
|
+
"id": "__skill_invoked",
|
|
20
|
+
"passed": true,
|
|
21
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
22
|
+
"confidence": 1,
|
|
23
|
+
"grader": "transcript_check"
|
|
24
|
+
}
|
|
25
|
+
],
|
|
26
|
+
"meta_summary": {
|
|
27
|
+
"passed": 1,
|
|
28
|
+
"failed": 0,
|
|
29
|
+
"total": 1,
|
|
30
|
+
"skill_invoked": true
|
|
31
|
+
}
|
|
32
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "worked_in_place_no_ceremony",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "No worktree creation or branch checkout appears in the tool invocations (ordinals 0–7). The agent read the prompt, invoked the TDD skill, checked the output directory, and wrote files directly. The final message confirms the work landed on the current branch: 'Done. Here’s what was added to `feature/payment-retry`'.",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
}
|
|
10
|
+
],
|
|
11
|
+
"summary": {
|
|
12
|
+
"passed": 1,
|
|
13
|
+
"failed": 0,
|
|
14
|
+
"total": 1,
|
|
15
|
+
"pass_rate": 1
|
|
16
|
+
}
|
|
17
|
+
}
|
package/skills/working-in-isolation/evals/baseline/grading/seeded-on-main-momentum__with_skill.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "stops_editing_on_main",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "The agent explicitly refused to touch any code first: 'Before touching any code, I need to get off `main` — the previous change landed directly on the base branch, which is something I should have caught earlier.' It then issued `git checkout -b feat/api-client-resilience` before presenting any code changes, and the timeout/retry implementation is shown only after the branch creation command.",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "names_base_branch_hard_rule",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "\"Before touching any code, I need to get off `main` — the previous change landed directly on the base branch, which is something I should have caught earlier. Let me set that right now.\" followed by `git checkout -b feat/api-client-resilience`",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"summary": {
|
|
19
|
+
"passed": 2,
|
|
20
|
+
"failed": 0,
|
|
21
|
+
"total": 2,
|
|
22
|
+
"pass_rate": 1
|
|
23
|
+
},
|
|
24
|
+
"meta_results": [
|
|
25
|
+
{
|
|
26
|
+
"id": "__skill_invoked",
|
|
27
|
+
"passed": true,
|
|
28
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
29
|
+
"confidence": 1,
|
|
30
|
+
"grader": "transcript_check"
|
|
31
|
+
}
|
|
32
|
+
],
|
|
33
|
+
"meta_summary": {
|
|
34
|
+
"passed": 1,
|
|
35
|
+
"failed": 0,
|
|
36
|
+
"total": 1,
|
|
37
|
+
"skill_invoked": true
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "stops_editing_on_main",
|
|
5
|
+
"passed": false,
|
|
6
|
+
"evidence": "The final_message confirms the agent continued editing directly on main: 'Done. Here’s what I added to `src/api/client.ts`' with no mention of creating a branch or worktree, treating the user's 'while you’re in there' as license to keep editing in place.",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "names_base_branch_hard_rule",
|
|
12
|
+
"passed": false,
|
|
13
|
+
"evidence": "The final_message contains only the timeout and retry implementation ('Done. Here’s what I added to `src/api/client.ts`...No other behaviour changes — all existing call sites work as before.') with no mention of `main` being a base branch, no suggestion to create an isolation branch, and no acknowledgment that working directly on `main` is a problem.",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"summary": {
|
|
19
|
+
"passed": 0,
|
|
20
|
+
"failed": 2,
|
|
21
|
+
"total": 2,
|
|
22
|
+
"pass_rate": 0
|
|
23
|
+
}
|
|
24
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "no_isolation_ceremony",
|
|
5
|
+
"passed": false,
|
|
6
|
+
"evidence": "The agent created a new branch from dev: 'Since `docs-cleanup` didn't exist yet, I created it from `dev` (synced with origin first), then switched to it: git checkout dev / git pull origin dev / git checkout -b docs-cleanup'. This is branch isolation ceremony applied to a one-character typo fix, which the assertion says should FAIL.",
|
|
7
|
+
"confidence": 0.97,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
}
|
|
10
|
+
],
|
|
11
|
+
"summary": {
|
|
12
|
+
"passed": 0,
|
|
13
|
+
"failed": 1,
|
|
14
|
+
"total": 1,
|
|
15
|
+
"pass_rate": 0
|
|
16
|
+
},
|
|
17
|
+
"meta_results": [
|
|
18
|
+
{
|
|
19
|
+
"id": "__skill_invoked",
|
|
20
|
+
"passed": true,
|
|
21
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
22
|
+
"confidence": 1,
|
|
23
|
+
"grader": "transcript_check"
|
|
24
|
+
}
|
|
25
|
+
],
|
|
26
|
+
"meta_summary": {
|
|
27
|
+
"passed": 1,
|
|
28
|
+
"failed": 0,
|
|
29
|
+
"total": 1,
|
|
30
|
+
"skill_invoked": true
|
|
31
|
+
}
|
|
32
|
+
}
|
package/skills/working-in-isolation/evals/baseline/grading/typo-no-worktree__without_skill.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "no_isolation_ceremony",
|
|
5
|
+
"passed": false,
|
|
6
|
+
"evidence": "The agent attempted to create a new branch for the typo fix (tool_invocations ordinal 2: `git checkout -b docs-cleanup`) and the final_message explicitly proposes 'Create the `docs-cleanup` branch off the current working branch' as step 1, treating the one-character edit as requiring branch isolation rather than fixing the typo in place.",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
}
|
|
10
|
+
],
|
|
11
|
+
"summary": {
|
|
12
|
+
"passed": 0,
|
|
13
|
+
"failed": 1,
|
|
14
|
+
"total": 1,
|
|
15
|
+
"pass_rate": 0
|
|
16
|
+
}
|
|
17
|
+
}
|