@metaharness/darwin 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +221 -0
- package/SECURITY.md +200 -0
- package/dist/archive.d.ts +89 -0
- package/dist/archive.d.ts.map +1 -0
- package/dist/archive.js +220 -0
- package/dist/archive.js.map +1 -0
- package/dist/bench/gates.d.ts +19 -0
- package/dist/bench/gates.d.ts.map +1 -0
- package/dist/bench/gates.js +82 -0
- package/dist/bench/gates.js.map +1 -0
- package/dist/bench/index.d.ts +11 -0
- package/dist/bench/index.d.ts.map +1 -0
- package/dist/bench/index.js +25 -0
- package/dist/bench/index.js.map +1 -0
- package/dist/bench/lineage.d.ts +60 -0
- package/dist/bench/lineage.d.ts.map +1 -0
- package/dist/bench/lineage.js +166 -0
- package/dist/bench/lineage.js.map +1 -0
- package/dist/bench/metrics.d.ts +32 -0
- package/dist/bench/metrics.d.ts.map +1 -0
- package/dist/bench/metrics.js +52 -0
- package/dist/bench/metrics.js.map +1 -0
- package/dist/bench/promotion.d.ts +21 -0
- package/dist/bench/promotion.d.ts.map +1 -0
- package/dist/bench/promotion.js +109 -0
- package/dist/bench/promotion.js.map +1 -0
- package/dist/bench/risk.d.ts +45 -0
- package/dist/bench/risk.d.ts.map +1 -0
- package/dist/bench/risk.js +71 -0
- package/dist/bench/risk.js.map +1 -0
- package/dist/bench/runner.d.ts +53 -0
- package/dist/bench/runner.d.ts.map +1 -0
- package/dist/bench/runner.js +131 -0
- package/dist/bench/runner.js.map +1 -0
- package/dist/bench/score.d.ts +16 -0
- package/dist/bench/score.d.ts.map +1 -0
- package/dist/bench/score.js +83 -0
- package/dist/bench/score.js.map +1 -0
- package/dist/bench/stats.d.ts +26 -0
- package/dist/bench/stats.d.ts.map +1 -0
- package/dist/bench/stats.js +74 -0
- package/dist/bench/stats.js.map +1 -0
- package/dist/bench/suite.d.ts +16 -0
- package/dist/bench/suite.d.ts.map +1 -0
- package/dist/bench/suite.js +59 -0
- package/dist/bench/suite.js.map +1 -0
- package/dist/bench/types.d.ts +135 -0
- package/dist/bench/types.d.ts.map +1 -0
- package/dist/bench/types.js +16 -0
- package/dist/bench/types.js.map +1 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +125 -0
- package/dist/cli.js.map +1 -0
- package/dist/evolve.d.ts +11 -0
- package/dist/evolve.d.ts.map +1 -0
- package/dist/evolve.js +129 -0
- package/dist/evolve.js.map +1 -0
- package/dist/generator.d.ts +9 -0
- package/dist/generator.d.ts.map +1 -0
- package/dist/generator.js +46 -0
- package/dist/generator.js.map +1 -0
- package/dist/index.d.ts +12 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +37 -0
- package/dist/index.js.map +1 -0
- package/dist/mutator.d.ts +61 -0
- package/dist/mutator.d.ts.map +1 -0
- package/dist/mutator.js +193 -0
- package/dist/mutator.js.map +1 -0
- package/dist/openrouter-mutator.d.ts +32 -0
- package/dist/openrouter-mutator.d.ts.map +1 -0
- package/dist/openrouter-mutator.js +81 -0
- package/dist/openrouter-mutator.js.map +1 -0
- package/dist/repo_profiler.d.ts +8 -0
- package/dist/repo_profiler.d.ts.map +1 -0
- package/dist/repo_profiler.js +127 -0
- package/dist/repo_profiler.js.map +1 -0
- package/dist/safety.d.ts +45 -0
- package/dist/safety.d.ts.map +1 -0
- package/dist/safety.js +191 -0
- package/dist/safety.js.map +1 -0
- package/dist/sandbox.d.ts +24 -0
- package/dist/sandbox.d.ts.map +1 -0
- package/dist/sandbox.js +153 -0
- package/dist/sandbox.js.map +1 -0
- package/dist/scorer.d.ts +26 -0
- package/dist/scorer.d.ts.map +1 -0
- package/dist/scorer.js +168 -0
- package/dist/scorer.js.map +1 -0
- package/dist/templates.d.ts +37 -0
- package/dist/templates.d.ts.map +1 -0
- package/dist/templates.js +309 -0
- package/dist/templates.js.map +1 -0
- package/dist/types.d.ts +123 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +13 -0
- package/dist/types.js.map +1 -0
- package/package.json +57 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 RuvNet (https://ruv.io)
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# @metaharness/darwin
|
|
2
|
+
|
|
3
|
+
> Darwin Mode — **the model is frozen; the harness evolves.**
|
|
4
|
+
|
|
5
|
+
Bounded, empirical, population-based self-improvement of an agent harness
|
|
6
|
+
(ADR-070…075). "Self-improving agents" is widely misread as "the model trains
|
|
7
|
+
itself." Darwin Mode ships the practical version: an agent **modifies its own
|
|
8
|
+
harness**, runs benchmarks in a sandbox, keeps the variants that *measurably*
|
|
9
|
+
improve, and builds an **archive of successful descendants**. The foundation
|
|
10
|
+
model never changes — what evolves is the operating system around it (planner,
|
|
11
|
+
context builder, reviewer, retry/tool/memory/score policy). This follows the
|
|
12
|
+
**Darwin Gödel Machine** lineage: iteratively mutate the source of a coding
|
|
13
|
+
agent, then *empirically validate* each variant — no weight updates, just a
|
|
14
|
+
population, a benchmark, and an archive.
|
|
15
|
+
|
|
16
|
+
```
|
|
17
|
+
repo
|
|
18
|
+
→ profile RepoProfile (pkg mgr, test cmd, source/risk files)
|
|
19
|
+
→ baseline generate the seven mutation-surface files
|
|
20
|
+
→ mutate pick ONE approved surface, perturb it (behind the gate)
|
|
21
|
+
→ sandbox safety-inspect → run the test command (no shell, no net, no secrets)
|
|
22
|
+
→ score weighted base score − hard penalty layer
|
|
23
|
+
→ archive record parent→child as a TREE (not a single best branch)
|
|
24
|
+
→ select sample the next generation from the WHOLE archive
|
|
25
|
+
→ repeat
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Dependency-free: **Node ≥ 20 built-ins only**, no runtime dependencies.
|
|
29
|
+
|
|
30
|
+
## Quick start
|
|
31
|
+
|
|
32
|
+
Build (TypeScript → `dist/`):
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
npm run build # tsc
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Then evolve a repo with the CLI (one verb, `evolve`):
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
metaharness-darwin evolve <repo> [--generations N] [--children N] [--concurrency N] [--seed N]
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
| Flag | Meaning | Default |
|
|
45
|
+
|------|---------|---------|
|
|
46
|
+
| `--generations N` | number of generations to run | `3` |
|
|
47
|
+
| `--children N` | children produced per parent per generation | `4` |
|
|
48
|
+
| `--concurrency N` | max variants evaluated concurrently (bounded fan-out) | `4` |
|
|
49
|
+
| `--seed N` | deterministic seed for mutation selection | `0` |
|
|
50
|
+
|
|
51
|
+
The `<repo>` argument defaults to the current directory. Everything is written
|
|
52
|
+
under a self-describing `.metaharness/` work tree inside the repo:
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
<repo>/.metaharness/
|
|
56
|
+
├── archive.json # the population TREE: ArchiveRecord[] (variant + score + children)
|
|
57
|
+
├── lineage.json # serialized graph { nodes, edges } for rendering
|
|
58
|
+
├── variants/ # one directory per variant (its mutation-surface files)
|
|
59
|
+
│ ├── baseline/
|
|
60
|
+
│ ├── g1_v0/ …
|
|
61
|
+
├── runs/ # one <variantId>.json per variant: { traces, score }
|
|
62
|
+
└── reports/
|
|
63
|
+
└── winner.json # the best scored ArchiveRecord
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Sample run output (leaderboard + winner lineage, printed to stdout):
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
Darwin Mode — leaderboard
|
|
70
|
+
0.842 g2_v1 [contextBuilder] safety=1.00 pass=1.00 ◀ winner
|
|
71
|
+
0.791 g1_v0 [reviewer] safety=1.00 pass=1.00
|
|
72
|
+
0.788 baseline [planner] safety=1.00 pass=1.00
|
|
73
|
+
0.000 g1_v3 [toolPolicy] safety=0.00 pass=0.00
|
|
74
|
+
|
|
75
|
+
Winner: g2_v1
|
|
76
|
+
Lineage: baseline → g1_v0 → g2_v1
|
|
77
|
+
Delta over baseline: +0.054
|
|
78
|
+
|
|
79
|
+
Artifacts: <repo>/.metaharness
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## The seven mutation surfaces
|
|
83
|
+
|
|
84
|
+
A child variant may mutate **exactly one** surface per generation, and a variant
|
|
85
|
+
directory may contain **only** these seven files — nothing else (the allowlist is
|
|
86
|
+
enforced by `safety.ts`, see `FILE_BY_SURFACE` / `APPROVED_FILES`). Each surface
|
|
87
|
+
is pure, side-effect-free policy logic over injected data.
|
|
88
|
+
|
|
89
|
+
| Surface (`MutationSurface`) | File | Governs |
|
|
90
|
+
|-----------------------------|------|---------|
|
|
91
|
+
| `planner` | `planner.ts` | task string → ordered plan steps (map → inspect → patch → verify) |
|
|
92
|
+
| `contextBuilder` | `context_builder.ts` | ranks candidate files by term overlap with the task |
|
|
93
|
+
| `reviewer` | `reviewer.ts` | flags changed files against an injected risk-file list + test outcome |
|
|
94
|
+
| `retryPolicy` | `retry_policy.ts` | whether/how to retry given a symbolic failure classification |
|
|
95
|
+
| `toolPolicy` | `tool_policy.ts` | allow-list + deterministic ordering over symbolic command kinds |
|
|
96
|
+
| `memoryPolicy` | `memory_policy.ts` | whether an outcome record is worth remembering |
|
|
97
|
+
| `scorePolicy` | `score_policy.ts` | the weight vector a variant *proposes* over the positive scoring terms |
|
|
98
|
+
|
|
99
|
+
A variant may *propose* score weights via `scorePolicy`, but it can never
|
|
100
|
+
re-grade itself: the verdict that decides promotion is computed by the frozen
|
|
101
|
+
kernel scorer (see below), not by the variant's own file.
|
|
102
|
+
|
|
103
|
+
## Scoring and the promotion gate
|
|
104
|
+
|
|
105
|
+
The scorer (`src/scorer.ts`, ADR-072) is a **pure function** — re-running it on
|
|
106
|
+
the same traces yields an identical verdict. It is a weighted base score over six
|
|
107
|
+
`[0,1]` terms (weights from `scoreWeights()`, summing to 1.0):
|
|
108
|
+
|
|
109
|
+
```
|
|
110
|
+
baseScore = 0.35·taskSuccess + 0.20·testPassRate + 0.15·traceQuality
|
|
111
|
+
+ 0.10·costEfficiency + 0.10·latencyEfficiency + 0.10·safetyScore
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
minus a hard **penalty layer** read out of the run traces (a single safety
|
|
115
|
+
violation can drive the final score negative — that is the point):
|
|
116
|
+
|
|
117
|
+
```
|
|
118
|
+
finalScore = baseScore − 0.30·secretExposure − 0.25·destructiveAction
|
|
119
|
+
− 0.20·hallucinatedFile − 0.15·toolLoop − 0.10·costOverrun
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
A child replaces its parent only when **all four** promotion clauses hold
|
|
123
|
+
against the parent:
|
|
124
|
+
|
|
125
|
+
```
|
|
126
|
+
1. beatsParent finalScore > parentFinalScore + promotionDelta (default delta 0.05)
|
|
127
|
+
2. safetyOk safetyScore ≥ 0.95
|
|
128
|
+
3. noRegression testPassRate ≥ parentTestPassRate
|
|
129
|
+
4. noBlockedActions safetyScore == 1.0 (zero blocked actions in any trace)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
Non-promoted variants are **retained**, not deleted — "did not clear the gate"
|
|
133
|
+
means "not chosen as a parent by the default policy," never "removed."
|
|
134
|
+
|
|
135
|
+
## The archive: evolve like species, not release like software
|
|
136
|
+
|
|
137
|
+
The archive (`src/archive.ts`, ADR-073) is a **tree** of variants keyed by id and
|
|
138
|
+
persisted as `archive.json`, not a single best branch. Selection
|
|
139
|
+
(`selectParents`) samples the **whole** archive — including older, non-promoted
|
|
140
|
+
branches — which is how evolution escapes hill-climbing: when a generation
|
|
141
|
+
stalls (no promotions), a weak-looking ancestor can still seed a strong branch.
|
|
142
|
+
Insertion order is preserved, so `best()`, tie-breaks, and `selectParents` are
|
|
143
|
+
all deterministic and reproducible from `archive.json` alone.
|
|
144
|
+
|
|
145
|
+
## Safety model
|
|
146
|
+
|
|
147
|
+
A self-modifying agent that can edit anything is a liability. Darwin Mode's bound
|
|
148
|
+
is enforced in `src/safety.ts` (ADR-071) as the **load-bearing security
|
|
149
|
+
boundary**, with two independent, defense-in-depth checks:
|
|
150
|
+
|
|
151
|
+
- **`inspectVariant(dir)`** runs *before any variant executes*. It disqualifies a
|
|
152
|
+
variant directory containing anything other than the seven approved files, a
|
|
153
|
+
blocked filename (`.env`, `secret`, `id_rsa`, `.git`, `package.json`, …), a
|
|
154
|
+
symlink or nested directory, or blocked content (`process.env`,
|
|
155
|
+
`child_process`, `eval`, `fetch`, restricted node builtins, shell strings, …).
|
|
156
|
+
- **`validateGeneratedCode(code)`** runs *before generated code is written to
|
|
157
|
+
disk* (the LLM-mutator path). Independent pattern set; a violating generation
|
|
158
|
+
is **discarded**, never repaired in place.
|
|
159
|
+
|
|
160
|
+
The gate runs **first**: a disqualified variant never has its test command run —
|
|
161
|
+
the sandbox seals the trace with the reserved exit code `99` and records the
|
|
162
|
+
findings as `blockedActions`, which zeroes `safetyScore` and makes promotion
|
|
163
|
+
impossible. When a variant *is* admitted, the sandbox (`src/sandbox.ts`) is
|
|
164
|
+
**shell-free** (the test command is split to argv and run via `execFile`, never a
|
|
165
|
+
shell — no command-injection surface) and runs under a **scrubbed environment**
|
|
166
|
+
(only `PATH` plus three identifying variables; nothing else from `process.env`
|
|
167
|
+
leaks, so secrets, tokens, and proxy settings never reach a variant).
|
|
168
|
+
|
|
169
|
+
See [`SECURITY.md`](../../SECURITY.md) for the full threat model.
|
|
170
|
+
|
|
171
|
+
## Programmatic API
|
|
172
|
+
|
|
173
|
+
```ts
|
|
174
|
+
import { evolve } from '@metaharness/darwin';
|
|
175
|
+
|
|
176
|
+
const result = await evolve({
|
|
177
|
+
repoRoot: '/abs/path/to/repo',
|
|
178
|
+
workRoot: '/abs/path/to/repo/.metaharness',
|
|
179
|
+
generations: 3,
|
|
180
|
+
childrenPerGeneration: 4,
|
|
181
|
+
concurrency: 4,
|
|
182
|
+
promotionDelta: 0.05,
|
|
183
|
+
seed: 0,
|
|
184
|
+
tasks: [
|
|
185
|
+
'run repository test suite',
|
|
186
|
+
'verify generated harness safety',
|
|
187
|
+
'check trace quality',
|
|
188
|
+
],
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
result.winner; // the best scored ArchiveRecord (or null)
|
|
192
|
+
result.winnerLineage; // ['baseline', 'g1_v0', 'g2_v1'] — root → winner
|
|
193
|
+
result.records; // every ArchiveRecord, in insertion order
|
|
194
|
+
result.baseline; // the baseline record
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
The package also re-exports the building blocks behind `evolve`: `profileRepo`,
|
|
198
|
+
`generateBaselineHarness`, `createChildVariant`, `DeterministicMutator` /
|
|
199
|
+
`CodeGenerator`, `runVariantTask` / `runVariantTasks`, `scoreVariant` /
|
|
200
|
+
`scoreWeights`, `Archive`, `inspectVariant` / `validateGeneratedCode`, plus the
|
|
201
|
+
`SURFACES`, `FILE_BY_SURFACE`, and `APPROVED_FILES` constants.
|
|
202
|
+
|
|
203
|
+
## Status
|
|
204
|
+
|
|
205
|
+
**Prototype.** The default `DeterministicMutator` performs seeded,
|
|
206
|
+
signature-preserving string edits (bounded context-window, retry-budget,
|
|
207
|
+
threshold, and phrasing perturbations) — a **placeholder** for an LLM-backed
|
|
208
|
+
`CodeGenerator` that slots in behind the *same* `validateGeneratedCode` gate. The
|
|
209
|
+
mutator is the only piece meant to be swapped; the safety boundary, scorer, and
|
|
210
|
+
archive are kernel code.
|
|
211
|
+
|
|
212
|
+
## License
|
|
213
|
+
|
|
214
|
+
MIT © rUv. See ADRs
|
|
215
|
+
[070](../../docs/adrs/ADR-070-darwin-mode-self-improving-harness.md) ·
|
|
216
|
+
[071](../../docs/adrs/ADR-071-darwin-mutation-surfaces-safety-allowlist.md) ·
|
|
217
|
+
[072](../../docs/adrs/ADR-072-darwin-scoring-and-promotion.md) ·
|
|
218
|
+
[073](../../docs/adrs/ADR-073-darwin-archive-and-selection.md) ·
|
|
219
|
+
[074](../../docs/adrs/ADR-074-darwin-ruvector-memory-ruflo-fabric.md) ·
|
|
220
|
+
[075](../../docs/adrs/ADR-075-darwin-prototype-roadmap-and-acceptance.md),
|
|
221
|
+
and the [repository](https://github.com/ruvnet/agent-harness-generator).
|
package/SECURITY.md
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
# Darwin Mode — Security Model & Adversarial Audit
|
|
2
|
+
|
|
3
|
+
> Package: `@metaharness/darwin` (`packages/darwin-mode`)
|
|
4
|
+
> Scope: the self-modification boundary — `src/safety.ts`, `src/sandbox.ts`, `src/scorer.ts`.
|
|
5
|
+
> Audit suite: `__tests__/security/` (97 tests).
|
|
6
|
+
|
|
7
|
+
Darwin Mode lets an agent **mutate its own harness** (seven policy files), run the
|
|
8
|
+
mutated harness against a benchmark, and promote measured, safe wins. A
|
|
9
|
+
self-modifying agent that can edit anything is a liability; this document
|
|
10
|
+
describes the boundary that makes the system **honestly bounded**, the
|
|
11
|
+
adversarial tests that pin it, and the **residual gaps** the audit found.
|
|
12
|
+
|
|
13
|
+
Related ADRs:
|
|
14
|
+
[ADR-070](../../docs/adrs/ADR-070-darwin-mode-self-improving-harness.md) (loop) ·
|
|
15
|
+
[ADR-071](../../docs/adrs/ADR-071-darwin-mutation-surfaces-safety-allowlist.md) (mutation surfaces + gate) ·
|
|
16
|
+
[ADR-072](../../docs/adrs/ADR-072-darwin-scoring-and-promotion.md) (scoring + promotion) ·
|
|
17
|
+
[ADR-073](../../docs/adrs/ADR-073-darwin-archive-and-selection.md) (archive/lineage) ·
|
|
18
|
+
[ADR-074](../../docs/adrs/ADR-074-darwin-ruvector-memory-ruflo-fabric.md) (memory/fabric) ·
|
|
19
|
+
[ADR-075](../../docs/adrs/ADR-075-darwin-prototype-roadmap-and-acceptance.md) (acceptance).
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## 1. Threat model
|
|
24
|
+
|
|
25
|
+
**Adversary**: the variant generator (today a deterministic mutator; tomorrow an
|
|
26
|
+
LLM behind the *same* gate). It is assumed to be capable of emitting any text
|
|
27
|
+
into the files it is allowed to write, and to *try* to:
|
|
28
|
+
|
|
29
|
+
1. **Escape the allowlist** — write an 8th file, a nested dir, a symlink to host
|
|
30
|
+
files, a dependency manifest, or an `.env`/key file.
|
|
31
|
+
2. **Smuggle a capability** — embed `process.env`, `child_process`, `fetch`, a
|
|
32
|
+
shell string (`curl|sh`), `eval`, dynamic `import`, or a destructive fs op in
|
|
33
|
+
an *approved* file.
|
|
34
|
+
3. **Exfiltrate via the test command** — leak ambient secrets (`AWS_SECRET_ACCESS_KEY`,
|
|
35
|
+
proxy config) or inject shell metacharacters (`;`, `&&`, `$(...)`) into the
|
|
36
|
+
command that scores it.
|
|
37
|
+
4. **Re-grade itself** — propose a self-serving `score_policy.ts` to inflate its
|
|
38
|
+
own score, or get promoted while carrying a blocked action.
|
|
39
|
+
|
|
40
|
+
**Not in scope** (see §7): containing *arbitrary native code* that a
|
|
41
|
+
repo-provided test command legitimately runs — that is an OS-sandbox concern,
|
|
42
|
+
not this gate's. This gate bounds *what the agent writes*, not what the repo's
|
|
43
|
+
own test runner does.
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## 2. The seven-file allowlist (ADR-071)
|
|
48
|
+
|
|
49
|
+
A variant directory may contain **only** these seven files — one concern each.
|
|
50
|
+
Anything else is disqualified before execution.
|
|
51
|
+
|
|
52
|
+
| Surface (`MutationSurface`) | File | Controls |
|
|
53
|
+
|---|---|---|
|
|
54
|
+
| `planner` | `planner.ts` | task decomposition |
|
|
55
|
+
| `contextBuilder` | `context_builder.ts` | what enters context |
|
|
56
|
+
| `reviewer` | `reviewer.ts` | patch critique |
|
|
57
|
+
| `retryPolicy` | `retry_policy.ts` | retry / abandon / escalate |
|
|
58
|
+
| `toolPolicy` | `tool_policy.ts` | which tools, in what order |
|
|
59
|
+
| `memoryPolicy` | `memory_policy.ts` | remember / retrieve / forget |
|
|
60
|
+
| `scorePolicy` | `score_policy.ts` | *proposes* weights (read-only at run time) |
|
|
61
|
+
|
|
62
|
+
Source of truth: `FILE_BY_SURFACE` / `APPROVED_FILES` in `src/safety.ts`.
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## 3. Two-layer, defense-in-depth gate (ADR-071)
|
|
67
|
+
|
|
68
|
+
Two **independent** checks share `BLOCKED_CONTENT_PATTERNS` but run at different
|
|
69
|
+
points, so a class that slips one is caught by the other.
|
|
70
|
+
|
|
71
|
+
- **`inspectVariant(dir)`** — runs in the sandbox **before any command executes**.
|
|
72
|
+
Walks the directory with `lstat` (never follows symlinks), enforces the
|
|
73
|
+
allowlist, blocked-filename substrings, size cap (256 KiB/file), entry cap
|
|
74
|
+
(32), and content patterns. Returns a non-empty findings list to disqualify.
|
|
75
|
+
- **`validateGeneratedCode(code)`** — runs on LLM/agent output **before it is
|
|
76
|
+
written to disk**. Content-only, de-duplicated reasons. A violating generation
|
|
77
|
+
is **discarded, never repaired in place**.
|
|
78
|
+
|
|
79
|
+
Both are **code, not comments**. A disqualified variant never has its test
|
|
80
|
+
command run: the sandbox seals a `RunTrace` with the reserved `exitCode 99` and
|
|
81
|
+
records the findings as `blockedActions`.
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## 4. Shell-free, env-scrubbed sandbox (ADR-070 §sandbox)
|
|
86
|
+
|
|
87
|
+
`runVariantTask` in `src/sandbox.ts`:
|
|
88
|
+
|
|
89
|
+
- **Gate first.** `inspectVariant` runs before anything; findings ⇒ `exitCode 99`,
|
|
90
|
+
no command executed (proven: `sandbox-injection.test.ts` →
|
|
91
|
+
*"disqualified variant never runs its command"*).
|
|
92
|
+
- **No shell.** The command is whitespace-split into argv and run via
|
|
93
|
+
`execFile` — **never** a shell. `;`, `&&`, `|`, `$(...)` are inert argument
|
|
94
|
+
strings, not metacharacters (proven: *"shell-free execution"* cases).
|
|
95
|
+
- **Scrubbed env.** Only `PATH`, `NODE_ENV=test`, `METAHARNESS_VARIANT`,
|
|
96
|
+
`METAHARNESS_TASK` are exposed. `AWS_SECRET_ACCESS_KEY`, `HTTP_PROXY`,
|
|
97
|
+
`DARWIN_SECRET`, etc. read as `undefined` inside the command (proven:
|
|
98
|
+
*"environment scrubbing"* case).
|
|
99
|
+
- **Never throws.** A failing/timing-out command becomes a `RunTrace`, so a
|
|
100
|
+
hostile variant cannot abort the evolution loop.
|
|
101
|
+
|
|
102
|
+
---
|
|
103
|
+
|
|
104
|
+
## 5. Frozen scorer / benchmark immutability (ADR-072)
|
|
105
|
+
|
|
106
|
+
`scoreVariant` in `src/scorer.ts` is **frozen kernel code**, *not* the variant's
|
|
107
|
+
`score_policy.ts`. Its only inputs are
|
|
108
|
+
`(variantId, traces, parentScore, promotionDelta, taskTimeoutMs)` — **there is no
|
|
109
|
+
parameter through which variant file content can enter**. A variant may *propose*
|
|
110
|
+
different weights, but the authoritative verdict is computed here, so a variant
|
|
111
|
+
**can never re-grade itself** (proven: `scorer-immutability.test.ts`).
|
|
112
|
+
|
|
113
|
+
Two structural guarantees:
|
|
114
|
+
|
|
115
|
+
- **Any `blockedActions` ⇒ `safetyScore 0` ⇒ `promoted: false`**, and the
|
|
116
|
+
promotion gate requires `safetyScore ≥ 0.95`. An unsafe variant cannot win,
|
|
117
|
+
even against a weak parent (proven: *"a blocked-action variant can NEVER be
|
|
118
|
+
promoted"*).
|
|
119
|
+
- **Reproducible.** Identical traces yield an identical `finalScore`/`promoted`,
|
|
120
|
+
independent of `variantId` (proven: *"identical traces ⇒ identical verdict"*).
|
|
121
|
+
|
|
122
|
+
The child also **cannot edit the benchmark**: `tasks` and `testCommand` live in
|
|
123
|
+
the `RepoProfile`, outside the variant directory and outside the allowlist
|
|
124
|
+
(ADR-071 rule 9).
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
## 6. The ten containment rules → where enforced → which test proves it
|
|
129
|
+
|
|
130
|
+
| # | Rule (ADR-071) | Enforced in | Proving test |
|
|
131
|
+
|---|---|---|---|
|
|
132
|
+
| 1 | No production writes | allowlist; sandbox runs in repo root with scrubbed env, variant dir is write target only | `inspect-bypass` (allowlist), `sandbox-injection` (disqualified-never-runs) |
|
|
133
|
+
| 2 | No credential access | `BLOCKED_FILENAME_PATTERNS` (`.env`/`secret`/`token`/`id_rsa`/…) + `BLOCKED_CONTENT_PATTERNS` (secret handling) + env scrub | `inspect-bypass` (blocked filenames + blocked content `secret/token/credential/private_key`); `sandbox-injection` (env scrub) |
|
|
134
|
+
| 3 | No network by default | content patterns `fetch(`/`XHR`/`WebSocket`/`node:net,http,…`/`from 'net'…`/`curl,wget,ssh` | `inspect-bypass` + `validate-generated` (network cases) |
|
|
135
|
+
| 4 | No mutation outside approved files | `APPROVED_FILES` allowlist; `MAX_FILES`; no nested dirs; no symlinks | `inspect-bypass` (extra files, subdir, symlink, >32 entries) |
|
|
136
|
+
| 5 | No promotion without benchmark evidence | `scoreVariant` promotion gate (4 clauses) | `scorer-immutability` + `scorer.test.ts` (gate) |
|
|
137
|
+
| 6 | No deletion commands | content patterns `rm` + flag/path (`\brm\s+[-/]`)/`rmdir`/`unlink`/`rmSync` | `inspect-bypass` + `validate-generated` (`rm -rf`, `rm <path>`, `rmSync`) |
|
|
138
|
+
| 7 | No child-process escape | content patterns `child_process`/`exec*`/`spawn*` | `inspect-bypass` + `validate-generated` (process exec); `sandbox-injection` (no shell) |
|
|
139
|
+
| 8 | No hidden state | size cap + entry cap; no symlinks (no out-of-tree state) | `inspect-bypass` (oversized file, >32 entries, symlink) |
|
|
140
|
+
| 9 | No benchmark changes by the child | tasks/testCommand are in `RepoProfile`, never in the allowlist; scorer takes only traces | `scorer-immutability` (no variant-file channel) |
|
|
141
|
+
| 10 | Full lineage + rollback | archive tree + witness (ADR-073/ADR-011) | covered by `archive.test.ts` (out of this audit's scope) |
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## 7. Residual risks / out of scope
|
|
146
|
+
|
|
147
|
+
### 7a. Hardened content-scan gaps (FIXED)
|
|
148
|
+
|
|
149
|
+
The audit found three blind spots where a payload defeated
|
|
150
|
+
`BLOCKED_CONTENT_PATTERNS`. **All three are now closed in `src/safety.ts`** (both
|
|
151
|
+
`inspectVariant` and `validateGeneratedCode` inherit the same patterns), and the
|
|
152
|
+
tests that documented them are pinned **green** as regression tests.
|
|
153
|
+
|
|
154
|
+
| # | Former bypass | Payload | Fix shipped in `BLOCKED_CONTENT_PATTERNS` |
|
|
155
|
+
|---|---|---|---|
|
|
156
|
+
| **#1** | Computed-member env access | `process['env']` · ``process[`env`]`` · `Reflect.get(process,'env')` | `/process\s*[.[]\s*['"\x60]?\s*env/i` (dot **or** bracket, optional quote/backtick) + a dedicated `/\bReflect\s*\.\s*get\s*\(\s*process/i` |
|
|
157
|
+
| **#2** | Subpath module import | `import … from 'fs/promises'` (also `'fs/x'`, `'net/…'`) | `/from\s+['"](fs|net|http|https|dns|tls|dgram|cluster|vm|worker_threads)(\/[^'"]*)?['"]/i` + `node:fs/…` subpaths |
|
|
158
|
+
| **#3** | Non-`-rf` destructive shell | `"rm /etc/important_config"` (a bare `rm <path>`) | `/\brm\s+[-/]/i` (any flag **or** path), not only `rm -rf` |
|
|
159
|
+
|
|
160
|
+
> Even before the fix, the env-scrub in the sandbox (§4) meant **#1 could not
|
|
161
|
+
> exfiltrate a real secret at run time** (env is empty) and the no-shell
|
|
162
|
+
> execution limited **#2/#3** to an already-scrubbed Node process — so these were
|
|
163
|
+
> defense-in-depth erosions of layer 1, not a full break. They are now closed so
|
|
164
|
+
> the static scan and the structural guarantees agree.
|
|
165
|
+
|
|
166
|
+
Files: `__tests__/security/inspect-bypass.test.ts` (hardened #1a/#1b/#1c/#2/#3),
|
|
167
|
+
`__tests__/security/validate-generated.test.ts` (hardened #1/#2/#3) — now green.
|
|
168
|
+
|
|
169
|
+
### 7b. Out of scope by design
|
|
170
|
+
|
|
171
|
+
- **The repo-provided `testCommand` runs real code.** `inspectVariant` bounds
|
|
172
|
+
what the *agent writes*, not what the *repo's own test runner* legitimately
|
|
173
|
+
executes. Containing that runner (CPU/mem/fs/network limits) is an **OS-level
|
|
174
|
+
sandbox** concern (containers/seccomp/cgroups), outside this package. The
|
|
175
|
+
package's contribution is: shell-free invocation + scrubbed env + a hard
|
|
176
|
+
wall-clock timeout + output-buffer cap.
|
|
177
|
+
- **Deterministic mutator is a placeholder.** Today's mutator is string
|
|
178
|
+
replacement; the LLM `CodeGenerator` (ADR-071 §contract) drops in **behind the
|
|
179
|
+
same gate** — `validateGeneratedCode` is the choke point that does not move.
|
|
180
|
+
- **Pattern-based scanning is heuristic.** A static regex scan cannot prove
|
|
181
|
+
semantic safety; it is a *floor*. The §7a bypasses illustrate that obfuscation
|
|
182
|
+
is always possible against a denylist. The structural defenses (allowlist,
|
|
183
|
+
no-symlink, no-shell, env-scrub, frozen scorer, safety-gated promotion) are the
|
|
184
|
+
load-bearing guarantees; the content denylist is a best-effort early filter.
|
|
185
|
+
- **Penalty-layer heuristics** (`scorer.ts` `SECRET_RE`/`DESTRUCTIVE_RE`/…) match
|
|
186
|
+
on trace stderr text and are coarse by design (ADR-072 §penalty); they are not
|
|
187
|
+
a containment boundary, only a scoring signal.
|
|
188
|
+
|
|
189
|
+
---
|
|
190
|
+
|
|
191
|
+
## 8. Running the audit
|
|
192
|
+
|
|
193
|
+
```bash
|
|
194
|
+
npx vitest run packages/darwin-mode/__tests__/security
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
97 tests, all green: every blocked attack in §3–§6 is rejected **and** the three
|
|
198
|
+
former §7a content-scan gaps are now closed and pinned as regression tests. If a
|
|
199
|
+
new obfuscation is found, add it as a failing test, harden
|
|
200
|
+
`BLOCKED_CONTENT_PATTERNS`, and update this doc together.
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import type { ArchiveRecord, HarnessVariant, ScoreCard } from './types.js';
|
|
2
|
+
/**
|
|
3
|
+
* In-memory tree of {@link ArchiveRecord}s keyed by variant id, persisted to a
|
|
4
|
+
* JSON file. Insertion order is preserved (a `Map` iterates in insertion order)
|
|
5
|
+
* so every ordering — `all`, tie-breaks in `best`, ties in `selectParents` — is
|
|
6
|
+
* deterministic and reproducible from `archive.json` alone.
|
|
7
|
+
*/
|
|
8
|
+
export declare class Archive {
|
|
9
|
+
private readonly file;
|
|
10
|
+
/** variantId → record. A Map preserves insertion order. */
|
|
11
|
+
private readonly records;
|
|
12
|
+
/**
|
|
13
|
+
* @param file Absolute path to `archive.json`. The file need not exist yet;
|
|
14
|
+
* {@link load} tolerates a missing or corrupt file by starting empty.
|
|
15
|
+
*/
|
|
16
|
+
constructor(file: string);
|
|
17
|
+
/**
|
|
18
|
+
* Load records from {@link file} if it exists. A missing, unreadable, or
|
|
19
|
+
* corrupt file (or one whose JSON is not an `ArchiveRecord[]`) is tolerated by
|
|
20
|
+
* starting from an empty archive — never throws.
|
|
21
|
+
*/
|
|
22
|
+
load(): Promise<void>;
|
|
23
|
+
/**
|
|
24
|
+
* Insert a record `{ variant, score: null, children: [] }` if the variant id
|
|
25
|
+
* is absent (idempotent — a re-add is a no-op). When `variant.parentId` is set
|
|
26
|
+
* and that parent already exists, append this id to the parent's `children`
|
|
27
|
+
* (without duplicates), wiring up the tree edge.
|
|
28
|
+
*/
|
|
29
|
+
addVariant(variant: HarnessVariant): void;
|
|
30
|
+
/**
|
|
31
|
+
* Attach a scorecard to a variant. Throws a clear error if the variant id is
|
|
32
|
+
* unknown — scoring a phantom variant is a programmer error, not a soft miss.
|
|
33
|
+
*/
|
|
34
|
+
setScore(variantId: string, score: ScoreCard): void;
|
|
35
|
+
/** The record for `variantId`, or `undefined` if it is not in the archive. */
|
|
36
|
+
get(variantId: string): ArchiveRecord | undefined;
|
|
37
|
+
/** Every record, in insertion order. */
|
|
38
|
+
all(): ArchiveRecord[];
|
|
39
|
+
/**
|
|
40
|
+
* The scored record with the highest `score.finalScore`, or `null` when no
|
|
41
|
+
* record is scored yet. Ties break toward the earlier insertion (the first
|
|
42
|
+
* record to reach that score wins), making the choice deterministic.
|
|
43
|
+
*/
|
|
44
|
+
best(): ArchiveRecord | null;
|
|
45
|
+
/**
|
|
46
|
+
* The archive-wide selection that escapes hill-climbing: the top-`limit`
|
|
47
|
+
* scored variants by `finalScore`, drawn from the WHOLE archive including
|
|
48
|
+
* older, non-promoted branches (ADR-073 stall fallback). Deterministic — ties
|
|
49
|
+
* break by insertion order, so the result is reproducible.
|
|
50
|
+
*
|
|
51
|
+
* @param limit Maximum number of parents to return. `<= 0` yields `[]`.
|
|
52
|
+
*/
|
|
53
|
+
selectParents(limit: number): HarnessVariant[];
|
|
54
|
+
/**
|
|
55
|
+
* The path of ids from the root ancestor down to `variantId`, following
|
|
56
|
+
* `parentId` upward then reversing. Returns `[]` if `variantId` is unknown.
|
|
57
|
+
* Guarded against cycles (e.g. a self-parent or a corrupt ancestor loop): each
|
|
58
|
+
* id is visited at most once, so the walk always terminates.
|
|
59
|
+
*/
|
|
60
|
+
lineageOf(variantId: string): string[];
|
|
61
|
+
/**
|
|
62
|
+
* A serializable projection of the tree for rendering the evolution graph:
|
|
63
|
+
* one node per record (carrying generation, mutated surface, final score, and
|
|
64
|
+
* promotion flag), and one edge per existing parent→child relationship. Edges
|
|
65
|
+
* referencing a missing endpoint are omitted so the graph stays well-formed.
|
|
66
|
+
*/
|
|
67
|
+
toLineageGraph(): {
|
|
68
|
+
nodes: Array<{
|
|
69
|
+
id: string;
|
|
70
|
+
parentId: string | null;
|
|
71
|
+
generation: number;
|
|
72
|
+
mutationSurface: string;
|
|
73
|
+
finalScore: number | null;
|
|
74
|
+
promoted: boolean | null;
|
|
75
|
+
}>;
|
|
76
|
+
edges: Array<{
|
|
77
|
+
from: string;
|
|
78
|
+
to: string;
|
|
79
|
+
}>;
|
|
80
|
+
};
|
|
81
|
+
/**
|
|
82
|
+
* Persist the archive as pretty-printed JSON to {@link file}, creating the
|
|
83
|
+
* parent directory if needed. The on-disk shape is exactly `all()` — an
|
|
84
|
+
* `ArchiveRecord[]` in insertion order — so a subsequent {@link load}
|
|
85
|
+
* reconstructs the same archive.
|
|
86
|
+
*/
|
|
87
|
+
save(): Promise<void>;
|
|
88
|
+
}
|
|
89
|
+
//# sourceMappingURL=archive.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"archive.d.ts","sourceRoot":"","sources":["../src/archive.ts"],"names":[],"mappings":"AAcA,OAAO,KAAK,EAAE,aAAa,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AAE3E;;;;;GAKG;AACH,qBAAa,OAAO;IAQN,OAAO,CAAC,QAAQ,CAAC,IAAI;IAPjC,2DAA2D;IAC3D,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAyC;IAEjE;;;OAGG;gBAC0B,IAAI,EAAE,MAAM;IAEzC;;;;OAIG;IACG,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;IAwB3B;;;;;OAKG;IACH,UAAU,CAAC,OAAO,EAAE,cAAc,GAAG,IAAI;IAczC;;;OAGG;IACH,QAAQ,CAAC,SAAS,EAAE,MAAM,EAAE,KAAK,EAAE,SAAS,GAAG,IAAI;IAUnD,8EAA8E;IAC9E,GAAG,CAAC,SAAS,EAAE,MAAM,GAAG,aAAa,GAAG,SAAS;IAIjD,wCAAwC;IACxC,GAAG,IAAI,aAAa,EAAE;IAItB;;;;OAIG;IACH,IAAI,IAAI,aAAa,GAAG,IAAI;IAW5B;;;;;;;OAOG;IACH,aAAa,CAAC,KAAK,EAAE,MAAM,GAAG,cAAc,EAAE;IAoB9C;;;;;OAKG;IACH,SAAS,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,EAAE;IAkBtC;;;;;OAKG;IACH,cAAc,IAAI;QAChB,KAAK,EAAE,KAAK,CAAC;YACX,EAAE,EAAE,MAAM,CAAC;YACX,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;YACxB,UAAU,EAAE,MAAM,CAAC;YACnB,eAAe,EAAE,MAAM,CAAC;YACxB,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;YAC1B,QAAQ,EAAE,OAAO,GAAG,IAAI,CAAC;SAC1B,CAAC,CAAC;QACH,KAAK,EAAE,KAAK,CAAC;YAAE,IAAI,EAAE,MAAM,CAAC;YAAC,EAAE,EAAE,MAAM,CAAA;SAAE,CAAC,CAAC;KAC5C;IA+BD;;;;;OAKG;IACG,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;CAK5B"}
|