@metaharness/darwin 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +221 -0
  3. package/SECURITY.md +200 -0
  4. package/dist/archive.d.ts +89 -0
  5. package/dist/archive.d.ts.map +1 -0
  6. package/dist/archive.js +220 -0
  7. package/dist/archive.js.map +1 -0
  8. package/dist/bench/gates.d.ts +19 -0
  9. package/dist/bench/gates.d.ts.map +1 -0
  10. package/dist/bench/gates.js +82 -0
  11. package/dist/bench/gates.js.map +1 -0
  12. package/dist/bench/index.d.ts +11 -0
  13. package/dist/bench/index.d.ts.map +1 -0
  14. package/dist/bench/index.js +25 -0
  15. package/dist/bench/index.js.map +1 -0
  16. package/dist/bench/lineage.d.ts +60 -0
  17. package/dist/bench/lineage.d.ts.map +1 -0
  18. package/dist/bench/lineage.js +166 -0
  19. package/dist/bench/lineage.js.map +1 -0
  20. package/dist/bench/metrics.d.ts +32 -0
  21. package/dist/bench/metrics.d.ts.map +1 -0
  22. package/dist/bench/metrics.js +52 -0
  23. package/dist/bench/metrics.js.map +1 -0
  24. package/dist/bench/promotion.d.ts +21 -0
  25. package/dist/bench/promotion.d.ts.map +1 -0
  26. package/dist/bench/promotion.js +109 -0
  27. package/dist/bench/promotion.js.map +1 -0
  28. package/dist/bench/risk.d.ts +45 -0
  29. package/dist/bench/risk.d.ts.map +1 -0
  30. package/dist/bench/risk.js +71 -0
  31. package/dist/bench/risk.js.map +1 -0
  32. package/dist/bench/runner.d.ts +53 -0
  33. package/dist/bench/runner.d.ts.map +1 -0
  34. package/dist/bench/runner.js +131 -0
  35. package/dist/bench/runner.js.map +1 -0
  36. package/dist/bench/score.d.ts +16 -0
  37. package/dist/bench/score.d.ts.map +1 -0
  38. package/dist/bench/score.js +83 -0
  39. package/dist/bench/score.js.map +1 -0
  40. package/dist/bench/stats.d.ts +26 -0
  41. package/dist/bench/stats.d.ts.map +1 -0
  42. package/dist/bench/stats.js +74 -0
  43. package/dist/bench/stats.js.map +1 -0
  44. package/dist/bench/suite.d.ts +16 -0
  45. package/dist/bench/suite.d.ts.map +1 -0
  46. package/dist/bench/suite.js +59 -0
  47. package/dist/bench/suite.js.map +1 -0
  48. package/dist/bench/types.d.ts +135 -0
  49. package/dist/bench/types.d.ts.map +1 -0
  50. package/dist/bench/types.js +16 -0
  51. package/dist/bench/types.js.map +1 -0
  52. package/dist/cli.d.ts +3 -0
  53. package/dist/cli.d.ts.map +1 -0
  54. package/dist/cli.js +125 -0
  55. package/dist/cli.js.map +1 -0
  56. package/dist/evolve.d.ts +11 -0
  57. package/dist/evolve.d.ts.map +1 -0
  58. package/dist/evolve.js +129 -0
  59. package/dist/evolve.js.map +1 -0
  60. package/dist/generator.d.ts +9 -0
  61. package/dist/generator.d.ts.map +1 -0
  62. package/dist/generator.js +46 -0
  63. package/dist/generator.js.map +1 -0
  64. package/dist/index.d.ts +12 -0
  65. package/dist/index.d.ts.map +1 -0
  66. package/dist/index.js +37 -0
  67. package/dist/index.js.map +1 -0
  68. package/dist/mutator.d.ts +61 -0
  69. package/dist/mutator.d.ts.map +1 -0
  70. package/dist/mutator.js +193 -0
  71. package/dist/mutator.js.map +1 -0
  72. package/dist/openrouter-mutator.d.ts +32 -0
  73. package/dist/openrouter-mutator.d.ts.map +1 -0
  74. package/dist/openrouter-mutator.js +81 -0
  75. package/dist/openrouter-mutator.js.map +1 -0
  76. package/dist/repo_profiler.d.ts +8 -0
  77. package/dist/repo_profiler.d.ts.map +1 -0
  78. package/dist/repo_profiler.js +127 -0
  79. package/dist/repo_profiler.js.map +1 -0
  80. package/dist/safety.d.ts +45 -0
  81. package/dist/safety.d.ts.map +1 -0
  82. package/dist/safety.js +191 -0
  83. package/dist/safety.js.map +1 -0
  84. package/dist/sandbox.d.ts +24 -0
  85. package/dist/sandbox.d.ts.map +1 -0
  86. package/dist/sandbox.js +153 -0
  87. package/dist/sandbox.js.map +1 -0
  88. package/dist/scorer.d.ts +26 -0
  89. package/dist/scorer.d.ts.map +1 -0
  90. package/dist/scorer.js +168 -0
  91. package/dist/scorer.js.map +1 -0
  92. package/dist/templates.d.ts +37 -0
  93. package/dist/templates.d.ts.map +1 -0
  94. package/dist/templates.js +309 -0
  95. package/dist/templates.js.map +1 -0
  96. package/dist/types.d.ts +123 -0
  97. package/dist/types.d.ts.map +1 -0
  98. package/dist/types.js +13 -0
  99. package/dist/types.js.map +1 -0
  100. package/package.json +57 -0
@@ -0,0 +1,153 @@
1
+ // SPDX-License-Identifier: MIT
2
+ //
3
+ // The sandbox runner (ADR-070 §sandbox, ADR-071 §gate) — the only place a
4
+ // variant's test command actually executes. It is the execution half of the
5
+ // evaluation side; the scorer (scorer.ts) is the judgement half.
6
+ //
7
+ // Two non-negotiable security properties, both pinned by tests:
8
+ //
9
+ // 1. The ADR-071 safety gate runs FIRST. A variant directory that fails
10
+ // `inspectVariant` never has any command run: the trace is sealed with the
11
+ // reserved exit code 99 and the findings recorded as blockedActions.
12
+ // 2. No shell, scrubbed environment. The test command is split into argv and
13
+ // run via `execFile` (never a shell, so no command-injection surface), and
14
+ // with a minimal env — PATH plus three identifying variables — so secrets,
15
+ // tokens, and proxy settings in `process.env` never leak into a variant.
16
+ //
17
+ // `runVariantTask` never throws: a failing or timing-out command becomes a
18
+ // RunTrace, not an exception, so the evolution loop cannot be aborted by a
19
+ // hostile or broken variant.
20
+ import { execFile } from 'node:child_process';
21
+ import { promisify } from 'node:util';
22
+ import { inspectVariant } from './safety.js';
23
+ const execFileAsync = promisify(execFile);
24
+ /** Reserved exit code meaning "disqualified by the safety gate before running". */
25
+ const DISQUALIFIED_EXIT_CODE = 99;
26
+ /** Default per-variant test-command wall-clock budget (ms). */
27
+ const DEFAULT_TASK_TIMEOUT_MS = 120_000;
28
+ /** Default cap on captured stdout/stderr (bytes) before the process is killed. */
29
+ const DEFAULT_MAX_BUFFER_BYTES = 8 * 1024 * 1024;
30
+ /**
31
+ * Split a test command into argv by whitespace. Deliberately simple: there is
32
+ * no shell, so there is no quoting/globbing to honour — the command comes from
33
+ * the RepoProfile, not the variant, and `execFile` receives a bare argv.
34
+ */
35
+ function toArgv(command) {
36
+ return command.trim().split(/\s+/).filter((part) => part.length > 0);
37
+ }
38
+ /**
39
+ * The minimal, scrubbed environment a variant's test command runs under. Only
40
+ * PATH (so the runtime is findable) plus three identifying variables are
41
+ * exposed; nothing else from `process.env` is passed through, so secrets,
42
+ * tokens, and proxy configuration cannot leak into a variant.
43
+ */
44
+ function scrubbedEnv(variantId, taskId) {
45
+ return {
46
+ PATH: process.env.PATH ?? '',
47
+ NODE_ENV: 'test',
48
+ METAHARNESS_VARIANT: variantId,
49
+ METAHARNESS_TASK: taskId,
50
+ };
51
+ }
52
+ /**
53
+ * Run one variant against one task in the sandbox.
54
+ *
55
+ * The ADR-071 safety gate runs first: if `inspectVariant` reports any findings,
56
+ * no command is executed and a disqualified trace (exitCode 99) is returned.
57
+ * Otherwise the profile's `testCommand` is executed via `execFile` (no shell)
58
+ * with a scrubbed env. Never throws — failures become RunTraces.
59
+ */
60
+ export async function runVariantTask(variant, profile, taskId, opts) {
61
+ const startedAt = new Date();
62
+ // ── Gate first: a disqualified variant never runs anything (ADR-071). ──
63
+ const findings = await inspectVariant(variant.dir);
64
+ if (findings.length > 0) {
65
+ const finishedAt = new Date();
66
+ return {
67
+ variantId: variant.id,
68
+ taskId,
69
+ startedAt: startedAt.toISOString(),
70
+ finishedAt: finishedAt.toISOString(),
71
+ exitCode: DISQUALIFIED_EXIT_CODE,
72
+ stdout: '',
73
+ stderr: findings.join('\n'),
74
+ durationMs: finishedAt.getTime() - startedAt.getTime(),
75
+ timedOut: false,
76
+ blockedActions: findings,
77
+ };
78
+ }
79
+ const timeout = opts?.taskTimeoutMs ?? DEFAULT_TASK_TIMEOUT_MS;
80
+ const maxBuffer = opts?.maxBufferBytes ?? DEFAULT_MAX_BUFFER_BYTES;
81
+ const argv = toArgv(profile.testCommand);
82
+ const env = scrubbedEnv(variant.id, taskId);
83
+ // A malformed (empty) command cannot run — treat as a benign failure trace.
84
+ if (argv.length === 0) {
85
+ const finishedAt = new Date();
86
+ return {
87
+ variantId: variant.id,
88
+ taskId,
89
+ startedAt: startedAt.toISOString(),
90
+ finishedAt: finishedAt.toISOString(),
91
+ exitCode: 1,
92
+ stdout: '',
93
+ stderr: 'empty testCommand',
94
+ durationMs: finishedAt.getTime() - startedAt.getTime(),
95
+ timedOut: false,
96
+ blockedActions: [],
97
+ };
98
+ }
99
+ try {
100
+ const { stdout, stderr } = await execFileAsync(argv[0], argv.slice(1), {
101
+ cwd: profile.root,
102
+ timeout,
103
+ maxBuffer,
104
+ env,
105
+ windowsHide: true,
106
+ // No `shell` option: execFile never invokes a shell (no injection surface).
107
+ });
108
+ const finishedAt = new Date();
109
+ return {
110
+ variantId: variant.id,
111
+ taskId,
112
+ startedAt: startedAt.toISOString(),
113
+ finishedAt: finishedAt.toISOString(),
114
+ exitCode: 0,
115
+ stdout: stdout ?? '',
116
+ stderr: stderr ?? '',
117
+ durationMs: finishedAt.getTime() - startedAt.getTime(),
118
+ timedOut: false,
119
+ blockedActions: [],
120
+ };
121
+ }
122
+ catch (err) {
123
+ const e = err;
124
+ const finishedAt = new Date();
125
+ const exitCode = typeof e.code === 'number' ? e.code : 1;
126
+ const timedOut = e.killed === true || e.signal === 'SIGTERM';
127
+ return {
128
+ variantId: variant.id,
129
+ taskId,
130
+ startedAt: startedAt.toISOString(),
131
+ finishedAt: finishedAt.toISOString(),
132
+ exitCode,
133
+ stdout: e.stdout ?? '',
134
+ stderr: e.stderr ?? '',
135
+ durationMs: finishedAt.getTime() - startedAt.getTime(),
136
+ timedOut,
137
+ blockedActions: [],
138
+ };
139
+ }
140
+ }
141
+ /**
142
+ * Run a variant against a list of tasks sequentially, returning every trace.
143
+ * Sequential by design: it bounds resource use and keeps traces deterministic
144
+ * (the population-level concurrency budget lives in the evolution loop, not here).
145
+ */
146
+ export async function runVariantTasks(variant, profile, taskIds, opts) {
147
+ const traces = [];
148
+ for (const taskId of taskIds) {
149
+ traces.push(await runVariantTask(variant, profile, taskId, opts));
150
+ }
151
+ return traces;
152
+ }
153
+ //# sourceMappingURL=sandbox.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"sandbox.js","sourceRoot":"","sources":["../src/sandbox.ts"],"names":[],"mappings":"AAAA,+BAA+B;AAC/B,EAAE;AACF,0EAA0E;AAC1E,4EAA4E;AAC5E,iEAAiE;AACjE,EAAE;AACF,gEAAgE;AAChE,EAAE;AACF,0EAA0E;AAC1E,gFAAgF;AAChF,0EAA0E;AAC1E,+EAA+E;AAC/E,gFAAgF;AAChF,gFAAgF;AAChF,8EAA8E;AAC9E,EAAE;AACF,2EAA2E;AAC3E,2EAA2E;AAC3E,6BAA6B;AAE7B,OAAO,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AAC9C,OAAO,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AACtC,OAAO,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAG7C,MAAM,aAAa,GAAG,SAAS,CAAC,QAAQ,CAAC,CAAC;AAE1C,mFAAmF;AACnF,MAAM,sBAAsB,GAAG,EAAE,CAAC;AAElC,+DAA+D;AAC/D,MAAM,uBAAuB,GAAG,OAAO,CAAC;AAExC,kFAAkF;AAClF,MAAM,wBAAwB,GAAG,CAAC,GAAG,IAAI,GAAG,IAAI,CAAC;AAmBjD;;;;GAIG;AACH,SAAS,MAAM,CAAC,OAAe;IAC7B,OAAO,OAAO,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;AACvE,CAAC;AAED;;;;;GAKG;AACH,SAAS,WAAW,CAAC,SAAiB,EAAE,MAAc;IACpD,OAAO;QACL,IAAI,EAAE,OAAO,CAAC,GAAG,CAAC,IAAI,IAAI,EAAE;QAC5B,QAAQ,EAAE,MAAM;QAChB,mBAAmB,EAAE,SAAS;QAC9B,gBAAgB,EAAE,MAAM;KACzB,CAAC;AACJ,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,OAAuB,EACvB,OAAoB,EACpB,MAAc,EACd,IAAqB;IAErB,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC;IAE7B,0EAA0E;IAC1E,MAAM,QAAQ,GAAG,MAAM,cAAc,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;IACnD,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACxB,MAAM,UAAU,GAAG,IAAI,IAAI,EAAE,CAAC;QAC9B,OAAO;YACL,SAAS,EAAE,OAAO,CAAC,EAAE;YACrB,MAAM;YACN,SAAS,EAAE,SAAS,CAAC,WAAW,EAAE;YAClC,UAAU,EAAE,UAAU,CAAC,WAAW,EAAE;YACpC,QAAQ,EAAE,sBAAsB;YAChC,MAAM,EAAE,EAAE;YACV,MAAM,EAAE,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC;YAC3B,UAAU,EAAE,UAAU,CAAC,OAAO,EAAE,GAAG,SAAS,CAAC,OAAO,EAAE;YACtD,QAAQ,EAAE,KAAK;YACf,cAAc,EAAE,QAAQ;SACzB,CAAC;IACJ,CAAC;IAED,MAAM,OAAO,GAAG,IAAI,EAAE,aAAa,IAAI,uBAAuB,CAAC;IAC/D,MAAM,SAAS,GAAG,IAAI,EAAE,cAAc,IAAI,wBAAwB,CAAC;IACnE,MAAM,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC;IACzC,MAAM,GAAG,GAAG,WAAW,CAAC,OAAO,CAAC,EAAE,EAAE,MAAM,CAAC,CAAC;IAE5C,4EAA4E;IAC5E,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtB,MAAM,UAAU,GAAG,IAAI,IAAI,EAAE,CAAC;QAC9B,OAAO;YACL,SAAS,EAAE,OAAO,CAAC,EAAE;YACrB,MAAM;YACN,SAAS,EAAE,SAAS,CAAC,WAAW,EAAE;YAClC,UAAU,EAAE,UAAU,CAAC,WAAW,EAAE;YACpC,QAAQ,EAAE,CAAC;YACX,MAAM,EAAE,EAAE;YACV,MAAM,EAAE,mBAAmB;YAC3B,UAAU,EAAE,UAAU,CAAC,OAAO,EAAE,GAAG,SAAS,CAAC,OAAO,EAAE;YACtD,QAAQ,EAAE,KAAK;YACf,cAAc,EAAE,EAAE;SACnB,CAAC;IACJ,CAAC;IAED,IAAI,CAAC;QACH,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,aAAa,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE;YACrE,GAAG,EAAE,OAAO,CAAC,IAAI;YACjB,OAAO;YACP,SAAS;YACT,GAAG;YACH,WAAW,EAAE,IAAI;YACjB,4EAA4E;SAC7E,CAAC,CAAC;QACH,MAAM,UAAU,GAAG,IAAI,IAAI,EAAE,CAAC;QAC9B,OAAO;YACL,SAAS,EAAE,OAAO,CAAC,EAAE;YACrB,MAAM;YACN,SAAS,EAAE,SAAS,CAAC,WAAW,EAAE;YAClC,UAAU,EAAE,UAAU,CAAC,WAAW,EAAE;YACpC,QAAQ,EAAE,CAAC;YACX,MAAM,EAAE,MAAM,IAAI,EAAE;YACpB,MAAM,EAAE,MAAM,IAAI,EAAE;YACpB,UAAU,EAAE,UAAU,CAAC,OAAO,EAAE,GAAG,SAAS,CAAC,OAAO,EAAE;YACtD,QAAQ,EAAE,KAAK;YACf,cAAc,EAAE,EAAE;SACnB,CAAC;IACJ,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,MAAM,CAAC,GAAG,GAAgB,CAAC;QAC3B,MAAM,UAAU,GAAG,IAAI,IAAI,EAAE,CAAC;QAC9B,MAAM,QAAQ,GAAG,OAAO,CAAC,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QACzD,MAAM,QAAQ,GAAG,CAAC,CAAC,MAAM,KAAK,IAAI,IAAI,CAAC,CAAC,MAAM,KAAK,SAAS,CAAC;QAC7D,OAAO;YACL,SAAS,EAAE,OAAO,CAAC,EAAE;YACrB,MAAM;YACN,SAAS,EAAE,SAAS,CAAC,WAAW,EAAE;YAClC,UAAU,EAAE,UAAU,CAAC,WAAW,EAAE;YACpC,QAAQ;YACR,MAAM,EAAE,CAAC,CAAC,MAAM,IAAI,EAAE;YACtB,MAAM,EAAE,CAAC,CAAC,MAAM,IAAI,EAAE;YACtB,UAAU,EAAE,UAAU,CAAC,OAAO,EAAE,GAAG,SAAS,CAAC,OAAO,EAAE;YACtD,QAAQ;YACR,cAAc,EAAE,EAAE;SACnB,CAAC;IACJ,CAAC;AACH,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,OAAuB,EACvB,OAAoB,EACpB,OAAiB,EACjB,IAAqB;IAErB,MAAM,MAAM,GAAe,EAAE,CAAC;IAC9B,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,MAAM,CAAC,IAAI,CAAC,MAAM,cAAc,CAAC,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC;IACpE,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC"}
@@ -0,0 +1,26 @@
1
+ import type { RunTrace, ScoreCard } from './types.js';
2
+ /**
3
+ * The authoritative scoring weights (ADR-072 §base score). They sum to 1.0 and
4
+ * are exposed so callers (and the archive) can report the policy in force.
5
+ */
6
+ export declare function scoreWeights(): {
7
+ taskSuccess: number;
8
+ testPassRate: number;
9
+ traceQuality: number;
10
+ costEfficiency: number;
11
+ latencyEfficiency: number;
12
+ safetyScore: number;
13
+ };
14
+ /**
15
+ * Score a variant from its run traces, fold in the penalty layer, and decide
16
+ * promotion against the parent. `parentScore` is null for the baseline (which
17
+ * is graded against a zero floor and never promoted).
18
+ *
19
+ * @param variantId the variant being scored
20
+ * @param traces one trace per task this variant ran
21
+ * @param parentScore the parent's scorecard, or null for the baseline
22
+ * @param promotionDelta anti-noise margin a child must beat the parent by
23
+ * @param taskTimeoutMs wall-clock budget used to normalise latency
24
+ */
25
+ export declare function scoreVariant(variantId: string, traces: RunTrace[], parentScore: ScoreCard | null, promotionDelta: number, taskTimeoutMs?: number): ScoreCard;
26
+ //# sourceMappingURL=scorer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"scorer.d.ts","sourceRoot":"","sources":["../src/scorer.ts"],"names":[],"mappings":"AAeA,OAAO,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AA6BtD;;;GAGG;AACH,wBAAgB,YAAY,IAAI;IAC9B,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;IACrB,YAAY,EAAE,MAAM,CAAC;IACrB,cAAc,EAAE,MAAM,CAAC;IACvB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,WAAW,EAAE,MAAM,CAAC;CACrB,CASA;AAED;;;;;;;;;;GAUG;AACH,wBAAgB,YAAY,CAC1B,SAAS,EAAE,MAAM,EACjB,MAAM,EAAE,QAAQ,EAAE,EAClB,WAAW,EAAE,SAAS,GAAG,IAAI,EAC7B,cAAc,EAAE,MAAM,EACtB,aAAa,SAA0B,GACtC,SAAS,CA2GX"}
package/dist/scorer.js ADDED
@@ -0,0 +1,168 @@
1
+ // SPDX-License-Identifier: MIT
2
+ //
3
+ // The frozen scorer (ADR-072) — the spine that turns "looks better" into "is
4
+ // measurably better and safe". This is kernel code, NOT the variant's
5
+ // `score_policy.ts`: a variant may *propose* weights, but the verdict that
6
+ // decides promotion is computed here, so a variant can never re-grade itself.
7
+ //
8
+ // A weighted base score over six [0,1] terms, minus a hard penalty layer whose
9
+ // signals are read out of the run traces (a single safety violation can drive
10
+ // the final score negative — that is the point). Promotion is gated by four
11
+ // independent clauses; all four must hold for a child to replace its parent.
12
+ //
13
+ // Pure function, no I/O. Re-running it on the same traces yields the identical
14
+ // finalScore and promoted verdict (ADR-072 reproducibility clause).
15
+ /** Reserved disqualification exit code (mirrors sandbox.ts). */
16
+ const DISQUALIFIED_EXIT_CODE = 99;
17
+ /** Combined stdout+stderr above this size marks a trace as low quality. */
18
+ const TRACE_QUALITY_BYTE_CAP = 4 * 1024 * 1024;
19
+ /** Default per-variant wall-clock budget (ms) for latency normalisation. */
20
+ const DEFAULT_TASK_TIMEOUT_MS = 120_000;
21
+ /** Minimum safetyScore the promotion gate requires (ADR-072). */
22
+ const SAFETY_GATE = 0.95;
23
+ /** Trace-pattern heuristics for the penalty layer (ADR-072 §penalty). */
24
+ const SECRET_RE = /secret|token|credential/i;
25
+ const DESTRUCTIVE_RE = /\brm\b|sudo|chmod|docker/i;
26
+ const HALLUCINATED_RE = /no such file|cannot find/i;
27
+ /**
28
+ * Round to 6 decimal places. Kills float-representation noise so a scorecard is
29
+ * byte-identical across runs and clean in the JSON artifacts (ADR-075). `+` drops
30
+ * any `-0`. With latency/cost hooked deterministically, every scored term is now
31
+ * a function of deterministic inputs only.
32
+ */
33
+ function round6(value) {
34
+ return +(Math.round(value * 1e6) / 1e6).toFixed(6);
35
+ }
36
+ /**
37
+ * The authoritative scoring weights (ADR-072 §base score). They sum to 1.0 and
38
+ * are exposed so callers (and the archive) can report the policy in force.
39
+ */
40
+ export function scoreWeights() {
41
+ return {
42
+ taskSuccess: 0.35,
43
+ testPassRate: 0.2,
44
+ traceQuality: 0.15,
45
+ costEfficiency: 0.1,
46
+ latencyEfficiency: 0.1,
47
+ safetyScore: 0.1,
48
+ };
49
+ }
50
+ /**
51
+ * Score a variant from its run traces, fold in the penalty layer, and decide
52
+ * promotion against the parent. `parentScore` is null for the baseline (which
53
+ * is graded against a zero floor and never promoted).
54
+ *
55
+ * @param variantId the variant being scored
56
+ * @param traces one trace per task this variant ran
57
+ * @param parentScore the parent's scorecard, or null for the baseline
58
+ * @param promotionDelta anti-noise margin a child must beat the parent by
59
+ * @param taskTimeoutMs wall-clock budget used to normalise latency
60
+ */
61
+ export function scoreVariant(variantId, traces, parentScore, promotionDelta, taskTimeoutMs = DEFAULT_TASK_TIMEOUT_MS) {
62
+ const total = traces.length;
63
+ const passed = traces.filter((t) => t.exitCode === 0).length;
64
+ const taskSuccess = total > 0 ? passed / total : 0;
65
+ const testPassRate = taskSuccess;
66
+ // Trace quality: high unless any trace produced an oversized output buffer.
67
+ const allCompact = traces.every((t) => t.stdout.length + t.stderr.length < TRACE_QUALITY_BYTE_CAP);
68
+ const traceQuality = allCompact ? 0.9 : 0.5;
69
+ // Latency + cost are DETERMINISTIC prototype hooks (full marks), exactly like
70
+ // each other. At prototype level every variant runs the identical repo test
71
+ // command, so per-variant wall-clock differences are pure measurement NOISE —
72
+ // folding raw `durationMs` into the score made `finalScore` (and, under
73
+ // parallel load, the winner's identity) non-reproducible, violating the
74
+ // ADR-075 reproducibility clause. Real latency/cost scoring lands with the
75
+ // LLM-backed evaluator, where variants differ structurally rather than by
76
+ // jitter; the wall-clock is still recorded per trace (RunTrace.durationMs) for
77
+ // observability. `taskTimeoutMs` is retained for that future formula and for
78
+ // the timeout-driven tool-loop penalty below.
79
+ void taskTimeoutMs;
80
+ const latencyEfficiency = 1.0;
81
+ const costEfficiency = 1.0;
82
+ // Safety: any blocked action across the run zeroes the term.
83
+ const totalBlocked = traces.reduce((sum, t) => sum + t.blockedActions.length, 0);
84
+ const safetyScore = totalBlocked === 0 ? 1.0 : 0.0;
85
+ // ── Penalty layer: coarse trace-pattern heuristics (ADR-072 §penalty). ──
86
+ const secretExposure = traces.some((t) => SECRET_RE.test(t.stderr)) ? 1 : 0;
87
+ const destructiveAction = traces.some((t) => DESTRUCTIVE_RE.test(t.stderr)) ? 1 : 0;
88
+ const hallucinatedFile = traces.some((t) => HALLUCINATED_RE.test(t.stderr)) ? 1 : 0;
89
+ const toolLoop = traces.some((t) => t.timedOut || t.exitCode === DISQUALIFIED_EXIT_CODE)
90
+ ? 1
91
+ : 0;
92
+ const costOverrun = 0; // cost circuit-breaker hook (ADR-072 §cost)
93
+ const w = scoreWeights();
94
+ const baseScore = round6(w.taskSuccess * taskSuccess +
95
+ w.testPassRate * testPassRate +
96
+ w.traceQuality * traceQuality +
97
+ w.costEfficiency * costEfficiency +
98
+ w.latencyEfficiency * latencyEfficiency +
99
+ w.safetyScore * safetyScore);
100
+ const finalScore = round6(baseScore -
101
+ 0.3 * secretExposure -
102
+ 0.25 * destructiveAction -
103
+ 0.2 * hallucinatedFile -
104
+ 0.15 * toolLoop -
105
+ 0.1 * costOverrun);
106
+ // ── Promotion gate: all four clauses must hold (ADR-072 §gate). ──
107
+ const parentFinal = parentScore?.finalScore ?? 0;
108
+ const parentTestPassRate = parentScore?.testPassRate ?? 0;
109
+ const beatsParent = finalScore > parentFinal + promotionDelta;
110
+ const safetyOk = safetyScore >= SAFETY_GATE;
111
+ const noRegression = testPassRate >= parentTestPassRate;
112
+ const noBlockedActions = safetyScore === 1.0;
113
+ const promoted = beatsParent && safetyOk && noRegression && noBlockedActions;
114
+ const reason = promoted
115
+ ? `promoted: finalScore ${finalScore.toFixed(4)} > parent ` +
116
+ `${parentFinal.toFixed(4)} + delta ${promotionDelta} ` +
117
+ `(safety ${safetyScore.toFixed(2)}, no test regression)`
118
+ : buildRejectReason({
119
+ beatsParent,
120
+ safetyOk,
121
+ noRegression,
122
+ noBlockedActions,
123
+ finalScore,
124
+ parentFinal,
125
+ promotionDelta,
126
+ safetyScore,
127
+ testPassRate,
128
+ parentTestPassRate,
129
+ });
130
+ return {
131
+ variantId,
132
+ taskSuccess: round6(taskSuccess),
133
+ testPassRate: round6(testPassRate),
134
+ traceQuality,
135
+ costEfficiency,
136
+ latencyEfficiency,
137
+ safetyScore,
138
+ secretExposure,
139
+ destructiveAction,
140
+ hallucinatedFile,
141
+ toolLoop,
142
+ costOverrun,
143
+ baseScore,
144
+ finalScore,
145
+ promoted,
146
+ reason,
147
+ };
148
+ }
149
+ /** Compose a human-readable reason listing every failed promotion clause. */
150
+ function buildRejectReason(ctx) {
151
+ const fails = [];
152
+ if (!ctx.beatsParent) {
153
+ fails.push(`finalScore ${ctx.finalScore.toFixed(4)} ≤ parent ` +
154
+ `${ctx.parentFinal.toFixed(4)} + delta ${ctx.promotionDelta}`);
155
+ }
156
+ if (!ctx.safetyOk) {
157
+ fails.push(`safetyScore ${ctx.safetyScore.toFixed(2)} < ${SAFETY_GATE}`);
158
+ }
159
+ if (!ctx.noRegression) {
160
+ fails.push(`testPassRate regression ${ctx.testPassRate.toFixed(2)} < ` +
161
+ `${ctx.parentTestPassRate.toFixed(2)}`);
162
+ }
163
+ if (!ctx.noBlockedActions) {
164
+ fails.push('blocked actions present (ADR-071 gate)');
165
+ }
166
+ return `not promoted: ${fails.join('; ')}`;
167
+ }
168
+ //# sourceMappingURL=scorer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"scorer.js","sourceRoot":"","sources":["../src/scorer.ts"],"names":[],"mappings":"AAAA,+BAA+B;AAC/B,EAAE;AACF,6EAA6E;AAC7E,sEAAsE;AACtE,2EAA2E;AAC3E,8EAA8E;AAC9E,EAAE;AACF,+EAA+E;AAC/E,8EAA8E;AAC9E,4EAA4E;AAC5E,6EAA6E;AAC7E,EAAE;AACF,+EAA+E;AAC/E,oEAAoE;AAIpE,gEAAgE;AAChE,MAAM,sBAAsB,GAAG,EAAE,CAAC;AAElC,2EAA2E;AAC3E,MAAM,sBAAsB,GAAG,CAAC,GAAG,IAAI,GAAG,IAAI,CAAC;AAE/C,4EAA4E;AAC5E,MAAM,uBAAuB,GAAG,OAAO,CAAC;AAExC,iEAAiE;AACjE,MAAM,WAAW,GAAG,IAAI,CAAC;AAEzB,yEAAyE;AACzE,MAAM,SAAS,GAAG,0BAA0B,CAAC;AAC7C,MAAM,cAAc,GAAG,2BAA2B,CAAC;AACnD,MAAM,eAAe,GAAG,2BAA2B,CAAC;AAEpD;;;;;GAKG;AACH,SAAS,MAAM,CAAC,KAAa;IAC3B,OAAO,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,GAAG,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;AACrD,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,YAAY;IAQ1B,OAAO;QACL,WAAW,EAAE,IAAI;QACjB,YAAY,EAAE,GAAG;QACjB,YAAY,EAAE,IAAI;QAClB,cAAc,EAAE,GAAG;QACnB,iBAAiB,EAAE,GAAG;QACtB,WAAW,EAAE,GAAG;KACjB,CAAC;AACJ,CAAC;AAED;;;;;;;;;;GAUG;AACH,MAAM,UAAU,YAAY,CAC1B,SAAiB,EACjB,MAAkB,EAClB,WAA6B,EAC7B,cAAsB,EACtB,aAAa,GAAG,uBAAuB;IAEvC,MAAM,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC;IAC5B,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,KAAK,CAAC,CAAC,CAAC,MAAM,CAAC;IAE7D,MAAM,WAAW,GAAG,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IACnD,MAAM,YAAY,GAAG,WAAW,CAAC;IAEjC,4EAA4E;IAC5E,MAAM,UAAU,GAAG,MAAM,CAAC,KAAK,CAC7B,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,MAAM,CAAC,MAAM,GAAG,sBAAsB,CAClE,CAAC;IACF,MAAM,YAAY,GAAG,UAAU,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;IAE5C,8EAA8E;IAC9E,4EAA4E;IAC5E,8EAA8E;IAC9E,wEAAwE;IACxE,wEAAwE;IACxE,2EAA2E;IAC3E,0EAA0E;IAC1E,+EAA+E;IAC/E,6EAA6E;IAC7E,8CAA8C;IAC9C,KAAK,aAAa,CAAC;IACnB,MAAM,iBAAiB,GAAG,GAAG,CAAC;IAC9B,MAAM,cAAc,GAAG,GAAG,CAAC;IAE3B,6DAA6D;IAC7D,MAAM,YAAY,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,cAAc,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IACjF,MAAM,WAAW,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;IAEnD,2EAA2E;IAC3E,MAAM,cAAc,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC5E,MAAM,iBAAiB,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACpF,MAAM,gBAAgB,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACpF,MAAM,QAAQ,GAAG,MAAM,CAAC,IAAI,CAC1B,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,QAAQ,KAAK,sBAAsB,CAC3D;QACC,CAAC,CAAC,CAAC;QACH,CAAC,CAAC,CAAC,CAAC;IACN,MAAM,WAAW,GAAG,CAAC,CAAC,CAAC,4CAA4C;IAEnE,MAAM,CAAC,GAAG,YAAY,EAAE,CAAC;IACzB,MAAM,SAAS,GAAG,MAAM,CACtB,CAAC,CAAC,WAAW,GAAG,WAAW;QACzB,CAAC,CAAC,YAAY,GAAG,YAAY;QAC7B,CAAC,CAAC,YAAY,GAAG,YAAY;QAC7B,CAAC,CAAC,cAAc,GAAG,cAAc;QACjC,CAAC,CAAC,iBAAiB,GAAG,iBAAiB;QACvC,CAAC,CAAC,WAAW,GAAG,WAAW,CAC9B,CAAC;IAEF,MAAM,UAAU,GAAG,MAAM,CACvB,SAAS;QACP,GAAG,GAAG,cAAc;QACpB,IAAI,GAAG,iBAAiB;QACxB,GAAG,GAAG,gBAAgB;QACtB,IAAI,GAAG,QAAQ;QACf,GAAG,GAAG,WAAW,CACpB,CAAC;IAEF,oEAAoE;IACpE,MAAM,WAAW,GAAG,WAAW,EAAE,UAAU,IAAI,CAAC,CAAC;IACjD,MAAM,kBAAkB,GAAG,WAAW,EAAE,YAAY,IAAI,CAAC,CAAC;IAE1D,MAAM,WAAW,GAAG,UAAU,GAAG,WAAW,GAAG,cAAc,CAAC;IAC9D,MAAM,QAAQ,GAAG,WAAW,IAAI,WAAW,CAAC;IAC5C,MAAM,YAAY,GAAG,YAAY,IAAI,kBAAkB,CAAC;IACxD,MAAM,gBAAgB,GAAG,WAAW,KAAK,GAAG,CAAC;IAE7C,MAAM,QAAQ,GAAG,WAAW,IAAI,QAAQ,IAAI,YAAY,IAAI,gBAAgB,CAAC;IAE7E,MAAM,MAAM,GAAG,QAAQ;QACrB,CAAC,CAAC,wBAAwB,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,YAAY;YACzD,GAAG,WAAW,CAAC,OAAO,CAAC,CAAC,CAAC,YAAY,cAAc,GAAG;YACtD,WAAW,WAAW,CAAC,OAAO,CAAC,CAAC,CAAC,uBAAuB;QAC1D,CAAC,CAAC,iBAAiB,CAAC;YAChB,WAAW;YACX,QAAQ;YACR,YAAY;YACZ,gBAAgB;YAChB,UAAU;YACV,WAAW;YACX,cAAc;YACd,WAAW;YACX,YAAY;YACZ,kBAAkB;SACnB,CAAC,CAAC;IAEP,OAAO;QACL,SAAS;QACT,WAAW,EAAE,MAAM,CAAC,WAAW,CAAC;QAChC,YAAY,EAAE,MAAM,CAAC,YAAY,CAAC;QAClC,YAAY;QACZ,cAAc;QACd,iBAAiB;QACjB,WAAW;QACX,cAAc;QACd,iBAAiB;QACjB,gBAAgB;QAChB,QAAQ;QACR,WAAW;QACX,SAAS;QACT,UAAU;QACV,QAAQ;QACR,MAAM;KACP,CAAC;AACJ,CAAC;AAED,6EAA6E;AAC7E,SAAS,iBAAiB,CAAC,GAW1B;IACC,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC;QACrB,KAAK,CAAC,IAAI,CACR,cAAc,GAAG,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,YAAY;YACjD,GAAG,GAAG,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC,CAAC,YAAY,GAAG,CAAC,cAAc,EAAE,CAChE,CAAC;IACJ,CAAC;IACD,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC;QAClB,KAAK,CAAC,IAAI,CAAC,eAAe,GAAG,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,WAAW,EAAE,CAAC,CAAC;IAC3E,CAAC;IACD,IAAI,CAAC,GAAG,CAAC,YAAY,EAAE,CAAC;QACtB,KAAK,CAAC,IAAI,CACR,2BAA2B,GAAG,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK;YACzD,GAAG,GAAG,CAAC,kBAAkB,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CACzC,CAAC;IACJ,CAAC;IACD,IAAI,CAAC,GAAG,CAAC,gBAAgB,EAAE,CAAC;QAC1B,KAAK,CAAC,IAAI,CAAC,wCAAwC,CAAC,CAAC;IACvD,CAAC;IACD,OAAO,iBAAiB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;AAC7C,CAAC"}
@@ -0,0 +1,37 @@
1
+ import type { RepoProfile } from './types.js';
2
+ /**
3
+ * planner.ts — turns a task string into an ordered list of plan steps. The
4
+ * baseline plan is a generic map → inspect → patch → verify loop, with the
5
+ * repository summary baked in as data for downstream context.
6
+ */
7
+ export declare function plannerTemplate(profile: RepoProfile): string;
8
+ /**
9
+ * context_builder.ts — ranks candidate files by lexical overlap with the task
10
+ * terms and returns the top slice as context items.
11
+ */
12
+ export declare function contextBuilderTemplate(): string;
13
+ /**
14
+ * reviewer.ts — flags changed files that intersect an injected risk-file list
15
+ * and escalates severity when tests have failed. No inline pattern matching on
16
+ * sensitive words; the risk set is passed in as data.
17
+ */
18
+ export declare function reviewerTemplate(): string;
19
+ /**
20
+ * retry_policy.ts — decides whether to retry an attempt based on a symbolic
21
+ * failure classification (an injected enum), never by scanning raw output.
22
+ */
23
+ export declare function retryPolicyTemplate(): string;
24
+ /**
25
+ * tool_policy.ts — expresses the tool policy over symbolic command kinds, with
26
+ * an allow-list and a deterministic ordering. No raw shell strings appear.
27
+ */
28
+ export declare function toolPolicyTemplate(): string;
29
+ /**
30
+ * memory_policy.ts — decides whether an outcome record is worth remembering.
31
+ */
32
+ export declare function memoryPolicyTemplate(): string;
33
+ /**
34
+ * score_policy.ts — the weight vector folded over the positive scoring terms.
35
+ */
36
+ export declare function scorePolicyTemplate(): string;
37
+ //# sourceMappingURL=templates.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"templates.d.ts","sourceRoot":"","sources":["../src/templates.ts"],"names":[],"mappings":"AAaA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AAE9C;;;;GAIG;AACH,wBAAgB,eAAe,CAAC,OAAO,EAAE,WAAW,GAAG,MAAM,CAkC5D;AAED;;;GAGG;AACH,wBAAgB,sBAAsB,IAAI,MAAM,CAsC/C;AAED;;;;GAIG;AACH,wBAAgB,gBAAgB,IAAI,MAAM,CAqDzC;AAED;;;GAGG;AACH,wBAAgB,mBAAmB,IAAI,MAAM,CAkD5C;AAED;;;GAGG;AACH,wBAAgB,kBAAkB,IAAI,MAAM,CA+B3C;AAED;;GAEG;AACH,wBAAgB,oBAAoB,IAAI,MAAM,CA0B7C;AAED;;GAEG;AACH,wBAAgB,mBAAmB,IAAI,MAAM,CA6B5C"}