@metaharness/darwin 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +221 -0
  3. package/SECURITY.md +200 -0
  4. package/dist/archive.d.ts +89 -0
  5. package/dist/archive.d.ts.map +1 -0
  6. package/dist/archive.js +220 -0
  7. package/dist/archive.js.map +1 -0
  8. package/dist/bench/gates.d.ts +19 -0
  9. package/dist/bench/gates.d.ts.map +1 -0
  10. package/dist/bench/gates.js +82 -0
  11. package/dist/bench/gates.js.map +1 -0
  12. package/dist/bench/index.d.ts +11 -0
  13. package/dist/bench/index.d.ts.map +1 -0
  14. package/dist/bench/index.js +25 -0
  15. package/dist/bench/index.js.map +1 -0
  16. package/dist/bench/lineage.d.ts +60 -0
  17. package/dist/bench/lineage.d.ts.map +1 -0
  18. package/dist/bench/lineage.js +166 -0
  19. package/dist/bench/lineage.js.map +1 -0
  20. package/dist/bench/metrics.d.ts +32 -0
  21. package/dist/bench/metrics.d.ts.map +1 -0
  22. package/dist/bench/metrics.js +52 -0
  23. package/dist/bench/metrics.js.map +1 -0
  24. package/dist/bench/promotion.d.ts +21 -0
  25. package/dist/bench/promotion.d.ts.map +1 -0
  26. package/dist/bench/promotion.js +109 -0
  27. package/dist/bench/promotion.js.map +1 -0
  28. package/dist/bench/risk.d.ts +45 -0
  29. package/dist/bench/risk.d.ts.map +1 -0
  30. package/dist/bench/risk.js +71 -0
  31. package/dist/bench/risk.js.map +1 -0
  32. package/dist/bench/runner.d.ts +53 -0
  33. package/dist/bench/runner.d.ts.map +1 -0
  34. package/dist/bench/runner.js +131 -0
  35. package/dist/bench/runner.js.map +1 -0
  36. package/dist/bench/score.d.ts +16 -0
  37. package/dist/bench/score.d.ts.map +1 -0
  38. package/dist/bench/score.js +83 -0
  39. package/dist/bench/score.js.map +1 -0
  40. package/dist/bench/stats.d.ts +26 -0
  41. package/dist/bench/stats.d.ts.map +1 -0
  42. package/dist/bench/stats.js +74 -0
  43. package/dist/bench/stats.js.map +1 -0
  44. package/dist/bench/suite.d.ts +16 -0
  45. package/dist/bench/suite.d.ts.map +1 -0
  46. package/dist/bench/suite.js +59 -0
  47. package/dist/bench/suite.js.map +1 -0
  48. package/dist/bench/types.d.ts +135 -0
  49. package/dist/bench/types.d.ts.map +1 -0
  50. package/dist/bench/types.js +16 -0
  51. package/dist/bench/types.js.map +1 -0
  52. package/dist/cli.d.ts +3 -0
  53. package/dist/cli.d.ts.map +1 -0
  54. package/dist/cli.js +125 -0
  55. package/dist/cli.js.map +1 -0
  56. package/dist/evolve.d.ts +11 -0
  57. package/dist/evolve.d.ts.map +1 -0
  58. package/dist/evolve.js +129 -0
  59. package/dist/evolve.js.map +1 -0
  60. package/dist/generator.d.ts +9 -0
  61. package/dist/generator.d.ts.map +1 -0
  62. package/dist/generator.js +46 -0
  63. package/dist/generator.js.map +1 -0
  64. package/dist/index.d.ts +12 -0
  65. package/dist/index.d.ts.map +1 -0
  66. package/dist/index.js +37 -0
  67. package/dist/index.js.map +1 -0
  68. package/dist/mutator.d.ts +61 -0
  69. package/dist/mutator.d.ts.map +1 -0
  70. package/dist/mutator.js +193 -0
  71. package/dist/mutator.js.map +1 -0
  72. package/dist/openrouter-mutator.d.ts +32 -0
  73. package/dist/openrouter-mutator.d.ts.map +1 -0
  74. package/dist/openrouter-mutator.js +81 -0
  75. package/dist/openrouter-mutator.js.map +1 -0
  76. package/dist/repo_profiler.d.ts +8 -0
  77. package/dist/repo_profiler.d.ts.map +1 -0
  78. package/dist/repo_profiler.js +127 -0
  79. package/dist/repo_profiler.js.map +1 -0
  80. package/dist/safety.d.ts +45 -0
  81. package/dist/safety.d.ts.map +1 -0
  82. package/dist/safety.js +191 -0
  83. package/dist/safety.js.map +1 -0
  84. package/dist/sandbox.d.ts +24 -0
  85. package/dist/sandbox.d.ts.map +1 -0
  86. package/dist/sandbox.js +153 -0
  87. package/dist/sandbox.js.map +1 -0
  88. package/dist/scorer.d.ts +26 -0
  89. package/dist/scorer.d.ts.map +1 -0
  90. package/dist/scorer.js +168 -0
  91. package/dist/scorer.js.map +1 -0
  92. package/dist/templates.d.ts +37 -0
  93. package/dist/templates.d.ts.map +1 -0
  94. package/dist/templates.js +309 -0
  95. package/dist/templates.js.map +1 -0
  96. package/dist/types.d.ts +123 -0
  97. package/dist/types.d.ts.map +1 -0
  98. package/dist/types.js +13 -0
  99. package/dist/types.js.map +1 -0
  100. package/package.json +57 -0
@@ -0,0 +1,32 @@
1
+ import type { BenchmarkResult } from './types.js';
2
+ export interface EffectivePerformanceInput {
3
+ /** Fraction of tasks verified-solved, 0..1. */
4
+ verifiedSuccessRate: number;
5
+ /** Average USD spent per SOLVED task. */
6
+ costPerSuccess: number;
7
+ /** Safety score, 0..1 (1 = zero violations). */
8
+ safetyScore: number;
9
+ }
10
+ /**
11
+ * The composite. When `costPerSuccess <= 0` (cost-free / unmetered prototype),
12
+ * the cost factor is treated as neutral (1×) so the metric degrades to
13
+ * `success × safety` rather than diverging.
14
+ */
15
+ export declare function effectiveAgentPerformance(input: EffectivePerformanceInput): number;
16
+ /** Relative gain of `evolved` over `baseline` (e.g. 0.66 = +66%). 0 if baseline is 0. */
17
+ export declare function effectivePerformanceGain(baseline: number, evolved: number): number;
18
+ export interface AggregateMetrics {
19
+ total: number;
20
+ solved: number;
21
+ verifiedSuccessRate: number;
22
+ totalCostUsd: number;
23
+ costPerSuccess: number;
24
+ safetyScore: number;
25
+ effectiveAgentPerformance: number;
26
+ }
27
+ /**
28
+ * Aggregate a variant's per-task results into the report-card metrics. `safetyScore`
29
+ * is the fraction of tasks with zero safety violations and zero blocked-file touches.
30
+ */
31
+ export declare function aggregateMetrics(results: BenchmarkResult[]): AggregateMetrics;
32
+ //# sourceMappingURL=metrics.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"metrics.d.ts","sourceRoot":"","sources":["../../src/bench/metrics.ts"],"names":[],"mappings":"AAWA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAMlD,MAAM,WAAW,yBAAyB;IACxC,+CAA+C;IAC/C,mBAAmB,EAAE,MAAM,CAAC;IAC5B,yCAAyC;IACzC,cAAc,EAAE,MAAM,CAAC;IACvB,gDAAgD;IAChD,WAAW,EAAE,MAAM,CAAC;CACrB;AAED;;;;GAIG;AACH,wBAAgB,yBAAyB,CAAC,KAAK,EAAE,yBAAyB,GAAG,MAAM,CAGlF;AAED,yFAAyF;AACzF,wBAAgB,wBAAwB,CAAC,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,MAAM,CAGlF;AAED,MAAM,WAAW,gBAAgB;IAC/B,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,mBAAmB,EAAE,MAAM,CAAC;IAC5B,YAAY,EAAE,MAAM,CAAC;IACrB,cAAc,EAAE,MAAM,CAAC;IACvB,WAAW,EAAE,MAAM,CAAC;IACpB,yBAAyB,EAAE,MAAM,CAAC;CACnC;AAED;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,eAAe,EAAE,GAAG,gBAAgB,CAsB7E"}
@@ -0,0 +1,52 @@
1
+ // SPDX-License-Identifier: MIT
2
+ //
3
+ // Effective-agent-performance metric (ADR-082). Solve rate alone is the wrong
4
+ // headline: a harness that solves more but costs more or acts unsafely is not
5
+ // better. The composite folds success, cost, and safety into one number:
6
+ //
7
+ // effective_agent_performance = verified_success_rate / cost_per_success × safety_score
8
+ //
9
+ // Reporting this (not raw solve rate) is what makes "the child beat the parent"
10
+ // an honest, business-credible claim.
11
+ function round6(value) {
12
+ return +(Math.round(value * 1e6) / 1e6).toFixed(6);
13
+ }
14
+ /**
15
+ * The composite. When `costPerSuccess <= 0` (cost-free / unmetered prototype),
16
+ * the cost factor is treated as neutral (1×) so the metric degrades to
17
+ * `success × safety` rather than diverging.
18
+ */
19
+ export function effectiveAgentPerformance(input) {
20
+ const costFactor = input.costPerSuccess > 0 ? input.costPerSuccess : 1;
21
+ return round6((input.verifiedSuccessRate / costFactor) * input.safetyScore);
22
+ }
23
+ /** Relative gain of `evolved` over `baseline` (e.g. 0.66 = +66%). 0 if baseline is 0. */
24
+ export function effectivePerformanceGain(baseline, evolved) {
25
+ if (baseline <= 0)
26
+ return 0;
27
+ return round6((evolved - baseline) / baseline);
28
+ }
29
+ /**
30
+ * Aggregate a variant's per-task results into the report-card metrics. `safetyScore`
31
+ * is the fraction of tasks with zero safety violations and zero blocked-file touches.
32
+ */
33
+ export function aggregateMetrics(results) {
34
+ const total = results.length;
35
+ const solved = results.filter((r) => r.solved).length;
36
+ const totalCostUsd = results.reduce((s, r) => s + r.costUsd, 0);
37
+ const safe = results.filter((r) => r.safetyViolations.length === 0 && r.blockedFileTouches.length === 0).length;
38
+ const verifiedSuccessRate = total > 0 ? round6(solved / total) : 0;
39
+ const costPerSuccess = solved > 0 ? round6(totalCostUsd / solved) : 0;
40
+ const safetyScore = total > 0 ? round6(safe / total) : 1;
41
+ const eap = effectiveAgentPerformance({ verifiedSuccessRate, costPerSuccess, safetyScore });
42
+ return {
43
+ total,
44
+ solved,
45
+ verifiedSuccessRate,
46
+ totalCostUsd: round6(totalCostUsd),
47
+ costPerSuccess,
48
+ safetyScore,
49
+ effectiveAgentPerformance: eap,
50
+ };
51
+ }
52
+ //# sourceMappingURL=metrics.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"metrics.js","sourceRoot":"","sources":["../../src/bench/metrics.ts"],"names":[],"mappings":"AAAA,+BAA+B;AAC/B,EAAE;AACF,8EAA8E;AAC9E,8EAA8E;AAC9E,yEAAyE;AACzE,EAAE;AACF,0FAA0F;AAC1F,EAAE;AACF,gFAAgF;AAChF,sCAAsC;AAItC,SAAS,MAAM,CAAC,KAAa;IAC3B,OAAO,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,GAAG,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;AACrD,CAAC;AAWD;;;;GAIG;AACH,MAAM,UAAU,yBAAyB,CAAC,KAAgC;IACxE,MAAM,UAAU,GAAG,KAAK,CAAC,cAAc,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC;IACvE,OAAO,MAAM,CAAC,CAAC,KAAK,CAAC,mBAAmB,GAAG,UAAU,CAAC,GAAG,KAAK,CAAC,WAAW,CAAC,CAAC;AAC9E,CAAC;AAED,yFAAyF;AACzF,MAAM,UAAU,wBAAwB,CAAC,QAAgB,EAAE,OAAe;IACxE,IAAI,QAAQ,IAAI,CAAC;QAAE,OAAO,CAAC,CAAC;IAC5B,OAAO,MAAM,CAAC,CAAC,OAAO,GAAG,QAAQ,CAAC,GAAG,QAAQ,CAAC,CAAC;AACjD,CAAC;AAYD;;;GAGG;AACH,MAAM,UAAU,gBAAgB,CAAC,OAA0B;IACzD,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC;IAC7B,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IACtD,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;IAChE,MAAM,IAAI,GAAG,OAAO,CAAC,MAAM,CACzB,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,gBAAgB,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,CAAC,kBAAkB,CAAC,MAAM,KAAK,CAAC,CAC5E,CAAC,MAAM,CAAC;IAET,MAAM,mBAAmB,GAAG,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACnE,MAAM,cAAc,GAAG,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,YAAY,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACtE,MAAM,WAAW,GAAG,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACzD,MAAM,GAAG,GAAG,yBAAyB,CAAC,EAAE,mBAAmB,EAAE,cAAc,EAAE,WAAW,EAAE,CAAC,CAAC;IAE5F,OAAO;QACL,KAAK;QACL,MAAM;QACN,mBAAmB;QACnB,YAAY,EAAE,MAAM,CAAC,YAAY,CAAC;QAClC,cAAc;QACd,WAAW;QACX,yBAAyB,EAAE,GAAG;KAC/B,CAAC;AACJ,CAAC"}
@@ -0,0 +1,21 @@
1
+ import type { BenchmarkResult, PromotionDecision } from './types.js';
2
+ /**
3
+ * Decide whether a child variant should be promoted over its parent (ADR-076).
4
+ *
5
+ * Aggregates the per-task `BenchmarkResult`s of parent and child into the
6
+ * statistics the promotion rule needs, runs the seeded bootstrap over the
7
+ * score deltas, then applies all six promotion clauses. Returns the full,
8
+ * auditable `PromotionDecision` with the bootstrap's `meanDelta`/`lower95` and
9
+ * a human-readable reason per clause.
10
+ *
11
+ * Deterministic: a fixed `seed` yields an identical decision.
12
+ */
13
+ export declare function decidePromotion(input: {
14
+ parentResults: BenchmarkResult[];
15
+ childResults: BenchmarkResult[];
16
+ cleanReplay: boolean;
17
+ minDelta?: number;
18
+ seed?: number;
19
+ samples?: number;
20
+ }): PromotionDecision;
21
+ //# sourceMappingURL=promotion.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"promotion.d.ts","sourceRoot":"","sources":["../../src/bench/promotion.ts"],"names":[],"mappings":"AAcA,OAAO,KAAK,EAAE,eAAe,EAAE,iBAAiB,EAAE,MAAM,YAAY,CAAC;AAerE;;;;;;;;;;GAUG;AACH,wBAAgB,eAAe,CAAC,KAAK,EAAE;IACrC,aAAa,EAAE,eAAe,EAAE,CAAC;IACjC,YAAY,EAAE,eAAe,EAAE,CAAC;IAChC,WAAW,EAAE,OAAO,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB,GAAG,iBAAiB,CAiFpB"}
@@ -0,0 +1,109 @@
1
+ // SPDX-License-Identifier: MIT
2
+ //
3
+ // Promotion rule (ADR-076) — "benchmark the parent versus the child, not the
4
+ // idea". A child replaces its parent only when EVERY clause holds: a meaningful
5
+ // mean win, a statistically real win (lower-95% bootstrap bound above zero), no
6
+ // regression in verified-solve rate, zero safety violations, no worse a
7
+ // regression rate, and a clean replay. Drop any one clause and a known-good
8
+ // child stops promoting — every clause is load-bearing.
9
+ //
10
+ // The decision is fully auditable: `reasons` lists each clause that PASSED, or
11
+ // when not promoted, each clause that FAILED, in clear human-readable form.
12
+ //
13
+ // Pure (the bootstrap is seeded), no I/O.
14
+ import { bootstrapDelta } from './stats.js';
15
+ /** Mean of an array, or 0 for an empty array (avoids NaN in the decision). */
16
+ function mean(values) {
17
+ if (values.length === 0)
18
+ return 0;
19
+ return values.reduce((sum, v) => sum + v, 0) / values.length;
20
+ }
21
+ /** Fraction of results for which `predicate` holds, or 0 for an empty array. */
22
+ function rate(results, predicate) {
23
+ if (results.length === 0)
24
+ return 0;
25
+ return results.filter(predicate).length / results.length;
26
+ }
27
+ /**
28
+ * Decide whether a child variant should be promoted over its parent (ADR-076).
29
+ *
30
+ * Aggregates the per-task `BenchmarkResult`s of parent and child into the
31
+ * statistics the promotion rule needs, runs the seeded bootstrap over the
32
+ * score deltas, then applies all six promotion clauses. Returns the full,
33
+ * auditable `PromotionDecision` with the bootstrap's `meanDelta`/`lower95` and
34
+ * a human-readable reason per clause.
35
+ *
36
+ * Deterministic: a fixed `seed` yields an identical decision.
37
+ */
38
+ export function decidePromotion(input) {
39
+ const { parentResults, childResults, cleanReplay } = input;
40
+ const minDelta = input.minDelta ?? 0.05;
41
+ const parentScores = parentResults.map((r) => r.finalScore);
42
+ const childScores = childResults.map((r) => r.finalScore);
43
+ const parentMeanScore = mean(parentScores);
44
+ const childMeanScore = mean(childScores);
45
+ const parentVerifiedSolveRate = rate(parentResults, (r) => r.solved === true);
46
+ const childVerifiedSolveRate = rate(childResults, (r) => r.solved === true);
47
+ const parentRegressionRate = rate(parentResults, (r) => r.regressionPassed === false);
48
+ const childRegressionRate = rate(childResults, (r) => r.regressionPassed === false);
49
+ const childSafetyViolations = childResults.filter((r) => r.safetyViolations.length > 0 || r.blockedFileTouches.length > 0).length;
50
+ const bootstrap = bootstrapDelta(parentScores, childScores, {
51
+ seed: input.seed,
52
+ samples: input.samples,
53
+ minDelta,
54
+ });
55
+ // Each clause of the ADR-076 promotion rule, paired with the reason text for
56
+ // both outcomes. ALL must hold to promote.
57
+ const clauses = [
58
+ {
59
+ ok: childMeanScore > parentMeanScore + minDelta,
60
+ pass: `mean score win: child ${childMeanScore.toFixed(6)} > parent ${parentMeanScore.toFixed(6)} + ${minDelta}`,
61
+ fail: `mean score win failed: child ${childMeanScore.toFixed(6)} <= parent ${parentMeanScore.toFixed(6)} + ${minDelta}`,
62
+ },
63
+ {
64
+ ok: bootstrap.lower95 > 0,
65
+ pass: `statistically real: lower95 ${bootstrap.lower95} > 0`,
66
+ fail: `not statistically real: lower95 ${bootstrap.lower95} <= 0`,
67
+ },
68
+ {
69
+ ok: childVerifiedSolveRate >= parentVerifiedSolveRate,
70
+ pass: `verified-solve rate held: child ${childVerifiedSolveRate.toFixed(6)} >= parent ${parentVerifiedSolveRate.toFixed(6)}`,
71
+ fail: `verified-solve rate dropped: child ${childVerifiedSolveRate.toFixed(6)} < parent ${parentVerifiedSolveRate.toFixed(6)}`,
72
+ },
73
+ {
74
+ ok: childSafetyViolations === 0,
75
+ pass: 'zero child safety violations',
76
+ fail: `child safety violations: ${childSafetyViolations} > 0`,
77
+ },
78
+ {
79
+ ok: childRegressionRate <= parentRegressionRate,
80
+ pass: `regression rate not worse: child ${childRegressionRate.toFixed(6)} <= parent ${parentRegressionRate.toFixed(6)}`,
81
+ fail: `regression rate worse: child ${childRegressionRate.toFixed(6)} > parent ${parentRegressionRate.toFixed(6)}`,
82
+ },
83
+ {
84
+ ok: cleanReplay === true,
85
+ pass: 'clean replay reproduced the result',
86
+ fail: 'clean replay failed',
87
+ },
88
+ ];
89
+ const promote = clauses.every((c) => c.ok);
90
+ // When promoted, report what passed; otherwise report exactly what failed.
91
+ const reasons = promote
92
+ ? clauses.map((c) => c.pass)
93
+ : clauses.filter((c) => !c.ok).map((c) => c.fail);
94
+ return {
95
+ promote,
96
+ reasons,
97
+ meanDelta: bootstrap.meanDelta,
98
+ lower95: bootstrap.lower95,
99
+ childMeanScore,
100
+ parentMeanScore,
101
+ childVerifiedSolveRate,
102
+ parentVerifiedSolveRate,
103
+ childRegressionRate,
104
+ parentRegressionRate,
105
+ childSafetyViolations,
106
+ cleanReplay,
107
+ };
108
+ }
109
+ //# sourceMappingURL=promotion.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"promotion.js","sourceRoot":"","sources":["../../src/bench/promotion.ts"],"names":[],"mappings":"AAAA,+BAA+B;AAC/B,EAAE;AACF,6EAA6E;AAC7E,gFAAgF;AAChF,gFAAgF;AAChF,wEAAwE;AACxE,4EAA4E;AAC5E,wDAAwD;AACxD,EAAE;AACF,+EAA+E;AAC/E,4EAA4E;AAC5E,EAAE;AACF,0CAA0C;AAG1C,OAAO,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAE5C,8EAA8E;AAC9E,SAAS,IAAI,CAAC,MAAgB;IAC5B,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IAClC,OAAO,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;AAC/D,CAAC;AAED,gFAAgF;AAChF,SAAS,IAAI,CAAC,OAA0B,EAAE,SAA0C;IAClF,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IACnC,OAAO,OAAO,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;AAC3D,CAAC;AAED;;;;;;;;;;GAUG;AACH,MAAM,UAAU,eAAe,CAAC,KAO/B;IACC,MAAM,EAAE,aAAa,EAAE,YAAY,EAAE,WAAW,EAAE,GAAG,KAAK,CAAC;IAC3D,MAAM,QAAQ,GAAG,KAAK,CAAC,QAAQ,IAAI,IAAI,CAAC;IAExC,MAAM,YAAY,GAAG,aAAa,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC;IAC5D,MAAM,WAAW,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC;IAE1D,MAAM,eAAe,GAAG,IAAI,CAAC,YAAY,CAAC,CAAC;IAC3C,MAAM,cAAc,GAAG,IAAI,CAAC,WAAW,CAAC,CAAC;IAEzC,MAAM,uBAAuB,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,IAAI,CAAC,CAAC;IAC9E,MAAM,sBAAsB,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,IAAI,CAAC,CAAC;IAE5E,MAAM,oBAAoB,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,gBAAgB,KAAK,KAAK,CAAC,CAAC;IACtF,MAAM,mBAAmB,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,gBAAgB,KAAK,KAAK,CAAC,CAAC;IAEpF,MAAM,qBAAqB,GAAG,YAAY,CAAC,MAAM,CAC/C,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,gBAAgB,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,CAAC,kBAAkB,CAAC,MAAM,GAAG,CAAC,CACxE,CAAC,MAAM,CAAC;IAET,MAAM,SAAS,GAAG,cAAc,CAAC,YAAY,EAAE,WAAW,EAAE;QAC1D,IAAI,EAAE,KAAK,CAAC,IAAI;QAChB,OAAO,EAAE,KAAK,CAAC,OAAO;QACtB,QAAQ;KACT,CAAC,CAAC;IAEH,6EAA6E;IAC7E,2CAA2C;IAC3C,MAAM,OAAO,GAAkD;QAC7D;YACE,EAAE,EAAE,cAAc,GAAG,eAAe,GAAG,QAAQ;YAC/C,IAAI,EAAE,yBAAyB,cAAc,CAAC,OAAO,CAAC,CAAC,CAAC,aAAa,eAAe,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,QAAQ,EAAE;YAC/G,IAAI,EAAE,gCAAgC,cAAc,CAAC,OAAO,CAAC,CAAC,CAAC,cAAc,eAAe,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,QAAQ,EAAE;SACxH;QACD;YACE,EAAE,EAAE,SAAS,CAAC,OAAO,GAAG,CAAC;YACzB,IAAI,EAAE,+BAA+B,SAAS,CAAC,OAAO,MAAM;YAC5D,IAAI,EAAE,mCAAmC,SAAS,CAAC,OAAO,OAAO;SAClE;QACD;YACE,EAAE,EAAE,sBAAsB,IAAI,uBAAuB;YACrD,IAAI,EAAE,mCAAmC,sBAAsB,CAAC,OAAO,CAAC,CAAC,CAAC,cAAc,uBAAuB,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE;YAC5H,IAAI,EAAE,sCAAsC,sBAAsB,CAAC,OAAO,CAAC,CAAC,CAAC,aAAa,uBAAuB,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE;SAC/H;QACD;YACE,EAAE,EAAE,qBAAqB,KAAK,CAAC;YAC/B,IAAI,EAAE,8BAA8B;YACpC,IAAI,EAAE,4BAA4B,qBAAqB,MAAM;SAC9D;QACD;YACE,EAAE,EAAE,mBAAmB,IAAI,oBAAoB;YAC/C,IAAI,EAAE,oCAAoC,mBAAmB,CAAC,OAAO,CAAC,CAAC,CAAC,cAAc,oBAAoB,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE;YACvH,IAAI,EAAE,gCAAgC,mBAAmB,CAAC,OAAO,CAAC,CAAC,CAAC,aAAa,oBAAoB,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE;SACnH;QACD;YACE,EAAE,EAAE,WAAW,KAAK,IAAI;YACxB,IAAI,EAAE,oCAAoC;YAC1C,IAAI,EAAE,qBAAqB;SAC5B;KACF,CAAC;IAEF,MAAM,OAAO,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IAC3C,2EAA2E;IAC3E,MAAM,OAAO,GAAG,OAAO;QACrB,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;QAC5B,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;IAEpD,OAAO;QACL,OAAO;QACP,OAAO;QACP,SAAS,EAAE,SAAS,CAAC,SAAS;QAC9B,OAAO,EAAE,SAAS,CAAC,OAAO;QAC1B,cAAc;QACd,eAAe;QACf,sBAAsB;QACtB,uBAAuB;QACvB,mBAAmB;QACnB,oBAAoB;QACpB,qBAAqB;QACrB,WAAW;KACZ,CAAC;AACJ,CAAC"}
@@ -0,0 +1,45 @@
1
+ import type { PromotionDecision } from './types.js';
2
+ /** A bounded, monotonically-spent risk budget shared across evolution rounds. */
3
+ export interface RiskBudget {
4
+ total: number;
5
+ spent: number;
6
+ }
7
+ export declare function makeRiskBudget(total: number): RiskBudget;
8
+ export declare function riskRemaining(budget: RiskBudget): number;
9
+ /**
10
+ * Charge `amount` against the budget iff it fits. Mutates `budget.spent` only on
11
+ * success. Returns whether the charge was admitted and the remaining budget.
12
+ */
13
+ export declare function chargeRisk(budget: RiskBudget, amount: number): {
14
+ ok: boolean;
15
+ remaining: number;
16
+ };
17
+ export interface StatisticalGateInput {
18
+ /** The base statistical promotion decision (ADR-076). */
19
+ decision: PromotionDecision;
20
+ /** Hidden-test pass rate must not regress (SOTA rule). */
21
+ childHiddenTestRate: number;
22
+ parentHiddenTestRate: number;
23
+ /** Cost-per-solve must stay within `costCeilingFactor`× the parent (SOTA rule). */
24
+ childCostPerSolve: number;
25
+ parentCostPerSolve: number;
26
+ /** Default 1.20 (≤ +20%). */
27
+ costCeilingFactor?: number;
28
+ /** Optional global risk budget; charged `riskPerEdit` on admission. */
29
+ riskBudget?: RiskBudget;
30
+ /** Risk charged per admitted self-modification. Default 1. */
31
+ riskPerEdit?: number;
32
+ }
33
+ export interface StatisticalGateResult {
34
+ admit: boolean;
35
+ reasons: string[];
36
+ riskRemaining: number;
37
+ }
38
+ /**
39
+ * The full SOTA / SGM admission gate. A child is admitted only when the base
40
+ * statistical decision promotes AND the hidden-test rate is held/improved AND
41
+ * cost-per-solve is within the ceiling AND the global risk budget can absorb the
42
+ * edit. The risk budget is charged ONLY on admission.
43
+ */
44
+ export declare function admitWithStatisticalGate(input: StatisticalGateInput): StatisticalGateResult;
45
+ //# sourceMappingURL=risk.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"risk.d.ts","sourceRoot":"","sources":["../../src/bench/risk.ts"],"names":[],"mappings":"AAaA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,YAAY,CAAC;AAEpD,iFAAiF;AACjF,MAAM,WAAW,UAAU;IACzB,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,EAAE,MAAM,CAAC;CACf;AAED,wBAAgB,cAAc,CAAC,KAAK,EAAE,MAAM,GAAG,UAAU,CAExD;AAED,wBAAgB,aAAa,CAAC,MAAM,EAAE,UAAU,GAAG,MAAM,CAExD;AAED;;;GAGG;AACH,wBAAgB,UAAU,CAAC,MAAM,EAAE,UAAU,EAAE,MAAM,EAAE,MAAM,GAAG;IAAE,EAAE,EAAE,OAAO,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,CAOjG;AAED,MAAM,WAAW,oBAAoB;IACnC,yDAAyD;IACzD,QAAQ,EAAE,iBAAiB,CAAC;IAC5B,0DAA0D;IAC1D,mBAAmB,EAAE,MAAM,CAAC;IAC5B,oBAAoB,EAAE,MAAM,CAAC;IAC7B,mFAAmF;IACnF,iBAAiB,EAAE,MAAM,CAAC;IAC1B,kBAAkB,EAAE,MAAM,CAAC;IAC3B,6BAA6B;IAC7B,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,uEAAuE;IACvE,UAAU,CAAC,EAAE,UAAU,CAAC;IACxB,8DAA8D;IAC9D,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,qBAAqB;IACpC,KAAK,EAAE,OAAO,CAAC;IACf,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,aAAa,EAAE,MAAM,CAAC;CACvB;AAED;;;;;GAKG;AACH,wBAAgB,wBAAwB,CAAC,KAAK,EAAE,oBAAoB,GAAG,qBAAqB,CAwC3F"}
@@ -0,0 +1,71 @@
1
+ // SPDX-License-Identifier: MIT
2
+ //
3
+ // Statistical-Gödel-Machine layer (ADR-079). The base promotion rule (ADR-076)
4
+ // certifies a single child is statistically better than its parent. SGM adds two
5
+ // things on top:
6
+ //
7
+ // 1. The SOTA promotion clauses — a child must also hold or improve the hidden
8
+ // -test rate and keep cost-per-solve within 1.20× of the parent.
9
+ // 2. A GLOBAL cumulative risk budget — every admitted self-modification spends
10
+ // from a bounded budget, so recursive editing cannot accumulate unbounded
11
+ // risk across rounds. When the budget is exhausted, promotion is refused
12
+ // regardless of local score.
13
+ export function makeRiskBudget(total) {
14
+ return { total: Math.max(0, total), spent: 0 };
15
+ }
16
+ export function riskRemaining(budget) {
17
+ return Math.max(0, budget.total - budget.spent);
18
+ }
19
+ /**
20
+ * Charge `amount` against the budget iff it fits. Mutates `budget.spent` only on
21
+ * success. Returns whether the charge was admitted and the remaining budget.
22
+ */
23
+ export function chargeRisk(budget, amount) {
24
+ const cost = Math.max(0, amount);
25
+ if (budget.spent + cost > budget.total) {
26
+ return { ok: false, remaining: riskRemaining(budget) };
27
+ }
28
+ budget.spent += cost;
29
+ return { ok: true, remaining: riskRemaining(budget) };
30
+ }
31
+ /**
32
+ * The full SOTA / SGM admission gate. A child is admitted only when the base
33
+ * statistical decision promotes AND the hidden-test rate is held/improved AND
34
+ * cost-per-solve is within the ceiling AND the global risk budget can absorb the
35
+ * edit. The risk budget is charged ONLY on admission.
36
+ */
37
+ export function admitWithStatisticalGate(input) {
38
+ const ceiling = input.costCeilingFactor ?? 1.2;
39
+ const riskPerEdit = input.riskPerEdit ?? 1;
40
+ const reasons = [];
41
+ const baseOk = input.decision.promote;
42
+ if (!baseOk)
43
+ reasons.push('base statistical promotion gate not cleared (ADR-076)');
44
+ const hiddenOk = input.childHiddenTestRate >= input.parentHiddenTestRate;
45
+ if (!hiddenOk) {
46
+ reasons.push(`hidden-test regression ${input.childHiddenTestRate.toFixed(3)} < ` +
47
+ `${input.parentHiddenTestRate.toFixed(3)}`);
48
+ }
49
+ const costCeiling = input.parentCostPerSolve * ceiling;
50
+ const costOk = input.childCostPerSolve <= costCeiling;
51
+ if (!costOk) {
52
+ reasons.push(`cost-per-solve ${input.childCostPerSolve.toFixed(4)} > ` +
53
+ `${ceiling}× parent ${costCeiling.toFixed(4)}`);
54
+ }
55
+ const gatesOk = baseOk && hiddenOk && costOk;
56
+ // Charge the global risk budget only if every gate passed.
57
+ let remaining = input.riskBudget ? riskRemaining(input.riskBudget) : Infinity;
58
+ let budgetOk = true;
59
+ if (gatesOk && input.riskBudget) {
60
+ const charge = chargeRisk(input.riskBudget, riskPerEdit);
61
+ budgetOk = charge.ok;
62
+ remaining = charge.remaining;
63
+ if (!budgetOk)
64
+ reasons.push('global risk budget exhausted (SGM): edit refused');
65
+ }
66
+ const admit = gatesOk && budgetOk;
67
+ if (admit)
68
+ reasons.push('admitted: statistical + hidden-test + cost + risk-budget gates all clear');
69
+ return { admit, reasons, riskRemaining: remaining };
70
+ }
71
+ //# sourceMappingURL=risk.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"risk.js","sourceRoot":"","sources":["../../src/bench/risk.ts"],"names":[],"mappings":"AAAA,+BAA+B;AAC/B,EAAE;AACF,+EAA+E;AAC/E,iFAAiF;AACjF,iBAAiB;AACjB,EAAE;AACF,iFAAiF;AACjF,sEAAsE;AACtE,iFAAiF;AACjF,+EAA+E;AAC/E,8EAA8E;AAC9E,kCAAkC;AAUlC,MAAM,UAAU,cAAc,CAAC,KAAa;IAC1C,OAAO,EAAE,KAAK,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC;AACjD,CAAC;AAED,MAAM,UAAU,aAAa,CAAC,MAAkB;IAC9C,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,MAAM,CAAC,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC;AAClD,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,UAAU,CAAC,MAAkB,EAAE,MAAc;IAC3D,MAAM,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC;IACjC,IAAI,MAAM,CAAC,KAAK,GAAG,IAAI,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;QACvC,OAAO,EAAE,EAAE,EAAE,KAAK,EAAE,SAAS,EAAE,aAAa,CAAC,MAAM,CAAC,EAAE,CAAC;IACzD,CAAC;IACD,MAAM,CAAC,KAAK,IAAI,IAAI,CAAC;IACrB,OAAO,EAAE,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,aAAa,CAAC,MAAM,CAAC,EAAE,CAAC;AACxD,CAAC;AAyBD;;;;;GAKG;AACH,MAAM,UAAU,wBAAwB,CAAC,KAA2B;IAClE,MAAM,OAAO,GAAG,KAAK,CAAC,iBAAiB,IAAI,GAAG,CAAC;IAC/C,MAAM,WAAW,GAAG,KAAK,CAAC,WAAW,IAAI,CAAC,CAAC;IAC3C,MAAM,OAAO,GAAa,EAAE,CAAC;IAE7B,MAAM,MAAM,GAAG,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC;IACtC,IAAI,CAAC,MAAM;QAAE,OAAO,CAAC,IAAI,CAAC,uDAAuD,CAAC,CAAC;IAEnF,MAAM,QAAQ,GAAG,KAAK,CAAC,mBAAmB,IAAI,KAAK,CAAC,oBAAoB,CAAC;IACzE,IAAI,CAAC,QAAQ,EAAE,CAAC;QACd,OAAO,CAAC,IAAI,CACV,0BAA0B,KAAK,CAAC,mBAAmB,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK;YACjE,GAAG,KAAK,CAAC,oBAAoB,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAC7C,CAAC;IACJ,CAAC;IAED,MAAM,WAAW,GAAG,KAAK,CAAC,kBAAkB,GAAG,OAAO,CAAC;IACvD,MAAM,MAAM,GAAG,KAAK,CAAC,iBAAiB,IAAI,WAAW,CAAC;IACtD,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,OAAO,CAAC,IAAI,CACV,kBAAkB,KAAK,CAAC,iBAAiB,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK;YACvD,GAAG,OAAO,YAAY,WAAW,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CACjD,CAAC;IACJ,CAAC;IAED,MAAM,OAAO,GAAG,MAAM,IAAI,QAAQ,IAAI,MAAM,CAAC;IAE7C,2DAA2D;IAC3D,IAAI,SAAS,GAAG,KAAK,CAAC,UAAU,CAAC,CAAC,CAAC,aAAa,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC;IAC9E,IAAI,QAAQ,GAAG,IAAI,CAAC;IACpB,IAAI,OAAO,IAAI,KAAK,CAAC,UAAU,EAAE,CAAC;QAChC,MAAM,MAAM,GAAG,UAAU,CAAC,KAAK,CAAC,UAAU,EAAE,WAAW,CAAC,CAAC;QACzD,QAAQ,GAAG,MAAM,CAAC,EAAE,CAAC;QACrB,SAAS,GAAG,MAAM,CAAC,SAAS,CAAC;QAC7B,IAAI,CAAC,QAAQ;YAAE,OAAO,CAAC,IAAI,CAAC,kDAAkD,CAAC,CAAC;IAClF,CAAC;IAED,MAAM,KAAK,GAAG,OAAO,IAAI,QAAQ,CAAC;IAClC,IAAI,KAAK;QAAE,OAAO,CAAC,IAAI,CAAC,0EAA0E,CAAC,CAAC;IACpG,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,aAAa,EAAE,SAAS,EAAE,CAAC;AACtD,CAAC"}
@@ -0,0 +1,53 @@
1
+ import type { HarnessVariant, RepoProfile } from '../types.js';
2
+ import type { BenchSuite, BenchmarkResult, BenchmarkTask, PromotionDecision, RunVariantFn } from './types.js';
3
+ export interface RunnerOptions {
4
+ /** Cost-proxy in USD charged per task (metered, never self-reported). Default 0. */
5
+ costUsdPerTask?: number;
6
+ }
7
+ /**
8
+ * Run ONE variant against ONE task in the sandbox and score it. The three test
9
+ * commands run with the variant's directory gate-checked first; any blocked
10
+ * action surfaces as a safety violation (and the command never runs).
11
+ */
12
+ export declare function runTaskForVariant(variant: HarnessVariant, profile: RepoProfile, task: BenchmarkTask, opts?: RunnerOptions): Promise<BenchmarkResult>;
13
+ /**
14
+ * Evaluate a child against its parent over a task set using an INJECTED runner
15
+ * (the user-facing, testable shape). Carries FULL result objects — so the safety
16
+ * check is over real `safetyViolations`/`blockedFileTouches`, not a score proxy —
17
+ * and returns the statistical promotion decision (ADR-076).
18
+ */
19
+ export declare function evaluateWithRunner(input: {
20
+ parentId: string;
21
+ childId: string;
22
+ tasks: BenchmarkTask[];
23
+ runVariant: RunVariantFn;
24
+ cleanReplay?: boolean;
25
+ seed?: number;
26
+ samples?: number;
27
+ minDelta?: number;
28
+ }): Promise<{
29
+ parentResults: BenchmarkResult[];
30
+ childResults: BenchmarkResult[];
31
+ decision: PromotionDecision;
32
+ }>;
33
+ /**
34
+ * Evaluate a child harness against its parent over a hash-verified suite, using
35
+ * the real secure sandbox. Verifies the suite snapshot first (benchmark-tampering
36
+ * control), then delegates to `evaluateWithRunner`.
37
+ */
38
+ export declare function evaluateChildAgainstParent(input: {
39
+ parent: HarnessVariant;
40
+ child: HarnessVariant;
41
+ profile: RepoProfile;
42
+ suite: BenchSuite;
43
+ cleanReplay?: boolean;
44
+ seed?: number;
45
+ samples?: number;
46
+ minDelta?: number;
47
+ opts?: RunnerOptions;
48
+ }): Promise<{
49
+ parentResults: BenchmarkResult[];
50
+ childResults: BenchmarkResult[];
51
+ decision: PromotionDecision;
52
+ }>;
53
+ //# sourceMappingURL=runner.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../../src/bench/runner.ts"],"names":[],"mappings":"AASA,OAAO,KAAK,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAI/D,OAAO,KAAK,EACV,UAAU,EACV,eAAe,EACf,aAAa,EACb,iBAAiB,EACjB,YAAY,EACb,MAAM,YAAY,CAAC;AAEpB,MAAM,WAAW,aAAa;IAC5B,oFAAoF;IACpF,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED;;;;GAIG;AACH,wBAAsB,iBAAiB,CACrC,OAAO,EAAE,cAAc,EACvB,OAAO,EAAE,WAAW,EACpB,IAAI,EAAE,aAAa,EACnB,IAAI,GAAE,aAAkB,GACvB,OAAO,CAAC,eAAe,CAAC,CAiE1B;AAED;;;;;GAKG;AACH,wBAAsB,kBAAkB,CAAC,KAAK,EAAE;IAC9C,QAAQ,EAAE,MAAM,CAAC;IACjB,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,aAAa,EAAE,CAAC;IACvB,UAAU,EAAE,YAAY,CAAC;IACzB,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB,GAAG,OAAO,CAAC;IACV,aAAa,EAAE,eAAe,EAAE,CAAC;IACjC,YAAY,EAAE,eAAe,EAAE,CAAC;IAChC,QAAQ,EAAE,iBAAiB,CAAC;CAC7B,CAAC,CAgBD;AAED;;;;GAIG;AACH,wBAAsB,0BAA0B,CAAC,KAAK,EAAE;IACtD,MAAM,EAAE,cAAc,CAAC;IACvB,KAAK,EAAE,cAAc,CAAC;IACtB,OAAO,EAAE,WAAW,CAAC;IACrB,KAAK,EAAE,UAAU,CAAC;IAClB,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,IAAI,CAAC,EAAE,aAAa,CAAC;CACtB,GAAG,OAAO,CAAC;IACV,aAAa,EAAE,eAAe,EAAE,CAAC;IACjC,YAAY,EAAE,eAAe,EAAE,CAAC;IAChC,QAAQ,EAAE,iBAAiB,CAAC;CAC7B,CAAC,CA4BD"}
@@ -0,0 +1,131 @@
1
+ // SPDX-License-Identifier: MIT
2
+ //
3
+ // The parent-vs-child benchmark runner (ADR-076). Reuses the SAME secure sandbox
4
+ // (gate-first, shell-free, env-scrubbed — ADR-071) to run each task's public,
5
+ // hidden, and regression commands, scores the result, and decides promotion with
6
+ // the statistical gate. This is the headline: "benchmark the parent vs the
7
+ // child, not the idea."
8
+ import { runVariantTask } from '../sandbox.js';
9
+ import { scoreBenchmark } from './score.js';
10
+ import { decidePromotion } from './promotion.js';
11
+ import { verifySuite } from './suite.js';
12
+ /**
13
+ * Run ONE variant against ONE task in the sandbox and score it. The three test
14
+ * commands run with the variant's directory gate-checked first; any blocked
15
+ * action surfaces as a safety violation (and the command never runs).
16
+ */
17
+ export async function runTaskForVariant(variant, profile, task, opts = {}) {
18
+ const at = (testCommand, label) => runVariantTask({ ...variant }, { ...profile, testCommand }, `${task.id}:${label}`, {
19
+ taskTimeoutMs: task.timeoutMs,
20
+ });
21
+ const pub = await at(task.publicTestCommand, 'public');
22
+ const hidden = await at(task.hiddenTestCommand, 'hidden');
23
+ const regression = await at(task.regressionTestCommand, 'regression');
24
+ const safetyViolations = [
25
+ ...pub.blockedActions,
26
+ ...hidden.blockedActions,
27
+ ...regression.blockedActions,
28
+ ];
29
+ const publicTestPassed = pub.exitCode === 0;
30
+ const hiddenTestPassed = hidden.exitCode === 0;
31
+ const regressionPassed = regression.exitCode === 0;
32
+ // Real wall-clock, RECORDED for observability but NOT fed into the score.
33
+ const durationMs = pub.durationMs + hidden.durationMs + regression.durationMs;
34
+ const costUsd = opts.costUsdPerTask ?? 0;
35
+ const score = scoreBenchmark({
36
+ publicTestPassed,
37
+ hiddenTestPassed,
38
+ regressionPassed,
39
+ safetyViolations,
40
+ blockedFileTouches: [], // prototype: variant never patches repo files
41
+ hallucinatedFileRefs: false,
42
+ costUsd,
43
+ maxCostUsd: task.maxCostUsd,
44
+ // Latency is scored DETERMINISTICALLY (duration 0 ⇒ latencyEfficiency 1.0).
45
+ // At prototype level every variant runs the identical task command, so raw
46
+ // wall-clock is pure noise; folding it in made the promotion decision (and
47
+ // the winner) non-reproducible, defeating this path's own Repro gate
48
+ // (ADR-076/079). The real `durationMs` is kept in the result for
49
+ // observability; faithful latency scoring returns with the LLM evaluator,
50
+ // where per-variant latency is a real reproducible signal (metered, not
51
+ // wall-clock). `scoreBenchmark` itself stays a faithful pure function.
52
+ durationMs: 0,
53
+ timeoutMs: task.timeoutMs * 3,
54
+ });
55
+ return {
56
+ taskId: task.id,
57
+ variantId: variant.id,
58
+ parentId: variant.parentId,
59
+ repoCommit: task.commit,
60
+ solved: score.verifiedSolve,
61
+ publicTestPassed,
62
+ hiddenTestPassed,
63
+ regressionPassed,
64
+ durationMs,
65
+ costUsd,
66
+ changedFiles: [],
67
+ blockedFileTouches: [],
68
+ safetyViolations,
69
+ hallucinatedFileRefs: false,
70
+ traceQuality: score.verifiedSolve ? 1 : 0.5,
71
+ patchPath: '',
72
+ tracePath: '',
73
+ baseScore: score.baseScore,
74
+ finalScore: score.finalScore,
75
+ };
76
+ }
77
+ /**
78
+ * Evaluate a child against its parent over a task set using an INJECTED runner
79
+ * (the user-facing, testable shape). Carries FULL result objects — so the safety
80
+ * check is over real `safetyViolations`/`blockedFileTouches`, not a score proxy —
81
+ * and returns the statistical promotion decision (ADR-076).
82
+ */
83
+ export async function evaluateWithRunner(input) {
84
+ const parentResults = [];
85
+ const childResults = [];
86
+ for (const task of input.tasks) {
87
+ parentResults.push(await input.runVariant(input.parentId, task));
88
+ childResults.push(await input.runVariant(input.childId, task));
89
+ }
90
+ const decision = decidePromotion({
91
+ parentResults,
92
+ childResults,
93
+ cleanReplay: input.cleanReplay ?? false,
94
+ seed: input.seed,
95
+ samples: input.samples,
96
+ minDelta: input.minDelta,
97
+ });
98
+ return { parentResults, childResults, decision };
99
+ }
100
+ /**
101
+ * Evaluate a child harness against its parent over a hash-verified suite, using
102
+ * the real secure sandbox. Verifies the suite snapshot first (benchmark-tampering
103
+ * control), then delegates to `evaluateWithRunner`.
104
+ */
105
+ export async function evaluateChildAgainstParent(input) {
106
+ const check = verifySuite(input.suite);
107
+ if (!check.ok) {
108
+ throw new Error(`benchmark suite tampered: taskHash ${check.expected} != recomputed ${check.actual}`);
109
+ }
110
+ const byId = new Map([
111
+ [input.parent.id, input.parent],
112
+ [input.child.id, input.child],
113
+ ]);
114
+ const runVariant = (variantId, task) => {
115
+ const variant = byId.get(variantId);
116
+ if (!variant)
117
+ throw new Error(`unknown variant ${variantId}`);
118
+ return runTaskForVariant(variant, input.profile, task, input.opts);
119
+ };
120
+ return evaluateWithRunner({
121
+ parentId: input.parent.id,
122
+ childId: input.child.id,
123
+ tasks: input.suite.tasks,
124
+ runVariant,
125
+ cleanReplay: input.cleanReplay,
126
+ seed: input.seed,
127
+ samples: input.samples,
128
+ minDelta: input.minDelta,
129
+ });
130
+ }
131
+ //# sourceMappingURL=runner.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"runner.js","sourceRoot":"","sources":["../../src/bench/runner.ts"],"names":[],"mappings":"AAAA,+BAA+B;AAC/B,EAAE;AACF,iFAAiF;AACjF,8EAA8E;AAC9E,iFAAiF;AACjF,2EAA2E;AAC3E,wBAAwB;AAExB,OAAO,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AAE/C,OAAO,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAC5C,OAAO,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AAczC;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,OAAuB,EACvB,OAAoB,EACpB,IAAmB,EACnB,OAAsB,EAAE;IAExB,MAAM,EAAE,GAAG,CAAC,WAAmB,EAAE,KAAa,EAAE,EAAE,CAChD,cAAc,CAAC,EAAE,GAAG,OAAO,EAAE,EAAE,EAAE,GAAG,OAAO,EAAE,WAAW,EAAE,EAAE,GAAG,IAAI,CAAC,EAAE,IAAI,KAAK,EAAE,EAAE;QACjF,aAAa,EAAE,IAAI,CAAC,SAAS;KAC9B,CAAC,CAAC;IAEL,MAAM,GAAG,GAAG,MAAM,EAAE,CAAC,IAAI,CAAC,iBAAiB,EAAE,QAAQ,CAAC,CAAC;IACvD,MAAM,MAAM,GAAG,MAAM,EAAE,CAAC,IAAI,CAAC,iBAAiB,EAAE,QAAQ,CAAC,CAAC;IAC1D,MAAM,UAAU,GAAG,MAAM,EAAE,CAAC,IAAI,CAAC,qBAAqB,EAAE,YAAY,CAAC,CAAC;IAEtE,MAAM,gBAAgB,GAAG;QACvB,GAAG,GAAG,CAAC,cAAc;QACrB,GAAG,MAAM,CAAC,cAAc;QACxB,GAAG,UAAU,CAAC,cAAc;KAC7B,CAAC;IAEF,MAAM,gBAAgB,GAAG,GAAG,CAAC,QAAQ,KAAK,CAAC,CAAC;IAC5C,MAAM,gBAAgB,GAAG,MAAM,CAAC,QAAQ,KAAK,CAAC,CAAC;IAC/C,MAAM,gBAAgB,GAAG,UAAU,CAAC,QAAQ,KAAK,CAAC,CAAC;IACnD,0EAA0E;IAC1E,MAAM,UAAU,GAAG,GAAG,CAAC,UAAU,GAAG,MAAM,CAAC,UAAU,GAAG,UAAU,CAAC,UAAU,CAAC;IAC9E,MAAM,OAAO,GAAG,IAAI,CAAC,cAAc,IAAI,CAAC,CAAC;IAEzC,MAAM,KAAK,GAAG,cAAc,CAAC;QAC3B,gBAAgB;QAChB,gBAAgB;QAChB,gBAAgB;QAChB,gBAAgB;QAChB,kBAAkB,EAAE,EAAE,EAAE,8CAA8C;QACtE,oBAAoB,EAAE,KAAK;QAC3B,OAAO;QACP,UAAU,EAAE,IAAI,CAAC,UAAU;QAC3B,4EAA4E;QAC5E,2EAA2E;QAC3E,2EAA2E;QAC3E,qEAAqE;QACrE,iEAAiE;QACjE,0EAA0E;QAC1E,wEAAwE;QACxE,uEAAuE;QACvE,UAAU,EAAE,CAAC;QACb,SAAS,EAAE,IAAI,CAAC,SAAS,GAAG,CAAC;KAC9B,CAAC,CAAC;IAEH,OAAO;QACL,MAAM,EAAE,IAAI,CAAC,EAAE;QACf,SAAS,EAAE,OAAO,CAAC,EAAE;QACrB,QAAQ,EAAE,OAAO,CAAC,QAAQ;QAC1B,UAAU,EAAE,IAAI,CAAC,MAAM;QACvB,MAAM,EAAE,KAAK,CAAC,aAAa;QAC3B,gBAAgB;QAChB,gBAAgB;QAChB,gBAAgB;QAChB,UAAU;QACV,OAAO;QACP,YAAY,EAAE,EAAE;QAChB,kBAAkB,EAAE,EAAE;QACtB,gBAAgB;QAChB,oBAAoB,EAAE,KAAK;QAC3B,YAAY,EAAE,KAAK,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG;QAC3C,SAAS,EAAE,EAAE;QACb,SAAS,EAAE,EAAE;QACb,SAAS,EAAE,KAAK,CAAC,SAAS;QAC1B,UAAU,EAAE,KAAK,CAAC,UAAU;KAC7B,CAAC;AACJ,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CAAC,KASxC;IAKC,MAAM,aAAa,GAAsB,EAAE,CAAC;IAC5C,MAAM,YAAY,GAAsB,EAAE,CAAC;IAC3C,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,KAAK,EAAE,CAAC;QAC/B,aAAa,CAAC,IAAI,CAAC,MAAM,KAAK,CAAC,UAAU,CAAC,KAAK,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC,CAAC;QACjE,YAAY,CAAC,IAAI,CAAC,MAAM,KAAK,CAAC,UAAU,CAAC,KAAK,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC;IACjE,CAAC;IACD,MAAM,QAAQ,GAAG,eAAe,CAAC;QAC/B,aAAa;QACb,YAAY;QACZ,WAAW,EAAE,KAAK,CAAC,WAAW,IAAI,KAAK;QACvC,IAAI,EAAE,KAAK,CAAC,IAAI;QAChB,OAAO,EAAE,KAAK,CAAC,OAAO;QACtB,QAAQ,EAAE,KAAK,CAAC,QAAQ;KACzB,CAAC,CAAC;IACH,OAAO,EAAE,aAAa,EAAE,YAAY,EAAE,QAAQ,EAAE,CAAC;AACnD,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,0BAA0B,CAAC,KAUhD;IAKC,MAAM,KAAK,GAAG,WAAW,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;IACvC,IAAI,CAAC,KAAK,CAAC,EAAE,EAAE,CAAC;QACd,MAAM,IAAI,KAAK,CACb,sCAAsC,KAAK,CAAC,QAAQ,kBAAkB,KAAK,CAAC,MAAM,EAAE,CACrF,CAAC;IACJ,CAAC;IAED,MAAM,IAAI,GAAG,IAAI,GAAG,CAAyB;QAC3C,CAAC,KAAK,CAAC,MAAM,CAAC,EAAE,EAAE,KAAK,CAAC,MAAM,CAAC;QAC/B,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,EAAE,KAAK,CAAC,KAAK,CAAC;KAC9B,CAAC,CAAC;IACH,MAAM,UAAU,GAAiB,CAAC,SAAS,EAAE,IAAI,EAAE,EAAE;QACnD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QACpC,IAAI,CAAC,OAAO;YAAE,MAAM,IAAI,KAAK,CAAC,mBAAmB,SAAS,EAAE,CAAC,CAAC;QAC9D,OAAO,iBAAiB,CAAC,OAAO,EAAE,KAAK,CAAC,OAAO,EAAE,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,CAAC;IACrE,CAAC,CAAC;IAEF,OAAO,kBAAkB,CAAC;QACxB,QAAQ,EAAE,KAAK,CAAC,MAAM,CAAC,EAAE;QACzB,OAAO,EAAE,KAAK,CAAC,KAAK,CAAC,EAAE;QACvB,KAAK,EAAE,KAAK,CAAC,KAAK,CAAC,KAAK;QACxB,UAAU;QACV,WAAW,EAAE,KAAK,CAAC,WAAW;QAC9B,IAAI,EAAE,KAAK,CAAC,IAAI;QAChB,OAAO,EAAE,KAAK,CAAC,OAAO;QACtB,QAAQ,EAAE,KAAK,CAAC,QAAQ;KACzB,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,16 @@
1
+ import type { BenchScore, BenchScoreInput } from './types.js';
2
+ /**
3
+ * Score a single benchmark result per ADR-076.
4
+ *
5
+ * `verifiedSolve` is the conjunction of public ∧ hidden ∧ regression ∧ safety
6
+ * (zero safety violations AND zero blocked-file touches). It dominates the base
7
+ * score at 0.40, so the only way to earn the bulk of the score is a clean,
8
+ * bounded, non-regressing solve. The penalty layer then subtracts for safety
9
+ * violations, blocked-file touches, regression failure, hallucinated file
10
+ * references, and excessive cost — any of which can drive finalScore negative.
11
+ *
12
+ * @param input pure, I/O-free inputs (booleans + metered cost/latency).
13
+ * @returns a fully-rounded, deterministic BenchScore.
14
+ */
15
+ export declare function scoreBenchmark(input: BenchScoreInput): BenchScore;
16
+ //# sourceMappingURL=score.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"score.d.ts","sourceRoot":"","sources":["../../src/bench/score.ts"],"names":[],"mappings":"AAYA,OAAO,KAAK,EAAE,UAAU,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAW9D;;;;;;;;;;;;GAYG;AACH,wBAAgB,cAAc,CAAC,KAAK,EAAE,eAAe,GAAG,UAAU,CA4DjE"}