rad-experiment 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +87 -0
  2. package/dist/cli/commands/list.d.ts +1 -0
  3. package/dist/cli/commands/list.js +35 -0
  4. package/dist/cli/commands/publish.d.ts +1 -0
  5. package/dist/cli/commands/publish.js +63 -0
  6. package/dist/cli/commands/reproduce.d.ts +1 -0
  7. package/dist/cli/commands/reproduce.js +45 -0
  8. package/dist/cli/commands/show.d.ts +1 -0
  9. package/dist/cli/commands/show.js +61 -0
  10. package/dist/cli/format.d.ts +9 -0
  11. package/dist/cli/format.js +21 -0
  12. package/dist/cli/helpers.d.ts +49 -0
  13. package/dist/cli/helpers.js +90 -0
  14. package/dist/cli/rad.d.ts +11 -0
  15. package/dist/cli/rad.js +64 -0
  16. package/dist/cob/actions.d.ts +35 -0
  17. package/dist/cob/actions.js +57 -0
  18. package/dist/cob/state.d.ts +7 -0
  19. package/dist/cob/state.js +97 -0
  20. package/dist/rad-cob-experiment.d.ts +2 -0
  21. package/dist/rad-cob-experiment.js +33 -0
  22. package/dist/rad-experiment.d.ts +2 -0
  23. package/dist/rad-experiment.js +74 -0
  24. package/dist/types.d.ts +102 -0
  25. package/dist/types.js +9 -0
  26. package/package.json +24 -0
  27. package/src/__tests__/actions.test.ts +122 -0
  28. package/src/__tests__/cob-protocol.test.ts +138 -0
  29. package/src/__tests__/fixtures.ts +119 -0
  30. package/src/__tests__/format.test.ts +55 -0
  31. package/src/__tests__/golden/publish-action.json +46 -0
  32. package/src/__tests__/golden/publish-minimal.json +25 -0
  33. package/src/__tests__/golden/publish-with-samples.json +38 -0
  34. package/src/__tests__/golden/reproduce-action.json +19 -0
  35. package/src/__tests__/golden/reproduce-minimal.json +18 -0
  36. package/src/__tests__/helpers.test.ts +138 -0
  37. package/src/__tests__/integration.test.ts +124 -0
  38. package/src/__tests__/serialization.test.ts +175 -0
  39. package/src/__tests__/state.test.ts +191 -0
  40. package/src/cli/commands/list.ts +45 -0
  41. package/src/cli/commands/publish.ts +68 -0
  42. package/src/cli/commands/reproduce.ts +52 -0
  43. package/src/cli/commands/show.ts +70 -0
  44. package/src/cli/format.ts +27 -0
  45. package/src/cli/helpers.ts +101 -0
  46. package/src/cli/rad.ts +87 -0
  47. package/src/cob/actions.ts +100 -0
  48. package/src/cob/state.ts +120 -0
  49. package/src/rad-cob-experiment.ts +39 -0
  50. package/src/rad-experiment.ts +85 -0
  51. package/src/types.ts +133 -0
  52. package/tsconfig.json +16 -0
@@ -0,0 +1,119 @@
1
+ // Shared test data builders. Each returns a fresh object with sensible defaults.
2
+
3
+ import type { Experiment, Measurement, MetricValue, Op, Reproduction } from "../types.js";
4
+
5
+ export const SAMPLE_OID = "7554439853c68bc7ec063d6a7fcdc80f89e5b1fb";
6
+ export const SAMPLE_AUTHOR_KEY = "z6Mkgoz7YreoB5v1jLvNCyX3MXLaza28Gs6d6yu3ABXE3cdY";
7
+
8
+ export function makeMeasurement(overrides?: Partial<Measurement>): Measurement {
9
+ return { n: 5, medianX1000: 59340, stdX1000: 1200, ...overrides };
10
+ }
11
+
12
+ export function makeMetricValue(overrides?: Partial<MetricValue>): MetricValue {
13
+ return {
14
+ name: "binary_size",
15
+ unit: "bytes",
16
+ baseline: { n: 1, medianX1000: 1000000, stdX1000: 0 },
17
+ candidate: { n: 1, medianX1000: 950000, stdX1000: 0 },
18
+ deltaPctX100: -500,
19
+ regressed: false,
20
+ ...overrides,
21
+ };
22
+ }
23
+
24
+ /** Build a raw publish action with snake_case fields (as stored in COBs). */
25
+ export function makeRawPublishAction(overrides?: Record<string, unknown>): Record<string, unknown> {
26
+ return {
27
+ type: "publish",
28
+ description: "SIMD vectorization in parser loop",
29
+ base: SAMPLE_OID,
30
+ oid: SAMPLE_OID,
31
+ metric_name: "wall_time",
32
+ metric_unit: "ms",
33
+ direction: "lower_is_better",
34
+ runner_class: "arm64",
35
+ os: "linux",
36
+ cpu: "apple-m2",
37
+ baseline: makeMeasurement(),
38
+ candidate: makeMeasurement({ n: 5, medianX1000: 45200, stdX1000: 950 }),
39
+ delta_pct_x100: 2378,
40
+ build_ok: true,
41
+ tests_ok: true,
42
+ sanitizers_ok: false,
43
+ agent_system: "claude-code",
44
+ agent_model: "claude-opus-4-6",
45
+ ...overrides,
46
+ };
47
+ }
48
+
49
+ /** Build a raw reproduce action with snake_case fields. */
50
+ export function makeRawReproduceAction(overrides?: Record<string, unknown>): Record<string, unknown> {
51
+ return {
52
+ type: "reproduce",
53
+ verdict: "confirmed",
54
+ runner_class: "amd64",
55
+ baseline: makeMeasurement({ n: 10, medianX1000: 59000, stdX1000: 800 }),
56
+ candidate: makeMeasurement({ n: 10, medianX1000: 45500, stdX1000: 600 }),
57
+ delta_pct_x100: 2288,
58
+ build_ok: true,
59
+ tests_ok: true,
60
+ notes: "Reproduced on CI",
61
+ ...overrides,
62
+ };
63
+ }
64
+
65
+ /** Build a minimal valid Op containing a publish action. */
66
+ export function makeOp(overrides?: Partial<Op> & { actions?: Record<string, unknown>[] }): Op {
67
+ const { actions, ...rest } = overrides ?? {};
68
+ return {
69
+ id: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
70
+ actions: (actions ?? [makeRawPublishAction()]) as Op["actions"],
71
+ author: SAMPLE_AUTHOR_KEY,
72
+ timestamp: 1711300000000,
73
+ parents: [],
74
+ related: [],
75
+ identity: null,
76
+ manifest: { typeName: "cc.experiment", version: 1 },
77
+ ...rest,
78
+ };
79
+ }
80
+
81
+ export function makeReproduction(overrides?: Partial<Reproduction>): Reproduction {
82
+ return {
83
+ verdict: "confirmed",
84
+ runnerClass: "amd64",
85
+ baseline: makeMeasurement({ n: 10, medianX1000: 59000, stdX1000: 800 }),
86
+ candidate: makeMeasurement({ n: 10, medianX1000: 45500, stdX1000: 600 }),
87
+ deltaPctX100: 2288,
88
+ author: { id: `did:key:${SAMPLE_AUTHOR_KEY}` },
89
+ timestamp: 1711300001000,
90
+ ...overrides,
91
+ };
92
+ }
93
+
94
+ /** Build a camelCase Experiment state (as produced by the helper). */
95
+ export function makeExperiment(overrides?: Partial<Experiment>): Experiment {
96
+ return {
97
+ description: "SIMD vectorization in parser loop",
98
+ base: SAMPLE_OID,
99
+ oid: SAMPLE_OID,
100
+ metricName: "wall_time",
101
+ metricUnit: "ms",
102
+ direction: "lower_is_better",
103
+ runnerClass: "arm64",
104
+ os: "linux",
105
+ cpu: "apple-m2",
106
+ baseline: makeMeasurement(),
107
+ candidate: makeMeasurement({ n: 5, medianX1000: 45200, stdX1000: 950 }),
108
+ deltaPctX100: 2378,
109
+ buildOk: true,
110
+ testsOk: true,
111
+ sanitizersOk: false,
112
+ agentSystem: "claude-code",
113
+ agentModel: "claude-opus-4-6",
114
+ reproductions: [],
115
+ author: { id: `did:key:${SAMPLE_AUTHOR_KEY}` },
116
+ createdAt: 1711300000000,
117
+ ...overrides,
118
+ };
119
+ }
@@ -0,0 +1,55 @@
1
+ import { describe, it } from "node:test";
2
+ import assert from "node:assert/strict";
3
+
4
+ import { deltaDisplay, measurementDisplay, shortId, confirmedCount } from "../cli/format.js";
5
+ import { makeExperiment, makeReproduction } from "./fixtures.js";
6
+
7
+ describe("deltaDisplay", () => {
8
+ it("formats positive value", () => assert.equal(deltaDisplay(547), "+5.47%"));
9
+ it("formats zero", () => assert.equal(deltaDisplay(0), "+0.00%"));
10
+ it("formats negative value", () => assert.equal(deltaDisplay(-78), "-0.78%"));
11
+ it("formats single-digit fractional", () => assert.equal(deltaDisplay(5), "+0.05%"));
12
+ it("formats exact percent", () => assert.equal(deltaDisplay(1000), "+10.00%"));
13
+ it("formats large value", () => assert.equal(deltaDisplay(23780), "+237.80%"));
14
+ });
15
+
16
+ describe("measurementDisplay", () => {
17
+ it("formats normal value", () => {
18
+ assert.equal(measurementDisplay({ n: 10, medianX1000: 59340, stdX1000: 120 }, "ms"), "59.340 ms");
19
+ });
20
+ it("formats sub-unit", () => {
21
+ assert.equal(measurementDisplay({ n: 5, medianX1000: 500, stdX1000: 10 }, "ms"), "0.500 ms");
22
+ });
23
+ it("formats zero", () => {
24
+ assert.equal(measurementDisplay({ n: 1, medianX1000: 0, stdX1000: 0 }, "s"), "0.0 s");
25
+ });
26
+ });
27
+
28
+ describe("shortId", () => {
29
+ it("truncates to 7 chars", () => assert.equal(shortId("abcdef1234567890abcd"), "abcdef1"));
30
+ it("handles short input", () => assert.equal(shortId("abc"), "abc"));
31
+ });
32
+
33
+ describe("confirmedCount", () => {
34
+ it("counts confirmed reproductions", () => {
35
+ const exp = makeExperiment({
36
+ reproductions: [
37
+ makeReproduction({ verdict: "confirmed" }),
38
+ makeReproduction({ verdict: "failed" }),
39
+ makeReproduction({ verdict: "confirmed" }),
40
+ ],
41
+ });
42
+ assert.equal(confirmedCount(exp), 2);
43
+ });
44
+
45
+ it("returns 0 when none confirmed", () => {
46
+ const exp = makeExperiment({
47
+ reproductions: [makeReproduction({ verdict: "inconclusive" })],
48
+ });
49
+ assert.equal(confirmedCount(exp), 0);
50
+ });
51
+
52
+ it("returns 0 for empty reproductions", () => {
53
+ assert.equal(confirmedCount(makeExperiment()), 0);
54
+ });
55
+ });
@@ -0,0 +1,46 @@
1
+ {
2
+ "agent_model": "claude-opus-4-6",
3
+ "agent_system": "claude-code",
4
+ "base": "7554439853c68bc7ec063d6a7fcdc80f89e5b1fb",
5
+ "baseline": {
6
+ "medianX1000": 59340,
7
+ "n": 5,
8
+ "stdX1000": 1200
9
+ },
10
+ "build_ok": true,
11
+ "candidate": {
12
+ "medianX1000": 45200,
13
+ "n": 5,
14
+ "stdX1000": 950
15
+ },
16
+ "cpu": "apple-m2",
17
+ "delta_pct_x100": 2378,
18
+ "description": "SIMD vectorization in parser loop",
19
+ "direction": "lower_is_better",
20
+ "metric_name": "wall_time",
21
+ "metric_unit": "ms",
22
+ "oid": "7554439853c68bc7ec063d6a7fcdc80f89e5b1fb",
23
+ "os": "linux",
24
+ "runner_class": "arm64",
25
+ "sanitizers_ok": false,
26
+ "secondary_metrics": [
27
+ {
28
+ "baseline": {
29
+ "medianX1000": 1000000,
30
+ "n": 1,
31
+ "stdX1000": 0
32
+ },
33
+ "candidate": {
34
+ "medianX1000": 950000,
35
+ "n": 1,
36
+ "stdX1000": 0
37
+ },
38
+ "deltaPctX100": -500,
39
+ "name": "binary_size",
40
+ "regressed": false,
41
+ "unit": "bytes"
42
+ }
43
+ ],
44
+ "tests_ok": true,
45
+ "type": "publish"
46
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "agent_model": "claude-opus-4-6",
3
+ "agent_system": "claude-code",
4
+ "base": "defd08dcff239ec2a3a57ec82f541d21210e98f9",
5
+ "baseline": {
6
+ "medianX1000": 50000,
7
+ "n": 10,
8
+ "stdX1000": 0
9
+ },
10
+ "build_ok": true,
11
+ "candidate": {
12
+ "medianX1000": 55000,
13
+ "n": 10,
14
+ "stdX1000": 0
15
+ },
16
+ "delta_pct_x100": 1000,
17
+ "direction": "higher_is_better",
18
+ "metric_name": "throughput",
19
+ "metric_unit": "ops/s",
20
+ "oid": "defd08dcff239ec2a3a57ec82f541d21210e98f9",
21
+ "runner_class": "amd64",
22
+ "sanitizers_ok": false,
23
+ "tests_ok": true,
24
+ "type": "publish"
25
+ }
@@ -0,0 +1,38 @@
1
+ {
2
+ "agent_model": "claude-opus-4-6",
3
+ "agent_system": "claude-code",
4
+ "base": "defd08dcff239ec2a3a57ec82f541d21210e98f9",
5
+ "baseline": {
6
+ "medianX1000": 59340,
7
+ "n": 3,
8
+ "samplesX1000": [
9
+ 58000,
10
+ 59340,
11
+ 60680
12
+ ],
13
+ "stdX1000": 1200
14
+ },
15
+ "build_ok": true,
16
+ "candidate": {
17
+ "medianX1000": 45200,
18
+ "n": 3,
19
+ "samplesX1000": [
20
+ 44500,
21
+ 45200,
22
+ 45900
23
+ ],
24
+ "stdX1000": 950
25
+ },
26
+ "cpu": "apple-m2",
27
+ "delta_pct_x100": 2378,
28
+ "description": "With per-run samples",
29
+ "direction": "lower_is_better",
30
+ "metric_name": "wall_time",
31
+ "metric_unit": "ms",
32
+ "oid": "defd08dcff239ec2a3a57ec82f541d21210e98f9",
33
+ "os": "linux",
34
+ "runner_class": "arm64",
35
+ "sanitizers_ok": false,
36
+ "tests_ok": true,
37
+ "type": "publish"
38
+ }
@@ -0,0 +1,19 @@
1
+ {
2
+ "baseline": {
3
+ "medianX1000": 59000,
4
+ "n": 10,
5
+ "stdX1000": 800
6
+ },
7
+ "build_ok": true,
8
+ "candidate": {
9
+ "medianX1000": 45500,
10
+ "n": 10,
11
+ "stdX1000": 600
12
+ },
13
+ "delta_pct_x100": 2288,
14
+ "notes": "Reproduced on CI",
15
+ "runner_class": "amd64",
16
+ "tests_ok": true,
17
+ "type": "reproduce",
18
+ "verdict": "confirmed"
19
+ }
@@ -0,0 +1,18 @@
1
+ {
2
+ "baseline": {
3
+ "medianX1000": 50500,
4
+ "n": 5,
5
+ "stdX1000": 0
6
+ },
7
+ "build_ok": true,
8
+ "candidate": {
9
+ "medianX1000": 50800,
10
+ "n": 5,
11
+ "stdX1000": 0
12
+ },
13
+ "delta_pct_x100": 59,
14
+ "runner_class": "arm64",
15
+ "tests_ok": true,
16
+ "type": "reproduce",
17
+ "verdict": "inconclusive"
18
+ }
@@ -0,0 +1,138 @@
1
+ import { describe, it } from "node:test";
2
+ import assert from "node:assert/strict";
3
+
4
+ import {
5
+ CliError,
6
+ requireArg,
7
+ requireInt,
8
+ optionalInt,
9
+ buildMeasurement,
10
+ parseSecondary,
11
+ } from "../cli/helpers.js";
12
+
13
+ describe("requireArg", () => {
14
+ it("returns string value", () => {
15
+ assert.equal(requireArg({ foo: "bar" }, "foo"), "bar");
16
+ });
17
+
18
+ it("throws CliError on missing key", () => {
19
+ assert.throws(() => requireArg({}, "foo"), CliError);
20
+ });
21
+
22
+ it("throws CliError on empty string", () => {
23
+ assert.throws(() => requireArg({ foo: "" }, "foo"), CliError);
24
+ });
25
+
26
+ it("throws CliError on null", () => {
27
+ assert.throws(() => requireArg({ foo: null }, "foo"), CliError);
28
+ });
29
+ });
30
+
31
+ describe("requireInt", () => {
32
+ it("parses integer string", () => {
33
+ assert.equal(requireInt({ n: "42" }, "n"), 42);
34
+ });
35
+
36
+ it("parses negative integer", () => {
37
+ assert.equal(requireInt({ n: "-7" }, "n"), -7);
38
+ });
39
+
40
+ it("throws CliError on non-integer", () => {
41
+ assert.throws(() => requireInt({ n: "abc" }, "n"), CliError);
42
+ });
43
+
44
+ it("throws CliError on missing", () => {
45
+ assert.throws(() => requireInt({}, "n"), CliError);
46
+ });
47
+ });
48
+
49
+ describe("optionalInt", () => {
50
+ it("returns default when missing", () => {
51
+ assert.equal(optionalInt({}, "x", 99), 99);
52
+ });
53
+
54
+ it("returns default when empty string", () => {
55
+ assert.equal(optionalInt({ x: "" }, "x", 99), 99);
56
+ });
57
+
58
+ it("parses when present", () => {
59
+ assert.equal(optionalInt({ x: "7" }, "x", 99), 7);
60
+ });
61
+
62
+ it("throws CliError on non-integer", () => {
63
+ assert.throws(() => optionalInt({ x: "abc" }, "x", 99), CliError);
64
+ });
65
+ });
66
+
67
+ describe("buildMeasurement", () => {
68
+ it("builds baseline measurement", () => {
69
+ const m = buildMeasurement(
70
+ { "baseline-median": "5000", "baseline-std": "100", "baseline-n": "10", "baseline-samples": "1000,2000,3000" },
71
+ "baseline",
72
+ );
73
+ assert.equal(m.n, 10);
74
+ assert.equal(m.medianX1000, 5000);
75
+ assert.equal(m.stdX1000, 100);
76
+ assert.deepEqual(m.samplesX1000, [1000, 2000, 3000]);
77
+ });
78
+
79
+ it("builds candidate measurement", () => {
80
+ const m = buildMeasurement(
81
+ { "candidate-median": "4000", "candidate-n": "5" },
82
+ "candidate",
83
+ );
84
+ assert.equal(m.n, 5);
85
+ assert.equal(m.medianX1000, 4000);
86
+ assert.equal(m.stdX1000, 0); // default
87
+ });
88
+
89
+ it("defaults std to 0", () => {
90
+ const m = buildMeasurement({ "baseline-median": "100", "baseline-n": "1" }, "baseline");
91
+ assert.equal(m.stdX1000, 0);
92
+ });
93
+
94
+ it("omits samplesX1000 when empty", () => {
95
+ const m = buildMeasurement(
96
+ { "baseline-median": "100", "baseline-n": "1", "baseline-samples": "" },
97
+ "baseline",
98
+ );
99
+ assert.equal("samplesX1000" in m, false);
100
+ });
101
+ });
102
+
103
+ describe("parseSecondary", () => {
104
+ it("parses 5-field format", () => {
105
+ const mv = parseSecondary("rss:KB:5000:4500:-1000");
106
+ assert.equal(mv.name, "rss");
107
+ assert.equal(mv.unit, "KB");
108
+ assert.equal(mv.baseline.medianX1000, 5000);
109
+ assert.equal(mv.candidate.medianX1000, 4500);
110
+ assert.equal(mv.deltaPctX100, -1000);
111
+ assert.equal(mv.regressed, false);
112
+ assert.equal(mv.baseline.n, 1);
113
+ assert.equal(mv.baseline.stdX1000, 0);
114
+ });
115
+
116
+ it("parses 6-field format with regressed=true", () => {
117
+ const mv = parseSecondary("mem:MB:41000:42000:243:true");
118
+ assert.equal(mv.regressed, true);
119
+ });
120
+
121
+ it("defaults regressed to false", () => {
122
+ const mv = parseSecondary("x:y:1:2:3");
123
+ assert.equal(mv.regressed, false);
124
+ });
125
+
126
+ it("throws CliError on too few fields", () => {
127
+ assert.throws(() => parseSecondary("a:b:c"), CliError);
128
+ });
129
+
130
+ it("throws CliError on non-integer numeric fields", () => {
131
+ assert.throws(() => parseSecondary("a:b:notnum:2:3"), CliError);
132
+ });
133
+
134
+ it("handles negative delta", () => {
135
+ const mv = parseSecondary("x:y:100:200:-500");
136
+ assert.equal(mv.deltaPctX100, -500);
137
+ });
138
+ });
@@ -0,0 +1,124 @@
1
+ // End-to-end integration tests. Requires `rad` on PATH with a configured identity.
2
+ // Creates a temp radicle repo, runs CLI commands, verifies output.
3
+
4
+ import { test } from "node:test";
5
+ import assert from "node:assert/strict";
6
+ import { execFileSync } from "node:child_process";
7
+ import { mkdtempSync, rmSync } from "node:fs";
8
+ import { join, dirname } from "node:path";
9
+ import { tmpdir } from "node:os";
10
+ import { fileURLToPath } from "node:url";
11
+
12
+ const __dirname = dirname(fileURLToPath(import.meta.url));
13
+ const CLI = join(__dirname, "..", "rad-experiment.ts");
14
+ const TSX = join(__dirname, "..", "..", "node_modules", ".bin", "tsx");
15
+
16
+ function radExperiment(...args: string[]): string {
17
+ return execFileSync(TSX, [CLI, ...args], { encoding: "utf-8", timeout: 15000 });
18
+ }
19
+
20
+ function shell(cmd: string, args: string[], opts?: { cwd?: string; timeout?: number }): string {
21
+ return execFileSync(cmd, args, { encoding: "utf-8", ...opts });
22
+ }
23
+
24
+ function shellQuiet(cmd: string, args: string[], opts?: { cwd?: string; timeout?: number }): void {
25
+ execFileSync(cmd, args, { stdio: "inherit", ...opts });
26
+ }
27
+
28
+ function canRunIntegration(): boolean {
29
+ try {
30
+ shell("rad", ["self"]);
31
+ return true;
32
+ } catch {
33
+ return false;
34
+ }
35
+ }
36
+
37
+ test("integration: full CLI workflow", { skip: !canRunIntegration() && "rad not available" }, () => {
38
+ const tempDir = mkdtempSync(join(tmpdir(), "rad-exp-test-"));
39
+
40
+ try {
41
+ // Setup: init git + radicle repo (use inherited stdio — rad init fails with piped stdio)
42
+ // Unique commit message avoids radicle identity collision across test runs.
43
+ shellQuiet("git", ["init", "-b", "main"], { cwd: tempDir });
44
+ shellQuiet("git", ["commit", "--allow-empty", "-m", `init-${Date.now()}`], { cwd: tempDir });
45
+ shellQuiet("rad", ["init", "--name", `test-${Date.now()}`, "--description", "test", "--default-branch", "main", "--public"], {
46
+ cwd: tempDir,
47
+ timeout: 15000,
48
+ });
49
+ const commit = shell("git", ["rev-parse", "HEAD"], { cwd: tempDir }).trim();
50
+
51
+ // publish
52
+ const publishOutput = radExperiment(
53
+ "--repo", tempDir,
54
+ "publish",
55
+ "--base", commit, "--head", commit,
56
+ "--metric", "wall_time", "--unit", "ms",
57
+ "--direction", "lower_is_better",
58
+ "--runner", "arm64",
59
+ "--baseline-median", "59340", "--baseline-n", "5",
60
+ "--candidate-median", "45200", "--candidate-n", "5",
61
+ "--delta", "2378",
62
+ "-d", "Integration test experiment",
63
+ );
64
+ assert.match(publishOutput, /Experiment published: ([a-f0-9]{40})/);
65
+ assert.match(publishOutput, /\+23\.78%/);
66
+ const publishedId = publishOutput.match(/Experiment published: ([a-f0-9]{40})/)![1];
67
+
68
+ // list
69
+ const listOutput = radExperiment("--repo", tempDir, "list");
70
+ assert.match(listOutput, new RegExp(publishedId.slice(0, 7)));
71
+ assert.match(listOutput, /wall_time/);
72
+
73
+ // show (text)
74
+ const showOutput = radExperiment("--repo", tempDir, "show", publishedId);
75
+ assert.match(showOutput, /Integration test experiment/);
76
+ assert.match(showOutput, /wall_time \(ms\)/);
77
+ assert.match(showOutput, /59\.340 ms/);
78
+ assert.match(showOutput, /\+23\.78%/);
79
+
80
+ // show --json
81
+ const jsonOutput = radExperiment("--repo", tempDir, "show", "--json", publishedId);
82
+ const exp = JSON.parse(jsonOutput);
83
+ assert.equal(exp.metricName, "wall_time");
84
+ assert.equal(exp.deltaPctX100, 2378);
85
+ assert.ok(Array.isArray(exp.reproductions));
86
+
87
+ // reproduce
88
+ const reproOutput = radExperiment(
89
+ "--repo", tempDir,
90
+ "reproduce", publishedId,
91
+ "--verdict", "confirmed",
92
+ "--runner", "amd64",
93
+ "--baseline-median", "59000", "--baseline-n", "5",
94
+ "--candidate-median", "45500", "--candidate-n", "5",
95
+ "--delta", "2288",
96
+ );
97
+ assert.match(reproOutput, /Reproduction added/);
98
+
99
+ // verify reproduction in show --json
100
+ const afterReproJson = radExperiment("--repo", tempDir, "show", "--json", publishedId);
101
+ const expAfter = JSON.parse(afterReproJson);
102
+ assert.equal(expAfter.reproductions.length, 1);
103
+ assert.equal(expAfter.reproductions[0].verdict, "confirmed");
104
+
105
+ // list --reproduced
106
+ const reproList = radExperiment("--repo", tempDir, "list", "--reproduced");
107
+ assert.match(reproList, /verified/);
108
+
109
+ // list --unverified
110
+ const unverifiedList = radExperiment("--repo", tempDir, "list", "--unverified");
111
+ assert.match(unverifiedList, /No experiments found/);
112
+
113
+ // error path
114
+ assert.throws(
115
+ () => radExperiment("--repo", tempDir, "publish"),
116
+ (err: unknown) => {
117
+ const e = err as { status: number; stderr: Buffer };
118
+ return e.status !== 0 && e.stderr.toString().includes("missing required argument");
119
+ },
120
+ );
121
+ } finally {
122
+ try { rmSync(tempDir, { recursive: true }); } catch { /* ignore */ }
123
+ }
124
+ });