rad-experiment 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +87 -0
- package/dist/cli/commands/list.d.ts +1 -0
- package/dist/cli/commands/list.js +35 -0
- package/dist/cli/commands/publish.d.ts +1 -0
- package/dist/cli/commands/publish.js +63 -0
- package/dist/cli/commands/reproduce.d.ts +1 -0
- package/dist/cli/commands/reproduce.js +45 -0
- package/dist/cli/commands/show.d.ts +1 -0
- package/dist/cli/commands/show.js +61 -0
- package/dist/cli/format.d.ts +9 -0
- package/dist/cli/format.js +21 -0
- package/dist/cli/helpers.d.ts +49 -0
- package/dist/cli/helpers.js +90 -0
- package/dist/cli/rad.d.ts +11 -0
- package/dist/cli/rad.js +64 -0
- package/dist/cob/actions.d.ts +35 -0
- package/dist/cob/actions.js +57 -0
- package/dist/cob/state.d.ts +7 -0
- package/dist/cob/state.js +97 -0
- package/dist/rad-cob-experiment.d.ts +2 -0
- package/dist/rad-cob-experiment.js +33 -0
- package/dist/rad-experiment.d.ts +2 -0
- package/dist/rad-experiment.js +74 -0
- package/dist/types.d.ts +102 -0
- package/dist/types.js +9 -0
- package/package.json +24 -0
- package/src/__tests__/actions.test.ts +122 -0
- package/src/__tests__/cob-protocol.test.ts +138 -0
- package/src/__tests__/fixtures.ts +119 -0
- package/src/__tests__/format.test.ts +55 -0
- package/src/__tests__/golden/publish-action.json +46 -0
- package/src/__tests__/golden/publish-minimal.json +25 -0
- package/src/__tests__/golden/publish-with-samples.json +38 -0
- package/src/__tests__/golden/reproduce-action.json +19 -0
- package/src/__tests__/golden/reproduce-minimal.json +18 -0
- package/src/__tests__/helpers.test.ts +138 -0
- package/src/__tests__/integration.test.ts +124 -0
- package/src/__tests__/serialization.test.ts +175 -0
- package/src/__tests__/state.test.ts +191 -0
- package/src/cli/commands/list.ts +45 -0
- package/src/cli/commands/publish.ts +68 -0
- package/src/cli/commands/reproduce.ts +52 -0
- package/src/cli/commands/show.ts +70 -0
- package/src/cli/format.ts +27 -0
- package/src/cli/helpers.ts +101 -0
- package/src/cli/rad.ts +87 -0
- package/src/cob/actions.ts +100 -0
- package/src/cob/state.ts +120 -0
- package/src/rad-cob-experiment.ts +39 -0
- package/src/rad-experiment.ts +85 -0
- package/src/types.ts +133 -0
- package/tsconfig.json +16 -0
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
// Shared test data builders. Each returns a fresh object with sensible defaults.
|
|
2
|
+
|
|
3
|
+
import type { Experiment, Measurement, MetricValue, Op, Reproduction } from "../types.js";
|
|
4
|
+
|
|
5
|
+
export const SAMPLE_OID = "7554439853c68bc7ec063d6a7fcdc80f89e5b1fb";
|
|
6
|
+
export const SAMPLE_AUTHOR_KEY = "z6Mkgoz7YreoB5v1jLvNCyX3MXLaza28Gs6d6yu3ABXE3cdY";
|
|
7
|
+
|
|
8
|
+
export function makeMeasurement(overrides?: Partial<Measurement>): Measurement {
|
|
9
|
+
return { n: 5, medianX1000: 59340, stdX1000: 1200, ...overrides };
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export function makeMetricValue(overrides?: Partial<MetricValue>): MetricValue {
|
|
13
|
+
return {
|
|
14
|
+
name: "binary_size",
|
|
15
|
+
unit: "bytes",
|
|
16
|
+
baseline: { n: 1, medianX1000: 1000000, stdX1000: 0 },
|
|
17
|
+
candidate: { n: 1, medianX1000: 950000, stdX1000: 0 },
|
|
18
|
+
deltaPctX100: -500,
|
|
19
|
+
regressed: false,
|
|
20
|
+
...overrides,
|
|
21
|
+
};
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/** Build a raw publish action with snake_case fields (as stored in COBs). */
|
|
25
|
+
export function makeRawPublishAction(overrides?: Record<string, unknown>): Record<string, unknown> {
|
|
26
|
+
return {
|
|
27
|
+
type: "publish",
|
|
28
|
+
description: "SIMD vectorization in parser loop",
|
|
29
|
+
base: SAMPLE_OID,
|
|
30
|
+
oid: SAMPLE_OID,
|
|
31
|
+
metric_name: "wall_time",
|
|
32
|
+
metric_unit: "ms",
|
|
33
|
+
direction: "lower_is_better",
|
|
34
|
+
runner_class: "arm64",
|
|
35
|
+
os: "linux",
|
|
36
|
+
cpu: "apple-m2",
|
|
37
|
+
baseline: makeMeasurement(),
|
|
38
|
+
candidate: makeMeasurement({ n: 5, medianX1000: 45200, stdX1000: 950 }),
|
|
39
|
+
delta_pct_x100: 2378,
|
|
40
|
+
build_ok: true,
|
|
41
|
+
tests_ok: true,
|
|
42
|
+
sanitizers_ok: false,
|
|
43
|
+
agent_system: "claude-code",
|
|
44
|
+
agent_model: "claude-opus-4-6",
|
|
45
|
+
...overrides,
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/** Build a raw reproduce action with snake_case fields. */
|
|
50
|
+
export function makeRawReproduceAction(overrides?: Record<string, unknown>): Record<string, unknown> {
|
|
51
|
+
return {
|
|
52
|
+
type: "reproduce",
|
|
53
|
+
verdict: "confirmed",
|
|
54
|
+
runner_class: "amd64",
|
|
55
|
+
baseline: makeMeasurement({ n: 10, medianX1000: 59000, stdX1000: 800 }),
|
|
56
|
+
candidate: makeMeasurement({ n: 10, medianX1000: 45500, stdX1000: 600 }),
|
|
57
|
+
delta_pct_x100: 2288,
|
|
58
|
+
build_ok: true,
|
|
59
|
+
tests_ok: true,
|
|
60
|
+
notes: "Reproduced on CI",
|
|
61
|
+
...overrides,
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/** Build a minimal valid Op containing a publish action. */
|
|
66
|
+
export function makeOp(overrides?: Partial<Op> & { actions?: Record<string, unknown>[] }): Op {
|
|
67
|
+
const { actions, ...rest } = overrides ?? {};
|
|
68
|
+
return {
|
|
69
|
+
id: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
|
|
70
|
+
actions: (actions ?? [makeRawPublishAction()]) as Op["actions"],
|
|
71
|
+
author: SAMPLE_AUTHOR_KEY,
|
|
72
|
+
timestamp: 1711300000000,
|
|
73
|
+
parents: [],
|
|
74
|
+
related: [],
|
|
75
|
+
identity: null,
|
|
76
|
+
manifest: { typeName: "cc.experiment", version: 1 },
|
|
77
|
+
...rest,
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
export function makeReproduction(overrides?: Partial<Reproduction>): Reproduction {
|
|
82
|
+
return {
|
|
83
|
+
verdict: "confirmed",
|
|
84
|
+
runnerClass: "amd64",
|
|
85
|
+
baseline: makeMeasurement({ n: 10, medianX1000: 59000, stdX1000: 800 }),
|
|
86
|
+
candidate: makeMeasurement({ n: 10, medianX1000: 45500, stdX1000: 600 }),
|
|
87
|
+
deltaPctX100: 2288,
|
|
88
|
+
author: { id: `did:key:${SAMPLE_AUTHOR_KEY}` },
|
|
89
|
+
timestamp: 1711300001000,
|
|
90
|
+
...overrides,
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/** Build a camelCase Experiment state (as produced by the helper). */
|
|
95
|
+
export function makeExperiment(overrides?: Partial<Experiment>): Experiment {
|
|
96
|
+
return {
|
|
97
|
+
description: "SIMD vectorization in parser loop",
|
|
98
|
+
base: SAMPLE_OID,
|
|
99
|
+
oid: SAMPLE_OID,
|
|
100
|
+
metricName: "wall_time",
|
|
101
|
+
metricUnit: "ms",
|
|
102
|
+
direction: "lower_is_better",
|
|
103
|
+
runnerClass: "arm64",
|
|
104
|
+
os: "linux",
|
|
105
|
+
cpu: "apple-m2",
|
|
106
|
+
baseline: makeMeasurement(),
|
|
107
|
+
candidate: makeMeasurement({ n: 5, medianX1000: 45200, stdX1000: 950 }),
|
|
108
|
+
deltaPctX100: 2378,
|
|
109
|
+
buildOk: true,
|
|
110
|
+
testsOk: true,
|
|
111
|
+
sanitizersOk: false,
|
|
112
|
+
agentSystem: "claude-code",
|
|
113
|
+
agentModel: "claude-opus-4-6",
|
|
114
|
+
reproductions: [],
|
|
115
|
+
author: { id: `did:key:${SAMPLE_AUTHOR_KEY}` },
|
|
116
|
+
createdAt: 1711300000000,
|
|
117
|
+
...overrides,
|
|
118
|
+
};
|
|
119
|
+
}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import { describe, it } from "node:test";
|
|
2
|
+
import assert from "node:assert/strict";
|
|
3
|
+
|
|
4
|
+
import { deltaDisplay, measurementDisplay, shortId, confirmedCount } from "../cli/format.js";
|
|
5
|
+
import { makeExperiment, makeReproduction } from "./fixtures.js";
|
|
6
|
+
|
|
7
|
+
describe("deltaDisplay", () => {
|
|
8
|
+
it("formats positive value", () => assert.equal(deltaDisplay(547), "+5.47%"));
|
|
9
|
+
it("formats zero", () => assert.equal(deltaDisplay(0), "+0.00%"));
|
|
10
|
+
it("formats negative value", () => assert.equal(deltaDisplay(-78), "-0.78%"));
|
|
11
|
+
it("formats single-digit fractional", () => assert.equal(deltaDisplay(5), "+0.05%"));
|
|
12
|
+
it("formats exact percent", () => assert.equal(deltaDisplay(1000), "+10.00%"));
|
|
13
|
+
it("formats large value", () => assert.equal(deltaDisplay(23780), "+237.80%"));
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
describe("measurementDisplay", () => {
|
|
17
|
+
it("formats normal value", () => {
|
|
18
|
+
assert.equal(measurementDisplay({ n: 10, medianX1000: 59340, stdX1000: 120 }, "ms"), "59.340 ms");
|
|
19
|
+
});
|
|
20
|
+
it("formats sub-unit", () => {
|
|
21
|
+
assert.equal(measurementDisplay({ n: 5, medianX1000: 500, stdX1000: 10 }, "ms"), "0.500 ms");
|
|
22
|
+
});
|
|
23
|
+
it("formats zero", () => {
|
|
24
|
+
assert.equal(measurementDisplay({ n: 1, medianX1000: 0, stdX1000: 0 }, "s"), "0.0 s");
|
|
25
|
+
});
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
describe("shortId", () => {
|
|
29
|
+
it("truncates to 7 chars", () => assert.equal(shortId("abcdef1234567890abcd"), "abcdef1"));
|
|
30
|
+
it("handles short input", () => assert.equal(shortId("abc"), "abc"));
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
describe("confirmedCount", () => {
|
|
34
|
+
it("counts confirmed reproductions", () => {
|
|
35
|
+
const exp = makeExperiment({
|
|
36
|
+
reproductions: [
|
|
37
|
+
makeReproduction({ verdict: "confirmed" }),
|
|
38
|
+
makeReproduction({ verdict: "failed" }),
|
|
39
|
+
makeReproduction({ verdict: "confirmed" }),
|
|
40
|
+
],
|
|
41
|
+
});
|
|
42
|
+
assert.equal(confirmedCount(exp), 2);
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
it("returns 0 when none confirmed", () => {
|
|
46
|
+
const exp = makeExperiment({
|
|
47
|
+
reproductions: [makeReproduction({ verdict: "inconclusive" })],
|
|
48
|
+
});
|
|
49
|
+
assert.equal(confirmedCount(exp), 0);
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
it("returns 0 for empty reproductions", () => {
|
|
53
|
+
assert.equal(confirmedCount(makeExperiment()), 0);
|
|
54
|
+
});
|
|
55
|
+
});
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"agent_model": "claude-opus-4-6",
|
|
3
|
+
"agent_system": "claude-code",
|
|
4
|
+
"base": "7554439853c68bc7ec063d6a7fcdc80f89e5b1fb",
|
|
5
|
+
"baseline": {
|
|
6
|
+
"medianX1000": 59340,
|
|
7
|
+
"n": 5,
|
|
8
|
+
"stdX1000": 1200
|
|
9
|
+
},
|
|
10
|
+
"build_ok": true,
|
|
11
|
+
"candidate": {
|
|
12
|
+
"medianX1000": 45200,
|
|
13
|
+
"n": 5,
|
|
14
|
+
"stdX1000": 950
|
|
15
|
+
},
|
|
16
|
+
"cpu": "apple-m2",
|
|
17
|
+
"delta_pct_x100": 2378,
|
|
18
|
+
"description": "SIMD vectorization in parser loop",
|
|
19
|
+
"direction": "lower_is_better",
|
|
20
|
+
"metric_name": "wall_time",
|
|
21
|
+
"metric_unit": "ms",
|
|
22
|
+
"oid": "7554439853c68bc7ec063d6a7fcdc80f89e5b1fb",
|
|
23
|
+
"os": "linux",
|
|
24
|
+
"runner_class": "arm64",
|
|
25
|
+
"sanitizers_ok": false,
|
|
26
|
+
"secondary_metrics": [
|
|
27
|
+
{
|
|
28
|
+
"baseline": {
|
|
29
|
+
"medianX1000": 1000000,
|
|
30
|
+
"n": 1,
|
|
31
|
+
"stdX1000": 0
|
|
32
|
+
},
|
|
33
|
+
"candidate": {
|
|
34
|
+
"medianX1000": 950000,
|
|
35
|
+
"n": 1,
|
|
36
|
+
"stdX1000": 0
|
|
37
|
+
},
|
|
38
|
+
"deltaPctX100": -500,
|
|
39
|
+
"name": "binary_size",
|
|
40
|
+
"regressed": false,
|
|
41
|
+
"unit": "bytes"
|
|
42
|
+
}
|
|
43
|
+
],
|
|
44
|
+
"tests_ok": true,
|
|
45
|
+
"type": "publish"
|
|
46
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"agent_model": "claude-opus-4-6",
|
|
3
|
+
"agent_system": "claude-code",
|
|
4
|
+
"base": "defd08dcff239ec2a3a57ec82f541d21210e98f9",
|
|
5
|
+
"baseline": {
|
|
6
|
+
"medianX1000": 50000,
|
|
7
|
+
"n": 10,
|
|
8
|
+
"stdX1000": 0
|
|
9
|
+
},
|
|
10
|
+
"build_ok": true,
|
|
11
|
+
"candidate": {
|
|
12
|
+
"medianX1000": 55000,
|
|
13
|
+
"n": 10,
|
|
14
|
+
"stdX1000": 0
|
|
15
|
+
},
|
|
16
|
+
"delta_pct_x100": 1000,
|
|
17
|
+
"direction": "higher_is_better",
|
|
18
|
+
"metric_name": "throughput",
|
|
19
|
+
"metric_unit": "ops/s",
|
|
20
|
+
"oid": "defd08dcff239ec2a3a57ec82f541d21210e98f9",
|
|
21
|
+
"runner_class": "amd64",
|
|
22
|
+
"sanitizers_ok": false,
|
|
23
|
+
"tests_ok": true,
|
|
24
|
+
"type": "publish"
|
|
25
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
{
|
|
2
|
+
"agent_model": "claude-opus-4-6",
|
|
3
|
+
"agent_system": "claude-code",
|
|
4
|
+
"base": "defd08dcff239ec2a3a57ec82f541d21210e98f9",
|
|
5
|
+
"baseline": {
|
|
6
|
+
"medianX1000": 59340,
|
|
7
|
+
"n": 3,
|
|
8
|
+
"samplesX1000": [
|
|
9
|
+
58000,
|
|
10
|
+
59340,
|
|
11
|
+
60680
|
|
12
|
+
],
|
|
13
|
+
"stdX1000": 1200
|
|
14
|
+
},
|
|
15
|
+
"build_ok": true,
|
|
16
|
+
"candidate": {
|
|
17
|
+
"medianX1000": 45200,
|
|
18
|
+
"n": 3,
|
|
19
|
+
"samplesX1000": [
|
|
20
|
+
44500,
|
|
21
|
+
45200,
|
|
22
|
+
45900
|
|
23
|
+
],
|
|
24
|
+
"stdX1000": 950
|
|
25
|
+
},
|
|
26
|
+
"cpu": "apple-m2",
|
|
27
|
+
"delta_pct_x100": 2378,
|
|
28
|
+
"description": "With per-run samples",
|
|
29
|
+
"direction": "lower_is_better",
|
|
30
|
+
"metric_name": "wall_time",
|
|
31
|
+
"metric_unit": "ms",
|
|
32
|
+
"oid": "defd08dcff239ec2a3a57ec82f541d21210e98f9",
|
|
33
|
+
"os": "linux",
|
|
34
|
+
"runner_class": "arm64",
|
|
35
|
+
"sanitizers_ok": false,
|
|
36
|
+
"tests_ok": true,
|
|
37
|
+
"type": "publish"
|
|
38
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
{
|
|
2
|
+
"baseline": {
|
|
3
|
+
"medianX1000": 59000,
|
|
4
|
+
"n": 10,
|
|
5
|
+
"stdX1000": 800
|
|
6
|
+
},
|
|
7
|
+
"build_ok": true,
|
|
8
|
+
"candidate": {
|
|
9
|
+
"medianX1000": 45500,
|
|
10
|
+
"n": 10,
|
|
11
|
+
"stdX1000": 600
|
|
12
|
+
},
|
|
13
|
+
"delta_pct_x100": 2288,
|
|
14
|
+
"notes": "Reproduced on CI",
|
|
15
|
+
"runner_class": "amd64",
|
|
16
|
+
"tests_ok": true,
|
|
17
|
+
"type": "reproduce",
|
|
18
|
+
"verdict": "confirmed"
|
|
19
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
{
|
|
2
|
+
"baseline": {
|
|
3
|
+
"medianX1000": 50500,
|
|
4
|
+
"n": 5,
|
|
5
|
+
"stdX1000": 0
|
|
6
|
+
},
|
|
7
|
+
"build_ok": true,
|
|
8
|
+
"candidate": {
|
|
9
|
+
"medianX1000": 50800,
|
|
10
|
+
"n": 5,
|
|
11
|
+
"stdX1000": 0
|
|
12
|
+
},
|
|
13
|
+
"delta_pct_x100": 59,
|
|
14
|
+
"runner_class": "arm64",
|
|
15
|
+
"tests_ok": true,
|
|
16
|
+
"type": "reproduce",
|
|
17
|
+
"verdict": "inconclusive"
|
|
18
|
+
}
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import { describe, it } from "node:test";
|
|
2
|
+
import assert from "node:assert/strict";
|
|
3
|
+
|
|
4
|
+
import {
|
|
5
|
+
CliError,
|
|
6
|
+
requireArg,
|
|
7
|
+
requireInt,
|
|
8
|
+
optionalInt,
|
|
9
|
+
buildMeasurement,
|
|
10
|
+
parseSecondary,
|
|
11
|
+
} from "../cli/helpers.js";
|
|
12
|
+
|
|
13
|
+
describe("requireArg", () => {
|
|
14
|
+
it("returns string value", () => {
|
|
15
|
+
assert.equal(requireArg({ foo: "bar" }, "foo"), "bar");
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
it("throws CliError on missing key", () => {
|
|
19
|
+
assert.throws(() => requireArg({}, "foo"), CliError);
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
it("throws CliError on empty string", () => {
|
|
23
|
+
assert.throws(() => requireArg({ foo: "" }, "foo"), CliError);
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
it("throws CliError on null", () => {
|
|
27
|
+
assert.throws(() => requireArg({ foo: null }, "foo"), CliError);
|
|
28
|
+
});
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
describe("requireInt", () => {
|
|
32
|
+
it("parses integer string", () => {
|
|
33
|
+
assert.equal(requireInt({ n: "42" }, "n"), 42);
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
it("parses negative integer", () => {
|
|
37
|
+
assert.equal(requireInt({ n: "-7" }, "n"), -7);
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
it("throws CliError on non-integer", () => {
|
|
41
|
+
assert.throws(() => requireInt({ n: "abc" }, "n"), CliError);
|
|
42
|
+
});
|
|
43
|
+
|
|
44
|
+
it("throws CliError on missing", () => {
|
|
45
|
+
assert.throws(() => requireInt({}, "n"), CliError);
|
|
46
|
+
});
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
describe("optionalInt", () => {
|
|
50
|
+
it("returns default when missing", () => {
|
|
51
|
+
assert.equal(optionalInt({}, "x", 99), 99);
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
it("returns default when empty string", () => {
|
|
55
|
+
assert.equal(optionalInt({ x: "" }, "x", 99), 99);
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
it("parses when present", () => {
|
|
59
|
+
assert.equal(optionalInt({ x: "7" }, "x", 99), 7);
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
it("throws CliError on non-integer", () => {
|
|
63
|
+
assert.throws(() => optionalInt({ x: "abc" }, "x", 99), CliError);
|
|
64
|
+
});
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
describe("buildMeasurement", () => {
|
|
68
|
+
it("builds baseline measurement", () => {
|
|
69
|
+
const m = buildMeasurement(
|
|
70
|
+
{ "baseline-median": "5000", "baseline-std": "100", "baseline-n": "10", "baseline-samples": "1000,2000,3000" },
|
|
71
|
+
"baseline",
|
|
72
|
+
);
|
|
73
|
+
assert.equal(m.n, 10);
|
|
74
|
+
assert.equal(m.medianX1000, 5000);
|
|
75
|
+
assert.equal(m.stdX1000, 100);
|
|
76
|
+
assert.deepEqual(m.samplesX1000, [1000, 2000, 3000]);
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
it("builds candidate measurement", () => {
|
|
80
|
+
const m = buildMeasurement(
|
|
81
|
+
{ "candidate-median": "4000", "candidate-n": "5" },
|
|
82
|
+
"candidate",
|
|
83
|
+
);
|
|
84
|
+
assert.equal(m.n, 5);
|
|
85
|
+
assert.equal(m.medianX1000, 4000);
|
|
86
|
+
assert.equal(m.stdX1000, 0); // default
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
it("defaults std to 0", () => {
|
|
90
|
+
const m = buildMeasurement({ "baseline-median": "100", "baseline-n": "1" }, "baseline");
|
|
91
|
+
assert.equal(m.stdX1000, 0);
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
it("omits samplesX1000 when empty", () => {
|
|
95
|
+
const m = buildMeasurement(
|
|
96
|
+
{ "baseline-median": "100", "baseline-n": "1", "baseline-samples": "" },
|
|
97
|
+
"baseline",
|
|
98
|
+
);
|
|
99
|
+
assert.equal("samplesX1000" in m, false);
|
|
100
|
+
});
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
describe("parseSecondary", () => {
|
|
104
|
+
it("parses 5-field format", () => {
|
|
105
|
+
const mv = parseSecondary("rss:KB:5000:4500:-1000");
|
|
106
|
+
assert.equal(mv.name, "rss");
|
|
107
|
+
assert.equal(mv.unit, "KB");
|
|
108
|
+
assert.equal(mv.baseline.medianX1000, 5000);
|
|
109
|
+
assert.equal(mv.candidate.medianX1000, 4500);
|
|
110
|
+
assert.equal(mv.deltaPctX100, -1000);
|
|
111
|
+
assert.equal(mv.regressed, false);
|
|
112
|
+
assert.equal(mv.baseline.n, 1);
|
|
113
|
+
assert.equal(mv.baseline.stdX1000, 0);
|
|
114
|
+
});
|
|
115
|
+
|
|
116
|
+
it("parses 6-field format with regressed=true", () => {
|
|
117
|
+
const mv = parseSecondary("mem:MB:41000:42000:243:true");
|
|
118
|
+
assert.equal(mv.regressed, true);
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
it("defaults regressed to false", () => {
|
|
122
|
+
const mv = parseSecondary("x:y:1:2:3");
|
|
123
|
+
assert.equal(mv.regressed, false);
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
it("throws CliError on too few fields", () => {
|
|
127
|
+
assert.throws(() => parseSecondary("a:b:c"), CliError);
|
|
128
|
+
});
|
|
129
|
+
|
|
130
|
+
it("throws CliError on non-integer numeric fields", () => {
|
|
131
|
+
assert.throws(() => parseSecondary("a:b:notnum:2:3"), CliError);
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
it("handles negative delta", () => {
|
|
135
|
+
const mv = parseSecondary("x:y:100:200:-500");
|
|
136
|
+
assert.equal(mv.deltaPctX100, -500);
|
|
137
|
+
});
|
|
138
|
+
});
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
// End-to-end integration tests. Requires `rad` on PATH with a configured identity.
|
|
2
|
+
// Creates a temp radicle repo, runs CLI commands, verifies output.
|
|
3
|
+
|
|
4
|
+
import { test } from "node:test";
|
|
5
|
+
import assert from "node:assert/strict";
|
|
6
|
+
import { execFileSync } from "node:child_process";
|
|
7
|
+
import { mkdtempSync, rmSync } from "node:fs";
|
|
8
|
+
import { join, dirname } from "node:path";
|
|
9
|
+
import { tmpdir } from "node:os";
|
|
10
|
+
import { fileURLToPath } from "node:url";
|
|
11
|
+
|
|
12
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
13
|
+
const CLI = join(__dirname, "..", "rad-experiment.ts");
|
|
14
|
+
const TSX = join(__dirname, "..", "..", "node_modules", ".bin", "tsx");
|
|
15
|
+
|
|
16
|
+
function radExperiment(...args: string[]): string {
|
|
17
|
+
return execFileSync(TSX, [CLI, ...args], { encoding: "utf-8", timeout: 15000 });
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
function shell(cmd: string, args: string[], opts?: { cwd?: string; timeout?: number }): string {
|
|
21
|
+
return execFileSync(cmd, args, { encoding: "utf-8", ...opts });
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
function shellQuiet(cmd: string, args: string[], opts?: { cwd?: string; timeout?: number }): void {
|
|
25
|
+
execFileSync(cmd, args, { stdio: "inherit", ...opts });
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function canRunIntegration(): boolean {
|
|
29
|
+
try {
|
|
30
|
+
shell("rad", ["self"]);
|
|
31
|
+
return true;
|
|
32
|
+
} catch {
|
|
33
|
+
return false;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
test("integration: full CLI workflow", { skip: !canRunIntegration() && "rad not available" }, () => {
|
|
38
|
+
const tempDir = mkdtempSync(join(tmpdir(), "rad-exp-test-"));
|
|
39
|
+
|
|
40
|
+
try {
|
|
41
|
+
// Setup: init git + radicle repo (use inherited stdio — rad init fails with piped stdio)
|
|
42
|
+
// Unique commit message avoids radicle identity collision across test runs.
|
|
43
|
+
shellQuiet("git", ["init", "-b", "main"], { cwd: tempDir });
|
|
44
|
+
shellQuiet("git", ["commit", "--allow-empty", "-m", `init-${Date.now()}`], { cwd: tempDir });
|
|
45
|
+
shellQuiet("rad", ["init", "--name", `test-${Date.now()}`, "--description", "test", "--default-branch", "main", "--public"], {
|
|
46
|
+
cwd: tempDir,
|
|
47
|
+
timeout: 15000,
|
|
48
|
+
});
|
|
49
|
+
const commit = shell("git", ["rev-parse", "HEAD"], { cwd: tempDir }).trim();
|
|
50
|
+
|
|
51
|
+
// publish
|
|
52
|
+
const publishOutput = radExperiment(
|
|
53
|
+
"--repo", tempDir,
|
|
54
|
+
"publish",
|
|
55
|
+
"--base", commit, "--head", commit,
|
|
56
|
+
"--metric", "wall_time", "--unit", "ms",
|
|
57
|
+
"--direction", "lower_is_better",
|
|
58
|
+
"--runner", "arm64",
|
|
59
|
+
"--baseline-median", "59340", "--baseline-n", "5",
|
|
60
|
+
"--candidate-median", "45200", "--candidate-n", "5",
|
|
61
|
+
"--delta", "2378",
|
|
62
|
+
"-d", "Integration test experiment",
|
|
63
|
+
);
|
|
64
|
+
assert.match(publishOutput, /Experiment published: ([a-f0-9]{40})/);
|
|
65
|
+
assert.match(publishOutput, /\+23\.78%/);
|
|
66
|
+
const publishedId = publishOutput.match(/Experiment published: ([a-f0-9]{40})/)![1];
|
|
67
|
+
|
|
68
|
+
// list
|
|
69
|
+
const listOutput = radExperiment("--repo", tempDir, "list");
|
|
70
|
+
assert.match(listOutput, new RegExp(publishedId.slice(0, 7)));
|
|
71
|
+
assert.match(listOutput, /wall_time/);
|
|
72
|
+
|
|
73
|
+
// show (text)
|
|
74
|
+
const showOutput = radExperiment("--repo", tempDir, "show", publishedId);
|
|
75
|
+
assert.match(showOutput, /Integration test experiment/);
|
|
76
|
+
assert.match(showOutput, /wall_time \(ms\)/);
|
|
77
|
+
assert.match(showOutput, /59\.340 ms/);
|
|
78
|
+
assert.match(showOutput, /\+23\.78%/);
|
|
79
|
+
|
|
80
|
+
// show --json
|
|
81
|
+
const jsonOutput = radExperiment("--repo", tempDir, "show", "--json", publishedId);
|
|
82
|
+
const exp = JSON.parse(jsonOutput);
|
|
83
|
+
assert.equal(exp.metricName, "wall_time");
|
|
84
|
+
assert.equal(exp.deltaPctX100, 2378);
|
|
85
|
+
assert.ok(Array.isArray(exp.reproductions));
|
|
86
|
+
|
|
87
|
+
// reproduce
|
|
88
|
+
const reproOutput = radExperiment(
|
|
89
|
+
"--repo", tempDir,
|
|
90
|
+
"reproduce", publishedId,
|
|
91
|
+
"--verdict", "confirmed",
|
|
92
|
+
"--runner", "amd64",
|
|
93
|
+
"--baseline-median", "59000", "--baseline-n", "5",
|
|
94
|
+
"--candidate-median", "45500", "--candidate-n", "5",
|
|
95
|
+
"--delta", "2288",
|
|
96
|
+
);
|
|
97
|
+
assert.match(reproOutput, /Reproduction added/);
|
|
98
|
+
|
|
99
|
+
// verify reproduction in show --json
|
|
100
|
+
const afterReproJson = radExperiment("--repo", tempDir, "show", "--json", publishedId);
|
|
101
|
+
const expAfter = JSON.parse(afterReproJson);
|
|
102
|
+
assert.equal(expAfter.reproductions.length, 1);
|
|
103
|
+
assert.equal(expAfter.reproductions[0].verdict, "confirmed");
|
|
104
|
+
|
|
105
|
+
// list --reproduced
|
|
106
|
+
const reproList = radExperiment("--repo", tempDir, "list", "--reproduced");
|
|
107
|
+
assert.match(reproList, /verified/);
|
|
108
|
+
|
|
109
|
+
// list --unverified
|
|
110
|
+
const unverifiedList = radExperiment("--repo", tempDir, "list", "--unverified");
|
|
111
|
+
assert.match(unverifiedList, /No experiments found/);
|
|
112
|
+
|
|
113
|
+
// error path
|
|
114
|
+
assert.throws(
|
|
115
|
+
() => radExperiment("--repo", tempDir, "publish"),
|
|
116
|
+
(err: unknown) => {
|
|
117
|
+
const e = err as { status: number; stderr: Buffer };
|
|
118
|
+
return e.status !== 0 && e.stderr.toString().includes("missing required argument");
|
|
119
|
+
},
|
|
120
|
+
);
|
|
121
|
+
} finally {
|
|
122
|
+
try { rmSync(tempDir, { recursive: true }); } catch { /* ignore */ }
|
|
123
|
+
}
|
|
124
|
+
});
|