ppef 1.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +76 -125
- package/dist/__tests__/cli/evaluate-command.integration.test.d.ts +8 -0
- package/dist/__tests__/cli/evaluate-command.integration.test.d.ts.map +1 -0
- package/dist/__tests__/cli/evaluate-command.integration.test.js +308 -0
- package/dist/__tests__/cli/evaluate-command.integration.test.js.map +1 -0
- package/dist/__tests__/evaluators/claims-evaluator.unit.test.d.ts +8 -0
- package/dist/__tests__/evaluators/claims-evaluator.unit.test.d.ts.map +1 -0
- package/dist/__tests__/evaluators/claims-evaluator.unit.test.js +405 -0
- package/dist/__tests__/evaluators/claims-evaluator.unit.test.js.map +1 -0
- package/dist/__tests__/evaluators/metrics-evaluator.unit.test.d.ts +8 -0
- package/dist/__tests__/evaluators/metrics-evaluator.unit.test.d.ts.map +1 -0
- package/dist/__tests__/evaluators/metrics-evaluator.unit.test.js +424 -0
- package/dist/__tests__/evaluators/metrics-evaluator.unit.test.js.map +1 -0
- package/dist/__tests__/evaluators/registry.unit.test.d.ts +7 -0
- package/dist/__tests__/evaluators/registry.unit.test.d.ts.map +1 -0
- package/dist/__tests__/evaluators/registry.unit.test.js +173 -0
- package/dist/__tests__/evaluators/registry.unit.test.js.map +1 -0
- package/dist/__tests__/evaluators/robustness-evaluator.unit.test.d.ts +8 -0
- package/dist/__tests__/evaluators/robustness-evaluator.unit.test.d.ts.map +1 -0
- package/dist/__tests__/evaluators/robustness-evaluator.unit.test.js +260 -0
- package/dist/__tests__/evaluators/robustness-evaluator.unit.test.js.map +1 -0
- package/dist/__tests__/framework-pipeline.integration.test.js +36 -9
- package/dist/__tests__/framework-pipeline.integration.test.js.map +1 -1
- package/dist/__tests__/index-exports.unit.test.js +9 -12
- package/dist/__tests__/index-exports.unit.test.js.map +1 -1
- package/dist/aggregation/pipeline.d.ts.map +1 -1
- package/dist/aggregation/pipeline.js +40 -3
- package/dist/aggregation/pipeline.js.map +1 -1
- package/dist/claims/index.d.ts +6 -3
- package/dist/claims/index.d.ts.map +1 -1
- package/dist/claims/index.js +6 -3
- package/dist/claims/index.js.map +1 -1
- package/dist/cli/__tests__/aggregate.command.unit.test.js +3 -0
- package/dist/cli/__tests__/aggregate.command.unit.test.js.map +1 -1
- package/dist/cli/__tests__/binary-sut.integration.test.d.ts +8 -0
- package/dist/cli/__tests__/binary-sut.integration.test.d.ts.map +1 -0
- package/dist/cli/__tests__/binary-sut.integration.test.js +165 -0
- package/dist/cli/__tests__/binary-sut.integration.test.js.map +1 -0
- package/dist/cli/__tests__/config-loader.unit.test.d.ts +7 -0
- package/dist/cli/__tests__/config-loader.unit.test.d.ts.map +1 -0
- package/dist/cli/__tests__/config-loader.unit.test.js +611 -0
- package/dist/cli/__tests__/config-loader.unit.test.js.map +1 -0
- package/dist/cli/command-deps.d.ts +13 -1
- package/dist/cli/command-deps.d.ts.map +1 -1
- package/dist/cli/commands/aggregate.d.ts.map +1 -1
- package/dist/cli/commands/aggregate.js +3 -0
- package/dist/cli/commands/aggregate.js.map +1 -1
- package/dist/cli/commands/evaluate.d.ts +41 -0
- package/dist/cli/commands/evaluate.d.ts.map +1 -0
- package/dist/cli/commands/evaluate.js +287 -0
- package/dist/cli/commands/evaluate.js.map +1 -0
- package/dist/cli/commands/run.d.ts.map +1 -1
- package/dist/cli/commands/run.js +93 -1
- package/dist/cli/commands/run.js.map +1 -1
- package/dist/cli/index.d.ts +2 -1
- package/dist/cli/index.d.ts.map +1 -1
- package/dist/cli/index.js +3 -1
- package/dist/cli/index.js.map +1 -1
- package/dist/cli/module-loader.d.ts +23 -1
- package/dist/cli/module-loader.d.ts.map +1 -1
- package/dist/cli/module-loader.js +19 -1
- package/dist/cli/module-loader.js.map +1 -1
- package/dist/cli/types.d.ts +19 -0
- package/dist/cli/types.d.ts.map +1 -1
- package/dist/evaluators/claims-evaluator.d.ts +87 -0
- package/dist/evaluators/claims-evaluator.d.ts.map +1 -0
- package/dist/evaluators/claims-evaluator.js +289 -0
- package/dist/evaluators/claims-evaluator.js.map +1 -0
- package/dist/evaluators/exploratory-evaluator.d.ts +136 -0
- package/dist/evaluators/exploratory-evaluator.d.ts.map +1 -0
- package/dist/evaluators/exploratory-evaluator.js +545 -0
- package/dist/evaluators/exploratory-evaluator.js.map +1 -0
- package/dist/evaluators/index.d.ts +13 -0
- package/dist/evaluators/index.d.ts.map +1 -0
- package/dist/evaluators/index.js +14 -0
- package/dist/evaluators/index.js.map +1 -0
- package/dist/evaluators/metrics-evaluator.d.ts +114 -0
- package/dist/evaluators/metrics-evaluator.d.ts.map +1 -0
- package/dist/evaluators/metrics-evaluator.js +433 -0
- package/dist/evaluators/metrics-evaluator.js.map +1 -0
- package/dist/evaluators/registry.d.ts +106 -0
- package/dist/evaluators/registry.d.ts.map +1 -0
- package/dist/evaluators/registry.js +148 -0
- package/dist/evaluators/registry.js.map +1 -0
- package/dist/evaluators/robustness-evaluator.d.ts +57 -0
- package/dist/evaluators/robustness-evaluator.d.ts.map +1 -0
- package/dist/evaluators/robustness-evaluator.js +186 -0
- package/dist/evaluators/robustness-evaluator.js.map +1 -0
- package/dist/executor/__tests__/binary-sut.unit.test.d.ts +8 -0
- package/dist/executor/__tests__/binary-sut.unit.test.d.ts.map +1 -0
- package/dist/executor/__tests__/binary-sut.unit.test.js +313 -0
- package/dist/executor/__tests__/binary-sut.unit.test.js.map +1 -0
- package/dist/executor/__tests__/checkpoint-storage.unit.test.js +43 -0
- package/dist/executor/__tests__/checkpoint-storage.unit.test.js.map +1 -1
- package/dist/executor/__tests__/executor.unit.test.js +56 -9
- package/dist/executor/__tests__/executor.unit.test.js.map +1 -1
- package/dist/executor/__tests__/resource-calculator.unit.test.d.ts +10 -0
- package/dist/executor/__tests__/resource-calculator.unit.test.d.ts.map +1 -0
- package/dist/executor/__tests__/resource-calculator.unit.test.js +104 -0
- package/dist/executor/__tests__/resource-calculator.unit.test.js.map +1 -0
- package/dist/executor/__tests__/worker-threads-executor.unit.test.d.ts +8 -0
- package/dist/executor/__tests__/worker-threads-executor.unit.test.d.ts.map +1 -0
- package/dist/executor/__tests__/worker-threads-executor.unit.test.js +276 -0
- package/dist/executor/__tests__/worker-threads-executor.unit.test.js.map +1 -0
- package/dist/executor/binary-sut.d.ts +105 -0
- package/dist/executor/binary-sut.d.ts.map +1 -0
- package/dist/executor/binary-sut.js +174 -0
- package/dist/executor/binary-sut.js.map +1 -0
- package/dist/executor/checkpoint-storage.d.ts.map +1 -1
- package/dist/executor/checkpoint-storage.js +6 -4
- package/dist/executor/checkpoint-storage.js.map +1 -1
- package/dist/executor/executor.d.ts +28 -0
- package/dist/executor/executor.d.ts.map +1 -1
- package/dist/executor/executor.js +85 -24
- package/dist/executor/executor.js.map +1 -1
- package/dist/executor/index.d.ts +4 -0
- package/dist/executor/index.d.ts.map +1 -1
- package/dist/executor/index.js +4 -0
- package/dist/executor/index.js.map +1 -1
- package/dist/executor/resource-calculator.d.ts +49 -0
- package/dist/executor/resource-calculator.d.ts.map +1 -0
- package/dist/executor/resource-calculator.js +129 -0
- package/dist/executor/resource-calculator.js.map +1 -0
- package/dist/executor/worker-entry.js +26 -10
- package/dist/executor/worker-entry.js.map +1 -1
- package/dist/executor/worker-executor.d.ts +104 -3
- package/dist/executor/worker-executor.d.ts.map +1 -1
- package/dist/executor/worker-executor.js +224 -4
- package/dist/executor/worker-executor.js.map +1 -1
- package/dist/executor/worker-threads-executor.d.ts +245 -0
- package/dist/executor/worker-threads-executor.d.ts.map +1 -0
- package/dist/executor/worker-threads-executor.js +332 -0
- package/dist/executor/worker-threads-executor.js.map +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +4 -2
- package/dist/index.js.map +1 -1
- package/dist/renderers/latex-renderer.d.ts +60 -0
- package/dist/renderers/latex-renderer.d.ts.map +1 -1
- package/dist/renderers/latex-renderer.js +299 -0
- package/dist/renderers/latex-renderer.js.map +1 -1
- package/dist/renderers/types.d.ts +9 -0
- package/dist/renderers/types.d.ts.map +1 -1
- package/dist/renderers/types.js.map +1 -1
- package/dist/robustness/index.d.ts +5 -2
- package/dist/robustness/index.d.ts.map +1 -1
- package/dist/robustness/index.js +4 -2
- package/dist/robustness/index.js.map +1 -1
- package/dist/types/evaluator.d.ts +449 -0
- package/dist/types/evaluator.d.ts.map +1 -0
- package/dist/types/evaluator.js +9 -0
- package/dist/types/evaluator.js.map +1 -0
- package/dist/types/result.d.ts +2 -0
- package/dist/types/result.d.ts.map +1 -1
- package/package.json +2 -2
- package/dist/claims/__tests__/evaluator.unit.test.d.ts +0 -12
- package/dist/claims/__tests__/evaluator.unit.test.d.ts.map +0 -1
- package/dist/claims/__tests__/evaluator.unit.test.js +0 -801
- package/dist/claims/__tests__/evaluator.unit.test.js.map +0 -1
- package/dist/claims/evaluator.d.ts +0 -33
- package/dist/claims/evaluator.d.ts.map +0 -1
- package/dist/claims/evaluator.js +0 -174
- package/dist/claims/evaluator.js.map +0 -1
- package/dist/robustness/__tests__/analyzer.unit.test.d.ts +0 -11
- package/dist/robustness/__tests__/analyzer.unit.test.d.ts.map +0 -1
- package/dist/robustness/__tests__/analyzer.unit.test.js +0 -455
- package/dist/robustness/__tests__/analyzer.unit.test.js.map +0 -1
- package/dist/robustness/analyzer.d.ts +0 -61
- package/dist/robustness/analyzer.d.ts.map +0 -1
- package/dist/robustness/analyzer.js +0 -191
- package/dist/robustness/analyzer.js.map +0 -1
|
@@ -1,801 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Unit tests for Claims Evaluator
|
|
3
|
-
*
|
|
4
|
-
* Tests the evaluateClaim function and related functionality including:
|
|
5
|
-
* - Satisfied claims with various directions
|
|
6
|
-
* - Violated claims
|
|
7
|
-
* - Inconclusive results from missing data
|
|
8
|
-
* - Evidence computation (delta, ratio, pValue, effectSize)
|
|
9
|
-
* - Scope filtering with caseClass constraints
|
|
10
|
-
*/
|
|
11
|
-
import { describe, it } from "node:test";
|
|
12
|
-
import { strict as assert } from "node:assert";
|
|
13
|
-
import { createMockAggregate, createMockSummaryStats } from "../../__tests__/test-helpers.js";
|
|
14
|
-
import { evaluateClaim, evaluateClaims, createClaimSummary } from "../evaluator.js";
|
|
15
|
-
/**
|
|
16
|
-
* Test helpers
|
|
17
|
-
*/
|
|
18
|
-
/**
|
|
19
|
-
* Create a test claim with defaults.
|
|
20
|
-
*/
|
|
21
|
-
const createTestClaim = (overrides) => ({
|
|
22
|
-
claimId: "C001",
|
|
23
|
-
description: "Test claim",
|
|
24
|
-
sut: "primary-sut",
|
|
25
|
-
baseline: "baseline-sut",
|
|
26
|
-
metric: "execution-time",
|
|
27
|
-
direction: "less",
|
|
28
|
-
scope: "global",
|
|
29
|
-
...overrides,
|
|
30
|
-
});
|
|
31
|
-
/**
|
|
32
|
-
* Test suites
|
|
33
|
-
*/
|
|
34
|
-
describe("evaluateClaim", () => {
|
|
35
|
-
describe("satisfied claims - direction 'less'", () => {
|
|
36
|
-
it("should satisfy claim when primary < baseline", () => {
|
|
37
|
-
const claim = createTestClaim({
|
|
38
|
-
sut: "fast-sut",
|
|
39
|
-
baseline: "slow-sut",
|
|
40
|
-
direction: "less",
|
|
41
|
-
});
|
|
42
|
-
const aggregates = [
|
|
43
|
-
createMockAggregate("fast-sut", "primary", undefined, {
|
|
44
|
-
"execution-time": createMockSummaryStats([80, 85, 90]),
|
|
45
|
-
}),
|
|
46
|
-
createMockAggregate("slow-sut", "baseline", undefined, {
|
|
47
|
-
"execution-time": createMockSummaryStats([120, 125, 130]),
|
|
48
|
-
}),
|
|
49
|
-
];
|
|
50
|
-
const result = evaluateClaim(claim, aggregates);
|
|
51
|
-
assert.equal(result.status, "satisfied");
|
|
52
|
-
assert.equal(result.claim.claimId, "C001");
|
|
53
|
-
assert.equal(result.evidence.primaryValue, 85);
|
|
54
|
-
assert.equal(result.evidence.baselineValue, 125);
|
|
55
|
-
assert.equal(result.evidence.delta, -40);
|
|
56
|
-
});
|
|
57
|
-
it("should satisfy claim with threshold when primary <= baseline - threshold", () => {
|
|
58
|
-
const claim = createTestClaim({
|
|
59
|
-
sut: "fast-sut",
|
|
60
|
-
baseline: "slow-sut",
|
|
61
|
-
direction: "less",
|
|
62
|
-
threshold: 10,
|
|
63
|
-
});
|
|
64
|
-
const aggregates = [
|
|
65
|
-
createMockAggregate("fast-sut", "primary", undefined, {
|
|
66
|
-
"execution-time": createMockSummaryStats([80, 85, 90]),
|
|
67
|
-
}),
|
|
68
|
-
createMockAggregate("slow-sut", "baseline", undefined, {
|
|
69
|
-
"execution-time": createMockSummaryStats([100, 105, 110]),
|
|
70
|
-
}),
|
|
71
|
-
];
|
|
72
|
-
const result = evaluateClaim(claim, aggregates);
|
|
73
|
-
assert.equal(result.status, "satisfied");
|
|
74
|
-
assert.equal(result.evidence.delta, -20);
|
|
75
|
-
assert.equal(result.evidence.delta, -20); // -20 <= -10 threshold
|
|
76
|
-
});
|
|
77
|
-
});
|
|
78
|
-
describe("violated claims - direction 'less'", () => {
|
|
79
|
-
it("should violate claim when primary > baseline", () => {
|
|
80
|
-
const claim = createTestClaim({
|
|
81
|
-
sut: "slow-sut",
|
|
82
|
-
baseline: "fast-sut",
|
|
83
|
-
direction: "less",
|
|
84
|
-
});
|
|
85
|
-
const aggregates = [
|
|
86
|
-
createMockAggregate("slow-sut", "primary", undefined, {
|
|
87
|
-
"execution-time": createMockSummaryStats([120, 125, 130]),
|
|
88
|
-
}),
|
|
89
|
-
createMockAggregate("fast-sut", "baseline", undefined, {
|
|
90
|
-
"execution-time": createMockSummaryStats([80, 85, 90]),
|
|
91
|
-
}),
|
|
92
|
-
];
|
|
93
|
-
const result = evaluateClaim(claim, aggregates);
|
|
94
|
-
assert.equal(result.status, "violated");
|
|
95
|
-
assert.equal(result.evidence.delta, 40);
|
|
96
|
-
});
|
|
97
|
-
it("should violate claim with threshold when delta not sufficient", () => {
|
|
98
|
-
const claim = createTestClaim({
|
|
99
|
-
sut: "fast-sut",
|
|
100
|
-
baseline: "slow-sut",
|
|
101
|
-
direction: "less",
|
|
102
|
-
threshold: 50,
|
|
103
|
-
});
|
|
104
|
-
const aggregates = [
|
|
105
|
-
createMockAggregate("fast-sut", "primary", undefined, {
|
|
106
|
-
"execution-time": createMockSummaryStats([80, 85, 90]),
|
|
107
|
-
}),
|
|
108
|
-
createMockAggregate("slow-sut", "baseline", undefined, {
|
|
109
|
-
"execution-time": createMockSummaryStats([120, 125, 130]),
|
|
110
|
-
}),
|
|
111
|
-
];
|
|
112
|
-
const result = evaluateClaim(claim, aggregates);
|
|
113
|
-
assert.equal(result.status, "violated");
|
|
114
|
-
assert.equal(result.evidence.delta, -40); // -40 > -50 threshold
|
|
115
|
-
});
|
|
116
|
-
});
|
|
117
|
-
describe("satisfied claims - direction 'greater'", () => {
|
|
118
|
-
it("should satisfy claim when primary > baseline", () => {
|
|
119
|
-
const claim = createTestClaim({
|
|
120
|
-
sut: "high-quality",
|
|
121
|
-
baseline: "low-quality",
|
|
122
|
-
metric: "accuracy",
|
|
123
|
-
direction: "greater",
|
|
124
|
-
});
|
|
125
|
-
const aggregates = [
|
|
126
|
-
createMockAggregate("high-quality", "primary", undefined, {
|
|
127
|
-
accuracy: createMockSummaryStats([0.9, 0.92, 0.95]),
|
|
128
|
-
}),
|
|
129
|
-
createMockAggregate("low-quality", "baseline", undefined, {
|
|
130
|
-
accuracy: createMockSummaryStats([0.7, 0.75, 0.8]),
|
|
131
|
-
}),
|
|
132
|
-
];
|
|
133
|
-
const result = evaluateClaim(claim, aggregates);
|
|
134
|
-
assert.equal(result.status, "satisfied");
|
|
135
|
-
assert.ok(Math.abs(result.evidence.delta - 0.17) < 0.01);
|
|
136
|
-
});
|
|
137
|
-
it("should satisfy claim with threshold", () => {
|
|
138
|
-
const claim = createTestClaim({
|
|
139
|
-
sut: "high-quality",
|
|
140
|
-
baseline: "low-quality",
|
|
141
|
-
metric: "accuracy",
|
|
142
|
-
direction: "greater",
|
|
143
|
-
threshold: 0.1,
|
|
144
|
-
});
|
|
145
|
-
const aggregates = [
|
|
146
|
-
createMockAggregate("high-quality", "primary", undefined, {
|
|
147
|
-
accuracy: createMockSummaryStats([0.9, 0.92, 0.95]),
|
|
148
|
-
}),
|
|
149
|
-
createMockAggregate("low-quality", "baseline", undefined, {
|
|
150
|
-
accuracy: createMockSummaryStats([0.7, 0.75, 0.8]),
|
|
151
|
-
}),
|
|
152
|
-
];
|
|
153
|
-
const result = evaluateClaim(claim, aggregates);
|
|
154
|
-
assert.equal(result.status, "satisfied");
|
|
155
|
-
assert.ok(Math.abs(result.evidence.delta - 0.17) < 0.01); // 0.17 >= 0.1
|
|
156
|
-
});
|
|
157
|
-
});
|
|
158
|
-
describe("violated claims - direction 'greater'", () => {
|
|
159
|
-
it("should violate claim when primary < baseline", () => {
|
|
160
|
-
const claim = createTestClaim({
|
|
161
|
-
sut: "low-quality",
|
|
162
|
-
baseline: "high-quality",
|
|
163
|
-
metric: "accuracy",
|
|
164
|
-
direction: "greater",
|
|
165
|
-
});
|
|
166
|
-
const aggregates = [
|
|
167
|
-
createMockAggregate("low-quality", "primary", undefined, {
|
|
168
|
-
accuracy: createMockSummaryStats([0.7, 0.75, 0.8]),
|
|
169
|
-
}),
|
|
170
|
-
createMockAggregate("high-quality", "baseline", undefined, {
|
|
171
|
-
accuracy: createMockSummaryStats([0.9, 0.92, 0.95]),
|
|
172
|
-
}),
|
|
173
|
-
];
|
|
174
|
-
const result = evaluateClaim(claim, aggregates);
|
|
175
|
-
assert.equal(result.status, "violated");
|
|
176
|
-
assert.ok(Math.abs(result.evidence.delta - -0.17) < 0.01);
|
|
177
|
-
});
|
|
178
|
-
});
|
|
179
|
-
describe("satisfied claims - direction 'equal'", () => {
|
|
180
|
-
it("should satisfy claim when values are approximately equal", () => {
|
|
181
|
-
const claim = createTestClaim({
|
|
182
|
-
sut: "sut-a",
|
|
183
|
-
baseline: "sut-b",
|
|
184
|
-
metric: "output-size",
|
|
185
|
-
direction: "equal",
|
|
186
|
-
});
|
|
187
|
-
const aggregates = [
|
|
188
|
-
createMockAggregate("sut-a", "primary", undefined, {
|
|
189
|
-
"output-size": createMockSummaryStats([100, 100.1, 99.9]),
|
|
190
|
-
}),
|
|
191
|
-
createMockAggregate("sut-b", "baseline", undefined, {
|
|
192
|
-
"output-size": createMockSummaryStats([100, 100.05, 99.95]),
|
|
193
|
-
}),
|
|
194
|
-
];
|
|
195
|
-
const result = evaluateClaim(claim, aggregates);
|
|
196
|
-
assert.equal(result.status, "satisfied");
|
|
197
|
-
assert.ok(Math.abs(result.evidence.delta) <= 0.001);
|
|
198
|
-
});
|
|
199
|
-
it("should satisfy claim with custom threshold", () => {
|
|
200
|
-
const claim = createTestClaim({
|
|
201
|
-
sut: "sut-a",
|
|
202
|
-
baseline: "sut-b",
|
|
203
|
-
metric: "output-size",
|
|
204
|
-
direction: "equal",
|
|
205
|
-
threshold: 5,
|
|
206
|
-
});
|
|
207
|
-
const aggregates = [
|
|
208
|
-
createMockAggregate("sut-a", "primary", undefined, {
|
|
209
|
-
"output-size": createMockSummaryStats([100, 102, 98]),
|
|
210
|
-
}),
|
|
211
|
-
createMockAggregate("sut-b", "baseline", undefined, {
|
|
212
|
-
"output-size": createMockSummaryStats([103, 105, 97]),
|
|
213
|
-
}),
|
|
214
|
-
];
|
|
215
|
-
const result = evaluateClaim(claim, aggregates);
|
|
216
|
-
assert.equal(result.status, "satisfied");
|
|
217
|
-
assert.ok(Math.abs(result.evidence.delta) <= 5);
|
|
218
|
-
});
|
|
219
|
-
});
|
|
220
|
-
describe("violated claims - direction 'equal'", () => {
|
|
221
|
-
it("should violate claim when values differ significantly", () => {
|
|
222
|
-
const claim = createTestClaim({
|
|
223
|
-
sut: "sut-a",
|
|
224
|
-
baseline: "sut-b",
|
|
225
|
-
metric: "output-size",
|
|
226
|
-
direction: "equal",
|
|
227
|
-
});
|
|
228
|
-
const aggregates = [
|
|
229
|
-
createMockAggregate("sut-a", "primary", undefined, {
|
|
230
|
-
"output-size": createMockSummaryStats([100, 110, 90]),
|
|
231
|
-
}),
|
|
232
|
-
createMockAggregate("sut-b", "baseline", undefined, {
|
|
233
|
-
"output-size": createMockSummaryStats([150, 160, 140]),
|
|
234
|
-
}),
|
|
235
|
-
];
|
|
236
|
-
const result = evaluateClaim(claim, aggregates);
|
|
237
|
-
assert.equal(result.status, "violated");
|
|
238
|
-
assert.ok(Math.abs(result.evidence.delta) > 0.001);
|
|
239
|
-
});
|
|
240
|
-
});
|
|
241
|
-
describe("inconclusive - missing primary or baseline", () => {
|
|
242
|
-
it("should be inconclusive when primary SUT not found", () => {
|
|
243
|
-
const claim = createTestClaim({
|
|
244
|
-
sut: "missing-sut",
|
|
245
|
-
baseline: "baseline-sut",
|
|
246
|
-
});
|
|
247
|
-
const aggregates = [
|
|
248
|
-
createMockAggregate("baseline-sut", "baseline", undefined, {
|
|
249
|
-
"execution-time": createMockSummaryStats([100, 110, 120]),
|
|
250
|
-
}),
|
|
251
|
-
];
|
|
252
|
-
const result = evaluateClaim(claim, aggregates);
|
|
253
|
-
assert.equal(result.status, "inconclusive");
|
|
254
|
-
assert.ok(result.inconclusiveReason?.includes("Primary SUT not found"));
|
|
255
|
-
});
|
|
256
|
-
it("should be inconclusive when baseline SUT not found", () => {
|
|
257
|
-
const claim = createTestClaim({
|
|
258
|
-
sut: "primary-sut",
|
|
259
|
-
baseline: "missing-sut",
|
|
260
|
-
});
|
|
261
|
-
const aggregates = [
|
|
262
|
-
createMockAggregate("primary-sut", "primary", undefined, {
|
|
263
|
-
"execution-time": createMockSummaryStats([80, 85, 90]),
|
|
264
|
-
}),
|
|
265
|
-
];
|
|
266
|
-
const result = evaluateClaim(claim, aggregates);
|
|
267
|
-
assert.equal(result.status, "inconclusive");
|
|
268
|
-
assert.ok(result.inconclusiveReason?.includes("Baseline SUT not found"));
|
|
269
|
-
});
|
|
270
|
-
it("should be inconclusive when both SUTs not found", () => {
|
|
271
|
-
const claim = createTestClaim({
|
|
272
|
-
sut: "missing-primary",
|
|
273
|
-
baseline: "missing-baseline",
|
|
274
|
-
});
|
|
275
|
-
const aggregates = [
|
|
276
|
-
createMockAggregate("other-sut", "baseline", undefined, {
|
|
277
|
-
"execution-time": createMockSummaryStats([100, 110, 120]),
|
|
278
|
-
}),
|
|
279
|
-
];
|
|
280
|
-
const result = evaluateClaim(claim, aggregates);
|
|
281
|
-
assert.equal(result.status, "inconclusive");
|
|
282
|
-
assert.ok(result.inconclusiveReason?.includes("Primary SUT not found"));
|
|
283
|
-
assert.ok(result.inconclusiveReason?.includes("Baseline SUT not found"));
|
|
284
|
-
});
|
|
285
|
-
});
|
|
286
|
-
describe("inconclusive - missing metrics", () => {
|
|
287
|
-
it("should be inconclusive when metric missing from primary", () => {
|
|
288
|
-
const claim = createTestClaim({
|
|
289
|
-
metric: "memory-usage",
|
|
290
|
-
});
|
|
291
|
-
const aggregates = [
|
|
292
|
-
createMockAggregate("primary-sut", "primary", undefined, {
|
|
293
|
-
"execution-time": createMockSummaryStats([80, 85, 90]),
|
|
294
|
-
}),
|
|
295
|
-
createMockAggregate("baseline-sut", "baseline", undefined, {
|
|
296
|
-
"memory-usage": createMockSummaryStats([100, 110, 120]),
|
|
297
|
-
}),
|
|
298
|
-
];
|
|
299
|
-
const result = evaluateClaim(claim, aggregates);
|
|
300
|
-
assert.equal(result.status, "inconclusive");
|
|
301
|
-
assert.ok(result.inconclusiveReason?.includes("Metric not found"));
|
|
302
|
-
});
|
|
303
|
-
it("should be inconclusive when metric missing from baseline", () => {
|
|
304
|
-
const claim = createTestClaim({
|
|
305
|
-
metric: "memory-usage",
|
|
306
|
-
});
|
|
307
|
-
const aggregates = [
|
|
308
|
-
createMockAggregate("primary-sut", "primary", undefined, {
|
|
309
|
-
"memory-usage": createMockSummaryStats([80, 85, 90]),
|
|
310
|
-
}),
|
|
311
|
-
createMockAggregate("baseline-sut", "baseline", undefined, {
|
|
312
|
-
"execution-time": createMockSummaryStats([100, 110, 120]),
|
|
313
|
-
}),
|
|
314
|
-
];
|
|
315
|
-
const result = evaluateClaim(claim, aggregates);
|
|
316
|
-
assert.equal(result.status, "inconclusive");
|
|
317
|
-
assert.ok(result.inconclusiveReason?.includes("Metric not found"));
|
|
318
|
-
});
|
|
319
|
-
it("should be inconclusive when metric missing from both", () => {
|
|
320
|
-
const claim = createTestClaim({
|
|
321
|
-
metric: "memory-usage",
|
|
322
|
-
});
|
|
323
|
-
const aggregates = [
|
|
324
|
-
createMockAggregate("primary-sut", "primary", undefined, {}),
|
|
325
|
-
createMockAggregate("baseline-sut", "baseline", undefined, {}),
|
|
326
|
-
];
|
|
327
|
-
const result = evaluateClaim(claim, aggregates);
|
|
328
|
-
assert.equal(result.status, "inconclusive");
|
|
329
|
-
assert.ok(result.inconclusiveReason?.includes("Metric not found"));
|
|
330
|
-
});
|
|
331
|
-
});
|
|
332
|
-
describe("evidence computation", () => {
|
|
333
|
-
it("should compute delta correctly", () => {
|
|
334
|
-
const claim = createTestClaim({
|
|
335
|
-
sut: "primary-sut",
|
|
336
|
-
baseline: "baseline-sut",
|
|
337
|
-
metric: "execution-time",
|
|
338
|
-
});
|
|
339
|
-
const aggregates = [
|
|
340
|
-
createMockAggregate("primary-sut", "primary", undefined, {
|
|
341
|
-
"execution-time": createMockSummaryStats([80, 90, 100]),
|
|
342
|
-
}),
|
|
343
|
-
createMockAggregate("baseline-sut", "baseline", undefined, {
|
|
344
|
-
"execution-time": createMockSummaryStats([120, 130, 140]),
|
|
345
|
-
}),
|
|
346
|
-
];
|
|
347
|
-
const result = evaluateClaim(claim, aggregates);
|
|
348
|
-
assert.equal(result.evidence.primaryValue, 90);
|
|
349
|
-
assert.equal(result.evidence.baselineValue, 130);
|
|
350
|
-
assert.equal(result.evidence.delta, -40);
|
|
351
|
-
});
|
|
352
|
-
it("should compute ratio correctly", () => {
|
|
353
|
-
const claim = createTestClaim({
|
|
354
|
-
sut: "primary-sut",
|
|
355
|
-
baseline: "baseline-sut",
|
|
356
|
-
});
|
|
357
|
-
const aggregates = [
|
|
358
|
-
createMockAggregate("primary-sut", "primary", undefined, {
|
|
359
|
-
"execution-time": createMockSummaryStats([100]),
|
|
360
|
-
}),
|
|
361
|
-
createMockAggregate("baseline-sut", "baseline", undefined, {
|
|
362
|
-
"execution-time": createMockSummaryStats([200]),
|
|
363
|
-
}),
|
|
364
|
-
];
|
|
365
|
-
const result = evaluateClaim(claim, aggregates);
|
|
366
|
-
assert.equal(result.evidence.ratio, 0.5);
|
|
367
|
-
});
|
|
368
|
-
it("should handle zero baseline for ratio", () => {
|
|
369
|
-
const claim = createTestClaim({
|
|
370
|
-
sut: "primary-sut",
|
|
371
|
-
baseline: "baseline-sut",
|
|
372
|
-
metric: "errors",
|
|
373
|
-
});
|
|
374
|
-
const aggregates = [
|
|
375
|
-
createMockAggregate("primary-sut", "primary", undefined, {
|
|
376
|
-
errors: createMockSummaryStats([0]),
|
|
377
|
-
}),
|
|
378
|
-
createMockAggregate("baseline-sut", "baseline", undefined, {
|
|
379
|
-
errors: createMockSummaryStats([0]),
|
|
380
|
-
}),
|
|
381
|
-
];
|
|
382
|
-
const result = evaluateClaim(claim, aggregates);
|
|
383
|
-
assert.equal(result.evidence.ratio, Infinity);
|
|
384
|
-
});
|
|
385
|
-
it("should compute n as sum of sample sizes", () => {
|
|
386
|
-
const claim = createTestClaim();
|
|
387
|
-
const aggregates = [
|
|
388
|
-
createMockAggregate("primary-sut", "primary", undefined, {
|
|
389
|
-
"execution-time": createMockSummaryStats([100, 110, 120]),
|
|
390
|
-
}),
|
|
391
|
-
createMockAggregate("baseline-sut", "baseline", undefined, {
|
|
392
|
-
"execution-time": createMockSummaryStats([200, 210]),
|
|
393
|
-
}),
|
|
394
|
-
];
|
|
395
|
-
const result = evaluateClaim(claim, aggregates);
|
|
396
|
-
assert.equal(result.evidence.n, 5); // 3 + 2
|
|
397
|
-
});
|
|
398
|
-
it("should include pValue from comparisons", () => {
|
|
399
|
-
const claim = createTestClaim();
|
|
400
|
-
const primaryAgg = createMockAggregate("primary-sut", "primary", undefined, {
|
|
401
|
-
"execution-time": createMockSummaryStats([80, 85, 90]),
|
|
402
|
-
});
|
|
403
|
-
const baselineAgg = createMockAggregate("baseline-sut", "baseline", undefined, {
|
|
404
|
-
"execution-time": createMockSummaryStats([120, 125, 130]),
|
|
405
|
-
});
|
|
406
|
-
// Add comparison data
|
|
407
|
-
primaryAgg.comparisons = {
|
|
408
|
-
"baseline-sut": {
|
|
409
|
-
deltas: { "execution-time": -40 },
|
|
410
|
-
ratios: { "execution-time": 0.68 },
|
|
411
|
-
pValue: 0.01,
|
|
412
|
-
effectSize: 1.5,
|
|
413
|
-
},
|
|
414
|
-
};
|
|
415
|
-
const aggregates = [primaryAgg, baselineAgg];
|
|
416
|
-
const result = evaluateClaim(claim, aggregates);
|
|
417
|
-
assert.equal(result.evidence.pValue, 0.01);
|
|
418
|
-
assert.equal(result.evidence.effectSize, 1.5);
|
|
419
|
-
});
|
|
420
|
-
it("should handle missing pValue and effectSize", () => {
|
|
421
|
-
const claim = createTestClaim();
|
|
422
|
-
const aggregates = [
|
|
423
|
-
createMockAggregate("primary-sut", "primary", undefined, {
|
|
424
|
-
"execution-time": createMockSummaryStats([80, 85, 90]),
|
|
425
|
-
}),
|
|
426
|
-
createMockAggregate("baseline-sut", "baseline", undefined, {
|
|
427
|
-
"execution-time": createMockSummaryStats([120, 125, 130]),
|
|
428
|
-
}),
|
|
429
|
-
];
|
|
430
|
-
const result = evaluateClaim(claim, aggregates);
|
|
431
|
-
assert.equal(result.evidence.pValue, undefined);
|
|
432
|
-
assert.equal(result.evidence.effectSize, undefined);
|
|
433
|
-
});
|
|
434
|
-
});
|
|
435
|
-
describe("statistical significance", () => {
|
|
436
|
-
it("should be inconclusive when pValue exceeds significance level", () => {
|
|
437
|
-
const claim = createTestClaim({
|
|
438
|
-
significanceLevel: 0.05,
|
|
439
|
-
});
|
|
440
|
-
const primaryAgg = createMockAggregate("primary-sut", "primary", undefined, {
|
|
441
|
-
"execution-time": createMockSummaryStats([80, 85, 90]),
|
|
442
|
-
});
|
|
443
|
-
const baselineAgg = createMockAggregate("baseline-sut", "baseline", undefined, {
|
|
444
|
-
"execution-time": createMockSummaryStats([120, 125, 130]),
|
|
445
|
-
});
|
|
446
|
-
primaryAgg.comparisons = {
|
|
447
|
-
"baseline-sut": {
|
|
448
|
-
deltas: { "execution-time": -40 },
|
|
449
|
-
ratios: { "execution-time": 0.68 },
|
|
450
|
-
pValue: 0.1, // Not significant
|
|
451
|
-
effectSize: 1.5,
|
|
452
|
-
},
|
|
453
|
-
};
|
|
454
|
-
const result = evaluateClaim(claim, [primaryAgg, baselineAgg]);
|
|
455
|
-
assert.equal(result.status, "inconclusive");
|
|
456
|
-
});
|
|
457
|
-
it("should be satisfied when pValue meets significance level", () => {
|
|
458
|
-
const claim = createTestClaim({
|
|
459
|
-
direction: "less",
|
|
460
|
-
significanceLevel: 0.05,
|
|
461
|
-
});
|
|
462
|
-
const primaryAgg = createMockAggregate("primary-sut", "primary", undefined, {
|
|
463
|
-
"execution-time": createMockSummaryStats([80, 85, 90]),
|
|
464
|
-
});
|
|
465
|
-
const baselineAgg = createMockAggregate("baseline-sut", "baseline", undefined, {
|
|
466
|
-
"execution-time": createMockSummaryStats([120, 125, 130]),
|
|
467
|
-
});
|
|
468
|
-
primaryAgg.comparisons = {
|
|
469
|
-
"baseline-sut": {
|
|
470
|
-
deltas: { "execution-time": -40 },
|
|
471
|
-
ratios: { "execution-time": 0.68 },
|
|
472
|
-
pValue: 0.01, // Significant
|
|
473
|
-
effectSize: 1.5,
|
|
474
|
-
},
|
|
475
|
-
};
|
|
476
|
-
const result = evaluateClaim(claim, [primaryAgg, baselineAgg]);
|
|
477
|
-
assert.equal(result.status, "satisfied");
|
|
478
|
-
});
|
|
479
|
-
it("should be inconclusive when effectSize below minimum", () => {
|
|
480
|
-
const claim = createTestClaim({
|
|
481
|
-
direction: "less",
|
|
482
|
-
minEffectSize: 0.8,
|
|
483
|
-
});
|
|
484
|
-
const primaryAgg = createMockAggregate("primary-sut", "primary", undefined, {
|
|
485
|
-
"execution-time": createMockSummaryStats([80, 85, 90]),
|
|
486
|
-
});
|
|
487
|
-
const baselineAgg = createMockAggregate("baseline-sut", "baseline", undefined, {
|
|
488
|
-
"execution-time": createMockSummaryStats([120, 125, 130]),
|
|
489
|
-
});
|
|
490
|
-
primaryAgg.comparisons = {
|
|
491
|
-
"baseline-sut": {
|
|
492
|
-
deltas: { "execution-time": -40 },
|
|
493
|
-
ratios: { "execution-time": 0.68 },
|
|
494
|
-
pValue: 0.01,
|
|
495
|
-
effectSize: 0.5, // Below minimum
|
|
496
|
-
},
|
|
497
|
-
};
|
|
498
|
-
const result = evaluateClaim(claim, [primaryAgg, baselineAgg]);
|
|
499
|
-
assert.equal(result.status, "inconclusive");
|
|
500
|
-
});
|
|
501
|
-
});
|
|
502
|
-
describe("scope filtering - byClass with caseClass constraints", () => {
|
|
503
|
-
it("should filter aggregates to matching caseClass", () => {
|
|
504
|
-
const claim = createTestClaim({
|
|
505
|
-
sut: "primary-sut",
|
|
506
|
-
baseline: "baseline-sut",
|
|
507
|
-
scope: "caseClass",
|
|
508
|
-
scopeConstraints: {
|
|
509
|
-
caseClass: "scale-free",
|
|
510
|
-
},
|
|
511
|
-
});
|
|
512
|
-
const aggregates = [
|
|
513
|
-
createMockAggregate("primary-sut", "primary", "scale-free", {
|
|
514
|
-
"execution-time": createMockSummaryStats([80, 85, 90]),
|
|
515
|
-
}),
|
|
516
|
-
createMockAggregate("primary-sut", "primary", "small-world", {
|
|
517
|
-
"execution-time": createMockSummaryStats([200, 210, 220]),
|
|
518
|
-
}),
|
|
519
|
-
createMockAggregate("baseline-sut", "baseline", "scale-free", {
|
|
520
|
-
"execution-time": createMockSummaryStats([120, 125, 130]),
|
|
521
|
-
}),
|
|
522
|
-
createMockAggregate("baseline-sut", "baseline", "small-world", {
|
|
523
|
-
"execution-time": createMockSummaryStats([250, 260, 270]),
|
|
524
|
-
}),
|
|
525
|
-
];
|
|
526
|
-
const result = evaluateClaim(claim, aggregates);
|
|
527
|
-
assert.equal(result.status, "satisfied");
|
|
528
|
-
assert.equal(result.evidence.primaryValue, 85); // From scale-free aggregate
|
|
529
|
-
assert.equal(result.evidence.baselineValue, 125); // From scale-free aggregate
|
|
530
|
-
});
|
|
531
|
-
it("should be inconclusive when caseClass not found for primary", () => {
|
|
532
|
-
const claim = createTestClaim({
|
|
533
|
-
sut: "primary-sut",
|
|
534
|
-
baseline: "baseline-sut",
|
|
535
|
-
scope: "caseClass",
|
|
536
|
-
scopeConstraints: {
|
|
537
|
-
caseClass: "missing-class",
|
|
538
|
-
},
|
|
539
|
-
});
|
|
540
|
-
const aggregates = [
|
|
541
|
-
createMockAggregate("primary-sut", "primary", "scale-free", {
|
|
542
|
-
"execution-time": createMockSummaryStats([80, 85, 90]),
|
|
543
|
-
}),
|
|
544
|
-
createMockAggregate("baseline-sut", "baseline", "scale-free", {
|
|
545
|
-
"execution-time": createMockSummaryStats([120, 125, 130]),
|
|
546
|
-
}),
|
|
547
|
-
];
|
|
548
|
-
const result = evaluateClaim(claim, aggregates);
|
|
549
|
-
assert.equal(result.status, "inconclusive");
|
|
550
|
-
assert.ok(result.inconclusiveReason?.includes("Primary SUT not found"));
|
|
551
|
-
});
|
|
552
|
-
it("should handle multiple caseClass values", () => {
|
|
553
|
-
const claim = createTestClaim({
|
|
554
|
-
sut: "primary-sut",
|
|
555
|
-
baseline: "baseline-sut",
|
|
556
|
-
scope: "caseClass",
|
|
557
|
-
scopeConstraints: {
|
|
558
|
-
caseClass: ["scale-free", "random"],
|
|
559
|
-
},
|
|
560
|
-
});
|
|
561
|
-
const aggregates = [
|
|
562
|
-
createMockAggregate("primary-sut", "primary", "scale-free", {
|
|
563
|
-
"execution-time": createMockSummaryStats([80, 85, 90]),
|
|
564
|
-
}),
|
|
565
|
-
createMockAggregate("primary-sut", "primary", "random", {
|
|
566
|
-
"execution-time": createMockSummaryStats([70, 75, 80]),
|
|
567
|
-
}),
|
|
568
|
-
createMockAggregate("baseline-sut", "baseline", "scale-free", {
|
|
569
|
-
"execution-time": createMockSummaryStats([120, 125, 130]),
|
|
570
|
-
}),
|
|
571
|
-
createMockAggregate("baseline-sut", "baseline", "random", {
|
|
572
|
-
"execution-time": createMockSummaryStats([110, 115, 120]),
|
|
573
|
-
}),
|
|
574
|
-
createMockAggregate("primary-sut", "primary", "small-world", {
|
|
575
|
-
"execution-time": createMockSummaryStats([200, 210, 220]),
|
|
576
|
-
}),
|
|
577
|
-
];
|
|
578
|
-
const result = evaluateClaim(claim, aggregates);
|
|
579
|
-
// Should find scale-free aggregates (first match)
|
|
580
|
-
assert.equal(result.status, "satisfied");
|
|
581
|
-
assert.equal(result.evidence.primaryValue, 85);
|
|
582
|
-
assert.equal(result.evidence.baselineValue, 125);
|
|
583
|
-
});
|
|
584
|
-
it("should not filter when no scopeConstraints", () => {
|
|
585
|
-
const claim = createTestClaim({
|
|
586
|
-
sut: "primary-sut",
|
|
587
|
-
baseline: "baseline-sut",
|
|
588
|
-
scope: "global",
|
|
589
|
-
});
|
|
590
|
-
const aggregates = [
|
|
591
|
-
createMockAggregate("primary-sut", "primary", "scale-free", {
|
|
592
|
-
"execution-time": createMockSummaryStats([80, 85, 90]),
|
|
593
|
-
}),
|
|
594
|
-
createMockAggregate("baseline-sut", "baseline", "scale-free", {
|
|
595
|
-
"execution-time": createMockSummaryStats([120, 125, 130]),
|
|
596
|
-
}),
|
|
597
|
-
];
|
|
598
|
-
const result = evaluateClaim(claim, aggregates);
|
|
599
|
-
assert.equal(result.status, "satisfied");
|
|
600
|
-
});
|
|
601
|
-
});
|
|
602
|
-
describe("edge cases", () => {
|
|
603
|
-
it("should handle NaN values in evidence", () => {
|
|
604
|
-
const claim = createTestClaim();
|
|
605
|
-
const primaryAgg = createMockAggregate("primary-sut", "primary", undefined, {});
|
|
606
|
-
const baselineAgg = createMockAggregate("baseline-sut", "baseline", undefined, {});
|
|
607
|
-
// Add metric with NaN mean
|
|
608
|
-
primaryAgg.metrics["test-metric"] = createMockSummaryStats([]);
|
|
609
|
-
baselineAgg.metrics["test-metric"] = createMockSummaryStats([]);
|
|
610
|
-
claim.metric = "test-metric";
|
|
611
|
-
const result = evaluateClaim(claim, [primaryAgg, baselineAgg]);
|
|
612
|
-
assert.ok(Number.isNaN(result.evidence.primaryValue));
|
|
613
|
-
assert.ok(Number.isNaN(result.evidence.baselineValue));
|
|
614
|
-
assert.ok(Number.isNaN(result.evidence.delta));
|
|
615
|
-
assert.ok(Number.isNaN(result.evidence.ratio));
|
|
616
|
-
});
|
|
617
|
-
it("should handle identical primary and baseline values", () => {
|
|
618
|
-
const claim = createTestClaim({
|
|
619
|
-
direction: "greater",
|
|
620
|
-
});
|
|
621
|
-
const aggregates = [
|
|
622
|
-
createMockAggregate("primary-sut", "primary", undefined, {
|
|
623
|
-
"execution-time": createMockSummaryStats([100, 100, 100]),
|
|
624
|
-
}),
|
|
625
|
-
createMockAggregate("baseline-sut", "baseline", undefined, {
|
|
626
|
-
"execution-time": createMockSummaryStats([100, 100, 100]),
|
|
627
|
-
}),
|
|
628
|
-
];
|
|
629
|
-
const result = evaluateClaim(claim, aggregates);
|
|
630
|
-
assert.equal(result.status, "violated");
|
|
631
|
-
assert.equal(result.evidence.delta, 0);
|
|
632
|
-
});
|
|
633
|
-
it("should handle boundary case for greater with exact threshold", () => {
|
|
634
|
-
const claim = createTestClaim({
|
|
635
|
-
sut: "primary-sut",
|
|
636
|
-
baseline: "baseline-sut",
|
|
637
|
-
metric: "accuracy",
|
|
638
|
-
direction: "greater",
|
|
639
|
-
threshold: 0.17,
|
|
640
|
-
});
|
|
641
|
-
const aggregates = [
|
|
642
|
-
createMockAggregate("primary-sut", "primary", undefined, {
|
|
643
|
-
accuracy: createMockSummaryStats([0.9, 0.92, 0.95]),
|
|
644
|
-
}),
|
|
645
|
-
createMockAggregate("baseline-sut", "baseline", undefined, {
|
|
646
|
-
accuracy: createMockSummaryStats([0.7, 0.75, 0.8]),
|
|
647
|
-
}),
|
|
648
|
-
];
|
|
649
|
-
const result = evaluateClaim(claim, aggregates);
|
|
650
|
-
assert.equal(result.status, "satisfied");
|
|
651
|
-
assert.ok(result.evidence.delta >= 0.17);
|
|
652
|
-
});
|
|
653
|
-
});
|
|
654
|
-
describe("scope filtering - unknown constraint keys", () => {
|
|
655
|
-
it("should ignore unknown constraint keys", () => {
|
|
656
|
-
const claim = createTestClaim({
|
|
657
|
-
sut: "primary-sut",
|
|
658
|
-
baseline: "baseline-sut",
|
|
659
|
-
scope: "caseClass",
|
|
660
|
-
scopeConstraints: {
|
|
661
|
-
unknownKey: "some-value",
|
|
662
|
-
},
|
|
663
|
-
});
|
|
664
|
-
const aggregates = [
|
|
665
|
-
createMockAggregate("primary-sut", "primary", "scale-free", {
|
|
666
|
-
"execution-time": createMockSummaryStats([80, 85, 90]),
|
|
667
|
-
}),
|
|
668
|
-
createMockAggregate("baseline-sut", "baseline", "scale-free", {
|
|
669
|
-
"execution-time": createMockSummaryStats([120, 125, 130]),
|
|
670
|
-
}),
|
|
671
|
-
];
|
|
672
|
-
const result = evaluateClaim(claim, aggregates);
|
|
673
|
-
assert.equal(result.status, "satisfied");
|
|
674
|
-
});
|
|
675
|
-
});
|
|
676
|
-
});
|
|
677
|
-
describe("evaluateClaims", () => {
|
|
678
|
-
it("should evaluate multiple claims", () => {
|
|
679
|
-
const claims = [
|
|
680
|
-
createTestClaim({
|
|
681
|
-
claimId: "C001",
|
|
682
|
-
sut: "fast-sut",
|
|
683
|
-
baseline: "slow-sut",
|
|
684
|
-
direction: "less",
|
|
685
|
-
}),
|
|
686
|
-
createTestClaim({
|
|
687
|
-
claimId: "C002",
|
|
688
|
-
sut: "high-quality",
|
|
689
|
-
baseline: "low-quality",
|
|
690
|
-
metric: "accuracy",
|
|
691
|
-
direction: "greater",
|
|
692
|
-
}),
|
|
693
|
-
];
|
|
694
|
-
const aggregates = [
|
|
695
|
-
createMockAggregate("fast-sut", "primary", undefined, {
|
|
696
|
-
"execution-time": createMockSummaryStats([80, 85, 90]),
|
|
697
|
-
}),
|
|
698
|
-
createMockAggregate("slow-sut", "baseline", undefined, {
|
|
699
|
-
"execution-time": createMockSummaryStats([120, 125, 130]),
|
|
700
|
-
}),
|
|
701
|
-
createMockAggregate("high-quality", "primary", undefined, {
|
|
702
|
-
accuracy: createMockSummaryStats([0.9, 0.92, 0.95]),
|
|
703
|
-
}),
|
|
704
|
-
createMockAggregate("low-quality", "baseline", undefined, {
|
|
705
|
-
accuracy: createMockSummaryStats([0.7, 0.75, 0.8]),
|
|
706
|
-
}),
|
|
707
|
-
];
|
|
708
|
-
const results = evaluateClaims(claims, aggregates);
|
|
709
|
-
assert.equal(results.length, 2);
|
|
710
|
-
assert.equal(results[0].claim.claimId, "C001");
|
|
711
|
-
assert.equal(results[0].status, "satisfied");
|
|
712
|
-
assert.equal(results[1].claim.claimId, "C002");
|
|
713
|
-
assert.equal(results[1].status, "satisfied");
|
|
714
|
-
});
|
|
715
|
-
});
|
|
716
|
-
describe("createClaimSummary", () => {
|
|
717
|
-
it("should create summary with correct counts", () => {
|
|
718
|
-
const evaluations = [
|
|
719
|
-
{
|
|
720
|
-
claim: createTestClaim({ claimId: "C001" }),
|
|
721
|
-
status: "satisfied",
|
|
722
|
-
evidence: {
|
|
723
|
-
primaryValue: 100,
|
|
724
|
-
baselineValue: 120,
|
|
725
|
-
delta: -20,
|
|
726
|
-
ratio: 0.83,
|
|
727
|
-
},
|
|
728
|
-
},
|
|
729
|
-
{
|
|
730
|
-
claim: createTestClaim({ claimId: "C002" }),
|
|
731
|
-
status: "violated",
|
|
732
|
-
evidence: {
|
|
733
|
-
primaryValue: 150,
|
|
734
|
-
baselineValue: 100,
|
|
735
|
-
delta: 50,
|
|
736
|
-
ratio: 1.5,
|
|
737
|
-
},
|
|
738
|
-
},
|
|
739
|
-
{
|
|
740
|
-
claim: createTestClaim({ claimId: "C003" }),
|
|
741
|
-
status: "inconclusive",
|
|
742
|
-
evidence: {
|
|
743
|
-
primaryValue: Number.NaN,
|
|
744
|
-
baselineValue: Number.NaN,
|
|
745
|
-
delta: Number.NaN,
|
|
746
|
-
ratio: Number.NaN,
|
|
747
|
-
},
|
|
748
|
-
inconclusiveReason: "Missing data",
|
|
749
|
-
},
|
|
750
|
-
];
|
|
751
|
-
const summary = createClaimSummary(evaluations);
|
|
752
|
-
assert.equal(summary.version, "1.0.0");
|
|
753
|
-
assert.equal(summary.summary.total, 3);
|
|
754
|
-
assert.equal(summary.summary.satisfied, 1);
|
|
755
|
-
assert.equal(summary.summary.violated, 1);
|
|
756
|
-
assert.equal(summary.summary.inconclusive, 1);
|
|
757
|
-
assert.equal(summary.summary.satisfactionRate, 0.5); // 1 / (1 + 1)
|
|
758
|
-
assert.ok(summary.timestamp);
|
|
759
|
-
});
|
|
760
|
-
it("should handle empty evaluations array", () => {
|
|
761
|
-
const summary = createClaimSummary([]);
|
|
762
|
-
assert.equal(summary.summary.total, 0);
|
|
763
|
-
assert.equal(summary.summary.satisfied, 0);
|
|
764
|
-
assert.equal(summary.summary.violated, 0);
|
|
765
|
-
assert.equal(summary.summary.inconclusive, 0);
|
|
766
|
-
assert.equal(summary.summary.satisfactionRate, 0);
|
|
767
|
-
});
|
|
768
|
-
it("should handle all inconclusive evaluations", () => {
|
|
769
|
-
const evaluations = [
|
|
770
|
-
{
|
|
771
|
-
claim: createTestClaim({ claimId: "C001" }),
|
|
772
|
-
status: "inconclusive",
|
|
773
|
-
evidence: {
|
|
774
|
-
primaryValue: Number.NaN,
|
|
775
|
-
baselineValue: Number.NaN,
|
|
776
|
-
delta: Number.NaN,
|
|
777
|
-
ratio: Number.NaN,
|
|
778
|
-
},
|
|
779
|
-
inconclusiveReason: "Missing data",
|
|
780
|
-
},
|
|
781
|
-
{
|
|
782
|
-
claim: createTestClaim({ claimId: "C002" }),
|
|
783
|
-
status: "inconclusive",
|
|
784
|
-
evidence: {
|
|
785
|
-
primaryValue: Number.NaN,
|
|
786
|
-
baselineValue: Number.NaN,
|
|
787
|
-
delta: Number.NaN,
|
|
788
|
-
ratio: Number.NaN,
|
|
789
|
-
},
|
|
790
|
-
inconclusiveReason: "Missing metric",
|
|
791
|
-
},
|
|
792
|
-
];
|
|
793
|
-
const summary = createClaimSummary(evaluations);
|
|
794
|
-
assert.equal(summary.summary.total, 2);
|
|
795
|
-
assert.equal(summary.summary.satisfied, 0);
|
|
796
|
-
assert.equal(summary.summary.violated, 0);
|
|
797
|
-
assert.equal(summary.summary.inconclusive, 2);
|
|
798
|
-
assert.equal(summary.summary.satisfactionRate, 0); // No definitive results
|
|
799
|
-
});
|
|
800
|
-
});
|
|
801
|
-
//# sourceMappingURL=evaluator.unit.test.js.map
|