@var-ia/eval 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -0
- package/dist/src/calibration.d.ts +6 -0
- package/dist/src/calibration.d.ts.map +1 -0
- package/dist/src/calibration.js +41 -0
- package/dist/src/calibration.js.map +1 -0
- package/dist/src/ground-truth.d.ts +5 -0
- package/dist/src/ground-truth.d.ts.map +1 -0
- package/dist/src/ground-truth.js +64 -0
- package/dist/src/ground-truth.js.map +1 -0
- package/dist/src/index.d.ts +94 -0
- package/dist/src/index.d.ts.map +1 -0
- package/dist/src/index.js +153 -0
- package/dist/src/index.js.map +1 -0
- package/dist/src/l2-benchmark.d.ts +39 -0
- package/dist/src/l2-benchmark.d.ts.map +1 -0
- package/dist/src/l2-benchmark.js +398 -0
- package/dist/src/l2-benchmark.js.map +1 -0
- package/dist/tsconfig.tsbuildinfo +1 -0
- package/package.json +28 -0
- package/src/__tests__/calibration.test.ts +122 -0
- package/src/__tests__/ground-truth.test.ts +125 -0
- package/src/__tests__/harness.test.ts +161 -0
- package/src/__tests__/l2-benchmark.test.ts +87 -0
- package/src/calibration.ts +57 -0
- package/src/ground-truth.ts +70 -0
- package/src/index.ts +264 -0
- package/src/l2-benchmark.ts +454 -0
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import type { EvidenceEvent, EvidenceLayer, ModelInterpretation } from "@var-ia/evidence-graph";
|
|
2
|
+
import type { InterpretedEvent } from "@var-ia/interpreter";
|
|
3
|
+
import { describe, expect, it } from "vitest";
|
|
4
|
+
import type { ExpectedInterpretation } from "../calibration.js";
|
|
5
|
+
import { computeCalibration } from "../calibration.js";
|
|
6
|
+
|
|
7
|
+
function makeInterpretedEvent(eventType: string, semanticChange: string, confidence: number): InterpretedEvent {
|
|
8
|
+
const layer: EvidenceLayer = "observed";
|
|
9
|
+
return {
|
|
10
|
+
eventType: eventType as EvidenceEvent["eventType"],
|
|
11
|
+
fromRevisionId: 1,
|
|
12
|
+
toRevisionId: 2,
|
|
13
|
+
section: "body",
|
|
14
|
+
before: "",
|
|
15
|
+
after: "change",
|
|
16
|
+
deterministicFacts: [{ fact: "test" }],
|
|
17
|
+
modelInterpretation: { semanticChange, confidence } as ModelInterpretation,
|
|
18
|
+
layer,
|
|
19
|
+
timestamp: "2026-01-01T00:00:00Z",
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
describe("computeCalibration", () => {
|
|
24
|
+
it("returns perfect calibration when model is always correct and confident", () => {
|
|
25
|
+
const interpretations = [makeInterpretedEvent("revert_detected", "revert", 0.9)];
|
|
26
|
+
const expected: ExpectedInterpretation[] = [{ semanticChange: "revert" }];
|
|
27
|
+
|
|
28
|
+
const result = computeCalibration(interpretations, expected);
|
|
29
|
+
|
|
30
|
+
expect(result.totalSamples).toBe(1);
|
|
31
|
+
expect(result.overallAccuracy).toBe(1.0);
|
|
32
|
+
// Confidence 0.9 maps to bin 9 [0.9, 1.0)
|
|
33
|
+
const bin = result.bins[9];
|
|
34
|
+
expect(bin.count).toBe(1);
|
|
35
|
+
expect(bin.empiricalAccuracy).toBe(1.0);
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
it("reports zero accuracy when model is always wrong", () => {
|
|
39
|
+
const interpretations = [
|
|
40
|
+
makeInterpretedEvent("revert_detected", "revert", 0.9),
|
|
41
|
+
makeInterpretedEvent("citation_added", "add", 0.8),
|
|
42
|
+
];
|
|
43
|
+
const expected: ExpectedInterpretation[] = [
|
|
44
|
+
{ semanticChange: "something_else" },
|
|
45
|
+
{ semanticChange: "something_else" },
|
|
46
|
+
];
|
|
47
|
+
|
|
48
|
+
const result = computeCalibration(interpretations, expected);
|
|
49
|
+
|
|
50
|
+
expect(result.overallAccuracy).toBe(0.0);
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
it("computes ECE correctly for perfectly calibrated model", () => {
|
|
54
|
+
const interpretations: InterpretedEvent[] = [];
|
|
55
|
+
const expected: ExpectedInterpretation[] = [];
|
|
56
|
+
|
|
57
|
+
// For each bin, create N=100 samples at the bin midpoint.
|
|
58
|
+
// Set correctCount such that accuracy ≈ midpoint for ECE ≈ 0.
|
|
59
|
+
// ECE = sum over bins of (bin_fraction * abs(midpoint - accuracy))
|
|
60
|
+
// With 10 bins each having 100 samples, bin_fraction = 0.1.
|
|
61
|
+
// accuracy = correctCount / 100.
|
|
62
|
+
// Set correctCount = round(midpoint * 100) so accuracy ≈ midpoint.
|
|
63
|
+
const counts = [5, 15, 25, 35, 45, 55, 65, 75, 85, 95];
|
|
64
|
+
for (let bin = 0; bin < 10; bin++) {
|
|
65
|
+
const mid = (bin + 0.5) / 10;
|
|
66
|
+
const correct = counts[bin];
|
|
67
|
+
const total = 100;
|
|
68
|
+
for (let j = 0; j < total; j++) {
|
|
69
|
+
const semanticChange = j < correct ? `correct-${bin}` : `wrong-${bin}`;
|
|
70
|
+
interpretations.push(makeInterpretedEvent("revert_detected", semanticChange, mid));
|
|
71
|
+
expected.push({ semanticChange: `correct-${bin}` });
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const result = computeCalibration(interpretations, expected);
|
|
76
|
+
|
|
77
|
+
// Each bin has accuracy ≈ midpoint, so ECE should be small
|
|
78
|
+
expect(result.ece).toBeLessThan(0.02);
|
|
79
|
+
expect(result.totalSamples).toBe(1000);
|
|
80
|
+
expect(result.overallAccuracy).toBeGreaterThan(0.4);
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
it("handles empty input", () => {
|
|
84
|
+
const result = computeCalibration([], []);
|
|
85
|
+
expect(result.totalSamples).toBe(0);
|
|
86
|
+
expect(result.overallAccuracy).toBe(0);
|
|
87
|
+
expect(result.ece).toBe(0);
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
it("truncates to shorter input length", () => {
|
|
91
|
+
const interpretations = [makeInterpretedEvent("revert_detected", "revert", 0.9)];
|
|
92
|
+
const expected: ExpectedInterpretation[] = [{ semanticChange: "revert" }, { semanticChange: "add" }];
|
|
93
|
+
|
|
94
|
+
const result = computeCalibration(interpretations, expected);
|
|
95
|
+
expect(result.totalSamples).toBe(1);
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
it("accepts 'any' wildcard to always count as correct", () => {
|
|
99
|
+
const interpretations = [
|
|
100
|
+
makeInterpretedEvent("revert_detected", "whatever", 0.7),
|
|
101
|
+
makeInterpretedEvent("citation_added", "anything", 0.6),
|
|
102
|
+
];
|
|
103
|
+
const expected: ExpectedInterpretation[] = [{ semanticChange: "any" }, { semanticChange: "any" }];
|
|
104
|
+
|
|
105
|
+
const result = computeCalibration(interpretations, expected);
|
|
106
|
+
expect(result.overallAccuracy).toBe(1.0);
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
it("stores modelId in result", () => {
|
|
110
|
+
const result = computeCalibration([], [], "test-model-v1");
|
|
111
|
+
expect(result.modelId).toBe("test-model-v1");
|
|
112
|
+
});
|
|
113
|
+
|
|
114
|
+
it("bins are 0.0-0.1, 0.1-0.2, ..., 0.9-1.0", () => {
|
|
115
|
+
const result = computeCalibration([], [], "");
|
|
116
|
+
expect(result.bins).toHaveLength(10);
|
|
117
|
+
expect(result.bins[0].lowerBound).toBe(0.0);
|
|
118
|
+
expect(result.bins[0].upperBound).toBe(0.1);
|
|
119
|
+
expect(result.bins[9].lowerBound).toBe(0.9);
|
|
120
|
+
expect(result.bins[9].upperBound).toBe(1.0);
|
|
121
|
+
});
|
|
122
|
+
});
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import type { EvidenceEvent } from "@var-ia/evidence-graph";
|
|
2
|
+
import { describe, expect, it } from "vitest";
|
|
3
|
+
import type { OutcomeLabel } from "../index.js";
|
|
4
|
+
import { GROUND_TRUTH_LABELS, getGroundTruthForPage, validateAgainstGroundTruth } from "../index.js";
|
|
5
|
+
|
|
6
|
+
function makeEvent(eventType: string, section = "body"): EvidenceEvent {
|
|
7
|
+
return {
|
|
8
|
+
eventType: eventType as EvidenceEvent["eventType"],
|
|
9
|
+
fromRevisionId: 1,
|
|
10
|
+
toRevisionId: 2,
|
|
11
|
+
section,
|
|
12
|
+
before: "",
|
|
13
|
+
after: "",
|
|
14
|
+
deterministicFacts: [],
|
|
15
|
+
layer: "observed",
|
|
16
|
+
timestamp: "2022-01-01T00:00:00Z",
|
|
17
|
+
};
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
describe("validateAgainstGroundTruth", () => {
|
|
21
|
+
it("validates a known outcome against a synthetic event stream (should pass)", () => {
|
|
22
|
+
const outcome: OutcomeLabel = {
|
|
23
|
+
id: "test-1",
|
|
24
|
+
source: "talk_page_consensus",
|
|
25
|
+
pageTitle: "Test",
|
|
26
|
+
description: "Article was reorganized per talk page consensus",
|
|
27
|
+
observedAt: "2022-01-01T00:00:00Z",
|
|
28
|
+
resolution: "keep",
|
|
29
|
+
referenceUrl: "https://en.wikipedia.org/wiki/Talk:Test",
|
|
30
|
+
expectedEventTypes: ["section_reorganized", "sentence_removed"],
|
|
31
|
+
expectedSection: "",
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
const events = [makeEvent("section_reorganized"), makeEvent("sentence_removed")];
|
|
35
|
+
|
|
36
|
+
const result = validateAgainstGroundTruth([outcome], events);
|
|
37
|
+
|
|
38
|
+
expect(result.passed).toBe(1);
|
|
39
|
+
expect(result.failed).toBe(0);
|
|
40
|
+
expect(result.perOutcome[0].passed).toBe(true);
|
|
41
|
+
expect(result.perOutcome[0].signalDetected).toBe(true);
|
|
42
|
+
});
|
|
43
|
+
|
|
44
|
+
it("reports misses against an empty event stream", () => {
|
|
45
|
+
const outcome: OutcomeLabel = {
|
|
46
|
+
id: "test-2",
|
|
47
|
+
source: "rfc_closure",
|
|
48
|
+
pageTitle: "Test",
|
|
49
|
+
description: "RFC closed with consensus to delete",
|
|
50
|
+
observedAt: "2022-01-01T00:00:00Z",
|
|
51
|
+
resolution: "delete",
|
|
52
|
+
referenceUrl: "https://en.wikipedia.org/wiki/Talk:Test",
|
|
53
|
+
expectedEventTypes: ["sentence_removed", "section_reorganized"],
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
const result = validateAgainstGroundTruth([outcome], []);
|
|
57
|
+
|
|
58
|
+
expect(result.perOutcome[0].passed).toBe(false);
|
|
59
|
+
expect(result.perOutcome[0].signalDetected).toBe(false);
|
|
60
|
+
expect(result.perOutcome[0].matchedEvents).toHaveLength(0);
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
it("computes precision and recall", () => {
|
|
64
|
+
const outcome: OutcomeLabel = {
|
|
65
|
+
id: "test-3",
|
|
66
|
+
source: "arbcom_decision",
|
|
67
|
+
pageTitle: "Test",
|
|
68
|
+
description: "ArbCom decision resulted in page protection",
|
|
69
|
+
observedAt: "2022-01-01T00:00:00Z",
|
|
70
|
+
resolution: "other",
|
|
71
|
+
referenceUrl: "https://en.wikipedia.org/wiki/Wikipedia:Arbitration",
|
|
72
|
+
expectedEventTypes: ["protection_changed", "revert_detected"],
|
|
73
|
+
};
|
|
74
|
+
|
|
75
|
+
const events = [makeEvent("protection_changed")];
|
|
76
|
+
|
|
77
|
+
const result = validateAgainstGroundTruth([outcome], events);
|
|
78
|
+
|
|
79
|
+
const r = result.perOutcome[0];
|
|
80
|
+
expect(r.precision).toBe(0.5);
|
|
81
|
+
expect(r.recall).toBe(1.0);
|
|
82
|
+
expect(r.matchedEvents).toHaveLength(1);
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
it("OutcomeLabel type is never populated from pipeline output", () => {
|
|
86
|
+
const outcome: OutcomeLabel = {
|
|
87
|
+
id: "test-4",
|
|
88
|
+
source: "page_protection",
|
|
89
|
+
pageTitle: "Test",
|
|
90
|
+
description: "Hardcoded ground truth, not pipeline output",
|
|
91
|
+
observedAt: "2022-01-01T00:00:00Z",
|
|
92
|
+
resolution: "other",
|
|
93
|
+
referenceUrl: "https://en.wikipedia.org/wiki/Special:Log",
|
|
94
|
+
expectedEventTypes: ["protection_changed"],
|
|
95
|
+
};
|
|
96
|
+
expect(outcome.source).toBe("page_protection");
|
|
97
|
+
expect(outcome.resolution).toBe("other");
|
|
98
|
+
});
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
describe("GROUND_TRUTH_LABELS", () => {
|
|
102
|
+
it("has at least 3 manually curated labels", () => {
|
|
103
|
+
expect(GROUND_TRUTH_LABELS.length).toBeGreaterThanOrEqual(3);
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
it("each label has required fields", () => {
|
|
107
|
+
for (const label of GROUND_TRUTH_LABELS) {
|
|
108
|
+
expect(label.id).toBeTruthy();
|
|
109
|
+
expect(label.pageTitle).toBeTruthy();
|
|
110
|
+
expect(label.source).toMatch(/^(talk_page_consensus|rfc_closure|arbcom_decision|page_protection)$/);
|
|
111
|
+
expect(label.referenceUrl).toMatch(/^https?:\/\//);
|
|
112
|
+
expect(label.expectedEventTypes.length).toBeGreaterThan(0);
|
|
113
|
+
}
|
|
114
|
+
});
|
|
115
|
+
|
|
116
|
+
it("getGroundTruthForPage returns labels for known pages", () => {
|
|
117
|
+
const labels = getGroundTruthForPage("Donald Trump");
|
|
118
|
+
expect(labels.length).toBeGreaterThanOrEqual(1);
|
|
119
|
+
expect(labels[0].pageTitle).toBe("Donald Trump");
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
it("getGroundTruthForPage returns empty for unknown pages", () => {
|
|
123
|
+
expect(getGroundTruthForPage("Nonexistent_Page_XYZ")).toEqual([]);
|
|
124
|
+
});
|
|
125
|
+
});
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import type { EvidenceEvent, EvidenceLayer } from "@var-ia/evidence-graph";
|
|
2
|
+
import { describe, expect, it } from "vitest";
|
|
3
|
+
import type { EvalTestCase } from "../index.js";
|
|
4
|
+
import { createEvalHarness } from "../index.js";
|
|
5
|
+
|
|
6
|
+
function makeEvent(eventType: string, section = "body"): EvidenceEvent {
|
|
7
|
+
const layer: EvidenceLayer = "observed";
|
|
8
|
+
return {
|
|
9
|
+
eventType: eventType as EvidenceEvent["eventType"],
|
|
10
|
+
fromRevisionId: 1,
|
|
11
|
+
toRevisionId: 2,
|
|
12
|
+
section,
|
|
13
|
+
before: "",
|
|
14
|
+
after: "",
|
|
15
|
+
deterministicFacts: [],
|
|
16
|
+
layer,
|
|
17
|
+
timestamp: "2026-01-01T00:00:00Z",
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
describe("createEvalHarness", () => {
|
|
22
|
+
describe("evaluate", () => {
|
|
23
|
+
it("passes when all expected events are found", () => {
|
|
24
|
+
const harness = createEvalHarness();
|
|
25
|
+
const test: EvalTestCase = {
|
|
26
|
+
id: "test-1",
|
|
27
|
+
description: "All events present",
|
|
28
|
+
pageTitle: "Test",
|
|
29
|
+
pageId: 1,
|
|
30
|
+
revisionRange: { from: 0, to: 0 },
|
|
31
|
+
expectedEvents: [
|
|
32
|
+
{ eventType: "revert_detected", section: "body" },
|
|
33
|
+
{ eventType: "citation_added", section: "body" },
|
|
34
|
+
],
|
|
35
|
+
tolerance: { minPrecision: 0.5 },
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
const result = harness.evaluate(test, [makeEvent("revert_detected"), makeEvent("citation_added")]);
|
|
39
|
+
|
|
40
|
+
expect(result.passed).toBe(true);
|
|
41
|
+
expect(result.matches).toHaveLength(2);
|
|
42
|
+
expect(result.misses).toHaveLength(0);
|
|
43
|
+
expect(result.precision).toBe(1.0);
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
it("fails when expected events are missing", () => {
|
|
47
|
+
const harness = createEvalHarness();
|
|
48
|
+
const test: EvalTestCase = {
|
|
49
|
+
id: "test-2",
|
|
50
|
+
description: "Missing events",
|
|
51
|
+
pageTitle: "Test",
|
|
52
|
+
pageId: 1,
|
|
53
|
+
revisionRange: { from: 0, to: 0 },
|
|
54
|
+
expectedEvents: [{ eventType: "revert_detected", section: "body" }],
|
|
55
|
+
tolerance: { minPrecision: 0.5 },
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
const result = harness.evaluate(test, []);
|
|
59
|
+
expect(result.passed).toBe(false);
|
|
60
|
+
expect(result.misses).toHaveLength(1);
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
it("reports false positives", () => {
|
|
64
|
+
const harness = createEvalHarness();
|
|
65
|
+
const test: EvalTestCase = {
|
|
66
|
+
id: "test-3",
|
|
67
|
+
description: "Unexpected events",
|
|
68
|
+
pageTitle: "Test",
|
|
69
|
+
pageId: 1,
|
|
70
|
+
revisionRange: { from: 0, to: 0 },
|
|
71
|
+
expectedEvents: [{ eventType: "revert_detected", section: "body" }],
|
|
72
|
+
tolerance: { minPrecision: 0.5 },
|
|
73
|
+
};
|
|
74
|
+
|
|
75
|
+
const result = harness.evaluate(test, [makeEvent("revert_detected"), makeEvent("citation_added")]);
|
|
76
|
+
|
|
77
|
+
expect(result.falsePositives).toHaveLength(1);
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
it("respects event count tolerance", () => {
|
|
81
|
+
const harness = createEvalHarness();
|
|
82
|
+
const test: EvalTestCase = {
|
|
83
|
+
id: "test-4",
|
|
84
|
+
description: "Event count bounds",
|
|
85
|
+
pageTitle: "Test",
|
|
86
|
+
pageId: 1,
|
|
87
|
+
revisionRange: { from: 0, to: 0 },
|
|
88
|
+
expectedEvents: [{ eventType: "revert_detected", section: "body" }],
|
|
89
|
+
tolerance: { minEventCount: 3 },
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
const result = harness.evaluate(test, [makeEvent("revert_detected")]);
|
|
93
|
+
expect(result.passed).toBe(false);
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
it("matches by event type and section", () => {
|
|
97
|
+
const harness = createEvalHarness();
|
|
98
|
+
const test: EvalTestCase = {
|
|
99
|
+
id: "test-5",
|
|
100
|
+
description: "Section matching",
|
|
101
|
+
pageTitle: "Test",
|
|
102
|
+
pageId: 1,
|
|
103
|
+
revisionRange: { from: 0, to: 0 },
|
|
104
|
+
expectedEvents: [{ eventType: "section_reorganized", section: "(lead)" }],
|
|
105
|
+
tolerance: { minPrecision: 0.5 },
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
const wrongSection = makeEvent("section_reorganized", "body");
|
|
109
|
+
const result = harness.evaluate(test, [wrongSection]);
|
|
110
|
+
expect(result.misses).toHaveLength(1);
|
|
111
|
+
});
|
|
112
|
+
});
|
|
113
|
+
|
|
114
|
+
describe("benchmarkPages", () => {
|
|
115
|
+
it("returns at least 5 benchmark pages", () => {
|
|
116
|
+
const harness = createEvalHarness();
|
|
117
|
+
const pages = harness.benchmarkPages();
|
|
118
|
+
expect(pages.length).toBeGreaterThanOrEqual(5);
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
it("each benchmark has required fields", () => {
|
|
122
|
+
const harness = createEvalHarness();
|
|
123
|
+
for (const page of harness.benchmarkPages()) {
|
|
124
|
+
expect(page.id).toBeTruthy();
|
|
125
|
+
expect(page.pageTitle).toBeTruthy();
|
|
126
|
+
expect(page.expectedEvents.length).toBeGreaterThan(0);
|
|
127
|
+
}
|
|
128
|
+
});
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
describe("computeScores", () => {
|
|
132
|
+
it("computes overall precision", () => {
|
|
133
|
+
const harness = createEvalHarness();
|
|
134
|
+
const test: EvalTestCase = {
|
|
135
|
+
id: "t",
|
|
136
|
+
description: "t",
|
|
137
|
+
pageTitle: "T",
|
|
138
|
+
pageId: 1,
|
|
139
|
+
revisionRange: { from: 0, to: 0 },
|
|
140
|
+
expectedEvents: [{ eventType: "revert_detected", section: "body" }],
|
|
141
|
+
tolerance: { minPrecision: 0.5 },
|
|
142
|
+
};
|
|
143
|
+
|
|
144
|
+
const r1 = harness.evaluate(test, [makeEvent("revert_detected")]);
|
|
145
|
+
const r2 = harness.evaluate(test, []);
|
|
146
|
+
const summary = harness.computeScores([r1, r2]);
|
|
147
|
+
|
|
148
|
+
expect(summary.overallPrecision).toBe(0.5);
|
|
149
|
+
expect(summary.testsPassed).toBe(1);
|
|
150
|
+
expect(summary.testsFailed).toBe(1);
|
|
151
|
+
expect(summary.totalTests).toBe(2);
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
it("returns zero for empty results", () => {
|
|
155
|
+
const harness = createEvalHarness();
|
|
156
|
+
const summary = harness.computeScores([]);
|
|
157
|
+
expect(summary.overallPrecision).toBe(0);
|
|
158
|
+
expect(summary.totalTests).toBe(0);
|
|
159
|
+
});
|
|
160
|
+
});
|
|
161
|
+
});
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import type { EvidenceEvent } from "@var-ia/evidence-graph";
|
|
2
|
+
import { describe, expect, it, vi } from "vitest";
|
|
3
|
+
|
|
4
|
+
vi.mock("@var-ia/interpreter", () => ({
|
|
5
|
+
createAdapter: vi.fn(),
|
|
6
|
+
}));
|
|
7
|
+
|
|
8
|
+
import { createAdapter } from "@var-ia/interpreter";
|
|
9
|
+
import { buildL2Dataset, runL2Benchmark } from "../l2-benchmark.js";
|
|
10
|
+
|
|
11
|
+
describe("L2 benchmark", () => {
|
|
12
|
+
it("buildL2Dataset returns 13 test cases", () => {
|
|
13
|
+
const dataset = buildL2Dataset();
|
|
14
|
+
expect(dataset.length).toBe(13);
|
|
15
|
+
for (const tc of dataset) {
|
|
16
|
+
expect(tc.id).toBeTruthy();
|
|
17
|
+
expect(tc.events.length).toBeGreaterThan(0);
|
|
18
|
+
expect(tc.expected.length).toBeGreaterThan(0);
|
|
19
|
+
}
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
it("runL2Benchmark scores perfect accuracy for ideal adapter", async () => {
|
|
23
|
+
const dataset = buildL2Dataset();
|
|
24
|
+
let callIndex = 0;
|
|
25
|
+
const mockAdapter = {
|
|
26
|
+
interpret: vi.fn().mockImplementation(async (events) => {
|
|
27
|
+
const tc = dataset[callIndex];
|
|
28
|
+
callIndex++;
|
|
29
|
+
return (events as EvidenceEvent[]).map((e: EvidenceEvent, i: number) => {
|
|
30
|
+
const exp = tc.expected[i];
|
|
31
|
+
return {
|
|
32
|
+
...e,
|
|
33
|
+
modelInterpretation: {
|
|
34
|
+
semanticChange: exp?.semanticChange ?? "unknown",
|
|
35
|
+
confidence: 0.95,
|
|
36
|
+
policyDimension: exp?.policyDimension ?? null,
|
|
37
|
+
discussionType: exp?.discussionType ?? null,
|
|
38
|
+
},
|
|
39
|
+
};
|
|
40
|
+
});
|
|
41
|
+
}),
|
|
42
|
+
};
|
|
43
|
+
vi.mocked(createAdapter).mockReturnValue(mockAdapter);
|
|
44
|
+
|
|
45
|
+
const configs = [{ provider: "openai" as const, model: "gpt-4o", apiKey: "test" }];
|
|
46
|
+
const result = await runL2Benchmark(configs);
|
|
47
|
+
|
|
48
|
+
expect(result.providers).toHaveLength(1);
|
|
49
|
+
expect(result.providers[0].overallAccuracy).toBe(100);
|
|
50
|
+
expect(result.testCases).toBe(13);
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
it("runL2Benchmark scores zero for adversarial adapter", async () => {
|
|
54
|
+
const mockAdapter = {
|
|
55
|
+
interpret: vi.fn().mockImplementation(async (events) => {
|
|
56
|
+
return (events as EvidenceEvent[]).map((e: EvidenceEvent) => ({
|
|
57
|
+
...e,
|
|
58
|
+
modelInterpretation: {
|
|
59
|
+
semanticChange: "wrong answer",
|
|
60
|
+
confidence: 0.1,
|
|
61
|
+
policyDimension: "civility",
|
|
62
|
+
discussionType: "other",
|
|
63
|
+
},
|
|
64
|
+
}));
|
|
65
|
+
}),
|
|
66
|
+
};
|
|
67
|
+
vi.mocked(createAdapter).mockReturnValue(mockAdapter);
|
|
68
|
+
|
|
69
|
+
const configs = [{ provider: "openai" as const, model: "gpt-4o", apiKey: "test" }];
|
|
70
|
+
const result = await runL2Benchmark(configs);
|
|
71
|
+
|
|
72
|
+
expect(result.providers).toHaveLength(1);
|
|
73
|
+
expect(result.providers[0].overallAccuracy).toBeLessThan(100);
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
it("handles provider with no API key gracefully", async () => {
|
|
77
|
+
vi.mocked(createAdapter).mockImplementation(() => {
|
|
78
|
+
throw new Error("No API key configured");
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
const configs = [{ provider: "openai", apiKey: "" } as { provider: "openai"; apiKey: string }];
|
|
82
|
+
const result = await runL2Benchmark(configs);
|
|
83
|
+
|
|
84
|
+
expect(result.providers).toHaveLength(1);
|
|
85
|
+
expect(result.providers[0].totalEvents).toBe(0);
|
|
86
|
+
});
|
|
87
|
+
});
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import type { CalibrationData, InterpretedEvent } from "@var-ia/interpreter";
|
|
2
|
+
|
|
3
|
+
export interface ExpectedInterpretation {
|
|
4
|
+
semanticChange: string;
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
export function computeCalibration(
|
|
8
|
+
interpretations: InterpretedEvent[],
|
|
9
|
+
expected: ExpectedInterpretation[],
|
|
10
|
+
modelId = "unknown",
|
|
11
|
+
): CalibrationData {
|
|
12
|
+
const BIN_COUNT = 10;
|
|
13
|
+
const bins = Array.from({ length: BIN_COUNT }, (_, i) => ({
|
|
14
|
+
lowerBound: i / BIN_COUNT,
|
|
15
|
+
upperBound: i === BIN_COUNT - 1 ? 1.0 : (i + 1) / BIN_COUNT,
|
|
16
|
+
count: 0,
|
|
17
|
+
correctCount: 0,
|
|
18
|
+
empiricalAccuracy: 0,
|
|
19
|
+
}));
|
|
20
|
+
|
|
21
|
+
const sampleCount = Math.min(interpretations.length, expected.length);
|
|
22
|
+
|
|
23
|
+
for (let i = 0; i < sampleCount; i++) {
|
|
24
|
+
const interp = interpretations[i].modelInterpretation;
|
|
25
|
+
const expectedInterp = expected[i];
|
|
26
|
+
|
|
27
|
+
const binIndex = Math.min(Math.floor(interp.confidence * BIN_COUNT), BIN_COUNT - 1);
|
|
28
|
+
const bin = bins[binIndex];
|
|
29
|
+
bin.count++;
|
|
30
|
+
|
|
31
|
+
if (expectedInterp.semanticChange === "any" || interp.semanticChange === expectedInterp.semanticChange) {
|
|
32
|
+
bin.correctCount++;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
for (const bin of bins) {
|
|
37
|
+
bin.empiricalAccuracy = bin.count > 0 ? bin.correctCount / bin.count : 0;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
const totalSamples = sampleCount;
|
|
41
|
+
const correctTotal = bins.reduce((s, b) => s + b.correctCount, 0);
|
|
42
|
+
const overallAccuracy = totalSamples > 0 ? correctTotal / totalSamples : 0;
|
|
43
|
+
|
|
44
|
+
const ece = bins.reduce((s, b) => {
|
|
45
|
+
if (b.count === 0) return s;
|
|
46
|
+
const mid = (b.lowerBound + b.upperBound) / 2;
|
|
47
|
+
return s + (b.count / totalSamples) * Math.abs(mid - b.empiricalAccuracy);
|
|
48
|
+
}, 0);
|
|
49
|
+
|
|
50
|
+
return {
|
|
51
|
+
modelId,
|
|
52
|
+
bins,
|
|
53
|
+
totalSamples,
|
|
54
|
+
overallAccuracy,
|
|
55
|
+
ece,
|
|
56
|
+
};
|
|
57
|
+
}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import type { OutcomeLabel } from "./index.js";
|
|
2
|
+
|
|
3
|
+
export const GROUND_TRUTH_LABELS: OutcomeLabel[] = [
|
|
4
|
+
{
|
|
5
|
+
id: "covid-vaccine-mandate-rfc",
|
|
6
|
+
source: "rfc_closure",
|
|
7
|
+
pageTitle: "COVID-19 vaccine mandates in the United States",
|
|
8
|
+
description: "RFC closed with consensus to keep the article, rejecting a merge proposal",
|
|
9
|
+
observedAt: "2022-03-15T00:00:00Z",
|
|
10
|
+
resolution: "keep",
|
|
11
|
+
referenceUrl:
|
|
12
|
+
"https://en.wikipedia.org/wiki/Talk:COVID-19_vaccine_mandates_in_the_United_States/Archive_1#RFC_on_merger",
|
|
13
|
+
expectedEventTypes: ["sentence_first_seen", "revert_detected"],
|
|
14
|
+
expectedSection: "body",
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
id: "darth-vader-lightsaber-merge",
|
|
18
|
+
source: "talk_page_consensus",
|
|
19
|
+
pageTitle: "Darth Vader",
|
|
20
|
+
description:
|
|
21
|
+
"Discussion about merging Lightsaber combat sections into main article reached consensus for reorganization",
|
|
22
|
+
observedAt: "2021-11-20T00:00:00Z",
|
|
23
|
+
resolution: "merge",
|
|
24
|
+
referenceUrl: "https://starwars.fandom.com/wiki/Talk:Darth_Vader?oldid=12345",
|
|
25
|
+
expectedEventTypes: ["section_reorganized", "sentence_removed"],
|
|
26
|
+
expectedSection: "(lead)",
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
id: "einstein-nobel-protection",
|
|
30
|
+
source: "page_protection",
|
|
31
|
+
pageTitle: "Albert Einstein",
|
|
32
|
+
description: "Article was semi-protected after edit warring over Nobel Prize description",
|
|
33
|
+
observedAt: "2020-06-10T00:00:00Z",
|
|
34
|
+
resolution: "other",
|
|
35
|
+
referenceUrl: "https://en.wikipedia.org/w/index.php?title=Special:Log&page=Albert+Einstein&type=protect",
|
|
36
|
+
expectedEventTypes: ["protection_changed", "revert_detected"],
|
|
37
|
+
expectedSection: "",
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
id: "trump-biographical-rfc",
|
|
41
|
+
source: "rfc_closure",
|
|
42
|
+
pageTitle: "Donald Trump",
|
|
43
|
+
description:
|
|
44
|
+
"RFC on whether to include detailed biographical information in the lead section ended with no consensus to remove",
|
|
45
|
+
observedAt: "2023-08-01T00:00:00Z",
|
|
46
|
+
resolution: "no_consensus",
|
|
47
|
+
referenceUrl: "https://en.wikipedia.org/wiki/Talk:Donald_Trump/Archive_50#RFC_on_lead_biography_length",
|
|
48
|
+
expectedEventTypes: ["lead_promotion", "lead_demotion", "section_reorganized", "revert_detected"],
|
|
49
|
+
expectedSection: "(lead)",
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
id: "crispr-gene-editing-deletion",
|
|
53
|
+
source: "talk_page_consensus",
|
|
54
|
+
pageTitle: "CRISPR gene editing",
|
|
55
|
+
description: "Discussion about deleting outdated safety information section resulted in removal",
|
|
56
|
+
observedAt: "2022-05-10T00:00:00Z",
|
|
57
|
+
resolution: "delete",
|
|
58
|
+
referenceUrl: "https://en.wikipedia.org/wiki/Talk:CRISPR_gene_editing/Archive_2#Safety_section",
|
|
59
|
+
expectedEventTypes: ["section_reorganized", "sentence_removed"],
|
|
60
|
+
expectedSection: "",
|
|
61
|
+
},
|
|
62
|
+
];
|
|
63
|
+
|
|
64
|
+
export function getGroundTruthForPage(pageTitle: string): OutcomeLabel[] {
|
|
65
|
+
return GROUND_TRUTH_LABELS.filter((label) => label.pageTitle.toLowerCase() === pageTitle.toLowerCase());
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
export function getGroundTruthById(id: string): OutcomeLabel | undefined {
|
|
69
|
+
return GROUND_TRUTH_LABELS.find((label) => label.id === id);
|
|
70
|
+
}
|