@var-ia/eval 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -0
- package/dist/src/calibration.d.ts +6 -0
- package/dist/src/calibration.d.ts.map +1 -0
- package/dist/src/calibration.js +41 -0
- package/dist/src/calibration.js.map +1 -0
- package/dist/src/ground-truth.d.ts +5 -0
- package/dist/src/ground-truth.d.ts.map +1 -0
- package/dist/src/ground-truth.js +64 -0
- package/dist/src/ground-truth.js.map +1 -0
- package/dist/src/index.d.ts +94 -0
- package/dist/src/index.d.ts.map +1 -0
- package/dist/src/index.js +153 -0
- package/dist/src/index.js.map +1 -0
- package/dist/src/l2-benchmark.d.ts +39 -0
- package/dist/src/l2-benchmark.d.ts.map +1 -0
- package/dist/src/l2-benchmark.js +398 -0
- package/dist/src/l2-benchmark.js.map +1 -0
- package/dist/tsconfig.tsbuildinfo +1 -0
- package/package.json +28 -0
- package/src/__tests__/calibration.test.ts +122 -0
- package/src/__tests__/ground-truth.test.ts +125 -0
- package/src/__tests__/harness.test.ts +161 -0
- package/src/__tests__/l2-benchmark.test.ts +87 -0
- package/src/calibration.ts +57 -0
- package/src/ground-truth.ts +70 -0
- package/src/index.ts +264 -0
- package/src/l2-benchmark.ts +454 -0
package/README.md
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# @var-ia/eval
|
|
2
|
+
|
|
3
|
+
Generic evaluation harness for L2 model quality — benchmarks, calibration, L3 ground truth validation.
|
|
4
|
+
|
|
5
|
+
## Exports
|
|
6
|
+
|
|
7
|
+
### Harness
|
|
8
|
+
|
|
9
|
+
- `createEvalHarness()` — returns an `EvalHarness` with `evaluate()`, `benchmarkPages()`, and `computeScores()`
|
|
10
|
+
- `EvalHarness` — interface for running test cases against evidence events
|
|
11
|
+
- `EvalTestCase` — a single benchmark case (page, revision range, expected events)
|
|
12
|
+
- `EvalResult` — per-test result with precision, matches, misses, false positives
|
|
13
|
+
- `EvalScoreSummary` — aggregate scores across all tests
|
|
14
|
+
|
|
15
|
+
### L2 Benchmark
|
|
16
|
+
|
|
17
|
+
- `runL2Benchmark()` — run L2 interpretation benchmark across a synthetic dataset
|
|
18
|
+
- `buildL2Dataset()` — construct a benchmark dataset of test cases
|
|
19
|
+
- `printBenchmarkResult()` — format benchmark results for display
|
|
20
|
+
|
|
21
|
+
### Calibration
|
|
22
|
+
|
|
23
|
+
- `computeCalibration()` — compute calibration scores for model interpretations against expected labels
|
|
24
|
+
- `ExpectedInterpretation` — expected interpretation for a calibration case
|
|
25
|
+
|
|
26
|
+
### L3 Ground Truth
|
|
27
|
+
|
|
28
|
+
- `validateAgainstGroundTruth()` — validate L1 events against L3 outcome labels
|
|
29
|
+
- `GROUND_TRUTH_LABELS` — built-in ground truth labels
|
|
30
|
+
- `getGroundTruthById()` / `getGroundTruthForPage()` — lookup helpers
|
|
31
|
+
- `OutcomeLabel` — L3 ground truth label type
|
|
32
|
+
- `L3ValidationResult` / `L3ValidationSummary` — validation result types
|
|
33
|
+
|
|
34
|
+
## License
|
|
35
|
+
|
|
36
|
+
AGPL-3.0
|
|
37
|
+
|
|
38
|
+
[Varia](https://github.com/var-ia/var-ia) · [Docs](https://github.com/var-ia/varia-docs)
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { CalibrationData, InterpretedEvent } from "@var-ia/interpreter";
|
|
2
|
+
export interface ExpectedInterpretation {
|
|
3
|
+
semanticChange: string;
|
|
4
|
+
}
|
|
5
|
+
export declare function computeCalibration(interpretations: InterpretedEvent[], expected: ExpectedInterpretation[], modelId?: string): CalibrationData;
|
|
6
|
+
//# sourceMappingURL=calibration.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"calibration.d.ts","sourceRoot":"","sources":["../../src/calibration.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAE7E,MAAM,WAAW,sBAAsB;IACrC,cAAc,EAAE,MAAM,CAAC;CACxB;AAED,wBAAgB,kBAAkB,CAChC,eAAe,EAAE,gBAAgB,EAAE,EACnC,QAAQ,EAAE,sBAAsB,EAAE,EAClC,OAAO,SAAY,GAClB,eAAe,CA8CjB"}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
export function computeCalibration(interpretations, expected, modelId = "unknown") {
|
|
2
|
+
const BIN_COUNT = 10;
|
|
3
|
+
const bins = Array.from({ length: BIN_COUNT }, (_, i) => ({
|
|
4
|
+
lowerBound: i / BIN_COUNT,
|
|
5
|
+
upperBound: i === BIN_COUNT - 1 ? 1.0 : (i + 1) / BIN_COUNT,
|
|
6
|
+
count: 0,
|
|
7
|
+
correctCount: 0,
|
|
8
|
+
empiricalAccuracy: 0,
|
|
9
|
+
}));
|
|
10
|
+
const sampleCount = Math.min(interpretations.length, expected.length);
|
|
11
|
+
for (let i = 0; i < sampleCount; i++) {
|
|
12
|
+
const interp = interpretations[i].modelInterpretation;
|
|
13
|
+
const expectedInterp = expected[i];
|
|
14
|
+
const binIndex = Math.min(Math.floor(interp.confidence * BIN_COUNT), BIN_COUNT - 1);
|
|
15
|
+
const bin = bins[binIndex];
|
|
16
|
+
bin.count++;
|
|
17
|
+
if (expectedInterp.semanticChange === "any" || interp.semanticChange === expectedInterp.semanticChange) {
|
|
18
|
+
bin.correctCount++;
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
for (const bin of bins) {
|
|
22
|
+
bin.empiricalAccuracy = bin.count > 0 ? bin.correctCount / bin.count : 0;
|
|
23
|
+
}
|
|
24
|
+
const totalSamples = sampleCount;
|
|
25
|
+
const correctTotal = bins.reduce((s, b) => s + b.correctCount, 0);
|
|
26
|
+
const overallAccuracy = totalSamples > 0 ? correctTotal / totalSamples : 0;
|
|
27
|
+
const ece = bins.reduce((s, b) => {
|
|
28
|
+
if (b.count === 0)
|
|
29
|
+
return s;
|
|
30
|
+
const mid = (b.lowerBound + b.upperBound) / 2;
|
|
31
|
+
return s + (b.count / totalSamples) * Math.abs(mid - b.empiricalAccuracy);
|
|
32
|
+
}, 0);
|
|
33
|
+
return {
|
|
34
|
+
modelId,
|
|
35
|
+
bins,
|
|
36
|
+
totalSamples,
|
|
37
|
+
overallAccuracy,
|
|
38
|
+
ece,
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
//# sourceMappingURL=calibration.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"calibration.js","sourceRoot":"","sources":["../../src/calibration.ts"],"names":[],"mappings":"AAMA,MAAM,UAAU,kBAAkB,CAChC,eAAmC,EACnC,QAAkC,EAClC,OAAO,GAAG,SAAS;IAEnB,MAAM,SAAS,GAAG,EAAE,CAAC;IACrB,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,SAAS,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;QACxD,UAAU,EAAE,CAAC,GAAG,SAAS;QACzB,UAAU,EAAE,CAAC,KAAK,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,SAAS;QAC3D,KAAK,EAAE,CAAC;QACR,YAAY,EAAE,CAAC;QACf,iBAAiB,EAAE,CAAC;KACrB,CAAC,CAAC,CAAC;IAEJ,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,eAAe,CAAC,MAAM,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;IAEtE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,MAAM,GAAG,eAAe,CAAC,CAAC,CAAC,CAAC,mBAAmB,CAAC;QACtD,MAAM,cAAc,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC;QAEnC,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,UAAU,GAAG,SAAS,CAAC,EAAE,SAAS,GAAG,CAAC,CAAC,CAAC;QACpF,MAAM,GAAG,GAAG,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC3B,GAAG,CAAC,KAAK,EAAE,CAAC;QAEZ,IAAI,cAAc,CAAC,cAAc,KAAK,KAAK,IAAI,MAAM,CAAC,cAAc,KAAK,cAAc,CAAC,cAAc,EAAE,CAAC;YACvG,GAAG,CAAC,YAAY,EAAE,CAAC;QACrB,CAAC;IACH,CAAC;IAED,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,GAAG,CAAC,iBAAiB,GAAG,GAAG,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,YAAY,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IAC3E,CAAC;IAED,MAAM,YAAY,GAAG,WAAW,CAAC;IACjC,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,YAAY,EAAE,CAAC,CAAC,CAAC;IAClE,MAAM,eAAe,GAAG,YAAY,GAAG,CAAC,CAAC,CAAC,CAAC,YAAY,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC;IAE3E,MAAM,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QAC/B,IAAI,CAAC,CAAC,KAAK,KAAK,CAAC;YAAE,OAAO,CAAC,CAAC;QAC5B,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,UAAU,GAAG,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC;QAC9C,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,GAAG,YAAY,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC,iBAAiB,CAAC,CAAC;IAC5E,CAAC,EAAE,CAAC,CAAC,CAAC;IAEN,OAAO;QACL,OAAO;QACP,IAAI;QACJ,YAAY;QACZ,eAAe;QACf,GAAG;KACJ,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import type { OutcomeLabel } from "./index.js";
|
|
2
|
+
export declare const GROUND_TRUTH_LABELS: OutcomeLabel[];
|
|
3
|
+
export declare function getGroundTruthForPage(pageTitle: string): OutcomeLabel[];
|
|
4
|
+
export declare function getGroundTruthById(id: string): OutcomeLabel | undefined;
|
|
5
|
+
//# sourceMappingURL=ground-truth.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ground-truth.d.ts","sourceRoot":"","sources":["../../src/ground-truth.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAE/C,eAAO,MAAM,mBAAmB,EAAE,YAAY,EA2D7C,CAAC;AAEF,wBAAgB,qBAAqB,CAAC,SAAS,EAAE,MAAM,GAAG,YAAY,EAAE,CAEvE;AAED,wBAAgB,kBAAkB,CAAC,EAAE,EAAE,MAAM,GAAG,YAAY,GAAG,SAAS,CAEvE"}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
export const GROUND_TRUTH_LABELS = [
|
|
2
|
+
{
|
|
3
|
+
id: "covid-vaccine-mandate-rfc",
|
|
4
|
+
source: "rfc_closure",
|
|
5
|
+
pageTitle: "COVID-19 vaccine mandates in the United States",
|
|
6
|
+
description: "RFC closed with consensus to keep the article, rejecting a merge proposal",
|
|
7
|
+
observedAt: "2022-03-15T00:00:00Z",
|
|
8
|
+
resolution: "keep",
|
|
9
|
+
referenceUrl: "https://en.wikipedia.org/wiki/Talk:COVID-19_vaccine_mandates_in_the_United_States/Archive_1#RFC_on_merger",
|
|
10
|
+
expectedEventTypes: ["sentence_first_seen", "revert_detected"],
|
|
11
|
+
expectedSection: "body",
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
id: "darth-vader-lightsaber-merge",
|
|
15
|
+
source: "talk_page_consensus",
|
|
16
|
+
pageTitle: "Darth Vader",
|
|
17
|
+
description: "Discussion about merging Lightsaber combat sections into main article reached consensus for reorganization",
|
|
18
|
+
observedAt: "2021-11-20T00:00:00Z",
|
|
19
|
+
resolution: "merge",
|
|
20
|
+
referenceUrl: "https://starwars.fandom.com/wiki/Talk:Darth_Vader?oldid=12345",
|
|
21
|
+
expectedEventTypes: ["section_reorganized", "sentence_removed"],
|
|
22
|
+
expectedSection: "(lead)",
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
id: "einstein-nobel-protection",
|
|
26
|
+
source: "page_protection",
|
|
27
|
+
pageTitle: "Albert Einstein",
|
|
28
|
+
description: "Article was semi-protected after edit warring over Nobel Prize description",
|
|
29
|
+
observedAt: "2020-06-10T00:00:00Z",
|
|
30
|
+
resolution: "other",
|
|
31
|
+
referenceUrl: "https://en.wikipedia.org/w/index.php?title=Special:Log&page=Albert+Einstein&type=protect",
|
|
32
|
+
expectedEventTypes: ["protection_changed", "revert_detected"],
|
|
33
|
+
expectedSection: "",
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
id: "trump-biographical-rfc",
|
|
37
|
+
source: "rfc_closure",
|
|
38
|
+
pageTitle: "Donald Trump",
|
|
39
|
+
description: "RFC on whether to include detailed biographical information in the lead section ended with no consensus to remove",
|
|
40
|
+
observedAt: "2023-08-01T00:00:00Z",
|
|
41
|
+
resolution: "no_consensus",
|
|
42
|
+
referenceUrl: "https://en.wikipedia.org/wiki/Talk:Donald_Trump/Archive_50#RFC_on_lead_biography_length",
|
|
43
|
+
expectedEventTypes: ["lead_promotion", "lead_demotion", "section_reorganized", "revert_detected"],
|
|
44
|
+
expectedSection: "(lead)",
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
id: "crispr-gene-editing-deletion",
|
|
48
|
+
source: "talk_page_consensus",
|
|
49
|
+
pageTitle: "CRISPR gene editing",
|
|
50
|
+
description: "Discussion about deleting outdated safety information section resulted in removal",
|
|
51
|
+
observedAt: "2022-05-10T00:00:00Z",
|
|
52
|
+
resolution: "delete",
|
|
53
|
+
referenceUrl: "https://en.wikipedia.org/wiki/Talk:CRISPR_gene_editing/Archive_2#Safety_section",
|
|
54
|
+
expectedEventTypes: ["section_reorganized", "sentence_removed"],
|
|
55
|
+
expectedSection: "",
|
|
56
|
+
},
|
|
57
|
+
];
|
|
58
|
+
export function getGroundTruthForPage(pageTitle) {
|
|
59
|
+
return GROUND_TRUTH_LABELS.filter((label) => label.pageTitle.toLowerCase() === pageTitle.toLowerCase());
|
|
60
|
+
}
|
|
61
|
+
export function getGroundTruthById(id) {
|
|
62
|
+
return GROUND_TRUTH_LABELS.find((label) => label.id === id);
|
|
63
|
+
}
|
|
64
|
+
//# sourceMappingURL=ground-truth.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ground-truth.js","sourceRoot":"","sources":["../../src/ground-truth.ts"],"names":[],"mappings":"AAEA,MAAM,CAAC,MAAM,mBAAmB,GAAmB;IACjD;QACE,EAAE,EAAE,2BAA2B;QAC/B,MAAM,EAAE,aAAa;QACrB,SAAS,EAAE,gDAAgD;QAC3D,WAAW,EAAE,2EAA2E;QACxF,UAAU,EAAE,sBAAsB;QAClC,UAAU,EAAE,MAAM;QAClB,YAAY,EACV,2GAA2G;QAC7G,kBAAkB,EAAE,CAAC,qBAAqB,EAAE,iBAAiB,CAAC;QAC9D,eAAe,EAAE,MAAM;KACxB;IACD;QACE,EAAE,EAAE,8BAA8B;QAClC,MAAM,EAAE,qBAAqB;QAC7B,SAAS,EAAE,aAAa;QACxB,WAAW,EACT,4GAA4G;QAC9G,UAAU,EAAE,sBAAsB;QAClC,UAAU,EAAE,OAAO;QACnB,YAAY,EAAE,+DAA+D;QAC7E,kBAAkB,EAAE,CAAC,qBAAqB,EAAE,kBAAkB,CAAC;QAC/D,eAAe,EAAE,QAAQ;KAC1B;IACD;QACE,EAAE,EAAE,2BAA2B;QAC/B,MAAM,EAAE,iBAAiB;QACzB,SAAS,EAAE,iBAAiB;QAC5B,WAAW,EAAE,4EAA4E;QACzF,UAAU,EAAE,sBAAsB;QAClC,UAAU,EAAE,OAAO;QACnB,YAAY,EAAE,0FAA0F;QACxG,kBAAkB,EAAE,CAAC,oBAAoB,EAAE,iBAAiB,CAAC;QAC7D,eAAe,EAAE,EAAE;KACpB;IACD;QACE,EAAE,EAAE,wBAAwB;QAC5B,MAAM,EAAE,aAAa;QACrB,SAAS,EAAE,cAAc;QACzB,WAAW,EACT,mHAAmH;QACrH,UAAU,EAAE,sBAAsB;QAClC,UAAU,EAAE,cAAc;QAC1B,YAAY,EAAE,yFAAyF;QACvG,kBAAkB,EAAE,CAAC,gBAAgB,EAAE,eAAe,EAAE,qBAAqB,EAAE,iBAAiB,CAAC;QACjG,eAAe,EAAE,QAAQ;KAC1B;IACD;QACE,EAAE,EAAE,8BAA8B;QAClC,MAAM,EAAE,qBAAqB;QAC7B,SAAS,EAAE,qBAAqB;QAChC,WAAW,EAAE,mFAAmF;QAChG,UAAU,EAAE,sBAAsB;QAClC,UAAU,EAAE,QAAQ;QACpB,YAAY,EAAE,iFAAiF;QAC/F,kBAAkB,EAAE,CAAC,qBAAqB,EAAE,kBAAkB,CAAC;QAC/D,eAAe,EAAE,EAAE;KACpB;CACF,CAAC;AAEF,MAAM,UAAU,qBAAqB,CAAC,SAAiB;IACrD,OAAO,mBAAmB,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,SAAS,CAAC,WAAW,EAAE,KAAK,SAAS,CAAC,WAAW,EAAE,CAAC,CAAC;AAC1G,CAAC;AAED,MAAM,UAAU,kBAAkB,CAAC,EAAU;IAC3C,OAAO,mBAAmB,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;AAC9D,CAAC"}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import type { EvidenceEvent } from "@var-ia/evidence-graph";
|
|
2
|
+
export interface OutcomeLabel {
|
|
3
|
+
id: string;
|
|
4
|
+
source: "talk_page_consensus" | "rfc_closure" | "arbcom_decision" | "page_protection";
|
|
5
|
+
pageTitle: string;
|
|
6
|
+
description: string;
|
|
7
|
+
observedAt: string;
|
|
8
|
+
resolution: "keep" | "merge" | "delete" | "no_consensus" | "redirect" | "other";
|
|
9
|
+
referenceUrl: string;
|
|
10
|
+
expectedEventTypes: string[];
|
|
11
|
+
expectedSection?: string;
|
|
12
|
+
}
|
|
13
|
+
export interface L3ValidationResult {
|
|
14
|
+
outcomeId: string;
|
|
15
|
+
passed: boolean;
|
|
16
|
+
description: string;
|
|
17
|
+
signalDetected: boolean;
|
|
18
|
+
matchedEvents: EvidenceEvent[];
|
|
19
|
+
expectedEventTypes: string[];
|
|
20
|
+
precision: number;
|
|
21
|
+
recall: number;
|
|
22
|
+
}
|
|
23
|
+
export interface L3ValidationSummary {
|
|
24
|
+
totalOutcomes: number;
|
|
25
|
+
passed: number;
|
|
26
|
+
failed: number;
|
|
27
|
+
overallPrecision: number;
|
|
28
|
+
overallRecall: number;
|
|
29
|
+
perOutcome: L3ValidationResult[];
|
|
30
|
+
}
|
|
31
|
+
export declare function validateAgainstGroundTruth(outcomes: OutcomeLabel[], events: EvidenceEvent[]): L3ValidationSummary;
|
|
32
|
+
export interface EvalTestCase {
|
|
33
|
+
id: string;
|
|
34
|
+
description: string;
|
|
35
|
+
pageTitle: string;
|
|
36
|
+
pageId: number;
|
|
37
|
+
revisionRange: {
|
|
38
|
+
from: number;
|
|
39
|
+
to: number;
|
|
40
|
+
};
|
|
41
|
+
expectedEvents: ExpectedEvent[];
|
|
42
|
+
tolerance?: EvalTolerance;
|
|
43
|
+
}
|
|
44
|
+
export interface ExpectedEvent {
|
|
45
|
+
eventType: string;
|
|
46
|
+
section: string;
|
|
47
|
+
minConfidence?: number;
|
|
48
|
+
}
|
|
49
|
+
export interface EvalTolerance {
|
|
50
|
+
minEventCount?: number;
|
|
51
|
+
maxEventCount?: number;
|
|
52
|
+
minPrecision?: number;
|
|
53
|
+
}
|
|
54
|
+
export interface EvalResult {
|
|
55
|
+
testId: string;
|
|
56
|
+
passed: boolean;
|
|
57
|
+
precision: number;
|
|
58
|
+
eventCount: {
|
|
59
|
+
expected: number;
|
|
60
|
+
actual: number;
|
|
61
|
+
};
|
|
62
|
+
matches: EventMatch[];
|
|
63
|
+
misses: MissingEvent[];
|
|
64
|
+
falsePositives: UnexpectedEvent[];
|
|
65
|
+
}
|
|
66
|
+
export interface EventMatch {
|
|
67
|
+
expected: ExpectedEvent;
|
|
68
|
+
actual: EvidenceEvent;
|
|
69
|
+
}
|
|
70
|
+
export interface MissingEvent {
|
|
71
|
+
expected: ExpectedEvent;
|
|
72
|
+
}
|
|
73
|
+
export interface UnexpectedEvent {
|
|
74
|
+
event: EvidenceEvent;
|
|
75
|
+
}
|
|
76
|
+
export interface EvalHarness {
|
|
77
|
+
evaluate(test: EvalTestCase, events: EvidenceEvent[]): EvalResult;
|
|
78
|
+
benchmarkPages(): EvalTestCase[];
|
|
79
|
+
computeScores(results: EvalResult[]): EvalScoreSummary;
|
|
80
|
+
}
|
|
81
|
+
export interface EvalScoreSummary {
|
|
82
|
+
overallPrecision: number;
|
|
83
|
+
testsPassed: number;
|
|
84
|
+
testsFailed: number;
|
|
85
|
+
totalTests: number;
|
|
86
|
+
perTest: Array<{
|
|
87
|
+
id: string;
|
|
88
|
+
precision: number;
|
|
89
|
+
passed: boolean;
|
|
90
|
+
}>;
|
|
91
|
+
}
|
|
92
|
+
export declare function createEvalHarness(): EvalHarness;
|
|
93
|
+
export { GROUND_TRUTH_LABELS, getGroundTruthById, getGroundTruthForPage } from "./ground-truth.js";
|
|
94
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,wBAAwB,CAAC;AAI5D,MAAM,WAAW,YAAY;IAC3B,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,qBAAqB,GAAG,aAAa,GAAG,iBAAiB,GAAG,iBAAiB,CAAC;IACtF,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,GAAG,OAAO,GAAG,QAAQ,GAAG,cAAc,GAAG,UAAU,GAAG,OAAO,CAAC;IAChF,YAAY,EAAE,MAAM,CAAC;IACrB,kBAAkB,EAAE,MAAM,EAAE,CAAC;IAC7B,eAAe,CAAC,EAAE,MAAM,CAAC;CAC1B;AAED,MAAM,WAAW,kBAAkB;IACjC,SAAS,EAAE,MAAM,CAAC;IAClB,MAAM,EAAE,OAAO,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,cAAc,EAAE,OAAO,CAAC;IACxB,aAAa,EAAE,aAAa,EAAE,CAAC;IAC/B,kBAAkB,EAAE,MAAM,EAAE,CAAC;IAC7B,SAAS,EAAE,MAAM,CAAC;IAClB,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,mBAAmB;IAClC,aAAa,EAAE,MAAM,CAAC;IACtB,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,gBAAgB,EAAE,MAAM,CAAC;IACzB,aAAa,EAAE,MAAM,CAAC;IACtB,UAAU,EAAE,kBAAkB,EAAE,CAAC;CAClC;AAED,wBAAgB,0BAA0B,CAAC,QAAQ,EAAE,YAAY,EAAE,EAAE,MAAM,EAAE,aAAa,EAAE,GAAG,mBAAmB,CAsCjH;AAED,MAAM,WAAW,YAAY;IAC3B,EAAE,EAAE,MAAM,CAAC;IACX,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,MAAM,EAAE,MAAM,CAAC;IACf,aAAa,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,EAAE,EAAE,MAAM,CAAA;KAAE,CAAC;IAC5C,cAAc,EAAE,aAAa,EAAE,CAAC;IAChC,SAAS,CAAC,EAAE,aAAa,CAAC;CAC3B;AAED,MAAM,WAAW,aAAa;IAC5B,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,aAAa,CAAC,EAAE,MAAM,CAAC;CACxB;AAED,MAAM,WAAW,aAAa;IAC5B,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB;AAED,MAAM,WAAW,UAAU;IACzB,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,OAAO,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE;QAAE,QAAQ,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC;IACjD,OAAO,EAAE,UAAU,EAAE,CAAC;IACtB,MAAM,EAAE,YAAY,EAAE,CAAC;IACvB,cAAc,EAAE,eAAe,EAAE,CAAC;CACnC;AAED,MAAM,WAAW,UAAU;IACzB,QAAQ,EAAE,aAAa,CAAC;IACxB,MAAM,EAAE,aAAa,CAAC;CACvB;AAED,MAAM,WAAW,YAAY;IAC3B,QAAQ,EAAE,aAAa,CAAC;CACzB;AAED,MAAM,WAAW,eAAe;IAC9B,KAAK,EAAE,aAAa,CAAC;CACtB;AAED,MAAM,WAAW,WAAW;IAC1B,QAAQ,CAAC,IAAI,EAAE,YAAY,EAAE,MAAM,EAAE,aAAa,EAAE,GAAG,UAAU,CAAC;IAClE,cAAc,IAAI,YAAY,EAAE,CAAC;IACjC,aAAa,CAAC,OAAO,EAAE,UAAU,EAAE,GAAG,gBAAgB,CAAC;CACxD;AAED,MAAM,WAAW,gBAAgB;IAC/B,gBAAgB,EAAE,MAAM,CAAC;IACzB,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,KAAK,CAAC;QAAE,EAAE,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,OAAO,CAAA;KAAE,CAAC,CAAC;CACpE;AAED,wBAAgB,iBAAiB,IAAI,WAAW,CA8H/C;AAED,OAAO,EAAE,mBAAmB,EAAE,kBAAkB,EAAE,qBAAqB,EAAE,MAAM,mBAAmB,CAAC"}
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
export function validateAgainstGroundTruth(outcomes, events) {
|
|
2
|
+
const results = outcomes.map((outcome) => {
|
|
3
|
+
const expected = outcome.expectedEventTypes;
|
|
4
|
+
const matched = events.filter((e) => expected.includes(e.eventType) && (!outcome.expectedSection || e.section === outcome.expectedSection));
|
|
5
|
+
const signalDetected = matched.length > 0;
|
|
6
|
+
const precision = matched.length > 0
|
|
7
|
+
? expected.filter((et) => matched.some((m) => m.eventType === et)).length / expected.length
|
|
8
|
+
: 0;
|
|
9
|
+
const recall = matched.length > 0 ? 1.0 : 0.0;
|
|
10
|
+
return {
|
|
11
|
+
outcomeId: outcome.id,
|
|
12
|
+
passed: signalDetected,
|
|
13
|
+
description: outcome.description,
|
|
14
|
+
signalDetected,
|
|
15
|
+
matchedEvents: matched,
|
|
16
|
+
expectedEventTypes: expected,
|
|
17
|
+
precision,
|
|
18
|
+
recall,
|
|
19
|
+
};
|
|
20
|
+
});
|
|
21
|
+
const passed = results.filter((r) => r.passed);
|
|
22
|
+
const avgPrecision = results.length > 0 ? results.reduce((s, r) => s + r.precision, 0) / results.length : 0;
|
|
23
|
+
const avgRecall = results.length > 0 ? results.reduce((s, r) => s + r.recall, 0) / results.length : 0;
|
|
24
|
+
return {
|
|
25
|
+
totalOutcomes: outcomes.length,
|
|
26
|
+
passed: passed.length,
|
|
27
|
+
failed: outcomes.length - passed.length,
|
|
28
|
+
overallPrecision: avgPrecision,
|
|
29
|
+
overallRecall: avgRecall,
|
|
30
|
+
perOutcome: results,
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
export function createEvalHarness() {
|
|
34
|
+
return {
|
|
35
|
+
evaluate(test, events) {
|
|
36
|
+
const matches = [];
|
|
37
|
+
const misses = [];
|
|
38
|
+
const falsePositives = [];
|
|
39
|
+
for (const expected of test.expectedEvents) {
|
|
40
|
+
const found = events.find((e) => e.eventType === expected.eventType && e.section === expected.section);
|
|
41
|
+
if (found) {
|
|
42
|
+
matches.push({ expected, actual: found });
|
|
43
|
+
}
|
|
44
|
+
else {
|
|
45
|
+
misses.push({ expected });
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
for (const event of events) {
|
|
49
|
+
if (!test.expectedEvents.some((e) => e.eventType === event.eventType)) {
|
|
50
|
+
falsePositives.push({ event });
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
const matchedCount = matches.length;
|
|
54
|
+
const totalExpected = test.expectedEvents.length;
|
|
55
|
+
const precision = totalExpected > 0 ? matchedCount / totalExpected : events.length === 0 ? 1.0 : 0.0;
|
|
56
|
+
const tolerance = test.tolerance ?? {};
|
|
57
|
+
const minEventCount = tolerance.minEventCount ?? 0;
|
|
58
|
+
const maxEventCount = tolerance.maxEventCount ?? Infinity;
|
|
59
|
+
const minPrecision = tolerance.minPrecision ?? 0.5;
|
|
60
|
+
const passed = precision >= minPrecision && events.length >= minEventCount && events.length <= maxEventCount;
|
|
61
|
+
return {
|
|
62
|
+
testId: test.id,
|
|
63
|
+
passed,
|
|
64
|
+
precision,
|
|
65
|
+
eventCount: { expected: totalExpected, actual: events.length },
|
|
66
|
+
matches,
|
|
67
|
+
misses,
|
|
68
|
+
falsePositives,
|
|
69
|
+
};
|
|
70
|
+
},
|
|
71
|
+
benchmarkPages() {
|
|
72
|
+
return [
|
|
73
|
+
{
|
|
74
|
+
id: "page-has-revisions",
|
|
75
|
+
description: "Any active Wikipedia page returns at least 2 revisions and generates section events",
|
|
76
|
+
pageTitle: "Earth",
|
|
77
|
+
pageId: 9228,
|
|
78
|
+
revisionRange: { from: 0, to: 0 },
|
|
79
|
+
expectedEvents: [{ eventType: "section_reorganized", section: "(lead)" }],
|
|
80
|
+
tolerance: { minEventCount: 1, minPrecision: 0.0 },
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
id: "contentious-page-has-reverts",
|
|
84
|
+
description: "Pages with edit wars should have revert events",
|
|
85
|
+
pageTitle: "Donald_Trump",
|
|
86
|
+
pageId: 4848272,
|
|
87
|
+
revisionRange: { from: 0, to: 0 },
|
|
88
|
+
expectedEvents: [{ eventType: "revert_detected", section: "" }],
|
|
89
|
+
tolerance: { minEventCount: 1, minPrecision: 0.0 },
|
|
90
|
+
},
|
|
91
|
+
{
|
|
92
|
+
id: "controversy-page-has-templates",
|
|
93
|
+
description: "Controversial topics have policy maintenance templates",
|
|
94
|
+
pageTitle: "COVID-19_pandemic",
|
|
95
|
+
pageId: 58899562,
|
|
96
|
+
revisionRange: { from: 0, to: 0 },
|
|
97
|
+
expectedEvents: [{ eventType: "template_added", section: "body" }],
|
|
98
|
+
tolerance: { minEventCount: 5, minPrecision: 0.0 },
|
|
99
|
+
},
|
|
100
|
+
{
|
|
101
|
+
id: "scientific-article-has-citations",
|
|
102
|
+
description: "Scientific articles always have citation changes",
|
|
103
|
+
pageTitle: "CRISPR",
|
|
104
|
+
pageId: 5000000,
|
|
105
|
+
revisionRange: { from: 0, to: 0 },
|
|
106
|
+
expectedEvents: [
|
|
107
|
+
{ eventType: "citation_added", section: "body" },
|
|
108
|
+
{ eventType: "citation_removed", section: "body" },
|
|
109
|
+
],
|
|
110
|
+
tolerance: { minEventCount: 3, minPrecision: 0.1 },
|
|
111
|
+
},
|
|
112
|
+
{
|
|
113
|
+
id: "featured-article-has-template-cleanup",
|
|
114
|
+
description: "Featured articles show cleanup/maintenance template activity",
|
|
115
|
+
pageTitle: "Shakespeare",
|
|
116
|
+
pageId: 26825,
|
|
117
|
+
revisionRange: { from: 0, to: 0 },
|
|
118
|
+
expectedEvents: [
|
|
119
|
+
{ eventType: "template_added", section: "body" },
|
|
120
|
+
{ eventType: "section_reorganized", section: "(lead)" },
|
|
121
|
+
],
|
|
122
|
+
tolerance: { minEventCount: 5, minPrecision: 0.1 },
|
|
123
|
+
},
|
|
124
|
+
{
|
|
125
|
+
id: "events-has-citation-additions",
|
|
126
|
+
description: "Pages with many citations will have observable citation diffs",
|
|
127
|
+
pageTitle: "Albert_Einstein",
|
|
128
|
+
pageId: 736,
|
|
129
|
+
revisionRange: { from: 0, to: 0 },
|
|
130
|
+
expectedEvents: [{ eventType: "citation_added", section: "body" }],
|
|
131
|
+
tolerance: { minEventCount: 2, minPrecision: 0.0 },
|
|
132
|
+
},
|
|
133
|
+
];
|
|
134
|
+
},
|
|
135
|
+
computeScores(results) {
|
|
136
|
+
const passed = results.filter((r) => r.passed);
|
|
137
|
+
const totalPrecision = results.length > 0 ? results.reduce((sum, r) => sum + r.precision, 0) / results.length : 0;
|
|
138
|
+
return {
|
|
139
|
+
overallPrecision: totalPrecision,
|
|
140
|
+
testsPassed: passed.length,
|
|
141
|
+
testsFailed: results.length - passed.length,
|
|
142
|
+
totalTests: results.length,
|
|
143
|
+
perTest: results.map((r) => ({
|
|
144
|
+
id: r.testId,
|
|
145
|
+
precision: r.precision,
|
|
146
|
+
passed: r.passed,
|
|
147
|
+
})),
|
|
148
|
+
};
|
|
149
|
+
},
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
export { GROUND_TRUTH_LABELS, getGroundTruthById, getGroundTruthForPage } from "./ground-truth.js";
|
|
153
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAoCA,MAAM,UAAU,0BAA0B,CAAC,QAAwB,EAAE,MAAuB;IAC1F,MAAM,OAAO,GAAyB,QAAQ,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,EAAE;QAC7D,MAAM,QAAQ,GAAG,OAAO,CAAC,kBAAkB,CAAC;QAC5C,MAAM,OAAO,GAAG,MAAM,CAAC,MAAM,CAC3B,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,OAAO,CAAC,eAAe,IAAI,CAAC,CAAC,OAAO,KAAK,OAAO,CAAC,eAAe,CAAC,CAC7G,CAAC;QAEF,MAAM,cAAc,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC;QAC1C,MAAM,SAAS,GACb,OAAO,CAAC,MAAM,GAAG,CAAC;YAChB,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,KAAK,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,QAAQ,CAAC,MAAM;YAC3F,CAAC,CAAC,CAAC,CAAC;QACR,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;QAE9C,OAAO;YACL,SAAS,EAAE,OAAO,CAAC,EAAE;YACrB,MAAM,EAAE,cAAc;YACtB,WAAW,EAAE,OAAO,CAAC,WAAW;YAChC,cAAc;YACd,aAAa,EAAE,OAAO;YACtB,kBAAkB,EAAE,QAAQ;YAC5B,SAAS;YACT,MAAM;SACP,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IAC/C,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;IAC5G,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;IAEtG,OAAO;QACL,aAAa,EAAE,QAAQ,CAAC,MAAM;QAC9B,MAAM,EAAE,MAAM,CAAC,MAAM;QACrB,MAAM,EAAE,QAAQ,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM;QACvC,gBAAgB,EAAE,YAAY;QAC9B,aAAa,EAAE,SAAS;QACxB,UAAU,EAAE,OAAO;KACpB,CAAC;AACJ,CAAC;AA6DD,MAAM,UAAU,iBAAiB;IAC/B,OAAO;QACL,QAAQ,CAAC,IAAI,EAAE,MAAM;YACnB,MAAM,OAAO,GAAiB,EAAE,CAAC;YACjC,MAAM,MAAM,GAAmB,EAAE,CAAC;YAClC,MAAM,cAAc,GAAsB,EAAE,CAAC;YAE7C,KAAK,MAAM,QAAQ,IAAI,IAAI,CAAC,cAAc,EAAE,CAAC;gBAC3C,MAAM,KAAK,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,KAAK,QAAQ,CAAC,SAAS,IAAI,CAAC,CAAC,OAAO,KAAK,QAAQ,CAAC,OAAO,CAAC,CAAC;gBACvG,IAAI,KAAK,EAAE,CAAC;oBACV,OAAO,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC,CAAC;gBAC5C,CAAC;qBAAM,CAAC;oBACN,MAAM,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,CAAC,CAAC;gBAC5B,CAAC;YACH,CAAC;YAED,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;gBAC3B,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,KAAK,KAAK,CAAC,SAAS,CAAC,EAAE,CAAC;oBACtE,cAAc,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;gBACjC,CAAC;YACH,CAAC;YAED,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC;YACpC,MAAM,aAAa,GAAG,IAAI,CAAC,cAAc,CAAC,MAAM,CAAC;YACjD,MAAM,SAAS,GAAG,aAAa,GAAG,CAAC,CAAC,CAAC,CAAC,YAAY,GAAG,aAAa,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;YAErG,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,EAAE,CAAC;YACvC,MAAM,aAAa,GAAG,SAAS,CAAC,aAAa,IAAI,CAAC,CAAC;YACnD,MAAM,aAAa,GAAG,SAAS,CAAC,aAAa,IAAI,QAAQ,CAAC;YAC1D,MAAM,YAAY,GAAG,SAAS,CAAC,YAAY,IAAI,GAAG,CAAC;YAEnD,MAAM,MAAM,GAAG,SAAS,IAAI,YAAY,IAAI,MAAM,CAAC,MAAM,IAAI,aAAa,IAAI,MAAM,CAAC,MAAM,IAAI,aAAa,CAAC;YAE7G,OAAO;gBACL,MAAM,EAAE,IAAI,CAAC,EAAE;gBACf,MAAM;gBACN,SAAS;gBACT,UAAU,EAAE,EAAE,QAAQ,EAAE,aAAa,EAAE,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE;gBAC9D,OAAO;gBACP,MAAM;gBACN,cAAc;aACf,CAAC;QACJ,CAAC;QAED,cAAc;YACZ,OAAO;gBACL;oBACE,EAAE,EAAE,oBAAoB;oBACxB,WAAW,EAAE,qFAAqF;oBAClG,SAAS,EAAE,OAAO;oBAClB,MAAM,EAAE,IAAI;oBACZ,aAAa,EAAE,EAAE,IAAI,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE;oBACjC,cAAc,EAAE,CAAC,EAAE,SAAS,EAAE,qBAAqB,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC;oBACzE,SAAS,EAAE,EAAE,aAAa,EAAE,CAAC,EAAE,YAAY,EAAE,GAAG,EAAE;iBACnD;gBACD;oBACE,EAAE,EAAE,8BAA8B;oBAClC,WAAW,EAAE,gDAAgD;oBAC7D,SAAS,EAAE,cAAc;oBACzB,MAAM,EAAE,OAAO;oBACf,aAAa,EAAE,EAAE,IAAI,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE;oBACjC,cAAc,EAAE,CAAC,EAAE,SAAS,EAAE,iBAAiB,EAAE,OAAO,EAAE,EAAE,EAAE,CAAC;oBAC/D,SAAS,EAAE,EAAE,aAAa,EAAE,CAAC,EAAE,YAAY,EAAE,GAAG,EAAE;iBACnD;gBACD;oBACE,EAAE,EAAE,gCAAgC;oBACpC,WAAW,EAAE,wDAAwD;oBACrE,SAAS,EAAE,mBAAmB;oBAC9B,MAAM,EAAE,QAAQ;oBAChB,aAAa,EAAE,EAAE,IAAI,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE;oBACjC,cAAc,EAAE,CAAC,EAAE,SAAS,EAAE,gBAAgB,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC;oBAClE,SAAS,EAAE,EAAE,aAAa,EAAE,CAAC,EAAE,YAAY,EAAE,GAAG,EAAE;iBACnD;gBACD;oBACE,EAAE,EAAE,kCAAkC;oBACtC,WAAW,EAAE,kDAAkD;oBAC/D,SAAS,EAAE,QAAQ;oBACnB,MAAM,EAAE,OAAO;oBACf,aAAa,EAAE,EAAE,IAAI,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE;oBACjC,cAAc,EAAE;wBACd,EAAE,SAAS,EAAE,gBAAgB,EAAE,OAAO,EAAE,MAAM,EAAE;wBAChD,EAAE,SAAS,EAAE,kBAAkB,EAAE,OAAO,EAAE,MAAM,EAAE;qBACnD;oBACD,SAAS,EAAE,EAAE,aAAa,EAAE,CAAC,EAAE,YAAY,EAAE,GAAG,EAAE;iBACnD;gBACD;oBACE,EAAE,EAAE,uCAAuC;oBAC3C,WAAW,EAAE,8DAA8D;oBAC3E,SAAS,EAAE,aAAa;oBACxB,MAAM,EAAE,KAAK;oBACb,aAAa,EAAE,EAAE,IAAI,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE;oBACjC,cAAc,EAAE;wBACd,EAAE,SAAS,EAAE,gBAAgB,EAAE,OAAO,EAAE,MAAM,EAAE;wBAChD,EAAE,SAAS,EAAE,qBAAqB,EAAE,OAAO,EAAE,QAAQ,EAAE;qBACxD;oBACD,SAAS,EAAE,EAAE,aAAa,EAAE,CAAC,EAAE,YAAY,EAAE,GAAG,EAAE;iBACnD;gBACD;oBACE,EAAE,EAAE,+BAA+B;oBACnC,WAAW,EAAE,+DAA+D;oBAC5E,SAAS,EAAE,iBAAiB;oBAC5B,MAAM,EAAE,GAAG;oBACX,aAAa,EAAE,EAAE,IAAI,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE;oBACjC,cAAc,EAAE,CAAC,EAAE,SAAS,EAAE,gBAAgB,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC;oBAClE,SAAS,EAAE,EAAE,aAAa,EAAE,CAAC,EAAE,YAAY,EAAE,GAAG,EAAE;iBACnD;aACF,CAAC;QACJ,CAAC;QAED,aAAa,CAAC,OAAO;YACnB,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;YAC/C,MAAM,cAAc,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;YAElH,OAAO;gBACL,gBAAgB,EAAE,cAAc;gBAChC,WAAW,EAAE,MAAM,CAAC,MAAM;gBAC1B,WAAW,EAAE,OAAO,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM;gBAC3C,UAAU,EAAE,OAAO,CAAC,MAAM;gBAC1B,OAAO,EAAE,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;oBAC3B,EAAE,EAAE,CAAC,CAAC,MAAM;oBACZ,SAAS,EAAE,CAAC,CAAC,SAAS;oBACtB,MAAM,EAAE,CAAC,CAAC,MAAM;iBACjB,CAAC,CAAC;aACJ,CAAC;QACJ,CAAC;KACF,CAAC;AACJ,CAAC;AAED,OAAO,EAAE,mBAAmB,EAAE,kBAAkB,EAAE,qBAAqB,EAAE,MAAM,mBAAmB,CAAC"}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import type { EvidenceEvent } from "@var-ia/evidence-graph";
|
|
2
|
+
import type { ModelConfig } from "@var-ia/interpreter";
|
|
3
|
+
export interface L2TestCase {
|
|
4
|
+
id: string;
|
|
5
|
+
description: string;
|
|
6
|
+
events: EvidenceEvent[];
|
|
7
|
+
expected: L2ExpectedInterpretation[];
|
|
8
|
+
}
|
|
9
|
+
export interface L2ExpectedInterpretation {
|
|
10
|
+
eventIndex: number;
|
|
11
|
+
semanticChange?: string;
|
|
12
|
+
confidence?: number;
|
|
13
|
+
policyDimension?: string;
|
|
14
|
+
discussionType?: string;
|
|
15
|
+
}
|
|
16
|
+
export interface L2MetricScore {
|
|
17
|
+
metric: string;
|
|
18
|
+
correct: number;
|
|
19
|
+
total: number;
|
|
20
|
+
accuracy: number;
|
|
21
|
+
}
|
|
22
|
+
export interface L2ProviderResult {
|
|
23
|
+
provider: string;
|
|
24
|
+
model: string;
|
|
25
|
+
metrics: L2MetricScore[];
|
|
26
|
+
avgConfidence: number;
|
|
27
|
+
overallAccuracy: number;
|
|
28
|
+
totalEvents: number;
|
|
29
|
+
}
|
|
30
|
+
export interface L2BenchmarkResult {
|
|
31
|
+
generatedAt: string;
|
|
32
|
+
testCases: number;
|
|
33
|
+
totalEvents: number;
|
|
34
|
+
providers: L2ProviderResult[];
|
|
35
|
+
}
|
|
36
|
+
export declare function runL2Benchmark(providers: ModelConfig[]): Promise<L2BenchmarkResult>;
|
|
37
|
+
export declare function printBenchmarkResult(result: L2BenchmarkResult): void;
|
|
38
|
+
export declare function buildL2Dataset(): L2TestCase[];
|
|
39
|
+
//# sourceMappingURL=l2-benchmark.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"l2-benchmark.d.ts","sourceRoot":"","sources":["../../src/l2-benchmark.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,wBAAwB,CAAC;AAC5D,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,qBAAqB,CAAC;AAGvD,MAAM,WAAW,UAAU;IACzB,EAAE,EAAE,MAAM,CAAC;IACX,WAAW,EAAE,MAAM,CAAC;IACpB,MAAM,EAAE,aAAa,EAAE,CAAC;IACxB,QAAQ,EAAE,wBAAwB,EAAE,CAAC;CACtC;AAED,MAAM,WAAW,wBAAwB;IACvC,UAAU,EAAE,MAAM,CAAC;IACnB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,aAAa;IAC5B,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,EAAE,MAAM,CAAC;IACjB,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,aAAa,EAAE,CAAC;IACzB,aAAa,EAAE,MAAM,CAAC;IACtB,eAAe,EAAE,MAAM,CAAC;IACxB,WAAW,EAAE,MAAM,CAAC;CACrB;AAED,MAAM,WAAW,iBAAiB;IAChC,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,gBAAgB,EAAE,CAAC;CAC/B;AAED,wBAAsB,cAAc,CAAC,SAAS,EAAE,WAAW,EAAE,GAAG,OAAO,CAAC,iBAAiB,CAAC,CA4BzF;AAmFD,wBAAgB,oBAAoB,CAAC,MAAM,EAAE,iBAAiB,GAAG,IAAI,CAmBpE;AAiBD,wBAAgB,cAAc,IAAI,UAAU,EAAE,CAwQ7C"}
|