npm - @var-ia/eval - Versions diffs - 0.1.1 - Mend

@var-ia/eval 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/README.md +38 -0
package/dist/src/calibration.d.ts +6 -0
package/dist/src/calibration.d.ts.map +1 -0
package/dist/src/calibration.js +41 -0
package/dist/src/calibration.js.map +1 -0
package/dist/src/ground-truth.d.ts +5 -0
package/dist/src/ground-truth.d.ts.map +1 -0
package/dist/src/ground-truth.js +64 -0
package/dist/src/ground-truth.js.map +1 -0
package/dist/src/index.d.ts +94 -0
package/dist/src/index.d.ts.map +1 -0
package/dist/src/index.js +153 -0
package/dist/src/index.js.map +1 -0
package/dist/src/l2-benchmark.d.ts +39 -0
package/dist/src/l2-benchmark.d.ts.map +1 -0
package/dist/src/l2-benchmark.js +398 -0
package/dist/src/l2-benchmark.js.map +1 -0
package/dist/tsconfig.tsbuildinfo +1 -0
package/package.json +28 -0
package/src/__tests__/calibration.test.ts +122 -0
package/src/__tests__/ground-truth.test.ts +125 -0
package/src/__tests__/harness.test.ts +161 -0
package/src/__tests__/l2-benchmark.test.ts +87 -0
package/src/calibration.ts +57 -0
package/src/ground-truth.ts +70 -0
package/src/index.ts +264 -0
package/src/l2-benchmark.ts +454 -0

package/src/index.ts ADDED Viewed

@@ -0,0 +1,264 @@
+import type { EvidenceEvent } from "@var-ia/evidence-graph";
+// ── L3 Ground Truth Types ──────────────────────────────────────────
+export interface OutcomeLabel {
+  id: string;
+  source: "talk_page_consensus" | "rfc_closure" | "arbcom_decision" | "page_protection";
+  pageTitle: string;
+  description: string;
+  observedAt: string;
+  resolution: "keep" | "merge" | "delete" | "no_consensus" | "redirect" | "other";
+  referenceUrl: string;
+  expectedEventTypes: string[];
+  expectedSection?: string;
+}
+export interface L3ValidationResult {
+  outcomeId: string;
+  passed: boolean;
+  description: string;
+  signalDetected: boolean;
+  matchedEvents: EvidenceEvent[];
+  expectedEventTypes: string[];
+  precision: number;
+  recall: number;
+}
+export interface L3ValidationSummary {
+  totalOutcomes: number;
+  passed: number;
+  failed: number;
+  overallPrecision: number;
+  overallRecall: number;
+  perOutcome: L3ValidationResult[];
+}
+export function validateAgainstGroundTruth(outcomes: OutcomeLabel[], events: EvidenceEvent[]): L3ValidationSummary {
+  const results: L3ValidationResult[] = outcomes.map((outcome) => {
+    const expected = outcome.expectedEventTypes;
+    const matched = events.filter(
+      (e) => expected.includes(e.eventType) && (!outcome.expectedSection || e.section === outcome.expectedSection),
+    );
+    const signalDetected = matched.length > 0;
+    const precision =
+      matched.length > 0
+        ? expected.filter((et) => matched.some((m) => m.eventType === et)).length / expected.length
+        : 0;
+    const recall = matched.length > 0 ? 1.0 : 0.0;
+    return {
+      outcomeId: outcome.id,
+      passed: signalDetected,
+      description: outcome.description,
+      signalDetected,
+      matchedEvents: matched,
+      expectedEventTypes: expected,
+      precision,
+      recall,
+    };
+  });
+  const passed = results.filter((r) => r.passed);
+  const avgPrecision = results.length > 0 ? results.reduce((s, r) => s + r.precision, 0) / results.length : 0;
+  const avgRecall = results.length > 0 ? results.reduce((s, r) => s + r.recall, 0) / results.length : 0;
+  return {
+    totalOutcomes: outcomes.length,
+    passed: passed.length,
+    failed: outcomes.length - passed.length,
+    overallPrecision: avgPrecision,
+    overallRecall: avgRecall,
+    perOutcome: results,
+  };
+}
+export interface EvalTestCase {
+  id: string;
+  description: string;
+  pageTitle: string;
+  pageId: number;
+  revisionRange: { from: number; to: number };
+  expectedEvents: ExpectedEvent[];
+  tolerance?: EvalTolerance;
+}
+export interface ExpectedEvent {
+  eventType: string;
+  section: string;
+  minConfidence?: number;
+}
+export interface EvalTolerance {
+  minEventCount?: number;
+  maxEventCount?: number;
+  minPrecision?: number;
+}
+export interface EvalResult {
+  testId: string;
+  passed: boolean;
+  precision: number;
+  eventCount: { expected: number; actual: number };
+  matches: EventMatch[];
+  misses: MissingEvent[];
+  falsePositives: UnexpectedEvent[];
+}
+export interface EventMatch {
+  expected: ExpectedEvent;
+  actual: EvidenceEvent;
+}
+export interface MissingEvent {
+  expected: ExpectedEvent;
+}
+export interface UnexpectedEvent {
+  event: EvidenceEvent;
+}
+export interface EvalHarness {
+  evaluate(test: EvalTestCase, events: EvidenceEvent[]): EvalResult;
+  benchmarkPages(): EvalTestCase[];
+  computeScores(results: EvalResult[]): EvalScoreSummary;
+}
+export interface EvalScoreSummary {
+  overallPrecision: number;
+  testsPassed: number;
+  testsFailed: number;
+  totalTests: number;
+  perTest: Array<{ id: string; precision: number; passed: boolean }>;
+}
+export function createEvalHarness(): EvalHarness {
+  return {
+    evaluate(test, events) {
+      const matches: EventMatch[] = [];
+      const misses: MissingEvent[] = [];
+      const falsePositives: UnexpectedEvent[] = [];
+      for (const expected of test.expectedEvents) {
+        const found = events.find((e) => e.eventType === expected.eventType && e.section === expected.section);
+        if (found) {
+          matches.push({ expected, actual: found });
+        } else {
+          misses.push({ expected });
+        }
+      }
+      for (const event of events) {
+        if (!test.expectedEvents.some((e) => e.eventType === event.eventType)) {
+          falsePositives.push({ event });
+        }
+      }
+      const matchedCount = matches.length;
+      const totalExpected = test.expectedEvents.length;
+      const precision = totalExpected > 0 ? matchedCount / totalExpected : events.length === 0 ? 1.0 : 0.0;
+      const tolerance = test.tolerance ?? {};
+      const minEventCount = tolerance.minEventCount ?? 0;
+      const maxEventCount = tolerance.maxEventCount ?? Infinity;
+      const minPrecision = tolerance.minPrecision ?? 0.5;
+      const passed = precision >= minPrecision && events.length >= minEventCount && events.length <= maxEventCount;
+      return {
+        testId: test.id,
+        passed,
+        precision,
+        eventCount: { expected: totalExpected, actual: events.length },
+        matches,
+        misses,
+        falsePositives,
+      };
+    },
+    benchmarkPages(): EvalTestCase[] {
+      return [
+        {
+          id: "page-has-revisions",
+          description: "Any active Wikipedia page returns at least 2 revisions and generates section events",
+          pageTitle: "Earth",
+          pageId: 9228,
+          revisionRange: { from: 0, to: 0 },
+          expectedEvents: [{ eventType: "section_reorganized", section: "(lead)" }],
+          tolerance: { minEventCount: 1, minPrecision: 0.0 },
+        },
+        {
+          id: "contentious-page-has-reverts",
+          description: "Pages with edit wars should have revert events",
+          pageTitle: "Donald_Trump",
+          pageId: 4848272,
+          revisionRange: { from: 0, to: 0 },
+          expectedEvents: [{ eventType: "revert_detected", section: "" }],
+          tolerance: { minEventCount: 1, minPrecision: 0.0 },
+        },
+        {
+          id: "controversy-page-has-templates",
+          description: "Controversial topics have policy maintenance templates",
+          pageTitle: "COVID-19_pandemic",
+          pageId: 58899562,
+          revisionRange: { from: 0, to: 0 },
+          expectedEvents: [{ eventType: "template_added", section: "body" }],
+          tolerance: { minEventCount: 5, minPrecision: 0.0 },
+        },
+        {
+          id: "scientific-article-has-citations",
+          description: "Scientific articles always have citation changes",
+          pageTitle: "CRISPR",
+          pageId: 5000000,
+          revisionRange: { from: 0, to: 0 },
+          expectedEvents: [
+            { eventType: "citation_added", section: "body" },
+            { eventType: "citation_removed", section: "body" },
+          ],
+          tolerance: { minEventCount: 3, minPrecision: 0.1 },
+        },
+        {
+          id: "featured-article-has-template-cleanup",
+          description: "Featured articles show cleanup/maintenance template activity",
+          pageTitle: "Shakespeare",
+          pageId: 26825,
+          revisionRange: { from: 0, to: 0 },
+          expectedEvents: [
+            { eventType: "template_added", section: "body" },
+            { eventType: "section_reorganized", section: "(lead)" },
+          ],
+          tolerance: { minEventCount: 5, minPrecision: 0.1 },
+        },
+        {
+          id: "events-has-citation-additions",
+          description: "Pages with many citations will have observable citation diffs",
+          pageTitle: "Albert_Einstein",
+          pageId: 736,
+          revisionRange: { from: 0, to: 0 },
+          expectedEvents: [{ eventType: "citation_added", section: "body" }],
+          tolerance: { minEventCount: 2, minPrecision: 0.0 },
+        },
+      ];
+    },
+    computeScores(results) {
+      const passed = results.filter((r) => r.passed);
+      const totalPrecision = results.length > 0 ? results.reduce((sum, r) => sum + r.precision, 0) / results.length : 0;
+      return {
+        overallPrecision: totalPrecision,
+        testsPassed: passed.length,
+        testsFailed: results.length - passed.length,
+        totalTests: results.length,
+        perTest: results.map((r) => ({
+          id: r.testId,
+          precision: r.precision,
+          passed: r.passed,
+        })),
+      };
+    },
+  };
+}
+export { GROUND_TRUTH_LABELS, getGroundTruthById, getGroundTruthForPage } from "./ground-truth.js";

package/src/l2-benchmark.ts ADDED Viewed

@@ -0,0 +1,454 @@
+import type { EvidenceEvent } from "@var-ia/evidence-graph";
+import type { ModelConfig } from "@var-ia/interpreter";
+import { createAdapter } from "@var-ia/interpreter";
+export interface L2TestCase {
+  id: string;
+  description: string;
+  events: EvidenceEvent[];
+  expected: ExpectedInterpretation[];
+}
+export interface ExpectedInterpretation {
+  eventIndex: number;
+  semanticChange?: string;
+  confidence?: number;
+  policyDimension?: string;
+  discussionType?: string;
+}
+export interface L2MetricScore {
+  metric: string;
+  correct: number;
+  total: number;
+  accuracy: number;
+}
+export interface L2ProviderResult {
+  provider: string;
+  model: string;
+  metrics: L2MetricScore[];
+  avgConfidence: number;
+  overallAccuracy: number;
+  totalEvents: number;
+}
+export interface L2BenchmarkResult {
+  generatedAt: string;
+  testCases: number;
+  totalEvents: number;
+  providers: L2ProviderResult[];
+}
+export async function runL2Benchmark(providers: ModelConfig[]): Promise<L2BenchmarkResult> {
+  const testCases = buildL2Dataset();
+  const totalEvents = testCases.reduce((s, tc) => s + tc.events.length, 0);
+  const providerResults: L2ProviderResult[] = [];
+  for (const config of providers) {
+    try {
+      const adapter = createAdapter(config);
+      const results = await runProviderBenchmark(config, adapter, testCases);
+      providerResults.push(results);
+    } catch {
+      providerResults.push({
+        provider: config.provider,
+        model: config.model ?? "unknown",
+        metrics: [],
+        avgConfidence: 0,
+        overallAccuracy: 0,
+        totalEvents: 0,
+      });
+    }
+  }
+  return {
+    generatedAt: new Date().toISOString(),
+    testCases: testCases.length,
+    totalEvents,
+    providers: providerResults,
+  };
+}
+async function runProviderBenchmark(
+  config: ModelConfig,
+  adapter: ReturnType<typeof createAdapter>,
+  testCases: L2TestCase[],
+): Promise<L2ProviderResult> {
+  let eventsCorrect = 0;
+  let totalEvents = 0;
+  let totalConfidence = 0;
+  let confidenceCount = 0;
+  const metricBuckets: Record<string, { correct: number; total: number }> = {
+    semanticChange: { correct: 0, total: 0 },
+    policyDimension: { correct: 0, total: 0 },
+    discussionType: { correct: 0, total: 0 },
+  };
+  for (const tc of testCases) {
+    const interpreted = await adapter.interpret(tc.events);
+    for (const exp of tc.expected) {
+      const actual = interpreted[exp.eventIndex];
+      if (!actual?.modelInterpretation) continue;
+      totalEvents++;
+      let eventCorrect = true;
+      if (exp.semanticChange) {
+        metricBuckets.semanticChange.total++;
+        if (actual.modelInterpretation.semanticChange === exp.semanticChange) {
+          metricBuckets.semanticChange.correct++;
+        } else {
+          eventCorrect = false;
+        }
+      }
+      if (exp.policyDimension) {
+        metricBuckets.policyDimension.total++;
+        if (actual.modelInterpretation.policyDimension === exp.policyDimension) {
+          metricBuckets.policyDimension.correct++;
+        } else {
+          eventCorrect = false;
+        }
+      }
+      if (exp.discussionType) {
+        metricBuckets.discussionType.total++;
+        if (actual.modelInterpretation.discussionType === exp.discussionType) {
+          metricBuckets.discussionType.correct++;
+        } else {
+          eventCorrect = false;
+        }
+      }
+      if (eventCorrect) eventsCorrect++;
+      if (exp.confidence !== undefined) {
+        totalConfidence += actual.modelInterpretation.confidence;
+        confidenceCount++;
+      }
+    }
+  }
+  const metrics: L2MetricScore[] = Object.entries(metricBuckets)
+    .filter(([, b]) => b.total > 0)
+    .map(([metric, b]) => ({
+      metric,
+      correct: b.correct,
+      total: b.total,
+      accuracy: b.total > 0 ? Math.round((b.correct / b.total) * 10000) / 100 : 0,
+    }));
+  return {
+    provider: config.provider,
+    model: config.model ?? "default",
+    metrics,
+    avgConfidence: confidenceCount > 0 ? Math.round((totalConfidence / confidenceCount) * 100) / 100 : 0,
+    overallAccuracy: totalEvents > 0 ? Math.round((eventsCorrect / totalEvents) * 10000) / 100 : 0,
+    totalEvents,
+  };
+}
+export function printBenchmarkResult(result: L2BenchmarkResult): void {
+  console.log(`\n=== L2 Quality Benchmarks ===`);
+  console.log(`Test cases: ${result.testCases} (${result.totalEvents} events)`);
+  console.log(`Providers:  ${result.providers.length}`);
+  console.log();
+  for (const p of result.providers) {
+    console.log(`── ${p.provider}/${p.model} ──`);
+    if (p.totalEvents === 0) {
+      console.log("  (skipped — no results)");
+      continue;
+    }
+    console.log(`  Overall accuracy: ${p.overallAccuracy}%`);
+    console.log(`  Avg confidence:  ${p.avgConfidence}`);
+    for (const m of p.metrics) {
+      console.log(`  ${m.metric}: ${m.correct}/${m.total} (${m.accuracy}%)`);
+    }
+    console.log();
+  }
+}
+function makeEvent(overrides: Partial<EvidenceEvent> = {}): EvidenceEvent {
+  return {
+    eventType: "claim_first_seen",
+    fromRevisionId: 1,
+    toRevisionId: 2,
+    section: "lead",
+    before: "",
+    after: "",
+    deterministicFacts: [],
+    layer: "observed",
+    timestamp: "2024-01-01T00:00:00Z",
+    ...overrides,
+  };
+}
+export function buildL2Dataset(): L2TestCase[] {
+  return [
+    {
+      id: "simple-claim-add",
+      description: "A new factual claim appears in the lead section",
+      events: [
+        makeEvent({
+          eventType: "claim_first_seen",
+          section: "lead",
+          after: "Earth is the third planet from the Sun",
+          deterministicFacts: [{ fact: "claim_detected", detail: "sentence_length=42" }],
+        }),
+      ],
+      expected: [
+        {
+          eventIndex: 0,
+          semanticChange: "factual claim introduced",
+          policyDimension: "verifiability",
+        },
+      ],
+    },
+    {
+      id: "claim-removal",
+      description: "A claim is removed between revisions",
+      events: [
+        makeEvent({
+          eventType: "claim_removed",
+          section: "body",
+          before: "Some scientists believe the theory is flawed",
+          deterministicFacts: [{ fact: "claim_removed", detail: "sentence_length=42" }],
+        }),
+      ],
+      expected: [
+        {
+          eventIndex: 0,
+          semanticChange: "factual claim removed",
+          policyDimension: "verifiability",
+        },
+      ],
+    },
+    {
+      id: "claim-softened",
+      description: "A claim is softened with hedging language",
+      events: [
+        makeEvent({
+          eventType: "claim_softened",
+          section: "body",
+          before: "The update resolves the outage",
+          after: "The update may reduce the outage",
+          deterministicFacts: [{ fact: "claim_changed", detail: "change=softened" }],
+        }),
+      ],
+      expected: [
+        {
+          eventIndex: 0,
+          semanticChange: "claim softened with hedging",
+          policyDimension: "npov",
+        },
+      ],
+    },
+    {
+      id: "claim-strengthened",
+      description: "A claim is strengthened with more definitive language",
+      events: [
+        makeEvent({
+          eventType: "claim_strengthened",
+          section: "body",
+          before: "The event may have occurred in 1920",
+          after: "The event occurred in 1920",
+          deterministicFacts: [{ fact: "claim_changed", detail: "change=strengthened" }],
+        }),
+      ],
+      expected: [
+        {
+          eventIndex: 0,
+          semanticChange: "claim strengthened with definitive language",
+        },
+      ],
+    },
+    {
+      id: "citation-added",
+      description: "A citation is added to support a claim",
+      events: [
+        makeEvent({
+          eventType: "citation_added",
+          section: "body",
+          after: '<ref name="smith2023">{{cite journal |title=Study}}</ref>',
+          deterministicFacts: [{ fact: "citation_changed", detail: "type=added" }],
+        }),
+      ],
+      expected: [
+        {
+          eventIndex: 0,
+          semanticChange: "citation added to support claim",
+          policyDimension: "verifiability",
+        },
+      ],
+    },
+    {
+      id: "citation-removed",
+      description: "A citation is removed from a claim",
+      events: [
+        makeEvent({
+          eventType: "citation_removed",
+          section: "body",
+          before: '<ref name="old2020">{{cite web |title=Old}}</ref>',
+          deterministicFacts: [{ fact: "citation_changed", detail: "type=removed" }],
+        }),
+      ],
+      expected: [
+        {
+          eventIndex: 0,
+          semanticChange: "citation removed from article",
+          policyDimension: "verifiability",
+        },
+      ],
+    },
+    {
+      id: "template-npov",
+      description: "A POV template is added, indicating neutrality concern",
+      events: [
+        makeEvent({
+          eventType: "template_added",
+          section: "body",
+          after: "POV",
+          deterministicFacts: [
+            { fact: "template_changed", detail: "name=POV type=added" },
+            { fact: "policy_signal", detail: "dimension=npov signal=pov" },
+          ],
+          layer: "policy_coded",
+        }),
+      ],
+      expected: [
+        {
+          eventIndex: 0,
+          semanticChange: "neutrality concern template added",
+          policyDimension: "npov",
+        },
+      ],
+    },
+    {
+      id: "blp-template",
+      description: "A BLP template is added to a biography",
+      events: [
+        makeEvent({
+          eventType: "template_added",
+          section: "body",
+          after: "BLP sources",
+          deterministicFacts: [
+            { fact: "template_changed", detail: "name=BLP sources type=added" },
+            { fact: "policy_signal", detail: "dimension=blp signal=blp_sources" },
+          ],
+          layer: "policy_coded",
+        }),
+      ],
+      expected: [
+        {
+          eventIndex: 0,
+          semanticChange: "BLP sourcing concern template added",
+          policyDimension: "blp",
+        },
+      ],
+    },
+    {
+      id: "revert-detected",
+      description: "A revert is detected in edit war",
+      events: [
+        makeEvent({
+          eventType: "revert_detected",
+          section: "",
+          after: "Undid revision 123456 by UserX",
+          deterministicFacts: [
+            { fact: "revert_detected", detail: "Undid revision 123456 by UserX" },
+            { fact: "policy_signal", detail: "dimension=edit_warring signal=revert_detected" },
+          ],
+          layer: "policy_coded",
+        }),
+      ],
+      expected: [
+        {
+          eventIndex: 0,
+          semanticChange: "revert detected indicating edit warring",
+          policyDimension: "edit_warring",
+        },
+      ],
+    },
+    {
+      id: "talk-sourcing-dispute",
+      description: "Talk page discussion about sourcing",
+      events: [
+        makeEvent({
+          eventType: "talk_page_correlated",
+          section: "Sources",
+          deterministicFacts: [{ fact: "talk_revision_match", detail: "type=discussion" }],
+        }),
+      ],
+      expected: [
+        {
+          eventIndex: 0,
+          semanticChange: "talk page discussion about sources",
+          discussionType: "sourcing_dispute",
+        },
+      ],
+    },
+    {
+      id: "talk-notability",
+      description: "Talk page discussion about notability",
+      events: [
+        makeEvent({
+          eventType: "talk_page_correlated",
+          section: "Notability",
+          deterministicFacts: [{ fact: "talk_revision_match", detail: "type=discussion" }],
+        }),
+      ],
+      expected: [
+        {
+          eventIndex: 0,
+          semanticChange: "talk page discussion about notability",
+          discussionType: "notability_challenge",
+        },
+      ],
+    },
+    {
+      id: "category-change",
+      description: "Category added, changing page classification",
+      events: [
+        makeEvent({
+          eventType: "category_added",
+          section: "",
+          after: "Living people",
+          deterministicFacts: [{ fact: "category_added", detail: "category=Living people" }],
+        }),
+      ],
+      expected: [
+        {
+          eventIndex: 0,
+          semanticChange: "category added to page classification",
+        },
+      ],
+    },
+    {
+      id: "protection-change",
+      description: "Page protection level changed",
+      events: [
+        makeEvent({
+          eventType: "protection_changed",
+          section: "",
+          after: "protect",
+          deterministicFacts: [
+            { fact: "protection_changed", detail: "name=pp-protect type=added" },
+            { fact: "policy_signal", detail: "dimension=protection signal=page_protected" },
+          ],
+          layer: "policy_coded",
+        }),
+      ],
+      expected: [
+        {
+          eventIndex: 0,
+          semanticChange: "page protection level changed",
+          policyDimension: "protection",
+        },
+      ],
+    },
+  ];
+}