@irisrun/evals 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,23 @@
1
+ import type { EngineDeps, Json, TurnOutcome } from "@irisrun/core";
2
+ import { type SessionInspection } from "@irisrun/inspect";
3
+ export interface EvalCase<S extends Json> {
4
+ name: string;
5
+ build(): {
6
+ deps: EngineDeps<S>;
7
+ sessionId: string;
8
+ };
9
+ turns?: number;
10
+ }
11
+ export type Scorer<S extends Json> = (inspection: SessionInspection, outcome: TurnOutcome<S>) => Json;
12
+ export interface EvalResult {
13
+ name: string;
14
+ score: Json;
15
+ status: TurnOutcome<Json>["status"] | "open";
16
+ }
17
+ export interface SuiteResult {
18
+ results: EvalResult[];
19
+ }
20
+ /** Run one eval case to score. Deterministic and reproducible across invocations. */
21
+ export declare function runEval<S extends Json>(c: EvalCase<S>, scorer: Scorer<S>): Promise<EvalResult>;
22
+ /** Run a suite of cases under one scorer; aggregate the reproducible results. */
23
+ export declare function runSuite<S extends Json>(cases: EvalCase<S>[], scorer: Scorer<S>): Promise<SuiteResult>;
package/dist/evals.js ADDED
@@ -0,0 +1,33 @@
1
+ // The reproducible-eval arbiter (spec 03 §7): "reproducible evals are the arbiter,
2
+ // not editorial taste." An EvalCase is a DETERMINISTIC scenario; a Scorer reads the
3
+ // recorded session (via @irisrun/inspect) and the last turn outcome. runEval calls
4
+ // case.build() on EVERY invocation, so it gets a FRESH store AND fresh performers
5
+ // (the scripted-model/-tool closure index resets to 0); within a single run the
6
+ // built performers PERSIST across the `turns` (a park→resume advances the index).
7
+ // It runs EXACTLY `turns` (default 1) sequential runTurn calls — NEVER
8
+ // loop-until-finished (a perpetually-parking case must not hang) — and scores the
9
+ // LAST outcome. Same case+scorer → byte-identical score; swapped tactic → different.
10
+ // Runner is core runTurn (no host needed). READ-ONLY scoring.
11
+ import { runTurn } from "@irisrun/core";
12
+ import { inspectSession } from "@irisrun/inspect";
13
+ /** Run one eval case to score. Deterministic and reproducible across invocations. */
14
+ export async function runEval(c, scorer) {
15
+ const { deps, sessionId } = c.build(); // FRESH store + performers (index resets to 0)
16
+ const turns = c.turns ?? 1;
17
+ let outcome = null;
18
+ for (let i = 0; i < turns; i++) {
19
+ // reuse `deps` across turns → performers persist (park→resume advances state)
20
+ outcome = await runTurn(deps, sessionId);
21
+ }
22
+ const inspection = await inspectSession(deps.store, sessionId);
23
+ const status = outcome ? outcome.status : "open";
24
+ const score = outcome ? scorer(inspection, outcome) : null;
25
+ return { name: c.name, score, status };
26
+ }
27
+ /** Run a suite of cases under one scorer; aggregate the reproducible results. */
28
+ export async function runSuite(cases, scorer) {
29
+ const results = [];
30
+ for (const c of cases)
31
+ results.push(await runEval(c, scorer));
32
+ return { results };
33
+ }
@@ -0,0 +1,5 @@
1
+ export declare const PACKAGE = "@irisrun/evals";
2
+ export { runEval, runSuite } from "./evals.js";
3
+ export type { EvalCase, Scorer, EvalResult, SuiteResult } from "./evals.js";
4
+ export { reproduce } from "./reproduce.js";
5
+ export type { ReproReport } from "./reproduce.js";
package/dist/index.js ADDED
@@ -0,0 +1,4 @@
1
+ // @irisrun/evals — public surface (host; reproducible-eval arbiter, read-only scoring).
2
+ export const PACKAGE = "@irisrun/evals";
3
+ export { runEval, runSuite } from "./evals.js";
4
+ export { reproduce } from "./reproduce.js";
@@ -0,0 +1,18 @@
1
+ import type { Json } from "@irisrun/core";
2
+ import type { EvalCase, Scorer, EvalResult } from "./evals.js";
3
+ export type ReproReport = {
4
+ name: string;
5
+ reproducible: boolean;
6
+ runs: number;
7
+ result: EvalResult;
8
+ journalDigest: string;
9
+ divergence?: {
10
+ run: number;
11
+ field: "score" | "status" | "journal";
12
+ };
13
+ };
14
+ /** Run an EvalCase N≥2 times and prove byte-identical results. Deterministic given a
15
+ * deterministic case; locates the first divergence otherwise. */
16
+ export declare function reproduce<S extends Json>(c: EvalCase<S>, scorer: Scorer<S>, opts?: {
17
+ runs?: number;
18
+ }): Promise<ReproReport>;
@@ -0,0 +1,69 @@
1
+ // reproduce() (roadmap P2-8): makes "reproducible evals" an EXPLICIT, provable
2
+ // feature, not just an implicit property. It runs an EvalCase N independent times
3
+ // (each `case.build()` is a fresh store + performers, index reset to 0 — the EvalCase
4
+ // contract) and proves byte-identical {score, status, FULL-journal digest} across
5
+ // runs. The journal digest is the strong claim: not only does the score match, the
6
+ // entire recorded session is byte-identical run-to-run. First divergence is located.
7
+ //
8
+ // PRECONDITION: the full-journal digest reads from seq 0, so it covers the COMPLETE
9
+ // journal only when the case does not truncate (the eval norm — short cases, default
10
+ // no snapshot, or keepHistory). If a case truncates, the digest covers the retained
11
+ // tail; the reproducibility verdict stays sound (both runs truncate identically).
12
+ import { runTurn, canonicalize, decode } from "@irisrun/core";
13
+ import { inspectSession } from "@irisrun/inspect";
14
+ // A tiny pure FNV-1a (32-bit) hex hash — a short fingerprint, not a security hash.
15
+ // Inlined so @irisrun/evals adds no dependency (mirrors @irisrun/audit's fnv.ts).
16
+ function fnv1a32hex(s) {
17
+ let h = 0x811c9dc5;
18
+ for (let i = 0; i < s.length; i++) {
19
+ h ^= s.charCodeAt(i);
20
+ h = Math.imul(h, 0x01000193) >>> 0;
21
+ }
22
+ return (h >>> 0).toString(16).padStart(8, "0");
23
+ }
24
+ /** Run an EvalCase N≥2 times and prove byte-identical results. Deterministic given a
25
+ * deterministic case; locates the first divergence otherwise. */
26
+ export async function reproduce(c, scorer, opts) {
27
+ const runs = Math.max(2, opts?.runs ?? 2); // reproducibility needs ≥2 runs
28
+ const turns = c.turns ?? 1;
29
+ let firstResult = null;
30
+ let canonical = null;
31
+ let divergence;
32
+ for (let i = 0; i < runs; i++) {
33
+ const { deps, sessionId } = c.build(); // FRESH store + performers (index resets to 0)
34
+ let outcome = null;
35
+ for (let t = 0; t < turns; t++)
36
+ outcome = await runTurn(deps, sessionId);
37
+ const inspection = await inspectSession(deps.store, sessionId);
38
+ const status = outcome ? outcome.status : "open";
39
+ const score = outcome ? scorer(inspection, outcome) : null;
40
+ const rows = await deps.store.readJournal(sessionId, 0); // full retained journal
41
+ const journalDigest = fnv1a32hex(canonicalize(rows.map((r) => decode(r.bytes))));
42
+ const sig = { score: canonicalize(score), status: canonicalize(status), journalDigest };
43
+ if (i === 0) {
44
+ firstResult = { name: c.name, score, status };
45
+ canonical = sig;
46
+ }
47
+ else if (canonical) {
48
+ // First divergence wins, in field-precedence order score → status → journal
49
+ // (a run differing in several fields reports the first by this order).
50
+ if (sig.score !== canonical.score)
51
+ divergence = { run: i, field: "score" };
52
+ else if (sig.status !== canonical.status)
53
+ divergence = { run: i, field: "status" };
54
+ else if (sig.journalDigest !== canonical.journalDigest)
55
+ divergence = { run: i, field: "journal" };
56
+ if (divergence)
57
+ break;
58
+ }
59
+ }
60
+ return {
61
+ name: c.name,
62
+ reproducible: divergence === undefined,
63
+ runs,
64
+ // firstResult/canonical are always set (runs ≥ 2 ⇒ the i===0 branch ran)
65
+ result: firstResult,
66
+ journalDigest: canonical.journalDigest,
67
+ divergence,
68
+ };
69
+ }
package/package.json ADDED
@@ -0,0 +1,33 @@
1
+ {
2
+ "name": "@irisrun/evals",
3
+ "version": "0.1.0",
4
+ "type": "module",
5
+ "description": "Iris reproducible-eval arbiter (spec 03 §7) — run a deterministic scenario (fresh store + scripted performers per run), then score the recorded session via @irisrun/inspect. The arbiter is reproducibility, not taste: the same case+scorer re-runs byte-identically; a swapped tactic scores differently but reproducibly. Runner is core runTurn; deps @irisrun/core + @irisrun/inspect.",
6
+ "exports": {
7
+ ".": {
8
+ "iris-src": "./src/index.ts",
9
+ "types": "./dist/index.d.ts",
10
+ "default": "./dist/index.js"
11
+ }
12
+ },
13
+ "dependencies": {
14
+ "@irisrun/core": "^0.1.0",
15
+ "@irisrun/inspect": "^0.1.0"
16
+ },
17
+ "license": "MIT",
18
+ "engines": {
19
+ "node": ">=24"
20
+ },
21
+ "publishConfig": {
22
+ "access": "public"
23
+ },
24
+ "repository": {
25
+ "type": "git",
26
+ "url": "git+https://github.com/xoai/iris.git",
27
+ "directory": "packages/evals"
28
+ },
29
+ "homepage": "https://github.com/xoai/iris#readme",
30
+ "files": [
31
+ "dist"
32
+ ]
33
+ }