useathena 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/README.md +258 -0
  2. package/apps/chrome-extension/README.md +35 -0
  3. package/apps/chrome-extension/background.js +97 -0
  4. package/apps/chrome-extension/gmail.js +107 -0
  5. package/apps/chrome-extension/linkedin.js +123 -0
  6. package/apps/chrome-extension/manifest.json +27 -0
  7. package/apps/chrome-extension/options.html +60 -0
  8. package/apps/chrome-extension/options.js +36 -0
  9. package/apps/chrome-extension/popup.html +37 -0
  10. package/apps/chrome-extension/popup.js +22 -0
  11. package/bin/athena +28 -0
  12. package/dist/api/server.js +145 -0
  13. package/dist/capture/ingest.js +85 -0
  14. package/dist/cli/commands.js +201 -0
  15. package/dist/cli/format.js +76 -0
  16. package/dist/cli/setup.js +316 -0
  17. package/dist/cli.js +291 -0
  18. package/dist/config.js +26 -0
  19. package/dist/core/fixtures.js +65 -0
  20. package/dist/core/ids.js +34 -0
  21. package/dist/core/refs.js +25 -0
  22. package/dist/core/types.js +10 -0
  23. package/dist/engine/engine.js +136 -0
  24. package/dist/engine/parse.js +76 -0
  25. package/dist/engine/prompts.js +64 -0
  26. package/dist/eval/harness.js +123 -0
  27. package/dist/eval/judge.js +75 -0
  28. package/dist/eval/run-eval.js +46 -0
  29. package/dist/eval/scenarios.js +470 -0
  30. package/dist/mcp/server.js +107 -0
  31. package/dist/mcp-server.js +7 -0
  32. package/dist/model/api-model-client.js +99 -0
  33. package/dist/model/cli-model-client.js +111 -0
  34. package/dist/model/model-client.js +28 -0
  35. package/dist/model/registry.js +67 -0
  36. package/dist/sensors/claude-code-hook.js +131 -0
  37. package/dist/serve/brief.js +95 -0
  38. package/dist/serve/outcome.js +56 -0
  39. package/dist/store/open.js +19 -0
  40. package/dist/store/store.js +269 -0
  41. package/docs/schema.md +368 -0
  42. package/package.json +43 -0
  43. package/scripts/prepare.mjs +20 -0
package/dist/config.js ADDED
@@ -0,0 +1,26 @@
1
+ import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
2
+ import { homedir } from "node:os";
3
+ import { dirname, join } from "node:path";
4
+ /**
5
+ * athena is a personal layer: one store per user by default, overridable for
6
+ * tests and project-scoped setups via ATHENA_DB.
7
+ */
8
+ export function dbPath() {
9
+ return process.env.ATHENA_DB ?? join(homedir(), ".athena", "athena.db");
10
+ }
11
+ export function configPath() {
12
+ return process.env.ATHENA_CONFIG ?? join(homedir(), ".athena", "config.json");
13
+ }
14
+ export function loadConfig() {
15
+ try {
16
+ return JSON.parse(readFileSync(configPath(), "utf8"));
17
+ }
18
+ catch {
19
+ return {};
20
+ }
21
+ }
22
+ export function saveConfig(config) {
23
+ const path = configPath();
24
+ mkdirSync(dirname(path), { recursive: true });
25
+ writeFileSync(path, `${JSON.stringify(config, null, 2)}\n`, { mode: 0o600 });
26
+ }
@@ -0,0 +1,65 @@
1
+ import { newId } from "./ids.js";
2
+ /**
3
+ * Fixture builders for tests and the eval harness. Defaults are valid records;
4
+ * override only what the scenario needs.
5
+ */
6
+ export const TEST_ACTOR = "act_0000000000TESTACTOR0000XX";
7
+ export function makeInstance(overrides = {}) {
8
+ const base = {
9
+ id: newId("ins"),
10
+ kind: "correction",
11
+ observedAt: new Date(0).toISOString(),
12
+ situation: {
13
+ summary: "drafting cold outreach to enterprise CTO",
14
+ domain: "email.outreach",
15
+ cues: [],
16
+ objectIds: [],
17
+ },
18
+ before: {
19
+ mediaType: "text/plain",
20
+ content: "Hi Priya, hope you're doing great!",
21
+ contentHash: "hash-before",
22
+ },
23
+ after: {
24
+ mediaType: "text/plain",
25
+ content: "Dear Ms. Shah, I hope this finds you well.",
26
+ contentHash: "hash-after",
27
+ },
28
+ diff: {
29
+ summary: "changed greeting from casual first-name to formal title",
30
+ hunks: [{ before: "Hi Priya", after: "Dear Ms. Shah" }],
31
+ magnitude: "minor",
32
+ },
33
+ probeAnswers: [],
34
+ sensorId: "sen_test",
35
+ actorId: TEST_ACTOR,
36
+ sourceRefs: [],
37
+ objectIds: [],
38
+ visibility: "user_private_raw",
39
+ canPromote: false,
40
+ canUseForAgents: false,
41
+ };
42
+ return { ...base, ...overrides };
43
+ }
44
+ export function makeHypothesis(overrides = {}) {
45
+ const base = {
46
+ id: newId("hyp"),
47
+ status: "candidate",
48
+ rule: "When drafting outreach to enterprise contacts, use formal titles in greetings.",
49
+ cues: ["recipient is senior at an enterprise account", "first written contact"],
50
+ expectancies: ["recipient replies without tone friction"],
51
+ domain: "email.outreach",
52
+ appliesWhen: ["first or early-stage outreach", "recipient seniority is director+"],
53
+ doesNotApplyWhen: ["established casual rapport with the recipient"],
54
+ supportingInstanceIds: [newId("ins")],
55
+ counterexampleInstanceIds: [],
56
+ confidence: 0.5,
57
+ validity: { fires: 0, upheld: 0, overridden: 0 },
58
+ replay: { tested: 0, reproduced: 0 },
59
+ createdAt: new Date(0).toISOString(),
60
+ staleAfter: new Date(90 * 24 * 3600 * 1000).toISOString(),
61
+ visibility: "user_private",
62
+ review: { state: "unreviewed" },
63
+ };
64
+ return { ...base, ...overrides };
65
+ }
@@ -0,0 +1,34 @@
1
+ import { randomBytes } from "node:crypto";
2
+ /**
3
+ * Prefixed ULIDs: time-sortable, collision-safe, and self-describing
4
+ * (`ins_01J...` is an instance wherever it appears).
5
+ */
6
+ const ENCODING = "0123456789ABCDEFGHJKMNPQRSTVWXYZ"; // Crockford base32
7
+ const TIME_LEN = 10;
8
+ const RANDOM_LEN = 16;
9
+ function encodeTime(now) {
10
+ let value = now;
11
+ let out = "";
12
+ for (let i = 0; i < TIME_LEN; i++) {
13
+ out = ENCODING[value % 32] + out;
14
+ value = Math.floor(value / 32);
15
+ }
16
+ return out;
17
+ }
18
+ function encodeRandom() {
19
+ const bytes = randomBytes(RANDOM_LEN);
20
+ let out = "";
21
+ for (let i = 0; i < RANDOM_LEN; i++) {
22
+ out += ENCODING[bytes[i] % 32];
23
+ }
24
+ return out;
25
+ }
26
+ export function ulid(now = Date.now()) {
27
+ return encodeTime(now) + encodeRandom();
28
+ }
29
+ export function newId(prefix, now) {
30
+ return `${prefix}_${ulid(now)}`;
31
+ }
32
+ export function hasPrefix(id, prefix) {
33
+ return id.startsWith(`${prefix}_`) && id.length === prefix.length + 1 + TIME_LEN + RANDOM_LEN;
34
+ }
@@ -0,0 +1,25 @@
1
+ const KIND_BY_PREFIX = {
2
+ ins: "instance",
3
+ hyp: "hypothesis",
4
+ obj: "object",
5
+ src: "source",
6
+ out: "outcome",
7
+ brf: "brief",
8
+ };
9
+ export function refTo(id, fragment) {
10
+ const prefix = id.split("_")[0] ?? "";
11
+ const kind = KIND_BY_PREFIX[prefix];
12
+ if (!kind)
13
+ throw new Error(`cannot build ref for unknown id prefix: ${id}`);
14
+ return `athena://${kind}/${id}${fragment ? `#${fragment}` : ""}`;
15
+ }
16
+ export function parseRef(ref) {
17
+ const match = /^athena:\/\/([a-z]+)\/([a-z]{3}_[0-9A-HJKMNP-TV-Z]+)(?:#(.+))?$/.exec(ref);
18
+ if (!match)
19
+ throw new Error(`invalid athena ref: ${ref}`);
20
+ const [, kindRaw, id, fragment] = match;
21
+ const kind = Object.values(KIND_BY_PREFIX).find((k) => k === kindRaw);
22
+ if (!kind)
23
+ throw new Error(`invalid athena ref kind: ${ref}`);
24
+ return fragment === undefined ? { kind, id: id } : { kind, id: id, fragment };
25
+ }
@@ -0,0 +1,10 @@
1
+ /**
2
+ * athena core domain types. Contract: docs/schema.md.
3
+ *
4
+ * Design rules enforced here:
5
+ * - Instances are immutable evidence; hypotheses are revisable views over evidence.
6
+ * - Everything cites: hypotheses carry instance ids, brief facts carry refs.
7
+ * - Privacy-zone fields are mandatory on capture-derived records.
8
+ * - Enums start minimal; widen only when real data forces it.
9
+ */
10
+ export {};
@@ -0,0 +1,136 @@
1
+ import { newId } from "../core/ids.js";
2
+ import { INFERENCE_SYSTEM, REPLAY_SYSTEM, inferencePrompt, replayPrompt } from "./prompts.js";
3
+ import { parseInferenceResponse, parseReplayResponse } from "./parse.js";
4
+ /**
5
+ * The hypothesis engine: instances in, evidence-cited hypotheses out.
6
+ *
7
+ * Per domain cluster:
8
+ * 1. hold out the most recent correction (the engine never sees its "after"),
9
+ * 2. one contrastive inference pass over the rest,
10
+ * 3. replay every candidate against the holdout — the validation gate.
11
+ *
12
+ * Candidates that reproduce the held-out edit become "validated"; candidates
13
+ * that fail keep "candidate" status with docked confidence. There is no
14
+ * deterministic fallback: no model, no inference.
15
+ */
16
+ const MIN_CLUSTER_SIZE = 2;
17
+ const MIN_CORRECTIONS_FOR_HOLDOUT = 3;
18
+ const STALE_AFTER_DAYS = 90;
19
+ const FAILED_REPLAY_CONFIDENCE_CAP = 0.35;
20
+ const DEFAULT_REPLAY_SAMPLES = 3;
21
+ export class LlmHypothesisEngine {
22
+ model;
23
+ now;
24
+ replaySamples;
25
+ constructor(model, options = {}) {
26
+ this.model = model;
27
+ this.now = options.now ?? (() => new Date());
28
+ this.replaySamples = Math.max(1, options.replaySamples ?? DEFAULT_REPLAY_SAMPLES);
29
+ }
30
+ async infer(instances) {
31
+ const hypotheses = [];
32
+ for (const [domain, cluster] of clusterByDomain(instances)) {
33
+ if (cluster.length < MIN_CLUSTER_SIZE)
34
+ continue;
35
+ hypotheses.push(...(await this.inferCluster(domain, cluster)));
36
+ }
37
+ return hypotheses;
38
+ }
39
+ async inferCluster(domain, cluster) {
40
+ const { train, holdout } = splitHoldout(cluster);
41
+ const raw = await this.model.generateJson({
42
+ system: INFERENCE_SYSTEM,
43
+ prompt: inferencePrompt(domain, train),
44
+ });
45
+ const candidates = parseInferenceResponse(raw, train.length);
46
+ const results = [];
47
+ for (const candidate of candidates) {
48
+ const supportingInstanceIds = candidate.supporting.map((index) => train[index - 1].id);
49
+ if (supportingInstanceIds.length === 0)
50
+ continue; // everything cites — uncited rules are dropped
51
+ const hypothesis = this.toHypothesis(domain, candidate, supportingInstanceIds, train);
52
+ for (const heldOut of holdout) {
53
+ const verdict = await this.replayVerdict(hypothesis, heldOut);
54
+ hypothesis.replay.tested += 1;
55
+ if (verdict)
56
+ hypothesis.replay.reproduced += 1;
57
+ }
58
+ hypothesis.replay.lastRunAt = this.now().toISOString();
59
+ if (hypothesis.replay.tested > 0) {
60
+ if (hypothesis.replay.reproduced === hypothesis.replay.tested) {
61
+ hypothesis.status = "validated";
62
+ }
63
+ else if (hypothesis.replay.reproduced === 0) {
64
+ hypothesis.confidence = Math.min(hypothesis.confidence, FAILED_REPLAY_CONFIDENCE_CAP);
65
+ }
66
+ }
67
+ results.push(hypothesis);
68
+ }
69
+ return results;
70
+ }
71
+ /** Majority vote across samples; stops early once a side is unreachable. */
72
+ async replayVerdict(hypothesis, heldOut) {
73
+ const needed = Math.floor(this.replaySamples / 2) + 1;
74
+ let matches = 0;
75
+ let misses = 0;
76
+ while (matches < needed && misses < needed) {
77
+ const verdict = parseReplayResponse(await this.model.generateJson({ system: REPLAY_SYSTEM, prompt: replayPrompt(hypothesis, heldOut) }));
78
+ if (verdict)
79
+ matches += 1;
80
+ else
81
+ misses += 1;
82
+ }
83
+ return matches >= needed;
84
+ }
85
+ toHypothesis(domain, candidate, supportingInstanceIds, train) {
86
+ const now = this.now();
87
+ return {
88
+ id: newId("hyp", now.getTime()),
89
+ status: "candidate",
90
+ rule: candidate.rule,
91
+ cues: candidate.cues,
92
+ expectancies: candidate.expectancies,
93
+ ...(candidate.goal !== undefined ? { goal: candidate.goal } : {}),
94
+ domain,
95
+ appliesWhen: candidate.appliesWhen,
96
+ doesNotApplyWhen: candidate.doesNotApplyWhen,
97
+ supportingInstanceIds,
98
+ counterexampleInstanceIds: candidate.counterexamples.map((index) => train[index - 1].id),
99
+ inferredRationale: candidate.rationale,
100
+ confidence: candidate.confidence,
101
+ validity: { fires: 0, upheld: 0, overridden: 0 },
102
+ replay: { tested: 0, reproduced: 0 },
103
+ createdAt: now.toISOString(),
104
+ staleAfter: new Date(now.getTime() + STALE_AFTER_DAYS * 24 * 3600 * 1000).toISOString(),
105
+ visibility: "user_private",
106
+ review: { state: "unreviewed" },
107
+ };
108
+ }
109
+ }
110
+ function clusterByDomain(instances) {
111
+ const clusters = new Map();
112
+ for (const instance of instances) {
113
+ const cluster = clusters.get(instance.situation.domain) ?? [];
114
+ cluster.push(instance);
115
+ clusters.set(instance.situation.domain, cluster);
116
+ }
117
+ for (const cluster of clusters.values()) {
118
+ cluster.sort((a, b) => a.observedAt.localeCompare(b.observedAt));
119
+ }
120
+ return clusters;
121
+ }
122
+ /**
123
+ * Hold out the most recent correction so replay tests generalization, not recall.
124
+ * Approvals always stay in train — they are boundary evidence, not edits to predict.
125
+ */
126
+ export function splitHoldout(cluster) {
127
+ const corrections = cluster.filter((i) => i.kind !== "approval" && i.before && i.after);
128
+ if (corrections.length < MIN_CORRECTIONS_FOR_HOLDOUT) {
129
+ return { train: cluster, holdout: [] };
130
+ }
131
+ const heldOut = corrections[corrections.length - 1];
132
+ return {
133
+ train: cluster.filter((i) => i.id !== heldOut.id),
134
+ holdout: [heldOut],
135
+ };
136
+ }
@@ -0,0 +1,76 @@
1
+ /**
2
+ * Strict parsing of the inference response. The model is untrusted input;
3
+ * everything is validated, indices are checked against the instance count,
4
+ * and a malformed response throws with a message naming what was wrong.
5
+ */
6
+ export function parseInferenceResponse(raw, instanceCount) {
7
+ if (typeof raw !== "object" || raw === null || !Array.isArray(raw.hypotheses)) {
8
+ throw new Error(`inference response missing "hypotheses" array: ${preview(raw)}`);
9
+ }
10
+ return (raw.hypotheses).map((entry, i) => parseHypothesis(entry, i, instanceCount));
11
+ }
12
+ function parseHypothesis(entry, position, instanceCount) {
13
+ if (typeof entry !== "object" || entry === null) {
14
+ throw new Error(`hypothesis ${position} is not an object`);
15
+ }
16
+ const record = entry;
17
+ const rule = requireString(record, "rule", position);
18
+ const confidence = record.confidence;
19
+ if (typeof confidence !== "number" || confidence < 0 || confidence > 1) {
20
+ throw new Error(`hypothesis ${position} ("${rule.slice(0, 40)}") has invalid confidence`);
21
+ }
22
+ const supporting = requireIndexArray(record, "supporting", position, instanceCount);
23
+ return {
24
+ rule,
25
+ cues: stringArray(record.cues),
26
+ expectancies: stringArray(record.expectancies),
27
+ ...(typeof record.goal === "string" && record.goal.length > 0 ? { goal: record.goal } : {}),
28
+ appliesWhen: stringArray(record.appliesWhen),
29
+ doesNotApplyWhen: stringArray(record.doesNotApplyWhen),
30
+ supporting,
31
+ counterexamples: requireIndexArray(record, "counterexamples", position, instanceCount),
32
+ rationale: typeof record.rationale === "string" ? record.rationale : "",
33
+ confidence: clamp(confidence, 0.05, 0.95),
34
+ };
35
+ }
36
+ export function parseReplayResponse(raw) {
37
+ if (typeof raw === "object" && raw !== null && typeof raw.match === "boolean") {
38
+ return raw.match;
39
+ }
40
+ throw new Error(`replay response missing boolean "match": ${preview(raw)}`);
41
+ }
42
+ function requireString(record, key, position) {
43
+ const value = record[key];
44
+ if (typeof value !== "string" || value.trim().length === 0) {
45
+ throw new Error(`hypothesis ${position} missing string "${key}"`);
46
+ }
47
+ return value;
48
+ }
49
+ function requireIndexArray(record, key, position, instanceCount) {
50
+ const value = record[key];
51
+ if (value === undefined)
52
+ return [];
53
+ if (!Array.isArray(value) || !value.every((n) => Number.isInteger(n))) {
54
+ throw new Error(`hypothesis ${position} has invalid "${key}" (must be instance numbers)`);
55
+ }
56
+ const indexes = value;
57
+ for (const index of indexes) {
58
+ if (index < 1 || index > instanceCount) {
59
+ throw new Error(`hypothesis ${position} cites instance ${index}, but only 1..${instanceCount} exist`);
60
+ }
61
+ }
62
+ return indexes;
63
+ }
64
+ function stringArray(value) {
65
+ if (value === undefined)
66
+ return [];
67
+ if (!Array.isArray(value) || !value.every((s) => typeof s === "string"))
68
+ return [];
69
+ return value;
70
+ }
71
+ function clamp(value, low, high) {
72
+ return Math.min(high, Math.max(low, value));
73
+ }
74
+ function preview(raw) {
75
+ return JSON.stringify(raw)?.slice(0, 200) ?? String(raw);
76
+ }
@@ -0,0 +1,64 @@
1
+ /**
2
+ * Prompts for the hypothesis engine. The design encodes the capture doctrine
3
+ * (docs/research): behavior over self-report, bands over one-way rules,
4
+ * approvals as boundary evidence, no rules beyond evidence.
5
+ */
6
+ export const INFERENCE_SYSTEM = `You extract tacit judgment rules from observed corrections of agent-drafted work.
7
+
8
+ You receive numbered instances from ONE domain. Corrections show a draft and the human's version. Approvals show a draft the human deliberately kept unchanged.
9
+
10
+ Extract the transferable judgment rules behind these edits. Requirements:
11
+
12
+ - A rule is a transferable judgment ("in situations like S, do X because Y"), never a description of a single edit.
13
+ - If corrections move in OPPOSITE directions (some make text warmer, some less salesy; some add an ask, some soften an ask), that is ONE calibration-band rule with both limits stated — not two rules, and never a one-directional rule.
14
+ - Approvals are deliberate keep-decisions and they OUTRANK your generalizations. If an approval keeps something your rule would change (a long reply kept, formality kept, a pitch kept), work out from that approval's situation what licensed the exception, and state that trigger explicitly in doesNotApplyWhen (e.g. "the thread explicitly asked for a detailed write-up"). A rule that would re-edit an approved draft in its own training set is wrong as stated.
15
+ - Only propose a rule supported by at least 2 instances, unless a probe answer explicitly states the human said it is a general rule.
16
+ - Do not invent rules about anything that was never corrected. Fewer, better-supported rules beat many speculative ones.
17
+ - cues = what signals that a situation is this kind of situation. expectancies = what you would observe when the rule is working, and what signals it is misfiring.
18
+ - Cite supporting instances by their number. Cite counterexamples (instances that cut against the rule) by number too.
19
+ - confidence is 0..1: your honest estimate that this rule reflects a stable preference rather than coincidence.
20
+
21
+ Respond with STRICT JSON only, no prose, matching:
22
+ {"hypotheses": [{"rule": string, "cues": string[], "expectancies": string[], "goal": string?, "appliesWhen": string[], "doesNotApplyWhen": string[], "supporting": number[], "counterexamples": number[], "rationale": string, "confidence": number}]}`;
23
+ export function inferencePrompt(domain, instances) {
24
+ const blocks = instances.map((instance, i) => renderInstance(instance, i + 1));
25
+ return `Domain: ${domain}\n\n${blocks.join("\n\n")}`;
26
+ }
27
+ function renderInstance(instance, index) {
28
+ const lines = [];
29
+ const kindNote = instance.kind === "approval" ? "approval — the human kept this draft unchanged" : instance.kind;
30
+ lines.push(`Instance ${index} (${kindNote}):`);
31
+ lines.push(`Situation: ${instance.situation.summary}`);
32
+ if (instance.situation.task)
33
+ lines.push(`Task: ${instance.situation.task}`);
34
+ if (instance.before)
35
+ lines.push(`Draft before:\n"""\n${instance.before.content}\n"""`);
36
+ if (instance.after && instance.kind !== "approval") {
37
+ lines.push(`Human's version:\n"""\n${instance.after.content}\n"""`);
38
+ }
39
+ if (instance.diff)
40
+ lines.push(`Edit summary: ${instance.diff.summary}`);
41
+ for (const probe of instance.probeAnswers) {
42
+ if (!probe.dismissed)
43
+ lines.push(`Probe (${probe.probe}) "${probe.question}" → "${probe.answer}"`);
44
+ }
45
+ return lines.join("\n");
46
+ }
47
+ export const REPLAY_SYSTEM = `You judge whether a stored judgment rule would have prevented a correction.
48
+
49
+ You see a rule, a situation, the agent's draft, and the human's actual version. Question: would an agent that knew and followed this rule have produced the human's version (or avoided the mistake) on its own? The rule must genuinely prescribe the change that was made — topical overlap is not enough.
50
+
51
+ Respond with STRICT JSON only: {"match": true} or {"match": false}.`;
52
+ export function replayPrompt(hypothesis, instance) {
53
+ return [
54
+ `Rule: ${hypothesis.rule}`,
55
+ hypothesis.appliesWhen.length ? `Applies when: ${hypothesis.appliesWhen.join("; ")}` : "",
56
+ hypothesis.doesNotApplyWhen.length ? `Does not apply when: ${hypothesis.doesNotApplyWhen.join("; ")}` : "",
57
+ "",
58
+ `Situation: ${instance.situation.summary}`,
59
+ `Agent's draft:\n"""\n${instance.before?.content ?? "(none)"}\n"""`,
60
+ `Human's version:\n"""\n${instance.after?.content ?? "(unchanged)"}\n"""`,
61
+ ]
62
+ .filter((line) => line !== "")
63
+ .join("\n");
64
+ }
@@ -0,0 +1,123 @@
1
+ export async function runScenario(engine, scenario, judge) {
2
+ const hypotheses = await engine.infer(scenario.train);
3
+ // Match every hypothesis against every planted rule once; everything else reads this map.
4
+ const matches = new Map();
5
+ const matchedHypothesisIds = new Set();
6
+ for (const planted of scenario.planted) {
7
+ const matched = [];
8
+ for (const hypothesis of hypotheses) {
9
+ if (await judge.ruleMatches(hypothesis, planted)) {
10
+ matched.push(hypothesis);
11
+ matchedHypothesisIds.add(hypothesis.id);
12
+ }
13
+ }
14
+ matches.set(planted.key, matched);
15
+ }
16
+ const found = scenario.planted.filter((p) => (matches.get(p.key) ?? []).length > 0).map((p) => p.key);
17
+ const missed = scenario.planted.map((p) => p.key).filter((key) => !found.includes(key));
18
+ // Rules matching an alsoAcceptable pattern are real discoveries, not hallucinations —
19
+ // they don't count toward recall, but they aren't false either.
20
+ const falseHypothesisIds = [];
21
+ for (const hypothesis of hypotheses) {
22
+ if (matchedHypothesisIds.has(hypothesis.id))
23
+ continue;
24
+ let acceptable = false;
25
+ for (const extra of scenario.alsoAcceptable ?? []) {
26
+ if (await judge.ruleMatches(hypothesis, extra)) {
27
+ acceptable = true;
28
+ break;
29
+ }
30
+ }
31
+ if (!acceptable)
32
+ falseHypothesisIds.push(hypothesis.id);
33
+ }
34
+ let boundariesCovered = 0;
35
+ let boundariesTotal = 0;
36
+ for (const planted of scenario.planted) {
37
+ for (const boundary of planted.boundaries ?? []) {
38
+ boundariesTotal += 1;
39
+ const candidates = matches.get(planted.key) ?? [];
40
+ const verdicts = await Promise.all(candidates.map((h) => judge.boundaryCovered(h, boundary.mustMention)));
41
+ if (verdicts.some(Boolean))
42
+ boundariesCovered += 1;
43
+ }
44
+ }
45
+ let applyReproduced = 0;
46
+ let applyTotal = 0;
47
+ let boundaryRespected = 0;
48
+ let boundaryTotal = 0;
49
+ for (const heldOut of scenario.heldOut) {
50
+ const candidates = matches.get(heldOut.ruleKey) ?? [];
51
+ const verdicts = await Promise.all(candidates.map((h) => judge.predictsEdit(h, heldOut)));
52
+ const hit = verdicts.some(Boolean);
53
+ if (heldOut.kind === "apply") {
54
+ applyTotal += 1;
55
+ if (hit)
56
+ applyReproduced += 1;
57
+ }
58
+ else {
59
+ boundaryTotal += 1;
60
+ if (hit)
61
+ boundaryRespected += 1;
62
+ }
63
+ }
64
+ return {
65
+ scenarioId: scenario.id,
66
+ hypothesisCount: hypotheses.length,
67
+ recall: { found, missed },
68
+ falseHypothesisIds,
69
+ boundaries: { covered: boundariesCovered, total: boundariesTotal },
70
+ replay: { applyReproduced, applyTotal, boundaryRespected, boundaryTotal },
71
+ };
72
+ }
73
+ export async function runEval(engine, scenarios, judge) {
74
+ const reports = [];
75
+ for (const scenario of scenarios) {
76
+ reports.push(await runScenario(engine, scenario, judge));
77
+ }
78
+ const plantedTotal = reports.reduce((n, r) => n + r.recall.found.length + r.recall.missed.length, 0);
79
+ const foundTotal = reports.reduce((n, r) => n + r.recall.found.length, 0);
80
+ const hypothesisTotal = reports.reduce((n, r) => n + r.hypothesisCount, 0);
81
+ const falseTotal = reports.reduce((n, r) => n + r.falseHypothesisIds.length, 0);
82
+ const boundaryTotal = reports.reduce((n, r) => n + r.boundaries.total, 0);
83
+ const boundaryCovered = reports.reduce((n, r) => n + r.boundaries.covered, 0);
84
+ const applyTotal = reports.reduce((n, r) => n + r.replay.applyTotal, 0);
85
+ const applyReproduced = reports.reduce((n, r) => n + r.replay.applyReproduced, 0);
86
+ const respectTotal = reports.reduce((n, r) => n + r.replay.boundaryTotal, 0);
87
+ const respected = reports.reduce((n, r) => n + r.replay.boundaryRespected, 0);
88
+ return {
89
+ scenarios: reports,
90
+ totals: {
91
+ ruleRecall: ratio(foundTotal, plantedTotal),
92
+ falseRuleRate: ratio(falseTotal, hypothesisTotal),
93
+ boundaryCoverage: ratio(boundaryCovered, boundaryTotal),
94
+ replayReproduction: ratio(applyReproduced, applyTotal),
95
+ boundaryRespect: ratio(respected, respectTotal),
96
+ },
97
+ };
98
+ }
99
+ export function formatReport(report) {
100
+ const lines = [];
101
+ for (const s of report.scenarios) {
102
+ const flags = [];
103
+ if (s.recall.missed.length)
104
+ flags.push(`missed: ${s.recall.missed.join(", ")}`);
105
+ if (s.falseHypothesisIds.length)
106
+ flags.push(`${s.falseHypothesisIds.length} false`);
107
+ lines.push(`${s.scenarioId.padEnd(20)} rules ${s.recall.found.length}/${s.recall.found.length + s.recall.missed.length}` +
108
+ ` boundaries ${s.boundaries.covered}/${s.boundaries.total}` +
109
+ ` replay ${s.replay.applyReproduced}/${s.replay.applyTotal}` +
110
+ ` respect ${s.replay.boundaryRespected}/${s.replay.boundaryTotal}` +
111
+ (flags.length ? ` [${flags.join("; ")}]` : ""));
112
+ }
113
+ const t = report.totals;
114
+ lines.push(`TOTALS recall ${pct(t.ruleRecall)} false-rule ${pct(t.falseRuleRate)} ` +
115
+ `boundary ${pct(t.boundaryCoverage)} replay ${pct(t.replayReproduction)} respect ${pct(t.boundaryRespect)}`);
116
+ return lines.join("\n");
117
+ }
118
+ function ratio(numerator, denominator) {
119
+ return denominator === 0 ? 1 : numerator / denominator;
120
+ }
121
+ function pct(value) {
122
+ return `${Math.round(value * 100)}%`;
123
+ }
@@ -0,0 +1,75 @@
1
+ export function hypothesisText(hypothesis) {
2
+ return [
3
+ hypothesis.rule,
4
+ hypothesis.goal ?? "",
5
+ hypothesis.inferredRationale ?? "",
6
+ ...hypothesis.cues,
7
+ ...hypothesis.expectancies,
8
+ ...hypothesis.appliesWhen,
9
+ ...hypothesis.doesNotApplyWhen,
10
+ ]
11
+ .join("\n")
12
+ .toLowerCase();
13
+ }
14
+ export function boundaryText(hypothesis) {
15
+ return [...hypothesis.appliesWhen, ...hypothesis.doesNotApplyWhen].join("\n").toLowerCase();
16
+ }
17
+ /** Every group must hit; a group hits when any alternative appears. */
18
+ export function mentionsAll(text, mustMention) {
19
+ return mustMention.every((group) => group.some((alt) => text.includes(alt.toLowerCase())));
20
+ }
21
+ export class RubricJudge {
22
+ ruleMatches(hypothesis, planted) {
23
+ return Promise.resolve(hypothesis.domain === planted.domain && mentionsAll(hypothesisText(hypothesis), planted.mustMention));
24
+ }
25
+ boundaryCovered(hypothesis, mustMention) {
26
+ return Promise.resolve(mentionsAll(boundaryText(hypothesis), mustMention));
27
+ }
28
+ predictsEdit(hypothesis, heldOut) {
29
+ const text = heldOut.kind === "boundary" ? boundaryText(hypothesis) : hypothesisText(hypothesis);
30
+ return Promise.resolve(mentionsAll(text, heldOut.expectMention));
31
+ }
32
+ }
33
+ export class LlmJudge {
34
+ model;
35
+ constructor(model) {
36
+ this.model = model;
37
+ }
38
+ async ruleMatches(hypothesis, planted) {
39
+ return this.verdict("You judge whether an inferred judgment rule expresses the same latent rule as a reference rule. " +
40
+ "Same behavioral prescription and same scope counts as a match; stylistic differences do not matter.", `Reference rule: ${planted.statement}\n\nInferred rule: ${renderHypothesis(hypothesis)}`);
41
+ }
42
+ async boundaryCovered(hypothesis, mustMention) {
43
+ return this.verdict("You judge whether an inferred judgment rule carries a given exception/boundary condition.", `Boundary concepts (any phrasing): ${mustMention.map((g) => g.join(" / ")).join("; ")}\n\n` +
44
+ `Inferred rule: ${renderHypothesis(hypothesis)}`);
45
+ }
46
+ async predictsEdit(hypothesis, heldOut) {
47
+ const { instance } = heldOut;
48
+ const question = heldOut.kind === "boundary"
49
+ ? "Would an agent following this rule correctly leave the draft alone (the rule's boundaries exclude this case)?"
50
+ : "Would an agent following this rule have produced the human's version (or avoided the mistake) on its own?";
51
+ return this.verdict(`You judge whether a stored judgment rule would have prevented a correction. ${question}`, `Situation: ${instance.situation.summary} (domain: ${instance.situation.domain})\n\n` +
52
+ `Draft before: ${instance.before?.content ?? "(none)"}\n\n` +
53
+ `Human's version: ${instance.after?.content ?? "(unchanged)"}\n\n` +
54
+ `Rule: ${renderHypothesis(hypothesis)}`);
55
+ }
56
+ async verdict(system, prompt) {
57
+ const raw = await this.model.generateJson({
58
+ system: `${system} Respond with JSON: {"match": true|false}.`,
59
+ prompt,
60
+ });
61
+ if (typeof raw === "object" && raw !== null && typeof raw.match === "boolean") {
62
+ return raw.match;
63
+ }
64
+ throw new Error(`LlmJudge: invalid verdict from ${this.model.id}: ${JSON.stringify(raw)}`);
65
+ }
66
+ }
67
+ function renderHypothesis(hypothesis) {
68
+ return [
69
+ hypothesis.rule,
70
+ hypothesis.appliesWhen.length ? `Applies when: ${hypothesis.appliesWhen.join("; ")}` : "",
71
+ hypothesis.doesNotApplyWhen.length ? `Does not apply when: ${hypothesis.doesNotApplyWhen.join("; ")}` : "",
72
+ ]
73
+ .filter(Boolean)
74
+ .join("\n");
75
+ }