@united-workforce/eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. package/LICENSE +21 -0
  2. package/dist/cli.d.ts +3 -0
  3. package/dist/cli.d.ts.map +1 -0
  4. package/dist/cli.js +14 -0
  5. package/dist/cli.js.map +1 -0
  6. package/dist/commands/diff.d.ts +3 -0
  7. package/dist/commands/diff.d.ts.map +1 -0
  8. package/dist/commands/diff.js +36 -0
  9. package/dist/commands/diff.js.map +1 -0
  10. package/dist/commands/format.d.ts +11 -0
  11. package/dist/commands/format.d.ts.map +1 -0
  12. package/dist/commands/format.js +114 -0
  13. package/dist/commands/format.js.map +1 -0
  14. package/dist/commands/index.d.ts +8 -0
  15. package/dist/commands/index.d.ts.map +1 -0
  16. package/dist/commands/index.js +7 -0
  17. package/dist/commands/index.js.map +1 -0
  18. package/dist/commands/list.d.ts +3 -0
  19. package/dist/commands/list.d.ts.map +1 -0
  20. package/dist/commands/list.js +35 -0
  21. package/dist/commands/list.js.map +1 -0
  22. package/dist/commands/read.d.ts +10 -0
  23. package/dist/commands/read.d.ts.map +1 -0
  24. package/dist/commands/read.js +37 -0
  25. package/dist/commands/read.js.map +1 -0
  26. package/dist/commands/report.d.ts +3 -0
  27. package/dist/commands/report.d.ts.map +1 -0
  28. package/dist/commands/report.js +30 -0
  29. package/dist/commands/report.js.map +1 -0
  30. package/dist/commands/run.d.ts +3 -0
  31. package/dist/commands/run.d.ts.map +1 -0
  32. package/dist/commands/run.js +64 -0
  33. package/dist/commands/run.js.map +1 -0
  34. package/dist/commands/types.d.ts +9 -0
  35. package/dist/commands/types.d.ts.map +1 -0
  36. package/dist/commands/types.js +2 -0
  37. package/dist/commands/types.js.map +1 -0
  38. package/dist/index.d.ts +8 -0
  39. package/dist/index.d.ts.map +1 -0
  40. package/dist/index.js +6 -0
  41. package/dist/index.js.map +1 -0
  42. package/dist/judge/builtin/frontmatter.d.ts +8 -0
  43. package/dist/judge/builtin/frontmatter.d.ts.map +1 -0
  44. package/dist/judge/builtin/frontmatter.js +75 -0
  45. package/dist/judge/builtin/frontmatter.js.map +1 -0
  46. package/dist/judge/builtin/hallucination.d.ts +10 -0
  47. package/dist/judge/builtin/hallucination.d.ts.map +1 -0
  48. package/dist/judge/builtin/hallucination.js +16 -0
  49. package/dist/judge/builtin/hallucination.js.map +1 -0
  50. package/dist/judge/builtin/index.d.ts +7 -0
  51. package/dist/judge/builtin/index.d.ts.map +1 -0
  52. package/dist/judge/builtin/index.js +6 -0
  53. package/dist/judge/builtin/index.js.map +1 -0
  54. package/dist/judge/builtin/read-steps.d.ts +4 -0
  55. package/dist/judge/builtin/read-steps.d.ts.map +1 -0
  56. package/dist/judge/builtin/read-steps.js +12 -0
  57. package/dist/judge/builtin/read-steps.js.map +1 -0
  58. package/dist/judge/builtin/token-stats.d.ts +8 -0
  59. package/dist/judge/builtin/token-stats.d.ts.map +1 -0
  60. package/dist/judge/builtin/token-stats.js +35 -0
  61. package/dist/judge/builtin/token-stats.js.map +1 -0
  62. package/dist/judge/builtin/types.d.ts +15 -0
  63. package/dist/judge/builtin/types.d.ts.map +1 -0
  64. package/dist/judge/builtin/types.js +2 -0
  65. package/dist/judge/builtin/types.js.map +1 -0
  66. package/dist/judge/builtin/upstream.d.ts +10 -0
  67. package/dist/judge/builtin/upstream.d.ts.map +1 -0
  68. package/dist/judge/builtin/upstream.js +16 -0
  69. package/dist/judge/builtin/upstream.js.map +1 -0
  70. package/dist/judge/index.d.ts +3 -0
  71. package/dist/judge/index.d.ts.map +1 -0
  72. package/dist/judge/index.js +2 -0
  73. package/dist/judge/index.js.map +1 -0
  74. package/dist/judge/types.d.ts +15 -0
  75. package/dist/judge/types.d.ts.map +1 -0
  76. package/dist/judge/types.js +2 -0
  77. package/dist/judge/types.js.map +1 -0
  78. package/dist/runner/collect.d.ts +16 -0
  79. package/dist/runner/collect.d.ts.map +1 -0
  80. package/dist/runner/collect.js +129 -0
  81. package/dist/runner/collect.js.map +1 -0
  82. package/dist/runner/execute.d.ts +9 -0
  83. package/dist/runner/execute.d.ts.map +1 -0
  84. package/dist/runner/execute.js +72 -0
  85. package/dist/runner/execute.js.map +1 -0
  86. package/dist/runner/index.d.ts +5 -0
  87. package/dist/runner/index.d.ts.map +1 -0
  88. package/dist/runner/index.js +4 -0
  89. package/dist/runner/index.js.map +1 -0
  90. package/dist/runner/prepare.d.ts +7 -0
  91. package/dist/runner/prepare.d.ts.map +1 -0
  92. package/dist/runner/prepare.js +38 -0
  93. package/dist/runner/prepare.js.map +1 -0
  94. package/dist/runner/types.d.ts +70 -0
  95. package/dist/runner/types.d.ts.map +1 -0
  96. package/dist/runner/types.js +2 -0
  97. package/dist/runner/types.js.map +1 -0
  98. package/dist/storage/index.d.ts +4 -0
  99. package/dist/storage/index.d.ts.map +1 -0
  100. package/dist/storage/index.js +3 -0
  101. package/dist/storage/index.js.map +1 -0
  102. package/dist/storage/schemas.d.ts +7 -0
  103. package/dist/storage/schemas.d.ts.map +1 -0
  104. package/dist/storage/schemas.js +118 -0
  105. package/dist/storage/schemas.js.map +1 -0
  106. package/dist/storage/store.d.ts +10 -0
  107. package/dist/storage/store.d.ts.map +1 -0
  108. package/dist/storage/store.js +36 -0
  109. package/dist/storage/store.js.map +1 -0
  110. package/dist/storage/types.d.ts +30 -0
  111. package/dist/storage/types.d.ts.map +1 -0
  112. package/dist/storage/types.js +2 -0
  113. package/dist/storage/types.js.map +1 -0
  114. package/dist/task/index.d.ts +3 -0
  115. package/dist/task/index.d.ts.map +1 -0
  116. package/dist/task/index.js +2 -0
  117. package/dist/task/index.js.map +1 -0
  118. package/dist/task/loader.d.ts +6 -0
  119. package/dist/task/loader.d.ts.map +1 -0
  120. package/dist/task/loader.js +69 -0
  121. package/dist/task/loader.js.map +1 -0
  122. package/dist/task/types.d.ts +27 -0
  123. package/dist/task/types.d.ts.map +1 -0
  124. package/dist/task/types.js +2 -0
  125. package/dist/task/types.js.map +1 -0
  126. package/package.json +45 -0
  127. package/src/cli.ts +22 -0
  128. package/src/commands/diff.ts +38 -0
  129. package/src/commands/format.ts +148 -0
  130. package/src/commands/index.ts +7 -0
  131. package/src/commands/list.ts +43 -0
  132. package/src/commands/read.ts +41 -0
  133. package/src/commands/report.ts +32 -0
  134. package/src/commands/run.ts +84 -0
  135. package/src/commands/types.ts +9 -0
  136. package/src/index.ts +34 -0
  137. package/src/judge/builtin/frontmatter.ts +95 -0
  138. package/src/judge/builtin/hallucination.ts +17 -0
  139. package/src/judge/builtin/index.ts +6 -0
  140. package/src/judge/builtin/read-steps.ts +14 -0
  141. package/src/judge/builtin/token-stats.ts +53 -0
  142. package/src/judge/builtin/types.ts +16 -0
  143. package/src/judge/builtin/upstream.ts +17 -0
  144. package/src/judge/index.ts +10 -0
  145. package/src/judge/types.ts +15 -0
  146. package/src/runner/collect.ts +172 -0
  147. package/src/runner/execute.ts +87 -0
  148. package/src/runner/index.ts +15 -0
  149. package/src/runner/prepare.ts +45 -0
  150. package/src/runner/types.ts +85 -0
  151. package/src/storage/index.ts +9 -0
  152. package/src/storage/schemas.ts +123 -0
  153. package/src/storage/store.ts +42 -0
  154. package/src/storage/types.ts +33 -0
  155. package/src/task/index.ts +2 -0
  156. package/src/task/loader.ts +74 -0
  157. package/src/task/types.ts +28 -0
@@ -0,0 +1,148 @@
1
+ import type { EvalRunPayload } from "../storage/index.js";
2
+ import type { EvalListEntry } from "./types.js";
3
+
4
+ const NAME_WIDTH = 28;
5
+ const SCORE_WIDTH = 10;
6
+ const TIMESTAMP_WIDTH = 26;
7
+
8
+ /** Format a 0..1 score (or weight) with fixed precision. */
9
+ function formatScore(value: number): string {
10
+ return value.toFixed(4);
11
+ }
12
+
13
+ /** Human-readable ISO-8601 timestamp from epoch milliseconds. */
14
+ function formatTimestamp(ms: number): string {
15
+ return new Date(ms).toISOString();
16
+ }
17
+
18
+ /** Right-pad to a fixed column width (with a trailing space if already full). */
19
+ function pad(value: string, width: number): string {
20
+ return value.length >= width ? `${value} ` : value.padEnd(width);
21
+ }
22
+
23
+ /** Directional indicator for a score delta (B relative to A). */
24
+ function formatDelta(delta: number): string {
25
+ if (delta > 0) {
26
+ return `▲ +${formatScore(delta)}`;
27
+ }
28
+ if (delta < 0) {
29
+ return `▼ ${formatScore(delta)}`;
30
+ }
31
+ return `= ${formatScore(0)}`;
32
+ }
33
+
34
+ /** Render a single eval run as a human-readable report. */
35
+ export function formatReport(payload: EvalRunPayload, runHash: string): string {
36
+ const lines: string[] = [];
37
+ lines.push("=== Eval Report ===");
38
+ lines.push(`Task: ${payload.task}`);
39
+ lines.push(`Overall: ${formatScore(payload.overall)}`);
40
+ lines.push(`Timestamp: ${formatTimestamp(payload.timestamp)}`);
41
+ lines.push("");
42
+ lines.push("Config:");
43
+ lines.push(` Agent: ${payload.config.agent}`);
44
+ lines.push(` Model: ${payload.config.model}`);
45
+ lines.push(` Engine: ${payload.config.engineVersion}`);
46
+ lines.push("");
47
+ lines.push("Judges:");
48
+ lines.push(` ${pad("NAME", NAME_WIDTH)}${pad("SCORE", SCORE_WIDTH)}WEIGHT`);
49
+ for (const judge of payload.judges) {
50
+ lines.push(
51
+ ` ${pad(judge.name, NAME_WIDTH)}${pad(formatScore(judge.score), SCORE_WIDTH)}${formatScore(judge.weight)}`,
52
+ );
53
+ }
54
+ lines.push("");
55
+ lines.push(`Thread: ${payload.threadId}`);
56
+ lines.push(`Run: ${runHash}`);
57
+ return `${lines.join("\n")}\n`;
58
+ }
59
+
60
+ /** Render a side-by-side comparison of two eval runs. */
61
+ export function formatDiff(
62
+ payloadA: EvalRunPayload,
63
+ hashA: string,
64
+ payloadB: EvalRunPayload,
65
+ hashB: string,
66
+ ): string {
67
+ const lines: string[] = [];
68
+ lines.push("=== Eval Diff ===");
69
+ lines.push(`A: ${hashA} (${payloadA.task})`);
70
+ lines.push(`B: ${hashB} (${payloadB.task})`);
71
+ lines.push("");
72
+
73
+ const overallDelta = payloadB.overall - payloadA.overall;
74
+ lines.push("Overall:");
75
+ lines.push(
76
+ ` A=${formatScore(payloadA.overall)} B=${formatScore(payloadB.overall)} ${formatDelta(overallDelta)}`,
77
+ );
78
+ lines.push("");
79
+
80
+ lines.push("Config:");
81
+ lines.push(configLine("Agent", payloadA.config.agent, payloadB.config.agent));
82
+ lines.push(configLine("Model", payloadA.config.model, payloadB.config.model));
83
+ lines.push(configLine("Engine", payloadA.config.engineVersion, payloadB.config.engineVersion));
84
+ lines.push("");
85
+
86
+ lines.push("Judges:");
87
+ lines.push(` ${pad("NAME", NAME_WIDTH)}${pad("A", SCORE_WIDTH)}${pad("B", SCORE_WIDTH)}DELTA`);
88
+ const scoresA = new Map(payloadA.judges.map((judge) => [judge.name, judge.score]));
89
+ const scoresB = new Map(payloadB.judges.map((judge) => [judge.name, judge.score]));
90
+ for (const name of unionJudgeNames(payloadA, payloadB)) {
91
+ const scoreA = scoresA.get(name);
92
+ const scoreB = scoresB.get(name);
93
+ const cellA = scoreA === undefined ? "—" : formatScore(scoreA);
94
+ const cellB = scoreB === undefined ? "—" : formatScore(scoreB);
95
+ const delta = scoreA !== undefined && scoreB !== undefined ? formatDelta(scoreB - scoreA) : "";
96
+ lines.push(
97
+ ` ${pad(name, NAME_WIDTH)}${pad(cellA, SCORE_WIDTH)}${pad(cellB, SCORE_WIDTH)}${delta}`,
98
+ );
99
+ }
100
+ return `${lines.join("\n")}\n`;
101
+ }
102
+
103
+ /** Render a table of indexed eval runs. */
104
+ export function formatList(entries: ReadonlyArray<EvalListEntry>): string {
105
+ const lines: string[] = [];
106
+ lines.push(
107
+ ` ${pad("TASK", NAME_WIDTH)}${pad("OVERALL", SCORE_WIDTH)}${pad("TIMESTAMP", TIMESTAMP_WIDTH)}HASH`,
108
+ );
109
+ if (entries.length === 0) {
110
+ lines.push(" (no eval runs found)");
111
+ }
112
+ for (const entry of entries) {
113
+ lines.push(
114
+ ` ${pad(entry.task, NAME_WIDTH)}${pad(formatScore(entry.overall), SCORE_WIDTH)}${pad(formatTimestamp(entry.timestamp), TIMESTAMP_WIDTH)}${entry.hash}`,
115
+ );
116
+ }
117
+ return `${lines.join("\n")}\n`;
118
+ }
119
+
120
+ /** Sort newest-first, then apply optional task filter and result limit. */
121
+ export function selectEntries(
122
+ entries: ReadonlyArray<EvalListEntry>,
123
+ task: string | null,
124
+ limit: number | null,
125
+ ): EvalListEntry[] {
126
+ const sorted = [...entries].sort((a, b) => b.timestamp - a.timestamp);
127
+ const filtered = task !== null ? sorted.filter((entry) => entry.task === task) : sorted;
128
+ return limit !== null ? filtered.slice(0, limit) : filtered;
129
+ }
130
+
131
+ /** Ordered union of judge names: A's order first, then B-only names. */
132
+ function unionJudgeNames(payloadA: EvalRunPayload, payloadB: EvalRunPayload): string[] {
133
+ const names: string[] = [];
134
+ const seen = new Set<string>();
135
+ for (const judge of [...payloadA.judges, ...payloadB.judges]) {
136
+ if (!seen.has(judge.name)) {
137
+ seen.add(judge.name);
138
+ names.push(judge.name);
139
+ }
140
+ }
141
+ return names;
142
+ }
143
+
144
+ /** One config row: `=` when equal, `≠` otherwise. */
145
+ function configLine(label: string, valueA: string, valueB: string): string {
146
+ const marker = valueA === valueB ? "=" : "≠";
147
+ return ` ${pad(`${label}:`, SCORE_WIDTH)}${marker} A=${valueA} B=${valueB}`;
148
+ }
@@ -0,0 +1,7 @@
1
+ export { registerDiffCommand } from "./diff.js";
2
+ export { formatDiff, formatList, formatReport, selectEntries } from "./format.js";
3
+ export { registerListCommand } from "./list.js";
4
+ export { readEvalEntries, readEvalRun } from "./read.js";
5
+ export { registerReportCommand } from "./report.js";
6
+ export { registerRunCommand } from "./run.js";
7
+ export type { EvalListEntry } from "./types.js";
@@ -0,0 +1,43 @@
1
+ import { createLogger } from "@united-workforce/util";
2
+ import type { Command } from "commander";
3
+
4
+ import { createEvalStore } from "../storage/index.js";
5
+ import { formatList, selectEntries } from "./format.js";
6
+ import { readEvalEntries } from "./read.js";
7
+
8
+ const log = createLogger({ sink: { kind: "stderr" } });
9
+ const LOG_LIST = "L5KX9R2B";
10
+
11
+ type ListCliOptions = {
12
+ task: string | undefined;
13
+ limit: string;
14
+ };
15
+
16
+ export function registerListCommand(program: Command): void {
17
+ program
18
+ .command("list")
19
+ .description("List past eval runs")
20
+ .option("--task <name>", "filter by task name")
21
+ .option("--limit <n>", "max results", "20")
22
+ .action(async (opts: ListCliOptions) => {
23
+ const limit = Number.parseInt(opts.limit, 10);
24
+ if (!Number.isInteger(limit) || limit < 1) {
25
+ process.stderr.write("--limit must be a positive integer\n");
26
+ process.exitCode = 1;
27
+ return;
28
+ }
29
+
30
+ try {
31
+ const evalStore = await createEvalStore();
32
+ const entries = readEvalEntries(evalStore);
33
+ const task = opts.task ?? null;
34
+ const selected = selectEntries(entries, task, limit);
35
+ log(LOG_LIST, `list task=${task ?? "*"} found=${entries.length} shown=${selected.length}`);
36
+ process.stdout.write(formatList(selected));
37
+ } catch (e) {
38
+ const message = e instanceof Error ? e.message : String(e);
39
+ process.stderr.write(`${message}\n`);
40
+ process.exitCode = 1;
41
+ }
42
+ });
43
+ }
@@ -0,0 +1,41 @@
1
+ import type { EvalRunPayload, EvalStore } from "../storage/index.js";
2
+ import type { EvalListEntry } from "./types.js";
3
+
4
+ /** Variable prefix and suffix for eval run pointers (`@uwf/eval/<task>/latest`). */
5
+ const EVAL_VAR_PREFIX = "@uwf/eval/";
6
+ const EVAL_VAR_SUFFIX = "/latest";
7
+
8
+ /** Read a single eval-run payload from CAS. Returns null when the node is absent. */
9
+ export function readEvalRun(evalStore: EvalStore, hash: string): EvalRunPayload | null {
10
+ const node = evalStore.store.cas.get(hash);
11
+ if (node === null) {
12
+ return null;
13
+ }
14
+ return node.payload as EvalRunPayload;
15
+ }
16
+
17
+ /**
18
+ * Read every indexed eval run by scanning `@uwf/eval/*\/latest` variables and
19
+ * loading the referenced CAS node. Dangling pointers are skipped.
20
+ */
21
+ export function readEvalEntries(evalStore: EvalStore): EvalListEntry[] {
22
+ const { store, varStore } = evalStore;
23
+ const entries: EvalListEntry[] = [];
24
+ for (const variable of varStore.list()) {
25
+ if (!variable.name.startsWith(EVAL_VAR_PREFIX) || !variable.name.endsWith(EVAL_VAR_SUFFIX)) {
26
+ continue;
27
+ }
28
+ const node = store.cas.get(variable.value);
29
+ if (node === null) {
30
+ continue;
31
+ }
32
+ const payload = node.payload as EvalRunPayload;
33
+ entries.push({
34
+ task: payload.task,
35
+ overall: payload.overall,
36
+ timestamp: payload.timestamp,
37
+ hash: variable.value,
38
+ });
39
+ }
40
+ return entries;
41
+ }
@@ -0,0 +1,32 @@
1
+ import { createLogger } from "@united-workforce/util";
2
+ import type { Command } from "commander";
3
+
4
+ import { createEvalStore } from "../storage/index.js";
5
+ import { formatReport } from "./format.js";
6
+ import { readEvalRun } from "./read.js";
7
+
8
+ const log = createLogger({ sink: { kind: "stderr" } });
9
+ const LOG_REPORT = "R7QP2M4K";
10
+
11
+ export function registerReportCommand(program: Command): void {
12
+ program
13
+ .command("report <hash>")
14
+ .description("Show eval run results")
15
+ .action(async (hash: string) => {
16
+ try {
17
+ const evalStore = await createEvalStore();
18
+ const payload = readEvalRun(evalStore, hash);
19
+ if (payload === null) {
20
+ process.stderr.write(`eval run not found: ${hash}\n`);
21
+ process.exitCode = 1;
22
+ return;
23
+ }
24
+ log(LOG_REPORT, `report task=${payload.task} hash=${hash}`);
25
+ process.stdout.write(formatReport(payload, hash));
26
+ } catch (e) {
27
+ const message = e instanceof Error ? e.message : String(e);
28
+ process.stderr.write(`${message}\n`);
29
+ process.exitCode = 1;
30
+ }
31
+ });
32
+ }
@@ -0,0 +1,84 @@
1
+ import { resolve } from "node:path";
2
+
3
+ import type { Command } from "commander";
4
+ import type { RunResult } from "../runner/index.js";
5
+ import { collect, execute, getEngineVersion, prepare } from "../runner/index.js";
6
+ import type { EvalRunConfig } from "../storage/index.js";
7
+ import { createEvalStore } from "../storage/index.js";
8
+
9
+ type RunCliOptions = {
10
+ agent: string;
11
+ model: string | undefined;
12
+ count: string;
13
+ };
14
+
15
+ async function runOnce(
16
+ taskDir: string,
17
+ agent: string,
18
+ model: string,
19
+ engineVersion: string,
20
+ ): Promise<RunResult> {
21
+ const prepared = await prepare(taskDir);
22
+ const { manifest, workDir } = prepared;
23
+
24
+ const { threadId } = await execute({
25
+ workDir,
26
+ workflow: manifest.workflow,
27
+ prompt: manifest.prompt,
28
+ agent,
29
+ maxSteps: manifest.limits.maxSteps,
30
+ });
31
+
32
+ const evalStore = await createEvalStore();
33
+ const config: EvalRunConfig = { agent, model, engineVersion };
34
+ const collected = await collect({
35
+ evalStore,
36
+ taskDir: prepared.taskDir,
37
+ workDir,
38
+ threadId,
39
+ manifest,
40
+ config,
41
+ });
42
+
43
+ return {
44
+ runHash: collected.runHash,
45
+ overall: collected.overall,
46
+ task: manifest.name,
47
+ judges: collected.judges,
48
+ };
49
+ }
50
+
51
+ export function registerRunCommand(program: Command): void {
52
+ program
53
+ .command("run <task>")
54
+ .description("Run eval on a task directory or tarball")
55
+ .option("--agent <name>", "agent adapter to use", "hermes")
56
+ .option("--model <model>", "model override")
57
+ .option("--count <n>", "number of eval runs", "1")
58
+ .action(async (task: string, opts: RunCliOptions) => {
59
+ const taskDir = resolve(task);
60
+ const agent = opts.agent;
61
+ const model = opts.model ?? "";
62
+ const count = Number.parseInt(opts.count, 10);
63
+ if (!Number.isInteger(count) || count < 1) {
64
+ process.stderr.write("--count must be a positive integer\n");
65
+ process.exitCode = 1;
66
+ return;
67
+ }
68
+
69
+ const engineVersion = getEngineVersion();
70
+
71
+ try {
72
+ const results: RunResult[] = [];
73
+ for (let i = 0; i < count; i++) {
74
+ results.push(await runOnce(taskDir, agent, model, engineVersion));
75
+ }
76
+ const output = count === 1 ? results[0] : results;
77
+ process.stdout.write(`${JSON.stringify(output)}\n`);
78
+ } catch (e) {
79
+ const message = e instanceof Error ? e.message : String(e);
80
+ process.stderr.write(`${message}\n`);
81
+ process.exitCode = 1;
82
+ }
83
+ });
84
+ }
@@ -0,0 +1,9 @@
1
+ import type { CasRef } from "@united-workforce/protocol";
2
+
3
+ /** Summary row for the `list` command: one indexed eval run. */
4
+ export type EvalListEntry = {
5
+ task: string;
6
+ overall: number;
7
+ timestamp: number;
8
+ hash: CasRef;
9
+ };
package/src/index.ts ADDED
@@ -0,0 +1,34 @@
1
+ // Judge types
2
+ export type { JudgeInput, JudgeOutput } from "./judge/index.js";
3
+ export type {
4
+ CollectInput,
5
+ CollectResult,
6
+ ExecuteInput,
7
+ ExecuteResult,
8
+ JudgeRunner,
9
+ JudgeRunOutput,
10
+ JudgeSummary,
11
+ PrepareResult,
12
+ RunOptions,
13
+ RunResult,
14
+ } from "./runner/index.js";
15
+ // Runner (prepare → execute → collect)
16
+ export { collect, computeOverall, execute, getEngineVersion, prepare } from "./runner/index.js";
17
+ export type {
18
+ EvalJudgeRecord,
19
+ EvalRunConfig,
20
+ EvalRunPayload,
21
+ EvalStore,
22
+ } from "./storage/index.js";
23
+ // Storage schemas and types
24
+ export {
25
+ createEvalStore,
26
+ EVAL_JUDGE_FRONTMATTER_SCHEMA,
27
+ EVAL_JUDGE_HALLUCINATION_SCHEMA,
28
+ EVAL_JUDGE_TOKEN_STATS_SCHEMA,
29
+ EVAL_JUDGE_UPSTREAM_SCHEMA,
30
+ EVAL_RUN_SCHEMA,
31
+ setEvalLatest,
32
+ } from "./storage/index.js";
33
+ export type { JudgeEntry, TaskLimits, TaskManifest } from "./task/index.js";
34
+ export { loadTaskManifest, parseTaskManifest } from "./task/index.js";
@@ -0,0 +1,95 @@
1
+ import { createLogger } from "@united-workforce/util";
2
+ import { parse as parseYaml } from "yaml";
3
+
4
+ import { EVAL_JUDGE_FRONTMATTER_SCHEMA } from "../../storage/index.js";
5
+ import { readThreadSteps } from "./read-steps.js";
6
+ import type { BuiltinJudgeOutput } from "./types.js";
7
+
8
+ const log = createLogger({ sink: { kind: "stderr" } });
9
+
10
+ const LOG_RESULT = "F2QH7R4M";
11
+
12
+ const FENCE = "---";
13
+
14
+ type InvalidStep = {
15
+ stepIndex: number;
16
+ role: string;
17
+ errors: string[];
18
+ };
19
+
20
+ /**
21
+ * Extract the YAML frontmatter block from a step output. Returns the inner YAML
22
+ * string when the output starts with a `---\n` block closed by a `\n---` fence,
23
+ * otherwise null.
24
+ */
25
+ function extractFrontmatterYaml(output: unknown): string | null {
26
+ if (typeof output !== "string") {
27
+ return null;
28
+ }
29
+ if (!output.startsWith(`${FENCE}\n`)) {
30
+ return null;
31
+ }
32
+ const rest = output.slice(FENCE.length + 1);
33
+ const closeIndex = rest.indexOf(`\n${FENCE}`);
34
+ if (closeIndex === -1) {
35
+ return null;
36
+ }
37
+ return rest.slice(0, closeIndex);
38
+ }
39
+
40
+ /** Validate a single step's frontmatter, returning a list of errors (empty = valid). */
41
+ function validateStepFrontmatter(output: unknown): string[] {
42
+ const yaml = extractFrontmatterYaml(output);
43
+ if (yaml === null) {
44
+ return ["output does not begin with a valid '---' frontmatter block"];
45
+ }
46
+
47
+ let parsed: unknown;
48
+ try {
49
+ parsed = parseYaml(yaml);
50
+ } catch (e) {
51
+ const message = e instanceof Error ? e.message : String(e);
52
+ return [`frontmatter YAML failed to parse: ${message}`];
53
+ }
54
+
55
+ if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
56
+ return ["frontmatter is not a YAML mapping"];
57
+ }
58
+
59
+ const status = (parsed as Record<string, unknown>).$status;
60
+ if (typeof status !== "string" || status.trim() === "") {
61
+ return ["$status field is missing or not a non-empty string"];
62
+ }
63
+
64
+ return [];
65
+ }
66
+
67
+ /**
68
+ * Deterministic judge: every step's agent output must contain valid YAML
69
+ * frontmatter with a non-empty `$status` field. Score = stepsValid / stepsTotal
70
+ * (0 when there are no steps).
71
+ */
72
+ export async function runFrontmatterJudge(threadId: string): Promise<BuiltinJudgeOutput> {
73
+ const steps = readThreadSteps(threadId);
74
+
75
+ const invalidSteps: InvalidStep[] = [];
76
+ for (let i = 0; i < steps.length; i++) {
77
+ const step = steps[i];
78
+ const errors = validateStepFrontmatter(step.output);
79
+ if (errors.length > 0) {
80
+ invalidSteps.push({ stepIndex: i, role: step.role, errors });
81
+ }
82
+ }
83
+
84
+ const stepsTotal = steps.length;
85
+ const stepsValid = stepsTotal - invalidSteps.length;
86
+ const score = stepsTotal > 0 ? stepsValid / stepsTotal : 0;
87
+
88
+ log(LOG_RESULT, `frontmatter thread=${threadId} valid=${stepsValid}/${stepsTotal}`);
89
+
90
+ return {
91
+ score,
92
+ data: { stepsTotal, stepsValid, invalidSteps },
93
+ schema: EVAL_JUDGE_FRONTMATTER_SCHEMA,
94
+ };
95
+ }
@@ -0,0 +1,17 @@
1
+ import { EVAL_JUDGE_HALLUCINATION_SCHEMA } from "../../storage/index.js";
2
+ import type { BuiltinJudgeOutput } from "./types.js";
3
+
4
+ /**
5
+ * LLM-as-judge: detects claims in each step's output that are not grounded in
6
+ * the available context (hallucinations).
7
+ *
8
+ * TODO: LLM-as-judge — needs provider config to call LLM API. Returns a stub
9
+ * (score 0, empty perStep) until the LLM call path is wired up.
10
+ */
11
+ export async function runHallucinationJudge(_threadId: string): Promise<BuiltinJudgeOutput> {
12
+ return {
13
+ score: 0,
14
+ data: { perStep: [] },
15
+ schema: EVAL_JUDGE_HALLUCINATION_SCHEMA,
16
+ };
17
+ }
@@ -0,0 +1,6 @@
1
+ export { runFrontmatterJudge } from "./frontmatter.js";
2
+ export { runHallucinationJudge } from "./hallucination.js";
3
+ export { readThreadSteps } from "./read-steps.js";
4
+ export { runTokenStatsJudge } from "./token-stats.js";
5
+ export type { BuiltinJudge, BuiltinJudgeOutput } from "./types.js";
6
+ export { runUpstreamJudge } from "./upstream.js";
@@ -0,0 +1,14 @@
1
+ import { execFileSync } from "node:child_process";
2
+
3
+ import type { StepEntry, ThreadStepsOutput } from "@united-workforce/protocol";
4
+
5
+ /** Shell out to `uwf step list` and return the parsed step entries (excludes start entry). */
6
+ export function readThreadSteps(threadId: string): StepEntry[] {
7
+ const stdout = execFileSync("uwf", ["step", "list", threadId], {
8
+ encoding: "utf8",
9
+ stdio: ["ignore", "pipe", "pipe"],
10
+ }).trim();
11
+ const parsed = JSON.parse(stdout) as ThreadStepsOutput;
12
+ // steps[0] is the StartEntry; the rest are StepEntry records.
13
+ return parsed.steps.slice(1) as StepEntry[];
14
+ }
@@ -0,0 +1,53 @@
1
+ import { createLogger } from "@united-workforce/util";
2
+
3
+ import { EVAL_JUDGE_TOKEN_STATS_SCHEMA } from "../../storage/index.js";
4
+ import { readThreadSteps } from "./read-steps.js";
5
+ import type { BuiltinJudgeOutput } from "./types.js";
6
+
7
+ const log = createLogger({ sink: { kind: "stderr" } });
8
+
9
+ const LOG_RESULT = "T7KQ3M9P";
10
+
11
+ type PerStepStats = {
12
+ role: string;
13
+ inputTokens: number;
14
+ outputTokens: number;
15
+ turns: number;
16
+ duration: number;
17
+ };
18
+
19
+ /**
20
+ * Informational judge: aggregate token usage across every step. Always scores
21
+ * 1.0 — it never penalizes a run, it only reports usage. Steps with null usage
22
+ * contribute zeros.
23
+ */
24
+ export async function runTokenStatsJudge(threadId: string): Promise<BuiltinJudgeOutput> {
25
+ const steps = readThreadSteps(threadId);
26
+
27
+ let totalInput = 0;
28
+ let totalOutput = 0;
29
+ let totalTurns = 0;
30
+ const perStep: PerStepStats[] = [];
31
+
32
+ for (const step of steps) {
33
+ const usage = step.usage;
34
+ const inputTokens = usage !== null ? usage.inputTokens : 0;
35
+ const outputTokens = usage !== null ? usage.outputTokens : 0;
36
+ const turns = usage !== null ? usage.turns : 0;
37
+ const duration = usage !== null ? usage.duration : 0;
38
+
39
+ totalInput += inputTokens;
40
+ totalOutput += outputTokens;
41
+ totalTurns += turns;
42
+
43
+ perStep.push({ role: step.role, inputTokens, outputTokens, turns, duration });
44
+ }
45
+
46
+ log(LOG_RESULT, `token-stats thread=${threadId} in=${totalInput} out=${totalOutput}`);
47
+
48
+ return {
49
+ score: 1.0,
50
+ data: { totalInput, totalOutput, totalTurns, perStep },
51
+ schema: EVAL_JUDGE_TOKEN_STATS_SCHEMA,
52
+ };
53
+ }
@@ -0,0 +1,16 @@
1
+ import type { JSONSchema } from "@ocas/core";
2
+
3
+ /**
4
+ * Output produced by a builtin judge. Structurally identical to the runner's
5
+ * `JudgeRunOutput`; defined locally to keep the judge module free of a
6
+ * dependency on the runner module.
7
+ */
8
+ export type BuiltinJudgeOutput = {
9
+ score: number;
10
+ data: unknown;
11
+ /** Schema describing `data`, used when persisting to CAS. */
12
+ schema: JSONSchema;
13
+ };
14
+
15
+ /** A builtin judge analyzes a thread's steps and returns a scored result. */
16
+ export type BuiltinJudge = (threadId: string) => Promise<BuiltinJudgeOutput>;
@@ -0,0 +1,17 @@
1
+ import { EVAL_JUDGE_UPSTREAM_SCHEMA } from "../../storage/index.js";
2
+ import type { BuiltinJudgeOutput } from "./types.js";
3
+
4
+ /**
5
+ * LLM-as-judge: measures how well each role consumed the relevant outputs from
6
+ * upstream steps.
7
+ *
8
+ * TODO: LLM-as-judge — needs provider config to call LLM API. Returns a stub
9
+ * (score 0, empty perStep) until the LLM call path is wired up.
10
+ */
11
+ export async function runUpstreamJudge(_threadId: string): Promise<BuiltinJudgeOutput> {
12
+ return {
13
+ score: 0,
14
+ data: { perStep: [] },
15
+ schema: EVAL_JUDGE_UPSTREAM_SCHEMA,
16
+ };
17
+ }
@@ -0,0 +1,10 @@
1
+ export {
2
+ type BuiltinJudge,
3
+ type BuiltinJudgeOutput,
4
+ readThreadSteps,
5
+ runFrontmatterJudge,
6
+ runHallucinationJudge,
7
+ runTokenStatsJudge,
8
+ runUpstreamJudge,
9
+ } from "./builtin/index.js";
10
+ export type { JudgeInput, JudgeOutput } from "./types.js";
@@ -0,0 +1,15 @@
1
+ /** Output shape every judge must produce on stdout (JSON). */
2
+ export type JudgeOutput<T = unknown> = {
3
+ /** Score between 0.0 and 1.0. */
4
+ score: number;
5
+ /** Judge-specific structured data, stored in CAS with its own schema. */
6
+ data: T;
7
+ };
8
+
9
+ /** Input context passed to judge scripts via argv. */
10
+ export type JudgeInput = {
11
+ /** Working directory where the task was executed. */
12
+ cwd: string;
13
+ /** Thread ID of the eval run. */
14
+ threadId: string;
15
+ };