@united-workforce/eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +14 -0
- package/dist/cli.js.map +1 -0
- package/dist/commands/diff.d.ts +3 -0
- package/dist/commands/diff.d.ts.map +1 -0
- package/dist/commands/diff.js +36 -0
- package/dist/commands/diff.js.map +1 -0
- package/dist/commands/format.d.ts +11 -0
- package/dist/commands/format.d.ts.map +1 -0
- package/dist/commands/format.js +114 -0
- package/dist/commands/format.js.map +1 -0
- package/dist/commands/index.d.ts +8 -0
- package/dist/commands/index.d.ts.map +1 -0
- package/dist/commands/index.js +7 -0
- package/dist/commands/index.js.map +1 -0
- package/dist/commands/list.d.ts +3 -0
- package/dist/commands/list.d.ts.map +1 -0
- package/dist/commands/list.js +35 -0
- package/dist/commands/list.js.map +1 -0
- package/dist/commands/read.d.ts +10 -0
- package/dist/commands/read.d.ts.map +1 -0
- package/dist/commands/read.js +37 -0
- package/dist/commands/read.js.map +1 -0
- package/dist/commands/report.d.ts +3 -0
- package/dist/commands/report.d.ts.map +1 -0
- package/dist/commands/report.js +30 -0
- package/dist/commands/report.js.map +1 -0
- package/dist/commands/run.d.ts +3 -0
- package/dist/commands/run.d.ts.map +1 -0
- package/dist/commands/run.js +64 -0
- package/dist/commands/run.js.map +1 -0
- package/dist/commands/types.d.ts +9 -0
- package/dist/commands/types.d.ts.map +1 -0
- package/dist/commands/types.js +2 -0
- package/dist/commands/types.js.map +1 -0
- package/dist/index.d.ts +8 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +6 -0
- package/dist/index.js.map +1 -0
- package/dist/judge/builtin/frontmatter.d.ts +8 -0
- package/dist/judge/builtin/frontmatter.d.ts.map +1 -0
- package/dist/judge/builtin/frontmatter.js +75 -0
- package/dist/judge/builtin/frontmatter.js.map +1 -0
- package/dist/judge/builtin/hallucination.d.ts +10 -0
- package/dist/judge/builtin/hallucination.d.ts.map +1 -0
- package/dist/judge/builtin/hallucination.js +16 -0
- package/dist/judge/builtin/hallucination.js.map +1 -0
- package/dist/judge/builtin/index.d.ts +7 -0
- package/dist/judge/builtin/index.d.ts.map +1 -0
- package/dist/judge/builtin/index.js +6 -0
- package/dist/judge/builtin/index.js.map +1 -0
- package/dist/judge/builtin/read-steps.d.ts +4 -0
- package/dist/judge/builtin/read-steps.d.ts.map +1 -0
- package/dist/judge/builtin/read-steps.js +12 -0
- package/dist/judge/builtin/read-steps.js.map +1 -0
- package/dist/judge/builtin/token-stats.d.ts +8 -0
- package/dist/judge/builtin/token-stats.d.ts.map +1 -0
- package/dist/judge/builtin/token-stats.js +35 -0
- package/dist/judge/builtin/token-stats.js.map +1 -0
- package/dist/judge/builtin/types.d.ts +15 -0
- package/dist/judge/builtin/types.d.ts.map +1 -0
- package/dist/judge/builtin/types.js +2 -0
- package/dist/judge/builtin/types.js.map +1 -0
- package/dist/judge/builtin/upstream.d.ts +10 -0
- package/dist/judge/builtin/upstream.d.ts.map +1 -0
- package/dist/judge/builtin/upstream.js +16 -0
- package/dist/judge/builtin/upstream.js.map +1 -0
- package/dist/judge/index.d.ts +3 -0
- package/dist/judge/index.d.ts.map +1 -0
- package/dist/judge/index.js +2 -0
- package/dist/judge/index.js.map +1 -0
- package/dist/judge/types.d.ts +15 -0
- package/dist/judge/types.d.ts.map +1 -0
- package/dist/judge/types.js +2 -0
- package/dist/judge/types.js.map +1 -0
- package/dist/runner/collect.d.ts +16 -0
- package/dist/runner/collect.d.ts.map +1 -0
- package/dist/runner/collect.js +129 -0
- package/dist/runner/collect.js.map +1 -0
- package/dist/runner/execute.d.ts +9 -0
- package/dist/runner/execute.d.ts.map +1 -0
- package/dist/runner/execute.js +72 -0
- package/dist/runner/execute.js.map +1 -0
- package/dist/runner/index.d.ts +5 -0
- package/dist/runner/index.d.ts.map +1 -0
- package/dist/runner/index.js +4 -0
- package/dist/runner/index.js.map +1 -0
- package/dist/runner/prepare.d.ts +7 -0
- package/dist/runner/prepare.d.ts.map +1 -0
- package/dist/runner/prepare.js +38 -0
- package/dist/runner/prepare.js.map +1 -0
- package/dist/runner/types.d.ts +70 -0
- package/dist/runner/types.d.ts.map +1 -0
- package/dist/runner/types.js +2 -0
- package/dist/runner/types.js.map +1 -0
- package/dist/storage/index.d.ts +4 -0
- package/dist/storage/index.d.ts.map +1 -0
- package/dist/storage/index.js +3 -0
- package/dist/storage/index.js.map +1 -0
- package/dist/storage/schemas.d.ts +7 -0
- package/dist/storage/schemas.d.ts.map +1 -0
- package/dist/storage/schemas.js +118 -0
- package/dist/storage/schemas.js.map +1 -0
- package/dist/storage/store.d.ts +10 -0
- package/dist/storage/store.d.ts.map +1 -0
- package/dist/storage/store.js +36 -0
- package/dist/storage/store.js.map +1 -0
- package/dist/storage/types.d.ts +30 -0
- package/dist/storage/types.d.ts.map +1 -0
- package/dist/storage/types.js +2 -0
- package/dist/storage/types.js.map +1 -0
- package/dist/task/index.d.ts +3 -0
- package/dist/task/index.d.ts.map +1 -0
- package/dist/task/index.js +2 -0
- package/dist/task/index.js.map +1 -0
- package/dist/task/loader.d.ts +6 -0
- package/dist/task/loader.d.ts.map +1 -0
- package/dist/task/loader.js +69 -0
- package/dist/task/loader.js.map +1 -0
- package/dist/task/types.d.ts +27 -0
- package/dist/task/types.d.ts.map +1 -0
- package/dist/task/types.js +2 -0
- package/dist/task/types.js.map +1 -0
- package/package.json +45 -0
- package/src/cli.ts +22 -0
- package/src/commands/diff.ts +38 -0
- package/src/commands/format.ts +148 -0
- package/src/commands/index.ts +7 -0
- package/src/commands/list.ts +43 -0
- package/src/commands/read.ts +41 -0
- package/src/commands/report.ts +32 -0
- package/src/commands/run.ts +84 -0
- package/src/commands/types.ts +9 -0
- package/src/index.ts +34 -0
- package/src/judge/builtin/frontmatter.ts +95 -0
- package/src/judge/builtin/hallucination.ts +17 -0
- package/src/judge/builtin/index.ts +6 -0
- package/src/judge/builtin/read-steps.ts +14 -0
- package/src/judge/builtin/token-stats.ts +53 -0
- package/src/judge/builtin/types.ts +16 -0
- package/src/judge/builtin/upstream.ts +17 -0
- package/src/judge/index.ts +10 -0
- package/src/judge/types.ts +15 -0
- package/src/runner/collect.ts +172 -0
- package/src/runner/execute.ts +87 -0
- package/src/runner/index.ts +15 -0
- package/src/runner/prepare.ts +45 -0
- package/src/runner/types.ts +85 -0
- package/src/storage/index.ts +9 -0
- package/src/storage/schemas.ts +123 -0
- package/src/storage/store.ts +42 -0
- package/src/storage/types.ts +33 -0
- package/src/task/index.ts +2 -0
- package/src/task/loader.ts +74 -0
- package/src/task/types.ts +28 -0
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import { execFileSync } from "node:child_process";
|
|
2
|
+
import { readFile } from "node:fs/promises";
|
|
3
|
+
import { resolve } from "node:path";
|
|
4
|
+
|
|
5
|
+
import type { JSONSchema, Store } from "@ocas/core";
|
|
6
|
+
import { putSchema } from "@ocas/core";
|
|
7
|
+
import type { CasRef } from "@united-workforce/protocol";
|
|
8
|
+
import { createLogger } from "@united-workforce/util";
|
|
9
|
+
|
|
10
|
+
import type { JudgeOutput } from "../judge/index.js";
|
|
11
|
+
import {
|
|
12
|
+
runFrontmatterJudge,
|
|
13
|
+
runHallucinationJudge,
|
|
14
|
+
runTokenStatsJudge,
|
|
15
|
+
runUpstreamJudge,
|
|
16
|
+
} from "../judge/index.js";
|
|
17
|
+
import type { EvalJudgeRecord, EvalRunPayload } from "../storage/index.js";
|
|
18
|
+
import { EVAL_RUN_SCHEMA, setEvalLatest } from "../storage/index.js";
|
|
19
|
+
import type { JudgeEntry } from "../task/index.js";
|
|
20
|
+
import type {
|
|
21
|
+
CollectInput,
|
|
22
|
+
CollectResult,
|
|
23
|
+
JudgeRunner,
|
|
24
|
+
JudgeRunOutput,
|
|
25
|
+
JudgeSummary,
|
|
26
|
+
} from "./types.js";
|
|
27
|
+
|
|
28
|
+
const log = createLogger({ sink: { kind: "stderr" } });
|
|
29
|
+
|
|
30
|
+
const LOG_JUDGE = "CT6N3P2K";
|
|
31
|
+
const LOG_STORED = "CT9V2Q7M";
|
|
32
|
+
|
|
33
|
+
/** Permissive schema for judge data without a dedicated schema (e.g. builtin placeholders). */
|
|
34
|
+
const GENERIC_DATA_SCHEMA: JSONSchema = { type: "object" };
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Compute the weighted overall score. Judges with weight 0 are informational
|
|
38
|
+
* and do not affect the result (they contribute 0 to both numerator and
|
|
39
|
+
* denominator). Returns 0 when total weight is 0.
|
|
40
|
+
*/
|
|
41
|
+
export function computeOverall(judges: ReadonlyArray<{ score: number; weight: number }>): number {
|
|
42
|
+
let totalWeight = 0;
|
|
43
|
+
let weighted = 0;
|
|
44
|
+
for (const judge of judges) {
|
|
45
|
+
totalWeight += judge.weight;
|
|
46
|
+
weighted += judge.score * judge.weight;
|
|
47
|
+
}
|
|
48
|
+
return totalWeight > 0 ? weighted / totalWeight : 0;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/** Run a task-provided judge script: `node <entry> <cwd> <threadId>`. */
|
|
52
|
+
async function runTaskJudge(
|
|
53
|
+
taskDir: string,
|
|
54
|
+
workDir: string,
|
|
55
|
+
threadId: string,
|
|
56
|
+
judge: JudgeEntry,
|
|
57
|
+
): Promise<JudgeRunOutput> {
|
|
58
|
+
if (judge.entry === null) {
|
|
59
|
+
throw new Error(`judge "${judge.name}" is not builtin but has no entry`);
|
|
60
|
+
}
|
|
61
|
+
const entryPath = resolve(taskDir, judge.entry);
|
|
62
|
+
|
|
63
|
+
let stdout: string;
|
|
64
|
+
try {
|
|
65
|
+
stdout = execFileSync("node", [entryPath, workDir, threadId], {
|
|
66
|
+
encoding: "utf8",
|
|
67
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
68
|
+
maxBuffer: 50 * 1024 * 1024,
|
|
69
|
+
});
|
|
70
|
+
} catch (e) {
|
|
71
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
72
|
+
throw new Error(`judge "${judge.name}" failed: ${message}`);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const line = stdout.trim().split("\n").pop()?.trim() ?? "";
|
|
76
|
+
let parsed: unknown;
|
|
77
|
+
try {
|
|
78
|
+
parsed = JSON.parse(line);
|
|
79
|
+
} catch {
|
|
80
|
+
throw new Error(`judge "${judge.name}" stdout is not valid JSON: ${line || "(empty)"}`);
|
|
81
|
+
}
|
|
82
|
+
const output = parsed as JudgeOutput;
|
|
83
|
+
if (typeof output.score !== "number") {
|
|
84
|
+
throw new Error(`judge "${judge.name}" output missing numeric score`);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const schema =
|
|
88
|
+
judge.schema !== null ? await loadSchema(resolve(taskDir, judge.schema)) : GENERIC_DATA_SCHEMA;
|
|
89
|
+
return { score: output.score, data: output.data, schema };
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/** Load and parse an OCAS JSON Schema file. */
|
|
93
|
+
async function loadSchema(path: string): Promise<JSONSchema> {
|
|
94
|
+
const text = await readFile(path, "utf8");
|
|
95
|
+
return JSON.parse(text) as JSONSchema;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/** Dispatch a builtin judge by name. Throws on an unknown builtin name. */
|
|
99
|
+
async function runBuiltinJudge(name: string, threadId: string): Promise<JudgeRunOutput> {
|
|
100
|
+
switch (name) {
|
|
101
|
+
case "frontmatter-compliance":
|
|
102
|
+
return runFrontmatterJudge(threadId);
|
|
103
|
+
case "upstream-consumption":
|
|
104
|
+
return runUpstreamJudge(threadId);
|
|
105
|
+
case "hallucination":
|
|
106
|
+
return runHallucinationJudge(threadId);
|
|
107
|
+
case "token-stats":
|
|
108
|
+
return runTokenStatsJudge(threadId);
|
|
109
|
+
default:
|
|
110
|
+
throw new Error(`unknown builtin judge "${name}"`);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Default judge runner. Builtin judges are dispatched by name; task judges spawn
|
|
116
|
+
* their entry script.
|
|
117
|
+
*/
|
|
118
|
+
const defaultJudgeRunner: JudgeRunner = async (taskDir, workDir, threadId, judge) => {
|
|
119
|
+
if (judge.builtin) {
|
|
120
|
+
return runBuiltinJudge(judge.name, threadId);
|
|
121
|
+
}
|
|
122
|
+
return runTaskJudge(taskDir, workDir, threadId, judge);
|
|
123
|
+
};
|
|
124
|
+
|
|
125
|
+
/** Persist judge data to CAS under its schema and return the CAS hash. */
|
|
126
|
+
async function storeJudgeData(store: Store, schema: JSONSchema, data: unknown): Promise<CasRef> {
|
|
127
|
+
const schemaHash = await putSchema(store, schema);
|
|
128
|
+
return (await store.cas.put(schemaHash, data)) as CasRef;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Run all judges, store their data and the overall eval-run record in CAS, then
|
|
133
|
+
* index the run under `@uwf/eval/<task>/latest`.
|
|
134
|
+
*/
|
|
135
|
+
export async function collect(
|
|
136
|
+
input: CollectInput,
|
|
137
|
+
runJudge: JudgeRunner = defaultJudgeRunner,
|
|
138
|
+
): Promise<CollectResult> {
|
|
139
|
+
const { evalStore, taskDir, workDir, threadId, manifest, config } = input;
|
|
140
|
+
const { store, varStore } = evalStore;
|
|
141
|
+
|
|
142
|
+
const records: EvalJudgeRecord[] = [];
|
|
143
|
+
for (const judge of manifest.judges) {
|
|
144
|
+
const result = await runJudge(taskDir, workDir, threadId, judge);
|
|
145
|
+
const dataHash = await storeJudgeData(store, result.schema, result.data);
|
|
146
|
+
records.push({ name: judge.name, score: result.score, weight: judge.weight, dataHash });
|
|
147
|
+
log(LOG_JUDGE, `judge=${judge.name} score=${result.score} weight=${judge.weight}`);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
const overall = computeOverall(records);
|
|
151
|
+
|
|
152
|
+
const payload: EvalRunPayload = {
|
|
153
|
+
task: manifest.name,
|
|
154
|
+
config,
|
|
155
|
+
threadId,
|
|
156
|
+
judges: records,
|
|
157
|
+
overall,
|
|
158
|
+
timestamp: Date.now(),
|
|
159
|
+
};
|
|
160
|
+
|
|
161
|
+
const schemaHash = await putSchema(store, EVAL_RUN_SCHEMA);
|
|
162
|
+
const runHash = (await store.cas.put(schemaHash, payload)) as string;
|
|
163
|
+
setEvalLatest(varStore, manifest.name, runHash);
|
|
164
|
+
log(LOG_STORED, `stored eval-run task=${manifest.name} hash=${runHash} overall=${overall}`);
|
|
165
|
+
|
|
166
|
+
const judges: JudgeSummary[] = records.map((r) => ({
|
|
167
|
+
name: r.name,
|
|
168
|
+
score: r.score,
|
|
169
|
+
weight: r.weight,
|
|
170
|
+
}));
|
|
171
|
+
return { runHash, overall, judges };
|
|
172
|
+
}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import { execFileSync } from "node:child_process";
|
|
2
|
+
|
|
3
|
+
import { createLogger } from "@united-workforce/util";
|
|
4
|
+
|
|
5
|
+
import type { ExecuteInput, ExecuteResult } from "./types.js";
|
|
6
|
+
|
|
7
|
+
const log = createLogger({ sink: { kind: "stderr" } });
|
|
8
|
+
|
|
9
|
+
const LOG_START = "EX5M2T9V";
|
|
10
|
+
const LOG_EXEC = "EX7Q4K2N";
|
|
11
|
+
|
|
12
|
+
/** Resolve the uwf CLI binary. Override with `UWF_BIN` for testing. */
|
|
13
|
+
function uwfBin(): string {
|
|
14
|
+
const override = process.env.UWF_BIN;
|
|
15
|
+
return override !== undefined && override !== "" ? override : "uwf";
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/** Run a uwf subcommand and return trimmed stdout. */
|
|
19
|
+
function runUwf(args: string[], cwd: string): string {
|
|
20
|
+
try {
|
|
21
|
+
return execFileSync(uwfBin(), args, {
|
|
22
|
+
encoding: "utf8",
|
|
23
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
24
|
+
maxBuffer: 50 * 1024 * 1024,
|
|
25
|
+
cwd,
|
|
26
|
+
}).trim();
|
|
27
|
+
} catch (e) {
|
|
28
|
+
const err = e as NodeJS.ErrnoException & { stderr?: Buffer | string | null };
|
|
29
|
+
const stderr =
|
|
30
|
+
err.stderr == null
|
|
31
|
+
? ""
|
|
32
|
+
: typeof err.stderr === "string"
|
|
33
|
+
? err.stderr
|
|
34
|
+
: err.stderr.toString("utf8");
|
|
35
|
+
const detail = stderr.trim() !== "" ? `: ${stderr.trim()}` : "";
|
|
36
|
+
throw new Error(`uwf ${args[0]} ${args[1]} failed${detail}`);
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/** Parse the thread ID from `uwf thread start` JSON output (`{ workflow, thread }`). */
|
|
41
|
+
function parseThreadId(stdout: string): string {
|
|
42
|
+
let parsed: unknown;
|
|
43
|
+
try {
|
|
44
|
+
parsed = JSON.parse(stdout);
|
|
45
|
+
} catch {
|
|
46
|
+
throw new Error(`uwf thread start did not emit valid JSON: ${stdout || "(empty)"}`);
|
|
47
|
+
}
|
|
48
|
+
const obj = parsed as Record<string, unknown>;
|
|
49
|
+
const thread = obj.thread;
|
|
50
|
+
if (typeof thread !== "string" || thread === "") {
|
|
51
|
+
throw new Error(`uwf thread start output missing thread id: ${stdout}`);
|
|
52
|
+
}
|
|
53
|
+
return thread;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Execute a workflow: create a thread, then run it for up to `maxSteps` steps.
|
|
58
|
+
* Shells out to the uwf CLI rather than importing it directly.
|
|
59
|
+
*/
|
|
60
|
+
export async function execute(input: ExecuteInput): Promise<ExecuteResult> {
|
|
61
|
+
const startOut = runUwf(
|
|
62
|
+
["thread", "start", input.workflow, "-p", input.prompt, "--cwd", input.workDir],
|
|
63
|
+
input.workDir,
|
|
64
|
+
);
|
|
65
|
+
const threadId = parseThreadId(startOut);
|
|
66
|
+
log(LOG_START, `thread started thread=${threadId} workflow=${input.workflow}`);
|
|
67
|
+
|
|
68
|
+
runUwf(
|
|
69
|
+
["thread", "exec", threadId, "--agent", input.agent, "-c", String(input.maxSteps)],
|
|
70
|
+
input.workDir,
|
|
71
|
+
);
|
|
72
|
+
log(LOG_EXEC, `thread executed thread=${threadId} maxSteps=${input.maxSteps}`);
|
|
73
|
+
|
|
74
|
+
return { threadId };
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/** Best-effort lookup of the uwf engine version (`uwf -V`); "unknown" on failure. */
|
|
78
|
+
export function getEngineVersion(): string {
|
|
79
|
+
try {
|
|
80
|
+
return execFileSync(uwfBin(), ["-V"], {
|
|
81
|
+
encoding: "utf8",
|
|
82
|
+
stdio: ["ignore", "pipe", "ignore"],
|
|
83
|
+
}).trim();
|
|
84
|
+
} catch {
|
|
85
|
+
return "unknown";
|
|
86
|
+
}
|
|
87
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
export { collect, computeOverall } from "./collect.js";
|
|
2
|
+
export { execute, getEngineVersion } from "./execute.js";
|
|
3
|
+
export { prepare } from "./prepare.js";
|
|
4
|
+
export type {
|
|
5
|
+
CollectInput,
|
|
6
|
+
CollectResult,
|
|
7
|
+
ExecuteInput,
|
|
8
|
+
ExecuteResult,
|
|
9
|
+
JudgeRunner,
|
|
10
|
+
JudgeRunOutput,
|
|
11
|
+
JudgeSummary,
|
|
12
|
+
PrepareResult,
|
|
13
|
+
RunOptions,
|
|
14
|
+
RunResult,
|
|
15
|
+
} from "./types.js";
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { access, cp, mkdir, mkdtemp } from "node:fs/promises";
|
|
2
|
+
import { tmpdir } from "node:os";
|
|
3
|
+
import { join } from "node:path";
|
|
4
|
+
|
|
5
|
+
import { createLogger } from "@united-workforce/util";
|
|
6
|
+
|
|
7
|
+
import { loadTaskManifest } from "../task/index.js";
|
|
8
|
+
import type { PrepareResult } from "./types.js";
|
|
9
|
+
|
|
10
|
+
const log = createLogger({ sink: { kind: "stderr" } });
|
|
11
|
+
|
|
12
|
+
const LOG_PREPARE = "PRE4K2NQ";
|
|
13
|
+
const LOG_FIXTURE = "PRE7M3VX";
|
|
14
|
+
|
|
15
|
+
/** Check whether a path exists. */
|
|
16
|
+
async function pathExists(path: string): Promise<boolean> {
|
|
17
|
+
try {
|
|
18
|
+
await access(path);
|
|
19
|
+
return true;
|
|
20
|
+
} catch {
|
|
21
|
+
return false;
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Prepare a task for execution: read its manifest and copy the fixture
|
|
27
|
+
* directory into a fresh temp working directory.
|
|
28
|
+
*/
|
|
29
|
+
export async function prepare(taskDir: string): Promise<PrepareResult> {
|
|
30
|
+
const manifest = await loadTaskManifest(taskDir);
|
|
31
|
+
log(LOG_PREPARE, `loaded task manifest name=${manifest.name} workflow=${manifest.workflow}`);
|
|
32
|
+
|
|
33
|
+
const workDir = await mkdtemp(join(tmpdir(), "uwf-eval-"));
|
|
34
|
+
|
|
35
|
+
const fixtureDir = join(taskDir, "fixture");
|
|
36
|
+
if (await pathExists(fixtureDir)) {
|
|
37
|
+
await cp(fixtureDir, workDir, { recursive: true });
|
|
38
|
+
log(LOG_FIXTURE, `copied fixture into workDir=${workDir}`);
|
|
39
|
+
} else {
|
|
40
|
+
await mkdir(workDir, { recursive: true });
|
|
41
|
+
log(LOG_FIXTURE, `no fixture/ found, using empty workDir=${workDir}`);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
return { taskDir, workDir, manifest };
|
|
45
|
+
}
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import type { JSONSchema } from "@ocas/core";
|
|
2
|
+
|
|
3
|
+
import type { EvalRunConfig, EvalStore } from "../storage/index.js";
|
|
4
|
+
import type { JudgeEntry, TaskManifest } from "../task/index.js";
|
|
5
|
+
|
|
6
|
+
/** Result of the prepare phase: task dir, temp working dir, parsed manifest. */
|
|
7
|
+
export type PrepareResult = {
|
|
8
|
+
taskDir: string;
|
|
9
|
+
workDir: string;
|
|
10
|
+
manifest: TaskManifest;
|
|
11
|
+
};
|
|
12
|
+
|
|
13
|
+
/** Input to the execute phase. */
|
|
14
|
+
export type ExecuteInput = {
|
|
15
|
+
/** Working directory the workflow runs in (the prepared temp dir). */
|
|
16
|
+
workDir: string;
|
|
17
|
+
/** Workflow name or path (from task.yaml). */
|
|
18
|
+
workflow: string;
|
|
19
|
+
/** Initial prompt for the thread. */
|
|
20
|
+
prompt: string;
|
|
21
|
+
/** Agent adapter to use. */
|
|
22
|
+
agent: string;
|
|
23
|
+
/** Maximum number of steps to execute. */
|
|
24
|
+
maxSteps: number;
|
|
25
|
+
};
|
|
26
|
+
|
|
27
|
+
/** Result of the execute phase. */
|
|
28
|
+
export type ExecuteResult = {
|
|
29
|
+
threadId: string;
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
/** Output produced by running a single judge. */
|
|
33
|
+
export type JudgeRunOutput = {
|
|
34
|
+
score: number;
|
|
35
|
+
data: unknown;
|
|
36
|
+
/** Schema describing `data`, used when persisting to CAS. */
|
|
37
|
+
schema: JSONSchema;
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
/** Pluggable judge execution strategy (injectable for testing). */
|
|
41
|
+
export type JudgeRunner = (
|
|
42
|
+
taskDir: string,
|
|
43
|
+
workDir: string,
|
|
44
|
+
threadId: string,
|
|
45
|
+
judge: JudgeEntry,
|
|
46
|
+
) => Promise<JudgeRunOutput>;
|
|
47
|
+
|
|
48
|
+
/** Input to the collect phase. */
|
|
49
|
+
export type CollectInput = {
|
|
50
|
+
evalStore: EvalStore;
|
|
51
|
+
taskDir: string;
|
|
52
|
+
workDir: string;
|
|
53
|
+
threadId: string;
|
|
54
|
+
manifest: TaskManifest;
|
|
55
|
+
config: EvalRunConfig;
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
/** A single judge's summarized result in the run output. */
|
|
59
|
+
export type JudgeSummary = {
|
|
60
|
+
name: string;
|
|
61
|
+
score: number;
|
|
62
|
+
weight: number;
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
/** Result of the collect phase. */
|
|
66
|
+
export type CollectResult = {
|
|
67
|
+
runHash: string;
|
|
68
|
+
overall: number;
|
|
69
|
+
judges: JudgeSummary[];
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
/** Options for a full eval run (from CLI flags). */
|
|
73
|
+
export type RunOptions = {
|
|
74
|
+
agent: string;
|
|
75
|
+
model: string;
|
|
76
|
+
count: number;
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
/** Final result of a full eval run. */
|
|
80
|
+
export type RunResult = {
|
|
81
|
+
runHash: string;
|
|
82
|
+
overall: number;
|
|
83
|
+
task: string;
|
|
84
|
+
judges: JudgeSummary[];
|
|
85
|
+
};
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
export {
|
|
2
|
+
EVAL_JUDGE_FRONTMATTER_SCHEMA,
|
|
3
|
+
EVAL_JUDGE_HALLUCINATION_SCHEMA,
|
|
4
|
+
EVAL_JUDGE_TOKEN_STATS_SCHEMA,
|
|
5
|
+
EVAL_JUDGE_UPSTREAM_SCHEMA,
|
|
6
|
+
EVAL_RUN_SCHEMA,
|
|
7
|
+
} from "./schemas.js";
|
|
8
|
+
export { createEvalStore, setEvalLatest } from "./store.js";
|
|
9
|
+
export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload, EvalStore } from "./types.js";
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import type { JSONSchema } from "@ocas/core";
|
|
2
|
+
|
|
3
|
+
export const EVAL_RUN_SCHEMA: JSONSchema = {
|
|
4
|
+
title: "@uwf/eval-run",
|
|
5
|
+
type: "object",
|
|
6
|
+
required: ["task", "config", "threadId", "judges", "overall", "timestamp"],
|
|
7
|
+
properties: {
|
|
8
|
+
task: { type: "string" },
|
|
9
|
+
config: {
|
|
10
|
+
type: "object",
|
|
11
|
+
required: ["agent", "model", "engineVersion"],
|
|
12
|
+
properties: {
|
|
13
|
+
agent: { type: "string" },
|
|
14
|
+
model: { type: "string" },
|
|
15
|
+
engineVersion: { type: "string" },
|
|
16
|
+
},
|
|
17
|
+
},
|
|
18
|
+
threadId: { type: "string" },
|
|
19
|
+
judges: {
|
|
20
|
+
type: "array",
|
|
21
|
+
items: {
|
|
22
|
+
type: "object",
|
|
23
|
+
required: ["name", "score", "weight", "dataHash"],
|
|
24
|
+
properties: {
|
|
25
|
+
name: { type: "string" },
|
|
26
|
+
score: { type: "number" },
|
|
27
|
+
weight: { type: "number" },
|
|
28
|
+
dataHash: { type: "string" },
|
|
29
|
+
},
|
|
30
|
+
},
|
|
31
|
+
},
|
|
32
|
+
overall: { type: "number" },
|
|
33
|
+
timestamp: { type: "integer" },
|
|
34
|
+
},
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
export const EVAL_JUDGE_FRONTMATTER_SCHEMA: JSONSchema = {
|
|
38
|
+
title: "@uwf/eval-judge-frontmatter",
|
|
39
|
+
type: "object",
|
|
40
|
+
required: ["stepsTotal", "stepsValid", "invalidSteps"],
|
|
41
|
+
properties: {
|
|
42
|
+
stepsTotal: { type: "integer" },
|
|
43
|
+
stepsValid: { type: "integer" },
|
|
44
|
+
invalidSteps: {
|
|
45
|
+
type: "array",
|
|
46
|
+
items: {
|
|
47
|
+
type: "object",
|
|
48
|
+
required: ["stepIndex", "role", "errors"],
|
|
49
|
+
properties: {
|
|
50
|
+
stepIndex: { type: "integer" },
|
|
51
|
+
role: { type: "string" },
|
|
52
|
+
errors: { type: "array", items: { type: "string" } },
|
|
53
|
+
},
|
|
54
|
+
},
|
|
55
|
+
},
|
|
56
|
+
},
|
|
57
|
+
};
|
|
58
|
+
|
|
59
|
+
export const EVAL_JUDGE_UPSTREAM_SCHEMA: JSONSchema = {
|
|
60
|
+
title: "@uwf/eval-judge-upstream",
|
|
61
|
+
type: "object",
|
|
62
|
+
required: ["perStep"],
|
|
63
|
+
properties: {
|
|
64
|
+
perStep: {
|
|
65
|
+
type: "array",
|
|
66
|
+
items: {
|
|
67
|
+
type: "object",
|
|
68
|
+
required: ["role", "consumed", "missed", "score"],
|
|
69
|
+
properties: {
|
|
70
|
+
role: { type: "string" },
|
|
71
|
+
consumed: { type: "array", items: { type: "string" } },
|
|
72
|
+
missed: { type: "array", items: { type: "string" } },
|
|
73
|
+
score: { type: "number" },
|
|
74
|
+
},
|
|
75
|
+
},
|
|
76
|
+
},
|
|
77
|
+
},
|
|
78
|
+
};
|
|
79
|
+
|
|
80
|
+
export const EVAL_JUDGE_HALLUCINATION_SCHEMA: JSONSchema = {
|
|
81
|
+
title: "@uwf/eval-judge-hallucination",
|
|
82
|
+
type: "object",
|
|
83
|
+
required: ["perStep"],
|
|
84
|
+
properties: {
|
|
85
|
+
perStep: {
|
|
86
|
+
type: "array",
|
|
87
|
+
items: {
|
|
88
|
+
type: "object",
|
|
89
|
+
required: ["role", "hallucinations", "score"],
|
|
90
|
+
properties: {
|
|
91
|
+
role: { type: "string" },
|
|
92
|
+
hallucinations: { type: "array", items: { type: "string" } },
|
|
93
|
+
score: { type: "number" },
|
|
94
|
+
},
|
|
95
|
+
},
|
|
96
|
+
},
|
|
97
|
+
},
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
export const EVAL_JUDGE_TOKEN_STATS_SCHEMA: JSONSchema = {
|
|
101
|
+
title: "@uwf/eval-judge-token-stats",
|
|
102
|
+
type: "object",
|
|
103
|
+
required: ["totalInput", "totalOutput", "totalTurns", "perStep"],
|
|
104
|
+
properties: {
|
|
105
|
+
totalInput: { type: "integer" },
|
|
106
|
+
totalOutput: { type: "integer" },
|
|
107
|
+
totalTurns: { type: "integer" },
|
|
108
|
+
perStep: {
|
|
109
|
+
type: "array",
|
|
110
|
+
items: {
|
|
111
|
+
type: "object",
|
|
112
|
+
required: ["role", "inputTokens", "outputTokens", "turns", "duration"],
|
|
113
|
+
properties: {
|
|
114
|
+
role: { type: "string" },
|
|
115
|
+
inputTokens: { type: "integer" },
|
|
116
|
+
outputTokens: { type: "integer" },
|
|
117
|
+
turns: { type: "integer" },
|
|
118
|
+
duration: { type: "number" },
|
|
119
|
+
},
|
|
120
|
+
},
|
|
121
|
+
},
|
|
122
|
+
},
|
|
123
|
+
};
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import { mkdir } from "node:fs/promises";
|
|
2
|
+
import { homedir } from "node:os";
|
|
3
|
+
import { join } from "node:path";
|
|
4
|
+
import type { VarStore } from "@ocas/core";
|
|
5
|
+
import { bootstrap, type Store } from "@ocas/core";
|
|
6
|
+
import { createFsStore, createSqliteVarStore } from "@ocas/fs";
|
|
7
|
+
|
|
8
|
+
import type { EvalStore } from "./types.js";
|
|
9
|
+
|
|
10
|
+
/** Variable name prefix for eval run pointers (`@uwf/eval/<task>/latest`). */
|
|
11
|
+
const EVAL_VAR_PREFIX = "@uwf/eval/";
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Resolve the global CAS directory shared by all uwf and ocas tools.
|
|
15
|
+
* Priority: `OCAS_HOME` → default ~/.ocas (matches uwf CLI's getGlobalCasDir).
|
|
16
|
+
*/
|
|
17
|
+
function getGlobalCasDir(): string {
|
|
18
|
+
const primary = process.env.OCAS_HOME;
|
|
19
|
+
if (primary !== undefined && primary !== "") {
|
|
20
|
+
return primary;
|
|
21
|
+
}
|
|
22
|
+
return join(homedir(), ".ocas");
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Open the unified OCAS store on the filesystem.
|
|
27
|
+
* Shares the same CAS + variable backend as the uwf CLI.
|
|
28
|
+
*/
|
|
29
|
+
export async function createEvalStore(): Promise<EvalStore> {
|
|
30
|
+
const casDir = getGlobalCasDir();
|
|
31
|
+
await mkdir(casDir, { recursive: true });
|
|
32
|
+
const cas = createFsStore(casDir);
|
|
33
|
+
const { var: varStore, tag } = createSqliteVarStore(join(casDir, "vars"), cas);
|
|
34
|
+
const store: Store = { cas, var: varStore, tag };
|
|
35
|
+
bootstrap(store);
|
|
36
|
+
return { store, varStore };
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/** Set the `@uwf/eval/<task>/latest` variable to point at a run hash. */
|
|
40
|
+
export function setEvalLatest(varStore: VarStore, taskName: string, runHash: string): void {
|
|
41
|
+
varStore.set(`${EVAL_VAR_PREFIX}${taskName}/latest`, runHash);
|
|
42
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import type { Store, VarStore } from "@ocas/core";
|
|
2
|
+
import type { CasRef } from "@united-workforce/protocol";
|
|
3
|
+
|
|
4
|
+
/** Handle to the OCAS store used for eval persistence. */
|
|
5
|
+
export type EvalStore = {
|
|
6
|
+
store: Store;
|
|
7
|
+
varStore: VarStore;
|
|
8
|
+
};
|
|
9
|
+
|
|
10
|
+
/** A single judge result within an eval run. */
|
|
11
|
+
export type EvalJudgeRecord = {
|
|
12
|
+
name: string;
|
|
13
|
+
score: number;
|
|
14
|
+
weight: number;
|
|
15
|
+
dataHash: CasRef;
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
/** Config snapshot for an eval run. */
|
|
19
|
+
export type EvalRunConfig = {
|
|
20
|
+
agent: string;
|
|
21
|
+
model: string;
|
|
22
|
+
engineVersion: string;
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
/** Full eval run record stored in CAS. */
|
|
26
|
+
export type EvalRunPayload = {
|
|
27
|
+
task: string;
|
|
28
|
+
config: EvalRunConfig;
|
|
29
|
+
threadId: string;
|
|
30
|
+
judges: EvalJudgeRecord[];
|
|
31
|
+
overall: number;
|
|
32
|
+
timestamp: number;
|
|
33
|
+
};
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import { readFile } from "node:fs/promises";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import { parse as parseYaml } from "yaml";
|
|
4
|
+
import type { JudgeEntry, TaskLimits, TaskManifest } from "./types.js";
|
|
5
|
+
|
|
6
|
+
function isRecord(value: unknown): value is Record<string, unknown> {
|
|
7
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
function parseJudgeEntry(raw: unknown, index: number): JudgeEntry {
|
|
11
|
+
if (!isRecord(raw)) {
|
|
12
|
+
throw new Error(`judges[${index}]: expected object`);
|
|
13
|
+
}
|
|
14
|
+
const name = raw.name;
|
|
15
|
+
if (typeof name !== "string" || name === "") {
|
|
16
|
+
throw new Error(`judges[${index}]: name is required`);
|
|
17
|
+
}
|
|
18
|
+
const weight = typeof raw.weight === "number" ? raw.weight : 0;
|
|
19
|
+
const builtin = raw.builtin === true;
|
|
20
|
+
const entry = typeof raw.entry === "string" ? raw.entry : null;
|
|
21
|
+
const schema = typeof raw.schema === "string" ? raw.schema : null;
|
|
22
|
+
if (!builtin && entry === null) {
|
|
23
|
+
throw new Error(`judges[${index}] "${name}": non-builtin judge must have entry`);
|
|
24
|
+
}
|
|
25
|
+
return { name, weight, builtin, entry, schema };
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function parseLimits(raw: unknown): TaskLimits {
|
|
29
|
+
if (!isRecord(raw)) {
|
|
30
|
+
return { maxSteps: 20, timeoutMinutes: 30 };
|
|
31
|
+
}
|
|
32
|
+
return {
|
|
33
|
+
maxSteps: typeof raw.maxSteps === "number" ? raw.maxSteps : 20,
|
|
34
|
+
timeoutMinutes: typeof raw.timeoutMinutes === "number" ? raw.timeoutMinutes : 30,
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/** Parse and validate a task.yaml file into a TaskManifest. */
|
|
39
|
+
export function parseTaskManifest(yamlText: string): TaskManifest {
|
|
40
|
+
const raw = parseYaml(yamlText) as unknown;
|
|
41
|
+
if (!isRecord(raw)) {
|
|
42
|
+
throw new Error("task.yaml must be a YAML mapping");
|
|
43
|
+
}
|
|
44
|
+
const name = raw.name;
|
|
45
|
+
if (typeof name !== "string" || name === "") {
|
|
46
|
+
throw new Error("task.yaml: name is required");
|
|
47
|
+
}
|
|
48
|
+
const description = typeof raw.description === "string" ? raw.description : "";
|
|
49
|
+
const workflow = raw.workflow;
|
|
50
|
+
if (typeof workflow !== "string" || workflow === "") {
|
|
51
|
+
throw new Error("task.yaml: workflow is required");
|
|
52
|
+
}
|
|
53
|
+
const prompt = raw.prompt;
|
|
54
|
+
if (typeof prompt !== "string" || prompt === "") {
|
|
55
|
+
throw new Error("task.yaml: prompt is required");
|
|
56
|
+
}
|
|
57
|
+
const limits = parseLimits(raw.limits);
|
|
58
|
+
const judgesRaw = raw.judges;
|
|
59
|
+
if (!Array.isArray(judgesRaw) || judgesRaw.length === 0) {
|
|
60
|
+
throw new Error("task.yaml: at least one judge is required");
|
|
61
|
+
}
|
|
62
|
+
const judges: JudgeEntry[] = [];
|
|
63
|
+
for (let i = 0; i < judgesRaw.length; i++) {
|
|
64
|
+
judges.push(parseJudgeEntry(judgesRaw[i], i));
|
|
65
|
+
}
|
|
66
|
+
return { name, description, workflow, prompt, limits, judges };
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/** Load and parse task.yaml from a directory. */
|
|
70
|
+
export async function loadTaskManifest(taskDir: string): Promise<TaskManifest> {
|
|
71
|
+
const yamlPath = join(taskDir, "task.yaml");
|
|
72
|
+
const text = await readFile(yamlPath, "utf8");
|
|
73
|
+
return parseTaskManifest(text);
|
|
74
|
+
}
|