@united-workforce/eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +14 -0
- package/dist/cli.js.map +1 -0
- package/dist/commands/diff.d.ts +3 -0
- package/dist/commands/diff.d.ts.map +1 -0
- package/dist/commands/diff.js +36 -0
- package/dist/commands/diff.js.map +1 -0
- package/dist/commands/format.d.ts +11 -0
- package/dist/commands/format.d.ts.map +1 -0
- package/dist/commands/format.js +114 -0
- package/dist/commands/format.js.map +1 -0
- package/dist/commands/index.d.ts +8 -0
- package/dist/commands/index.d.ts.map +1 -0
- package/dist/commands/index.js +7 -0
- package/dist/commands/index.js.map +1 -0
- package/dist/commands/list.d.ts +3 -0
- package/dist/commands/list.d.ts.map +1 -0
- package/dist/commands/list.js +35 -0
- package/dist/commands/list.js.map +1 -0
- package/dist/commands/read.d.ts +10 -0
- package/dist/commands/read.d.ts.map +1 -0
- package/dist/commands/read.js +37 -0
- package/dist/commands/read.js.map +1 -0
- package/dist/commands/report.d.ts +3 -0
- package/dist/commands/report.d.ts.map +1 -0
- package/dist/commands/report.js +30 -0
- package/dist/commands/report.js.map +1 -0
- package/dist/commands/run.d.ts +3 -0
- package/dist/commands/run.d.ts.map +1 -0
- package/dist/commands/run.js +64 -0
- package/dist/commands/run.js.map +1 -0
- package/dist/commands/types.d.ts +9 -0
- package/dist/commands/types.d.ts.map +1 -0
- package/dist/commands/types.js +2 -0
- package/dist/commands/types.js.map +1 -0
- package/dist/index.d.ts +8 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +6 -0
- package/dist/index.js.map +1 -0
- package/dist/judge/builtin/frontmatter.d.ts +8 -0
- package/dist/judge/builtin/frontmatter.d.ts.map +1 -0
- package/dist/judge/builtin/frontmatter.js +75 -0
- package/dist/judge/builtin/frontmatter.js.map +1 -0
- package/dist/judge/builtin/hallucination.d.ts +10 -0
- package/dist/judge/builtin/hallucination.d.ts.map +1 -0
- package/dist/judge/builtin/hallucination.js +16 -0
- package/dist/judge/builtin/hallucination.js.map +1 -0
- package/dist/judge/builtin/index.d.ts +7 -0
- package/dist/judge/builtin/index.d.ts.map +1 -0
- package/dist/judge/builtin/index.js +6 -0
- package/dist/judge/builtin/index.js.map +1 -0
- package/dist/judge/builtin/read-steps.d.ts +4 -0
- package/dist/judge/builtin/read-steps.d.ts.map +1 -0
- package/dist/judge/builtin/read-steps.js +12 -0
- package/dist/judge/builtin/read-steps.js.map +1 -0
- package/dist/judge/builtin/token-stats.d.ts +8 -0
- package/dist/judge/builtin/token-stats.d.ts.map +1 -0
- package/dist/judge/builtin/token-stats.js +35 -0
- package/dist/judge/builtin/token-stats.js.map +1 -0
- package/dist/judge/builtin/types.d.ts +15 -0
- package/dist/judge/builtin/types.d.ts.map +1 -0
- package/dist/judge/builtin/types.js +2 -0
- package/dist/judge/builtin/types.js.map +1 -0
- package/dist/judge/builtin/upstream.d.ts +10 -0
- package/dist/judge/builtin/upstream.d.ts.map +1 -0
- package/dist/judge/builtin/upstream.js +16 -0
- package/dist/judge/builtin/upstream.js.map +1 -0
- package/dist/judge/index.d.ts +3 -0
- package/dist/judge/index.d.ts.map +1 -0
- package/dist/judge/index.js +2 -0
- package/dist/judge/index.js.map +1 -0
- package/dist/judge/types.d.ts +15 -0
- package/dist/judge/types.d.ts.map +1 -0
- package/dist/judge/types.js +2 -0
- package/dist/judge/types.js.map +1 -0
- package/dist/runner/collect.d.ts +16 -0
- package/dist/runner/collect.d.ts.map +1 -0
- package/dist/runner/collect.js +129 -0
- package/dist/runner/collect.js.map +1 -0
- package/dist/runner/execute.d.ts +9 -0
- package/dist/runner/execute.d.ts.map +1 -0
- package/dist/runner/execute.js +72 -0
- package/dist/runner/execute.js.map +1 -0
- package/dist/runner/index.d.ts +5 -0
- package/dist/runner/index.d.ts.map +1 -0
- package/dist/runner/index.js +4 -0
- package/dist/runner/index.js.map +1 -0
- package/dist/runner/prepare.d.ts +7 -0
- package/dist/runner/prepare.d.ts.map +1 -0
- package/dist/runner/prepare.js +38 -0
- package/dist/runner/prepare.js.map +1 -0
- package/dist/runner/types.d.ts +70 -0
- package/dist/runner/types.d.ts.map +1 -0
- package/dist/runner/types.js +2 -0
- package/dist/runner/types.js.map +1 -0
- package/dist/storage/index.d.ts +4 -0
- package/dist/storage/index.d.ts.map +1 -0
- package/dist/storage/index.js +3 -0
- package/dist/storage/index.js.map +1 -0
- package/dist/storage/schemas.d.ts +7 -0
- package/dist/storage/schemas.d.ts.map +1 -0
- package/dist/storage/schemas.js +118 -0
- package/dist/storage/schemas.js.map +1 -0
- package/dist/storage/store.d.ts +10 -0
- package/dist/storage/store.d.ts.map +1 -0
- package/dist/storage/store.js +36 -0
- package/dist/storage/store.js.map +1 -0
- package/dist/storage/types.d.ts +30 -0
- package/dist/storage/types.d.ts.map +1 -0
- package/dist/storage/types.js +2 -0
- package/dist/storage/types.js.map +1 -0
- package/dist/task/index.d.ts +3 -0
- package/dist/task/index.d.ts.map +1 -0
- package/dist/task/index.js +2 -0
- package/dist/task/index.js.map +1 -0
- package/dist/task/loader.d.ts +6 -0
- package/dist/task/loader.d.ts.map +1 -0
- package/dist/task/loader.js +69 -0
- package/dist/task/loader.js.map +1 -0
- package/dist/task/types.d.ts +27 -0
- package/dist/task/types.d.ts.map +1 -0
- package/dist/task/types.js +2 -0
- package/dist/task/types.js.map +1 -0
- package/package.json +45 -0
- package/src/cli.ts +22 -0
- package/src/commands/diff.ts +38 -0
- package/src/commands/format.ts +148 -0
- package/src/commands/index.ts +7 -0
- package/src/commands/list.ts +43 -0
- package/src/commands/read.ts +41 -0
- package/src/commands/report.ts +32 -0
- package/src/commands/run.ts +84 -0
- package/src/commands/types.ts +9 -0
- package/src/index.ts +34 -0
- package/src/judge/builtin/frontmatter.ts +95 -0
- package/src/judge/builtin/hallucination.ts +17 -0
- package/src/judge/builtin/index.ts +6 -0
- package/src/judge/builtin/read-steps.ts +14 -0
- package/src/judge/builtin/token-stats.ts +53 -0
- package/src/judge/builtin/types.ts +16 -0
- package/src/judge/builtin/upstream.ts +17 -0
- package/src/judge/index.ts +10 -0
- package/src/judge/types.ts +15 -0
- package/src/runner/collect.ts +172 -0
- package/src/runner/execute.ts +87 -0
- package/src/runner/index.ts +15 -0
- package/src/runner/prepare.ts +45 -0
- package/src/runner/types.ts +85 -0
- package/src/storage/index.ts +9 -0
- package/src/storage/schemas.ts +123 -0
- package/src/storage/store.ts +42 -0
- package/src/storage/types.ts +33 -0
- package/src/task/index.ts +2 -0
- package/src/task/loader.ts +74 -0
- package/src/task/types.ts +28 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prepare.d.ts","sourceRoot":"","sources":["../../src/runner/prepare.ts"],"names":[],"mappings":"AAOA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAiBhD;;;GAGG;AACH,wBAAsB,OAAO,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,aAAa,CAAC,CAgBrE"}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import { access, cp, mkdir, mkdtemp } from "node:fs/promises";
|
|
2
|
+
import { tmpdir } from "node:os";
|
|
3
|
+
import { join } from "node:path";
|
|
4
|
+
import { createLogger } from "@united-workforce/util";
|
|
5
|
+
import { loadTaskManifest } from "../task/index.js";
|
|
6
|
+
const log = createLogger({ sink: { kind: "stderr" } });
|
|
7
|
+
const LOG_PREPARE = "PRE4K2NQ";
|
|
8
|
+
const LOG_FIXTURE = "PRE7M3VX";
|
|
9
|
+
/** Check whether a path exists. */
|
|
10
|
+
async function pathExists(path) {
|
|
11
|
+
try {
|
|
12
|
+
await access(path);
|
|
13
|
+
return true;
|
|
14
|
+
}
|
|
15
|
+
catch {
|
|
16
|
+
return false;
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Prepare a task for execution: read its manifest and copy the fixture
|
|
21
|
+
* directory into a fresh temp working directory.
|
|
22
|
+
*/
|
|
23
|
+
export async function prepare(taskDir) {
|
|
24
|
+
const manifest = await loadTaskManifest(taskDir);
|
|
25
|
+
log(LOG_PREPARE, `loaded task manifest name=${manifest.name} workflow=${manifest.workflow}`);
|
|
26
|
+
const workDir = await mkdtemp(join(tmpdir(), "uwf-eval-"));
|
|
27
|
+
const fixtureDir = join(taskDir, "fixture");
|
|
28
|
+
if (await pathExists(fixtureDir)) {
|
|
29
|
+
await cp(fixtureDir, workDir, { recursive: true });
|
|
30
|
+
log(LOG_FIXTURE, `copied fixture into workDir=${workDir}`);
|
|
31
|
+
}
|
|
32
|
+
else {
|
|
33
|
+
await mkdir(workDir, { recursive: true });
|
|
34
|
+
log(LOG_FIXTURE, `no fixture/ found, using empty workDir=${workDir}`);
|
|
35
|
+
}
|
|
36
|
+
return { taskDir, workDir, manifest };
|
|
37
|
+
}
|
|
38
|
+
//# sourceMappingURL=prepare.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prepare.js","sourceRoot":"","sources":["../../src/runner/prepare.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,EAAE,EAAE,KAAK,EAAE,OAAO,EAAE,MAAM,kBAAkB,CAAC;AAC9D,OAAO,EAAE,MAAM,EAAE,MAAM,SAAS,CAAC;AACjC,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAEtD,OAAO,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAC;AAGpD,MAAM,GAAG,GAAG,YAAY,CAAC,EAAE,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC;AAEvD,MAAM,WAAW,GAAG,UAAU,CAAC;AAC/B,MAAM,WAAW,GAAG,UAAU,CAAC;AAE/B,mCAAmC;AACnC,KAAK,UAAU,UAAU,CAAC,IAAY;IACpC,IAAI,CAAC;QACH,MAAM,MAAM,CAAC,IAAI,CAAC,CAAC;QACnB,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,OAAe;IAC3C,MAAM,QAAQ,GAAG,MAAM,gBAAgB,CAAC,OAAO,CAAC,CAAC;IACjD,GAAG,CAAC,WAAW,EAAE,6BAA6B,QAAQ,CAAC,IAAI,aAAa,QAAQ,CAAC,QAAQ,EAAE,CAAC,CAAC;IAE7F,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC,MAAM,EAAE,EAAE,WAAW,CAAC,CAAC,CAAC;IAE3D,MAAM,UAAU,GAAG,IAAI,CAAC,OAAO,EAAE,SAAS,CAAC,CAAC;IAC5C,IAAI,MAAM,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QACjC,MAAM,EAAE,CAAC,UAAU,EAAE,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QACnD,GAAG,CAAC,WAAW,EAAE,+BAA+B,OAAO,EAAE,CAAC,CAAC;IAC7D,CAAC;SAAM,CAAC;QACN,MAAM,KAAK,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAC1C,GAAG,CAAC,WAAW,EAAE,0CAA0C,OAAO,EAAE,CAAC,CAAC;IACxE,CAAC;IAED,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC;AACxC,CAAC"}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import type { JSONSchema } from "@ocas/core";
|
|
2
|
+
import type { EvalRunConfig, EvalStore } from "../storage/index.js";
|
|
3
|
+
import type { JudgeEntry, TaskManifest } from "../task/index.js";
|
|
4
|
+
/** Result of the prepare phase: task dir, temp working dir, parsed manifest. */
|
|
5
|
+
export type PrepareResult = {
|
|
6
|
+
taskDir: string;
|
|
7
|
+
workDir: string;
|
|
8
|
+
manifest: TaskManifest;
|
|
9
|
+
};
|
|
10
|
+
/** Input to the execute phase. */
|
|
11
|
+
export type ExecuteInput = {
|
|
12
|
+
/** Working directory the workflow runs in (the prepared temp dir). */
|
|
13
|
+
workDir: string;
|
|
14
|
+
/** Workflow name or path (from task.yaml). */
|
|
15
|
+
workflow: string;
|
|
16
|
+
/** Initial prompt for the thread. */
|
|
17
|
+
prompt: string;
|
|
18
|
+
/** Agent adapter to use. */
|
|
19
|
+
agent: string;
|
|
20
|
+
/** Maximum number of steps to execute. */
|
|
21
|
+
maxSteps: number;
|
|
22
|
+
};
|
|
23
|
+
/** Result of the execute phase. */
|
|
24
|
+
export type ExecuteResult = {
|
|
25
|
+
threadId: string;
|
|
26
|
+
};
|
|
27
|
+
/** Output produced by running a single judge. */
|
|
28
|
+
export type JudgeRunOutput = {
|
|
29
|
+
score: number;
|
|
30
|
+
data: unknown;
|
|
31
|
+
/** Schema describing `data`, used when persisting to CAS. */
|
|
32
|
+
schema: JSONSchema;
|
|
33
|
+
};
|
|
34
|
+
/** Pluggable judge execution strategy (injectable for testing). */
|
|
35
|
+
export type JudgeRunner = (taskDir: string, workDir: string, threadId: string, judge: JudgeEntry) => Promise<JudgeRunOutput>;
|
|
36
|
+
/** Input to the collect phase. */
|
|
37
|
+
export type CollectInput = {
|
|
38
|
+
evalStore: EvalStore;
|
|
39
|
+
taskDir: string;
|
|
40
|
+
workDir: string;
|
|
41
|
+
threadId: string;
|
|
42
|
+
manifest: TaskManifest;
|
|
43
|
+
config: EvalRunConfig;
|
|
44
|
+
};
|
|
45
|
+
/** A single judge's summarized result in the run output. */
|
|
46
|
+
export type JudgeSummary = {
|
|
47
|
+
name: string;
|
|
48
|
+
score: number;
|
|
49
|
+
weight: number;
|
|
50
|
+
};
|
|
51
|
+
/** Result of the collect phase. */
|
|
52
|
+
export type CollectResult = {
|
|
53
|
+
runHash: string;
|
|
54
|
+
overall: number;
|
|
55
|
+
judges: JudgeSummary[];
|
|
56
|
+
};
|
|
57
|
+
/** Options for a full eval run (from CLI flags). */
|
|
58
|
+
export type RunOptions = {
|
|
59
|
+
agent: string;
|
|
60
|
+
model: string;
|
|
61
|
+
count: number;
|
|
62
|
+
};
|
|
63
|
+
/** Final result of a full eval run. */
|
|
64
|
+
export type RunResult = {
|
|
65
|
+
runHash: string;
|
|
66
|
+
overall: number;
|
|
67
|
+
task: string;
|
|
68
|
+
judges: JudgeSummary[];
|
|
69
|
+
};
|
|
70
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/runner/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AAE7C,OAAO,KAAK,EAAE,aAAa,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AACpE,OAAO,KAAK,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAEjE,gFAAgF;AAChF,MAAM,MAAM,aAAa,GAAG;IAC1B,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,YAAY,CAAC;CACxB,CAAC;AAEF,kCAAkC;AAClC,MAAM,MAAM,YAAY,GAAG;IACzB,sEAAsE;IACtE,OAAO,EAAE,MAAM,CAAC;IAChB,8CAA8C;IAC9C,QAAQ,EAAE,MAAM,CAAC;IACjB,qCAAqC;IACrC,MAAM,EAAE,MAAM,CAAC;IACf,4BAA4B;IAC5B,KAAK,EAAE,MAAM,CAAC;IACd,0CAA0C;IAC1C,QAAQ,EAAE,MAAM,CAAC;CAClB,CAAC;AAEF,mCAAmC;AACnC,MAAM,MAAM,aAAa,GAAG;IAC1B,QAAQ,EAAE,MAAM,CAAC;CAClB,CAAC;AAEF,iDAAiD;AACjD,MAAM,MAAM,cAAc,GAAG;IAC3B,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,OAAO,CAAC;IACd,6DAA6D;IAC7D,MAAM,EAAE,UAAU,CAAC;CACpB,CAAC;AAEF,mEAAmE;AACnE,MAAM,MAAM,WAAW,GAAG,CACxB,OAAO,EAAE,MAAM,EACf,OAAO,EAAE,MAAM,EACf,QAAQ,EAAE,MAAM,EAChB,KAAK,EAAE,UAAU,KACd,OAAO,CAAC,cAAc,CAAC,CAAC;AAE7B,kCAAkC;AAClC,MAAM,MAAM,YAAY,GAAG;IACzB,SAAS,EAAE,SAAS,CAAC;IACrB,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,YAAY,CAAC;IACvB,MAAM,EAAE,aAAa,CAAC;CACvB,CAAC;AAEF,4DAA4D;AAC5D,MAAM,MAAM,YAAY,GAAG;IACzB,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;CAChB,CAAC;AAEF,mCAAmC;AACnC,MAAM,MAAM,aAAa,GAAG;IAC1B,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,YAAY,EAAE,CAAC;CACxB,CAAC;AAEF,oDAAoD;AACpD,MAAM,MAAM,UAAU,GAAG;IACvB,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,EAAE,MAAM,CAAC;CACf,CAAC;AAEF,uCAAuC;AACvC,MAAM,MAAM,SAAS,GAAG;IACtB,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,YAAY,EAAE,CAAC;CACxB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/runner/types.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
export { EVAL_JUDGE_FRONTMATTER_SCHEMA, EVAL_JUDGE_HALLUCINATION_SCHEMA, EVAL_JUDGE_TOKEN_STATS_SCHEMA, EVAL_JUDGE_UPSTREAM_SCHEMA, EVAL_RUN_SCHEMA, } from "./schemas.js";
|
|
2
|
+
export { createEvalStore, setEvalLatest } from "./store.js";
|
|
3
|
+
export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload, EvalStore } from "./types.js";
|
|
4
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/storage/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,6BAA6B,EAC7B,+BAA+B,EAC/B,6BAA6B,EAC7B,0BAA0B,EAC1B,eAAe,GAChB,MAAM,cAAc,CAAC;AACtB,OAAO,EAAE,eAAe,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAC5D,YAAY,EAAE,eAAe,EAAE,aAAa,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/storage/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,6BAA6B,EAC7B,+BAA+B,EAC/B,6BAA6B,EAC7B,0BAA0B,EAC1B,eAAe,GAChB,MAAM,cAAc,CAAC;AACtB,OAAO,EAAE,eAAe,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { JSONSchema } from "@ocas/core";
|
|
2
|
+
export declare const EVAL_RUN_SCHEMA: JSONSchema;
|
|
3
|
+
export declare const EVAL_JUDGE_FRONTMATTER_SCHEMA: JSONSchema;
|
|
4
|
+
export declare const EVAL_JUDGE_UPSTREAM_SCHEMA: JSONSchema;
|
|
5
|
+
export declare const EVAL_JUDGE_HALLUCINATION_SCHEMA: JSONSchema;
|
|
6
|
+
export declare const EVAL_JUDGE_TOKEN_STATS_SCHEMA: JSONSchema;
|
|
7
|
+
//# sourceMappingURL=schemas.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"schemas.d.ts","sourceRoot":"","sources":["../../src/storage/schemas.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AAE7C,eAAO,MAAM,eAAe,EAAE,UAgC7B,CAAC;AAEF,eAAO,MAAM,6BAA6B,EAAE,UAoB3C,CAAC;AAEF,eAAO,MAAM,0BAA0B,EAAE,UAmBxC,CAAC;AAEF,eAAO,MAAM,+BAA+B,EAAE,UAkB7C,CAAC;AAEF,eAAO,MAAM,6BAA6B,EAAE,UAuB3C,CAAC"}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
export const EVAL_RUN_SCHEMA = {
|
|
2
|
+
title: "@uwf/eval-run",
|
|
3
|
+
type: "object",
|
|
4
|
+
required: ["task", "config", "threadId", "judges", "overall", "timestamp"],
|
|
5
|
+
properties: {
|
|
6
|
+
task: { type: "string" },
|
|
7
|
+
config: {
|
|
8
|
+
type: "object",
|
|
9
|
+
required: ["agent", "model", "engineVersion"],
|
|
10
|
+
properties: {
|
|
11
|
+
agent: { type: "string" },
|
|
12
|
+
model: { type: "string" },
|
|
13
|
+
engineVersion: { type: "string" },
|
|
14
|
+
},
|
|
15
|
+
},
|
|
16
|
+
threadId: { type: "string" },
|
|
17
|
+
judges: {
|
|
18
|
+
type: "array",
|
|
19
|
+
items: {
|
|
20
|
+
type: "object",
|
|
21
|
+
required: ["name", "score", "weight", "dataHash"],
|
|
22
|
+
properties: {
|
|
23
|
+
name: { type: "string" },
|
|
24
|
+
score: { type: "number" },
|
|
25
|
+
weight: { type: "number" },
|
|
26
|
+
dataHash: { type: "string" },
|
|
27
|
+
},
|
|
28
|
+
},
|
|
29
|
+
},
|
|
30
|
+
overall: { type: "number" },
|
|
31
|
+
timestamp: { type: "integer" },
|
|
32
|
+
},
|
|
33
|
+
};
|
|
34
|
+
export const EVAL_JUDGE_FRONTMATTER_SCHEMA = {
|
|
35
|
+
title: "@uwf/eval-judge-frontmatter",
|
|
36
|
+
type: "object",
|
|
37
|
+
required: ["stepsTotal", "stepsValid", "invalidSteps"],
|
|
38
|
+
properties: {
|
|
39
|
+
stepsTotal: { type: "integer" },
|
|
40
|
+
stepsValid: { type: "integer" },
|
|
41
|
+
invalidSteps: {
|
|
42
|
+
type: "array",
|
|
43
|
+
items: {
|
|
44
|
+
type: "object",
|
|
45
|
+
required: ["stepIndex", "role", "errors"],
|
|
46
|
+
properties: {
|
|
47
|
+
stepIndex: { type: "integer" },
|
|
48
|
+
role: { type: "string" },
|
|
49
|
+
errors: { type: "array", items: { type: "string" } },
|
|
50
|
+
},
|
|
51
|
+
},
|
|
52
|
+
},
|
|
53
|
+
},
|
|
54
|
+
};
|
|
55
|
+
export const EVAL_JUDGE_UPSTREAM_SCHEMA = {
|
|
56
|
+
title: "@uwf/eval-judge-upstream",
|
|
57
|
+
type: "object",
|
|
58
|
+
required: ["perStep"],
|
|
59
|
+
properties: {
|
|
60
|
+
perStep: {
|
|
61
|
+
type: "array",
|
|
62
|
+
items: {
|
|
63
|
+
type: "object",
|
|
64
|
+
required: ["role", "consumed", "missed", "score"],
|
|
65
|
+
properties: {
|
|
66
|
+
role: { type: "string" },
|
|
67
|
+
consumed: { type: "array", items: { type: "string" } },
|
|
68
|
+
missed: { type: "array", items: { type: "string" } },
|
|
69
|
+
score: { type: "number" },
|
|
70
|
+
},
|
|
71
|
+
},
|
|
72
|
+
},
|
|
73
|
+
},
|
|
74
|
+
};
|
|
75
|
+
export const EVAL_JUDGE_HALLUCINATION_SCHEMA = {
|
|
76
|
+
title: "@uwf/eval-judge-hallucination",
|
|
77
|
+
type: "object",
|
|
78
|
+
required: ["perStep"],
|
|
79
|
+
properties: {
|
|
80
|
+
perStep: {
|
|
81
|
+
type: "array",
|
|
82
|
+
items: {
|
|
83
|
+
type: "object",
|
|
84
|
+
required: ["role", "hallucinations", "score"],
|
|
85
|
+
properties: {
|
|
86
|
+
role: { type: "string" },
|
|
87
|
+
hallucinations: { type: "array", items: { type: "string" } },
|
|
88
|
+
score: { type: "number" },
|
|
89
|
+
},
|
|
90
|
+
},
|
|
91
|
+
},
|
|
92
|
+
},
|
|
93
|
+
};
|
|
94
|
+
export const EVAL_JUDGE_TOKEN_STATS_SCHEMA = {
|
|
95
|
+
title: "@uwf/eval-judge-token-stats",
|
|
96
|
+
type: "object",
|
|
97
|
+
required: ["totalInput", "totalOutput", "totalTurns", "perStep"],
|
|
98
|
+
properties: {
|
|
99
|
+
totalInput: { type: "integer" },
|
|
100
|
+
totalOutput: { type: "integer" },
|
|
101
|
+
totalTurns: { type: "integer" },
|
|
102
|
+
perStep: {
|
|
103
|
+
type: "array",
|
|
104
|
+
items: {
|
|
105
|
+
type: "object",
|
|
106
|
+
required: ["role", "inputTokens", "outputTokens", "turns", "duration"],
|
|
107
|
+
properties: {
|
|
108
|
+
role: { type: "string" },
|
|
109
|
+
inputTokens: { type: "integer" },
|
|
110
|
+
outputTokens: { type: "integer" },
|
|
111
|
+
turns: { type: "integer" },
|
|
112
|
+
duration: { type: "number" },
|
|
113
|
+
},
|
|
114
|
+
},
|
|
115
|
+
},
|
|
116
|
+
},
|
|
117
|
+
};
|
|
118
|
+
//# sourceMappingURL=schemas.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"schemas.js","sourceRoot":"","sources":["../../src/storage/schemas.ts"],"names":[],"mappings":"AAEA,MAAM,CAAC,MAAM,eAAe,GAAe;IACzC,KAAK,EAAE,eAAe;IACtB,IAAI,EAAE,QAAQ;IACd,QAAQ,EAAE,CAAC,MAAM,EAAE,QAAQ,EAAE,UAAU,EAAE,QAAQ,EAAE,SAAS,EAAE,WAAW,CAAC;IAC1E,UAAU,EAAE;QACV,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;QACxB,MAAM,EAAE;YACN,IAAI,EAAE,QAAQ;YACd,QAAQ,EAAE,CAAC,OAAO,EAAE,OAAO,EAAE,eAAe,CAAC;YAC7C,UAAU,EAAE;gBACV,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;gBACzB,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;gBACzB,aAAa,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;aAClC;SACF;QACD,QAAQ,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;QAC5B,MAAM,EAAE;YACN,IAAI,EAAE,OAAO;YACb,KAAK,EAAE;gBACL,IAAI,EAAE,QAAQ;gBACd,QAAQ,EAAE,CAAC,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,UAAU,CAAC;gBACjD,UAAU,EAAE;oBACV,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;oBACxB,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;oBACzB,MAAM,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;oBAC1B,QAAQ,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;iBAC7B;aACF;SACF;QACD,OAAO,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;QAC3B,SAAS,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;KAC/B;CACF,CAAC;AAEF,MAAM,CAAC,MAAM,6BAA6B,GAAe;IACvD,KAAK,EAAE,6BAA6B;IACpC,IAAI,EAAE,QAAQ;IACd,QAAQ,EAAE,CAAC,YAAY,EAAE,YAAY,EAAE,cAAc,CAAC;IACtD,UAAU,EAAE;QACV,UAAU,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;QAC/B,UAAU,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;QAC/B,YAAY,EAAE;YACZ,IAAI,EAAE,OAAO;YACb,KAAK,EAAE;gBACL,IAAI,EAAE,QAAQ;gBACd,QAAQ,EAAE,CAAC,WAAW,EAAE,MAAM,EAAE,QAAQ,CAAC;gBACzC,UAAU,EAAE;oBACV,SAAS,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;oBAC9B,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;oBACxB,MAAM,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE;iBACrD;aACF;SACF;KACF;CACF,CAAC;AAEF,MAAM,CAAC,MAAM,0BAA0B,GAAe;IACpD,KAAK,EAAE,0BAA0B;IACjC,IAAI,EAAE,QAAQ;IACd,QAAQ,EAAE,CAAC,SAAS,CAAC;IACrB,UAAU,EAAE;QACV,OAAO,EAAE;YACP,IAAI,EAAE,OAAO;YACb,KAAK,EAAE;gBACL,IAAI,EAAE,QAAQ;gBACd,QAAQ,EAAE,CAAC,MAAM,EAAE,UAAU,EAAE,QAAQ,EAAE,OAAO,CAAC;gBACjD,UAAU,EAAE;oBACV,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;oBACxB,QAAQ,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE;oBACtD,MAAM,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE;oBACpD,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;iBAC1B;aACF;SACF;KACF;CACF,CAAC;AAEF,MAAM,CAAC,MAAM,+BAA+B,GAAe;IACzD,KAAK,EAAE,+BAA+B;IACtC,IAAI,EAAE,QAAQ;IACd,QAAQ,EAAE,CAAC,SAAS,CAAC;IACrB,UAAU,EAAE;QACV,OAAO,EAAE;YACP,IAAI,EAAE,OAAO;YACb,KAAK,EAAE;gBACL,IAAI,EAAE,QAAQ;gBACd,QAAQ,EAAE,CAAC,MAAM,EAAE,gBAAgB,EAAE,OAAO,CAAC;gBAC7C,UAAU,EAAE;oBACV,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;oBACxB,cAAc,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE;oBAC5D,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;iBAC1B;aACF;SACF;KACF;CACF,CAAC;AAEF,MAAM,CAAC,MAAM,6BAA6B,GAAe;IACvD,KAAK,EAAE,6BAA6B;IACpC,IAAI,EAAE,QAAQ;IACd,QAAQ,EAAE,CAAC,YAAY,EAAE,aAAa,EAAE,YAAY,EAAE,SAAS,CAAC;IAChE,UAAU,EAAE;QACV,UAAU,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;QAC/B,WAAW,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;QAChC,UAAU,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;QAC/B,OAAO,EAAE;YACP,IAAI,EAAE,OAAO;YACb,KAAK,EAAE;gBACL,IAAI,EAAE,QAAQ;gBACd,QAAQ,EAAE,CAAC,MAAM,EAAE,aAAa,EAAE,cAAc,EAAE,OAAO,EAAE,UAAU,CAAC;gBACtE,UAAU,EAAE;oBACV,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;oBACxB,WAAW,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;oBAChC,YAAY,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;oBACjC,KAAK,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;oBAC1B,QAAQ,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;iBAC7B;aACF;SACF;KACF;CACF,CAAC"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type { VarStore } from "@ocas/core";
|
|
2
|
+
import type { EvalStore } from "./types.js";
|
|
3
|
+
/**
|
|
4
|
+
* Open the unified OCAS store on the filesystem.
|
|
5
|
+
* Shares the same CAS + variable backend as the uwf CLI.
|
|
6
|
+
*/
|
|
7
|
+
export declare function createEvalStore(): Promise<EvalStore>;
|
|
8
|
+
/** Set the `@uwf/eval/<task>/latest` variable to point at a run hash. */
|
|
9
|
+
export declare function setEvalLatest(varStore: VarStore, taskName: string, runHash: string): void;
|
|
10
|
+
//# sourceMappingURL=store.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"store.d.ts","sourceRoot":"","sources":["../../src/storage/store.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAI3C,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AAiB5C;;;GAGG;AACH,wBAAsB,eAAe,IAAI,OAAO,CAAC,SAAS,CAAC,CAQ1D;AAED,yEAAyE;AACzE,wBAAgB,aAAa,CAAC,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,IAAI,CAEzF"}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { mkdir } from "node:fs/promises";
|
|
2
|
+
import { homedir } from "node:os";
|
|
3
|
+
import { join } from "node:path";
|
|
4
|
+
import { bootstrap } from "@ocas/core";
|
|
5
|
+
import { createFsStore, createSqliteVarStore } from "@ocas/fs";
|
|
6
|
+
/** Variable name prefix for eval run pointers (`@uwf/eval/<task>/latest`). */
|
|
7
|
+
const EVAL_VAR_PREFIX = "@uwf/eval/";
|
|
8
|
+
/**
|
|
9
|
+
* Resolve the global CAS directory shared by all uwf and ocas tools.
|
|
10
|
+
* Priority: `OCAS_HOME` → default ~/.ocas (matches uwf CLI's getGlobalCasDir).
|
|
11
|
+
*/
|
|
12
|
+
function getGlobalCasDir() {
|
|
13
|
+
const primary = process.env.OCAS_HOME;
|
|
14
|
+
if (primary !== undefined && primary !== "") {
|
|
15
|
+
return primary;
|
|
16
|
+
}
|
|
17
|
+
return join(homedir(), ".ocas");
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Open the unified OCAS store on the filesystem.
|
|
21
|
+
* Shares the same CAS + variable backend as the uwf CLI.
|
|
22
|
+
*/
|
|
23
|
+
export async function createEvalStore() {
|
|
24
|
+
const casDir = getGlobalCasDir();
|
|
25
|
+
await mkdir(casDir, { recursive: true });
|
|
26
|
+
const cas = createFsStore(casDir);
|
|
27
|
+
const { var: varStore, tag } = createSqliteVarStore(join(casDir, "vars"), cas);
|
|
28
|
+
const store = { cas, var: varStore, tag };
|
|
29
|
+
bootstrap(store);
|
|
30
|
+
return { store, varStore };
|
|
31
|
+
}
|
|
32
|
+
/** Set the `@uwf/eval/<task>/latest` variable to point at a run hash. */
|
|
33
|
+
export function setEvalLatest(varStore, taskName, runHash) {
|
|
34
|
+
varStore.set(`${EVAL_VAR_PREFIX}${taskName}/latest`, runHash);
|
|
35
|
+
}
|
|
36
|
+
//# sourceMappingURL=store.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"store.js","sourceRoot":"","sources":["../../src/storage/store.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AACzC,OAAO,EAAE,OAAO,EAAE,MAAM,SAAS,CAAC;AAClC,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,OAAO,EAAE,SAAS,EAAc,MAAM,YAAY,CAAC;AACnD,OAAO,EAAE,aAAa,EAAE,oBAAoB,EAAE,MAAM,UAAU,CAAC;AAI/D,8EAA8E;AAC9E,MAAM,eAAe,GAAG,YAAY,CAAC;AAErC;;;GAGG;AACH,SAAS,eAAe;IACtB,MAAM,OAAO,GAAG,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC;IACtC,IAAI,OAAO,KAAK,SAAS,IAAI,OAAO,KAAK,EAAE,EAAE,CAAC;QAC5C,OAAO,OAAO,CAAC;IACjB,CAAC;IACD,OAAO,IAAI,CAAC,OAAO,EAAE,EAAE,OAAO,CAAC,CAAC;AAClC,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe;IACnC,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;IACjC,MAAM,KAAK,CAAC,MAAM,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IACzC,MAAM,GAAG,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC;IAClC,MAAM,EAAE,GAAG,EAAE,QAAQ,EAAE,GAAG,EAAE,GAAG,oBAAoB,CAAC,IAAI,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,GAAG,CAAC,CAAC;IAC/E,MAAM,KAAK,GAAU,EAAE,GAAG,EAAE,GAAG,EAAE,QAAQ,EAAE,GAAG,EAAE,CAAC;IACjD,SAAS,CAAC,KAAK,CAAC,CAAC;IACjB,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;AAC7B,CAAC;AAED,yEAAyE;AACzE,MAAM,UAAU,aAAa,CAAC,QAAkB,EAAE,QAAgB,EAAE,OAAe;IACjF,QAAQ,CAAC,GAAG,CAAC,GAAG,eAAe,GAAG,QAAQ,SAAS,EAAE,OAAO,CAAC,CAAC;AAChE,CAAC"}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import type { Store, VarStore } from "@ocas/core";
|
|
2
|
+
import type { CasRef } from "@united-workforce/protocol";
|
|
3
|
+
/** Handle to the OCAS store used for eval persistence. */
|
|
4
|
+
export type EvalStore = {
|
|
5
|
+
store: Store;
|
|
6
|
+
varStore: VarStore;
|
|
7
|
+
};
|
|
8
|
+
/** A single judge result within an eval run. */
|
|
9
|
+
export type EvalJudgeRecord = {
|
|
10
|
+
name: string;
|
|
11
|
+
score: number;
|
|
12
|
+
weight: number;
|
|
13
|
+
dataHash: CasRef;
|
|
14
|
+
};
|
|
15
|
+
/** Config snapshot for an eval run. */
|
|
16
|
+
export type EvalRunConfig = {
|
|
17
|
+
agent: string;
|
|
18
|
+
model: string;
|
|
19
|
+
engineVersion: string;
|
|
20
|
+
};
|
|
21
|
+
/** Full eval run record stored in CAS. */
|
|
22
|
+
export type EvalRunPayload = {
|
|
23
|
+
task: string;
|
|
24
|
+
config: EvalRunConfig;
|
|
25
|
+
threadId: string;
|
|
26
|
+
judges: EvalJudgeRecord[];
|
|
27
|
+
overall: number;
|
|
28
|
+
timestamp: number;
|
|
29
|
+
};
|
|
30
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/storage/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAClD,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,4BAA4B,CAAC;AAEzD,0DAA0D;AAC1D,MAAM,MAAM,SAAS,GAAG;IACtB,KAAK,EAAE,KAAK,CAAC;IACb,QAAQ,EAAE,QAAQ,CAAC;CACpB,CAAC;AAEF,gDAAgD;AAChD,MAAM,MAAM,eAAe,GAAG;IAC5B,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,MAAM,CAAC;CAClB,CAAC;AAEF,uCAAuC;AACvC,MAAM,MAAM,aAAa,GAAG;IAC1B,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,EAAE,MAAM,CAAC;IACd,aAAa,EAAE,MAAM,CAAC;CACvB,CAAC;AAEF,0CAA0C;AAC1C,MAAM,MAAM,cAAc,GAAG;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,aAAa,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;IACjB,MAAM,EAAE,eAAe,EAAE,CAAC;IAC1B,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;CACnB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/storage/types.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/task/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAC;AAClE,YAAY,EAAE,UAAU,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/task/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAC"}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { TaskManifest } from "./types.js";
|
|
2
|
+
/** Parse and validate a task.yaml file into a TaskManifest. */
|
|
3
|
+
export declare function parseTaskManifest(yamlText: string): TaskManifest;
|
|
4
|
+
/** Load and parse task.yaml from a directory. */
|
|
5
|
+
export declare function loadTaskManifest(taskDir: string): Promise<TaskManifest>;
|
|
6
|
+
//# sourceMappingURL=loader.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"loader.d.ts","sourceRoot":"","sources":["../../src/task/loader.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAA0B,YAAY,EAAE,MAAM,YAAY,CAAC;AAkCvE,+DAA+D;AAC/D,wBAAgB,iBAAiB,CAAC,QAAQ,EAAE,MAAM,GAAG,YAAY,CA4BhE;AAED,iDAAiD;AACjD,wBAAsB,gBAAgB,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,CAAC,CAI7E"}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import { readFile } from "node:fs/promises";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import { parse as parseYaml } from "yaml";
|
|
4
|
+
function isRecord(value) {
|
|
5
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
6
|
+
}
|
|
7
|
+
function parseJudgeEntry(raw, index) {
|
|
8
|
+
if (!isRecord(raw)) {
|
|
9
|
+
throw new Error(`judges[${index}]: expected object`);
|
|
10
|
+
}
|
|
11
|
+
const name = raw.name;
|
|
12
|
+
if (typeof name !== "string" || name === "") {
|
|
13
|
+
throw new Error(`judges[${index}]: name is required`);
|
|
14
|
+
}
|
|
15
|
+
const weight = typeof raw.weight === "number" ? raw.weight : 0;
|
|
16
|
+
const builtin = raw.builtin === true;
|
|
17
|
+
const entry = typeof raw.entry === "string" ? raw.entry : null;
|
|
18
|
+
const schema = typeof raw.schema === "string" ? raw.schema : null;
|
|
19
|
+
if (!builtin && entry === null) {
|
|
20
|
+
throw new Error(`judges[${index}] "${name}": non-builtin judge must have entry`);
|
|
21
|
+
}
|
|
22
|
+
return { name, weight, builtin, entry, schema };
|
|
23
|
+
}
|
|
24
|
+
function parseLimits(raw) {
|
|
25
|
+
if (!isRecord(raw)) {
|
|
26
|
+
return { maxSteps: 20, timeoutMinutes: 30 };
|
|
27
|
+
}
|
|
28
|
+
return {
|
|
29
|
+
maxSteps: typeof raw.maxSteps === "number" ? raw.maxSteps : 20,
|
|
30
|
+
timeoutMinutes: typeof raw.timeoutMinutes === "number" ? raw.timeoutMinutes : 30,
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
/** Parse and validate a task.yaml file into a TaskManifest. */
|
|
34
|
+
export function parseTaskManifest(yamlText) {
|
|
35
|
+
const raw = parseYaml(yamlText);
|
|
36
|
+
if (!isRecord(raw)) {
|
|
37
|
+
throw new Error("task.yaml must be a YAML mapping");
|
|
38
|
+
}
|
|
39
|
+
const name = raw.name;
|
|
40
|
+
if (typeof name !== "string" || name === "") {
|
|
41
|
+
throw new Error("task.yaml: name is required");
|
|
42
|
+
}
|
|
43
|
+
const description = typeof raw.description === "string" ? raw.description : "";
|
|
44
|
+
const workflow = raw.workflow;
|
|
45
|
+
if (typeof workflow !== "string" || workflow === "") {
|
|
46
|
+
throw new Error("task.yaml: workflow is required");
|
|
47
|
+
}
|
|
48
|
+
const prompt = raw.prompt;
|
|
49
|
+
if (typeof prompt !== "string" || prompt === "") {
|
|
50
|
+
throw new Error("task.yaml: prompt is required");
|
|
51
|
+
}
|
|
52
|
+
const limits = parseLimits(raw.limits);
|
|
53
|
+
const judgesRaw = raw.judges;
|
|
54
|
+
if (!Array.isArray(judgesRaw) || judgesRaw.length === 0) {
|
|
55
|
+
throw new Error("task.yaml: at least one judge is required");
|
|
56
|
+
}
|
|
57
|
+
const judges = [];
|
|
58
|
+
for (let i = 0; i < judgesRaw.length; i++) {
|
|
59
|
+
judges.push(parseJudgeEntry(judgesRaw[i], i));
|
|
60
|
+
}
|
|
61
|
+
return { name, description, workflow, prompt, limits, judges };
|
|
62
|
+
}
|
|
63
|
+
/** Load and parse task.yaml from a directory. */
|
|
64
|
+
export async function loadTaskManifest(taskDir) {
|
|
65
|
+
const yamlPath = join(taskDir, "task.yaml");
|
|
66
|
+
const text = await readFile(yamlPath, "utf8");
|
|
67
|
+
return parseTaskManifest(text);
|
|
68
|
+
}
|
|
69
|
+
//# sourceMappingURL=loader.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"loader.js","sourceRoot":"","sources":["../../src/task/loader.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AAC5C,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,KAAK,IAAI,SAAS,EAAE,MAAM,MAAM,CAAC;AAG1C,SAAS,QAAQ,CAAC,KAAc;IAC9B,OAAO,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,KAAK,IAAI,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;AAC9E,CAAC;AAED,SAAS,eAAe,CAAC,GAAY,EAAE,KAAa;IAClD,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;QACnB,MAAM,IAAI,KAAK,CAAC,UAAU,KAAK,oBAAoB,CAAC,CAAC;IACvD,CAAC;IACD,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,CAAC;IACtB,IAAI,OAAO,IAAI,KAAK,QAAQ,IAAI,IAAI,KAAK,EAAE,EAAE,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,UAAU,KAAK,qBAAqB,CAAC,CAAC;IACxD,CAAC;IACD,MAAM,MAAM,GAAG,OAAO,GAAG,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;IAC/D,MAAM,OAAO,GAAG,GAAG,CAAC,OAAO,KAAK,IAAI,CAAC;IACrC,MAAM,KAAK,GAAG,OAAO,GAAG,CAAC,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC;IAC/D,MAAM,MAAM,GAAG,OAAO,GAAG,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC;IAClE,IAAI,CAAC,OAAO,IAAI,KAAK,KAAK,IAAI,EAAE,CAAC;QAC/B,MAAM,IAAI,KAAK,CAAC,UAAU,KAAK,MAAM,IAAI,sCAAsC,CAAC,CAAC;IACnF,CAAC;IACD,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,CAAC;AAClD,CAAC;AAED,SAAS,WAAW,CAAC,GAAY;IAC/B,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;QACnB,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,cAAc,EAAE,EAAE,EAAE,CAAC;IAC9C,CAAC;IACD,OAAO;QACL,QAAQ,EAAE,OAAO,GAAG,CAAC,QAAQ,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE;QAC9D,cAAc,EAAE,OAAO,GAAG,CAAC,cAAc,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC,CAAC,EAAE;KACjF,CAAC;AACJ,CAAC;AAED,+DAA+D;AAC/D,MAAM,UAAU,iBAAiB,CAAC,QAAgB;IAChD,MAAM,GAAG,GAAG,SAAS,CAAC,QAAQ,CAAY,CAAC;IAC3C,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;QACnB,MAAM,IAAI,KAAK,CAAC,kCAAkC,CAAC,CAAC;IACtD,CAAC;IACD,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,CAAC;IACtB,IAAI,OAAO,IAAI,KAAK,QAAQ,IAAI,IAAI,KAAK,EAAE,EAAE,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,6BAA6B,CAAC,CAAC;IACjD,CAAC;IACD,MAAM,WAAW,GAAG,OAAO,GAAG,CAAC,WAAW,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC,CAAC,EAAE,CAAC;IAC/E,MAAM,QAAQ,GAAG,GAAG,CAAC,QAAQ,CAAC;IAC9B,IAAI,OAAO,QAAQ,KAAK,QAAQ,IAAI,QAAQ,KAAK,EAAE,EAAE,CAAC;QACpD,MAAM,IAAI,KAAK,CAAC,iCAAiC,CAAC,CAAC;IACrD,CAAC;IACD,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,CAAC;IAC1B,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,KAAK,EAAE,EAAE,CAAC;QAChD,MAAM,IAAI,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACnD,CAAC;IACD,MAAM,MAAM,GAAG,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;IACvC,MAAM,SAAS,GAAG,GAAG,CAAC,MAAM,CAAC;IAC7B,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACxD,MAAM,IAAI,KAAK,CAAC,2CAA2C,CAAC,CAAC;IAC/D,CAAC;IACD,MAAM,MAAM,GAAiB,EAAE,CAAC;IAChC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC1C,MAAM,CAAC,IAAI,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;IAChD,CAAC;IACD,OAAO,EAAE,IAAI,EAAE,WAAW,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC;AACjE,CAAC;AAED,iDAAiD;AACjD,MAAM,CAAC,KAAK,UAAU,gBAAgB,CAAC,OAAe;IACpD,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,EAAE,WAAW,CAAC,CAAC;IAC5C,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;IAC9C,OAAO,iBAAiB,CAAC,IAAI,CAAC,CAAC;AACjC,CAAC"}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/** Judge entry in task.yaml */
|
|
2
|
+
export type JudgeEntry = {
|
|
3
|
+
name: string;
|
|
4
|
+
weight: number;
|
|
5
|
+
builtin: boolean;
|
|
6
|
+
/** Path to judge entry script (relative to task root). Required for non-builtin judges. */
|
|
7
|
+
entry: string | null;
|
|
8
|
+
/** Path to OCAS schema JSON for judge data. Required for non-builtin judges. */
|
|
9
|
+
schema: string | null;
|
|
10
|
+
};
|
|
11
|
+
/** Limits for eval execution. */
|
|
12
|
+
export type TaskLimits = {
|
|
13
|
+
maxSteps: number;
|
|
14
|
+
timeoutMinutes: number;
|
|
15
|
+
};
|
|
16
|
+
/** Parsed task.yaml manifest. */
|
|
17
|
+
export type TaskManifest = {
|
|
18
|
+
name: string;
|
|
19
|
+
description: string;
|
|
20
|
+
/** Workflow name or relative path to .yaml file. */
|
|
21
|
+
workflow: string;
|
|
22
|
+
/** Initial prompt for thread start. */
|
|
23
|
+
prompt: string;
|
|
24
|
+
limits: TaskLimits;
|
|
25
|
+
judges: JudgeEntry[];
|
|
26
|
+
};
|
|
27
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/task/types.ts"],"names":[],"mappings":"AAAA,+BAA+B;AAC/B,MAAM,MAAM,UAAU,GAAG;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,OAAO,CAAC;IACjB,2FAA2F;IAC3F,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IACrB,gFAAgF;IAChF,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;CACvB,CAAC;AAEF,iCAAiC;AACjC,MAAM,MAAM,UAAU,GAAG;IACvB,QAAQ,EAAE,MAAM,CAAC;IACjB,cAAc,EAAE,MAAM,CAAC;CACxB,CAAC;AAEF,iCAAiC;AACjC,MAAM,MAAM,YAAY,GAAG;IACzB,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,CAAC;IACpB,oDAAoD;IACpD,QAAQ,EAAE,MAAM,CAAC;IACjB,uCAAuC;IACvC,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,UAAU,CAAC;IACnB,MAAM,EAAE,UAAU,EAAE,CAAC;CACtB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/task/types.ts"],"names":[],"mappings":""}
|
package/package.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@united-workforce/eval",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"private": false,
|
|
5
|
+
"files": [
|
|
6
|
+
"src",
|
|
7
|
+
"dist",
|
|
8
|
+
"package.json"
|
|
9
|
+
],
|
|
10
|
+
"type": "module",
|
|
11
|
+
"bin": {
|
|
12
|
+
"uwf-eval": "./dist/cli.js"
|
|
13
|
+
},
|
|
14
|
+
"exports": {
|
|
15
|
+
".": {
|
|
16
|
+
"types": "./dist/index.d.ts",
|
|
17
|
+
"import": "./dist/index.js"
|
|
18
|
+
}
|
|
19
|
+
},
|
|
20
|
+
"dependencies": {
|
|
21
|
+
"@ocas/core": "^0.3.0",
|
|
22
|
+
"@ocas/fs": "^0.3.0",
|
|
23
|
+
"commander": "^14.0.3",
|
|
24
|
+
"yaml": "^2.9.0",
|
|
25
|
+
"@united-workforce/protocol": "^0.1.0",
|
|
26
|
+
"@united-workforce/util": "^0.1.0"
|
|
27
|
+
},
|
|
28
|
+
"devDependencies": {
|
|
29
|
+
"typescript": "^5.8.3"
|
|
30
|
+
},
|
|
31
|
+
"repository": {
|
|
32
|
+
"type": "git",
|
|
33
|
+
"url": "https://git.shazhou.work/shazhou/united-workforce.git",
|
|
34
|
+
"directory": "packages/eval"
|
|
35
|
+
},
|
|
36
|
+
"homepage": "https://git.shazhou.work/shazhou/united-workforce#readme",
|
|
37
|
+
"bugs": {
|
|
38
|
+
"url": "https://git.shazhou.work/shazhou/united-workforce/issues"
|
|
39
|
+
},
|
|
40
|
+
"license": "MIT",
|
|
41
|
+
"scripts": {
|
|
42
|
+
"test": "vitest run __tests__/",
|
|
43
|
+
"test:ci": "vitest run __tests__/"
|
|
44
|
+
}
|
|
45
|
+
}
|
package/src/cli.ts
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { Command } from "commander";
|
|
3
|
+
import {
|
|
4
|
+
registerDiffCommand,
|
|
5
|
+
registerListCommand,
|
|
6
|
+
registerReportCommand,
|
|
7
|
+
registerRunCommand,
|
|
8
|
+
} from "./commands/index.js";
|
|
9
|
+
|
|
10
|
+
const program = new Command();
|
|
11
|
+
|
|
12
|
+
program
|
|
13
|
+
.name("uwf-eval")
|
|
14
|
+
.description("Evaluate uwf workflow quality with real agents")
|
|
15
|
+
.version("0.1.0");
|
|
16
|
+
|
|
17
|
+
registerRunCommand(program);
|
|
18
|
+
registerReportCommand(program);
|
|
19
|
+
registerDiffCommand(program);
|
|
20
|
+
registerListCommand(program);
|
|
21
|
+
|
|
22
|
+
program.parse();
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import { createLogger } from "@united-workforce/util";
|
|
2
|
+
import type { Command } from "commander";
|
|
3
|
+
|
|
4
|
+
import { createEvalStore } from "../storage/index.js";
|
|
5
|
+
import { formatDiff } from "./format.js";
|
|
6
|
+
import { readEvalRun } from "./read.js";
|
|
7
|
+
|
|
8
|
+
const log = createLogger({ sink: { kind: "stderr" } });
|
|
9
|
+
const LOG_DIFF = "D3WZ8N5T";
|
|
10
|
+
|
|
11
|
+
export function registerDiffCommand(program: Command): void {
|
|
12
|
+
program
|
|
13
|
+
.command("diff <hash1> <hash2>")
|
|
14
|
+
.description("Compare two eval runs side-by-side")
|
|
15
|
+
.action(async (hash1: string, hash2: string) => {
|
|
16
|
+
try {
|
|
17
|
+
const evalStore = await createEvalStore();
|
|
18
|
+
const payloadA = readEvalRun(evalStore, hash1);
|
|
19
|
+
if (payloadA === null) {
|
|
20
|
+
process.stderr.write(`eval run not found: ${hash1}\n`);
|
|
21
|
+
process.exitCode = 1;
|
|
22
|
+
return;
|
|
23
|
+
}
|
|
24
|
+
const payloadB = readEvalRun(evalStore, hash2);
|
|
25
|
+
if (payloadB === null) {
|
|
26
|
+
process.stderr.write(`eval run not found: ${hash2}\n`);
|
|
27
|
+
process.exitCode = 1;
|
|
28
|
+
return;
|
|
29
|
+
}
|
|
30
|
+
log(LOG_DIFF, `diff a=${hash1} b=${hash2}`);
|
|
31
|
+
process.stdout.write(formatDiff(payloadA, hash1, payloadB, hash2));
|
|
32
|
+
} catch (e) {
|
|
33
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
34
|
+
process.stderr.write(`${message}\n`);
|
|
35
|
+
process.exitCode = 1;
|
|
36
|
+
}
|
|
37
|
+
});
|
|
38
|
+
}
|