@united-workforce/eval 0.1.3 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.build-fingerprint +1 -0
- package/dist/commands/list.js +1 -1
- package/dist/judge/builtin/frontmatter.d.ts.map +1 -1
- package/dist/judge/builtin/frontmatter.js +9 -48
- package/dist/judge/builtin/frontmatter.js.map +1 -1
- package/dist/judge/builtin/read-steps.d.ts +37 -3
- package/dist/judge/builtin/read-steps.d.ts.map +1 -1
- package/dist/judge/builtin/read-steps.js +28 -4
- package/dist/judge/builtin/read-steps.js.map +1 -1
- package/dist/judge/builtin/token-stats.d.ts.map +1 -1
- package/dist/judge/builtin/token-stats.js +3 -2
- package/dist/judge/builtin/token-stats.js.map +1 -1
- package/dist/runner/execute.d.ts +4 -0
- package/dist/runner/execute.d.ts.map +1 -1
- package/dist/runner/execute.js +37 -8
- package/dist/runner/execute.js.map +1 -1
- package/package.json +5 -5
- package/src/commands/list.ts +1 -1
- package/src/judge/builtin/frontmatter.ts +8 -54
- package/src/judge/builtin/read-steps.ts +83 -7
- package/src/judge/builtin/token-stats.ts +3 -2
- package/src/runner/execute.ts +37 -8
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ab2ab0e476768b6700c94fa8168e1b7c6fa60254bffe14047335c0b29c523971
|
package/dist/commands/list.js
CHANGED
|
@@ -3,7 +3,7 @@ import { createEvalStore } from "../storage/index.js";
|
|
|
3
3
|
import { formatList, selectEntries } from "./format.js";
|
|
4
4
|
import { readEvalEntries } from "./read.js";
|
|
5
5
|
const log = createLogger({ sink: { kind: "stderr" } });
|
|
6
|
-
const LOG_LIST = "
|
|
6
|
+
const LOG_LIST = "H5KX9R2B";
|
|
7
7
|
export function registerListCommand(program) {
|
|
8
8
|
program
|
|
9
9
|
.command("list")
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"frontmatter.d.ts","sourceRoot":"","sources":["../../../src/judge/builtin/frontmatter.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"frontmatter.d.ts","sourceRoot":"","sources":["../../../src/judge/builtin/frontmatter.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AAwBrD;;;;GAIG;AACH,wBAAsB,mBAAmB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,kBAAkB,CAAC,CAyBvF"}
|
|
@@ -1,56 +1,14 @@
|
|
|
1
1
|
import { createLogger } from "@united-workforce/util";
|
|
2
|
-
import { parse as parseYaml } from "yaml";
|
|
3
2
|
import { EVAL_JUDGE_FRONTMATTER_SCHEMA } from "../../storage/index.js";
|
|
4
|
-
import { readThreadSteps } from "./read-steps.js";
|
|
3
|
+
import { readStepDetail, readThreadSteps } from "./read-steps.js";
|
|
5
4
|
const log = createLogger({ sink: { kind: "stderr" } });
|
|
6
5
|
const LOG_RESULT = "F2QH7R4M";
|
|
7
|
-
const FENCE = "---";
|
|
8
|
-
/**
|
|
9
|
-
* Extract the YAML frontmatter block from a step output. Returns the inner YAML
|
|
10
|
-
* string when the output starts with a `---\n` block closed by a `\n---` fence,
|
|
11
|
-
* otherwise null.
|
|
12
|
-
*/
|
|
13
|
-
function extractFrontmatterYaml(output) {
|
|
14
|
-
if (typeof output !== "string") {
|
|
15
|
-
return null;
|
|
16
|
-
}
|
|
17
|
-
if (!output.startsWith(`${FENCE}\n`)) {
|
|
18
|
-
return null;
|
|
19
|
-
}
|
|
20
|
-
const rest = output.slice(FENCE.length + 1);
|
|
21
|
-
const closeIndex = rest.indexOf(`\n${FENCE}`);
|
|
22
|
-
if (closeIndex === -1) {
|
|
23
|
-
return null;
|
|
24
|
-
}
|
|
25
|
-
return rest.slice(0, closeIndex);
|
|
26
|
-
}
|
|
27
6
|
/** Validate a single step's frontmatter, returning a list of errors (empty = valid). */
|
|
28
|
-
function validateStepFrontmatter(
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
if (typeof output === "object" && output !== null && !Array.isArray(output)) {
|
|
32
|
-
const status = output.$status;
|
|
33
|
-
if (typeof status !== "string" || status.trim() === "") {
|
|
34
|
-
return ["$status field is missing or not a non-empty string"];
|
|
35
|
-
}
|
|
36
|
-
return [];
|
|
37
|
-
}
|
|
38
|
-
const yaml = extractFrontmatterYaml(output);
|
|
39
|
-
if (yaml === null) {
|
|
40
|
-
return ["output does not begin with a valid '---' frontmatter block"];
|
|
41
|
-
}
|
|
42
|
-
let parsed;
|
|
43
|
-
try {
|
|
44
|
-
parsed = parseYaml(yaml);
|
|
45
|
-
}
|
|
46
|
-
catch (e) {
|
|
47
|
-
const message = e instanceof Error ? e.message : String(e);
|
|
48
|
-
return [`frontmatter YAML failed to parse: ${message}`];
|
|
49
|
-
}
|
|
50
|
-
if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
|
|
51
|
-
return ["frontmatter is not a YAML mapping"];
|
|
7
|
+
function validateStepFrontmatter(frontmatter) {
|
|
8
|
+
if (Object.keys(frontmatter).length === 0) {
|
|
9
|
+
return ["step has no frontmatter"];
|
|
52
10
|
}
|
|
53
|
-
const status =
|
|
11
|
+
const status = frontmatter.$status;
|
|
54
12
|
if (typeof status !== "string" || status.trim() === "") {
|
|
55
13
|
return ["$status field is missing or not a non-empty string"];
|
|
56
14
|
}
|
|
@@ -66,7 +24,10 @@ export async function runFrontmatterJudge(threadId) {
|
|
|
66
24
|
const invalidSteps = [];
|
|
67
25
|
for (let i = 0; i < steps.length; i++) {
|
|
68
26
|
const step = steps[i];
|
|
69
|
-
|
|
27
|
+
if (step === undefined)
|
|
28
|
+
continue;
|
|
29
|
+
const detail = readStepDetail(step.hash);
|
|
30
|
+
const errors = validateStepFrontmatter(detail.frontmatter);
|
|
70
31
|
if (errors.length > 0) {
|
|
71
32
|
invalidSteps.push({ stepIndex: i, role: step.role, errors });
|
|
72
33
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"frontmatter.js","sourceRoot":"","sources":["../../../src/judge/builtin/frontmatter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;
|
|
1
|
+
{"version":3,"file":"frontmatter.js","sourceRoot":"","sources":["../../../src/judge/builtin/frontmatter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAEtD,OAAO,EAAE,6BAA6B,EAAE,MAAM,wBAAwB,CAAC;AACvE,OAAO,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,iBAAiB,CAAC;AAGlE,MAAM,GAAG,GAAG,YAAY,CAAC,EAAE,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC;AAEvD,MAAM,UAAU,GAAG,UAAU,CAAC;AAQ9B,wFAAwF;AACxF,SAAS,uBAAuB,CAAC,WAAoC;IACnE,IAAI,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1C,OAAO,CAAC,yBAAyB,CAAC,CAAC;IACrC,CAAC;IACD,MAAM,MAAM,GAAG,WAAW,CAAC,OAAO,CAAC;IACnC,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC;QACvD,OAAO,CAAC,oDAAoD,CAAC,CAAC;IAChE,CAAC;IACD,OAAO,EAAE,CAAC;AACZ,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,mBAAmB,CAAC,QAAgB;IACxD,MAAM,KAAK,GAAG,eAAe,CAAC,QAAQ,CAAC,CAAC;IAExC,MAAM,YAAY,GAAkB,EAAE,CAAC;IACvC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QACtB,IAAI,IAAI,KAAK,SAAS;YAAE,SAAS;QACjC,MAAM,MAAM,GAAG,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACzC,MAAM,MAAM,GAAG,uBAAuB,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC;QAC3D,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtB,YAAY,CAAC,IAAI,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC;QAC/D,CAAC;IACH,CAAC;IAED,MAAM,UAAU,GAAG,KAAK,CAAC,MAAM,CAAC;IAChC,MAAM,UAAU,GAAG,UAAU,GAAG,YAAY,CAAC,MAAM,CAAC;IACpD,MAAM,KAAK,GAAG,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC;IAE3D,GAAG,CAAC,UAAU,EAAE,sBAAsB,QAAQ,UAAU,UAAU,IAAI,UAAU,EAAE,CAAC,CAAC;IAEpF,OAAO;QACL,KAAK;QACL,IAAI,EAAE,EAAE,UAAU,EAAE,UAAU,EAAE,YAAY,EAAE;QAC9C,MAAM,EAAE,6BAA6B;KACtC,CAAC;AACJ,CAAC"}
|
|
@@ -1,4 +1,38 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
1
|
+
/**
|
|
2
|
+
* A single step entry as exposed by `uwf step list --format raw-json` under 0.6.
|
|
3
|
+
*
|
|
4
|
+
* Richer per-step data (frontmatter, turns, agent, usage) lives in the step
|
|
5
|
+
* detail node and is fetched separately via `readStepDetail(hash)` when needed.
|
|
6
|
+
*/
|
|
7
|
+
export type StepListEntry = {
|
|
8
|
+
hash: string;
|
|
9
|
+
role: string;
|
|
10
|
+
durationMs: number | null;
|
|
11
|
+
};
|
|
12
|
+
/** Shell out to `uwf step list --format raw-json` and return the bare-value payload's items. */
|
|
13
|
+
export declare function readThreadSteps(threadId: string): StepListEntry[];
|
|
14
|
+
/**
|
|
15
|
+
* Per-step detail surface used by builtin judges. Mirrors the
|
|
16
|
+
* `StepDetailPayload` schema (`@uwf/output/step-detail`) but only exposes the
|
|
17
|
+
* fields judges currently consume.
|
|
18
|
+
*/
|
|
19
|
+
export type StepDetail = {
|
|
20
|
+
hash: string;
|
|
21
|
+
role: string;
|
|
22
|
+
agent: string;
|
|
23
|
+
durationMs: number | null;
|
|
24
|
+
frontmatter: Record<string, unknown>;
|
|
25
|
+
usage: {
|
|
26
|
+
turns: number;
|
|
27
|
+
inputTokens: number;
|
|
28
|
+
outputTokens: number;
|
|
29
|
+
duration: number;
|
|
30
|
+
} | null;
|
|
31
|
+
};
|
|
32
|
+
/**
|
|
33
|
+
* Shell out to `uwf step show <hash> --format raw-json` and return the bare-value
|
|
34
|
+
* step-detail payload. Used by judges that need richer per-step data than
|
|
35
|
+
* `readThreadSteps` exposes (e.g. frontmatter contents, token usage).
|
|
36
|
+
*/
|
|
37
|
+
export declare function readStepDetail(stepHash: string): StepDetail;
|
|
4
38
|
//# sourceMappingURL=read-steps.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"read-steps.d.ts","sourceRoot":"","sources":["../../../src/judge/builtin/read-steps.ts"],"names":[],"mappings":"AAEA,
|
|
1
|
+
{"version":3,"file":"read-steps.d.ts","sourceRoot":"","sources":["../../../src/judge/builtin/read-steps.ts"],"names":[],"mappings":"AAEA;;;;;GAKG;AACH,MAAM,MAAM,aAAa,GAAG;IAC1B,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;CAC3B,CAAC;AAOF,gGAAgG;AAChG,wBAAgB,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,aAAa,EAAE,CAQjE;AAED;;;;GAIG;AACH,MAAM,MAAM,UAAU,GAAG;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,MAAM,CAAC;IACd,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1B,WAAW,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACrC,KAAK,EAAE;QACL,KAAK,EAAE,MAAM,CAAC;QACd,WAAW,EAAE,MAAM,CAAC;QACpB,YAAY,EAAE,MAAM,CAAC;QACrB,QAAQ,EAAE,MAAM,CAAC;KAClB,GAAG,IAAI,CAAC;CACV,CAAC;AAkBF;;;;GAIG;AACH,wBAAgB,cAAc,CAAC,QAAQ,EAAE,MAAM,GAAG,UAAU,CAmB3D"}
|
|
@@ -1,12 +1,36 @@
|
|
|
1
1
|
import { execFileSync } from "node:child_process";
|
|
2
|
-
/** Shell out to `uwf step list` and return the
|
|
2
|
+
/** Shell out to `uwf step list --format raw-json` and return the bare-value payload's items. */
|
|
3
3
|
export function readThreadSteps(threadId) {
|
|
4
|
-
const stdout = execFileSync("uwf", ["step", "list", threadId], {
|
|
4
|
+
const stdout = execFileSync("uwf", ["--format", "raw-json", "step", "list", threadId], {
|
|
5
5
|
encoding: "utf8",
|
|
6
6
|
stdio: ["ignore", "pipe", "pipe"],
|
|
7
7
|
}).trim();
|
|
8
8
|
const parsed = JSON.parse(stdout);
|
|
9
|
-
//
|
|
10
|
-
return parsed.
|
|
9
|
+
// The 0.6 payload does not include a synthetic start entry — every item is a real step.
|
|
10
|
+
return parsed.items;
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* Shell out to `uwf step show <hash> --format raw-json` and return the bare-value
|
|
14
|
+
* step-detail payload. Used by judges that need richer per-step data than
|
|
15
|
+
* `readThreadSteps` exposes (e.g. frontmatter contents, token usage).
|
|
16
|
+
*/
|
|
17
|
+
export function readStepDetail(stepHash) {
|
|
18
|
+
const stdout = execFileSync("uwf", ["--format", "raw-json", "step", "show", stepHash], {
|
|
19
|
+
encoding: "utf8",
|
|
20
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
21
|
+
}).trim();
|
|
22
|
+
const parsed = JSON.parse(stdout);
|
|
23
|
+
return {
|
|
24
|
+
hash: parsed.hash,
|
|
25
|
+
role: parsed.role,
|
|
26
|
+
agent: parsed.agent,
|
|
27
|
+
durationMs: parsed.durationMs,
|
|
28
|
+
frontmatter: parsed.frontmatter !== null &&
|
|
29
|
+
typeof parsed.frontmatter === "object" &&
|
|
30
|
+
!Array.isArray(parsed.frontmatter)
|
|
31
|
+
? parsed.frontmatter
|
|
32
|
+
: {},
|
|
33
|
+
usage: parsed.usage,
|
|
34
|
+
};
|
|
11
35
|
}
|
|
12
36
|
//# sourceMappingURL=read-steps.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"read-steps.js","sourceRoot":"","sources":["../../../src/judge/builtin/read-steps.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;
|
|
1
|
+
{"version":3,"file":"read-steps.js","sourceRoot":"","sources":["../../../src/judge/builtin/read-steps.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAmBlD,gGAAgG;AAChG,MAAM,UAAU,eAAe,CAAC,QAAgB;IAC9C,MAAM,MAAM,GAAG,YAAY,CAAC,KAAK,EAAE,CAAC,UAAU,EAAE,UAAU,EAAE,MAAM,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE;QACrF,QAAQ,EAAE,MAAM;QAChB,KAAK,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,CAAC;KAClC,CAAC,CAAC,IAAI,EAAE,CAAC;IACV,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAoB,CAAC;IACrD,wFAAwF;IACxF,OAAO,MAAM,CAAC,KAAK,CAAC;AACtB,CAAC;AAqCD;;;;GAIG;AACH,MAAM,UAAU,cAAc,CAAC,QAAgB;IAC7C,MAAM,MAAM,GAAG,YAAY,CAAC,KAAK,EAAE,CAAC,UAAU,EAAE,UAAU,EAAE,MAAM,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE;QACrF,QAAQ,EAAE,MAAM;QAChB,KAAK,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,CAAC;KAClC,CAAC,CAAC,IAAI,EAAE,CAAC;IACV,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAyB,CAAC;IAC1D,OAAO;QACL,IAAI,EAAE,MAAM,CAAC,IAAI;QACjB,IAAI,EAAE,MAAM,CAAC,IAAI;QACjB,KAAK,EAAE,MAAM,CAAC,KAAK;QACnB,UAAU,EAAE,MAAM,CAAC,UAAU;QAC7B,WAAW,EACT,MAAM,CAAC,WAAW,KAAK,IAAI;YAC3B,OAAO,MAAM,CAAC,WAAW,KAAK,QAAQ;YACtC,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,WAAW,CAAC;YAChC,CAAC,CAAC,MAAM,CAAC,WAAW;YACpB,CAAC,CAAC,EAAE;QACR,KAAK,EAAE,MAAM,CAAC,KAAK;KACpB,CAAC;AACJ,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"token-stats.d.ts","sourceRoot":"","sources":["../../../src/judge/builtin/token-stats.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AAcrD;;;;GAIG;AACH,wBAAsB,kBAAkB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,kBAAkB,CAAC,
|
|
1
|
+
{"version":3,"file":"token-stats.d.ts","sourceRoot":"","sources":["../../../src/judge/builtin/token-stats.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AAcrD;;;;GAIG;AACH,wBAAsB,kBAAkB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,kBAAkB,CAAC,CA8BtF"}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { createLogger } from "@united-workforce/util";
|
|
2
2
|
import { EVAL_JUDGE_TOKEN_STATS_SCHEMA } from "../../storage/index.js";
|
|
3
|
-
import { readThreadSteps } from "./read-steps.js";
|
|
3
|
+
import { readStepDetail, readThreadSteps } from "./read-steps.js";
|
|
4
4
|
const log = createLogger({ sink: { kind: "stderr" } });
|
|
5
5
|
const LOG_RESULT = "T7KQ3M9P";
|
|
6
6
|
/**
|
|
@@ -15,7 +15,8 @@ export async function runTokenStatsJudge(threadId) {
|
|
|
15
15
|
let totalTurns = 0;
|
|
16
16
|
const perStep = [];
|
|
17
17
|
for (const step of steps) {
|
|
18
|
-
const
|
|
18
|
+
const detail = readStepDetail(step.hash);
|
|
19
|
+
const usage = detail.usage;
|
|
19
20
|
const inputTokens = usage !== null ? usage.inputTokens : 0;
|
|
20
21
|
const outputTokens = usage !== null ? usage.outputTokens : 0;
|
|
21
22
|
const turns = usage !== null ? usage.turns : 0;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"token-stats.js","sourceRoot":"","sources":["../../../src/judge/builtin/token-stats.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAEtD,OAAO,EAAE,6BAA6B,EAAE,MAAM,wBAAwB,CAAC;AACvE,OAAO,EAAE,eAAe,EAAE,MAAM,iBAAiB,CAAC;
|
|
1
|
+
{"version":3,"file":"token-stats.js","sourceRoot":"","sources":["../../../src/judge/builtin/token-stats.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAEtD,OAAO,EAAE,6BAA6B,EAAE,MAAM,wBAAwB,CAAC;AACvE,OAAO,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,iBAAiB,CAAC;AAGlE,MAAM,GAAG,GAAG,YAAY,CAAC,EAAE,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC;AAEvD,MAAM,UAAU,GAAG,UAAU,CAAC;AAU9B;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CAAC,QAAgB;IACvD,MAAM,KAAK,GAAG,eAAe,CAAC,QAAQ,CAAC,CAAC;IAExC,IAAI,UAAU,GAAG,CAAC,CAAC;IACnB,IAAI,WAAW,GAAG,CAAC,CAAC;IACpB,IAAI,UAAU,GAAG,CAAC,CAAC;IACnB,MAAM,OAAO,GAAmB,EAAE,CAAC;IAEnC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,MAAM,GAAG,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACzC,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC;QAC3B,MAAM,WAAW,GAAG,KAAK,KAAK,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;QAC3D,MAAM,YAAY,GAAG,KAAK,KAAK,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC;QAC7D,MAAM,KAAK,GAAG,KAAK,KAAK,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/C,MAAM,QAAQ,GAAG,KAAK,KAAK,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QAErD,UAAU,IAAI,WAAW,CAAC;QAC1B,WAAW,IAAI,YAAY,CAAC;QAC5B,UAAU,IAAI,KAAK,CAAC;QAEpB,OAAO,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,WAAW,EAAE,YAAY,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC;IAChF,CAAC;IAED,GAAG,CAAC,UAAU,EAAE,sBAAsB,QAAQ,OAAO,UAAU,QAAQ,WAAW,EAAE,CAAC,CAAC;IAEtF,OAAO;QACL,KAAK,EAAE,GAAG;QACV,IAAI,EAAE,EAAE,UAAU,EAAE,WAAW,EAAE,UAAU,EAAE,OAAO,EAAE;QACtD,MAAM,EAAE,6BAA6B;KACtC,CAAC;AACJ,CAAC"}
|
package/dist/runner/execute.d.ts
CHANGED
|
@@ -2,6 +2,10 @@ import type { ExecuteInput, ExecuteResult } from "./types.js";
|
|
|
2
2
|
/**
|
|
3
3
|
* Execute a workflow: create a thread, then run it for up to `maxSteps` steps.
|
|
4
4
|
* Shells out to the uwf CLI rather than importing it directly.
|
|
5
|
+
*
|
|
6
|
+
* Both `thread start` and `thread exec` are invoked with `--format raw-json`
|
|
7
|
+
* so the legacy bare-value JSON shape is emitted (the 0.6 default is text).
|
|
8
|
+
* See `specs/cli-ocas-envelope-in-repo-consumer-migration.md`.
|
|
5
9
|
*/
|
|
6
10
|
export declare function execute(input: ExecuteInput): Promise<ExecuteResult>;
|
|
7
11
|
/** Best-effort lookup of the uwf engine version (`uwf -V`); "unknown" on failure. */
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"execute.d.ts","sourceRoot":"","sources":["../../src/runner/execute.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;
|
|
1
|
+
{"version":3,"file":"execute.d.ts","sourceRoot":"","sources":["../../src/runner/execute.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAwD9D;;;;;;;GAOG;AACH,wBAAsB,OAAO,CAAC,KAAK,EAAE,YAAY,GAAG,OAAO,CAAC,aAAa,CAAC,CAmCzE;AAED,qFAAqF;AACrF,wBAAgB,gBAAgB,IAAI,MAAM,CASzC"}
|
package/dist/runner/execute.js
CHANGED
|
@@ -26,10 +26,15 @@ function runUwf(args, cwd) {
|
|
|
26
26
|
? err.stderr
|
|
27
27
|
: err.stderr.toString("utf8");
|
|
28
28
|
const detail = stderr.trim() !== "" ? `: ${stderr.trim()}` : "";
|
|
29
|
-
|
|
29
|
+
// Find the subcommand group + subcommand by skipping leading global flags
|
|
30
|
+
// (e.g. `--format raw-json`). The first non-flag token is the group.
|
|
31
|
+
const groupIdx = args.findIndex((a) => !a.startsWith("--"));
|
|
32
|
+
const group = groupIdx >= 0 ? (args[groupIdx] ?? "") : "";
|
|
33
|
+
const subcmd = groupIdx >= 0 ? (args[groupIdx + 1] ?? "") : "";
|
|
34
|
+
throw new Error(`uwf ${group} ${subcmd} failed${detail}`);
|
|
30
35
|
}
|
|
31
36
|
}
|
|
32
|
-
/** Parse the thread ID from `uwf thread start`
|
|
37
|
+
/** Parse the thread ID from `uwf thread start --format raw-json` output (`{ threadId, workflowHash }`). */
|
|
33
38
|
function parseThreadId(stdout) {
|
|
34
39
|
let parsed;
|
|
35
40
|
try {
|
|
@@ -39,21 +44,45 @@ function parseThreadId(stdout) {
|
|
|
39
44
|
throw new Error(`uwf thread start did not emit valid JSON: ${stdout || "(empty)"}`);
|
|
40
45
|
}
|
|
41
46
|
const obj = parsed;
|
|
42
|
-
const
|
|
43
|
-
if (typeof
|
|
44
|
-
throw new Error(`uwf thread start output missing
|
|
47
|
+
const threadId = obj.threadId;
|
|
48
|
+
if (typeof threadId !== "string" || threadId === "") {
|
|
49
|
+
throw new Error(`uwf thread start output missing threadId: ${stdout}`);
|
|
45
50
|
}
|
|
46
|
-
return
|
|
51
|
+
return threadId;
|
|
47
52
|
}
|
|
48
53
|
/**
|
|
49
54
|
* Execute a workflow: create a thread, then run it for up to `maxSteps` steps.
|
|
50
55
|
* Shells out to the uwf CLI rather than importing it directly.
|
|
56
|
+
*
|
|
57
|
+
* Both `thread start` and `thread exec` are invoked with `--format raw-json`
|
|
58
|
+
* so the legacy bare-value JSON shape is emitted (the 0.6 default is text).
|
|
59
|
+
* See `specs/cli-ocas-envelope-in-repo-consumer-migration.md`.
|
|
51
60
|
*/
|
|
52
61
|
export async function execute(input) {
|
|
53
|
-
const startOut = runUwf([
|
|
62
|
+
const startOut = runUwf([
|
|
63
|
+
"--format",
|
|
64
|
+
"raw-json",
|
|
65
|
+
"thread",
|
|
66
|
+
"start",
|
|
67
|
+
input.workflow,
|
|
68
|
+
"-p",
|
|
69
|
+
input.prompt,
|
|
70
|
+
"--cwd",
|
|
71
|
+
input.workDir,
|
|
72
|
+
], input.workDir);
|
|
54
73
|
const threadId = parseThreadId(startOut);
|
|
55
74
|
log(LOG_START, `thread started thread=${threadId} workflow=${input.workflow}`);
|
|
56
|
-
runUwf([
|
|
75
|
+
runUwf([
|
|
76
|
+
"--format",
|
|
77
|
+
"raw-json",
|
|
78
|
+
"thread",
|
|
79
|
+
"exec",
|
|
80
|
+
threadId,
|
|
81
|
+
"--agent",
|
|
82
|
+
input.agent,
|
|
83
|
+
"-c",
|
|
84
|
+
String(input.maxSteps),
|
|
85
|
+
], input.workDir);
|
|
57
86
|
log(LOG_EXEC, `thread executed thread=${threadId} maxSteps=${input.maxSteps}`);
|
|
58
87
|
return { threadId };
|
|
59
88
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"execute.js","sourceRoot":"","sources":["../../src/runner/execute.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAElD,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAItD,MAAM,GAAG,GAAG,YAAY,CAAC,EAAE,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC;AAEvD,MAAM,SAAS,GAAG,UAAU,CAAC;AAC7B,MAAM,QAAQ,GAAG,UAAU,CAAC;AAE5B,uEAAuE;AACvE,SAAS,MAAM;IACb,MAAM,QAAQ,GAAG,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC;IACrC,OAAO,QAAQ,KAAK,SAAS,IAAI,QAAQ,KAAK,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC;AACtE,CAAC;AAED,sDAAsD;AACtD,SAAS,MAAM,CAAC,IAAc,EAAE,GAAW;IACzC,IAAI,CAAC;QACH,OAAO,YAAY,CAAC,MAAM,EAAE,EAAE,IAAI,EAAE;YAClC,QAAQ,EAAE,MAAM;YAChB,KAAK,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,CAAC;YACjC,SAAS,EAAE,EAAE,GAAG,IAAI,GAAG,IAAI;YAC3B,GAAG;SACJ,CAAC,CAAC,IAAI,EAAE,CAAC;IACZ,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,MAAM,GAAG,GAAG,CAAgE,CAAC;QAC7E,MAAM,MAAM,GACV,GAAG,CAAC,MAAM,IAAI,IAAI;YAChB,CAAC,CAAC,EAAE;YACJ,CAAC,CAAC,OAAO,GAAG,CAAC,MAAM,KAAK,QAAQ;gBAC9B,CAAC,CAAC,GAAG,CAAC,MAAM;gBACZ,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QACpC,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,KAAK,MAAM,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAChE,MAAM,IAAI,
|
|
1
|
+
{"version":3,"file":"execute.js","sourceRoot":"","sources":["../../src/runner/execute.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAElD,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAItD,MAAM,GAAG,GAAG,YAAY,CAAC,EAAE,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC;AAEvD,MAAM,SAAS,GAAG,UAAU,CAAC;AAC7B,MAAM,QAAQ,GAAG,UAAU,CAAC;AAE5B,uEAAuE;AACvE,SAAS,MAAM;IACb,MAAM,QAAQ,GAAG,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC;IACrC,OAAO,QAAQ,KAAK,SAAS,IAAI,QAAQ,KAAK,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC;AACtE,CAAC;AAED,sDAAsD;AACtD,SAAS,MAAM,CAAC,IAAc,EAAE,GAAW;IACzC,IAAI,CAAC;QACH,OAAO,YAAY,CAAC,MAAM,EAAE,EAAE,IAAI,EAAE;YAClC,QAAQ,EAAE,MAAM;YAChB,KAAK,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,CAAC;YACjC,SAAS,EAAE,EAAE,GAAG,IAAI,GAAG,IAAI;YAC3B,GAAG;SACJ,CAAC,CAAC,IAAI,EAAE,CAAC;IACZ,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,MAAM,GAAG,GAAG,CAAgE,CAAC;QAC7E,MAAM,MAAM,GACV,GAAG,CAAC,MAAM,IAAI,IAAI;YAChB,CAAC,CAAC,EAAE;YACJ,CAAC,CAAC,OAAO,GAAG,CAAC,MAAM,KAAK,QAAQ;gBAC9B,CAAC,CAAC,GAAG,CAAC,MAAM;gBACZ,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QACpC,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,KAAK,MAAM,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAChE,0EAA0E;QAC1E,qEAAqE;QACrE,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC;QAC5D,MAAM,KAAK,GAAG,QAAQ,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC1D,MAAM,MAAM,GAAG,QAAQ,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC/D,MAAM,IAAI,KAAK,CAAC,OAAO,KAAK,IAAI,MAAM,UAAU,MAAM,EAAE,CAAC,CAAC;IAC5D,CAAC;AACH,CAAC;AAED,2GAA2G;AAC3G,SAAS,aAAa,CAAC,MAAc;IACnC,IAAI,MAAe,CAAC;IACpB,IAAI,CAAC;QACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;IAC9B,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CAAC,6CAA6C,MAAM,IAAI,SAAS,EAAE,CAAC,CAAC;IACtF,CAAC;IACD,MAAM,GAAG,GAAG,MAAiC,CAAC;IAC9C,MAAM,QAAQ,GAAG,GAAG,CAAC,QAAQ,CAAC;IAC9B,IAAI,OAAO,QAAQ,KAAK,QAAQ,IAAI,QAAQ,KAAK,EAAE,EAAE,CAAC;QACpD,MAAM,IAAI,KAAK,CAAC,6CAA6C,MAAM,EAAE,CAAC,CAAC;IACzE,CAAC;IACD,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,KAAmB;IAC/C,MAAM,QAAQ,GAAG,MAAM,CACrB;QACE,UAAU;QACV,UAAU;QACV,QAAQ;QACR,OAAO;QACP,KAAK,CAAC,QAAQ;QACd,IAAI;QACJ,KAAK,CAAC,MAAM;QACZ,OAAO;QACP,KAAK,CAAC,OAAO;KACd,EACD,KAAK,CAAC,OAAO,CACd,CAAC;IACF,MAAM,QAAQ,GAAG,aAAa,CAAC,QAAQ,CAAC,CAAC;IACzC,GAAG,CAAC,SAAS,EAAE,yBAAyB,QAAQ,aAAa,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;IAE/E,MAAM,CACJ;QACE,UAAU;QACV,UAAU;QACV,QAAQ;QACR,MAAM;QACN,QAAQ;QACR,SAAS;QACT,KAAK,CAAC,KAAK;QACX,IAAI;QACJ,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC;KACvB,EACD,KAAK,CAAC,OAAO,CACd,CAAC;IACF,GAAG,CAAC,QAAQ,EAAE,0BAA0B,QAAQ,aAAa,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;IAE/E,OAAO,EAAE,QAAQ,EAAE,CAAC;AACtB,CAAC;AAED,qFAAqF;AACrF,MAAM,UAAU,gBAAgB;IAC9B,IAAI,CAAC;QACH,OAAO,YAAY,CAAC,MAAM,EAAE,EAAE,CAAC,IAAI,CAAC,EAAE;YACpC,QAAQ,EAAE,MAAM;YAChB,KAAK,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,QAAQ,CAAC;SACpC,CAAC,CAAC,IAAI,EAAE,CAAC;IACZ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,SAAS,CAAC;IACnB,CAAC;AACH,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@united-workforce/eval",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.6",
|
|
4
4
|
"private": false,
|
|
5
5
|
"files": [
|
|
6
6
|
"src",
|
|
@@ -18,12 +18,12 @@
|
|
|
18
18
|
}
|
|
19
19
|
},
|
|
20
20
|
"dependencies": {
|
|
21
|
-
"@ocas/core": "^0.
|
|
22
|
-
"@ocas/fs": "^0.
|
|
21
|
+
"@ocas/core": "^0.5.0",
|
|
22
|
+
"@ocas/fs": "^0.4.1",
|
|
23
23
|
"commander": "^14.0.3",
|
|
24
24
|
"yaml": "^2.9.0",
|
|
25
|
-
"@united-workforce/protocol": "^0.
|
|
26
|
-
"@united-workforce/util": "^0.
|
|
25
|
+
"@united-workforce/protocol": "^0.3.0",
|
|
26
|
+
"@united-workforce/util": "^0.2.0"
|
|
27
27
|
},
|
|
28
28
|
"devDependencies": {
|
|
29
29
|
"typescript": "^5.8.3"
|
package/src/commands/list.ts
CHANGED
|
@@ -6,7 +6,7 @@ import { formatList, selectEntries } from "./format.js";
|
|
|
6
6
|
import { readEvalEntries } from "./read.js";
|
|
7
7
|
|
|
8
8
|
const log = createLogger({ sink: { kind: "stderr" } });
|
|
9
|
-
const LOG_LIST = "
|
|
9
|
+
const LOG_LIST = "H5KX9R2B";
|
|
10
10
|
|
|
11
11
|
type ListCliOptions = {
|
|
12
12
|
task: string | undefined;
|
|
@@ -1,76 +1,28 @@
|
|
|
1
1
|
import { createLogger } from "@united-workforce/util";
|
|
2
|
-
import { parse as parseYaml } from "yaml";
|
|
3
2
|
|
|
4
3
|
import { EVAL_JUDGE_FRONTMATTER_SCHEMA } from "../../storage/index.js";
|
|
5
|
-
import { readThreadSteps } from "./read-steps.js";
|
|
4
|
+
import { readStepDetail, readThreadSteps } from "./read-steps.js";
|
|
6
5
|
import type { BuiltinJudgeOutput } from "./types.js";
|
|
7
6
|
|
|
8
7
|
const log = createLogger({ sink: { kind: "stderr" } });
|
|
9
8
|
|
|
10
9
|
const LOG_RESULT = "F2QH7R4M";
|
|
11
10
|
|
|
12
|
-
const FENCE = "---";
|
|
13
|
-
|
|
14
11
|
type InvalidStep = {
|
|
15
12
|
stepIndex: number;
|
|
16
13
|
role: string;
|
|
17
14
|
errors: string[];
|
|
18
15
|
};
|
|
19
16
|
|
|
20
|
-
/**
|
|
21
|
-
* Extract the YAML frontmatter block from a step output. Returns the inner YAML
|
|
22
|
-
* string when the output starts with a `---\n` block closed by a `\n---` fence,
|
|
23
|
-
* otherwise null.
|
|
24
|
-
*/
|
|
25
|
-
function extractFrontmatterYaml(output: unknown): string | null {
|
|
26
|
-
if (typeof output !== "string") {
|
|
27
|
-
return null;
|
|
28
|
-
}
|
|
29
|
-
if (!output.startsWith(`${FENCE}\n`)) {
|
|
30
|
-
return null;
|
|
31
|
-
}
|
|
32
|
-
const rest = output.slice(FENCE.length + 1);
|
|
33
|
-
const closeIndex = rest.indexOf(`\n${FENCE}`);
|
|
34
|
-
if (closeIndex === -1) {
|
|
35
|
-
return null;
|
|
36
|
-
}
|
|
37
|
-
return rest.slice(0, closeIndex);
|
|
38
|
-
}
|
|
39
|
-
|
|
40
17
|
/** Validate a single step's frontmatter, returning a list of errors (empty = valid). */
|
|
41
|
-
function validateStepFrontmatter(
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
if (typeof output === "object" && output !== null && !Array.isArray(output)) {
|
|
45
|
-
const status = (output as Record<string, unknown>).$status;
|
|
46
|
-
if (typeof status !== "string" || status.trim() === "") {
|
|
47
|
-
return ["$status field is missing or not a non-empty string"];
|
|
48
|
-
}
|
|
49
|
-
return [];
|
|
18
|
+
function validateStepFrontmatter(frontmatter: Record<string, unknown>): string[] {
|
|
19
|
+
if (Object.keys(frontmatter).length === 0) {
|
|
20
|
+
return ["step has no frontmatter"];
|
|
50
21
|
}
|
|
51
|
-
|
|
52
|
-
const yaml = extractFrontmatterYaml(output);
|
|
53
|
-
if (yaml === null) {
|
|
54
|
-
return ["output does not begin with a valid '---' frontmatter block"];
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
let parsed: unknown;
|
|
58
|
-
try {
|
|
59
|
-
parsed = parseYaml(yaml);
|
|
60
|
-
} catch (e) {
|
|
61
|
-
const message = e instanceof Error ? e.message : String(e);
|
|
62
|
-
return [`frontmatter YAML failed to parse: ${message}`];
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
|
|
66
|
-
return ["frontmatter is not a YAML mapping"];
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
const status = (parsed as Record<string, unknown>).$status;
|
|
22
|
+
const status = frontmatter.$status;
|
|
70
23
|
if (typeof status !== "string" || status.trim() === "") {
|
|
71
24
|
return ["$status field is missing or not a non-empty string"];
|
|
72
25
|
}
|
|
73
|
-
|
|
74
26
|
return [];
|
|
75
27
|
}
|
|
76
28
|
|
|
@@ -85,7 +37,9 @@ export async function runFrontmatterJudge(threadId: string): Promise<BuiltinJudg
|
|
|
85
37
|
const invalidSteps: InvalidStep[] = [];
|
|
86
38
|
for (let i = 0; i < steps.length; i++) {
|
|
87
39
|
const step = steps[i];
|
|
88
|
-
|
|
40
|
+
if (step === undefined) continue;
|
|
41
|
+
const detail = readStepDetail(step.hash);
|
|
42
|
+
const errors = validateStepFrontmatter(detail.frontmatter);
|
|
89
43
|
if (errors.length > 0) {
|
|
90
44
|
invalidSteps.push({ stepIndex: i, role: step.role, errors });
|
|
91
45
|
}
|
|
@@ -1,14 +1,90 @@
|
|
|
1
1
|
import { execFileSync } from "node:child_process";
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
/**
|
|
4
|
+
* A single step entry as exposed by `uwf step list --format raw-json` under 0.6.
|
|
5
|
+
*
|
|
6
|
+
* Richer per-step data (frontmatter, turns, agent, usage) lives in the step
|
|
7
|
+
* detail node and is fetched separately via `readStepDetail(hash)` when needed.
|
|
8
|
+
*/
|
|
9
|
+
export type StepListEntry = {
|
|
10
|
+
hash: string;
|
|
11
|
+
role: string;
|
|
12
|
+
durationMs: number | null;
|
|
13
|
+
};
|
|
4
14
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
15
|
+
type StepListPayload = {
|
|
16
|
+
threadId: string;
|
|
17
|
+
items: StepListEntry[];
|
|
18
|
+
};
|
|
19
|
+
|
|
20
|
+
/** Shell out to `uwf step list --format raw-json` and return the bare-value payload's items. */
|
|
21
|
+
export function readThreadSteps(threadId: string): StepListEntry[] {
|
|
22
|
+
const stdout = execFileSync("uwf", ["--format", "raw-json", "step", "list", threadId], {
|
|
23
|
+
encoding: "utf8",
|
|
24
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
25
|
+
}).trim();
|
|
26
|
+
const parsed = JSON.parse(stdout) as StepListPayload;
|
|
27
|
+
// The 0.6 payload does not include a synthetic start entry — every item is a real step.
|
|
28
|
+
return parsed.items;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Per-step detail surface used by builtin judges. Mirrors the
|
|
33
|
+
* `StepDetailPayload` schema (`@uwf/output/step-detail`) but only exposes the
|
|
34
|
+
* fields judges currently consume.
|
|
35
|
+
*/
|
|
36
|
+
export type StepDetail = {
|
|
37
|
+
hash: string;
|
|
38
|
+
role: string;
|
|
39
|
+
agent: string;
|
|
40
|
+
durationMs: number | null;
|
|
41
|
+
frontmatter: Record<string, unknown>;
|
|
42
|
+
usage: {
|
|
43
|
+
turns: number;
|
|
44
|
+
inputTokens: number;
|
|
45
|
+
outputTokens: number;
|
|
46
|
+
duration: number;
|
|
47
|
+
} | null;
|
|
48
|
+
};
|
|
49
|
+
|
|
50
|
+
type StepDetailRawPayload = {
|
|
51
|
+
hash: string;
|
|
52
|
+
role: string;
|
|
53
|
+
agent: string;
|
|
54
|
+
durationMs: number | null;
|
|
55
|
+
frontmatter: Record<string, unknown>;
|
|
56
|
+
// Usage is not exposed by the @uwf/output/step-detail schema yet; judges fall
|
|
57
|
+
// back to zeros when the field is null. Kept on the type for forward compat.
|
|
58
|
+
usage: {
|
|
59
|
+
turns: number;
|
|
60
|
+
inputTokens: number;
|
|
61
|
+
outputTokens: number;
|
|
62
|
+
duration: number;
|
|
63
|
+
} | null;
|
|
64
|
+
};
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Shell out to `uwf step show <hash> --format raw-json` and return the bare-value
|
|
68
|
+
* step-detail payload. Used by judges that need richer per-step data than
|
|
69
|
+
* `readThreadSteps` exposes (e.g. frontmatter contents, token usage).
|
|
70
|
+
*/
|
|
71
|
+
export function readStepDetail(stepHash: string): StepDetail {
|
|
72
|
+
const stdout = execFileSync("uwf", ["--format", "raw-json", "step", "show", stepHash], {
|
|
8
73
|
encoding: "utf8",
|
|
9
74
|
stdio: ["ignore", "pipe", "pipe"],
|
|
10
75
|
}).trim();
|
|
11
|
-
const parsed = JSON.parse(stdout) as
|
|
12
|
-
|
|
13
|
-
|
|
76
|
+
const parsed = JSON.parse(stdout) as StepDetailRawPayload;
|
|
77
|
+
return {
|
|
78
|
+
hash: parsed.hash,
|
|
79
|
+
role: parsed.role,
|
|
80
|
+
agent: parsed.agent,
|
|
81
|
+
durationMs: parsed.durationMs,
|
|
82
|
+
frontmatter:
|
|
83
|
+
parsed.frontmatter !== null &&
|
|
84
|
+
typeof parsed.frontmatter === "object" &&
|
|
85
|
+
!Array.isArray(parsed.frontmatter)
|
|
86
|
+
? parsed.frontmatter
|
|
87
|
+
: {},
|
|
88
|
+
usage: parsed.usage,
|
|
89
|
+
};
|
|
14
90
|
}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { createLogger } from "@united-workforce/util";
|
|
2
2
|
|
|
3
3
|
import { EVAL_JUDGE_TOKEN_STATS_SCHEMA } from "../../storage/index.js";
|
|
4
|
-
import { readThreadSteps } from "./read-steps.js";
|
|
4
|
+
import { readStepDetail, readThreadSteps } from "./read-steps.js";
|
|
5
5
|
import type { BuiltinJudgeOutput } from "./types.js";
|
|
6
6
|
|
|
7
7
|
const log = createLogger({ sink: { kind: "stderr" } });
|
|
@@ -30,7 +30,8 @@ export async function runTokenStatsJudge(threadId: string): Promise<BuiltinJudge
|
|
|
30
30
|
const perStep: PerStepStats[] = [];
|
|
31
31
|
|
|
32
32
|
for (const step of steps) {
|
|
33
|
-
const
|
|
33
|
+
const detail = readStepDetail(step.hash);
|
|
34
|
+
const usage = detail.usage;
|
|
34
35
|
const inputTokens = usage !== null ? usage.inputTokens : 0;
|
|
35
36
|
const outputTokens = usage !== null ? usage.outputTokens : 0;
|
|
36
37
|
const turns = usage !== null ? usage.turns : 0;
|
package/src/runner/execute.ts
CHANGED
|
@@ -33,11 +33,16 @@ function runUwf(args: string[], cwd: string): string {
|
|
|
33
33
|
? err.stderr
|
|
34
34
|
: err.stderr.toString("utf8");
|
|
35
35
|
const detail = stderr.trim() !== "" ? `: ${stderr.trim()}` : "";
|
|
36
|
-
|
|
36
|
+
// Find the subcommand group + subcommand by skipping leading global flags
|
|
37
|
+
// (e.g. `--format raw-json`). The first non-flag token is the group.
|
|
38
|
+
const groupIdx = args.findIndex((a) => !a.startsWith("--"));
|
|
39
|
+
const group = groupIdx >= 0 ? (args[groupIdx] ?? "") : "";
|
|
40
|
+
const subcmd = groupIdx >= 0 ? (args[groupIdx + 1] ?? "") : "";
|
|
41
|
+
throw new Error(`uwf ${group} ${subcmd} failed${detail}`);
|
|
37
42
|
}
|
|
38
43
|
}
|
|
39
44
|
|
|
40
|
-
/** Parse the thread ID from `uwf thread start`
|
|
45
|
+
/** Parse the thread ID from `uwf thread start --format raw-json` output (`{ threadId, workflowHash }`). */
|
|
41
46
|
function parseThreadId(stdout: string): string {
|
|
42
47
|
let parsed: unknown;
|
|
43
48
|
try {
|
|
@@ -46,27 +51,51 @@ function parseThreadId(stdout: string): string {
|
|
|
46
51
|
throw new Error(`uwf thread start did not emit valid JSON: ${stdout || "(empty)"}`);
|
|
47
52
|
}
|
|
48
53
|
const obj = parsed as Record<string, unknown>;
|
|
49
|
-
const
|
|
50
|
-
if (typeof
|
|
51
|
-
throw new Error(`uwf thread start output missing
|
|
54
|
+
const threadId = obj.threadId;
|
|
55
|
+
if (typeof threadId !== "string" || threadId === "") {
|
|
56
|
+
throw new Error(`uwf thread start output missing threadId: ${stdout}`);
|
|
52
57
|
}
|
|
53
|
-
return
|
|
58
|
+
return threadId;
|
|
54
59
|
}
|
|
55
60
|
|
|
56
61
|
/**
|
|
57
62
|
* Execute a workflow: create a thread, then run it for up to `maxSteps` steps.
|
|
58
63
|
* Shells out to the uwf CLI rather than importing it directly.
|
|
64
|
+
*
|
|
65
|
+
* Both `thread start` and `thread exec` are invoked with `--format raw-json`
|
|
66
|
+
* so the legacy bare-value JSON shape is emitted (the 0.6 default is text).
|
|
67
|
+
* See `specs/cli-ocas-envelope-in-repo-consumer-migration.md`.
|
|
59
68
|
*/
|
|
60
69
|
export async function execute(input: ExecuteInput): Promise<ExecuteResult> {
|
|
61
70
|
const startOut = runUwf(
|
|
62
|
-
[
|
|
71
|
+
[
|
|
72
|
+
"--format",
|
|
73
|
+
"raw-json",
|
|
74
|
+
"thread",
|
|
75
|
+
"start",
|
|
76
|
+
input.workflow,
|
|
77
|
+
"-p",
|
|
78
|
+
input.prompt,
|
|
79
|
+
"--cwd",
|
|
80
|
+
input.workDir,
|
|
81
|
+
],
|
|
63
82
|
input.workDir,
|
|
64
83
|
);
|
|
65
84
|
const threadId = parseThreadId(startOut);
|
|
66
85
|
log(LOG_START, `thread started thread=${threadId} workflow=${input.workflow}`);
|
|
67
86
|
|
|
68
87
|
runUwf(
|
|
69
|
-
[
|
|
88
|
+
[
|
|
89
|
+
"--format",
|
|
90
|
+
"raw-json",
|
|
91
|
+
"thread",
|
|
92
|
+
"exec",
|
|
93
|
+
threadId,
|
|
94
|
+
"--agent",
|
|
95
|
+
input.agent,
|
|
96
|
+
"-c",
|
|
97
|
+
String(input.maxSteps),
|
|
98
|
+
],
|
|
70
99
|
input.workDir,
|
|
71
100
|
);
|
|
72
101
|
log(LOG_EXEC, `thread executed thread=${threadId} maxSteps=${input.maxSteps}`);
|