@united-workforce/eval 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ ab2ab0e476768b6700c94fa8168e1b7c6fa60254bffe14047335c0b29c523971
@@ -1 +1 @@
1
- {"version":3,"file":"frontmatter.d.ts","sourceRoot":"","sources":["../../../src/judge/builtin/frontmatter.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AAuErD;;;;GAIG;AACH,wBAAsB,mBAAmB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,kBAAkB,CAAC,CAuBvF"}
1
+ {"version":3,"file":"frontmatter.d.ts","sourceRoot":"","sources":["../../../src/judge/builtin/frontmatter.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AAwBrD;;;;GAIG;AACH,wBAAsB,mBAAmB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,kBAAkB,CAAC,CAyBvF"}
@@ -1,56 +1,14 @@
1
1
  import { createLogger } from "@united-workforce/util";
2
- import { parse as parseYaml } from "yaml";
3
2
  import { EVAL_JUDGE_FRONTMATTER_SCHEMA } from "../../storage/index.js";
4
- import { readThreadSteps } from "./read-steps.js";
3
+ import { readStepDetail, readThreadSteps } from "./read-steps.js";
5
4
  const log = createLogger({ sink: { kind: "stderr" } });
6
5
  const LOG_RESULT = "F2QH7R4M";
7
- const FENCE = "---";
8
- /**
9
- * Extract the YAML frontmatter block from a step output. Returns the inner YAML
10
- * string when the output starts with a `---\n` block closed by a `\n---` fence,
11
- * otherwise null.
12
- */
13
- function extractFrontmatterYaml(output) {
14
- if (typeof output !== "string") {
15
- return null;
16
- }
17
- if (!output.startsWith(`${FENCE}\n`)) {
18
- return null;
19
- }
20
- const rest = output.slice(FENCE.length + 1);
21
- const closeIndex = rest.indexOf(`\n${FENCE}`);
22
- if (closeIndex === -1) {
23
- return null;
24
- }
25
- return rest.slice(0, closeIndex);
26
- }
27
6
  /** Validate a single step's frontmatter, returning a list of errors (empty = valid). */
28
- function validateStepFrontmatter(output) {
29
- // CAS stores the extracted output as a JSON object after the extract pipeline.
30
- // Accept both: parsed object (from step.output) or raw markdown string.
31
- if (typeof output === "object" && output !== null && !Array.isArray(output)) {
32
- const status = output.$status;
33
- if (typeof status !== "string" || status.trim() === "") {
34
- return ["$status field is missing or not a non-empty string"];
35
- }
36
- return [];
37
- }
38
- const yaml = extractFrontmatterYaml(output);
39
- if (yaml === null) {
40
- return ["output does not begin with a valid '---' frontmatter block"];
41
- }
42
- let parsed;
43
- try {
44
- parsed = parseYaml(yaml);
45
- }
46
- catch (e) {
47
- const message = e instanceof Error ? e.message : String(e);
48
- return [`frontmatter YAML failed to parse: ${message}`];
49
- }
50
- if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
51
- return ["frontmatter is not a YAML mapping"];
7
+ function validateStepFrontmatter(frontmatter) {
8
+ if (Object.keys(frontmatter).length === 0) {
9
+ return ["step has no frontmatter"];
52
10
  }
53
- const status = parsed.$status;
11
+ const status = frontmatter.$status;
54
12
  if (typeof status !== "string" || status.trim() === "") {
55
13
  return ["$status field is missing or not a non-empty string"];
56
14
  }
@@ -66,7 +24,10 @@ export async function runFrontmatterJudge(threadId) {
66
24
  const invalidSteps = [];
67
25
  for (let i = 0; i < steps.length; i++) {
68
26
  const step = steps[i];
69
- const errors = validateStepFrontmatter(step.output);
27
+ if (step === undefined)
28
+ continue;
29
+ const detail = readStepDetail(step.hash);
30
+ const errors = validateStepFrontmatter(detail.frontmatter);
70
31
  if (errors.length > 0) {
71
32
  invalidSteps.push({ stepIndex: i, role: step.role, errors });
72
33
  }
@@ -1 +1 @@
1
- {"version":3,"file":"frontmatter.js","sourceRoot":"","sources":["../../../src/judge/builtin/frontmatter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AACtD,OAAO,EAAE,KAAK,IAAI,SAAS,EAAE,MAAM,MAAM,CAAC;AAE1C,OAAO,EAAE,6BAA6B,EAAE,MAAM,wBAAwB,CAAC;AACvE,OAAO,EAAE,eAAe,EAAE,MAAM,iBAAiB,CAAC;AAGlD,MAAM,GAAG,GAAG,YAAY,CAAC,EAAE,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC;AAEvD,MAAM,UAAU,GAAG,UAAU,CAAC;AAE9B,MAAM,KAAK,GAAG,KAAK,CAAC;AAQpB;;;;GAIG;AACH,SAAS,sBAAsB,CAAC,MAAe;IAC7C,IAAI,OAAO,MAAM,KAAK,QAAQ,EAAE,CAAC;QAC/B,OAAO,IAAI,CAAC;IACd,CAAC;IACD,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,GAAG,KAAK,IAAI,CAAC,EAAE,CAAC;QACrC,OAAO,IAAI,CAAC;IACd,CAAC;IACD,MAAM,IAAI,GAAG,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAC5C,MAAM,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,KAAK,EAAE,CAAC,CAAC;IAC9C,IAAI,UAAU,KAAK,CAAC,CAAC,EAAE,CAAC;QACtB,OAAO,IAAI,CAAC;IACd,CAAC;IACD,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC;AACnC,CAAC;AAED,wFAAwF;AACxF,SAAS,uBAAuB,CAAC,MAAe;IAC9C,+EAA+E;IAC/E,wEAAwE;IACxE,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,KAAK,IAAI,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;QAC5E,MAAM,MAAM,GAAI,MAAkC,CAAC,OAAO,CAAC;QAC3D,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC;YACvD,OAAO,CAAC,oDAAoD,CAAC,CAAC;QAChE,CAAC;QACD,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,MAAM,IAAI,GAAG,sBAAsB,CAAC,MAAM,CAAC,CAAC;IAC5C,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;QAClB,OAAO,CAAC,4DAA4D,CAAC,CAAC;IACxE,CAAC;IAED,IAAI,MAAe,CAAC;IACpB,IAAI,CAAC;QACH,MAAM,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAC3B,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,MAAM,OAAO,GAAG,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;QAC3D,OAAO,CAAC,qCAAqC,OAAO,EAAE,CAAC,CAAC;IAC1D,CAAC;IAED,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;QAC3E,OAAO,CAAC,mCAAmC,CAAC,CAAC;IAC/C,CAAC;IAED,MAAM,MAAM,GAAI,MAAkC,CAAC,OAAO,CAAC;IAC3D,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC;QACvD,OAAO,CAAC,oDAAoD,CAAC,CAAC;IAChE,CAAC;IAED,OAAO,EAAE,CAAC;AACZ,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,mBAAmB,CAAC,QAAgB;IACxD,MAAM,KAAK,GAAG,eAAe,CAAC,QAAQ,CAAC,CAAC;IAExC,MAAM,YAAY,GAAkB,EAAE,CAAC;IACvC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QACtB,MAAM,MAAM,GAAG,uBAAuB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACpD,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtB,YAAY,CAAC,IAAI,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC;QAC/D,CAAC;IACH,CAAC;IAED,MAAM,UAAU,GAAG,KAAK,CAAC,MAAM,CAAC;IAChC,MAAM,UAAU,GAAG,UAAU,GAAG,YAAY,CAAC,MAAM,CAAC;IACpD,MAAM,KAAK,GAAG,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC;IAE3D,GAAG,CAAC,UAAU,EAAE,sBAAsB,QAAQ,UAAU,UAAU,IAAI,UAAU,EAAE,CAAC,CAAC;IAEpF,OAAO;QACL,KAAK;QACL,IAAI,EAAE,EAAE,UAAU,EAAE,UAAU,EAAE,YAAY,EAAE;QAC9C,MAAM,EAAE,6BAA6B;KACtC,CAAC;AACJ,CAAC"}
1
+ {"version":3,"file":"frontmatter.js","sourceRoot":"","sources":["../../../src/judge/builtin/frontmatter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAEtD,OAAO,EAAE,6BAA6B,EAAE,MAAM,wBAAwB,CAAC;AACvE,OAAO,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,iBAAiB,CAAC;AAGlE,MAAM,GAAG,GAAG,YAAY,CAAC,EAAE,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC;AAEvD,MAAM,UAAU,GAAG,UAAU,CAAC;AAQ9B,wFAAwF;AACxF,SAAS,uBAAuB,CAAC,WAAoC;IACnE,IAAI,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1C,OAAO,CAAC,yBAAyB,CAAC,CAAC;IACrC,CAAC;IACD,MAAM,MAAM,GAAG,WAAW,CAAC,OAAO,CAAC;IACnC,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC;QACvD,OAAO,CAAC,oDAAoD,CAAC,CAAC;IAChE,CAAC;IACD,OAAO,EAAE,CAAC;AACZ,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,mBAAmB,CAAC,QAAgB;IACxD,MAAM,KAAK,GAAG,eAAe,CAAC,QAAQ,CAAC,CAAC;IAExC,MAAM,YAAY,GAAkB,EAAE,CAAC;IACvC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QACtB,IAAI,IAAI,KAAK,SAAS;YAAE,SAAS;QACjC,MAAM,MAAM,GAAG,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACzC,MAAM,MAAM,GAAG,uBAAuB,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC;QAC3D,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtB,YAAY,CAAC,IAAI,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC;QAC/D,CAAC;IACH,CAAC;IAED,MAAM,UAAU,GAAG,KAAK,CAAC,MAAM,CAAC;IAChC,MAAM,UAAU,GAAG,UAAU,GAAG,YAAY,CAAC,MAAM,CAAC;IACpD,MAAM,KAAK,GAAG,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC;IAE3D,GAAG,CAAC,UAAU,EAAE,sBAAsB,QAAQ,UAAU,UAAU,IAAI,UAAU,EAAE,CAAC,CAAC;IAEpF,OAAO;QACL,KAAK;QACL,IAAI,EAAE,EAAE,UAAU,EAAE,UAAU,EAAE,YAAY,EAAE;QAC9C,MAAM,EAAE,6BAA6B;KACtC,CAAC;AACJ,CAAC"}
@@ -1,4 +1,38 @@
1
- import type { StepEntry } from "@united-workforce/protocol";
2
- /** Shell out to `uwf step list` and return the parsed step entries (excludes start entry). */
3
- export declare function readThreadSteps(threadId: string): StepEntry[];
1
+ /**
2
+ * A single step entry as exposed by `uwf step list --format raw-json` under 0.6.
3
+ *
4
+ * Richer per-step data (frontmatter, turns, agent, usage) lives in the step
5
+ * detail node and is fetched separately via `readStepDetail(hash)` when needed.
6
+ */
7
+ export type StepListEntry = {
8
+ hash: string;
9
+ role: string;
10
+ durationMs: number | null;
11
+ };
12
+ /** Shell out to `uwf step list --format raw-json` and return the bare-value payload's items. */
13
+ export declare function readThreadSteps(threadId: string): StepListEntry[];
14
+ /**
15
+ * Per-step detail surface used by builtin judges. Mirrors the
16
+ * `StepDetailPayload` schema (`@uwf/output/step-detail`) but only exposes the
17
+ * fields judges currently consume.
18
+ */
19
+ export type StepDetail = {
20
+ hash: string;
21
+ role: string;
22
+ agent: string;
23
+ durationMs: number | null;
24
+ frontmatter: Record<string, unknown>;
25
+ usage: {
26
+ turns: number;
27
+ inputTokens: number;
28
+ outputTokens: number;
29
+ duration: number;
30
+ } | null;
31
+ };
32
+ /**
33
+ * Shell out to `uwf step show <hash> --format raw-json` and return the bare-value
34
+ * step-detail payload. Used by judges that need richer per-step data than
35
+ * `readThreadSteps` exposes (e.g. frontmatter contents, token usage).
36
+ */
37
+ export declare function readStepDetail(stepHash: string): StepDetail;
4
38
  //# sourceMappingURL=read-steps.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"read-steps.d.ts","sourceRoot":"","sources":["../../../src/judge/builtin/read-steps.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAqB,MAAM,4BAA4B,CAAC;AAE/E,8FAA8F;AAC9F,wBAAgB,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,SAAS,EAAE,CAQ7D"}
1
+ {"version":3,"file":"read-steps.d.ts","sourceRoot":"","sources":["../../../src/judge/builtin/read-steps.ts"],"names":[],"mappings":"AAEA;;;;;GAKG;AACH,MAAM,MAAM,aAAa,GAAG;IAC1B,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;CAC3B,CAAC;AAOF,gGAAgG;AAChG,wBAAgB,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,aAAa,EAAE,CAQjE;AAED;;;;GAIG;AACH,MAAM,MAAM,UAAU,GAAG;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,MAAM,CAAC;IACd,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1B,WAAW,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACrC,KAAK,EAAE;QACL,KAAK,EAAE,MAAM,CAAC;QACd,WAAW,EAAE,MAAM,CAAC;QACpB,YAAY,EAAE,MAAM,CAAC;QACrB,QAAQ,EAAE,MAAM,CAAC;KAClB,GAAG,IAAI,CAAC;CACV,CAAC;AAkBF;;;;GAIG;AACH,wBAAgB,cAAc,CAAC,QAAQ,EAAE,MAAM,GAAG,UAAU,CAmB3D"}
@@ -1,12 +1,36 @@
1
1
  import { execFileSync } from "node:child_process";
2
- /** Shell out to `uwf step list` and return the parsed step entries (excludes start entry). */
2
+ /** Shell out to `uwf step list --format raw-json` and return the bare-value payload's items. */
3
3
  export function readThreadSteps(threadId) {
4
- const stdout = execFileSync("uwf", ["step", "list", threadId], {
4
+ const stdout = execFileSync("uwf", ["--format", "raw-json", "step", "list", threadId], {
5
5
  encoding: "utf8",
6
6
  stdio: ["ignore", "pipe", "pipe"],
7
7
  }).trim();
8
8
  const parsed = JSON.parse(stdout);
9
- // steps[0] is the StartEntry; the rest are StepEntry records.
10
- return parsed.steps.slice(1);
9
+ // The 0.6 payload does not include a synthetic start entry — every item is a real step.
10
+ return parsed.items;
11
+ }
12
+ /**
13
+ * Shell out to `uwf step show <hash> --format raw-json` and return the bare-value
14
+ * step-detail payload. Used by judges that need richer per-step data than
15
+ * `readThreadSteps` exposes (e.g. frontmatter contents, token usage).
16
+ */
17
+ export function readStepDetail(stepHash) {
18
+ const stdout = execFileSync("uwf", ["--format", "raw-json", "step", "show", stepHash], {
19
+ encoding: "utf8",
20
+ stdio: ["ignore", "pipe", "pipe"],
21
+ }).trim();
22
+ const parsed = JSON.parse(stdout);
23
+ return {
24
+ hash: parsed.hash,
25
+ role: parsed.role,
26
+ agent: parsed.agent,
27
+ durationMs: parsed.durationMs,
28
+ frontmatter: parsed.frontmatter !== null &&
29
+ typeof parsed.frontmatter === "object" &&
30
+ !Array.isArray(parsed.frontmatter)
31
+ ? parsed.frontmatter
32
+ : {},
33
+ usage: parsed.usage,
34
+ };
11
35
  }
12
36
  //# sourceMappingURL=read-steps.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"read-steps.js","sourceRoot":"","sources":["../../../src/judge/builtin/read-steps.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAIlD,8FAA8F;AAC9F,MAAM,UAAU,eAAe,CAAC,QAAgB;IAC9C,MAAM,MAAM,GAAG,YAAY,CAAC,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE;QAC7D,QAAQ,EAAE,MAAM;QAChB,KAAK,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,CAAC;KAClC,CAAC,CAAC,IAAI,EAAE,CAAC;IACV,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAsB,CAAC;IACvD,8DAA8D;IAC9D,OAAO,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAgB,CAAC;AAC9C,CAAC"}
1
+ {"version":3,"file":"read-steps.js","sourceRoot":"","sources":["../../../src/judge/builtin/read-steps.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAmBlD,gGAAgG;AAChG,MAAM,UAAU,eAAe,CAAC,QAAgB;IAC9C,MAAM,MAAM,GAAG,YAAY,CAAC,KAAK,EAAE,CAAC,UAAU,EAAE,UAAU,EAAE,MAAM,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE;QACrF,QAAQ,EAAE,MAAM;QAChB,KAAK,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,CAAC;KAClC,CAAC,CAAC,IAAI,EAAE,CAAC;IACV,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAoB,CAAC;IACrD,wFAAwF;IACxF,OAAO,MAAM,CAAC,KAAK,CAAC;AACtB,CAAC;AAqCD;;;;GAIG;AACH,MAAM,UAAU,cAAc,CAAC,QAAgB;IAC7C,MAAM,MAAM,GAAG,YAAY,CAAC,KAAK,EAAE,CAAC,UAAU,EAAE,UAAU,EAAE,MAAM,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE;QACrF,QAAQ,EAAE,MAAM;QAChB,KAAK,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,CAAC;KAClC,CAAC,CAAC,IAAI,EAAE,CAAC;IACV,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAyB,CAAC;IAC1D,OAAO;QACL,IAAI,EAAE,MAAM,CAAC,IAAI;QACjB,IAAI,EAAE,MAAM,CAAC,IAAI;QACjB,KAAK,EAAE,MAAM,CAAC,KAAK;QACnB,UAAU,EAAE,MAAM,CAAC,UAAU;QAC7B,WAAW,EACT,MAAM,CAAC,WAAW,KAAK,IAAI;YAC3B,OAAO,MAAM,CAAC,WAAW,KAAK,QAAQ;YACtC,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,WAAW,CAAC;YAChC,CAAC,CAAC,MAAM,CAAC,WAAW;YACpB,CAAC,CAAC,EAAE;QACR,KAAK,EAAE,MAAM,CAAC,KAAK;KACpB,CAAC;AACJ,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"token-stats.d.ts","sourceRoot":"","sources":["../../../src/judge/builtin/token-stats.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AAcrD;;;;GAIG;AACH,wBAAsB,kBAAkB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,kBAAkB,CAAC,CA6BtF"}
1
+ {"version":3,"file":"token-stats.d.ts","sourceRoot":"","sources":["../../../src/judge/builtin/token-stats.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AAcrD;;;;GAIG;AACH,wBAAsB,kBAAkB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,kBAAkB,CAAC,CA8BtF"}
@@ -1,6 +1,6 @@
1
1
  import { createLogger } from "@united-workforce/util";
2
2
  import { EVAL_JUDGE_TOKEN_STATS_SCHEMA } from "../../storage/index.js";
3
- import { readThreadSteps } from "./read-steps.js";
3
+ import { readStepDetail, readThreadSteps } from "./read-steps.js";
4
4
  const log = createLogger({ sink: { kind: "stderr" } });
5
5
  const LOG_RESULT = "T7KQ3M9P";
6
6
  /**
@@ -15,7 +15,8 @@ export async function runTokenStatsJudge(threadId) {
15
15
  let totalTurns = 0;
16
16
  const perStep = [];
17
17
  for (const step of steps) {
18
- const usage = step.usage;
18
+ const detail = readStepDetail(step.hash);
19
+ const usage = detail.usage;
19
20
  const inputTokens = usage !== null ? usage.inputTokens : 0;
20
21
  const outputTokens = usage !== null ? usage.outputTokens : 0;
21
22
  const turns = usage !== null ? usage.turns : 0;
@@ -1 +1 @@
1
- {"version":3,"file":"token-stats.js","sourceRoot":"","sources":["../../../src/judge/builtin/token-stats.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAEtD,OAAO,EAAE,6BAA6B,EAAE,MAAM,wBAAwB,CAAC;AACvE,OAAO,EAAE,eAAe,EAAE,MAAM,iBAAiB,CAAC;AAGlD,MAAM,GAAG,GAAG,YAAY,CAAC,EAAE,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC;AAEvD,MAAM,UAAU,GAAG,UAAU,CAAC;AAU9B;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CAAC,QAAgB;IACvD,MAAM,KAAK,GAAG,eAAe,CAAC,QAAQ,CAAC,CAAC;IAExC,IAAI,UAAU,GAAG,CAAC,CAAC;IACnB,IAAI,WAAW,GAAG,CAAC,CAAC;IACpB,IAAI,UAAU,GAAG,CAAC,CAAC;IACnB,MAAM,OAAO,GAAmB,EAAE,CAAC;IAEnC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC;QACzB,MAAM,WAAW,GAAG,KAAK,KAAK,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;QAC3D,MAAM,YAAY,GAAG,KAAK,KAAK,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC;QAC7D,MAAM,KAAK,GAAG,KAAK,KAAK,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/C,MAAM,QAAQ,GAAG,KAAK,KAAK,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QAErD,UAAU,IAAI,WAAW,CAAC;QAC1B,WAAW,IAAI,YAAY,CAAC;QAC5B,UAAU,IAAI,KAAK,CAAC;QAEpB,OAAO,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,WAAW,EAAE,YAAY,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC;IAChF,CAAC;IAED,GAAG,CAAC,UAAU,EAAE,sBAAsB,QAAQ,OAAO,UAAU,QAAQ,WAAW,EAAE,CAAC,CAAC;IAEtF,OAAO;QACL,KAAK,EAAE,GAAG;QACV,IAAI,EAAE,EAAE,UAAU,EAAE,WAAW,EAAE,UAAU,EAAE,OAAO,EAAE;QACtD,MAAM,EAAE,6BAA6B;KACtC,CAAC;AACJ,CAAC"}
1
+ {"version":3,"file":"token-stats.js","sourceRoot":"","sources":["../../../src/judge/builtin/token-stats.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAEtD,OAAO,EAAE,6BAA6B,EAAE,MAAM,wBAAwB,CAAC;AACvE,OAAO,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,iBAAiB,CAAC;AAGlE,MAAM,GAAG,GAAG,YAAY,CAAC,EAAE,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC;AAEvD,MAAM,UAAU,GAAG,UAAU,CAAC;AAU9B;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CAAC,QAAgB;IACvD,MAAM,KAAK,GAAG,eAAe,CAAC,QAAQ,CAAC,CAAC;IAExC,IAAI,UAAU,GAAG,CAAC,CAAC;IACnB,IAAI,WAAW,GAAG,CAAC,CAAC;IACpB,IAAI,UAAU,GAAG,CAAC,CAAC;IACnB,MAAM,OAAO,GAAmB,EAAE,CAAC;IAEnC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,MAAM,GAAG,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACzC,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC;QAC3B,MAAM,WAAW,GAAG,KAAK,KAAK,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;QAC3D,MAAM,YAAY,GAAG,KAAK,KAAK,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC;QAC7D,MAAM,KAAK,GAAG,KAAK,KAAK,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/C,MAAM,QAAQ,GAAG,KAAK,KAAK,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QAErD,UAAU,IAAI,WAAW,CAAC;QAC1B,WAAW,IAAI,YAAY,CAAC;QAC5B,UAAU,IAAI,KAAK,CAAC;QAEpB,OAAO,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,WAAW,EAAE,YAAY,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC;IAChF,CAAC;IAED,GAAG,CAAC,UAAU,EAAE,sBAAsB,QAAQ,OAAO,UAAU,QAAQ,WAAW,EAAE,CAAC,CAAC;IAEtF,OAAO;QACL,KAAK,EAAE,GAAG;QACV,IAAI,EAAE,EAAE,UAAU,EAAE,WAAW,EAAE,UAAU,EAAE,OAAO,EAAE;QACtD,MAAM,EAAE,6BAA6B;KACtC,CAAC;AACJ,CAAC"}
@@ -2,6 +2,10 @@ import type { ExecuteInput, ExecuteResult } from "./types.js";
2
2
  /**
3
3
  * Execute a workflow: create a thread, then run it for up to `maxSteps` steps.
4
4
  * Shells out to the uwf CLI rather than importing it directly.
5
+ *
6
+ * Both `thread start` and `thread exec` are invoked with `--format raw-json`
7
+ * so the legacy bare-value JSON shape is emitted (the 0.6 default is text).
8
+ * See `specs/cli-ocas-envelope-in-repo-consumer-migration.md`.
5
9
  */
6
10
  export declare function execute(input: ExecuteInput): Promise<ExecuteResult>;
7
11
  /** Best-effort lookup of the uwf engine version (`uwf -V`); "unknown" on failure. */
@@ -1 +1 @@
1
- {"version":3,"file":"execute.d.ts","sourceRoot":"","sources":["../../src/runner/execute.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAmD9D;;;GAGG;AACH,wBAAsB,OAAO,CAAC,KAAK,EAAE,YAAY,GAAG,OAAO,CAAC,aAAa,CAAC,CAezE;AAED,qFAAqF;AACrF,wBAAgB,gBAAgB,IAAI,MAAM,CASzC"}
1
+ {"version":3,"file":"execute.d.ts","sourceRoot":"","sources":["../../src/runner/execute.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAwD9D;;;;;;;GAOG;AACH,wBAAsB,OAAO,CAAC,KAAK,EAAE,YAAY,GAAG,OAAO,CAAC,aAAa,CAAC,CAmCzE;AAED,qFAAqF;AACrF,wBAAgB,gBAAgB,IAAI,MAAM,CASzC"}
@@ -26,10 +26,15 @@ function runUwf(args, cwd) {
26
26
  ? err.stderr
27
27
  : err.stderr.toString("utf8");
28
28
  const detail = stderr.trim() !== "" ? `: ${stderr.trim()}` : "";
29
- throw new Error(`uwf ${args[0]} ${args[1]} failed${detail}`);
29
+ // Find the subcommand group + subcommand by skipping leading global flags
30
+ // (e.g. `--format raw-json`). The first non-flag token is the group.
31
+ const groupIdx = args.findIndex((a) => !a.startsWith("--"));
32
+ const group = groupIdx >= 0 ? (args[groupIdx] ?? "") : "";
33
+ const subcmd = groupIdx >= 0 ? (args[groupIdx + 1] ?? "") : "";
34
+ throw new Error(`uwf ${group} ${subcmd} failed${detail}`);
30
35
  }
31
36
  }
32
- /** Parse the thread ID from `uwf thread start` JSON output (`{ workflow, thread }`). */
37
+ /** Parse the thread ID from `uwf thread start --format raw-json` output (`{ threadId, workflowHash }`). */
33
38
  function parseThreadId(stdout) {
34
39
  let parsed;
35
40
  try {
@@ -39,21 +44,45 @@ function parseThreadId(stdout) {
39
44
  throw new Error(`uwf thread start did not emit valid JSON: ${stdout || "(empty)"}`);
40
45
  }
41
46
  const obj = parsed;
42
- const thread = obj.thread;
43
- if (typeof thread !== "string" || thread === "") {
44
- throw new Error(`uwf thread start output missing thread id: ${stdout}`);
47
+ const threadId = obj.threadId;
48
+ if (typeof threadId !== "string" || threadId === "") {
49
+ throw new Error(`uwf thread start output missing threadId: ${stdout}`);
45
50
  }
46
- return thread;
51
+ return threadId;
47
52
  }
48
53
  /**
49
54
  * Execute a workflow: create a thread, then run it for up to `maxSteps` steps.
50
55
  * Shells out to the uwf CLI rather than importing it directly.
56
+ *
57
+ * Both `thread start` and `thread exec` are invoked with `--format raw-json`
58
+ * so the legacy bare-value JSON shape is emitted (the 0.6 default is text).
59
+ * See `specs/cli-ocas-envelope-in-repo-consumer-migration.md`.
51
60
  */
52
61
  export async function execute(input) {
53
- const startOut = runUwf(["thread", "start", input.workflow, "-p", input.prompt, "--cwd", input.workDir], input.workDir);
62
+ const startOut = runUwf([
63
+ "--format",
64
+ "raw-json",
65
+ "thread",
66
+ "start",
67
+ input.workflow,
68
+ "-p",
69
+ input.prompt,
70
+ "--cwd",
71
+ input.workDir,
72
+ ], input.workDir);
54
73
  const threadId = parseThreadId(startOut);
55
74
  log(LOG_START, `thread started thread=${threadId} workflow=${input.workflow}`);
56
- runUwf(["thread", "exec", threadId, "--agent", input.agent, "-c", String(input.maxSteps)], input.workDir);
75
+ runUwf([
76
+ "--format",
77
+ "raw-json",
78
+ "thread",
79
+ "exec",
80
+ threadId,
81
+ "--agent",
82
+ input.agent,
83
+ "-c",
84
+ String(input.maxSteps),
85
+ ], input.workDir);
57
86
  log(LOG_EXEC, `thread executed thread=${threadId} maxSteps=${input.maxSteps}`);
58
87
  return { threadId };
59
88
  }
@@ -1 +1 @@
1
- {"version":3,"file":"execute.js","sourceRoot":"","sources":["../../src/runner/execute.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAElD,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAItD,MAAM,GAAG,GAAG,YAAY,CAAC,EAAE,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC;AAEvD,MAAM,SAAS,GAAG,UAAU,CAAC;AAC7B,MAAM,QAAQ,GAAG,UAAU,CAAC;AAE5B,uEAAuE;AACvE,SAAS,MAAM;IACb,MAAM,QAAQ,GAAG,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC;IACrC,OAAO,QAAQ,KAAK,SAAS,IAAI,QAAQ,KAAK,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC;AACtE,CAAC;AAED,sDAAsD;AACtD,SAAS,MAAM,CAAC,IAAc,EAAE,GAAW;IACzC,IAAI,CAAC;QACH,OAAO,YAAY,CAAC,MAAM,EAAE,EAAE,IAAI,EAAE;YAClC,QAAQ,EAAE,MAAM;YAChB,KAAK,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,CAAC;YACjC,SAAS,EAAE,EAAE,GAAG,IAAI,GAAG,IAAI;YAC3B,GAAG;SACJ,CAAC,CAAC,IAAI,EAAE,CAAC;IACZ,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,MAAM,GAAG,GAAG,CAAgE,CAAC;QAC7E,MAAM,MAAM,GACV,GAAG,CAAC,MAAM,IAAI,IAAI;YAChB,CAAC,CAAC,EAAE;YACJ,CAAC,CAAC,OAAO,GAAG,CAAC,MAAM,KAAK,QAAQ;gBAC9B,CAAC,CAAC,GAAG,CAAC,MAAM;gBACZ,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QACpC,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,KAAK,MAAM,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAChE,MAAM,IAAI,KAAK,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC,CAAC,CAAC,UAAU,MAAM,EAAE,CAAC,CAAC;IAC/D,CAAC;AACH,CAAC;AAED,wFAAwF;AACxF,SAAS,aAAa,CAAC,MAAc;IACnC,IAAI,MAAe,CAAC;IACpB,IAAI,CAAC;QACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;IAC9B,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CAAC,6CAA6C,MAAM,IAAI,SAAS,EAAE,CAAC,CAAC;IACtF,CAAC;IACD,MAAM,GAAG,GAAG,MAAiC,CAAC;IAC9C,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,CAAC;IAC1B,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,KAAK,EAAE,EAAE,CAAC;QAChD,MAAM,IAAI,KAAK,CAAC,8CAA8C,MAAM,EAAE,CAAC,CAAC;IAC1E,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,KAAmB;IAC/C,MAAM,QAAQ,GAAG,MAAM,CACrB,CAAC,QAAQ,EAAE,OAAO,EAAE,KAAK,CAAC,QAAQ,EAAE,IAAI,EAAE,KAAK,CAAC,MAAM,EAAE,OAAO,EAAE,KAAK,CAAC,OAAO,CAAC,EAC/E,KAAK,CAAC,OAAO,CACd,CAAC;IACF,MAAM,QAAQ,GAAG,aAAa,CAAC,QAAQ,CAAC,CAAC;IACzC,GAAG,CAAC,SAAS,EAAE,yBAAyB,QAAQ,aAAa,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;IAE/E,MAAM,CACJ,CAAC,QAAQ,EAAE,MAAM,EAAE,QAAQ,EAAE,SAAS,EAAE,KAAK,CAAC,KAAK,EAAE,IAAI,EAAE,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,EAClF,KAAK,CAAC,OAAO,CACd,CAAC;IACF,GAAG,CAAC,QAAQ,EAAE,0BAA0B,QAAQ,aAAa,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;IAE/E,OAAO,EAAE,QAAQ,EAAE,CAAC;AACtB,CAAC;AAED,qFAAqF;AACrF,MAAM,UAAU,gBAAgB;IAC9B,IAAI,CAAC;QACH,OAAO,YAAY,CAAC,MAAM,EAAE,EAAE,CAAC,IAAI,CAAC,EAAE;YACpC,QAAQ,EAAE,MAAM;YAChB,KAAK,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,QAAQ,CAAC;SACpC,CAAC,CAAC,IAAI,EAAE,CAAC;IACZ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,SAAS,CAAC;IACnB,CAAC;AACH,CAAC"}
1
+ {"version":3,"file":"execute.js","sourceRoot":"","sources":["../../src/runner/execute.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAElD,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAItD,MAAM,GAAG,GAAG,YAAY,CAAC,EAAE,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC;AAEvD,MAAM,SAAS,GAAG,UAAU,CAAC;AAC7B,MAAM,QAAQ,GAAG,UAAU,CAAC;AAE5B,uEAAuE;AACvE,SAAS,MAAM;IACb,MAAM,QAAQ,GAAG,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC;IACrC,OAAO,QAAQ,KAAK,SAAS,IAAI,QAAQ,KAAK,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC;AACtE,CAAC;AAED,sDAAsD;AACtD,SAAS,MAAM,CAAC,IAAc,EAAE,GAAW;IACzC,IAAI,CAAC;QACH,OAAO,YAAY,CAAC,MAAM,EAAE,EAAE,IAAI,EAAE;YAClC,QAAQ,EAAE,MAAM;YAChB,KAAK,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,CAAC;YACjC,SAAS,EAAE,EAAE,GAAG,IAAI,GAAG,IAAI;YAC3B,GAAG;SACJ,CAAC,CAAC,IAAI,EAAE,CAAC;IACZ,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,MAAM,GAAG,GAAG,CAAgE,CAAC;QAC7E,MAAM,MAAM,GACV,GAAG,CAAC,MAAM,IAAI,IAAI;YAChB,CAAC,CAAC,EAAE;YACJ,CAAC,CAAC,OAAO,GAAG,CAAC,MAAM,KAAK,QAAQ;gBAC9B,CAAC,CAAC,GAAG,CAAC,MAAM;gBACZ,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QACpC,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,KAAK,MAAM,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAChE,0EAA0E;QAC1E,qEAAqE;QACrE,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC;QAC5D,MAAM,KAAK,GAAG,QAAQ,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC1D,MAAM,MAAM,GAAG,QAAQ,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC/D,MAAM,IAAI,KAAK,CAAC,OAAO,KAAK,IAAI,MAAM,UAAU,MAAM,EAAE,CAAC,CAAC;IAC5D,CAAC;AACH,CAAC;AAED,2GAA2G;AAC3G,SAAS,aAAa,CAAC,MAAc;IACnC,IAAI,MAAe,CAAC;IACpB,IAAI,CAAC;QACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;IAC9B,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CAAC,6CAA6C,MAAM,IAAI,SAAS,EAAE,CAAC,CAAC;IACtF,CAAC;IACD,MAAM,GAAG,GAAG,MAAiC,CAAC;IAC9C,MAAM,QAAQ,GAAG,GAAG,CAAC,QAAQ,CAAC;IAC9B,IAAI,OAAO,QAAQ,KAAK,QAAQ,IAAI,QAAQ,KAAK,EAAE,EAAE,CAAC;QACpD,MAAM,IAAI,KAAK,CAAC,6CAA6C,MAAM,EAAE,CAAC,CAAC;IACzE,CAAC;IACD,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,KAAmB;IAC/C,MAAM,QAAQ,GAAG,MAAM,CACrB;QACE,UAAU;QACV,UAAU;QACV,QAAQ;QACR,OAAO;QACP,KAAK,CAAC,QAAQ;QACd,IAAI;QACJ,KAAK,CAAC,MAAM;QACZ,OAAO;QACP,KAAK,CAAC,OAAO;KACd,EACD,KAAK,CAAC,OAAO,CACd,CAAC;IACF,MAAM,QAAQ,GAAG,aAAa,CAAC,QAAQ,CAAC,CAAC;IACzC,GAAG,CAAC,SAAS,EAAE,yBAAyB,QAAQ,aAAa,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;IAE/E,MAAM,CACJ;QACE,UAAU;QACV,UAAU;QACV,QAAQ;QACR,MAAM;QACN,QAAQ;QACR,SAAS;QACT,KAAK,CAAC,KAAK;QACX,IAAI;QACJ,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC;KACvB,EACD,KAAK,CAAC,OAAO,CACd,CAAC;IACF,GAAG,CAAC,QAAQ,EAAE,0BAA0B,QAAQ,aAAa,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;IAE/E,OAAO,EAAE,QAAQ,EAAE,CAAC;AACtB,CAAC;AAED,qFAAqF;AACrF,MAAM,UAAU,gBAAgB;IAC9B,IAAI,CAAC;QACH,OAAO,YAAY,CAAC,MAAM,EAAE,EAAE,CAAC,IAAI,CAAC,EAAE;YACpC,QAAQ,EAAE,MAAM;YAChB,KAAK,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,QAAQ,CAAC;SACpC,CAAC,CAAC,IAAI,EAAE,CAAC;IACZ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,SAAS,CAAC;IACnB,CAAC;AACH,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@united-workforce/eval",
3
- "version": "0.1.5",
3
+ "version": "0.1.6",
4
4
  "private": false,
5
5
  "files": [
6
6
  "src",
@@ -18,12 +18,12 @@
18
18
  }
19
19
  },
20
20
  "dependencies": {
21
- "@ocas/core": "^0.3.0",
22
- "@ocas/fs": "^0.3.0",
21
+ "@ocas/core": "^0.5.0",
22
+ "@ocas/fs": "^0.4.1",
23
23
  "commander": "^14.0.3",
24
24
  "yaml": "^2.9.0",
25
- "@united-workforce/protocol": "^0.1.0",
26
- "@united-workforce/util": "^0.1.4"
25
+ "@united-workforce/protocol": "^0.3.0",
26
+ "@united-workforce/util": "^0.2.0"
27
27
  },
28
28
  "devDependencies": {
29
29
  "typescript": "^5.8.3"
@@ -1,76 +1,28 @@
1
1
  import { createLogger } from "@united-workforce/util";
2
- import { parse as parseYaml } from "yaml";
3
2
 
4
3
  import { EVAL_JUDGE_FRONTMATTER_SCHEMA } from "../../storage/index.js";
5
- import { readThreadSteps } from "./read-steps.js";
4
+ import { readStepDetail, readThreadSteps } from "./read-steps.js";
6
5
  import type { BuiltinJudgeOutput } from "./types.js";
7
6
 
8
7
  const log = createLogger({ sink: { kind: "stderr" } });
9
8
 
10
9
  const LOG_RESULT = "F2QH7R4M";
11
10
 
12
- const FENCE = "---";
13
-
14
11
  type InvalidStep = {
15
12
  stepIndex: number;
16
13
  role: string;
17
14
  errors: string[];
18
15
  };
19
16
 
20
- /**
21
- * Extract the YAML frontmatter block from a step output. Returns the inner YAML
22
- * string when the output starts with a `---\n` block closed by a `\n---` fence,
23
- * otherwise null.
24
- */
25
- function extractFrontmatterYaml(output: unknown): string | null {
26
- if (typeof output !== "string") {
27
- return null;
28
- }
29
- if (!output.startsWith(`${FENCE}\n`)) {
30
- return null;
31
- }
32
- const rest = output.slice(FENCE.length + 1);
33
- const closeIndex = rest.indexOf(`\n${FENCE}`);
34
- if (closeIndex === -1) {
35
- return null;
36
- }
37
- return rest.slice(0, closeIndex);
38
- }
39
-
40
17
  /** Validate a single step's frontmatter, returning a list of errors (empty = valid). */
41
- function validateStepFrontmatter(output: unknown): string[] {
42
- // CAS stores the extracted output as a JSON object after the extract pipeline.
43
- // Accept both: parsed object (from step.output) or raw markdown string.
44
- if (typeof output === "object" && output !== null && !Array.isArray(output)) {
45
- const status = (output as Record<string, unknown>).$status;
46
- if (typeof status !== "string" || status.trim() === "") {
47
- return ["$status field is missing or not a non-empty string"];
48
- }
49
- return [];
18
+ function validateStepFrontmatter(frontmatter: Record<string, unknown>): string[] {
19
+ if (Object.keys(frontmatter).length === 0) {
20
+ return ["step has no frontmatter"];
50
21
  }
51
-
52
- const yaml = extractFrontmatterYaml(output);
53
- if (yaml === null) {
54
- return ["output does not begin with a valid '---' frontmatter block"];
55
- }
56
-
57
- let parsed: unknown;
58
- try {
59
- parsed = parseYaml(yaml);
60
- } catch (e) {
61
- const message = e instanceof Error ? e.message : String(e);
62
- return [`frontmatter YAML failed to parse: ${message}`];
63
- }
64
-
65
- if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
66
- return ["frontmatter is not a YAML mapping"];
67
- }
68
-
69
- const status = (parsed as Record<string, unknown>).$status;
22
+ const status = frontmatter.$status;
70
23
  if (typeof status !== "string" || status.trim() === "") {
71
24
  return ["$status field is missing or not a non-empty string"];
72
25
  }
73
-
74
26
  return [];
75
27
  }
76
28
 
@@ -85,7 +37,9 @@ export async function runFrontmatterJudge(threadId: string): Promise<BuiltinJudg
85
37
  const invalidSteps: InvalidStep[] = [];
86
38
  for (let i = 0; i < steps.length; i++) {
87
39
  const step = steps[i];
88
- const errors = validateStepFrontmatter(step.output);
40
+ if (step === undefined) continue;
41
+ const detail = readStepDetail(step.hash);
42
+ const errors = validateStepFrontmatter(detail.frontmatter);
89
43
  if (errors.length > 0) {
90
44
  invalidSteps.push({ stepIndex: i, role: step.role, errors });
91
45
  }
@@ -1,14 +1,90 @@
1
1
  import { execFileSync } from "node:child_process";
2
2
 
3
- import type { StepEntry, ThreadStepsOutput } from "@united-workforce/protocol";
3
+ /**
4
+ * A single step entry as exposed by `uwf step list --format raw-json` under 0.6.
5
+ *
6
+ * Richer per-step data (frontmatter, turns, agent, usage) lives in the step
7
+ * detail node and is fetched separately via `readStepDetail(hash)` when needed.
8
+ */
9
+ export type StepListEntry = {
10
+ hash: string;
11
+ role: string;
12
+ durationMs: number | null;
13
+ };
4
14
 
5
- /** Shell out to `uwf step list` and return the parsed step entries (excludes start entry). */
6
- export function readThreadSteps(threadId: string): StepEntry[] {
7
- const stdout = execFileSync("uwf", ["step", "list", threadId], {
15
+ type StepListPayload = {
16
+ threadId: string;
17
+ items: StepListEntry[];
18
+ };
19
+
20
+ /** Shell out to `uwf step list --format raw-json` and return the bare-value payload's items. */
21
+ export function readThreadSteps(threadId: string): StepListEntry[] {
22
+ const stdout = execFileSync("uwf", ["--format", "raw-json", "step", "list", threadId], {
23
+ encoding: "utf8",
24
+ stdio: ["ignore", "pipe", "pipe"],
25
+ }).trim();
26
+ const parsed = JSON.parse(stdout) as StepListPayload;
27
+ // The 0.6 payload does not include a synthetic start entry — every item is a real step.
28
+ return parsed.items;
29
+ }
30
+
31
+ /**
32
+ * Per-step detail surface used by builtin judges. Mirrors the
33
+ * `StepDetailPayload` schema (`@uwf/output/step-detail`) but only exposes the
34
+ * fields judges currently consume.
35
+ */
36
+ export type StepDetail = {
37
+ hash: string;
38
+ role: string;
39
+ agent: string;
40
+ durationMs: number | null;
41
+ frontmatter: Record<string, unknown>;
42
+ usage: {
43
+ turns: number;
44
+ inputTokens: number;
45
+ outputTokens: number;
46
+ duration: number;
47
+ } | null;
48
+ };
49
+
50
+ type StepDetailRawPayload = {
51
+ hash: string;
52
+ role: string;
53
+ agent: string;
54
+ durationMs: number | null;
55
+ frontmatter: Record<string, unknown>;
56
+ // Usage is not exposed by the @uwf/output/step-detail schema yet; judges fall
57
+ // back to zeros when the field is null. Kept on the type for forward compat.
58
+ usage: {
59
+ turns: number;
60
+ inputTokens: number;
61
+ outputTokens: number;
62
+ duration: number;
63
+ } | null;
64
+ };
65
+
66
+ /**
67
+ * Shell out to `uwf step show <hash> --format raw-json` and return the bare-value
68
+ * step-detail payload. Used by judges that need richer per-step data than
69
+ * `readThreadSteps` exposes (e.g. frontmatter contents, token usage).
70
+ */
71
+ export function readStepDetail(stepHash: string): StepDetail {
72
+ const stdout = execFileSync("uwf", ["--format", "raw-json", "step", "show", stepHash], {
8
73
  encoding: "utf8",
9
74
  stdio: ["ignore", "pipe", "pipe"],
10
75
  }).trim();
11
- const parsed = JSON.parse(stdout) as ThreadStepsOutput;
12
- // steps[0] is the StartEntry; the rest are StepEntry records.
13
- return parsed.steps.slice(1) as StepEntry[];
76
+ const parsed = JSON.parse(stdout) as StepDetailRawPayload;
77
+ return {
78
+ hash: parsed.hash,
79
+ role: parsed.role,
80
+ agent: parsed.agent,
81
+ durationMs: parsed.durationMs,
82
+ frontmatter:
83
+ parsed.frontmatter !== null &&
84
+ typeof parsed.frontmatter === "object" &&
85
+ !Array.isArray(parsed.frontmatter)
86
+ ? parsed.frontmatter
87
+ : {},
88
+ usage: parsed.usage,
89
+ };
14
90
  }
@@ -1,7 +1,7 @@
1
1
  import { createLogger } from "@united-workforce/util";
2
2
 
3
3
  import { EVAL_JUDGE_TOKEN_STATS_SCHEMA } from "../../storage/index.js";
4
- import { readThreadSteps } from "./read-steps.js";
4
+ import { readStepDetail, readThreadSteps } from "./read-steps.js";
5
5
  import type { BuiltinJudgeOutput } from "./types.js";
6
6
 
7
7
  const log = createLogger({ sink: { kind: "stderr" } });
@@ -30,7 +30,8 @@ export async function runTokenStatsJudge(threadId: string): Promise<BuiltinJudge
30
30
  const perStep: PerStepStats[] = [];
31
31
 
32
32
  for (const step of steps) {
33
- const usage = step.usage;
33
+ const detail = readStepDetail(step.hash);
34
+ const usage = detail.usage;
34
35
  const inputTokens = usage !== null ? usage.inputTokens : 0;
35
36
  const outputTokens = usage !== null ? usage.outputTokens : 0;
36
37
  const turns = usage !== null ? usage.turns : 0;
@@ -33,11 +33,16 @@ function runUwf(args: string[], cwd: string): string {
33
33
  ? err.stderr
34
34
  : err.stderr.toString("utf8");
35
35
  const detail = stderr.trim() !== "" ? `: ${stderr.trim()}` : "";
36
- throw new Error(`uwf ${args[0]} ${args[1]} failed${detail}`);
36
+ // Find the subcommand group + subcommand by skipping leading global flags
37
+ // (e.g. `--format raw-json`). The first non-flag token is the group.
38
+ const groupIdx = args.findIndex((a) => !a.startsWith("--"));
39
+ const group = groupIdx >= 0 ? (args[groupIdx] ?? "") : "";
40
+ const subcmd = groupIdx >= 0 ? (args[groupIdx + 1] ?? "") : "";
41
+ throw new Error(`uwf ${group} ${subcmd} failed${detail}`);
37
42
  }
38
43
  }
39
44
 
40
- /** Parse the thread ID from `uwf thread start` JSON output (`{ workflow, thread }`). */
45
+ /** Parse the thread ID from `uwf thread start --format raw-json` output (`{ threadId, workflowHash }`). */
41
46
  function parseThreadId(stdout: string): string {
42
47
  let parsed: unknown;
43
48
  try {
@@ -46,27 +51,51 @@ function parseThreadId(stdout: string): string {
46
51
  throw new Error(`uwf thread start did not emit valid JSON: ${stdout || "(empty)"}`);
47
52
  }
48
53
  const obj = parsed as Record<string, unknown>;
49
- const thread = obj.thread;
50
- if (typeof thread !== "string" || thread === "") {
51
- throw new Error(`uwf thread start output missing thread id: ${stdout}`);
54
+ const threadId = obj.threadId;
55
+ if (typeof threadId !== "string" || threadId === "") {
56
+ throw new Error(`uwf thread start output missing threadId: ${stdout}`);
52
57
  }
53
- return thread;
58
+ return threadId;
54
59
  }
55
60
 
56
61
  /**
57
62
  * Execute a workflow: create a thread, then run it for up to `maxSteps` steps.
58
63
  * Shells out to the uwf CLI rather than importing it directly.
64
+ *
65
+ * Both `thread start` and `thread exec` are invoked with `--format raw-json`
66
+ * so the legacy bare-value JSON shape is emitted (the 0.6 default is text).
67
+ * See `specs/cli-ocas-envelope-in-repo-consumer-migration.md`.
59
68
  */
60
69
  export async function execute(input: ExecuteInput): Promise<ExecuteResult> {
61
70
  const startOut = runUwf(
62
- ["thread", "start", input.workflow, "-p", input.prompt, "--cwd", input.workDir],
71
+ [
72
+ "--format",
73
+ "raw-json",
74
+ "thread",
75
+ "start",
76
+ input.workflow,
77
+ "-p",
78
+ input.prompt,
79
+ "--cwd",
80
+ input.workDir,
81
+ ],
63
82
  input.workDir,
64
83
  );
65
84
  const threadId = parseThreadId(startOut);
66
85
  log(LOG_START, `thread started thread=${threadId} workflow=${input.workflow}`);
67
86
 
68
87
  runUwf(
69
- ["thread", "exec", threadId, "--agent", input.agent, "-c", String(input.maxSteps)],
88
+ [
89
+ "--format",
90
+ "raw-json",
91
+ "thread",
92
+ "exec",
93
+ threadId,
94
+ "--agent",
95
+ input.agent,
96
+ "-c",
97
+ String(input.maxSteps),
98
+ ],
70
99
  input.workDir,
71
100
  );
72
101
  log(LOG_EXEC, `thread executed thread=${threadId} maxSteps=${input.maxSteps}`);