@united-workforce/eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. package/LICENSE +21 -0
  2. package/dist/cli.d.ts +3 -0
  3. package/dist/cli.d.ts.map +1 -0
  4. package/dist/cli.js +14 -0
  5. package/dist/cli.js.map +1 -0
  6. package/dist/commands/diff.d.ts +3 -0
  7. package/dist/commands/diff.d.ts.map +1 -0
  8. package/dist/commands/diff.js +36 -0
  9. package/dist/commands/diff.js.map +1 -0
  10. package/dist/commands/format.d.ts +11 -0
  11. package/dist/commands/format.d.ts.map +1 -0
  12. package/dist/commands/format.js +114 -0
  13. package/dist/commands/format.js.map +1 -0
  14. package/dist/commands/index.d.ts +8 -0
  15. package/dist/commands/index.d.ts.map +1 -0
  16. package/dist/commands/index.js +7 -0
  17. package/dist/commands/index.js.map +1 -0
  18. package/dist/commands/list.d.ts +3 -0
  19. package/dist/commands/list.d.ts.map +1 -0
  20. package/dist/commands/list.js +35 -0
  21. package/dist/commands/list.js.map +1 -0
  22. package/dist/commands/read.d.ts +10 -0
  23. package/dist/commands/read.d.ts.map +1 -0
  24. package/dist/commands/read.js +37 -0
  25. package/dist/commands/read.js.map +1 -0
  26. package/dist/commands/report.d.ts +3 -0
  27. package/dist/commands/report.d.ts.map +1 -0
  28. package/dist/commands/report.js +30 -0
  29. package/dist/commands/report.js.map +1 -0
  30. package/dist/commands/run.d.ts +3 -0
  31. package/dist/commands/run.d.ts.map +1 -0
  32. package/dist/commands/run.js +64 -0
  33. package/dist/commands/run.js.map +1 -0
  34. package/dist/commands/types.d.ts +9 -0
  35. package/dist/commands/types.d.ts.map +1 -0
  36. package/dist/commands/types.js +2 -0
  37. package/dist/commands/types.js.map +1 -0
  38. package/dist/index.d.ts +8 -0
  39. package/dist/index.d.ts.map +1 -0
  40. package/dist/index.js +6 -0
  41. package/dist/index.js.map +1 -0
  42. package/dist/judge/builtin/frontmatter.d.ts +8 -0
  43. package/dist/judge/builtin/frontmatter.d.ts.map +1 -0
  44. package/dist/judge/builtin/frontmatter.js +75 -0
  45. package/dist/judge/builtin/frontmatter.js.map +1 -0
  46. package/dist/judge/builtin/hallucination.d.ts +10 -0
  47. package/dist/judge/builtin/hallucination.d.ts.map +1 -0
  48. package/dist/judge/builtin/hallucination.js +16 -0
  49. package/dist/judge/builtin/hallucination.js.map +1 -0
  50. package/dist/judge/builtin/index.d.ts +7 -0
  51. package/dist/judge/builtin/index.d.ts.map +1 -0
  52. package/dist/judge/builtin/index.js +6 -0
  53. package/dist/judge/builtin/index.js.map +1 -0
  54. package/dist/judge/builtin/read-steps.d.ts +4 -0
  55. package/dist/judge/builtin/read-steps.d.ts.map +1 -0
  56. package/dist/judge/builtin/read-steps.js +12 -0
  57. package/dist/judge/builtin/read-steps.js.map +1 -0
  58. package/dist/judge/builtin/token-stats.d.ts +8 -0
  59. package/dist/judge/builtin/token-stats.d.ts.map +1 -0
  60. package/dist/judge/builtin/token-stats.js +35 -0
  61. package/dist/judge/builtin/token-stats.js.map +1 -0
  62. package/dist/judge/builtin/types.d.ts +15 -0
  63. package/dist/judge/builtin/types.d.ts.map +1 -0
  64. package/dist/judge/builtin/types.js +2 -0
  65. package/dist/judge/builtin/types.js.map +1 -0
  66. package/dist/judge/builtin/upstream.d.ts +10 -0
  67. package/dist/judge/builtin/upstream.d.ts.map +1 -0
  68. package/dist/judge/builtin/upstream.js +16 -0
  69. package/dist/judge/builtin/upstream.js.map +1 -0
  70. package/dist/judge/index.d.ts +3 -0
  71. package/dist/judge/index.d.ts.map +1 -0
  72. package/dist/judge/index.js +2 -0
  73. package/dist/judge/index.js.map +1 -0
  74. package/dist/judge/types.d.ts +15 -0
  75. package/dist/judge/types.d.ts.map +1 -0
  76. package/dist/judge/types.js +2 -0
  77. package/dist/judge/types.js.map +1 -0
  78. package/dist/runner/collect.d.ts +16 -0
  79. package/dist/runner/collect.d.ts.map +1 -0
  80. package/dist/runner/collect.js +129 -0
  81. package/dist/runner/collect.js.map +1 -0
  82. package/dist/runner/execute.d.ts +9 -0
  83. package/dist/runner/execute.d.ts.map +1 -0
  84. package/dist/runner/execute.js +72 -0
  85. package/dist/runner/execute.js.map +1 -0
  86. package/dist/runner/index.d.ts +5 -0
  87. package/dist/runner/index.d.ts.map +1 -0
  88. package/dist/runner/index.js +4 -0
  89. package/dist/runner/index.js.map +1 -0
  90. package/dist/runner/prepare.d.ts +7 -0
  91. package/dist/runner/prepare.d.ts.map +1 -0
  92. package/dist/runner/prepare.js +38 -0
  93. package/dist/runner/prepare.js.map +1 -0
  94. package/dist/runner/types.d.ts +70 -0
  95. package/dist/runner/types.d.ts.map +1 -0
  96. package/dist/runner/types.js +2 -0
  97. package/dist/runner/types.js.map +1 -0
  98. package/dist/storage/index.d.ts +4 -0
  99. package/dist/storage/index.d.ts.map +1 -0
  100. package/dist/storage/index.js +3 -0
  101. package/dist/storage/index.js.map +1 -0
  102. package/dist/storage/schemas.d.ts +7 -0
  103. package/dist/storage/schemas.d.ts.map +1 -0
  104. package/dist/storage/schemas.js +118 -0
  105. package/dist/storage/schemas.js.map +1 -0
  106. package/dist/storage/store.d.ts +10 -0
  107. package/dist/storage/store.d.ts.map +1 -0
  108. package/dist/storage/store.js +36 -0
  109. package/dist/storage/store.js.map +1 -0
  110. package/dist/storage/types.d.ts +30 -0
  111. package/dist/storage/types.d.ts.map +1 -0
  112. package/dist/storage/types.js +2 -0
  113. package/dist/storage/types.js.map +1 -0
  114. package/dist/task/index.d.ts +3 -0
  115. package/dist/task/index.d.ts.map +1 -0
  116. package/dist/task/index.js +2 -0
  117. package/dist/task/index.js.map +1 -0
  118. package/dist/task/loader.d.ts +6 -0
  119. package/dist/task/loader.d.ts.map +1 -0
  120. package/dist/task/loader.js +69 -0
  121. package/dist/task/loader.js.map +1 -0
  122. package/dist/task/types.d.ts +27 -0
  123. package/dist/task/types.d.ts.map +1 -0
  124. package/dist/task/types.js +2 -0
  125. package/dist/task/types.js.map +1 -0
  126. package/package.json +45 -0
  127. package/src/cli.ts +22 -0
  128. package/src/commands/diff.ts +38 -0
  129. package/src/commands/format.ts +148 -0
  130. package/src/commands/index.ts +7 -0
  131. package/src/commands/list.ts +43 -0
  132. package/src/commands/read.ts +41 -0
  133. package/src/commands/report.ts +32 -0
  134. package/src/commands/run.ts +84 -0
  135. package/src/commands/types.ts +9 -0
  136. package/src/index.ts +34 -0
  137. package/src/judge/builtin/frontmatter.ts +95 -0
  138. package/src/judge/builtin/hallucination.ts +17 -0
  139. package/src/judge/builtin/index.ts +6 -0
  140. package/src/judge/builtin/read-steps.ts +14 -0
  141. package/src/judge/builtin/token-stats.ts +53 -0
  142. package/src/judge/builtin/types.ts +16 -0
  143. package/src/judge/builtin/upstream.ts +17 -0
  144. package/src/judge/index.ts +10 -0
  145. package/src/judge/types.ts +15 -0
  146. package/src/runner/collect.ts +172 -0
  147. package/src/runner/execute.ts +87 -0
  148. package/src/runner/index.ts +15 -0
  149. package/src/runner/prepare.ts +45 -0
  150. package/src/runner/types.ts +85 -0
  151. package/src/storage/index.ts +9 -0
  152. package/src/storage/schemas.ts +123 -0
  153. package/src/storage/store.ts +42 -0
  154. package/src/storage/types.ts +33 -0
  155. package/src/task/index.ts +2 -0
  156. package/src/task/loader.ts +74 -0
  157. package/src/task/types.ts +28 -0
@@ -0,0 +1,75 @@
1
+ import { createLogger } from "@united-workforce/util";
2
+ import { parse as parseYaml } from "yaml";
3
+ import { EVAL_JUDGE_FRONTMATTER_SCHEMA } from "../../storage/index.js";
4
+ import { readThreadSteps } from "./read-steps.js";
5
+ const log = createLogger({ sink: { kind: "stderr" } });
6
+ const LOG_RESULT = "F2QH7R4M";
7
+ const FENCE = "---";
8
+ /**
9
+ * Extract the YAML frontmatter block from a step output. Returns the inner YAML
10
+ * string when the output starts with a `---\n` block closed by a `\n---` fence,
11
+ * otherwise null.
12
+ */
13
+ function extractFrontmatterYaml(output) {
14
+ if (typeof output !== "string") {
15
+ return null;
16
+ }
17
+ if (!output.startsWith(`${FENCE}\n`)) {
18
+ return null;
19
+ }
20
+ const rest = output.slice(FENCE.length + 1);
21
+ const closeIndex = rest.indexOf(`\n${FENCE}`);
22
+ if (closeIndex === -1) {
23
+ return null;
24
+ }
25
+ return rest.slice(0, closeIndex);
26
+ }
27
+ /** Validate a single step's frontmatter, returning a list of errors (empty = valid). */
28
+ function validateStepFrontmatter(output) {
29
+ const yaml = extractFrontmatterYaml(output);
30
+ if (yaml === null) {
31
+ return ["output does not begin with a valid '---' frontmatter block"];
32
+ }
33
+ let parsed;
34
+ try {
35
+ parsed = parseYaml(yaml);
36
+ }
37
+ catch (e) {
38
+ const message = e instanceof Error ? e.message : String(e);
39
+ return [`frontmatter YAML failed to parse: ${message}`];
40
+ }
41
+ if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
42
+ return ["frontmatter is not a YAML mapping"];
43
+ }
44
+ const status = parsed.$status;
45
+ if (typeof status !== "string" || status.trim() === "") {
46
+ return ["$status field is missing or not a non-empty string"];
47
+ }
48
+ return [];
49
+ }
50
+ /**
51
+ * Deterministic judge: every step's agent output must contain valid YAML
52
+ * frontmatter with a non-empty `$status` field. Score = stepsValid / stepsTotal
53
+ * (0 when there are no steps).
54
+ */
55
+ export async function runFrontmatterJudge(threadId) {
56
+ const steps = readThreadSteps(threadId);
57
+ const invalidSteps = [];
58
+ for (let i = 0; i < steps.length; i++) {
59
+ const step = steps[i];
60
+ const errors = validateStepFrontmatter(step.output);
61
+ if (errors.length > 0) {
62
+ invalidSteps.push({ stepIndex: i, role: step.role, errors });
63
+ }
64
+ }
65
+ const stepsTotal = steps.length;
66
+ const stepsValid = stepsTotal - invalidSteps.length;
67
+ const score = stepsTotal > 0 ? stepsValid / stepsTotal : 0;
68
+ log(LOG_RESULT, `frontmatter thread=${threadId} valid=${stepsValid}/${stepsTotal}`);
69
+ return {
70
+ score,
71
+ data: { stepsTotal, stepsValid, invalidSteps },
72
+ schema: EVAL_JUDGE_FRONTMATTER_SCHEMA,
73
+ };
74
+ }
75
+ //# sourceMappingURL=frontmatter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"frontmatter.js","sourceRoot":"","sources":["../../../src/judge/builtin/frontmatter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AACtD,OAAO,EAAE,KAAK,IAAI,SAAS,EAAE,MAAM,MAAM,CAAC;AAE1C,OAAO,EAAE,6BAA6B,EAAE,MAAM,wBAAwB,CAAC;AACvE,OAAO,EAAE,eAAe,EAAE,MAAM,iBAAiB,CAAC;AAGlD,MAAM,GAAG,GAAG,YAAY,CAAC,EAAE,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC;AAEvD,MAAM,UAAU,GAAG,UAAU,CAAC;AAE9B,MAAM,KAAK,GAAG,KAAK,CAAC;AAQpB;;;;GAIG;AACH,SAAS,sBAAsB,CAAC,MAAe;IAC7C,IAAI,OAAO,MAAM,KAAK,QAAQ,EAAE,CAAC;QAC/B,OAAO,IAAI,CAAC;IACd,CAAC;IACD,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,GAAG,KAAK,IAAI,CAAC,EAAE,CAAC;QACrC,OAAO,IAAI,CAAC;IACd,CAAC;IACD,MAAM,IAAI,GAAG,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAC5C,MAAM,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,KAAK,EAAE,CAAC,CAAC;IAC9C,IAAI,UAAU,KAAK,CAAC,CAAC,EAAE,CAAC;QACtB,OAAO,IAAI,CAAC;IACd,CAAC;IACD,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC;AACnC,CAAC;AAED,wFAAwF;AACxF,SAAS,uBAAuB,CAAC,MAAe;IAC9C,MAAM,IAAI,GAAG,sBAAsB,CAAC,MAAM,CAAC,CAAC;IAC5C,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;QAClB,OAAO,CAAC,4DAA4D,CAAC,CAAC;IACxE,CAAC;IAED,IAAI,MAAe,CAAC;IACpB,IAAI,CAAC;QACH,MAAM,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAC3B,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,MAAM,OAAO,GAAG,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;QAC3D,OAAO,CAAC,qCAAqC,OAAO,EAAE,CAAC,CAAC;IAC1D,CAAC;IAED,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;QAC3E,OAAO,CAAC,mCAAmC,CAAC,CAAC;IAC/C,CAAC;IAED,MAAM,MAAM,GAAI,MAAkC,CAAC,OAAO,CAAC;IAC3D,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC;QACvD,OAAO,CAAC,oDAAoD,CAAC,CAAC;IAChE,CAAC;IAED,OAAO,EAAE,CAAC;AACZ,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,mBAAmB,CAAC,QAAgB;IACxD,MAAM,KAAK,GAAG,eAAe,CAAC,QAAQ,CAAC,CAAC;IAExC,MAAM,YAAY,GAAkB,EAAE,CAAC;IACvC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QACtB,MAAM,MAAM,GAAG,uBAAuB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACpD,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtB,YAAY,CAAC,IAAI,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC;QAC/D,CAAC;IACH,CAAC;IAED,MAAM,UAAU,GAAG,KAAK,CAAC,MAAM,CAAC;IAChC,MAAM,UAAU,GAAG,UAAU,GAAG,YAAY,CAAC,MAAM,CAAC;IACpD,MAAM,KAAK,GAAG,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC;IAE3D,GAAG,CAAC,UAAU,EAAE,sBAAsB,QAAQ,UAAU,UAAU,IAAI,UAAU,EAAE,CAAC,CAAC;IAEpF,OAAO;QACL,KAAK;QACL,IAAI,EAAE,EAAE,UAAU,EAAE,UAAU,EAAE,YAAY,EAAE;QAC9C,MAAM,EAAE,6BAA6B;KACtC,CAAC;AACJ,CAAC"}
@@ -0,0 +1,10 @@
1
+ import type { BuiltinJudgeOutput } from "./types.js";
2
+ /**
3
+ * LLM-as-judge: detects claims in each step's output that are not grounded in
4
+ * the available context (hallucinations).
5
+ *
6
+ * TODO: LLM-as-judge — needs provider config to call LLM API. Returns a stub
7
+ * (score 0, empty perStep) until the LLM call path is wired up.
8
+ */
9
+ export declare function runHallucinationJudge(_threadId: string): Promise<BuiltinJudgeOutput>;
10
+ //# sourceMappingURL=hallucination.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"hallucination.d.ts","sourceRoot":"","sources":["../../../src/judge/builtin/hallucination.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AAErD;;;;;;GAMG;AACH,wBAAsB,qBAAqB,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,kBAAkB,CAAC,CAM1F"}
@@ -0,0 +1,16 @@
1
+ import { EVAL_JUDGE_HALLUCINATION_SCHEMA } from "../../storage/index.js";
2
+ /**
3
+ * LLM-as-judge: detects claims in each step's output that are not grounded in
4
+ * the available context (hallucinations).
5
+ *
6
+ * TODO: LLM-as-judge — needs provider config to call LLM API. Returns a stub
7
+ * (score 0, empty perStep) until the LLM call path is wired up.
8
+ */
9
+ export async function runHallucinationJudge(_threadId) {
10
+ return {
11
+ score: 0,
12
+ data: { perStep: [] },
13
+ schema: EVAL_JUDGE_HALLUCINATION_SCHEMA,
14
+ };
15
+ }
16
+ //# sourceMappingURL=hallucination.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"hallucination.js","sourceRoot":"","sources":["../../../src/judge/builtin/hallucination.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,+BAA+B,EAAE,MAAM,wBAAwB,CAAC;AAGzE;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,qBAAqB,CAAC,SAAiB;IAC3D,OAAO;QACL,KAAK,EAAE,CAAC;QACR,IAAI,EAAE,EAAE,OAAO,EAAE,EAAE,EAAE;QACrB,MAAM,EAAE,+BAA+B;KACxC,CAAC;AACJ,CAAC"}
@@ -0,0 +1,7 @@
1
+ export { runFrontmatterJudge } from "./frontmatter.js";
2
+ export { runHallucinationJudge } from "./hallucination.js";
3
+ export { readThreadSteps } from "./read-steps.js";
4
+ export { runTokenStatsJudge } from "./token-stats.js";
5
+ export type { BuiltinJudge, BuiltinJudgeOutput } from "./types.js";
6
+ export { runUpstreamJudge } from "./upstream.js";
7
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/judge/builtin/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,MAAM,kBAAkB,CAAC;AACvD,OAAO,EAAE,qBAAqB,EAAE,MAAM,oBAAoB,CAAC;AAC3D,OAAO,EAAE,eAAe,EAAE,MAAM,iBAAiB,CAAC;AAClD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AACtD,YAAY,EAAE,YAAY,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AACnE,OAAO,EAAE,gBAAgB,EAAE,MAAM,eAAe,CAAC"}
@@ -0,0 +1,6 @@
1
+ export { runFrontmatterJudge } from "./frontmatter.js";
2
+ export { runHallucinationJudge } from "./hallucination.js";
3
+ export { readThreadSteps } from "./read-steps.js";
4
+ export { runTokenStatsJudge } from "./token-stats.js";
5
+ export { runUpstreamJudge } from "./upstream.js";
6
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/judge/builtin/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,MAAM,kBAAkB,CAAC;AACvD,OAAO,EAAE,qBAAqB,EAAE,MAAM,oBAAoB,CAAC;AAC3D,OAAO,EAAE,eAAe,EAAE,MAAM,iBAAiB,CAAC;AAClD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AAEtD,OAAO,EAAE,gBAAgB,EAAE,MAAM,eAAe,CAAC"}
@@ -0,0 +1,4 @@
1
+ import type { StepEntry } from "@united-workforce/protocol";
2
+ /** Shell out to `uwf step list` and return the parsed step entries (excludes start entry). */
3
+ export declare function readThreadSteps(threadId: string): StepEntry[];
4
+ //# sourceMappingURL=read-steps.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"read-steps.d.ts","sourceRoot":"","sources":["../../../src/judge/builtin/read-steps.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAqB,MAAM,4BAA4B,CAAC;AAE/E,8FAA8F;AAC9F,wBAAgB,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,SAAS,EAAE,CAQ7D"}
@@ -0,0 +1,12 @@
1
+ import { execFileSync } from "node:child_process";
2
+ /** Shell out to `uwf step list` and return the parsed step entries (excludes start entry). */
3
+ export function readThreadSteps(threadId) {
4
+ const stdout = execFileSync("uwf", ["step", "list", threadId], {
5
+ encoding: "utf8",
6
+ stdio: ["ignore", "pipe", "pipe"],
7
+ }).trim();
8
+ const parsed = JSON.parse(stdout);
9
+ // steps[0] is the StartEntry; the rest are StepEntry records.
10
+ return parsed.steps.slice(1);
11
+ }
12
+ //# sourceMappingURL=read-steps.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"read-steps.js","sourceRoot":"","sources":["../../../src/judge/builtin/read-steps.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAIlD,8FAA8F;AAC9F,MAAM,UAAU,eAAe,CAAC,QAAgB;IAC9C,MAAM,MAAM,GAAG,YAAY,CAAC,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE;QAC7D,QAAQ,EAAE,MAAM;QAChB,KAAK,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,CAAC;KAClC,CAAC,CAAC,IAAI,EAAE,CAAC;IACV,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAsB,CAAC;IACvD,8DAA8D;IAC9D,OAAO,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAgB,CAAC;AAC9C,CAAC"}
@@ -0,0 +1,8 @@
1
+ import type { BuiltinJudgeOutput } from "./types.js";
2
+ /**
3
+ * Informational judge: aggregate token usage across every step. Always scores
4
+ * 1.0 — it never penalizes a run, it only reports usage. Steps with null usage
5
+ * contribute zeros.
6
+ */
7
+ export declare function runTokenStatsJudge(threadId: string): Promise<BuiltinJudgeOutput>;
8
+ //# sourceMappingURL=token-stats.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"token-stats.d.ts","sourceRoot":"","sources":["../../../src/judge/builtin/token-stats.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AAcrD;;;;GAIG;AACH,wBAAsB,kBAAkB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,kBAAkB,CAAC,CA6BtF"}
@@ -0,0 +1,35 @@
1
+ import { createLogger } from "@united-workforce/util";
2
+ import { EVAL_JUDGE_TOKEN_STATS_SCHEMA } from "../../storage/index.js";
3
+ import { readThreadSteps } from "./read-steps.js";
4
+ const log = createLogger({ sink: { kind: "stderr" } });
5
+ const LOG_RESULT = "T7KQ3M9P";
6
+ /**
7
+ * Informational judge: aggregate token usage across every step. Always scores
8
+ * 1.0 — it never penalizes a run, it only reports usage. Steps with null usage
9
+ * contribute zeros.
10
+ */
11
+ export async function runTokenStatsJudge(threadId) {
12
+ const steps = readThreadSteps(threadId);
13
+ let totalInput = 0;
14
+ let totalOutput = 0;
15
+ let totalTurns = 0;
16
+ const perStep = [];
17
+ for (const step of steps) {
18
+ const usage = step.usage;
19
+ const inputTokens = usage !== null ? usage.inputTokens : 0;
20
+ const outputTokens = usage !== null ? usage.outputTokens : 0;
21
+ const turns = usage !== null ? usage.turns : 0;
22
+ const duration = usage !== null ? usage.duration : 0;
23
+ totalInput += inputTokens;
24
+ totalOutput += outputTokens;
25
+ totalTurns += turns;
26
+ perStep.push({ role: step.role, inputTokens, outputTokens, turns, duration });
27
+ }
28
+ log(LOG_RESULT, `token-stats thread=${threadId} in=${totalInput} out=${totalOutput}`);
29
+ return {
30
+ score: 1.0,
31
+ data: { totalInput, totalOutput, totalTurns, perStep },
32
+ schema: EVAL_JUDGE_TOKEN_STATS_SCHEMA,
33
+ };
34
+ }
35
+ //# sourceMappingURL=token-stats.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"token-stats.js","sourceRoot":"","sources":["../../../src/judge/builtin/token-stats.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAEtD,OAAO,EAAE,6BAA6B,EAAE,MAAM,wBAAwB,CAAC;AACvE,OAAO,EAAE,eAAe,EAAE,MAAM,iBAAiB,CAAC;AAGlD,MAAM,GAAG,GAAG,YAAY,CAAC,EAAE,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC;AAEvD,MAAM,UAAU,GAAG,UAAU,CAAC;AAU9B;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CAAC,QAAgB;IACvD,MAAM,KAAK,GAAG,eAAe,CAAC,QAAQ,CAAC,CAAC;IAExC,IAAI,UAAU,GAAG,CAAC,CAAC;IACnB,IAAI,WAAW,GAAG,CAAC,CAAC;IACpB,IAAI,UAAU,GAAG,CAAC,CAAC;IACnB,MAAM,OAAO,GAAmB,EAAE,CAAC;IAEnC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC;QACzB,MAAM,WAAW,GAAG,KAAK,KAAK,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;QAC3D,MAAM,YAAY,GAAG,KAAK,KAAK,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC;QAC7D,MAAM,KAAK,GAAG,KAAK,KAAK,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/C,MAAM,QAAQ,GAAG,KAAK,KAAK,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QAErD,UAAU,IAAI,WAAW,CAAC;QAC1B,WAAW,IAAI,YAAY,CAAC;QAC5B,UAAU,IAAI,KAAK,CAAC;QAEpB,OAAO,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,WAAW,EAAE,YAAY,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC;IAChF,CAAC;IAED,GAAG,CAAC,UAAU,EAAE,sBAAsB,QAAQ,OAAO,UAAU,QAAQ,WAAW,EAAE,CAAC,CAAC;IAEtF,OAAO;QACL,KAAK,EAAE,GAAG;QACV,IAAI,EAAE,EAAE,UAAU,EAAE,WAAW,EAAE,UAAU,EAAE,OAAO,EAAE;QACtD,MAAM,EAAE,6BAA6B;KACtC,CAAC;AACJ,CAAC"}
@@ -0,0 +1,15 @@
1
+ import type { JSONSchema } from "@ocas/core";
2
+ /**
3
+ * Output produced by a builtin judge. Structurally identical to the runner's
4
+ * `JudgeRunOutput`; defined locally to keep the judge module free of a
5
+ * dependency on the runner module.
6
+ */
7
+ export type BuiltinJudgeOutput = {
8
+ score: number;
9
+ data: unknown;
10
+ /** Schema describing `data`, used when persisting to CAS. */
11
+ schema: JSONSchema;
12
+ };
13
+ /** A builtin judge analyzes a thread's steps and returns a scored result. */
14
+ export type BuiltinJudge = (threadId: string) => Promise<BuiltinJudgeOutput>;
15
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../../src/judge/builtin/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AAE7C;;;;GAIG;AACH,MAAM,MAAM,kBAAkB,GAAG;IAC/B,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,OAAO,CAAC;IACd,6DAA6D;IAC7D,MAAM,EAAE,UAAU,CAAC;CACpB,CAAC;AAEF,6EAA6E;AAC7E,MAAM,MAAM,YAAY,GAAG,CAAC,QAAQ,EAAE,MAAM,KAAK,OAAO,CAAC,kBAAkB,CAAC,CAAC"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../../src/judge/builtin/types.ts"],"names":[],"mappings":""}
@@ -0,0 +1,10 @@
1
+ import type { BuiltinJudgeOutput } from "./types.js";
2
+ /**
3
+ * LLM-as-judge: measures how well each role consumed the relevant outputs from
4
+ * upstream steps.
5
+ *
6
+ * TODO: LLM-as-judge — needs provider config to call LLM API. Returns a stub
7
+ * (score 0, empty perStep) until the LLM call path is wired up.
8
+ */
9
+ export declare function runUpstreamJudge(_threadId: string): Promise<BuiltinJudgeOutput>;
10
+ //# sourceMappingURL=upstream.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"upstream.d.ts","sourceRoot":"","sources":["../../../src/judge/builtin/upstream.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AAErD;;;;;;GAMG;AACH,wBAAsB,gBAAgB,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,kBAAkB,CAAC,CAMrF"}
@@ -0,0 +1,16 @@
1
+ import { EVAL_JUDGE_UPSTREAM_SCHEMA } from "../../storage/index.js";
2
+ /**
3
+ * LLM-as-judge: measures how well each role consumed the relevant outputs from
4
+ * upstream steps.
5
+ *
6
+ * TODO: LLM-as-judge — needs provider config to call LLM API. Returns a stub
7
+ * (score 0, empty perStep) until the LLM call path is wired up.
8
+ */
9
+ export async function runUpstreamJudge(_threadId) {
10
+ return {
11
+ score: 0,
12
+ data: { perStep: [] },
13
+ schema: EVAL_JUDGE_UPSTREAM_SCHEMA,
14
+ };
15
+ }
16
+ //# sourceMappingURL=upstream.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"upstream.js","sourceRoot":"","sources":["../../../src/judge/builtin/upstream.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,0BAA0B,EAAE,MAAM,wBAAwB,CAAC;AAGpE;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CAAC,SAAiB;IACtD,OAAO;QACL,KAAK,EAAE,CAAC;QACR,IAAI,EAAE,EAAE,OAAO,EAAE,EAAE,EAAE;QACrB,MAAM,EAAE,0BAA0B;KACnC,CAAC;AACJ,CAAC"}
@@ -0,0 +1,3 @@
1
+ export { type BuiltinJudge, type BuiltinJudgeOutput, readThreadSteps, runFrontmatterJudge, runHallucinationJudge, runTokenStatsJudge, runUpstreamJudge, } from "./builtin/index.js";
2
+ export type { JudgeInput, JudgeOutput } from "./types.js";
3
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/judge/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,KAAK,YAAY,EACjB,KAAK,kBAAkB,EACvB,eAAe,EACf,mBAAmB,EACnB,qBAAqB,EACrB,kBAAkB,EAClB,gBAAgB,GACjB,MAAM,oBAAoB,CAAC;AAC5B,YAAY,EAAE,UAAU,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC"}
@@ -0,0 +1,2 @@
1
+ export { readThreadSteps, runFrontmatterJudge, runHallucinationJudge, runTokenStatsJudge, runUpstreamJudge, } from "./builtin/index.js";
2
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/judge/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAGL,eAAe,EACf,mBAAmB,EACnB,qBAAqB,EACrB,kBAAkB,EAClB,gBAAgB,GACjB,MAAM,oBAAoB,CAAC"}
@@ -0,0 +1,15 @@
1
+ /** Output shape every judge must produce on stdout (JSON). */
2
+ export type JudgeOutput<T = unknown> = {
3
+ /** Score between 0.0 and 1.0. */
4
+ score: number;
5
+ /** Judge-specific structured data, stored in CAS with its own schema. */
6
+ data: T;
7
+ };
8
+ /** Input context passed to judge scripts via argv. */
9
+ export type JudgeInput = {
10
+ /** Working directory where the task was executed. */
11
+ cwd: string;
12
+ /** Thread ID of the eval run. */
13
+ threadId: string;
14
+ };
15
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/judge/types.ts"],"names":[],"mappings":"AAAA,8DAA8D;AAC9D,MAAM,MAAM,WAAW,CAAC,CAAC,GAAG,OAAO,IAAI;IACrC,iCAAiC;IACjC,KAAK,EAAE,MAAM,CAAC;IACd,yEAAyE;IACzE,IAAI,EAAE,CAAC,CAAC;CACT,CAAC;AAEF,sDAAsD;AACtD,MAAM,MAAM,UAAU,GAAG;IACvB,qDAAqD;IACrD,GAAG,EAAE,MAAM,CAAC;IACZ,iCAAiC;IACjC,QAAQ,EAAE,MAAM,CAAC;CAClB,CAAC"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/judge/types.ts"],"names":[],"mappings":""}
@@ -0,0 +1,16 @@
1
+ import type { CollectInput, CollectResult, JudgeRunner } from "./types.js";
2
+ /**
3
+ * Compute the weighted overall score. Judges with weight 0 are informational
4
+ * and do not affect the result (they contribute 0 to both numerator and
5
+ * denominator). Returns 0 when total weight is 0.
6
+ */
7
+ export declare function computeOverall(judges: ReadonlyArray<{
8
+ score: number;
9
+ weight: number;
10
+ }>): number;
11
+ /**
12
+ * Run all judges, store their data and the overall eval-run record in CAS, then
13
+ * index the run under `@uwf/eval/<task>/latest`.
14
+ */
15
+ export declare function collect(input: CollectInput, runJudge?: JudgeRunner): Promise<CollectResult>;
16
+ //# sourceMappingURL=collect.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"collect.d.ts","sourceRoot":"","sources":["../../src/runner/collect.ts"],"names":[],"mappings":"AAmBA,OAAO,KAAK,EACV,YAAY,EACZ,aAAa,EACb,WAAW,EAGZ,MAAM,YAAY,CAAC;AAUpB;;;;GAIG;AACH,wBAAgB,cAAc,CAAC,MAAM,EAAE,aAAa,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,MAAM,CAAA;CAAE,CAAC,GAAG,MAAM,CAQ/F;AAkFD;;;GAGG;AACH,wBAAsB,OAAO,CAC3B,KAAK,EAAE,YAAY,EACnB,QAAQ,GAAE,WAAgC,GACzC,OAAO,CAAC,aAAa,CAAC,CAkCxB"}
@@ -0,0 +1,129 @@
1
+ import { execFileSync } from "node:child_process";
2
+ import { readFile } from "node:fs/promises";
3
+ import { resolve } from "node:path";
4
+ import { putSchema } from "@ocas/core";
5
+ import { createLogger } from "@united-workforce/util";
6
+ import { runFrontmatterJudge, runHallucinationJudge, runTokenStatsJudge, runUpstreamJudge, } from "../judge/index.js";
7
+ import { EVAL_RUN_SCHEMA, setEvalLatest } from "../storage/index.js";
8
+ const log = createLogger({ sink: { kind: "stderr" } });
9
+ const LOG_JUDGE = "CT6N3P2K";
10
+ const LOG_STORED = "CT9V2Q7M";
11
+ /** Permissive schema for judge data without a dedicated schema (e.g. builtin placeholders). */
12
+ const GENERIC_DATA_SCHEMA = { type: "object" };
13
+ /**
14
+ * Compute the weighted overall score. Judges with weight 0 are informational
15
+ * and do not affect the result (they contribute 0 to both numerator and
16
+ * denominator). Returns 0 when total weight is 0.
17
+ */
18
+ export function computeOverall(judges) {
19
+ let totalWeight = 0;
20
+ let weighted = 0;
21
+ for (const judge of judges) {
22
+ totalWeight += judge.weight;
23
+ weighted += judge.score * judge.weight;
24
+ }
25
+ return totalWeight > 0 ? weighted / totalWeight : 0;
26
+ }
27
+ /** Run a task-provided judge script: `node <entry> <cwd> <threadId>`. */
28
+ async function runTaskJudge(taskDir, workDir, threadId, judge) {
29
+ if (judge.entry === null) {
30
+ throw new Error(`judge "${judge.name}" is not builtin but has no entry`);
31
+ }
32
+ const entryPath = resolve(taskDir, judge.entry);
33
+ let stdout;
34
+ try {
35
+ stdout = execFileSync("node", [entryPath, workDir, threadId], {
36
+ encoding: "utf8",
37
+ stdio: ["ignore", "pipe", "pipe"],
38
+ maxBuffer: 50 * 1024 * 1024,
39
+ });
40
+ }
41
+ catch (e) {
42
+ const message = e instanceof Error ? e.message : String(e);
43
+ throw new Error(`judge "${judge.name}" failed: ${message}`);
44
+ }
45
+ const line = stdout.trim().split("\n").pop()?.trim() ?? "";
46
+ let parsed;
47
+ try {
48
+ parsed = JSON.parse(line);
49
+ }
50
+ catch {
51
+ throw new Error(`judge "${judge.name}" stdout is not valid JSON: ${line || "(empty)"}`);
52
+ }
53
+ const output = parsed;
54
+ if (typeof output.score !== "number") {
55
+ throw new Error(`judge "${judge.name}" output missing numeric score`);
56
+ }
57
+ const schema = judge.schema !== null ? await loadSchema(resolve(taskDir, judge.schema)) : GENERIC_DATA_SCHEMA;
58
+ return { score: output.score, data: output.data, schema };
59
+ }
60
+ /** Load and parse an OCAS JSON Schema file. */
61
+ async function loadSchema(path) {
62
+ const text = await readFile(path, "utf8");
63
+ return JSON.parse(text);
64
+ }
65
+ /** Dispatch a builtin judge by name. Throws on an unknown builtin name. */
66
+ async function runBuiltinJudge(name, threadId) {
67
+ switch (name) {
68
+ case "frontmatter-compliance":
69
+ return runFrontmatterJudge(threadId);
70
+ case "upstream-consumption":
71
+ return runUpstreamJudge(threadId);
72
+ case "hallucination":
73
+ return runHallucinationJudge(threadId);
74
+ case "token-stats":
75
+ return runTokenStatsJudge(threadId);
76
+ default:
77
+ throw new Error(`unknown builtin judge "${name}"`);
78
+ }
79
+ }
80
+ /**
81
+ * Default judge runner. Builtin judges are dispatched by name; task judges spawn
82
+ * their entry script.
83
+ */
84
+ const defaultJudgeRunner = async (taskDir, workDir, threadId, judge) => {
85
+ if (judge.builtin) {
86
+ return runBuiltinJudge(judge.name, threadId);
87
+ }
88
+ return runTaskJudge(taskDir, workDir, threadId, judge);
89
+ };
90
+ /** Persist judge data to CAS under its schema and return the CAS hash. */
91
+ async function storeJudgeData(store, schema, data) {
92
+ const schemaHash = await putSchema(store, schema);
93
+ return (await store.cas.put(schemaHash, data));
94
+ }
95
+ /**
96
+ * Run all judges, store their data and the overall eval-run record in CAS, then
97
+ * index the run under `@uwf/eval/<task>/latest`.
98
+ */
99
+ export async function collect(input, runJudge = defaultJudgeRunner) {
100
+ const { evalStore, taskDir, workDir, threadId, manifest, config } = input;
101
+ const { store, varStore } = evalStore;
102
+ const records = [];
103
+ for (const judge of manifest.judges) {
104
+ const result = await runJudge(taskDir, workDir, threadId, judge);
105
+ const dataHash = await storeJudgeData(store, result.schema, result.data);
106
+ records.push({ name: judge.name, score: result.score, weight: judge.weight, dataHash });
107
+ log(LOG_JUDGE, `judge=${judge.name} score=${result.score} weight=${judge.weight}`);
108
+ }
109
+ const overall = computeOverall(records);
110
+ const payload = {
111
+ task: manifest.name,
112
+ config,
113
+ threadId,
114
+ judges: records,
115
+ overall,
116
+ timestamp: Date.now(),
117
+ };
118
+ const schemaHash = await putSchema(store, EVAL_RUN_SCHEMA);
119
+ const runHash = (await store.cas.put(schemaHash, payload));
120
+ setEvalLatest(varStore, manifest.name, runHash);
121
+ log(LOG_STORED, `stored eval-run task=${manifest.name} hash=${runHash} overall=${overall}`);
122
+ const judges = records.map((r) => ({
123
+ name: r.name,
124
+ score: r.score,
125
+ weight: r.weight,
126
+ }));
127
+ return { runHash, overall, judges };
128
+ }
129
+ //# sourceMappingURL=collect.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"collect.js","sourceRoot":"","sources":["../../src/runner/collect.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AAC5C,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAGpC,OAAO,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AAEvC,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAGtD,OAAO,EACL,mBAAmB,EACnB,qBAAqB,EACrB,kBAAkB,EAClB,gBAAgB,GACjB,MAAM,mBAAmB,CAAC;AAE3B,OAAO,EAAE,eAAe,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AAUrE,MAAM,GAAG,GAAG,YAAY,CAAC,EAAE,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC;AAEvD,MAAM,SAAS,GAAG,UAAU,CAAC;AAC7B,MAAM,UAAU,GAAG,UAAU,CAAC;AAE9B,+FAA+F;AAC/F,MAAM,mBAAmB,GAAe,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;AAE3D;;;;GAIG;AACH,MAAM,UAAU,cAAc,CAAC,MAAwD;IACrF,IAAI,WAAW,GAAG,CAAC,CAAC;IACpB,IAAI,QAAQ,GAAG,CAAC,CAAC;IACjB,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,WAAW,IAAI,KAAK,CAAC,MAAM,CAAC;QAC5B,QAAQ,IAAI,KAAK,CAAC,KAAK,GAAG,KAAK,CAAC,MAAM,CAAC;IACzC,CAAC;IACD,OAAO,WAAW,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,GAAG,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;AACtD,CAAC;AAED,yEAAyE;AACzE,KAAK,UAAU,YAAY,CACzB,OAAe,EACf,OAAe,EACf,QAAgB,EAChB,KAAiB;IAEjB,IAAI,KAAK,CAAC,KAAK,KAAK,IAAI,EAAE,CAAC;QACzB,MAAM,IAAI,KAAK,CAAC,UAAU,KAAK,CAAC,IAAI,mCAAmC,CAAC,CAAC;IAC3E,CAAC;IACD,MAAM,SAAS,GAAG,OAAO,CAAC,OAAO,EAAE,KAAK,CAAC,KAAK,CAAC,CAAC;IAEhD,IAAI,MAAc,CAAC;IACnB,IAAI,CAAC;QACH,MAAM,GAAG,YAAY,CAAC,MAAM,EAAE,CAAC,SAAS,EAAE,OAAO,EAAE,QAAQ,CAAC,EAAE;YAC5D,QAAQ,EAAE,MAAM;YAChB,KAAK,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,CAAC;YACjC,SAAS,EAAE,EAAE,GAAG,IAAI,GAAG,IAAI;SAC5B,CAAC,CAAC;IACL,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,MAAM,OAAO,GAAG,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;QAC3D,MAAM,IAAI,KAAK,CAAC,UAAU,KAAK,CAAC,IAAI,aAAa,OAAO,EAAE,CAAC,CAAC;IAC9D,CAAC;IAED,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAC3D,IAAI,MAAe,CAAC;IACpB,IAAI,CAAC;QACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC5B,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CAAC,UAAU,KAAK,CAAC,IAAI,+BAA+B,IAAI,IAAI,SAAS,EAAE,CAAC,CAAC;IAC1F,CAAC;IACD,MAAM,MAAM,GAAG,MAAqB,CAAC;IACrC,IAAI,OAAO,MAAM,CAAC,KAAK,KAAK,QAAQ,EAAE,CAAC;QACrC,MAAM,IAAI,KAAK,CAAC,UAAU,KAAK,CAAC,IAAI,gCAAgC,CAAC,CAAC;IACxE,CAAC;IAED,MAAM,MAAM,GACV,KAAK,CAAC,MAAM,KAAK,IAAI,CAAC,CAAC,CAAC,MAAM,UAAU,CAAC,OAAO,CAAC,OAAO,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,mBAAmB,CAAC;IACjG,OAAO,EAAE,KAAK,EAAE,MAAM,CAAC,KAAK,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE,MAAM,EAAE,CAAC;AAC5D,CAAC;AAED,+CAA+C;AAC/C,KAAK,UAAU,UAAU,CAAC,IAAY;IACpC,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IAC1C,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAe,CAAC;AACxC,CAAC;AAED,2EAA2E;AAC3E,KAAK,UAAU,eAAe,CAAC,IAAY,EAAE,QAAgB;IAC3D,QAAQ,IAAI,EAAE,CAAC;QACb,KAAK,wBAAwB;YAC3B,OAAO,mBAAmB,CAAC,QAAQ,CAAC,CAAC;QACvC,KAAK,sBAAsB;YACzB,OAAO,gBAAgB,CAAC,QAAQ,CAAC,CAAC;QACpC,KAAK,eAAe;YAClB,OAAO,qBAAqB,CAAC,QAAQ,CAAC,CAAC;QACzC,KAAK,aAAa;YAChB,OAAO,kBAAkB,CAAC,QAAQ,CAAC,CAAC;QACtC;YACE,MAAM,IAAI,KAAK,CAAC,0BAA0B,IAAI,GAAG,CAAC,CAAC;IACvD,CAAC;AACH,CAAC;AAED;;;GAGG;AACH,MAAM,kBAAkB,GAAgB,KAAK,EAAE,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,EAAE;IAClF,IAAI,KAAK,CAAC,OAAO,EAAE,CAAC;QAClB,OAAO,eAAe,CAAC,KAAK,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;IAC/C,CAAC;IACD,OAAO,YAAY,CAAC,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC;AACzD,CAAC,CAAC;AAEF,0EAA0E;AAC1E,KAAK,UAAU,cAAc,CAAC,KAAY,EAAE,MAAkB,EAAE,IAAa;IAC3E,MAAM,UAAU,GAAG,MAAM,SAAS,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC;IAClD,OAAO,CAAC,MAAM,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,UAAU,EAAE,IAAI,CAAC,CAAW,CAAC;AAC3D,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,OAAO,CAC3B,KAAmB,EACnB,WAAwB,kBAAkB;IAE1C,MAAM,EAAE,SAAS,EAAE,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,MAAM,EAAE,GAAG,KAAK,CAAC;IAC1E,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC;IAEtC,MAAM,OAAO,GAAsB,EAAE,CAAC;IACtC,KAAK,MAAM,KAAK,IAAI,QAAQ,CAAC,MAAM,EAAE,CAAC;QACpC,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC;QACjE,MAAM,QAAQ,GAAG,MAAM,cAAc,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,IAAI,CAAC,CAAC;QACzE,OAAO,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,KAAK,CAAC,IAAI,EAAE,KAAK,EAAE,MAAM,CAAC,KAAK,EAAE,MAAM,EAAE,KAAK,CAAC,MAAM,EAAE,QAAQ,EAAE,CAAC,CAAC;QACxF,GAAG,CAAC,SAAS,EAAE,SAAS,KAAK,CAAC,IAAI,UAAU,MAAM,CAAC,KAAK,WAAW,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC;IACrF,CAAC;IAED,MAAM,OAAO,GAAG,cAAc,CAAC,OAAO,CAAC,CAAC;IAExC,MAAM,OAAO,GAAmB;QAC9B,IAAI,EAAE,QAAQ,CAAC,IAAI;QACnB,MAAM;QACN,QAAQ;QACR,MAAM,EAAE,OAAO;QACf,OAAO;QACP,SAAS,EAAE,IAAI,CAAC,GAAG,EAAE;KACtB,CAAC;IAEF,MAAM,UAAU,GAAG,MAAM,SAAS,CAAC,KAAK,EAAE,eAAe,CAAC,CAAC;IAC3D,MAAM,OAAO,GAAG,CAAC,MAAM,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,UAAU,EAAE,OAAO,CAAC,CAAW,CAAC;IACrE,aAAa,CAAC,QAAQ,EAAE,QAAQ,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IAChD,GAAG,CAAC,UAAU,EAAE,wBAAwB,QAAQ,CAAC,IAAI,SAAS,OAAO,YAAY,OAAO,EAAE,CAAC,CAAC;IAE5F,MAAM,MAAM,GAAmB,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACjD,IAAI,EAAE,CAAC,CAAC,IAAI;QACZ,KAAK,EAAE,CAAC,CAAC,KAAK;QACd,MAAM,EAAE,CAAC,CAAC,MAAM;KACjB,CAAC,CAAC,CAAC;IACJ,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC;AACtC,CAAC"}
@@ -0,0 +1,9 @@
1
+ import type { ExecuteInput, ExecuteResult } from "./types.js";
2
+ /**
3
+ * Execute a workflow: create a thread, then run it for up to `maxSteps` steps.
4
+ * Shells out to the uwf CLI rather than importing it directly.
5
+ */
6
+ export declare function execute(input: ExecuteInput): Promise<ExecuteResult>;
7
+ /** Best-effort lookup of the uwf engine version (`uwf -V`); "unknown" on failure. */
8
+ export declare function getEngineVersion(): string;
9
+ //# sourceMappingURL=execute.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"execute.d.ts","sourceRoot":"","sources":["../../src/runner/execute.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAmD9D;;;GAGG;AACH,wBAAsB,OAAO,CAAC,KAAK,EAAE,YAAY,GAAG,OAAO,CAAC,aAAa,CAAC,CAezE;AAED,qFAAqF;AACrF,wBAAgB,gBAAgB,IAAI,MAAM,CASzC"}
@@ -0,0 +1,72 @@
1
+ import { execFileSync } from "node:child_process";
2
+ import { createLogger } from "@united-workforce/util";
3
+ const log = createLogger({ sink: { kind: "stderr" } });
4
+ const LOG_START = "EX5M2T9V";
5
+ const LOG_EXEC = "EX7Q4K2N";
6
+ /** Resolve the uwf CLI binary. Override with `UWF_BIN` for testing. */
7
+ function uwfBin() {
8
+ const override = process.env.UWF_BIN;
9
+ return override !== undefined && override !== "" ? override : "uwf";
10
+ }
11
+ /** Run a uwf subcommand and return trimmed stdout. */
12
+ function runUwf(args, cwd) {
13
+ try {
14
+ return execFileSync(uwfBin(), args, {
15
+ encoding: "utf8",
16
+ stdio: ["ignore", "pipe", "pipe"],
17
+ maxBuffer: 50 * 1024 * 1024,
18
+ cwd,
19
+ }).trim();
20
+ }
21
+ catch (e) {
22
+ const err = e;
23
+ const stderr = err.stderr == null
24
+ ? ""
25
+ : typeof err.stderr === "string"
26
+ ? err.stderr
27
+ : err.stderr.toString("utf8");
28
+ const detail = stderr.trim() !== "" ? `: ${stderr.trim()}` : "";
29
+ throw new Error(`uwf ${args[0]} ${args[1]} failed${detail}`);
30
+ }
31
+ }
32
+ /** Parse the thread ID from `uwf thread start` JSON output (`{ workflow, thread }`). */
33
+ function parseThreadId(stdout) {
34
+ let parsed;
35
+ try {
36
+ parsed = JSON.parse(stdout);
37
+ }
38
+ catch {
39
+ throw new Error(`uwf thread start did not emit valid JSON: ${stdout || "(empty)"}`);
40
+ }
41
+ const obj = parsed;
42
+ const thread = obj.thread;
43
+ if (typeof thread !== "string" || thread === "") {
44
+ throw new Error(`uwf thread start output missing thread id: ${stdout}`);
45
+ }
46
+ return thread;
47
+ }
48
+ /**
49
+ * Execute a workflow: create a thread, then run it for up to `maxSteps` steps.
50
+ * Shells out to the uwf CLI rather than importing it directly.
51
+ */
52
+ export async function execute(input) {
53
+ const startOut = runUwf(["thread", "start", input.workflow, "-p", input.prompt, "--cwd", input.workDir], input.workDir);
54
+ const threadId = parseThreadId(startOut);
55
+ log(LOG_START, `thread started thread=${threadId} workflow=${input.workflow}`);
56
+ runUwf(["thread", "exec", threadId, "--agent", input.agent, "-c", String(input.maxSteps)], input.workDir);
57
+ log(LOG_EXEC, `thread executed thread=${threadId} maxSteps=${input.maxSteps}`);
58
+ return { threadId };
59
+ }
60
+ /** Best-effort lookup of the uwf engine version (`uwf -V`); "unknown" on failure. */
61
+ export function getEngineVersion() {
62
+ try {
63
+ return execFileSync(uwfBin(), ["-V"], {
64
+ encoding: "utf8",
65
+ stdio: ["ignore", "pipe", "ignore"],
66
+ }).trim();
67
+ }
68
+ catch {
69
+ return "unknown";
70
+ }
71
+ }
72
+ //# sourceMappingURL=execute.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"execute.js","sourceRoot":"","sources":["../../src/runner/execute.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAElD,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAItD,MAAM,GAAG,GAAG,YAAY,CAAC,EAAE,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC;AAEvD,MAAM,SAAS,GAAG,UAAU,CAAC;AAC7B,MAAM,QAAQ,GAAG,UAAU,CAAC;AAE5B,uEAAuE;AACvE,SAAS,MAAM;IACb,MAAM,QAAQ,GAAG,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC;IACrC,OAAO,QAAQ,KAAK,SAAS,IAAI,QAAQ,KAAK,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC;AACtE,CAAC;AAED,sDAAsD;AACtD,SAAS,MAAM,CAAC,IAAc,EAAE,GAAW;IACzC,IAAI,CAAC;QACH,OAAO,YAAY,CAAC,MAAM,EAAE,EAAE,IAAI,EAAE;YAClC,QAAQ,EAAE,MAAM;YAChB,KAAK,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,CAAC;YACjC,SAAS,EAAE,EAAE,GAAG,IAAI,GAAG,IAAI;YAC3B,GAAG;SACJ,CAAC,CAAC,IAAI,EAAE,CAAC;IACZ,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,MAAM,GAAG,GAAG,CAAgE,CAAC;QAC7E,MAAM,MAAM,GACV,GAAG,CAAC,MAAM,IAAI,IAAI;YAChB,CAAC,CAAC,EAAE;YACJ,CAAC,CAAC,OAAO,GAAG,CAAC,MAAM,KAAK,QAAQ;gBAC9B,CAAC,CAAC,GAAG,CAAC,MAAM;gBACZ,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QACpC,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,KAAK,MAAM,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAChE,MAAM,IAAI,KAAK,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC,CAAC,CAAC,UAAU,MAAM,EAAE,CAAC,CAAC;IAC/D,CAAC;AACH,CAAC;AAED,wFAAwF;AACxF,SAAS,aAAa,CAAC,MAAc;IACnC,IAAI,MAAe,CAAC;IACpB,IAAI,CAAC;QACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;IAC9B,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CAAC,6CAA6C,MAAM,IAAI,SAAS,EAAE,CAAC,CAAC;IACtF,CAAC;IACD,MAAM,GAAG,GAAG,MAAiC,CAAC;IAC9C,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,CAAC;IAC1B,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,KAAK,EAAE,EAAE,CAAC;QAChD,MAAM,IAAI,KAAK,CAAC,8CAA8C,MAAM,EAAE,CAAC,CAAC;IAC1E,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,KAAmB;IAC/C,MAAM,QAAQ,GAAG,MAAM,CACrB,CAAC,QAAQ,EAAE,OAAO,EAAE,KAAK,CAAC,QAAQ,EAAE,IAAI,EAAE,KAAK,CAAC,MAAM,EAAE,OAAO,EAAE,KAAK,CAAC,OAAO,CAAC,EAC/E,KAAK,CAAC,OAAO,CACd,CAAC;IACF,MAAM,QAAQ,GAAG,aAAa,CAAC,QAAQ,CAAC,CAAC;IACzC,GAAG,CAAC,SAAS,EAAE,yBAAyB,QAAQ,aAAa,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;IAE/E,MAAM,CACJ,CAAC,QAAQ,EAAE,MAAM,EAAE,QAAQ,EAAE,SAAS,EAAE,KAAK,CAAC,KAAK,EAAE,IAAI,EAAE,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,EAClF,KAAK,CAAC,OAAO,CACd,CAAC;IACF,GAAG,CAAC,QAAQ,EAAE,0BAA0B,QAAQ,aAAa,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;IAE/E,OAAO,EAAE,QAAQ,EAAE,CAAC;AACtB,CAAC;AAED,qFAAqF;AACrF,MAAM,UAAU,gBAAgB;IAC9B,IAAI,CAAC;QACH,OAAO,YAAY,CAAC,MAAM,EAAE,EAAE,CAAC,IAAI,CAAC,EAAE;YACpC,QAAQ,EAAE,MAAM;YAChB,KAAK,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,QAAQ,CAAC;SACpC,CAAC,CAAC,IAAI,EAAE,CAAC;IACZ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,SAAS,CAAC;IACnB,CAAC;AACH,CAAC"}
@@ -0,0 +1,5 @@
1
+ export { collect, computeOverall } from "./collect.js";
2
+ export { execute, getEngineVersion } from "./execute.js";
3
+ export { prepare } from "./prepare.js";
4
+ export type { CollectInput, CollectResult, ExecuteInput, ExecuteResult, JudgeRunner, JudgeRunOutput, JudgeSummary, PrepareResult, RunOptions, RunResult, } from "./types.js";
5
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/runner/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AACvD,OAAO,EAAE,OAAO,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AACzD,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AACvC,YAAY,EACV,YAAY,EACZ,aAAa,EACb,YAAY,EACZ,aAAa,EACb,WAAW,EACX,cAAc,EACd,YAAY,EACZ,aAAa,EACb,UAAU,EACV,SAAS,GACV,MAAM,YAAY,CAAC"}
@@ -0,0 +1,4 @@
1
+ export { collect, computeOverall } from "./collect.js";
2
+ export { execute, getEngineVersion } from "./execute.js";
3
+ export { prepare } from "./prepare.js";
4
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/runner/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AACvD,OAAO,EAAE,OAAO,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AACzD,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC"}
@@ -0,0 +1,7 @@
1
+ import type { PrepareResult } from "./types.js";
2
+ /**
3
+ * Prepare a task for execution: read its manifest and copy the fixture
4
+ * directory into a fresh temp working directory.
5
+ */
6
+ export declare function prepare(taskDir: string): Promise<PrepareResult>;
7
+ //# sourceMappingURL=prepare.d.ts.map