@workbench-ai/workbench-core 0.0.66 → 0.0.68

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/dist/execution-graph.d.ts +4 -3
  2. package/dist/execution-graph.d.ts.map +1 -1
  3. package/dist/execution-graph.js +15 -14
  4. package/dist/execution-jobs.d.ts +5 -20
  5. package/dist/execution-jobs.d.ts.map +1 -1
  6. package/dist/execution-jobs.js +7 -91
  7. package/dist/execution-outputs.d.ts +2 -2
  8. package/dist/execution-outputs.d.ts.map +1 -1
  9. package/dist/execution-outputs.js +10 -10
  10. package/dist/execution-runtime-types.d.ts +1 -1
  11. package/dist/execution-runtime-types.d.ts.map +1 -1
  12. package/dist/execution-scheduler.d.ts.map +1 -1
  13. package/dist/execution-scheduler.js +4 -1
  14. package/dist/execution-traces.js +1 -1
  15. package/dist/generic-spec.d.ts +29 -29
  16. package/dist/generic-spec.d.ts.map +1 -1
  17. package/dist/generic-spec.js +94 -92
  18. package/dist/index.d.ts +325 -220
  19. package/dist/index.d.ts.map +1 -1
  20. package/dist/index.js +5784 -3856
  21. package/dist/runtime-dockerfile.d.ts +1 -1
  22. package/dist/runtime-dockerfile.d.ts.map +1 -1
  23. package/dist/runtime-dockerfile.js +4 -4
  24. package/dist/runtime-utils.d.ts +1 -1
  25. package/dist/runtime-utils.d.ts.map +1 -1
  26. package/dist/runtime-utils.js +3 -3
  27. package/dist/sandbox-backends/docker.js +7 -5
  28. package/dist/sandbox-inputs.js +3 -3
  29. package/dist/sandbox-plane.d.ts.map +1 -1
  30. package/dist/sandbox-plane.js +13 -9
  31. package/dist/skill-patch.d.ts +8 -0
  32. package/dist/skill-patch.d.ts.map +1 -0
  33. package/dist/{candidate-patch.js → skill-patch.js} +5 -5
  34. package/package.json +3 -3
  35. package/worker/sandbox-adapter-runner.cjs +2 -2
  36. package/dist/candidate-patch.d.ts +0 -8
  37. package/dist/candidate-patch.d.ts.map +0 -1
  38. package/dist/execution-evidence.d.ts +0 -22
  39. package/dist/execution-evidence.d.ts.map +0 -1
  40. package/dist/execution-evidence.js +0 -302
  41. package/dist/inspection.d.ts +0 -111
  42. package/dist/inspection.d.ts.map +0 -1
  43. package/dist/inspection.js +0 -217
@@ -1,8 +1,8 @@
1
1
  import { createHash } from "node:crypto";
2
2
  import { isWorkbenchExecutionNetworkEgress, } from "@workbench-ai/workbench-contract";
3
3
  import YAML from "yaml";
4
- export const BENCHMARK_SPEC_FILE = "benchmark.yaml";
5
- export const CANDIDATE_SPEC_FILE = "candidate.yaml";
4
+ export const EVAL_SPEC_FILE = "eval.yaml";
5
+ export const SKILL_SPEC_FILE = "skill.yaml";
6
6
  export const DEFAULT_EXECUTION_RESOURCES = {
7
7
  cpu: 2,
8
8
  memoryGb: 4,
@@ -14,7 +14,7 @@ export function validateWorkbenchResolvedSourceYaml(source) {
14
14
  const warnings = [];
15
15
  const trimmed = source.trim();
16
16
  if (!trimmed) {
17
- errors.push("Resolved Workbench source cannot be empty.");
17
+ errors.push("Resolved Workbench spec cannot be empty.");
18
18
  }
19
19
  if (trimmed) {
20
20
  try {
@@ -31,25 +31,25 @@ export function validateWorkbenchResolvedSourceYaml(source) {
31
31
  };
32
32
  }
33
33
  export function resolveWorkbenchResolvedSourceYaml(source) {
34
- const parsed = parseYamlRecord(source, "resolved Workbench source");
34
+ const parsed = parseYamlRecord(source, "resolved Workbench spec");
35
35
  const errors = [];
36
- rejectUnknownKeys(parsed, "resolved Workbench source", [
36
+ rejectUnknownKeys(parsed, "resolved Workbench spec", [
37
37
  "version",
38
- "benchmark",
39
- "candidate",
38
+ "eval",
39
+ "skill",
40
40
  ], errors);
41
41
  if (parsed.version !== 4) {
42
- throw new Error("Resolved Workbench source version must be 4.");
42
+ throw new Error("Resolved Workbench spec version must be 4.");
43
43
  }
44
- const benchmark = normalizeBenchmarkRecord(readRequiredRecord(parsed.benchmark, "resolved Workbench source.benchmark", errors), "benchmark.yaml", errors);
45
- const candidate = normalizeCandidateRecord(readRequiredRecord(parsed.candidate, "resolved Workbench source.candidate", errors), "resolved Workbench source.candidate", errors);
44
+ const evalSpec = normalizeEvalRecord(readRequiredRecord(parsed.eval, EVAL_SPEC_FILE, errors), EVAL_SPEC_FILE, "resolved", errors);
45
+ const skill = normalizeSkillRecord(readRequiredRecord(parsed.skill, "resolved Workbench spec.skill", errors), "resolved Workbench spec.skill", "resolved", errors);
46
46
  if (errors.length > 0) {
47
47
  throw new Error(errors.join("\n"));
48
48
  }
49
49
  return genericSpecFromAuthoredBundle({
50
50
  version: 4,
51
- benchmark: benchmark,
52
- candidate: candidate,
51
+ eval: evalSpec,
52
+ skill: skill,
53
53
  });
54
54
  }
55
55
  export function engineResolveBindingForSourceYaml(source) {
@@ -58,7 +58,7 @@ export function engineResolveBindingForSourceYaml(source) {
58
58
  export function engineResolveBindingForSpec(spec) {
59
59
  const resolver = engineResolveInvocationForSpec(spec);
60
60
  return {
61
- engine: spec.benchmark.engine.use,
61
+ engine: spec.eval.engine.use,
62
62
  resolver: {
63
63
  use: resolver.use,
64
64
  withFingerprint: fingerprintJson(resolver.with ?? {}),
@@ -67,29 +67,29 @@ export function engineResolveBindingForSpec(spec) {
67
67
  }
68
68
  export function resolveWorkbenchSourceFiles(args) {
69
69
  return genericSpecFromAuthoredBundle(parseWorkbenchSourceFiles({
70
- benchmarkSource: args.benchmarkSource,
71
- candidateSource: args.candidateSource,
72
- runId: args.runId,
70
+ evalSource: args.evalSource,
71
+ skillSource: args.skillSource,
72
+ selectedAgentId: args.selectedAgentId,
73
73
  }));
74
74
  }
75
75
  export function parseWorkbenchSourceFiles(args) {
76
76
  const errors = [];
77
- const benchmark = normalizeBenchmarkRecord(parseYamlRecord(args.benchmarkSource, BENCHMARK_SPEC_FILE), BENCHMARK_SPEC_FILE, errors);
78
- const candidate = normalizeCandidateRecord(parseYamlRecord(args.candidateSource ?? "", "candidate YAML"), "candidate YAML", errors, args.runId ?? undefined);
77
+ const evalSpec = normalizeEvalRecord(parseYamlRecord(args.evalSource, EVAL_SPEC_FILE), EVAL_SPEC_FILE, "authored", errors);
78
+ const skill = normalizeSkillRecord(parseYamlRecord(args.skillSource ?? "", "skill YAML"), "skill YAML", "authored", errors, args.selectedAgentId ?? undefined);
79
79
  if (errors.length > 0) {
80
80
  throw new Error(errors.join("\n"));
81
81
  }
82
82
  return {
83
83
  version: 4,
84
- benchmark: benchmark,
85
- candidate: candidate,
84
+ eval: evalSpec,
85
+ skill: skill,
86
86
  };
87
87
  }
88
88
  export function serializeWorkbenchResolvedSourceYaml(source) {
89
89
  return YAML.stringify(source).trimEnd() + "\n";
90
90
  }
91
- export function isWorkbenchCandidateManifestPath(filePath) {
92
- return /^candidates\/[^/]+\/candidate\.ya?ml$/iu.test(filePath.replace(/\\/gu, "/").replace(/^\/+/u, "").replace(/^(?:\.\/)+/u, ""));
91
+ export function isWorkbenchSkillManifestPath(filePath) {
92
+ return /^skills\/[^/]+\/skill\.ya?ml$/iu.test(filePath.replace(/\\/gu, "/").replace(/^\/+/u, "").replace(/^(?:\.\/)+/u, ""));
93
93
  }
94
94
  export function resolveEngineCaseExecutionConfig(args) {
95
95
  return {
@@ -131,38 +131,38 @@ export function runtimeSandboxRef(runtime) {
131
131
  return `dockerfile://${runtime.dockerfile}`;
132
132
  }
133
133
  function genericSpecFromAuthoredBundle(source) {
134
- const engineRuntime = engineRuntimeFromConfig(source.benchmark.engine);
135
- const engineRun = cloneEngineInvocation(source.benchmark.engine);
136
- const engineResolve = cloneEngineInvocation(source.benchmark.engine);
137
- const candidate = source.candidate;
138
- const selectedRun = candidate.runs[candidate.selectedRunId];
139
- if (!selectedRun) {
140
- throw new Error(`Candidate run not found: ${candidate.selectedRunId}`);
134
+ const engineRuntime = engineRuntimeFromConfig(source.eval.engine);
135
+ const engineRun = cloneEngineInvocation(source.eval.engine);
136
+ const engineResolve = cloneEngineInvocation(source.eval.engine);
137
+ const skill = source.skill;
138
+ const selectedAgent = skill.agents[skill.selectedAgentId];
139
+ if (!selectedAgent) {
140
+ throw new Error(`Skill agent not found: ${skill.selectedAgentId}`);
141
141
  }
142
142
  return {
143
143
  version: 4,
144
- name: source.benchmark.name,
145
- description: source.benchmark.description,
146
- benchmark: {
147
- name: source.benchmark.name,
148
- description: source.benchmark.description,
149
- engine: cloneJson(source.benchmark.engine),
144
+ name: source.eval.name,
145
+ description: source.eval.description,
146
+ eval: {
147
+ name: source.eval.name,
148
+ description: source.eval.description,
149
+ engine: cloneJson(source.eval.engine),
150
150
  },
151
- candidate: {
152
- name: candidate.name,
153
- ...(candidate.description ? { description: candidate.description } : {}),
154
- files: cloneJson(candidate.files),
155
- ...(candidate.prepare ? { prepare: cloneJson(candidate.prepare) } : {}),
156
- defaultRun: candidate.defaultRun ?? candidate.selectedRunId,
157
- selectedRunId: candidate.selectedRunId,
158
- selectedRunName: selectedRun.name,
159
- runs: cloneJson(candidate.runs),
160
- ...(candidate.improve
151
+ skill: {
152
+ name: skill.name,
153
+ ...(skill.description ? { description: skill.description } : {}),
154
+ files: cloneJson(skill.files),
155
+ ...(skill.prepare ? { prepare: cloneJson(skill.prepare) } : {}),
156
+ defaultAgent: skill.defaultAgent ?? skill.selectedAgentId,
157
+ selectedAgentId: skill.selectedAgentId,
158
+ selectedAgentName: selectedAgent.name,
159
+ agents: cloneJson(skill.agents),
160
+ ...(skill.improve
161
161
  ? {
162
162
  improve: {
163
- edits: [...candidate.improve.edits],
164
- ...(candidate.improve.optimizeOn ? { optimizeOn: cloneJson(candidate.improve.optimizeOn) } : {}),
165
- ...(candidate.improve.selectBy ? { selectBy: cloneJson(candidate.improve.selectBy) } : {}),
163
+ edits: [...skill.improve.edits],
164
+ ...(skill.improve.optimizeOn ? { optimizeOn: cloneJson(skill.improve.optimizeOn) } : {}),
165
+ ...(skill.improve.selectBy ? { selectBy: cloneJson(skill.improve.selectBy) } : {}),
166
166
  },
167
167
  }
168
168
  : {}),
@@ -170,18 +170,18 @@ function genericSpecFromAuthoredBundle(source) {
170
170
  environment: cloneJson(engineRuntime),
171
171
  adapters: [
172
172
  ...new Set([
173
- ...source.benchmark.adapters,
174
- ...candidate.adapters,
173
+ ...source.eval.adapters,
174
+ ...skill.adapters,
175
175
  ]),
176
176
  ],
177
- engine: cloneJson(source.benchmark.engine),
177
+ engine: cloneJson(source.eval.engine),
178
178
  engineResolve: cloneJson(engineResolve),
179
- ...(candidate.improve ? { improve: clonePhaseAdapter(candidate.improve) } : {}),
180
- run: clonePhaseAdapter(selectedRun),
179
+ ...(skill.improve ? { improve: clonePhaseAdapter(skill.improve) } : {}),
180
+ run: clonePhaseAdapter(selectedAgent),
181
181
  engineRun: cloneJson(engineRun),
182
182
  };
183
183
  }
184
- function normalizeBenchmarkRecord(record, label, errors) {
184
+ function normalizeEvalRecord(record, label, mode, errors) {
185
185
  if (!record) {
186
186
  return null;
187
187
  }
@@ -192,7 +192,7 @@ function normalizeBenchmarkRecord(record, label, errors) {
192
192
  "adapters",
193
193
  "engine",
194
194
  ], errors);
195
- requireVersionFour(record.version, label, errors);
195
+ requireSpecVersion(record.version, label, mode === "authored" ? 1 : 4, errors);
196
196
  const name = readRequiredString(record.name, `${label}.name`, errors);
197
197
  const description = readRequiredString(record.description, `${label}.description`, errors);
198
198
  const adapters = normalizeAdapterSources(record.adapters, `${label}.adapters`, errors);
@@ -222,7 +222,7 @@ function normalizeEngineRuntimeConfig(engine, label, errors) {
222
222
  }
223
223
  }
224
224
  }
225
- function normalizeCandidateRecord(record, label, errors, selectedRunId) {
225
+ function normalizeSkillRecord(record, label, mode, errors, selectedAgentId) {
226
226
  if (!record) {
227
227
  return null;
228
228
  }
@@ -233,26 +233,28 @@ function normalizeCandidateRecord(record, label, errors, selectedRunId) {
233
233
  "files",
234
234
  "prepare",
235
235
  "adapters",
236
- "defaultRun",
237
- "runs",
236
+ "defaultAgent",
237
+ "agents",
238
+ ...(mode === "resolved" ? ["selectedAgentId"] : []),
238
239
  "improve",
239
- "selectedRunId",
240
240
  ], errors);
241
- requireVersionFour(record.version, label, errors);
241
+ requireSpecVersion(record.version, label, mode === "authored" ? 1 : 4, errors);
242
242
  const name = readRequiredString(record.name, `${label}.name`, errors);
243
243
  const description = readOptionalString(record.description, `${label}.description`, errors);
244
244
  const files = normalizePathRef(record.files, `${label}.files`, errors);
245
- const prepare = normalizeCandidatePrepare(record.prepare, `${label}.prepare`, errors);
245
+ const prepare = normalizeSkillPrepare(record.prepare, `${label}.prepare`, errors);
246
246
  const adapters = normalizeAdapterSources(record.adapters, `${label}.adapters`, errors);
247
- const runs = normalizeCandidateRuns(record.runs, `${label}.runs`, errors);
248
- const defaultRun = readOptionalString(record.defaultRun, `${label}.defaultRun`, errors);
249
- const embeddedSelectedRun = readOptionalString(record.selectedRunId, `${label}.selectedRunId`, errors);
250
- const selected = selectedRunId ?? embeddedSelectedRun ?? defaultRun ?? Object.keys(runs).sort()[0];
251
- if (selected && !runs[selected]) {
252
- errors.push(`${label}.selectedRunId references unknown run ${selected}.`);
253
- }
254
- const improve = normalizeCandidateImprove(record.improve, `${label}.improve`, errors);
255
- return name && files && selected && Object.keys(runs).length > 0
247
+ const agents = normalizeSkillAgents(record.agents, `${label}.agents`, errors);
248
+ const defaultAgent = readOptionalString(record.defaultAgent, `${label}.defaultAgent`, errors);
249
+ const embeddedSelectedAgent = mode === "resolved"
250
+ ? readOptionalString(record.selectedAgentId, `${label}.selectedAgentId`, errors)
251
+ : undefined;
252
+ const selected = selectedAgentId ?? embeddedSelectedAgent ?? defaultAgent ?? Object.keys(agents).sort()[0];
253
+ if (selected && !agents[selected]) {
254
+ errors.push(`${label}.${mode === "authored" ? "defaultAgent" : "selectedAgentId"} references unknown agent ${selected}.`);
255
+ }
256
+ const improve = normalizeSkillImprove(record.improve, `${label}.improve`, errors);
257
+ return name && files && selected && Object.keys(agents).length > 0
256
258
  ? {
257
259
  version: 4,
258
260
  name,
@@ -260,14 +262,14 @@ function normalizeCandidateRecord(record, label, errors, selectedRunId) {
260
262
  files,
261
263
  ...(prepare ? { prepare } : {}),
262
264
  adapters,
263
- ...(defaultRun ? { defaultRun } : {}),
264
- runs,
265
+ ...(defaultAgent ? { defaultAgent } : {}),
266
+ agents,
265
267
  ...(improve ? { improve } : {}),
266
- selectedRunId: selected,
268
+ selectedAgentId: selected,
267
269
  }
268
270
  : null;
269
271
  }
270
- function normalizeCandidatePrepare(value, label, errors) {
272
+ function normalizeSkillPrepare(value, label, errors) {
271
273
  if (value === undefined) {
272
274
  return undefined;
273
275
  }
@@ -279,37 +281,37 @@ function normalizeCandidatePrepare(value, label, errors) {
279
281
  const command = readRequiredString(record.command, `${label}.command`, errors);
280
282
  return command ? { command } : undefined;
281
283
  }
282
- function normalizeCandidateRuns(value, label, errors) {
284
+ function normalizeSkillAgents(value, label, errors) {
283
285
  const record = readRequiredRecord(value, label, errors);
284
286
  if (!record) {
285
287
  return {};
286
288
  }
287
- const runs = {};
288
- for (const [runId, runValue] of Object.entries(record).sort(([left], [right]) => left.localeCompare(right))) {
289
- if (!/^[a-zA-Z0-9][a-zA-Z0-9._-]*$/u.test(runId)) {
290
- errors.push(`${label}.${runId} must use letters, numbers, dots, underscores, or dashes.`);
289
+ const agents = {};
290
+ for (const [agentId, agentValue] of Object.entries(record).sort(([left], [right]) => left.localeCompare(right))) {
291
+ if (!/^[a-zA-Z0-9][a-zA-Z0-9._-]*$/u.test(agentId)) {
292
+ errors.push(`${label}.${agentId} must use letters, numbers, dots, underscores, or dashes.`);
291
293
  continue;
292
294
  }
293
- const runRecord = readRequiredRecord(runValue, `${label}.${runId}`, errors);
294
- if (!runRecord) {
295
+ const agentRecord = readRequiredRecord(agentValue, `${label}.${agentId}`, errors);
296
+ if (!agentRecord) {
295
297
  continue;
296
298
  }
297
- rejectUnknownKeys(runRecord, `${label}.${runId}`, ["name", "use", "with", "auth"], errors);
298
- const name = readRequiredString(runRecord.name, `${label}.${runId}.name`, errors);
299
- const invocation = normalizePhaseAdapter(adapterRecordFrom(runRecord), `${label}.${runId}`, errors);
299
+ rejectUnknownKeys(agentRecord, `${label}.${agentId}`, ["name", "use", "with", "auth"], errors);
300
+ const name = readRequiredString(agentRecord.name, `${label}.${agentId}.name`, errors);
301
+ const invocation = normalizePhaseAdapter(adapterRecordFrom(agentRecord), `${label}.${agentId}`, errors);
300
302
  if (name && invocation) {
301
- runs[runId] = {
303
+ agents[agentId] = {
302
304
  name,
303
305
  ...invocation,
304
306
  };
305
307
  }
306
308
  }
307
- if (Object.keys(runs).length === 0) {
308
- errors.push(`${label} must declare at least one run.`);
309
+ if (Object.keys(agents).length === 0) {
310
+ errors.push(`${label} must declare at least one agent.`);
309
311
  }
310
- return runs;
312
+ return agents;
311
313
  }
312
- function normalizeCandidateImprove(value, label, errors) {
314
+ function normalizeSkillImprove(value, label, errors) {
313
315
  if (value === undefined) {
314
316
  return undefined;
315
317
  }
@@ -385,9 +387,9 @@ function adapterRecordFrom(record) {
385
387
  ...(record.auth !== undefined ? { auth: record.auth } : {}),
386
388
  };
387
389
  }
388
- function requireVersionFour(value, label, errors) {
389
- if (value !== 4) {
390
- errors.push(`${label}.version must be 4.`);
390
+ function requireSpecVersion(value, label, version, errors) {
391
+ if (value !== version) {
392
+ errors.push(`${label}.version must be ${version}.`);
391
393
  }
392
394
  }
393
395
  function normalizeRuntime(value, label, errors) {