@rtrentjones/greenlight 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  verifyAgentWeb
3
- } from "./chunk-KVOI4UL2.js";
3
+ } from "./chunk-IYEIZYI5.js";
4
4
  import "./chunk-QFKE5JKC.js";
5
5
  export {
6
6
  verifyAgentWeb
package/dist/bin.js CHANGED
@@ -7,13 +7,14 @@ import {
7
7
  loadConfig,
8
8
  resolveUrl,
9
9
  scanSqlFiles,
10
+ toExportResult,
10
11
  verifyAll
11
- } from "./chunk-OBWWE7GE.js";
12
+ } from "./chunk-FZH2YQPJ.js";
12
13
  import "./chunk-HX7VA25D.js";
13
14
  import "./chunk-N3IKUCSF.js";
14
15
  import "./chunk-KP3Y6WRU.js";
15
- import "./chunk-KVOI4UL2.js";
16
- import "./chunk-XWTOJHLV.js";
16
+ import "./chunk-IYEIZYI5.js";
17
+ import "./chunk-3A6F2JNP.js";
17
18
  import "./chunk-QFKE5JKC.js";
18
19
 
19
20
  // src/commands/add.ts
@@ -601,7 +602,7 @@ function tokensForTool(tool) {
601
602
  }
602
603
 
603
604
  // src/version.ts
604
- var MODULE_REF = "v0.5.1";
605
+ var MODULE_REF = "v0.6.0";
605
606
  var MODULE_SOURCE_BASE = "git::https://github.com/RTrentJones/greenlight.git//infra/modules";
606
607
  function moduleSource(module, ref = MODULE_REF) {
607
608
  return `${MODULE_SOURCE_BASE}/${module}?ref=${ref}`;
@@ -2894,7 +2895,7 @@ import { setTimeout as sleep } from "timers/promises";
2894
2895
 
2895
2896
  // src/commands/verify.ts
2896
2897
  import { spawnSync } from "child_process";
2897
- import { resolve as resolve9 } from "path";
2898
+ import { basename, resolve as resolve9 } from "path";
2898
2899
  function defaultSpec(lane) {
2899
2900
  switch (lane) {
2900
2901
  case "astro":
@@ -2918,21 +2919,34 @@ function defaultSpec(lane) {
2918
2919
  };
2919
2920
  }
2920
2921
  }
2921
- function printReport(report) {
2922
- console.log(`verify ${report.mode} ${report.url}
2922
+ function printReport(report, log = console.log) {
2923
+ log(`verify ${report.mode} ${report.url}
2923
2924
  `);
2924
2925
  for (const c of report.checks) {
2925
- console.log(` ${c.pass ? "\u2714" : "\u2718"} ${c.name}${c.detail ? ` \u2014 ${c.detail}` : ""}`);
2926
+ log(` ${c.pass ? "\u2714" : "\u2718"} ${c.name}${c.detail ? ` \u2014 ${c.detail}` : ""}`);
2926
2927
  }
2927
- console.log(`
2928
+ log(`
2928
2929
  ${report.pass ? "\u2714 PASS" : "\u2718 FAIL"}`);
2929
2930
  if (!report.pass && report.logs) {
2930
- console.log(`
2931
+ log(`
2931
2932
  --- recent logs (${report.mode}) ---
2932
2933
  ${report.logs}
2933
2934
  --- end logs ---`);
2934
2935
  }
2935
2936
  }
2937
+ function emitReports(reports, json, ctx) {
2938
+ const log = json ? console.error : console.log;
2939
+ for (const report of reports) printReport(report, log);
2940
+ const pass = allPass(reports);
2941
+ if (reports.length > 1) log(`
2942
+ ${pass ? "\u2714 ALL PASS" : "\u2718 FAIL"} (${reports.length} specs)`);
2943
+ if (json) process.stdout.write(`${JSON.stringify(toExportResult(reports, ctx))}
2944
+ `);
2945
+ process.exit(pass ? 0 : 1);
2946
+ }
2947
+ function gitSha() {
2948
+ return process.env.VERCEL_GIT_COMMIT_SHA ?? process.env.GITHUB_SHA ?? null;
2949
+ }
2936
2950
  var LOG_TAIL_LINES = 50;
2937
2951
  function redactSecrets(text, env = process.env) {
2938
2952
  let out = text;
@@ -2973,6 +2987,9 @@ function flag6(args, name) {
2973
2987
  const i = args.indexOf(name);
2974
2988
  return i >= 0 ? args[i + 1] : void 0;
2975
2989
  }
2990
+ function jsonFlag(args) {
2991
+ return args.includes("--json") || process.env.GREENLIGHT_VERIFY_JSON === "1";
2992
+ }
2976
2993
  async function verifyCommand(args) {
2977
2994
  const specPath = flag6(args, "--spec");
2978
2995
  if (specPath) {
@@ -2987,12 +3004,12 @@ async function verifyCommand(args) {
2987
3004
  toolDir: process.cwd()
2988
3005
  });
2989
3006
  attachFailureLogs(reports2, specs2, process.cwd());
2990
- for (const report of reports2) printReport(report);
2991
- const pass2 = allPass(reports2);
2992
- if (reports2.length > 1)
2993
- console.log(`
2994
- ${pass2 ? "\u2714 ALL PASS" : "\u2718 FAIL"} (${reports2.length} specs)`);
2995
- process.exit(pass2 ? 0 : 1);
3007
+ const tool = flag6(args, "--tool") ?? basename(specPath).replace(/\.config\.[tj]s$/, "");
3008
+ emitReports(reports2, jsonFlag(args), {
3009
+ tool,
3010
+ env: flag6(args, "--env") ?? "preview",
3011
+ gitSha: gitSha()
3012
+ });
2996
3013
  }
2997
3014
  const name = args[0];
2998
3015
  if (!name || name.startsWith("-")) {
@@ -3025,12 +3042,11 @@ ${pass2 ? "\u2714 ALL PASS" : "\u2718 FAIL"} (${reports2.length} specs)`);
3025
3042
  const toolDir = resolve9(process.cwd(), entry.dir ?? ".");
3026
3043
  const reports = await verifyAll(url, specs, { reachableTimeoutMs, toolDir });
3027
3044
  attachFailureLogs(reports, specs, toolDir);
3028
- for (const report of reports) printReport(report);
3029
- const pass = allPass(reports);
3030
- if (reports.length > 1)
3031
- console.log(`
3032
- ${pass ? "\u2714 ALL PASS" : "\u2718 FAIL"} (${reports.length} specs)`);
3033
- process.exit(pass ? 0 : 1);
3045
+ emitReports(reports, jsonFlag(args), {
3046
+ tool: entry.name ?? name,
3047
+ env: override ? "preview" : flag6(args, "--env") ?? "preview",
3048
+ gitSha: gitSha()
3049
+ });
3034
3050
  }
3035
3051
 
3036
3052
  // src/commands/preview.ts
@@ -3387,7 +3403,7 @@ var HELP = `greenlight <command>
3387
3403
  config load & validate the manifest, then print it
3388
3404
  deploy <name> --env <env> build + deploy an entry via its target adapter
3389
3405
  preview <name> [--port <n>] build + serve locally + verify (one command)
3390
- verify <name> [--env <env> | --url <url>] run the verify harness against the URL
3406
+ verify <name> [--env <env> | --url <url>] [--json] run the verify harness (--json: standards-shaped result to stdout)
3391
3407
  promote <name> [--perform] [--push] gated develop -> main fast-forward
3392
3408
  status <name> last ship/deploy/verify run for a tool (via gh)
3393
3409
  secrets gather <name> [--repo o/r] [--env e] guided, link-first token prompts -> GitHub secrets (no disk/logs)
@@ -14,6 +14,10 @@ function resultText(res) {
14
14
  }
15
15
  return JSON.stringify(res);
16
16
  }
17
+ var clamp01 = (n) => {
18
+ const v = typeof n === "number" ? n : Number(n);
19
+ return Number.isFinite(v) ? Math.min(1, Math.max(0, v)) : 0;
20
+ };
17
21
  function llmJudge(model) {
18
22
  return async ({ rubric, result }) => {
19
23
  if (!process.env.ANTHROPIC_API_KEY) throw new Error("ANTHROPIC_API_KEY not set");
@@ -27,7 +31,7 @@ function llmJudge(model) {
27
31
  const resp = await client.messages.create({
28
32
  model,
29
33
  max_tokens: 512,
30
- system: 'You are a strict evaluation judge. Score how well RESULT satisfies RUBRIC on a 1\u20135 scale (5 = fully satisfies). Reply ONLY with JSON: {"score": <1-5>, "pass": <bool>, "reason": "<short>"}.',
34
+ system: 'You are a strict evaluation judge. Score how well RESULT satisfies RUBRIC on a 0..1 scale (1 = fully satisfies). Reply ONLY with JSON: {"score": <0..1>, "pass": <bool>, "rationale": "<one sentence>"}.',
31
35
  messages: [{ role: "user", content: `RUBRIC:
32
36
  ${rubric}
33
37
 
@@ -38,12 +42,23 @@ ${result}` }]
38
42
  const json = text.match(/\{[\s\S]*\}/);
39
43
  if (!json) throw new Error(`judge returned no JSON: ${text.slice(0, 120)}`);
40
44
  const parsed = JSON.parse(json[0]);
41
- return { score: Number(parsed.score) || 0, pass: Boolean(parsed.pass), reason: parsed.reason };
45
+ return {
46
+ score: clamp01(parsed.score),
47
+ pass: Boolean(parsed.pass),
48
+ rationale: parsed.rationale ?? parsed.reason,
49
+ // `reason` = deprecated alias, one release
50
+ tokensIn: resp.usage?.input_tokens,
51
+ tokensOut: resp.usage?.output_tokens
52
+ };
42
53
  };
43
54
  }
44
55
  async function verifyEval(baseUrl, spec, judge) {
45
- const score = judge ?? llmJudge(spec.model ?? "claude-sonnet-4-6");
56
+ const model = spec.model ?? "claude-sonnet-4-6";
57
+ const score = judge ?? llmJudge(model);
46
58
  const checks = [];
59
+ const started = Date.now();
60
+ let tokensIn = 0;
61
+ let tokensOut = 0;
47
62
  const client = new Client({ name: "greenlight-verify", version: "0.0.0" });
48
63
  const transport = new StreamableHTTPClientTransport(new URL(baseUrl));
49
64
  try {
@@ -53,15 +68,22 @@ async function verifyEval(baseUrl, spec, judge) {
53
68
  }
54
69
  try {
55
70
  for (const c of spec.cases) {
56
- const min = c.minScore ?? 4;
71
+ const min = c.minScore ?? 0.8;
57
72
  try {
58
73
  const res = await client.callTool({ name: c.tool, arguments: c.args ?? {} });
59
- const verdict = await score({ rubric: c.rubric, result: resultText(res) });
74
+ const result = resultText(res);
75
+ const verdict = await score({ rubric: c.rubric, result });
60
76
  const pass = verdict.pass && verdict.score >= min;
77
+ const rationale = verdict.rationale ?? verdict.reason;
78
+ tokensIn += verdict.tokensIn ?? 0;
79
+ tokensOut += verdict.tokensOut ?? 0;
61
80
  checks.push({
62
81
  name: `eval: ${c.name}`,
63
82
  pass,
64
- detail: `score ${verdict.score}/5 (min ${min})${verdict.reason ? ` \u2014 ${verdict.reason}` : ""}`
83
+ score: verdict.score,
84
+ explanation: rationale,
85
+ output: result,
86
+ detail: `score ${verdict.score.toFixed(2)} (min ${min})${rationale ? ` \u2014 ${rationale}` : ""}`
65
87
  });
66
88
  } catch (e) {
67
89
  checks.push({ name: `eval: ${c.name}`, pass: false, detail: msg(e) });
@@ -70,10 +92,16 @@ async function verifyEval(baseUrl, spec, judge) {
70
92
  } finally {
71
93
  await client.close();
72
94
  }
73
- return report("eval", baseUrl, checks);
95
+ return {
96
+ ...report("eval", baseUrl, checks),
97
+ model,
98
+ durationMs: Date.now() - started,
99
+ ...tokensIn || tokensOut ? { tokensIn, tokensOut } : {}
100
+ };
74
101
  }
75
102
 
76
103
  export {
104
+ clamp01,
77
105
  llmJudge,
78
106
  verifyEval
79
107
  };
@@ -421,6 +421,53 @@ async function verifyApi(baseUrl, spec) {
421
421
  );
422
422
  }
423
423
 
424
+ // ../packages/verify/src/export.ts
425
+ var clamp01 = (n) => Math.min(1, Math.max(0, n));
426
+ function sumDefined(xs) {
427
+ const present = xs.filter((x) => typeof x === "number");
428
+ return present.length ? present.reduce((a, b) => a + b, 0) : void 0;
429
+ }
430
+ function toExportResult(reports, ctx) {
431
+ const checks = [];
432
+ for (const r of reports) {
433
+ for (const c of r.checks) {
434
+ checks.push({
435
+ name: c.name,
436
+ passed: c.pass,
437
+ input: c.input ?? null,
438
+ expected: c.expected ?? null,
439
+ output: c.output ?? null,
440
+ "eval.score": c.score != null ? clamp01(c.score) : c.pass ? 1 : 0,
441
+ "eval.explanation": c.explanation ?? null
442
+ });
443
+ }
444
+ }
445
+ const passed = reports.length > 0 && reports.every((r) => r.pass);
446
+ const passRate = checks.length === 0 ? 0 : checks.filter((c) => c.passed).length / checks.length;
447
+ const model = reports.find((r) => r.model)?.model;
448
+ const tokensIn = sumDefined(reports.map((r) => r.tokensIn));
449
+ const tokensOut = sumDefined(reports.map((r) => r.tokensOut));
450
+ const cost = sumDefined(reports.map((r) => r.costUsd));
451
+ const durationMs = sumDefined(reports.map((r) => r.durationMs));
452
+ const attributes = {};
453
+ if (model) attributes["gen_ai.request.model"] = model;
454
+ if (tokensIn != null) attributes["gen_ai.usage.input_tokens"] = tokensIn;
455
+ if (tokensOut != null) attributes["gen_ai.usage.output_tokens"] = tokensOut;
456
+ if (cost != null) attributes["gen_ai.response.cost"] = cost;
457
+ return {
458
+ schemaVersion: "1",
459
+ tool: ctx.tool,
460
+ mode: reports.map((r) => r.mode).join("+") || "verify",
461
+ env: ctx.env,
462
+ git_sha: ctx.gitSha ?? null,
463
+ passed,
464
+ pass_rate: passRate,
465
+ duration_ms: durationMs ?? null,
466
+ ...Object.keys(attributes).length ? { attributes } : {},
467
+ checks
468
+ };
469
+ }
470
+
424
471
  // ../packages/verify/src/index.ts
425
472
  function defineVerify(spec) {
426
473
  return spec;
@@ -456,11 +503,11 @@ async function verify(baseUrl, spec, opts) {
456
503
  return verifyTest2(spec, opts?.toolDir ?? process.cwd());
457
504
  }
458
505
  case "agent-web": {
459
- const { verifyAgentWeb: verifyAgentWeb2 } = await import("./agent-web-3FTO2TLJ.js");
506
+ const { verifyAgentWeb: verifyAgentWeb2 } = await import("./agent-web-BG5ZIVAB.js");
460
507
  return verifyAgentWeb2(baseUrl, spec);
461
508
  }
462
509
  case "eval": {
463
- const { verifyEval: verifyEval2 } = await import("./eval-44S2BATV.js");
510
+ const { verifyEval: verifyEval2 } = await import("./eval-YZXJSUKH.js");
464
511
  return verifyEval2(baseUrl, spec);
465
512
  }
466
513
  }
@@ -487,6 +534,7 @@ export {
487
534
  loadConfig,
488
535
  resolveUrl,
489
536
  scanSqlFiles,
537
+ toExportResult,
490
538
  defineVerify,
491
539
  verifyAll,
492
540
  allPass
@@ -124,6 +124,8 @@ async function runScenario(client, page, base, spec, scenario) {
124
124
  const messages = [{ role: "user", content: `Task: ${scenario.task}` }];
125
125
  const maxSteps = spec.maxSteps ?? 12;
126
126
  let finish = null;
127
+ let tokensIn = 0;
128
+ let tokensOut = 0;
127
129
  for (let step = 0; step < maxSteps && !finish; step++) {
128
130
  const resp = await client.messages.create({
129
131
  model: spec.model ?? "claude-sonnet-4-6",
@@ -132,6 +134,8 @@ async function runScenario(client, page, base, spec, scenario) {
132
134
  tools: TOOLS,
133
135
  messages
134
136
  });
137
+ tokensIn += resp.usage?.input_tokens ?? 0;
138
+ tokensOut += resp.usage?.output_tokens ?? 0;
135
139
  const blocks = resp.content;
136
140
  messages.push({ role: "assistant", content: blocks });
137
141
  const toolUses = blocks.filter((b) => b.type === "tool_use");
@@ -160,7 +164,7 @@ async function runScenario(client, page, base, spec, scenario) {
160
164
  checks.push({ ...c, name: `${tag} ${c.name}` });
161
165
  }
162
166
  if (checks.length === 0) checks.push({ name: `${tag} agent succeeded`, pass: true });
163
- return checks;
167
+ return { checks, tokensIn, tokensOut };
164
168
  }
165
169
  async function verifyAgentWeb(baseUrl, spec) {
166
170
  const base = baseUrl.replace(/\/+$/, "");
@@ -213,11 +217,17 @@ async function verifyAgentWeb(baseUrl, spec) {
213
217
  ]);
214
218
  }
215
219
  const checks = [];
220
+ const started = Date.now();
221
+ let tokensIn = 0;
222
+ let tokensOut = 0;
216
223
  try {
217
224
  for (const scenario of spec.scenarios) {
218
225
  const page = await browser.newPage();
219
226
  try {
220
- checks.push(...await runScenario(client, page, base, spec, scenario));
227
+ const r = await runScenario(client, page, base, spec, scenario);
228
+ checks.push(...r.checks);
229
+ tokensIn += r.tokensIn;
230
+ tokensOut += r.tokensOut;
221
231
  } catch (e) {
222
232
  checks.push({ name: `[${scenario.name}]`, pass: false, detail: msg(e) });
223
233
  } finally {
@@ -227,7 +237,12 @@ async function verifyAgentWeb(baseUrl, spec) {
227
237
  } finally {
228
238
  await browser.close();
229
239
  }
230
- return report("agent-web", baseUrl, checks);
240
+ return {
241
+ ...report("agent-web", baseUrl, checks),
242
+ model: spec.model ?? "claude-sonnet-4-6",
243
+ durationMs: Date.now() - started,
244
+ ...tokensIn || tokensOut ? { tokensIn, tokensOut } : {}
245
+ };
231
246
  }
232
247
 
233
248
  export {
@@ -1,9 +1,11 @@
1
1
  import {
2
+ clamp01,
2
3
  llmJudge,
3
4
  verifyEval
4
- } from "./chunk-XWTOJHLV.js";
5
+ } from "./chunk-3A6F2JNP.js";
5
6
  import "./chunk-QFKE5JKC.js";
6
7
  export {
8
+ clamp01,
7
9
  llmJudge,
8
10
  verifyEval
9
11
  };
package/dist/index.js CHANGED
@@ -2,12 +2,12 @@ import {
2
2
  defineConfig,
3
3
  defineVerify,
4
4
  loadConfig
5
- } from "./chunk-OBWWE7GE.js";
5
+ } from "./chunk-FZH2YQPJ.js";
6
6
  import "./chunk-HX7VA25D.js";
7
7
  import "./chunk-N3IKUCSF.js";
8
8
  import "./chunk-KP3Y6WRU.js";
9
- import "./chunk-KVOI4UL2.js";
10
- import "./chunk-XWTOJHLV.js";
9
+ import "./chunk-IYEIZYI5.js";
10
+ import "./chunk-3A6F2JNP.js";
11
11
  import "./chunk-QFKE5JKC.js";
12
12
  export {
13
13
  defineConfig,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@rtrentjones/greenlight",
3
- "version": "0.5.1",
3
+ "version": "0.6.0",
4
4
  "description": "Greenlight CLI — setup and lifecycle for the harness.",
5
5
  "license": "MIT",
6
6
  "repository": {
@@ -31,10 +31,10 @@
31
31
  "@anthropic-ai/sdk": "^0.69.0"
32
32
  },
33
33
  "devDependencies": {
34
- "@rtrentjones/greenlight-adapters": "0.5.1",
35
- "@rtrentjones/greenlight-loop": "0.5.1",
36
- "@rtrentjones/greenlight-verify": "0.5.1",
37
- "@rtrentjones/greenlight-shared": "0.5.1"
34
+ "@rtrentjones/greenlight-adapters": "0.6.0",
35
+ "@rtrentjones/greenlight-shared": "0.6.0",
36
+ "@rtrentjones/greenlight-verify": "0.6.0",
37
+ "@rtrentjones/greenlight-loop": "0.6.0"
38
38
  },
39
39
  "scripts": {
40
40
  "build": "node scripts/copy-assets.mjs && tsup",